1 /*
2 * Copyright (c) 2006-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Memory allocator with per-CPU caching, derived from the kmem magazine
31 * concept and implementation as described in the following paper:
32 * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
33 * That implementation is Copyright 2006 Sun Microsystems, Inc. All rights
34 * reserved. Use is subject to license terms.
35 *
36 * There are several major differences between this and the original kmem
37 * magazine: this derivative implementation allows for multiple objects to
38 * be allocated and freed from/to the object cache in one call; in addition,
39 * it provides for better flexibility where the user is allowed to define
40 * its own slab allocator (instead of the default zone allocator). Finally,
41 * no object construction/destruction takes place at the moment, although
42 * this could be added in future to improve efficiency.
43 */
44
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/queue.h>
50 #include <sys/kernel.h>
51 #include <sys/systm.h>
52
53 #include <kern/debug.h>
54 #include <kern/zalloc.h>
55 #include <kern/cpu_number.h>
56 #include <kern/locks.h>
57 #include <kern/thread_call.h>
58
59 #include <libkern/libkern.h>
60 #include <libkern/OSAtomic.h>
61 #include <libkern/OSDebug.h>
62
63 #include <mach/vm_param.h>
64 #include <machine/limits.h>
65 #include <machine/machine_routines.h>
66
67 #include <os/atomic_private.h>
68
69 #include <string.h>
70
71 #include <sys/mcache.h>
72
73 #define MCACHE_SIZE(n) \
74 __builtin_offsetof(mcache_t, mc_cpu[n])
75
76 /* Allocate extra in case we need to manually align the pointer */
77 #define MCACHE_ALLOC_SIZE \
78 (sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_LINE_SIZE)
79
80 #define MCACHE_CPU(c) \
81 (mcache_cpu_t *)((void *)((char *)(c) + MCACHE_SIZE(cpu_number())))
82
83 /*
84 * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
85 * to serialize accesses to the global list of caches in the system.
86 * They also record the thread currently running in the critical
87 * section, so that we can avoid recursive requests to reap the
88 * caches when memory runs low.
89 */
90 #define MCACHE_LIST_LOCK() { \
91 lck_mtx_lock(&mcache_llock); \
92 mcache_llock_owner = current_thread(); \
93 }
94
95 #define MCACHE_LIST_UNLOCK() { \
96 mcache_llock_owner = NULL; \
97 lck_mtx_unlock(&mcache_llock); \
98 }
99
100 #define MCACHE_LOCK(l) lck_mtx_lock(l)
101 #define MCACHE_UNLOCK(l) lck_mtx_unlock(l)
102 #define MCACHE_LOCK_TRY(l) lck_mtx_try_lock(l)
103
104 static unsigned int ncpu;
105 static unsigned int cache_line_size;
106 static struct thread *mcache_llock_owner;
107 static LCK_GRP_DECLARE(mcache_llock_grp, "mcache.list");
108 static LCK_MTX_DECLARE(mcache_llock, &mcache_llock_grp);
109 static struct zone *mcache_zone;
110 static const uint32_t mcache_reap_interval = 15;
111 static const uint32_t mcache_reap_interval_leeway = 2;
112 static UInt32 mcache_reaping;
113 static int mcache_ready;
114 static int mcache_updating;
115
116 static int mcache_bkt_contention = 3;
117 #if DEBUG
118 static unsigned int mcache_flags = MCF_DEBUG;
119 #else
120 static unsigned int mcache_flags = 0;
121 #endif
122
123 int mca_trn_max = MCA_TRN_MAX;
124
125 static mcache_bkttype_t mcache_bkttype[] = {
126 { 1, 4096, 32768, NULL },
127 { 3, 2048, 16384, NULL },
128 { 7, 1024, 12288, NULL },
129 { 15, 256, 8192, NULL },
130 { 31, 64, 4096, NULL },
131 { 47, 0, 2048, NULL },
132 { 63, 0, 1024, NULL },
133 { 95, 0, 512, NULL },
134 { 143, 0, 256, NULL },
135 { 165, 0, 0, NULL },
136 };
137
138 static mcache_t *mcache_create_common(const char *, size_t, size_t,
139 mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
140 mcache_notifyfn_t, void *, u_int32_t, int);
141 static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
142 unsigned int, int);
143 static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
144 static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
145 static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
146 static void mcache_cpu_batch_refill(mcache_cpu_t *, mcache_bkt_t *, int);
147 static uint32_t mcache_bkt_batch_alloc(mcache_t *, mcache_bktlist_t *,
148 mcache_bkt_t **, uint32_t);
149 static void mcache_bkt_batch_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
150 static void mcache_cache_bkt_enable(mcache_t *);
151 static void mcache_bkt_purge(mcache_t *);
152 static void mcache_bkt_destroy(mcache_t *, mcache_bkt_t *, int);
153 static void mcache_bkt_ws_update(mcache_t *);
154 static void mcache_bkt_ws_zero(mcache_t *);
155 static void mcache_bkt_ws_reap(mcache_t *);
156 static void mcache_dispatch(void (*)(void *), void *);
157 static void mcache_cache_reap(mcache_t *);
158 static void mcache_cache_update(mcache_t *);
159 static void mcache_cache_bkt_resize(void *);
160 static void mcache_cache_enable(void *);
161 static void mcache_update(thread_call_param_t __unused, thread_call_param_t __unused);
162 static void mcache_update_timeout(void *);
163 static void mcache_applyall(void (*)(mcache_t *));
164 static void mcache_reap_start(void *);
165 static void mcache_reap_done(void *);
166 static void mcache_reap_timeout(thread_call_param_t __unused, thread_call_param_t);
167 static void mcache_notify(mcache_t *, u_int32_t);
168 static void mcache_purge(void *);
169 __attribute__((noreturn))
170 static void mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
171 int64_t expected, int64_t got);
172
173 static LIST_HEAD(, mcache) mcache_head;
174 mcache_t *mcache_audit_cache;
175
176 static thread_call_t mcache_reap_tcall;
177 static thread_call_t mcache_update_tcall;
178
179 /*
180 * Initialize the framework; this is currently called as part of BSD init.
181 */
182 __private_extern__ void
mcache_init(void)183 mcache_init(void)
184 {
185 mcache_bkttype_t *btp;
186 unsigned int i;
187 char name[32];
188
189 VERIFY(mca_trn_max >= 2);
190
191 ncpu = ml_wait_max_cpus();
192 (void) mcache_cache_line_size(); /* prime it */
193
194 mcache_reap_tcall = thread_call_allocate(mcache_reap_timeout, NULL);
195 mcache_update_tcall = thread_call_allocate(mcache_update, NULL);
196 if (mcache_reap_tcall == NULL || mcache_update_tcall == NULL) {
197 panic("mcache_init: thread_call_allocate failed");
198 /* NOTREACHED */
199 __builtin_unreachable();
200 }
201
202 mcache_zone = zone_create("mcache", MCACHE_ALLOC_SIZE,
203 ZC_PGZ_USE_GUARDS | ZC_DESTRUCTIBLE);
204
205 LIST_INIT(&mcache_head);
206
207 for (i = 0; i < sizeof(mcache_bkttype) / sizeof(*btp); i++) {
208 btp = &mcache_bkttype[i];
209 (void) snprintf(name, sizeof(name), "bkt_%d",
210 btp->bt_bktsize);
211 btp->bt_cache = mcache_create(name,
212 (btp->bt_bktsize + 1) * sizeof(void *), 0, 0, MCR_SLEEP);
213 }
214
215 PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof(mcache_flags));
216 mcache_flags &= MCF_FLAGS_MASK;
217
218 mcache_audit_cache = mcache_create("audit", sizeof(mcache_audit_t),
219 0, 0, MCR_SLEEP);
220
221 mcache_applyall(mcache_cache_bkt_enable);
222 mcache_ready = 1;
223
224 printf("mcache: %d CPU(s), %d bytes CPU cache line size\n",
225 ncpu, CPU_CACHE_LINE_SIZE);
226 }
227
228 /*
229 * Return the global mcache flags.
230 */
231 __private_extern__ unsigned int
mcache_getflags(void)232 mcache_getflags(void)
233 {
234 return mcache_flags;
235 }
236
237 /*
238 * Return the CPU cache line size.
239 */
240 __private_extern__ unsigned int
mcache_cache_line_size(void)241 mcache_cache_line_size(void)
242 {
243 if (cache_line_size == 0) {
244 ml_cpu_info_t cpu_info;
245 ml_cpu_get_info(&cpu_info);
246 cache_line_size = (unsigned int)cpu_info.cache_line_size;
247 }
248 return cache_line_size;
249 }
250
251 /*
252 * Create a cache using the zone allocator as the backend slab allocator.
253 * The caller may specify any alignment for the object; if it specifies 0
254 * the default alignment (MCACHE_ALIGN) will be used.
255 */
256 __private_extern__ mcache_t *
mcache_create(const char * name,size_t bufsize,size_t align,u_int32_t flags,int wait __unused)257 mcache_create(const char *name, size_t bufsize, size_t align,
258 u_int32_t flags, int wait __unused)
259 {
260 return mcache_create_common(name, bufsize, align, mcache_slab_alloc,
261 mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1);
262 }
263
264 /*
265 * Create a cache using a custom backend slab allocator. Since the caller
266 * is responsible for allocation, no alignment guarantee will be provided
267 * by this framework.
268 */
269 __private_extern__ mcache_t *
mcache_create_ext(const char * name,size_t bufsize,mcache_allocfn_t allocfn,mcache_freefn_t freefn,mcache_auditfn_t auditfn,mcache_logfn_t logfn,mcache_notifyfn_t notifyfn,void * arg,u_int32_t flags,int wait __unused)270 mcache_create_ext(const char *name, size_t bufsize,
271 mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
272 mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
273 u_int32_t flags, int wait __unused)
274 {
275 return mcache_create_common(name, bufsize, 0, allocfn,
276 freefn, auditfn, logfn, notifyfn, arg, flags, 0);
277 }
278
279 /*
280 * Common cache creation routine.
281 */
282 static mcache_t *
mcache_create_common(const char * name,size_t bufsize,size_t align,mcache_allocfn_t allocfn,mcache_freefn_t freefn,mcache_auditfn_t auditfn,mcache_logfn_t logfn,mcache_notifyfn_t notifyfn,void * arg,u_int32_t flags,int need_zone)283 mcache_create_common(const char *name, size_t bufsize, size_t align,
284 mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
285 mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
286 u_int32_t flags, int need_zone)
287 {
288 mcache_bkttype_t *btp;
289 mcache_t *cp = NULL;
290 size_t chunksize;
291 void *buf, **pbuf;
292 unsigned int c;
293 char lck_name[64];
294
295 buf = zalloc_flags(mcache_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
296
297 /*
298 * In case we didn't get a cache-aligned memory, round it up
299 * accordingly. This is needed in order to get the rest of
300 * structure members aligned properly. It also means that
301 * the memory span gets shifted due to the round up, but it
302 * is okay since we've allocated extra space for this.
303 */
304 cp = (mcache_t *)
305 P2ROUNDUP((intptr_t)buf + sizeof(void *), CPU_CACHE_LINE_SIZE);
306 pbuf = (void **)((intptr_t)cp - sizeof(void *));
307 *pbuf = buf;
308
309 /*
310 * Guaranteed alignment is valid only when we use the internal
311 * slab allocator (currently set to use the zone allocator).
312 */
313 if (!need_zone) {
314 align = 1;
315 } else {
316 /* Enforce 64-bit minimum alignment for zone-based buffers */
317 if (align == 0) {
318 align = MCACHE_ALIGN;
319 }
320 align = P2ROUNDUP(align, MCACHE_ALIGN);
321 }
322
323 if ((align & (align - 1)) != 0) {
324 panic("mcache_create: bad alignment %lu", align);
325 /* NOTREACHED */
326 __builtin_unreachable();
327 }
328
329 cp->mc_align = align;
330 cp->mc_slab_alloc = allocfn;
331 cp->mc_slab_free = freefn;
332 cp->mc_slab_audit = auditfn;
333 cp->mc_slab_log = logfn;
334 cp->mc_slab_notify = notifyfn;
335 cp->mc_private = need_zone ? cp : arg;
336 cp->mc_bufsize = bufsize;
337 cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;
338
339 (void) snprintf(cp->mc_name, sizeof(cp->mc_name), "mcache.%s", name);
340
341 (void) snprintf(lck_name, sizeof(lck_name), "%s.cpu", cp->mc_name);
342 cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name, LCK_GRP_ATTR_NULL);
343
344 /*
345 * Allocation chunk size is the object's size plus any extra size
346 * needed to satisfy the object's alignment. It is enforced to be
347 * at least the size of an LP64 pointer to simplify auditing and to
348 * handle multiple-element allocation requests, where the elements
349 * returned are linked together in a list.
350 */
351 chunksize = MAX(bufsize, sizeof(u_int64_t));
352 if (need_zone) {
353 VERIFY(align != 0 && (align % MCACHE_ALIGN) == 0);
354 chunksize += sizeof(uint64_t) + align;
355 chunksize = P2ROUNDUP(chunksize, align);
356 cp->mc_slab_zone = zone_create(cp->mc_name, chunksize,
357 ZC_PGZ_USE_GUARDS | ZC_DESTRUCTIBLE);
358 }
359 cp->mc_chunksize = chunksize;
360
361 /*
362 * Initialize the bucket layer.
363 */
364 (void) snprintf(lck_name, sizeof(lck_name), "%s.bkt", cp->mc_name);
365 cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
366 LCK_GRP_ATTR_NULL);
367 lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp, LCK_ATTR_NULL);
368
369 (void) snprintf(lck_name, sizeof(lck_name), "%s.sync", cp->mc_name);
370 cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
371 LCK_GRP_ATTR_NULL);
372 lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp, LCK_ATTR_NULL);
373
374 for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++) {
375 continue;
376 }
377
378 cp->cache_bkttype = btp;
379
380 /*
381 * Initialize the CPU layer. Each per-CPU structure is aligned
382 * on the CPU cache line boundary to prevent false sharing.
383 */
384 for (c = 0; c < ncpu; c++) {
385 mcache_cpu_t *ccp = &cp->mc_cpu[c];
386
387 VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
388 lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp, LCK_ATTR_NULL);
389 ccp->cc_objs = -1;
390 ccp->cc_pobjs = -1;
391 }
392
393 if (mcache_ready) {
394 mcache_cache_bkt_enable(cp);
395 }
396
397 /* TODO: dynamically create sysctl for stats */
398
399 MCACHE_LIST_LOCK();
400 LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
401 MCACHE_LIST_UNLOCK();
402
403 /*
404 * If cache buckets are enabled and this is the first cache
405 * created, start the periodic cache update.
406 */
407 if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
408 mcache_updating = 1;
409 mcache_update_timeout(NULL);
410 }
411 if (cp->mc_flags & MCF_DEBUG) {
412 printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
413 "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
414 arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
415 }
416 return cp;
417 }
418
419 /*
420 * Allocate one or more objects from a cache.
421 */
422 __private_extern__ unsigned int
mcache_alloc_ext(mcache_t * cp,mcache_obj_t ** list,unsigned int num,int wait)423 mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
424 {
425 mcache_cpu_t *ccp;
426 mcache_obj_t **top = &(*list);
427 mcache_bkt_t *bkt;
428 unsigned int need = num;
429 boolean_t nwretry = FALSE;
430
431 /* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
432 VERIFY((wait & (MCR_NOSLEEP | MCR_FAILOK)) != (MCR_NOSLEEP | MCR_FAILOK));
433
434 ASSERT(list != NULL);
435 *list = NULL;
436
437 if (num == 0) {
438 return 0;
439 }
440
441 retry_alloc:
442 /* We may not always be running in the same CPU in case of retries */
443 ccp = MCACHE_CPU(cp);
444
445 MCACHE_LOCK(&ccp->cc_lock);
446 for (;;) {
447 /*
448 * If we have an object in the current CPU's filled bucket,
449 * chain the object to any previous objects and return if
450 * we've satisfied the number of requested objects.
451 */
452 if (ccp->cc_objs > 0) {
453 mcache_obj_t *tail;
454 int objs;
455
456 /*
457 * Objects in the bucket are already linked together
458 * with the most recently freed object at the head of
459 * the list; grab as many objects as we can.
460 */
461 objs = MIN((unsigned int)ccp->cc_objs, need);
462 *list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
463 ccp->cc_objs -= objs;
464 ccp->cc_alloc += objs;
465
466 tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
467 list = &tail->obj_next;
468 *list = NULL;
469
470 /* If we got them all, return to caller */
471 if ((need -= objs) == 0) {
472 MCACHE_UNLOCK(&ccp->cc_lock);
473
474 if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
475 cp->mc_slab_log != NULL) {
476 (*cp->mc_slab_log)(num, *top, TRUE);
477 }
478
479 if (cp->mc_flags & MCF_DEBUG) {
480 goto debug_alloc;
481 }
482
483 return num;
484 }
485 }
486
487 /*
488 * The CPU's filled bucket is empty. If the previous filled
489 * bucket was full, exchange and try again.
490 */
491 if (ccp->cc_pobjs > 0) {
492 mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
493 continue;
494 }
495
496 /*
497 * If the bucket layer is disabled, allocate from slab. This
498 * can happen either because MCF_NOCPUCACHE is set, or because
499 * the bucket layer is currently being resized.
500 */
501 if (ccp->cc_bktsize == 0) {
502 break;
503 }
504
505 /*
506 * Both of the CPU's buckets are empty; try to get full
507 * bucket(s) from the bucket layer. Upon success, refill
508 * this CPU and place any empty bucket into the empty list.
509 * To prevent potential thrashing, replace both empty buckets
510 * only if the requested count exceeds a bucket's worth of
511 * objects.
512 */
513 (void) mcache_bkt_batch_alloc(cp, &cp->mc_full,
514 &bkt, (need <= ccp->cc_bktsize) ? 1 : 2);
515 if (bkt != NULL) {
516 mcache_bkt_t *bkt_list = NULL;
517
518 if (ccp->cc_pfilled != NULL) {
519 ccp->cc_pfilled->bkt_next = bkt_list;
520 bkt_list = ccp->cc_pfilled;
521 }
522 if (bkt->bkt_next == NULL) {
523 /*
524 * Bucket layer allocation returns only 1
525 * magazine; retain current empty magazine.
526 */
527 mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
528 } else {
529 /*
530 * We got 2 full buckets from the bucket
531 * layer; release the current empty bucket
532 * back to the bucket layer.
533 */
534 if (ccp->cc_filled != NULL) {
535 ccp->cc_filled->bkt_next = bkt_list;
536 bkt_list = ccp->cc_filled;
537 }
538 mcache_cpu_batch_refill(ccp, bkt,
539 ccp->cc_bktsize);
540 }
541 mcache_bkt_batch_free(cp, &cp->mc_empty, bkt_list);
542 continue;
543 }
544
545 /*
546 * The bucket layer has no full buckets; allocate the
547 * object(s) directly from the slab layer.
548 */
549 break;
550 }
551 MCACHE_UNLOCK(&ccp->cc_lock);
552
553 need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
554
555 /*
556 * If this is a blocking allocation, or if it is non-blocking and
557 * the cache's full bucket is non-empty, then retry the allocation.
558 */
559 if (need > 0) {
560 if (!(wait & MCR_NONBLOCKING)) {
561 os_atomic_inc(&cp->mc_wretry_cnt, relaxed);
562 goto retry_alloc;
563 } else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
564 !mcache_bkt_isempty(cp)) {
565 if (!nwretry) {
566 nwretry = TRUE;
567 }
568 os_atomic_inc(&cp->mc_nwretry_cnt, relaxed);
569 goto retry_alloc;
570 } else if (nwretry) {
571 os_atomic_inc(&cp->mc_nwfail_cnt, relaxed);
572 }
573 }
574
575 if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
576 (*cp->mc_slab_log)((num - need), *top, TRUE);
577 }
578
579 if (!(cp->mc_flags & MCF_DEBUG)) {
580 return num - need;
581 }
582
583 debug_alloc:
584 if (cp->mc_flags & MCF_DEBUG) {
585 mcache_obj_t **o = top;
586 unsigned int n;
587
588 n = 0;
589 /*
590 * Verify that the chain of objects have the same count as
591 * what we are about to report to the caller. Any mismatch
592 * here means that the object list is insanely broken and
593 * therefore we must panic.
594 */
595 while (*o != NULL) {
596 o = &(*o)->obj_next;
597 ++n;
598 }
599 if (n != (num - need)) {
600 panic("mcache_alloc_ext: %s cp %p corrupted list "
601 "(got %d actual %d)\n", cp->mc_name,
602 (void *)cp, num - need, n);
603 /* NOTREACHED */
604 __builtin_unreachable();
605 }
606 }
607
608 /* Invoke the slab layer audit callback if auditing is enabled */
609 if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
610 (*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
611 }
612
613 return num - need;
614 }
615
616 /*
617 * Allocate a single object from a cache.
618 */
619 __private_extern__ void *
mcache_alloc(mcache_t * cp,int wait)620 mcache_alloc(mcache_t *cp, int wait)
621 {
622 mcache_obj_t *buf;
623
624 (void) mcache_alloc_ext(cp, &buf, 1, wait);
625 return buf;
626 }
627
628 __private_extern__ void
mcache_waiter_inc(mcache_t * cp)629 mcache_waiter_inc(mcache_t *cp)
630 {
631 os_atomic_inc(&cp->mc_waiter_cnt, relaxed);
632 }
633
634 __private_extern__ void
mcache_waiter_dec(mcache_t * cp)635 mcache_waiter_dec(mcache_t *cp)
636 {
637 os_atomic_dec(&cp->mc_waiter_cnt, relaxed);
638 }
639
640 __private_extern__ boolean_t
mcache_bkt_isempty(mcache_t * cp)641 mcache_bkt_isempty(mcache_t *cp)
642 {
643 /*
644 * This isn't meant to accurately tell whether there are
645 * any full buckets in the cache; it is simply a way to
646 * obtain "hints" about the state of the cache.
647 */
648 return cp->mc_full.bl_total == 0;
649 }
650
651 /*
652 * Notify the slab layer about an event.
653 */
654 static void
mcache_notify(mcache_t * cp,u_int32_t event)655 mcache_notify(mcache_t *cp, u_int32_t event)
656 {
657 if (cp->mc_slab_notify != NULL) {
658 (*cp->mc_slab_notify)(cp->mc_private, event);
659 }
660 }
661
662 /*
663 * Purge the cache and disable its buckets.
664 */
665 static void
mcache_purge(void * arg)666 mcache_purge(void *arg)
667 {
668 mcache_t *cp = arg;
669
670 mcache_bkt_purge(cp);
671 /*
672 * We cannot simply call mcache_cache_bkt_enable() from here as
673 * a bucket resize may be in flight and we would cause the CPU
674 * layers of the cache to point to different sizes. Therefore,
675 * we simply increment the enable count so that during the next
676 * periodic cache update the buckets can be reenabled.
677 */
678 lck_mtx_lock_spin(&cp->mc_sync_lock);
679 cp->mc_enable_cnt++;
680 lck_mtx_unlock(&cp->mc_sync_lock);
681 }
682
683 __private_extern__ boolean_t
mcache_purge_cache(mcache_t * cp,boolean_t async)684 mcache_purge_cache(mcache_t *cp, boolean_t async)
685 {
686 /*
687 * Purging a cache that has no per-CPU caches or is already
688 * in the process of being purged is rather pointless.
689 */
690 if (cp->mc_flags & MCF_NOCPUCACHE) {
691 return FALSE;
692 }
693
694 lck_mtx_lock_spin(&cp->mc_sync_lock);
695 if (cp->mc_purge_cnt > 0) {
696 lck_mtx_unlock(&cp->mc_sync_lock);
697 return FALSE;
698 }
699 cp->mc_purge_cnt++;
700 lck_mtx_unlock(&cp->mc_sync_lock);
701
702 if (async) {
703 mcache_dispatch(mcache_purge, cp);
704 } else {
705 mcache_purge(cp);
706 }
707
708 return TRUE;
709 }
710
711 /*
712 * Free a single object to a cache.
713 */
714 __private_extern__ void
mcache_free(mcache_t * cp,void * buf)715 mcache_free(mcache_t *cp, void *buf)
716 {
717 ((mcache_obj_t *)buf)->obj_next = NULL;
718 mcache_free_ext(cp, (mcache_obj_t *)buf);
719 }
720
721 /*
722 * Free one or more objects to a cache.
723 */
724 __private_extern__ void
mcache_free_ext(mcache_t * cp,mcache_obj_t * list)725 mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
726 {
727 mcache_cpu_t *ccp = MCACHE_CPU(cp);
728 mcache_bkttype_t *btp;
729 mcache_obj_t *nlist;
730 mcache_bkt_t *bkt;
731
732 if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
733 (*cp->mc_slab_log)(0, list, FALSE);
734 }
735
736 /* Invoke the slab layer audit callback if auditing is enabled */
737 if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
738 (*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
739 }
740
741 MCACHE_LOCK(&ccp->cc_lock);
742 for (;;) {
743 /*
744 * If there is space in the current CPU's filled bucket, put
745 * the object there and return once all objects are freed.
746 * Note the cast to unsigned integer takes care of the case
747 * where the bucket layer is disabled (when cc_objs is -1).
748 */
749 if ((unsigned int)ccp->cc_objs <
750 (unsigned int)ccp->cc_bktsize) {
751 /*
752 * Reverse the list while we place the object into the
753 * bucket; this effectively causes the most recently
754 * freed object(s) to be reused during allocation.
755 */
756 nlist = list->obj_next;
757 list->obj_next = (ccp->cc_objs == 0) ? NULL :
758 ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
759 ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
760 ccp->cc_free++;
761
762 if ((list = nlist) != NULL) {
763 continue;
764 }
765
766 /* We are done; return to caller */
767 MCACHE_UNLOCK(&ccp->cc_lock);
768
769 /* If there is a waiter below, notify it */
770 if (cp->mc_waiter_cnt > 0) {
771 mcache_notify(cp, MCN_RETRYALLOC);
772 }
773 return;
774 }
775
776 /*
777 * The CPU's filled bucket is full. If the previous filled
778 * bucket was empty, exchange and try again.
779 */
780 if (ccp->cc_pobjs == 0) {
781 mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
782 continue;
783 }
784
785 /*
786 * If the bucket layer is disabled, free to slab. This can
787 * happen either because MCF_NOCPUCACHE is set, or because
788 * the bucket layer is currently being resized.
789 */
790 if (ccp->cc_bktsize == 0) {
791 break;
792 }
793
794 /*
795 * Both of the CPU's buckets are full; try to get empty
796 * buckets from the bucket layer. Upon success, empty this
797 * CPU and place any full bucket into the full list.
798 *
799 * TODO: Because the caller currently doesn't indicate
800 * the number of objects in the list, we choose the more
801 * conservative approach of allocating only 1 empty
802 * bucket (to prevent potential thrashing). Once we
803 * have the object count, we can replace 1 with similar
804 * logic as used in mcache_alloc_ext().
805 */
806 (void) mcache_bkt_batch_alloc(cp, &cp->mc_empty, &bkt, 1);
807 if (bkt != NULL) {
808 mcache_bkt_t *bkt_list = NULL;
809
810 if (ccp->cc_pfilled != NULL) {
811 ccp->cc_pfilled->bkt_next = bkt_list;
812 bkt_list = ccp->cc_pfilled;
813 }
814 if (bkt->bkt_next == NULL) {
815 /*
816 * Bucket layer allocation returns only 1
817 * bucket; retain current full bucket.
818 */
819 mcache_cpu_refill(ccp, bkt, 0);
820 } else {
821 /*
822 * We got 2 empty buckets from the bucket
823 * layer; release the current full bucket
824 * back to the bucket layer.
825 */
826 if (ccp->cc_filled != NULL) {
827 ccp->cc_filled->bkt_next = bkt_list;
828 bkt_list = ccp->cc_filled;
829 }
830 mcache_cpu_batch_refill(ccp, bkt, 0);
831 }
832 mcache_bkt_batch_free(cp, &cp->mc_full, bkt_list);
833 continue;
834 }
835 btp = cp->cache_bkttype;
836
837 /*
838 * We need an empty bucket to put our freed objects into
839 * but couldn't get an empty bucket from the bucket layer;
840 * attempt to allocate one. We do not want to block for
841 * allocation here, and if the bucket allocation fails
842 * we will simply fall through to the slab layer.
843 */
844 MCACHE_UNLOCK(&ccp->cc_lock);
845 bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
846 MCACHE_LOCK(&ccp->cc_lock);
847
848 if (bkt != NULL) {
849 /*
850 * We have an empty bucket, but since we drop the
851 * CPU lock above, the cache's bucket size may have
852 * changed. If so, free the bucket and try again.
853 */
854 if (ccp->cc_bktsize != btp->bt_bktsize) {
855 MCACHE_UNLOCK(&ccp->cc_lock);
856 mcache_free(btp->bt_cache, bkt);
857 MCACHE_LOCK(&ccp->cc_lock);
858 continue;
859 }
860
861 /*
862 * Store it in the bucket object since we'll
863 * need to refer to it during bucket destroy;
864 * we can't safely refer to cache_bkttype as
865 * the bucket lock may not be acquired then.
866 */
867 bkt->bkt_type = btp;
868
869 /*
870 * We have an empty bucket of the right size;
871 * add it to the bucket layer and try again.
872 */
873 ASSERT(bkt->bkt_next == NULL);
874 mcache_bkt_batch_free(cp, &cp->mc_empty, bkt);
875 continue;
876 }
877
878 /*
879 * The bucket layer has no empty buckets; free the
880 * object(s) directly to the slab layer.
881 */
882 break;
883 }
884 MCACHE_UNLOCK(&ccp->cc_lock);
885
886 /* If there is a waiter below, notify it */
887 if (cp->mc_waiter_cnt > 0) {
888 mcache_notify(cp, MCN_RETRYALLOC);
889 }
890
891 /* Advise the slab layer to purge the object(s) */
892 (*cp->mc_slab_free)(cp->mc_private, list,
893 (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
894 }
895
896 /*
897 * Cache destruction routine.
898 */
899 __private_extern__ void
mcache_destroy(mcache_t * cp)900 mcache_destroy(mcache_t *cp)
901 {
902 void **pbuf;
903
904 MCACHE_LIST_LOCK();
905 LIST_REMOVE(cp, mc_list);
906 MCACHE_LIST_UNLOCK();
907
908 mcache_bkt_purge(cp);
909
910 /*
911 * This cache is dead; there should be no further transaction.
912 * If it's still invoked, make sure that it induces a fault.
913 */
914 cp->mc_slab_alloc = NULL;
915 cp->mc_slab_free = NULL;
916 cp->mc_slab_audit = NULL;
917
918 lck_grp_free(cp->mc_bkt_lock_grp);
919 lck_grp_free(cp->mc_cpu_lock_grp);
920 lck_grp_free(cp->mc_sync_lock_grp);
921
922 /*
923 * TODO: We need to destroy the zone here, but cannot do it
924 * because there is no such way to achieve that. Until then
925 * the memory allocated for the zone structure is leaked.
926 * Once it is achievable, uncomment these lines:
927 *
928 * if (cp->mc_slab_zone != NULL) {
929 * zdestroy(cp->mc_slab_zone);
930 * cp->mc_slab_zone = NULL;
931 * }
932 */
933
934 /* Get the original address since we're about to free it */
935 pbuf = (void **)((intptr_t)cp - sizeof(void *));
936
937 zfree(mcache_zone, *pbuf);
938 }
939
940 /*
941 * Internal slab allocator used as a backend for simple caches. The current
942 * implementation uses the zone allocator for simplicity reasons.
943 */
944 static unsigned int
mcache_slab_alloc(void * arg,mcache_obj_t *** plist,unsigned int num,int wait)945 mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num,
946 int wait)
947 {
948 #pragma unused(wait)
949 mcache_t *cp = arg;
950 unsigned int need = num;
951 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
952 u_int32_t flags = cp->mc_flags;
953 void *buf, *base, **pbuf;
954 mcache_obj_t **list = *plist;
955
956 *list = NULL;
957
958 for (;;) {
959 buf = zalloc_flags(cp->mc_slab_zone, Z_WAITOK | Z_NOFAIL);
960
961 /* Get the aligned base address for this object */
962 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
963 cp->mc_align);
964
965 /*
966 * Wind back a pointer size from the aligned base and
967 * save the original address so we can free it later.
968 */
969 pbuf = (void **)((intptr_t)base - sizeof(void *));
970 *pbuf = buf;
971
972 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
973 ((intptr_t)buf + cp->mc_chunksize));
974
975 /*
976 * If auditing is enabled, patternize the contents of
977 * the buffer starting from the 64-bit aligned base to
978 * the end of the buffer; the length is rounded up to
979 * the nearest 64-bit multiply; this is because we use
980 * 64-bit memory access to set/check the pattern.
981 */
982 if (flags & MCF_DEBUG) {
983 VERIFY(((intptr_t)base + rsize) <=
984 ((intptr_t)buf + cp->mc_chunksize));
985 mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
986 }
987
988 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
989 *list = (mcache_obj_t *)base;
990
991 (*list)->obj_next = NULL;
992 list = *plist = &(*list)->obj_next;
993
994 /* If we got them all, return to mcache */
995 if (--need == 0) {
996 break;
997 }
998 }
999
1000 return num - need;
1001 }
1002
1003 /*
1004 * Internal slab deallocator used as a backend for simple caches.
1005 */
1006 static void
mcache_slab_free(void * arg,mcache_obj_t * list,__unused boolean_t purged)1007 mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
1008 {
1009 mcache_t *cp = arg;
1010 mcache_obj_t *nlist;
1011 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1012 u_int32_t flags = cp->mc_flags;
1013 void *base;
1014 void **pbuf;
1015
1016 for (;;) {
1017 nlist = list->obj_next;
1018 list->obj_next = NULL;
1019
1020 base = list;
1021 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1022
1023 /* Get the original address since we're about to free it */
1024 pbuf = (void **)((intptr_t)base - sizeof(void *));
1025
1026 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
1027 ((intptr_t)*pbuf + cp->mc_chunksize));
1028
1029 if (flags & MCF_DEBUG) {
1030 VERIFY(((intptr_t)base + rsize) <=
1031 ((intptr_t)*pbuf + cp->mc_chunksize));
1032 mcache_audit_free_verify(NULL, base, 0, rsize);
1033 }
1034
1035 /* Free it to zone */
1036 zfree(cp->mc_slab_zone, *pbuf);
1037
1038 /* No more objects to free; return to mcache */
1039 if ((list = nlist) == NULL) {
1040 break;
1041 }
1042 }
1043 }
1044
1045 /*
1046 * Internal slab auditor for simple caches.
1047 */
1048 static void
mcache_slab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)1049 mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1050 {
1051 mcache_t *cp = arg;
1052 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1053 void *base, **pbuf;
1054
1055 while (list != NULL) {
1056 mcache_obj_t *next = list->obj_next;
1057
1058 base = list;
1059 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1060
1061 /* Get the original address */
1062 pbuf = (void **)((intptr_t)base - sizeof(void *));
1063
1064 VERIFY(((intptr_t)base + rsize) <=
1065 ((intptr_t)*pbuf + cp->mc_chunksize));
1066
1067 if (!alloc) {
1068 mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
1069 } else {
1070 mcache_audit_free_verify_set(NULL, base, 0, rsize);
1071 }
1072
1073 list = list->obj_next = next;
1074 }
1075 }
1076
1077 /*
1078 * Refill the CPU's buckets with bkt and its follower (if any).
1079 */
1080 static void
mcache_cpu_batch_refill(mcache_cpu_t * ccp,mcache_bkt_t * bkt,int objs)1081 mcache_cpu_batch_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1082 {
1083 ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1084 (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1085 ASSERT(ccp->cc_bktsize > 0);
1086
1087 ccp->cc_filled = bkt;
1088 ccp->cc_objs = objs;
1089 if (__probable(bkt->bkt_next != NULL)) {
1090 ccp->cc_pfilled = bkt->bkt_next;
1091 ccp->cc_pobjs = objs;
1092 bkt->bkt_next = NULL;
1093 } else {
1094 ASSERT(bkt->bkt_next == NULL);
1095 ccp->cc_pfilled = NULL;
1096 ccp->cc_pobjs = -1;
1097 }
1098 }
1099
1100 /*
1101 * Refill the CPU's filled bucket with bkt and save the previous one.
1102 */
1103 static void
mcache_cpu_refill(mcache_cpu_t * ccp,mcache_bkt_t * bkt,int objs)1104 mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1105 {
1106 ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1107 (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1108 ASSERT(ccp->cc_bktsize > 0);
1109
1110 ccp->cc_pfilled = ccp->cc_filled;
1111 ccp->cc_pobjs = ccp->cc_objs;
1112 ccp->cc_filled = bkt;
1113 ccp->cc_objs = objs;
1114 }
1115
1116 /*
1117 * Get one or more buckets from the bucket layer.
1118 */
1119 static uint32_t
mcache_bkt_batch_alloc(mcache_t * cp,mcache_bktlist_t * blp,mcache_bkt_t ** list,uint32_t num)1120 mcache_bkt_batch_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t **list,
1121 uint32_t num)
1122 {
1123 mcache_bkt_t *bkt_list = NULL;
1124 mcache_bkt_t *bkt;
1125 uint32_t need = num;
1126
1127 ASSERT(list != NULL && need > 0);
1128
1129 if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
1130 /*
1131 * The bucket layer lock is held by another CPU; increase
1132 * the contention count so that we can later resize the
1133 * bucket size accordingly.
1134 */
1135 MCACHE_LOCK(&cp->mc_bkt_lock);
1136 cp->mc_bkt_contention++;
1137 }
1138
1139 while ((bkt = blp->bl_list) != NULL) {
1140 blp->bl_list = bkt->bkt_next;
1141 bkt->bkt_next = bkt_list;
1142 bkt_list = bkt;
1143 if (--blp->bl_total < blp->bl_min) {
1144 blp->bl_min = blp->bl_total;
1145 }
1146 blp->bl_alloc++;
1147 if (--need == 0) {
1148 break;
1149 }
1150 }
1151
1152 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1153
1154 *list = bkt_list;
1155
1156 return num - need;
1157 }
1158
1159 /*
1160 * Return one or more buckets to the bucket layer.
1161 */
1162 static void
mcache_bkt_batch_free(mcache_t * cp,mcache_bktlist_t * blp,mcache_bkt_t * bkt)1163 mcache_bkt_batch_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
1164 {
1165 mcache_bkt_t *nbkt;
1166
1167 MCACHE_LOCK(&cp->mc_bkt_lock);
1168 while (bkt != NULL) {
1169 nbkt = bkt->bkt_next;
1170 bkt->bkt_next = blp->bl_list;
1171 blp->bl_list = bkt;
1172 blp->bl_total++;
1173 bkt = nbkt;
1174 }
1175 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1176 }
1177
1178 /*
1179 * Enable the bucket layer of a cache.
1180 */
1181 static void
mcache_cache_bkt_enable(mcache_t * cp)1182 mcache_cache_bkt_enable(mcache_t *cp)
1183 {
1184 mcache_cpu_t *ccp;
1185 unsigned int cpu;
1186
1187 if (cp->mc_flags & MCF_NOCPUCACHE) {
1188 return;
1189 }
1190
1191 for (cpu = 0; cpu < ncpu; cpu++) {
1192 ccp = &cp->mc_cpu[cpu];
1193 MCACHE_LOCK(&ccp->cc_lock);
1194 ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
1195 MCACHE_UNLOCK(&ccp->cc_lock);
1196 }
1197 }
1198
1199 /*
1200 * Purge all buckets from a cache and disable its bucket layer.
1201 */
1202 static void
mcache_bkt_purge(mcache_t * cp)1203 mcache_bkt_purge(mcache_t *cp)
1204 {
1205 mcache_cpu_t *ccp;
1206 mcache_bkt_t *bp, *pbp;
1207 int objs, pobjs;
1208 unsigned int cpu;
1209
1210 for (cpu = 0; cpu < ncpu; cpu++) {
1211 ccp = &cp->mc_cpu[cpu];
1212
1213 MCACHE_LOCK(&ccp->cc_lock);
1214
1215 bp = ccp->cc_filled;
1216 pbp = ccp->cc_pfilled;
1217 objs = ccp->cc_objs;
1218 pobjs = ccp->cc_pobjs;
1219 ccp->cc_filled = NULL;
1220 ccp->cc_pfilled = NULL;
1221 ccp->cc_objs = -1;
1222 ccp->cc_pobjs = -1;
1223 ccp->cc_bktsize = 0;
1224
1225 MCACHE_UNLOCK(&ccp->cc_lock);
1226
1227 if (bp != NULL) {
1228 mcache_bkt_destroy(cp, bp, objs);
1229 }
1230 if (pbp != NULL) {
1231 mcache_bkt_destroy(cp, pbp, pobjs);
1232 }
1233 }
1234
1235 mcache_bkt_ws_zero(cp);
1236 mcache_bkt_ws_reap(cp);
1237 }
1238
1239 /*
1240 * Free one or more objects in the bucket to the slab layer,
1241 * and also free the bucket itself.
1242 */
1243 static void
mcache_bkt_destroy(mcache_t * cp,mcache_bkt_t * bkt,int nobjs)1244 mcache_bkt_destroy(mcache_t *cp, mcache_bkt_t *bkt, int nobjs)
1245 {
1246 if (nobjs > 0) {
1247 mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
1248
1249 if (cp->mc_flags & MCF_DEBUG) {
1250 mcache_obj_t *o = top;
1251 int cnt = 0;
1252
1253 /*
1254 * Verify that the chain of objects in the bucket is
1255 * valid. Any mismatch here means a mistake when the
1256 * object(s) were freed to the CPU layer, so we panic.
1257 */
1258 while (o != NULL) {
1259 o = o->obj_next;
1260 ++cnt;
1261 }
1262 if (cnt != nobjs) {
1263 panic("mcache_bkt_destroy: %s cp %p corrupted "
1264 "list in bkt %p (nobjs %d actual %d)\n",
1265 cp->mc_name, (void *)cp, (void *)bkt,
1266 nobjs, cnt);
1267 /* NOTREACHED */
1268 __builtin_unreachable();
1269 }
1270 }
1271
1272 /* Advise the slab layer to purge the object(s) */
1273 (*cp->mc_slab_free)(cp->mc_private, top,
1274 (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
1275 }
1276 mcache_free(bkt->bkt_type->bt_cache, bkt);
1277 }
1278
1279 /*
1280 * Update the bucket layer working set statistics.
1281 */
1282 static void
mcache_bkt_ws_update(mcache_t * cp)1283 mcache_bkt_ws_update(mcache_t *cp)
1284 {
1285 MCACHE_LOCK(&cp->mc_bkt_lock);
1286
1287 cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
1288 cp->mc_full.bl_min = cp->mc_full.bl_total;
1289 cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
1290 cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1291
1292 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1293 }
1294
1295 /*
1296 * Mark everything as eligible for reaping (working set is zero).
1297 */
1298 static void
mcache_bkt_ws_zero(mcache_t * cp)1299 mcache_bkt_ws_zero(mcache_t *cp)
1300 {
1301 MCACHE_LOCK(&cp->mc_bkt_lock);
1302
1303 cp->mc_full.bl_reaplimit = cp->mc_full.bl_total;
1304 cp->mc_full.bl_min = cp->mc_full.bl_total;
1305 cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_total;
1306 cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1307
1308 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1309 }
1310
1311 /*
1312 * Reap all buckets that are beyond the working set.
1313 */
1314 static void
mcache_bkt_ws_reap(mcache_t * cp)1315 mcache_bkt_ws_reap(mcache_t *cp)
1316 {
1317 mcache_bkt_t *bkt, *nbkt;
1318 uint32_t reap;
1319
1320 reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
1321 if (reap != 0) {
1322 (void) mcache_bkt_batch_alloc(cp, &cp->mc_full, &bkt, reap);
1323 while (bkt != NULL) {
1324 nbkt = bkt->bkt_next;
1325 bkt->bkt_next = NULL;
1326 mcache_bkt_destroy(cp, bkt, bkt->bkt_type->bt_bktsize);
1327 bkt = nbkt;
1328 }
1329 }
1330
1331 reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
1332 if (reap != 0) {
1333 (void) mcache_bkt_batch_alloc(cp, &cp->mc_empty, &bkt, reap);
1334 while (bkt != NULL) {
1335 nbkt = bkt->bkt_next;
1336 bkt->bkt_next = NULL;
1337 mcache_bkt_destroy(cp, bkt, 0);
1338 bkt = nbkt;
1339 }
1340 }
1341 }
1342
1343 static void
mcache_reap_timeout(thread_call_param_t dummy __unused,thread_call_param_t arg)1344 mcache_reap_timeout(thread_call_param_t dummy __unused,
1345 thread_call_param_t arg)
1346 {
1347 volatile UInt32 *flag = arg;
1348
1349 ASSERT(flag == &mcache_reaping);
1350
1351 *flag = 0;
1352 }
1353
1354 static void
mcache_reap_done(void * flag)1355 mcache_reap_done(void *flag)
1356 {
1357 uint64_t deadline, leeway;
1358
1359 clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1360 &deadline);
1361 clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1362 NSEC_PER_SEC, &leeway);
1363 thread_call_enter_delayed_with_leeway(mcache_reap_tcall, flag,
1364 deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1365 }
1366
1367 static void
mcache_reap_start(void * arg)1368 mcache_reap_start(void *arg)
1369 {
1370 UInt32 *flag = arg;
1371
1372 ASSERT(flag == &mcache_reaping);
1373
1374 mcache_applyall(mcache_cache_reap);
1375 mcache_dispatch(mcache_reap_done, flag);
1376 }
1377
1378 __private_extern__ void
mcache_reap(void)1379 mcache_reap(void)
1380 {
1381 UInt32 *flag = &mcache_reaping;
1382
1383 if (mcache_llock_owner == current_thread() ||
1384 !OSCompareAndSwap(0, 1, flag)) {
1385 return;
1386 }
1387
1388 mcache_dispatch(mcache_reap_start, flag);
1389 }
1390
1391 __private_extern__ void
mcache_reap_now(mcache_t * cp,boolean_t purge)1392 mcache_reap_now(mcache_t *cp, boolean_t purge)
1393 {
1394 if (purge) {
1395 mcache_bkt_purge(cp);
1396 mcache_cache_bkt_enable(cp);
1397 } else {
1398 mcache_bkt_ws_zero(cp);
1399 mcache_bkt_ws_reap(cp);
1400 }
1401 }
1402
1403 static void
mcache_cache_reap(mcache_t * cp)1404 mcache_cache_reap(mcache_t *cp)
1405 {
1406 mcache_bkt_ws_reap(cp);
1407 }
1408
1409 /*
1410 * Performs period maintenance on a cache.
1411 */
1412 static void
mcache_cache_update(mcache_t * cp)1413 mcache_cache_update(mcache_t *cp)
1414 {
1415 int need_bkt_resize = 0;
1416 int need_bkt_reenable = 0;
1417
1418 lck_mtx_assert(&mcache_llock, LCK_MTX_ASSERT_OWNED);
1419
1420 mcache_bkt_ws_update(cp);
1421
1422 /*
1423 * Cache resize and post-purge reenable are mutually exclusive.
1424 * If the cache was previously purged, there is no point of
1425 * increasing the bucket size as there was an indication of
1426 * memory pressure on the system.
1427 */
1428 lck_mtx_lock_spin(&cp->mc_sync_lock);
1429 if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt) {
1430 need_bkt_reenable = 1;
1431 }
1432 lck_mtx_unlock(&cp->mc_sync_lock);
1433
1434 MCACHE_LOCK(&cp->mc_bkt_lock);
1435 /*
1436 * If the contention count is greater than the threshold, and if
1437 * we are not already at the maximum bucket size, increase it.
1438 * Otherwise, if this cache was previously purged by the user
1439 * then we simply reenable it.
1440 */
1441 if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
1442 (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
1443 mcache_bkt_contention && !need_bkt_reenable) {
1444 need_bkt_resize = 1;
1445 }
1446
1447 cp->mc_bkt_contention_prev = cp->mc_bkt_contention;
1448 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1449
1450 if (need_bkt_resize) {
1451 mcache_dispatch(mcache_cache_bkt_resize, cp);
1452 } else if (need_bkt_reenable) {
1453 mcache_dispatch(mcache_cache_enable, cp);
1454 }
1455 }
1456
1457 /*
1458 * Recompute a cache's bucket size. This is an expensive operation
1459 * and should not be done frequently; larger buckets provide for a
1460 * higher transfer rate with the bucket while smaller buckets reduce
1461 * the memory consumption.
1462 */
1463 static void
mcache_cache_bkt_resize(void * arg)1464 mcache_cache_bkt_resize(void *arg)
1465 {
1466 mcache_t *cp = arg;
1467 mcache_bkttype_t *btp = cp->cache_bkttype;
1468
1469 if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
1470 mcache_bkt_purge(cp);
1471
1472 /*
1473 * Upgrade to the next bucket type with larger bucket size;
1474 * temporarily set the previous contention snapshot to a
1475 * negative number to prevent unnecessary resize request.
1476 */
1477 MCACHE_LOCK(&cp->mc_bkt_lock);
1478 cp->cache_bkttype = ++btp;
1479 cp->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
1480 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1481
1482 mcache_cache_enable(cp);
1483 }
1484 }
1485
1486 /*
1487 * Reenable a previously disabled cache due to purge.
1488 */
1489 static void
mcache_cache_enable(void * arg)1490 mcache_cache_enable(void *arg)
1491 {
1492 mcache_t *cp = arg;
1493
1494 lck_mtx_lock_spin(&cp->mc_sync_lock);
1495 cp->mc_purge_cnt = 0;
1496 cp->mc_enable_cnt = 0;
1497 lck_mtx_unlock(&cp->mc_sync_lock);
1498
1499 mcache_cache_bkt_enable(cp);
1500 }
1501
1502 static void
mcache_update_timeout(__unused void * arg)1503 mcache_update_timeout(__unused void *arg)
1504 {
1505 uint64_t deadline, leeway;
1506
1507 clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1508 &deadline);
1509 clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1510 NSEC_PER_SEC, &leeway);
1511 thread_call_enter_delayed_with_leeway(mcache_update_tcall, NULL,
1512 deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1513 }
1514
1515 static void
mcache_update(thread_call_param_t arg __unused,thread_call_param_t dummy __unused)1516 mcache_update(thread_call_param_t arg __unused,
1517 thread_call_param_t dummy __unused)
1518 {
1519 mcache_applyall(mcache_cache_update);
1520 mcache_update_timeout(NULL);
1521 }
1522
1523 static void
mcache_applyall(void (* func)(mcache_t *))1524 mcache_applyall(void (*func)(mcache_t *))
1525 {
1526 mcache_t *cp;
1527
1528 MCACHE_LIST_LOCK();
1529 LIST_FOREACH(cp, &mcache_head, mc_list) {
1530 func(cp);
1531 }
1532 MCACHE_LIST_UNLOCK();
1533 }
1534
1535 static void
mcache_dispatch(void (* func)(void *),void * arg)1536 mcache_dispatch(void (*func)(void *), void *arg)
1537 {
1538 ASSERT(func != NULL);
1539 timeout(func, arg, hz / 1000);
1540 }
1541
1542 __private_extern__ void
mcache_buffer_log(mcache_audit_t * mca,void * addr,mcache_t * cp,struct timeval * base_ts)1543 mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp,
1544 struct timeval *base_ts)
1545 {
1546 struct timeval now, base = { .tv_sec = 0, .tv_usec = 0 };
1547 void *stack[MCACHE_STACK_DEPTH + 1];
1548 struct mca_trn *transaction;
1549
1550 transaction = &mca->mca_trns[mca->mca_next_trn];
1551
1552 mca->mca_addr = addr;
1553 mca->mca_cache = cp;
1554
1555 transaction->mca_thread = current_thread();
1556
1557 bzero(stack, sizeof(stack));
1558 transaction->mca_depth = (uint16_t)OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
1559 bcopy(&stack[1], transaction->mca_stack,
1560 sizeof(transaction->mca_stack));
1561
1562 microuptime(&now);
1563 if (base_ts != NULL) {
1564 base = *base_ts;
1565 }
1566 /* tstamp is in ms relative to base_ts */
1567 transaction->mca_tstamp = ((now.tv_usec - base.tv_usec) / 1000);
1568 if ((now.tv_sec - base.tv_sec) > 0) {
1569 transaction->mca_tstamp += ((now.tv_sec - base.tv_sec) * 1000);
1570 }
1571
1572 mca->mca_next_trn =
1573 (mca->mca_next_trn + 1) % mca_trn_max;
1574 }
1575
1576 /*
1577 * N.B.: mcache_set_pattern(), mcache_verify_pattern() and
1578 * mcache_verify_set_pattern() are marked as noinline to prevent the
1579 * compiler from aliasing pointers when they are inlined inside the callers
1580 * (e.g. mcache_audit_free_verify_set()) which would be undefined behavior.
1581 */
1582 __private_extern__ OS_NOINLINE void
mcache_set_pattern(u_int64_t pattern,void * buf_arg,size_t size)1583 mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1584 {
1585 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1586 u_int64_t *buf = (u_int64_t *)buf_arg;
1587
1588 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1589 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1590
1591 while (buf < buf_end) {
1592 *buf++ = pattern;
1593 }
1594 }
1595
1596 __private_extern__ OS_NOINLINE void *
mcache_verify_pattern(u_int64_t pattern,void * buf_arg,size_t size)1597 mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1598 {
1599 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1600 u_int64_t *buf;
1601
1602 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1603 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1604
1605 for (buf = buf_arg; buf < buf_end; buf++) {
1606 if (*buf != pattern) {
1607 return buf;
1608 }
1609 }
1610 return NULL;
1611 }
1612
1613 OS_NOINLINE static void *
mcache_verify_set_pattern(u_int64_t old,u_int64_t new,void * buf_arg,size_t size)1614 mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
1615 size_t size)
1616 {
1617 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1618 u_int64_t *buf;
1619
1620 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1621 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1622
1623 for (buf = buf_arg; buf < buf_end; buf++) {
1624 if (*buf != old) {
1625 mcache_set_pattern(old, buf_arg,
1626 (uintptr_t)buf - (uintptr_t)buf_arg);
1627 return buf;
1628 }
1629 *buf = new;
1630 }
1631 return NULL;
1632 }
1633
1634 __private_extern__ void
mcache_audit_free_verify(mcache_audit_t * mca,void * base,size_t offset,size_t size)1635 mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
1636 size_t size)
1637 {
1638 void *addr;
1639 u_int64_t *oaddr64;
1640 mcache_obj_t *next;
1641
1642 addr = (void *)((uintptr_t)base + offset);
1643 next = ((mcache_obj_t *)addr)->obj_next;
1644
1645 /* For the "obj_next" pointer in the buffer */
1646 oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1647 *oaddr64 = MCACHE_FREE_PATTERN;
1648
1649 if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
1650 (caddr_t)base, size)) != NULL) {
1651 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1652 (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1653 /* NOTREACHED */
1654 }
1655 ((mcache_obj_t *)addr)->obj_next = next;
1656 }
1657
1658 __private_extern__ void
mcache_audit_free_verify_set(mcache_audit_t * mca,void * base,size_t offset,size_t size)1659 mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
1660 size_t size)
1661 {
1662 void *addr;
1663 u_int64_t *oaddr64;
1664 mcache_obj_t *next;
1665
1666 addr = (void *)((uintptr_t)base + offset);
1667 next = ((mcache_obj_t *)addr)->obj_next;
1668
1669 /* For the "obj_next" pointer in the buffer */
1670 oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1671 *oaddr64 = MCACHE_FREE_PATTERN;
1672
1673 if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
1674 MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
1675 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1676 (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1677 /* NOTREACHED */
1678 }
1679 ((mcache_obj_t *)addr)->obj_next = next;
1680 }
1681
1682 #undef panic
1683
1684 #define DUMP_TRN_FMT() \
1685 "%s transaction thread %p saved PC stack (%d deep):\n" \
1686 "\t%p, %p, %p, %p, %p, %p, %p, %p\n" \
1687 "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1688
1689 #define DUMP_TRN_FIELDS(s, x) \
1690 s, \
1691 mca->mca_trns[x].mca_thread, mca->mca_trns[x].mca_depth, \
1692 mca->mca_trns[x].mca_stack[0], mca->mca_trns[x].mca_stack[1], \
1693 mca->mca_trns[x].mca_stack[2], mca->mca_trns[x].mca_stack[3], \
1694 mca->mca_trns[x].mca_stack[4], mca->mca_trns[x].mca_stack[5], \
1695 mca->mca_trns[x].mca_stack[6], mca->mca_trns[x].mca_stack[7], \
1696 mca->mca_trns[x].mca_stack[8], mca->mca_trns[x].mca_stack[9], \
1697 mca->mca_trns[x].mca_stack[10], mca->mca_trns[x].mca_stack[11], \
1698 mca->mca_trns[x].mca_stack[12], mca->mca_trns[x].mca_stack[13], \
1699 mca->mca_trns[x].mca_stack[14], mca->mca_trns[x].mca_stack[15]
1700
1701 #define MCA_TRN_LAST ((mca->mca_next_trn + mca_trn_max) % mca_trn_max)
1702 #define MCA_TRN_PREV ((mca->mca_next_trn + mca_trn_max - 1) % mca_trn_max)
1703
1704 __private_extern__ char *
mcache_dump_mca(char buf[static DUMP_MCA_BUF_SIZE],mcache_audit_t * mca)1705 mcache_dump_mca(char buf[static DUMP_MCA_BUF_SIZE], mcache_audit_t *mca)
1706 {
1707 snprintf(buf, DUMP_MCA_BUF_SIZE,
1708 "mca %p: addr %p, cache %p (%s) nxttrn %d\n"
1709 DUMP_TRN_FMT()
1710 DUMP_TRN_FMT(),
1711
1712 mca, mca->mca_addr, mca->mca_cache,
1713 mca->mca_cache ? mca->mca_cache->mc_name : "?",
1714 mca->mca_next_trn,
1715
1716 DUMP_TRN_FIELDS("last", MCA_TRN_LAST),
1717 DUMP_TRN_FIELDS("previous", MCA_TRN_PREV));
1718
1719 return buf;
1720 }
1721
1722 __attribute__((noreturn))
1723 static void
mcache_audit_panic(mcache_audit_t * mca,void * addr,size_t offset,int64_t expected,int64_t got)1724 mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
1725 int64_t expected, int64_t got)
1726 {
1727 char buf[DUMP_MCA_BUF_SIZE];
1728
1729 if (mca == NULL) {
1730 panic("mcache_audit: buffer %p modified after free at "
1731 "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
1732 offset, got, expected);
1733 /* NOTREACHED */
1734 __builtin_unreachable();
1735 }
1736
1737 panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
1738 "(0x%llx instead of 0x%llx)\n%s\n",
1739 addr, offset, got, expected, mcache_dump_mca(buf, mca));
1740 /* NOTREACHED */
1741 __builtin_unreachable();
1742 }
1743