bsd/kern/mcache.c

/*
 * Copyright (c) 2006-2020 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

/*
 * Memory allocator with per-CPU caching, derived from the kmem magazine
 * concept and implementation as described in the following paper:
 * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
 * That implementation is Copyright 2006 Sun Microsystems, Inc.  All rights
 * reserved.  Use is subject to license terms.
 *
 * There are several major differences between this and the original kmem
 * magazine: this derivative implementation allows for multiple objects to
 * be allocated and freed from/to the object cache in one call; in addition,
 * it provides for better flexibility where the user is allowed to define
 * its own slab allocator (instead of the default zone allocator).  Finally,
 * no object construction/destruction takes place at the moment, although
 * this could be added in future to improve efficiency.
 */

#include <sys/param.h>
#include <sys/types.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/queue.h>
#include <sys/kernel.h>
#include <sys/systm.h>

#include <kern/debug.h>
#include <kern/zalloc.h>
#include <kern/cpu_number.h>
#include <kern/locks.h>
#include <kern/thread_call.h>

#include <libkern/libkern.h>
#include <libkern/OSAtomic.h>
#include <libkern/OSDebug.h>

#include <mach/vm_param.h>
#include <machine/limits.h>
#include <machine/machine_routines.h>

#include <string.h>

#include <sys/mcache.h>

#define MCACHE_SIZE(n) \
	__builtin_offsetof(mcache_t, mc_cpu[n])

/* Allocate extra in case we need to manually align the pointer */
#define MCACHE_ALLOC_SIZE \
	(sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_LINE_SIZE)

#define MCACHE_CPU(c) \
	(mcache_cpu_t *)((void *)((char *)(c) + MCACHE_SIZE(cpu_number())))

/*
 * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
 * to serialize accesses to the global list of caches in the system.
 * They also record the thread currently running in the critical
 * section, so that we can avoid recursive requests to reap the
 * caches when memory runs low.
 */
#define MCACHE_LIST_LOCK() {                            \
	lck_mtx_lock(&mcache_llock);                     \
	mcache_llock_owner = current_thread();          \
}

#define MCACHE_LIST_UNLOCK() {                          \
	mcache_llock_owner = NULL;                      \
	lck_mtx_unlock(&mcache_llock);                   \
}

#define MCACHE_LOCK(l)          lck_mtx_lock(l)
#define MCACHE_UNLOCK(l)        lck_mtx_unlock(l)
#define MCACHE_LOCK_TRY(l)      lck_mtx_try_lock(l)

static unsigned int ncpu;
static unsigned int cache_line_size;
static struct thread *mcache_llock_owner;
static LCK_GRP_DECLARE(mcache_llock_grp, "mcache.list");
static LCK_MTX_DECLARE(mcache_llock, &mcache_llock_grp);
static struct zone *mcache_zone;
static const uint32_t mcache_reap_interval = 15;
static const uint32_t mcache_reap_interval_leeway = 2;
static UInt32 mcache_reaping;
static int mcache_ready;
static int mcache_updating;

static int mcache_bkt_contention = 3;
#if DEBUG
static unsigned int mcache_flags = MCF_DEBUG;
#else
static unsigned int mcache_flags = 0;
#endif

int mca_trn_max = MCA_TRN_MAX;

static mcache_bkttype_t mcache_bkttype[] = {
	{ 1, 4096, 32768, NULL },
	{ 3, 2048, 16384, NULL },
	{ 7, 1024, 12288, NULL },
	{ 15, 256, 8192, NULL },
	{ 31, 64, 4096, NULL },
	{ 47, 0, 2048, NULL },
	{ 63, 0, 1024, NULL },
	{ 95, 0, 512, NULL },
	{ 143, 0, 256, NULL },
	{ 165, 0, 0, NULL },
};

static mcache_t *mcache_create_common(const char *, size_t, size_t,
    mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
    mcache_notifyfn_t, void *, u_int32_t, int);
static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
    unsigned int, int);
static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
static void mcache_cpu_batch_refill(mcache_cpu_t *, mcache_bkt_t *, int);
static uint32_t mcache_bkt_batch_alloc(mcache_t *, mcache_bktlist_t *,
    mcache_bkt_t **, uint32_t);
static void mcache_bkt_batch_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
static void mcache_cache_bkt_enable(mcache_t *);
static void mcache_bkt_purge(mcache_t *);
static void mcache_bkt_destroy(mcache_t *, mcache_bkt_t *, int);
static void mcache_bkt_ws_update(mcache_t *);
static void mcache_bkt_ws_zero(mcache_t *);
static void mcache_bkt_ws_reap(mcache_t *);
static void mcache_dispatch(void (*)(void *), void *);
static void mcache_cache_reap(mcache_t *);
static void mcache_cache_update(mcache_t *);
static void mcache_cache_bkt_resize(void *);
static void mcache_cache_enable(void *);
static void mcache_update(thread_call_param_t __unused, thread_call_param_t __unused);
static void mcache_update_timeout(void *);
static void mcache_applyall(void (*)(mcache_t *));
static void mcache_reap_start(void *);
static void mcache_reap_done(void *);
static void mcache_reap_timeout(thread_call_param_t __unused, thread_call_param_t);
static void mcache_notify(mcache_t *, u_int32_t);
static void mcache_purge(void *);

static LIST_HEAD(, mcache) mcache_head;
mcache_t *mcache_audit_cache;

static thread_call_t mcache_reap_tcall;
static thread_call_t mcache_update_tcall;

/*
 * Initialize the framework; this is currently called as part of BSD init.
 */
__private_extern__ void
mcache_init(void)
{
	mcache_bkttype_t *btp;
	unsigned int i;
	char name[32];

	VERIFY(mca_trn_max >= 2);

	ncpu = ml_wait_max_cpus();
	(void) mcache_cache_line_size();        /* prime it */

	mcache_reap_tcall = thread_call_allocate(mcache_reap_timeout, NULL);
	mcache_update_tcall = thread_call_allocate(mcache_update, NULL);
	if (mcache_reap_tcall == NULL || mcache_update_tcall == NULL) {
		panic("mcache_init: thread_call_allocate failed");
		/* NOTREACHED */
		__builtin_unreachable();
	}

	mcache_zone = zone_create("mcache", MCACHE_ALLOC_SIZE,
	    ZC_PGZ_USE_GUARDS | ZC_DESTRUCTIBLE);

	LIST_INIT(&mcache_head);

	for (i = 0; i < sizeof(mcache_bkttype) / sizeof(*btp); i++) {
		btp = &mcache_bkttype[i];
		(void) snprintf(name, sizeof(name), "bkt_%d",
		    btp->bt_bktsize);
		btp->bt_cache = mcache_create(name,
		    (btp->bt_bktsize + 1) * sizeof(void *), 0, 0, MCR_SLEEP);
	}

	PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof(mcache_flags));
	mcache_flags &= MCF_FLAGS_MASK;

	mcache_audit_cache = mcache_create("audit", sizeof(mcache_audit_t),
	    0, 0, MCR_SLEEP);

	mcache_applyall(mcache_cache_bkt_enable);
	mcache_ready = 1;

	printf("mcache: %d CPU(s), %d bytes CPU cache line size\n",
	    ncpu, CPU_CACHE_LINE_SIZE);
}

/*
 * Return the global mcache flags.
 */
__private_extern__ unsigned int
mcache_getflags(void)
{
	return mcache_flags;
}

/*
 * Return the CPU cache line size.
 */
__private_extern__ unsigned int
mcache_cache_line_size(void)
{
	if (cache_line_size == 0) {
		ml_cpu_info_t cpu_info;
		ml_cpu_get_info(&cpu_info);
		cache_line_size = (unsigned int)cpu_info.cache_line_size;
	}
	return cache_line_size;
}

/*
 * Create a cache using the zone allocator as the backend slab allocator.
 * The caller may specify any alignment for the object; if it specifies 0
 * the default alignment (MCACHE_ALIGN) will be used.
 */
__private_extern__ mcache_t *
mcache_create(const char *name, size_t bufsize, size_t align,
    u_int32_t flags, int wait __unused)
{
	return mcache_create_common(name, bufsize, align, mcache_slab_alloc,
	           mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1);
}

/*
 * Create a cache using a custom backend slab allocator.  Since the caller
 * is responsible for allocation, no alignment guarantee will be provided
 * by this framework.
 */
__private_extern__ mcache_t *
mcache_create_ext(const char *name, size_t bufsize,
    mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
    mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
    u_int32_t flags, int wait __unused)
{
	return mcache_create_common(name, bufsize, 0, allocfn,
	           freefn, auditfn, logfn, notifyfn, arg, flags, 0);
}

/*
 * Common cache creation routine.
 */
static mcache_t *
mcache_create_common(const char *name, size_t bufsize, size_t align,
    mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
    mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
    u_int32_t flags, int need_zone)
{
	mcache_bkttype_t *btp;
	mcache_t *cp = NULL;
	size_t chunksize;
	void *buf, **pbuf;
	unsigned int c;
	char lck_name[64];

	buf = zalloc_flags(mcache_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);

	/*
	 * In case we didn't get a cache-aligned memory, round it up
	 * accordingly.  This is needed in order to get the rest of
	 * structure members aligned properly.  It also means that
	 * the memory span gets shifted due to the round up, but it
	 * is okay since we've allocated extra space for this.
	 */
	cp = (mcache_t *)
	    P2ROUNDUP((intptr_t)buf + sizeof(void *), CPU_CACHE_LINE_SIZE);
	pbuf = (void **)((intptr_t)cp - sizeof(void *));
	*pbuf = buf;

	/*
	 * Guaranteed alignment is valid only when we use the internal
	 * slab allocator (currently set to use the zone allocator).
	 */
	if (!need_zone) {
		align = 1;
	} else {
		/* Enforce 64-bit minimum alignment for zone-based buffers */
		if (align == 0) {
			align = MCACHE_ALIGN;
		}
		align = P2ROUNDUP(align, MCACHE_ALIGN);
	}

	if ((align & (align - 1)) != 0) {
		panic("mcache_create: bad alignment %lu", align);
		/* NOTREACHED */
		__builtin_unreachable();
	}

	cp->mc_align = align;
	cp->mc_slab_alloc = allocfn;
	cp->mc_slab_free = freefn;
	cp->mc_slab_audit = auditfn;
	cp->mc_slab_log = logfn;
	cp->mc_slab_notify = notifyfn;
	cp->mc_private = need_zone ? cp : arg;
	cp->mc_bufsize = bufsize;
	cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;

	(void) snprintf(cp->mc_name, sizeof(cp->mc_name), "mcache.%s", name);

	(void) snprintf(lck_name, sizeof(lck_name), "%s.cpu", cp->mc_name);
	cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name, LCK_GRP_ATTR_NULL);

	/*
	 * Allocation chunk size is the object's size plus any extra size
	 * needed to satisfy the object's alignment.  It is enforced to be
	 * at least the size of an LP64 pointer to simplify auditing and to
	 * handle multiple-element allocation requests, where the elements
	 * returned are linked together in a list.
	 */
	chunksize = MAX(bufsize, sizeof(u_int64_t));
	if (need_zone) {
		VERIFY(align != 0 && (align % MCACHE_ALIGN) == 0);
		chunksize += sizeof(uint64_t) + align;
		chunksize = P2ROUNDUP(chunksize, align);
		cp->mc_slab_zone = zone_create(cp->mc_name, chunksize,
		    ZC_PGZ_USE_GUARDS | ZC_DESTRUCTIBLE);
	}
	cp->mc_chunksize = chunksize;

	/*
	 * Initialize the bucket layer.
	 */
	(void) snprintf(lck_name, sizeof(lck_name), "%s.bkt", cp->mc_name);
	cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
	    LCK_GRP_ATTR_NULL);
	lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp, LCK_ATTR_NULL);

	(void) snprintf(lck_name, sizeof(lck_name), "%s.sync", cp->mc_name);
	cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
	    LCK_GRP_ATTR_NULL);
	lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp, LCK_ATTR_NULL);

	for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++) {
		continue;
	}

	cp->cache_bkttype = btp;

	/*
	 * Initialize the CPU layer.  Each per-CPU structure is aligned
	 * on the CPU cache line boundary to prevent false sharing.
	 */
	for (c = 0; c < ncpu; c++) {
		mcache_cpu_t *ccp = &cp->mc_cpu[c];

		VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
		lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp, LCK_ATTR_NULL);
		ccp->cc_objs = -1;
		ccp->cc_pobjs = -1;
	}

	if (mcache_ready) {
		mcache_cache_bkt_enable(cp);
	}

	/* TODO: dynamically create sysctl for stats */

	MCACHE_LIST_LOCK();
	LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
	MCACHE_LIST_UNLOCK();

	/*
	 * If cache buckets are enabled and this is the first cache
	 * created, start the periodic cache update.
	 */
	if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
		mcache_updating = 1;
		mcache_update_timeout(NULL);
	}
	if (cp->mc_flags & MCF_DEBUG) {
		printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
		    "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
		    arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
	}
	return cp;
}

/*
 * Allocate one or more objects from a cache.
 */
__private_extern__ unsigned int
mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
{
	mcache_cpu_t *ccp;
	mcache_obj_t **top = &(*list);
	mcache_bkt_t *bkt;
	unsigned int need = num;
	boolean_t nwretry = FALSE;

	/* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
	VERIFY((wait & (MCR_NOSLEEP | MCR_FAILOK)) != (MCR_NOSLEEP | MCR_FAILOK));

	ASSERT(list != NULL);
	*list = NULL;

	if (num == 0) {
		return 0;
	}

retry_alloc:
	/* We may not always be running in the same CPU in case of retries */
	ccp = MCACHE_CPU(cp);

	MCACHE_LOCK(&ccp->cc_lock);
	for (;;) {
		/*
		 * If we have an object in the current CPU's filled bucket,
		 * chain the object to any previous objects and return if
		 * we've satisfied the number of requested objects.
		 */
		if (ccp->cc_objs > 0) {
			mcache_obj_t *tail;
			int objs;

			/*
			 * Objects in the bucket are already linked together
			 * with the most recently freed object at the head of
			 * the list; grab as many objects as we can.
			 */
			objs = MIN((unsigned int)ccp->cc_objs, need);
			*list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
			ccp->cc_objs -= objs;
			ccp->cc_alloc += objs;

			tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
			list = &tail->obj_next;
			*list = NULL;

			/* If we got them all, return to caller */
			if ((need -= objs) == 0) {
				MCACHE_UNLOCK(&ccp->cc_lock);

				if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
				    cp->mc_slab_log != NULL) {
					(*cp->mc_slab_log)(num, *top, TRUE);
				}

				if (cp->mc_flags & MCF_DEBUG) {
					goto debug_alloc;
				}

				return num;
			}
		}

		/*
		 * The CPU's filled bucket is empty.  If the previous filled
		 * bucket was full, exchange and try again.
		 */
		if (ccp->cc_pobjs > 0) {
			mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
			continue;
		}

		/*
		 * If the bucket layer is disabled, allocate from slab.  This
		 * can happen either because MCF_NOCPUCACHE is set, or because
		 * the bucket layer is currently being resized.
		 */
		if (ccp->cc_bktsize == 0) {
			break;
		}

		/*
		 * Both of the CPU's buckets are empty; try to get full
		 * bucket(s) from the bucket layer.  Upon success, refill
		 * this CPU and place any empty bucket into the empty list.
		 * To prevent potential thrashing, replace both empty buckets
		 * only if the requested count exceeds a bucket's worth of
		 * objects.
		 */
		(void) mcache_bkt_batch_alloc(cp, &cp->mc_full,
		    &bkt, (need <= ccp->cc_bktsize) ? 1 : 2);
		if (bkt != NULL) {
			mcache_bkt_t *bkt_list = NULL;

			if (ccp->cc_pfilled != NULL) {
				ccp->cc_pfilled->bkt_next = bkt_list;
				bkt_list = ccp->cc_pfilled;
			}
			if (bkt->bkt_next == NULL) {
				/*
				 * Bucket layer allocation returns only 1
				 * magazine; retain current empty magazine.
				 */
				mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
			} else {
				/*
				 * We got 2 full buckets from the bucket
				 * layer; release the current empty bucket
				 * back to the bucket layer.
				 */
				if (ccp->cc_filled != NULL) {
					ccp->cc_filled->bkt_next = bkt_list;
					bkt_list = ccp->cc_filled;
				}
				mcache_cpu_batch_refill(ccp, bkt,
				    ccp->cc_bktsize);
			}
			mcache_bkt_batch_free(cp, &cp->mc_empty, bkt_list);
			continue;
		}

		/*
		 * The bucket layer has no full buckets; allocate the
		 * object(s) directly from the slab layer.
		 */
		break;
	}
	MCACHE_UNLOCK(&ccp->cc_lock);

	need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);

	/*
	 * If this is a blocking allocation, or if it is non-blocking and
	 * the cache's full bucket is non-empty, then retry the allocation.
	 */
	if (need > 0) {
		if (!(wait & MCR_NONBLOCKING)) {
			atomic_add_32(&cp->mc_wretry_cnt, 1);
			goto retry_alloc;
		} else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
		    !mcache_bkt_isempty(cp)) {
			if (!nwretry) {
				nwretry = TRUE;
			}
			atomic_add_32(&cp->mc_nwretry_cnt, 1);
			goto retry_alloc;
		} else if (nwretry) {
			atomic_add_32(&cp->mc_nwfail_cnt, 1);
		}
	}

	if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
		(*cp->mc_slab_log)((num - need), *top, TRUE);
	}

	if (!(cp->mc_flags & MCF_DEBUG)) {
		return num - need;
	}

debug_alloc:
	if (cp->mc_flags & MCF_DEBUG) {
		mcache_obj_t **o = top;
		unsigned int n;

		n = 0;
		/*
		 * Verify that the chain of objects have the same count as
		 * what we are about to report to the caller.  Any mismatch
		 * here means that the object list is insanely broken and
		 * therefore we must panic.
		 */
		while (*o != NULL) {
			o = &(*o)->obj_next;
			++n;
		}
		if (n != (num - need)) {
			panic("mcache_alloc_ext: %s cp %p corrupted list "
			    "(got %d actual %d)\n", cp->mc_name,
			    (void *)cp, num - need, n);
			/* NOTREACHED */
			__builtin_unreachable();
		}
	}

	/* Invoke the slab layer audit callback if auditing is enabled */
	if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
		(*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
	}

	return num - need;
}

/*
 * Allocate a single object from a cache.
 */
__private_extern__ void *
mcache_alloc(mcache_t *cp, int wait)
{
	mcache_obj_t *buf;

	(void) mcache_alloc_ext(cp, &buf, 1, wait);
	return buf;
}

__private_extern__ void
mcache_waiter_inc(mcache_t *cp)
{
	atomic_add_32(&cp->mc_waiter_cnt, 1);
}

__private_extern__ void
mcache_waiter_dec(mcache_t *cp)
{
	atomic_add_32(&cp->mc_waiter_cnt, -1);
}

__private_extern__ boolean_t
mcache_bkt_isempty(mcache_t *cp)
{
	/*
	 * This isn't meant to accurately tell whether there are
	 * any full buckets in the cache; it is simply a way to
	 * obtain "hints" about the state of the cache.
	 */
	return cp->mc_full.bl_total == 0;
}

/*
 * Notify the slab layer about an event.
 */
static void
mcache_notify(mcache_t *cp, u_int32_t event)
{
	if (cp->mc_slab_notify != NULL) {
		(*cp->mc_slab_notify)(cp->mc_private, event);
	}
}

/*
 * Purge the cache and disable its buckets.
 */
static void
mcache_purge(void *arg)
{
	mcache_t *cp = arg;

	mcache_bkt_purge(cp);
	/*
	 * We cannot simply call mcache_cache_bkt_enable() from here as
	 * a bucket resize may be in flight and we would cause the CPU
	 * layers of the cache to point to different sizes.  Therefore,
	 * we simply increment the enable count so that during the next
	 * periodic cache update the buckets can be reenabled.
	 */
	lck_mtx_lock_spin(&cp->mc_sync_lock);
	cp->mc_enable_cnt++;
	lck_mtx_unlock(&cp->mc_sync_lock);
}

__private_extern__ boolean_t
mcache_purge_cache(mcache_t *cp, boolean_t async)
{
	/*
	 * Purging a cache that has no per-CPU caches or is already
	 * in the process of being purged is rather pointless.
	 */
	if (cp->mc_flags & MCF_NOCPUCACHE) {
		return FALSE;
	}

	lck_mtx_lock_spin(&cp->mc_sync_lock);
	if (cp->mc_purge_cnt > 0) {
		lck_mtx_unlock(&cp->mc_sync_lock);
		return FALSE;
	}
	cp->mc_purge_cnt++;
	lck_mtx_unlock(&cp->mc_sync_lock);

	if (async) {
		mcache_dispatch(mcache_purge, cp);
	} else {
		mcache_purge(cp);
	}

	return TRUE;
}

/*
 * Free a single object to a cache.
 */
__private_extern__ void
mcache_free(mcache_t *cp, void *buf)
{
	((mcache_obj_t *)buf)->obj_next = NULL;
	mcache_free_ext(cp, (mcache_obj_t *)buf);
}

/*
 * Free one or more objects to a cache.
 */
__private_extern__ void
mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
{
	mcache_cpu_t *ccp = MCACHE_CPU(cp);
	mcache_bkttype_t *btp;
	mcache_obj_t *nlist;
	mcache_bkt_t *bkt;

	if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
		(*cp->mc_slab_log)(0, list, FALSE);
	}

	/* Invoke the slab layer audit callback if auditing is enabled */
	if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
		(*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
	}

	MCACHE_LOCK(&ccp->cc_lock);
	for (;;) {
		/*
		 * If there is space in the current CPU's filled bucket, put
		 * the object there and return once all objects are freed.
		 * Note the cast to unsigned integer takes care of the case
		 * where the bucket layer is disabled (when cc_objs is -1).
		 */
		if ((unsigned int)ccp->cc_objs <
		    (unsigned int)ccp->cc_bktsize) {
			/*
			 * Reverse the list while we place the object into the
			 * bucket; this effectively causes the most recently
			 * freed object(s) to be reused during allocation.
			 */
			nlist = list->obj_next;
			list->obj_next = (ccp->cc_objs == 0) ? NULL :
			    ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
			ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
			ccp->cc_free++;

			if ((list = nlist) != NULL) {
				continue;
			}

			/* We are done; return to caller */
			MCACHE_UNLOCK(&ccp->cc_lock);

			/* If there is a waiter below, notify it */
			if (cp->mc_waiter_cnt > 0) {
				mcache_notify(cp, MCN_RETRYALLOC);
			}
			return;
		}

		/*
		 * The CPU's filled bucket is full.  If the previous filled
		 * bucket was empty, exchange and try again.
		 */
		if (ccp->cc_pobjs == 0) {
			mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
			continue;
		}

		/*
		 * If the bucket layer is disabled, free to slab.  This can
		 * happen either because MCF_NOCPUCACHE is set, or because
		 * the bucket layer is currently being resized.
		 */
		if (ccp->cc_bktsize == 0) {
			break;
		}

		/*
		 * Both of the CPU's buckets are full; try to get empty
		 * buckets from the bucket layer.  Upon success, empty this
		 * CPU and place any full bucket into the full list.
		 *
		 * TODO: Because the caller currently doesn't indicate
		 * the number of objects in the list, we choose the more
		 * conservative approach of allocating only 1 empty
		 * bucket (to prevent potential thrashing).  Once we
		 * have the object count, we can replace 1 with similar
		 * logic as used in mcache_alloc_ext().
		 */
		(void) mcache_bkt_batch_alloc(cp, &cp->mc_empty, &bkt, 1);
		if (bkt != NULL) {
			mcache_bkt_t *bkt_list = NULL;

			if (ccp->cc_pfilled != NULL) {
				ccp->cc_pfilled->bkt_next = bkt_list;
				bkt_list = ccp->cc_pfilled;
			}
			if (bkt->bkt_next == NULL) {
				/*
				 * Bucket layer allocation returns only 1
				 * bucket; retain current full bucket.
				 */
				mcache_cpu_refill(ccp, bkt, 0);
			} else {
				/*
				 * We got 2 empty buckets from the bucket
				 * layer; release the current full bucket
				 * back to the bucket layer.
				 */
				if (ccp->cc_filled != NULL) {
					ccp->cc_filled->bkt_next = bkt_list;
					bkt_list = ccp->cc_filled;
				}
				mcache_cpu_batch_refill(ccp, bkt, 0);
			}
			mcache_bkt_batch_free(cp, &cp->mc_full, bkt_list);
			continue;
		}
		btp = cp->cache_bkttype;

		/*
		 * We need an empty bucket to put our freed objects into
		 * but couldn't get an empty bucket from the bucket layer;
		 * attempt to allocate one.  We do not want to block for
		 * allocation here, and if the bucket allocation fails
		 * we will simply fall through to the slab layer.
		 */
		MCACHE_UNLOCK(&ccp->cc_lock);
		bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
		MCACHE_LOCK(&ccp->cc_lock);

		if (bkt != NULL) {
			/*
			 * We have an empty bucket, but since we drop the
			 * CPU lock above, the cache's bucket size may have
			 * changed.  If so, free the bucket and try again.
			 */
			if (ccp->cc_bktsize != btp->bt_bktsize) {
				MCACHE_UNLOCK(&ccp->cc_lock);
				mcache_free(btp->bt_cache, bkt);
				MCACHE_LOCK(&ccp->cc_lock);
				continue;
			}

			/*
			 * Store it in the bucket object since we'll
			 * need to refer to it during bucket destroy;
			 * we can't safely refer to cache_bkttype as
			 * the bucket lock may not be acquired then.
			 */
			bkt->bkt_type = btp;

			/*
			 * We have an empty bucket of the right size;
			 * add it to the bucket layer and try again.
			 */
			ASSERT(bkt->bkt_next == NULL);
			mcache_bkt_batch_free(cp, &cp->mc_empty, bkt);
			continue;
		}

		/*
		 * The bucket layer has no empty buckets; free the
		 * object(s) directly to the slab layer.
		 */
		break;
	}
	MCACHE_UNLOCK(&ccp->cc_lock);

	/* If there is a waiter below, notify it */
	if (cp->mc_waiter_cnt > 0) {
		mcache_notify(cp, MCN_RETRYALLOC);
	}

	/* Advise the slab layer to purge the object(s) */
	(*cp->mc_slab_free)(cp->mc_private, list,
	    (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
}

/*
 * Cache destruction routine.
 */
__private_extern__ void
mcache_destroy(mcache_t *cp)
{
	void **pbuf;

	MCACHE_LIST_LOCK();
	LIST_REMOVE(cp, mc_list);
	MCACHE_LIST_UNLOCK();

	mcache_bkt_purge(cp);

	/*
	 * This cache is dead; there should be no further transaction.
	 * If it's still invoked, make sure that it induces a fault.
	 */
	cp->mc_slab_alloc = NULL;
	cp->mc_slab_free = NULL;
	cp->mc_slab_audit = NULL;

	lck_grp_free(cp->mc_bkt_lock_grp);
	lck_grp_free(cp->mc_cpu_lock_grp);
	lck_grp_free(cp->mc_sync_lock_grp);

	/*
	 * TODO: We need to destroy the zone here, but cannot do it
	 * because there is no such way to achieve that.  Until then
	 * the memory allocated for the zone structure is leaked.
	 * Once it is achievable, uncomment these lines:
	 *
	 *	if (cp->mc_slab_zone != NULL) {
	 *		zdestroy(cp->mc_slab_zone);
	 *		cp->mc_slab_zone = NULL;
	 *	}
	 */

	/* Get the original address since we're about to free it */
	pbuf = (void **)((intptr_t)cp - sizeof(void *));

	zfree(mcache_zone, *pbuf);
}

/*
 * Internal slab allocator used as a backend for simple caches.  The current
 * implementation uses the zone allocator for simplicity reasons.
 */
static unsigned int
mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num,
    int wait)
{
#pragma unused(wait)
	mcache_t *cp = arg;
	unsigned int need = num;
	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
	u_int32_t flags = cp->mc_flags;
	void *buf, *base, **pbuf;
	mcache_obj_t **list = *plist;

	*list = NULL;

	for (;;) {
		buf = zalloc_flags(cp->mc_slab_zone, Z_WAITOK | Z_NOFAIL);

		/* Get the aligned base address for this object */
		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
		    cp->mc_align);

		/*
		 * Wind back a pointer size from the aligned base and
		 * save the original address so we can free it later.
		 */
		pbuf = (void **)((intptr_t)base - sizeof(void *));
		*pbuf = buf;

		VERIFY(((intptr_t)base + cp->mc_bufsize) <=
		    ((intptr_t)buf + cp->mc_chunksize));

		/*
		 * If auditing is enabled, patternize the contents of
		 * the buffer starting from the 64-bit aligned base to
		 * the end of the buffer; the length is rounded up to
		 * the nearest 64-bit multiply; this is because we use
		 * 64-bit memory access to set/check the pattern.
		 */
		if (flags & MCF_DEBUG) {
			VERIFY(((intptr_t)base + rsize) <=
			    ((intptr_t)buf + cp->mc_chunksize));
			mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
		}

		VERIFY(IS_P2ALIGNED(base, cp->mc_align));
		*list = (mcache_obj_t *)base;

		(*list)->obj_next = NULL;
		list = *plist = &(*list)->obj_next;

		/* If we got them all, return to mcache */
		if (--need == 0) {
			break;
		}
	}

	return num - need;
}

/*
 * Internal slab deallocator used as a backend for simple caches.
 */
static void
mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
{
	mcache_t *cp = arg;
	mcache_obj_t *nlist;
	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
	u_int32_t flags = cp->mc_flags;
	void *base;
	void **pbuf;

	for (;;) {
		nlist = list->obj_next;
		list->obj_next = NULL;

		base = list;
		VERIFY(IS_P2ALIGNED(base, cp->mc_align));

		/* Get the original address since we're about to free it */
		pbuf = (void **)((intptr_t)base - sizeof(void *));

		VERIFY(((intptr_t)base + cp->mc_bufsize) <=
		    ((intptr_t)*pbuf + cp->mc_chunksize));

		if (flags & MCF_DEBUG) {
			VERIFY(((intptr_t)base + rsize) <=
			    ((intptr_t)*pbuf + cp->mc_chunksize));
			mcache_audit_free_verify(NULL, base, 0, rsize);
		}

		/* Free it to zone */
		zfree(cp->mc_slab_zone, *pbuf);

		/* No more objects to free; return to mcache */
		if ((list = nlist) == NULL) {
			break;
		}
	}
}

/*
 * Internal slab auditor for simple caches.
 */
static void
mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
{
	mcache_t *cp = arg;
	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
	void *base, **pbuf;

	while (list != NULL) {
		mcache_obj_t *next = list->obj_next;

		base = list;
		VERIFY(IS_P2ALIGNED(base, cp->mc_align));

		/* Get the original address */
		pbuf = (void **)((intptr_t)base - sizeof(void *));

		VERIFY(((intptr_t)base + rsize) <=
		    ((intptr_t)*pbuf + cp->mc_chunksize));

		if (!alloc) {
			mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
		} else {
			mcache_audit_free_verify_set(NULL, base, 0, rsize);
		}

		list = list->obj_next = next;
	}
}

/*
 * Refill the CPU's buckets with bkt and its follower (if any).
 */
static void
mcache_cpu_batch_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
{
	ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
	    (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
	ASSERT(ccp->cc_bktsize > 0);

	ccp->cc_filled = bkt;
	ccp->cc_objs = objs;
	if (__probable(bkt->bkt_next != NULL)) {
		ccp->cc_pfilled = bkt->bkt_next;
		ccp->cc_pobjs = objs;
		bkt->bkt_next = NULL;
	} else {
		ASSERT(bkt->bkt_next == NULL);
		ccp->cc_pfilled = NULL;
		ccp->cc_pobjs = -1;
	}
}

/*
 * Refill the CPU's filled bucket with bkt and save the previous one.
 */
static void
mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
{
	ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
	    (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
	ASSERT(ccp->cc_bktsize > 0);

	ccp->cc_pfilled = ccp->cc_filled;
	ccp->cc_pobjs = ccp->cc_objs;
	ccp->cc_filled = bkt;
	ccp->cc_objs = objs;
}

/*
 * Get one or more buckets from the bucket layer.
 */
static uint32_t
mcache_bkt_batch_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t **list,
    uint32_t num)
{
	mcache_bkt_t *bkt_list = NULL;
	mcache_bkt_t *bkt;
	uint32_t need = num;

	ASSERT(list != NULL && need > 0);

	if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
		/*
		 * The bucket layer lock is held by another CPU; increase
		 * the contention count so that we can later resize the
		 * bucket size accordingly.
		 */
		MCACHE_LOCK(&cp->mc_bkt_lock);
		cp->mc_bkt_contention++;
	}

	while ((bkt = blp->bl_list) != NULL) {
		blp->bl_list = bkt->bkt_next;
		bkt->bkt_next = bkt_list;
		bkt_list = bkt;
		if (--blp->bl_total < blp->bl_min) {
			blp->bl_min = blp->bl_total;
		}
		blp->bl_alloc++;
		if (--need == 0) {
			break;
		}
	}

	MCACHE_UNLOCK(&cp->mc_bkt_lock);

	*list = bkt_list;

	return num - need;
}

/*
 * Return one or more buckets to the bucket layer.
 */
static void
mcache_bkt_batch_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
{
	mcache_bkt_t *nbkt;

	MCACHE_LOCK(&cp->mc_bkt_lock);
	while (bkt != NULL) {
		nbkt = bkt->bkt_next;
		bkt->bkt_next = blp->bl_list;
		blp->bl_list = bkt;
		blp->bl_total++;
		bkt = nbkt;
	}
	MCACHE_UNLOCK(&cp->mc_bkt_lock);
}

/*
 * Enable the bucket layer of a cache.
 */
static void
mcache_cache_bkt_enable(mcache_t *cp)
{
	mcache_cpu_t *ccp;
	unsigned int cpu;

	if (cp->mc_flags & MCF_NOCPUCACHE) {
		return;
	}

	for (cpu = 0; cpu < ncpu; cpu++) {
		ccp = &cp->mc_cpu[cpu];
		MCACHE_LOCK(&ccp->cc_lock);
		ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
		MCACHE_UNLOCK(&ccp->cc_lock);
	}
}

/*
 * Purge all buckets from a cache and disable its bucket layer.
 */
static void
mcache_bkt_purge(mcache_t *cp)
{
	mcache_cpu_t *ccp;
	mcache_bkt_t *bp, *pbp;
	int objs, pobjs;
	unsigned int cpu;

	for (cpu = 0; cpu < ncpu; cpu++) {
		ccp = &cp->mc_cpu[cpu];

		MCACHE_LOCK(&ccp->cc_lock);

		bp = ccp->cc_filled;
		pbp = ccp->cc_pfilled;
		objs = ccp->cc_objs;
		pobjs = ccp->cc_pobjs;
		ccp->cc_filled = NULL;
		ccp->cc_pfilled = NULL;
		ccp->cc_objs = -1;
		ccp->cc_pobjs = -1;
		ccp->cc_bktsize = 0;

		MCACHE_UNLOCK(&ccp->cc_lock);

		if (bp != NULL) {
			mcache_bkt_destroy(cp, bp, objs);
		}
		if (pbp != NULL) {
			mcache_bkt_destroy(cp, pbp, pobjs);
		}
	}

	mcache_bkt_ws_zero(cp);
	mcache_bkt_ws_reap(cp);
}

/*
 * Free one or more objects in the bucket to the slab layer,
 * and also free the bucket itself.
 */
static void
mcache_bkt_destroy(mcache_t *cp, mcache_bkt_t *bkt, int nobjs)
{
	if (nobjs > 0) {
		mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];

		if (cp->mc_flags & MCF_DEBUG) {
			mcache_obj_t *o = top;
			int cnt = 0;

			/*
			 * Verify that the chain of objects in the bucket is
			 * valid.  Any mismatch here means a mistake when the
			 * object(s) were freed to the CPU layer, so we panic.
			 */
			while (o != NULL) {
				o = o->obj_next;
				++cnt;
			}
			if (cnt != nobjs) {
				panic("mcache_bkt_destroy: %s cp %p corrupted "
				    "list in bkt %p (nobjs %d actual %d)\n",
				    cp->mc_name, (void *)cp, (void *)bkt,
				    nobjs, cnt);
				/* NOTREACHED */
				__builtin_unreachable();
			}
		}

		/* Advise the slab layer to purge the object(s) */
		(*cp->mc_slab_free)(cp->mc_private, top,
		    (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
	}
	mcache_free(bkt->bkt_type->bt_cache, bkt);
}

/*
 * Update the bucket layer working set statistics.
 */
static void
mcache_bkt_ws_update(mcache_t *cp)
{
	MCACHE_LOCK(&cp->mc_bkt_lock);

	cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
	cp->mc_full.bl_min = cp->mc_full.bl_total;
	cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
	cp->mc_empty.bl_min = cp->mc_empty.bl_total;

	MCACHE_UNLOCK(&cp->mc_bkt_lock);
}

/*
 * Mark everything as eligible for reaping (working set is zero).
 */
static void
mcache_bkt_ws_zero(mcache_t *cp)
{
	MCACHE_LOCK(&cp->mc_bkt_lock);

	cp->mc_full.bl_reaplimit = cp->mc_full.bl_total;
	cp->mc_full.bl_min = cp->mc_full.bl_total;
	cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_total;
	cp->mc_empty.bl_min = cp->mc_empty.bl_total;

	MCACHE_UNLOCK(&cp->mc_bkt_lock);
}

/*
 * Reap all buckets that are beyond the working set.
 */
static void
mcache_bkt_ws_reap(mcache_t *cp)
{
	mcache_bkt_t *bkt, *nbkt;
	uint32_t reap;

	reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
	if (reap != 0) {
		(void) mcache_bkt_batch_alloc(cp, &cp->mc_full, &bkt, reap);
		while (bkt != NULL) {
			nbkt = bkt->bkt_next;
			bkt->bkt_next = NULL;
			mcache_bkt_destroy(cp, bkt, bkt->bkt_type->bt_bktsize);
			bkt = nbkt;
		}
	}

	reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
	if (reap != 0) {
		(void) mcache_bkt_batch_alloc(cp, &cp->mc_empty, &bkt, reap);
		while (bkt != NULL) {
			nbkt = bkt->bkt_next;
			bkt->bkt_next = NULL;
			mcache_bkt_destroy(cp, bkt, 0);
			bkt = nbkt;
		}
	}
}

static void
mcache_reap_timeout(thread_call_param_t dummy __unused,
    thread_call_param_t arg)
{
	volatile UInt32 *flag = arg;

	ASSERT(flag == &mcache_reaping);

	*flag = 0;
}

static void
mcache_reap_done(void *flag)
{
	uint64_t deadline, leeway;

	clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
	    &deadline);
	clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
	    NSEC_PER_SEC, &leeway);
	thread_call_enter_delayed_with_leeway(mcache_reap_tcall, flag,
	    deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
}

static void
mcache_reap_start(void *arg)
{
	UInt32 *flag = arg;

	ASSERT(flag == &mcache_reaping);

	mcache_applyall(mcache_cache_reap);
	mcache_dispatch(mcache_reap_done, flag);
}

__private_extern__ void
mcache_reap(void)
{
	UInt32 *flag = &mcache_reaping;

	if (mcache_llock_owner == current_thread() ||
	    !OSCompareAndSwap(0, 1, flag)) {
		return;
	}

	mcache_dispatch(mcache_reap_start, flag);
}

__private_extern__ void
mcache_reap_now(mcache_t *cp, boolean_t purge)
{
	if (purge) {
		mcache_bkt_purge(cp);
		mcache_cache_bkt_enable(cp);
	} else {
		mcache_bkt_ws_zero(cp);
		mcache_bkt_ws_reap(cp);
	}
}

static void
mcache_cache_reap(mcache_t *cp)
{
	mcache_bkt_ws_reap(cp);
}

/*
 * Performs period maintenance on a cache.
 */
static void
mcache_cache_update(mcache_t *cp)
{
	int need_bkt_resize = 0;
	int need_bkt_reenable = 0;

	lck_mtx_assert(&mcache_llock, LCK_MTX_ASSERT_OWNED);

	mcache_bkt_ws_update(cp);

	/*
	 * Cache resize and post-purge reenable are mutually exclusive.
	 * If the cache was previously purged, there is no point of
	 * increasing the bucket size as there was an indication of
	 * memory pressure on the system.
	 */
	lck_mtx_lock_spin(&cp->mc_sync_lock);
	if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt) {
		need_bkt_reenable = 1;
	}
	lck_mtx_unlock(&cp->mc_sync_lock);

	MCACHE_LOCK(&cp->mc_bkt_lock);
	/*
	 * If the contention count is greater than the threshold, and if
	 * we are not already at the maximum bucket size, increase it.
	 * Otherwise, if this cache was previously purged by the user
	 * then we simply reenable it.
	 */
	if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
	    (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
	    mcache_bkt_contention && !need_bkt_reenable) {
		need_bkt_resize = 1;
	}

	cp->mc_bkt_contention_prev = cp->mc_bkt_contention;
	MCACHE_UNLOCK(&cp->mc_bkt_lock);

	if (need_bkt_resize) {
		mcache_dispatch(mcache_cache_bkt_resize, cp);
	} else if (need_bkt_reenable) {
		mcache_dispatch(mcache_cache_enable, cp);
	}
}

/*
 * Recompute a cache's bucket size.  This is an expensive operation
 * and should not be done frequently; larger buckets provide for a
 * higher transfer rate with the bucket while smaller buckets reduce
 * the memory consumption.
 */
static void
mcache_cache_bkt_resize(void *arg)
{
	mcache_t *cp = arg;
	mcache_bkttype_t *btp = cp->cache_bkttype;

	if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
		mcache_bkt_purge(cp);

		/*
		 * Upgrade to the next bucket type with larger bucket size;
		 * temporarily set the previous contention snapshot to a
		 * negative number to prevent unnecessary resize request.
		 */
		MCACHE_LOCK(&cp->mc_bkt_lock);
		cp->cache_bkttype = ++btp;
		cp->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
		MCACHE_UNLOCK(&cp->mc_bkt_lock);

		mcache_cache_enable(cp);
	}
}

/*
 * Reenable a previously disabled cache due to purge.
 */
static void
mcache_cache_enable(void *arg)
{
	mcache_t *cp = arg;

	lck_mtx_lock_spin(&cp->mc_sync_lock);
	cp->mc_purge_cnt = 0;
	cp->mc_enable_cnt = 0;
	lck_mtx_unlock(&cp->mc_sync_lock);

	mcache_cache_bkt_enable(cp);
}

static void
mcache_update_timeout(__unused void *arg)
{
	uint64_t deadline, leeway;

	clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
	    &deadline);
	clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
	    NSEC_PER_SEC, &leeway);
	thread_call_enter_delayed_with_leeway(mcache_update_tcall, NULL,
	    deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
}

static void
mcache_update(thread_call_param_t arg __unused,
    thread_call_param_t dummy __unused)
{
	mcache_applyall(mcache_cache_update);
	mcache_update_timeout(NULL);
}

static void
mcache_applyall(void (*func)(mcache_t *))
{
	mcache_t *cp;

	MCACHE_LIST_LOCK();
	LIST_FOREACH(cp, &mcache_head, mc_list) {
		func(cp);
	}
	MCACHE_LIST_UNLOCK();
}

static void
mcache_dispatch(void (*func)(void *), void *arg)
{
	ASSERT(func != NULL);
	timeout(func, arg, hz / 1000);
}

__private_extern__ void
mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp,
    struct timeval *base_ts)
{
	struct timeval now, base = { .tv_sec = 0, .tv_usec = 0 };
	void *stack[MCACHE_STACK_DEPTH + 1];
	struct mca_trn *transaction;

	transaction = &mca->mca_trns[mca->mca_next_trn];

	mca->mca_addr = addr;
	mca->mca_cache = cp;

	transaction->mca_thread = current_thread();

	bzero(stack, sizeof(stack));
	transaction->mca_depth = (uint16_t)OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
	bcopy(&stack[1], transaction->mca_stack,
	    sizeof(transaction->mca_stack));

	microuptime(&now);
	if (base_ts != NULL) {
		base = *base_ts;
	}
	/* tstamp is in ms relative to base_ts */
	transaction->mca_tstamp = ((now.tv_usec - base.tv_usec) / 1000);
	if ((now.tv_sec - base.tv_sec) > 0) {
		transaction->mca_tstamp += ((now.tv_sec - base.tv_sec) * 1000);
	}

	mca->mca_next_trn =
	    (mca->mca_next_trn + 1) % mca_trn_max;
}

/*
 * N.B.: mcache_set_pattern(), mcache_verify_pattern() and
 * mcache_verify_set_pattern() are marked as noinline to prevent the
 * compiler from aliasing pointers when they are inlined inside the callers
 * (e.g. mcache_audit_free_verify_set()) which would be undefined behavior.
 */
__private_extern__ OS_NOINLINE void
mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
{
	u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
	u_int64_t *buf = (u_int64_t *)buf_arg;

	VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
	VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));

	while (buf < buf_end) {
		*buf++ = pattern;
	}
}

__private_extern__ OS_NOINLINE void *
mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
{
	u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
	u_int64_t *buf;

	VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
	VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));

	for (buf = buf_arg; buf < buf_end; buf++) {
		if (*buf != pattern) {
			return buf;
		}
	}
	return NULL;
}

OS_NOINLINE static void *
mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
    size_t size)
{
	u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
	u_int64_t *buf;

	VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
	VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));

	for (buf = buf_arg; buf < buf_end; buf++) {
		if (*buf != old) {
			mcache_set_pattern(old, buf_arg,
			    (uintptr_t)buf - (uintptr_t)buf_arg);
			return buf;
		}
		*buf = new;
	}
	return NULL;
}

__private_extern__ void
mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
    size_t size)
{
	void *addr;
	u_int64_t *oaddr64;
	mcache_obj_t *next;

	addr = (void *)((uintptr_t)base + offset);
	next = ((mcache_obj_t *)addr)->obj_next;

	/* For the "obj_next" pointer in the buffer */
	oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
	*oaddr64 = MCACHE_FREE_PATTERN;

	if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
	    (caddr_t)base, size)) != NULL) {
		mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
		    (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
		/* NOTREACHED */
	}
	((mcache_obj_t *)addr)->obj_next = next;
}

__private_extern__ void
mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
    size_t size)
{
	void *addr;
	u_int64_t *oaddr64;
	mcache_obj_t *next;

	addr = (void *)((uintptr_t)base + offset);
	next = ((mcache_obj_t *)addr)->obj_next;

	/* For the "obj_next" pointer in the buffer */
	oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
	*oaddr64 = MCACHE_FREE_PATTERN;

	if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
	    MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
		mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
		    (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
		/* NOTREACHED */
	}
	((mcache_obj_t *)addr)->obj_next = next;
}

#undef panic

#define DUMP_TRN_FMT() \
	    "%s transaction thread %p saved PC stack (%d deep):\n" \
	    "\t%p, %p, %p, %p, %p, %p, %p, %p\n" \
	    "\t%p, %p, %p, %p, %p, %p, %p, %p\n"

#define DUMP_TRN_FIELDS(s, x) \
	    s, \
	    mca->mca_trns[x].mca_thread, mca->mca_trns[x].mca_depth, \
	    mca->mca_trns[x].mca_stack[0], mca->mca_trns[x].mca_stack[1], \
	    mca->mca_trns[x].mca_stack[2], mca->mca_trns[x].mca_stack[3], \
	    mca->mca_trns[x].mca_stack[4], mca->mca_trns[x].mca_stack[5], \
	    mca->mca_trns[x].mca_stack[6], mca->mca_trns[x].mca_stack[7], \
	    mca->mca_trns[x].mca_stack[8], mca->mca_trns[x].mca_stack[9], \
	    mca->mca_trns[x].mca_stack[10], mca->mca_trns[x].mca_stack[11], \
	    mca->mca_trns[x].mca_stack[12], mca->mca_trns[x].mca_stack[13], \
	    mca->mca_trns[x].mca_stack[14], mca->mca_trns[x].mca_stack[15]

#define MCA_TRN_LAST ((mca->mca_next_trn + mca_trn_max) % mca_trn_max)
#define MCA_TRN_PREV ((mca->mca_next_trn + mca_trn_max - 1) % mca_trn_max)

__private_extern__ char *
mcache_dump_mca(char buf[static DUMP_MCA_BUF_SIZE], mcache_audit_t *mca)
{
	snprintf(buf, DUMP_MCA_BUF_SIZE,
	    "mca %p: addr %p, cache %p (%s) nxttrn %d\n"
	    DUMP_TRN_FMT()
	    DUMP_TRN_FMT(),

	    mca, mca->mca_addr, mca->mca_cache,
	    mca->mca_cache ? mca->mca_cache->mc_name : "?",
	    mca->mca_next_trn,

	    DUMP_TRN_FIELDS("last", MCA_TRN_LAST),
	    DUMP_TRN_FIELDS("previous", MCA_TRN_PREV));

	return buf;
}

__private_extern__ void
mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
    int64_t expected, int64_t got)
{
	char buf[DUMP_MCA_BUF_SIZE];

	if (mca == NULL) {
		panic("mcache_audit: buffer %p modified after free at "
		    "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
		    offset, got, expected);
		/* NOTREACHED */
		__builtin_unreachable();
	}

	panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
	    "(0x%llx instead of 0x%llx)\n%s\n",
	    addr, offset, got, expected, mcache_dump_mca(buf, mca));
	/* NOTREACHED */
	__builtin_unreachable();
}

__attribute__((noinline, cold, not_tail_called, noreturn))
__private_extern__ int
assfail(const char *a, const char *f, int l)
{
	panic("assertion failed: %s, file: %s, line: %d", a, f, l);
	/* NOTREACHED */
	__builtin_unreachable();
}