skmem_slab.c (revision d4514f0bc1d3f944c22d92e68b646ac3fb40d452) - OpenGrok cross reference for /xnu-11215.81.4/bsd/skywalk/mem/skmem_slab.c

/*
 * Copyright (c) 2016-2023 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include <skywalk/os_skywalk_private.h>
#define _FN_KPRINTF
#include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
#include <libkern/OSDebug.h>    /* for OSBacktrace */
#include <kern/sched_prim.h>    /* for assert_wait */
#include <vm/vm_memtag.h>

static struct skmem_slab *skmem_slab_create(struct skmem_cache *, uint32_t);
static void skmem_slab_destroy(struct skmem_cache *, struct skmem_slab *);

/*
 * Too big a value will cause overflow and thus trip the assertion; the
 * idea here is to set an upper limit for the time that a particular
 * thread is allowed to perform retries before we give up and panic.
 */
#define SKMEM_SLAB_MAX_BACKOFF          (20 * USEC_PER_SEC) /* seconds */

/*
 * Threshold (in msec) after which we reset the exponential backoff value
 * back to its (random) initial value.  Note that we allow the actual delay
 * to be at most twice this value.
 */
#define SKMEM_SLAB_BACKOFF_THRES        1024    /* up to ~2 sec (2048 msec) */

/*
 * To reduce the likelihood of global synchronization between threads,
 * we use some random value to start the exponential backoff.
 */
#define SKMEM_SLAB_BACKOFF_RANDOM       4       /* range is [1,4] msec */

/*
 * Create a slab.
 */
static struct skmem_slab *
skmem_slab_create(struct skmem_cache *skm, uint32_t skmflag)
{
	struct skmem_region *skr = skm->skm_region;
	uint32_t objsize, chunks;
	size_t slabsize = skm->skm_slabsize;
	struct skmem_slab *__single sl;
	struct sksegment *__single sg, *__single sgm;
	char *buf, *__indexable slab;
	char *__indexable bufm;
	uint32_t slabm_size;
	void *__sized_by(slabm_size) slabm;

	/*
	 * Allocate a segment (a slab at our layer) from the region.
	 */
	slab = skmem_region_alloc(skr, &slabm, &sg, &sgm, skmflag,
	    skr->skr_params.srp_c_seg_size, &slabm_size);
	if (slab == NULL) {
		goto rg_alloc_failure;
	}

	if ((sl = skmem_cache_alloc(skmem_slab_cache, SKMEM_SLEEP)) == NULL) {
		goto slab_alloc_failure;
	}

	ASSERT(sg != NULL);
	ASSERT(sgm == NULL || sgm->sg_index == sg->sg_index);

	bzero(sl, sizeof(*sl));
	sl->sl_cache = skm;
	sl->sl_base = buf = slab;
	bufm = slabm;
	objsize = (uint32_t)skr->skr_c_obj_size;
	sl->sl_basem = __unsafe_forge_bidi_indexable(void *, bufm, objsize);
	ASSERT(skr->skr_c_obj_size <= UINT32_MAX);
	ASSERT(skm->skm_objsize == objsize);
	ASSERT((slabsize / objsize) <= UINT32_MAX);
	sl->sl_chunks = chunks = (uint32_t)(slabsize / objsize);
	sl->sl_seg = sg;
	sl->sl_segm = sgm;

	/*
	 * Create one or more buffer control structures for the slab,
	 * each one tracking a chunk of raw object from the segment,
	 * and insert these into the slab's list of buffer controls.
	 */
	ASSERT(chunks > 0);
	while (chunks != 0) {
		struct skmem_bufctl *__indexable bc;
		bc = skmem_cache_alloc(skmem_bufctl_cache, SKMEM_SLEEP);
		if (bc == NULL) {
			goto bufctl_alloc_failure;
		}

		bzero(bc, bc_size);
		bc->bc_lim = objsize;
		bc->bc_addr = buf;
		bc->bc_addrm = bufm;
		bc->bc_slab = sl;
		bc->bc_idx = (sl->sl_chunks - chunks);
		if (skr->skr_mode & SKR_MODE_SHAREOK) {
			bc->bc_flags |= SKMEM_BUFCTL_SHAREOK;
		}
		SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
		buf += objsize;
		if (bufm != NULL) {
			/* XXX -fbounds-safety */
			bufm = (char *)bufm + objsize;
		}
		--chunks;
	}

	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
	    SK_KVA(skm), SK_KVA(sl));
	SK_DF(SK_VERB_MEM_CACHE, "  [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
	    SK_KVA(slab), SK_KVA(slab + objsize));

	return sl;

bufctl_alloc_failure:
	skmem_slab_destroy(skm, sl);

slab_alloc_failure:
	skmem_region_free(skr, slab, __unsafe_forge_bidi_indexable(void *,
	    slabm, skr->skr_c_obj_size));

rg_alloc_failure:
	os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);

	return NULL;
}

/*
 * Destroy a slab.
 */
static void
skmem_slab_destroy(struct skmem_cache *skm, struct skmem_slab *sl)
{
	struct skmem_bufctl *bc, *tbc;
	void *__single slab = sl->sl_base;
	void *__single slabm = sl->sl_basem;

	ASSERT(sl->sl_refcnt == 0);

	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
	    SK_KVA(skm), SK_KVA(sl));
	SK_DF(SK_VERB_MEM_CACHE, "  [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
	    SK_KVA(slab), SK_KVA((uintptr_t)slab + skm->skm_objsize));

	/*
	 * Go through the slab's list of buffer controls and free
	 * them, and then free the slab itself back to its cache.
	 */
	SLIST_FOREACH_SAFE(bc, &sl->sl_head, bc_link, tbc) {
		SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
		skmem_cache_free(skmem_bufctl_cache, bc);
	}
	skmem_cache_free(skmem_slab_cache, sl);

	/* and finally free the segment back to the backing region */
	skmem_region_free(skm->skm_region, slab, slabm);
}

/*
 * Allocate a raw object from the (locked) slab layer.  Normal region variant.
 */
int
skmem_slab_alloc_locked(struct skmem_cache *skm, struct skmem_obj_info *oi,
    struct skmem_obj_info *oim, uint32_t skmflag)
{
	struct skmem_bufctl_bkt *bcb;
	struct skmem_bufctl *bc;
	struct skmem_slab *sl;
	uint32_t retries = 0;
	uint64_t boff_total = 0;                /* in usec */
	uint64_t boff = 0;                      /* in msec */
	boolean_t new_slab;
	size_t bufsize;
	void *__sized_by(bufsize) buf;
#if CONFIG_KERNEL_TAGGING
	vm_offset_t tagged_address;             /* address tagging */
	struct skmem_region *region;            /* region source for this slab */
#endif /* CONFIG_KERNEL_TAGGING */

	/* this flag is not for the caller to set */
	VERIFY(!(skmflag & SKMEM_FAILOK));

	/*
	 * A slab is either in a partially-allocated list (at least it has
	 * a free object available), or is in the empty list (everything
	 * has been allocated.)  If we can't find a partially-allocated
	 * slab, then we need to allocate a slab (segment) from the region.
	 */
again:
	SKM_SLAB_LOCK_ASSERT_HELD(skm);
	sl = TAILQ_FIRST(&skm->skm_sl_partial_list);
	if (sl == NULL) {
		uint32_t flags = skmflag;
		boolean_t retry;

		ASSERT(skm->skm_sl_partial == 0);
		SKM_SLAB_UNLOCK(skm);
		if (!(flags & SKMEM_NOSLEEP)) {
			/*
			 * Pick up a random value to start the exponential
			 * backoff, if this is the first round, or if the
			 * current value is over the threshold.  Otherwise,
			 * double the backoff value.
			 */
			if (boff == 0 || boff > SKMEM_SLAB_BACKOFF_THRES) {
				read_frandom(&boff, sizeof(boff));
				boff = (boff % SKMEM_SLAB_BACKOFF_RANDOM) + 1;
				ASSERT(boff > 0);
			} else if (os_mul_overflow(boff, 2, &boff)) {
				panic_plain("\"%s\": boff counter "
				    "overflows\n", skm->skm_name);
				/* NOTREACHED */
				__builtin_unreachable();
			}
			/* add this value (in msec) to the total (in usec) */
			if (os_add_overflow(boff_total,
			    (boff * NSEC_PER_USEC), &boff_total)) {
				panic_plain("\"%s\": boff_total counter "
				    "overflows\n", skm->skm_name);
				/* NOTREACHED */
				__builtin_unreachable();
			}
		}
		/*
		 * In the event of a race between multiple threads trying
		 * to create the last remaining (or the only) slab, let the
		 * loser(s) attempt to retry after waiting a bit.  The winner
		 * would have inserted the newly-created slab into the list.
		 */
		if (!(flags & SKMEM_NOSLEEP) &&
		    boff_total <= SKMEM_SLAB_MAX_BACKOFF) {
			retry = TRUE;
			++retries;
			flags |= SKMEM_FAILOK;
		} else {
			if (!(flags & SKMEM_NOSLEEP)) {
				panic_plain("\"%s\": failed to allocate "
				    "slab (sleeping mode) after %llu "
				    "msec, %u retries\n\n%s", skm->skm_name,
				    (boff_total / NSEC_PER_USEC), retries,
				    skmem_dump(skm->skm_region));
				/* NOTREACHED */
				__builtin_unreachable();
			}
			retry = FALSE;
		}

		/*
		 * Create a new slab.
		 */
		if ((sl = skmem_slab_create(skm, flags)) == NULL) {
			if (retry) {
				SK_ERR("\"%s\": failed to allocate "
				    "slab (%ssleeping mode): waiting for %llu "
				    "msec, total %llu msec, %u retries",
				    skm->skm_name,
				    (flags & SKMEM_NOSLEEP) ? "non-" : "",
				    boff, (boff_total / NSEC_PER_USEC), retries);
				VERIFY(boff > 0 && ((uint32_t)boff <=
				    (SKMEM_SLAB_BACKOFF_THRES * 2)));
				delay((uint32_t)boff * NSEC_PER_USEC);
				SKM_SLAB_LOCK(skm);
				goto again;
			} else {
				SK_RDERR(4, "\"%s\": failed to allocate slab "
				    "(%ssleeping mode)", skm->skm_name,
				    (flags & SKMEM_NOSLEEP) ? "non-" : "");
				SKM_SLAB_LOCK(skm);
			}
			return ENOMEM;
		}

		SKM_SLAB_LOCK(skm);
		skm->skm_sl_create++;
		if ((skm->skm_sl_bufinuse += sl->sl_chunks) >
		    skm->skm_sl_bufmax) {
			skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
		}
	}
	skm->skm_sl_alloc++;

	new_slab = (sl->sl_refcnt == 0);
	ASSERT(new_slab || SKMEM_SLAB_IS_PARTIAL(sl));

	sl->sl_refcnt++;
	ASSERT(sl->sl_refcnt <= sl->sl_chunks);

	/*
	 * We either have a new slab, or a partially-allocated one.
	 * Remove a buffer control from the slab, and insert it to
	 * the allocated-address hash chain.
	 */
	bc = SLIST_FIRST(&sl->sl_head);
	ASSERT(bc != NULL);
	SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);

	/* sanity check */
	VERIFY(bc->bc_usecnt == 0);

	/*
	 * Also store the master object's region info for the caller.
	 */
	bzero(oi, sizeof(*oi));
#if CONFIG_KERNEL_TAGGING
	region = sl->sl_cache->skm_region;
	if (region->skr_mode & SKR_MODE_MEMTAG) {
		/*
		 * If this region is configured to be tagged, we generate a
		 * unique tag for the object address, and return this tagged
		 * address to the caller. vm_memtag_assign_tag generates a
		 * unique tag for the given address and size, and
		 * vm_memtag_set_tag commits the tag to the backing memory
		 * metadata. This tagged address is returned back to the client,
		 * and when the client frees the address, we "re-tag" the
		 * address to prevent against use-after-free attacks (more on
		 * this in skmem_cache_batch_free).
		 */
		tagged_address = vm_memtag_assign_tag((vm_offset_t)bc->bc_addr,
		    skm->skm_objsize);
		vm_memtag_set_tag(tagged_address, skm->skm_objsize);
		/*
		 * XXX -fbounds-safety: tagged_address's type is vm_offset_t
		 * which is unsafe, so we have ot use __unsafe_forge here.
		 * Also, skm->skm_objsize is equal to bc->bc_addr (they're both
		 * set to skr->skr_c_obj_size)
		 */
		bufsize = skm->skm_objsize;
		/*
		 * XXX -fbounds-safety: Couldn't pass bufsize here, because
		 * compiler gives an error: cannot reference 'bufsize' after it
		 * is changed during consecutive assignments
		 */
		buf = __unsafe_forge_bidi_indexable(void *, tagged_address,
		    skm->skm_objsize);
	} else {
		bufsize = bc->bc_lim;
		buf = bc->bc_addr;
	}
#else /* !CONFIG_KERNEL_TAGGING */
	bufsize = bc->bc_lim;
	buf = bc->bc_addr;
#endif /* CONFIG_KERNEL_TAGGING */
	SKMEM_OBJ_SIZE(oi) = (uint32_t)bufsize;
	SKMEM_OBJ_ADDR(oi) = buf;
	SKMEM_OBJ_BUFCTL(oi) = bc;      /* master only; NULL for slave */
	ASSERT(skm->skm_objsize <= UINT32_MAX);
	SKMEM_OBJ_IDX_REG(oi) =
	    ((sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx);
	SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
	/*
	 * And for slave object.
	 */
	if (oim != NULL) {
		bzero(oim, sizeof(*oim));
		if (bc->bc_addrm != NULL) {
			SKMEM_OBJ_ADDR(oim) = __unsafe_forge_bidi_indexable(
				void *, bc->bc_addrm, SKMEM_OBJ_SIZE(oi));
			SKMEM_OBJ_SIZE(oim) = SKMEM_OBJ_SIZE(oi);
			SKMEM_OBJ_IDX_REG(oim) = SKMEM_OBJ_IDX_REG(oi);
			SKMEM_OBJ_IDX_SEG(oim) = SKMEM_OBJ_IDX_SEG(oi);
		}
	}

	if (skm->skm_mode & SKM_MODE_BATCH) {
		((struct skmem_obj *)buf)->mo_next = NULL;
	}

	/* insert to allocated-address hash chain */
	bcb = SKMEM_CACHE_HASH(skm, buf);
	SLIST_INSERT_HEAD(&bcb->bcb_head, bc, bc_link);

	if (SLIST_EMPTY(&sl->sl_head)) {
		/*
		 * If that was the last buffer control from this slab,
		 * insert the slab into the empty list.  If it was in
		 * the partially-allocated list, then remove the slab
		 * from there as well.
		 */
		ASSERT(sl->sl_refcnt == sl->sl_chunks);
		if (new_slab) {
			ASSERT(sl->sl_chunks == 1);
		} else {
			ASSERT(sl->sl_chunks > 1);
			ASSERT(skm->skm_sl_partial > 0);
			skm->skm_sl_partial--;
			TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
		}
		skm->skm_sl_empty++;
		ASSERT(skm->skm_sl_empty != 0);
		TAILQ_INSERT_HEAD(&skm->skm_sl_empty_list, sl, sl_link);
	} else {
		/*
		 * The slab is not empty; if it was newly allocated
		 * above, then it's not in the partially-allocated
		 * list and so we insert it there.
		 */
		ASSERT(SKMEM_SLAB_IS_PARTIAL(sl));
		if (new_slab) {
			skm->skm_sl_partial++;
			ASSERT(skm->skm_sl_partial != 0);
			TAILQ_INSERT_HEAD(&skm->skm_sl_partial_list,
			    sl, sl_link);
		}
	}

	/* if auditing is enabled, record this transaction */
	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
		skmem_audit_bufctl(bc);
	}

	return 0;
}

/*
 * Allocate a raw object from the (locked) slab layer.  Pseudo region variant.
 */
int
skmem_slab_alloc_pseudo_locked(struct skmem_cache *skm,
    struct skmem_obj_info *oi, struct skmem_obj_info *oim, uint32_t skmflag)
{
	zalloc_flags_t zflags = (skmflag & SKMEM_NOSLEEP) ? Z_NOWAIT : Z_WAITOK;
	struct skmem_region *skr = skm->skm_region;
	void *obj, *buf;

	/* this flag is not for the caller to set */
	VERIFY(!(skmflag & SKMEM_FAILOK));

	SKM_SLAB_LOCK_ASSERT_HELD(skm);

	ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
	/* mirrored region is not applicable */
	ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
	/* batching is not yet supported */
	ASSERT(!(skm->skm_mode & SKM_MODE_BATCH));

	obj = zalloc_flags_buf(skr->skr_zreg, zflags | Z_ZERO);
	if (obj == NULL) {
		os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
		return ENOMEM;
	}

#if KASAN
	/*
	 * Perform some fix-ups since the zone element isn't guaranteed
	 * to be on the aligned boundary.  The effective object size
	 * has been adjusted accordingly by skmem_region_create() earlier
	 * at cache creation time.
	 *
	 * 'buf' is the aligned address for this object.
	 */
	uintptr_t diff = P2ROUNDUP((intptr_t)obj + sizeof(u_int64_t),
	    skm->skm_bufalign) - (uintptr_t)obj;
	buf = (void *)((char *)obj + diff);

	/*
	 * Wind back a pointer size from the aligned address and
	 * save the original address so we can free it later.
	 */
	/*
	 * XXX -fbounds-safety: Since this function is for generic alloc, we
	 * cannot modify the struct like we did for struct skmem_cache.
	 * Unfortunately, __unsafe_forge_bidi_indexable seems to be the only
	 * choice.
	 */
	void **pbuf = __unsafe_forge_bidi_indexable(void **,
	    (intptr_t)buf - sizeof(void *), sizeof(void *));
	*pbuf = obj;

	VERIFY(((intptr_t)buf + skm->skm_bufsize) <=
	    ((intptr_t)obj + skm->skm_objsize));
#else /* !KASAN */
	/*
	 * We expect that the zone allocator would allocate elements
	 * rounded up to the requested alignment based on the effective
	 * object size computed in skmem_region_create() earlier, and
	 * 'buf' is therefore the element address itself.
	 */
	buf = obj;
#endif /* !KASAN */

	/* make sure the object is aligned */
	VERIFY(IS_P2ALIGNED(buf, skm->skm_bufalign));

	/*
	 * Return the object's info to the caller.
	 */
	bzero(oi, sizeof(*oi));
	SKMEM_OBJ_ADDR(oi) = buf;
#if KASAN
	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize -
	    (uint32_t)skm->skm_bufalign;
#else
	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
#endif
	ASSERT(skm->skm_objsize <= UINT32_MAX);
	if (oim != NULL) {
		bzero(oim, sizeof(*oim));
	}

	skm->skm_sl_alloc++;
	skm->skm_sl_bufinuse++;
	if (skm->skm_sl_bufinuse > skm->skm_sl_bufmax) {
		skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
	}

	return 0;
}

/*
 * Allocate a raw object from the slab layer.
 */
int
skmem_slab_alloc(struct skmem_cache *skm, struct skmem_obj_info *oi,
    struct skmem_obj_info *oim, uint32_t skmflag)
{
	int err;

	SKM_SLAB_LOCK(skm);
	err = skm->skm_slab_alloc(skm, oi, oim, skmflag);
	SKM_SLAB_UNLOCK(skm);

	return err;
}

/*
 * Allocate raw object(s) from the slab layer.
 */
uint32_t
skmem_slab_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
    uint32_t num, uint32_t skmflag)
{
	uint32_t need = num;

	ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
	*list = NULL;

	SKM_SLAB_LOCK(skm);
	for (;;) {
		struct skmem_obj_info oi, oim;

		/*
		 * Get a single raw object from the slab layer.
		 */
		if (skm->skm_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
			break;
		}

		*list = SKMEM_OBJ_ADDR(&oi);
		ASSERT((*list)->mo_next == NULL);
		/* store these inside the object itself */
		(*list)->mo_info = oi;
		(*list)->mo_minfo = oim;
		list = &(*list)->mo_next;

		ASSERT(need != 0);
		if (--need == 0) {
			break;
		}
	}
	SKM_SLAB_UNLOCK(skm);

	return num - need;
}

/*
 * Free a raw object to the (locked) slab layer.  Normal region variant.
 */
void
skmem_slab_free_locked(struct skmem_cache *skm, void *buf)
{
	struct skmem_bufctl *bc, *tbc;
	struct skmem_bufctl_bkt *bcb;
	struct skmem_slab *sl = NULL;
#if CONFIG_KERNEL_TAGGING
	struct skmem_region *region;
	vm_offset_t tagged_addr;
	/*
	 * If buf is tagged, then addr would have the canonicalized address.
	 * If buf is untagged, then addr is same as buf.
	 */
	void *addr = __unsafe_forge_bidi_indexable(void *,
	    vm_memtag_canonicalize_address((vm_offset_t)buf), skm->skm_objsize);
#endif /* CONFIG_KERNEL_TAGGING */

	SKM_SLAB_LOCK_ASSERT_HELD(skm);
	ASSERT(buf != NULL);
	/* caller is expected to clear mo_next */
	ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
	    ((struct skmem_obj *)buf)->mo_next == NULL);

	/*
	 * Search the hash chain to find a matching buffer control for the
	 * given object address.  If found, remove the buffer control from
	 * the hash chain and insert it into the freelist.  Otherwise, we
	 * panic since the caller has given us a bogus address.
	 */
	skm->skm_sl_free++;
	bcb = SKMEM_CACHE_HASH(skm, buf);

#if CONFIG_KERNEL_TAGGING
	/*
	 * If this region is configured to tag memory addresses, then buf is a
	 * tagged address. When we search for the buffer control from the hash
	 * table, we need to use the untagged address, because buffer control
	 * maintains untagged address (bc_addr). vm_memtag_canonicalize_address
	 * returns the untagged address.
	 */
	SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
		if (bc->bc_addr == addr) {
			SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
			sl = bc->bc_slab;
			break;
		}
	}
#else /* !CONFIG_KERNEL_TAGGING */
	SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
		if (bc->bc_addr == buf) {
			SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
			sl = bc->bc_slab;
			break;
		}
	}
#endif /* CONFIG_KERNEL_TAGGING */

	if (bc == NULL) {
		panic("%s: attempt to free invalid or already-freed obj %p "
		    "on skm %p", __func__, buf, skm);
		/* NOTREACHED */
		__builtin_unreachable();
	}
	ASSERT(sl != NULL && sl->sl_cache == skm);

#if CONFIG_KERNEL_TAGGING
	/*
	 * We use untagged address here, because SKMEM_SLAB_MEMBER compares the
	 * address against sl_base, which is untagged.
	 */
	VERIFY(SKMEM_SLAB_MEMBER(sl, addr));
#else /* !CONFIG_KERNEL_TAGGING */
	VERIFY(SKMEM_SLAB_MEMBER(sl, buf));
#endif /* CONFIG_KERNEL_TAGGING */

	/* make sure this object is not currently in use by another object */
	VERIFY(bc->bc_usecnt == 0);

	/* if auditing is enabled, record this transaction */
	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
		skmem_audit_bufctl(bc);
	}

	/* if clear on free is requested, zero out the object */
	if (skm->skm_mode & SKM_MODE_CLEARONFREE) {
		size_t size = skm->skm_objsize;
		void *buf_cpy = __unsafe_forge_bidi_indexable(void *, buf, size);
		bzero(buf_cpy, size);
		buf_cpy = NULL;
		size = 0;
	}

#if CONFIG_KERNEL_TAGGING
	/*
	 * If this region is configured to tag memory addresses, we re-tag this
	 * address as the object is freed. We do the re-tagging in the magazine
	 * layer too, but in case we need to free raw objects to the slab layer
	 * (either becasue SKM_MODE_NOMAGAZINES is set, or the magazine layer
	 * was not able to allocate empty magazines), we re-tag the addresses
	 * here in the slab layer. Freeing to the slab layer is symmetrical to
	 * allocating from the slab layer - when we allocate from slab layer, we
	 * tag the address, and then construct the object; when we free to the
	 * slab layer, we destruct the object, and retag the address.
	 * We do the re-tagging here, because this is right after the last usage
	 * of the buf variable (which is tagged).
	 */
	region = skm->skm_region;
	if (region->skr_mode & SKR_MODE_MEMTAG) {
		tagged_addr = vm_memtag_assign_tag((vm_offset_t)buf,
		    skm->skm_objsize);
		vm_memtag_set_tag(tagged_addr, skm->skm_objsize);
	}
#endif /* CONFIG_KERNEL_TAGGING */

	/* insert the buffer control to the slab's freelist */
	SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);

	ASSERT(sl->sl_refcnt >= 1);
	if (--sl->sl_refcnt == 0) {
		/*
		 * If this was the last outstanding object for the slab,
		 * remove the slab from the partially-allocated or empty
		 * list, and destroy the slab (segment) back to the region.
		 */
		if (sl->sl_chunks == 1) {
			ASSERT(skm->skm_sl_empty > 0);
			skm->skm_sl_empty--;
			TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
		} else {
			ASSERT(skm->skm_sl_partial > 0);
			skm->skm_sl_partial--;
			TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
		}
		ASSERT((int64_t)(skm->skm_sl_bufinuse - sl->sl_chunks) >= 0);
		skm->skm_sl_bufinuse -= sl->sl_chunks;
		skm->skm_sl_destroy++;
		SKM_SLAB_UNLOCK(skm);
		skmem_slab_destroy(skm, sl);
		SKM_SLAB_LOCK(skm);
		return;
	}

	ASSERT(bc == SLIST_FIRST(&sl->sl_head));
	if (SLIST_NEXT(bc, bc_link) == NULL) {
		/*
		 * If this is the first (potentially amongst many) object
		 * that's returned to the slab, remove the slab from the
		 * empty list and insert to end of the partially-allocated
		 * list. This should help avoid thrashing the partial slab
		 * since we avoid disturbing what's already at the front.
		 */
		ASSERT(sl->sl_refcnt == (sl->sl_chunks - 1));
		ASSERT(sl->sl_chunks > 1);
		ASSERT(skm->skm_sl_empty > 0);
		skm->skm_sl_empty--;
		TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
		skm->skm_sl_partial++;
		ASSERT(skm->skm_sl_partial != 0);
		TAILQ_INSERT_TAIL(&skm->skm_sl_partial_list, sl, sl_link);
	}
}

/*
 * Free a raw object to the (locked) slab layer.  Pseudo region variant.
 */
void
skmem_slab_free_pseudo_locked(struct skmem_cache *skm, void *buf)
{
	struct skmem_region *skr = skm->skm_region;
	void *__single obj = buf;

	ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);

	SKM_SLAB_LOCK_ASSERT_HELD(skm);

	VERIFY(IS_P2ALIGNED(obj, skm->skm_bufalign));

#if KASAN
	/*
	 * Since we stuffed the original zone element address before
	 * the buffer address in KASAN mode, get it back since we're
	 * about to free it.
	 */
	void **pbuf = __unsafe_forge_bidi_indexable(void **,
	    ((intptr_t)obj - sizeof(void *)), sizeof(void *));

	VERIFY(((intptr_t)obj + skm->skm_bufsize) <=
	    ((intptr_t)*pbuf + skm->skm_objsize));

	obj = *pbuf;
#endif /* KASAN */

	/* free it to zone */
	zfree(skr->skr_zreg, obj);

	skm->skm_sl_free++;
	ASSERT(skm->skm_sl_bufinuse > 0);
	skm->skm_sl_bufinuse--;
}

/*
 * Free a raw object to the slab layer.
 */
void
skmem_slab_free(struct skmem_cache *skm, void *buf)
{
	if (skm->skm_mode & SKM_MODE_BATCH) {
		((struct skmem_obj *)buf)->mo_next = NULL;
	}

	SKM_SLAB_LOCK(skm);
	skm->skm_slab_free(skm, buf);
	SKM_SLAB_UNLOCK(skm);
}

/*
 * Free raw object(s) to the slab layer.
 */
void
skmem_slab_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
{
	struct skmem_obj *listn;

	ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));

	SKM_SLAB_LOCK(skm);
	for (;;) {
		listn = list->mo_next;
		list->mo_next = NULL;

		/*
		 * Free a single object to the slab layer.
		 */
		skm->skm_slab_free(skm, (void *)list);

		/* if no more objects to free, we're done */
		if ((list = listn) == NULL) {
			break;
		}
	}
	SKM_SLAB_UNLOCK(skm);
}


/*
 * Given a buffer control, record the current transaction.
 */
__attribute__((noinline, cold, not_tail_called))
inline void
skmem_audit_bufctl(struct skmem_bufctl *bc)
{
	struct skmem_bufctl_audit *bca = (struct skmem_bufctl_audit *)bc;
	struct timeval tv;

	microuptime(&tv);
	bca->bc_thread = current_thread();
	bca->bc_timestamp = (uint32_t)((tv.tv_sec * 1000) + (tv.tv_usec / 1000));
	bca->bc_depth = OSBacktrace(bca->bc_stack, SKMEM_STACK_DEPTH);
}