xref: /xnu-11417.140.69/bsd/skywalk/mem/skmem_slab.c (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1 /*
2  * Copyright (c) 2016-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #define _FN_KPRINTF
31 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
32 #include <libkern/OSDebug.h>    /* for OSBacktrace */
33 #include <kern/sched_prim.h>    /* for assert_wait */
34 #include <vm/vm_memtag.h>
35 
36 static struct skmem_slab *skmem_slab_create(struct skmem_cache *, uint32_t);
37 static void skmem_slab_destroy(struct skmem_cache *, struct skmem_slab *);
38 
39 /*
40  * Too big a value will cause overflow and thus trip the assertion; the
41  * idea here is to set an upper limit for the time that a particular
42  * thread is allowed to perform retries before we give up and panic.
43  */
44 #define SKMEM_SLAB_MAX_BACKOFF          (20 * USEC_PER_SEC) /* seconds */
45 
46 /*
47  * Threshold (in msec) after which we reset the exponential backoff value
48  * back to its (random) initial value.  Note that we allow the actual delay
49  * to be at most twice this value.
50  */
51 #define SKMEM_SLAB_BACKOFF_THRES        1024    /* up to ~2 sec (2048 msec) */
52 
53 /*
54  * To reduce the likelihood of global synchronization between threads,
55  * we use some random value to start the exponential backoff.
56  */
57 #define SKMEM_SLAB_BACKOFF_RANDOM       4       /* range is [1,4] msec */
58 
59 /*
60  * Create a slab.
61  */
62 static struct skmem_slab *
skmem_slab_create(struct skmem_cache * skm,uint32_t skmflag)63 skmem_slab_create(struct skmem_cache *skm, uint32_t skmflag)
64 {
65 	struct skmem_region *skr = skm->skm_region;
66 	uint32_t objsize, chunks;
67 	size_t slabsize = skm->skm_slabsize;
68 	struct skmem_slab *__single sl;
69 	struct sksegment *__single sg, *__single sgm;
70 	char *buf, *__indexable slab;
71 	char *__indexable bufm;
72 	uint32_t slabm_size;
73 	void *__sized_by(slabm_size) slabm;
74 
75 	/*
76 	 * Allocate a segment (a slab at our layer) from the region.
77 	 */
78 	slab = skmem_region_alloc(skr, &slabm, &sg, &sgm, skmflag,
79 	    skr->skr_params.srp_c_seg_size, &slabm_size);
80 	if (slab == NULL) {
81 		goto rg_alloc_failure;
82 	}
83 
84 	if ((sl = skmem_cache_alloc(skmem_slab_cache, SKMEM_SLEEP)) == NULL) {
85 		goto slab_alloc_failure;
86 	}
87 
88 	ASSERT(sg != NULL);
89 	ASSERT(sgm == NULL || sgm->sg_index == sg->sg_index);
90 
91 	bzero(sl, sizeof(*sl));
92 	sl->sl_cache = skm;
93 	sl->sl_base = buf = slab;
94 	bufm = slabm;
95 	objsize = (uint32_t)skr->skr_c_obj_size;
96 	sl->sl_basem = __unsafe_forge_bidi_indexable(void *, bufm, objsize);
97 	ASSERT(skr->skr_c_obj_size <= UINT32_MAX);
98 	ASSERT(skm->skm_objsize == objsize);
99 	ASSERT((slabsize / objsize) <= UINT32_MAX);
100 	sl->sl_chunks = chunks = (uint32_t)(slabsize / objsize);
101 	sl->sl_seg = sg;
102 	sl->sl_segm = sgm;
103 
104 	/*
105 	 * Create one or more buffer control structures for the slab,
106 	 * each one tracking a chunk of raw object from the segment,
107 	 * and insert these into the slab's list of buffer controls.
108 	 */
109 	ASSERT(chunks > 0);
110 	while (chunks != 0) {
111 		struct skmem_bufctl *__indexable bc;
112 		bc = skmem_cache_alloc(skmem_bufctl_cache, SKMEM_SLEEP);
113 		if (bc == NULL) {
114 			goto bufctl_alloc_failure;
115 		}
116 
117 		bzero(bc, bc_size);
118 		bc->bc_lim = objsize;
119 		bc->bc_addr = buf;
120 		bc->bc_addrm = bufm;
121 		bc->bc_slab = sl;
122 		bc->bc_idx = (sl->sl_chunks - chunks);
123 		if (skr->skr_mode & SKR_MODE_SHAREOK) {
124 			bc->bc_flags |= SKMEM_BUFCTL_SHAREOK;
125 		}
126 		SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
127 		buf += objsize;
128 		if (bufm != NULL) {
129 			/* XXX -fbounds-safety */
130 			bufm = (char *)bufm + objsize;
131 		}
132 		--chunks;
133 	}
134 
135 	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
136 	    SK_KVA(skm), SK_KVA(sl));
137 	SK_DF(SK_VERB_MEM_CACHE, "  [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
138 	    SK_KVA(slab), SK_KVA(slab + objsize));
139 
140 	return sl;
141 
142 bufctl_alloc_failure:
143 	skmem_slab_destroy(skm, sl);
144 
145 slab_alloc_failure:
146 	skmem_region_free(skr, slab, __unsafe_forge_bidi_indexable(void *,
147 	    slabm, skr->skr_c_obj_size));
148 
149 rg_alloc_failure:
150 	os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
151 
152 	return NULL;
153 }
154 
155 /*
156  * Destroy a slab.
157  */
158 static void
skmem_slab_destroy(struct skmem_cache * skm,struct skmem_slab * sl)159 skmem_slab_destroy(struct skmem_cache *skm, struct skmem_slab *sl)
160 {
161 	struct skmem_bufctl *bc, *tbc;
162 	void *__single slab = sl->sl_base;
163 	void *__single slabm = sl->sl_basem;
164 
165 	ASSERT(sl->sl_refcnt == 0);
166 
167 	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
168 	    SK_KVA(skm), SK_KVA(sl));
169 	SK_DF(SK_VERB_MEM_CACHE, "  [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
170 	    SK_KVA(slab), SK_KVA((uintptr_t)slab + skm->skm_objsize));
171 
172 	/*
173 	 * Go through the slab's list of buffer controls and free
174 	 * them, and then free the slab itself back to its cache.
175 	 */
176 	SLIST_FOREACH_SAFE(bc, &sl->sl_head, bc_link, tbc) {
177 		SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
178 		skmem_cache_free(skmem_bufctl_cache, bc);
179 	}
180 	skmem_cache_free(skmem_slab_cache, sl);
181 
182 	/*
183 	 * Restore original tag before freeing back to system. sl->sl_base should
184 	 * have the original tag.
185 	 */
186 	if (skm->skm_region->skr_bufspec.memtag) {
187 		vm_memtag_store_tag(slab, skm->skm_slabsize);
188 	}
189 
190 	/* and finally free the segment back to the backing region */
191 	skmem_region_free(skm->skm_region, slab, slabm);
192 }
193 
194 /*
195  * Allocate a raw object from the (locked) slab layer.  Normal region variant.
196  */
197 int
skmem_slab_alloc_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)198 skmem_slab_alloc_locked(struct skmem_cache *skm, struct skmem_obj_info *oi,
199     struct skmem_obj_info *oim, uint32_t skmflag)
200 {
201 	struct skmem_bufctl_bkt *bcb;
202 	struct skmem_bufctl *bc;
203 	struct skmem_slab *sl;
204 	uint32_t retries = 0;
205 	uint64_t boff_total = 0;                /* in usec */
206 	uint64_t boff = 0;                      /* in msec */
207 	boolean_t new_slab;
208 	size_t bufsize;
209 	void *__sized_by(bufsize) buf;
210 #if CONFIG_KERNEL_TAGGING
211 	vm_map_address_t tagged_address;        /* address tagging */
212 	struct skmem_region *region;            /* region source for this slab */
213 #endif /* CONFIG_KERNEL_TAGGING */
214 
215 	/* this flag is not for the caller to set */
216 	VERIFY(!(skmflag & SKMEM_FAILOK));
217 
218 	/*
219 	 * A slab is either in a partially-allocated list (at least it has
220 	 * a free object available), or is in the empty list (everything
221 	 * has been allocated.)  If we can't find a partially-allocated
222 	 * slab, then we need to allocate a slab (segment) from the region.
223 	 */
224 again:
225 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
226 	sl = TAILQ_FIRST(&skm->skm_sl_partial_list);
227 	if (sl == NULL) {
228 		uint32_t flags = skmflag;
229 		boolean_t retry;
230 
231 		ASSERT(skm->skm_sl_partial == 0);
232 		SKM_SLAB_UNLOCK(skm);
233 		if (!(flags & SKMEM_NOSLEEP)) {
234 			/*
235 			 * Pick up a random value to start the exponential
236 			 * backoff, if this is the first round, or if the
237 			 * current value is over the threshold.  Otherwise,
238 			 * double the backoff value.
239 			 */
240 			if (boff == 0 || boff > SKMEM_SLAB_BACKOFF_THRES) {
241 				read_frandom(&boff, sizeof(boff));
242 				boff = (boff % SKMEM_SLAB_BACKOFF_RANDOM) + 1;
243 				ASSERT(boff > 0);
244 			} else if (os_mul_overflow(boff, 2, &boff)) {
245 				panic_plain("\"%s\": boff counter "
246 				    "overflows\n", skm->skm_name);
247 				/* NOTREACHED */
248 				__builtin_unreachable();
249 			}
250 			/* add this value (in msec) to the total (in usec) */
251 			if (os_add_overflow(boff_total,
252 			    (boff * NSEC_PER_USEC), &boff_total)) {
253 				panic_plain("\"%s\": boff_total counter "
254 				    "overflows\n", skm->skm_name);
255 				/* NOTREACHED */
256 				__builtin_unreachable();
257 			}
258 		}
259 		/*
260 		 * In the event of a race between multiple threads trying
261 		 * to create the last remaining (or the only) slab, let the
262 		 * loser(s) attempt to retry after waiting a bit.  The winner
263 		 * would have inserted the newly-created slab into the list.
264 		 */
265 		if (!(flags & SKMEM_NOSLEEP) &&
266 		    boff_total <= SKMEM_SLAB_MAX_BACKOFF) {
267 			retry = TRUE;
268 			++retries;
269 			flags |= SKMEM_FAILOK;
270 		} else {
271 			if (!(flags & SKMEM_NOSLEEP)) {
272 				panic_plain("\"%s\": failed to allocate "
273 				    "slab (sleeping mode) after %llu "
274 				    "msec, %u retries\n\n%s", skm->skm_name,
275 				    (boff_total / NSEC_PER_USEC), retries,
276 				    skmem_dump(skm->skm_region));
277 				/* NOTREACHED */
278 				__builtin_unreachable();
279 			}
280 			retry = FALSE;
281 		}
282 
283 		/*
284 		 * Create a new slab.
285 		 */
286 		if ((sl = skmem_slab_create(skm, flags)) == NULL) {
287 			if (retry) {
288 				SK_ERR("\"%s\": failed to allocate "
289 				    "slab (%ssleeping mode): waiting for %llu "
290 				    "msec, total %llu msec, %u retries",
291 				    skm->skm_name,
292 				    (flags & SKMEM_NOSLEEP) ? "non-" : "",
293 				    boff, (boff_total / NSEC_PER_USEC), retries);
294 				VERIFY(boff > 0 && ((uint32_t)boff <=
295 				    (SKMEM_SLAB_BACKOFF_THRES * 2)));
296 				delay((uint32_t)boff * NSEC_PER_USEC);
297 				SKM_SLAB_LOCK(skm);
298 				goto again;
299 			} else {
300 				SK_RDERR(4, "\"%s\": failed to allocate slab "
301 				    "(%ssleeping mode)", skm->skm_name,
302 				    (flags & SKMEM_NOSLEEP) ? "non-" : "");
303 				SKM_SLAB_LOCK(skm);
304 			}
305 			return ENOMEM;
306 		}
307 
308 		SKM_SLAB_LOCK(skm);
309 		skm->skm_sl_create++;
310 		if ((skm->skm_sl_bufinuse += sl->sl_chunks) >
311 		    skm->skm_sl_bufmax) {
312 			skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
313 		}
314 	}
315 	skm->skm_sl_alloc++;
316 
317 	new_slab = (sl->sl_refcnt == 0);
318 	ASSERT(new_slab || SKMEM_SLAB_IS_PARTIAL(sl));
319 
320 	sl->sl_refcnt++;
321 	ASSERT(sl->sl_refcnt <= sl->sl_chunks);
322 
323 	/*
324 	 * We either have a new slab, or a partially-allocated one.
325 	 * Remove a buffer control from the slab, and insert it to
326 	 * the allocated-address hash chain.
327 	 */
328 	bc = SLIST_FIRST(&sl->sl_head);
329 	ASSERT(bc != NULL);
330 	SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
331 
332 	/* sanity check */
333 	VERIFY(bc->bc_usecnt == 0);
334 
335 	/*
336 	 * Also store the master object's region info for the caller.
337 	 */
338 	bzero(oi, sizeof(*oi));
339 #if CONFIG_KERNEL_TAGGING
340 	region = sl->sl_cache->skm_region;
341 	if (region->skr_mode & SKR_MODE_MEMTAG) {
342 		tagged_address = (vm_map_address_t)vm_memtag_generate_and_store_tag(bc->bc_addr,
343 		    skm->skm_objsize);
344 		/*
345 		 * XXX -fbounds-safety: tagged_address's type is vm_offset_t
346 		 * which is unsafe, so we have ot use __unsafe_forge here.
347 		 * Also, skm->skm_objsize is equal to bc->bc_addr (they're both
348 		 * set to skr->skr_c_obj_size)
349 		 */
350 		bufsize = skm->skm_objsize;
351 		/*
352 		 * XXX -fbounds-safety: Couldn't pass bufsize here, because
353 		 * compiler gives an error: cannot reference 'bufsize' after it
354 		 * is changed during consecutive assignments
355 		 */
356 		buf = __unsafe_forge_bidi_indexable(void *, tagged_address,
357 		    skm->skm_objsize);
358 	} else {
359 		bufsize = bc->bc_lim;
360 		buf = bc->bc_addr;
361 	}
362 #else /* !CONFIG_KERNEL_TAGGING */
363 	bufsize = bc->bc_lim;
364 	buf = bc->bc_addr;
365 #endif /* CONFIG_KERNEL_TAGGING */
366 	SKMEM_OBJ_SIZE(oi) = (uint32_t)bufsize;
367 	SKMEM_OBJ_ADDR(oi) = buf;
368 	SKMEM_OBJ_BUFCTL(oi) = bc;      /* master only; NULL for slave */
369 	ASSERT(skm->skm_objsize <= UINT32_MAX);
370 	SKMEM_OBJ_IDX_REG(oi) =
371 	    ((sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx);
372 	SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
373 	/*
374 	 * And for slave object.
375 	 */
376 	if (oim != NULL) {
377 		bzero(oim, sizeof(*oim));
378 		if (bc->bc_addrm != NULL) {
379 			SKMEM_OBJ_ADDR(oim) = __unsafe_forge_bidi_indexable(
380 				void *, bc->bc_addrm, SKMEM_OBJ_SIZE(oi));
381 			SKMEM_OBJ_SIZE(oim) = SKMEM_OBJ_SIZE(oi);
382 			SKMEM_OBJ_IDX_REG(oim) = SKMEM_OBJ_IDX_REG(oi);
383 			SKMEM_OBJ_IDX_SEG(oim) = SKMEM_OBJ_IDX_SEG(oi);
384 		}
385 	}
386 
387 	if (skm->skm_mode & SKM_MODE_BATCH) {
388 		((struct skmem_obj *)buf)->mo_next = NULL;
389 	}
390 
391 	/* insert to allocated-address hash chain */
392 	bcb = SKMEM_CACHE_HASH(skm, buf);
393 	SLIST_INSERT_HEAD(&bcb->bcb_head, bc, bc_link);
394 
395 	if (SLIST_EMPTY(&sl->sl_head)) {
396 		/*
397 		 * If that was the last buffer control from this slab,
398 		 * insert the slab into the empty list.  If it was in
399 		 * the partially-allocated list, then remove the slab
400 		 * from there as well.
401 		 */
402 		ASSERT(sl->sl_refcnt == sl->sl_chunks);
403 		if (new_slab) {
404 			ASSERT(sl->sl_chunks == 1);
405 		} else {
406 			ASSERT(sl->sl_chunks > 1);
407 			ASSERT(skm->skm_sl_partial > 0);
408 			skm->skm_sl_partial--;
409 			TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
410 		}
411 		skm->skm_sl_empty++;
412 		ASSERT(skm->skm_sl_empty != 0);
413 		TAILQ_INSERT_HEAD(&skm->skm_sl_empty_list, sl, sl_link);
414 	} else {
415 		/*
416 		 * The slab is not empty; if it was newly allocated
417 		 * above, then it's not in the partially-allocated
418 		 * list and so we insert it there.
419 		 */
420 		ASSERT(SKMEM_SLAB_IS_PARTIAL(sl));
421 		if (new_slab) {
422 			skm->skm_sl_partial++;
423 			ASSERT(skm->skm_sl_partial != 0);
424 			TAILQ_INSERT_HEAD(&skm->skm_sl_partial_list,
425 			    sl, sl_link);
426 		}
427 	}
428 
429 	/* if auditing is enabled, record this transaction */
430 	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
431 		skmem_audit_bufctl(bc);
432 	}
433 
434 	return 0;
435 }
436 
437 /*
438  * Allocate a raw object from the (locked) slab layer.  Pseudo region variant.
439  */
440 int
skmem_slab_alloc_pseudo_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)441 skmem_slab_alloc_pseudo_locked(struct skmem_cache *skm,
442     struct skmem_obj_info *oi, struct skmem_obj_info *oim, uint32_t skmflag)
443 {
444 	zalloc_flags_t zflags = (skmflag & SKMEM_NOSLEEP) ? Z_NOWAIT : Z_WAITOK;
445 	struct skmem_region *skr = skm->skm_region;
446 	void *obj, *buf;
447 
448 	/* this flag is not for the caller to set */
449 	VERIFY(!(skmflag & SKMEM_FAILOK));
450 
451 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
452 
453 	ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
454 	/* mirrored region is not applicable */
455 	ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
456 	/* batching is not yet supported */
457 	ASSERT(!(skm->skm_mode & SKM_MODE_BATCH));
458 
459 	obj = zalloc_flags_buf(skr->skr_zreg, zflags | Z_ZERO);
460 	if (obj == NULL) {
461 		os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
462 		return ENOMEM;
463 	}
464 
465 #if KASAN
466 	/*
467 	 * Perform some fix-ups since the zone element isn't guaranteed
468 	 * to be on the aligned boundary.  The effective object size
469 	 * has been adjusted accordingly by skmem_region_create() earlier
470 	 * at cache creation time.
471 	 *
472 	 * 'buf' is the aligned address for this object.
473 	 */
474 	uintptr_t diff = P2ROUNDUP((intptr_t)obj + sizeof(u_int64_t),
475 	    skm->skm_bufalign) - (uintptr_t)obj;
476 	buf = (void *)((char *)obj + diff);
477 
478 	/*
479 	 * Wind back a pointer size from the aligned address and
480 	 * save the original address so we can free it later.
481 	 */
482 	/*
483 	 * XXX -fbounds-safety: Since this function is for generic alloc, we
484 	 * cannot modify the struct like we did for struct skmem_cache.
485 	 * Unfortunately, __unsafe_forge_bidi_indexable seems to be the only
486 	 * choice.
487 	 */
488 	void **pbuf = __unsafe_forge_bidi_indexable(void **,
489 	    (intptr_t)buf - sizeof(void *), sizeof(void *));
490 	*pbuf = obj;
491 
492 	VERIFY(((intptr_t)buf + skm->skm_bufsize) <=
493 	    ((intptr_t)obj + skm->skm_objsize));
494 #else /* !KASAN */
495 	/*
496 	 * We expect that the zone allocator would allocate elements
497 	 * rounded up to the requested alignment based on the effective
498 	 * object size computed in skmem_region_create() earlier, and
499 	 * 'buf' is therefore the element address itself.
500 	 */
501 	buf = obj;
502 #endif /* !KASAN */
503 
504 	/* make sure the object is aligned */
505 	VERIFY(IS_P2ALIGNED(buf, skm->skm_bufalign));
506 
507 	/*
508 	 * Return the object's info to the caller.
509 	 */
510 	bzero(oi, sizeof(*oi));
511 	SKMEM_OBJ_ADDR(oi) = buf;
512 #if KASAN
513 	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize -
514 	    (uint32_t)skm->skm_bufalign;
515 #else
516 	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
517 #endif
518 	ASSERT(skm->skm_objsize <= UINT32_MAX);
519 	if (oim != NULL) {
520 		bzero(oim, sizeof(*oim));
521 	}
522 
523 	skm->skm_sl_alloc++;
524 	skm->skm_sl_bufinuse++;
525 	if (skm->skm_sl_bufinuse > skm->skm_sl_bufmax) {
526 		skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
527 	}
528 
529 	return 0;
530 }
531 
532 /*
533  * Allocate a raw object from the slab layer.
534  */
535 int
skmem_slab_alloc(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)536 skmem_slab_alloc(struct skmem_cache *skm, struct skmem_obj_info *oi,
537     struct skmem_obj_info *oim, uint32_t skmflag)
538 {
539 	int err;
540 
541 	SKM_SLAB_LOCK(skm);
542 	err = skm->skm_slab_alloc(skm, oi, oim, skmflag);
543 	SKM_SLAB_UNLOCK(skm);
544 
545 	return err;
546 }
547 
548 /*
549  * Allocate raw object(s) from the slab layer.
550  */
551 uint32_t
skmem_slab_batch_alloc(struct skmem_cache * skm,struct skmem_obj ** list,uint32_t num,uint32_t skmflag)552 skmem_slab_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
553     uint32_t num, uint32_t skmflag)
554 {
555 	uint32_t need = num;
556 
557 	ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
558 	*list = NULL;
559 
560 	SKM_SLAB_LOCK(skm);
561 	for (;;) {
562 		struct skmem_obj_info oi, oim;
563 
564 		/*
565 		 * Get a single raw object from the slab layer.
566 		 */
567 		if (skm->skm_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
568 			break;
569 		}
570 
571 		*list = SKMEM_OBJ_ADDR(&oi);
572 		ASSERT((*list)->mo_next == NULL);
573 		/* store these inside the object itself */
574 		(*list)->mo_info = oi;
575 		(*list)->mo_minfo = oim;
576 		list = &(*list)->mo_next;
577 
578 		ASSERT(need != 0);
579 		if (--need == 0) {
580 			break;
581 		}
582 	}
583 	SKM_SLAB_UNLOCK(skm);
584 
585 	return num - need;
586 }
587 
588 /*
589  * Free a raw object to the (locked) slab layer.  Normal region variant.
590  */
591 void
skmem_slab_free_locked(struct skmem_cache * skm,void * buf)592 skmem_slab_free_locked(struct skmem_cache *skm, void *buf)
593 {
594 	struct skmem_bufctl *bc, *tbc;
595 	struct skmem_bufctl_bkt *bcb;
596 	struct skmem_slab *sl = NULL;
597 #if CONFIG_KERNEL_TAGGING
598 	struct skmem_region *region;
599 	vm_map_address_t tagged_addr;
600 #endif /* CONFIG_KERNEL_TAGGING */
601 
602 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
603 	ASSERT(buf != NULL);
604 	/* caller is expected to clear mo_next */
605 	ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
606 	    ((struct skmem_obj *)buf)->mo_next == NULL);
607 
608 	/*
609 	 * Search the hash chain to find a matching buffer control for the
610 	 * given object address.  If found, remove the buffer control from
611 	 * the hash chain and insert it into the freelist.  Otherwise, we
612 	 * panic since the caller has given us a bogus address.
613 	 */
614 	skm->skm_sl_free++;
615 	bcb = SKMEM_CACHE_HASH(skm, buf);
616 
617 	SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
618 		if (SKMEM_COMPARE_CANONICAL_ADDR(bc->bc_addr, buf, skm->skm_objsize)) {
619 			SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
620 			sl = bc->bc_slab;
621 			break;
622 		}
623 	}
624 
625 	if (bc == NULL) {
626 		panic("%s: attempt to free invalid or already-freed obj %p "
627 		    "on skm %p", __func__, buf, skm);
628 		/* NOTREACHED */
629 		__builtin_unreachable();
630 	}
631 	ASSERT(sl != NULL && sl->sl_cache == skm);
632 	VERIFY(SKMEM_SLAB_MEMBER(sl, SKMEM_MEMTAG_STRIP_TAG(buf, skm->skm_objsize)));
633 
634 	/* make sure this object is not currently in use by another object */
635 	VERIFY(bc->bc_usecnt == 0);
636 
637 	/* if auditing is enabled, record this transaction */
638 	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
639 		skmem_audit_bufctl(bc);
640 	}
641 
642 	/* if clear on free is requested, zero out the object */
643 	if (skm->skm_mode & SKM_MODE_CLEARONFREE) {
644 		size_t size = skm->skm_objsize;
645 		void *buf_cpy = __unsafe_forge_bidi_indexable(void *, buf, size);
646 		bzero(buf_cpy, size);
647 		buf_cpy = NULL;
648 		size = 0;
649 	}
650 
651 #if CONFIG_KERNEL_TAGGING
652 	region = skm->skm_region;
653 	if (region->skr_mode & SKR_MODE_MEMTAG) {
654 		tagged_addr = (vm_map_address_t)vm_memtag_generate_and_store_tag(buf, skm->skm_objsize);
655 	}
656 #endif /* CONFIG_KERNEL_TAGGING */
657 
658 	/* insert the buffer control to the slab's freelist */
659 	SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
660 
661 	ASSERT(sl->sl_refcnt >= 1);
662 	if (--sl->sl_refcnt == 0) {
663 		/*
664 		 * If this was the last outstanding object for the slab,
665 		 * remove the slab from the partially-allocated or empty
666 		 * list, and destroy the slab (segment) back to the region.
667 		 */
668 		if (sl->sl_chunks == 1) {
669 			ASSERT(skm->skm_sl_empty > 0);
670 			skm->skm_sl_empty--;
671 			TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
672 		} else {
673 			ASSERT(skm->skm_sl_partial > 0);
674 			skm->skm_sl_partial--;
675 			TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
676 		}
677 		ASSERT((int64_t)(skm->skm_sl_bufinuse - sl->sl_chunks) >= 0);
678 		skm->skm_sl_bufinuse -= sl->sl_chunks;
679 		skm->skm_sl_destroy++;
680 		SKM_SLAB_UNLOCK(skm);
681 		skmem_slab_destroy(skm, sl);
682 		SKM_SLAB_LOCK(skm);
683 		return;
684 	}
685 
686 	ASSERT(bc == SLIST_FIRST(&sl->sl_head));
687 	if (SLIST_NEXT(bc, bc_link) == NULL) {
688 		/*
689 		 * If this is the first (potentially amongst many) object
690 		 * that's returned to the slab, remove the slab from the
691 		 * empty list and insert to end of the partially-allocated
692 		 * list. This should help avoid thrashing the partial slab
693 		 * since we avoid disturbing what's already at the front.
694 		 */
695 		ASSERT(sl->sl_refcnt == (sl->sl_chunks - 1));
696 		ASSERT(sl->sl_chunks > 1);
697 		ASSERT(skm->skm_sl_empty > 0);
698 		skm->skm_sl_empty--;
699 		TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
700 		skm->skm_sl_partial++;
701 		ASSERT(skm->skm_sl_partial != 0);
702 		TAILQ_INSERT_TAIL(&skm->skm_sl_partial_list, sl, sl_link);
703 	}
704 }
705 
706 /*
707  * Free a raw object to the (locked) slab layer.  Pseudo region variant.
708  */
709 void
skmem_slab_free_pseudo_locked(struct skmem_cache * skm,void * buf)710 skmem_slab_free_pseudo_locked(struct skmem_cache *skm, void *buf)
711 {
712 	struct skmem_region *skr = skm->skm_region;
713 	void *__single obj = buf;
714 
715 	ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
716 
717 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
718 
719 	VERIFY(IS_P2ALIGNED(obj, skm->skm_bufalign));
720 
721 #if KASAN
722 	/*
723 	 * Since we stuffed the original zone element address before
724 	 * the buffer address in KASAN mode, get it back since we're
725 	 * about to free it.
726 	 */
727 	void **pbuf = __unsafe_forge_bidi_indexable(void **,
728 	    ((intptr_t)obj - sizeof(void *)), sizeof(void *));
729 
730 	VERIFY(((intptr_t)obj + skm->skm_bufsize) <=
731 	    ((intptr_t)*pbuf + skm->skm_objsize));
732 
733 	obj = *pbuf;
734 #endif /* KASAN */
735 
736 	/* free it to zone */
737 	zfree(skr->skr_zreg, obj);
738 
739 	skm->skm_sl_free++;
740 	ASSERT(skm->skm_sl_bufinuse > 0);
741 	skm->skm_sl_bufinuse--;
742 }
743 
744 /*
745  * Free a raw object to the slab layer.
746  */
747 void
skmem_slab_free(struct skmem_cache * skm,void * buf)748 skmem_slab_free(struct skmem_cache *skm, void *buf)
749 {
750 	if (skm->skm_mode & SKM_MODE_BATCH) {
751 		((struct skmem_obj *)buf)->mo_next = NULL;
752 	}
753 
754 	SKM_SLAB_LOCK(skm);
755 	skm->skm_slab_free(skm, buf);
756 	SKM_SLAB_UNLOCK(skm);
757 }
758 
759 /*
760  * Free raw object(s) to the slab layer.
761  */
762 void
skmem_slab_batch_free(struct skmem_cache * skm,struct skmem_obj * list)763 skmem_slab_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
764 {
765 	struct skmem_obj *listn;
766 
767 	ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
768 
769 	SKM_SLAB_LOCK(skm);
770 	for (;;) {
771 		listn = list->mo_next;
772 		list->mo_next = NULL;
773 
774 		/*
775 		 * Free a single object to the slab layer.
776 		 */
777 		skm->skm_slab_free(skm, (void *)list);
778 
779 		/* if no more objects to free, we're done */
780 		if ((list = listn) == NULL) {
781 			break;
782 		}
783 	}
784 	SKM_SLAB_UNLOCK(skm);
785 }
786 
787 
788 /*
789  * Given a buffer control, record the current transaction.
790  */
791 __attribute__((noinline, cold, not_tail_called))
792 inline void
skmem_audit_bufctl(struct skmem_bufctl * bc)793 skmem_audit_bufctl(struct skmem_bufctl *bc)
794 {
795 	struct skmem_bufctl_audit *bca = (struct skmem_bufctl_audit *)bc;
796 	struct timeval tv;
797 
798 	microuptime(&tv);
799 	bca->bc_thread = current_thread();
800 	bca->bc_timestamp = (uint32_t)((tv.tv_sec * 1000) + (tv.tv_usec / 1000));
801 	bca->bc_depth = OSBacktrace(bca->bc_stack, SKMEM_STACK_DEPTH);
802 }
803