xref: /xnu-12377.81.4/bsd/skywalk/mem/skmem_slab.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2016-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #define _FN_KPRINTF
31 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
32 #include <libkern/OSDebug.h>    /* for OSBacktrace */
33 #include <kern/sched_prim.h>    /* for assert_wait */
34 #include <kern/uipc_domain.h>
35 #include <vm/vm_memtag.h>
36 
37 static struct skmem_slab *skmem_slab_create(struct skmem_cache *, uint32_t);
38 static void skmem_slab_destroy(struct skmem_cache *, struct skmem_slab *);
39 
40 /*
41  * Too big a value will cause overflow and thus trip the assertion; the
42  * idea here is to set an upper limit for the time that a particular
43  * thread is allowed to perform retries before we give up and panic.
44  */
45 #define SKMEM_SLAB_MAX_BACKOFF          (20 * USEC_PER_SEC) /* seconds */
46 
47 /*
48  * Threshold (in msec) after which we reset the exponential backoff value
49  * back to its (random) initial value.  Note that we allow the actual delay
50  * to be at most twice this value.
51  */
52 #define SKMEM_SLAB_BACKOFF_THRES        1024    /* up to ~2 sec (2048 msec) */
53 
54 /*
55  * To reduce the likelihood of global synchronization between threads,
56  * we use some random value to start the exponential backoff.
57  */
58 #define SKMEM_SLAB_BACKOFF_RANDOM       4       /* range is [1,4] msec */
59 
60 /*
61  * Create a slab.
62  */
63 static struct skmem_slab *
skmem_slab_create(struct skmem_cache * skm,uint32_t skmflag)64 skmem_slab_create(struct skmem_cache *skm, uint32_t skmflag)
65 {
66 	struct skmem_region *skr = skm->skm_region;
67 	uint32_t objsize, chunks;
68 	size_t slabsize = skm->skm_slabsize;
69 	struct skmem_slab *__single sl;
70 	struct sksegment *__single sg, *__single sgm;
71 	char *buf, *__indexable slab;
72 	char *__indexable bufm;
73 	uint32_t slabm_size;
74 	void *__sized_by(slabm_size) slabm;
75 
76 	/*
77 	 * Allocate a segment (a slab at our layer) from the region.
78 	 */
79 	slab = skmem_region_alloc(skr, &slabm, &sg, &sgm, skmflag,
80 	    skr->skr_params.srp_c_seg_size, &slabm_size);
81 	if (slab == NULL) {
82 		goto rg_alloc_failure;
83 	}
84 
85 	if ((sl = skmem_cache_alloc(skmem_slab_cache, SKMEM_SLEEP)) == NULL) {
86 		goto slab_alloc_failure;
87 	}
88 
89 	ASSERT(sg != NULL);
90 	ASSERT(sgm == NULL || sgm->sg_index == sg->sg_index);
91 
92 	bzero(sl, sizeof(*sl));
93 	sl->sl_cache = skm;
94 	sl->sl_base = buf = slab;
95 	bufm = slabm;
96 	objsize = (uint32_t)skr->skr_c_obj_size;
97 	sl->sl_basem = __unsafe_forge_bidi_indexable(void *, bufm, objsize);
98 	ASSERT(skr->skr_c_obj_size <= UINT32_MAX);
99 	ASSERT(skm->skm_objsize == objsize);
100 	ASSERT((slabsize / objsize) <= UINT32_MAX);
101 	sl->sl_chunks = chunks = (uint32_t)(slabsize / objsize);
102 	sl->sl_seg = sg;
103 	sl->sl_segm = sgm;
104 
105 	/*
106 	 * Create one or more buffer control structures for the slab,
107 	 * each one tracking a chunk of raw object from the segment,
108 	 * and insert these into the slab's list of buffer controls.
109 	 */
110 	ASSERT(chunks > 0);
111 	while (chunks != 0) {
112 		struct skmem_bufctl *__indexable bc;
113 		bc = skmem_cache_alloc(skmem_bufctl_cache, SKMEM_SLEEP);
114 		if (bc == NULL) {
115 			goto bufctl_alloc_failure;
116 		}
117 
118 		bzero(bc, bc_size);
119 		bc->bc_lim = objsize;
120 		bc->bc_addr = buf;
121 		bc->bc_addrm = bufm;
122 		bc->bc_slab = sl;
123 		bc->bc_idx = (sl->sl_chunks - chunks);
124 		if (skr->skr_mode & SKR_MODE_SHAREOK) {
125 			bc->bc_flags |= SKMEM_BUFCTL_SHAREOK;
126 		}
127 		SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
128 		buf += objsize;
129 		if (bufm != NULL) {
130 			/* XXX -fbounds-safety */
131 			bufm = (char *)bufm + objsize;
132 		}
133 		--chunks;
134 	}
135 
136 	SK_DF(SK_VERB_MEM_CACHE, "skm %p sl %p",
137 	    SK_KVA(skm), SK_KVA(sl));
138 	SK_DF(SK_VERB_MEM_CACHE, "  [%u] [%p-%p)", sl->sl_seg->sg_index,
139 	    SK_KVA(slab), SK_KVA(slab + objsize));
140 
141 	return sl;
142 
143 bufctl_alloc_failure:
144 	skmem_slab_destroy(skm, sl);
145 
146 slab_alloc_failure:
147 	skmem_region_free(skr, slab, __unsafe_forge_bidi_indexable(void *,
148 	    slabm, skr->skr_c_obj_size));
149 
150 rg_alloc_failure:
151 	os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
152 
153 	return NULL;
154 }
155 
156 /*
157  * Destroy a slab.
158  */
159 static void
skmem_slab_destroy(struct skmem_cache * skm,struct skmem_slab * sl)160 skmem_slab_destroy(struct skmem_cache *skm, struct skmem_slab *sl)
161 {
162 	struct skmem_bufctl *bc, *tbc;
163 	void *__single slab = sl->sl_base;
164 	void *__single slabm = sl->sl_basem;
165 
166 	ASSERT(sl->sl_refcnt == 0);
167 
168 	SK_DF(SK_VERB_MEM_CACHE, "skm %p sl %p",
169 	    SK_KVA(skm), SK_KVA(sl));
170 	SK_DF(SK_VERB_MEM_CACHE, "  [%u] [%p-%p)", sl->sl_seg->sg_index,
171 	    SK_KVA(slab), SK_KVA((uintptr_t)slab + skm->skm_objsize));
172 
173 	/*
174 	 * Go through the slab's list of buffer controls and free
175 	 * them, and then free the slab itself back to its cache.
176 	 */
177 	SLIST_FOREACH_SAFE(bc, &sl->sl_head, bc_link, tbc) {
178 		SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
179 		skmem_cache_free(skmem_bufctl_cache, bc);
180 	}
181 	skmem_cache_free(skmem_slab_cache, sl);
182 
183 	/*
184 	 * Restore original tag before freeing back to system. sl->sl_base should
185 	 * have the original tag.
186 	 */
187 	if (skm->skm_region->skr_bufspec.memtag) {
188 		vm_memtag_store_tag(slab, skm->skm_slabsize);
189 	}
190 
191 	/* and finally free the segment back to the backing region */
192 	skmem_region_free(skm->skm_region, slab, slabm);
193 }
194 
195 /*
196  * Allocate a raw object from the (locked) slab layer.  Normal region variant.
197  */
198 int
skmem_slab_alloc_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)199 skmem_slab_alloc_locked(struct skmem_cache *skm, struct skmem_obj_info *oi,
200     struct skmem_obj_info *oim, uint32_t skmflag)
201 {
202 	struct skmem_bufctl_bkt *bcb;
203 	struct skmem_bufctl *bc;
204 	struct skmem_slab *sl;
205 	uint32_t retries = 0;
206 	uint64_t boff_total = 0;                /* in usec */
207 	uint64_t boff = 0;                      /* in msec */
208 	boolean_t new_slab;
209 	size_t bufsize;
210 	void *__sized_by(bufsize) buf;
211 #if CONFIG_KERNEL_TAGGING
212 	vm_map_address_t tagged_address;        /* address tagging */
213 	struct skmem_region *region;            /* region source for this slab */
214 #endif /* CONFIG_KERNEL_TAGGING */
215 
216 	/* this flag is not for the caller to set */
217 	VERIFY(!(skmflag & SKMEM_FAILOK));
218 
219 	/*
220 	 * A slab is either in a partially-allocated list (at least it has
221 	 * a free object available), or is in the empty list (everything
222 	 * has been allocated.)  If we can't find a partially-allocated
223 	 * slab, then we need to allocate a slab (segment) from the region.
224 	 */
225 again:
226 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
227 	sl = TAILQ_FIRST(&skm->skm_sl_partial_list);
228 	if (sl == NULL) {
229 		uint32_t flags = skmflag;
230 		boolean_t retry;
231 
232 		ASSERT(skm->skm_sl_partial == 0);
233 		SKM_SLAB_UNLOCK(skm);
234 		if (!(flags & SKMEM_NOSLEEP)) {
235 			/*
236 			 * Pick up a random value to start the exponential
237 			 * backoff, if this is the first round, or if the
238 			 * current value is over the threshold.  Otherwise,
239 			 * double the backoff value.
240 			 */
241 			if (boff == 0 || boff > SKMEM_SLAB_BACKOFF_THRES) {
242 				read_frandom(&boff, sizeof(boff));
243 				boff = (boff % SKMEM_SLAB_BACKOFF_RANDOM) + 1;
244 				ASSERT(boff > 0);
245 			} else if (os_mul_overflow(boff, 2, &boff)) {
246 				panic_plain("\"%s\": boff counter "
247 				    "overflows\n", skm->skm_name);
248 				/* NOTREACHED */
249 				__builtin_unreachable();
250 			}
251 			/* add this value (in msec) to the total (in usec) */
252 			if (os_add_overflow(boff_total,
253 			    (boff * NSEC_PER_USEC), &boff_total)) {
254 				panic_plain("\"%s\": boff_total counter "
255 				    "overflows\n", skm->skm_name);
256 				/* NOTREACHED */
257 				__builtin_unreachable();
258 			}
259 		}
260 		/*
261 		 * In the event of a race between multiple threads trying
262 		 * to create the last remaining (or the only) slab, let the
263 		 * loser(s) attempt to retry after waiting a bit.  The winner
264 		 * would have inserted the newly-created slab into the list.
265 		 */
266 		if (!(flags & SKMEM_NOSLEEP) &&
267 		    boff_total <= SKMEM_SLAB_MAX_BACKOFF) {
268 			retry = TRUE;
269 			++retries;
270 			flags |= SKMEM_FAILOK;
271 		} else {
272 			if (!(flags & SKMEM_NOSLEEP)) {
273 				panic_plain("\"%s\": failed to allocate "
274 				    "slab (sleeping mode) after %llu "
275 				    "msec, %u retries\n\n%s", skm->skm_name,
276 				    (boff_total / NSEC_PER_USEC), retries,
277 				    skmem_dump(skm->skm_region));
278 				/* NOTREACHED */
279 				__builtin_unreachable();
280 			}
281 			retry = FALSE;
282 		}
283 
284 		/*
285 		 * Create a new slab.
286 		 */
287 		if ((sl = skmem_slab_create(skm, flags)) == NULL) {
288 			if (retry) {
289 				SK_ERR("\"%s\": failed to allocate "
290 				    "slab (%ssleeping mode): waiting for %llu "
291 				    "msec, total %llu msec, %u retries",
292 				    skm->skm_name,
293 				    (flags & SKMEM_NOSLEEP) ? "non-" : "",
294 				    boff, (boff_total / NSEC_PER_USEC), retries);
295 				VERIFY(boff > 0 && ((uint32_t)boff <=
296 				    (SKMEM_SLAB_BACKOFF_THRES * 2)));
297 				delay((uint32_t)boff * NSEC_PER_USEC);
298 				SKM_SLAB_LOCK(skm);
299 				goto again;
300 			} else {
301 				SK_RDERR(4, "\"%s\": failed to allocate slab "
302 				    "(%ssleeping mode)", skm->skm_name,
303 				    (flags & SKMEM_NOSLEEP) ? "non-" : "");
304 				SKM_SLAB_LOCK(skm);
305 			}
306 			return ENOMEM;
307 		}
308 
309 		SKM_SLAB_LOCK(skm);
310 		skm->skm_sl_create++;
311 		if ((skm->skm_sl_bufinuse += sl->sl_chunks) >
312 		    skm->skm_sl_bufmax) {
313 			skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
314 		}
315 	}
316 	skm->skm_sl_alloc++;
317 
318 	new_slab = (sl->sl_refcnt == 0);
319 	ASSERT(new_slab || SKMEM_SLAB_IS_PARTIAL(sl));
320 
321 	sl->sl_refcnt++;
322 	ASSERT(sl->sl_refcnt <= sl->sl_chunks);
323 
324 	/*
325 	 * We either have a new slab, or a partially-allocated one.
326 	 * Remove a buffer control from the slab, and insert it to
327 	 * the allocated-address hash chain.
328 	 */
329 	bc = SLIST_FIRST(&sl->sl_head);
330 	ASSERT(bc != NULL);
331 	SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
332 
333 	/* sanity check */
334 	VERIFY(bc->bc_usecnt == 0);
335 
336 	/*
337 	 * Also store the master object's region info for the caller.
338 	 */
339 	bzero(oi, sizeof(*oi));
340 #if CONFIG_KERNEL_TAGGING
341 	region = sl->sl_cache->skm_region;
342 	if (region->skr_mode & SKR_MODE_MEMTAG) {
343 		tagged_address = (vm_map_address_t)vm_memtag_generate_and_store_tag(bc->bc_addr,
344 		    skm->skm_objsize);
345 		/*
346 		 * XXX -fbounds-safety: tagged_address's type is vm_offset_t
347 		 * which is unsafe, so we have ot use __unsafe_forge here.
348 		 * Also, skm->skm_objsize is equal to bc->bc_addr (they're both
349 		 * set to skr->skr_c_obj_size)
350 		 */
351 		bufsize = skm->skm_objsize;
352 		/*
353 		 * XXX -fbounds-safety: Couldn't pass bufsize here, because
354 		 * compiler gives an error: cannot reference 'bufsize' after it
355 		 * is changed during consecutive assignments
356 		 */
357 		buf = __unsafe_forge_bidi_indexable(void *, tagged_address,
358 		    skm->skm_objsize);
359 	} else {
360 		bufsize = bc->bc_lim;
361 		buf = bc->bc_addr;
362 	}
363 #else /* !CONFIG_KERNEL_TAGGING */
364 	bufsize = bc->bc_lim;
365 	buf = bc->bc_addr;
366 #endif /* CONFIG_KERNEL_TAGGING */
367 	SKMEM_OBJ_SIZE(oi) = (uint32_t)bufsize;
368 	SKMEM_OBJ_ADDR(oi) = buf;
369 	SKMEM_OBJ_BUFCTL(oi) = bc;      /* master only; NULL for slave */
370 	ASSERT(skm->skm_objsize <= UINT32_MAX);
371 	SKMEM_OBJ_IDX_REG(oi) =
372 	    ((sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx);
373 	SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
374 	/*
375 	 * And for slave object.
376 	 */
377 	if (oim != NULL) {
378 		bzero(oim, sizeof(*oim));
379 		if (bc->bc_addrm != NULL) {
380 			SKMEM_OBJ_ADDR(oim) = __unsafe_forge_bidi_indexable(
381 				void *, bc->bc_addrm, SKMEM_OBJ_SIZE(oi));
382 			SKMEM_OBJ_SIZE(oim) = SKMEM_OBJ_SIZE(oi);
383 			SKMEM_OBJ_IDX_REG(oim) = SKMEM_OBJ_IDX_REG(oi);
384 			SKMEM_OBJ_IDX_SEG(oim) = SKMEM_OBJ_IDX_SEG(oi);
385 		}
386 	}
387 
388 	if (skm->skm_mode & SKM_MODE_BATCH) {
389 		((struct skmem_obj *)buf)->mo_next = NULL;
390 	}
391 
392 	/* insert to allocated-address hash chain */
393 	bcb = SKMEM_CACHE_HASH(skm, buf);
394 	SLIST_INSERT_HEAD(&bcb->bcb_head, bc, bc_link);
395 
396 	if (SLIST_EMPTY(&sl->sl_head)) {
397 		/*
398 		 * If that was the last buffer control from this slab,
399 		 * insert the slab into the empty list.  If it was in
400 		 * the partially-allocated list, then remove the slab
401 		 * from there as well.
402 		 */
403 		ASSERT(sl->sl_refcnt == sl->sl_chunks);
404 		if (new_slab) {
405 			ASSERT(sl->sl_chunks == 1);
406 		} else {
407 			ASSERT(sl->sl_chunks > 1);
408 			ASSERT(skm->skm_sl_partial > 0);
409 			skm->skm_sl_partial--;
410 			TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
411 		}
412 		skm->skm_sl_empty++;
413 		ASSERT(skm->skm_sl_empty != 0);
414 		TAILQ_INSERT_HEAD(&skm->skm_sl_empty_list, sl, sl_link);
415 	} else {
416 		/*
417 		 * The slab is not empty; if it was newly allocated
418 		 * above, then it's not in the partially-allocated
419 		 * list and so we insert it there.
420 		 */
421 		ASSERT(SKMEM_SLAB_IS_PARTIAL(sl));
422 		if (new_slab) {
423 			skm->skm_sl_partial++;
424 			ASSERT(skm->skm_sl_partial != 0);
425 			TAILQ_INSERT_HEAD(&skm->skm_sl_partial_list,
426 			    sl, sl_link);
427 		}
428 	}
429 
430 	/* if auditing is enabled, record this transaction */
431 	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
432 		skmem_audit_bufctl(bc);
433 	}
434 
435 	return 0;
436 }
437 
438 /*
439  * Allocate a raw object from the (locked) slab layer.  Pseudo region variant.
440  */
441 int
skmem_slab_alloc_pseudo_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)442 skmem_slab_alloc_pseudo_locked(struct skmem_cache *skm,
443     struct skmem_obj_info *oi, struct skmem_obj_info *oim, uint32_t skmflag)
444 {
445 	zalloc_flags_t zflags = (skmflag & SKMEM_NOSLEEP) ? Z_NOWAIT : Z_WAITOK;
446 	struct skmem_region *skr = skm->skm_region;
447 	void *obj, *buf;
448 
449 	/* this flag is not for the caller to set */
450 	VERIFY(!(skmflag & SKMEM_FAILOK));
451 
452 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
453 
454 	ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
455 	/* mirrored region is not applicable */
456 	ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
457 	/* batching is not yet supported */
458 	ASSERT(!(skm->skm_mode & SKM_MODE_BATCH));
459 
460 	obj = zalloc_flags_buf(skr->skr_zreg, zflags | Z_ZERO);
461 	if (obj == NULL) {
462 		os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
463 		return ENOMEM;
464 	}
465 
466 #if KASAN
467 	/*
468 	 * Perform some fix-ups since the zone element isn't guaranteed
469 	 * to be on the aligned boundary.  The effective object size
470 	 * has been adjusted accordingly by skmem_region_create() earlier
471 	 * at cache creation time.
472 	 *
473 	 * 'buf' is the aligned address for this object.
474 	 */
475 	uintptr_t diff = P2ROUNDUP((intptr_t)obj + sizeof(u_int64_t),
476 	    skm->skm_bufalign) - (uintptr_t)obj;
477 	buf = (void *)((char *)obj + diff);
478 
479 	/*
480 	 * Wind back a pointer size from the aligned address and
481 	 * save the original address so we can free it later.
482 	 */
483 	/*
484 	 * XXX -fbounds-safety: Since this function is for generic alloc, we
485 	 * cannot modify the struct like we did for struct skmem_cache.
486 	 * Unfortunately, __unsafe_forge_bidi_indexable seems to be the only
487 	 * choice.
488 	 */
489 	void **pbuf = __unsafe_forge_bidi_indexable(void **,
490 	    (intptr_t)buf - sizeof(void *), sizeof(void *));
491 	*pbuf = obj;
492 
493 	VERIFY(((intptr_t)buf + skm->skm_bufsize) <=
494 	    ((intptr_t)obj + skm->skm_objsize));
495 #else /* !KASAN */
496 	/*
497 	 * We expect that the zone allocator would allocate elements
498 	 * rounded up to the requested alignment based on the effective
499 	 * object size computed in skmem_region_create() earlier, and
500 	 * 'buf' is therefore the element address itself.
501 	 */
502 	buf = obj;
503 #endif /* !KASAN */
504 
505 	/* make sure the object is aligned */
506 	VERIFY(IS_P2ALIGNED(buf, skm->skm_bufalign));
507 
508 	/*
509 	 * Return the object's info to the caller.
510 	 */
511 	bzero(oi, sizeof(*oi));
512 	SKMEM_OBJ_ADDR(oi) = buf;
513 #if KASAN
514 	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize -
515 	    (uint32_t)skm->skm_bufalign;
516 #else
517 	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
518 #endif
519 	ASSERT(skm->skm_objsize <= UINT32_MAX);
520 	if (oim != NULL) {
521 		bzero(oim, sizeof(*oim));
522 	}
523 
524 	skm->skm_sl_alloc++;
525 	skm->skm_sl_bufinuse++;
526 	if (skm->skm_sl_bufinuse > skm->skm_sl_bufmax) {
527 		skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
528 	}
529 
530 	return 0;
531 }
532 
533 /*
534  * Allocate a raw object from the slab layer.
535  */
536 int
skmem_slab_alloc(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)537 skmem_slab_alloc(struct skmem_cache *skm, struct skmem_obj_info *oi,
538     struct skmem_obj_info *oim, uint32_t skmflag)
539 {
540 	int err;
541 
542 	SKM_SLAB_LOCK(skm);
543 	err = skm->skm_slab_alloc(skm, oi, oim, skmflag);
544 	SKM_SLAB_UNLOCK(skm);
545 
546 	return err;
547 }
548 
549 /*
550  * Allocate raw object(s) from the slab layer.
551  */
552 uint32_t
skmem_slab_batch_alloc(struct skmem_cache * skm,struct skmem_obj ** list,uint32_t num,uint32_t skmflag)553 skmem_slab_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
554     uint32_t num, uint32_t skmflag)
555 {
556 	uint32_t need = num;
557 
558 	ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
559 	*list = NULL;
560 
561 	SKM_SLAB_LOCK(skm);
562 	for (;;) {
563 		struct skmem_obj_info oi, oim;
564 
565 		/*
566 		 * Get a single raw object from the slab layer.
567 		 */
568 		if (skm->skm_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
569 			break;
570 		}
571 
572 		*list = SKMEM_OBJ_ADDR(&oi);
573 		ASSERT((*list)->mo_next == NULL);
574 		/* store these inside the object itself */
575 		(*list)->mo_info = oi;
576 		(*list)->mo_minfo = oim;
577 		list = &(*list)->mo_next;
578 
579 		ASSERT(need != 0);
580 		if (--need == 0) {
581 			break;
582 		}
583 	}
584 	SKM_SLAB_UNLOCK(skm);
585 
586 	return num - need;
587 }
588 
589 /*
590  * Free a raw object to the (locked) slab layer.  Normal region variant.
591  */
592 void
skmem_slab_free_locked(struct skmem_cache * skm,void * buf)593 skmem_slab_free_locked(struct skmem_cache *skm, void *buf)
594 {
595 	struct skmem_bufctl *bc, *tbc;
596 	struct skmem_bufctl_bkt *bcb;
597 	struct skmem_slab *sl = NULL;
598 #if CONFIG_KERNEL_TAGGING
599 	struct skmem_region *region;
600 	vm_map_address_t tagged_addr;
601 #endif /* CONFIG_KERNEL_TAGGING */
602 
603 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
604 	ASSERT(buf != NULL);
605 	/* caller is expected to clear mo_next */
606 	ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
607 	    ((struct skmem_obj *)buf)->mo_next == NULL);
608 
609 	/*
610 	 * Search the hash chain to find a matching buffer control for the
611 	 * given object address.  If found, remove the buffer control from
612 	 * the hash chain and insert it into the freelist.  Otherwise, we
613 	 * panic since the caller has given us a bogus address.
614 	 */
615 	skm->skm_sl_free++;
616 	bcb = SKMEM_CACHE_HASH(skm, buf);
617 
618 	SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
619 		if (SKMEM_COMPARE_CANONICAL_ADDR(bc->bc_addr, buf, skm->skm_objsize)) {
620 			SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
621 			sl = bc->bc_slab;
622 			break;
623 		}
624 	}
625 
626 	if (bc == NULL) {
627 		panic("%s: attempt to free invalid or already-freed obj %p "
628 		    "on skm %p", __func__, buf, skm);
629 		/* NOTREACHED */
630 		__builtin_unreachable();
631 	}
632 	ASSERT(sl != NULL && sl->sl_cache == skm);
633 	VERIFY(SKMEM_SLAB_MEMBER(sl, SKMEM_MEMTAG_STRIP_TAG(buf, skm->skm_objsize)));
634 
635 	/* make sure this object is not currently in use by another object */
636 	VERIFY(bc->bc_usecnt == 0);
637 
638 	/* if auditing is enabled, record this transaction */
639 	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
640 		skmem_audit_bufctl(bc);
641 	}
642 
643 	/* if clear on free is requested, zero out the object */
644 	if (skm->skm_mode & SKM_MODE_CLEARONFREE) {
645 		size_t size = skm->skm_objsize;
646 		void *buf_cpy = __unsafe_forge_bidi_indexable(void *, buf, size);
647 		bzero(buf_cpy, size);
648 		buf_cpy = NULL;
649 		size = 0;
650 	}
651 
652 #if CONFIG_KERNEL_TAGGING
653 	region = skm->skm_region;
654 	if (region->skr_mode & SKR_MODE_MEMTAG) {
655 		tagged_addr = (vm_map_address_t)vm_memtag_generate_and_store_tag(buf, skm->skm_objsize);
656 	}
657 #endif /* CONFIG_KERNEL_TAGGING */
658 
659 	/* insert the buffer control to the slab's freelist */
660 	SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
661 
662 	ASSERT(sl->sl_refcnt >= 1);
663 	if (--sl->sl_refcnt == 0) {
664 		/*
665 		 * If this was the last outstanding object for the slab,
666 		 * remove the slab from the partially-allocated or empty
667 		 * list, and destroy the slab (segment) back to the region.
668 		 */
669 		if (sl->sl_chunks == 1) {
670 			ASSERT(skm->skm_sl_empty > 0);
671 			skm->skm_sl_empty--;
672 			TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
673 		} else {
674 			ASSERT(skm->skm_sl_partial > 0);
675 			skm->skm_sl_partial--;
676 			TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
677 		}
678 		ASSERT((int64_t)(skm->skm_sl_bufinuse - sl->sl_chunks) >= 0);
679 		skm->skm_sl_bufinuse -= sl->sl_chunks;
680 		skm->skm_sl_destroy++;
681 		SKM_SLAB_UNLOCK(skm);
682 		skmem_slab_destroy(skm, sl);
683 		SKM_SLAB_LOCK(skm);
684 		return;
685 	}
686 
687 	ASSERT(bc == SLIST_FIRST(&sl->sl_head));
688 	if (SLIST_NEXT(bc, bc_link) == NULL) {
689 		/*
690 		 * If this is the first (potentially amongst many) object
691 		 * that's returned to the slab, remove the slab from the
692 		 * empty list and insert to end of the partially-allocated
693 		 * list. This should help avoid thrashing the partial slab
694 		 * since we avoid disturbing what's already at the front.
695 		 */
696 		ASSERT(sl->sl_refcnt == (sl->sl_chunks - 1));
697 		ASSERT(sl->sl_chunks > 1);
698 		ASSERT(skm->skm_sl_empty > 0);
699 		skm->skm_sl_empty--;
700 		TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
701 		skm->skm_sl_partial++;
702 		ASSERT(skm->skm_sl_partial != 0);
703 		TAILQ_INSERT_TAIL(&skm->skm_sl_partial_list, sl, sl_link);
704 	}
705 }
706 
707 /*
708  * Free a raw object to the (locked) slab layer.  Pseudo region variant.
709  */
710 void
skmem_slab_free_pseudo_locked(struct skmem_cache * skm,void * buf)711 skmem_slab_free_pseudo_locked(struct skmem_cache *skm, void *buf)
712 {
713 	struct skmem_region *skr = skm->skm_region;
714 	void *__single obj = buf;
715 
716 	ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
717 
718 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
719 
720 	VERIFY(IS_P2ALIGNED(obj, skm->skm_bufalign));
721 
722 #if KASAN
723 	/*
724 	 * Since we stuffed the original zone element address before
725 	 * the buffer address in KASAN mode, get it back since we're
726 	 * about to free it.
727 	 */
728 	void **pbuf = __unsafe_forge_bidi_indexable(void **,
729 	    ((intptr_t)obj - sizeof(void *)), sizeof(void *));
730 
731 	VERIFY(((intptr_t)obj + skm->skm_bufsize) <=
732 	    ((intptr_t)*pbuf + skm->skm_objsize));
733 
734 	obj = *pbuf;
735 #endif /* KASAN */
736 
737 	/* free it to zone */
738 	zfree(skr->skr_zreg, obj);
739 
740 	skm->skm_sl_free++;
741 	ASSERT(skm->skm_sl_bufinuse > 0);
742 	skm->skm_sl_bufinuse--;
743 }
744 
745 /*
746  * Free a raw object to the slab layer.
747  */
748 void
skmem_slab_free(struct skmem_cache * skm,void * buf)749 skmem_slab_free(struct skmem_cache *skm, void *buf)
750 {
751 	if (skm->skm_mode & SKM_MODE_BATCH) {
752 		((struct skmem_obj *)buf)->mo_next = NULL;
753 	}
754 
755 	SKM_SLAB_LOCK(skm);
756 	skm->skm_slab_free(skm, buf);
757 	SKM_SLAB_UNLOCK(skm);
758 }
759 
760 /*
761  * Free raw object(s) to the slab layer.
762  */
763 void
skmem_slab_batch_free(struct skmem_cache * skm,struct skmem_obj * list)764 skmem_slab_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
765 {
766 	struct skmem_obj *listn;
767 
768 	ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
769 
770 	SKM_SLAB_LOCK(skm);
771 	for (;;) {
772 		listn = list->mo_next;
773 		list->mo_next = NULL;
774 
775 		/*
776 		 * Free a single object to the slab layer.
777 		 */
778 		skm->skm_slab_free(skm, (void *)list);
779 
780 		/* if no more objects to free, we're done */
781 		if ((list = listn) == NULL) {
782 			break;
783 		}
784 	}
785 	SKM_SLAB_UNLOCK(skm);
786 }
787 
788 
789 /*
790  * Given a buffer control, record the current transaction.
791  */
792 __attribute__((noinline, cold, not_tail_called))
793 inline void
skmem_audit_bufctl(struct skmem_bufctl * bc)794 skmem_audit_bufctl(struct skmem_bufctl *bc)
795 {
796 	struct skmem_bufctl_audit *bca = (struct skmem_bufctl_audit *)bc;
797 	struct timeval tv;
798 
799 	microuptime(&tv);
800 	bca->bc_thread = current_thread();
801 	bca->bc_timestamp = (uint32_t)((tv.tv_sec * 1000) + (tv.tv_usec / 1000));
802 	bca->bc_depth = OSBacktrace(bca->bc_stack, SKMEM_STACK_DEPTH);
803 }
804