xref: /xnu-11215.81.4/bsd/skywalk/mem/skmem_slab.c (revision d4514f0bc1d3f944c22d92e68b646ac3fb40d452)
1 /*
2  * Copyright (c) 2016-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #define _FN_KPRINTF
31 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
32 #include <libkern/OSDebug.h>    /* for OSBacktrace */
33 #include <kern/sched_prim.h>    /* for assert_wait */
34 #include <vm/vm_memtag.h>
35 
36 static struct skmem_slab *skmem_slab_create(struct skmem_cache *, uint32_t);
37 static void skmem_slab_destroy(struct skmem_cache *, struct skmem_slab *);
38 
39 /*
40  * Too big a value will cause overflow and thus trip the assertion; the
41  * idea here is to set an upper limit for the time that a particular
42  * thread is allowed to perform retries before we give up and panic.
43  */
44 #define SKMEM_SLAB_MAX_BACKOFF          (20 * USEC_PER_SEC) /* seconds */
45 
46 /*
47  * Threshold (in msec) after which we reset the exponential backoff value
48  * back to its (random) initial value.  Note that we allow the actual delay
49  * to be at most twice this value.
50  */
51 #define SKMEM_SLAB_BACKOFF_THRES        1024    /* up to ~2 sec (2048 msec) */
52 
53 /*
54  * To reduce the likelihood of global synchronization between threads,
55  * we use some random value to start the exponential backoff.
56  */
57 #define SKMEM_SLAB_BACKOFF_RANDOM       4       /* range is [1,4] msec */
58 
59 /*
60  * Create a slab.
61  */
62 static struct skmem_slab *
skmem_slab_create(struct skmem_cache * skm,uint32_t skmflag)63 skmem_slab_create(struct skmem_cache *skm, uint32_t skmflag)
64 {
65 	struct skmem_region *skr = skm->skm_region;
66 	uint32_t objsize, chunks;
67 	size_t slabsize = skm->skm_slabsize;
68 	struct skmem_slab *__single sl;
69 	struct sksegment *__single sg, *__single sgm;
70 	char *buf, *__indexable slab;
71 	char *__indexable bufm;
72 	uint32_t slabm_size;
73 	void *__sized_by(slabm_size) slabm;
74 
75 	/*
76 	 * Allocate a segment (a slab at our layer) from the region.
77 	 */
78 	slab = skmem_region_alloc(skr, &slabm, &sg, &sgm, skmflag,
79 	    skr->skr_params.srp_c_seg_size, &slabm_size);
80 	if (slab == NULL) {
81 		goto rg_alloc_failure;
82 	}
83 
84 	if ((sl = skmem_cache_alloc(skmem_slab_cache, SKMEM_SLEEP)) == NULL) {
85 		goto slab_alloc_failure;
86 	}
87 
88 	ASSERT(sg != NULL);
89 	ASSERT(sgm == NULL || sgm->sg_index == sg->sg_index);
90 
91 	bzero(sl, sizeof(*sl));
92 	sl->sl_cache = skm;
93 	sl->sl_base = buf = slab;
94 	bufm = slabm;
95 	objsize = (uint32_t)skr->skr_c_obj_size;
96 	sl->sl_basem = __unsafe_forge_bidi_indexable(void *, bufm, objsize);
97 	ASSERT(skr->skr_c_obj_size <= UINT32_MAX);
98 	ASSERT(skm->skm_objsize == objsize);
99 	ASSERT((slabsize / objsize) <= UINT32_MAX);
100 	sl->sl_chunks = chunks = (uint32_t)(slabsize / objsize);
101 	sl->sl_seg = sg;
102 	sl->sl_segm = sgm;
103 
104 	/*
105 	 * Create one or more buffer control structures for the slab,
106 	 * each one tracking a chunk of raw object from the segment,
107 	 * and insert these into the slab's list of buffer controls.
108 	 */
109 	ASSERT(chunks > 0);
110 	while (chunks != 0) {
111 		struct skmem_bufctl *__indexable bc;
112 		bc = skmem_cache_alloc(skmem_bufctl_cache, SKMEM_SLEEP);
113 		if (bc == NULL) {
114 			goto bufctl_alloc_failure;
115 		}
116 
117 		bzero(bc, bc_size);
118 		bc->bc_lim = objsize;
119 		bc->bc_addr = buf;
120 		bc->bc_addrm = bufm;
121 		bc->bc_slab = sl;
122 		bc->bc_idx = (sl->sl_chunks - chunks);
123 		if (skr->skr_mode & SKR_MODE_SHAREOK) {
124 			bc->bc_flags |= SKMEM_BUFCTL_SHAREOK;
125 		}
126 		SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
127 		buf += objsize;
128 		if (bufm != NULL) {
129 			/* XXX -fbounds-safety */
130 			bufm = (char *)bufm + objsize;
131 		}
132 		--chunks;
133 	}
134 
135 	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
136 	    SK_KVA(skm), SK_KVA(sl));
137 	SK_DF(SK_VERB_MEM_CACHE, "  [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
138 	    SK_KVA(slab), SK_KVA(slab + objsize));
139 
140 	return sl;
141 
142 bufctl_alloc_failure:
143 	skmem_slab_destroy(skm, sl);
144 
145 slab_alloc_failure:
146 	skmem_region_free(skr, slab, __unsafe_forge_bidi_indexable(void *,
147 	    slabm, skr->skr_c_obj_size));
148 
149 rg_alloc_failure:
150 	os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
151 
152 	return NULL;
153 }
154 
155 /*
156  * Destroy a slab.
157  */
158 static void
skmem_slab_destroy(struct skmem_cache * skm,struct skmem_slab * sl)159 skmem_slab_destroy(struct skmem_cache *skm, struct skmem_slab *sl)
160 {
161 	struct skmem_bufctl *bc, *tbc;
162 	void *__single slab = sl->sl_base;
163 	void *__single slabm = sl->sl_basem;
164 
165 	ASSERT(sl->sl_refcnt == 0);
166 
167 	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
168 	    SK_KVA(skm), SK_KVA(sl));
169 	SK_DF(SK_VERB_MEM_CACHE, "  [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
170 	    SK_KVA(slab), SK_KVA((uintptr_t)slab + skm->skm_objsize));
171 
172 	/*
173 	 * Go through the slab's list of buffer controls and free
174 	 * them, and then free the slab itself back to its cache.
175 	 */
176 	SLIST_FOREACH_SAFE(bc, &sl->sl_head, bc_link, tbc) {
177 		SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
178 		skmem_cache_free(skmem_bufctl_cache, bc);
179 	}
180 	skmem_cache_free(skmem_slab_cache, sl);
181 
182 	/* and finally free the segment back to the backing region */
183 	skmem_region_free(skm->skm_region, slab, slabm);
184 }
185 
186 /*
187  * Allocate a raw object from the (locked) slab layer.  Normal region variant.
188  */
189 int
skmem_slab_alloc_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)190 skmem_slab_alloc_locked(struct skmem_cache *skm, struct skmem_obj_info *oi,
191     struct skmem_obj_info *oim, uint32_t skmflag)
192 {
193 	struct skmem_bufctl_bkt *bcb;
194 	struct skmem_bufctl *bc;
195 	struct skmem_slab *sl;
196 	uint32_t retries = 0;
197 	uint64_t boff_total = 0;                /* in usec */
198 	uint64_t boff = 0;                      /* in msec */
199 	boolean_t new_slab;
200 	size_t bufsize;
201 	void *__sized_by(bufsize) buf;
202 #if CONFIG_KERNEL_TAGGING
203 	vm_offset_t tagged_address;             /* address tagging */
204 	struct skmem_region *region;            /* region source for this slab */
205 #endif /* CONFIG_KERNEL_TAGGING */
206 
207 	/* this flag is not for the caller to set */
208 	VERIFY(!(skmflag & SKMEM_FAILOK));
209 
210 	/*
211 	 * A slab is either in a partially-allocated list (at least it has
212 	 * a free object available), or is in the empty list (everything
213 	 * has been allocated.)  If we can't find a partially-allocated
214 	 * slab, then we need to allocate a slab (segment) from the region.
215 	 */
216 again:
217 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
218 	sl = TAILQ_FIRST(&skm->skm_sl_partial_list);
219 	if (sl == NULL) {
220 		uint32_t flags = skmflag;
221 		boolean_t retry;
222 
223 		ASSERT(skm->skm_sl_partial == 0);
224 		SKM_SLAB_UNLOCK(skm);
225 		if (!(flags & SKMEM_NOSLEEP)) {
226 			/*
227 			 * Pick up a random value to start the exponential
228 			 * backoff, if this is the first round, or if the
229 			 * current value is over the threshold.  Otherwise,
230 			 * double the backoff value.
231 			 */
232 			if (boff == 0 || boff > SKMEM_SLAB_BACKOFF_THRES) {
233 				read_frandom(&boff, sizeof(boff));
234 				boff = (boff % SKMEM_SLAB_BACKOFF_RANDOM) + 1;
235 				ASSERT(boff > 0);
236 			} else if (os_mul_overflow(boff, 2, &boff)) {
237 				panic_plain("\"%s\": boff counter "
238 				    "overflows\n", skm->skm_name);
239 				/* NOTREACHED */
240 				__builtin_unreachable();
241 			}
242 			/* add this value (in msec) to the total (in usec) */
243 			if (os_add_overflow(boff_total,
244 			    (boff * NSEC_PER_USEC), &boff_total)) {
245 				panic_plain("\"%s\": boff_total counter "
246 				    "overflows\n", skm->skm_name);
247 				/* NOTREACHED */
248 				__builtin_unreachable();
249 			}
250 		}
251 		/*
252 		 * In the event of a race between multiple threads trying
253 		 * to create the last remaining (or the only) slab, let the
254 		 * loser(s) attempt to retry after waiting a bit.  The winner
255 		 * would have inserted the newly-created slab into the list.
256 		 */
257 		if (!(flags & SKMEM_NOSLEEP) &&
258 		    boff_total <= SKMEM_SLAB_MAX_BACKOFF) {
259 			retry = TRUE;
260 			++retries;
261 			flags |= SKMEM_FAILOK;
262 		} else {
263 			if (!(flags & SKMEM_NOSLEEP)) {
264 				panic_plain("\"%s\": failed to allocate "
265 				    "slab (sleeping mode) after %llu "
266 				    "msec, %u retries\n\n%s", skm->skm_name,
267 				    (boff_total / NSEC_PER_USEC), retries,
268 				    skmem_dump(skm->skm_region));
269 				/* NOTREACHED */
270 				__builtin_unreachable();
271 			}
272 			retry = FALSE;
273 		}
274 
275 		/*
276 		 * Create a new slab.
277 		 */
278 		if ((sl = skmem_slab_create(skm, flags)) == NULL) {
279 			if (retry) {
280 				SK_ERR("\"%s\": failed to allocate "
281 				    "slab (%ssleeping mode): waiting for %llu "
282 				    "msec, total %llu msec, %u retries",
283 				    skm->skm_name,
284 				    (flags & SKMEM_NOSLEEP) ? "non-" : "",
285 				    boff, (boff_total / NSEC_PER_USEC), retries);
286 				VERIFY(boff > 0 && ((uint32_t)boff <=
287 				    (SKMEM_SLAB_BACKOFF_THRES * 2)));
288 				delay((uint32_t)boff * NSEC_PER_USEC);
289 				SKM_SLAB_LOCK(skm);
290 				goto again;
291 			} else {
292 				SK_RDERR(4, "\"%s\": failed to allocate slab "
293 				    "(%ssleeping mode)", skm->skm_name,
294 				    (flags & SKMEM_NOSLEEP) ? "non-" : "");
295 				SKM_SLAB_LOCK(skm);
296 			}
297 			return ENOMEM;
298 		}
299 
300 		SKM_SLAB_LOCK(skm);
301 		skm->skm_sl_create++;
302 		if ((skm->skm_sl_bufinuse += sl->sl_chunks) >
303 		    skm->skm_sl_bufmax) {
304 			skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
305 		}
306 	}
307 	skm->skm_sl_alloc++;
308 
309 	new_slab = (sl->sl_refcnt == 0);
310 	ASSERT(new_slab || SKMEM_SLAB_IS_PARTIAL(sl));
311 
312 	sl->sl_refcnt++;
313 	ASSERT(sl->sl_refcnt <= sl->sl_chunks);
314 
315 	/*
316 	 * We either have a new slab, or a partially-allocated one.
317 	 * Remove a buffer control from the slab, and insert it to
318 	 * the allocated-address hash chain.
319 	 */
320 	bc = SLIST_FIRST(&sl->sl_head);
321 	ASSERT(bc != NULL);
322 	SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
323 
324 	/* sanity check */
325 	VERIFY(bc->bc_usecnt == 0);
326 
327 	/*
328 	 * Also store the master object's region info for the caller.
329 	 */
330 	bzero(oi, sizeof(*oi));
331 #if CONFIG_KERNEL_TAGGING
332 	region = sl->sl_cache->skm_region;
333 	if (region->skr_mode & SKR_MODE_MEMTAG) {
334 		/*
335 		 * If this region is configured to be tagged, we generate a
336 		 * unique tag for the object address, and return this tagged
337 		 * address to the caller. vm_memtag_assign_tag generates a
338 		 * unique tag for the given address and size, and
339 		 * vm_memtag_set_tag commits the tag to the backing memory
340 		 * metadata. This tagged address is returned back to the client,
341 		 * and when the client frees the address, we "re-tag" the
342 		 * address to prevent against use-after-free attacks (more on
343 		 * this in skmem_cache_batch_free).
344 		 */
345 		tagged_address = vm_memtag_assign_tag((vm_offset_t)bc->bc_addr,
346 		    skm->skm_objsize);
347 		vm_memtag_set_tag(tagged_address, skm->skm_objsize);
348 		/*
349 		 * XXX -fbounds-safety: tagged_address's type is vm_offset_t
350 		 * which is unsafe, so we have ot use __unsafe_forge here.
351 		 * Also, skm->skm_objsize is equal to bc->bc_addr (they're both
352 		 * set to skr->skr_c_obj_size)
353 		 */
354 		bufsize = skm->skm_objsize;
355 		/*
356 		 * XXX -fbounds-safety: Couldn't pass bufsize here, because
357 		 * compiler gives an error: cannot reference 'bufsize' after it
358 		 * is changed during consecutive assignments
359 		 */
360 		buf = __unsafe_forge_bidi_indexable(void *, tagged_address,
361 		    skm->skm_objsize);
362 	} else {
363 		bufsize = bc->bc_lim;
364 		buf = bc->bc_addr;
365 	}
366 #else /* !CONFIG_KERNEL_TAGGING */
367 	bufsize = bc->bc_lim;
368 	buf = bc->bc_addr;
369 #endif /* CONFIG_KERNEL_TAGGING */
370 	SKMEM_OBJ_SIZE(oi) = (uint32_t)bufsize;
371 	SKMEM_OBJ_ADDR(oi) = buf;
372 	SKMEM_OBJ_BUFCTL(oi) = bc;      /* master only; NULL for slave */
373 	ASSERT(skm->skm_objsize <= UINT32_MAX);
374 	SKMEM_OBJ_IDX_REG(oi) =
375 	    ((sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx);
376 	SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
377 	/*
378 	 * And for slave object.
379 	 */
380 	if (oim != NULL) {
381 		bzero(oim, sizeof(*oim));
382 		if (bc->bc_addrm != NULL) {
383 			SKMEM_OBJ_ADDR(oim) = __unsafe_forge_bidi_indexable(
384 				void *, bc->bc_addrm, SKMEM_OBJ_SIZE(oi));
385 			SKMEM_OBJ_SIZE(oim) = SKMEM_OBJ_SIZE(oi);
386 			SKMEM_OBJ_IDX_REG(oim) = SKMEM_OBJ_IDX_REG(oi);
387 			SKMEM_OBJ_IDX_SEG(oim) = SKMEM_OBJ_IDX_SEG(oi);
388 		}
389 	}
390 
391 	if (skm->skm_mode & SKM_MODE_BATCH) {
392 		((struct skmem_obj *)buf)->mo_next = NULL;
393 	}
394 
395 	/* insert to allocated-address hash chain */
396 	bcb = SKMEM_CACHE_HASH(skm, buf);
397 	SLIST_INSERT_HEAD(&bcb->bcb_head, bc, bc_link);
398 
399 	if (SLIST_EMPTY(&sl->sl_head)) {
400 		/*
401 		 * If that was the last buffer control from this slab,
402 		 * insert the slab into the empty list.  If it was in
403 		 * the partially-allocated list, then remove the slab
404 		 * from there as well.
405 		 */
406 		ASSERT(sl->sl_refcnt == sl->sl_chunks);
407 		if (new_slab) {
408 			ASSERT(sl->sl_chunks == 1);
409 		} else {
410 			ASSERT(sl->sl_chunks > 1);
411 			ASSERT(skm->skm_sl_partial > 0);
412 			skm->skm_sl_partial--;
413 			TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
414 		}
415 		skm->skm_sl_empty++;
416 		ASSERT(skm->skm_sl_empty != 0);
417 		TAILQ_INSERT_HEAD(&skm->skm_sl_empty_list, sl, sl_link);
418 	} else {
419 		/*
420 		 * The slab is not empty; if it was newly allocated
421 		 * above, then it's not in the partially-allocated
422 		 * list and so we insert it there.
423 		 */
424 		ASSERT(SKMEM_SLAB_IS_PARTIAL(sl));
425 		if (new_slab) {
426 			skm->skm_sl_partial++;
427 			ASSERT(skm->skm_sl_partial != 0);
428 			TAILQ_INSERT_HEAD(&skm->skm_sl_partial_list,
429 			    sl, sl_link);
430 		}
431 	}
432 
433 	/* if auditing is enabled, record this transaction */
434 	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
435 		skmem_audit_bufctl(bc);
436 	}
437 
438 	return 0;
439 }
440 
441 /*
442  * Allocate a raw object from the (locked) slab layer.  Pseudo region variant.
443  */
444 int
skmem_slab_alloc_pseudo_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)445 skmem_slab_alloc_pseudo_locked(struct skmem_cache *skm,
446     struct skmem_obj_info *oi, struct skmem_obj_info *oim, uint32_t skmflag)
447 {
448 	zalloc_flags_t zflags = (skmflag & SKMEM_NOSLEEP) ? Z_NOWAIT : Z_WAITOK;
449 	struct skmem_region *skr = skm->skm_region;
450 	void *obj, *buf;
451 
452 	/* this flag is not for the caller to set */
453 	VERIFY(!(skmflag & SKMEM_FAILOK));
454 
455 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
456 
457 	ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
458 	/* mirrored region is not applicable */
459 	ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
460 	/* batching is not yet supported */
461 	ASSERT(!(skm->skm_mode & SKM_MODE_BATCH));
462 
463 	obj = zalloc_flags_buf(skr->skr_zreg, zflags | Z_ZERO);
464 	if (obj == NULL) {
465 		os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
466 		return ENOMEM;
467 	}
468 
469 #if KASAN
470 	/*
471 	 * Perform some fix-ups since the zone element isn't guaranteed
472 	 * to be on the aligned boundary.  The effective object size
473 	 * has been adjusted accordingly by skmem_region_create() earlier
474 	 * at cache creation time.
475 	 *
476 	 * 'buf' is the aligned address for this object.
477 	 */
478 	uintptr_t diff = P2ROUNDUP((intptr_t)obj + sizeof(u_int64_t),
479 	    skm->skm_bufalign) - (uintptr_t)obj;
480 	buf = (void *)((char *)obj + diff);
481 
482 	/*
483 	 * Wind back a pointer size from the aligned address and
484 	 * save the original address so we can free it later.
485 	 */
486 	/*
487 	 * XXX -fbounds-safety: Since this function is for generic alloc, we
488 	 * cannot modify the struct like we did for struct skmem_cache.
489 	 * Unfortunately, __unsafe_forge_bidi_indexable seems to be the only
490 	 * choice.
491 	 */
492 	void **pbuf = __unsafe_forge_bidi_indexable(void **,
493 	    (intptr_t)buf - sizeof(void *), sizeof(void *));
494 	*pbuf = obj;
495 
496 	VERIFY(((intptr_t)buf + skm->skm_bufsize) <=
497 	    ((intptr_t)obj + skm->skm_objsize));
498 #else /* !KASAN */
499 	/*
500 	 * We expect that the zone allocator would allocate elements
501 	 * rounded up to the requested alignment based on the effective
502 	 * object size computed in skmem_region_create() earlier, and
503 	 * 'buf' is therefore the element address itself.
504 	 */
505 	buf = obj;
506 #endif /* !KASAN */
507 
508 	/* make sure the object is aligned */
509 	VERIFY(IS_P2ALIGNED(buf, skm->skm_bufalign));
510 
511 	/*
512 	 * Return the object's info to the caller.
513 	 */
514 	bzero(oi, sizeof(*oi));
515 	SKMEM_OBJ_ADDR(oi) = buf;
516 #if KASAN
517 	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize -
518 	    (uint32_t)skm->skm_bufalign;
519 #else
520 	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
521 #endif
522 	ASSERT(skm->skm_objsize <= UINT32_MAX);
523 	if (oim != NULL) {
524 		bzero(oim, sizeof(*oim));
525 	}
526 
527 	skm->skm_sl_alloc++;
528 	skm->skm_sl_bufinuse++;
529 	if (skm->skm_sl_bufinuse > skm->skm_sl_bufmax) {
530 		skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
531 	}
532 
533 	return 0;
534 }
535 
536 /*
537  * Allocate a raw object from the slab layer.
538  */
539 int
skmem_slab_alloc(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)540 skmem_slab_alloc(struct skmem_cache *skm, struct skmem_obj_info *oi,
541     struct skmem_obj_info *oim, uint32_t skmflag)
542 {
543 	int err;
544 
545 	SKM_SLAB_LOCK(skm);
546 	err = skm->skm_slab_alloc(skm, oi, oim, skmflag);
547 	SKM_SLAB_UNLOCK(skm);
548 
549 	return err;
550 }
551 
552 /*
553  * Allocate raw object(s) from the slab layer.
554  */
555 uint32_t
skmem_slab_batch_alloc(struct skmem_cache * skm,struct skmem_obj ** list,uint32_t num,uint32_t skmflag)556 skmem_slab_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
557     uint32_t num, uint32_t skmflag)
558 {
559 	uint32_t need = num;
560 
561 	ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
562 	*list = NULL;
563 
564 	SKM_SLAB_LOCK(skm);
565 	for (;;) {
566 		struct skmem_obj_info oi, oim;
567 
568 		/*
569 		 * Get a single raw object from the slab layer.
570 		 */
571 		if (skm->skm_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
572 			break;
573 		}
574 
575 		*list = SKMEM_OBJ_ADDR(&oi);
576 		ASSERT((*list)->mo_next == NULL);
577 		/* store these inside the object itself */
578 		(*list)->mo_info = oi;
579 		(*list)->mo_minfo = oim;
580 		list = &(*list)->mo_next;
581 
582 		ASSERT(need != 0);
583 		if (--need == 0) {
584 			break;
585 		}
586 	}
587 	SKM_SLAB_UNLOCK(skm);
588 
589 	return num - need;
590 }
591 
592 /*
593  * Free a raw object to the (locked) slab layer.  Normal region variant.
594  */
595 void
skmem_slab_free_locked(struct skmem_cache * skm,void * buf)596 skmem_slab_free_locked(struct skmem_cache *skm, void *buf)
597 {
598 	struct skmem_bufctl *bc, *tbc;
599 	struct skmem_bufctl_bkt *bcb;
600 	struct skmem_slab *sl = NULL;
601 #if CONFIG_KERNEL_TAGGING
602 	struct skmem_region *region;
603 	vm_offset_t tagged_addr;
604 	/*
605 	 * If buf is tagged, then addr would have the canonicalized address.
606 	 * If buf is untagged, then addr is same as buf.
607 	 */
608 	void *addr = __unsafe_forge_bidi_indexable(void *,
609 	    vm_memtag_canonicalize_address((vm_offset_t)buf), skm->skm_objsize);
610 #endif /* CONFIG_KERNEL_TAGGING */
611 
612 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
613 	ASSERT(buf != NULL);
614 	/* caller is expected to clear mo_next */
615 	ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
616 	    ((struct skmem_obj *)buf)->mo_next == NULL);
617 
618 	/*
619 	 * Search the hash chain to find a matching buffer control for the
620 	 * given object address.  If found, remove the buffer control from
621 	 * the hash chain and insert it into the freelist.  Otherwise, we
622 	 * panic since the caller has given us a bogus address.
623 	 */
624 	skm->skm_sl_free++;
625 	bcb = SKMEM_CACHE_HASH(skm, buf);
626 
627 #if CONFIG_KERNEL_TAGGING
628 	/*
629 	 * If this region is configured to tag memory addresses, then buf is a
630 	 * tagged address. When we search for the buffer control from the hash
631 	 * table, we need to use the untagged address, because buffer control
632 	 * maintains untagged address (bc_addr). vm_memtag_canonicalize_address
633 	 * returns the untagged address.
634 	 */
635 	SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
636 		if (bc->bc_addr == addr) {
637 			SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
638 			sl = bc->bc_slab;
639 			break;
640 		}
641 	}
642 #else /* !CONFIG_KERNEL_TAGGING */
643 	SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
644 		if (bc->bc_addr == buf) {
645 			SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
646 			sl = bc->bc_slab;
647 			break;
648 		}
649 	}
650 #endif /* CONFIG_KERNEL_TAGGING */
651 
652 	if (bc == NULL) {
653 		panic("%s: attempt to free invalid or already-freed obj %p "
654 		    "on skm %p", __func__, buf, skm);
655 		/* NOTREACHED */
656 		__builtin_unreachable();
657 	}
658 	ASSERT(sl != NULL && sl->sl_cache == skm);
659 
660 #if CONFIG_KERNEL_TAGGING
661 	/*
662 	 * We use untagged address here, because SKMEM_SLAB_MEMBER compares the
663 	 * address against sl_base, which is untagged.
664 	 */
665 	VERIFY(SKMEM_SLAB_MEMBER(sl, addr));
666 #else /* !CONFIG_KERNEL_TAGGING */
667 	VERIFY(SKMEM_SLAB_MEMBER(sl, buf));
668 #endif /* CONFIG_KERNEL_TAGGING */
669 
670 	/* make sure this object is not currently in use by another object */
671 	VERIFY(bc->bc_usecnt == 0);
672 
673 	/* if auditing is enabled, record this transaction */
674 	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
675 		skmem_audit_bufctl(bc);
676 	}
677 
678 	/* if clear on free is requested, zero out the object */
679 	if (skm->skm_mode & SKM_MODE_CLEARONFREE) {
680 		size_t size = skm->skm_objsize;
681 		void *buf_cpy = __unsafe_forge_bidi_indexable(void *, buf, size);
682 		bzero(buf_cpy, size);
683 		buf_cpy = NULL;
684 		size = 0;
685 	}
686 
687 #if CONFIG_KERNEL_TAGGING
688 	/*
689 	 * If this region is configured to tag memory addresses, we re-tag this
690 	 * address as the object is freed. We do the re-tagging in the magazine
691 	 * layer too, but in case we need to free raw objects to the slab layer
692 	 * (either becasue SKM_MODE_NOMAGAZINES is set, or the magazine layer
693 	 * was not able to allocate empty magazines), we re-tag the addresses
694 	 * here in the slab layer. Freeing to the slab layer is symmetrical to
695 	 * allocating from the slab layer - when we allocate from slab layer, we
696 	 * tag the address, and then construct the object; when we free to the
697 	 * slab layer, we destruct the object, and retag the address.
698 	 * We do the re-tagging here, because this is right after the last usage
699 	 * of the buf variable (which is tagged).
700 	 */
701 	region = skm->skm_region;
702 	if (region->skr_mode & SKR_MODE_MEMTAG) {
703 		tagged_addr = vm_memtag_assign_tag((vm_offset_t)buf,
704 		    skm->skm_objsize);
705 		vm_memtag_set_tag(tagged_addr, skm->skm_objsize);
706 	}
707 #endif /* CONFIG_KERNEL_TAGGING */
708 
709 	/* insert the buffer control to the slab's freelist */
710 	SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
711 
712 	ASSERT(sl->sl_refcnt >= 1);
713 	if (--sl->sl_refcnt == 0) {
714 		/*
715 		 * If this was the last outstanding object for the slab,
716 		 * remove the slab from the partially-allocated or empty
717 		 * list, and destroy the slab (segment) back to the region.
718 		 */
719 		if (sl->sl_chunks == 1) {
720 			ASSERT(skm->skm_sl_empty > 0);
721 			skm->skm_sl_empty--;
722 			TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
723 		} else {
724 			ASSERT(skm->skm_sl_partial > 0);
725 			skm->skm_sl_partial--;
726 			TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
727 		}
728 		ASSERT((int64_t)(skm->skm_sl_bufinuse - sl->sl_chunks) >= 0);
729 		skm->skm_sl_bufinuse -= sl->sl_chunks;
730 		skm->skm_sl_destroy++;
731 		SKM_SLAB_UNLOCK(skm);
732 		skmem_slab_destroy(skm, sl);
733 		SKM_SLAB_LOCK(skm);
734 		return;
735 	}
736 
737 	ASSERT(bc == SLIST_FIRST(&sl->sl_head));
738 	if (SLIST_NEXT(bc, bc_link) == NULL) {
739 		/*
740 		 * If this is the first (potentially amongst many) object
741 		 * that's returned to the slab, remove the slab from the
742 		 * empty list and insert to end of the partially-allocated
743 		 * list. This should help avoid thrashing the partial slab
744 		 * since we avoid disturbing what's already at the front.
745 		 */
746 		ASSERT(sl->sl_refcnt == (sl->sl_chunks - 1));
747 		ASSERT(sl->sl_chunks > 1);
748 		ASSERT(skm->skm_sl_empty > 0);
749 		skm->skm_sl_empty--;
750 		TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
751 		skm->skm_sl_partial++;
752 		ASSERT(skm->skm_sl_partial != 0);
753 		TAILQ_INSERT_TAIL(&skm->skm_sl_partial_list, sl, sl_link);
754 	}
755 }
756 
757 /*
758  * Free a raw object to the (locked) slab layer.  Pseudo region variant.
759  */
760 void
skmem_slab_free_pseudo_locked(struct skmem_cache * skm,void * buf)761 skmem_slab_free_pseudo_locked(struct skmem_cache *skm, void *buf)
762 {
763 	struct skmem_region *skr = skm->skm_region;
764 	void *__single obj = buf;
765 
766 	ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
767 
768 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
769 
770 	VERIFY(IS_P2ALIGNED(obj, skm->skm_bufalign));
771 
772 #if KASAN
773 	/*
774 	 * Since we stuffed the original zone element address before
775 	 * the buffer address in KASAN mode, get it back since we're
776 	 * about to free it.
777 	 */
778 	void **pbuf = __unsafe_forge_bidi_indexable(void **,
779 	    ((intptr_t)obj - sizeof(void *)), sizeof(void *));
780 
781 	VERIFY(((intptr_t)obj + skm->skm_bufsize) <=
782 	    ((intptr_t)*pbuf + skm->skm_objsize));
783 
784 	obj = *pbuf;
785 #endif /* KASAN */
786 
787 	/* free it to zone */
788 	zfree(skr->skr_zreg, obj);
789 
790 	skm->skm_sl_free++;
791 	ASSERT(skm->skm_sl_bufinuse > 0);
792 	skm->skm_sl_bufinuse--;
793 }
794 
795 /*
796  * Free a raw object to the slab layer.
797  */
798 void
skmem_slab_free(struct skmem_cache * skm,void * buf)799 skmem_slab_free(struct skmem_cache *skm, void *buf)
800 {
801 	if (skm->skm_mode & SKM_MODE_BATCH) {
802 		((struct skmem_obj *)buf)->mo_next = NULL;
803 	}
804 
805 	SKM_SLAB_LOCK(skm);
806 	skm->skm_slab_free(skm, buf);
807 	SKM_SLAB_UNLOCK(skm);
808 }
809 
810 /*
811  * Free raw object(s) to the slab layer.
812  */
813 void
skmem_slab_batch_free(struct skmem_cache * skm,struct skmem_obj * list)814 skmem_slab_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
815 {
816 	struct skmem_obj *listn;
817 
818 	ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
819 
820 	SKM_SLAB_LOCK(skm);
821 	for (;;) {
822 		listn = list->mo_next;
823 		list->mo_next = NULL;
824 
825 		/*
826 		 * Free a single object to the slab layer.
827 		 */
828 		skm->skm_slab_free(skm, (void *)list);
829 
830 		/* if no more objects to free, we're done */
831 		if ((list = listn) == NULL) {
832 			break;
833 		}
834 	}
835 	SKM_SLAB_UNLOCK(skm);
836 }
837 
838 
839 /*
840  * Given a buffer control, record the current transaction.
841  */
842 __attribute__((noinline, cold, not_tail_called))
843 inline void
skmem_audit_bufctl(struct skmem_bufctl * bc)844 skmem_audit_bufctl(struct skmem_bufctl *bc)
845 {
846 	struct skmem_bufctl_audit *bca = (struct skmem_bufctl_audit *)bc;
847 	struct timeval tv;
848 
849 	microuptime(&tv);
850 	bca->bc_thread = current_thread();
851 	bca->bc_timestamp = (uint32_t)((tv.tv_sec * 1000) + (tv.tv_usec / 1000));
852 	bca->bc_depth = OSBacktrace(bca->bc_stack, SKMEM_STACK_DEPTH);
853 }
854