1 /*
2 * Copyright (c) 2016-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #define _FN_KPRINTF
31 #include <pexpert/pexpert.h> /* for PE_parse_boot_argn */
32 #include <libkern/OSDebug.h> /* for OSBacktrace */
33 #include <kern/sched_prim.h> /* for assert_wait */
34 #include <vm/vm_memtag.h>
35
36 static struct skmem_slab *skmem_slab_create(struct skmem_cache *, uint32_t);
37 static void skmem_slab_destroy(struct skmem_cache *, struct skmem_slab *);
38
39 /*
40 * Too big a value will cause overflow and thus trip the assertion; the
41 * idea here is to set an upper limit for the time that a particular
42 * thread is allowed to perform retries before we give up and panic.
43 */
44 #define SKMEM_SLAB_MAX_BACKOFF (20 * USEC_PER_SEC) /* seconds */
45
46 /*
47 * Threshold (in msec) after which we reset the exponential backoff value
48 * back to its (random) initial value. Note that we allow the actual delay
49 * to be at most twice this value.
50 */
51 #define SKMEM_SLAB_BACKOFF_THRES 1024 /* up to ~2 sec (2048 msec) */
52
53 /*
54 * To reduce the likelihood of global synchronization between threads,
55 * we use some random value to start the exponential backoff.
56 */
57 #define SKMEM_SLAB_BACKOFF_RANDOM 4 /* range is [1,4] msec */
58
59 /*
60 * Create a slab.
61 */
62 static struct skmem_slab *
skmem_slab_create(struct skmem_cache * skm,uint32_t skmflag)63 skmem_slab_create(struct skmem_cache *skm, uint32_t skmflag)
64 {
65 struct skmem_region *skr = skm->skm_region;
66 uint32_t objsize, chunks;
67 size_t slabsize = skm->skm_slabsize;
68 struct skmem_slab *__single sl;
69 struct sksegment *__single sg, *__single sgm;
70 char *buf, *__indexable slab;
71 char *__indexable bufm;
72 uint32_t slabm_size;
73 void *__sized_by(slabm_size) slabm;
74
75 /*
76 * Allocate a segment (a slab at our layer) from the region.
77 */
78 slab = skmem_region_alloc(skr, &slabm, &sg, &sgm, skmflag,
79 skr->skr_params.srp_c_seg_size, &slabm_size);
80 if (slab == NULL) {
81 goto rg_alloc_failure;
82 }
83
84 if ((sl = skmem_cache_alloc(skmem_slab_cache, SKMEM_SLEEP)) == NULL) {
85 goto slab_alloc_failure;
86 }
87
88 ASSERT(sg != NULL);
89 ASSERT(sgm == NULL || sgm->sg_index == sg->sg_index);
90
91 bzero(sl, sizeof(*sl));
92 sl->sl_cache = skm;
93 sl->sl_base = buf = slab;
94 bufm = slabm;
95 objsize = (uint32_t)skr->skr_c_obj_size;
96 sl->sl_basem = __unsafe_forge_bidi_indexable(void *, bufm, objsize);
97 ASSERT(skr->skr_c_obj_size <= UINT32_MAX);
98 ASSERT(skm->skm_objsize == objsize);
99 ASSERT((slabsize / objsize) <= UINT32_MAX);
100 sl->sl_chunks = chunks = (uint32_t)(slabsize / objsize);
101 sl->sl_seg = sg;
102 sl->sl_segm = sgm;
103
104 /*
105 * Create one or more buffer control structures for the slab,
106 * each one tracking a chunk of raw object from the segment,
107 * and insert these into the slab's list of buffer controls.
108 */
109 ASSERT(chunks > 0);
110 while (chunks != 0) {
111 struct skmem_bufctl *__indexable bc;
112 bc = skmem_cache_alloc(skmem_bufctl_cache, SKMEM_SLEEP);
113 if (bc == NULL) {
114 goto bufctl_alloc_failure;
115 }
116
117 bzero(bc, bc_size);
118 bc->bc_lim = objsize;
119 bc->bc_addr = buf;
120 bc->bc_addrm = bufm;
121 bc->bc_slab = sl;
122 bc->bc_idx = (sl->sl_chunks - chunks);
123 if (skr->skr_mode & SKR_MODE_SHAREOK) {
124 bc->bc_flags |= SKMEM_BUFCTL_SHAREOK;
125 }
126 SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
127 buf += objsize;
128 if (bufm != NULL) {
129 /* XXX -fbounds-safety */
130 bufm = (char *)bufm + objsize;
131 }
132 --chunks;
133 }
134
135 SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
136 SK_KVA(skm), SK_KVA(sl));
137 SK_DF(SK_VERB_MEM_CACHE, " [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
138 SK_KVA(slab), SK_KVA(slab + objsize));
139
140 return sl;
141
142 bufctl_alloc_failure:
143 skmem_slab_destroy(skm, sl);
144
145 slab_alloc_failure:
146 skmem_region_free(skr, slab, __unsafe_forge_bidi_indexable(void *,
147 slabm, skr->skr_c_obj_size));
148
149 rg_alloc_failure:
150 os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
151
152 return NULL;
153 }
154
155 /*
156 * Destroy a slab.
157 */
158 static void
skmem_slab_destroy(struct skmem_cache * skm,struct skmem_slab * sl)159 skmem_slab_destroy(struct skmem_cache *skm, struct skmem_slab *sl)
160 {
161 struct skmem_bufctl *bc, *tbc;
162 void *__single slab = sl->sl_base;
163 void *__single slabm = sl->sl_basem;
164
165 ASSERT(sl->sl_refcnt == 0);
166
167 SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
168 SK_KVA(skm), SK_KVA(sl));
169 SK_DF(SK_VERB_MEM_CACHE, " [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
170 SK_KVA(slab), SK_KVA((uintptr_t)slab + skm->skm_objsize));
171
172 /*
173 * Go through the slab's list of buffer controls and free
174 * them, and then free the slab itself back to its cache.
175 */
176 SLIST_FOREACH_SAFE(bc, &sl->sl_head, bc_link, tbc) {
177 SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
178 skmem_cache_free(skmem_bufctl_cache, bc);
179 }
180 skmem_cache_free(skmem_slab_cache, sl);
181
182 /* and finally free the segment back to the backing region */
183 skmem_region_free(skm->skm_region, slab, slabm);
184 }
185
186 /*
187 * Allocate a raw object from the (locked) slab layer. Normal region variant.
188 */
189 int
skmem_slab_alloc_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)190 skmem_slab_alloc_locked(struct skmem_cache *skm, struct skmem_obj_info *oi,
191 struct skmem_obj_info *oim, uint32_t skmflag)
192 {
193 struct skmem_bufctl_bkt *bcb;
194 struct skmem_bufctl *bc;
195 struct skmem_slab *sl;
196 uint32_t retries = 0;
197 uint64_t boff_total = 0; /* in usec */
198 uint64_t boff = 0; /* in msec */
199 boolean_t new_slab;
200 size_t bufsize;
201 void *__sized_by(bufsize) buf;
202 #if CONFIG_KERNEL_TAGGING
203 vm_offset_t tagged_address; /* address tagging */
204 struct skmem_region *region; /* region source for this slab */
205 #endif /* CONFIG_KERNEL_TAGGING */
206
207 /* this flag is not for the caller to set */
208 VERIFY(!(skmflag & SKMEM_FAILOK));
209
210 /*
211 * A slab is either in a partially-allocated list (at least it has
212 * a free object available), or is in the empty list (everything
213 * has been allocated.) If we can't find a partially-allocated
214 * slab, then we need to allocate a slab (segment) from the region.
215 */
216 again:
217 SKM_SLAB_LOCK_ASSERT_HELD(skm);
218 sl = TAILQ_FIRST(&skm->skm_sl_partial_list);
219 if (sl == NULL) {
220 uint32_t flags = skmflag;
221 boolean_t retry;
222
223 ASSERT(skm->skm_sl_partial == 0);
224 SKM_SLAB_UNLOCK(skm);
225 if (!(flags & SKMEM_NOSLEEP)) {
226 /*
227 * Pick up a random value to start the exponential
228 * backoff, if this is the first round, or if the
229 * current value is over the threshold. Otherwise,
230 * double the backoff value.
231 */
232 if (boff == 0 || boff > SKMEM_SLAB_BACKOFF_THRES) {
233 read_frandom(&boff, sizeof(boff));
234 boff = (boff % SKMEM_SLAB_BACKOFF_RANDOM) + 1;
235 ASSERT(boff > 0);
236 } else if (os_mul_overflow(boff, 2, &boff)) {
237 panic_plain("\"%s\": boff counter "
238 "overflows\n", skm->skm_name);
239 /* NOTREACHED */
240 __builtin_unreachable();
241 }
242 /* add this value (in msec) to the total (in usec) */
243 if (os_add_overflow(boff_total,
244 (boff * NSEC_PER_USEC), &boff_total)) {
245 panic_plain("\"%s\": boff_total counter "
246 "overflows\n", skm->skm_name);
247 /* NOTREACHED */
248 __builtin_unreachable();
249 }
250 }
251 /*
252 * In the event of a race between multiple threads trying
253 * to create the last remaining (or the only) slab, let the
254 * loser(s) attempt to retry after waiting a bit. The winner
255 * would have inserted the newly-created slab into the list.
256 */
257 if (!(flags & SKMEM_NOSLEEP) &&
258 boff_total <= SKMEM_SLAB_MAX_BACKOFF) {
259 retry = TRUE;
260 ++retries;
261 flags |= SKMEM_FAILOK;
262 } else {
263 if (!(flags & SKMEM_NOSLEEP)) {
264 panic_plain("\"%s\": failed to allocate "
265 "slab (sleeping mode) after %llu "
266 "msec, %u retries\n\n%s", skm->skm_name,
267 (boff_total / NSEC_PER_USEC), retries,
268 skmem_dump(skm->skm_region));
269 /* NOTREACHED */
270 __builtin_unreachable();
271 }
272 retry = FALSE;
273 }
274
275 /*
276 * Create a new slab.
277 */
278 if ((sl = skmem_slab_create(skm, flags)) == NULL) {
279 if (retry) {
280 SK_ERR("\"%s\": failed to allocate "
281 "slab (%ssleeping mode): waiting for %llu "
282 "msec, total %llu msec, %u retries",
283 skm->skm_name,
284 (flags & SKMEM_NOSLEEP) ? "non-" : "",
285 boff, (boff_total / NSEC_PER_USEC), retries);
286 VERIFY(boff > 0 && ((uint32_t)boff <=
287 (SKMEM_SLAB_BACKOFF_THRES * 2)));
288 delay((uint32_t)boff * NSEC_PER_USEC);
289 SKM_SLAB_LOCK(skm);
290 goto again;
291 } else {
292 SK_RDERR(4, "\"%s\": failed to allocate slab "
293 "(%ssleeping mode)", skm->skm_name,
294 (flags & SKMEM_NOSLEEP) ? "non-" : "");
295 SKM_SLAB_LOCK(skm);
296 }
297 return ENOMEM;
298 }
299
300 SKM_SLAB_LOCK(skm);
301 skm->skm_sl_create++;
302 if ((skm->skm_sl_bufinuse += sl->sl_chunks) >
303 skm->skm_sl_bufmax) {
304 skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
305 }
306 }
307 skm->skm_sl_alloc++;
308
309 new_slab = (sl->sl_refcnt == 0);
310 ASSERT(new_slab || SKMEM_SLAB_IS_PARTIAL(sl));
311
312 sl->sl_refcnt++;
313 ASSERT(sl->sl_refcnt <= sl->sl_chunks);
314
315 /*
316 * We either have a new slab, or a partially-allocated one.
317 * Remove a buffer control from the slab, and insert it to
318 * the allocated-address hash chain.
319 */
320 bc = SLIST_FIRST(&sl->sl_head);
321 ASSERT(bc != NULL);
322 SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
323
324 /* sanity check */
325 VERIFY(bc->bc_usecnt == 0);
326
327 /*
328 * Also store the master object's region info for the caller.
329 */
330 bzero(oi, sizeof(*oi));
331 #if CONFIG_KERNEL_TAGGING
332 region = sl->sl_cache->skm_region;
333 if (region->skr_mode & SKR_MODE_MEMTAG) {
334 /*
335 * If this region is configured to be tagged, we generate a
336 * unique tag for the object address, and return this tagged
337 * address to the caller. vm_memtag_assign_tag generates a
338 * unique tag for the given address and size, and
339 * vm_memtag_set_tag commits the tag to the backing memory
340 * metadata. This tagged address is returned back to the client,
341 * and when the client frees the address, we "re-tag" the
342 * address to prevent against use-after-free attacks (more on
343 * this in skmem_cache_batch_free).
344 */
345 tagged_address = vm_memtag_assign_tag((vm_offset_t)bc->bc_addr,
346 skm->skm_objsize);
347 vm_memtag_set_tag(tagged_address, skm->skm_objsize);
348 /*
349 * XXX -fbounds-safety: tagged_address's type is vm_offset_t
350 * which is unsafe, so we have ot use __unsafe_forge here.
351 * Also, skm->skm_objsize is equal to bc->bc_addr (they're both
352 * set to skr->skr_c_obj_size)
353 */
354 bufsize = skm->skm_objsize;
355 /*
356 * XXX -fbounds-safety: Couldn't pass bufsize here, because
357 * compiler gives an error: cannot reference 'bufsize' after it
358 * is changed during consecutive assignments
359 */
360 buf = __unsafe_forge_bidi_indexable(void *, tagged_address,
361 skm->skm_objsize);
362 } else {
363 bufsize = bc->bc_lim;
364 buf = bc->bc_addr;
365 }
366 #else /* !CONFIG_KERNEL_TAGGING */
367 bufsize = bc->bc_lim;
368 buf = bc->bc_addr;
369 #endif /* CONFIG_KERNEL_TAGGING */
370 SKMEM_OBJ_SIZE(oi) = (uint32_t)bufsize;
371 SKMEM_OBJ_ADDR(oi) = buf;
372 SKMEM_OBJ_BUFCTL(oi) = bc; /* master only; NULL for slave */
373 ASSERT(skm->skm_objsize <= UINT32_MAX);
374 SKMEM_OBJ_IDX_REG(oi) =
375 ((sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx);
376 SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
377 /*
378 * And for slave object.
379 */
380 if (oim != NULL) {
381 bzero(oim, sizeof(*oim));
382 if (bc->bc_addrm != NULL) {
383 SKMEM_OBJ_ADDR(oim) = __unsafe_forge_bidi_indexable(
384 void *, bc->bc_addrm, SKMEM_OBJ_SIZE(oi));
385 SKMEM_OBJ_SIZE(oim) = SKMEM_OBJ_SIZE(oi);
386 SKMEM_OBJ_IDX_REG(oim) = SKMEM_OBJ_IDX_REG(oi);
387 SKMEM_OBJ_IDX_SEG(oim) = SKMEM_OBJ_IDX_SEG(oi);
388 }
389 }
390
391 if (skm->skm_mode & SKM_MODE_BATCH) {
392 ((struct skmem_obj *)buf)->mo_next = NULL;
393 }
394
395 /* insert to allocated-address hash chain */
396 bcb = SKMEM_CACHE_HASH(skm, buf);
397 SLIST_INSERT_HEAD(&bcb->bcb_head, bc, bc_link);
398
399 if (SLIST_EMPTY(&sl->sl_head)) {
400 /*
401 * If that was the last buffer control from this slab,
402 * insert the slab into the empty list. If it was in
403 * the partially-allocated list, then remove the slab
404 * from there as well.
405 */
406 ASSERT(sl->sl_refcnt == sl->sl_chunks);
407 if (new_slab) {
408 ASSERT(sl->sl_chunks == 1);
409 } else {
410 ASSERT(sl->sl_chunks > 1);
411 ASSERT(skm->skm_sl_partial > 0);
412 skm->skm_sl_partial--;
413 TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
414 }
415 skm->skm_sl_empty++;
416 ASSERT(skm->skm_sl_empty != 0);
417 TAILQ_INSERT_HEAD(&skm->skm_sl_empty_list, sl, sl_link);
418 } else {
419 /*
420 * The slab is not empty; if it was newly allocated
421 * above, then it's not in the partially-allocated
422 * list and so we insert it there.
423 */
424 ASSERT(SKMEM_SLAB_IS_PARTIAL(sl));
425 if (new_slab) {
426 skm->skm_sl_partial++;
427 ASSERT(skm->skm_sl_partial != 0);
428 TAILQ_INSERT_HEAD(&skm->skm_sl_partial_list,
429 sl, sl_link);
430 }
431 }
432
433 /* if auditing is enabled, record this transaction */
434 if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
435 skmem_audit_bufctl(bc);
436 }
437
438 return 0;
439 }
440
441 /*
442 * Allocate a raw object from the (locked) slab layer. Pseudo region variant.
443 */
444 int
skmem_slab_alloc_pseudo_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)445 skmem_slab_alloc_pseudo_locked(struct skmem_cache *skm,
446 struct skmem_obj_info *oi, struct skmem_obj_info *oim, uint32_t skmflag)
447 {
448 zalloc_flags_t zflags = (skmflag & SKMEM_NOSLEEP) ? Z_NOWAIT : Z_WAITOK;
449 struct skmem_region *skr = skm->skm_region;
450 void *obj, *buf;
451
452 /* this flag is not for the caller to set */
453 VERIFY(!(skmflag & SKMEM_FAILOK));
454
455 SKM_SLAB_LOCK_ASSERT_HELD(skm);
456
457 ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
458 /* mirrored region is not applicable */
459 ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
460 /* batching is not yet supported */
461 ASSERT(!(skm->skm_mode & SKM_MODE_BATCH));
462
463 obj = zalloc_flags_buf(skr->skr_zreg, zflags | Z_ZERO);
464 if (obj == NULL) {
465 os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
466 return ENOMEM;
467 }
468
469 #if KASAN
470 /*
471 * Perform some fix-ups since the zone element isn't guaranteed
472 * to be on the aligned boundary. The effective object size
473 * has been adjusted accordingly by skmem_region_create() earlier
474 * at cache creation time.
475 *
476 * 'buf' is the aligned address for this object.
477 */
478 uintptr_t diff = P2ROUNDUP((intptr_t)obj + sizeof(u_int64_t),
479 skm->skm_bufalign) - (uintptr_t)obj;
480 buf = (void *)((char *)obj + diff);
481
482 /*
483 * Wind back a pointer size from the aligned address and
484 * save the original address so we can free it later.
485 */
486 /*
487 * XXX -fbounds-safety: Since this function is for generic alloc, we
488 * cannot modify the struct like we did for struct skmem_cache.
489 * Unfortunately, __unsafe_forge_bidi_indexable seems to be the only
490 * choice.
491 */
492 void **pbuf = __unsafe_forge_bidi_indexable(void **,
493 (intptr_t)buf - sizeof(void *), sizeof(void *));
494 *pbuf = obj;
495
496 VERIFY(((intptr_t)buf + skm->skm_bufsize) <=
497 ((intptr_t)obj + skm->skm_objsize));
498 #else /* !KASAN */
499 /*
500 * We expect that the zone allocator would allocate elements
501 * rounded up to the requested alignment based on the effective
502 * object size computed in skmem_region_create() earlier, and
503 * 'buf' is therefore the element address itself.
504 */
505 buf = obj;
506 #endif /* !KASAN */
507
508 /* make sure the object is aligned */
509 VERIFY(IS_P2ALIGNED(buf, skm->skm_bufalign));
510
511 /*
512 * Return the object's info to the caller.
513 */
514 bzero(oi, sizeof(*oi));
515 SKMEM_OBJ_ADDR(oi) = buf;
516 #if KASAN
517 SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize -
518 (uint32_t)skm->skm_bufalign;
519 #else
520 SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
521 #endif
522 ASSERT(skm->skm_objsize <= UINT32_MAX);
523 if (oim != NULL) {
524 bzero(oim, sizeof(*oim));
525 }
526
527 skm->skm_sl_alloc++;
528 skm->skm_sl_bufinuse++;
529 if (skm->skm_sl_bufinuse > skm->skm_sl_bufmax) {
530 skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
531 }
532
533 return 0;
534 }
535
536 /*
537 * Allocate a raw object from the slab layer.
538 */
539 int
skmem_slab_alloc(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)540 skmem_slab_alloc(struct skmem_cache *skm, struct skmem_obj_info *oi,
541 struct skmem_obj_info *oim, uint32_t skmflag)
542 {
543 int err;
544
545 SKM_SLAB_LOCK(skm);
546 err = skm->skm_slab_alloc(skm, oi, oim, skmflag);
547 SKM_SLAB_UNLOCK(skm);
548
549 return err;
550 }
551
552 /*
553 * Allocate raw object(s) from the slab layer.
554 */
555 uint32_t
skmem_slab_batch_alloc(struct skmem_cache * skm,struct skmem_obj ** list,uint32_t num,uint32_t skmflag)556 skmem_slab_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
557 uint32_t num, uint32_t skmflag)
558 {
559 uint32_t need = num;
560
561 ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
562 *list = NULL;
563
564 SKM_SLAB_LOCK(skm);
565 for (;;) {
566 struct skmem_obj_info oi, oim;
567
568 /*
569 * Get a single raw object from the slab layer.
570 */
571 if (skm->skm_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
572 break;
573 }
574
575 *list = SKMEM_OBJ_ADDR(&oi);
576 ASSERT((*list)->mo_next == NULL);
577 /* store these inside the object itself */
578 (*list)->mo_info = oi;
579 (*list)->mo_minfo = oim;
580 list = &(*list)->mo_next;
581
582 ASSERT(need != 0);
583 if (--need == 0) {
584 break;
585 }
586 }
587 SKM_SLAB_UNLOCK(skm);
588
589 return num - need;
590 }
591
592 /*
593 * Free a raw object to the (locked) slab layer. Normal region variant.
594 */
595 void
skmem_slab_free_locked(struct skmem_cache * skm,void * buf)596 skmem_slab_free_locked(struct skmem_cache *skm, void *buf)
597 {
598 struct skmem_bufctl *bc, *tbc;
599 struct skmem_bufctl_bkt *bcb;
600 struct skmem_slab *sl = NULL;
601 #if CONFIG_KERNEL_TAGGING
602 struct skmem_region *region;
603 vm_offset_t tagged_addr;
604 /*
605 * If buf is tagged, then addr would have the canonicalized address.
606 * If buf is untagged, then addr is same as buf.
607 */
608 void *addr = __unsafe_forge_bidi_indexable(void *,
609 vm_memtag_canonicalize_address((vm_offset_t)buf), skm->skm_objsize);
610 #endif /* CONFIG_KERNEL_TAGGING */
611
612 SKM_SLAB_LOCK_ASSERT_HELD(skm);
613 ASSERT(buf != NULL);
614 /* caller is expected to clear mo_next */
615 ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
616 ((struct skmem_obj *)buf)->mo_next == NULL);
617
618 /*
619 * Search the hash chain to find a matching buffer control for the
620 * given object address. If found, remove the buffer control from
621 * the hash chain and insert it into the freelist. Otherwise, we
622 * panic since the caller has given us a bogus address.
623 */
624 skm->skm_sl_free++;
625 bcb = SKMEM_CACHE_HASH(skm, buf);
626
627 #if CONFIG_KERNEL_TAGGING
628 /*
629 * If this region is configured to tag memory addresses, then buf is a
630 * tagged address. When we search for the buffer control from the hash
631 * table, we need to use the untagged address, because buffer control
632 * maintains untagged address (bc_addr). vm_memtag_canonicalize_address
633 * returns the untagged address.
634 */
635 SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
636 if (bc->bc_addr == addr) {
637 SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
638 sl = bc->bc_slab;
639 break;
640 }
641 }
642 #else /* !CONFIG_KERNEL_TAGGING */
643 SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
644 if (bc->bc_addr == buf) {
645 SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
646 sl = bc->bc_slab;
647 break;
648 }
649 }
650 #endif /* CONFIG_KERNEL_TAGGING */
651
652 if (bc == NULL) {
653 panic("%s: attempt to free invalid or already-freed obj %p "
654 "on skm %p", __func__, buf, skm);
655 /* NOTREACHED */
656 __builtin_unreachable();
657 }
658 ASSERT(sl != NULL && sl->sl_cache == skm);
659
660 #if CONFIG_KERNEL_TAGGING
661 /*
662 * We use untagged address here, because SKMEM_SLAB_MEMBER compares the
663 * address against sl_base, which is untagged.
664 */
665 VERIFY(SKMEM_SLAB_MEMBER(sl, addr));
666 #else /* !CONFIG_KERNEL_TAGGING */
667 VERIFY(SKMEM_SLAB_MEMBER(sl, buf));
668 #endif /* CONFIG_KERNEL_TAGGING */
669
670 /* make sure this object is not currently in use by another object */
671 VERIFY(bc->bc_usecnt == 0);
672
673 /* if auditing is enabled, record this transaction */
674 if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
675 skmem_audit_bufctl(bc);
676 }
677
678 /* if clear on free is requested, zero out the object */
679 if (skm->skm_mode & SKM_MODE_CLEARONFREE) {
680 size_t size = skm->skm_objsize;
681 void *buf_cpy = __unsafe_forge_bidi_indexable(void *, buf, size);
682 bzero(buf_cpy, size);
683 buf_cpy = NULL;
684 size = 0;
685 }
686
687 #if CONFIG_KERNEL_TAGGING
688 /*
689 * If this region is configured to tag memory addresses, we re-tag this
690 * address as the object is freed. We do the re-tagging in the magazine
691 * layer too, but in case we need to free raw objects to the slab layer
692 * (either becasue SKM_MODE_NOMAGAZINES is set, or the magazine layer
693 * was not able to allocate empty magazines), we re-tag the addresses
694 * here in the slab layer. Freeing to the slab layer is symmetrical to
695 * allocating from the slab layer - when we allocate from slab layer, we
696 * tag the address, and then construct the object; when we free to the
697 * slab layer, we destruct the object, and retag the address.
698 * We do the re-tagging here, because this is right after the last usage
699 * of the buf variable (which is tagged).
700 */
701 region = skm->skm_region;
702 if (region->skr_mode & SKR_MODE_MEMTAG) {
703 tagged_addr = vm_memtag_assign_tag((vm_offset_t)buf,
704 skm->skm_objsize);
705 vm_memtag_set_tag(tagged_addr, skm->skm_objsize);
706 }
707 #endif /* CONFIG_KERNEL_TAGGING */
708
709 /* insert the buffer control to the slab's freelist */
710 SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
711
712 ASSERT(sl->sl_refcnt >= 1);
713 if (--sl->sl_refcnt == 0) {
714 /*
715 * If this was the last outstanding object for the slab,
716 * remove the slab from the partially-allocated or empty
717 * list, and destroy the slab (segment) back to the region.
718 */
719 if (sl->sl_chunks == 1) {
720 ASSERT(skm->skm_sl_empty > 0);
721 skm->skm_sl_empty--;
722 TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
723 } else {
724 ASSERT(skm->skm_sl_partial > 0);
725 skm->skm_sl_partial--;
726 TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
727 }
728 ASSERT((int64_t)(skm->skm_sl_bufinuse - sl->sl_chunks) >= 0);
729 skm->skm_sl_bufinuse -= sl->sl_chunks;
730 skm->skm_sl_destroy++;
731 SKM_SLAB_UNLOCK(skm);
732 skmem_slab_destroy(skm, sl);
733 SKM_SLAB_LOCK(skm);
734 return;
735 }
736
737 ASSERT(bc == SLIST_FIRST(&sl->sl_head));
738 if (SLIST_NEXT(bc, bc_link) == NULL) {
739 /*
740 * If this is the first (potentially amongst many) object
741 * that's returned to the slab, remove the slab from the
742 * empty list and insert to end of the partially-allocated
743 * list. This should help avoid thrashing the partial slab
744 * since we avoid disturbing what's already at the front.
745 */
746 ASSERT(sl->sl_refcnt == (sl->sl_chunks - 1));
747 ASSERT(sl->sl_chunks > 1);
748 ASSERT(skm->skm_sl_empty > 0);
749 skm->skm_sl_empty--;
750 TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
751 skm->skm_sl_partial++;
752 ASSERT(skm->skm_sl_partial != 0);
753 TAILQ_INSERT_TAIL(&skm->skm_sl_partial_list, sl, sl_link);
754 }
755 }
756
757 /*
758 * Free a raw object to the (locked) slab layer. Pseudo region variant.
759 */
760 void
skmem_slab_free_pseudo_locked(struct skmem_cache * skm,void * buf)761 skmem_slab_free_pseudo_locked(struct skmem_cache *skm, void *buf)
762 {
763 struct skmem_region *skr = skm->skm_region;
764 void *__single obj = buf;
765
766 ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
767
768 SKM_SLAB_LOCK_ASSERT_HELD(skm);
769
770 VERIFY(IS_P2ALIGNED(obj, skm->skm_bufalign));
771
772 #if KASAN
773 /*
774 * Since we stuffed the original zone element address before
775 * the buffer address in KASAN mode, get it back since we're
776 * about to free it.
777 */
778 void **pbuf = __unsafe_forge_bidi_indexable(void **,
779 ((intptr_t)obj - sizeof(void *)), sizeof(void *));
780
781 VERIFY(((intptr_t)obj + skm->skm_bufsize) <=
782 ((intptr_t)*pbuf + skm->skm_objsize));
783
784 obj = *pbuf;
785 #endif /* KASAN */
786
787 /* free it to zone */
788 zfree(skr->skr_zreg, obj);
789
790 skm->skm_sl_free++;
791 ASSERT(skm->skm_sl_bufinuse > 0);
792 skm->skm_sl_bufinuse--;
793 }
794
795 /*
796 * Free a raw object to the slab layer.
797 */
798 void
skmem_slab_free(struct skmem_cache * skm,void * buf)799 skmem_slab_free(struct skmem_cache *skm, void *buf)
800 {
801 if (skm->skm_mode & SKM_MODE_BATCH) {
802 ((struct skmem_obj *)buf)->mo_next = NULL;
803 }
804
805 SKM_SLAB_LOCK(skm);
806 skm->skm_slab_free(skm, buf);
807 SKM_SLAB_UNLOCK(skm);
808 }
809
810 /*
811 * Free raw object(s) to the slab layer.
812 */
813 void
skmem_slab_batch_free(struct skmem_cache * skm,struct skmem_obj * list)814 skmem_slab_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
815 {
816 struct skmem_obj *listn;
817
818 ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
819
820 SKM_SLAB_LOCK(skm);
821 for (;;) {
822 listn = list->mo_next;
823 list->mo_next = NULL;
824
825 /*
826 * Free a single object to the slab layer.
827 */
828 skm->skm_slab_free(skm, (void *)list);
829
830 /* if no more objects to free, we're done */
831 if ((list = listn) == NULL) {
832 break;
833 }
834 }
835 SKM_SLAB_UNLOCK(skm);
836 }
837
838
839 /*
840 * Given a buffer control, record the current transaction.
841 */
842 __attribute__((noinline, cold, not_tail_called))
843 inline void
skmem_audit_bufctl(struct skmem_bufctl * bc)844 skmem_audit_bufctl(struct skmem_bufctl *bc)
845 {
846 struct skmem_bufctl_audit *bca = (struct skmem_bufctl_audit *)bc;
847 struct timeval tv;
848
849 microuptime(&tv);
850 bca->bc_thread = current_thread();
851 bca->bc_timestamp = (uint32_t)((tv.tv_sec * 1000) + (tv.tv_usec / 1000));
852 bca->bc_depth = OSBacktrace(bca->bc_stack, SKMEM_STACK_DEPTH);
853 }
854