1 /*
2 * Copyright (c) 2016-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #define _FN_KPRINTF
31 #include <pexpert/pexpert.h> /* for PE_parse_boot_argn */
32 #include <libkern/OSDebug.h> /* for OSBacktrace */
33 #include <kern/sched_prim.h> /* for assert_wait */
34 #include <vm/vm_memtag.h>
35
36 static struct skmem_slab *skmem_slab_create(struct skmem_cache *, uint32_t);
37 static void skmem_slab_destroy(struct skmem_cache *, struct skmem_slab *);
38
39 /*
40 * Too big a value will cause overflow and thus trip the assertion; the
41 * idea here is to set an upper limit for the time that a particular
42 * thread is allowed to perform retries before we give up and panic.
43 */
44 #define SKMEM_SLAB_MAX_BACKOFF (20 * USEC_PER_SEC) /* seconds */
45
46 /*
47 * Threshold (in msec) after which we reset the exponential backoff value
48 * back to its (random) initial value. Note that we allow the actual delay
49 * to be at most twice this value.
50 */
51 #define SKMEM_SLAB_BACKOFF_THRES 1024 /* up to ~2 sec (2048 msec) */
52
53 /*
54 * To reduce the likelihood of global synchronization between threads,
55 * we use some random value to start the exponential backoff.
56 */
57 #define SKMEM_SLAB_BACKOFF_RANDOM 4 /* range is [1,4] msec */
58
59 /*
60 * Create a slab.
61 */
62 static struct skmem_slab *
skmem_slab_create(struct skmem_cache * skm,uint32_t skmflag)63 skmem_slab_create(struct skmem_cache *skm, uint32_t skmflag)
64 {
65 struct skmem_region *skr = skm->skm_region;
66 uint32_t objsize, chunks;
67 size_t slabsize = skm->skm_slabsize;
68 struct skmem_slab *__single sl;
69 struct sksegment *__single sg, *__single sgm;
70 char *buf, *__indexable slab;
71 char *__indexable bufm;
72 uint32_t slabm_size;
73 void *__sized_by(slabm_size) slabm;
74
75 /*
76 * Allocate a segment (a slab at our layer) from the region.
77 */
78 slab = skmem_region_alloc(skr, &slabm, &sg, &sgm, skmflag,
79 skr->skr_params.srp_c_seg_size, &slabm_size);
80 if (slab == NULL) {
81 goto rg_alloc_failure;
82 }
83
84 if ((sl = skmem_cache_alloc(skmem_slab_cache, SKMEM_SLEEP)) == NULL) {
85 goto slab_alloc_failure;
86 }
87
88 ASSERT(sg != NULL);
89 ASSERT(sgm == NULL || sgm->sg_index == sg->sg_index);
90
91 bzero(sl, sizeof(*sl));
92 sl->sl_cache = skm;
93 sl->sl_base = buf = slab;
94 bufm = slabm;
95 objsize = (uint32_t)skr->skr_c_obj_size;
96 sl->sl_basem = __unsafe_forge_bidi_indexable(void *, bufm, objsize);
97 ASSERT(skr->skr_c_obj_size <= UINT32_MAX);
98 ASSERT(skm->skm_objsize == objsize);
99 ASSERT((slabsize / objsize) <= UINT32_MAX);
100 sl->sl_chunks = chunks = (uint32_t)(slabsize / objsize);
101 sl->sl_seg = sg;
102 sl->sl_segm = sgm;
103
104 /*
105 * Create one or more buffer control structures for the slab,
106 * each one tracking a chunk of raw object from the segment,
107 * and insert these into the slab's list of buffer controls.
108 */
109 ASSERT(chunks > 0);
110 while (chunks != 0) {
111 struct skmem_bufctl *__indexable bc;
112 bc = skmem_cache_alloc(skmem_bufctl_cache, SKMEM_SLEEP);
113 if (bc == NULL) {
114 goto bufctl_alloc_failure;
115 }
116
117 bzero(bc, bc_size);
118 bc->bc_lim = objsize;
119 bc->bc_addr = buf;
120 bc->bc_addrm = bufm;
121 bc->bc_slab = sl;
122 bc->bc_idx = (sl->sl_chunks - chunks);
123 if (skr->skr_mode & SKR_MODE_SHAREOK) {
124 bc->bc_flags |= SKMEM_BUFCTL_SHAREOK;
125 }
126 SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
127 buf += objsize;
128 if (bufm != NULL) {
129 /* XXX -fbounds-safety */
130 bufm = (char *)bufm + objsize;
131 }
132 --chunks;
133 }
134
135 SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
136 SK_KVA(skm), SK_KVA(sl));
137 SK_DF(SK_VERB_MEM_CACHE, " [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
138 SK_KVA(slab), SK_KVA(slab + objsize));
139
140 return sl;
141
142 bufctl_alloc_failure:
143 skmem_slab_destroy(skm, sl);
144
145 slab_alloc_failure:
146 skmem_region_free(skr, slab, __unsafe_forge_bidi_indexable(void *,
147 slabm, skr->skr_c_obj_size));
148
149 rg_alloc_failure:
150 os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
151
152 return NULL;
153 }
154
155 /*
156 * Destroy a slab.
157 */
158 static void
skmem_slab_destroy(struct skmem_cache * skm,struct skmem_slab * sl)159 skmem_slab_destroy(struct skmem_cache *skm, struct skmem_slab *sl)
160 {
161 struct skmem_bufctl *bc, *tbc;
162 void *__single slab = sl->sl_base;
163 void *__single slabm = sl->sl_basem;
164
165 ASSERT(sl->sl_refcnt == 0);
166
167 SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
168 SK_KVA(skm), SK_KVA(sl));
169 SK_DF(SK_VERB_MEM_CACHE, " [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
170 SK_KVA(slab), SK_KVA((uintptr_t)slab + skm->skm_objsize));
171
172 /*
173 * Go through the slab's list of buffer controls and free
174 * them, and then free the slab itself back to its cache.
175 */
176 SLIST_FOREACH_SAFE(bc, &sl->sl_head, bc_link, tbc) {
177 SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
178 skmem_cache_free(skmem_bufctl_cache, bc);
179 }
180 skmem_cache_free(skmem_slab_cache, sl);
181
182 /*
183 * Restore original tag before freeing back to system. sl->sl_base should
184 * have the original tag.
185 */
186 if (skm->skm_region->skr_bufspec.memtag) {
187 vm_memtag_store_tag(slab, skm->skm_slabsize);
188 }
189
190 /* and finally free the segment back to the backing region */
191 skmem_region_free(skm->skm_region, slab, slabm);
192 }
193
194 /*
195 * Allocate a raw object from the (locked) slab layer. Normal region variant.
196 */
197 int
skmem_slab_alloc_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)198 skmem_slab_alloc_locked(struct skmem_cache *skm, struct skmem_obj_info *oi,
199 struct skmem_obj_info *oim, uint32_t skmflag)
200 {
201 struct skmem_bufctl_bkt *bcb;
202 struct skmem_bufctl *bc;
203 struct skmem_slab *sl;
204 uint32_t retries = 0;
205 uint64_t boff_total = 0; /* in usec */
206 uint64_t boff = 0; /* in msec */
207 boolean_t new_slab;
208 size_t bufsize;
209 void *__sized_by(bufsize) buf;
210 #if CONFIG_KERNEL_TAGGING
211 vm_map_address_t tagged_address; /* address tagging */
212 struct skmem_region *region; /* region source for this slab */
213 #endif /* CONFIG_KERNEL_TAGGING */
214
215 /* this flag is not for the caller to set */
216 VERIFY(!(skmflag & SKMEM_FAILOK));
217
218 /*
219 * A slab is either in a partially-allocated list (at least it has
220 * a free object available), or is in the empty list (everything
221 * has been allocated.) If we can't find a partially-allocated
222 * slab, then we need to allocate a slab (segment) from the region.
223 */
224 again:
225 SKM_SLAB_LOCK_ASSERT_HELD(skm);
226 sl = TAILQ_FIRST(&skm->skm_sl_partial_list);
227 if (sl == NULL) {
228 uint32_t flags = skmflag;
229 boolean_t retry;
230
231 ASSERT(skm->skm_sl_partial == 0);
232 SKM_SLAB_UNLOCK(skm);
233 if (!(flags & SKMEM_NOSLEEP)) {
234 /*
235 * Pick up a random value to start the exponential
236 * backoff, if this is the first round, or if the
237 * current value is over the threshold. Otherwise,
238 * double the backoff value.
239 */
240 if (boff == 0 || boff > SKMEM_SLAB_BACKOFF_THRES) {
241 read_frandom(&boff, sizeof(boff));
242 boff = (boff % SKMEM_SLAB_BACKOFF_RANDOM) + 1;
243 ASSERT(boff > 0);
244 } else if (os_mul_overflow(boff, 2, &boff)) {
245 panic_plain("\"%s\": boff counter "
246 "overflows\n", skm->skm_name);
247 /* NOTREACHED */
248 __builtin_unreachable();
249 }
250 /* add this value (in msec) to the total (in usec) */
251 if (os_add_overflow(boff_total,
252 (boff * NSEC_PER_USEC), &boff_total)) {
253 panic_plain("\"%s\": boff_total counter "
254 "overflows\n", skm->skm_name);
255 /* NOTREACHED */
256 __builtin_unreachable();
257 }
258 }
259 /*
260 * In the event of a race between multiple threads trying
261 * to create the last remaining (or the only) slab, let the
262 * loser(s) attempt to retry after waiting a bit. The winner
263 * would have inserted the newly-created slab into the list.
264 */
265 if (!(flags & SKMEM_NOSLEEP) &&
266 boff_total <= SKMEM_SLAB_MAX_BACKOFF) {
267 retry = TRUE;
268 ++retries;
269 flags |= SKMEM_FAILOK;
270 } else {
271 if (!(flags & SKMEM_NOSLEEP)) {
272 panic_plain("\"%s\": failed to allocate "
273 "slab (sleeping mode) after %llu "
274 "msec, %u retries\n\n%s", skm->skm_name,
275 (boff_total / NSEC_PER_USEC), retries,
276 skmem_dump(skm->skm_region));
277 /* NOTREACHED */
278 __builtin_unreachable();
279 }
280 retry = FALSE;
281 }
282
283 /*
284 * Create a new slab.
285 */
286 if ((sl = skmem_slab_create(skm, flags)) == NULL) {
287 if (retry) {
288 SK_ERR("\"%s\": failed to allocate "
289 "slab (%ssleeping mode): waiting for %llu "
290 "msec, total %llu msec, %u retries",
291 skm->skm_name,
292 (flags & SKMEM_NOSLEEP) ? "non-" : "",
293 boff, (boff_total / NSEC_PER_USEC), retries);
294 VERIFY(boff > 0 && ((uint32_t)boff <=
295 (SKMEM_SLAB_BACKOFF_THRES * 2)));
296 delay((uint32_t)boff * NSEC_PER_USEC);
297 SKM_SLAB_LOCK(skm);
298 goto again;
299 } else {
300 SK_RDERR(4, "\"%s\": failed to allocate slab "
301 "(%ssleeping mode)", skm->skm_name,
302 (flags & SKMEM_NOSLEEP) ? "non-" : "");
303 SKM_SLAB_LOCK(skm);
304 }
305 return ENOMEM;
306 }
307
308 SKM_SLAB_LOCK(skm);
309 skm->skm_sl_create++;
310 if ((skm->skm_sl_bufinuse += sl->sl_chunks) >
311 skm->skm_sl_bufmax) {
312 skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
313 }
314 }
315 skm->skm_sl_alloc++;
316
317 new_slab = (sl->sl_refcnt == 0);
318 ASSERT(new_slab || SKMEM_SLAB_IS_PARTIAL(sl));
319
320 sl->sl_refcnt++;
321 ASSERT(sl->sl_refcnt <= sl->sl_chunks);
322
323 /*
324 * We either have a new slab, or a partially-allocated one.
325 * Remove a buffer control from the slab, and insert it to
326 * the allocated-address hash chain.
327 */
328 bc = SLIST_FIRST(&sl->sl_head);
329 ASSERT(bc != NULL);
330 SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
331
332 /* sanity check */
333 VERIFY(bc->bc_usecnt == 0);
334
335 /*
336 * Also store the master object's region info for the caller.
337 */
338 bzero(oi, sizeof(*oi));
339 #if CONFIG_KERNEL_TAGGING
340 region = sl->sl_cache->skm_region;
341 if (region->skr_mode & SKR_MODE_MEMTAG) {
342 tagged_address = (vm_map_address_t)vm_memtag_generate_and_store_tag(bc->bc_addr,
343 skm->skm_objsize);
344 /*
345 * XXX -fbounds-safety: tagged_address's type is vm_offset_t
346 * which is unsafe, so we have ot use __unsafe_forge here.
347 * Also, skm->skm_objsize is equal to bc->bc_addr (they're both
348 * set to skr->skr_c_obj_size)
349 */
350 bufsize = skm->skm_objsize;
351 /*
352 * XXX -fbounds-safety: Couldn't pass bufsize here, because
353 * compiler gives an error: cannot reference 'bufsize' after it
354 * is changed during consecutive assignments
355 */
356 buf = __unsafe_forge_bidi_indexable(void *, tagged_address,
357 skm->skm_objsize);
358 } else {
359 bufsize = bc->bc_lim;
360 buf = bc->bc_addr;
361 }
362 #else /* !CONFIG_KERNEL_TAGGING */
363 bufsize = bc->bc_lim;
364 buf = bc->bc_addr;
365 #endif /* CONFIG_KERNEL_TAGGING */
366 SKMEM_OBJ_SIZE(oi) = (uint32_t)bufsize;
367 SKMEM_OBJ_ADDR(oi) = buf;
368 SKMEM_OBJ_BUFCTL(oi) = bc; /* master only; NULL for slave */
369 ASSERT(skm->skm_objsize <= UINT32_MAX);
370 SKMEM_OBJ_IDX_REG(oi) =
371 ((sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx);
372 SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
373 /*
374 * And for slave object.
375 */
376 if (oim != NULL) {
377 bzero(oim, sizeof(*oim));
378 if (bc->bc_addrm != NULL) {
379 SKMEM_OBJ_ADDR(oim) = __unsafe_forge_bidi_indexable(
380 void *, bc->bc_addrm, SKMEM_OBJ_SIZE(oi));
381 SKMEM_OBJ_SIZE(oim) = SKMEM_OBJ_SIZE(oi);
382 SKMEM_OBJ_IDX_REG(oim) = SKMEM_OBJ_IDX_REG(oi);
383 SKMEM_OBJ_IDX_SEG(oim) = SKMEM_OBJ_IDX_SEG(oi);
384 }
385 }
386
387 if (skm->skm_mode & SKM_MODE_BATCH) {
388 ((struct skmem_obj *)buf)->mo_next = NULL;
389 }
390
391 /* insert to allocated-address hash chain */
392 bcb = SKMEM_CACHE_HASH(skm, buf);
393 SLIST_INSERT_HEAD(&bcb->bcb_head, bc, bc_link);
394
395 if (SLIST_EMPTY(&sl->sl_head)) {
396 /*
397 * If that was the last buffer control from this slab,
398 * insert the slab into the empty list. If it was in
399 * the partially-allocated list, then remove the slab
400 * from there as well.
401 */
402 ASSERT(sl->sl_refcnt == sl->sl_chunks);
403 if (new_slab) {
404 ASSERT(sl->sl_chunks == 1);
405 } else {
406 ASSERT(sl->sl_chunks > 1);
407 ASSERT(skm->skm_sl_partial > 0);
408 skm->skm_sl_partial--;
409 TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
410 }
411 skm->skm_sl_empty++;
412 ASSERT(skm->skm_sl_empty != 0);
413 TAILQ_INSERT_HEAD(&skm->skm_sl_empty_list, sl, sl_link);
414 } else {
415 /*
416 * The slab is not empty; if it was newly allocated
417 * above, then it's not in the partially-allocated
418 * list and so we insert it there.
419 */
420 ASSERT(SKMEM_SLAB_IS_PARTIAL(sl));
421 if (new_slab) {
422 skm->skm_sl_partial++;
423 ASSERT(skm->skm_sl_partial != 0);
424 TAILQ_INSERT_HEAD(&skm->skm_sl_partial_list,
425 sl, sl_link);
426 }
427 }
428
429 /* if auditing is enabled, record this transaction */
430 if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
431 skmem_audit_bufctl(bc);
432 }
433
434 return 0;
435 }
436
437 /*
438 * Allocate a raw object from the (locked) slab layer. Pseudo region variant.
439 */
440 int
skmem_slab_alloc_pseudo_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)441 skmem_slab_alloc_pseudo_locked(struct skmem_cache *skm,
442 struct skmem_obj_info *oi, struct skmem_obj_info *oim, uint32_t skmflag)
443 {
444 zalloc_flags_t zflags = (skmflag & SKMEM_NOSLEEP) ? Z_NOWAIT : Z_WAITOK;
445 struct skmem_region *skr = skm->skm_region;
446 void *obj, *buf;
447
448 /* this flag is not for the caller to set */
449 VERIFY(!(skmflag & SKMEM_FAILOK));
450
451 SKM_SLAB_LOCK_ASSERT_HELD(skm);
452
453 ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
454 /* mirrored region is not applicable */
455 ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
456 /* batching is not yet supported */
457 ASSERT(!(skm->skm_mode & SKM_MODE_BATCH));
458
459 obj = zalloc_flags_buf(skr->skr_zreg, zflags | Z_ZERO);
460 if (obj == NULL) {
461 os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
462 return ENOMEM;
463 }
464
465 #if KASAN
466 /*
467 * Perform some fix-ups since the zone element isn't guaranteed
468 * to be on the aligned boundary. The effective object size
469 * has been adjusted accordingly by skmem_region_create() earlier
470 * at cache creation time.
471 *
472 * 'buf' is the aligned address for this object.
473 */
474 uintptr_t diff = P2ROUNDUP((intptr_t)obj + sizeof(u_int64_t),
475 skm->skm_bufalign) - (uintptr_t)obj;
476 buf = (void *)((char *)obj + diff);
477
478 /*
479 * Wind back a pointer size from the aligned address and
480 * save the original address so we can free it later.
481 */
482 /*
483 * XXX -fbounds-safety: Since this function is for generic alloc, we
484 * cannot modify the struct like we did for struct skmem_cache.
485 * Unfortunately, __unsafe_forge_bidi_indexable seems to be the only
486 * choice.
487 */
488 void **pbuf = __unsafe_forge_bidi_indexable(void **,
489 (intptr_t)buf - sizeof(void *), sizeof(void *));
490 *pbuf = obj;
491
492 VERIFY(((intptr_t)buf + skm->skm_bufsize) <=
493 ((intptr_t)obj + skm->skm_objsize));
494 #else /* !KASAN */
495 /*
496 * We expect that the zone allocator would allocate elements
497 * rounded up to the requested alignment based on the effective
498 * object size computed in skmem_region_create() earlier, and
499 * 'buf' is therefore the element address itself.
500 */
501 buf = obj;
502 #endif /* !KASAN */
503
504 /* make sure the object is aligned */
505 VERIFY(IS_P2ALIGNED(buf, skm->skm_bufalign));
506
507 /*
508 * Return the object's info to the caller.
509 */
510 bzero(oi, sizeof(*oi));
511 SKMEM_OBJ_ADDR(oi) = buf;
512 #if KASAN
513 SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize -
514 (uint32_t)skm->skm_bufalign;
515 #else
516 SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
517 #endif
518 ASSERT(skm->skm_objsize <= UINT32_MAX);
519 if (oim != NULL) {
520 bzero(oim, sizeof(*oim));
521 }
522
523 skm->skm_sl_alloc++;
524 skm->skm_sl_bufinuse++;
525 if (skm->skm_sl_bufinuse > skm->skm_sl_bufmax) {
526 skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
527 }
528
529 return 0;
530 }
531
532 /*
533 * Allocate a raw object from the slab layer.
534 */
535 int
skmem_slab_alloc(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)536 skmem_slab_alloc(struct skmem_cache *skm, struct skmem_obj_info *oi,
537 struct skmem_obj_info *oim, uint32_t skmflag)
538 {
539 int err;
540
541 SKM_SLAB_LOCK(skm);
542 err = skm->skm_slab_alloc(skm, oi, oim, skmflag);
543 SKM_SLAB_UNLOCK(skm);
544
545 return err;
546 }
547
548 /*
549 * Allocate raw object(s) from the slab layer.
550 */
551 uint32_t
skmem_slab_batch_alloc(struct skmem_cache * skm,struct skmem_obj ** list,uint32_t num,uint32_t skmflag)552 skmem_slab_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
553 uint32_t num, uint32_t skmflag)
554 {
555 uint32_t need = num;
556
557 ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
558 *list = NULL;
559
560 SKM_SLAB_LOCK(skm);
561 for (;;) {
562 struct skmem_obj_info oi, oim;
563
564 /*
565 * Get a single raw object from the slab layer.
566 */
567 if (skm->skm_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
568 break;
569 }
570
571 *list = SKMEM_OBJ_ADDR(&oi);
572 ASSERT((*list)->mo_next == NULL);
573 /* store these inside the object itself */
574 (*list)->mo_info = oi;
575 (*list)->mo_minfo = oim;
576 list = &(*list)->mo_next;
577
578 ASSERT(need != 0);
579 if (--need == 0) {
580 break;
581 }
582 }
583 SKM_SLAB_UNLOCK(skm);
584
585 return num - need;
586 }
587
588 /*
589 * Free a raw object to the (locked) slab layer. Normal region variant.
590 */
591 void
skmem_slab_free_locked(struct skmem_cache * skm,void * buf)592 skmem_slab_free_locked(struct skmem_cache *skm, void *buf)
593 {
594 struct skmem_bufctl *bc, *tbc;
595 struct skmem_bufctl_bkt *bcb;
596 struct skmem_slab *sl = NULL;
597 #if CONFIG_KERNEL_TAGGING
598 struct skmem_region *region;
599 vm_map_address_t tagged_addr;
600 #endif /* CONFIG_KERNEL_TAGGING */
601
602 SKM_SLAB_LOCK_ASSERT_HELD(skm);
603 ASSERT(buf != NULL);
604 /* caller is expected to clear mo_next */
605 ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
606 ((struct skmem_obj *)buf)->mo_next == NULL);
607
608 /*
609 * Search the hash chain to find a matching buffer control for the
610 * given object address. If found, remove the buffer control from
611 * the hash chain and insert it into the freelist. Otherwise, we
612 * panic since the caller has given us a bogus address.
613 */
614 skm->skm_sl_free++;
615 bcb = SKMEM_CACHE_HASH(skm, buf);
616
617 SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
618 if (SKMEM_COMPARE_CANONICAL_ADDR(bc->bc_addr, buf, skm->skm_objsize)) {
619 SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
620 sl = bc->bc_slab;
621 break;
622 }
623 }
624
625 if (bc == NULL) {
626 panic("%s: attempt to free invalid or already-freed obj %p "
627 "on skm %p", __func__, buf, skm);
628 /* NOTREACHED */
629 __builtin_unreachable();
630 }
631 ASSERT(sl != NULL && sl->sl_cache == skm);
632 VERIFY(SKMEM_SLAB_MEMBER(sl, SKMEM_MEMTAG_STRIP_TAG(buf, skm->skm_objsize)));
633
634 /* make sure this object is not currently in use by another object */
635 VERIFY(bc->bc_usecnt == 0);
636
637 /* if auditing is enabled, record this transaction */
638 if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
639 skmem_audit_bufctl(bc);
640 }
641
642 /* if clear on free is requested, zero out the object */
643 if (skm->skm_mode & SKM_MODE_CLEARONFREE) {
644 size_t size = skm->skm_objsize;
645 void *buf_cpy = __unsafe_forge_bidi_indexable(void *, buf, size);
646 bzero(buf_cpy, size);
647 buf_cpy = NULL;
648 size = 0;
649 }
650
651 #if CONFIG_KERNEL_TAGGING
652 region = skm->skm_region;
653 if (region->skr_mode & SKR_MODE_MEMTAG) {
654 tagged_addr = (vm_map_address_t)vm_memtag_generate_and_store_tag(buf, skm->skm_objsize);
655 }
656 #endif /* CONFIG_KERNEL_TAGGING */
657
658 /* insert the buffer control to the slab's freelist */
659 SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
660
661 ASSERT(sl->sl_refcnt >= 1);
662 if (--sl->sl_refcnt == 0) {
663 /*
664 * If this was the last outstanding object for the slab,
665 * remove the slab from the partially-allocated or empty
666 * list, and destroy the slab (segment) back to the region.
667 */
668 if (sl->sl_chunks == 1) {
669 ASSERT(skm->skm_sl_empty > 0);
670 skm->skm_sl_empty--;
671 TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
672 } else {
673 ASSERT(skm->skm_sl_partial > 0);
674 skm->skm_sl_partial--;
675 TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
676 }
677 ASSERT((int64_t)(skm->skm_sl_bufinuse - sl->sl_chunks) >= 0);
678 skm->skm_sl_bufinuse -= sl->sl_chunks;
679 skm->skm_sl_destroy++;
680 SKM_SLAB_UNLOCK(skm);
681 skmem_slab_destroy(skm, sl);
682 SKM_SLAB_LOCK(skm);
683 return;
684 }
685
686 ASSERT(bc == SLIST_FIRST(&sl->sl_head));
687 if (SLIST_NEXT(bc, bc_link) == NULL) {
688 /*
689 * If this is the first (potentially amongst many) object
690 * that's returned to the slab, remove the slab from the
691 * empty list and insert to end of the partially-allocated
692 * list. This should help avoid thrashing the partial slab
693 * since we avoid disturbing what's already at the front.
694 */
695 ASSERT(sl->sl_refcnt == (sl->sl_chunks - 1));
696 ASSERT(sl->sl_chunks > 1);
697 ASSERT(skm->skm_sl_empty > 0);
698 skm->skm_sl_empty--;
699 TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
700 skm->skm_sl_partial++;
701 ASSERT(skm->skm_sl_partial != 0);
702 TAILQ_INSERT_TAIL(&skm->skm_sl_partial_list, sl, sl_link);
703 }
704 }
705
706 /*
707 * Free a raw object to the (locked) slab layer. Pseudo region variant.
708 */
709 void
skmem_slab_free_pseudo_locked(struct skmem_cache * skm,void * buf)710 skmem_slab_free_pseudo_locked(struct skmem_cache *skm, void *buf)
711 {
712 struct skmem_region *skr = skm->skm_region;
713 void *__single obj = buf;
714
715 ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
716
717 SKM_SLAB_LOCK_ASSERT_HELD(skm);
718
719 VERIFY(IS_P2ALIGNED(obj, skm->skm_bufalign));
720
721 #if KASAN
722 /*
723 * Since we stuffed the original zone element address before
724 * the buffer address in KASAN mode, get it back since we're
725 * about to free it.
726 */
727 void **pbuf = __unsafe_forge_bidi_indexable(void **,
728 ((intptr_t)obj - sizeof(void *)), sizeof(void *));
729
730 VERIFY(((intptr_t)obj + skm->skm_bufsize) <=
731 ((intptr_t)*pbuf + skm->skm_objsize));
732
733 obj = *pbuf;
734 #endif /* KASAN */
735
736 /* free it to zone */
737 zfree(skr->skr_zreg, obj);
738
739 skm->skm_sl_free++;
740 ASSERT(skm->skm_sl_bufinuse > 0);
741 skm->skm_sl_bufinuse--;
742 }
743
744 /*
745 * Free a raw object to the slab layer.
746 */
747 void
skmem_slab_free(struct skmem_cache * skm,void * buf)748 skmem_slab_free(struct skmem_cache *skm, void *buf)
749 {
750 if (skm->skm_mode & SKM_MODE_BATCH) {
751 ((struct skmem_obj *)buf)->mo_next = NULL;
752 }
753
754 SKM_SLAB_LOCK(skm);
755 skm->skm_slab_free(skm, buf);
756 SKM_SLAB_UNLOCK(skm);
757 }
758
759 /*
760 * Free raw object(s) to the slab layer.
761 */
762 void
skmem_slab_batch_free(struct skmem_cache * skm,struct skmem_obj * list)763 skmem_slab_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
764 {
765 struct skmem_obj *listn;
766
767 ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
768
769 SKM_SLAB_LOCK(skm);
770 for (;;) {
771 listn = list->mo_next;
772 list->mo_next = NULL;
773
774 /*
775 * Free a single object to the slab layer.
776 */
777 skm->skm_slab_free(skm, (void *)list);
778
779 /* if no more objects to free, we're done */
780 if ((list = listn) == NULL) {
781 break;
782 }
783 }
784 SKM_SLAB_UNLOCK(skm);
785 }
786
787
788 /*
789 * Given a buffer control, record the current transaction.
790 */
791 __attribute__((noinline, cold, not_tail_called))
792 inline void
skmem_audit_bufctl(struct skmem_bufctl * bc)793 skmem_audit_bufctl(struct skmem_bufctl *bc)
794 {
795 struct skmem_bufctl_audit *bca = (struct skmem_bufctl_audit *)bc;
796 struct timeval tv;
797
798 microuptime(&tv);
799 bca->bc_thread = current_thread();
800 bca->bc_timestamp = (uint32_t)((tv.tv_sec * 1000) + (tv.tv_usec / 1000));
801 bca->bc_depth = OSBacktrace(bca->bc_stack, SKMEM_STACK_DEPTH);
802 }
803