1 /*
2 * Copyright (c) 2016-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #define _FN_KPRINTF
31 #include <pexpert/pexpert.h> /* for PE_parse_boot_argn */
32 #include <libkern/OSDebug.h> /* for OSBacktrace */
33 #include <kern/sched_prim.h> /* for assert_wait */
34 #include <kern/uipc_domain.h>
35 #include <vm/vm_memtag.h>
36
37 static struct skmem_slab *skmem_slab_create(struct skmem_cache *, uint32_t);
38 static void skmem_slab_destroy(struct skmem_cache *, struct skmem_slab *);
39
40 /*
41 * Too big a value will cause overflow and thus trip the assertion; the
42 * idea here is to set an upper limit for the time that a particular
43 * thread is allowed to perform retries before we give up and panic.
44 */
45 #define SKMEM_SLAB_MAX_BACKOFF (20 * USEC_PER_SEC) /* seconds */
46
47 /*
48 * Threshold (in msec) after which we reset the exponential backoff value
49 * back to its (random) initial value. Note that we allow the actual delay
50 * to be at most twice this value.
51 */
52 #define SKMEM_SLAB_BACKOFF_THRES 1024 /* up to ~2 sec (2048 msec) */
53
54 /*
55 * To reduce the likelihood of global synchronization between threads,
56 * we use some random value to start the exponential backoff.
57 */
58 #define SKMEM_SLAB_BACKOFF_RANDOM 4 /* range is [1,4] msec */
59
60 /*
61 * Create a slab.
62 */
63 static struct skmem_slab *
skmem_slab_create(struct skmem_cache * skm,uint32_t skmflag)64 skmem_slab_create(struct skmem_cache *skm, uint32_t skmflag)
65 {
66 struct skmem_region *skr = skm->skm_region;
67 uint32_t objsize, chunks;
68 size_t slabsize = skm->skm_slabsize;
69 struct skmem_slab *__single sl;
70 struct sksegment *__single sg, *__single sgm;
71 char *buf, *__indexable slab;
72 char *__indexable bufm;
73 uint32_t slabm_size;
74 void *__sized_by(slabm_size) slabm;
75
76 /*
77 * Allocate a segment (a slab at our layer) from the region.
78 */
79 slab = skmem_region_alloc(skr, &slabm, &sg, &sgm, skmflag,
80 skr->skr_params.srp_c_seg_size, &slabm_size);
81 if (slab == NULL) {
82 goto rg_alloc_failure;
83 }
84
85 if ((sl = skmem_cache_alloc(skmem_slab_cache, SKMEM_SLEEP)) == NULL) {
86 goto slab_alloc_failure;
87 }
88
89 ASSERT(sg != NULL);
90 ASSERT(sgm == NULL || sgm->sg_index == sg->sg_index);
91
92 bzero(sl, sizeof(*sl));
93 sl->sl_cache = skm;
94 sl->sl_base = buf = slab;
95 bufm = slabm;
96 objsize = (uint32_t)skr->skr_c_obj_size;
97 sl->sl_basem = __unsafe_forge_bidi_indexable(void *, bufm, objsize);
98 ASSERT(skr->skr_c_obj_size <= UINT32_MAX);
99 ASSERT(skm->skm_objsize == objsize);
100 ASSERT((slabsize / objsize) <= UINT32_MAX);
101 sl->sl_chunks = chunks = (uint32_t)(slabsize / objsize);
102 sl->sl_seg = sg;
103 sl->sl_segm = sgm;
104
105 /*
106 * Create one or more buffer control structures for the slab,
107 * each one tracking a chunk of raw object from the segment,
108 * and insert these into the slab's list of buffer controls.
109 */
110 ASSERT(chunks > 0);
111 while (chunks != 0) {
112 struct skmem_bufctl *__indexable bc;
113 bc = skmem_cache_alloc(skmem_bufctl_cache, SKMEM_SLEEP);
114 if (bc == NULL) {
115 goto bufctl_alloc_failure;
116 }
117
118 bzero(bc, bc_size);
119 bc->bc_lim = objsize;
120 bc->bc_addr = buf;
121 bc->bc_addrm = bufm;
122 bc->bc_slab = sl;
123 bc->bc_idx = (sl->sl_chunks - chunks);
124 if (skr->skr_mode & SKR_MODE_SHAREOK) {
125 bc->bc_flags |= SKMEM_BUFCTL_SHAREOK;
126 }
127 SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
128 buf += objsize;
129 if (bufm != NULL) {
130 /* XXX -fbounds-safety */
131 bufm = (char *)bufm + objsize;
132 }
133 --chunks;
134 }
135
136 SK_DF(SK_VERB_MEM_CACHE, "skm %p sl %p",
137 SK_KVA(skm), SK_KVA(sl));
138 SK_DF(SK_VERB_MEM_CACHE, " [%u] [%p-%p)", sl->sl_seg->sg_index,
139 SK_KVA(slab), SK_KVA(slab + objsize));
140
141 return sl;
142
143 bufctl_alloc_failure:
144 skmem_slab_destroy(skm, sl);
145
146 slab_alloc_failure:
147 skmem_region_free(skr, slab, __unsafe_forge_bidi_indexable(void *,
148 slabm, skr->skr_c_obj_size));
149
150 rg_alloc_failure:
151 os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
152
153 return NULL;
154 }
155
156 /*
157 * Destroy a slab.
158 */
159 static void
skmem_slab_destroy(struct skmem_cache * skm,struct skmem_slab * sl)160 skmem_slab_destroy(struct skmem_cache *skm, struct skmem_slab *sl)
161 {
162 struct skmem_bufctl *bc, *tbc;
163 void *__single slab = sl->sl_base;
164 void *__single slabm = sl->sl_basem;
165
166 ASSERT(sl->sl_refcnt == 0);
167
168 SK_DF(SK_VERB_MEM_CACHE, "skm %p sl %p",
169 SK_KVA(skm), SK_KVA(sl));
170 SK_DF(SK_VERB_MEM_CACHE, " [%u] [%p-%p)", sl->sl_seg->sg_index,
171 SK_KVA(slab), SK_KVA((uintptr_t)slab + skm->skm_objsize));
172
173 /*
174 * Go through the slab's list of buffer controls and free
175 * them, and then free the slab itself back to its cache.
176 */
177 SLIST_FOREACH_SAFE(bc, &sl->sl_head, bc_link, tbc) {
178 SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
179 skmem_cache_free(skmem_bufctl_cache, bc);
180 }
181 skmem_cache_free(skmem_slab_cache, sl);
182
183 /*
184 * Restore original tag before freeing back to system. sl->sl_base should
185 * have the original tag.
186 */
187 if (skm->skm_region->skr_bufspec.memtag) {
188 vm_memtag_store_tag(slab, skm->skm_slabsize);
189 }
190
191 /* and finally free the segment back to the backing region */
192 skmem_region_free(skm->skm_region, slab, slabm);
193 }
194
195 /*
196 * Allocate a raw object from the (locked) slab layer. Normal region variant.
197 */
198 int
skmem_slab_alloc_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)199 skmem_slab_alloc_locked(struct skmem_cache *skm, struct skmem_obj_info *oi,
200 struct skmem_obj_info *oim, uint32_t skmflag)
201 {
202 struct skmem_bufctl_bkt *bcb;
203 struct skmem_bufctl *bc;
204 struct skmem_slab *sl;
205 uint32_t retries = 0;
206 uint64_t boff_total = 0; /* in usec */
207 uint64_t boff = 0; /* in msec */
208 boolean_t new_slab;
209 size_t bufsize;
210 void *__sized_by(bufsize) buf;
211 #if CONFIG_KERNEL_TAGGING
212 vm_map_address_t tagged_address; /* address tagging */
213 struct skmem_region *region; /* region source for this slab */
214 #endif /* CONFIG_KERNEL_TAGGING */
215
216 /* this flag is not for the caller to set */
217 VERIFY(!(skmflag & SKMEM_FAILOK));
218
219 /*
220 * A slab is either in a partially-allocated list (at least it has
221 * a free object available), or is in the empty list (everything
222 * has been allocated.) If we can't find a partially-allocated
223 * slab, then we need to allocate a slab (segment) from the region.
224 */
225 again:
226 SKM_SLAB_LOCK_ASSERT_HELD(skm);
227 sl = TAILQ_FIRST(&skm->skm_sl_partial_list);
228 if (sl == NULL) {
229 uint32_t flags = skmflag;
230 boolean_t retry;
231
232 ASSERT(skm->skm_sl_partial == 0);
233 SKM_SLAB_UNLOCK(skm);
234 if (!(flags & SKMEM_NOSLEEP)) {
235 /*
236 * Pick up a random value to start the exponential
237 * backoff, if this is the first round, or if the
238 * current value is over the threshold. Otherwise,
239 * double the backoff value.
240 */
241 if (boff == 0 || boff > SKMEM_SLAB_BACKOFF_THRES) {
242 read_frandom(&boff, sizeof(boff));
243 boff = (boff % SKMEM_SLAB_BACKOFF_RANDOM) + 1;
244 ASSERT(boff > 0);
245 } else if (os_mul_overflow(boff, 2, &boff)) {
246 panic_plain("\"%s\": boff counter "
247 "overflows\n", skm->skm_name);
248 /* NOTREACHED */
249 __builtin_unreachable();
250 }
251 /* add this value (in msec) to the total (in usec) */
252 if (os_add_overflow(boff_total,
253 (boff * NSEC_PER_USEC), &boff_total)) {
254 panic_plain("\"%s\": boff_total counter "
255 "overflows\n", skm->skm_name);
256 /* NOTREACHED */
257 __builtin_unreachable();
258 }
259 }
260 /*
261 * In the event of a race between multiple threads trying
262 * to create the last remaining (or the only) slab, let the
263 * loser(s) attempt to retry after waiting a bit. The winner
264 * would have inserted the newly-created slab into the list.
265 */
266 if (!(flags & SKMEM_NOSLEEP) &&
267 boff_total <= SKMEM_SLAB_MAX_BACKOFF) {
268 retry = TRUE;
269 ++retries;
270 flags |= SKMEM_FAILOK;
271 } else {
272 if (!(flags & SKMEM_NOSLEEP)) {
273 panic_plain("\"%s\": failed to allocate "
274 "slab (sleeping mode) after %llu "
275 "msec, %u retries\n\n%s", skm->skm_name,
276 (boff_total / NSEC_PER_USEC), retries,
277 skmem_dump(skm->skm_region));
278 /* NOTREACHED */
279 __builtin_unreachable();
280 }
281 retry = FALSE;
282 }
283
284 /*
285 * Create a new slab.
286 */
287 if ((sl = skmem_slab_create(skm, flags)) == NULL) {
288 if (retry) {
289 SK_ERR("\"%s\": failed to allocate "
290 "slab (%ssleeping mode): waiting for %llu "
291 "msec, total %llu msec, %u retries",
292 skm->skm_name,
293 (flags & SKMEM_NOSLEEP) ? "non-" : "",
294 boff, (boff_total / NSEC_PER_USEC), retries);
295 VERIFY(boff > 0 && ((uint32_t)boff <=
296 (SKMEM_SLAB_BACKOFF_THRES * 2)));
297 delay((uint32_t)boff * NSEC_PER_USEC);
298 SKM_SLAB_LOCK(skm);
299 goto again;
300 } else {
301 SK_RDERR(4, "\"%s\": failed to allocate slab "
302 "(%ssleeping mode)", skm->skm_name,
303 (flags & SKMEM_NOSLEEP) ? "non-" : "");
304 SKM_SLAB_LOCK(skm);
305 }
306 return ENOMEM;
307 }
308
309 SKM_SLAB_LOCK(skm);
310 skm->skm_sl_create++;
311 if ((skm->skm_sl_bufinuse += sl->sl_chunks) >
312 skm->skm_sl_bufmax) {
313 skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
314 }
315 }
316 skm->skm_sl_alloc++;
317
318 new_slab = (sl->sl_refcnt == 0);
319 ASSERT(new_slab || SKMEM_SLAB_IS_PARTIAL(sl));
320
321 sl->sl_refcnt++;
322 ASSERT(sl->sl_refcnt <= sl->sl_chunks);
323
324 /*
325 * We either have a new slab, or a partially-allocated one.
326 * Remove a buffer control from the slab, and insert it to
327 * the allocated-address hash chain.
328 */
329 bc = SLIST_FIRST(&sl->sl_head);
330 ASSERT(bc != NULL);
331 SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
332
333 /* sanity check */
334 VERIFY(bc->bc_usecnt == 0);
335
336 /*
337 * Also store the master object's region info for the caller.
338 */
339 bzero(oi, sizeof(*oi));
340 #if CONFIG_KERNEL_TAGGING
341 region = sl->sl_cache->skm_region;
342 if (region->skr_mode & SKR_MODE_MEMTAG) {
343 tagged_address = (vm_map_address_t)vm_memtag_generate_and_store_tag(bc->bc_addr,
344 skm->skm_objsize);
345 /*
346 * XXX -fbounds-safety: tagged_address's type is vm_offset_t
347 * which is unsafe, so we have ot use __unsafe_forge here.
348 * Also, skm->skm_objsize is equal to bc->bc_addr (they're both
349 * set to skr->skr_c_obj_size)
350 */
351 bufsize = skm->skm_objsize;
352 /*
353 * XXX -fbounds-safety: Couldn't pass bufsize here, because
354 * compiler gives an error: cannot reference 'bufsize' after it
355 * is changed during consecutive assignments
356 */
357 buf = __unsafe_forge_bidi_indexable(void *, tagged_address,
358 skm->skm_objsize);
359 } else {
360 bufsize = bc->bc_lim;
361 buf = bc->bc_addr;
362 }
363 #else /* !CONFIG_KERNEL_TAGGING */
364 bufsize = bc->bc_lim;
365 buf = bc->bc_addr;
366 #endif /* CONFIG_KERNEL_TAGGING */
367 SKMEM_OBJ_SIZE(oi) = (uint32_t)bufsize;
368 SKMEM_OBJ_ADDR(oi) = buf;
369 SKMEM_OBJ_BUFCTL(oi) = bc; /* master only; NULL for slave */
370 ASSERT(skm->skm_objsize <= UINT32_MAX);
371 SKMEM_OBJ_IDX_REG(oi) =
372 ((sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx);
373 SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
374 /*
375 * And for slave object.
376 */
377 if (oim != NULL) {
378 bzero(oim, sizeof(*oim));
379 if (bc->bc_addrm != NULL) {
380 SKMEM_OBJ_ADDR(oim) = __unsafe_forge_bidi_indexable(
381 void *, bc->bc_addrm, SKMEM_OBJ_SIZE(oi));
382 SKMEM_OBJ_SIZE(oim) = SKMEM_OBJ_SIZE(oi);
383 SKMEM_OBJ_IDX_REG(oim) = SKMEM_OBJ_IDX_REG(oi);
384 SKMEM_OBJ_IDX_SEG(oim) = SKMEM_OBJ_IDX_SEG(oi);
385 }
386 }
387
388 if (skm->skm_mode & SKM_MODE_BATCH) {
389 ((struct skmem_obj *)buf)->mo_next = NULL;
390 }
391
392 /* insert to allocated-address hash chain */
393 bcb = SKMEM_CACHE_HASH(skm, buf);
394 SLIST_INSERT_HEAD(&bcb->bcb_head, bc, bc_link);
395
396 if (SLIST_EMPTY(&sl->sl_head)) {
397 /*
398 * If that was the last buffer control from this slab,
399 * insert the slab into the empty list. If it was in
400 * the partially-allocated list, then remove the slab
401 * from there as well.
402 */
403 ASSERT(sl->sl_refcnt == sl->sl_chunks);
404 if (new_slab) {
405 ASSERT(sl->sl_chunks == 1);
406 } else {
407 ASSERT(sl->sl_chunks > 1);
408 ASSERT(skm->skm_sl_partial > 0);
409 skm->skm_sl_partial--;
410 TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
411 }
412 skm->skm_sl_empty++;
413 ASSERT(skm->skm_sl_empty != 0);
414 TAILQ_INSERT_HEAD(&skm->skm_sl_empty_list, sl, sl_link);
415 } else {
416 /*
417 * The slab is not empty; if it was newly allocated
418 * above, then it's not in the partially-allocated
419 * list and so we insert it there.
420 */
421 ASSERT(SKMEM_SLAB_IS_PARTIAL(sl));
422 if (new_slab) {
423 skm->skm_sl_partial++;
424 ASSERT(skm->skm_sl_partial != 0);
425 TAILQ_INSERT_HEAD(&skm->skm_sl_partial_list,
426 sl, sl_link);
427 }
428 }
429
430 /* if auditing is enabled, record this transaction */
431 if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
432 skmem_audit_bufctl(bc);
433 }
434
435 return 0;
436 }
437
438 /*
439 * Allocate a raw object from the (locked) slab layer. Pseudo region variant.
440 */
441 int
skmem_slab_alloc_pseudo_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)442 skmem_slab_alloc_pseudo_locked(struct skmem_cache *skm,
443 struct skmem_obj_info *oi, struct skmem_obj_info *oim, uint32_t skmflag)
444 {
445 zalloc_flags_t zflags = (skmflag & SKMEM_NOSLEEP) ? Z_NOWAIT : Z_WAITOK;
446 struct skmem_region *skr = skm->skm_region;
447 void *obj, *buf;
448
449 /* this flag is not for the caller to set */
450 VERIFY(!(skmflag & SKMEM_FAILOK));
451
452 SKM_SLAB_LOCK_ASSERT_HELD(skm);
453
454 ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
455 /* mirrored region is not applicable */
456 ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
457 /* batching is not yet supported */
458 ASSERT(!(skm->skm_mode & SKM_MODE_BATCH));
459
460 obj = zalloc_flags_buf(skr->skr_zreg, zflags | Z_ZERO);
461 if (obj == NULL) {
462 os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
463 return ENOMEM;
464 }
465
466 #if KASAN
467 /*
468 * Perform some fix-ups since the zone element isn't guaranteed
469 * to be on the aligned boundary. The effective object size
470 * has been adjusted accordingly by skmem_region_create() earlier
471 * at cache creation time.
472 *
473 * 'buf' is the aligned address for this object.
474 */
475 uintptr_t diff = P2ROUNDUP((intptr_t)obj + sizeof(u_int64_t),
476 skm->skm_bufalign) - (uintptr_t)obj;
477 buf = (void *)((char *)obj + diff);
478
479 /*
480 * Wind back a pointer size from the aligned address and
481 * save the original address so we can free it later.
482 */
483 /*
484 * XXX -fbounds-safety: Since this function is for generic alloc, we
485 * cannot modify the struct like we did for struct skmem_cache.
486 * Unfortunately, __unsafe_forge_bidi_indexable seems to be the only
487 * choice.
488 */
489 void **pbuf = __unsafe_forge_bidi_indexable(void **,
490 (intptr_t)buf - sizeof(void *), sizeof(void *));
491 *pbuf = obj;
492
493 VERIFY(((intptr_t)buf + skm->skm_bufsize) <=
494 ((intptr_t)obj + skm->skm_objsize));
495 #else /* !KASAN */
496 /*
497 * We expect that the zone allocator would allocate elements
498 * rounded up to the requested alignment based on the effective
499 * object size computed in skmem_region_create() earlier, and
500 * 'buf' is therefore the element address itself.
501 */
502 buf = obj;
503 #endif /* !KASAN */
504
505 /* make sure the object is aligned */
506 VERIFY(IS_P2ALIGNED(buf, skm->skm_bufalign));
507
508 /*
509 * Return the object's info to the caller.
510 */
511 bzero(oi, sizeof(*oi));
512 SKMEM_OBJ_ADDR(oi) = buf;
513 #if KASAN
514 SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize -
515 (uint32_t)skm->skm_bufalign;
516 #else
517 SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
518 #endif
519 ASSERT(skm->skm_objsize <= UINT32_MAX);
520 if (oim != NULL) {
521 bzero(oim, sizeof(*oim));
522 }
523
524 skm->skm_sl_alloc++;
525 skm->skm_sl_bufinuse++;
526 if (skm->skm_sl_bufinuse > skm->skm_sl_bufmax) {
527 skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
528 }
529
530 return 0;
531 }
532
533 /*
534 * Allocate a raw object from the slab layer.
535 */
536 int
skmem_slab_alloc(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)537 skmem_slab_alloc(struct skmem_cache *skm, struct skmem_obj_info *oi,
538 struct skmem_obj_info *oim, uint32_t skmflag)
539 {
540 int err;
541
542 SKM_SLAB_LOCK(skm);
543 err = skm->skm_slab_alloc(skm, oi, oim, skmflag);
544 SKM_SLAB_UNLOCK(skm);
545
546 return err;
547 }
548
549 /*
550 * Allocate raw object(s) from the slab layer.
551 */
552 uint32_t
skmem_slab_batch_alloc(struct skmem_cache * skm,struct skmem_obj ** list,uint32_t num,uint32_t skmflag)553 skmem_slab_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
554 uint32_t num, uint32_t skmflag)
555 {
556 uint32_t need = num;
557
558 ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
559 *list = NULL;
560
561 SKM_SLAB_LOCK(skm);
562 for (;;) {
563 struct skmem_obj_info oi, oim;
564
565 /*
566 * Get a single raw object from the slab layer.
567 */
568 if (skm->skm_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
569 break;
570 }
571
572 *list = SKMEM_OBJ_ADDR(&oi);
573 ASSERT((*list)->mo_next == NULL);
574 /* store these inside the object itself */
575 (*list)->mo_info = oi;
576 (*list)->mo_minfo = oim;
577 list = &(*list)->mo_next;
578
579 ASSERT(need != 0);
580 if (--need == 0) {
581 break;
582 }
583 }
584 SKM_SLAB_UNLOCK(skm);
585
586 return num - need;
587 }
588
589 /*
590 * Free a raw object to the (locked) slab layer. Normal region variant.
591 */
592 void
skmem_slab_free_locked(struct skmem_cache * skm,void * buf)593 skmem_slab_free_locked(struct skmem_cache *skm, void *buf)
594 {
595 struct skmem_bufctl *bc, *tbc;
596 struct skmem_bufctl_bkt *bcb;
597 struct skmem_slab *sl = NULL;
598 #if CONFIG_KERNEL_TAGGING
599 struct skmem_region *region;
600 vm_map_address_t tagged_addr;
601 #endif /* CONFIG_KERNEL_TAGGING */
602
603 SKM_SLAB_LOCK_ASSERT_HELD(skm);
604 ASSERT(buf != NULL);
605 /* caller is expected to clear mo_next */
606 ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
607 ((struct skmem_obj *)buf)->mo_next == NULL);
608
609 /*
610 * Search the hash chain to find a matching buffer control for the
611 * given object address. If found, remove the buffer control from
612 * the hash chain and insert it into the freelist. Otherwise, we
613 * panic since the caller has given us a bogus address.
614 */
615 skm->skm_sl_free++;
616 bcb = SKMEM_CACHE_HASH(skm, buf);
617
618 SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
619 if (SKMEM_COMPARE_CANONICAL_ADDR(bc->bc_addr, buf, skm->skm_objsize)) {
620 SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
621 sl = bc->bc_slab;
622 break;
623 }
624 }
625
626 if (bc == NULL) {
627 panic("%s: attempt to free invalid or already-freed obj %p "
628 "on skm %p", __func__, buf, skm);
629 /* NOTREACHED */
630 __builtin_unreachable();
631 }
632 ASSERT(sl != NULL && sl->sl_cache == skm);
633 VERIFY(SKMEM_SLAB_MEMBER(sl, SKMEM_MEMTAG_STRIP_TAG(buf, skm->skm_objsize)));
634
635 /* make sure this object is not currently in use by another object */
636 VERIFY(bc->bc_usecnt == 0);
637
638 /* if auditing is enabled, record this transaction */
639 if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
640 skmem_audit_bufctl(bc);
641 }
642
643 /* if clear on free is requested, zero out the object */
644 if (skm->skm_mode & SKM_MODE_CLEARONFREE) {
645 size_t size = skm->skm_objsize;
646 void *buf_cpy = __unsafe_forge_bidi_indexable(void *, buf, size);
647 bzero(buf_cpy, size);
648 buf_cpy = NULL;
649 size = 0;
650 }
651
652 #if CONFIG_KERNEL_TAGGING
653 region = skm->skm_region;
654 if (region->skr_mode & SKR_MODE_MEMTAG) {
655 tagged_addr = (vm_map_address_t)vm_memtag_generate_and_store_tag(buf, skm->skm_objsize);
656 }
657 #endif /* CONFIG_KERNEL_TAGGING */
658
659 /* insert the buffer control to the slab's freelist */
660 SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
661
662 ASSERT(sl->sl_refcnt >= 1);
663 if (--sl->sl_refcnt == 0) {
664 /*
665 * If this was the last outstanding object for the slab,
666 * remove the slab from the partially-allocated or empty
667 * list, and destroy the slab (segment) back to the region.
668 */
669 if (sl->sl_chunks == 1) {
670 ASSERT(skm->skm_sl_empty > 0);
671 skm->skm_sl_empty--;
672 TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
673 } else {
674 ASSERT(skm->skm_sl_partial > 0);
675 skm->skm_sl_partial--;
676 TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
677 }
678 ASSERT((int64_t)(skm->skm_sl_bufinuse - sl->sl_chunks) >= 0);
679 skm->skm_sl_bufinuse -= sl->sl_chunks;
680 skm->skm_sl_destroy++;
681 SKM_SLAB_UNLOCK(skm);
682 skmem_slab_destroy(skm, sl);
683 SKM_SLAB_LOCK(skm);
684 return;
685 }
686
687 ASSERT(bc == SLIST_FIRST(&sl->sl_head));
688 if (SLIST_NEXT(bc, bc_link) == NULL) {
689 /*
690 * If this is the first (potentially amongst many) object
691 * that's returned to the slab, remove the slab from the
692 * empty list and insert to end of the partially-allocated
693 * list. This should help avoid thrashing the partial slab
694 * since we avoid disturbing what's already at the front.
695 */
696 ASSERT(sl->sl_refcnt == (sl->sl_chunks - 1));
697 ASSERT(sl->sl_chunks > 1);
698 ASSERT(skm->skm_sl_empty > 0);
699 skm->skm_sl_empty--;
700 TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
701 skm->skm_sl_partial++;
702 ASSERT(skm->skm_sl_partial != 0);
703 TAILQ_INSERT_TAIL(&skm->skm_sl_partial_list, sl, sl_link);
704 }
705 }
706
707 /*
708 * Free a raw object to the (locked) slab layer. Pseudo region variant.
709 */
710 void
skmem_slab_free_pseudo_locked(struct skmem_cache * skm,void * buf)711 skmem_slab_free_pseudo_locked(struct skmem_cache *skm, void *buf)
712 {
713 struct skmem_region *skr = skm->skm_region;
714 void *__single obj = buf;
715
716 ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
717
718 SKM_SLAB_LOCK_ASSERT_HELD(skm);
719
720 VERIFY(IS_P2ALIGNED(obj, skm->skm_bufalign));
721
722 #if KASAN
723 /*
724 * Since we stuffed the original zone element address before
725 * the buffer address in KASAN mode, get it back since we're
726 * about to free it.
727 */
728 void **pbuf = __unsafe_forge_bidi_indexable(void **,
729 ((intptr_t)obj - sizeof(void *)), sizeof(void *));
730
731 VERIFY(((intptr_t)obj + skm->skm_bufsize) <=
732 ((intptr_t)*pbuf + skm->skm_objsize));
733
734 obj = *pbuf;
735 #endif /* KASAN */
736
737 /* free it to zone */
738 zfree(skr->skr_zreg, obj);
739
740 skm->skm_sl_free++;
741 ASSERT(skm->skm_sl_bufinuse > 0);
742 skm->skm_sl_bufinuse--;
743 }
744
745 /*
746 * Free a raw object to the slab layer.
747 */
748 void
skmem_slab_free(struct skmem_cache * skm,void * buf)749 skmem_slab_free(struct skmem_cache *skm, void *buf)
750 {
751 if (skm->skm_mode & SKM_MODE_BATCH) {
752 ((struct skmem_obj *)buf)->mo_next = NULL;
753 }
754
755 SKM_SLAB_LOCK(skm);
756 skm->skm_slab_free(skm, buf);
757 SKM_SLAB_UNLOCK(skm);
758 }
759
760 /*
761 * Free raw object(s) to the slab layer.
762 */
763 void
skmem_slab_batch_free(struct skmem_cache * skm,struct skmem_obj * list)764 skmem_slab_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
765 {
766 struct skmem_obj *listn;
767
768 ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
769
770 SKM_SLAB_LOCK(skm);
771 for (;;) {
772 listn = list->mo_next;
773 list->mo_next = NULL;
774
775 /*
776 * Free a single object to the slab layer.
777 */
778 skm->skm_slab_free(skm, (void *)list);
779
780 /* if no more objects to free, we're done */
781 if ((list = listn) == NULL) {
782 break;
783 }
784 }
785 SKM_SLAB_UNLOCK(skm);
786 }
787
788
789 /*
790 * Given a buffer control, record the current transaction.
791 */
792 __attribute__((noinline, cold, not_tail_called))
793 inline void
skmem_audit_bufctl(struct skmem_bufctl * bc)794 skmem_audit_bufctl(struct skmem_bufctl *bc)
795 {
796 struct skmem_bufctl_audit *bca = (struct skmem_bufctl_audit *)bc;
797 struct timeval tv;
798
799 microuptime(&tv);
800 bca->bc_thread = current_thread();
801 bca->bc_timestamp = (uint32_t)((tv.tv_sec * 1000) + (tv.tv_usec / 1000));
802 bca->bc_depth = OSBacktrace(bca->bc_stack, SKMEM_STACK_DEPTH);
803 }
804