1 /*
2 * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <ptrauth.h>
71
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/malloc.h>
75 #include <sys/mbuf.h>
76 #include <sys/kernel.h>
77 #include <sys/sysctl.h>
78 #include <sys/syslog.h>
79 #include <sys/protosw.h>
80 #include <sys/domain.h>
81 #include <sys/queue.h>
82 #include <sys/proc.h>
83 #include <sys/filedesc.h>
84 #include <sys/file_internal.h>
85
86 #include <vm/vm_kern_xnu.h>
87
88 #include <dev/random/randomdev.h>
89
90 #include <kern/kern_types.h>
91 #include <kern/simple_lock.h>
92 #include <kern/queue.h>
93 #include <kern/sched_prim.h>
94 #include <kern/backtrace.h>
95 #include <kern/percpu.h>
96 #include <kern/zalloc.h>
97
98 #include <libkern/OSDebug.h>
99 #include <libkern/libkern.h>
100
101 #include <os/log.h>
102 #include <os/ptrtools.h>
103
104 #include <IOKit/IOMapper.h>
105
106 #include <machine/limits.h>
107 #include <machine/machine_routines.h>
108
109 #include <sys/mcache.h>
110
111 #include <net/droptap.h>
112 #include <netinet/mptcp_var.h>
113 #include <netinet/tcp_var.h>
114
115 #define DUMP_BUF_CHK() { \
116 clen -= k; \
117 if (clen < 1) \
118 goto done; \
119 c += k; \
120 }
121
122 #if INET
123 static int
dump_tcp_reass_qlen(char * str,int str_len)124 dump_tcp_reass_qlen(char *str, int str_len)
125 {
126 char *c = str;
127 int k, clen = str_len;
128
129 if (tcp_reass_total_qlen != 0) {
130 k = scnprintf(c, clen, "\ntcp reass qlen %d\n", tcp_reass_total_qlen);
131 DUMP_BUF_CHK();
132 }
133
134 done:
135 return str_len - clen;
136 }
137 #endif /* INET */
138
139 #if MPTCP
140 static int
dump_mptcp_reass_qlen(char * str,int str_len)141 dump_mptcp_reass_qlen(char *str, int str_len)
142 {
143 char *c = str;
144 int k, clen = str_len;
145
146 if (mptcp_reass_total_qlen != 0) {
147 k = scnprintf(c, clen, "\nmptcp reass qlen %d\n", mptcp_reass_total_qlen);
148 DUMP_BUF_CHK();
149 }
150
151 done:
152 return str_len - clen;
153 }
154 #endif /* MPTCP */
155
156 #if NETWORKING
157 extern int dlil_dump_top_if_qlen(char *__counted_by(str_len), int str_len);
158 #endif /* NETWORKING */
159
160 /*
161 * MBUF IMPLEMENTATION NOTES.
162 *
163 * There is a total of 5 per-CPU caches:
164 *
165 * MC_MBUF:
166 * This is a cache of rudimentary objects of _MSIZE in size; each
167 * object represents an mbuf structure. This cache preserves only
168 * the m_type field of the mbuf during its transactions.
169 *
170 * MC_CL:
171 * This is a cache of rudimentary objects of MCLBYTES in size; each
172 * object represents a mcluster structure. This cache does not
173 * preserve the contents of the objects during its transactions.
174 *
175 * MC_BIGCL:
176 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each
177 * object represents a mbigcluster structure. This cache does not
178 * preserve the contents of the objects during its transaction.
179 *
180 * MC_MBUF_CL:
181 * This is a cache of mbufs each having a cluster attached to it.
182 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
183 * fields of the mbuf related to the external cluster are preserved
184 * during transactions.
185 *
186 * MC_MBUF_BIGCL:
187 * This is a cache of mbufs each having a big cluster attached to it.
188 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
189 * fields of the mbuf related to the external cluster are preserved
190 * during transactions.
191 *
192 * OBJECT ALLOCATION:
193 *
194 * Allocation requests are handled first at the per-CPU (mcache) layer
195 * before falling back to the slab layer. Performance is optimal when
196 * the request is satisfied at the CPU layer because global data/lock
197 * never gets accessed. When the slab layer is entered for allocation,
198 * the slab freelist will be checked first for available objects before
199 * the VM backing store is invoked. Slab layer operations are serialized
200 * for all of the caches as the mbuf global lock is held most of the time.
201 * Allocation paths are different depending on the class of objects:
202 *
203 * a. Rudimentary object:
204 *
205 * { m_get_common(), m_clattach(), m_mclget(),
206 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
207 * composite object allocation }
208 * | ^
209 * | |
210 * | +-----------------------+
211 * v |
212 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
213 * | ^
214 * v |
215 * [CPU cache] -------> (found?) -------+
216 * | |
217 * v |
218 * mbuf_slab_alloc() |
219 * | |
220 * v |
221 * +---------> [freelist] -------> (found?) -------+
222 * | |
223 * | v
224 * | m_clalloc()
225 * | |
226 * | v
227 * +---<<---- kmem_mb_alloc()
228 *
229 * b. Composite object:
230 *
231 * { m_getpackets_internal(), m_allocpacket_internal() }
232 * | ^
233 * | |
234 * | +------ (done) ---------+
235 * v |
236 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
237 * | ^
238 * v |
239 * [CPU cache] -------> (found?) -------+
240 * | |
241 * v |
242 * mbuf_cslab_alloc() |
243 * | |
244 * v |
245 * [freelist] -------> (found?) -------+
246 * | |
247 * v |
248 * (rudimentary object) |
249 * mcache_alloc/mcache_alloc_ext() ------>>-----+
250 *
251 * Auditing notes: If auditing is enabled, buffers will be subjected to
252 * integrity checks by the audit routine. This is done by verifying their
253 * contents against DEADBEEF (free) pattern before returning them to caller.
254 * As part of this step, the routine will also record the transaction and
255 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
256 * also restore any constructed data structure fields if necessary.
257 *
258 * OBJECT DEALLOCATION:
259 *
260 * Freeing an object simply involves placing it into the CPU cache; this
261 * pollutes the cache to benefit subsequent allocations. The slab layer
262 * will only be entered if the object is to be purged out of the cache.
263 * During normal operations, this happens only when the CPU layer resizes
264 * its bucket while it's adjusting to the allocation load. Deallocation
265 * paths are different depending on the class of objects:
266 *
267 * a. Rudimentary object:
268 *
269 * { m_free(), m_freem_list(), composite object deallocation }
270 * | ^
271 * | |
272 * | +------ (done) ---------+
273 * v |
274 * mcache_free/mcache_free_ext() |
275 * | |
276 * v |
277 * mbuf_slab_audit() |
278 * | |
279 * v |
280 * [CPU cache] ---> (not purging?) -----+
281 * | |
282 * v |
283 * mbuf_slab_free() |
284 * | |
285 * v |
286 * [freelist] ----------->>------------+
287 * (objects get purged to VM only on demand)
288 *
289 * b. Composite object:
290 *
291 * { m_free(), m_freem_list() }
292 * | ^
293 * | |
294 * | +------ (done) ---------+
295 * v |
296 * mcache_free/mcache_free_ext() |
297 * | |
298 * v |
299 * mbuf_cslab_audit() |
300 * | |
301 * v |
302 * [CPU cache] ---> (not purging?) -----+
303 * | |
304 * v |
305 * mbuf_cslab_free() |
306 * | |
307 * v |
308 * [freelist] ---> (not purging?) -----+
309 * | |
310 * v |
311 * (rudimentary object) |
312 * mcache_free/mcache_free_ext() ------->>------+
313 *
314 * Auditing notes: If auditing is enabled, the audit routine will save
315 * any constructed data structure fields (if necessary) before filling the
316 * contents of the buffers with DEADBEEF (free) pattern and recording the
317 * transaction. Buffers that are freed (whether at CPU or slab layer) are
318 * expected to contain the free pattern.
319 *
320 * DEBUGGING:
321 *
322 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
323 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
324 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
325 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
326 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
327 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
328 *
329 * Each object is associated with exactly one mcache_audit_t structure that
330 * contains the information related to its last buffer transaction. Given
331 * an address of an object, the audit structure can be retrieved by finding
332 * the position of the object relevant to the base address of the cluster:
333 *
334 * +------------+ +=============+
335 * | mbuf addr | | mclaudit[i] |
336 * +------------+ +=============+
337 * | | cl_audit[0] |
338 * i = MTOBG(addr) +-------------+
339 * | +-----> | cl_audit[1] | -----> mcache_audit_t
340 * b = BGTOM(i) | +-------------+
341 * | | | ... |
342 * x = MCLIDX(b, addr) | +-------------+
343 * | | | cl_audit[7] |
344 * +-----------------+ +-------------+
345 * (e.g. x == 1)
346 *
347 * The mclaudit[] array is allocated at initialization time, but its contents
348 * get populated when the corresponding cluster is created. Because a page
349 * can be turned into NMBPG number of mbufs, we preserve enough space for the
350 * mbufs so that there is a 1-to-1 mapping between them. A page that never
351 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
352 * remaining entries unused. For 16KB cluster, only one entry from the first
353 * page is allocated and used for the entire object.
354 */
355
356 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
357 extern vm_map_t mb_map; /* special map */
358
359 static uint32_t mb_kmem_contig_failed;
360 static uint32_t mb_kmem_failed;
361 static uint32_t mb_kmem_one_failed;
362 /* Timestamp of allocation failures. */
363 static uint64_t mb_kmem_contig_failed_ts;
364 static uint64_t mb_kmem_failed_ts;
365 static uint64_t mb_kmem_one_failed_ts;
366 static uint64_t mb_kmem_contig_failed_size;
367 static uint64_t mb_kmem_failed_size;
368 static uint32_t mb_kmem_stats[6];
369
370 /* Back-end (common) layer */
371 static uint64_t mb_expand_cnt;
372 static uint64_t mb_expand_cl_cnt;
373 static uint64_t mb_expand_cl_total;
374 static uint64_t mb_expand_bigcl_cnt;
375 static uint64_t mb_expand_bigcl_total;
376 static uint64_t mb_expand_16kcl_cnt;
377 static uint64_t mb_expand_16kcl_total;
378 static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */
379 static uint32_t mbuf_worker_run_cnt;
380 static uint64_t mbuf_worker_last_runtime;
381 static uint64_t mbuf_drain_last_runtime;
382 static int mbuf_worker_ready; /* worker thread is runnable */
383 static unsigned int ncpu; /* number of CPUs */
384 static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */
385 static ppnum_t mcl_pages; /* Size of array (# physical pages) */
386 static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
387 static mcache_t *ref_cache; /* Cache of cluster reference & flags */
388 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
389 unsigned int mbuf_debug; /* patchable mbuf mcache flags */
390 static unsigned int mb_normalized; /* number of packets "normalized" */
391
392 #define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
393 #define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
394
395 #define MBUF_CLASS_VALID(c) \
396 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
397
398 /*
399 * mbuf specific mcache allocation request flags.
400 */
401 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
402
403 /*
404 * Per-cluster slab structure.
405 *
406 * A slab is a cluster control structure that contains one or more object
407 * chunks; the available chunks are chained in the slab's freelist (sl_head).
408 * Each time a chunk is taken out of the slab, the slab's reference count
409 * gets incremented. When all chunks have been taken out, the empty slab
410 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
411 * returned to a slab causes the slab's reference count to be decremented;
412 * it also causes the slab to be reinserted back to class's slab list, if
413 * it's not already done.
414 *
415 * Compartmentalizing of the object chunks into slabs allows us to easily
416 * merge one or more slabs together when the adjacent slabs are idle, as
417 * well as to convert or move a slab from one class to another; e.g. the
418 * mbuf cluster slab can be converted to a regular cluster slab when all
419 * mbufs in the slab have been freed.
420 *
421 * A slab may also span across multiple clusters for chunks larger than
422 * a cluster's size. In this case, only the slab of the first cluster is
423 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
424 * that they are part of the larger slab.
425 *
426 * Each slab controls a page of memory.
427 */
428 typedef struct mcl_slab {
429 struct mcl_slab *sl_next; /* neighboring slab */
430 u_int8_t sl_class; /* controlling mbuf class */
431 int8_t sl_refcnt; /* outstanding allocations */
432 int8_t sl_chunks; /* chunks (bufs) in this slab */
433 u_int16_t sl_flags; /* slab flags (see below) */
434 u_int16_t sl_len; /* slab length */
435 void *sl_base; /* base of allocated memory */
436 void *sl_head; /* first free buffer */
437 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
438 } mcl_slab_t;
439
440 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
441 #define SLF_PARTIAL 0x0002 /* part of another slab */
442 #define SLF_DETACHED 0x0004 /* not in slab freelist */
443
444 /*
445 * The array of slabs are broken into groups of arrays per 1MB of kernel
446 * memory to reduce the footprint. Each group is allocated on demand
447 * whenever a new piece of memory mapped in from the VM crosses the 1MB
448 * boundary.
449 */
450 #define NSLABSPMB ((1 << MBSHIFT) >> PAGE_SHIFT)
451
452 typedef struct mcl_slabg {
453 mcl_slab_t *slg_slab; /* group of slabs */
454 } mcl_slabg_t;
455
456 /*
457 * Number of slabs needed to control a 16KB cluster object.
458 */
459 #define NSLABSP16KB (M16KCLBYTES >> PAGE_SHIFT)
460
461 /*
462 * Per-cluster audit structure.
463 */
464 typedef struct {
465 mcache_audit_t **cl_audit; /* array of audits */
466 } mcl_audit_t;
467
468 typedef struct {
469 struct thread *msa_thread; /* thread doing transaction */
470 struct thread *msa_pthread; /* previous transaction thread */
471 uint32_t msa_tstamp; /* transaction timestamp (ms) */
472 uint32_t msa_ptstamp; /* prev transaction timestamp (ms) */
473 uint16_t msa_depth; /* pc stack depth */
474 uint16_t msa_pdepth; /* previous transaction pc stack */
475 void *msa_stack[MCACHE_STACK_DEPTH];
476 void *msa_pstack[MCACHE_STACK_DEPTH];
477 } mcl_scratch_audit_t;
478
479 typedef struct {
480 /*
481 * Size of data from the beginning of an mbuf that covers m_hdr,
482 * pkthdr and m_ext structures. If auditing is enabled, we allocate
483 * a shadow mbuf structure of this size inside each audit structure,
484 * and the contents of the real mbuf gets copied into it when the mbuf
485 * is freed. This allows us to pattern-fill the mbuf for integrity
486 * check, and to preserve any constructed mbuf fields (e.g. mbuf +
487 * cluster cache case). Note that we don't save the contents of
488 * clusters when they are freed; we simply pattern-fill them.
489 */
490 u_int8_t sc_mbuf[(_MSIZE - _MHLEN) + sizeof(_m_ext_t)];
491 mcl_scratch_audit_t sc_scratch __attribute__((aligned(8)));
492 } mcl_saved_contents_t;
493
494 #define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t))
495
496 #define MCA_SAVED_MBUF_PTR(_mca) \
497 ((struct mbuf *)(void *)((mcl_saved_contents_t *) \
498 (_mca)->mca_contents)->sc_mbuf)
499 #define MCA_SAVED_MBUF_SIZE \
500 (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
501 #define MCA_SAVED_SCRATCH_PTR(_mca) \
502 (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
503
504 /*
505 * mbuf specific mcache audit flags
506 */
507 #define MB_INUSE 0x01 /* object has not been returned to slab */
508 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
509 #define MB_SCVALID 0x04 /* object has valid saved contents */
510
511 /*
512 * Each of the following two arrays hold up to nmbclusters elements.
513 */
514 static mcl_audit_t *mclaudit; /* array of cluster audit information */
515 static unsigned int maxclaudit; /* max # of entries in audit table */
516 static mcl_slabg_t **slabstbl; /* cluster slabs table */
517 static unsigned int maxslabgrp; /* max # of entries in slabs table */
518 static unsigned int slabgrp; /* # of entries in slabs table */
519
520 /* Globals */
521 unsigned char *mbutl; /* first mapped cluster address */
522 static unsigned char *embutl; /* ending virtual address of mclusters */
523
524 static boolean_t mclverify; /* debug: pattern-checking */
525 static boolean_t mcltrace; /* debug: stack tracing */
526 static boolean_t mclfindleak; /* debug: leak detection */
527 static boolean_t mclexpleak; /* debug: expose leak info to user space */
528
529 static struct timeval mb_start; /* beginning of time */
530
531 /* mbuf leak detection variables */
532 static struct mleak_table mleak_table;
533 static mleak_stat_t *mleak_stat;
534
535 #define MLEAK_STAT_SIZE(n) \
536 __builtin_offsetof(mleak_stat_t, ml_trace[n])
537
538 struct mallocation {
539 mcache_obj_t *element; /* the alloc'ed element, NULL if unused */
540 u_int32_t trace_index; /* mtrace index for corresponding backtrace */
541 u_int32_t count; /* How many objects were requested */
542 u_int64_t hitcount; /* for determining hash effectiveness */
543 };
544
545 struct mtrace {
546 u_int64_t collisions;
547 u_int64_t hitcount;
548 u_int64_t allocs;
549 u_int64_t depth;
550 uintptr_t addr[MLEAK_STACK_DEPTH];
551 };
552
553 /* Size must be a power of two for the zhash to be able to just mask off bits */
554 #define MLEAK_ALLOCATION_MAP_NUM 512
555 #define MLEAK_TRACE_MAP_NUM 256
556
557 /*
558 * Sample factor for how often to record a trace. This is overwritable
559 * by the boot-arg mleak_sample_factor.
560 */
561 #define MLEAK_SAMPLE_FACTOR 500
562
563 /*
564 * Number of top leakers recorded.
565 */
566 #define MLEAK_NUM_TRACES 5
567
568 #define MB_LEAK_SPACING_64 " "
569 #define MB_LEAK_SPACING_32 " "
570
571
572 #define MB_LEAK_HDR_32 "\n\
573 trace [1] trace [2] trace [3] trace [4] trace [5] \n\
574 ---------- ---------- ---------- ---------- ---------- \n\
575 "
576
577 #define MB_LEAK_HDR_64 "\n\
578 trace [1] trace [2] trace [3] \
579 trace [4] trace [5] \n\
580 ------------------ ------------------ ------------------ \
581 ------------------ ------------------ \n\
582 "
583
584 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
585 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
586
587 /* Hashmaps of allocations and their corresponding traces */
588 static struct mallocation *mleak_allocations;
589 static struct mtrace *mleak_traces;
590 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
591
592 /* Lock to protect mleak tables from concurrent modification */
593 static LCK_GRP_DECLARE(mleak_lock_grp, "mleak_lock");
594 static LCK_MTX_DECLARE(mleak_lock_data, &mleak_lock_grp);
595 static lck_mtx_t *const mleak_lock = &mleak_lock_data;
596
597 /* *Failed* large allocations. */
598 struct mtracelarge {
599 uint64_t size;
600 uint64_t depth;
601 uintptr_t addr[MLEAK_STACK_DEPTH];
602 };
603
604 #define MTRACELARGE_NUM_TRACES 5
605 static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES];
606
607 static void mtracelarge_register(size_t size);
608
609 /* The minimum number of objects that are allocated, to start. */
610 #define MINCL 32
611 #define MINBIGCL (MINCL >> 1)
612
613 /* Low watermarks (only map in pages once free counts go below) */
614 #define MBIGCL_LOWAT MINBIGCL
615
616 #define m_cache(c) mbuf_table[c].mtbl_cache
617 #define m_slablist(c) mbuf_table[c].mtbl_slablist
618 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
619 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
620 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
621 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
622 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
623 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
624 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
625 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
626 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
627 #define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
628 #define m_region_expand(c) mbuf_table[c].mtbl_expand
629
630 mbuf_table_t mbuf_table[] = {
631 /*
632 * The caches for mbufs, regular clusters and big clusters.
633 * The average total values were based on data gathered by actual
634 * usage patterns on iOS.
635 */
636 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
637 NULL, NULL, 0, 0, 0, 0, 3000, 0 },
638 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
639 NULL, NULL, 0, 0, 0, 0, 2000, 0 },
640 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
641 NULL, NULL, 0, 0, 0, 0, 1000, 0 },
642 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
643 NULL, NULL, 0, 0, 0, 0, 200, 0 },
644 /*
645 * The following are special caches; they serve as intermediate
646 * caches backed by the above rudimentary caches. Each object
647 * in the cache is an mbuf with a cluster attached to it. Unlike
648 * the above caches, these intermediate caches do not directly
649 * deal with the slab structures; instead, the constructed
650 * cached elements are simply stored in the freelists.
651 */
652 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000, 0 },
653 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000, 0 },
654 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 200, 0 },
655 };
656
657 #if SKYWALK
658 #define MC_THRESHOLD_SCALE_DOWN_FACTOR 2
659 static unsigned int mc_threshold_scale_down_factor =
660 MC_THRESHOLD_SCALE_DOWN_FACTOR;
661 #endif /* SKYWALK */
662
663 static uint32_t
m_avgtotal(mbuf_class_t c)664 m_avgtotal(mbuf_class_t c)
665 {
666 #if SKYWALK
667 return if_is_fsw_transport_netagent_enabled() ?
668 (mbuf_table[c].mtbl_avgtotal / mc_threshold_scale_down_factor) :
669 mbuf_table[c].mtbl_avgtotal;
670 #else /* !SKYWALK */
671 return mbuf_table[c].mtbl_avgtotal;
672 #endif /* SKYWALK */
673 }
674
675 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
676 static int mb_waiters; /* number of waiters */
677
678 static struct timeval mb_wdtstart; /* watchdog start timestamp */
679 static char *mbuf_dump_buf;
680
681 #define MBUF_DUMP_BUF_SIZE 4096
682
683 /*
684 * mbuf watchdog is enabled by default. It is also toggeable via the
685 * kern.ipc.mb_watchdog sysctl.
686 * Garbage collection is enabled by default on embedded platforms.
687 * mb_drain_maxint controls the amount of time to wait (in seconds) before
688 * consecutive calls to mbuf_drain().
689 */
690 static unsigned int mb_watchdog = 1;
691 #if !XNU_TARGET_OS_OSX
692 static unsigned int mb_drain_maxint = 60;
693 #else /* XNU_TARGET_OS_OSX */
694 static unsigned int mb_drain_maxint = 0;
695 #endif /* XNU_TARGET_OS_OSX */
696
697 /* The following are used to serialize m_clalloc() */
698 static boolean_t mb_clalloc_busy;
699 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
700 static int mb_clalloc_waiters;
701
702 static char *mbuf_dump(void);
703 static void mbuf_worker_thread_init(void);
704 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
705 static void slab_free(mbuf_class_t, mcache_obj_t *);
706 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
707 unsigned int, int);
708 static void mbuf_slab_free(void *, mcache_obj_t *, int);
709 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
710 static void mbuf_slab_notify(void *, u_int32_t);
711 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
712 unsigned int);
713 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
714 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
715 unsigned int, int);
716 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
717 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
718 static int freelist_populate(mbuf_class_t, unsigned int, int);
719 static void freelist_init(mbuf_class_t);
720 static boolean_t mbuf_cached_above(mbuf_class_t, int);
721 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
722 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
723 static int m_howmany(int, size_t);
724 static void mbuf_worker_thread(void);
725 static void mbuf_watchdog(void);
726 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
727
728 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
729 size_t, unsigned int);
730 static void mcl_audit_free(void *, unsigned int);
731 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
732 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
733 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
734 boolean_t);
735 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
736 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
737 static void mcl_audit_scratch(mcache_audit_t *);
738 static void mcl_audit_mcheck_panic(struct mbuf *);
739 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
740
741 static void mleak_activate(void);
742 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
743 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
744 static void mleak_free(mcache_obj_t *);
745 static void mleak_sort_traces(void);
746 static void mleak_update_stats(void);
747
748 static mcl_slab_t *slab_get(void *);
749 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
750 void *, void *, unsigned int, int, int);
751 static void slab_insert(mcl_slab_t *, mbuf_class_t);
752 static void slab_remove(mcl_slab_t *, mbuf_class_t);
753 static boolean_t slab_inrange(mcl_slab_t *, void *);
754 static void slab_nextptr_panic(mcl_slab_t *, void *);
755 static void slab_detach(mcl_slab_t *);
756 static boolean_t slab_is_detached(mcl_slab_t *);
757
758 #if (DEBUG || DEVELOPMENT)
759 #define mbwdog_logger(fmt, ...) _mbwdog_logger(__func__, __LINE__, fmt, ## __VA_ARGS__)
760 static void _mbwdog_logger(const char *func, const int line, const char *fmt, ...);
761 static char *mbwdog_logging;
762 const unsigned mbwdog_logging_size = 4096;
763 static size_t mbwdog_logging_used;
764 #else
765 #define mbwdog_logger(fmt, ...) do { } while (0)
766 #endif /* DEBUG || DEVELOPMENT */
767 static void mbuf_drain_locked(boolean_t);
768
769 void
mbuf_mcheck(struct mbuf * m)770 mbuf_mcheck(struct mbuf *m)
771 {
772 if (__improbable(m->m_type != MT_FREE && !MBUF_IS_PAIRED(m))) {
773 if (mclaudit == NULL) {
774 panic("MCHECK: m_type=%d m=%p",
775 (u_int16_t)(m)->m_type, m);
776 } else {
777 mcl_audit_mcheck_panic(m);
778 }
779 }
780 }
781
782 #define MBUF_IN_MAP(addr) \
783 ((unsigned char *)(addr) >= mbutl && \
784 (unsigned char *)(addr) < embutl)
785
786 #define MRANGE(addr) { \
787 if (!MBUF_IN_MAP(addr)) \
788 panic("MRANGE: address out of range 0x%p", addr); \
789 }
790
791 /*
792 * Macros to obtain page index given a base cluster address
793 */
794 #define MTOPG(x) (((unsigned char *)x - mbutl) >> PAGE_SHIFT)
795 #define PGTOM(x) (mbutl + (x << PAGE_SHIFT))
796
797 /*
798 * Macro to find the mbuf index relative to a base.
799 */
800 #define MBPAGEIDX(c, m) \
801 (((unsigned char *)(m) - (unsigned char *)(c)) >> _MSIZESHIFT)
802
803 /*
804 * Same thing for 2KB cluster index.
805 */
806 #define CLPAGEIDX(c, m) \
807 (((unsigned char *)(m) - (unsigned char *)(c)) >> MCLSHIFT)
808
809 /*
810 * Macro to find 4KB cluster index relative to a base
811 */
812 #define BCLPAGEIDX(c, m) \
813 (((unsigned char *)(m) - (unsigned char *)(c)) >> MBIGCLSHIFT)
814
815 /*
816 * Macro to convert BSD malloc sleep flag to mcache's
817 */
818 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
819
820 static int
821 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
822 {
823 #pragma unused(oidp, arg1, arg2)
824 int i;
825
826 /* Ensure leak tracing turned on */
827 if (!mclfindleak || !mclexpleak) {
828 return ENXIO;
829 }
830
831 lck_mtx_lock(mleak_lock);
832 mleak_update_stats();
833 i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
834 lck_mtx_unlock(mleak_lock);
835
836 return i;
837 }
838
839 static int
840 mleak_table_sysctl SYSCTL_HANDLER_ARGS
841 {
842 #pragma unused(oidp, arg1, arg2)
843 int i = 0;
844
845 /* Ensure leak tracing turned on */
846 if (!mclfindleak || !mclexpleak) {
847 return ENXIO;
848 }
849
850 lck_mtx_lock(mleak_lock);
851 i = SYSCTL_OUT(req, &mleak_table, sizeof(mleak_table));
852 lck_mtx_unlock(mleak_lock);
853
854 return i;
855 }
856
857 void
mbuf_stat_sync(void)858 mbuf_stat_sync(void)
859 {
860 mb_class_stat_t *sp;
861 mcache_cpu_t *ccp;
862 mcache_t *cp;
863 int k, m, bktsize;
864
865
866 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
867
868 for (k = 0; k < MC_MAX; k++) {
869 cp = m_cache(k);
870 ccp = &cp->mc_cpu[0];
871 bktsize = ccp->cc_bktsize;
872 sp = mbuf_table[k].mtbl_stats;
873
874 if (cp->mc_flags & MCF_NOCPUCACHE) {
875 sp->mbcl_mc_state = MCS_DISABLED;
876 } else if (cp->mc_purge_cnt > 0) {
877 sp->mbcl_mc_state = MCS_PURGING;
878 } else if (bktsize == 0) {
879 sp->mbcl_mc_state = MCS_OFFLINE;
880 } else {
881 sp->mbcl_mc_state = MCS_ONLINE;
882 }
883
884 sp->mbcl_mc_cached = 0;
885 for (m = 0; m < ncpu; m++) {
886 ccp = &cp->mc_cpu[m];
887 if (ccp->cc_objs > 0) {
888 sp->mbcl_mc_cached += ccp->cc_objs;
889 }
890 if (ccp->cc_pobjs > 0) {
891 sp->mbcl_mc_cached += ccp->cc_pobjs;
892 }
893 }
894 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
895 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
896 sp->mbcl_infree;
897
898 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
899 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
900 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
901
902 /* Calculate total count specific to each class */
903 sp->mbcl_ctotal = sp->mbcl_total;
904 switch (m_class(k)) {
905 case MC_MBUF:
906 /* Deduct mbufs used in composite caches */
907 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
908 m_total(MC_MBUF_BIGCL) - m_total(MC_MBUF_16KCL));
909 break;
910
911 case MC_CL:
912 /* Deduct clusters used in composite cache */
913 sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
914 break;
915
916 case MC_BIGCL:
917 /* Deduct clusters used in composite cache */
918 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
919 break;
920
921 case MC_16KCL:
922 /* Deduct clusters used in composite cache */
923 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
924 break;
925
926 default:
927 break;
928 }
929 }
930 }
931
932 bool
mbuf_class_under_pressure(struct mbuf * m)933 mbuf_class_under_pressure(struct mbuf *m)
934 {
935 int mclass = mbuf_get_class(m);
936
937 if (m_total(mclass) - m_infree(mclass) >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
938 /*
939 * The above computation does not include the per-CPU cached objects.
940 * As a fast-path check this is good-enough. But now we do
941 * the "slower" count of the cached objects to know exactly the
942 * number of active mbufs in use.
943 *
944 * We do not take the mbuf_lock here to avoid lock-contention. Numbers
945 * might be slightly off but we don't try to be 100% accurate.
946 * At worst, we drop a packet that we shouldn't have dropped or
947 * we might go slightly above our memory-pressure threshold.
948 */
949 mcache_t *cp = m_cache(mclass);
950 mcache_cpu_t *ccp = &cp->mc_cpu[0];
951
952 int bktsize = os_access_once(ccp->cc_bktsize);
953 uint32_t bl_total = os_access_once(cp->mc_full.bl_total);
954 uint32_t cached = 0;
955 int i;
956
957 for (i = 0; i < ncpu; i++) {
958 ccp = &cp->mc_cpu[i];
959
960 int cc_objs = os_access_once(ccp->cc_objs);
961 if (cc_objs > 0) {
962 cached += cc_objs;
963 }
964
965 int cc_pobjs = os_access_once(ccp->cc_pobjs);
966 if (cc_pobjs > 0) {
967 cached += cc_pobjs;
968 }
969 }
970 cached += (bl_total * bktsize);
971 if (m_total(mclass) - m_infree(mclass) - cached >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
972 os_log(OS_LOG_DEFAULT,
973 "%s memory-pressure on mbuf due to class %u, total %u free %u cached %u max %u",
974 __func__, mclass, m_total(mclass), m_infree(mclass), cached, m_maxlimit(mclass));
975 return true;
976 }
977 }
978
979 return false;
980 }
981
982 __private_extern__ void
mbinit(void)983 mbinit(void)
984 {
985 unsigned int m;
986 unsigned int initmcl = 0;
987 thread_t thread = THREAD_NULL;
988
989 microuptime(&mb_start);
990
991 /*
992 * These MBUF_ values must be equal to their private counterparts.
993 */
994 static_assert(MBUF_EXT == M_EXT);
995 static_assert(MBUF_PKTHDR == M_PKTHDR);
996 static_assert(MBUF_EOR == M_EOR);
997 static_assert(MBUF_LOOP == M_LOOP);
998 static_assert(MBUF_BCAST == M_BCAST);
999 static_assert(MBUF_MCAST == M_MCAST);
1000 static_assert(MBUF_FRAG == M_FRAG);
1001 static_assert(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1002 static_assert(MBUF_LASTFRAG == M_LASTFRAG);
1003 static_assert(MBUF_PROMISC == M_PROMISC);
1004 static_assert(MBUF_HASFCS == M_HASFCS);
1005
1006 static_assert(MBUF_TYPE_FREE == MT_FREE);
1007 static_assert(MBUF_TYPE_DATA == MT_DATA);
1008 static_assert(MBUF_TYPE_HEADER == MT_HEADER);
1009 static_assert(MBUF_TYPE_SOCKET == MT_SOCKET);
1010 static_assert(MBUF_TYPE_PCB == MT_PCB);
1011 static_assert(MBUF_TYPE_RTABLE == MT_RTABLE);
1012 static_assert(MBUF_TYPE_HTABLE == MT_HTABLE);
1013 static_assert(MBUF_TYPE_ATABLE == MT_ATABLE);
1014 static_assert(MBUF_TYPE_SONAME == MT_SONAME);
1015 static_assert(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1016 static_assert(MBUF_TYPE_FTABLE == MT_FTABLE);
1017 static_assert(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1018 static_assert(MBUF_TYPE_IFADDR == MT_IFADDR);
1019 static_assert(MBUF_TYPE_CONTROL == MT_CONTROL);
1020 static_assert(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1021
1022 static_assert(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1023 static_assert(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1024 static_assert(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1025 static_assert(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1026 static_assert(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
1027 static_assert(MBUF_CSUM_REQ_IP == CSUM_IP);
1028 static_assert(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1029 static_assert(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1030 static_assert(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1031 static_assert(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1032 static_assert(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1033 static_assert(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1034 static_assert(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1035 static_assert(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1036
1037 static_assert(MBUF_WAITOK == M_WAIT);
1038 static_assert(MBUF_DONTWAIT == M_DONTWAIT);
1039 static_assert(MBUF_COPYALL == M_COPYALL);
1040
1041 static_assert(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1042 static_assert(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1043 static_assert(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1044 static_assert(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1045 static_assert(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1046 static_assert(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1047 static_assert(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1048 static_assert(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1049 static_assert(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI);
1050 static_assert(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1051 static_assert(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1052
1053 static_assert(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1054 static_assert(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1055 static_assert(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1056 static_assert(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1057
1058 /* Module specific scratch space (32-bit alignment requirement) */
1059 static_assert(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) % sizeof(uint32_t)));
1060
1061 /* Make sure we don't save more than we should */
1062 static_assert(MCA_SAVED_MBUF_SIZE <= sizeof(struct mbuf));
1063
1064 if (nmbclusters == 0) {
1065 nmbclusters = NMBCLUSTERS;
1066 }
1067
1068 /* This should be a sane (at least even) value by now */
1069 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1070
1071 /* Setup the mbuf table */
1072 mbuf_table_init();
1073
1074 static_assert(sizeof(struct mbuf) == _MSIZE);
1075
1076 /*
1077 * Allocate cluster slabs table:
1078 *
1079 * maxslabgrp = (N * 2048) / (1024 * 1024)
1080 *
1081 * Where N is nmbclusters rounded up to the nearest 512. This yields
1082 * mcl_slab_g_t units, each one representing a MB of memory.
1083 */
1084 maxslabgrp =
1085 (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
1086 slabstbl = zalloc_permanent(maxslabgrp * sizeof(mcl_slabg_t *),
1087 ZALIGN(mcl_slabg_t));
1088
1089 /*
1090 * Allocate audit structures, if needed:
1091 *
1092 * maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE
1093 *
1094 * This yields mcl_audit_t units, each one representing a page.
1095 */
1096 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof(mbuf_debug));
1097 mbuf_debug |= mcache_getflags();
1098 if (mbuf_debug & MCF_DEBUG) {
1099 int l;
1100 mcl_audit_t *mclad;
1101 maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
1102 mclaudit = zalloc_permanent(maxclaudit * sizeof(*mclaudit),
1103 ZALIGN(mcl_audit_t));
1104 for (l = 0, mclad = mclaudit; l < maxclaudit; l++) {
1105 mclad[l].cl_audit = zalloc_permanent(NMBPG * sizeof(mcache_audit_t *),
1106 ZALIGN_PTR);
1107 }
1108
1109 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1110 AUDIT_CONTENTS_SIZE, sizeof(u_int64_t), 0, MCR_SLEEP);
1111 VERIFY(mcl_audit_con_cache != NULL);
1112 }
1113 mclverify = (mbuf_debug & MCF_VERIFY);
1114 mcltrace = (mbuf_debug & MCF_TRACE);
1115 mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1116 mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1117
1118 /* Enable mbuf leak logging, with a lock to protect the tables */
1119
1120 mleak_activate();
1121
1122 /*
1123 * Allocate structure for per-CPU statistics that's aligned
1124 * on the CPU cache boundary; this code assumes that we never
1125 * uninitialize this framework, since the original address
1126 * before alignment is not saved.
1127 */
1128 ncpu = ml_wait_max_cpus();
1129
1130 /* Calculate the number of pages assigned to the cluster pool */
1131 mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
1132 mcl_paddr = zalloc_permanent(mcl_pages * sizeof(ppnum_t),
1133 ZALIGN(ppnum_t));
1134
1135 /* Register with the I/O Bus mapper */
1136 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1137
1138 embutl = (mbutl + (nmbclusters * MCLBYTES));
1139 VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0);
1140
1141 /* Prime up the freelist */
1142 PE_parse_boot_argn("initmcl", &initmcl, sizeof(initmcl));
1143 if (initmcl != 0) {
1144 initmcl >>= NCLPBGSHIFT; /* become a 4K unit */
1145 if (initmcl > m_maxlimit(MC_BIGCL)) {
1146 initmcl = m_maxlimit(MC_BIGCL);
1147 }
1148 }
1149 if (initmcl < m_minlimit(MC_BIGCL)) {
1150 initmcl = m_minlimit(MC_BIGCL);
1151 }
1152
1153 lck_mtx_lock(mbuf_mlock);
1154
1155 /*
1156 * For classes with non-zero minimum limits, populate their freelists
1157 * so that m_total(class) is at least m_minlimit(class).
1158 */
1159 VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1160 freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1161 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1162 freelist_init(m_class(MC_CL));
1163
1164 for (m = 0; m < MC_MAX; m++) {
1165 /* Make sure we didn't miss any */
1166 VERIFY(m_minlimit(m_class(m)) == 0 ||
1167 m_total(m_class(m)) >= m_minlimit(m_class(m)));
1168 }
1169
1170 lck_mtx_unlock(mbuf_mlock);
1171
1172 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1173 NULL, &thread);
1174 thread_deallocate(thread);
1175
1176 ref_cache = mcache_create("mext_ref", sizeof(struct ext_ref),
1177 0, 0, MCR_SLEEP);
1178
1179 /* Create the cache for each class */
1180 for (m = 0; m < MC_MAX; m++) {
1181 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1182 u_int32_t flags;
1183
1184 flags = mbuf_debug;
1185 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1186 m_class(m) == MC_MBUF_16KCL) {
1187 allocfunc = mbuf_cslab_alloc;
1188 freefunc = mbuf_cslab_free;
1189 auditfunc = mbuf_cslab_audit;
1190 logfunc = mleak_logger;
1191 } else {
1192 allocfunc = mbuf_slab_alloc;
1193 freefunc = mbuf_slab_free;
1194 auditfunc = mbuf_slab_audit;
1195 logfunc = mleak_logger;
1196 }
1197
1198 if (!mclfindleak) {
1199 flags |= MCF_NOLEAKLOG;
1200 }
1201
1202 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1203 allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1204 (void *)(uintptr_t)m, flags, MCR_SLEEP);
1205 }
1206
1207 /*
1208 * Set the max limit on sb_max to be 1/16 th of the size of
1209 * memory allocated for mbuf clusters.
1210 */
1211 high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1212 if (high_sb_max < sb_max) {
1213 /* sb_max is too large for this configuration, scale it down */
1214 if (high_sb_max > (1 << MBSHIFT)) {
1215 /* We have atleast 16 M of mbuf pool */
1216 sb_max = high_sb_max;
1217 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1218 /*
1219 * If we have more than 1M of mbufpool, cap the size of
1220 * max sock buf at 1M
1221 */
1222 sb_max = high_sb_max = (1 << MBSHIFT);
1223 } else {
1224 sb_max = high_sb_max;
1225 }
1226 }
1227
1228 /* allocate space for mbuf_dump_buf */
1229 mbuf_dump_buf = zalloc_permanent(MBUF_DUMP_BUF_SIZE, ZALIGN_NONE);
1230
1231 if (mbuf_debug & MCF_DEBUG) {
1232 printf("%s: MLEN %d, MHLEN %d\n", __func__,
1233 (int)_MLEN, (int)_MHLEN);
1234 }
1235 printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1236 (nmbclusters << MCLSHIFT) >> MBSHIFT,
1237 (nclusters << MCLSHIFT) >> MBSHIFT,
1238 (njcl << MCLSHIFT) >> MBSHIFT);
1239 }
1240
1241 /*
1242 * Obtain a slab of object(s) from the class's freelist.
1243 */
1244 static mcache_obj_t *
slab_alloc(mbuf_class_t class,int wait)1245 slab_alloc(mbuf_class_t class, int wait)
1246 {
1247 mcl_slab_t *sp;
1248 mcache_obj_t *buf;
1249
1250 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1251
1252 /* This should always be NULL for us */
1253 VERIFY(m_cobjlist(class) == NULL);
1254
1255 /*
1256 * Treat composite objects as having longer lifespan by using
1257 * a slab from the reverse direction, in hoping that this could
1258 * reduce the probability of fragmentation for slabs that hold
1259 * more than one buffer chunks (e.g. mbuf slabs). For other
1260 * slabs, this probably doesn't make much of a difference.
1261 */
1262 if ((class == MC_MBUF || class == MC_CL || class == MC_BIGCL)
1263 && (wait & MCR_COMP)) {
1264 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1265 } else {
1266 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1267 }
1268
1269 if (sp == NULL) {
1270 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1271 /* The slab list for this class is empty */
1272 return NULL;
1273 }
1274
1275 VERIFY(m_infree(class) > 0);
1276 VERIFY(!slab_is_detached(sp));
1277 VERIFY(sp->sl_class == class &&
1278 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1279 buf = sp->sl_head;
1280 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1281 sp->sl_head = buf->obj_next;
1282 /* Increment slab reference */
1283 sp->sl_refcnt++;
1284
1285 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == sp->sl_chunks);
1286
1287 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1288 slab_nextptr_panic(sp, sp->sl_head);
1289 /* In case sl_head is in the map but not in the slab */
1290 VERIFY(slab_inrange(sp, sp->sl_head));
1291 /* NOTREACHED */
1292 }
1293
1294 if (mclaudit != NULL) {
1295 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1296 mca->mca_uflags = 0;
1297 /* Save contents on mbuf objects only */
1298 if (class == MC_MBUF) {
1299 mca->mca_uflags |= MB_SCVALID;
1300 }
1301 }
1302
1303 if (class == MC_CL) {
1304 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1305 /*
1306 * A 2K cluster slab can have at most NCLPG references.
1307 */
1308 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPG &&
1309 sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1310 VERIFY(sp->sl_refcnt < NCLPG || sp->sl_head == NULL);
1311 } else if (class == MC_BIGCL) {
1312 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1313 m_infree(MC_MBUF_BIGCL);
1314 /*
1315 * A 4K cluster slab can have NBCLPG references.
1316 */
1317 VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG &&
1318 sp->sl_len == PAGE_SIZE &&
1319 (sp->sl_refcnt < NBCLPG || sp->sl_head == NULL));
1320 } else if (class == MC_16KCL) {
1321 mcl_slab_t *nsp;
1322 int k;
1323
1324 --m_infree(MC_16KCL);
1325 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1326 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1327 /*
1328 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1329 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1330 * most 1 reference.
1331 */
1332 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1333 nsp = nsp->sl_next;
1334 /* Next slab must already be present */
1335 VERIFY(nsp != NULL);
1336 nsp->sl_refcnt++;
1337 VERIFY(!slab_is_detached(nsp));
1338 VERIFY(nsp->sl_class == MC_16KCL &&
1339 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1340 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1341 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1342 nsp->sl_head == NULL);
1343 }
1344 } else {
1345 VERIFY(class == MC_MBUF);
1346 --m_infree(MC_MBUF);
1347 /*
1348 * If auditing is turned on, this check is
1349 * deferred until later in mbuf_slab_audit().
1350 */
1351 if (mclaudit == NULL) {
1352 mbuf_mcheck((struct mbuf *)buf);
1353 }
1354 /*
1355 * Since we have incremented the reference count above,
1356 * an mbuf slab (formerly a 4KB cluster slab that was cut
1357 * up into mbufs) must have a reference count between 1
1358 * and NMBPG at this point.
1359 */
1360 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPG &&
1361 sp->sl_chunks == NMBPG &&
1362 sp->sl_len == PAGE_SIZE);
1363 VERIFY(sp->sl_refcnt < NMBPG || sp->sl_head == NULL);
1364 }
1365
1366 /* If empty, remove this slab from the class's freelist */
1367 if (sp->sl_head == NULL) {
1368 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPG);
1369 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPG);
1370 VERIFY(class != MC_BIGCL || sp->sl_refcnt == NBCLPG);
1371 slab_remove(sp, class);
1372 }
1373
1374 return buf;
1375 }
1376
1377 /*
1378 * Place a slab of object(s) back into a class's slab list.
1379 */
1380 static void
slab_free(mbuf_class_t class,mcache_obj_t * buf)1381 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1382 {
1383 mcl_slab_t *sp;
1384 boolean_t reinit_supercl = false;
1385 mbuf_class_t super_class;
1386
1387 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1388
1389 VERIFY(buf->obj_next == NULL);
1390
1391 /*
1392 * Synchronizing with m_clalloc, as it reads m_total, while we here
1393 * are modifying m_total.
1394 */
1395 while (mb_clalloc_busy) {
1396 mb_clalloc_waiters++;
1397 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
1398 (PZERO - 1), "m_clalloc", NULL);
1399 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1400 }
1401
1402 /* We are busy now; tell everyone else to go away */
1403 mb_clalloc_busy = TRUE;
1404
1405 sp = slab_get(buf);
1406 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1407 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1408
1409 /* Decrement slab reference */
1410 sp->sl_refcnt--;
1411
1412 if (class == MC_CL) {
1413 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1414 /*
1415 * A slab that has been splitted for 2KB clusters can have
1416 * at most 1 outstanding reference at this point.
1417 */
1418 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPG - 1) &&
1419 sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1420 VERIFY(sp->sl_refcnt < (NCLPG - 1) ||
1421 (slab_is_detached(sp) && sp->sl_head == NULL));
1422 } else if (class == MC_BIGCL) {
1423 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1424
1425 /* A 4KB cluster slab can have NBCLPG references at most */
1426 VERIFY(sp->sl_refcnt >= 0 && sp->sl_chunks == NBCLPG);
1427 VERIFY(sp->sl_refcnt < (NBCLPG - 1) ||
1428 (slab_is_detached(sp) && sp->sl_head == NULL));
1429 } else if (class == MC_16KCL) {
1430 mcl_slab_t *nsp;
1431 int k;
1432 /*
1433 * A 16KB cluster takes NSLABSP16KB slabs, all must
1434 * now have 0 reference.
1435 */
1436 VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE));
1437 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1438 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1439 VERIFY(slab_is_detached(sp));
1440 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1441 nsp = nsp->sl_next;
1442 /* Next slab must already be present */
1443 VERIFY(nsp != NULL);
1444 nsp->sl_refcnt--;
1445 VERIFY(slab_is_detached(nsp));
1446 VERIFY(nsp->sl_class == MC_16KCL &&
1447 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1448 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1449 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1450 nsp->sl_head == NULL);
1451 }
1452 } else {
1453 /*
1454 * A slab that has been splitted for mbufs has at most
1455 * NMBPG reference counts. Since we have decremented
1456 * one reference above, it must now be between 0 and
1457 * NMBPG-1.
1458 */
1459 VERIFY(class == MC_MBUF);
1460 VERIFY(sp->sl_refcnt >= 0 &&
1461 sp->sl_refcnt <= (NMBPG - 1) &&
1462 sp->sl_chunks == NMBPG &&
1463 sp->sl_len == PAGE_SIZE);
1464 VERIFY(sp->sl_refcnt < (NMBPG - 1) ||
1465 (slab_is_detached(sp) && sp->sl_head == NULL));
1466 }
1467
1468 /*
1469 * When auditing is enabled, ensure that the buffer still
1470 * contains the free pattern. Otherwise it got corrupted
1471 * while at the CPU cache layer.
1472 */
1473 if (mclaudit != NULL) {
1474 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1475 if (mclverify) {
1476 mcache_audit_free_verify(mca, buf, 0,
1477 m_maxsize(class));
1478 }
1479 mca->mca_uflags &= ~MB_SCVALID;
1480 }
1481
1482 if (class == MC_CL) {
1483 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1484 buf->obj_next = sp->sl_head;
1485 } else if (class == MC_BIGCL) {
1486 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1487 m_infree(MC_MBUF_BIGCL);
1488 buf->obj_next = sp->sl_head;
1489 } else if (class == MC_16KCL) {
1490 ++m_infree(MC_16KCL);
1491 } else {
1492 ++m_infree(MC_MBUF);
1493 buf->obj_next = sp->sl_head;
1494 }
1495 sp->sl_head = buf;
1496
1497 /*
1498 * If a slab has been split to either one which holds 2KB clusters,
1499 * or one which holds mbufs, turn it back to one which holds a
1500 * 4 or 16 KB cluster depending on the page size.
1501 */
1502 if (m_maxsize(MC_BIGCL) == PAGE_SIZE) {
1503 super_class = MC_BIGCL;
1504 } else {
1505 VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL));
1506 super_class = MC_16KCL;
1507 }
1508 if (class == MC_MBUF && sp->sl_refcnt == 0 &&
1509 m_total(class) >= (m_minlimit(class) + NMBPG) &&
1510 m_total(super_class) < m_maxlimit(super_class)) {
1511 int i = NMBPG;
1512
1513 m_total(MC_MBUF) -= NMBPG;
1514 mbstat.m_mbufs = m_total(MC_MBUF);
1515 m_infree(MC_MBUF) -= NMBPG;
1516 mtype_stat_add(MT_FREE, -((unsigned)NMBPG));
1517
1518 while (i--) {
1519 struct mbuf *m = sp->sl_head;
1520 VERIFY(m != NULL);
1521 sp->sl_head = m->m_next;
1522 m->m_next = NULL;
1523 }
1524 reinit_supercl = true;
1525 } else if (class == MC_CL && sp->sl_refcnt == 0 &&
1526 m_total(class) >= (m_minlimit(class) + NCLPG) &&
1527 m_total(super_class) < m_maxlimit(super_class)) {
1528 int i = NCLPG;
1529
1530 m_total(MC_CL) -= NCLPG;
1531 mbstat.m_clusters = m_total(MC_CL);
1532 m_infree(MC_CL) -= NCLPG;
1533
1534 while (i--) {
1535 union mcluster *c = sp->sl_head;
1536 VERIFY(c != NULL);
1537 sp->sl_head = c->mcl_next;
1538 c->mcl_next = NULL;
1539 }
1540 reinit_supercl = true;
1541 } else if (class == MC_BIGCL && super_class != MC_BIGCL &&
1542 sp->sl_refcnt == 0 &&
1543 m_total(class) >= (m_minlimit(class) + NBCLPG) &&
1544 m_total(super_class) < m_maxlimit(super_class)) {
1545 int i = NBCLPG;
1546
1547 VERIFY(super_class == MC_16KCL);
1548 m_total(MC_BIGCL) -= NBCLPG;
1549 mbstat.m_bigclusters = m_total(MC_BIGCL);
1550 m_infree(MC_BIGCL) -= NBCLPG;
1551
1552 while (i--) {
1553 union mbigcluster *bc = sp->sl_head;
1554 VERIFY(bc != NULL);
1555 sp->sl_head = bc->mbc_next;
1556 bc->mbc_next = NULL;
1557 }
1558 reinit_supercl = true;
1559 }
1560
1561 if (reinit_supercl) {
1562 VERIFY(sp->sl_head == NULL);
1563 VERIFY(m_total(class) >= m_minlimit(class));
1564 slab_remove(sp, class);
1565
1566 /* Reinitialize it as a cluster for the super class */
1567 m_total(super_class)++;
1568 m_infree(super_class)++;
1569 VERIFY(sp->sl_flags == (SLF_MAPPED | SLF_DETACHED) &&
1570 sp->sl_len == PAGE_SIZE && sp->sl_refcnt == 0);
1571
1572 slab_init(sp, super_class, SLF_MAPPED, sp->sl_base,
1573 sp->sl_base, PAGE_SIZE, 0, 1);
1574 if (mclverify) {
1575 mcache_set_pattern(MCACHE_FREE_PATTERN,
1576 (caddr_t)sp->sl_base, sp->sl_len);
1577 }
1578 ((mcache_obj_t *)(sp->sl_base))->obj_next = NULL;
1579
1580 if (super_class == MC_BIGCL) {
1581 mbstat.m_bigclusters = m_total(MC_BIGCL);
1582 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
1583 m_infree(MC_MBUF_BIGCL);
1584 }
1585
1586 VERIFY(slab_is_detached(sp));
1587 VERIFY(m_total(super_class) <= m_maxlimit(super_class));
1588
1589 /* And finally switch class */
1590 class = super_class;
1591 }
1592
1593 /* Reinsert the slab to the class's slab list */
1594 if (slab_is_detached(sp)) {
1595 slab_insert(sp, class);
1596 }
1597
1598 /* We're done; let others enter */
1599 mb_clalloc_busy = FALSE;
1600 if (mb_clalloc_waiters > 0) {
1601 mb_clalloc_waiters = 0;
1602 wakeup(mb_clalloc_waitchan);
1603 }
1604 }
1605
1606 /*
1607 * Common allocator for rudimentary objects called by the CPU cache layer
1608 * during an allocation request whenever there is no available element in the
1609 * bucket layer. It returns one or more elements from the appropriate global
1610 * freelist. If the freelist is empty, it will attempt to populate it and
1611 * retry the allocation.
1612 */
1613 static unsigned int
mbuf_slab_alloc(void * arg,mcache_obj_t *** plist,unsigned int num,int wait)1614 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1615 {
1616 mbuf_class_t class = (mbuf_class_t)arg;
1617 unsigned int need = num;
1618 mcache_obj_t **list = *plist;
1619
1620 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1621 ASSERT(need > 0);
1622
1623 lck_mtx_lock(mbuf_mlock);
1624
1625 for (;;) {
1626 if ((*list = slab_alloc(class, wait)) != NULL) {
1627 (*list)->obj_next = NULL;
1628 list = *plist = &(*list)->obj_next;
1629
1630 if (--need == 0) {
1631 /*
1632 * If the number of elements in freelist has
1633 * dropped below low watermark, asynchronously
1634 * populate the freelist now rather than doing
1635 * it later when we run out of elements.
1636 */
1637 if (!mbuf_cached_above(class, wait) &&
1638 m_infree(class) < (m_total(class) >> 5)) {
1639 (void) freelist_populate(class, 1,
1640 M_DONTWAIT);
1641 }
1642 break;
1643 }
1644 } else {
1645 VERIFY(m_infree(class) == 0 || class == MC_CL);
1646
1647 (void) freelist_populate(class, 1,
1648 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
1649
1650 if (m_infree(class) > 0) {
1651 continue;
1652 }
1653
1654 /* Check if there's anything at the cache layer */
1655 if (mbuf_cached_above(class, wait)) {
1656 break;
1657 }
1658
1659 /* watchdog checkpoint */
1660 mbuf_watchdog();
1661
1662 /* We have nothing and cannot block; give up */
1663 if (wait & MCR_NOSLEEP) {
1664 if (!(wait & MCR_TRYHARD)) {
1665 m_fail_cnt(class)++;
1666 mbstat.m_drops++;
1667 break;
1668 }
1669 }
1670
1671 /*
1672 * If the freelist is still empty and the caller is
1673 * willing to be blocked, sleep on the wait channel
1674 * until an element is available. Otherwise, if
1675 * MCR_TRYHARD is set, do our best to satisfy the
1676 * request without having to go to sleep.
1677 */
1678 if (mbuf_worker_ready &&
1679 mbuf_sleep(class, need, wait)) {
1680 break;
1681 }
1682
1683 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1684 }
1685 }
1686
1687 m_alloc_cnt(class) += num - need;
1688 lck_mtx_unlock(mbuf_mlock);
1689
1690 return num - need;
1691 }
1692
1693 /*
1694 * Common de-allocator for rudimentary objects called by the CPU cache
1695 * layer when one or more elements need to be returned to the appropriate
1696 * global freelist.
1697 */
1698 static void
mbuf_slab_free(void * arg,mcache_obj_t * list,__unused int purged)1699 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
1700 {
1701 mbuf_class_t class = (mbuf_class_t)arg;
1702 mcache_obj_t *nlist;
1703 unsigned int num = 0;
1704 int w;
1705
1706 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1707
1708 lck_mtx_lock(mbuf_mlock);
1709
1710 for (;;) {
1711 nlist = list->obj_next;
1712 list->obj_next = NULL;
1713 slab_free(class, list);
1714 ++num;
1715 if ((list = nlist) == NULL) {
1716 break;
1717 }
1718 }
1719 m_free_cnt(class) += num;
1720
1721 if ((w = mb_waiters) > 0) {
1722 mb_waiters = 0;
1723 }
1724 if (w) {
1725 mbwdog_logger("waking up all threads");
1726 }
1727 lck_mtx_unlock(mbuf_mlock);
1728
1729 if (w != 0) {
1730 wakeup(mb_waitchan);
1731 }
1732 }
1733
1734 /*
1735 * Common auditor for rudimentary objects called by the CPU cache layer
1736 * during an allocation or free request. For the former, this is called
1737 * after the objects are obtained from either the bucket or slab layer
1738 * and before they are returned to the caller. For the latter, this is
1739 * called immediately during free and before placing the objects into
1740 * the bucket or slab layer.
1741 */
1742 static void
mbuf_slab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)1743 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1744 {
1745 mbuf_class_t class = (mbuf_class_t)arg;
1746 mcache_audit_t *mca;
1747
1748 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1749
1750 while (list != NULL) {
1751 lck_mtx_lock(mbuf_mlock);
1752 mca = mcl_audit_buf2mca(class, list);
1753
1754 /* Do the sanity checks */
1755 if (class == MC_MBUF) {
1756 mcl_audit_mbuf(mca, list, FALSE, alloc);
1757 ASSERT(mca->mca_uflags & MB_SCVALID);
1758 } else {
1759 mcl_audit_cluster(mca, list, m_maxsize(class),
1760 alloc, TRUE);
1761 ASSERT(!(mca->mca_uflags & MB_SCVALID));
1762 }
1763 /* Record this transaction */
1764 if (mcltrace) {
1765 mcache_buffer_log(mca, list, m_cache(class), &mb_start);
1766 }
1767
1768 if (alloc) {
1769 mca->mca_uflags |= MB_INUSE;
1770 } else {
1771 mca->mca_uflags &= ~MB_INUSE;
1772 }
1773 /* Unpair the object (unconditionally) */
1774 mca->mca_uptr = NULL;
1775 lck_mtx_unlock(mbuf_mlock);
1776
1777 list = list->obj_next;
1778 }
1779 }
1780
1781 /*
1782 * Common notify routine for all caches. It is called by mcache when
1783 * one or more objects get freed. We use this indication to trigger
1784 * the wakeup of any sleeping threads so that they can retry their
1785 * allocation requests.
1786 */
1787 static void
mbuf_slab_notify(void * arg,u_int32_t reason)1788 mbuf_slab_notify(void *arg, u_int32_t reason)
1789 {
1790 mbuf_class_t class = (mbuf_class_t)arg;
1791 int w;
1792
1793 ASSERT(MBUF_CLASS_VALID(class));
1794
1795 if (reason != MCN_RETRYALLOC) {
1796 return;
1797 }
1798
1799 lck_mtx_lock(mbuf_mlock);
1800 if ((w = mb_waiters) > 0) {
1801 m_notified(class)++;
1802 mb_waiters = 0;
1803 }
1804 if (w) {
1805 mbwdog_logger("waking up all threads");
1806 }
1807 lck_mtx_unlock(mbuf_mlock);
1808
1809 if (w != 0) {
1810 wakeup(mb_waitchan);
1811 }
1812 }
1813
1814 /*
1815 * Obtain object(s) from the composite class's freelist.
1816 */
1817 static unsigned int
cslab_alloc(mbuf_class_t class,mcache_obj_t *** plist,unsigned int num)1818 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
1819 {
1820 unsigned int need = num;
1821 mcl_slab_t *sp, *clsp, *nsp;
1822 struct mbuf *m;
1823 mcache_obj_t **list = *plist;
1824 void *cl;
1825
1826 VERIFY(need > 0);
1827 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1828
1829 /* Get what we can from the freelist */
1830 while ((*list = m_cobjlist(class)) != NULL) {
1831 MRANGE(*list);
1832
1833 m = (struct mbuf *)*list;
1834 sp = slab_get(m);
1835 cl = m->m_ext.ext_buf;
1836 clsp = slab_get(cl);
1837 VERIFY(m->m_flags == M_EXT && cl != NULL);
1838 VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
1839
1840 if (class == MC_MBUF_CL) {
1841 VERIFY(clsp->sl_refcnt >= 1 &&
1842 clsp->sl_refcnt <= NCLPG);
1843 } else {
1844 VERIFY(clsp->sl_refcnt >= 1 &&
1845 clsp->sl_refcnt <= NBCLPG);
1846 }
1847
1848 if (class == MC_MBUF_16KCL) {
1849 int k;
1850 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
1851 nsp = nsp->sl_next;
1852 /* Next slab must already be present */
1853 VERIFY(nsp != NULL);
1854 VERIFY(nsp->sl_refcnt == 1);
1855 }
1856 }
1857
1858 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
1859 !MBUF_IN_MAP(m_cobjlist(class))) {
1860 slab_nextptr_panic(sp, m_cobjlist(class));
1861 /* NOTREACHED */
1862 }
1863 (*list)->obj_next = NULL;
1864 list = *plist = &(*list)->obj_next;
1865
1866 if (--need == 0) {
1867 break;
1868 }
1869 }
1870 m_infree(class) -= (num - need);
1871
1872 return num - need;
1873 }
1874
1875 /*
1876 * Place object(s) back into a composite class's freelist.
1877 */
1878 static unsigned int
cslab_free(mbuf_class_t class,mcache_obj_t * list,int purged)1879 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
1880 {
1881 mcache_obj_t *o, *tail;
1882 unsigned int num = 0;
1883 struct mbuf *m, *ms;
1884 mcache_audit_t *mca = NULL;
1885 mcache_obj_t *ref_list = NULL;
1886 mcl_slab_t *clsp, *nsp;
1887 void *cl;
1888 mbuf_class_t cl_class;
1889
1890 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1891 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1892
1893 if (class == MC_MBUF_CL) {
1894 cl_class = MC_CL;
1895 } else if (class == MC_MBUF_BIGCL) {
1896 cl_class = MC_BIGCL;
1897 } else {
1898 VERIFY(class == MC_MBUF_16KCL);
1899 cl_class = MC_16KCL;
1900 }
1901
1902 o = tail = list;
1903
1904 while ((m = ms = (struct mbuf *)o) != NULL) {
1905 mcache_obj_t *rfa, *nexto = o->obj_next;
1906
1907 /* Do the mbuf sanity checks */
1908 if (mclaudit != NULL) {
1909 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
1910 if (mclverify) {
1911 mcache_audit_free_verify(mca, m, 0,
1912 m_maxsize(MC_MBUF));
1913 }
1914 ms = MCA_SAVED_MBUF_PTR(mca);
1915 }
1916
1917 /* Do the cluster sanity checks */
1918 cl = ms->m_ext.ext_buf;
1919 clsp = slab_get(cl);
1920 if (mclverify) {
1921 size_t size = m_maxsize(cl_class);
1922 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
1923 (mcache_obj_t *)cl), cl, 0, size);
1924 }
1925 VERIFY(ms->m_type == MT_FREE);
1926 VERIFY(ms->m_flags == M_EXT);
1927 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
1928 if (cl_class == MC_CL) {
1929 VERIFY(clsp->sl_refcnt >= 1 &&
1930 clsp->sl_refcnt <= NCLPG);
1931 } else {
1932 VERIFY(clsp->sl_refcnt >= 1 &&
1933 clsp->sl_refcnt <= NBCLPG);
1934 }
1935 if (cl_class == MC_16KCL) {
1936 int k;
1937 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
1938 nsp = nsp->sl_next;
1939 /* Next slab must already be present */
1940 VERIFY(nsp != NULL);
1941 VERIFY(nsp->sl_refcnt == 1);
1942 }
1943 }
1944
1945 /*
1946 * If we're asked to purge, restore the actual mbuf using
1947 * contents of the shadow structure (if auditing is enabled)
1948 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
1949 * about to free it and the attached cluster into their caches.
1950 */
1951 if (purged) {
1952 /* Restore constructed mbuf fields */
1953 if (mclaudit != NULL) {
1954 mcl_audit_restore_mbuf(m, mca, TRUE);
1955 }
1956
1957 MEXT_MINREF(m) = 0;
1958 MEXT_REF(m) = 0;
1959 MEXT_PREF(m) = 0;
1960 MEXT_FLAGS(m) = 0;
1961 MEXT_PRIV(m) = 0;
1962 MEXT_PMBUF(m) = NULL;
1963
1964 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
1965 m_set_ext(m, NULL, NULL, NULL);
1966 rfa->obj_next = ref_list;
1967 ref_list = rfa;
1968
1969 m->m_type = MT_FREE;
1970 m->m_flags = m->m_len = 0;
1971 m->m_next = m->m_nextpkt = NULL;
1972
1973 /* Save mbuf fields and make auditing happy */
1974 if (mclaudit != NULL) {
1975 mcl_audit_mbuf(mca, o, FALSE, FALSE);
1976 }
1977
1978 VERIFY(m_total(class) > 0);
1979 m_total(class)--;
1980
1981 /* Free the mbuf */
1982 o->obj_next = NULL;
1983 slab_free(MC_MBUF, o);
1984
1985 /* And free the cluster */
1986 ((mcache_obj_t *)cl)->obj_next = NULL;
1987 if (class == MC_MBUF_CL) {
1988 slab_free(MC_CL, cl);
1989 } else if (class == MC_MBUF_BIGCL) {
1990 slab_free(MC_BIGCL, cl);
1991 } else {
1992 slab_free(MC_16KCL, cl);
1993 }
1994 }
1995
1996 ++num;
1997 tail = o;
1998 o = nexto;
1999 }
2000
2001 if (!purged) {
2002 tail->obj_next = m_cobjlist(class);
2003 m_cobjlist(class) = list;
2004 m_infree(class) += num;
2005 } else if (ref_list != NULL) {
2006 mcache_free_ext(ref_cache, ref_list);
2007 }
2008
2009 return num;
2010 }
2011
2012 /*
2013 * Common allocator for composite objects called by the CPU cache layer
2014 * during an allocation request whenever there is no available element in
2015 * the bucket layer. It returns one or more composite elements from the
2016 * appropriate global freelist. If the freelist is empty, it will attempt
2017 * to obtain the rudimentary objects from their caches and construct them
2018 * into composite mbuf + cluster objects.
2019 */
2020 static unsigned int
mbuf_cslab_alloc(void * arg,mcache_obj_t *** plist,unsigned int needed,int wait)2021 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2022 int wait)
2023 {
2024 mbuf_class_t class = (mbuf_class_t)arg;
2025 mbuf_class_t cl_class = 0;
2026 unsigned int num = 0, cnum = 0, want = needed;
2027 mcache_obj_t *ref_list = NULL;
2028 mcache_obj_t *mp_list = NULL;
2029 mcache_obj_t *clp_list = NULL;
2030 mcache_obj_t **list;
2031 struct ext_ref *rfa;
2032 struct mbuf *m;
2033 void *cl;
2034
2035 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2036 ASSERT(needed > 0);
2037
2038 /* There should not be any slab for this class */
2039 VERIFY(m_slab_cnt(class) == 0 &&
2040 m_slablist(class).tqh_first == NULL &&
2041 m_slablist(class).tqh_last == NULL);
2042
2043 lck_mtx_lock(mbuf_mlock);
2044
2045 /* Try using the freelist first */
2046 num = cslab_alloc(class, plist, needed);
2047 list = *plist;
2048 if (num == needed) {
2049 m_alloc_cnt(class) += num;
2050 lck_mtx_unlock(mbuf_mlock);
2051 return needed;
2052 }
2053
2054 lck_mtx_unlock(mbuf_mlock);
2055
2056 /*
2057 * We could not satisfy the request using the freelist alone;
2058 * allocate from the appropriate rudimentary caches and use
2059 * whatever we can get to construct the composite objects.
2060 */
2061 needed -= num;
2062
2063 /*
2064 * Mark these allocation requests as coming from a composite cache.
2065 * Also, if the caller is willing to be blocked, mark the request
2066 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2067 * slab layer waiting for the individual object when one or more
2068 * of the already-constructed composite objects are available.
2069 */
2070 wait |= MCR_COMP;
2071 if (!(wait & MCR_NOSLEEP)) {
2072 wait |= MCR_FAILOK;
2073 }
2074
2075 /* allocate mbufs */
2076 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2077 if (needed == 0) {
2078 ASSERT(mp_list == NULL);
2079 goto fail;
2080 }
2081
2082 /* allocate clusters */
2083 if (class == MC_MBUF_CL) {
2084 cl_class = MC_CL;
2085 } else if (class == MC_MBUF_BIGCL) {
2086 cl_class = MC_BIGCL;
2087 } else {
2088 VERIFY(class == MC_MBUF_16KCL);
2089 cl_class = MC_16KCL;
2090 }
2091 needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2092 if (needed == 0) {
2093 ASSERT(clp_list == NULL);
2094 goto fail;
2095 }
2096
2097 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2098 if (needed == 0) {
2099 ASSERT(ref_list == NULL);
2100 goto fail;
2101 }
2102
2103 /*
2104 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
2105 * overs will get freed accordingly before we return to caller.
2106 */
2107 for (cnum = 0; cnum < needed; cnum++) {
2108 struct mbuf *ms;
2109
2110 m = ms = (struct mbuf *)mp_list;
2111 mp_list = mp_list->obj_next;
2112
2113 cl = clp_list;
2114 clp_list = clp_list->obj_next;
2115 ((mcache_obj_t *)cl)->obj_next = NULL;
2116
2117 rfa = (struct ext_ref *)ref_list;
2118 ref_list = ref_list->obj_next;
2119 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2120
2121 /*
2122 * If auditing is enabled, construct the shadow mbuf
2123 * in the audit structure instead of in the actual one.
2124 * mbuf_cslab_audit() will take care of restoring the
2125 * contents after the integrity check.
2126 */
2127 if (mclaudit != NULL) {
2128 mcache_audit_t *mca, *cl_mca;
2129
2130 lck_mtx_lock(mbuf_mlock);
2131 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2132 ms = MCA_SAVED_MBUF_PTR(mca);
2133 cl_mca = mcl_audit_buf2mca(cl_class,
2134 (mcache_obj_t *)cl);
2135
2136 /*
2137 * Pair them up. Note that this is done at the time
2138 * the mbuf+cluster objects are constructed. This
2139 * information should be treated as "best effort"
2140 * debugging hint since more than one mbufs can refer
2141 * to a cluster. In that case, the cluster might not
2142 * be freed along with the mbuf it was paired with.
2143 */
2144 mca->mca_uptr = cl_mca;
2145 cl_mca->mca_uptr = mca;
2146
2147 ASSERT(mca->mca_uflags & MB_SCVALID);
2148 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2149 lck_mtx_unlock(mbuf_mlock);
2150
2151 /* Technically, they are in the freelist */
2152 if (mclverify) {
2153 size_t size;
2154
2155 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2156 m_maxsize(MC_MBUF));
2157
2158 if (class == MC_MBUF_CL) {
2159 size = m_maxsize(MC_CL);
2160 } else if (class == MC_MBUF_BIGCL) {
2161 size = m_maxsize(MC_BIGCL);
2162 } else {
2163 size = m_maxsize(MC_16KCL);
2164 }
2165
2166 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2167 size);
2168 }
2169 }
2170
2171 mbuf_init(ms, 0, MT_FREE);
2172 if (class == MC_MBUF_16KCL) {
2173 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2174 } else if (class == MC_MBUF_BIGCL) {
2175 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2176 } else {
2177 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2178 }
2179 VERIFY(ms->m_flags == M_EXT);
2180 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2181
2182 *list = (mcache_obj_t *)m;
2183 (*list)->obj_next = NULL;
2184 list = *plist = &(*list)->obj_next;
2185 }
2186
2187 fail:
2188 /*
2189 * Free up what's left of the above.
2190 */
2191 if (mp_list != NULL) {
2192 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2193 }
2194 if (clp_list != NULL) {
2195 mcache_free_ext(m_cache(cl_class), clp_list);
2196 }
2197 if (ref_list != NULL) {
2198 mcache_free_ext(ref_cache, ref_list);
2199 }
2200
2201 lck_mtx_lock(mbuf_mlock);
2202 if (num > 0 || cnum > 0) {
2203 m_total(class) += cnum;
2204 VERIFY(m_total(class) <= m_maxlimit(class));
2205 m_alloc_cnt(class) += num + cnum;
2206 }
2207 if ((num + cnum) < want) {
2208 m_fail_cnt(class) += (want - (num + cnum));
2209 }
2210 lck_mtx_unlock(mbuf_mlock);
2211
2212 return num + cnum;
2213 }
2214
2215 /*
2216 * Common de-allocator for composite objects called by the CPU cache
2217 * layer when one or more elements need to be returned to the appropriate
2218 * global freelist.
2219 */
2220 static void
mbuf_cslab_free(void * arg,mcache_obj_t * list,int purged)2221 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2222 {
2223 mbuf_class_t class = (mbuf_class_t)arg;
2224 unsigned int num;
2225 int w;
2226
2227 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2228
2229 lck_mtx_lock(mbuf_mlock);
2230
2231 num = cslab_free(class, list, purged);
2232 m_free_cnt(class) += num;
2233
2234 if ((w = mb_waiters) > 0) {
2235 mb_waiters = 0;
2236 }
2237 if (w) {
2238 mbwdog_logger("waking up all threads");
2239 }
2240
2241 lck_mtx_unlock(mbuf_mlock);
2242
2243 if (w != 0) {
2244 wakeup(mb_waitchan);
2245 }
2246 }
2247
2248 /*
2249 * Common auditor for composite objects called by the CPU cache layer
2250 * during an allocation or free request. For the former, this is called
2251 * after the objects are obtained from either the bucket or slab layer
2252 * and before they are returned to the caller. For the latter, this is
2253 * called immediately during free and before placing the objects into
2254 * the bucket or slab layer.
2255 */
2256 static void
mbuf_cslab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)2257 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2258 {
2259 mbuf_class_t class = (mbuf_class_t)arg, cl_class;
2260 mcache_audit_t *mca;
2261 struct mbuf *m, *ms;
2262 mcl_slab_t *clsp, *nsp;
2263 size_t cl_size;
2264 void *cl;
2265
2266 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2267 if (class == MC_MBUF_CL) {
2268 cl_class = MC_CL;
2269 } else if (class == MC_MBUF_BIGCL) {
2270 cl_class = MC_BIGCL;
2271 } else {
2272 cl_class = MC_16KCL;
2273 }
2274 cl_size = m_maxsize(cl_class);
2275
2276 while ((m = ms = (struct mbuf *)list) != NULL) {
2277 lck_mtx_lock(mbuf_mlock);
2278 /* Do the mbuf sanity checks and record its transaction */
2279 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2280 mcl_audit_mbuf(mca, m, TRUE, alloc);
2281 if (mcltrace) {
2282 mcache_buffer_log(mca, m, m_cache(class), &mb_start);
2283 }
2284
2285 if (alloc) {
2286 mca->mca_uflags |= MB_COMP_INUSE;
2287 } else {
2288 mca->mca_uflags &= ~MB_COMP_INUSE;
2289 }
2290
2291 /*
2292 * Use the shadow mbuf in the audit structure if we are
2293 * freeing, since the contents of the actual mbuf has been
2294 * pattern-filled by the above call to mcl_audit_mbuf().
2295 */
2296 if (!alloc && mclverify) {
2297 ms = MCA_SAVED_MBUF_PTR(mca);
2298 }
2299
2300 /* Do the cluster sanity checks and record its transaction */
2301 cl = ms->m_ext.ext_buf;
2302 clsp = slab_get(cl);
2303 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2304 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2305 if (class == MC_MBUF_CL) {
2306 VERIFY(clsp->sl_refcnt >= 1 &&
2307 clsp->sl_refcnt <= NCLPG);
2308 } else {
2309 VERIFY(clsp->sl_refcnt >= 1 &&
2310 clsp->sl_refcnt <= NBCLPG);
2311 }
2312
2313 if (class == MC_MBUF_16KCL) {
2314 int k;
2315 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2316 nsp = nsp->sl_next;
2317 /* Next slab must already be present */
2318 VERIFY(nsp != NULL);
2319 VERIFY(nsp->sl_refcnt == 1);
2320 }
2321 }
2322
2323
2324 mca = mcl_audit_buf2mca(cl_class, cl);
2325 mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE);
2326 if (mcltrace) {
2327 mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
2328 }
2329
2330 if (alloc) {
2331 mca->mca_uflags |= MB_COMP_INUSE;
2332 } else {
2333 mca->mca_uflags &= ~MB_COMP_INUSE;
2334 }
2335 lck_mtx_unlock(mbuf_mlock);
2336
2337 list = list->obj_next;
2338 }
2339 }
2340
2341 static void
m_vm_error_stats(uint32_t * cnt,uint64_t * ts,uint64_t * size,uint64_t alloc_size,kern_return_t error)2342 m_vm_error_stats(uint32_t *cnt, uint64_t *ts, uint64_t *size,
2343 uint64_t alloc_size, kern_return_t error)
2344 {
2345 *cnt = *cnt + 1;
2346 *ts = net_uptime();
2347 if (size) {
2348 *size = alloc_size;
2349 }
2350 switch (error) {
2351 case KERN_SUCCESS:
2352 break;
2353 case KERN_INVALID_ARGUMENT:
2354 mb_kmem_stats[0]++;
2355 break;
2356 case KERN_INVALID_ADDRESS:
2357 mb_kmem_stats[1]++;
2358 break;
2359 case KERN_RESOURCE_SHORTAGE:
2360 mb_kmem_stats[2]++;
2361 break;
2362 case KERN_NO_SPACE:
2363 mb_kmem_stats[3]++;
2364 break;
2365 case KERN_FAILURE:
2366 mb_kmem_stats[4]++;
2367 break;
2368 default:
2369 mb_kmem_stats[5]++;
2370 break;
2371 }
2372 }
2373
2374 static vm_offset_t
kmem_mb_alloc(vm_map_t mbmap,int size,int physContig,kern_return_t * err)2375 kmem_mb_alloc(vm_map_t mbmap, int size, int physContig, kern_return_t *err)
2376 {
2377 vm_offset_t addr = 0;
2378 kern_return_t kr = KERN_SUCCESS;
2379
2380 if (!physContig) {
2381 kr = kmem_alloc(mbmap, &addr, size,
2382 KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
2383 } else {
2384 kr = kmem_alloc_contig(mbmap, &addr, size, PAGE_MASK, 0xfffff,
2385 0, KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
2386 }
2387
2388 if (kr != KERN_SUCCESS) {
2389 addr = 0;
2390 }
2391 if (err) {
2392 *err = kr;
2393 }
2394
2395 return addr;
2396 }
2397
2398 /*
2399 * Allocate some number of mbuf clusters and place on cluster freelist.
2400 */
2401 static int
m_clalloc(const u_int32_t num,const int wait,const u_int32_t bufsize)2402 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2403 {
2404 int i, count = 0;
2405 vm_size_t size = 0;
2406 int numpages = 0, large_buffer;
2407 vm_offset_t page = 0;
2408 mcache_audit_t *mca_list = NULL;
2409 mcache_obj_t *con_list = NULL;
2410 mcl_slab_t *sp;
2411 mbuf_class_t class;
2412 kern_return_t error;
2413
2414 /* Set if a buffer allocation needs allocation of multiple pages */
2415 large_buffer = ((bufsize == m_maxsize(MC_16KCL)) &&
2416 PAGE_SIZE < M16KCLBYTES);
2417 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2418 bufsize == m_maxsize(MC_16KCL));
2419
2420 VERIFY((bufsize == PAGE_SIZE) ||
2421 (bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL)));
2422
2423 if (bufsize == m_size(MC_BIGCL)) {
2424 class = MC_BIGCL;
2425 } else {
2426 class = MC_16KCL;
2427 }
2428
2429 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2430
2431 /*
2432 * Multiple threads may attempt to populate the cluster map one
2433 * after another. Since we drop the lock below prior to acquiring
2434 * the physical page(s), our view of the cluster map may no longer
2435 * be accurate, and we could end up over-committing the pages beyond
2436 * the maximum allowed for each class. To prevent it, this entire
2437 * operation (including the page mapping) is serialized.
2438 */
2439 while (mb_clalloc_busy) {
2440 mb_clalloc_waiters++;
2441 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2442 (PZERO - 1), "m_clalloc", NULL);
2443 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2444 }
2445
2446 /* We are busy now; tell everyone else to go away */
2447 mb_clalloc_busy = TRUE;
2448
2449 /*
2450 * Honor the caller's wish to block or not block. We have a way
2451 * to grow the pool asynchronously using the mbuf worker thread.
2452 */
2453 i = m_howmany(num, bufsize);
2454 if (i <= 0 || (wait & M_DONTWAIT)) {
2455 goto out;
2456 }
2457
2458 lck_mtx_unlock(mbuf_mlock);
2459
2460 size = round_page(i * bufsize);
2461 page = kmem_mb_alloc(mb_map, size, large_buffer, &error);
2462
2463 /*
2464 * If we did ask for "n" 16KB physically contiguous chunks
2465 * and didn't get them, then please try again without this
2466 * restriction.
2467 */
2468 net_update_uptime();
2469 if (large_buffer && page == 0) {
2470 m_vm_error_stats(&mb_kmem_contig_failed,
2471 &mb_kmem_contig_failed_ts,
2472 &mb_kmem_contig_failed_size,
2473 size, error);
2474 page = kmem_mb_alloc(mb_map, size, 0, &error);
2475 }
2476
2477 if (page == 0) {
2478 m_vm_error_stats(&mb_kmem_failed,
2479 &mb_kmem_failed_ts,
2480 &mb_kmem_failed_size,
2481 size, error);
2482 #if PAGE_SIZE == 4096
2483 if (bufsize == m_maxsize(MC_BIGCL)) {
2484 #else
2485 if (bufsize >= m_maxsize(MC_BIGCL)) {
2486 #endif
2487 /* Try for 1 page if failed */
2488 size = PAGE_SIZE;
2489 page = kmem_mb_alloc(mb_map, size, 0, &error);
2490 if (page == 0) {
2491 m_vm_error_stats(&mb_kmem_one_failed,
2492 &mb_kmem_one_failed_ts,
2493 NULL, size, error);
2494 }
2495 }
2496
2497 if (page == 0) {
2498 lck_mtx_lock(mbuf_mlock);
2499 goto out;
2500 }
2501 }
2502
2503 VERIFY(IS_P2ALIGNED(page, PAGE_SIZE));
2504 numpages = size / PAGE_SIZE;
2505
2506 /* If auditing is enabled, allocate the audit structures now */
2507 if (mclaudit != NULL) {
2508 int needed;
2509
2510 /*
2511 * Yes, I realize this is a waste of memory for clusters
2512 * that never get transformed into mbufs, as we may end
2513 * up with NMBPG-1 unused audit structures per cluster.
2514 * But doing so tremendously simplifies the allocation
2515 * strategy, since at this point we are not holding the
2516 * mbuf lock and the caller is okay to be blocked.
2517 */
2518 if (bufsize == PAGE_SIZE) {
2519 needed = numpages * NMBPG;
2520
2521 i = mcache_alloc_ext(mcl_audit_con_cache,
2522 &con_list, needed, MCR_SLEEP);
2523
2524 VERIFY(con_list != NULL && i == needed);
2525 } else {
2526 /*
2527 * if multiple 4K pages are being used for a
2528 * 16K cluster
2529 */
2530 needed = numpages / NSLABSP16KB;
2531 }
2532
2533 i = mcache_alloc_ext(mcache_audit_cache,
2534 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2535
2536 VERIFY(mca_list != NULL && i == needed);
2537 }
2538
2539 lck_mtx_lock(mbuf_mlock);
2540
2541 for (i = 0; i < numpages; i++, page += PAGE_SIZE) {
2542 ppnum_t offset =
2543 ((unsigned char *)page - mbutl) >> PAGE_SHIFT;
2544 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
2545
2546 /*
2547 * If there is a mapper the appropriate I/O page is
2548 * returned; zero out the page to discard its past
2549 * contents to prevent exposing leftover kernel memory.
2550 */
2551 VERIFY(offset < mcl_pages);
2552 if (mcl_paddr_base != 0) {
2553 bzero((void *)(uintptr_t) page, PAGE_SIZE);
2554 new_page = IOMapperInsertPage(mcl_paddr_base,
2555 offset, new_page);
2556 }
2557 mcl_paddr[offset] = new_page;
2558
2559 /* Pattern-fill this fresh page */
2560 if (mclverify) {
2561 mcache_set_pattern(MCACHE_FREE_PATTERN,
2562 (caddr_t)page, PAGE_SIZE);
2563 }
2564 if (bufsize == PAGE_SIZE) {
2565 mcache_obj_t *buf;
2566 /* One for the entire page */
2567 sp = slab_get((void *)page);
2568 if (mclaudit != NULL) {
2569 mcl_audit_init((void *)page,
2570 &mca_list, &con_list,
2571 AUDIT_CONTENTS_SIZE, NMBPG);
2572 }
2573 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2574 slab_init(sp, class, SLF_MAPPED, (void *)page,
2575 (void *)page, PAGE_SIZE, 0, 1);
2576 buf = (mcache_obj_t *)page;
2577 buf->obj_next = NULL;
2578
2579 /* Insert this slab */
2580 slab_insert(sp, class);
2581
2582 /* Update stats now since slab_get drops the lock */
2583 ++m_infree(class);
2584 ++m_total(class);
2585 VERIFY(m_total(class) <= m_maxlimit(class));
2586 if (class == MC_BIGCL) {
2587 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2588 m_infree(MC_MBUF_BIGCL);
2589 mbstat.m_bigclusters = m_total(MC_BIGCL);
2590 }
2591 ++count;
2592 } else if ((bufsize > PAGE_SIZE) &&
2593 (i % NSLABSP16KB) == 0) {
2594 union m16kcluster *m16kcl = (union m16kcluster *)page;
2595 mcl_slab_t *nsp;
2596 int k;
2597
2598 /* One for the entire 16KB */
2599 sp = slab_get(m16kcl);
2600 if (mclaudit != NULL) {
2601 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2602 }
2603
2604 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2605 slab_init(sp, MC_16KCL, SLF_MAPPED,
2606 m16kcl, m16kcl, bufsize, 0, 1);
2607 m16kcl->m16kcl_next = NULL;
2608
2609 /*
2610 * 2nd-Nth page's slab is part of the first one,
2611 * where N is NSLABSP16KB.
2612 */
2613 for (k = 1; k < NSLABSP16KB; k++) {
2614 nsp = slab_get(((union mbigcluster *)page) + k);
2615 VERIFY(nsp->sl_refcnt == 0 &&
2616 nsp->sl_flags == 0);
2617 slab_init(nsp, MC_16KCL,
2618 SLF_MAPPED | SLF_PARTIAL,
2619 m16kcl, NULL, 0, 0, 0);
2620 }
2621 /* Insert this slab */
2622 slab_insert(sp, MC_16KCL);
2623
2624 /* Update stats now since slab_get drops the lock */
2625 ++m_infree(MC_16KCL);
2626 ++m_total(MC_16KCL);
2627 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2628 ++count;
2629 }
2630 }
2631 VERIFY(mca_list == NULL && con_list == NULL);
2632
2633 /* We're done; let others enter */
2634 mb_clalloc_busy = FALSE;
2635 if (mb_clalloc_waiters > 0) {
2636 mb_clalloc_waiters = 0;
2637 wakeup(mb_clalloc_waitchan);
2638 }
2639
2640 return count;
2641 out:
2642 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2643
2644 mtracelarge_register(size);
2645
2646 /* We're done; let others enter */
2647 mb_clalloc_busy = FALSE;
2648 if (mb_clalloc_waiters > 0) {
2649 mb_clalloc_waiters = 0;
2650 wakeup(mb_clalloc_waitchan);
2651 }
2652
2653 /*
2654 * When non-blocking we kick a thread if we have to grow the
2655 * pool or if the number of free clusters is less than requested.
2656 */
2657 if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) {
2658 mbwdog_logger("waking up the worker thread to to grow %s by %d",
2659 m_cname(class), i);
2660 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
2661 mbuf_worker_needs_wakeup = FALSE;
2662 }
2663 if (class == MC_BIGCL) {
2664 if (i > 0) {
2665 /*
2666 * Remember total number of 4KB clusters needed
2667 * at this time.
2668 */
2669 i += m_total(MC_BIGCL);
2670 if (i > m_region_expand(MC_BIGCL)) {
2671 m_region_expand(MC_BIGCL) = i;
2672 }
2673 }
2674 if (m_infree(MC_BIGCL) >= num) {
2675 return 1;
2676 }
2677 } else {
2678 if (i > 0) {
2679 /*
2680 * Remember total number of 16KB clusters needed
2681 * at this time.
2682 */
2683 i += m_total(MC_16KCL);
2684 if (i > m_region_expand(MC_16KCL)) {
2685 m_region_expand(MC_16KCL) = i;
2686 }
2687 }
2688 if (m_infree(MC_16KCL) >= num) {
2689 return 1;
2690 }
2691 }
2692 return 0;
2693 }
2694
2695 /*
2696 * Populate the global freelist of the corresponding buffer class.
2697 */
2698 static int
2699 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2700 {
2701 mcache_obj_t *o = NULL;
2702 int i, numpages = 0, count;
2703 mbuf_class_t super_class;
2704
2705 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2706 class == MC_16KCL);
2707
2708 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2709
2710 VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) ||
2711 PAGE_SIZE == m_maxsize(MC_16KCL));
2712
2713 if (m_maxsize(class) >= PAGE_SIZE) {
2714 return m_clalloc(num, wait, m_maxsize(class)) != 0;
2715 }
2716
2717 /*
2718 * The rest of the function will allocate pages and will slice
2719 * them up into the right size
2720 */
2721
2722 numpages = (num * m_size(class) + PAGE_SIZE - 1) / PAGE_SIZE;
2723
2724 /* Currently assume that pages are 4K or 16K */
2725 if (PAGE_SIZE == m_maxsize(MC_BIGCL)) {
2726 super_class = MC_BIGCL;
2727 } else {
2728 super_class = MC_16KCL;
2729 }
2730
2731 i = m_clalloc(numpages, wait, m_maxsize(super_class));
2732
2733 /* how many objects will we cut the page into? */
2734 int numobj = PAGE_SIZE / m_maxsize(class);
2735
2736 for (count = 0; count < numpages; count++) {
2737 /* respect totals, minlimit, maxlimit */
2738 if (m_total(super_class) <= m_minlimit(super_class) ||
2739 m_total(class) >= m_maxlimit(class)) {
2740 break;
2741 }
2742
2743 if ((o = slab_alloc(super_class, wait)) == NULL) {
2744 break;
2745 }
2746
2747 struct mbuf *m = (struct mbuf *)o;
2748 union mcluster *c = (union mcluster *)o;
2749 union mbigcluster *mbc = (union mbigcluster *)o;
2750 mcl_slab_t *sp = slab_get(o);
2751 mcache_audit_t *mca = NULL;
2752
2753 /*
2754 * since one full page will be converted to MC_MBUF or
2755 * MC_CL, verify that the reference count will match that
2756 * assumption
2757 */
2758 VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp));
2759 VERIFY((sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2760 /*
2761 * Make sure that the cluster is unmolested
2762 * while in freelist
2763 */
2764 if (mclverify) {
2765 mca = mcl_audit_buf2mca(super_class,
2766 (mcache_obj_t *)o);
2767 mcache_audit_free_verify(mca,
2768 (mcache_obj_t *)o, 0, m_maxsize(super_class));
2769 }
2770
2771 /* Reinitialize it as an mbuf or 2K or 4K slab */
2772 slab_init(sp, class, sp->sl_flags,
2773 sp->sl_base, NULL, PAGE_SIZE, 0, numobj);
2774
2775 VERIFY(sp->sl_head == NULL);
2776
2777 VERIFY(m_total(super_class) >= 1);
2778 m_total(super_class)--;
2779
2780 if (super_class == MC_BIGCL) {
2781 mbstat.m_bigclusters = m_total(MC_BIGCL);
2782 }
2783
2784 m_total(class) += numobj;
2785 VERIFY(m_total(class) <= m_maxlimit(class));
2786 m_infree(class) += numobj;
2787
2788 i = numobj;
2789 if (class == MC_MBUF) {
2790 mbstat.m_mbufs = m_total(MC_MBUF);
2791 mtype_stat_add(MT_FREE, NMBPG);
2792 while (i--) {
2793 /*
2794 * If auditing is enabled, construct the
2795 * shadow mbuf in the audit structure
2796 * instead of the actual one.
2797 * mbuf_slab_audit() will take care of
2798 * restoring the contents after the
2799 * integrity check.
2800 */
2801 if (mclaudit != NULL) {
2802 struct mbuf *ms;
2803 mca = mcl_audit_buf2mca(MC_MBUF,
2804 (mcache_obj_t *)m);
2805 ms = MCA_SAVED_MBUF_PTR(mca);
2806 ms->m_type = MT_FREE;
2807 } else {
2808 m->m_type = MT_FREE;
2809 }
2810 m->m_next = sp->sl_head;
2811 sp->sl_head = (void *)m++;
2812 }
2813 } else if (class == MC_CL) { /* MC_CL */
2814 mbstat.m_clfree =
2815 m_infree(MC_CL) + m_infree(MC_MBUF_CL);
2816 mbstat.m_clusters = m_total(MC_CL);
2817 while (i--) {
2818 c->mcl_next = sp->sl_head;
2819 sp->sl_head = (void *)c++;
2820 }
2821 } else {
2822 VERIFY(class == MC_BIGCL);
2823 mbstat.m_bigclusters = m_total(MC_BIGCL);
2824 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2825 m_infree(MC_MBUF_BIGCL);
2826 while (i--) {
2827 mbc->mbc_next = sp->sl_head;
2828 sp->sl_head = (void *)mbc++;
2829 }
2830 }
2831
2832 /* Insert into the mbuf or 2k or 4k slab list */
2833 slab_insert(sp, class);
2834
2835 if ((i = mb_waiters) > 0) {
2836 mb_waiters = 0;
2837 }
2838 if (i != 0) {
2839 mbwdog_logger("waking up all threads");
2840 wakeup(mb_waitchan);
2841 }
2842 }
2843 return count != 0;
2844 }
2845
2846 /*
2847 * For each class, initialize the freelist to hold m_minlimit() objects.
2848 */
2849 static void
2850 freelist_init(mbuf_class_t class)
2851 {
2852 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2853
2854 VERIFY(class == MC_CL || class == MC_BIGCL);
2855 VERIFY(m_total(class) == 0);
2856 VERIFY(m_minlimit(class) > 0);
2857
2858 while (m_total(class) < m_minlimit(class)) {
2859 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
2860 }
2861
2862 VERIFY(m_total(class) >= m_minlimit(class));
2863 }
2864
2865 /*
2866 * (Inaccurately) check if it might be worth a trip back to the
2867 * mcache layer due the availability of objects there. We'll
2868 * end up back here if there's nothing up there.
2869 */
2870 static boolean_t
2871 mbuf_cached_above(mbuf_class_t class, int wait)
2872 {
2873 switch (class) {
2874 case MC_MBUF:
2875 if (wait & MCR_COMP) {
2876 return !mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
2877 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
2878 }
2879 break;
2880
2881 case MC_CL:
2882 if (wait & MCR_COMP) {
2883 return !mcache_bkt_isempty(m_cache(MC_MBUF_CL));
2884 }
2885 break;
2886
2887 case MC_BIGCL:
2888 if (wait & MCR_COMP) {
2889 return !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
2890 }
2891 break;
2892
2893 case MC_16KCL:
2894 if (wait & MCR_COMP) {
2895 return !mcache_bkt_isempty(m_cache(MC_MBUF_16KCL));
2896 }
2897 break;
2898
2899 case MC_MBUF_CL:
2900 case MC_MBUF_BIGCL:
2901 case MC_MBUF_16KCL:
2902 break;
2903
2904 default:
2905 VERIFY(0);
2906 /* NOTREACHED */
2907 }
2908
2909 return !mcache_bkt_isempty(m_cache(class));
2910 }
2911
2912 /*
2913 * If possible, convert constructed objects to raw ones.
2914 */
2915 static boolean_t
2916 mbuf_steal(mbuf_class_t class, unsigned int num)
2917 {
2918 mcache_obj_t *top = NULL;
2919 mcache_obj_t **list = ⊤
2920 unsigned int tot = 0;
2921
2922 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2923
2924 switch (class) {
2925 case MC_MBUF:
2926 case MC_CL:
2927 case MC_BIGCL:
2928 case MC_16KCL:
2929 return FALSE;
2930
2931 case MC_MBUF_CL:
2932 case MC_MBUF_BIGCL:
2933 case MC_MBUF_16KCL:
2934 /* Get the required number of constructed objects if possible */
2935 if (m_infree(class) > m_minlimit(class)) {
2936 tot = cslab_alloc(class, &list,
2937 MIN(num, m_infree(class)));
2938 }
2939
2940 /* And destroy them to get back the raw objects */
2941 if (top != NULL) {
2942 (void) cslab_free(class, top, 1);
2943 }
2944 break;
2945
2946 default:
2947 VERIFY(0);
2948 /* NOTREACHED */
2949 }
2950
2951 return tot == num;
2952 }
2953
2954 static void
2955 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
2956 {
2957 int m, bmap = 0;
2958
2959 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2960
2961 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2962 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2963 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2964
2965 /*
2966 * This logic can be made smarter; for now, simply mark
2967 * all other related classes as potential victims.
2968 */
2969 switch (class) {
2970 case MC_MBUF:
2971 m_wantpurge(MC_CL)++;
2972 m_wantpurge(MC_BIGCL)++;
2973 m_wantpurge(MC_MBUF_CL)++;
2974 m_wantpurge(MC_MBUF_BIGCL)++;
2975 break;
2976
2977 case MC_CL:
2978 m_wantpurge(MC_MBUF)++;
2979 m_wantpurge(MC_BIGCL)++;
2980 m_wantpurge(MC_MBUF_BIGCL)++;
2981 if (!comp) {
2982 m_wantpurge(MC_MBUF_CL)++;
2983 }
2984 break;
2985
2986 case MC_BIGCL:
2987 m_wantpurge(MC_MBUF)++;
2988 m_wantpurge(MC_CL)++;
2989 m_wantpurge(MC_MBUF_CL)++;
2990 if (!comp) {
2991 m_wantpurge(MC_MBUF_BIGCL)++;
2992 }
2993 break;
2994
2995 case MC_16KCL:
2996 if (!comp) {
2997 m_wantpurge(MC_MBUF_16KCL)++;
2998 }
2999 break;
3000
3001 default:
3002 VERIFY(0);
3003 /* NOTREACHED */
3004 }
3005
3006 /*
3007 * Run through each marked class and check if we really need to
3008 * purge (and therefore temporarily disable) the per-CPU caches
3009 * layer used by the class. If so, remember the classes since
3010 * we are going to drop the lock below prior to purging.
3011 */
3012 for (m = 0; m < MC_MAX; m++) {
3013 if (m_wantpurge(m) > 0) {
3014 m_wantpurge(m) = 0;
3015 /*
3016 * Try hard to steal the required number of objects
3017 * from the freelist of other mbuf classes. Only
3018 * purge and disable the per-CPU caches layer when
3019 * we don't have enough; it's the last resort.
3020 */
3021 if (!mbuf_steal(m, num)) {
3022 bmap |= (1 << m);
3023 }
3024 }
3025 }
3026
3027 lck_mtx_unlock(mbuf_mlock);
3028
3029 if (bmap != 0) {
3030 /* signal the domains to drain */
3031 net_drain_domains();
3032
3033 /* Sigh; we have no other choices but to ask mcache to purge */
3034 for (m = 0; m < MC_MAX; m++) {
3035 if ((bmap & (1 << m)) &&
3036 mcache_purge_cache(m_cache(m), TRUE)) {
3037 lck_mtx_lock(mbuf_mlock);
3038 m_purge_cnt(m)++;
3039 mbstat.m_drain++;
3040 lck_mtx_unlock(mbuf_mlock);
3041 }
3042 }
3043 } else {
3044 /*
3045 * Request mcache to reap extra elements from all of its caches;
3046 * note that all reaps are serialized and happen only at a fixed
3047 * interval.
3048 */
3049 mcache_reap();
3050 }
3051 lck_mtx_lock(mbuf_mlock);
3052 }
3053
3054 struct mbuf *
3055 m_get_common(int wait, short type, int hdr)
3056 {
3057 struct mbuf *m;
3058
3059 int mcflags = MSLEEPF(wait);
3060
3061 /* Is this due to a non-blocking retry? If so, then try harder */
3062 if (mcflags & MCR_NOSLEEP) {
3063 mcflags |= MCR_TRYHARD;
3064 }
3065
3066 m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3067 if (m != NULL) {
3068 mbuf_init(m, hdr, type);
3069 mtype_stat_inc(type);
3070 mtype_stat_dec(MT_FREE);
3071 }
3072 return m;
3073 }
3074
3075 /*
3076 * Space allocation routines; these are also available as macros
3077 * for critical paths.
3078 */
3079 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
3080
3081 struct mbuf *
3082 m_free(struct mbuf *m)
3083 {
3084 struct mbuf *n = m->m_next;
3085
3086 if (m->m_type == MT_FREE) {
3087 panic("m_free: freeing an already freed mbuf");
3088 }
3089
3090 if (m->m_flags & M_PKTHDR) {
3091 /* Free the aux data and tags if there is any */
3092 m_tag_delete_chain(m);
3093
3094 m_do_tx_compl_callback(m, NULL);
3095 }
3096
3097 if (m->m_flags & M_EXT) {
3098 if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
3099 return n;
3100 }
3101 /*
3102 * Make sure that we don't touch any ext_ref
3103 * member after we decrement the reference count
3104 * since that may lead to use-after-free
3105 * when we do not hold the last reference.
3106 */
3107 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
3108 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
3109 const uint16_t minref = MEXT_MINREF(m);
3110 const uint16_t refcnt = m_decref(m);
3111
3112 if (refcnt == minref && !composite) {
3113 if (m_free_func == NULL) {
3114 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3115 } else if (m_free_func == m_bigfree) {
3116 mcache_free(m_cache(MC_BIGCL),
3117 m->m_ext.ext_buf);
3118 } else if (m_free_func == m_16kfree) {
3119 mcache_free(m_cache(MC_16KCL),
3120 m->m_ext.ext_buf);
3121 } else {
3122 (*m_free_func)(m->m_ext.ext_buf,
3123 m->m_ext.ext_size, m_get_ext_arg(m));
3124 }
3125 mcache_free(ref_cache, m_get_rfa(m));
3126 m_set_ext(m, NULL, NULL, NULL);
3127 } else if (refcnt == minref && composite) {
3128 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
3129
3130 mtype_stat_dec(m->m_type);
3131 mtype_stat_inc(MT_FREE);
3132
3133 m->m_type = MT_FREE;
3134 m->m_flags = M_EXT;
3135 m->m_len = 0;
3136 m->m_next = m->m_nextpkt = NULL;
3137 /*
3138 * MEXT_FLAGS is safe to access here
3139 * since we are now sure that we held
3140 * the last reference to ext_ref.
3141 */
3142 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3143
3144 /* "Free" into the intermediate cache */
3145 if (m_free_func == NULL) {
3146 mcache_free(m_cache(MC_MBUF_CL), m);
3147 } else if (m_free_func == m_bigfree) {
3148 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3149 } else {
3150 VERIFY(m_free_func == m_16kfree);
3151 mcache_free(m_cache(MC_MBUF_16KCL), m);
3152 }
3153 return n;
3154 }
3155 }
3156
3157 mtype_stat_dec(m->m_type);
3158 mtype_stat_inc(MT_FREE);
3159
3160 m->m_type = MT_FREE;
3161 m->m_flags = m->m_len = 0;
3162 m->m_next = m->m_nextpkt = NULL;
3163
3164 mcache_free(m_cache(MC_MBUF), m);
3165
3166 return n;
3167 }
3168
3169 __private_extern__ struct mbuf *
3170 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3171 void (*extfree)(caddr_t, u_int, caddr_t), size_t extsize, caddr_t extarg,
3172 int wait, int pair)
3173 {
3174 struct ext_ref *rfa = NULL;
3175
3176 /*
3177 * If pairing is requested and an existing mbuf is provided, reject
3178 * it if it's already been paired to another cluster. Otherwise,
3179 * allocate a new one or free any existing below.
3180 */
3181 if ((m != NULL && MBUF_IS_PAIRED(m)) ||
3182 (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)) {
3183 return NULL;
3184 }
3185
3186 if (m->m_flags & M_EXT) {
3187 /*
3188 * Make sure that we don't touch any ext_ref
3189 * member after we decrement the reference count
3190 * since that may lead to use-after-free
3191 * when we do not hold the last reference.
3192 */
3193 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
3194 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
3195 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
3196 const uint16_t minref = MEXT_MINREF(m);
3197 const uint16_t refcnt = m_decref(m);
3198
3199 if (refcnt == minref && !composite) {
3200 if (m_free_func == NULL) {
3201 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3202 } else if (m_free_func == m_bigfree) {
3203 mcache_free(m_cache(MC_BIGCL),
3204 m->m_ext.ext_buf);
3205 } else if (m_free_func == m_16kfree) {
3206 mcache_free(m_cache(MC_16KCL),
3207 m->m_ext.ext_buf);
3208 } else {
3209 (*m_free_func)(m->m_ext.ext_buf,
3210 m->m_ext.ext_size, m_get_ext_arg(m));
3211 }
3212 /* Re-use the reference structure */
3213 rfa = m_get_rfa(m);
3214 } else if (refcnt == minref && composite) {
3215 VERIFY(m->m_type != MT_FREE);
3216
3217 mtype_stat_dec(m->m_type);
3218 mtype_stat_inc(MT_FREE);
3219
3220 m->m_type = MT_FREE;
3221 m->m_flags = M_EXT;
3222 m->m_len = 0;
3223 m->m_next = m->m_nextpkt = NULL;
3224
3225 /*
3226 * MEXT_FLAGS is safe to access here
3227 * since we are now sure that we held
3228 * the last reference to ext_ref.
3229 */
3230 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3231
3232 /* "Free" into the intermediate cache */
3233 if (m_free_func == NULL) {
3234 mcache_free(m_cache(MC_MBUF_CL), m);
3235 } else if (m_free_func == m_bigfree) {
3236 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3237 } else {
3238 VERIFY(m_free_func == m_16kfree);
3239 mcache_free(m_cache(MC_MBUF_16KCL), m);
3240 }
3241 /*
3242 * Allocate a new mbuf, since we didn't divorce
3243 * the composite mbuf + cluster pair above.
3244 */
3245 if ((m = _M_GETHDR(wait, type)) == NULL) {
3246 return NULL;
3247 }
3248 }
3249 }
3250
3251 if (rfa == NULL &&
3252 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3253 m_free(m);
3254 return NULL;
3255 }
3256
3257 if (!pair) {
3258 mext_init(m, extbuf, extsize, extfree, extarg, rfa,
3259 0, 1, 0, 0, 0, NULL);
3260 } else {
3261 mext_init(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
3262 1, 1, 1, EXTF_PAIRED, 0, m);
3263 }
3264
3265 return m;
3266 }
3267
3268 /*
3269 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3270 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3271 */
3272 struct mbuf *
3273 m_getcl(int wait, int type, int flags)
3274 {
3275 struct mbuf *m = NULL;
3276 int hdr = (flags & M_PKTHDR);
3277
3278 int mcflags = MSLEEPF(wait);
3279
3280 /* Is this due to a non-blocking retry? If so, then try harder */
3281 if (mcflags & MCR_NOSLEEP) {
3282 mcflags |= MCR_TRYHARD;
3283 }
3284
3285 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3286 if (m != NULL) {
3287 u_int16_t flag;
3288 struct ext_ref *rfa;
3289 void *cl;
3290
3291 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3292 cl = m->m_ext.ext_buf;
3293 rfa = m_get_rfa(m);
3294
3295 ASSERT(cl != NULL && rfa != NULL);
3296 VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
3297
3298 flag = MEXT_FLAGS(m);
3299
3300 mbuf_init(m, hdr, type);
3301 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3302
3303 mtype_stat_inc(type);
3304 mtype_stat_dec(MT_FREE);
3305 }
3306 return m;
3307 }
3308
3309 /* m_mclget() add an mbuf cluster to a normal mbuf */
3310 struct mbuf *
3311 m_mclget(struct mbuf *m, int wait)
3312 {
3313 struct ext_ref *rfa = NULL;
3314
3315 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3316 return m;
3317 }
3318 m->m_ext.ext_buf = m_mclalloc(wait);
3319 if (m->m_ext.ext_buf != NULL) {
3320 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3321 } else {
3322 mcache_free(ref_cache, rfa);
3323 }
3324
3325 return m;
3326 }
3327
3328 /* Allocate an mbuf cluster */
3329 caddr_t
3330 m_mclalloc(int wait)
3331 {
3332 int mcflags = MSLEEPF(wait);
3333
3334 /* Is this due to a non-blocking retry? If so, then try harder */
3335 if (mcflags & MCR_NOSLEEP) {
3336 mcflags |= MCR_TRYHARD;
3337 }
3338
3339 return mcache_alloc(m_cache(MC_CL), mcflags);
3340 }
3341
3342 /* Free an mbuf cluster */
3343 void
3344 m_mclfree(caddr_t p)
3345 {
3346 mcache_free(m_cache(MC_CL), p);
3347 }
3348
3349 __private_extern__ caddr_t
3350 m_bigalloc(int wait)
3351 {
3352 int mcflags = MSLEEPF(wait);
3353
3354 /* Is this due to a non-blocking retry? If so, then try harder */
3355 if (mcflags & MCR_NOSLEEP) {
3356 mcflags |= MCR_TRYHARD;
3357 }
3358
3359 return mcache_alloc(m_cache(MC_BIGCL), mcflags);
3360 }
3361
3362 __private_extern__ void
3363 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3364 {
3365 mcache_free(m_cache(MC_BIGCL), p);
3366 }
3367
3368 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3369 __private_extern__ struct mbuf *
3370 m_mbigget(struct mbuf *m, int wait)
3371 {
3372 struct ext_ref *rfa = NULL;
3373
3374 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3375 return m;
3376 }
3377 m->m_ext.ext_buf = m_bigalloc(wait);
3378 if (m->m_ext.ext_buf != NULL) {
3379 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3380 } else {
3381 mcache_free(ref_cache, rfa);
3382 }
3383 return m;
3384 }
3385
3386 __private_extern__ caddr_t
3387 m_16kalloc(int wait)
3388 {
3389 int mcflags = MSLEEPF(wait);
3390
3391 /* Is this due to a non-blocking retry? If so, then try harder */
3392 if (mcflags & MCR_NOSLEEP) {
3393 mcflags |= MCR_TRYHARD;
3394 }
3395
3396 return mcache_alloc(m_cache(MC_16KCL), mcflags);
3397 }
3398
3399 __private_extern__ void
3400 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3401 {
3402 mcache_free(m_cache(MC_16KCL), p);
3403 }
3404
3405 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3406 __private_extern__ struct mbuf *
3407 m_m16kget(struct mbuf *m, int wait)
3408 {
3409 struct ext_ref *rfa = NULL;
3410
3411 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3412 return m;
3413 }
3414 m->m_ext.ext_buf = m_16kalloc(wait);
3415 if (m->m_ext.ext_buf != NULL) {
3416 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3417 } else {
3418 mcache_free(ref_cache, rfa);
3419 }
3420
3421 return m;
3422 }
3423
3424 /*
3425 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3426 * if wantall is not set, return whatever number were available. Set up the
3427 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3428 * are chained on the m_nextpkt field. Any packets requested beyond this
3429 * are chained onto the last packet header's m_next field. The size of
3430 * the cluster is controlled by the parameter bufsize.
3431 */
3432 __private_extern__ struct mbuf *
3433 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3434 int wait, int wantall, size_t bufsize)
3435 {
3436 struct mbuf *m = NULL;
3437 struct mbuf **np, *top;
3438 unsigned int pnum, needed = *num_needed;
3439 mcache_obj_t *mp_list = NULL;
3440 int mcflags = MSLEEPF(wait);
3441 mcache_t *cp;
3442 u_int16_t flag;
3443 struct ext_ref *rfa;
3444 void *cl;
3445
3446 ASSERT(bufsize == m_maxsize(MC_CL) ||
3447 bufsize == m_maxsize(MC_BIGCL) ||
3448 bufsize == m_maxsize(MC_16KCL));
3449
3450 top = NULL;
3451 np = ⊤
3452 pnum = 0;
3453
3454 /*
3455 * The caller doesn't want all the requested buffers; only some.
3456 * Try hard to get what we can, but don't block. This effectively
3457 * overrides MCR_SLEEP, since this thread will not go to sleep
3458 * if we can't get all the buffers.
3459 */
3460 if (!wantall || (mcflags & MCR_NOSLEEP)) {
3461 mcflags |= MCR_TRYHARD;
3462 }
3463
3464 /* Allocate the composite mbuf + cluster elements from the cache */
3465 if (bufsize == m_maxsize(MC_CL)) {
3466 cp = m_cache(MC_MBUF_CL);
3467 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3468 cp = m_cache(MC_MBUF_BIGCL);
3469 } else {
3470 cp = m_cache(MC_MBUF_16KCL);
3471 }
3472 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3473
3474 for (pnum = 0; pnum < needed; pnum++) {
3475 m = (struct mbuf *)mp_list;
3476 mp_list = mp_list->obj_next;
3477
3478 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3479 cl = m->m_ext.ext_buf;
3480 rfa = m_get_rfa(m);
3481
3482 ASSERT(cl != NULL && rfa != NULL);
3483 VERIFY(MBUF_IS_COMPOSITE(m));
3484
3485 flag = MEXT_FLAGS(m);
3486
3487 mbuf_init(m, num_with_pkthdrs, MT_DATA);
3488 if (bufsize == m_maxsize(MC_16KCL)) {
3489 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3490 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3491 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3492 } else {
3493 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3494 }
3495
3496 if (num_with_pkthdrs > 0) {
3497 --num_with_pkthdrs;
3498 }
3499
3500 *np = m;
3501 if (num_with_pkthdrs > 0) {
3502 np = &m->m_nextpkt;
3503 } else {
3504 np = &m->m_next;
3505 }
3506 }
3507 ASSERT(pnum != *num_needed || mp_list == NULL);
3508 if (mp_list != NULL) {
3509 mcache_free_ext(cp, mp_list);
3510 }
3511 if (pnum > 0) {
3512 mtype_stat_add(MT_DATA, pnum);
3513 mtype_stat_sub(MT_FREE, pnum);
3514 }
3515
3516 if (wantall && (pnum != *num_needed)) {
3517 if (top != NULL) {
3518 m_freem_list(top);
3519 }
3520 return NULL;
3521 }
3522
3523 if (pnum > *num_needed) {
3524 printf("%s: File a radar related to <rdar://10146739>. \
3525 needed = %u, pnum = %u, num_needed = %u \n",
3526 __func__, needed, pnum, *num_needed);
3527 }
3528 *num_needed = pnum;
3529
3530 return top;
3531 }
3532
3533 /*
3534 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
3535 * wantall is not set, return whatever number were available. The size of
3536 * each mbuf in the list is controlled by the parameter packetlen. Each
3537 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
3538 * in the chain is called a segment. If maxsegments is not null and the
3539 * value pointed to is not null, this specify the maximum number of segments
3540 * for a chain of mbufs. If maxsegments is zero or the value pointed to
3541 * is zero the caller does not have any restriction on the number of segments.
3542 * The actual number of segments of a mbuf chain is return in the value
3543 * pointed to by maxsegments.
3544 */
3545 __private_extern__ struct mbuf *
3546 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3547 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3548 {
3549 struct mbuf **np, *top, *first = NULL;
3550 size_t bufsize, r_bufsize;
3551 unsigned int num = 0;
3552 unsigned int nsegs = 0;
3553 unsigned int needed = 0, resid;
3554 int mcflags = MSLEEPF(wait);
3555 mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3556 mcache_t *cp = NULL, *rcp = NULL;
3557
3558 if (*numlist == 0) {
3559 os_log(OS_LOG_DEFAULT, "m_allocpacket_internal *numlist is 0");
3560 return NULL;
3561 }
3562
3563 top = NULL;
3564 np = ⊤
3565
3566 if (wantsize == 0) {
3567 if (packetlen <= MINCLSIZE) {
3568 bufsize = packetlen;
3569 } else if (packetlen > m_maxsize(MC_CL)) {
3570 /* Use 4KB if jumbo cluster pool isn't available */
3571 if (packetlen <= m_maxsize(MC_BIGCL)) {
3572 bufsize = m_maxsize(MC_BIGCL);
3573 } else {
3574 bufsize = m_maxsize(MC_16KCL);
3575 }
3576 } else {
3577 bufsize = m_maxsize(MC_CL);
3578 }
3579 } else if (wantsize == m_maxsize(MC_CL) ||
3580 wantsize == m_maxsize(MC_BIGCL) ||
3581 wantsize == m_maxsize(MC_16KCL)) {
3582 bufsize = wantsize;
3583 } else {
3584 *numlist = 0;
3585 os_log(OS_LOG_DEFAULT, "m_allocpacket_internal wantsize unsupported");
3586 return NULL;
3587 }
3588
3589 if (bufsize <= MHLEN) {
3590 nsegs = 1;
3591 } else if (bufsize <= MINCLSIZE) {
3592 if (maxsegments != NULL && *maxsegments == 1) {
3593 bufsize = m_maxsize(MC_CL);
3594 nsegs = 1;
3595 } else {
3596 nsegs = 2;
3597 }
3598 } else if (bufsize == m_maxsize(MC_16KCL)) {
3599 nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1;
3600 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3601 nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1;
3602 } else {
3603 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3604 }
3605 if (maxsegments != NULL) {
3606 if (*maxsegments && nsegs > *maxsegments) {
3607 *maxsegments = nsegs;
3608 *numlist = 0;
3609 os_log(OS_LOG_DEFAULT, "m_allocpacket_internal nsegs > *maxsegments");
3610 return NULL;
3611 }
3612 *maxsegments = nsegs;
3613 }
3614
3615 /*
3616 * The caller doesn't want all the requested buffers; only some.
3617 * Try hard to get what we can, but don't block. This effectively
3618 * overrides MCR_SLEEP, since this thread will not go to sleep
3619 * if we can't get all the buffers.
3620 */
3621 if (!wantall || (mcflags & MCR_NOSLEEP)) {
3622 mcflags |= MCR_TRYHARD;
3623 }
3624
3625 /*
3626 * Simple case where all elements in the lists/chains are mbufs.
3627 * Unless bufsize is greater than MHLEN, each segment chain is made
3628 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
3629 * of 2 mbufs; the second one is used for the residual data, i.e.
3630 * the remaining data that cannot fit into the first mbuf.
3631 */
3632 if (bufsize <= MINCLSIZE) {
3633 /* Allocate the elements in one shot from the mbuf cache */
3634 ASSERT(bufsize <= MHLEN || nsegs == 2);
3635 cp = m_cache(MC_MBUF);
3636 needed = mcache_alloc_ext(cp, &mp_list,
3637 (*numlist) * nsegs, mcflags);
3638
3639 /*
3640 * The number of elements must be even if we are to use an
3641 * mbuf (instead of a cluster) to store the residual data.
3642 * If we couldn't allocate the requested number of mbufs,
3643 * trim the number down (if it's odd) in order to avoid
3644 * creating a partial segment chain.
3645 */
3646 if (bufsize > MHLEN && (needed & 0x1)) {
3647 needed--;
3648 }
3649
3650 while (num < needed) {
3651 struct mbuf *m = NULL;
3652
3653 m = (struct mbuf *)mp_list;
3654 mp_list = mp_list->obj_next;
3655 ASSERT(m != NULL);
3656
3657 mbuf_init(m, 1, MT_DATA);
3658 num++;
3659 if (bufsize > MHLEN) {
3660 /* A second mbuf for this segment chain */
3661 m->m_next = (struct mbuf *)mp_list;
3662 mp_list = mp_list->obj_next;
3663
3664 ASSERT(m->m_next != NULL);
3665
3666 mbuf_init(m->m_next, 0, MT_DATA);
3667 num++;
3668 }
3669 *np = m;
3670 np = &m->m_nextpkt;
3671 }
3672 ASSERT(num != *numlist || mp_list == NULL);
3673
3674 if (num > 0) {
3675 mtype_stat_add(MT_DATA, num);
3676 mtype_stat_sub(MT_FREE, num);
3677 }
3678 num /= nsegs;
3679
3680 /* We've got them all; return to caller */
3681 if (num == *numlist) {
3682 return top;
3683 }
3684
3685 goto fail;
3686 }
3687
3688 /*
3689 * Complex cases where elements are made up of one or more composite
3690 * mbufs + cluster, depending on packetlen. Each N-segment chain can
3691 * be illustrated as follows:
3692 *
3693 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3694 *
3695 * Every composite mbuf + cluster element comes from the intermediate
3696 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
3697 * the last composite element will come from the MC_MBUF_CL cache,
3698 * unless the residual data is larger than 2KB where we use the
3699 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
3700 * data is defined as extra data beyond the first element that cannot
3701 * fit into the previous element, i.e. there is no residual data if
3702 * the chain only has 1 segment.
3703 */
3704 r_bufsize = bufsize;
3705 resid = packetlen > bufsize ? packetlen % bufsize : 0;
3706 if (resid > 0) {
3707 /* There is residual data; figure out the cluster size */
3708 if (wantsize == 0 && packetlen > MINCLSIZE) {
3709 /*
3710 * Caller didn't request that all of the segments
3711 * in the chain use the same cluster size; use the
3712 * smaller of the cluster sizes.
3713 */
3714 if (resid > m_maxsize(MC_BIGCL)) {
3715 r_bufsize = m_maxsize(MC_16KCL);
3716 } else if (resid > m_maxsize(MC_CL)) {
3717 r_bufsize = m_maxsize(MC_BIGCL);
3718 } else {
3719 r_bufsize = m_maxsize(MC_CL);
3720 }
3721 } else {
3722 /* Use the same cluster size as the other segments */
3723 resid = 0;
3724 }
3725 }
3726
3727 needed = *numlist;
3728 if (resid > 0) {
3729 /*
3730 * Attempt to allocate composite mbuf + cluster elements for
3731 * the residual data in each chain; record the number of such
3732 * elements that can be allocated so that we know how many
3733 * segment chains we can afford to create.
3734 */
3735 if (r_bufsize <= m_maxsize(MC_CL)) {
3736 rcp = m_cache(MC_MBUF_CL);
3737 } else if (r_bufsize <= m_maxsize(MC_BIGCL)) {
3738 rcp = m_cache(MC_MBUF_BIGCL);
3739 } else {
3740 rcp = m_cache(MC_MBUF_16KCL);
3741 }
3742 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
3743 if (needed == 0) {
3744 goto fail;
3745 }
3746
3747 /* This is temporarily reduced for calculation */
3748 ASSERT(nsegs > 1);
3749 nsegs--;
3750 }
3751
3752 /*
3753 * Attempt to allocate the rest of the composite mbuf + cluster
3754 * elements for the number of segment chains that we need.
3755 */
3756 if (bufsize <= m_maxsize(MC_CL)) {
3757 cp = m_cache(MC_MBUF_CL);
3758 } else if (bufsize <= m_maxsize(MC_BIGCL)) {
3759 cp = m_cache(MC_MBUF_BIGCL);
3760 } else {
3761 cp = m_cache(MC_MBUF_16KCL);
3762 }
3763 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
3764
3765 /* Round it down to avoid creating a partial segment chain */
3766 needed = (needed / nsegs) * nsegs;
3767 if (needed == 0) {
3768 goto fail;
3769 }
3770
3771 if (resid > 0) {
3772 /*
3773 * We're about to construct the chain(s); take into account
3774 * the number of segments we have created above to hold the
3775 * residual data for each chain, as well as restore the
3776 * original count of segments per chain.
3777 */
3778 ASSERT(nsegs > 0);
3779 needed += needed / nsegs;
3780 nsegs++;
3781 }
3782
3783 for (;;) {
3784 struct mbuf *m = NULL;
3785 u_int16_t flag;
3786 struct ext_ref *rfa;
3787 void *cl;
3788 int pkthdr;
3789 m_ext_free_func_t m_free_func;
3790
3791 ++num;
3792
3793 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
3794 m = (struct mbuf *)mp_list;
3795 mp_list = mp_list->obj_next;
3796 } else {
3797 m = (struct mbuf *)rmp_list;
3798 rmp_list = rmp_list->obj_next;
3799 }
3800 m_free_func = m_get_ext_free(m);
3801 ASSERT(m != NULL);
3802 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3803 VERIFY(m_free_func == NULL || m_free_func == m_bigfree ||
3804 m_free_func == m_16kfree);
3805
3806 cl = m->m_ext.ext_buf;
3807 rfa = m_get_rfa(m);
3808
3809 ASSERT(cl != NULL && rfa != NULL);
3810 VERIFY(MBUF_IS_COMPOSITE(m));
3811
3812 flag = MEXT_FLAGS(m);
3813
3814 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
3815 if (pkthdr) {
3816 first = m;
3817 }
3818 mbuf_init(m, pkthdr, MT_DATA);
3819 if (m_free_func == m_16kfree) {
3820 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3821 } else if (m_free_func == m_bigfree) {
3822 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3823 } else {
3824 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3825 }
3826
3827 *np = m;
3828 if ((num % nsegs) == 0) {
3829 np = &first->m_nextpkt;
3830 } else {
3831 np = &m->m_next;
3832 }
3833
3834 if (num == needed) {
3835 break;
3836 }
3837 }
3838
3839 if (num > 0) {
3840 mtype_stat_add(MT_DATA, num);
3841 mtype_stat_sub(MT_FREE, num);
3842 }
3843
3844 num /= nsegs;
3845
3846 /* We've got them all; return to caller */
3847 if (num == *numlist) {
3848 ASSERT(mp_list == NULL && rmp_list == NULL);
3849 return top;
3850 }
3851
3852 fail:
3853 /* Free up what's left of the above */
3854 if (mp_list != NULL) {
3855 mcache_free_ext(cp, mp_list);
3856 }
3857 if (rmp_list != NULL) {
3858 mcache_free_ext(rcp, rmp_list);
3859 }
3860 if (wantall && top != NULL) {
3861 m_freem_list(top);
3862 *numlist = 0;
3863 return NULL;
3864 }
3865 *numlist = num;
3866 return top;
3867 }
3868
3869 /*
3870 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
3871 * for mbufs packets freed. Used by the drivers.
3872 */
3873 int
3874 m_freem_list(struct mbuf *m)
3875 {
3876 struct mbuf *nextpkt;
3877 mcache_obj_t *mp_list = NULL;
3878 mcache_obj_t *mcl_list = NULL;
3879 mcache_obj_t *mbc_list = NULL;
3880 mcache_obj_t *m16k_list = NULL;
3881 mcache_obj_t *m_mcl_list = NULL;
3882 mcache_obj_t *m_mbc_list = NULL;
3883 mcache_obj_t *m_m16k_list = NULL;
3884 mcache_obj_t *ref_list = NULL;
3885 int pktcount = 0;
3886 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
3887
3888 while (m != NULL) {
3889 pktcount++;
3890
3891 nextpkt = m->m_nextpkt;
3892 m->m_nextpkt = NULL;
3893
3894 while (m != NULL) {
3895 struct mbuf *next = m->m_next;
3896 mcache_obj_t *o, *rfa;
3897 if (m->m_type == MT_FREE) {
3898 panic("m_free: freeing an already freed mbuf");
3899 }
3900
3901 if (m->m_flags & M_PKTHDR) {
3902 /* Free the aux data and tags if there is any */
3903 m_tag_delete_chain(m);
3904 m_do_tx_compl_callback(m, NULL);
3905 }
3906
3907 if (!(m->m_flags & M_EXT)) {
3908 mt_free++;
3909 goto simple_free;
3910 }
3911
3912 if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
3913 m = next;
3914 continue;
3915 }
3916
3917 mt_free++;
3918
3919 o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
3920 /*
3921 * Make sure that we don't touch any ext_ref
3922 * member after we decrement the reference count
3923 * since that may lead to use-after-free
3924 * when we do not hold the last reference.
3925 */
3926 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
3927 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
3928 const uint16_t minref = MEXT_MINREF(m);
3929 const uint16_t refcnt = m_decref(m);
3930 if (refcnt == minref && !composite) {
3931 if (m_free_func == NULL) {
3932 o->obj_next = mcl_list;
3933 mcl_list = o;
3934 } else if (m_free_func == m_bigfree) {
3935 o->obj_next = mbc_list;
3936 mbc_list = o;
3937 } else if (m_free_func == m_16kfree) {
3938 o->obj_next = m16k_list;
3939 m16k_list = o;
3940 } else {
3941 (*(m_free_func))((caddr_t)o,
3942 m->m_ext.ext_size,
3943 m_get_ext_arg(m));
3944 }
3945 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
3946 rfa->obj_next = ref_list;
3947 ref_list = rfa;
3948 m_set_ext(m, NULL, NULL, NULL);
3949 } else if (refcnt == minref && composite) {
3950 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
3951 /*
3952 * Amortize the costs of atomic operations
3953 * by doing them at the end, if possible.
3954 */
3955 if (m->m_type == MT_DATA) {
3956 mt_data++;
3957 } else if (m->m_type == MT_HEADER) {
3958 mt_header++;
3959 } else if (m->m_type == MT_SONAME) {
3960 mt_soname++;
3961 } else if (m->m_type == MT_TAG) {
3962 mt_tag++;
3963 } else {
3964 mtype_stat_dec(m->m_type);
3965 }
3966
3967 m->m_type = MT_FREE;
3968 m->m_flags = M_EXT;
3969 m->m_len = 0;
3970 m->m_next = m->m_nextpkt = NULL;
3971
3972 /*
3973 * MEXT_FLAGS is safe to access here
3974 * since we are now sure that we held
3975 * the last reference to ext_ref.
3976 */
3977 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3978
3979 /* "Free" into the intermediate cache */
3980 o = (mcache_obj_t *)m;
3981 if (m_free_func == NULL) {
3982 o->obj_next = m_mcl_list;
3983 m_mcl_list = o;
3984 } else if (m_free_func == m_bigfree) {
3985 o->obj_next = m_mbc_list;
3986 m_mbc_list = o;
3987 } else {
3988 VERIFY(m_free_func == m_16kfree);
3989 o->obj_next = m_m16k_list;
3990 m_m16k_list = o;
3991 }
3992 m = next;
3993 continue;
3994 }
3995 simple_free:
3996 /*
3997 * Amortize the costs of atomic operations
3998 * by doing them at the end, if possible.
3999 */
4000 if (m->m_type == MT_DATA) {
4001 mt_data++;
4002 } else if (m->m_type == MT_HEADER) {
4003 mt_header++;
4004 } else if (m->m_type == MT_SONAME) {
4005 mt_soname++;
4006 } else if (m->m_type == MT_TAG) {
4007 mt_tag++;
4008 } else if (m->m_type != MT_FREE) {
4009 mtype_stat_dec(m->m_type);
4010 }
4011
4012 m->m_type = MT_FREE;
4013 m->m_flags = m->m_len = 0;
4014 m->m_next = m->m_nextpkt = NULL;
4015
4016 ((mcache_obj_t *)m)->obj_next = mp_list;
4017 mp_list = (mcache_obj_t *)m;
4018
4019 m = next;
4020 }
4021
4022 m = nextpkt;
4023 }
4024
4025 if (mt_free > 0) {
4026 mtype_stat_add(MT_FREE, mt_free);
4027 }
4028 if (mt_data > 0) {
4029 mtype_stat_sub(MT_DATA, mt_data);
4030 }
4031 if (mt_header > 0) {
4032 mtype_stat_sub(MT_HEADER, mt_header);
4033 }
4034 if (mt_soname > 0) {
4035 mtype_stat_sub(MT_SONAME, mt_soname);
4036 }
4037 if (mt_tag > 0) {
4038 mtype_stat_sub(MT_TAG, mt_tag);
4039 }
4040 if (mp_list != NULL) {
4041 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4042 }
4043 if (mcl_list != NULL) {
4044 mcache_free_ext(m_cache(MC_CL), mcl_list);
4045 }
4046 if (mbc_list != NULL) {
4047 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4048 }
4049 if (m16k_list != NULL) {
4050 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4051 }
4052 if (m_mcl_list != NULL) {
4053 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4054 }
4055 if (m_mbc_list != NULL) {
4056 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4057 }
4058 if (m_m16k_list != NULL) {
4059 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4060 }
4061 if (ref_list != NULL) {
4062 mcache_free_ext(ref_cache, ref_list);
4063 }
4064
4065 return pktcount;
4066 }
4067
4068 /*
4069 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4070 * within this routine also.
4071 *
4072 * The last mbuf and offset accessed are passed in and adjusted on return to
4073 * avoid having to iterate over the entire mbuf chain each time.
4074 */
4075 struct mbuf *
4076 m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
4077 struct mbuf **m_lastm, int *m_off, uint32_t mode)
4078 {
4079 struct mbuf *m = m0, *n, **np = NULL;
4080 int off = off0, len = len0;
4081 struct mbuf *top = NULL;
4082 int mcflags = MSLEEPF(wait);
4083 mcache_obj_t *list = NULL;
4084 int copyhdr = 0;
4085 int type = 0;
4086 int needed = 0;
4087
4088 if (off == 0 && (m->m_flags & M_PKTHDR)) {
4089 copyhdr = 1;
4090 }
4091
4092 if (m_lastm != NULL && *m_lastm != NULL) {
4093 if (off0 >= *m_off) {
4094 m = *m_lastm;
4095 off = off0 - *m_off;
4096 }
4097 }
4098
4099 while (off >= m->m_len) {
4100 off -= m->m_len;
4101 m = m->m_next;
4102 }
4103
4104 n = m;
4105 while (len > 0) {
4106 needed++;
4107 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4108 n = n->m_next;
4109 }
4110 needed++;
4111 len = len0;
4112
4113 /*
4114 * If the caller doesn't want to be put to sleep, mark it with
4115 * MCR_TRYHARD so that we may reclaim buffers from other places
4116 * before giving up.
4117 */
4118 if (mcflags & MCR_NOSLEEP) {
4119 mcflags |= MCR_TRYHARD;
4120 }
4121
4122 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4123 mcflags) != needed) {
4124 goto nospace;
4125 }
4126
4127 needed = 0;
4128 while (len > 0) {
4129 n = (struct mbuf *)list;
4130 list = list->obj_next;
4131 ASSERT(n != NULL && m != NULL);
4132
4133 type = (top == NULL) ? MT_HEADER : m->m_type;
4134 mbuf_init(n, (top == NULL), type);
4135
4136 if (top == NULL) {
4137 top = n;
4138 np = &top->m_next;
4139 continue;
4140 } else {
4141 needed++;
4142 *np = n;
4143 }
4144
4145 if (copyhdr) {
4146 if ((mode == M_COPYM_MOVE_HDR) ||
4147 (mode == M_COPYM_MUST_MOVE_HDR)) {
4148 M_COPY_PKTHDR(n, m);
4149 } else if ((mode == M_COPYM_COPY_HDR) ||
4150 (mode == M_COPYM_MUST_COPY_HDR)) {
4151 if (m_dup_pkthdr(n, m, wait) == 0) {
4152 goto nospace;
4153 }
4154 }
4155 n->m_pkthdr.len = len;
4156 copyhdr = 0;
4157 }
4158 n->m_len = MIN(len, (m->m_len - off));
4159
4160 if (m->m_flags & M_EXT) {
4161 n->m_ext = m->m_ext;
4162 m_incref(m);
4163 n->m_data = m->m_data + off;
4164 n->m_flags |= M_EXT;
4165 } else {
4166 if (m_mtod_end(n) > m_mtod_upper_bound(n)) {
4167 panic("%s n %p copy overflow",
4168 __func__, n);
4169 }
4170
4171 bcopy(mtod(m, caddr_t) + off, mtod(n, caddr_t),
4172 (unsigned)n->m_len);
4173 }
4174 len -= n->m_len;
4175
4176 if (len == 0) {
4177 if (m_lastm != NULL) {
4178 *m_lastm = m;
4179 *m_off = off0 + len0 - (off + n->m_len);
4180 }
4181 break;
4182 }
4183 off = 0;
4184 m = m->m_next;
4185 np = &n->m_next;
4186 }
4187
4188 mtype_stat_inc(MT_HEADER);
4189 mtype_stat_add(type, needed);
4190 mtype_stat_sub(MT_FREE, needed + 1);
4191
4192 ASSERT(list == NULL);
4193
4194 return top;
4195
4196 nospace:
4197 if (list != NULL) {
4198 mcache_free_ext(m_cache(MC_MBUF), list);
4199 }
4200 if (top != NULL) {
4201 m_freem(top);
4202 }
4203 return NULL;
4204 }
4205
4206 #ifndef MBUF_GROWTH_NORMAL_THRESH
4207 #define MBUF_GROWTH_NORMAL_THRESH 25
4208 #endif
4209
4210 /*
4211 * Cluster freelist allocation check.
4212 */
4213 static int
4214 m_howmany(int num, size_t bufsize)
4215 {
4216 int i = 0, j = 0;
4217 u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
4218 u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
4219 u_int32_t sumclusters, freeclusters;
4220 u_int32_t percent_pool, percent_kmem;
4221 u_int32_t mb_growth, mb_growth_thresh;
4222
4223 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
4224 bufsize == m_maxsize(MC_16KCL));
4225
4226 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4227
4228 /* Numbers in 2K cluster units */
4229 m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
4230 m_clusters = m_total(MC_CL);
4231 m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
4232 m_16kclusters = m_total(MC_16KCL);
4233 sumclusters = m_mbclusters + m_clusters + m_bigclusters;
4234
4235 m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
4236 m_clfree = m_infree(MC_CL);
4237 m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
4238 m_16kclfree = m_infree(MC_16KCL);
4239 freeclusters = m_mbfree + m_clfree + m_bigclfree;
4240
4241 /* Bail if we've maxed out the mbuf memory map */
4242 if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
4243 (bufsize == m_maxsize(MC_16KCL) &&
4244 (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
4245 mbwdog_logger("maxed out nclusters (%u >= %u) or njcl (%u >= %u)",
4246 sumclusters, nclusters,
4247 (m_16kclusters << NCLPJCLSHIFT), njcl);
4248 return 0;
4249 }
4250
4251 if (bufsize == m_maxsize(MC_BIGCL)) {
4252 /* Under minimum */
4253 if (m_bigclusters < m_minlimit(MC_BIGCL)) {
4254 return m_minlimit(MC_BIGCL) - m_bigclusters;
4255 }
4256
4257 percent_pool =
4258 ((sumclusters - freeclusters) * 100) / sumclusters;
4259 percent_kmem = (sumclusters * 100) / nclusters;
4260
4261 /*
4262 * If a light/normal user, grow conservatively (75%)
4263 * If a heavy user, grow aggressively (50%)
4264 */
4265 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH) {
4266 mb_growth = MB_GROWTH_NORMAL;
4267 } else {
4268 mb_growth = MB_GROWTH_AGGRESSIVE;
4269 }
4270
4271 if (percent_kmem < 5) {
4272 /* For initial allocations */
4273 i = num;
4274 } else {
4275 /* Return if >= MBIGCL_LOWAT clusters available */
4276 if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
4277 m_total(MC_BIGCL) >=
4278 MBIGCL_LOWAT + m_minlimit(MC_BIGCL)) {
4279 return 0;
4280 }
4281
4282 /* Ensure at least num clusters are accessible */
4283 if (num >= m_infree(MC_BIGCL)) {
4284 i = num - m_infree(MC_BIGCL);
4285 }
4286 if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL)) {
4287 j = num - (m_total(MC_BIGCL) -
4288 m_minlimit(MC_BIGCL));
4289 }
4290
4291 i = MAX(i, j);
4292
4293 /*
4294 * Grow pool if percent_pool > 75 (normal growth)
4295 * or percent_pool > 50 (aggressive growth).
4296 */
4297 mb_growth_thresh = 100 - (100 / (1 << mb_growth));
4298 if (percent_pool > mb_growth_thresh) {
4299 j = ((sumclusters + num) >> mb_growth) -
4300 freeclusters;
4301 }
4302 i = MAX(i, j);
4303 }
4304
4305 /* Check to ensure we didn't go over limits */
4306 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) {
4307 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
4308 }
4309 if ((i << 1) + sumclusters >= nclusters) {
4310 i = (nclusters - sumclusters) >> 1;
4311 }
4312 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
4313 VERIFY(sumclusters + (i << 1) <= nclusters);
4314 } else { /* 16K CL */
4315 /* Ensure at least num clusters are available */
4316 if (num >= m_16kclfree) {
4317 i = num - m_16kclfree;
4318 }
4319
4320 /* Always grow 16KCL pool aggressively */
4321 if (((m_16kclusters + num) >> 1) > m_16kclfree) {
4322 j = ((m_16kclusters + num) >> 1) - m_16kclfree;
4323 }
4324 i = MAX(i, j);
4325
4326 /* Check to ensure we don't go over limit */
4327 if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL)) {
4328 i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
4329 }
4330 }
4331 return i;
4332 }
4333
4334 uint64_t
4335 mcl_to_paddr(char *addr)
4336 {
4337 vm_offset_t base_phys;
4338
4339 if (!MBUF_IN_MAP(addr)) {
4340 return 0;
4341 }
4342 base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
4343
4344 if (base_phys == 0) {
4345 return 0;
4346 }
4347 return (uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK));
4348 }
4349
4350 /*
4351 * Inform the corresponding mcache(s) that there's a waiter below.
4352 */
4353 static void
4354 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
4355 {
4356 mcache_waiter_inc(m_cache(class));
4357 if (comp) {
4358 if (class == MC_CL) {
4359 mcache_waiter_inc(m_cache(MC_MBUF_CL));
4360 } else if (class == MC_BIGCL) {
4361 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
4362 } else if (class == MC_16KCL) {
4363 mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
4364 } else {
4365 mcache_waiter_inc(m_cache(MC_MBUF_CL));
4366 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
4367 }
4368 }
4369 }
4370
4371 /*
4372 * Inform the corresponding mcache(s) that there's no more waiter below.
4373 */
4374 static void
4375 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
4376 {
4377 mcache_waiter_dec(m_cache(class));
4378 if (comp) {
4379 if (class == MC_CL) {
4380 mcache_waiter_dec(m_cache(MC_MBUF_CL));
4381 } else if (class == MC_BIGCL) {
4382 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
4383 } else if (class == MC_16KCL) {
4384 mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
4385 } else {
4386 mcache_waiter_dec(m_cache(MC_MBUF_CL));
4387 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
4388 }
4389 }
4390 }
4391
4392 static bool mbuf_watchdog_defunct_active = false;
4393
4394 struct mbuf_watchdog_defunct_args {
4395 struct proc *top_app;
4396 uint32_t top_app_space_used;
4397 bool non_blocking;
4398 };
4399
4400 extern const char *proc_name_address(void *p);
4401
4402 static void
4403 mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1)
4404 {
4405 #pragma unused(arg0, arg1)
4406 struct mbuf_watchdog_defunct_args args = {};
4407 struct fileproc *fp = NULL;
4408
4409 args.non_blocking = false;
4410 proc_iterate(PROC_ALLPROCLIST,
4411 mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
4412
4413 /*
4414 * Defunct all sockets from this app.
4415 */
4416 if (args.top_app != NULL) {
4417 /* Restart the watchdog count. */
4418 lck_mtx_lock(mbuf_mlock);
4419 microuptime(&mb_wdtstart);
4420 lck_mtx_unlock(mbuf_mlock);
4421 os_log(OS_LOG_DEFAULT, "%s: defuncting all sockets from %s.%d",
4422 __func__,
4423 proc_name_address(args.top_app),
4424 proc_pid(args.top_app));
4425 proc_fdlock(args.top_app);
4426 fdt_foreach(fp, args.top_app) {
4427 struct fileglob *fg = fp->fp_glob;
4428 struct socket *so = NULL;
4429
4430 if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
4431 continue;
4432 }
4433 so = (struct socket *)fp_get_data(fp);
4434 if (!socket_try_lock(so)) {
4435 continue;
4436 }
4437 if (sosetdefunct(args.top_app, so,
4438 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
4439 TRUE) == 0) {
4440 sodefunct(args.top_app, so,
4441 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
4442 }
4443 socket_unlock(so, 0);
4444 }
4445 proc_fdunlock(args.top_app);
4446 proc_rele(args.top_app);
4447 mbstat.m_forcedefunct++;
4448 }
4449 mbuf_watchdog_defunct_active = false;
4450 }
4451
4452 /*
4453 * Called during slab (blocking and non-blocking) allocation. If there
4454 * is at least one waiter, and the time since the first waiter is blocked
4455 * is greater than the watchdog timeout, panic the system.
4456 */
4457 static void
4458 mbuf_watchdog(void)
4459 {
4460 struct timeval now;
4461 unsigned int since;
4462 static thread_call_t defunct_tcall = NULL;
4463
4464 if (mb_waiters == 0 || !mb_watchdog) {
4465 return;
4466 }
4467
4468 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4469
4470 microuptime(&now);
4471 since = now.tv_sec - mb_wdtstart.tv_sec;
4472
4473 if (mbuf_watchdog_defunct_active) {
4474 /*
4475 * Don't panic the system while we are trying
4476 * to find sockets to defunct.
4477 */
4478 return;
4479 }
4480 if (since >= MB_WDT_MAXTIME) {
4481 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
4482 mb_waiters, since, mbuf_dump());
4483 /* NOTREACHED */
4484 }
4485 /*
4486 * Check if we are about to panic the system due
4487 * to lack of mbufs and start defuncting sockets
4488 * from processes that use too many sockets.
4489 *
4490 * We're always called with the mbuf_mlock held,
4491 * so that also protects mbuf_watchdog_defunct_active.
4492 */
4493 if (since >= MB_WDT_MAXTIME / 2) {
4494 /*
4495 * Start a thread to defunct sockets
4496 * from apps that are over-using their socket
4497 * buffers.
4498 */
4499 if (defunct_tcall == NULL) {
4500 defunct_tcall =
4501 thread_call_allocate_with_options(mbuf_watchdog_defunct,
4502 NULL,
4503 THREAD_CALL_PRIORITY_KERNEL,
4504 THREAD_CALL_OPTIONS_ONCE);
4505 }
4506 if (defunct_tcall != NULL) {
4507 mbuf_watchdog_defunct_active = true;
4508 thread_call_enter(defunct_tcall);
4509 }
4510 }
4511 }
4512
4513 /*
4514 * Called during blocking allocation. Returns TRUE if one or more objects
4515 * are available at the per-CPU caches layer and that allocation should be
4516 * retried at that level.
4517 */
4518 static boolean_t
4519 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
4520 {
4521 boolean_t mcache_retry = FALSE;
4522
4523 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4524
4525 /* Check if there's anything at the cache layer */
4526 if (mbuf_cached_above(class, wait)) {
4527 mcache_retry = TRUE;
4528 goto done;
4529 }
4530
4531 /* Nothing? Then try hard to get it from somewhere */
4532 m_reclaim(class, num, (wait & MCR_COMP));
4533
4534 /* We tried hard and got something? */
4535 if (m_infree(class) > 0) {
4536 mbstat.m_wait++;
4537 goto done;
4538 } else if (mbuf_cached_above(class, wait)) {
4539 mbstat.m_wait++;
4540 mcache_retry = TRUE;
4541 goto done;
4542 } else if (wait & MCR_TRYHARD) {
4543 mcache_retry = TRUE;
4544 goto done;
4545 }
4546
4547 /*
4548 * There's really nothing for us right now; inform the
4549 * cache(s) that there is a waiter below and go to sleep.
4550 */
4551 mbuf_waiter_inc(class, (wait & MCR_COMP));
4552
4553 VERIFY(!(wait & MCR_NOSLEEP));
4554
4555 /*
4556 * If this is the first waiter, arm the watchdog timer. Otherwise
4557 * check if we need to panic the system due to watchdog timeout.
4558 */
4559 if (mb_waiters == 0) {
4560 microuptime(&mb_wdtstart);
4561 } else {
4562 mbuf_watchdog();
4563 }
4564
4565 mb_waiters++;
4566 m_region_expand(class) += m_total(class) + num;
4567 /* wake up the worker thread */
4568 if (mbuf_worker_ready &&
4569 mbuf_worker_needs_wakeup) {
4570 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
4571 mbuf_worker_needs_wakeup = FALSE;
4572 }
4573 mbwdog_logger("waiting (%d mbufs in class %s)", num, m_cname(class));
4574 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO - 1), m_cname(class), NULL);
4575 mbwdog_logger("woke up (%d mbufs in class %s) ", num, m_cname(class));
4576
4577 /* We are now up; stop getting notified until next round */
4578 mbuf_waiter_dec(class, (wait & MCR_COMP));
4579
4580 /* We waited and got something */
4581 if (m_infree(class) > 0) {
4582 mbstat.m_wait++;
4583 goto done;
4584 } else if (mbuf_cached_above(class, wait)) {
4585 mbstat.m_wait++;
4586 mcache_retry = TRUE;
4587 }
4588 done:
4589 return mcache_retry;
4590 }
4591
4592 __attribute__((noreturn))
4593 static void
4594 mbuf_worker_thread(void)
4595 {
4596 int mbuf_expand;
4597
4598 while (1) {
4599 lck_mtx_lock(mbuf_mlock);
4600 mbwdog_logger("worker thread running");
4601 mbuf_worker_run_cnt++;
4602 mbuf_expand = 0;
4603 /*
4604 * Allocations are based on page size, so if we have depleted
4605 * the reserved spaces, try to free mbufs from the major classes.
4606 */
4607 #if PAGE_SIZE == 4096
4608 uint32_t m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
4609 uint32_t m_clusters = m_total(MC_CL);
4610 uint32_t m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
4611 uint32_t sumclusters = m_mbclusters + m_clusters + m_bigclusters;
4612 if (sumclusters >= nclusters) {
4613 mbwdog_logger("reclaiming bigcl");
4614 mbuf_drain_locked(TRUE);
4615 m_reclaim(MC_BIGCL, 4, FALSE);
4616 }
4617 #else
4618 uint32_t m_16kclusters = m_total(MC_16KCL);
4619 if ((m_16kclusters << NCLPJCLSHIFT) >= njcl) {
4620 mbwdog_logger("reclaiming 16kcl");
4621 mbuf_drain_locked(TRUE);
4622 m_reclaim(MC_16KCL, 4, FALSE);
4623 }
4624 #endif
4625 if (m_region_expand(MC_CL) > 0) {
4626 int n;
4627 mb_expand_cl_cnt++;
4628 /* Adjust to current number of cluster in use */
4629 n = m_region_expand(MC_CL) -
4630 (m_total(MC_CL) - m_infree(MC_CL));
4631 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL)) {
4632 n = m_maxlimit(MC_CL) - m_total(MC_CL);
4633 }
4634 if (n > 0) {
4635 mb_expand_cl_total += n;
4636 }
4637 m_region_expand(MC_CL) = 0;
4638
4639 if (n > 0) {
4640 mbwdog_logger("expanding MC_CL by %d", n);
4641 freelist_populate(MC_CL, n, M_WAIT);
4642 }
4643 }
4644 if (m_region_expand(MC_BIGCL) > 0) {
4645 int n;
4646 mb_expand_bigcl_cnt++;
4647 /* Adjust to current number of 4 KB cluster in use */
4648 n = m_region_expand(MC_BIGCL) -
4649 (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
4650 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL)) {
4651 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
4652 }
4653 if (n > 0) {
4654 mb_expand_bigcl_total += n;
4655 }
4656 m_region_expand(MC_BIGCL) = 0;
4657
4658 if (n > 0) {
4659 mbwdog_logger("expanding MC_BIGCL by %d", n);
4660 freelist_populate(MC_BIGCL, n, M_WAIT);
4661 }
4662 }
4663 if (m_region_expand(MC_16KCL) > 0) {
4664 int n;
4665 mb_expand_16kcl_cnt++;
4666 /* Adjust to current number of 16 KB cluster in use */
4667 n = m_region_expand(MC_16KCL) -
4668 (m_total(MC_16KCL) - m_infree(MC_16KCL));
4669 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL)) {
4670 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
4671 }
4672 if (n > 0) {
4673 mb_expand_16kcl_total += n;
4674 }
4675 m_region_expand(MC_16KCL) = 0;
4676
4677 if (n > 0) {
4678 mbwdog_logger("expanding MC_16KCL by %d", n);
4679 (void) freelist_populate(MC_16KCL, n, M_WAIT);
4680 }
4681 }
4682
4683 /*
4684 * Because we can run out of memory before filling the mbuf
4685 * map, we should not allocate more clusters than they are
4686 * mbufs -- otherwise we could have a large number of useless
4687 * clusters allocated.
4688 */
4689 mbwdog_logger("totals: MC_MBUF %d MC_BIGCL %d MC_CL %d MC_16KCL %d",
4690 m_total(MC_MBUF), m_total(MC_BIGCL), m_total(MC_CL),
4691 m_total(MC_16KCL));
4692 uint32_t total_mbufs = m_total(MC_MBUF);
4693 uint32_t total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
4694 m_total(MC_16KCL);
4695 if (total_mbufs < total_clusters) {
4696 mbwdog_logger("expanding MC_MBUF by %d",
4697 total_clusters - total_mbufs);
4698 }
4699 while (total_mbufs < total_clusters) {
4700 mb_expand_cnt++;
4701 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0) {
4702 break;
4703 }
4704 total_mbufs = m_total(MC_MBUF);
4705 total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
4706 m_total(MC_16KCL);
4707 }
4708
4709 mbuf_worker_needs_wakeup = TRUE;
4710 /*
4711 * If there's a deadlock and we're not sending / receiving
4712 * packets, net_uptime() won't be updated. Update it here
4713 * so we are sure it's correct.
4714 */
4715 net_update_uptime();
4716 mbuf_worker_last_runtime = net_uptime();
4717 assert_wait((caddr_t)&mbuf_worker_needs_wakeup,
4718 THREAD_UNINT);
4719 mbwdog_logger("worker thread sleeping");
4720 lck_mtx_unlock(mbuf_mlock);
4721 (void) thread_block((thread_continue_t)mbuf_worker_thread);
4722 }
4723 }
4724
4725 __attribute__((noreturn))
4726 static void
4727 mbuf_worker_thread_init(void)
4728 {
4729 mbuf_worker_ready++;
4730 mbuf_worker_thread();
4731 }
4732
4733 static mcl_slab_t *
4734 slab_get(void *buf)
4735 {
4736 mcl_slabg_t *slg;
4737 unsigned int ix, k;
4738
4739 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4740
4741 VERIFY(MBUF_IN_MAP(buf));
4742 ix = ((unsigned char *)buf - mbutl) >> MBSHIFT;
4743 VERIFY(ix < maxslabgrp);
4744
4745 if ((slg = slabstbl[ix]) == NULL) {
4746 /*
4747 * In the current implementation, we never shrink the slabs
4748 * table; if we attempt to reallocate a cluster group when
4749 * it's already allocated, panic since this is a sign of a
4750 * memory corruption (slabstbl[ix] got nullified).
4751 */
4752 ++slabgrp;
4753 VERIFY(ix < slabgrp);
4754 /*
4755 * Slabs expansion can only be done single threaded; when
4756 * we get here, it must be as a result of m_clalloc() which
4757 * is serialized and therefore mb_clalloc_busy must be set.
4758 */
4759 VERIFY(mb_clalloc_busy);
4760 lck_mtx_unlock(mbuf_mlock);
4761
4762 /* This is a new buffer; create the slabs group for it */
4763 slg = zalloc_permanent_type(mcl_slabg_t);
4764 slg->slg_slab = zalloc_permanent(sizeof(mcl_slab_t) * NSLABSPMB,
4765 ZALIGN(mcl_slab_t));
4766
4767 lck_mtx_lock(mbuf_mlock);
4768 /*
4769 * No other thread could have gone into m_clalloc() after
4770 * we dropped the lock above, so verify that it's true.
4771 */
4772 VERIFY(mb_clalloc_busy);
4773
4774 slabstbl[ix] = slg;
4775
4776 /* Chain each slab in the group to its forward neighbor */
4777 for (k = 1; k < NSLABSPMB; k++) {
4778 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
4779 }
4780 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
4781
4782 /* And chain the last slab in the previous group to this */
4783 if (ix > 0) {
4784 VERIFY(slabstbl[ix - 1]->
4785 slg_slab[NSLABSPMB - 1].sl_next == NULL);
4786 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
4787 &slg->slg_slab[0];
4788 }
4789 }
4790
4791 ix = MTOPG(buf) % NSLABSPMB;
4792 VERIFY(ix < NSLABSPMB);
4793
4794 return &slg->slg_slab[ix];
4795 }
4796
4797 static void
4798 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
4799 void *base, void *head, unsigned int len, int refcnt, int chunks)
4800 {
4801 sp->sl_class = class;
4802 sp->sl_flags = flags;
4803 sp->sl_base = base;
4804 sp->sl_head = head;
4805 sp->sl_len = len;
4806 sp->sl_refcnt = refcnt;
4807 sp->sl_chunks = chunks;
4808 slab_detach(sp);
4809 }
4810
4811 static void
4812 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
4813 {
4814 VERIFY(slab_is_detached(sp));
4815 m_slab_cnt(class)++;
4816 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
4817 sp->sl_flags &= ~SLF_DETACHED;
4818
4819 /*
4820 * If a buffer spans multiple contiguous pages then mark them as
4821 * detached too
4822 */
4823 if (class == MC_16KCL) {
4824 int k;
4825 for (k = 1; k < NSLABSP16KB; k++) {
4826 sp = sp->sl_next;
4827 /* Next slab must already be present */
4828 VERIFY(sp != NULL && slab_is_detached(sp));
4829 sp->sl_flags &= ~SLF_DETACHED;
4830 }
4831 }
4832 }
4833
4834 static void
4835 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
4836 {
4837 int k;
4838 VERIFY(!slab_is_detached(sp));
4839 VERIFY(m_slab_cnt(class) > 0);
4840 m_slab_cnt(class)--;
4841 TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
4842 slab_detach(sp);
4843 if (class == MC_16KCL) {
4844 for (k = 1; k < NSLABSP16KB; k++) {
4845 sp = sp->sl_next;
4846 /* Next slab must already be present */
4847 VERIFY(sp != NULL);
4848 VERIFY(!slab_is_detached(sp));
4849 slab_detach(sp);
4850 }
4851 }
4852 }
4853
4854 static boolean_t
4855 slab_inrange(mcl_slab_t *sp, void *buf)
4856 {
4857 return (uintptr_t)buf >= (uintptr_t)sp->sl_base &&
4858 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len);
4859 }
4860
4861 #undef panic
4862
4863 static void
4864 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
4865 {
4866 int i;
4867 unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
4868 uintptr_t buf = (uintptr_t)sp->sl_base;
4869
4870 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
4871 void *next = ((mcache_obj_t *)buf)->obj_next;
4872 if (next != addr) {
4873 continue;
4874 }
4875 if (!mclverify) {
4876 if (next != NULL && !MBUF_IN_MAP(next)) {
4877 mcache_t *cp = m_cache(sp->sl_class);
4878 panic("%s: %s buffer %p in slab %p modified "
4879 "after free at offset 0: %p out of range "
4880 "[%p-%p)\n", __func__, cp->mc_name,
4881 (void *)buf, sp, next, mbutl, embutl);
4882 /* NOTREACHED */
4883 }
4884 } else {
4885 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
4886 (mcache_obj_t *)buf);
4887 mcl_audit_verify_nextptr(next, mca);
4888 }
4889 }
4890 }
4891
4892 static void
4893 slab_detach(mcl_slab_t *sp)
4894 {
4895 sp->sl_link.tqe_next = (mcl_slab_t *)-1;
4896 sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
4897 sp->sl_flags |= SLF_DETACHED;
4898 }
4899
4900 static boolean_t
4901 slab_is_detached(mcl_slab_t *sp)
4902 {
4903 return (intptr_t)sp->sl_link.tqe_next == -1 &&
4904 (intptr_t)sp->sl_link.tqe_prev == -1 &&
4905 (sp->sl_flags & SLF_DETACHED);
4906 }
4907
4908 static void
4909 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
4910 mcache_obj_t **con_list, size_t con_size, unsigned int num)
4911 {
4912 mcache_audit_t *mca, *mca_tail;
4913 mcache_obj_t *con = NULL;
4914 boolean_t save_contents = (con_list != NULL);
4915 unsigned int i, ix;
4916
4917 ASSERT(num <= NMBPG);
4918 ASSERT(con_list == NULL || con_size != 0);
4919
4920 ix = MTOPG(buf);
4921 VERIFY(ix < maxclaudit);
4922
4923 /* Make sure we haven't been here before */
4924 for (i = 0; i < num; i++) {
4925 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
4926 }
4927
4928 mca = mca_tail = *mca_list;
4929 if (save_contents) {
4930 con = *con_list;
4931 }
4932
4933 for (i = 0; i < num; i++) {
4934 mcache_audit_t *next;
4935
4936 next = mca->mca_next;
4937 bzero(mca, sizeof(*mca));
4938 mca->mca_next = next;
4939 mclaudit[ix].cl_audit[i] = mca;
4940
4941 /* Attach the contents buffer if requested */
4942 if (save_contents) {
4943 mcl_saved_contents_t *msc =
4944 (mcl_saved_contents_t *)(void *)con;
4945
4946 VERIFY(msc != NULL);
4947 VERIFY(IS_P2ALIGNED(msc, sizeof(u_int64_t)));
4948 VERIFY(con_size == sizeof(*msc));
4949 mca->mca_contents_size = con_size;
4950 mca->mca_contents = msc;
4951 con = con->obj_next;
4952 bzero(mca->mca_contents, mca->mca_contents_size);
4953 }
4954
4955 mca_tail = mca;
4956 mca = mca->mca_next;
4957 }
4958
4959 if (save_contents) {
4960 *con_list = con;
4961 }
4962
4963 *mca_list = mca_tail->mca_next;
4964 mca_tail->mca_next = NULL;
4965 }
4966
4967 static void
4968 mcl_audit_free(void *buf, unsigned int num)
4969 {
4970 unsigned int i, ix;
4971 mcache_audit_t *mca, *mca_list;
4972
4973 ix = MTOPG(buf);
4974 VERIFY(ix < maxclaudit);
4975
4976 if (mclaudit[ix].cl_audit[0] != NULL) {
4977 mca_list = mclaudit[ix].cl_audit[0];
4978 for (i = 0; i < num; i++) {
4979 mca = mclaudit[ix].cl_audit[i];
4980 mclaudit[ix].cl_audit[i] = NULL;
4981 if (mca->mca_contents) {
4982 mcache_free(mcl_audit_con_cache,
4983 mca->mca_contents);
4984 }
4985 }
4986 mcache_free_ext(mcache_audit_cache,
4987 (mcache_obj_t *)mca_list);
4988 }
4989 }
4990
4991 /*
4992 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
4993 * the corresponding audit structure for that buffer.
4994 */
4995 static mcache_audit_t *
4996 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj)
4997 {
4998 mcache_audit_t *mca = NULL;
4999 int ix = MTOPG(mobj), m_idx = 0;
5000 unsigned char *page_addr;
5001
5002 VERIFY(ix < maxclaudit);
5003 VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE)));
5004
5005 page_addr = PGTOM(ix);
5006
5007 switch (class) {
5008 case MC_MBUF:
5009 /*
5010 * For the mbuf case, find the index of the page
5011 * used by the mbuf and use that index to locate the
5012 * base address of the page. Then find out the
5013 * mbuf index relative to the page base and use
5014 * it to locate the audit structure.
5015 */
5016 m_idx = MBPAGEIDX(page_addr, mobj);
5017 VERIFY(m_idx < (int)NMBPG);
5018 mca = mclaudit[ix].cl_audit[m_idx];
5019 break;
5020
5021 case MC_CL:
5022 /*
5023 * Same thing as above, but for 2KB clusters in a page.
5024 */
5025 m_idx = CLPAGEIDX(page_addr, mobj);
5026 VERIFY(m_idx < (int)NCLPG);
5027 mca = mclaudit[ix].cl_audit[m_idx];
5028 break;
5029
5030 case MC_BIGCL:
5031 m_idx = BCLPAGEIDX(page_addr, mobj);
5032 VERIFY(m_idx < (int)NBCLPG);
5033 mca = mclaudit[ix].cl_audit[m_idx];
5034 break;
5035 case MC_16KCL:
5036 /*
5037 * Same as above, but only return the first element.
5038 */
5039 mca = mclaudit[ix].cl_audit[0];
5040 break;
5041
5042 default:
5043 VERIFY(0);
5044 /* NOTREACHED */
5045 }
5046
5047 return mca;
5048 }
5049
5050 static void
5051 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
5052 boolean_t alloc)
5053 {
5054 struct mbuf *m = addr;
5055 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
5056
5057 VERIFY(mca->mca_contents != NULL &&
5058 mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
5059
5060 if (mclverify) {
5061 mcl_audit_verify_nextptr(next, mca);
5062 }
5063
5064 if (!alloc) {
5065 /* Save constructed mbuf fields */
5066 mcl_audit_save_mbuf(m, mca);
5067 if (mclverify) {
5068 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
5069 m_maxsize(MC_MBUF));
5070 }
5071 ((mcache_obj_t *)m)->obj_next = next;
5072 return;
5073 }
5074
5075 /* Check if the buffer has been corrupted while in freelist */
5076 if (mclverify) {
5077 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
5078 }
5079 /* Restore constructed mbuf fields */
5080 mcl_audit_restore_mbuf(m, mca, composite);
5081 }
5082
5083 static void
5084 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
5085 {
5086 struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
5087
5088 if (composite) {
5089 struct mbuf *next = m->m_next;
5090 VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL &&
5091 MBUF_IS_COMPOSITE(ms));
5092 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
5093 /*
5094 * We could have hand-picked the mbuf fields and restore
5095 * them individually, but that will be a maintenance
5096 * headache. Instead, restore everything that was saved;
5097 * the mbuf layer will recheck and reinitialize anyway.
5098 */
5099 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
5100 m->m_next = next;
5101 } else {
5102 /*
5103 * For a regular mbuf (no cluster attached) there's nothing
5104 * to restore other than the type field, which is expected
5105 * to be MT_FREE.
5106 */
5107 m->m_type = ms->m_type;
5108 }
5109 mbuf_mcheck(m);
5110 }
5111
5112 static void
5113 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
5114 {
5115 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
5116 mbuf_mcheck(m);
5117 bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
5118 }
5119
5120 static void
5121 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
5122 boolean_t save_next)
5123 {
5124 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
5125
5126 if (!alloc) {
5127 if (mclverify) {
5128 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
5129 }
5130 if (save_next) {
5131 mcl_audit_verify_nextptr(next, mca);
5132 ((mcache_obj_t *)addr)->obj_next = next;
5133 }
5134 } else if (mclverify) {
5135 /* Check if the buffer has been corrupted while in freelist */
5136 mcl_audit_verify_nextptr(next, mca);
5137 mcache_audit_free_verify_set(mca, addr, 0, size);
5138 }
5139 }
5140
5141 static void
5142 mcl_audit_scratch(mcache_audit_t *mca)
5143 {
5144 void *stack[MCACHE_STACK_DEPTH + 1];
5145 mcl_scratch_audit_t *msa;
5146 struct timeval now;
5147
5148 VERIFY(mca->mca_contents != NULL);
5149 msa = MCA_SAVED_SCRATCH_PTR(mca);
5150
5151 msa->msa_pthread = msa->msa_thread;
5152 msa->msa_thread = current_thread();
5153 bcopy(msa->msa_stack, msa->msa_pstack, sizeof(msa->msa_pstack));
5154 msa->msa_pdepth = msa->msa_depth;
5155 bzero(stack, sizeof(stack));
5156 msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
5157 bcopy(&stack[1], msa->msa_stack, sizeof(msa->msa_stack));
5158
5159 msa->msa_ptstamp = msa->msa_tstamp;
5160 microuptime(&now);
5161 /* tstamp is in ms relative to base_ts */
5162 msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
5163 if ((now.tv_sec - mb_start.tv_sec) > 0) {
5164 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
5165 }
5166 }
5167
5168 __abortlike
5169 static void
5170 mcl_audit_mcheck_panic(struct mbuf *m)
5171 {
5172 char buf[DUMP_MCA_BUF_SIZE];
5173 mcache_audit_t *mca;
5174
5175 MRANGE(m);
5176 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
5177
5178 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s",
5179 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(buf, mca));
5180 /* NOTREACHED */
5181 }
5182
5183 __abortlike
5184 static void
5185 mcl_audit_verify_nextptr_panic(void *next, mcache_audit_t *mca)
5186 {
5187 char buf[DUMP_MCA_BUF_SIZE];
5188 panic("mcl_audit: buffer %p modified after free at offset 0: "
5189 "%p out of range [%p-%p)\n%s\n",
5190 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(buf, mca));
5191 /* NOTREACHED */
5192 }
5193
5194 static void
5195 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
5196 {
5197 if (next != NULL && !MBUF_IN_MAP(next) &&
5198 (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
5199 mcl_audit_verify_nextptr_panic(next, mca);
5200 }
5201 }
5202
5203 static uintptr_t
5204 hash_mix(uintptr_t x)
5205 {
5206 #ifndef __LP64__
5207 x += ~(x << 15);
5208 x ^= (x >> 10);
5209 x += (x << 3);
5210 x ^= (x >> 6);
5211 x += ~(x << 11);
5212 x ^= (x >> 16);
5213 #else
5214 x += ~(x << 32);
5215 x ^= (x >> 22);
5216 x += ~(x << 13);
5217 x ^= (x >> 8);
5218 x += (x << 3);
5219 x ^= (x >> 15);
5220 x += ~(x << 27);
5221 x ^= (x >> 31);
5222 #endif
5223 return x;
5224 }
5225
5226 static uint32_t
5227 hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
5228 {
5229 uintptr_t hash = 0;
5230 uintptr_t mask = max_size - 1;
5231
5232 while (depth) {
5233 hash += bt[--depth];
5234 }
5235
5236 hash = hash_mix(hash) & mask;
5237
5238 assert(hash < max_size);
5239
5240 return (uint32_t) hash;
5241 }
5242
5243 static uint32_t
5244 hashaddr(uintptr_t pt, uint32_t max_size)
5245 {
5246 uintptr_t hash = 0;
5247 uintptr_t mask = max_size - 1;
5248
5249 hash = hash_mix(pt) & mask;
5250
5251 assert(hash < max_size);
5252
5253 return (uint32_t) hash;
5254 }
5255
5256 /* This function turns on mbuf leak detection */
5257 static void
5258 mleak_activate(void)
5259 {
5260 mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
5261 PE_parse_boot_argn("mleak_sample_factor",
5262 &mleak_table.mleak_sample_factor,
5263 sizeof(mleak_table.mleak_sample_factor));
5264
5265 if (mleak_table.mleak_sample_factor == 0) {
5266 mclfindleak = 0;
5267 }
5268
5269 if (mclfindleak == 0) {
5270 return;
5271 }
5272
5273 vm_size_t alloc_size =
5274 mleak_alloc_buckets * sizeof(struct mallocation);
5275 vm_size_t trace_size = mleak_trace_buckets * sizeof(struct mtrace);
5276
5277 mleak_allocations = zalloc_permanent(alloc_size, ZALIGN(struct mallocation));
5278 mleak_traces = zalloc_permanent(trace_size, ZALIGN(struct mtrace));
5279 mleak_stat = zalloc_permanent(MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
5280 ZALIGN(mleak_stat_t));
5281
5282 mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
5283 #ifdef __LP64__
5284 mleak_stat->ml_isaddr64 = 1;
5285 #endif /* __LP64__ */
5286 }
5287
5288 static void
5289 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
5290 {
5291 int temp;
5292
5293 if (mclfindleak == 0) {
5294 return;
5295 }
5296
5297 if (!alloc) {
5298 return mleak_free(addr);
5299 }
5300
5301 temp = os_atomic_inc_orig(&mleak_table.mleak_capture, relaxed);
5302
5303 if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
5304 uintptr_t bt[MLEAK_STACK_DEPTH];
5305 unsigned int logged = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
5306 mleak_log(bt, addr, logged, num);
5307 }
5308 }
5309
5310 /*
5311 * This function records the allocation in the mleak_allocations table
5312 * and the backtrace in the mleak_traces table; if allocation slot is in use,
5313 * replace old allocation with new one if the trace slot is in use, return
5314 * (or increment refcount if same trace).
5315 */
5316 static boolean_t
5317 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
5318 {
5319 struct mallocation *allocation;
5320 struct mtrace *trace;
5321 uint32_t trace_index;
5322
5323 /* Quit if someone else modifying the tables */
5324 if (!lck_mtx_try_lock_spin(mleak_lock)) {
5325 mleak_table.total_conflicts++;
5326 return FALSE;
5327 }
5328
5329 allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
5330 mleak_alloc_buckets)];
5331 trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
5332 trace = &mleak_traces[trace_index];
5333
5334 VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
5335 VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
5336
5337 allocation->hitcount++;
5338 trace->hitcount++;
5339
5340 /*
5341 * If the allocation bucket we want is occupied
5342 * and the occupier has the same trace, just bail.
5343 */
5344 if (allocation->element != NULL &&
5345 trace_index == allocation->trace_index) {
5346 mleak_table.alloc_collisions++;
5347 lck_mtx_unlock(mleak_lock);
5348 return TRUE;
5349 }
5350
5351 /*
5352 * Store the backtrace in the traces array;
5353 * Size of zero = trace bucket is free.
5354 */
5355 if (trace->allocs > 0 &&
5356 bcmp(trace->addr, bt, (depth * sizeof(uintptr_t))) != 0) {
5357 /* Different, unique trace, but the same hash! Bail out. */
5358 trace->collisions++;
5359 mleak_table.trace_collisions++;
5360 lck_mtx_unlock(mleak_lock);
5361 return TRUE;
5362 } else if (trace->allocs > 0) {
5363 /* Same trace, already added, so increment refcount */
5364 trace->allocs++;
5365 } else {
5366 /* Found an unused trace bucket, so record the trace here */
5367 if (trace->depth != 0) {
5368 /* this slot previously used but not currently in use */
5369 mleak_table.trace_overwrites++;
5370 }
5371 mleak_table.trace_recorded++;
5372 trace->allocs = 1;
5373 memcpy(trace->addr, bt, (depth * sizeof(uintptr_t)));
5374 trace->depth = depth;
5375 trace->collisions = 0;
5376 }
5377
5378 /* Step 2: Store the allocation record in the allocations array */
5379 if (allocation->element != NULL) {
5380 /*
5381 * Replace an existing allocation. No need to preserve
5382 * because only a subset of the allocations are being
5383 * recorded anyway.
5384 */
5385 mleak_table.alloc_collisions++;
5386 } else if (allocation->trace_index != 0) {
5387 mleak_table.alloc_overwrites++;
5388 }
5389 allocation->element = addr;
5390 allocation->trace_index = trace_index;
5391 allocation->count = num;
5392 mleak_table.alloc_recorded++;
5393 mleak_table.outstanding_allocs++;
5394
5395 lck_mtx_unlock(mleak_lock);
5396 return TRUE;
5397 }
5398
5399 static void
5400 mleak_free(mcache_obj_t *addr)
5401 {
5402 while (addr != NULL) {
5403 struct mallocation *allocation = &mleak_allocations
5404 [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
5405
5406 if (allocation->element == addr &&
5407 allocation->trace_index < mleak_trace_buckets) {
5408 lck_mtx_lock_spin(mleak_lock);
5409 if (allocation->element == addr &&
5410 allocation->trace_index < mleak_trace_buckets) {
5411 struct mtrace *trace;
5412 trace = &mleak_traces[allocation->trace_index];
5413 /* allocs = 0 means trace bucket is unused */
5414 if (trace->allocs > 0) {
5415 trace->allocs--;
5416 }
5417 if (trace->allocs == 0) {
5418 trace->depth = 0;
5419 }
5420 /* NULL element means alloc bucket is unused */
5421 allocation->element = NULL;
5422 mleak_table.outstanding_allocs--;
5423 }
5424 lck_mtx_unlock(mleak_lock);
5425 }
5426 addr = addr->obj_next;
5427 }
5428 }
5429
5430 static void
5431 mleak_sort_traces()
5432 {
5433 int i, j, k;
5434 struct mtrace *swap;
5435
5436 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
5437 mleak_top_trace[i] = NULL;
5438 }
5439
5440 for (i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++) {
5441 if (mleak_traces[i].allocs <= 0) {
5442 continue;
5443 }
5444
5445 mleak_top_trace[j] = &mleak_traces[i];
5446 for (k = j; k > 0; k--) {
5447 if (mleak_top_trace[k]->allocs <=
5448 mleak_top_trace[k - 1]->allocs) {
5449 break;
5450 }
5451
5452 swap = mleak_top_trace[k - 1];
5453 mleak_top_trace[k - 1] = mleak_top_trace[k];
5454 mleak_top_trace[k] = swap;
5455 }
5456 j++;
5457 }
5458
5459 j--;
5460 for (; i < mleak_trace_buckets; i++) {
5461 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs) {
5462 continue;
5463 }
5464
5465 mleak_top_trace[j] = &mleak_traces[i];
5466
5467 for (k = j; k > 0; k--) {
5468 if (mleak_top_trace[k]->allocs <=
5469 mleak_top_trace[k - 1]->allocs) {
5470 break;
5471 }
5472
5473 swap = mleak_top_trace[k - 1];
5474 mleak_top_trace[k - 1] = mleak_top_trace[k];
5475 mleak_top_trace[k] = swap;
5476 }
5477 }
5478 }
5479
5480 static void
5481 mleak_update_stats()
5482 {
5483 mleak_trace_stat_t *mltr;
5484 int i;
5485
5486 VERIFY(mleak_stat != NULL);
5487 #ifdef __LP64__
5488 VERIFY(mleak_stat->ml_isaddr64);
5489 #else
5490 VERIFY(!mleak_stat->ml_isaddr64);
5491 #endif /* !__LP64__ */
5492 VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
5493
5494 mleak_sort_traces();
5495
5496 mltr = &mleak_stat->ml_trace[0];
5497 bzero(mltr, sizeof(*mltr) * MLEAK_NUM_TRACES);
5498 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
5499 int j;
5500
5501 if (mleak_top_trace[i] == NULL ||
5502 mleak_top_trace[i]->allocs == 0) {
5503 continue;
5504 }
5505
5506 mltr->mltr_collisions = mleak_top_trace[i]->collisions;
5507 mltr->mltr_hitcount = mleak_top_trace[i]->hitcount;
5508 mltr->mltr_allocs = mleak_top_trace[i]->allocs;
5509 mltr->mltr_depth = mleak_top_trace[i]->depth;
5510
5511 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
5512 for (j = 0; j < mltr->mltr_depth; j++) {
5513 mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
5514 }
5515
5516 mltr++;
5517 }
5518 }
5519
5520 static struct mbtypes {
5521 int mt_type;
5522 const char *mt_name;
5523 } mbtypes[] = {
5524 { MT_DATA, "data" },
5525 { MT_OOBDATA, "oob data" },
5526 { MT_CONTROL, "ancillary data" },
5527 { MT_HEADER, "packet headers" },
5528 { MT_SOCKET, "socket structures" },
5529 { MT_PCB, "protocol control blocks" },
5530 { MT_RTABLE, "routing table entries" },
5531 { MT_HTABLE, "IMP host table entries" },
5532 { MT_ATABLE, "address resolution tables" },
5533 { MT_FTABLE, "fragment reassembly queue headers" },
5534 { MT_SONAME, "socket names and addresses" },
5535 { MT_SOOPTS, "socket options" },
5536 { MT_RIGHTS, "access rights" },
5537 { MT_IFADDR, "interface addresses" },
5538 { MT_TAG, "packet tags" },
5539 { 0, NULL }
5540 };
5541
5542 #define MBUF_DUMP_BUF_CHK() { \
5543 clen -= k; \
5544 if (clen < 1) \
5545 goto done; \
5546 c += k; \
5547 }
5548
5549 static char *
5550 mbuf_dump(void)
5551 {
5552 unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct,
5553 totreturned = 0;
5554 u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
5555 u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
5556 u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
5557 int nmbtypes = sizeof(mbstat.m_mtypes) / sizeof(short);
5558 uint8_t seen[256];
5559 struct mbtypes *mp;
5560 mb_class_stat_t *sp;
5561 mleak_trace_stat_t *mltr;
5562 char *c = mbuf_dump_buf;
5563 int i, j, k, clen = MBUF_DUMP_BUF_SIZE;
5564 struct mbuf_watchdog_defunct_args args = {};
5565
5566 mbuf_dump_buf[0] = '\0';
5567
5568 /* synchronize all statistics in the mbuf table */
5569 mbuf_stat_sync();
5570 mbuf_mtypes_sync();
5571
5572 sp = &mb_stat->mbs_class[0];
5573 for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
5574 u_int32_t mem;
5575
5576 if (m_class(i) == MC_MBUF) {
5577 m_mbufs = sp->mbcl_active;
5578 } else if (m_class(i) == MC_CL) {
5579 m_clfree = sp->mbcl_total - sp->mbcl_active;
5580 } else if (m_class(i) == MC_BIGCL) {
5581 m_bigclfree = sp->mbcl_total - sp->mbcl_active;
5582 } else if (m_class(i) == MC_16KCL) {
5583 m_16kclfree = sp->mbcl_total - sp->mbcl_active;
5584 m_16kclusters = sp->mbcl_total;
5585 } else if (m_class(i) == MC_MBUF_CL) {
5586 m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
5587 } else if (m_class(i) == MC_MBUF_BIGCL) {
5588 m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
5589 } else if (m_class(i) == MC_MBUF_16KCL) {
5590 m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
5591 }
5592
5593 mem = sp->mbcl_ctotal * sp->mbcl_size;
5594 totmem += mem;
5595 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
5596 sp->mbcl_size;
5597 totreturned += sp->mbcl_release_cnt;
5598 }
5599
5600 /* adjust free counts to include composite caches */
5601 m_clfree += m_mbufclfree;
5602 m_bigclfree += m_mbufbigclfree;
5603 m_16kclfree += m_mbuf16kclfree;
5604
5605 totmbufs = 0;
5606 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
5607 totmbufs += mbstat.m_mtypes[mp->mt_type];
5608 }
5609 if (totmbufs > m_mbufs) {
5610 totmbufs = m_mbufs;
5611 }
5612 k = scnprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
5613 MBUF_DUMP_BUF_CHK();
5614
5615 bzero(&seen, sizeof(seen));
5616 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
5617 if (mbstat.m_mtypes[mp->mt_type] != 0) {
5618 seen[mp->mt_type] = 1;
5619 k = scnprintf(c, clen, "\t%u mbufs allocated to %s\n",
5620 mbstat.m_mtypes[mp->mt_type], mp->mt_name);
5621 MBUF_DUMP_BUF_CHK();
5622 }
5623 }
5624 seen[MT_FREE] = 1;
5625 for (i = 0; i < nmbtypes; i++) {
5626 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
5627 k = scnprintf(c, clen, "\t%u mbufs allocated to "
5628 "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
5629 MBUF_DUMP_BUF_CHK();
5630 }
5631 }
5632 if ((m_mbufs - totmbufs) > 0) {
5633 k = scnprintf(c, clen, "\t%lu mbufs allocated to caches\n",
5634 m_mbufs - totmbufs);
5635 MBUF_DUMP_BUF_CHK();
5636 }
5637 k = scnprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
5638 "%u/%u mbuf 4KB clusters in use\n",
5639 (unsigned int)(mbstat.m_clusters - m_clfree),
5640 (unsigned int)mbstat.m_clusters,
5641 (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
5642 (unsigned int)mbstat.m_bigclusters);
5643 MBUF_DUMP_BUF_CHK();
5644
5645 k = scnprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
5646 m_16kclusters - m_16kclfree, m_16kclusters,
5647 njclbytes / 1024);
5648 MBUF_DUMP_BUF_CHK();
5649 totused = totmem - totfree;
5650 if (totmem == 0) {
5651 totpct = 0;
5652 } else if (totused < (ULONG_MAX / 100)) {
5653 totpct = (totused * 100) / totmem;
5654 } else {
5655 u_long totmem1 = totmem / 100;
5656 u_long totused1 = totused / 100;
5657 totpct = (totused1 * 100) / totmem1;
5658 }
5659 k = scnprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
5660 "in use)\n", totmem / 1024, totpct);
5661 MBUF_DUMP_BUF_CHK();
5662 k = scnprintf(c, clen, "%lu KB returned to the system\n",
5663 totreturned / 1024);
5664 MBUF_DUMP_BUF_CHK();
5665
5666 net_update_uptime();
5667
5668 k = scnprintf(c, clen,
5669 "worker thread runs: %u, expansions: %llu, cl %llu/%llu, "
5670 "bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt,
5671 mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total,
5672 mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt,
5673 mb_expand_16kcl_total);
5674 MBUF_DUMP_BUF_CHK();
5675 if (mbuf_worker_last_runtime != 0) {
5676 k = scnprintf(c, clen, "worker thread last run time: "
5677 "%llu (%llu seconds ago)\n",
5678 mbuf_worker_last_runtime,
5679 net_uptime() - mbuf_worker_last_runtime);
5680 MBUF_DUMP_BUF_CHK();
5681 }
5682 if (mbuf_drain_last_runtime != 0) {
5683 k = scnprintf(c, clen, "drain routine last run time: "
5684 "%llu (%llu seconds ago)\n",
5685 mbuf_drain_last_runtime,
5686 net_uptime() - mbuf_drain_last_runtime);
5687 MBUF_DUMP_BUF_CHK();
5688 }
5689
5690 /*
5691 * Log where the most mbufs have accumulated:
5692 * - Process socket buffers
5693 * - TCP reassembly queue
5694 * - Interface AQM queue (output) and DLIL input queue
5695 */
5696 args.non_blocking = true;
5697 proc_iterate(PROC_ALLPROCLIST,
5698 mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
5699 if (args.top_app != NULL) {
5700 k = scnprintf(c, clen, "\ntop proc mbuf space %u bytes by %s:%d\n",
5701 args.top_app_space_used,
5702 proc_name_address(args.top_app),
5703 proc_pid(args.top_app));
5704 proc_rele(args.top_app);
5705 }
5706 MBUF_DUMP_BUF_CHK();
5707
5708 #if INET
5709 k = dump_tcp_reass_qlen(c, clen);
5710 MBUF_DUMP_BUF_CHK();
5711 #endif /* INET */
5712
5713 #if MPTCP
5714 k = dump_mptcp_reass_qlen(c, clen);
5715 MBUF_DUMP_BUF_CHK();
5716 #endif /* MPTCP */
5717
5718 #if NETWORKING
5719 k = dlil_dump_top_if_qlen(c, clen);
5720 MBUF_DUMP_BUF_CHK();
5721 #endif /* NETWORKING */
5722
5723 /* mbuf leak detection statistics */
5724 mleak_update_stats();
5725
5726 k = scnprintf(c, clen, "\nmbuf leak detection table:\n");
5727 MBUF_DUMP_BUF_CHK();
5728 k = scnprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
5729 mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
5730 mleak_table.mleak_sample_factor);
5731 MBUF_DUMP_BUF_CHK();
5732 k = scnprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
5733 mleak_table.outstanding_allocs);
5734 MBUF_DUMP_BUF_CHK();
5735 k = scnprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
5736 mleak_table.alloc_recorded, mleak_table.trace_recorded);
5737 MBUF_DUMP_BUF_CHK();
5738 k = scnprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
5739 mleak_table.alloc_collisions, mleak_table.trace_collisions);
5740 MBUF_DUMP_BUF_CHK();
5741 k = scnprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
5742 mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
5743 MBUF_DUMP_BUF_CHK();
5744 k = scnprintf(c, clen, "\tlock conflicts: %llu\n\n",
5745 mleak_table.total_conflicts);
5746 MBUF_DUMP_BUF_CHK();
5747
5748 k = scnprintf(c, clen, "top %d outstanding traces:\n",
5749 mleak_stat->ml_cnt);
5750 MBUF_DUMP_BUF_CHK();
5751 for (i = 0; i < mleak_stat->ml_cnt; i++) {
5752 mltr = &mleak_stat->ml_trace[i];
5753 k = scnprintf(c, clen, "[%d] %llu outstanding alloc(s), "
5754 "%llu hit(s), %llu collision(s)\n", (i + 1),
5755 mltr->mltr_allocs, mltr->mltr_hitcount,
5756 mltr->mltr_collisions);
5757 MBUF_DUMP_BUF_CHK();
5758 }
5759
5760 if (mleak_stat->ml_isaddr64) {
5761 k = scnprintf(c, clen, MB_LEAK_HDR_64);
5762 } else {
5763 k = scnprintf(c, clen, MB_LEAK_HDR_32);
5764 }
5765 MBUF_DUMP_BUF_CHK();
5766
5767 for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
5768 k = scnprintf(c, clen, "%2d: ", (i + 1));
5769 MBUF_DUMP_BUF_CHK();
5770 for (j = 0; j < mleak_stat->ml_cnt; j++) {
5771 mltr = &mleak_stat->ml_trace[j];
5772 if (i < mltr->mltr_depth) {
5773 if (mleak_stat->ml_isaddr64) {
5774 k = scnprintf(c, clen, "0x%0llx ",
5775 (uint64_t)VM_KERNEL_UNSLIDE(
5776 mltr->mltr_addr[i]));
5777 } else {
5778 k = scnprintf(c, clen,
5779 "0x%08x ",
5780 (uint32_t)VM_KERNEL_UNSLIDE(
5781 mltr->mltr_addr[i]));
5782 }
5783 } else {
5784 if (mleak_stat->ml_isaddr64) {
5785 k = scnprintf(c, clen,
5786 MB_LEAK_SPACING_64);
5787 } else {
5788 k = scnprintf(c, clen,
5789 MB_LEAK_SPACING_32);
5790 }
5791 }
5792 MBUF_DUMP_BUF_CHK();
5793 }
5794 k = scnprintf(c, clen, "\n");
5795 MBUF_DUMP_BUF_CHK();
5796 }
5797
5798 done:
5799 return mbuf_dump_buf;
5800 }
5801
5802 #undef MBUF_DUMP_BUF_CHK
5803
5804 /*
5805 * This routine is reserved for mbuf_get_driver_scratch(); clients inside
5806 * xnu that intend on utilizing the module-private area should directly
5807 * refer to the pkt_mpriv structure in the pkthdr. They are also expected
5808 * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
5809 * to handing it off to another module, respectively.
5810 */
5811 u_int32_t
5812 m_scratch_get(struct mbuf *m, u_int8_t **p)
5813 {
5814 struct pkthdr *pkt = &m->m_pkthdr;
5815
5816 VERIFY(m->m_flags & M_PKTHDR);
5817
5818 /* See comments in <rdar://problem/14040693> */
5819 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
5820 panic_plain("Invalid attempt to access guarded module-private "
5821 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
5822 /* NOTREACHED */
5823 }
5824
5825 if (mcltrace) {
5826 mcache_audit_t *mca;
5827
5828 lck_mtx_lock(mbuf_mlock);
5829 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
5830 if (mca->mca_uflags & MB_SCVALID) {
5831 mcl_audit_scratch(mca);
5832 }
5833 lck_mtx_unlock(mbuf_mlock);
5834 }
5835
5836 *p = (u_int8_t *)&pkt->pkt_mpriv;
5837 return sizeof(pkt->pkt_mpriv);
5838 }
5839
5840 /*
5841 * Simple routine to avoid taking the lock when we can't run the
5842 * mbuf drain.
5843 */
5844 static int
5845 mbuf_drain_checks(boolean_t ignore_waiters)
5846 {
5847 if (mb_drain_maxint == 0) {
5848 return 0;
5849 }
5850 if (!ignore_waiters && mb_waiters != 0) {
5851 return 0;
5852 }
5853
5854 return 1;
5855 }
5856
5857 /*
5858 * Called by the VM when there's memory pressure or when we exhausted
5859 * the 4k/16k reserved space.
5860 */
5861 static void
5862 mbuf_drain_locked(boolean_t ignore_waiters)
5863 {
5864 mbuf_class_t mc;
5865 mcl_slab_t *sp, *sp_tmp, *nsp;
5866 unsigned int num, k, interval, released = 0;
5867 unsigned long total_mem = 0, use_mem = 0;
5868 boolean_t ret, purge_caches = FALSE;
5869 ppnum_t offset;
5870 mcache_obj_t *obj;
5871 unsigned long per;
5872 static unsigned char scratch[32];
5873 static ppnum_t scratch_pa = 0;
5874
5875 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5876 if (!mbuf_drain_checks(ignore_waiters)) {
5877 return;
5878 }
5879 if (scratch_pa == 0) {
5880 bzero(scratch, sizeof(scratch));
5881 scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
5882 VERIFY(scratch_pa);
5883 } else if (mclverify) {
5884 /*
5885 * Panic if a driver wrote to our scratch memory.
5886 */
5887 for (k = 0; k < sizeof(scratch); k++) {
5888 if (scratch[k]) {
5889 panic("suspect DMA to freed address");
5890 }
5891 }
5892 }
5893 /*
5894 * Don't free memory too often as that could cause excessive
5895 * waiting times for mbufs. Purge caches if we were asked to drain
5896 * in the last 5 minutes.
5897 */
5898 if (mbuf_drain_last_runtime != 0) {
5899 interval = net_uptime() - mbuf_drain_last_runtime;
5900 if (interval <= mb_drain_maxint) {
5901 return;
5902 }
5903 if (interval <= mb_drain_maxint * 5) {
5904 purge_caches = TRUE;
5905 }
5906 }
5907 mbuf_drain_last_runtime = net_uptime();
5908 /*
5909 * Don't free any memory if we're using 60% or more.
5910 */
5911 for (mc = 0; mc < MC_MAX; mc++) {
5912 total_mem += m_total(mc) * m_maxsize(mc);
5913 use_mem += m_active(mc) * m_maxsize(mc);
5914 }
5915 per = (use_mem * 100) / total_mem;
5916 if (per >= 60) {
5917 return;
5918 }
5919 /*
5920 * Purge all the caches. This effectively disables
5921 * caching for a few seconds, but the mbuf worker thread will
5922 * re-enable them again.
5923 */
5924 if (purge_caches == TRUE) {
5925 for (mc = 0; mc < MC_MAX; mc++) {
5926 if (m_total(mc) < m_avgtotal(mc)) {
5927 continue;
5928 }
5929 lck_mtx_unlock(mbuf_mlock);
5930 ret = mcache_purge_cache(m_cache(mc), FALSE);
5931 lck_mtx_lock(mbuf_mlock);
5932 if (ret == TRUE) {
5933 m_purge_cnt(mc)++;
5934 }
5935 }
5936 }
5937 /*
5938 * Move the objects from the composite class freelist to
5939 * the rudimentary slabs list, but keep at least 10% of the average
5940 * total in the freelist.
5941 */
5942 for (mc = 0; mc < MC_MAX; mc++) {
5943 while (m_cobjlist(mc) &&
5944 m_total(mc) < m_avgtotal(mc) &&
5945 m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
5946 obj = m_cobjlist(mc);
5947 m_cobjlist(mc) = obj->obj_next;
5948 obj->obj_next = NULL;
5949 num = cslab_free(mc, obj, 1);
5950 VERIFY(num == 1);
5951 m_free_cnt(mc)++;
5952 m_infree(mc)--;
5953 /* cslab_free() handles m_total */
5954 }
5955 }
5956 /*
5957 * Free the buffers present in the slab list up to 10% of the total
5958 * average per class.
5959 *
5960 * We walk the list backwards in an attempt to reduce fragmentation.
5961 */
5962 for (mc = MC_MAX - 1; (int)mc >= 0; mc--) {
5963 TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
5964 /*
5965 * Process only unused slabs occupying memory.
5966 */
5967 if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
5968 sp->sl_base == NULL) {
5969 continue;
5970 }
5971 if (m_total(mc) < m_avgtotal(mc) ||
5972 m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
5973 break;
5974 }
5975 slab_remove(sp, mc);
5976 switch (mc) {
5977 case MC_MBUF:
5978 m_infree(mc) -= NMBPG;
5979 m_total(mc) -= NMBPG;
5980 if (mclaudit != NULL) {
5981 mcl_audit_free(sp->sl_base, NMBPG);
5982 }
5983 break;
5984 case MC_CL:
5985 m_infree(mc) -= NCLPG;
5986 m_total(mc) -= NCLPG;
5987 if (mclaudit != NULL) {
5988 mcl_audit_free(sp->sl_base, NMBPG);
5989 }
5990 break;
5991 case MC_BIGCL:
5992 {
5993 m_infree(mc) -= NBCLPG;
5994 m_total(mc) -= NBCLPG;
5995 if (mclaudit != NULL) {
5996 mcl_audit_free(sp->sl_base, NMBPG);
5997 }
5998 break;
5999 }
6000 case MC_16KCL:
6001 m_infree(mc)--;
6002 m_total(mc)--;
6003 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
6004 nsp = nsp->sl_next;
6005 VERIFY(nsp->sl_refcnt == 0 &&
6006 nsp->sl_base != NULL &&
6007 nsp->sl_len == 0);
6008 slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
6009 0);
6010 nsp->sl_flags = 0;
6011 }
6012 if (mclaudit != NULL) {
6013 if (sp->sl_len == PAGE_SIZE) {
6014 mcl_audit_free(sp->sl_base,
6015 NMBPG);
6016 } else {
6017 mcl_audit_free(sp->sl_base, 1);
6018 }
6019 }
6020 break;
6021 default:
6022 /*
6023 * The composite classes have their own
6024 * freelist (m_cobjlist), so we only
6025 * process rudimentary classes here.
6026 */
6027 VERIFY(0);
6028 }
6029 m_release_cnt(mc) += m_size(mc);
6030 released += m_size(mc);
6031 VERIFY(sp->sl_base != NULL &&
6032 sp->sl_len >= PAGE_SIZE);
6033 offset = MTOPG(sp->sl_base);
6034 /*
6035 * Make sure the IOMapper points to a valid, but
6036 * bogus, address. This should prevent further DMA
6037 * accesses to freed memory.
6038 */
6039 IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
6040 mcl_paddr[offset] = 0;
6041 kmem_free(mb_map, (vm_offset_t)sp->sl_base,
6042 sp->sl_len);
6043 slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
6044 sp->sl_flags = 0;
6045 }
6046 }
6047 mbstat.m_drain++;
6048 mbstat.m_bigclusters = m_total(MC_BIGCL);
6049 mbstat.m_clusters = m_total(MC_CL);
6050 mbstat.m_mbufs = m_total(MC_MBUF);
6051 mbuf_stat_sync();
6052 mbuf_mtypes_sync();
6053 }
6054
6055 __private_extern__ void
6056 mbuf_drain(boolean_t ignore_waiters)
6057 {
6058 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_NOTOWNED);
6059 if (!mbuf_drain_checks(ignore_waiters)) {
6060 return;
6061 }
6062 lck_mtx_lock(mbuf_mlock);
6063 mbuf_drain_locked(ignore_waiters);
6064 lck_mtx_unlock(mbuf_mlock);
6065 }
6066
6067
6068 static int
6069 m_drain_force_sysctl SYSCTL_HANDLER_ARGS
6070 {
6071 #pragma unused(arg1, arg2)
6072 int val = 0, err;
6073
6074 err = sysctl_handle_int(oidp, &val, 0, req);
6075 if (err != 0 || req->newptr == USER_ADDR_NULL) {
6076 return err;
6077 }
6078 if (val) {
6079 mbuf_drain(TRUE);
6080 }
6081
6082 return err;
6083 }
6084
6085 #if DEBUG || DEVELOPMENT
6086 __printflike(3, 4)
6087 static void
6088 _mbwdog_logger(const char *func, const int line, const char *fmt, ...)
6089 {
6090 va_list ap;
6091 struct timeval now;
6092 char str[384], p[256];
6093 int len;
6094
6095 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6096 if (mbwdog_logging == NULL) {
6097 /*
6098 * This might block under a mutex, which isn't really great,
6099 * but this happens once, so we'll live.
6100 */
6101 mbwdog_logging = zalloc_permanent(mbwdog_logging_size,
6102 ZALIGN_NONE);
6103 }
6104 va_start(ap, fmt);
6105 vsnprintf(p, sizeof(p), fmt, ap);
6106 va_end(ap);
6107 microuptime(&now);
6108 len = scnprintf(str, sizeof(str),
6109 "\n%ld.%d (%d/%llx) %s:%d %s",
6110 now.tv_sec, now.tv_usec,
6111 proc_getpid(current_proc()),
6112 (uint64_t)VM_KERNEL_ADDRPERM(current_thread()),
6113 func, line, p);
6114 if (len < 0) {
6115 return;
6116 }
6117 if (mbwdog_logging_used + len > mbwdog_logging_size) {
6118 mbwdog_logging_used = mbwdog_logging_used / 2;
6119 memmove(mbwdog_logging, mbwdog_logging + mbwdog_logging_used,
6120 mbwdog_logging_size - mbwdog_logging_used);
6121 mbwdog_logging[mbwdog_logging_used] = 0;
6122 }
6123 strlcat(mbwdog_logging, str, mbwdog_logging_size);
6124 mbwdog_logging_used += len;
6125 }
6126
6127 #endif // DEBUG || DEVELOPMENT
6128
6129 static void
6130 mtracelarge_register(size_t size)
6131 {
6132 int i;
6133 struct mtracelarge *trace;
6134 uintptr_t bt[MLEAK_STACK_DEPTH];
6135 unsigned int depth;
6136
6137 depth = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
6138 /* Check if this entry is already on the list. */
6139 for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
6140 trace = &mtracelarge_table[i];
6141 if (trace->size == size && trace->depth == depth &&
6142 memcmp(bt, trace->addr, depth * sizeof(uintptr_t)) == 0) {
6143 return;
6144 }
6145 }
6146 for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
6147 trace = &mtracelarge_table[i];
6148 if (size > trace->size) {
6149 trace->depth = depth;
6150 memcpy(trace->addr, bt, depth * sizeof(uintptr_t));
6151 trace->size = size;
6152 break;
6153 }
6154 }
6155 }
6156
6157 #if DEBUG || DEVELOPMENT
6158
6159 static int
6160 mbuf_wd_dump_sysctl SYSCTL_HANDLER_ARGS
6161 {
6162 char *str;
6163
6164 ifnet_head_lock_shared();
6165 lck_mtx_lock(mbuf_mlock);
6166
6167 str = mbuf_dump();
6168
6169 lck_mtx_unlock(mbuf_mlock);
6170 ifnet_head_done();
6171
6172 return sysctl_io_string(req, str, 0, 0, NULL);
6173 }
6174
6175 #endif /* DEBUG || DEVELOPMENT */
6176
6177 SYSCTL_DECL(_kern_ipc);
6178 #if DEBUG || DEVELOPMENT
6179 #if SKYWALK
6180 SYSCTL_UINT(_kern_ipc, OID_AUTO, mc_threshold_scale_factor,
6181 CTLFLAG_RW | CTLFLAG_LOCKED, &mc_threshold_scale_down_factor,
6182 MC_THRESHOLD_SCALE_DOWN_FACTOR,
6183 "scale down factor for mbuf cache thresholds");
6184 #endif /* SKYWALK */
6185 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_wd_dump,
6186 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED,
6187 0, 0, mbuf_wd_dump_sysctl, "A", "mbuf watchdog dump");
6188 #endif /* DEBUG || DEVELOPMENT */
6189 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
6190 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
6191 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
6192 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
6193 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
6194 0, 0, mleak_table_sysctl, "S,mleak_table", "");
6195 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
6196 CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
6197 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
6198 CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
6199 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
6200 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
6201 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
6202 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
6203 m_drain_force_sysctl, "I",
6204 "Forces the mbuf garbage collection to run");
6205 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
6206 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0,
6207 "Minimum time interval between garbage collection");
6208