1 /*
2 * Copyright (c) 1998-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <ptrauth.h>
71
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/malloc.h>
75 #include <sys/mbuf.h>
76 #include <sys/kernel.h>
77 #include <sys/sysctl.h>
78 #include <sys/syslog.h>
79 #include <sys/protosw.h>
80 #include <sys/domain.h>
81 #include <sys/queue.h>
82 #include <sys/proc.h>
83 #include <sys/filedesc.h>
84 #include <sys/file_internal.h>
85
86 #include <dev/random/randomdev.h>
87
88 #include <kern/kern_types.h>
89 #include <kern/simple_lock.h>
90 #include <kern/queue.h>
91 #include <kern/sched_prim.h>
92 #include <kern/backtrace.h>
93 #include <kern/percpu.h>
94 #include <kern/zalloc.h>
95
96 #include <libkern/OSAtomic.h>
97 #include <libkern/OSDebug.h>
98 #include <libkern/libkern.h>
99
100 #include <os/log.h>
101 #include <os/ptrtools.h>
102
103 #include <IOKit/IOMapper.h>
104
105 #include <machine/limits.h>
106 #include <machine/machine_routines.h>
107
108 #include <sys/mcache.h>
109 #include <net/ntstat.h>
110
111 #if INET
112 extern int dump_tcp_reass_qlen(char *, int);
113 extern int tcp_reass_qlen_space(struct socket *);
114 #endif /* INET */
115
116 #if MPTCP
117 extern int dump_mptcp_reass_qlen(char *, int);
118 #endif /* MPTCP */
119
120
121 #if NETWORKING
122 extern int dlil_dump_top_if_qlen(char *, int);
123 #endif /* NETWORKING */
124
125 /*
126 * MBUF IMPLEMENTATION NOTES.
127 *
128 * There is a total of 5 per-CPU caches:
129 *
130 * MC_MBUF:
131 * This is a cache of rudimentary objects of MSIZE in size; each
132 * object represents an mbuf structure. This cache preserves only
133 * the m_type field of the mbuf during its transactions.
134 *
135 * MC_CL:
136 * This is a cache of rudimentary objects of MCLBYTES in size; each
137 * object represents a mcluster structure. This cache does not
138 * preserve the contents of the objects during its transactions.
139 *
140 * MC_BIGCL:
141 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each
142 * object represents a mbigcluster structure. This cache does not
143 * preserve the contents of the objects during its transaction.
144 *
145 * MC_MBUF_CL:
146 * This is a cache of mbufs each having a cluster attached to it.
147 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
148 * fields of the mbuf related to the external cluster are preserved
149 * during transactions.
150 *
151 * MC_MBUF_BIGCL:
152 * This is a cache of mbufs each having a big cluster attached to it.
153 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
154 * fields of the mbuf related to the external cluster are preserved
155 * during transactions.
156 *
157 * OBJECT ALLOCATION:
158 *
159 * Allocation requests are handled first at the per-CPU (mcache) layer
160 * before falling back to the slab layer. Performance is optimal when
161 * the request is satisfied at the CPU layer because global data/lock
162 * never gets accessed. When the slab layer is entered for allocation,
163 * the slab freelist will be checked first for available objects before
164 * the VM backing store is invoked. Slab layer operations are serialized
165 * for all of the caches as the mbuf global lock is held most of the time.
166 * Allocation paths are different depending on the class of objects:
167 *
168 * a. Rudimentary object:
169 *
170 * { m_get_common(), m_clattach(), m_mclget(),
171 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
172 * composite object allocation }
173 * | ^
174 * | |
175 * | +-----------------------+
176 * v |
177 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
178 * | ^
179 * v |
180 * [CPU cache] -------> (found?) -------+
181 * | |
182 * v |
183 * mbuf_slab_alloc() |
184 * | |
185 * v |
186 * +---------> [freelist] -------> (found?) -------+
187 * | |
188 * | v
189 * | m_clalloc()
190 * | |
191 * | v
192 * +---<<---- kmem_mb_alloc()
193 *
194 * b. Composite object:
195 *
196 * { m_getpackets_internal(), m_allocpacket_internal() }
197 * | ^
198 * | |
199 * | +------ (done) ---------+
200 * v |
201 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
202 * | ^
203 * v |
204 * [CPU cache] -------> (found?) -------+
205 * | |
206 * v |
207 * mbuf_cslab_alloc() |
208 * | |
209 * v |
210 * [freelist] -------> (found?) -------+
211 * | |
212 * v |
213 * (rudimentary object) |
214 * mcache_alloc/mcache_alloc_ext() ------>>-----+
215 *
216 * Auditing notes: If auditing is enabled, buffers will be subjected to
217 * integrity checks by the audit routine. This is done by verifying their
218 * contents against DEADBEEF (free) pattern before returning them to caller.
219 * As part of this step, the routine will also record the transaction and
220 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
221 * also restore any constructed data structure fields if necessary.
222 *
223 * OBJECT DEALLOCATION:
224 *
225 * Freeing an object simply involves placing it into the CPU cache; this
226 * pollutes the cache to benefit subsequent allocations. The slab layer
227 * will only be entered if the object is to be purged out of the cache.
228 * During normal operations, this happens only when the CPU layer resizes
229 * its bucket while it's adjusting to the allocation load. Deallocation
230 * paths are different depending on the class of objects:
231 *
232 * a. Rudimentary object:
233 *
234 * { m_free(), m_freem_list(), composite object deallocation }
235 * | ^
236 * | |
237 * | +------ (done) ---------+
238 * v |
239 * mcache_free/mcache_free_ext() |
240 * | |
241 * v |
242 * mbuf_slab_audit() |
243 * | |
244 * v |
245 * [CPU cache] ---> (not purging?) -----+
246 * | |
247 * v |
248 * mbuf_slab_free() |
249 * | |
250 * v |
251 * [freelist] ----------->>------------+
252 * (objects get purged to VM only on demand)
253 *
254 * b. Composite object:
255 *
256 * { m_free(), m_freem_list() }
257 * | ^
258 * | |
259 * | +------ (done) ---------+
260 * v |
261 * mcache_free/mcache_free_ext() |
262 * | |
263 * v |
264 * mbuf_cslab_audit() |
265 * | |
266 * v |
267 * [CPU cache] ---> (not purging?) -----+
268 * | |
269 * v |
270 * mbuf_cslab_free() |
271 * | |
272 * v |
273 * [freelist] ---> (not purging?) -----+
274 * | |
275 * v |
276 * (rudimentary object) |
277 * mcache_free/mcache_free_ext() ------->>------+
278 *
279 * Auditing notes: If auditing is enabled, the audit routine will save
280 * any constructed data structure fields (if necessary) before filling the
281 * contents of the buffers with DEADBEEF (free) pattern and recording the
282 * transaction. Buffers that are freed (whether at CPU or slab layer) are
283 * expected to contain the free pattern.
284 *
285 * DEBUGGING:
286 *
287 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
288 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
289 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
290 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
291 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
292 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
293 *
294 * Each object is associated with exactly one mcache_audit_t structure that
295 * contains the information related to its last buffer transaction. Given
296 * an address of an object, the audit structure can be retrieved by finding
297 * the position of the object relevant to the base address of the cluster:
298 *
299 * +------------+ +=============+
300 * | mbuf addr | | mclaudit[i] |
301 * +------------+ +=============+
302 * | | cl_audit[0] |
303 * i = MTOBG(addr) +-------------+
304 * | +-----> | cl_audit[1] | -----> mcache_audit_t
305 * b = BGTOM(i) | +-------------+
306 * | | | ... |
307 * x = MCLIDX(b, addr) | +-------------+
308 * | | | cl_audit[7] |
309 * +-----------------+ +-------------+
310 * (e.g. x == 1)
311 *
312 * The mclaudit[] array is allocated at initialization time, but its contents
313 * get populated when the corresponding cluster is created. Because a page
314 * can be turned into NMBPG number of mbufs, we preserve enough space for the
315 * mbufs so that there is a 1-to-1 mapping between them. A page that never
316 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
317 * remaining entries unused. For 16KB cluster, only one entry from the first
318 * page is allocated and used for the entire object.
319 */
320
321 /* TODO: should be in header file */
322 /* kernel translater */
323 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
324 extern vm_map_t mb_map; /* special map */
325
326 static uint32_t mb_kmem_contig_failed;
327 static uint32_t mb_kmem_failed;
328 static uint32_t mb_kmem_one_failed;
329 /* Timestamp of allocation failures. */
330 static uint64_t mb_kmem_contig_failed_ts;
331 static uint64_t mb_kmem_failed_ts;
332 static uint64_t mb_kmem_one_failed_ts;
333 static uint64_t mb_kmem_contig_failed_size;
334 static uint64_t mb_kmem_failed_size;
335 static uint32_t mb_kmem_stats[6];
336
337 /* Global lock */
338 static LCK_GRP_DECLARE(mbuf_mlock_grp, "mbuf");
339 static LCK_MTX_DECLARE(mbuf_mlock_data, &mbuf_mlock_grp);
340 static lck_mtx_t *const mbuf_mlock = &mbuf_mlock_data;
341
342 /* Back-end (common) layer */
343 static uint64_t mb_expand_cnt;
344 static uint64_t mb_expand_cl_cnt;
345 static uint64_t mb_expand_cl_total;
346 static uint64_t mb_expand_bigcl_cnt;
347 static uint64_t mb_expand_bigcl_total;
348 static uint64_t mb_expand_16kcl_cnt;
349 static uint64_t mb_expand_16kcl_total;
350 static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */
351 static uint32_t mbuf_worker_run_cnt;
352 static uint64_t mbuf_worker_last_runtime;
353 static uint64_t mbuf_drain_last_runtime;
354 static int mbuf_worker_ready; /* worker thread is runnable */
355 static unsigned int ncpu; /* number of CPUs */
356 static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */
357 static ppnum_t mcl_pages; /* Size of array (# physical pages) */
358 static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
359 static mcache_t *ref_cache; /* Cache of cluster reference & flags */
360 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
361 unsigned int mbuf_debug; /* patchable mbuf mcache flags */
362 static unsigned int mb_normalized; /* number of packets "normalized" */
363
364 #define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
365 #define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
366
367 typedef enum {
368 MC_MBUF = 0, /* Regular mbuf */
369 MC_CL, /* Cluster */
370 MC_BIGCL, /* Large (4KB) cluster */
371 MC_16KCL, /* Jumbo (16KB) cluster */
372 MC_MBUF_CL, /* mbuf + cluster */
373 MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */
374 MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */
375 } mbuf_class_t;
376
377 #define MBUF_CLASS_MIN MC_MBUF
378 #define MBUF_CLASS_MAX MC_MBUF_16KCL
379 #define MBUF_CLASS_LAST MC_16KCL
380 #define MBUF_CLASS_VALID(c) \
381 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
382 #define MBUF_CLASS_COMPOSITE(c) \
383 ((int)(c) > MBUF_CLASS_LAST)
384
385
386 /*
387 * mbuf specific mcache allocation request flags.
388 */
389 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
390
391 /*
392 * Per-cluster slab structure.
393 *
394 * A slab is a cluster control structure that contains one or more object
395 * chunks; the available chunks are chained in the slab's freelist (sl_head).
396 * Each time a chunk is taken out of the slab, the slab's reference count
397 * gets incremented. When all chunks have been taken out, the empty slab
398 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
399 * returned to a slab causes the slab's reference count to be decremented;
400 * it also causes the slab to be reinserted back to class's slab list, if
401 * it's not already done.
402 *
403 * Compartmentalizing of the object chunks into slabs allows us to easily
404 * merge one or more slabs together when the adjacent slabs are idle, as
405 * well as to convert or move a slab from one class to another; e.g. the
406 * mbuf cluster slab can be converted to a regular cluster slab when all
407 * mbufs in the slab have been freed.
408 *
409 * A slab may also span across multiple clusters for chunks larger than
410 * a cluster's size. In this case, only the slab of the first cluster is
411 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
412 * that they are part of the larger slab.
413 *
414 * Each slab controls a page of memory.
415 */
416 typedef struct mcl_slab {
417 struct mcl_slab *sl_next; /* neighboring slab */
418 u_int8_t sl_class; /* controlling mbuf class */
419 int8_t sl_refcnt; /* outstanding allocations */
420 int8_t sl_chunks; /* chunks (bufs) in this slab */
421 u_int16_t sl_flags; /* slab flags (see below) */
422 u_int16_t sl_len; /* slab length */
423 void *sl_base; /* base of allocated memory */
424 void *sl_head; /* first free buffer */
425 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
426 } mcl_slab_t;
427
428 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
429 #define SLF_PARTIAL 0x0002 /* part of another slab */
430 #define SLF_DETACHED 0x0004 /* not in slab freelist */
431
432 /*
433 * The array of slabs are broken into groups of arrays per 1MB of kernel
434 * memory to reduce the footprint. Each group is allocated on demand
435 * whenever a new piece of memory mapped in from the VM crosses the 1MB
436 * boundary.
437 */
438 #define NSLABSPMB ((1 << MBSHIFT) >> PAGE_SHIFT)
439
440 typedef struct mcl_slabg {
441 mcl_slab_t *slg_slab; /* group of slabs */
442 } mcl_slabg_t;
443
444 /*
445 * Number of slabs needed to control a 16KB cluster object.
446 */
447 #define NSLABSP16KB (M16KCLBYTES >> PAGE_SHIFT)
448
449 /*
450 * Per-cluster audit structure.
451 */
452 typedef struct {
453 mcache_audit_t **cl_audit; /* array of audits */
454 } mcl_audit_t;
455
456 typedef struct {
457 struct thread *msa_thread; /* thread doing transaction */
458 struct thread *msa_pthread; /* previous transaction thread */
459 uint32_t msa_tstamp; /* transaction timestamp (ms) */
460 uint32_t msa_ptstamp; /* prev transaction timestamp (ms) */
461 uint16_t msa_depth; /* pc stack depth */
462 uint16_t msa_pdepth; /* previous transaction pc stack */
463 void *msa_stack[MCACHE_STACK_DEPTH];
464 void *msa_pstack[MCACHE_STACK_DEPTH];
465 } mcl_scratch_audit_t;
466
467 typedef struct {
468 /*
469 * Size of data from the beginning of an mbuf that covers m_hdr,
470 * pkthdr and m_ext structures. If auditing is enabled, we allocate
471 * a shadow mbuf structure of this size inside each audit structure,
472 * and the contents of the real mbuf gets copied into it when the mbuf
473 * is freed. This allows us to pattern-fill the mbuf for integrity
474 * check, and to preserve any constructed mbuf fields (e.g. mbuf +
475 * cluster cache case). Note that we don't save the contents of
476 * clusters when they are freed; we simply pattern-fill them.
477 */
478 u_int8_t sc_mbuf[(MSIZE - _MHLEN) + sizeof(_m_ext_t)];
479 mcl_scratch_audit_t sc_scratch __attribute__((aligned(8)));
480 } mcl_saved_contents_t;
481
482 #define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t))
483
484 #define MCA_SAVED_MBUF_PTR(_mca) \
485 ((struct mbuf *)(void *)((mcl_saved_contents_t *) \
486 (_mca)->mca_contents)->sc_mbuf)
487 #define MCA_SAVED_MBUF_SIZE \
488 (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
489 #define MCA_SAVED_SCRATCH_PTR(_mca) \
490 (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
491
492 /*
493 * mbuf specific mcache audit flags
494 */
495 #define MB_INUSE 0x01 /* object has not been returned to slab */
496 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
497 #define MB_SCVALID 0x04 /* object has valid saved contents */
498
499 /*
500 * Each of the following two arrays hold up to nmbclusters elements.
501 */
502 static mcl_audit_t *mclaudit; /* array of cluster audit information */
503 static unsigned int maxclaudit; /* max # of entries in audit table */
504 static mcl_slabg_t **slabstbl; /* cluster slabs table */
505 static unsigned int maxslabgrp; /* max # of entries in slabs table */
506 static unsigned int slabgrp; /* # of entries in slabs table */
507
508 /* Globals */
509 int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
510 int njcl; /* # of clusters for jumbo sizes */
511 int njclbytes; /* size of a jumbo cluster */
512 unsigned char *mbutl; /* first mapped cluster address */
513 unsigned char *embutl; /* ending virtual address of mclusters */
514 int max_linkhdr; /* largest link-level header */
515 int max_protohdr; /* largest protocol header */
516 int max_hdr; /* largest link+protocol header */
517 int max_datalen; /* MHLEN - max_hdr */
518
519 static boolean_t mclverify; /* debug: pattern-checking */
520 static boolean_t mcltrace; /* debug: stack tracing */
521 static boolean_t mclfindleak; /* debug: leak detection */
522 static boolean_t mclexpleak; /* debug: expose leak info to user space */
523
524 static struct timeval mb_start; /* beginning of time */
525
526 /* mbuf leak detection variables */
527 static struct mleak_table mleak_table;
528 static mleak_stat_t *mleak_stat;
529
530 #define MLEAK_STAT_SIZE(n) \
531 __builtin_offsetof(mleak_stat_t, ml_trace[n])
532
533 struct mallocation {
534 mcache_obj_t *element; /* the alloc'ed element, NULL if unused */
535 u_int32_t trace_index; /* mtrace index for corresponding backtrace */
536 u_int32_t count; /* How many objects were requested */
537 u_int64_t hitcount; /* for determining hash effectiveness */
538 };
539
540 struct mtrace {
541 u_int64_t collisions;
542 u_int64_t hitcount;
543 u_int64_t allocs;
544 u_int64_t depth;
545 uintptr_t addr[MLEAK_STACK_DEPTH];
546 };
547
548 /* Size must be a power of two for the zhash to be able to just mask off bits */
549 #define MLEAK_ALLOCATION_MAP_NUM 512
550 #define MLEAK_TRACE_MAP_NUM 256
551
552 /*
553 * Sample factor for how often to record a trace. This is overwritable
554 * by the boot-arg mleak_sample_factor.
555 */
556 #define MLEAK_SAMPLE_FACTOR 500
557
558 /*
559 * Number of top leakers recorded.
560 */
561 #define MLEAK_NUM_TRACES 5
562
563 #define MB_LEAK_SPACING_64 " "
564 #define MB_LEAK_SPACING_32 " "
565
566
567 #define MB_LEAK_HDR_32 "\n\
568 trace [1] trace [2] trace [3] trace [4] trace [5] \n\
569 ---------- ---------- ---------- ---------- ---------- \n\
570 "
571
572 #define MB_LEAK_HDR_64 "\n\
573 trace [1] trace [2] trace [3] \
574 trace [4] trace [5] \n\
575 ------------------ ------------------ ------------------ \
576 ------------------ ------------------ \n\
577 "
578
579 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
580 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
581
582 /* Hashmaps of allocations and their corresponding traces */
583 static struct mallocation *mleak_allocations;
584 static struct mtrace *mleak_traces;
585 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
586
587 /* Lock to protect mleak tables from concurrent modification */
588 static LCK_GRP_DECLARE(mleak_lock_grp, "mleak_lock");
589 static LCK_MTX_DECLARE(mleak_lock_data, &mleak_lock_grp);
590 static lck_mtx_t *const mleak_lock = &mleak_lock_data;
591
592 /* *Failed* large allocations. */
593 struct mtracelarge {
594 uint64_t size;
595 uint64_t depth;
596 uintptr_t addr[MLEAK_STACK_DEPTH];
597 };
598
599 #define MTRACELARGE_NUM_TRACES 5
600 static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES];
601
602 static void mtracelarge_register(size_t size);
603
604 /* Lock to protect the completion callback table */
605 static LCK_GRP_DECLARE(mbuf_tx_compl_tbl_lck_grp, "mbuf_tx_compl_tbl");
606 LCK_RW_DECLARE(mbuf_tx_compl_tbl_lock, &mbuf_tx_compl_tbl_lck_grp);
607
608 extern u_int32_t high_sb_max;
609
610 /* The minimum number of objects that are allocated, to start. */
611 #define MINCL 32
612 #define MINBIGCL (MINCL >> 1)
613 #define MIN16KCL (MINCL >> 2)
614
615 /* Low watermarks (only map in pages once free counts go below) */
616 #define MBIGCL_LOWAT MINBIGCL
617 #define M16KCL_LOWAT MIN16KCL
618
619 typedef struct {
620 mbuf_class_t mtbl_class; /* class type */
621 mcache_t *mtbl_cache; /* mcache for this buffer class */
622 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
623 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */
624 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */
625 u_int32_t mtbl_maxsize; /* maximum buffer size */
626 int mtbl_minlimit; /* minimum allowed */
627 int mtbl_maxlimit; /* maximum allowed */
628 u_int32_t mtbl_wantpurge; /* purge during next reclaim */
629 uint32_t mtbl_avgtotal; /* average total on iOS */
630 u_int32_t mtbl_expand; /* worker should expand the class */
631 } mbuf_table_t;
632
633 #define m_class(c) mbuf_table[c].mtbl_class
634 #define m_cache(c) mbuf_table[c].mtbl_cache
635 #define m_slablist(c) mbuf_table[c].mtbl_slablist
636 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
637 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
638 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
639 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
640 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
641 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
642 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
643 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
644 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
645 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
646 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
647 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
648 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
649 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
650 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
651 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
652 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
653 #define m_peak(c) mbuf_table[c].mtbl_stats->mbcl_peak_reported
654 #define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
655 #define m_region_expand(c) mbuf_table[c].mtbl_expand
656
657 static mbuf_table_t mbuf_table[] = {
658 /*
659 * The caches for mbufs, regular clusters and big clusters.
660 * The average total values were based on data gathered by actual
661 * usage patterns on iOS.
662 */
663 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
664 NULL, NULL, 0, 0, 0, 0, 3000, 0 },
665 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
666 NULL, NULL, 0, 0, 0, 0, 2000, 0 },
667 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
668 NULL, NULL, 0, 0, 0, 0, 1000, 0 },
669 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
670 NULL, NULL, 0, 0, 0, 0, 200, 0 },
671 /*
672 * The following are special caches; they serve as intermediate
673 * caches backed by the above rudimentary caches. Each object
674 * in the cache is an mbuf with a cluster attached to it. Unlike
675 * the above caches, these intermediate caches do not directly
676 * deal with the slab structures; instead, the constructed
677 * cached elements are simply stored in the freelists.
678 */
679 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000, 0 },
680 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000, 0 },
681 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 200, 0 },
682 };
683
684 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
685
686 #if SKYWALK
687 #define MC_THRESHOLD_SCALE_DOWN_FACTOR 2
688 static unsigned int mc_threshold_scale_down_factor =
689 MC_THRESHOLD_SCALE_DOWN_FACTOR;
690 #endif /* SKYWALK */
691
692 static uint32_t
m_avgtotal(mbuf_class_t c)693 m_avgtotal(mbuf_class_t c)
694 {
695 #if SKYWALK
696 return if_is_fsw_transport_netagent_enabled() ?
697 (mbuf_table[c].mtbl_avgtotal / mc_threshold_scale_down_factor) :
698 mbuf_table[c].mtbl_avgtotal;
699 #else /* !SKYWALK */
700 return mbuf_table[c].mtbl_avgtotal;
701 #endif /* SKYWALK */
702 }
703
704 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
705 static int mb_waiters; /* number of waiters */
706
707 boolean_t mb_peak_newreport = FALSE;
708 boolean_t mb_peak_firstreport = FALSE;
709
710 /* generate a report by default after 1 week of uptime */
711 #define MBUF_PEAK_FIRST_REPORT_THRESHOLD 604800
712
713 #define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */
714 static struct timeval mb_wdtstart; /* watchdog start timestamp */
715 static char *mbuf_dump_buf;
716
717 #define MBUF_DUMP_BUF_SIZE 4096
718
719 /*
720 * mbuf watchdog is enabled by default. It is also toggeable via the
721 * kern.ipc.mb_watchdog sysctl.
722 * Garbage collection is enabled by default on embedded platforms.
723 * mb_drain_maxint controls the amount of time to wait (in seconds) before
724 * consecutive calls to mbuf_drain().
725 */
726 static unsigned int mb_watchdog = 1;
727 #if !XNU_TARGET_OS_OSX
728 static unsigned int mb_drain_maxint = 60;
729 #else /* XNU_TARGET_OS_OSX */
730 static unsigned int mb_drain_maxint = 0;
731 #endif /* XNU_TARGET_OS_OSX */
732 static unsigned int mb_memory_pressure_percentage = 80;
733
734 uintptr_t mb_obscure_extfree __attribute__((visibility("hidden")));
735 uintptr_t mb_obscure_extref __attribute__((visibility("hidden")));
736
737 /* Red zone */
738 static u_int32_t mb_redzone_cookie;
739 static void m_redzone_init(struct mbuf *);
740 static void m_redzone_verify(struct mbuf *m);
741
742 /* The following are used to serialize m_clalloc() */
743 static boolean_t mb_clalloc_busy;
744 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
745 static int mb_clalloc_waiters;
746
747 static void mbuf_mtypes_sync(boolean_t);
748 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
749 static void mbuf_stat_sync(void);
750 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
751 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
752 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
753 static char *mbuf_dump(void);
754 static void mbuf_table_init(void);
755 static inline void m_incref(struct mbuf *);
756 static inline u_int16_t m_decref(struct mbuf *);
757 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
758 static void mbuf_worker_thread_init(void);
759 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
760 static void slab_free(mbuf_class_t, mcache_obj_t *);
761 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
762 unsigned int, int);
763 static void mbuf_slab_free(void *, mcache_obj_t *, int);
764 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
765 static void mbuf_slab_notify(void *, u_int32_t);
766 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
767 unsigned int);
768 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
769 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
770 unsigned int, int);
771 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
772 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
773 static int freelist_populate(mbuf_class_t, unsigned int, int);
774 static void freelist_init(mbuf_class_t);
775 static boolean_t mbuf_cached_above(mbuf_class_t, int);
776 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
777 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
778 static int m_howmany(int, size_t);
779 static void mbuf_worker_thread(void);
780 static void mbuf_watchdog(void);
781 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
782
783 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
784 size_t, unsigned int);
785 static void mcl_audit_free(void *, unsigned int);
786 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
787 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
788 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
789 boolean_t);
790 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
791 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
792 static void mcl_audit_scratch(mcache_audit_t *);
793 static void mcl_audit_mcheck_panic(struct mbuf *);
794 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
795
796 static void mleak_activate(void);
797 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
798 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
799 static void mleak_free(mcache_obj_t *);
800 static void mleak_sort_traces(void);
801 static void mleak_update_stats(void);
802
803 static mcl_slab_t *slab_get(void *);
804 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
805 void *, void *, unsigned int, int, int);
806 static void slab_insert(mcl_slab_t *, mbuf_class_t);
807 static void slab_remove(mcl_slab_t *, mbuf_class_t);
808 static boolean_t slab_inrange(mcl_slab_t *, void *);
809 static void slab_nextptr_panic(mcl_slab_t *, void *);
810 static void slab_detach(mcl_slab_t *);
811 static boolean_t slab_is_detached(mcl_slab_t *);
812
813 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
814 static struct mbuf *m_split0(struct mbuf *, int, int, int);
815 __private_extern__ void mbuf_report_peak_usage(void);
816 static boolean_t mbuf_report_usage(mbuf_class_t);
817 #if DEBUG || DEVELOPMENT
818 #define mbwdog_logger(fmt, ...) _mbwdog_logger(__func__, __LINE__, fmt, ## __VA_ARGS__)
819 static void _mbwdog_logger(const char *func, const int line, const char *fmt, ...);
820 static char *mbwdog_logging;
821 const unsigned mbwdog_logging_size = 4096;
822 static size_t mbwdog_logging_used;
823 #else
824 #define mbwdog_logger(fmt, ...) do { } while (0)
825 #endif
826 static void mbuf_drain_locked(boolean_t);
827
828 /* flags for m_copyback0 */
829 #define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
830 #define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
831 #define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
832 #define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
833
834 /*
835 * This flag is set for all mbufs that come out of and into the composite
836 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
837 * are marked with such a flag have clusters attached to them, and will be
838 * treated differently when they are freed; instead of being placed back
839 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
840 * are placed back into the appropriate composite cache's freelist, and the
841 * actual freeing is deferred until the composite objects are purged. At
842 * such a time, this flag will be cleared from the mbufs and the objects
843 * will be freed into their own separate freelists.
844 */
845 #define EXTF_COMPOSITE 0x1
846
847 /*
848 * This flag indicates that the external cluster is read-only, i.e. it is
849 * or was referred to by more than one mbufs. Once set, this flag is never
850 * cleared.
851 */
852 #define EXTF_READONLY 0x2
853 /*
854 * This flag indicates that the external cluster is paired with the mbuf.
855 * Pairing implies an external free routine defined which will be invoked
856 * when the reference count drops to the minimum at m_free time. This
857 * flag is never cleared.
858 */
859 #define EXTF_PAIRED 0x4
860
861 #define EXTF_MASK \
862 (EXTF_COMPOSITE | EXTF_READONLY | EXTF_PAIRED)
863
864 #define MEXT_MINREF(m) ((m_get_rfa(m))->minref)
865 #define MEXT_REF(m) ((m_get_rfa(m))->refcnt)
866 #define MEXT_PREF(m) ((m_get_rfa(m))->prefcnt)
867 #define MEXT_FLAGS(m) ((m_get_rfa(m))->flags)
868 #define MEXT_PRIV(m) ((m_get_rfa(m))->priv)
869 #define MEXT_PMBUF(m) ((m_get_rfa(m))->paired)
870 #define MEXT_TOKEN(m) ((m_get_rfa(m))->ext_token)
871 #define MBUF_IS_COMPOSITE(m) \
872 (MEXT_REF(m) == MEXT_MINREF(m) && \
873 (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
874 /*
875 * This macro can be used to test if the mbuf is paired to an external
876 * cluster. The test for MEXT_PMBUF being equal to the mbuf in subject
877 * is important, as EXTF_PAIRED alone is insufficient since it is immutable,
878 * and thus survives calls to m_free_paired.
879 */
880 #define MBUF_IS_PAIRED(m) \
881 (((m)->m_flags & M_EXT) && \
882 (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_PAIRED && \
883 MEXT_PMBUF(m) == (m))
884
885 /*
886 * Macros used to verify the integrity of the mbuf.
887 */
888 #define _MCHECK(m) { \
889 if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) { \
890 if (mclaudit == NULL) \
891 panic("MCHECK: m_type=%d m=%p", \
892 (u_int16_t)(m)->m_type, m); \
893 else \
894 mcl_audit_mcheck_panic(m); \
895 } \
896 }
897
898 #define MBUF_IN_MAP(addr) \
899 ((unsigned char *)(addr) >= mbutl && \
900 (unsigned char *)(addr) < embutl)
901
902 #define MRANGE(addr) { \
903 if (!MBUF_IN_MAP(addr)) \
904 panic("MRANGE: address out of range 0x%p", addr); \
905 }
906
907 /*
908 * Macro version of mtod.
909 */
910 #define MTOD(m, t) ((t)((m)->m_data))
911
912 /*
913 * Macros to obtain page index given a base cluster address
914 */
915 #define MTOPG(x) (((unsigned char *)x - mbutl) >> PAGE_SHIFT)
916 #define PGTOM(x) (mbutl + (x << PAGE_SHIFT))
917
918 /*
919 * Macro to find the mbuf index relative to a base.
920 */
921 #define MBPAGEIDX(c, m) \
922 (((unsigned char *)(m) - (unsigned char *)(c)) >> MSIZESHIFT)
923
924 /*
925 * Same thing for 2KB cluster index.
926 */
927 #define CLPAGEIDX(c, m) \
928 (((unsigned char *)(m) - (unsigned char *)(c)) >> MCLSHIFT)
929
930 /*
931 * Macro to find 4KB cluster index relative to a base
932 */
933 #define BCLPAGEIDX(c, m) \
934 (((unsigned char *)(m) - (unsigned char *)(c)) >> MBIGCLSHIFT)
935
936 /*
937 * Macros used during mbuf and cluster initialization.
938 */
939 #define MBUF_INIT_PKTHDR(m) { \
940 (m)->m_pkthdr.rcvif = NULL; \
941 (m)->m_pkthdr.pkt_hdr = NULL; \
942 (m)->m_pkthdr.len = 0; \
943 (m)->m_pkthdr.csum_flags = 0; \
944 (m)->m_pkthdr.csum_data = 0; \
945 (m)->m_pkthdr.vlan_tag = 0; \
946 (m)->m_pkthdr.comp_gencnt = 0; \
947 (m)->m_pkthdr.pkt_crumbs = 0; \
948 m_classifier_init(m, 0); \
949 m_tag_init(m, 1); \
950 m_scratch_init(m); \
951 m_redzone_init(m); \
952 }
953
954 #define MBUF_INIT(m, pkthdr, type) { \
955 _MCHECK(m); \
956 (m)->m_next = (m)->m_nextpkt = NULL; \
957 (m)->m_len = 0; \
958 (m)->m_type = type; \
959 if ((pkthdr) == 0) { \
960 (m)->m_data = (m)->m_dat; \
961 (m)->m_flags = 0; \
962 } else { \
963 (m)->m_data = (m)->m_pktdat; \
964 (m)->m_flags = M_PKTHDR; \
965 MBUF_INIT_PKTHDR(m); \
966 } \
967 }
968
969 #define MEXT_INIT(m, buf, size, free, arg, rfa, min, ref, pref, flag, \
970 priv, pm) { \
971 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
972 (m)->m_flags |= M_EXT; \
973 m_set_ext((m), (rfa), (free), (arg)); \
974 (m)->m_ext.ext_size = (u_int)(size); \
975 MEXT_MINREF(m) = (min); \
976 MEXT_REF(m) = (ref); \
977 MEXT_PREF(m) = (pref); \
978 MEXT_FLAGS(m) = (flag); \
979 MEXT_PRIV(m) = (priv); \
980 MEXT_PMBUF(m) = (pm); \
981 }
982
983 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
984 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, 0, \
985 ref, 0, flag, 0, NULL)
986
987 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
988 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, 0, \
989 ref, 0, flag, 0, NULL)
990
991 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
992 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, 0, \
993 ref, 0, flag, 0, NULL)
994
995 /*
996 * Macro to convert BSD malloc sleep flag to mcache's
997 */
998 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
999
1000 /*
1001 * The structure that holds all mbuf class statistics exportable via sysctl.
1002 * Similar to mbstat structure, the mb_stat structure is protected by the
1003 * global mbuf lock. It contains additional information about the classes
1004 * that allows for a more accurate view of the state of the allocator.
1005 */
1006 struct mb_stat *mb_stat;
1007 struct omb_stat *omb_stat; /* For backwards compatibility */
1008
1009 #define MB_STAT_SIZE(n) \
1010 __builtin_offsetof(mb_stat_t, mbs_class[n])
1011 #define OMB_STAT_SIZE(n) \
1012 __builtin_offsetof(struct omb_stat, mbs_class[n])
1013
1014 /*
1015 * The legacy structure holding all of the mbuf allocation statistics.
1016 * The actual statistics used by the kernel are stored in the mbuf_table
1017 * instead, and are updated atomically while the global mbuf lock is held.
1018 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
1019 * Unlike before, the kernel no longer relies on the contents of mbstat for
1020 * its operations (e.g. cluster expansion) because the structure is exposed
1021 * to outside and could possibly be modified, therefore making it unsafe.
1022 * With the exception of the mbstat.m_mtypes array (see below), all of the
1023 * statistics are updated as they change.
1024 */
1025 struct mbstat mbstat;
1026
1027 #define MBSTAT_MTYPES_MAX \
1028 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
1029
1030 /*
1031 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
1032 * atomically and stored in a per-CPU structure which is lock-free; this is
1033 * done in order to avoid writing to the global mbstat data structure which
1034 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
1035 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
1036 * array and returned to the application. Any updates for types greater or
1037 * equal than MT_MAX would be done atomically to the mbstat; this slows down
1038 * performance but is okay since the kernel uses only up to MT_MAX-1 while
1039 * anything beyond that (up to type 255) is considered a corner case.
1040 */
1041 typedef struct {
1042 unsigned int cpu_mtypes[MT_MAX];
1043 } mbuf_mtypes_t;
1044
1045 static mbuf_mtypes_t PERCPU_DATA(mbuf_mtypes);
1046
1047 #define mtype_stat_add(type, n) { \
1048 if ((unsigned)(type) < MT_MAX) { \
1049 mbuf_mtypes_t *mbs = PERCPU_GET(mbuf_mtypes); \
1050 atomic_add_32(&mbs->cpu_mtypes[type], n); \
1051 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
1052 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \
1053 } \
1054 }
1055
1056 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
1057 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
1058 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
1059
1060 static void
mbuf_mtypes_sync(boolean_t locked)1061 mbuf_mtypes_sync(boolean_t locked)
1062 {
1063 mbuf_mtypes_t mtc;
1064
1065 if (locked) {
1066 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1067 }
1068
1069 mtc = *PERCPU_GET_MASTER(mbuf_mtypes);
1070 percpu_foreach_secondary(mtype, mbuf_mtypes) {
1071 for (int n = 0; n < MT_MAX; n++) {
1072 mtc.cpu_mtypes[n] += mtype->cpu_mtypes[n];
1073 }
1074 }
1075
1076 if (!locked) {
1077 lck_mtx_lock(mbuf_mlock);
1078 }
1079 for (int n = 0; n < MT_MAX; n++) {
1080 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
1081 }
1082 if (!locked) {
1083 lck_mtx_unlock(mbuf_mlock);
1084 }
1085 }
1086
1087 static int
1088 mbstat_sysctl SYSCTL_HANDLER_ARGS
1089 {
1090 #pragma unused(oidp, arg1, arg2)
1091 mbuf_mtypes_sync(FALSE);
1092
1093 return SYSCTL_OUT(req, &mbstat, sizeof(mbstat));
1094 }
1095
1096 static void
mbuf_stat_sync(void)1097 mbuf_stat_sync(void)
1098 {
1099 mb_class_stat_t *sp;
1100 mcache_cpu_t *ccp;
1101 mcache_t *cp;
1102 int k, m, bktsize;
1103
1104 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1105
1106 for (k = 0; k < NELEM(mbuf_table); k++) {
1107 cp = m_cache(k);
1108 ccp = &cp->mc_cpu[0];
1109 bktsize = ccp->cc_bktsize;
1110 sp = mbuf_table[k].mtbl_stats;
1111
1112 if (cp->mc_flags & MCF_NOCPUCACHE) {
1113 sp->mbcl_mc_state = MCS_DISABLED;
1114 } else if (cp->mc_purge_cnt > 0) {
1115 sp->mbcl_mc_state = MCS_PURGING;
1116 } else if (bktsize == 0) {
1117 sp->mbcl_mc_state = MCS_OFFLINE;
1118 } else {
1119 sp->mbcl_mc_state = MCS_ONLINE;
1120 }
1121
1122 sp->mbcl_mc_cached = 0;
1123 for (m = 0; m < ncpu; m++) {
1124 ccp = &cp->mc_cpu[m];
1125 if (ccp->cc_objs > 0) {
1126 sp->mbcl_mc_cached += ccp->cc_objs;
1127 }
1128 if (ccp->cc_pobjs > 0) {
1129 sp->mbcl_mc_cached += ccp->cc_pobjs;
1130 }
1131 }
1132 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
1133 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1134 sp->mbcl_infree;
1135
1136 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1137 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1138 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1139
1140 /* Calculate total count specific to each class */
1141 sp->mbcl_ctotal = sp->mbcl_total;
1142 switch (m_class(k)) {
1143 case MC_MBUF:
1144 /* Deduct mbufs used in composite caches */
1145 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1146 m_total(MC_MBUF_BIGCL));
1147 break;
1148
1149 case MC_CL:
1150 /* Deduct clusters used in composite cache */
1151 sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1152 break;
1153
1154 case MC_BIGCL:
1155 /* Deduct clusters used in composite cache */
1156 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1157 break;
1158
1159 case MC_16KCL:
1160 /* Deduct clusters used in composite cache */
1161 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1162 break;
1163
1164 default:
1165 break;
1166 }
1167 }
1168 }
1169
1170 static int
1171 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1172 {
1173 #pragma unused(oidp, arg1, arg2)
1174 void *statp;
1175 int k, statsz, proc64 = proc_is64bit(req->p);
1176
1177 lck_mtx_lock(mbuf_mlock);
1178 mbuf_stat_sync();
1179
1180 if (!proc64) {
1181 struct omb_class_stat *oc;
1182 struct mb_class_stat *c;
1183
1184 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1185 oc = &omb_stat->mbs_class[0];
1186 c = &mb_stat->mbs_class[0];
1187 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1188 (void) snprintf(oc->mbcl_cname, sizeof(oc->mbcl_cname),
1189 "%s", c->mbcl_cname);
1190 oc->mbcl_size = c->mbcl_size;
1191 oc->mbcl_total = c->mbcl_total;
1192 oc->mbcl_active = c->mbcl_active;
1193 oc->mbcl_infree = c->mbcl_infree;
1194 oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1195 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1196 oc->mbcl_free_cnt = c->mbcl_free_cnt;
1197 oc->mbcl_notified = c->mbcl_notified;
1198 oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1199 oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1200 oc->mbcl_ctotal = c->mbcl_ctotal;
1201 oc->mbcl_release_cnt = c->mbcl_release_cnt;
1202 oc->mbcl_mc_state = c->mbcl_mc_state;
1203 oc->mbcl_mc_cached = c->mbcl_mc_cached;
1204 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1205 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1206 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1207 oc->mbcl_peak_reported = c->mbcl_peak_reported;
1208 }
1209 statp = omb_stat;
1210 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1211 } else {
1212 statp = mb_stat;
1213 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1214 }
1215
1216 lck_mtx_unlock(mbuf_mlock);
1217
1218 return SYSCTL_OUT(req, statp, statsz);
1219 }
1220
1221 static int
1222 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1223 {
1224 #pragma unused(oidp, arg1, arg2)
1225 int i;
1226
1227 /* Ensure leak tracing turned on */
1228 if (!mclfindleak || !mclexpleak) {
1229 return ENXIO;
1230 }
1231
1232 lck_mtx_lock(mleak_lock);
1233 mleak_update_stats();
1234 i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1235 lck_mtx_unlock(mleak_lock);
1236
1237 return i;
1238 }
1239
1240 static int
1241 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1242 {
1243 #pragma unused(oidp, arg1, arg2)
1244 int i = 0;
1245
1246 /* Ensure leak tracing turned on */
1247 if (!mclfindleak || !mclexpleak) {
1248 return ENXIO;
1249 }
1250
1251 lck_mtx_lock(mleak_lock);
1252 i = SYSCTL_OUT(req, &mleak_table, sizeof(mleak_table));
1253 lck_mtx_unlock(mleak_lock);
1254
1255 return i;
1256 }
1257
1258 static inline void
m_incref(struct mbuf * m)1259 m_incref(struct mbuf *m)
1260 {
1261 UInt16 old, new;
1262 volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m);
1263
1264 do {
1265 old = *addr;
1266 new = old + 1;
1267 VERIFY(new != 0);
1268 } while (!OSCompareAndSwap16(old, new, addr));
1269
1270 /*
1271 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1272 * we don't clear the flag when the refcount goes back to the
1273 * minimum, to simplify code calling m_mclhasreference().
1274 */
1275 if (new > (MEXT_MINREF(m) + 1) && !(MEXT_FLAGS(m) & EXTF_READONLY)) {
1276 (void) OSBitOrAtomic16(EXTF_READONLY, &MEXT_FLAGS(m));
1277 }
1278 }
1279
1280 static inline u_int16_t
m_decref(struct mbuf * m)1281 m_decref(struct mbuf *m)
1282 {
1283 UInt16 old, new;
1284 volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m);
1285
1286 do {
1287 old = *addr;
1288 new = old - 1;
1289 VERIFY(old != 0);
1290 } while (!OSCompareAndSwap16(old, new, addr));
1291
1292 return new;
1293 }
1294
1295 static void
mbuf_table_init(void)1296 mbuf_table_init(void)
1297 {
1298 unsigned int b, c, s;
1299 int m, config_mbuf_jumbo = 0;
1300
1301 omb_stat = zalloc_permanent(OMB_STAT_SIZE(NELEM(mbuf_table)),
1302 ZALIGN(struct omb_stat));
1303
1304 mb_stat = zalloc_permanent(MB_STAT_SIZE(NELEM(mbuf_table)),
1305 ZALIGN(mb_stat_t));
1306
1307 mb_stat->mbs_cnt = NELEM(mbuf_table);
1308 for (m = 0; m < NELEM(mbuf_table); m++) {
1309 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1310 }
1311
1312 #if CONFIG_MBUF_JUMBO
1313 config_mbuf_jumbo = 1;
1314 #endif /* CONFIG_MBUF_JUMBO */
1315
1316 if (config_mbuf_jumbo == 1 || PAGE_SIZE == M16KCLBYTES) {
1317 /*
1318 * Set aside 1/3 of the mbuf cluster map for jumbo
1319 * clusters; we do this only on platforms where jumbo
1320 * cluster pool is enabled.
1321 */
1322 njcl = nmbclusters / 3;
1323 njclbytes = M16KCLBYTES;
1324 }
1325
1326 /*
1327 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1328 * a multiple of 4KB clusters.
1329 */
1330 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1331 if (njcl > 0) {
1332 /*
1333 * Each jumbo cluster takes 8 2KB clusters, so make
1334 * sure that the pool size is evenly divisible by 8;
1335 * njcl is in 2KB unit, hence treated as such.
1336 */
1337 njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL);
1338
1339 /* Update nclusters with rounded down value of njcl */
1340 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1341 }
1342
1343 /*
1344 * njcl is valid only on platforms with 16KB jumbo clusters or
1345 * with 16KB pages, where it is configured to 1/3 of the pool
1346 * size. On these platforms, the remaining is used for 2KB
1347 * and 4KB clusters. On platforms without 16KB jumbo clusters,
1348 * the entire pool is used for both 2KB and 4KB clusters. A 4KB
1349 * cluster can either be splitted into 16 mbufs, or into 2 2KB
1350 * clusters.
1351 *
1352 * +---+---+------------ ... -----------+------- ... -------+
1353 * | c | b | s | njcl |
1354 * +---+---+------------ ... -----------+------- ... -------+
1355 *
1356 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1357 * clusters (1/64th each.)
1358 */
1359 c = P2ROUNDDOWN((nclusters >> 6), NCLPG); /* in 2KB unit */
1360 b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), NBCLPG); /* in 4KB unit */
1361 s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */
1362
1363 /*
1364 * 1/64th (c) is reserved for 2KB clusters.
1365 */
1366 m_minlimit(MC_CL) = c;
1367 m_maxlimit(MC_CL) = s + c; /* in 2KB unit */
1368 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1369 (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1370
1371 /*
1372 * Another 1/64th (b) of the map is reserved for 4KB clusters.
1373 * It cannot be turned into 2KB clusters or mbufs.
1374 */
1375 m_minlimit(MC_BIGCL) = b;
1376 m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */
1377 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1378 (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1379
1380 /*
1381 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1382 */
1383 m_minlimit(MC_MBUF) = 0;
1384 m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT); /* in mbuf unit */
1385 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1386 (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1387
1388 /*
1389 * Set limits for the composite classes.
1390 */
1391 m_minlimit(MC_MBUF_CL) = 0;
1392 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1393 m_maxsize(MC_MBUF_CL) = MCLBYTES;
1394 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1395 (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1396
1397 m_minlimit(MC_MBUF_BIGCL) = 0;
1398 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1399 m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1400 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1401 (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1402
1403 /*
1404 * And for jumbo classes.
1405 */
1406 m_minlimit(MC_16KCL) = 0;
1407 m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */
1408 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1409 (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1410
1411 m_minlimit(MC_MBUF_16KCL) = 0;
1412 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1413 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1414 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1415 (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1416
1417 /*
1418 * Initialize the legacy mbstat structure.
1419 */
1420 bzero(&mbstat, sizeof(mbstat));
1421 mbstat.m_msize = m_maxsize(MC_MBUF);
1422 mbstat.m_mclbytes = m_maxsize(MC_CL);
1423 mbstat.m_minclsize = MINCLSIZE;
1424 mbstat.m_mlen = MLEN;
1425 mbstat.m_mhlen = MHLEN;
1426 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1427 }
1428
1429 int
mbuf_get_class(struct mbuf * m)1430 mbuf_get_class(struct mbuf *m)
1431 {
1432 if (m->m_flags & M_EXT) {
1433 uint32_t composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
1434 m_ext_free_func_t m_free_func = m_get_ext_free(m);
1435
1436 if (m_free_func == NULL) {
1437 if (composite) {
1438 return MC_MBUF_CL;
1439 } else {
1440 return MC_CL;
1441 }
1442 } else if (m_free_func == m_bigfree) {
1443 if (composite) {
1444 return MC_MBUF_BIGCL;
1445 } else {
1446 return MC_BIGCL;
1447 }
1448 } else if (m_free_func == m_16kfree) {
1449 if (composite) {
1450 return MC_MBUF_16KCL;
1451 } else {
1452 return MC_16KCL;
1453 }
1454 }
1455 }
1456
1457 return MC_MBUF;
1458 }
1459
1460 bool
mbuf_class_under_pressure(struct mbuf * m)1461 mbuf_class_under_pressure(struct mbuf *m)
1462 {
1463 int mclass = mbuf_get_class(m);
1464
1465 if (m_total(mclass) - m_infree(mclass) >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
1466 /*
1467 * The above computation does not include the per-CPU cached objects.
1468 * As a fast-path check this is good-enough. But now we do
1469 * the "slower" count of the cached objects to know exactly the
1470 * number of active mbufs in use.
1471 *
1472 * We do not take the mbuf_lock here to avoid lock-contention. Numbers
1473 * might be slightly off but we don't try to be 100% accurate.
1474 * At worst, we drop a packet that we shouldn't have dropped or
1475 * we might go slightly above our memory-pressure threshold.
1476 */
1477 mcache_t *cp = m_cache(mclass);
1478 mcache_cpu_t *ccp = &cp->mc_cpu[0];
1479
1480 int bktsize = os_access_once(ccp->cc_bktsize);
1481 uint32_t bl_total = os_access_once(cp->mc_full.bl_total);
1482 uint32_t cached = 0;
1483 int i;
1484
1485 for (i = 0; i < ncpu; i++) {
1486 ccp = &cp->mc_cpu[i];
1487
1488 int cc_objs = os_access_once(ccp->cc_objs);
1489 if (cc_objs > 0) {
1490 cached += cc_objs;
1491 }
1492
1493 int cc_pobjs = os_access_once(ccp->cc_pobjs);
1494 if (cc_pobjs > 0) {
1495 cached += cc_pobjs;
1496 }
1497 }
1498 cached += (bl_total * bktsize);
1499
1500 if (m_total(mclass) - m_infree(mclass) - cached >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
1501 os_log(OS_LOG_DEFAULT,
1502 "%s memory-pressure on mbuf due to class %u, total %u free %u cached %u max %u",
1503 __func__, mclass, m_total(mclass), m_infree(mclass), cached, m_maxlimit(mclass));
1504 return true;
1505 }
1506 }
1507
1508 return false;
1509 }
1510
1511 #if defined(__LP64__)
1512 typedef struct ncl_tbl {
1513 uint64_t nt_maxmem; /* memory (sane) size */
1514 uint32_t nt_mbpool; /* mbuf pool size */
1515 } ncl_tbl_t;
1516
1517 static const ncl_tbl_t ncl_table[] = {
1518 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ },
1519 { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (96 << MBSHIFT) /* 96 MB */ },
1520 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (128 << MBSHIFT) /* 128 MB */ },
1521 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (256 << MBSHIFT) /* 256 MB */ },
1522 { (1ULL << (GBSHIFT + 5)) /* 32 GB */, (512 << MBSHIFT) /* 512 MB */ },
1523 { 0, 0 }
1524 };
1525 #endif /* __LP64__ */
1526
1527 __private_extern__ unsigned int
mbuf_default_ncl(uint64_t mem)1528 mbuf_default_ncl(uint64_t mem)
1529 {
1530 #if !defined(__LP64__)
1531 unsigned int n;
1532 /*
1533 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1534 */
1535 if ((n = ((mem / 16) / MCLBYTES)) > 32768) {
1536 n = 32768;
1537 }
1538 #else
1539 unsigned int n, i;
1540 /*
1541 * 64-bit kernel (mbuf pool size based on table).
1542 */
1543 n = ncl_table[0].nt_mbpool;
1544 for (i = 0; ncl_table[i].nt_mbpool != 0; i++) {
1545 if (mem < ncl_table[i].nt_maxmem) {
1546 break;
1547 }
1548 n = ncl_table[i].nt_mbpool;
1549 }
1550 n >>= MCLSHIFT;
1551 #endif /* !__LP64__ */
1552 return n;
1553 }
1554
1555 __private_extern__ void
mbinit(void)1556 mbinit(void)
1557 {
1558 unsigned int m;
1559 unsigned int initmcl = 0;
1560 thread_t thread = THREAD_NULL;
1561
1562 microuptime(&mb_start);
1563
1564 /*
1565 * These MBUF_ values must be equal to their private counterparts.
1566 */
1567 _CASSERT(MBUF_EXT == M_EXT);
1568 _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1569 _CASSERT(MBUF_EOR == M_EOR);
1570 _CASSERT(MBUF_LOOP == M_LOOP);
1571 _CASSERT(MBUF_BCAST == M_BCAST);
1572 _CASSERT(MBUF_MCAST == M_MCAST);
1573 _CASSERT(MBUF_FRAG == M_FRAG);
1574 _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1575 _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1576 _CASSERT(MBUF_PROMISC == M_PROMISC);
1577 _CASSERT(MBUF_HASFCS == M_HASFCS);
1578
1579 _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1580 _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1581 _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1582 _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1583 _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1584 _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1585 _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1586 _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1587 _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1588 _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1589 _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1590 _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1591 _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1592 _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1593 _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1594
1595 _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1596 _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1597 _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1598 _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1599 _CASSERT(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
1600 _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1601 _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1602 _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1603 _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1604 _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1605 _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1606 _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1607 _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1608 _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1609
1610 _CASSERT(MBUF_WAITOK == M_WAIT);
1611 _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1612 _CASSERT(MBUF_COPYALL == M_COPYALL);
1613
1614 _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1615 _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1616 _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1617 _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1618 _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1619 _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1620 _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1621 _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1622 _CASSERT(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI);
1623 _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1624 _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1625
1626 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1627 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1628 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1629 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1630
1631 /* Module specific scratch space (32-bit alignment requirement) */
1632 _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
1633 sizeof(uint32_t)));
1634
1635 /* pktdata needs to start at 128-bit offset! */
1636 _CASSERT((offsetof(struct mbuf, m_pktdat) % 16) == 0);
1637
1638 /* Initialize random red zone cookie value */
1639 _CASSERT(sizeof(mb_redzone_cookie) ==
1640 sizeof(((struct pkthdr *)0)->redzone));
1641 read_random(&mb_redzone_cookie, sizeof(mb_redzone_cookie));
1642 read_random(&mb_obscure_extref, sizeof(mb_obscure_extref));
1643 read_random(&mb_obscure_extfree, sizeof(mb_obscure_extfree));
1644 mb_obscure_extref |= 0x3;
1645 mb_obscure_extfree |= 0x3;
1646
1647 /* Make sure we don't save more than we should */
1648 _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof(struct mbuf));
1649
1650 if (nmbclusters == 0) {
1651 nmbclusters = NMBCLUSTERS;
1652 }
1653
1654 /* This should be a sane (at least even) value by now */
1655 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1656
1657 /* Setup the mbuf table */
1658 mbuf_table_init();
1659
1660 /*
1661 * Allocate cluster slabs table:
1662 *
1663 * maxslabgrp = (N * 2048) / (1024 * 1024)
1664 *
1665 * Where N is nmbclusters rounded up to the nearest 512. This yields
1666 * mcl_slab_g_t units, each one representing a MB of memory.
1667 */
1668 maxslabgrp =
1669 (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
1670 slabstbl = zalloc_permanent(maxslabgrp * sizeof(mcl_slabg_t *),
1671 ZALIGN(mcl_slabg_t));
1672
1673 /*
1674 * Allocate audit structures, if needed:
1675 *
1676 * maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE
1677 *
1678 * This yields mcl_audit_t units, each one representing a page.
1679 */
1680 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof(mbuf_debug));
1681 mbuf_debug |= mcache_getflags();
1682 if (mbuf_debug & MCF_DEBUG) {
1683 int l;
1684 mcl_audit_t *mclad;
1685 maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
1686 mclaudit = zalloc_permanent(maxclaudit * sizeof(*mclaudit),
1687 ZALIGN(mcl_audit_t));
1688 for (l = 0, mclad = mclaudit; l < maxclaudit; l++) {
1689 mclad[l].cl_audit = zalloc_permanent(NMBPG * sizeof(mcache_audit_t *),
1690 ZALIGN_PTR);
1691 }
1692
1693 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1694 AUDIT_CONTENTS_SIZE, sizeof(u_int64_t), 0, MCR_SLEEP);
1695 VERIFY(mcl_audit_con_cache != NULL);
1696 }
1697 mclverify = (mbuf_debug & MCF_VERIFY);
1698 mcltrace = (mbuf_debug & MCF_TRACE);
1699 mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1700 mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1701
1702 /* Enable mbuf leak logging, with a lock to protect the tables */
1703
1704 mleak_activate();
1705
1706 /*
1707 * Allocate structure for per-CPU statistics that's aligned
1708 * on the CPU cache boundary; this code assumes that we never
1709 * uninitialize this framework, since the original address
1710 * before alignment is not saved.
1711 */
1712 ncpu = ml_wait_max_cpus();
1713
1714 /* Calculate the number of pages assigned to the cluster pool */
1715 mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
1716 mcl_paddr = zalloc_permanent(mcl_pages * sizeof(ppnum_t),
1717 ZALIGN(ppnum_t));
1718
1719 /* Register with the I/O Bus mapper */
1720 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1721
1722 embutl = (mbutl + (nmbclusters * MCLBYTES));
1723 VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0);
1724
1725 /* Prime up the freelist */
1726 PE_parse_boot_argn("initmcl", &initmcl, sizeof(initmcl));
1727 if (initmcl != 0) {
1728 initmcl >>= NCLPBGSHIFT; /* become a 4K unit */
1729 if (initmcl > m_maxlimit(MC_BIGCL)) {
1730 initmcl = m_maxlimit(MC_BIGCL);
1731 }
1732 }
1733 if (initmcl < m_minlimit(MC_BIGCL)) {
1734 initmcl = m_minlimit(MC_BIGCL);
1735 }
1736
1737 lck_mtx_lock(mbuf_mlock);
1738
1739 /*
1740 * For classes with non-zero minimum limits, populate their freelists
1741 * so that m_total(class) is at least m_minlimit(class).
1742 */
1743 VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1744 freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1745 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1746 freelist_init(m_class(MC_CL));
1747
1748 for (m = 0; m < NELEM(mbuf_table); m++) {
1749 /* Make sure we didn't miss any */
1750 VERIFY(m_minlimit(m_class(m)) == 0 ||
1751 m_total(m_class(m)) >= m_minlimit(m_class(m)));
1752
1753 /* populate the initial sizes and report from there on */
1754 m_peak(m_class(m)) = m_total(m_class(m));
1755 }
1756 mb_peak_newreport = FALSE;
1757
1758 lck_mtx_unlock(mbuf_mlock);
1759
1760 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1761 NULL, &thread);
1762 thread_deallocate(thread);
1763
1764 ref_cache = mcache_create("mext_ref", sizeof(struct ext_ref),
1765 0, 0, MCR_SLEEP);
1766
1767 /* Create the cache for each class */
1768 for (m = 0; m < NELEM(mbuf_table); m++) {
1769 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1770 u_int32_t flags;
1771
1772 flags = mbuf_debug;
1773 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1774 m_class(m) == MC_MBUF_16KCL) {
1775 allocfunc = mbuf_cslab_alloc;
1776 freefunc = mbuf_cslab_free;
1777 auditfunc = mbuf_cslab_audit;
1778 logfunc = mleak_logger;
1779 } else {
1780 allocfunc = mbuf_slab_alloc;
1781 freefunc = mbuf_slab_free;
1782 auditfunc = mbuf_slab_audit;
1783 logfunc = mleak_logger;
1784 }
1785
1786 /*
1787 * Disable per-CPU caches for jumbo classes if there
1788 * is no jumbo cluster pool available in the system.
1789 * The cache itself is still created (but will never
1790 * be populated) since it simplifies the code.
1791 */
1792 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1793 njcl == 0) {
1794 flags |= MCF_NOCPUCACHE;
1795 }
1796
1797 if (!mclfindleak) {
1798 flags |= MCF_NOLEAKLOG;
1799 }
1800
1801 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1802 allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1803 (void *)(uintptr_t)m, flags, MCR_SLEEP);
1804 }
1805
1806 /*
1807 * Set the max limit on sb_max to be 1/16 th of the size of
1808 * memory allocated for mbuf clusters.
1809 */
1810 high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1811 if (high_sb_max < sb_max) {
1812 /* sb_max is too large for this configuration, scale it down */
1813 if (high_sb_max > (1 << MBSHIFT)) {
1814 /* We have atleast 16 M of mbuf pool */
1815 sb_max = high_sb_max;
1816 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1817 /*
1818 * If we have more than 1M of mbufpool, cap the size of
1819 * max sock buf at 1M
1820 */
1821 sb_max = high_sb_max = (1 << MBSHIFT);
1822 } else {
1823 sb_max = high_sb_max;
1824 }
1825 }
1826
1827 /* allocate space for mbuf_dump_buf */
1828 mbuf_dump_buf = zalloc_permanent(MBUF_DUMP_BUF_SIZE, ZALIGN_NONE);
1829
1830 if (mbuf_debug & MCF_DEBUG) {
1831 printf("%s: MLEN %d, MHLEN %d\n", __func__,
1832 (int)_MLEN, (int)_MHLEN);
1833 }
1834
1835 printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1836 (nmbclusters << MCLSHIFT) >> MBSHIFT,
1837 (nclusters << MCLSHIFT) >> MBSHIFT,
1838 (njcl << MCLSHIFT) >> MBSHIFT);
1839 }
1840
1841 /*
1842 * Obtain a slab of object(s) from the class's freelist.
1843 */
1844 static mcache_obj_t *
slab_alloc(mbuf_class_t class,int wait)1845 slab_alloc(mbuf_class_t class, int wait)
1846 {
1847 mcl_slab_t *sp;
1848 mcache_obj_t *buf;
1849
1850 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1851
1852 /* This should always be NULL for us */
1853 VERIFY(m_cobjlist(class) == NULL);
1854
1855 /*
1856 * Treat composite objects as having longer lifespan by using
1857 * a slab from the reverse direction, in hoping that this could
1858 * reduce the probability of fragmentation for slabs that hold
1859 * more than one buffer chunks (e.g. mbuf slabs). For other
1860 * slabs, this probably doesn't make much of a difference.
1861 */
1862 if ((class == MC_MBUF || class == MC_CL || class == MC_BIGCL)
1863 && (wait & MCR_COMP)) {
1864 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1865 } else {
1866 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1867 }
1868
1869 if (sp == NULL) {
1870 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1871 /* The slab list for this class is empty */
1872 return NULL;
1873 }
1874
1875 VERIFY(m_infree(class) > 0);
1876 VERIFY(!slab_is_detached(sp));
1877 VERIFY(sp->sl_class == class &&
1878 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1879 buf = sp->sl_head;
1880 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1881 sp->sl_head = buf->obj_next;
1882 /* Increment slab reference */
1883 sp->sl_refcnt++;
1884
1885 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == sp->sl_chunks);
1886
1887 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1888 slab_nextptr_panic(sp, sp->sl_head);
1889 /* In case sl_head is in the map but not in the slab */
1890 VERIFY(slab_inrange(sp, sp->sl_head));
1891 /* NOTREACHED */
1892 }
1893
1894 if (mclaudit != NULL) {
1895 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1896 mca->mca_uflags = 0;
1897 /* Save contents on mbuf objects only */
1898 if (class == MC_MBUF) {
1899 mca->mca_uflags |= MB_SCVALID;
1900 }
1901 }
1902
1903 if (class == MC_CL) {
1904 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1905 /*
1906 * A 2K cluster slab can have at most NCLPG references.
1907 */
1908 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPG &&
1909 sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1910 VERIFY(sp->sl_refcnt < NCLPG || sp->sl_head == NULL);
1911 } else if (class == MC_BIGCL) {
1912 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1913 m_infree(MC_MBUF_BIGCL);
1914 /*
1915 * A 4K cluster slab can have NBCLPG references.
1916 */
1917 VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG &&
1918 sp->sl_len == PAGE_SIZE &&
1919 (sp->sl_refcnt < NBCLPG || sp->sl_head == NULL));
1920 } else if (class == MC_16KCL) {
1921 mcl_slab_t *nsp;
1922 int k;
1923
1924 --m_infree(MC_16KCL);
1925 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1926 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1927 /*
1928 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1929 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1930 * most 1 reference.
1931 */
1932 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1933 nsp = nsp->sl_next;
1934 /* Next slab must already be present */
1935 VERIFY(nsp != NULL);
1936 nsp->sl_refcnt++;
1937 VERIFY(!slab_is_detached(nsp));
1938 VERIFY(nsp->sl_class == MC_16KCL &&
1939 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1940 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1941 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1942 nsp->sl_head == NULL);
1943 }
1944 } else {
1945 VERIFY(class == MC_MBUF);
1946 --m_infree(MC_MBUF);
1947 /*
1948 * If auditing is turned on, this check is
1949 * deferred until later in mbuf_slab_audit().
1950 */
1951 if (mclaudit == NULL) {
1952 _MCHECK((struct mbuf *)buf);
1953 }
1954 /*
1955 * Since we have incremented the reference count above,
1956 * an mbuf slab (formerly a 4KB cluster slab that was cut
1957 * up into mbufs) must have a reference count between 1
1958 * and NMBPG at this point.
1959 */
1960 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPG &&
1961 sp->sl_chunks == NMBPG &&
1962 sp->sl_len == PAGE_SIZE);
1963 VERIFY(sp->sl_refcnt < NMBPG || sp->sl_head == NULL);
1964 }
1965
1966 /* If empty, remove this slab from the class's freelist */
1967 if (sp->sl_head == NULL) {
1968 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPG);
1969 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPG);
1970 VERIFY(class != MC_BIGCL || sp->sl_refcnt == NBCLPG);
1971 slab_remove(sp, class);
1972 }
1973
1974 return buf;
1975 }
1976
1977 /*
1978 * Place a slab of object(s) back into a class's slab list.
1979 */
1980 static void
slab_free(mbuf_class_t class,mcache_obj_t * buf)1981 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1982 {
1983 mcl_slab_t *sp;
1984 boolean_t reinit_supercl = false;
1985 mbuf_class_t super_class;
1986
1987 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1988
1989 VERIFY(class != MC_16KCL || njcl > 0);
1990 VERIFY(buf->obj_next == NULL);
1991
1992 /*
1993 * Synchronizing with m_clalloc, as it reads m_total, while we here
1994 * are modifying m_total.
1995 */
1996 while (mb_clalloc_busy) {
1997 mb_clalloc_waiters++;
1998 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
1999 (PZERO - 1), "m_clalloc", NULL);
2000 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2001 }
2002
2003 /* We are busy now; tell everyone else to go away */
2004 mb_clalloc_busy = TRUE;
2005
2006 sp = slab_get(buf);
2007 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
2008 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2009
2010 /* Decrement slab reference */
2011 sp->sl_refcnt--;
2012
2013 if (class == MC_CL) {
2014 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
2015 /*
2016 * A slab that has been splitted for 2KB clusters can have
2017 * at most 1 outstanding reference at this point.
2018 */
2019 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPG - 1) &&
2020 sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
2021 VERIFY(sp->sl_refcnt < (NCLPG - 1) ||
2022 (slab_is_detached(sp) && sp->sl_head == NULL));
2023 } else if (class == MC_BIGCL) {
2024 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
2025
2026 /* A 4KB cluster slab can have NBCLPG references at most */
2027 VERIFY(sp->sl_refcnt >= 0 && sp->sl_chunks == NBCLPG);
2028 VERIFY(sp->sl_refcnt < (NBCLPG - 1) ||
2029 (slab_is_detached(sp) && sp->sl_head == NULL));
2030 } else if (class == MC_16KCL) {
2031 mcl_slab_t *nsp;
2032 int k;
2033 /*
2034 * A 16KB cluster takes NSLABSP16KB slabs, all must
2035 * now have 0 reference.
2036 */
2037 VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE));
2038 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
2039 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
2040 VERIFY(slab_is_detached(sp));
2041 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
2042 nsp = nsp->sl_next;
2043 /* Next slab must already be present */
2044 VERIFY(nsp != NULL);
2045 nsp->sl_refcnt--;
2046 VERIFY(slab_is_detached(nsp));
2047 VERIFY(nsp->sl_class == MC_16KCL &&
2048 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
2049 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
2050 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
2051 nsp->sl_head == NULL);
2052 }
2053 } else {
2054 /*
2055 * A slab that has been splitted for mbufs has at most
2056 * NMBPG reference counts. Since we have decremented
2057 * one reference above, it must now be between 0 and
2058 * NMBPG-1.
2059 */
2060 VERIFY(class == MC_MBUF);
2061 VERIFY(sp->sl_refcnt >= 0 &&
2062 sp->sl_refcnt <= (NMBPG - 1) &&
2063 sp->sl_chunks == NMBPG &&
2064 sp->sl_len == PAGE_SIZE);
2065 VERIFY(sp->sl_refcnt < (NMBPG - 1) ||
2066 (slab_is_detached(sp) && sp->sl_head == NULL));
2067 }
2068
2069 /*
2070 * When auditing is enabled, ensure that the buffer still
2071 * contains the free pattern. Otherwise it got corrupted
2072 * while at the CPU cache layer.
2073 */
2074 if (mclaudit != NULL) {
2075 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
2076 if (mclverify) {
2077 mcache_audit_free_verify(mca, buf, 0,
2078 m_maxsize(class));
2079 }
2080 mca->mca_uflags &= ~MB_SCVALID;
2081 }
2082
2083 if (class == MC_CL) {
2084 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
2085 buf->obj_next = sp->sl_head;
2086 } else if (class == MC_BIGCL) {
2087 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
2088 m_infree(MC_MBUF_BIGCL);
2089 buf->obj_next = sp->sl_head;
2090 } else if (class == MC_16KCL) {
2091 ++m_infree(MC_16KCL);
2092 } else {
2093 ++m_infree(MC_MBUF);
2094 buf->obj_next = sp->sl_head;
2095 }
2096 sp->sl_head = buf;
2097
2098 /*
2099 * If a slab has been split to either one which holds 2KB clusters,
2100 * or one which holds mbufs, turn it back to one which holds a
2101 * 4 or 16 KB cluster depending on the page size.
2102 */
2103 if (m_maxsize(MC_BIGCL) == PAGE_SIZE) {
2104 super_class = MC_BIGCL;
2105 } else {
2106 VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL));
2107 super_class = MC_16KCL;
2108 }
2109 if (class == MC_MBUF && sp->sl_refcnt == 0 &&
2110 m_total(class) >= (m_minlimit(class) + NMBPG) &&
2111 m_total(super_class) < m_maxlimit(super_class)) {
2112 int i = NMBPG;
2113
2114 m_total(MC_MBUF) -= NMBPG;
2115 mbstat.m_mbufs = m_total(MC_MBUF);
2116 m_infree(MC_MBUF) -= NMBPG;
2117 mtype_stat_add(MT_FREE, -((unsigned)NMBPG));
2118
2119 while (i--) {
2120 struct mbuf *m = sp->sl_head;
2121 VERIFY(m != NULL);
2122 sp->sl_head = m->m_next;
2123 m->m_next = NULL;
2124 }
2125 reinit_supercl = true;
2126 } else if (class == MC_CL && sp->sl_refcnt == 0 &&
2127 m_total(class) >= (m_minlimit(class) + NCLPG) &&
2128 m_total(super_class) < m_maxlimit(super_class)) {
2129 int i = NCLPG;
2130
2131 m_total(MC_CL) -= NCLPG;
2132 mbstat.m_clusters = m_total(MC_CL);
2133 m_infree(MC_CL) -= NCLPG;
2134
2135 while (i--) {
2136 union mcluster *c = sp->sl_head;
2137 VERIFY(c != NULL);
2138 sp->sl_head = c->mcl_next;
2139 c->mcl_next = NULL;
2140 }
2141 reinit_supercl = true;
2142 } else if (class == MC_BIGCL && super_class != MC_BIGCL &&
2143 sp->sl_refcnt == 0 &&
2144 m_total(class) >= (m_minlimit(class) + NBCLPG) &&
2145 m_total(super_class) < m_maxlimit(super_class)) {
2146 int i = NBCLPG;
2147
2148 VERIFY(super_class == MC_16KCL);
2149 m_total(MC_BIGCL) -= NBCLPG;
2150 mbstat.m_bigclusters = m_total(MC_BIGCL);
2151 m_infree(MC_BIGCL) -= NBCLPG;
2152
2153 while (i--) {
2154 union mbigcluster *bc = sp->sl_head;
2155 VERIFY(bc != NULL);
2156 sp->sl_head = bc->mbc_next;
2157 bc->mbc_next = NULL;
2158 }
2159 reinit_supercl = true;
2160 }
2161
2162 if (reinit_supercl) {
2163 VERIFY(sp->sl_head == NULL);
2164 VERIFY(m_total(class) >= m_minlimit(class));
2165 slab_remove(sp, class);
2166
2167 /* Reinitialize it as a cluster for the super class */
2168 m_total(super_class)++;
2169 m_infree(super_class)++;
2170 VERIFY(sp->sl_flags == (SLF_MAPPED | SLF_DETACHED) &&
2171 sp->sl_len == PAGE_SIZE && sp->sl_refcnt == 0);
2172
2173 slab_init(sp, super_class, SLF_MAPPED, sp->sl_base,
2174 sp->sl_base, PAGE_SIZE, 0, 1);
2175 if (mclverify) {
2176 mcache_set_pattern(MCACHE_FREE_PATTERN,
2177 (caddr_t)sp->sl_base, sp->sl_len);
2178 }
2179 ((mcache_obj_t *)(sp->sl_base))->obj_next = NULL;
2180
2181 if (super_class == MC_BIGCL) {
2182 mbstat.m_bigclusters = m_total(MC_BIGCL);
2183 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2184 m_infree(MC_MBUF_BIGCL);
2185 }
2186
2187 VERIFY(slab_is_detached(sp));
2188 VERIFY(m_total(super_class) <= m_maxlimit(super_class));
2189
2190 /* And finally switch class */
2191 class = super_class;
2192 }
2193
2194 /* Reinsert the slab to the class's slab list */
2195 if (slab_is_detached(sp)) {
2196 slab_insert(sp, class);
2197 }
2198
2199 /* We're done; let others enter */
2200 mb_clalloc_busy = FALSE;
2201 if (mb_clalloc_waiters > 0) {
2202 mb_clalloc_waiters = 0;
2203 wakeup(mb_clalloc_waitchan);
2204 }
2205 }
2206
2207 /*
2208 * Common allocator for rudimentary objects called by the CPU cache layer
2209 * during an allocation request whenever there is no available element in the
2210 * bucket layer. It returns one or more elements from the appropriate global
2211 * freelist. If the freelist is empty, it will attempt to populate it and
2212 * retry the allocation.
2213 */
2214 static unsigned int
mbuf_slab_alloc(void * arg,mcache_obj_t *** plist,unsigned int num,int wait)2215 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
2216 {
2217 mbuf_class_t class = (mbuf_class_t)arg;
2218 unsigned int need = num;
2219 mcache_obj_t **list = *plist;
2220
2221 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2222 ASSERT(need > 0);
2223
2224 lck_mtx_lock(mbuf_mlock);
2225
2226 for (;;) {
2227 if ((*list = slab_alloc(class, wait)) != NULL) {
2228 (*list)->obj_next = NULL;
2229 list = *plist = &(*list)->obj_next;
2230
2231 if (--need == 0) {
2232 /*
2233 * If the number of elements in freelist has
2234 * dropped below low watermark, asynchronously
2235 * populate the freelist now rather than doing
2236 * it later when we run out of elements.
2237 */
2238 if (!mbuf_cached_above(class, wait) &&
2239 m_infree(class) < (m_total(class) >> 5)) {
2240 (void) freelist_populate(class, 1,
2241 M_DONTWAIT);
2242 }
2243 break;
2244 }
2245 } else {
2246 VERIFY(m_infree(class) == 0 || class == MC_CL);
2247
2248 (void) freelist_populate(class, 1,
2249 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2250
2251 if (m_infree(class) > 0) {
2252 continue;
2253 }
2254
2255 /* Check if there's anything at the cache layer */
2256 if (mbuf_cached_above(class, wait)) {
2257 break;
2258 }
2259
2260 /* watchdog checkpoint */
2261 mbuf_watchdog();
2262
2263 /* We have nothing and cannot block; give up */
2264 if (wait & MCR_NOSLEEP) {
2265 if (!(wait & MCR_TRYHARD)) {
2266 m_fail_cnt(class)++;
2267 mbstat.m_drops++;
2268 break;
2269 }
2270 }
2271
2272 /*
2273 * If the freelist is still empty and the caller is
2274 * willing to be blocked, sleep on the wait channel
2275 * until an element is available. Otherwise, if
2276 * MCR_TRYHARD is set, do our best to satisfy the
2277 * request without having to go to sleep.
2278 */
2279 if (mbuf_worker_ready &&
2280 mbuf_sleep(class, need, wait)) {
2281 break;
2282 }
2283
2284 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2285 }
2286 }
2287
2288 m_alloc_cnt(class) += num - need;
2289 lck_mtx_unlock(mbuf_mlock);
2290
2291 return num - need;
2292 }
2293
2294 /*
2295 * Common de-allocator for rudimentary objects called by the CPU cache
2296 * layer when one or more elements need to be returned to the appropriate
2297 * global freelist.
2298 */
2299 static void
mbuf_slab_free(void * arg,mcache_obj_t * list,__unused int purged)2300 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2301 {
2302 mbuf_class_t class = (mbuf_class_t)arg;
2303 mcache_obj_t *nlist;
2304 unsigned int num = 0;
2305 int w;
2306
2307 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2308
2309 lck_mtx_lock(mbuf_mlock);
2310
2311 for (;;) {
2312 nlist = list->obj_next;
2313 list->obj_next = NULL;
2314 slab_free(class, list);
2315 ++num;
2316 if ((list = nlist) == NULL) {
2317 break;
2318 }
2319 }
2320 m_free_cnt(class) += num;
2321
2322 if ((w = mb_waiters) > 0) {
2323 mb_waiters = 0;
2324 }
2325 if (w) {
2326 mbwdog_logger("waking up all threads");
2327 }
2328 lck_mtx_unlock(mbuf_mlock);
2329
2330 if (w != 0) {
2331 wakeup(mb_waitchan);
2332 }
2333 }
2334
2335 /*
2336 * Common auditor for rudimentary objects called by the CPU cache layer
2337 * during an allocation or free request. For the former, this is called
2338 * after the objects are obtained from either the bucket or slab layer
2339 * and before they are returned to the caller. For the latter, this is
2340 * called immediately during free and before placing the objects into
2341 * the bucket or slab layer.
2342 */
2343 static void
mbuf_slab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)2344 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2345 {
2346 mbuf_class_t class = (mbuf_class_t)arg;
2347 mcache_audit_t *mca;
2348
2349 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2350
2351 while (list != NULL) {
2352 lck_mtx_lock(mbuf_mlock);
2353 mca = mcl_audit_buf2mca(class, list);
2354
2355 /* Do the sanity checks */
2356 if (class == MC_MBUF) {
2357 mcl_audit_mbuf(mca, list, FALSE, alloc);
2358 ASSERT(mca->mca_uflags & MB_SCVALID);
2359 } else {
2360 mcl_audit_cluster(mca, list, m_maxsize(class),
2361 alloc, TRUE);
2362 ASSERT(!(mca->mca_uflags & MB_SCVALID));
2363 }
2364 /* Record this transaction */
2365 if (mcltrace) {
2366 mcache_buffer_log(mca, list, m_cache(class), &mb_start);
2367 }
2368
2369 if (alloc) {
2370 mca->mca_uflags |= MB_INUSE;
2371 } else {
2372 mca->mca_uflags &= ~MB_INUSE;
2373 }
2374 /* Unpair the object (unconditionally) */
2375 mca->mca_uptr = NULL;
2376 lck_mtx_unlock(mbuf_mlock);
2377
2378 list = list->obj_next;
2379 }
2380 }
2381
2382 /*
2383 * Common notify routine for all caches. It is called by mcache when
2384 * one or more objects get freed. We use this indication to trigger
2385 * the wakeup of any sleeping threads so that they can retry their
2386 * allocation requests.
2387 */
2388 static void
mbuf_slab_notify(void * arg,u_int32_t reason)2389 mbuf_slab_notify(void *arg, u_int32_t reason)
2390 {
2391 mbuf_class_t class = (mbuf_class_t)arg;
2392 int w;
2393
2394 ASSERT(MBUF_CLASS_VALID(class));
2395
2396 if (reason != MCN_RETRYALLOC) {
2397 return;
2398 }
2399
2400 lck_mtx_lock(mbuf_mlock);
2401 if ((w = mb_waiters) > 0) {
2402 m_notified(class)++;
2403 mb_waiters = 0;
2404 }
2405 if (w) {
2406 mbwdog_logger("waking up all threads");
2407 }
2408 lck_mtx_unlock(mbuf_mlock);
2409
2410 if (w != 0) {
2411 wakeup(mb_waitchan);
2412 }
2413 }
2414
2415 /*
2416 * Obtain object(s) from the composite class's freelist.
2417 */
2418 static unsigned int
cslab_alloc(mbuf_class_t class,mcache_obj_t *** plist,unsigned int num)2419 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2420 {
2421 unsigned int need = num;
2422 mcl_slab_t *sp, *clsp, *nsp;
2423 struct mbuf *m;
2424 mcache_obj_t **list = *plist;
2425 void *cl;
2426
2427 VERIFY(need > 0);
2428 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2429 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2430
2431 /* Get what we can from the freelist */
2432 while ((*list = m_cobjlist(class)) != NULL) {
2433 MRANGE(*list);
2434
2435 m = (struct mbuf *)*list;
2436 sp = slab_get(m);
2437 cl = m->m_ext.ext_buf;
2438 clsp = slab_get(cl);
2439 VERIFY(m->m_flags == M_EXT && cl != NULL);
2440 VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
2441
2442 if (class == MC_MBUF_CL) {
2443 VERIFY(clsp->sl_refcnt >= 1 &&
2444 clsp->sl_refcnt <= NCLPG);
2445 } else {
2446 VERIFY(clsp->sl_refcnt >= 1 &&
2447 clsp->sl_refcnt <= NBCLPG);
2448 }
2449
2450 if (class == MC_MBUF_16KCL) {
2451 int k;
2452 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2453 nsp = nsp->sl_next;
2454 /* Next slab must already be present */
2455 VERIFY(nsp != NULL);
2456 VERIFY(nsp->sl_refcnt == 1);
2457 }
2458 }
2459
2460 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2461 !MBUF_IN_MAP(m_cobjlist(class))) {
2462 slab_nextptr_panic(sp, m_cobjlist(class));
2463 /* NOTREACHED */
2464 }
2465 (*list)->obj_next = NULL;
2466 list = *plist = &(*list)->obj_next;
2467
2468 if (--need == 0) {
2469 break;
2470 }
2471 }
2472 m_infree(class) -= (num - need);
2473
2474 return num - need;
2475 }
2476
2477 /*
2478 * Place object(s) back into a composite class's freelist.
2479 */
2480 static unsigned int
cslab_free(mbuf_class_t class,mcache_obj_t * list,int purged)2481 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2482 {
2483 mcache_obj_t *o, *tail;
2484 unsigned int num = 0;
2485 struct mbuf *m, *ms;
2486 mcache_audit_t *mca = NULL;
2487 mcache_obj_t *ref_list = NULL;
2488 mcl_slab_t *clsp, *nsp;
2489 void *cl;
2490 mbuf_class_t cl_class;
2491
2492 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2493 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2494 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2495
2496 if (class == MC_MBUF_CL) {
2497 cl_class = MC_CL;
2498 } else if (class == MC_MBUF_BIGCL) {
2499 cl_class = MC_BIGCL;
2500 } else {
2501 VERIFY(class == MC_MBUF_16KCL);
2502 cl_class = MC_16KCL;
2503 }
2504
2505 o = tail = list;
2506
2507 while ((m = ms = (struct mbuf *)o) != NULL) {
2508 mcache_obj_t *rfa, *nexto = o->obj_next;
2509
2510 /* Do the mbuf sanity checks */
2511 if (mclaudit != NULL) {
2512 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2513 if (mclverify) {
2514 mcache_audit_free_verify(mca, m, 0,
2515 m_maxsize(MC_MBUF));
2516 }
2517 ms = MCA_SAVED_MBUF_PTR(mca);
2518 }
2519
2520 /* Do the cluster sanity checks */
2521 cl = ms->m_ext.ext_buf;
2522 clsp = slab_get(cl);
2523 if (mclverify) {
2524 size_t size = m_maxsize(cl_class);
2525 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2526 (mcache_obj_t *)cl), cl, 0, size);
2527 }
2528 VERIFY(ms->m_type == MT_FREE);
2529 VERIFY(ms->m_flags == M_EXT);
2530 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2531 if (cl_class == MC_CL) {
2532 VERIFY(clsp->sl_refcnt >= 1 &&
2533 clsp->sl_refcnt <= NCLPG);
2534 } else {
2535 VERIFY(clsp->sl_refcnt >= 1 &&
2536 clsp->sl_refcnt <= NBCLPG);
2537 }
2538 if (cl_class == MC_16KCL) {
2539 int k;
2540 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2541 nsp = nsp->sl_next;
2542 /* Next slab must already be present */
2543 VERIFY(nsp != NULL);
2544 VERIFY(nsp->sl_refcnt == 1);
2545 }
2546 }
2547
2548 /*
2549 * If we're asked to purge, restore the actual mbuf using
2550 * contents of the shadow structure (if auditing is enabled)
2551 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2552 * about to free it and the attached cluster into their caches.
2553 */
2554 if (purged) {
2555 /* Restore constructed mbuf fields */
2556 if (mclaudit != NULL) {
2557 mcl_audit_restore_mbuf(m, mca, TRUE);
2558 }
2559
2560 MEXT_MINREF(m) = 0;
2561 MEXT_REF(m) = 0;
2562 MEXT_PREF(m) = 0;
2563 MEXT_FLAGS(m) = 0;
2564 MEXT_PRIV(m) = 0;
2565 MEXT_PMBUF(m) = NULL;
2566 MEXT_TOKEN(m) = 0;
2567
2568 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
2569 m_set_ext(m, NULL, NULL, NULL);
2570 rfa->obj_next = ref_list;
2571 ref_list = rfa;
2572
2573 m->m_type = MT_FREE;
2574 m->m_flags = m->m_len = 0;
2575 m->m_next = m->m_nextpkt = NULL;
2576
2577 /* Save mbuf fields and make auditing happy */
2578 if (mclaudit != NULL) {
2579 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2580 }
2581
2582 VERIFY(m_total(class) > 0);
2583 m_total(class)--;
2584
2585 /* Free the mbuf */
2586 o->obj_next = NULL;
2587 slab_free(MC_MBUF, o);
2588
2589 /* And free the cluster */
2590 ((mcache_obj_t *)cl)->obj_next = NULL;
2591 if (class == MC_MBUF_CL) {
2592 slab_free(MC_CL, cl);
2593 } else if (class == MC_MBUF_BIGCL) {
2594 slab_free(MC_BIGCL, cl);
2595 } else {
2596 slab_free(MC_16KCL, cl);
2597 }
2598 }
2599
2600 ++num;
2601 tail = o;
2602 o = nexto;
2603 }
2604
2605 if (!purged) {
2606 tail->obj_next = m_cobjlist(class);
2607 m_cobjlist(class) = list;
2608 m_infree(class) += num;
2609 } else if (ref_list != NULL) {
2610 mcache_free_ext(ref_cache, ref_list);
2611 }
2612
2613 return num;
2614 }
2615
2616 /*
2617 * Common allocator for composite objects called by the CPU cache layer
2618 * during an allocation request whenever there is no available element in
2619 * the bucket layer. It returns one or more composite elements from the
2620 * appropriate global freelist. If the freelist is empty, it will attempt
2621 * to obtain the rudimentary objects from their caches and construct them
2622 * into composite mbuf + cluster objects.
2623 */
2624 static unsigned int
mbuf_cslab_alloc(void * arg,mcache_obj_t *** plist,unsigned int needed,int wait)2625 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2626 int wait)
2627 {
2628 mbuf_class_t class = (mbuf_class_t)arg;
2629 mbuf_class_t cl_class = 0;
2630 unsigned int num = 0, cnum = 0, want = needed;
2631 mcache_obj_t *ref_list = NULL;
2632 mcache_obj_t *mp_list = NULL;
2633 mcache_obj_t *clp_list = NULL;
2634 mcache_obj_t **list;
2635 struct ext_ref *rfa;
2636 struct mbuf *m;
2637 void *cl;
2638
2639 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2640 ASSERT(needed > 0);
2641
2642 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2643
2644 /* There should not be any slab for this class */
2645 VERIFY(m_slab_cnt(class) == 0 &&
2646 m_slablist(class).tqh_first == NULL &&
2647 m_slablist(class).tqh_last == NULL);
2648
2649 lck_mtx_lock(mbuf_mlock);
2650
2651 /* Try using the freelist first */
2652 num = cslab_alloc(class, plist, needed);
2653 list = *plist;
2654 if (num == needed) {
2655 m_alloc_cnt(class) += num;
2656 lck_mtx_unlock(mbuf_mlock);
2657 return needed;
2658 }
2659
2660 lck_mtx_unlock(mbuf_mlock);
2661
2662 /*
2663 * We could not satisfy the request using the freelist alone;
2664 * allocate from the appropriate rudimentary caches and use
2665 * whatever we can get to construct the composite objects.
2666 */
2667 needed -= num;
2668
2669 /*
2670 * Mark these allocation requests as coming from a composite cache.
2671 * Also, if the caller is willing to be blocked, mark the request
2672 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2673 * slab layer waiting for the individual object when one or more
2674 * of the already-constructed composite objects are available.
2675 */
2676 wait |= MCR_COMP;
2677 if (!(wait & MCR_NOSLEEP)) {
2678 wait |= MCR_FAILOK;
2679 }
2680
2681 /* allocate mbufs */
2682 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2683 if (needed == 0) {
2684 ASSERT(mp_list == NULL);
2685 goto fail;
2686 }
2687
2688 /* allocate clusters */
2689 if (class == MC_MBUF_CL) {
2690 cl_class = MC_CL;
2691 } else if (class == MC_MBUF_BIGCL) {
2692 cl_class = MC_BIGCL;
2693 } else {
2694 VERIFY(class == MC_MBUF_16KCL);
2695 cl_class = MC_16KCL;
2696 }
2697 needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2698 if (needed == 0) {
2699 ASSERT(clp_list == NULL);
2700 goto fail;
2701 }
2702
2703 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2704 if (needed == 0) {
2705 ASSERT(ref_list == NULL);
2706 goto fail;
2707 }
2708
2709 /*
2710 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
2711 * overs will get freed accordingly before we return to caller.
2712 */
2713 for (cnum = 0; cnum < needed; cnum++) {
2714 struct mbuf *ms;
2715
2716 m = ms = (struct mbuf *)mp_list;
2717 mp_list = mp_list->obj_next;
2718
2719 cl = clp_list;
2720 clp_list = clp_list->obj_next;
2721 ((mcache_obj_t *)cl)->obj_next = NULL;
2722
2723 rfa = (struct ext_ref *)ref_list;
2724 ref_list = ref_list->obj_next;
2725 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2726
2727 /*
2728 * If auditing is enabled, construct the shadow mbuf
2729 * in the audit structure instead of in the actual one.
2730 * mbuf_cslab_audit() will take care of restoring the
2731 * contents after the integrity check.
2732 */
2733 if (mclaudit != NULL) {
2734 mcache_audit_t *mca, *cl_mca;
2735
2736 lck_mtx_lock(mbuf_mlock);
2737 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2738 ms = MCA_SAVED_MBUF_PTR(mca);
2739 cl_mca = mcl_audit_buf2mca(cl_class,
2740 (mcache_obj_t *)cl);
2741
2742 /*
2743 * Pair them up. Note that this is done at the time
2744 * the mbuf+cluster objects are constructed. This
2745 * information should be treated as "best effort"
2746 * debugging hint since more than one mbufs can refer
2747 * to a cluster. In that case, the cluster might not
2748 * be freed along with the mbuf it was paired with.
2749 */
2750 mca->mca_uptr = cl_mca;
2751 cl_mca->mca_uptr = mca;
2752
2753 ASSERT(mca->mca_uflags & MB_SCVALID);
2754 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2755 lck_mtx_unlock(mbuf_mlock);
2756
2757 /* Technically, they are in the freelist */
2758 if (mclverify) {
2759 size_t size;
2760
2761 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2762 m_maxsize(MC_MBUF));
2763
2764 if (class == MC_MBUF_CL) {
2765 size = m_maxsize(MC_CL);
2766 } else if (class == MC_MBUF_BIGCL) {
2767 size = m_maxsize(MC_BIGCL);
2768 } else {
2769 size = m_maxsize(MC_16KCL);
2770 }
2771
2772 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2773 size);
2774 }
2775 }
2776
2777 MBUF_INIT(ms, 0, MT_FREE);
2778 if (class == MC_MBUF_16KCL) {
2779 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2780 } else if (class == MC_MBUF_BIGCL) {
2781 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2782 } else {
2783 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2784 }
2785 VERIFY(ms->m_flags == M_EXT);
2786 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2787
2788 *list = (mcache_obj_t *)m;
2789 (*list)->obj_next = NULL;
2790 list = *plist = &(*list)->obj_next;
2791 }
2792
2793 fail:
2794 /*
2795 * Free up what's left of the above.
2796 */
2797 if (mp_list != NULL) {
2798 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2799 }
2800 if (clp_list != NULL) {
2801 mcache_free_ext(m_cache(cl_class), clp_list);
2802 }
2803 if (ref_list != NULL) {
2804 mcache_free_ext(ref_cache, ref_list);
2805 }
2806
2807 lck_mtx_lock(mbuf_mlock);
2808 if (num > 0 || cnum > 0) {
2809 m_total(class) += cnum;
2810 VERIFY(m_total(class) <= m_maxlimit(class));
2811 m_alloc_cnt(class) += num + cnum;
2812 }
2813 if ((num + cnum) < want) {
2814 m_fail_cnt(class) += (want - (num + cnum));
2815 }
2816 lck_mtx_unlock(mbuf_mlock);
2817
2818 return num + cnum;
2819 }
2820
2821 /*
2822 * Common de-allocator for composite objects called by the CPU cache
2823 * layer when one or more elements need to be returned to the appropriate
2824 * global freelist.
2825 */
2826 static void
mbuf_cslab_free(void * arg,mcache_obj_t * list,int purged)2827 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2828 {
2829 mbuf_class_t class = (mbuf_class_t)arg;
2830 unsigned int num;
2831 int w;
2832
2833 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2834
2835 lck_mtx_lock(mbuf_mlock);
2836
2837 num = cslab_free(class, list, purged);
2838 m_free_cnt(class) += num;
2839
2840 if ((w = mb_waiters) > 0) {
2841 mb_waiters = 0;
2842 }
2843 if (w) {
2844 mbwdog_logger("waking up all threads");
2845 }
2846
2847 lck_mtx_unlock(mbuf_mlock);
2848
2849 if (w != 0) {
2850 wakeup(mb_waitchan);
2851 }
2852 }
2853
2854 /*
2855 * Common auditor for composite objects called by the CPU cache layer
2856 * during an allocation or free request. For the former, this is called
2857 * after the objects are obtained from either the bucket or slab layer
2858 * and before they are returned to the caller. For the latter, this is
2859 * called immediately during free and before placing the objects into
2860 * the bucket or slab layer.
2861 */
2862 static void
mbuf_cslab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)2863 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2864 {
2865 mbuf_class_t class = (mbuf_class_t)arg, cl_class;
2866 mcache_audit_t *mca;
2867 struct mbuf *m, *ms;
2868 mcl_slab_t *clsp, *nsp;
2869 size_t cl_size;
2870 void *cl;
2871
2872 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2873 if (class == MC_MBUF_CL) {
2874 cl_class = MC_CL;
2875 } else if (class == MC_MBUF_BIGCL) {
2876 cl_class = MC_BIGCL;
2877 } else {
2878 cl_class = MC_16KCL;
2879 }
2880 cl_size = m_maxsize(cl_class);
2881
2882 while ((m = ms = (struct mbuf *)list) != NULL) {
2883 lck_mtx_lock(mbuf_mlock);
2884 /* Do the mbuf sanity checks and record its transaction */
2885 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2886 mcl_audit_mbuf(mca, m, TRUE, alloc);
2887 if (mcltrace) {
2888 mcache_buffer_log(mca, m, m_cache(class), &mb_start);
2889 }
2890
2891 if (alloc) {
2892 mca->mca_uflags |= MB_COMP_INUSE;
2893 } else {
2894 mca->mca_uflags &= ~MB_COMP_INUSE;
2895 }
2896
2897 /*
2898 * Use the shadow mbuf in the audit structure if we are
2899 * freeing, since the contents of the actual mbuf has been
2900 * pattern-filled by the above call to mcl_audit_mbuf().
2901 */
2902 if (!alloc && mclverify) {
2903 ms = MCA_SAVED_MBUF_PTR(mca);
2904 }
2905
2906 /* Do the cluster sanity checks and record its transaction */
2907 cl = ms->m_ext.ext_buf;
2908 clsp = slab_get(cl);
2909 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2910 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2911 if (class == MC_MBUF_CL) {
2912 VERIFY(clsp->sl_refcnt >= 1 &&
2913 clsp->sl_refcnt <= NCLPG);
2914 } else {
2915 VERIFY(clsp->sl_refcnt >= 1 &&
2916 clsp->sl_refcnt <= NBCLPG);
2917 }
2918
2919 if (class == MC_MBUF_16KCL) {
2920 int k;
2921 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2922 nsp = nsp->sl_next;
2923 /* Next slab must already be present */
2924 VERIFY(nsp != NULL);
2925 VERIFY(nsp->sl_refcnt == 1);
2926 }
2927 }
2928
2929
2930 mca = mcl_audit_buf2mca(cl_class, cl);
2931 mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE);
2932 if (mcltrace) {
2933 mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
2934 }
2935
2936 if (alloc) {
2937 mca->mca_uflags |= MB_COMP_INUSE;
2938 } else {
2939 mca->mca_uflags &= ~MB_COMP_INUSE;
2940 }
2941 lck_mtx_unlock(mbuf_mlock);
2942
2943 list = list->obj_next;
2944 }
2945 }
2946
2947 static void
m_vm_error_stats(uint32_t * cnt,uint64_t * ts,uint64_t * size,uint64_t alloc_size,kern_return_t error)2948 m_vm_error_stats(uint32_t *cnt, uint64_t *ts, uint64_t *size,
2949 uint64_t alloc_size, kern_return_t error)
2950 {
2951 *cnt = *cnt + 1;
2952 *ts = net_uptime();
2953 if (size) {
2954 *size = alloc_size;
2955 }
2956 switch (error) {
2957 case KERN_SUCCESS:
2958 break;
2959 case KERN_INVALID_ARGUMENT:
2960 mb_kmem_stats[0]++;
2961 break;
2962 case KERN_INVALID_ADDRESS:
2963 mb_kmem_stats[1]++;
2964 break;
2965 case KERN_RESOURCE_SHORTAGE:
2966 mb_kmem_stats[2]++;
2967 break;
2968 case KERN_NO_SPACE:
2969 mb_kmem_stats[3]++;
2970 break;
2971 case KERN_FAILURE:
2972 mb_kmem_stats[4]++;
2973 break;
2974 default:
2975 mb_kmem_stats[5]++;
2976 break;
2977 }
2978 }
2979
2980 static vm_offset_t
kmem_mb_alloc(vm_map_t mbmap,int size,int physContig,kern_return_t * err)2981 kmem_mb_alloc(vm_map_t mbmap, int size, int physContig, kern_return_t *err)
2982 {
2983 vm_offset_t addr = 0;
2984 kern_return_t kr = KERN_SUCCESS;
2985
2986 if (!physContig) {
2987 kr = kmem_alloc(mbmap, &addr, size,
2988 KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
2989 } else {
2990 kr = kmem_alloc_contig(mbmap, &addr, size, PAGE_MASK, 0xfffff,
2991 0, KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
2992 }
2993
2994 if (kr != KERN_SUCCESS) {
2995 addr = 0;
2996 }
2997 if (err) {
2998 *err = kr;
2999 }
3000
3001 return addr;
3002 }
3003
3004 /*
3005 * Allocate some number of mbuf clusters and place on cluster freelist.
3006 */
3007 static int
m_clalloc(const u_int32_t num,const int wait,const u_int32_t bufsize)3008 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
3009 {
3010 int i, count = 0;
3011 vm_size_t size = 0;
3012 int numpages = 0, large_buffer;
3013 vm_offset_t page = 0;
3014 mcache_audit_t *mca_list = NULL;
3015 mcache_obj_t *con_list = NULL;
3016 mcl_slab_t *sp;
3017 mbuf_class_t class;
3018 kern_return_t error;
3019
3020 /* Set if a buffer allocation needs allocation of multiple pages */
3021 large_buffer = ((bufsize == m_maxsize(MC_16KCL)) &&
3022 PAGE_SIZE < M16KCLBYTES);
3023 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
3024 bufsize == m_maxsize(MC_16KCL));
3025
3026 VERIFY((bufsize == PAGE_SIZE) ||
3027 (bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL)));
3028
3029 if (bufsize == m_size(MC_BIGCL)) {
3030 class = MC_BIGCL;
3031 } else {
3032 class = MC_16KCL;
3033 }
3034
3035 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3036
3037 /*
3038 * Multiple threads may attempt to populate the cluster map one
3039 * after another. Since we drop the lock below prior to acquiring
3040 * the physical page(s), our view of the cluster map may no longer
3041 * be accurate, and we could end up over-committing the pages beyond
3042 * the maximum allowed for each class. To prevent it, this entire
3043 * operation (including the page mapping) is serialized.
3044 */
3045 while (mb_clalloc_busy) {
3046 mb_clalloc_waiters++;
3047 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
3048 (PZERO - 1), "m_clalloc", NULL);
3049 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3050 }
3051
3052 /* We are busy now; tell everyone else to go away */
3053 mb_clalloc_busy = TRUE;
3054
3055 /*
3056 * Honor the caller's wish to block or not block. We have a way
3057 * to grow the pool asynchronously using the mbuf worker thread.
3058 */
3059 i = m_howmany(num, bufsize);
3060 if (i <= 0 || (wait & M_DONTWAIT)) {
3061 goto out;
3062 }
3063
3064 lck_mtx_unlock(mbuf_mlock);
3065
3066 size = round_page(i * bufsize);
3067 page = kmem_mb_alloc(mb_map, size, large_buffer, &error);
3068
3069 /*
3070 * If we did ask for "n" 16KB physically contiguous chunks
3071 * and didn't get them, then please try again without this
3072 * restriction.
3073 */
3074 net_update_uptime();
3075 if (large_buffer && page == 0) {
3076 m_vm_error_stats(&mb_kmem_contig_failed,
3077 &mb_kmem_contig_failed_ts,
3078 &mb_kmem_contig_failed_size,
3079 size, error);
3080 page = kmem_mb_alloc(mb_map, size, 0, &error);
3081 }
3082
3083 if (page == 0) {
3084 m_vm_error_stats(&mb_kmem_failed,
3085 &mb_kmem_failed_ts,
3086 &mb_kmem_failed_size,
3087 size, error);
3088 #if PAGE_SIZE == 4096
3089 if (bufsize == m_maxsize(MC_BIGCL)) {
3090 #else
3091 if (bufsize >= m_maxsize(MC_BIGCL)) {
3092 #endif
3093 /* Try for 1 page if failed */
3094 size = PAGE_SIZE;
3095 page = kmem_mb_alloc(mb_map, size, 0, &error);
3096 if (page == 0) {
3097 m_vm_error_stats(&mb_kmem_one_failed,
3098 &mb_kmem_one_failed_ts,
3099 NULL, size, error);
3100 }
3101 }
3102
3103 if (page == 0) {
3104 lck_mtx_lock(mbuf_mlock);
3105 goto out;
3106 }
3107 }
3108
3109 VERIFY(IS_P2ALIGNED(page, PAGE_SIZE));
3110 numpages = size / PAGE_SIZE;
3111
3112 /* If auditing is enabled, allocate the audit structures now */
3113 if (mclaudit != NULL) {
3114 int needed;
3115
3116 /*
3117 * Yes, I realize this is a waste of memory for clusters
3118 * that never get transformed into mbufs, as we may end
3119 * up with NMBPG-1 unused audit structures per cluster.
3120 * But doing so tremendously simplifies the allocation
3121 * strategy, since at this point we are not holding the
3122 * mbuf lock and the caller is okay to be blocked.
3123 */
3124 if (bufsize == PAGE_SIZE) {
3125 needed = numpages * NMBPG;
3126
3127 i = mcache_alloc_ext(mcl_audit_con_cache,
3128 &con_list, needed, MCR_SLEEP);
3129
3130 VERIFY(con_list != NULL && i == needed);
3131 } else {
3132 /*
3133 * if multiple 4K pages are being used for a
3134 * 16K cluster
3135 */
3136 needed = numpages / NSLABSP16KB;
3137 }
3138
3139 i = mcache_alloc_ext(mcache_audit_cache,
3140 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
3141
3142 VERIFY(mca_list != NULL && i == needed);
3143 }
3144
3145 lck_mtx_lock(mbuf_mlock);
3146
3147 for (i = 0; i < numpages; i++, page += PAGE_SIZE) {
3148 ppnum_t offset =
3149 ((unsigned char *)page - mbutl) >> PAGE_SHIFT;
3150 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
3151
3152 /*
3153 * If there is a mapper the appropriate I/O page is
3154 * returned; zero out the page to discard its past
3155 * contents to prevent exposing leftover kernel memory.
3156 */
3157 VERIFY(offset < mcl_pages);
3158 if (mcl_paddr_base != 0) {
3159 bzero((void *)(uintptr_t) page, PAGE_SIZE);
3160 new_page = IOMapperInsertPage(mcl_paddr_base,
3161 offset, new_page);
3162 }
3163 mcl_paddr[offset] = new_page;
3164
3165 /* Pattern-fill this fresh page */
3166 if (mclverify) {
3167 mcache_set_pattern(MCACHE_FREE_PATTERN,
3168 (caddr_t)page, PAGE_SIZE);
3169 }
3170 if (bufsize == PAGE_SIZE) {
3171 mcache_obj_t *buf;
3172 /* One for the entire page */
3173 sp = slab_get((void *)page);
3174 if (mclaudit != NULL) {
3175 mcl_audit_init((void *)page,
3176 &mca_list, &con_list,
3177 AUDIT_CONTENTS_SIZE, NMBPG);
3178 }
3179 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3180 slab_init(sp, class, SLF_MAPPED, (void *)page,
3181 (void *)page, PAGE_SIZE, 0, 1);
3182 buf = (mcache_obj_t *)page;
3183 buf->obj_next = NULL;
3184
3185 /* Insert this slab */
3186 slab_insert(sp, class);
3187
3188 /* Update stats now since slab_get drops the lock */
3189 ++m_infree(class);
3190 ++m_total(class);
3191 VERIFY(m_total(class) <= m_maxlimit(class));
3192 if (class == MC_BIGCL) {
3193 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3194 m_infree(MC_MBUF_BIGCL);
3195 mbstat.m_bigclusters = m_total(MC_BIGCL);
3196 }
3197 ++count;
3198 } else if ((bufsize > PAGE_SIZE) &&
3199 (i % NSLABSP16KB) == 0) {
3200 union m16kcluster *m16kcl = (union m16kcluster *)page;
3201 mcl_slab_t *nsp;
3202 int k;
3203
3204 /* One for the entire 16KB */
3205 sp = slab_get(m16kcl);
3206 if (mclaudit != NULL) {
3207 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
3208 }
3209
3210 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3211 slab_init(sp, MC_16KCL, SLF_MAPPED,
3212 m16kcl, m16kcl, bufsize, 0, 1);
3213 m16kcl->m16kcl_next = NULL;
3214
3215 /*
3216 * 2nd-Nth page's slab is part of the first one,
3217 * where N is NSLABSP16KB.
3218 */
3219 for (k = 1; k < NSLABSP16KB; k++) {
3220 nsp = slab_get(((union mbigcluster *)page) + k);
3221 VERIFY(nsp->sl_refcnt == 0 &&
3222 nsp->sl_flags == 0);
3223 slab_init(nsp, MC_16KCL,
3224 SLF_MAPPED | SLF_PARTIAL,
3225 m16kcl, NULL, 0, 0, 0);
3226 }
3227 /* Insert this slab */
3228 slab_insert(sp, MC_16KCL);
3229
3230 /* Update stats now since slab_get drops the lock */
3231 ++m_infree(MC_16KCL);
3232 ++m_total(MC_16KCL);
3233 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3234 ++count;
3235 }
3236 }
3237 VERIFY(mca_list == NULL && con_list == NULL);
3238
3239 if (!mb_peak_newreport && mbuf_report_usage(class)) {
3240 mb_peak_newreport = TRUE;
3241 }
3242
3243 /* We're done; let others enter */
3244 mb_clalloc_busy = FALSE;
3245 if (mb_clalloc_waiters > 0) {
3246 mb_clalloc_waiters = 0;
3247 wakeup(mb_clalloc_waitchan);
3248 }
3249
3250 return count;
3251 out:
3252 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3253
3254 mtracelarge_register(size);
3255
3256 /* We're done; let others enter */
3257 mb_clalloc_busy = FALSE;
3258 if (mb_clalloc_waiters > 0) {
3259 mb_clalloc_waiters = 0;
3260 wakeup(mb_clalloc_waitchan);
3261 }
3262
3263 /*
3264 * When non-blocking we kick a thread if we have to grow the
3265 * pool or if the number of free clusters is less than requested.
3266 */
3267 if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) {
3268 mbwdog_logger("waking up the worker thread to to grow %s by %d",
3269 m_cname(class), i);
3270 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
3271 mbuf_worker_needs_wakeup = FALSE;
3272 }
3273 if (class == MC_BIGCL) {
3274 if (i > 0) {
3275 /*
3276 * Remember total number of 4KB clusters needed
3277 * at this time.
3278 */
3279 i += m_total(MC_BIGCL);
3280 if (i > m_region_expand(MC_BIGCL)) {
3281 m_region_expand(MC_BIGCL) = i;
3282 }
3283 }
3284 if (m_infree(MC_BIGCL) >= num) {
3285 return 1;
3286 }
3287 } else {
3288 if (i > 0) {
3289 /*
3290 * Remember total number of 16KB clusters needed
3291 * at this time.
3292 */
3293 i += m_total(MC_16KCL);
3294 if (i > m_region_expand(MC_16KCL)) {
3295 m_region_expand(MC_16KCL) = i;
3296 }
3297 }
3298 if (m_infree(MC_16KCL) >= num) {
3299 return 1;
3300 }
3301 }
3302 return 0;
3303 }
3304
3305 /*
3306 * Populate the global freelist of the corresponding buffer class.
3307 */
3308 static int
3309 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
3310 {
3311 mcache_obj_t *o = NULL;
3312 int i, numpages = 0, count;
3313 mbuf_class_t super_class;
3314
3315 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
3316 class == MC_16KCL);
3317
3318 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3319
3320 VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) ||
3321 PAGE_SIZE == m_maxsize(MC_16KCL));
3322
3323 if (m_maxsize(class) >= PAGE_SIZE) {
3324 return m_clalloc(num, wait, m_maxsize(class)) != 0;
3325 }
3326
3327 /*
3328 * The rest of the function will allocate pages and will slice
3329 * them up into the right size
3330 */
3331
3332 numpages = (num * m_size(class) + PAGE_SIZE - 1) / PAGE_SIZE;
3333
3334 /* Currently assume that pages are 4K or 16K */
3335 if (PAGE_SIZE == m_maxsize(MC_BIGCL)) {
3336 super_class = MC_BIGCL;
3337 } else {
3338 super_class = MC_16KCL;
3339 }
3340
3341 i = m_clalloc(numpages, wait, m_maxsize(super_class));
3342
3343 /* how many objects will we cut the page into? */
3344 int numobj = PAGE_SIZE / m_maxsize(class);
3345
3346 for (count = 0; count < numpages; count++) {
3347 /* respect totals, minlimit, maxlimit */
3348 if (m_total(super_class) <= m_minlimit(super_class) ||
3349 m_total(class) >= m_maxlimit(class)) {
3350 break;
3351 }
3352
3353 if ((o = slab_alloc(super_class, wait)) == NULL) {
3354 break;
3355 }
3356
3357 struct mbuf *m = (struct mbuf *)o;
3358 union mcluster *c = (union mcluster *)o;
3359 union mbigcluster *mbc = (union mbigcluster *)o;
3360 mcl_slab_t *sp = slab_get(o);
3361 mcache_audit_t *mca = NULL;
3362
3363 /*
3364 * since one full page will be converted to MC_MBUF or
3365 * MC_CL, verify that the reference count will match that
3366 * assumption
3367 */
3368 VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp));
3369 VERIFY((sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
3370 /*
3371 * Make sure that the cluster is unmolested
3372 * while in freelist
3373 */
3374 if (mclverify) {
3375 mca = mcl_audit_buf2mca(super_class,
3376 (mcache_obj_t *)o);
3377 mcache_audit_free_verify(mca,
3378 (mcache_obj_t *)o, 0, m_maxsize(super_class));
3379 }
3380
3381 /* Reinitialize it as an mbuf or 2K or 4K slab */
3382 slab_init(sp, class, sp->sl_flags,
3383 sp->sl_base, NULL, PAGE_SIZE, 0, numobj);
3384
3385 VERIFY(sp->sl_head == NULL);
3386
3387 VERIFY(m_total(super_class) >= 1);
3388 m_total(super_class)--;
3389
3390 if (super_class == MC_BIGCL) {
3391 mbstat.m_bigclusters = m_total(MC_BIGCL);
3392 }
3393
3394 m_total(class) += numobj;
3395 VERIFY(m_total(class) <= m_maxlimit(class));
3396 m_infree(class) += numobj;
3397
3398 if (!mb_peak_newreport && mbuf_report_usage(class)) {
3399 mb_peak_newreport = TRUE;
3400 }
3401
3402 i = numobj;
3403 if (class == MC_MBUF) {
3404 mbstat.m_mbufs = m_total(MC_MBUF);
3405 mtype_stat_add(MT_FREE, NMBPG);
3406 while (i--) {
3407 /*
3408 * If auditing is enabled, construct the
3409 * shadow mbuf in the audit structure
3410 * instead of the actual one.
3411 * mbuf_slab_audit() will take care of
3412 * restoring the contents after the
3413 * integrity check.
3414 */
3415 if (mclaudit != NULL) {
3416 struct mbuf *ms;
3417 mca = mcl_audit_buf2mca(MC_MBUF,
3418 (mcache_obj_t *)m);
3419 ms = MCA_SAVED_MBUF_PTR(mca);
3420 ms->m_type = MT_FREE;
3421 } else {
3422 m->m_type = MT_FREE;
3423 }
3424 m->m_next = sp->sl_head;
3425 sp->sl_head = (void *)m++;
3426 }
3427 } else if (class == MC_CL) { /* MC_CL */
3428 mbstat.m_clfree =
3429 m_infree(MC_CL) + m_infree(MC_MBUF_CL);
3430 mbstat.m_clusters = m_total(MC_CL);
3431 while (i--) {
3432 c->mcl_next = sp->sl_head;
3433 sp->sl_head = (void *)c++;
3434 }
3435 } else {
3436 VERIFY(class == MC_BIGCL);
3437 mbstat.m_bigclusters = m_total(MC_BIGCL);
3438 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3439 m_infree(MC_MBUF_BIGCL);
3440 while (i--) {
3441 mbc->mbc_next = sp->sl_head;
3442 sp->sl_head = (void *)mbc++;
3443 }
3444 }
3445
3446 /* Insert into the mbuf or 2k or 4k slab list */
3447 slab_insert(sp, class);
3448
3449 if ((i = mb_waiters) > 0) {
3450 mb_waiters = 0;
3451 }
3452 if (i != 0) {
3453 mbwdog_logger("waking up all threads");
3454 wakeup(mb_waitchan);
3455 }
3456 }
3457 return count != 0;
3458 }
3459
3460 /*
3461 * For each class, initialize the freelist to hold m_minlimit() objects.
3462 */
3463 static void
3464 freelist_init(mbuf_class_t class)
3465 {
3466 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3467
3468 VERIFY(class == MC_CL || class == MC_BIGCL);
3469 VERIFY(m_total(class) == 0);
3470 VERIFY(m_minlimit(class) > 0);
3471
3472 while (m_total(class) < m_minlimit(class)) {
3473 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3474 }
3475
3476 VERIFY(m_total(class) >= m_minlimit(class));
3477 }
3478
3479 /*
3480 * (Inaccurately) check if it might be worth a trip back to the
3481 * mcache layer due the availability of objects there. We'll
3482 * end up back here if there's nothing up there.
3483 */
3484 static boolean_t
3485 mbuf_cached_above(mbuf_class_t class, int wait)
3486 {
3487 switch (class) {
3488 case MC_MBUF:
3489 if (wait & MCR_COMP) {
3490 return !mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3491 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
3492 }
3493 break;
3494
3495 case MC_CL:
3496 if (wait & MCR_COMP) {
3497 return !mcache_bkt_isempty(m_cache(MC_MBUF_CL));
3498 }
3499 break;
3500
3501 case MC_BIGCL:
3502 if (wait & MCR_COMP) {
3503 return !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
3504 }
3505 break;
3506
3507 case MC_16KCL:
3508 if (wait & MCR_COMP) {
3509 return !mcache_bkt_isempty(m_cache(MC_MBUF_16KCL));
3510 }
3511 break;
3512
3513 case MC_MBUF_CL:
3514 case MC_MBUF_BIGCL:
3515 case MC_MBUF_16KCL:
3516 break;
3517
3518 default:
3519 VERIFY(0);
3520 /* NOTREACHED */
3521 }
3522
3523 return !mcache_bkt_isempty(m_cache(class));
3524 }
3525
3526 /*
3527 * If possible, convert constructed objects to raw ones.
3528 */
3529 static boolean_t
3530 mbuf_steal(mbuf_class_t class, unsigned int num)
3531 {
3532 mcache_obj_t *top = NULL;
3533 mcache_obj_t **list = ⊤
3534 unsigned int tot = 0;
3535
3536 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3537
3538 switch (class) {
3539 case MC_MBUF:
3540 case MC_CL:
3541 case MC_BIGCL:
3542 case MC_16KCL:
3543 return FALSE;
3544
3545 case MC_MBUF_CL:
3546 case MC_MBUF_BIGCL:
3547 case MC_MBUF_16KCL:
3548 /* Get the required number of constructed objects if possible */
3549 if (m_infree(class) > m_minlimit(class)) {
3550 tot = cslab_alloc(class, &list,
3551 MIN(num, m_infree(class)));
3552 }
3553
3554 /* And destroy them to get back the raw objects */
3555 if (top != NULL) {
3556 (void) cslab_free(class, top, 1);
3557 }
3558 break;
3559
3560 default:
3561 VERIFY(0);
3562 /* NOTREACHED */
3563 }
3564
3565 return tot == num;
3566 }
3567
3568 static void
3569 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3570 {
3571 int m, bmap = 0;
3572
3573 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3574
3575 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3576 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3577 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3578
3579 /*
3580 * This logic can be made smarter; for now, simply mark
3581 * all other related classes as potential victims.
3582 */
3583 switch (class) {
3584 case MC_MBUF:
3585 m_wantpurge(MC_CL)++;
3586 m_wantpurge(MC_BIGCL)++;
3587 m_wantpurge(MC_MBUF_CL)++;
3588 m_wantpurge(MC_MBUF_BIGCL)++;
3589 break;
3590
3591 case MC_CL:
3592 m_wantpurge(MC_MBUF)++;
3593 m_wantpurge(MC_BIGCL)++;
3594 m_wantpurge(MC_MBUF_BIGCL)++;
3595 if (!comp) {
3596 m_wantpurge(MC_MBUF_CL)++;
3597 }
3598 break;
3599
3600 case MC_BIGCL:
3601 m_wantpurge(MC_MBUF)++;
3602 m_wantpurge(MC_CL)++;
3603 m_wantpurge(MC_MBUF_CL)++;
3604 if (!comp) {
3605 m_wantpurge(MC_MBUF_BIGCL)++;
3606 }
3607 break;
3608
3609 case MC_16KCL:
3610 if (!comp) {
3611 m_wantpurge(MC_MBUF_16KCL)++;
3612 }
3613 break;
3614
3615 default:
3616 VERIFY(0);
3617 /* NOTREACHED */
3618 }
3619
3620 /*
3621 * Run through each marked class and check if we really need to
3622 * purge (and therefore temporarily disable) the per-CPU caches
3623 * layer used by the class. If so, remember the classes since
3624 * we are going to drop the lock below prior to purging.
3625 */
3626 for (m = 0; m < NELEM(mbuf_table); m++) {
3627 if (m_wantpurge(m) > 0) {
3628 m_wantpurge(m) = 0;
3629 /*
3630 * Try hard to steal the required number of objects
3631 * from the freelist of other mbuf classes. Only
3632 * purge and disable the per-CPU caches layer when
3633 * we don't have enough; it's the last resort.
3634 */
3635 if (!mbuf_steal(m, num)) {
3636 bmap |= (1 << m);
3637 }
3638 }
3639 }
3640
3641 lck_mtx_unlock(mbuf_mlock);
3642
3643 if (bmap != 0) {
3644 /* signal the domains to drain */
3645 net_drain_domains();
3646
3647 /* Sigh; we have no other choices but to ask mcache to purge */
3648 for (m = 0; m < NELEM(mbuf_table); m++) {
3649 if ((bmap & (1 << m)) &&
3650 mcache_purge_cache(m_cache(m), TRUE)) {
3651 lck_mtx_lock(mbuf_mlock);
3652 m_purge_cnt(m)++;
3653 mbstat.m_drain++;
3654 lck_mtx_unlock(mbuf_mlock);
3655 }
3656 }
3657 } else {
3658 /*
3659 * Request mcache to reap extra elements from all of its caches;
3660 * note that all reaps are serialized and happen only at a fixed
3661 * interval.
3662 */
3663 mcache_reap();
3664 }
3665 lck_mtx_lock(mbuf_mlock);
3666 }
3667
3668 static inline struct mbuf *
3669 m_get_common(int wait, short type, int hdr)
3670 {
3671 struct mbuf *m;
3672 int mcflags = MSLEEPF(wait);
3673
3674 /* Is this due to a non-blocking retry? If so, then try harder */
3675 if (mcflags & MCR_NOSLEEP) {
3676 mcflags |= MCR_TRYHARD;
3677 }
3678
3679 m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3680 if (m != NULL) {
3681 MBUF_INIT(m, hdr, type);
3682 mtype_stat_inc(type);
3683 mtype_stat_dec(MT_FREE);
3684 }
3685 return m;
3686 }
3687
3688 /*
3689 * Space allocation routines; these are also available as macros
3690 * for critical paths.
3691 */
3692 #define _M_GET(wait, type) m_get_common(wait, type, 0)
3693 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
3694 #define _M_RETRY(wait, type) _M_GET(wait, type)
3695 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3696 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
3697 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
3698
3699 struct mbuf *
3700 m_get(int wait, int type)
3701 {
3702 return _M_GET(wait, type);
3703 }
3704
3705 struct mbuf *
3706 m_gethdr(int wait, int type)
3707 {
3708 return _M_GETHDR(wait, type);
3709 }
3710
3711 struct mbuf *
3712 m_retry(int wait, int type)
3713 {
3714 return _M_RETRY(wait, type);
3715 }
3716
3717 struct mbuf *
3718 m_retryhdr(int wait, int type)
3719 {
3720 return _M_RETRYHDR(wait, type);
3721 }
3722
3723 struct mbuf *
3724 m_getclr(int wait, int type)
3725 {
3726 struct mbuf *m;
3727
3728 _MGET(m, wait, type);
3729 if (m != NULL) {
3730 bzero(MTOD(m, caddr_t), MLEN);
3731 }
3732 return m;
3733 }
3734
3735 static int
3736 m_free_paired(struct mbuf *m)
3737 {
3738 VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED));
3739
3740 membar_sync();
3741 if (MEXT_PMBUF(m) == m) {
3742 volatile UInt16 *addr = (volatile UInt16 *)&MEXT_PREF(m);
3743 int16_t oprefcnt, prefcnt;
3744
3745 /*
3746 * Paired ref count might be negative in case we lose
3747 * against another thread clearing MEXT_PMBUF, in the
3748 * event it occurs after the above memory barrier sync.
3749 * In that case just ignore as things have been unpaired.
3750 */
3751 do {
3752 oprefcnt = *addr;
3753 prefcnt = oprefcnt - 1;
3754 } while (!OSCompareAndSwap16(oprefcnt, prefcnt, addr));
3755
3756 if (prefcnt > 1) {
3757 return 1;
3758 } else if (prefcnt == 1) {
3759 m_ext_free_func_t m_free_func = m_get_ext_free(m);
3760 VERIFY(m_free_func != NULL);
3761 (*m_free_func)(m->m_ext.ext_buf,
3762 m->m_ext.ext_size, m_get_ext_arg(m));
3763 return 1;
3764 } else if (prefcnt == 0) {
3765 VERIFY(MBUF_IS_PAIRED(m));
3766
3767 /*
3768 * Restore minref to its natural value, so that
3769 * the caller will be able to free the cluster
3770 * as appropriate.
3771 */
3772 MEXT_MINREF(m) = 0;
3773
3774 /*
3775 * Clear MEXT_PMBUF, but leave EXTF_PAIRED intact
3776 * as it is immutable. atomic_set_ptr also causes
3777 * memory barrier sync.
3778 */
3779 atomic_set_ptr(&MEXT_PMBUF(m), NULL);
3780
3781 switch (m->m_ext.ext_size) {
3782 case MCLBYTES:
3783 m_set_ext(m, m_get_rfa(m), NULL, NULL);
3784 break;
3785
3786 case MBIGCLBYTES:
3787 m_set_ext(m, m_get_rfa(m), m_bigfree, NULL);
3788 break;
3789
3790 case M16KCLBYTES:
3791 m_set_ext(m, m_get_rfa(m), m_16kfree, NULL);
3792 break;
3793
3794 default:
3795 VERIFY(0);
3796 /* NOTREACHED */
3797 }
3798 }
3799 }
3800
3801 /*
3802 * Tell caller the unpair has occurred, and that the reference
3803 * count on the external cluster held for the paired mbuf should
3804 * now be dropped.
3805 */
3806 return 0;
3807 }
3808
3809 struct mbuf *
3810 m_free(struct mbuf *m)
3811 {
3812 struct mbuf *n = m->m_next;
3813
3814 if (m->m_type == MT_FREE) {
3815 panic("m_free: freeing an already freed mbuf");
3816 }
3817
3818 if (m->m_flags & M_PKTHDR) {
3819 /* Check for scratch area overflow */
3820 m_redzone_verify(m);
3821 /* Free the aux data and tags if there is any */
3822 m_tag_delete_chain(m, NULL);
3823
3824 m_do_tx_compl_callback(m, NULL);
3825 }
3826
3827 if (m->m_flags & M_EXT) {
3828 if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
3829 return n;
3830 }
3831 /*
3832 * Make sure that we don't touch any ext_ref
3833 * member after we decrement the reference count
3834 * since that may lead to use-after-free
3835 * when we do not hold the last reference.
3836 */
3837 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
3838 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
3839 const uint16_t minref = MEXT_MINREF(m);
3840 const uint16_t refcnt = m_decref(m);
3841
3842 if (refcnt == minref && !composite) {
3843 if (m_free_func == NULL) {
3844 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3845 } else if (m_free_func == m_bigfree) {
3846 mcache_free(m_cache(MC_BIGCL),
3847 m->m_ext.ext_buf);
3848 } else if (m_free_func == m_16kfree) {
3849 mcache_free(m_cache(MC_16KCL),
3850 m->m_ext.ext_buf);
3851 } else {
3852 (*m_free_func)(m->m_ext.ext_buf,
3853 m->m_ext.ext_size, m_get_ext_arg(m));
3854 }
3855 mcache_free(ref_cache, m_get_rfa(m));
3856 m_set_ext(m, NULL, NULL, NULL);
3857 } else if (refcnt == minref && composite) {
3858 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
3859 VERIFY(m->m_type != MT_FREE);
3860
3861 mtype_stat_dec(m->m_type);
3862 mtype_stat_inc(MT_FREE);
3863
3864 m->m_type = MT_FREE;
3865 m->m_flags = M_EXT;
3866 m->m_len = 0;
3867 m->m_next = m->m_nextpkt = NULL;
3868 /*
3869 * MEXT_FLAGS is safe to access here
3870 * since we are now sure that we held
3871 * the last reference to ext_ref.
3872 */
3873 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3874
3875 /* "Free" into the intermediate cache */
3876 if (m_free_func == NULL) {
3877 mcache_free(m_cache(MC_MBUF_CL), m);
3878 } else if (m_free_func == m_bigfree) {
3879 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3880 } else {
3881 VERIFY(m_free_func == m_16kfree);
3882 mcache_free(m_cache(MC_MBUF_16KCL), m);
3883 }
3884 return n;
3885 }
3886 }
3887
3888 if (m->m_type != MT_FREE) {
3889 mtype_stat_dec(m->m_type);
3890 mtype_stat_inc(MT_FREE);
3891 }
3892
3893 m->m_type = MT_FREE;
3894 m->m_flags = m->m_len = 0;
3895 m->m_next = m->m_nextpkt = NULL;
3896
3897 mcache_free(m_cache(MC_MBUF), m);
3898
3899 return n;
3900 }
3901
3902 __private_extern__ struct mbuf *
3903 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3904 void (*extfree)(caddr_t, u_int, caddr_t), size_t extsize, caddr_t extarg,
3905 int wait, int pair)
3906 {
3907 struct ext_ref *rfa = NULL;
3908
3909 /*
3910 * If pairing is requested and an existing mbuf is provided, reject
3911 * it if it's already been paired to another cluster. Otherwise,
3912 * allocate a new one or free any existing below.
3913 */
3914 if ((m != NULL && MBUF_IS_PAIRED(m)) ||
3915 (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)) {
3916 return NULL;
3917 }
3918
3919 if (m->m_flags & M_EXT) {
3920 /*
3921 * Make sure that we don't touch any ext_ref
3922 * member after we decrement the reference count
3923 * since that may lead to use-after-free
3924 * when we do not hold the last reference.
3925 */
3926 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
3927 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
3928 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
3929 const uint16_t minref = MEXT_MINREF(m);
3930 const uint16_t refcnt = m_decref(m);
3931
3932 if (refcnt == minref && !composite) {
3933 if (m_free_func == NULL) {
3934 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3935 } else if (m_free_func == m_bigfree) {
3936 mcache_free(m_cache(MC_BIGCL),
3937 m->m_ext.ext_buf);
3938 } else if (m_free_func == m_16kfree) {
3939 mcache_free(m_cache(MC_16KCL),
3940 m->m_ext.ext_buf);
3941 } else {
3942 (*m_free_func)(m->m_ext.ext_buf,
3943 m->m_ext.ext_size, m_get_ext_arg(m));
3944 }
3945 /* Re-use the reference structure */
3946 rfa = m_get_rfa(m);
3947 } else if (refcnt == minref && composite) {
3948 VERIFY(m->m_type != MT_FREE);
3949
3950 mtype_stat_dec(m->m_type);
3951 mtype_stat_inc(MT_FREE);
3952
3953 m->m_type = MT_FREE;
3954 m->m_flags = M_EXT;
3955 m->m_len = 0;
3956 m->m_next = m->m_nextpkt = NULL;
3957
3958 /*
3959 * MEXT_FLAGS is safe to access here
3960 * since we are now sure that we held
3961 * the last reference to ext_ref.
3962 */
3963 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3964
3965 /* "Free" into the intermediate cache */
3966 if (m_free_func == NULL) {
3967 mcache_free(m_cache(MC_MBUF_CL), m);
3968 } else if (m_free_func == m_bigfree) {
3969 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3970 } else {
3971 VERIFY(m_free_func == m_16kfree);
3972 mcache_free(m_cache(MC_MBUF_16KCL), m);
3973 }
3974 /*
3975 * Allocate a new mbuf, since we didn't divorce
3976 * the composite mbuf + cluster pair above.
3977 */
3978 if ((m = _M_GETHDR(wait, type)) == NULL) {
3979 return NULL;
3980 }
3981 }
3982 }
3983
3984 if (rfa == NULL &&
3985 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3986 m_free(m);
3987 return NULL;
3988 }
3989
3990 if (!pair) {
3991 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa,
3992 0, 1, 0, 0, 0, NULL);
3993 } else {
3994 MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
3995 1, 1, 1, EXTF_PAIRED, 0, m);
3996 }
3997
3998 return m;
3999 }
4000
4001 /*
4002 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
4003 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
4004 */
4005 struct mbuf *
4006 m_getcl(int wait, int type, int flags)
4007 {
4008 struct mbuf *m;
4009 int mcflags = MSLEEPF(wait);
4010 int hdr = (flags & M_PKTHDR);
4011
4012 /* Is this due to a non-blocking retry? If so, then try harder */
4013 if (mcflags & MCR_NOSLEEP) {
4014 mcflags |= MCR_TRYHARD;
4015 }
4016
4017 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
4018 if (m != NULL) {
4019 u_int16_t flag;
4020 struct ext_ref *rfa;
4021 void *cl;
4022
4023 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4024 cl = m->m_ext.ext_buf;
4025 rfa = m_get_rfa(m);
4026
4027 ASSERT(cl != NULL && rfa != NULL);
4028 VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
4029
4030 flag = MEXT_FLAGS(m);
4031
4032 MBUF_INIT(m, hdr, type);
4033 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4034
4035 mtype_stat_inc(type);
4036 mtype_stat_dec(MT_FREE);
4037 }
4038 return m;
4039 }
4040
4041 /* m_mclget() add an mbuf cluster to a normal mbuf */
4042 struct mbuf *
4043 m_mclget(struct mbuf *m, int wait)
4044 {
4045 struct ext_ref *rfa;
4046
4047 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4048 return m;
4049 }
4050
4051 m->m_ext.ext_buf = m_mclalloc(wait);
4052 if (m->m_ext.ext_buf != NULL) {
4053 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4054 } else {
4055 mcache_free(ref_cache, rfa);
4056 }
4057 return m;
4058 }
4059
4060 /* Allocate an mbuf cluster */
4061 caddr_t
4062 m_mclalloc(int wait)
4063 {
4064 int mcflags = MSLEEPF(wait);
4065
4066 /* Is this due to a non-blocking retry? If so, then try harder */
4067 if (mcflags & MCR_NOSLEEP) {
4068 mcflags |= MCR_TRYHARD;
4069 }
4070
4071 return mcache_alloc(m_cache(MC_CL), mcflags);
4072 }
4073
4074 /* Free an mbuf cluster */
4075 void
4076 m_mclfree(caddr_t p)
4077 {
4078 mcache_free(m_cache(MC_CL), p);
4079 }
4080
4081 /*
4082 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
4083 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
4084 */
4085 int
4086 m_mclhasreference(struct mbuf *m)
4087 {
4088 if (!(m->m_flags & M_EXT)) {
4089 return 0;
4090 }
4091
4092 ASSERT(m_get_rfa(m) != NULL);
4093
4094 return (MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0;
4095 }
4096
4097 __private_extern__ caddr_t
4098 m_bigalloc(int wait)
4099 {
4100 int mcflags = MSLEEPF(wait);
4101
4102 /* Is this due to a non-blocking retry? If so, then try harder */
4103 if (mcflags & MCR_NOSLEEP) {
4104 mcflags |= MCR_TRYHARD;
4105 }
4106
4107 return mcache_alloc(m_cache(MC_BIGCL), mcflags);
4108 }
4109
4110 __private_extern__ void
4111 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
4112 {
4113 mcache_free(m_cache(MC_BIGCL), p);
4114 }
4115
4116 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
4117 __private_extern__ struct mbuf *
4118 m_mbigget(struct mbuf *m, int wait)
4119 {
4120 struct ext_ref *rfa;
4121
4122 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4123 return m;
4124 }
4125
4126 m->m_ext.ext_buf = m_bigalloc(wait);
4127 if (m->m_ext.ext_buf != NULL) {
4128 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4129 } else {
4130 mcache_free(ref_cache, rfa);
4131 }
4132 return m;
4133 }
4134
4135 __private_extern__ caddr_t
4136 m_16kalloc(int wait)
4137 {
4138 int mcflags = MSLEEPF(wait);
4139
4140 /* Is this due to a non-blocking retry? If so, then try harder */
4141 if (mcflags & MCR_NOSLEEP) {
4142 mcflags |= MCR_TRYHARD;
4143 }
4144
4145 return mcache_alloc(m_cache(MC_16KCL), mcflags);
4146 }
4147
4148 __private_extern__ void
4149 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
4150 {
4151 mcache_free(m_cache(MC_16KCL), p);
4152 }
4153
4154 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
4155 __private_extern__ struct mbuf *
4156 m_m16kget(struct mbuf *m, int wait)
4157 {
4158 struct ext_ref *rfa;
4159
4160 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4161 return m;
4162 }
4163
4164 m->m_ext.ext_buf = m_16kalloc(wait);
4165 if (m->m_ext.ext_buf != NULL) {
4166 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4167 } else {
4168 mcache_free(ref_cache, rfa);
4169 }
4170 return m;
4171 }
4172
4173 /*
4174 * "Move" mbuf pkthdr from "from" to "to".
4175 * "from" must have M_PKTHDR set, and "to" must be empty.
4176 */
4177 void
4178 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
4179 {
4180 VERIFY(from->m_flags & M_PKTHDR);
4181
4182 /* Check for scratch area overflow */
4183 m_redzone_verify(from);
4184
4185 if (to->m_flags & M_PKTHDR) {
4186 /* Check for scratch area overflow */
4187 m_redzone_verify(to);
4188 /* We will be taking over the tags of 'to' */
4189 m_tag_delete_chain(to, NULL);
4190 }
4191 to->m_pkthdr = from->m_pkthdr; /* especially tags */
4192 m_classifier_init(from, 0); /* purge classifier info */
4193 m_tag_init(from, 1); /* purge all tags from src */
4194 m_scratch_init(from); /* clear src scratch area */
4195 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
4196 if ((to->m_flags & M_EXT) == 0) {
4197 to->m_data = to->m_pktdat;
4198 }
4199 m_redzone_init(to); /* setup red zone on dst */
4200 }
4201
4202 /*
4203 * Duplicate "from"'s mbuf pkthdr in "to".
4204 * "from" must have M_PKTHDR set, and "to" must be empty.
4205 * In particular, this does a deep copy of the packet tags.
4206 */
4207 int
4208 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
4209 {
4210 VERIFY(from->m_flags & M_PKTHDR);
4211
4212 /* Check for scratch area overflow */
4213 m_redzone_verify(from);
4214
4215 if (to->m_flags & M_PKTHDR) {
4216 /* Check for scratch area overflow */
4217 m_redzone_verify(to);
4218 /* We will be taking over the tags of 'to' */
4219 m_tag_delete_chain(to, NULL);
4220 }
4221 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
4222 if ((to->m_flags & M_EXT) == 0) {
4223 to->m_data = to->m_pktdat;
4224 }
4225 to->m_pkthdr = from->m_pkthdr;
4226 /* clear TX completion flag so the callback is not called in the copy */
4227 to->m_pkthdr.pkt_flags &= ~PKTF_TX_COMPL_TS_REQ;
4228 m_redzone_init(to); /* setup red zone on dst */
4229 m_tag_init(to, 0); /* preserve dst static tags */
4230 return m_tag_copy_chain(to, from, how);
4231 }
4232
4233 void
4234 m_copy_pftag(struct mbuf *to, struct mbuf *from)
4235 {
4236 memcpy(m_pftag(to), m_pftag(from), sizeof(struct pf_mtag));
4237 #if PF_ECN
4238 m_pftag(to)->pftag_hdr = NULL;
4239 m_pftag(to)->pftag_flags &= ~(PF_TAG_HDR_INET | PF_TAG_HDR_INET6);
4240 #endif /* PF_ECN */
4241 }
4242
4243 void
4244 m_copy_necptag(struct mbuf *to, struct mbuf *from)
4245 {
4246 memcpy(m_necptag(to), m_necptag(from), sizeof(struct necp_mtag_));
4247 }
4248
4249 void
4250 m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
4251 {
4252 VERIFY(m->m_flags & M_PKTHDR);
4253
4254 m->m_pkthdr.pkt_proto = 0;
4255 m->m_pkthdr.pkt_flowsrc = 0;
4256 m->m_pkthdr.pkt_flowid = 0;
4257 m->m_pkthdr.pkt_ext_flags = 0;
4258 m->m_pkthdr.pkt_flags &= pktf_mask; /* caller-defined mask */
4259 /* preserve service class and interface info for loopback packets */
4260 if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
4261 (void) m_set_service_class(m, MBUF_SC_BE);
4262 }
4263 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
4264 m->m_pkthdr.pkt_ifainfo = 0;
4265 }
4266 /*
4267 * Preserve timestamp if requested
4268 */
4269 if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID)) {
4270 m->m_pkthdr.pkt_timestamp = 0;
4271 }
4272 }
4273
4274 void
4275 m_copy_classifier(struct mbuf *to, struct mbuf *from)
4276 {
4277 VERIFY(to->m_flags & M_PKTHDR);
4278 VERIFY(from->m_flags & M_PKTHDR);
4279
4280 to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
4281 to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
4282 to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
4283 to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
4284 to->m_pkthdr.pkt_ext_flags = from->m_pkthdr.pkt_ext_flags;
4285 (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
4286 to->m_pkthdr.pkt_ifainfo = from->m_pkthdr.pkt_ifainfo;
4287 }
4288
4289 /*
4290 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
4291 * if wantall is not set, return whatever number were available. Set up the
4292 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
4293 * are chained on the m_nextpkt field. Any packets requested beyond this
4294 * are chained onto the last packet header's m_next field. The size of
4295 * the cluster is controlled by the parameter bufsize.
4296 */
4297 __private_extern__ struct mbuf *
4298 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
4299 int wait, int wantall, size_t bufsize)
4300 {
4301 struct mbuf *m;
4302 struct mbuf **np, *top;
4303 unsigned int pnum, needed = *num_needed;
4304 mcache_obj_t *mp_list = NULL;
4305 int mcflags = MSLEEPF(wait);
4306 u_int16_t flag;
4307 struct ext_ref *rfa;
4308 mcache_t *cp;
4309 void *cl;
4310
4311 ASSERT(bufsize == m_maxsize(MC_CL) ||
4312 bufsize == m_maxsize(MC_BIGCL) ||
4313 bufsize == m_maxsize(MC_16KCL));
4314
4315 /*
4316 * Caller must first check for njcl because this
4317 * routine is internal and not exposed/used via KPI.
4318 */
4319 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
4320
4321 top = NULL;
4322 np = ⊤
4323 pnum = 0;
4324
4325 /*
4326 * The caller doesn't want all the requested buffers; only some.
4327 * Try hard to get what we can, but don't block. This effectively
4328 * overrides MCR_SLEEP, since this thread will not go to sleep
4329 * if we can't get all the buffers.
4330 */
4331 if (!wantall || (mcflags & MCR_NOSLEEP)) {
4332 mcflags |= MCR_TRYHARD;
4333 }
4334
4335 /* Allocate the composite mbuf + cluster elements from the cache */
4336 if (bufsize == m_maxsize(MC_CL)) {
4337 cp = m_cache(MC_MBUF_CL);
4338 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4339 cp = m_cache(MC_MBUF_BIGCL);
4340 } else {
4341 cp = m_cache(MC_MBUF_16KCL);
4342 }
4343 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
4344
4345 for (pnum = 0; pnum < needed; pnum++) {
4346 m = (struct mbuf *)mp_list;
4347 mp_list = mp_list->obj_next;
4348
4349 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4350 cl = m->m_ext.ext_buf;
4351 rfa = m_get_rfa(m);
4352
4353 ASSERT(cl != NULL && rfa != NULL);
4354 VERIFY(MBUF_IS_COMPOSITE(m));
4355
4356 flag = MEXT_FLAGS(m);
4357
4358 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
4359 if (bufsize == m_maxsize(MC_16KCL)) {
4360 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4361 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4362 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4363 } else {
4364 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4365 }
4366
4367 if (num_with_pkthdrs > 0) {
4368 --num_with_pkthdrs;
4369 }
4370
4371 *np = m;
4372 if (num_with_pkthdrs > 0) {
4373 np = &m->m_nextpkt;
4374 } else {
4375 np = &m->m_next;
4376 }
4377 }
4378 ASSERT(pnum != *num_needed || mp_list == NULL);
4379 if (mp_list != NULL) {
4380 mcache_free_ext(cp, mp_list);
4381 }
4382
4383 if (pnum > 0) {
4384 mtype_stat_add(MT_DATA, pnum);
4385 mtype_stat_sub(MT_FREE, pnum);
4386 }
4387
4388 if (wantall && (pnum != *num_needed)) {
4389 if (top != NULL) {
4390 m_freem_list(top);
4391 }
4392 return NULL;
4393 }
4394
4395 if (pnum > *num_needed) {
4396 printf("%s: File a radar related to <rdar://10146739>. \
4397 needed = %u, pnum = %u, num_needed = %u \n",
4398 __func__, needed, pnum, *num_needed);
4399 }
4400
4401 *num_needed = pnum;
4402 return top;
4403 }
4404
4405 /*
4406 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
4407 * wantall is not set, return whatever number were available. The size of
4408 * each mbuf in the list is controlled by the parameter packetlen. Each
4409 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
4410 * in the chain is called a segment. If maxsegments is not null and the
4411 * value pointed to is not null, this specify the maximum number of segments
4412 * for a chain of mbufs. If maxsegments is zero or the value pointed to
4413 * is zero the caller does not have any restriction on the number of segments.
4414 * The actual number of segments of a mbuf chain is return in the value
4415 * pointed to by maxsegments.
4416 */
4417 __private_extern__ struct mbuf *
4418 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
4419 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
4420 {
4421 struct mbuf **np, *top, *first = NULL;
4422 size_t bufsize, r_bufsize;
4423 unsigned int num = 0;
4424 unsigned int nsegs = 0;
4425 unsigned int needed, resid;
4426 int mcflags = MSLEEPF(wait);
4427 mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
4428 mcache_t *cp = NULL, *rcp = NULL;
4429
4430 if (*numlist == 0) {
4431 return NULL;
4432 }
4433
4434 top = NULL;
4435 np = ⊤
4436
4437 if (wantsize == 0) {
4438 if (packetlen <= MINCLSIZE) {
4439 bufsize = packetlen;
4440 } else if (packetlen > m_maxsize(MC_CL)) {
4441 /* Use 4KB if jumbo cluster pool isn't available */
4442 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0) {
4443 bufsize = m_maxsize(MC_BIGCL);
4444 } else {
4445 bufsize = m_maxsize(MC_16KCL);
4446 }
4447 } else {
4448 bufsize = m_maxsize(MC_CL);
4449 }
4450 } else if (wantsize == m_maxsize(MC_CL) ||
4451 wantsize == m_maxsize(MC_BIGCL) ||
4452 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
4453 bufsize = wantsize;
4454 } else {
4455 *numlist = 0;
4456 return NULL;
4457 }
4458
4459 if (bufsize <= MHLEN) {
4460 nsegs = 1;
4461 } else if (bufsize <= MINCLSIZE) {
4462 if (maxsegments != NULL && *maxsegments == 1) {
4463 bufsize = m_maxsize(MC_CL);
4464 nsegs = 1;
4465 } else {
4466 nsegs = 2;
4467 }
4468 } else if (bufsize == m_maxsize(MC_16KCL)) {
4469 VERIFY(njcl > 0);
4470 nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1;
4471 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4472 nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1;
4473 } else {
4474 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
4475 }
4476 if (maxsegments != NULL) {
4477 if (*maxsegments && nsegs > *maxsegments) {
4478 *maxsegments = nsegs;
4479 *numlist = 0;
4480 return NULL;
4481 }
4482 *maxsegments = nsegs;
4483 }
4484
4485 /*
4486 * The caller doesn't want all the requested buffers; only some.
4487 * Try hard to get what we can, but don't block. This effectively
4488 * overrides MCR_SLEEP, since this thread will not go to sleep
4489 * if we can't get all the buffers.
4490 */
4491 if (!wantall || (mcflags & MCR_NOSLEEP)) {
4492 mcflags |= MCR_TRYHARD;
4493 }
4494
4495 /*
4496 * Simple case where all elements in the lists/chains are mbufs.
4497 * Unless bufsize is greater than MHLEN, each segment chain is made
4498 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
4499 * of 2 mbufs; the second one is used for the residual data, i.e.
4500 * the remaining data that cannot fit into the first mbuf.
4501 */
4502 if (bufsize <= MINCLSIZE) {
4503 /* Allocate the elements in one shot from the mbuf cache */
4504 ASSERT(bufsize <= MHLEN || nsegs == 2);
4505 cp = m_cache(MC_MBUF);
4506 needed = mcache_alloc_ext(cp, &mp_list,
4507 (*numlist) * nsegs, mcflags);
4508
4509 /*
4510 * The number of elements must be even if we are to use an
4511 * mbuf (instead of a cluster) to store the residual data.
4512 * If we couldn't allocate the requested number of mbufs,
4513 * trim the number down (if it's odd) in order to avoid
4514 * creating a partial segment chain.
4515 */
4516 if (bufsize > MHLEN && (needed & 0x1)) {
4517 needed--;
4518 }
4519
4520 while (num < needed) {
4521 struct mbuf *m;
4522
4523 m = (struct mbuf *)mp_list;
4524 mp_list = mp_list->obj_next;
4525 ASSERT(m != NULL);
4526
4527 MBUF_INIT(m, 1, MT_DATA);
4528 num++;
4529 if (bufsize > MHLEN) {
4530 /* A second mbuf for this segment chain */
4531 m->m_next = (struct mbuf *)mp_list;
4532 mp_list = mp_list->obj_next;
4533 ASSERT(m->m_next != NULL);
4534
4535 MBUF_INIT(m->m_next, 0, MT_DATA);
4536 num++;
4537 }
4538 *np = m;
4539 np = &m->m_nextpkt;
4540 }
4541 ASSERT(num != *numlist || mp_list == NULL);
4542
4543 if (num > 0) {
4544 mtype_stat_add(MT_DATA, num);
4545 mtype_stat_sub(MT_FREE, num);
4546 }
4547 num /= nsegs;
4548
4549 /* We've got them all; return to caller */
4550 if (num == *numlist) {
4551 return top;
4552 }
4553
4554 goto fail;
4555 }
4556
4557 /*
4558 * Complex cases where elements are made up of one or more composite
4559 * mbufs + cluster, depending on packetlen. Each N-segment chain can
4560 * be illustrated as follows:
4561 *
4562 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4563 *
4564 * Every composite mbuf + cluster element comes from the intermediate
4565 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
4566 * the last composite element will come from the MC_MBUF_CL cache,
4567 * unless the residual data is larger than 2KB where we use the
4568 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
4569 * data is defined as extra data beyond the first element that cannot
4570 * fit into the previous element, i.e. there is no residual data if
4571 * the chain only has 1 segment.
4572 */
4573 r_bufsize = bufsize;
4574 resid = packetlen > bufsize ? packetlen % bufsize : 0;
4575 if (resid > 0) {
4576 /* There is residual data; figure out the cluster size */
4577 if (wantsize == 0 && packetlen > MINCLSIZE) {
4578 /*
4579 * Caller didn't request that all of the segments
4580 * in the chain use the same cluster size; use the
4581 * smaller of the cluster sizes.
4582 */
4583 if (njcl > 0 && resid > m_maxsize(MC_BIGCL)) {
4584 r_bufsize = m_maxsize(MC_16KCL);
4585 } else if (resid > m_maxsize(MC_CL)) {
4586 r_bufsize = m_maxsize(MC_BIGCL);
4587 } else {
4588 r_bufsize = m_maxsize(MC_CL);
4589 }
4590 } else {
4591 /* Use the same cluster size as the other segments */
4592 resid = 0;
4593 }
4594 }
4595
4596 needed = *numlist;
4597 if (resid > 0) {
4598 /*
4599 * Attempt to allocate composite mbuf + cluster elements for
4600 * the residual data in each chain; record the number of such
4601 * elements that can be allocated so that we know how many
4602 * segment chains we can afford to create.
4603 */
4604 if (r_bufsize <= m_maxsize(MC_CL)) {
4605 rcp = m_cache(MC_MBUF_CL);
4606 } else if (r_bufsize <= m_maxsize(MC_BIGCL)) {
4607 rcp = m_cache(MC_MBUF_BIGCL);
4608 } else {
4609 rcp = m_cache(MC_MBUF_16KCL);
4610 }
4611 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
4612
4613 if (needed == 0) {
4614 goto fail;
4615 }
4616
4617 /* This is temporarily reduced for calculation */
4618 ASSERT(nsegs > 1);
4619 nsegs--;
4620 }
4621
4622 /*
4623 * Attempt to allocate the rest of the composite mbuf + cluster
4624 * elements for the number of segment chains that we need.
4625 */
4626 if (bufsize <= m_maxsize(MC_CL)) {
4627 cp = m_cache(MC_MBUF_CL);
4628 } else if (bufsize <= m_maxsize(MC_BIGCL)) {
4629 cp = m_cache(MC_MBUF_BIGCL);
4630 } else {
4631 cp = m_cache(MC_MBUF_16KCL);
4632 }
4633 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
4634
4635 /* Round it down to avoid creating a partial segment chain */
4636 needed = (needed / nsegs) * nsegs;
4637 if (needed == 0) {
4638 goto fail;
4639 }
4640
4641 if (resid > 0) {
4642 /*
4643 * We're about to construct the chain(s); take into account
4644 * the number of segments we have created above to hold the
4645 * residual data for each chain, as well as restore the
4646 * original count of segments per chain.
4647 */
4648 ASSERT(nsegs > 0);
4649 needed += needed / nsegs;
4650 nsegs++;
4651 }
4652
4653 for (;;) {
4654 struct mbuf *m;
4655 u_int16_t flag;
4656 struct ext_ref *rfa;
4657 void *cl;
4658 int pkthdr;
4659 m_ext_free_func_t m_free_func;
4660
4661 ++num;
4662 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
4663 m = (struct mbuf *)mp_list;
4664 mp_list = mp_list->obj_next;
4665 } else {
4666 m = (struct mbuf *)rmp_list;
4667 rmp_list = rmp_list->obj_next;
4668 }
4669 m_free_func = m_get_ext_free(m);
4670 ASSERT(m != NULL);
4671 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4672 VERIFY(m_free_func == NULL || m_free_func == m_bigfree ||
4673 m_free_func == m_16kfree);
4674
4675 cl = m->m_ext.ext_buf;
4676 rfa = m_get_rfa(m);
4677
4678 ASSERT(cl != NULL && rfa != NULL);
4679 VERIFY(MBUF_IS_COMPOSITE(m));
4680
4681 flag = MEXT_FLAGS(m);
4682
4683 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4684 if (pkthdr) {
4685 first = m;
4686 }
4687 MBUF_INIT(m, pkthdr, MT_DATA);
4688 if (m_free_func == m_16kfree) {
4689 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4690 } else if (m_free_func == m_bigfree) {
4691 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4692 } else {
4693 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4694 }
4695
4696 *np = m;
4697 if ((num % nsegs) == 0) {
4698 np = &first->m_nextpkt;
4699 } else {
4700 np = &m->m_next;
4701 }
4702
4703 if (num == needed) {
4704 break;
4705 }
4706 }
4707
4708 if (num > 0) {
4709 mtype_stat_add(MT_DATA, num);
4710 mtype_stat_sub(MT_FREE, num);
4711 }
4712
4713 num /= nsegs;
4714
4715 /* We've got them all; return to caller */
4716 if (num == *numlist) {
4717 ASSERT(mp_list == NULL && rmp_list == NULL);
4718 return top;
4719 }
4720
4721 fail:
4722 /* Free up what's left of the above */
4723 if (mp_list != NULL) {
4724 mcache_free_ext(cp, mp_list);
4725 }
4726 if (rmp_list != NULL) {
4727 mcache_free_ext(rcp, rmp_list);
4728 }
4729 if (wantall && top != NULL) {
4730 m_freem_list(top);
4731 *numlist = 0;
4732 return NULL;
4733 }
4734 *numlist = num;
4735 return top;
4736 }
4737
4738 /*
4739 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4740 * packets on receive ring.
4741 */
4742 __private_extern__ struct mbuf *
4743 m_getpacket_how(int wait)
4744 {
4745 unsigned int num_needed = 1;
4746
4747 return m_getpackets_internal(&num_needed, 1, wait, 1,
4748 m_maxsize(MC_CL));
4749 }
4750
4751 /*
4752 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4753 * packets on receive ring.
4754 */
4755 struct mbuf *
4756 m_getpacket(void)
4757 {
4758 unsigned int num_needed = 1;
4759
4760 return m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4761 m_maxsize(MC_CL));
4762 }
4763
4764 /*
4765 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
4766 * if this can't be met, return whatever number were available. Set up the
4767 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
4768 * are chained on the m_nextpkt field. Any packets requested beyond this are
4769 * chained onto the last packet header's m_next field.
4770 */
4771 struct mbuf *
4772 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4773 {
4774 unsigned int n = num_needed;
4775
4776 return m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4777 m_maxsize(MC_CL));
4778 }
4779
4780 /*
4781 * Return a list of mbuf hdrs set up as packet hdrs chained together
4782 * on the m_nextpkt field
4783 */
4784 struct mbuf *
4785 m_getpackethdrs(int num_needed, int how)
4786 {
4787 struct mbuf *m;
4788 struct mbuf **np, *top;
4789
4790 top = NULL;
4791 np = ⊤
4792
4793 while (num_needed--) {
4794 m = _M_RETRYHDR(how, MT_DATA);
4795 if (m == NULL) {
4796 break;
4797 }
4798
4799 *np = m;
4800 np = &m->m_nextpkt;
4801 }
4802
4803 return top;
4804 }
4805
4806 /*
4807 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
4808 * for mbufs packets freed. Used by the drivers.
4809 */
4810 int
4811 m_freem_list(struct mbuf *m)
4812 {
4813 struct mbuf *nextpkt;
4814 mcache_obj_t *mp_list = NULL;
4815 mcache_obj_t *mcl_list = NULL;
4816 mcache_obj_t *mbc_list = NULL;
4817 mcache_obj_t *m16k_list = NULL;
4818 mcache_obj_t *m_mcl_list = NULL;
4819 mcache_obj_t *m_mbc_list = NULL;
4820 mcache_obj_t *m_m16k_list = NULL;
4821 mcache_obj_t *ref_list = NULL;
4822 int pktcount = 0;
4823 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4824
4825 while (m != NULL) {
4826 pktcount++;
4827
4828 nextpkt = m->m_nextpkt;
4829 m->m_nextpkt = NULL;
4830
4831 while (m != NULL) {
4832 struct mbuf *next = m->m_next;
4833 mcache_obj_t *o, *rfa;
4834
4835 if (m->m_type == MT_FREE) {
4836 panic("m_free: freeing an already freed mbuf");
4837 }
4838
4839 if (m->m_flags & M_PKTHDR) {
4840 /* Check for scratch area overflow */
4841 m_redzone_verify(m);
4842 /* Free the aux data and tags if there is any */
4843 m_tag_delete_chain(m, NULL);
4844 m_do_tx_compl_callback(m, NULL);
4845 }
4846
4847 if (!(m->m_flags & M_EXT)) {
4848 mt_free++;
4849 goto simple_free;
4850 }
4851
4852 if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
4853 m = next;
4854 continue;
4855 }
4856
4857 mt_free++;
4858
4859 o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4860 /*
4861 * Make sure that we don't touch any ext_ref
4862 * member after we decrement the reference count
4863 * since that may lead to use-after-free
4864 * when we do not hold the last reference.
4865 */
4866 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
4867 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
4868 const uint16_t minref = MEXT_MINREF(m);
4869 const uint16_t refcnt = m_decref(m);
4870
4871 if (refcnt == minref && !composite) {
4872 if (m_free_func == NULL) {
4873 o->obj_next = mcl_list;
4874 mcl_list = o;
4875 } else if (m_free_func == m_bigfree) {
4876 o->obj_next = mbc_list;
4877 mbc_list = o;
4878 } else if (m_free_func == m_16kfree) {
4879 o->obj_next = m16k_list;
4880 m16k_list = o;
4881 } else {
4882 (*(m_free_func))((caddr_t)o,
4883 m->m_ext.ext_size,
4884 m_get_ext_arg(m));
4885 }
4886 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
4887 rfa->obj_next = ref_list;
4888 ref_list = rfa;
4889 m_set_ext(m, NULL, NULL, NULL);
4890 } else if (refcnt == minref && composite) {
4891 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
4892 VERIFY(m->m_type != MT_FREE);
4893 /*
4894 * Amortize the costs of atomic operations
4895 * by doing them at the end, if possible.
4896 */
4897 if (m->m_type == MT_DATA) {
4898 mt_data++;
4899 } else if (m->m_type == MT_HEADER) {
4900 mt_header++;
4901 } else if (m->m_type == MT_SONAME) {
4902 mt_soname++;
4903 } else if (m->m_type == MT_TAG) {
4904 mt_tag++;
4905 } else {
4906 mtype_stat_dec(m->m_type);
4907 }
4908
4909 m->m_type = MT_FREE;
4910 m->m_flags = M_EXT;
4911 m->m_len = 0;
4912 m->m_next = m->m_nextpkt = NULL;
4913
4914 /*
4915 * MEXT_FLAGS is safe to access here
4916 * since we are now sure that we held
4917 * the last reference to ext_ref.
4918 */
4919 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4920
4921 /* "Free" into the intermediate cache */
4922 o = (mcache_obj_t *)m;
4923 if (m_free_func == NULL) {
4924 o->obj_next = m_mcl_list;
4925 m_mcl_list = o;
4926 } else if (m_free_func == m_bigfree) {
4927 o->obj_next = m_mbc_list;
4928 m_mbc_list = o;
4929 } else {
4930 VERIFY(m_free_func == m_16kfree);
4931 o->obj_next = m_m16k_list;
4932 m_m16k_list = o;
4933 }
4934 m = next;
4935 continue;
4936 }
4937 simple_free:
4938 /*
4939 * Amortize the costs of atomic operations
4940 * by doing them at the end, if possible.
4941 */
4942 if (m->m_type == MT_DATA) {
4943 mt_data++;
4944 } else if (m->m_type == MT_HEADER) {
4945 mt_header++;
4946 } else if (m->m_type == MT_SONAME) {
4947 mt_soname++;
4948 } else if (m->m_type == MT_TAG) {
4949 mt_tag++;
4950 } else if (m->m_type != MT_FREE) {
4951 mtype_stat_dec(m->m_type);
4952 }
4953
4954 m->m_type = MT_FREE;
4955 m->m_flags = m->m_len = 0;
4956 m->m_next = m->m_nextpkt = NULL;
4957
4958 ((mcache_obj_t *)m)->obj_next = mp_list;
4959 mp_list = (mcache_obj_t *)m;
4960
4961 m = next;
4962 }
4963
4964 m = nextpkt;
4965 }
4966
4967 if (mt_free > 0) {
4968 mtype_stat_add(MT_FREE, mt_free);
4969 }
4970 if (mt_data > 0) {
4971 mtype_stat_sub(MT_DATA, mt_data);
4972 }
4973 if (mt_header > 0) {
4974 mtype_stat_sub(MT_HEADER, mt_header);
4975 }
4976 if (mt_soname > 0) {
4977 mtype_stat_sub(MT_SONAME, mt_soname);
4978 }
4979 if (mt_tag > 0) {
4980 mtype_stat_sub(MT_TAG, mt_tag);
4981 }
4982
4983 if (mp_list != NULL) {
4984 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4985 }
4986 if (mcl_list != NULL) {
4987 mcache_free_ext(m_cache(MC_CL), mcl_list);
4988 }
4989 if (mbc_list != NULL) {
4990 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4991 }
4992 if (m16k_list != NULL) {
4993 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4994 }
4995 if (m_mcl_list != NULL) {
4996 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4997 }
4998 if (m_mbc_list != NULL) {
4999 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
5000 }
5001 if (m_m16k_list != NULL) {
5002 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
5003 }
5004 if (ref_list != NULL) {
5005 mcache_free_ext(ref_cache, ref_list);
5006 }
5007
5008 return pktcount;
5009 }
5010
5011 void
5012 m_freem(struct mbuf *m)
5013 {
5014 while (m != NULL) {
5015 m = m_free(m);
5016 }
5017 }
5018
5019 /*
5020 * Mbuffer utility routines.
5021 */
5022 /*
5023 * Set the m_data pointer of a newly allocated mbuf to place an object of the
5024 * specified size at the end of the mbuf, longword aligned.
5025 *
5026 * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
5027 * separate macros, each asserting that it was called at the proper moment.
5028 * This required callers to themselves test the storage type and call the
5029 * right one. Rather than require callers to be aware of those layout
5030 * decisions, we centralize here.
5031 */
5032 void
5033 m_align(struct mbuf *m, int len)
5034 {
5035 int adjust = 0;
5036
5037 /* At this point data must point to start */
5038 VERIFY(m->m_data == M_START(m));
5039 VERIFY(len >= 0);
5040 VERIFY(len <= M_SIZE(m));
5041 adjust = M_SIZE(m) - len;
5042 m->m_data += adjust & ~(sizeof(long) - 1);
5043 }
5044
5045 /*
5046 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
5047 * copy junk along. Does not adjust packet header length.
5048 */
5049 struct mbuf *
5050 m_prepend(struct mbuf *m, int len, int how)
5051 {
5052 struct mbuf *mn;
5053
5054 _MGET(mn, how, m->m_type);
5055 if (mn == NULL) {
5056 m_freem(m);
5057 return NULL;
5058 }
5059 if (m->m_flags & M_PKTHDR) {
5060 M_COPY_PKTHDR(mn, m);
5061 m->m_flags &= ~M_PKTHDR;
5062 }
5063 mn->m_next = m;
5064 m = mn;
5065 if (m->m_flags & M_PKTHDR) {
5066 VERIFY(len <= MHLEN);
5067 MH_ALIGN(m, len);
5068 } else {
5069 VERIFY(len <= MLEN);
5070 M_ALIGN(m, len);
5071 }
5072 m->m_len = len;
5073 return m;
5074 }
5075
5076 /*
5077 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
5078 * chain, copy junk along, and adjust length.
5079 */
5080 struct mbuf *
5081 m_prepend_2(struct mbuf *m, int len, int how, int align)
5082 {
5083 if (M_LEADINGSPACE(m) >= len &&
5084 (!align || IS_P2ALIGNED((m->m_data - len), sizeof(u_int32_t)))) {
5085 m->m_data -= len;
5086 m->m_len += len;
5087 } else {
5088 m = m_prepend(m, len, how);
5089 }
5090 if ((m) && (m->m_flags & M_PKTHDR)) {
5091 m->m_pkthdr.len += len;
5092 }
5093 return m;
5094 }
5095
5096 /*
5097 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
5098 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
5099 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
5100 */
5101 int MCFail;
5102
5103 struct mbuf *
5104 m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode)
5105 {
5106 struct mbuf *n, *mhdr = NULL, **np;
5107 int off = off0;
5108 struct mbuf *top;
5109 int copyhdr = 0;
5110
5111 if (off < 0 || len < 0) {
5112 panic("m_copym: invalid offset %d or len %d", off, len);
5113 }
5114
5115 VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
5116 mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR));
5117
5118 if ((off == 0 && (m->m_flags & M_PKTHDR)) ||
5119 mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) {
5120 mhdr = m;
5121 copyhdr = 1;
5122 }
5123
5124 while (off >= m->m_len) {
5125 if (m->m_next == NULL) {
5126 panic("m_copym: invalid mbuf chain");
5127 }
5128 off -= m->m_len;
5129 m = m->m_next;
5130 }
5131 np = ⊤
5132 top = NULL;
5133
5134 while (len > 0) {
5135 if (m == NULL) {
5136 if (len != M_COPYALL) {
5137 panic("m_copym: len != M_COPYALL");
5138 }
5139 break;
5140 }
5141
5142 if (copyhdr) {
5143 n = _M_RETRYHDR(wait, m->m_type);
5144 } else {
5145 n = _M_RETRY(wait, m->m_type);
5146 }
5147 *np = n;
5148
5149 if (n == NULL) {
5150 goto nospace;
5151 }
5152
5153 if (copyhdr != 0) {
5154 if ((mode == M_COPYM_MOVE_HDR) ||
5155 (mode == M_COPYM_MUST_MOVE_HDR)) {
5156 M_COPY_PKTHDR(n, mhdr);
5157 } else if ((mode == M_COPYM_COPY_HDR) ||
5158 (mode == M_COPYM_MUST_COPY_HDR)) {
5159 if (m_dup_pkthdr(n, mhdr, wait) == 0) {
5160 goto nospace;
5161 }
5162 }
5163 if (len == M_COPYALL) {
5164 n->m_pkthdr.len -= off0;
5165 } else {
5166 n->m_pkthdr.len = len;
5167 }
5168 copyhdr = 0;
5169 /*
5170 * There is data to copy from the packet header mbuf
5171 * if it is empty or it is before the starting offset
5172 */
5173 if (mhdr != m) {
5174 np = &n->m_next;
5175 continue;
5176 }
5177 }
5178 n->m_len = MIN(len, (m->m_len - off));
5179 if (m->m_flags & M_EXT) {
5180 n->m_ext = m->m_ext;
5181 m_incref(m);
5182 n->m_data = m->m_data + off;
5183 n->m_flags |= M_EXT;
5184 } else {
5185 /*
5186 * Limit to the capacity of the destination
5187 */
5188 if (n->m_flags & M_PKTHDR) {
5189 n->m_len = MIN(n->m_len, MHLEN);
5190 } else {
5191 n->m_len = MIN(n->m_len, MLEN);
5192 }
5193
5194 if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE) {
5195 panic("%s n %p copy overflow",
5196 __func__, n);
5197 }
5198
5199 bcopy(MTOD(m, caddr_t) + off, MTOD(n, caddr_t),
5200 (unsigned)n->m_len);
5201 }
5202 if (len != M_COPYALL) {
5203 len -= n->m_len;
5204 }
5205 off = 0;
5206 m = m->m_next;
5207 np = &n->m_next;
5208 }
5209
5210 if (top == NULL) {
5211 MCFail++;
5212 }
5213
5214 return top;
5215 nospace:
5216
5217 m_freem(top);
5218 MCFail++;
5219 return NULL;
5220 }
5221
5222
5223 struct mbuf *
5224 m_copym(struct mbuf *m, int off0, int len, int wait)
5225 {
5226 return m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR);
5227 }
5228
5229 /*
5230 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
5231 * within this routine also, the last mbuf and offset accessed are passed
5232 * out and can be passed back in to avoid having to rescan the entire mbuf
5233 * list (normally hung off of the socket)
5234 */
5235 struct mbuf *
5236 m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
5237 struct mbuf **m_lastm, int *m_off, uint32_t mode)
5238 {
5239 struct mbuf *m = m0, *n, **np = NULL;
5240 int off = off0, len = len0;
5241 struct mbuf *top = NULL;
5242 int mcflags = MSLEEPF(wait);
5243 int copyhdr = 0;
5244 int type = 0;
5245 mcache_obj_t *list = NULL;
5246 int needed = 0;
5247
5248 if (off == 0 && (m->m_flags & M_PKTHDR)) {
5249 copyhdr = 1;
5250 }
5251
5252 if (m_lastm != NULL && *m_lastm != NULL) {
5253 m = *m_lastm;
5254 off = *m_off;
5255 } else {
5256 while (off >= m->m_len) {
5257 off -= m->m_len;
5258 m = m->m_next;
5259 }
5260 }
5261
5262 n = m;
5263 while (len > 0) {
5264 needed++;
5265 ASSERT(n != NULL);
5266 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
5267 n = n->m_next;
5268 }
5269 needed++;
5270 len = len0;
5271
5272 /*
5273 * If the caller doesn't want to be put to sleep, mark it with
5274 * MCR_TRYHARD so that we may reclaim buffers from other places
5275 * before giving up.
5276 */
5277 if (mcflags & MCR_NOSLEEP) {
5278 mcflags |= MCR_TRYHARD;
5279 }
5280
5281 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
5282 mcflags) != needed) {
5283 goto nospace;
5284 }
5285
5286 needed = 0;
5287 while (len > 0) {
5288 n = (struct mbuf *)list;
5289 list = list->obj_next;
5290 ASSERT(n != NULL && m != NULL);
5291
5292 type = (top == NULL) ? MT_HEADER : m->m_type;
5293 MBUF_INIT(n, (top == NULL), type);
5294
5295 if (top == NULL) {
5296 top = n;
5297 np = &top->m_next;
5298 continue;
5299 } else {
5300 needed++;
5301 *np = n;
5302 }
5303
5304 if (copyhdr) {
5305 if ((mode == M_COPYM_MOVE_HDR) ||
5306 (mode == M_COPYM_MUST_MOVE_HDR)) {
5307 M_COPY_PKTHDR(n, m);
5308 } else if ((mode == M_COPYM_COPY_HDR) ||
5309 (mode == M_COPYM_MUST_COPY_HDR)) {
5310 if (m_dup_pkthdr(n, m, wait) == 0) {
5311 goto nospace;
5312 }
5313 }
5314 n->m_pkthdr.len = len;
5315 copyhdr = 0;
5316 }
5317 n->m_len = MIN(len, (m->m_len - off));
5318
5319 if (m->m_flags & M_EXT) {
5320 n->m_ext = m->m_ext;
5321 m_incref(m);
5322 n->m_data = m->m_data + off;
5323 n->m_flags |= M_EXT;
5324 } else {
5325 if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE) {
5326 panic("%s n %p copy overflow",
5327 __func__, n);
5328 }
5329
5330 bcopy(MTOD(m, caddr_t) + off, MTOD(n, caddr_t),
5331 (unsigned)n->m_len);
5332 }
5333 len -= n->m_len;
5334
5335 if (len == 0) {
5336 if (m_lastm != NULL && m_off != NULL) {
5337 if ((off + n->m_len) == m->m_len) {
5338 *m_lastm = m->m_next;
5339 *m_off = 0;
5340 } else {
5341 *m_lastm = m;
5342 *m_off = off + n->m_len;
5343 }
5344 }
5345 break;
5346 }
5347 off = 0;
5348 m = m->m_next;
5349 np = &n->m_next;
5350 }
5351
5352 mtype_stat_inc(MT_HEADER);
5353 mtype_stat_add(type, needed);
5354 mtype_stat_sub(MT_FREE, needed + 1);
5355
5356 ASSERT(list == NULL);
5357 return top;
5358
5359 nospace:
5360 if (list != NULL) {
5361 mcache_free_ext(m_cache(MC_MBUF), list);
5362 }
5363 if (top != NULL) {
5364 m_freem(top);
5365 }
5366 MCFail++;
5367 return NULL;
5368 }
5369
5370 /*
5371 * Copy data from an mbuf chain starting "off" bytes from the beginning,
5372 * continuing for "len" bytes, into the indicated buffer.
5373 */
5374 void
5375 m_copydata(struct mbuf *m, int off, int len, void *vp)
5376 {
5377 int off0 = off, len0 = len;
5378 struct mbuf *m0 = m;
5379 unsigned count;
5380 char *cp = vp;
5381
5382 if (__improbable(off < 0 || len < 0)) {
5383 panic("%s: invalid offset %d or len %d", __func__, off, len);
5384 /* NOTREACHED */
5385 }
5386
5387 while (off > 0) {
5388 if (__improbable(m == NULL)) {
5389 panic("%s: invalid mbuf chain %p [off %d, len %d]",
5390 __func__, m0, off0, len0);
5391 /* NOTREACHED */
5392 }
5393 if (off < m->m_len) {
5394 break;
5395 }
5396 off -= m->m_len;
5397 m = m->m_next;
5398 }
5399 while (len > 0) {
5400 if (__improbable(m == NULL)) {
5401 panic("%s: invalid mbuf chain %p [off %d, len %d]",
5402 __func__, m0, off0, len0);
5403 /* NOTREACHED */
5404 }
5405 count = MIN(m->m_len - off, len);
5406 bcopy(MTOD(m, caddr_t) + off, cp, count);
5407 len -= count;
5408 cp += count;
5409 off = 0;
5410 m = m->m_next;
5411 }
5412 }
5413
5414 /*
5415 * Concatenate mbuf chain n to m. Both chains must be of the same type
5416 * (e.g. MT_DATA). Any m_pkthdr is not updated.
5417 */
5418 void
5419 m_cat(struct mbuf *m, struct mbuf *n)
5420 {
5421 while (m->m_next) {
5422 m = m->m_next;
5423 }
5424 while (n) {
5425 if ((m->m_flags & M_EXT) ||
5426 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
5427 /* just join the two chains */
5428 m->m_next = n;
5429 return;
5430 }
5431 /* splat the data from one into the other */
5432 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5433 (u_int)n->m_len);
5434 m->m_len += n->m_len;
5435 n = m_free(n);
5436 }
5437 }
5438
5439 void
5440 m_adj(struct mbuf *mp, int req_len)
5441 {
5442 int len = req_len;
5443 struct mbuf *m;
5444 int count;
5445
5446 if ((m = mp) == NULL) {
5447 return;
5448 }
5449 if (len >= 0) {
5450 /*
5451 * Trim from head.
5452 */
5453 while (m != NULL && len > 0) {
5454 if (m->m_len <= len) {
5455 len -= m->m_len;
5456 m->m_len = 0;
5457 m = m->m_next;
5458 } else {
5459 m->m_len -= len;
5460 m->m_data += len;
5461 len = 0;
5462 }
5463 }
5464 m = mp;
5465 if (m->m_flags & M_PKTHDR) {
5466 m->m_pkthdr.len -= (req_len - len);
5467 }
5468 } else {
5469 /*
5470 * Trim from tail. Scan the mbuf chain,
5471 * calculating its length and finding the last mbuf.
5472 * If the adjustment only affects this mbuf, then just
5473 * adjust and return. Otherwise, rescan and truncate
5474 * after the remaining size.
5475 */
5476 len = -len;
5477 count = 0;
5478 for (;;) {
5479 count += m->m_len;
5480 if (m->m_next == (struct mbuf *)0) {
5481 break;
5482 }
5483 m = m->m_next;
5484 }
5485 if (m->m_len >= len) {
5486 m->m_len -= len;
5487 m = mp;
5488 if (m->m_flags & M_PKTHDR) {
5489 m->m_pkthdr.len -= len;
5490 }
5491 return;
5492 }
5493 count -= len;
5494 if (count < 0) {
5495 count = 0;
5496 }
5497 /*
5498 * Correct length for chain is "count".
5499 * Find the mbuf with last data, adjust its length,
5500 * and toss data from remaining mbufs on chain.
5501 */
5502 m = mp;
5503 if (m->m_flags & M_PKTHDR) {
5504 m->m_pkthdr.len = count;
5505 }
5506 for (; m; m = m->m_next) {
5507 if (m->m_len >= count) {
5508 m->m_len = count;
5509 break;
5510 }
5511 count -= m->m_len;
5512 }
5513 while ((m = m->m_next)) {
5514 m->m_len = 0;
5515 }
5516 }
5517 }
5518
5519 /*
5520 * Rearange an mbuf chain so that len bytes are contiguous
5521 * and in the data area of an mbuf (so that mtod and dtom
5522 * will work for a structure of size len). Returns the resulting
5523 * mbuf chain on success, frees it and returns null on failure.
5524 * If there is room, it will add up to max_protohdr-len extra bytes to the
5525 * contiguous region in an attempt to avoid being called next time.
5526 */
5527 int MPFail;
5528
5529 struct mbuf *
5530 m_pullup(struct mbuf *n, int len)
5531 {
5532 struct mbuf *m;
5533 int count;
5534 int space;
5535
5536 /* check invalid arguments */
5537 if (n == NULL) {
5538 panic("%s: n == NULL", __func__);
5539 }
5540 if (len < 0) {
5541 os_log_info(OS_LOG_DEFAULT, "%s: failed negative len %d",
5542 __func__, len);
5543 goto bad;
5544 }
5545 if (len > MLEN) {
5546 os_log_info(OS_LOG_DEFAULT, "%s: failed len %d too big",
5547 __func__, len);
5548 goto bad;
5549 }
5550 if ((n->m_flags & M_EXT) == 0 &&
5551 n->m_data >= &n->m_dat[MLEN]) {
5552 os_log_info(OS_LOG_DEFAULT, "%s: m_data out of bounds",
5553 __func__);
5554 goto bad;
5555 }
5556
5557 /*
5558 * If first mbuf has no cluster, and has room for len bytes
5559 * without shifting current data, pullup into it,
5560 * otherwise allocate a new mbuf to prepend to the chain.
5561 */
5562 if ((n->m_flags & M_EXT) == 0 &&
5563 len < &n->m_dat[MLEN] - n->m_data && n->m_next != NULL) {
5564 if (n->m_len >= len) {
5565 return n;
5566 }
5567 m = n;
5568 n = n->m_next;
5569 len -= m->m_len;
5570 } else {
5571 if (len > MHLEN) {
5572 goto bad;
5573 }
5574 _MGET(m, M_DONTWAIT, n->m_type);
5575 if (m == 0) {
5576 goto bad;
5577 }
5578 m->m_len = 0;
5579 if (n->m_flags & M_PKTHDR) {
5580 M_COPY_PKTHDR(m, n);
5581 n->m_flags &= ~M_PKTHDR;
5582 }
5583 }
5584 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5585 do {
5586 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
5587 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5588 (unsigned)count);
5589 len -= count;
5590 m->m_len += count;
5591 n->m_len -= count;
5592 space -= count;
5593 if (n->m_len != 0) {
5594 n->m_data += count;
5595 } else {
5596 n = m_free(n);
5597 }
5598 } while (len > 0 && n != NULL);
5599 if (len > 0) {
5600 (void) m_free(m);
5601 goto bad;
5602 }
5603 m->m_next = n;
5604 return m;
5605 bad:
5606 m_freem(n);
5607 MPFail++;
5608 return 0;
5609 }
5610
5611 /*
5612 * Like m_pullup(), except a new mbuf is always allocated, and we allow
5613 * the amount of empty space before the data in the new mbuf to be specified
5614 * (in the event that the caller expects to prepend later).
5615 */
5616 __private_extern__ int MSFail = 0;
5617
5618 __private_extern__ struct mbuf *
5619 m_copyup(struct mbuf *n, int len, int dstoff)
5620 {
5621 struct mbuf *m;
5622 int count, space;
5623
5624 VERIFY(len >= 0 && dstoff >= 0);
5625
5626 if (len > (MHLEN - dstoff)) {
5627 goto bad;
5628 }
5629 MGET(m, M_DONTWAIT, n->m_type);
5630 if (m == NULL) {
5631 goto bad;
5632 }
5633 m->m_len = 0;
5634 if (n->m_flags & M_PKTHDR) {
5635 m_copy_pkthdr(m, n);
5636 n->m_flags &= ~M_PKTHDR;
5637 }
5638 m->m_data += dstoff;
5639 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5640 do {
5641 count = min(min(max(len, max_protohdr), space), n->m_len);
5642 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
5643 (unsigned)count);
5644 len -= count;
5645 m->m_len += count;
5646 n->m_len -= count;
5647 space -= count;
5648 if (n->m_len) {
5649 n->m_data += count;
5650 } else {
5651 n = m_free(n);
5652 }
5653 } while (len > 0 && n);
5654 if (len > 0) {
5655 (void) m_free(m);
5656 goto bad;
5657 }
5658 m->m_next = n;
5659 return m;
5660 bad:
5661 m_freem(n);
5662 MSFail++;
5663 return NULL;
5664 }
5665
5666 /*
5667 * Partition an mbuf chain in two pieces, returning the tail --
5668 * all but the first len0 bytes. In case of failure, it returns NULL and
5669 * attempts to restore the chain to its original state.
5670 */
5671 struct mbuf *
5672 m_split(struct mbuf *m0, int len0, int wait)
5673 {
5674 return m_split0(m0, len0, wait, 1);
5675 }
5676
5677 static struct mbuf *
5678 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
5679 {
5680 struct mbuf *m, *n;
5681 unsigned len = len0, remain;
5682
5683 /*
5684 * First iterate to the mbuf which contains the first byte of
5685 * data at offset len0
5686 */
5687 for (m = m0; m && len > m->m_len; m = m->m_next) {
5688 len -= m->m_len;
5689 }
5690 if (m == NULL) {
5691 return NULL;
5692 }
5693 /*
5694 * len effectively is now the offset in the current
5695 * mbuf where we have to perform split.
5696 *
5697 * remain becomes the tail length.
5698 * Note that len can also be == m->m_len
5699 */
5700 remain = m->m_len - len;
5701
5702 /*
5703 * If current mbuf len contains the entire remaining offset len,
5704 * just make the second mbuf chain pointing to next mbuf onwards
5705 * and return after making necessary adjustments
5706 */
5707 if (copyhdr && (m0->m_flags & M_PKTHDR) && remain == 0) {
5708 _MGETHDR(n, wait, m0->m_type);
5709 if (n == NULL) {
5710 return NULL;
5711 }
5712 n->m_next = m->m_next;
5713 m->m_next = NULL;
5714 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5715 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5716 m0->m_pkthdr.len = len0;
5717 return n;
5718 }
5719 if (copyhdr && (m0->m_flags & M_PKTHDR)) {
5720 _MGETHDR(n, wait, m0->m_type);
5721 if (n == NULL) {
5722 return NULL;
5723 }
5724 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5725 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5726 m0->m_pkthdr.len = len0;
5727
5728 /*
5729 * If current points to external storage
5730 * then it can be shared by making last mbuf
5731 * of head chain and first mbuf of current chain
5732 * pointing to different data offsets
5733 */
5734 if (m->m_flags & M_EXT) {
5735 goto extpacket;
5736 }
5737 if (remain > MHLEN) {
5738 /* m can't be the lead packet */
5739 MH_ALIGN(n, 0);
5740 n->m_next = m_split(m, len, wait);
5741 if (n->m_next == NULL) {
5742 (void) m_free(n);
5743 return NULL;
5744 } else {
5745 return n;
5746 }
5747 } else {
5748 MH_ALIGN(n, remain);
5749 }
5750 } else if (remain == 0) {
5751 n = m->m_next;
5752 m->m_next = NULL;
5753 return n;
5754 } else {
5755 _MGET(n, wait, m->m_type);
5756 if (n == NULL) {
5757 return NULL;
5758 }
5759
5760 if ((m->m_flags & M_EXT) == 0) {
5761 VERIFY(remain <= MLEN);
5762 M_ALIGN(n, remain);
5763 }
5764 }
5765 extpacket:
5766 if (m->m_flags & M_EXT) {
5767 n->m_flags |= M_EXT;
5768 n->m_ext = m->m_ext;
5769 m_incref(m);
5770 n->m_data = m->m_data + len;
5771 } else {
5772 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
5773 }
5774 n->m_len = remain;
5775 m->m_len = len;
5776 n->m_next = m->m_next;
5777 m->m_next = NULL;
5778 return n;
5779 }
5780
5781 /*
5782 * Routine to copy from device local memory into mbufs.
5783 */
5784 struct mbuf *
5785 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
5786 void (*copy)(const void *, void *, size_t))
5787 {
5788 struct mbuf *m;
5789 struct mbuf *top = NULL, **mp = ⊤
5790 int off = off0, len;
5791 char *cp;
5792 char *epkt;
5793
5794 cp = buf;
5795 epkt = cp + totlen;
5796 if (off) {
5797 /*
5798 * If 'off' is non-zero, packet is trailer-encapsulated,
5799 * so we have to skip the type and length fields.
5800 */
5801 cp += off + 2 * sizeof(u_int16_t);
5802 totlen -= 2 * sizeof(u_int16_t);
5803 }
5804 _MGETHDR(m, M_DONTWAIT, MT_DATA);
5805 if (m == NULL) {
5806 return NULL;
5807 }
5808 m->m_pkthdr.rcvif = ifp;
5809 m->m_pkthdr.len = totlen;
5810 m->m_len = MHLEN;
5811
5812 while (totlen > 0) {
5813 if (top != NULL) {
5814 _MGET(m, M_DONTWAIT, MT_DATA);
5815 if (m == NULL) {
5816 m_freem(top);
5817 return NULL;
5818 }
5819 m->m_len = MLEN;
5820 }
5821 len = MIN(totlen, epkt - cp);
5822 if (len >= MINCLSIZE) {
5823 MCLGET(m, M_DONTWAIT);
5824 if (m->m_flags & M_EXT) {
5825 m->m_len = len = MIN(len, m_maxsize(MC_CL));
5826 } else {
5827 /* give up when it's out of cluster mbufs */
5828 if (top != NULL) {
5829 m_freem(top);
5830 }
5831 m_freem(m);
5832 return NULL;
5833 }
5834 } else {
5835 /*
5836 * Place initial small packet/header at end of mbuf.
5837 */
5838 if (len < m->m_len) {
5839 if (top == NULL &&
5840 len + max_linkhdr <= m->m_len) {
5841 m->m_data += max_linkhdr;
5842 }
5843 m->m_len = len;
5844 } else {
5845 len = m->m_len;
5846 }
5847 }
5848 if (copy) {
5849 copy(cp, MTOD(m, caddr_t), (unsigned)len);
5850 } else {
5851 bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
5852 }
5853 cp += len;
5854 *mp = m;
5855 mp = &m->m_next;
5856 totlen -= len;
5857 if (cp == epkt) {
5858 cp = buf;
5859 }
5860 }
5861 return top;
5862 }
5863
5864 #ifndef MBUF_GROWTH_NORMAL_THRESH
5865 #define MBUF_GROWTH_NORMAL_THRESH 25
5866 #endif
5867
5868 /*
5869 * Cluster freelist allocation check.
5870 */
5871 static int
5872 m_howmany(int num, size_t bufsize)
5873 {
5874 int i = 0, j = 0;
5875 u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5876 u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5877 u_int32_t sumclusters, freeclusters;
5878 u_int32_t percent_pool, percent_kmem;
5879 u_int32_t mb_growth, mb_growth_thresh;
5880
5881 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5882 bufsize == m_maxsize(MC_16KCL));
5883
5884 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5885
5886 /* Numbers in 2K cluster units */
5887 m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5888 m_clusters = m_total(MC_CL);
5889 m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5890 m_16kclusters = m_total(MC_16KCL);
5891 sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5892
5893 m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5894 m_clfree = m_infree(MC_CL);
5895 m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5896 m_16kclfree = m_infree(MC_16KCL);
5897 freeclusters = m_mbfree + m_clfree + m_bigclfree;
5898
5899 /* Bail if we've maxed out the mbuf memory map */
5900 if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5901 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5902 (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5903 mbwdog_logger("maxed out nclusters (%u >= %u) or njcl (%u >= %u)",
5904 sumclusters, nclusters,
5905 (m_16kclusters << NCLPJCLSHIFT), njcl);
5906 return 0;
5907 }
5908
5909 if (bufsize == m_maxsize(MC_BIGCL)) {
5910 /* Under minimum */
5911 if (m_bigclusters < m_minlimit(MC_BIGCL)) {
5912 return m_minlimit(MC_BIGCL) - m_bigclusters;
5913 }
5914
5915 percent_pool =
5916 ((sumclusters - freeclusters) * 100) / sumclusters;
5917 percent_kmem = (sumclusters * 100) / nclusters;
5918
5919 /*
5920 * If a light/normal user, grow conservatively (75%)
5921 * If a heavy user, grow aggressively (50%)
5922 */
5923 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH) {
5924 mb_growth = MB_GROWTH_NORMAL;
5925 } else {
5926 mb_growth = MB_GROWTH_AGGRESSIVE;
5927 }
5928
5929 if (percent_kmem < 5) {
5930 /* For initial allocations */
5931 i = num;
5932 } else {
5933 /* Return if >= MBIGCL_LOWAT clusters available */
5934 if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5935 m_total(MC_BIGCL) >=
5936 MBIGCL_LOWAT + m_minlimit(MC_BIGCL)) {
5937 return 0;
5938 }
5939
5940 /* Ensure at least num clusters are accessible */
5941 if (num >= m_infree(MC_BIGCL)) {
5942 i = num - m_infree(MC_BIGCL);
5943 }
5944 if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL)) {
5945 j = num - (m_total(MC_BIGCL) -
5946 m_minlimit(MC_BIGCL));
5947 }
5948
5949 i = MAX(i, j);
5950
5951 /*
5952 * Grow pool if percent_pool > 75 (normal growth)
5953 * or percent_pool > 50 (aggressive growth).
5954 */
5955 mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5956 if (percent_pool > mb_growth_thresh) {
5957 j = ((sumclusters + num) >> mb_growth) -
5958 freeclusters;
5959 }
5960 i = MAX(i, j);
5961 }
5962
5963 /* Check to ensure we didn't go over limits */
5964 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) {
5965 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5966 }
5967 if ((i << 1) + sumclusters >= nclusters) {
5968 i = (nclusters - sumclusters) >> 1;
5969 }
5970 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5971 VERIFY(sumclusters + (i << 1) <= nclusters);
5972 } else { /* 16K CL */
5973 VERIFY(njcl > 0);
5974 /* Ensure at least num clusters are available */
5975 if (num >= m_16kclfree) {
5976 i = num - m_16kclfree;
5977 }
5978
5979 /* Always grow 16KCL pool aggressively */
5980 if (((m_16kclusters + num) >> 1) > m_16kclfree) {
5981 j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5982 }
5983 i = MAX(i, j);
5984
5985 /* Check to ensure we don't go over limit */
5986 if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL)) {
5987 i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
5988 }
5989 }
5990 return i;
5991 }
5992 /*
5993 * Return the number of bytes in the mbuf chain, m.
5994 */
5995 unsigned int
5996 m_length(struct mbuf *m)
5997 {
5998 struct mbuf *m0;
5999 unsigned int pktlen;
6000
6001 if (m->m_flags & M_PKTHDR) {
6002 return m->m_pkthdr.len;
6003 }
6004
6005 pktlen = 0;
6006 for (m0 = m; m0 != NULL; m0 = m0->m_next) {
6007 pktlen += m0->m_len;
6008 }
6009 return pktlen;
6010 }
6011
6012 /*
6013 * Copy data from a buffer back into the indicated mbuf chain,
6014 * starting "off" bytes from the beginning, extending the mbuf
6015 * chain if necessary.
6016 */
6017 void
6018 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
6019 {
6020 #if DEBUG
6021 struct mbuf *origm = m0;
6022 int error;
6023 #endif /* DEBUG */
6024
6025 if (m0 == NULL) {
6026 return;
6027 }
6028
6029 #if DEBUG
6030 error =
6031 #endif /* DEBUG */
6032 m_copyback0(&m0, off, len, cp,
6033 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
6034
6035 #if DEBUG
6036 if (error != 0 || (m0 != NULL && origm != m0)) {
6037 panic("m_copyback");
6038 }
6039 #endif /* DEBUG */
6040 }
6041
6042 struct mbuf *
6043 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
6044 {
6045 int error;
6046
6047 /* don't support chain expansion */
6048 VERIFY(off + len <= m_length(m0));
6049
6050 error = m_copyback0(&m0, off, len, cp,
6051 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
6052 if (error) {
6053 /*
6054 * no way to recover from partial success.
6055 * just free the chain.
6056 */
6057 m_freem(m0);
6058 return NULL;
6059 }
6060 return m0;
6061 }
6062
6063 /*
6064 * m_makewritable: ensure the specified range writable.
6065 */
6066 int
6067 m_makewritable(struct mbuf **mp, int off, int len, int how)
6068 {
6069 int error;
6070 #if DEBUG
6071 struct mbuf *n;
6072 int origlen, reslen;
6073
6074 origlen = m_length(*mp);
6075 #endif /* DEBUG */
6076
6077 #if 0 /* M_COPYALL is large enough */
6078 if (len == M_COPYALL) {
6079 len = m_length(*mp) - off; /* XXX */
6080 }
6081 #endif
6082
6083 error = m_copyback0(mp, off, len, NULL,
6084 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
6085
6086 #if DEBUG
6087 reslen = 0;
6088 for (n = *mp; n; n = n->m_next) {
6089 reslen += n->m_len;
6090 }
6091 if (origlen != reslen) {
6092 panic("m_makewritable: length changed");
6093 }
6094 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len) {
6095 panic("m_makewritable: inconsist");
6096 }
6097 #endif /* DEBUG */
6098
6099 return error;
6100 }
6101
6102 static int
6103 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
6104 int how)
6105 {
6106 int mlen;
6107 struct mbuf *m, *n;
6108 struct mbuf **mp;
6109 int totlen = 0;
6110 const char *cp = vp;
6111
6112 VERIFY(mp0 != NULL);
6113 VERIFY(*mp0 != NULL);
6114 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
6115 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
6116
6117 /*
6118 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
6119 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
6120 */
6121
6122 VERIFY((~flags & (M_COPYBACK0_EXTEND | M_COPYBACK0_COW)) != 0);
6123
6124 mp = mp0;
6125 m = *mp;
6126 while (off > (mlen = m->m_len)) {
6127 off -= mlen;
6128 totlen += mlen;
6129 if (m->m_next == NULL) {
6130 int tspace;
6131 extend:
6132 if (!(flags & M_COPYBACK0_EXTEND)) {
6133 goto out;
6134 }
6135
6136 /*
6137 * try to make some space at the end of "m".
6138 */
6139
6140 mlen = m->m_len;
6141 if (off + len >= MINCLSIZE &&
6142 !(m->m_flags & M_EXT) && m->m_len == 0) {
6143 MCLGET(m, how);
6144 }
6145 tspace = M_TRAILINGSPACE(m);
6146 if (tspace > 0) {
6147 tspace = MIN(tspace, off + len);
6148 VERIFY(tspace > 0);
6149 bzero(mtod(m, char *) + m->m_len,
6150 MIN(off, tspace));
6151 m->m_len += tspace;
6152 off += mlen;
6153 totlen -= mlen;
6154 continue;
6155 }
6156
6157 /*
6158 * need to allocate an mbuf.
6159 */
6160
6161 if (off + len >= MINCLSIZE) {
6162 n = m_getcl(how, m->m_type, 0);
6163 } else {
6164 n = _M_GET(how, m->m_type);
6165 }
6166 if (n == NULL) {
6167 goto out;
6168 }
6169 n->m_len = 0;
6170 n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
6171 bzero(mtod(n, char *), MIN(n->m_len, off));
6172 m->m_next = n;
6173 }
6174 mp = &m->m_next;
6175 m = m->m_next;
6176 }
6177 while (len > 0) {
6178 mlen = m->m_len - off;
6179 if (mlen != 0 && m_mclhasreference(m)) {
6180 char *datap;
6181 int eatlen;
6182
6183 /*
6184 * this mbuf is read-only.
6185 * allocate a new writable mbuf and try again.
6186 */
6187
6188 #if DIAGNOSTIC
6189 if (!(flags & M_COPYBACK0_COW)) {
6190 panic("m_copyback0: read-only");
6191 }
6192 #endif /* DIAGNOSTIC */
6193
6194 /*
6195 * if we're going to write into the middle of
6196 * a mbuf, split it first.
6197 */
6198 if (off > 0 && len < mlen) {
6199 n = m_split0(m, off, how, 0);
6200 if (n == NULL) {
6201 goto enobufs;
6202 }
6203 m->m_next = n;
6204 mp = &m->m_next;
6205 m = n;
6206 off = 0;
6207 continue;
6208 }
6209
6210 /*
6211 * XXX TODO coalesce into the trailingspace of
6212 * the previous mbuf when possible.
6213 */
6214
6215 /*
6216 * allocate a new mbuf. copy packet header if needed.
6217 */
6218 n = _M_GET(how, m->m_type);
6219 if (n == NULL) {
6220 goto enobufs;
6221 }
6222 if (off == 0 && (m->m_flags & M_PKTHDR)) {
6223 M_COPY_PKTHDR(n, m);
6224 n->m_len = MHLEN;
6225 } else {
6226 if (len >= MINCLSIZE) {
6227 MCLGET(n, M_DONTWAIT);
6228 }
6229 n->m_len =
6230 (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
6231 }
6232 if (n->m_len > len) {
6233 n->m_len = len;
6234 }
6235
6236 /*
6237 * free the region which has been overwritten.
6238 * copying data from old mbufs if requested.
6239 */
6240 if (flags & M_COPYBACK0_PRESERVE) {
6241 datap = mtod(n, char *);
6242 } else {
6243 datap = NULL;
6244 }
6245 eatlen = n->m_len;
6246 VERIFY(off == 0 || eatlen >= mlen);
6247 if (off > 0) {
6248 VERIFY(len >= mlen);
6249 m->m_len = off;
6250 m->m_next = n;
6251 if (datap) {
6252 m_copydata(m, off, mlen, datap);
6253 datap += mlen;
6254 }
6255 eatlen -= mlen;
6256 mp = &m->m_next;
6257 m = m->m_next;
6258 }
6259 while (m != NULL && m_mclhasreference(m) &&
6260 n->m_type == m->m_type && eatlen > 0) {
6261 mlen = MIN(eatlen, m->m_len);
6262 if (datap) {
6263 m_copydata(m, 0, mlen, datap);
6264 datap += mlen;
6265 }
6266 m->m_data += mlen;
6267 m->m_len -= mlen;
6268 eatlen -= mlen;
6269 if (m->m_len == 0) {
6270 *mp = m = m_free(m);
6271 }
6272 }
6273 if (eatlen > 0) {
6274 n->m_len -= eatlen;
6275 }
6276 n->m_next = m;
6277 *mp = m = n;
6278 continue;
6279 }
6280 mlen = MIN(mlen, len);
6281 if (flags & M_COPYBACK0_COPYBACK) {
6282 bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
6283 cp += mlen;
6284 }
6285 len -= mlen;
6286 mlen += off;
6287 off = 0;
6288 totlen += mlen;
6289 if (len == 0) {
6290 break;
6291 }
6292 if (m->m_next == NULL) {
6293 goto extend;
6294 }
6295 mp = &m->m_next;
6296 m = m->m_next;
6297 }
6298 out:
6299 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
6300 VERIFY(flags & M_COPYBACK0_EXTEND);
6301 m->m_pkthdr.len = totlen;
6302 }
6303
6304 return 0;
6305
6306 enobufs:
6307 return ENOBUFS;
6308 }
6309
6310 uint64_t
6311 mcl_to_paddr(char *addr)
6312 {
6313 vm_offset_t base_phys;
6314
6315 if (!MBUF_IN_MAP(addr)) {
6316 return 0;
6317 }
6318 base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
6319
6320 if (base_phys == 0) {
6321 return 0;
6322 }
6323 return (uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK));
6324 }
6325
6326 /*
6327 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
6328 * And really copy the thing. That way, we don't "precompute" checksums
6329 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
6330 * small packets, don't dup into a cluster. That way received packets
6331 * don't take up too much room in the sockbuf (cf. sbspace()).
6332 */
6333 int MDFail;
6334
6335 struct mbuf *
6336 m_dup(struct mbuf *m, int how)
6337 {
6338 struct mbuf *n, **np;
6339 struct mbuf *top;
6340 int copyhdr = 0;
6341
6342 np = ⊤
6343 top = NULL;
6344 if (m->m_flags & M_PKTHDR) {
6345 copyhdr = 1;
6346 }
6347
6348 /*
6349 * Quick check: if we have one mbuf and its data fits in an
6350 * mbuf with packet header, just copy and go.
6351 */
6352 if (m->m_next == NULL) {
6353 /* Then just move the data into an mbuf and be done... */
6354 if (copyhdr) {
6355 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
6356 if ((n = _M_GETHDR(how, m->m_type)) == NULL) {
6357 return NULL;
6358 }
6359 n->m_len = m->m_len;
6360 m_dup_pkthdr(n, m, how);
6361 bcopy(m->m_data, n->m_data, m->m_len);
6362 return n;
6363 }
6364 } else if (m->m_len <= MLEN) {
6365 if ((n = _M_GET(how, m->m_type)) == NULL) {
6366 return NULL;
6367 }
6368 bcopy(m->m_data, n->m_data, m->m_len);
6369 n->m_len = m->m_len;
6370 return n;
6371 }
6372 }
6373 while (m != NULL) {
6374 #if BLUE_DEBUG
6375 printf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
6376 m->m_data);
6377 #endif
6378 if (copyhdr) {
6379 n = _M_GETHDR(how, m->m_type);
6380 } else {
6381 n = _M_GET(how, m->m_type);
6382 }
6383 if (n == NULL) {
6384 goto nospace;
6385 }
6386 if (m->m_flags & M_EXT) {
6387 if (m->m_len <= m_maxsize(MC_CL)) {
6388 MCLGET(n, how);
6389 } else if (m->m_len <= m_maxsize(MC_BIGCL)) {
6390 n = m_mbigget(n, how);
6391 } else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0) {
6392 n = m_m16kget(n, how);
6393 }
6394 if (!(n->m_flags & M_EXT)) {
6395 (void) m_free(n);
6396 goto nospace;
6397 }
6398 } else {
6399 VERIFY((copyhdr == 1 && m->m_len <= MHLEN) ||
6400 (copyhdr == 0 && m->m_len <= MLEN));
6401 }
6402 *np = n;
6403 if (copyhdr) {
6404 /* Don't use M_COPY_PKTHDR: preserve m_data */
6405 m_dup_pkthdr(n, m, how);
6406 copyhdr = 0;
6407 if (!(n->m_flags & M_EXT)) {
6408 n->m_data = n->m_pktdat;
6409 }
6410 }
6411 n->m_len = m->m_len;
6412 /*
6413 * Get the dup on the same bdry as the original
6414 * Assume that the two mbufs have the same offset to data area
6415 * (up to word boundaries)
6416 */
6417 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
6418 m = m->m_next;
6419 np = &n->m_next;
6420 #if BLUE_DEBUG
6421 printf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
6422 n->m_data);
6423 #endif
6424 }
6425
6426 if (top == NULL) {
6427 MDFail++;
6428 }
6429 return top;
6430
6431 nospace:
6432 m_freem(top);
6433 MDFail++;
6434 return NULL;
6435 }
6436
6437 #define MBUF_MULTIPAGES(m) \
6438 (((m)->m_flags & M_EXT) && \
6439 ((IS_P2ALIGNED((m)->m_data, PAGE_SIZE) \
6440 && (m)->m_len > PAGE_SIZE) || \
6441 (!IS_P2ALIGNED((m)->m_data, PAGE_SIZE) && \
6442 P2ROUNDUP((m)->m_data, PAGE_SIZE) < ((uintptr_t)(m)->m_data + (m)->m_len))))
6443
6444 static struct mbuf *
6445 m_expand(struct mbuf *m, struct mbuf **last)
6446 {
6447 struct mbuf *top = NULL;
6448 struct mbuf **nm = ⊤
6449 uintptr_t data0, data;
6450 unsigned int len0, len;
6451
6452 VERIFY(MBUF_MULTIPAGES(m));
6453 VERIFY(m->m_next == NULL);
6454 data0 = (uintptr_t)m->m_data;
6455 len0 = m->m_len;
6456 *last = top;
6457
6458 for (;;) {
6459 struct mbuf *n;
6460
6461 data = data0;
6462 if (IS_P2ALIGNED(data, PAGE_SIZE) && len0 > PAGE_SIZE) {
6463 len = PAGE_SIZE;
6464 } else if (!IS_P2ALIGNED(data, PAGE_SIZE) &&
6465 P2ROUNDUP(data, PAGE_SIZE) < (data + len0)) {
6466 len = P2ROUNDUP(data, PAGE_SIZE) - data;
6467 } else {
6468 len = len0;
6469 }
6470
6471 VERIFY(len > 0);
6472 VERIFY(m->m_flags & M_EXT);
6473 m->m_data = (void *)data;
6474 m->m_len = len;
6475
6476 *nm = *last = m;
6477 nm = &m->m_next;
6478 m->m_next = NULL;
6479
6480 data0 += len;
6481 len0 -= len;
6482 if (len0 == 0) {
6483 break;
6484 }
6485
6486 n = _M_RETRY(M_DONTWAIT, MT_DATA);
6487 if (n == NULL) {
6488 m_freem(top);
6489 top = *last = NULL;
6490 break;
6491 }
6492
6493 n->m_ext = m->m_ext;
6494 m_incref(m);
6495 n->m_flags |= M_EXT;
6496 m = n;
6497 }
6498 return top;
6499 }
6500
6501 struct mbuf *
6502 m_normalize(struct mbuf *m)
6503 {
6504 struct mbuf *top = NULL;
6505 struct mbuf **nm = ⊤
6506 boolean_t expanded = FALSE;
6507
6508 while (m != NULL) {
6509 struct mbuf *n;
6510
6511 n = m->m_next;
6512 m->m_next = NULL;
6513
6514 /* Does the data cross one or more page boundaries? */
6515 if (MBUF_MULTIPAGES(m)) {
6516 struct mbuf *last;
6517 if ((m = m_expand(m, &last)) == NULL) {
6518 m_freem(n);
6519 m_freem(top);
6520 top = NULL;
6521 break;
6522 }
6523 *nm = m;
6524 nm = &last->m_next;
6525 expanded = TRUE;
6526 } else {
6527 *nm = m;
6528 nm = &m->m_next;
6529 }
6530 m = n;
6531 }
6532 if (expanded) {
6533 atomic_add_32(&mb_normalized, 1);
6534 }
6535 return top;
6536 }
6537
6538 /*
6539 * Append the specified data to the indicated mbuf chain,
6540 * Extend the mbuf chain if the new data does not fit in
6541 * existing space.
6542 *
6543 * Return 1 if able to complete the job; otherwise 0.
6544 */
6545 int
6546 m_append(struct mbuf *m0, int len, caddr_t cp)
6547 {
6548 struct mbuf *m, *n;
6549 int remainder, space;
6550
6551 for (m = m0; m->m_next != NULL; m = m->m_next) {
6552 ;
6553 }
6554 remainder = len;
6555 space = M_TRAILINGSPACE(m);
6556 if (space > 0) {
6557 /*
6558 * Copy into available space.
6559 */
6560 if (space > remainder) {
6561 space = remainder;
6562 }
6563 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
6564 m->m_len += space;
6565 cp += space;
6566 remainder -= space;
6567 }
6568 while (remainder > 0) {
6569 /*
6570 * Allocate a new mbuf; could check space
6571 * and allocate a cluster instead.
6572 */
6573 n = m_get(M_WAITOK, m->m_type);
6574 if (n == NULL) {
6575 break;
6576 }
6577 n->m_len = min(MLEN, remainder);
6578 bcopy(cp, mtod(n, caddr_t), n->m_len);
6579 cp += n->m_len;
6580 remainder -= n->m_len;
6581 m->m_next = n;
6582 m = n;
6583 }
6584 if (m0->m_flags & M_PKTHDR) {
6585 m0->m_pkthdr.len += len - remainder;
6586 }
6587 return remainder == 0;
6588 }
6589
6590 struct mbuf *
6591 m_last(struct mbuf *m)
6592 {
6593 while (m->m_next != NULL) {
6594 m = m->m_next;
6595 }
6596 return m;
6597 }
6598
6599 unsigned int
6600 m_fixhdr(struct mbuf *m0)
6601 {
6602 u_int len;
6603
6604 VERIFY(m0->m_flags & M_PKTHDR);
6605
6606 len = m_length2(m0, NULL);
6607 m0->m_pkthdr.len = len;
6608 return len;
6609 }
6610
6611 unsigned int
6612 m_length2(struct mbuf *m0, struct mbuf **last)
6613 {
6614 struct mbuf *m;
6615 u_int len;
6616
6617 len = 0;
6618 for (m = m0; m != NULL; m = m->m_next) {
6619 len += m->m_len;
6620 if (m->m_next == NULL) {
6621 break;
6622 }
6623 }
6624 if (last != NULL) {
6625 *last = m;
6626 }
6627 return len;
6628 }
6629
6630 /*
6631 * Defragment a mbuf chain, returning the shortest possible chain of mbufs
6632 * and clusters. If allocation fails and this cannot be completed, NULL will
6633 * be returned, but the passed in chain will be unchanged. Upon success,
6634 * the original chain will be freed, and the new chain will be returned.
6635 *
6636 * If a non-packet header is passed in, the original mbuf (chain?) will
6637 * be returned unharmed.
6638 *
6639 * If offset is specfied, the first mbuf in the chain will have a leading
6640 * space of the amount stated by the "off" parameter.
6641 *
6642 * This routine requires that the m_pkthdr.header field of the original
6643 * mbuf chain is cleared by the caller.
6644 */
6645 struct mbuf *
6646 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
6647 {
6648 struct mbuf *m_new = NULL, *m_final = NULL;
6649 int progress = 0, length, pktlen;
6650
6651 if (!(m0->m_flags & M_PKTHDR)) {
6652 return m0;
6653 }
6654
6655 VERIFY(off < MHLEN);
6656 m_fixhdr(m0); /* Needed sanity check */
6657
6658 pktlen = m0->m_pkthdr.len + off;
6659 if (pktlen > MHLEN) {
6660 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
6661 } else {
6662 m_final = m_gethdr(how, MT_DATA);
6663 }
6664
6665 if (m_final == NULL) {
6666 goto nospace;
6667 }
6668
6669 if (off > 0) {
6670 pktlen -= off;
6671 m_final->m_data += off;
6672 }
6673
6674 /*
6675 * Caller must have handled the contents pointed to by this
6676 * pointer before coming here, as otherwise it will point to
6677 * the original mbuf which will get freed upon success.
6678 */
6679 VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
6680
6681 if (m_dup_pkthdr(m_final, m0, how) == 0) {
6682 goto nospace;
6683 }
6684
6685 m_new = m_final;
6686
6687 while (progress < pktlen) {
6688 length = pktlen - progress;
6689 if (length > MCLBYTES) {
6690 length = MCLBYTES;
6691 }
6692 length -= ((m_new == m_final) ? off : 0);
6693 if (length < 0) {
6694 goto nospace;
6695 }
6696
6697 if (m_new == NULL) {
6698 if (length > MLEN) {
6699 m_new = m_getcl(how, MT_DATA, 0);
6700 } else {
6701 m_new = m_get(how, MT_DATA);
6702 }
6703 if (m_new == NULL) {
6704 goto nospace;
6705 }
6706 }
6707
6708 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
6709 progress += length;
6710 m_new->m_len = length;
6711 if (m_new != m_final) {
6712 m_cat(m_final, m_new);
6713 }
6714 m_new = NULL;
6715 }
6716 m_freem(m0);
6717 m0 = m_final;
6718 return m0;
6719 nospace:
6720 if (m_final) {
6721 m_freem(m_final);
6722 }
6723 return NULL;
6724 }
6725
6726 struct mbuf *
6727 m_defrag(struct mbuf *m0, int how)
6728 {
6729 return m_defrag_offset(m0, 0, how);
6730 }
6731
6732 void
6733 m_mchtype(struct mbuf *m, int t)
6734 {
6735 mtype_stat_inc(t);
6736 mtype_stat_dec(m->m_type);
6737 (m)->m_type = t;
6738 }
6739
6740 void *
6741 m_mtod(struct mbuf *m)
6742 {
6743 return MTOD(m, void *);
6744 }
6745
6746 struct mbuf *
6747 m_dtom(void *x)
6748 {
6749 return (struct mbuf *)((uintptr_t)(x) & ~(MSIZE - 1));
6750 }
6751
6752 void
6753 m_mcheck(struct mbuf *m)
6754 {
6755 _MCHECK(m);
6756 }
6757
6758 /*
6759 * Return a pointer to mbuf/offset of location in mbuf chain.
6760 */
6761 struct mbuf *
6762 m_getptr(struct mbuf *m, int loc, int *off)
6763 {
6764 while (loc >= 0) {
6765 /* Normal end of search. */
6766 if (m->m_len > loc) {
6767 *off = loc;
6768 return m;
6769 } else {
6770 loc -= m->m_len;
6771 if (m->m_next == NULL) {
6772 if (loc == 0) {
6773 /* Point at the end of valid data. */
6774 *off = m->m_len;
6775 return m;
6776 }
6777 return NULL;
6778 }
6779 m = m->m_next;
6780 }
6781 }
6782 return NULL;
6783 }
6784
6785 /*
6786 * Inform the corresponding mcache(s) that there's a waiter below.
6787 */
6788 static void
6789 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
6790 {
6791 mcache_waiter_inc(m_cache(class));
6792 if (comp) {
6793 if (class == MC_CL) {
6794 mcache_waiter_inc(m_cache(MC_MBUF_CL));
6795 } else if (class == MC_BIGCL) {
6796 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6797 } else if (class == MC_16KCL) {
6798 mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
6799 } else {
6800 mcache_waiter_inc(m_cache(MC_MBUF_CL));
6801 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6802 }
6803 }
6804 }
6805
6806 /*
6807 * Inform the corresponding mcache(s) that there's no more waiter below.
6808 */
6809 static void
6810 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
6811 {
6812 mcache_waiter_dec(m_cache(class));
6813 if (comp) {
6814 if (class == MC_CL) {
6815 mcache_waiter_dec(m_cache(MC_MBUF_CL));
6816 } else if (class == MC_BIGCL) {
6817 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6818 } else if (class == MC_16KCL) {
6819 mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
6820 } else {
6821 mcache_waiter_dec(m_cache(MC_MBUF_CL));
6822 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6823 }
6824 }
6825 }
6826
6827 static bool mbuf_watchdog_defunct_active = false;
6828
6829 static uint32_t
6830 mbuf_watchdog_socket_space(struct socket *so)
6831 {
6832 uint32_t space = 0;
6833
6834 if (so == NULL) {
6835 return 0;
6836 }
6837
6838 space = so->so_snd.sb_mbcnt + so->so_rcv.sb_mbcnt;
6839
6840 #if INET
6841 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6842 SOCK_PROTO(so) == IPPROTO_TCP) {
6843 space += tcp_reass_qlen_space(so);
6844 }
6845 #endif /* INET */
6846
6847 return space;
6848 }
6849
6850 struct mbuf_watchdog_defunct_args {
6851 struct proc *top_app;
6852 uint32_t top_app_space_used;
6853 bool non_blocking;
6854 };
6855
6856 static bool
6857 proc_fd_trylock(proc_t p)
6858 {
6859 return lck_mtx_try_lock(&p->p_fd.fd_lock);
6860 }
6861
6862 static int
6863 mbuf_watchdog_defunct_iterate(proc_t p, void *arg)
6864 {
6865 struct fileproc *fp = NULL;
6866 struct mbuf_watchdog_defunct_args *args =
6867 (struct mbuf_watchdog_defunct_args *)arg;
6868 uint32_t space_used = 0;
6869
6870 /*
6871 * Non-blocking is only used when dumping the mbuf usage from the watchdog
6872 */
6873 if (args->non_blocking) {
6874 if (!proc_fd_trylock(p)) {
6875 return PROC_RETURNED;
6876 }
6877 } else {
6878 proc_fdlock(p);
6879 }
6880 fdt_foreach(fp, p) {
6881 struct fileglob *fg = fp->fp_glob;
6882 struct socket *so = NULL;
6883
6884 if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
6885 continue;
6886 }
6887 so = fg_get_data(fg);
6888 /*
6889 * We calculate the space without the socket
6890 * lock because we don't want to be blocked
6891 * by another process that called send() and
6892 * is stuck waiting for mbufs.
6893 *
6894 * These variables are 32-bit so we don't have
6895 * to worry about incomplete reads.
6896 */
6897 space_used += mbuf_watchdog_socket_space(so);
6898 }
6899 proc_fdunlock(p);
6900 if (space_used > args->top_app_space_used) {
6901 if (args->top_app != NULL) {
6902 proc_rele(args->top_app);
6903 }
6904 args->top_app = p;
6905 args->top_app_space_used = space_used;
6906
6907 return PROC_CLAIMED;
6908 } else {
6909 return PROC_RETURNED;
6910 }
6911 }
6912
6913 extern char *proc_name_address(void *p);
6914
6915 static void
6916 mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1)
6917 {
6918 #pragma unused(arg0, arg1)
6919 struct mbuf_watchdog_defunct_args args = {};
6920 struct fileproc *fp = NULL;
6921
6922 args.non_blocking = false;
6923 proc_iterate(PROC_ALLPROCLIST,
6924 mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
6925
6926 /*
6927 * Defunct all sockets from this app.
6928 */
6929 if (args.top_app != NULL) {
6930 /* Restart the watchdog count. */
6931 lck_mtx_lock(mbuf_mlock);
6932 microuptime(&mb_wdtstart);
6933 lck_mtx_unlock(mbuf_mlock);
6934 os_log(OS_LOG_DEFAULT, "%s: defuncting all sockets from %s.%d",
6935 __func__,
6936 proc_name_address(args.top_app),
6937 proc_pid(args.top_app));
6938 proc_fdlock(args.top_app);
6939 fdt_foreach(fp, args.top_app) {
6940 struct fileglob *fg = fp->fp_glob;
6941 struct socket *so = NULL;
6942
6943 if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
6944 continue;
6945 }
6946 so = (struct socket *)fp_get_data(fp);
6947 if (!socket_try_lock(so)) {
6948 continue;
6949 }
6950 if (sosetdefunct(args.top_app, so,
6951 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
6952 TRUE) == 0) {
6953 sodefunct(args.top_app, so,
6954 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
6955 }
6956 socket_unlock(so, 0);
6957 }
6958 proc_fdunlock(args.top_app);
6959 proc_rele(args.top_app);
6960 mbstat.m_forcedefunct++;
6961 }
6962 mbuf_watchdog_defunct_active = false;
6963 }
6964
6965 /*
6966 * Called during slab (blocking and non-blocking) allocation. If there
6967 * is at least one waiter, and the time since the first waiter is blocked
6968 * is greater than the watchdog timeout, panic the system.
6969 */
6970 static void
6971 mbuf_watchdog(void)
6972 {
6973 struct timeval now;
6974 unsigned int since;
6975 static thread_call_t defunct_tcall = NULL;
6976
6977 if (mb_waiters == 0 || !mb_watchdog) {
6978 return;
6979 }
6980
6981 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6982
6983 microuptime(&now);
6984 since = now.tv_sec - mb_wdtstart.tv_sec;
6985
6986 if (mbuf_watchdog_defunct_active) {
6987 /*
6988 * Don't panic the system while we are trying
6989 * to find sockets to defunct.
6990 */
6991 return;
6992 }
6993 if (since >= MB_WDT_MAXTIME) {
6994 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
6995 mb_waiters, since, mbuf_dump());
6996 /* NOTREACHED */
6997 }
6998 /*
6999 * Check if we are about to panic the system due
7000 * to lack of mbufs and start defuncting sockets
7001 * from processes that use too many sockets.
7002 *
7003 * We're always called with the mbuf_mlock held,
7004 * so that also protects mbuf_watchdog_defunct_active.
7005 */
7006 if (since >= MB_WDT_MAXTIME / 2) {
7007 /*
7008 * Start a thread to defunct sockets
7009 * from apps that are over-using their socket
7010 * buffers.
7011 */
7012 if (defunct_tcall == NULL) {
7013 defunct_tcall =
7014 thread_call_allocate_with_options(mbuf_watchdog_defunct,
7015 NULL,
7016 THREAD_CALL_PRIORITY_KERNEL,
7017 THREAD_CALL_OPTIONS_ONCE);
7018 }
7019 if (defunct_tcall != NULL) {
7020 mbuf_watchdog_defunct_active = true;
7021 thread_call_enter(defunct_tcall);
7022 }
7023 }
7024 }
7025
7026 /*
7027 * Called during blocking allocation. Returns TRUE if one or more objects
7028 * are available at the per-CPU caches layer and that allocation should be
7029 * retried at that level.
7030 */
7031 static boolean_t
7032 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
7033 {
7034 boolean_t mcache_retry = FALSE;
7035
7036 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
7037
7038 /* Check if there's anything at the cache layer */
7039 if (mbuf_cached_above(class, wait)) {
7040 mcache_retry = TRUE;
7041 goto done;
7042 }
7043
7044 /* Nothing? Then try hard to get it from somewhere */
7045 m_reclaim(class, num, (wait & MCR_COMP));
7046
7047 /* We tried hard and got something? */
7048 if (m_infree(class) > 0) {
7049 mbstat.m_wait++;
7050 goto done;
7051 } else if (mbuf_cached_above(class, wait)) {
7052 mbstat.m_wait++;
7053 mcache_retry = TRUE;
7054 goto done;
7055 } else if (wait & MCR_TRYHARD) {
7056 mcache_retry = TRUE;
7057 goto done;
7058 }
7059
7060 /*
7061 * There's really nothing for us right now; inform the
7062 * cache(s) that there is a waiter below and go to sleep.
7063 */
7064 mbuf_waiter_inc(class, (wait & MCR_COMP));
7065
7066 VERIFY(!(wait & MCR_NOSLEEP));
7067
7068 /*
7069 * If this is the first waiter, arm the watchdog timer. Otherwise
7070 * check if we need to panic the system due to watchdog timeout.
7071 */
7072 if (mb_waiters == 0) {
7073 microuptime(&mb_wdtstart);
7074 } else {
7075 mbuf_watchdog();
7076 }
7077
7078 mb_waiters++;
7079 m_region_expand(class) += m_total(class) + num;
7080 /* wake up the worker thread */
7081 if (mbuf_worker_ready &&
7082 mbuf_worker_needs_wakeup) {
7083 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
7084 mbuf_worker_needs_wakeup = FALSE;
7085 }
7086 mbwdog_logger("waiting (%d mbufs in class %s)", num, m_cname(class));
7087 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO - 1), m_cname(class), NULL);
7088 mbwdog_logger("woke up (%d mbufs in class %s) ", num, m_cname(class));
7089
7090 /* We are now up; stop getting notified until next round */
7091 mbuf_waiter_dec(class, (wait & MCR_COMP));
7092
7093 /* We waited and got something */
7094 if (m_infree(class) > 0) {
7095 mbstat.m_wait++;
7096 goto done;
7097 } else if (mbuf_cached_above(class, wait)) {
7098 mbstat.m_wait++;
7099 mcache_retry = TRUE;
7100 }
7101 done:
7102 return mcache_retry;
7103 }
7104
7105 __attribute__((noreturn))
7106 static void
7107 mbuf_worker_thread(void)
7108 {
7109 int mbuf_expand;
7110
7111 while (1) {
7112 lck_mtx_lock(mbuf_mlock);
7113 mbwdog_logger("worker thread running");
7114 mbuf_worker_run_cnt++;
7115 mbuf_expand = 0;
7116 /*
7117 * Allocations are based on page size, so if we have depleted
7118 * the reserved spaces, try to free mbufs from the major classes.
7119 */
7120 #if PAGE_SIZE == 4096
7121 uint32_t m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
7122 uint32_t m_clusters = m_total(MC_CL);
7123 uint32_t m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
7124 uint32_t sumclusters = m_mbclusters + m_clusters + m_bigclusters;
7125 if (sumclusters >= nclusters) {
7126 mbwdog_logger("reclaiming bigcl");
7127 mbuf_drain_locked(TRUE);
7128 m_reclaim(MC_BIGCL, 4, FALSE);
7129 }
7130 #else
7131 uint32_t m_16kclusters = m_total(MC_16KCL);
7132 if (njcl > 0 && (m_16kclusters << NCLPJCLSHIFT) >= njcl) {
7133 mbwdog_logger("reclaiming 16kcl");
7134 mbuf_drain_locked(TRUE);
7135 m_reclaim(MC_16KCL, 4, FALSE);
7136 }
7137 #endif
7138 if (m_region_expand(MC_CL) > 0) {
7139 int n;
7140 mb_expand_cl_cnt++;
7141 /* Adjust to current number of cluster in use */
7142 n = m_region_expand(MC_CL) -
7143 (m_total(MC_CL) - m_infree(MC_CL));
7144 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL)) {
7145 n = m_maxlimit(MC_CL) - m_total(MC_CL);
7146 }
7147 if (n > 0) {
7148 mb_expand_cl_total += n;
7149 }
7150 m_region_expand(MC_CL) = 0;
7151
7152 if (n > 0) {
7153 mbwdog_logger("expanding MC_CL by %d", n);
7154 freelist_populate(MC_CL, n, M_WAIT);
7155 }
7156 }
7157 if (m_region_expand(MC_BIGCL) > 0) {
7158 int n;
7159 mb_expand_bigcl_cnt++;
7160 /* Adjust to current number of 4 KB cluster in use */
7161 n = m_region_expand(MC_BIGCL) -
7162 (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
7163 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL)) {
7164 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
7165 }
7166 if (n > 0) {
7167 mb_expand_bigcl_total += n;
7168 }
7169 m_region_expand(MC_BIGCL) = 0;
7170
7171 if (n > 0) {
7172 mbwdog_logger("expanding MC_BIGCL by %d", n);
7173 freelist_populate(MC_BIGCL, n, M_WAIT);
7174 }
7175 }
7176 if (m_region_expand(MC_16KCL) > 0) {
7177 int n;
7178 mb_expand_16kcl_cnt++;
7179 /* Adjust to current number of 16 KB cluster in use */
7180 n = m_region_expand(MC_16KCL) -
7181 (m_total(MC_16KCL) - m_infree(MC_16KCL));
7182 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL)) {
7183 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
7184 }
7185 if (n > 0) {
7186 mb_expand_16kcl_total += n;
7187 }
7188 m_region_expand(MC_16KCL) = 0;
7189
7190 if (n > 0) {
7191 mbwdog_logger("expanding MC_16KCL by %d", n);
7192 (void) freelist_populate(MC_16KCL, n, M_WAIT);
7193 }
7194 }
7195
7196 /*
7197 * Because we can run out of memory before filling the mbuf
7198 * map, we should not allocate more clusters than they are
7199 * mbufs -- otherwise we could have a large number of useless
7200 * clusters allocated.
7201 */
7202 mbwdog_logger("totals: MC_MBUF %d MC_BIGCL %d MC_CL %d MC_16KCL %d",
7203 m_total(MC_MBUF), m_total(MC_BIGCL), m_total(MC_CL),
7204 m_total(MC_16KCL));
7205 uint32_t total_mbufs = m_total(MC_MBUF);
7206 uint32_t total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
7207 m_total(MC_16KCL);
7208 if (total_mbufs < total_clusters) {
7209 mbwdog_logger("expanding MC_MBUF by %d",
7210 total_clusters - total_mbufs);
7211 }
7212 while (total_mbufs < total_clusters) {
7213 mb_expand_cnt++;
7214 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0) {
7215 break;
7216 }
7217 total_mbufs = m_total(MC_MBUF);
7218 total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
7219 m_total(MC_16KCL);
7220 }
7221
7222 mbuf_worker_needs_wakeup = TRUE;
7223 /*
7224 * If there's a deadlock and we're not sending / receiving
7225 * packets, net_uptime() won't be updated. Update it here
7226 * so we are sure it's correct.
7227 */
7228 net_update_uptime();
7229 mbuf_worker_last_runtime = net_uptime();
7230 assert_wait((caddr_t)&mbuf_worker_needs_wakeup,
7231 THREAD_UNINT);
7232 mbwdog_logger("worker thread sleeping");
7233 lck_mtx_unlock(mbuf_mlock);
7234 (void) thread_block((thread_continue_t)mbuf_worker_thread);
7235 }
7236 }
7237
7238 __attribute__((noreturn))
7239 static void
7240 mbuf_worker_thread_init(void)
7241 {
7242 mbuf_worker_ready++;
7243 mbuf_worker_thread();
7244 }
7245
7246 static mcl_slab_t *
7247 slab_get(void *buf)
7248 {
7249 mcl_slabg_t *slg;
7250 unsigned int ix, k;
7251
7252 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
7253
7254 VERIFY(MBUF_IN_MAP(buf));
7255 ix = ((unsigned char *)buf - mbutl) >> MBSHIFT;
7256 VERIFY(ix < maxslabgrp);
7257
7258 if ((slg = slabstbl[ix]) == NULL) {
7259 /*
7260 * In the current implementation, we never shrink the slabs
7261 * table; if we attempt to reallocate a cluster group when
7262 * it's already allocated, panic since this is a sign of a
7263 * memory corruption (slabstbl[ix] got nullified).
7264 */
7265 ++slabgrp;
7266 VERIFY(ix < slabgrp);
7267 /*
7268 * Slabs expansion can only be done single threaded; when
7269 * we get here, it must be as a result of m_clalloc() which
7270 * is serialized and therefore mb_clalloc_busy must be set.
7271 */
7272 VERIFY(mb_clalloc_busy);
7273 lck_mtx_unlock(mbuf_mlock);
7274
7275 /* This is a new buffer; create the slabs group for it */
7276 slg = zalloc_permanent_type(mcl_slabg_t);
7277 slg->slg_slab = zalloc_permanent(sizeof(mcl_slab_t) * NSLABSPMB,
7278 ZALIGN(mcl_slab_t));
7279
7280 lck_mtx_lock(mbuf_mlock);
7281 /*
7282 * No other thread could have gone into m_clalloc() after
7283 * we dropped the lock above, so verify that it's true.
7284 */
7285 VERIFY(mb_clalloc_busy);
7286
7287 slabstbl[ix] = slg;
7288
7289 /* Chain each slab in the group to its forward neighbor */
7290 for (k = 1; k < NSLABSPMB; k++) {
7291 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
7292 }
7293 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
7294
7295 /* And chain the last slab in the previous group to this */
7296 if (ix > 0) {
7297 VERIFY(slabstbl[ix - 1]->
7298 slg_slab[NSLABSPMB - 1].sl_next == NULL);
7299 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
7300 &slg->slg_slab[0];
7301 }
7302 }
7303
7304 ix = MTOPG(buf) % NSLABSPMB;
7305 VERIFY(ix < NSLABSPMB);
7306
7307 return &slg->slg_slab[ix];
7308 }
7309
7310 static void
7311 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
7312 void *base, void *head, unsigned int len, int refcnt, int chunks)
7313 {
7314 sp->sl_class = class;
7315 sp->sl_flags = flags;
7316 sp->sl_base = base;
7317 sp->sl_head = head;
7318 sp->sl_len = len;
7319 sp->sl_refcnt = refcnt;
7320 sp->sl_chunks = chunks;
7321 slab_detach(sp);
7322 }
7323
7324 static void
7325 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
7326 {
7327 VERIFY(slab_is_detached(sp));
7328 m_slab_cnt(class)++;
7329 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
7330 sp->sl_flags &= ~SLF_DETACHED;
7331
7332 /*
7333 * If a buffer spans multiple contiguous pages then mark them as
7334 * detached too
7335 */
7336 if (class == MC_16KCL) {
7337 int k;
7338 for (k = 1; k < NSLABSP16KB; k++) {
7339 sp = sp->sl_next;
7340 /* Next slab must already be present */
7341 VERIFY(sp != NULL && slab_is_detached(sp));
7342 sp->sl_flags &= ~SLF_DETACHED;
7343 }
7344 }
7345 }
7346
7347 static void
7348 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
7349 {
7350 int k;
7351 VERIFY(!slab_is_detached(sp));
7352 VERIFY(m_slab_cnt(class) > 0);
7353 m_slab_cnt(class)--;
7354 TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
7355 slab_detach(sp);
7356 if (class == MC_16KCL) {
7357 for (k = 1; k < NSLABSP16KB; k++) {
7358 sp = sp->sl_next;
7359 /* Next slab must already be present */
7360 VERIFY(sp != NULL);
7361 VERIFY(!slab_is_detached(sp));
7362 slab_detach(sp);
7363 }
7364 }
7365 }
7366
7367 static boolean_t
7368 slab_inrange(mcl_slab_t *sp, void *buf)
7369 {
7370 return (uintptr_t)buf >= (uintptr_t)sp->sl_base &&
7371 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len);
7372 }
7373
7374 #undef panic
7375
7376 static void
7377 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
7378 {
7379 int i;
7380 unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
7381 uintptr_t buf = (uintptr_t)sp->sl_base;
7382
7383 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
7384 void *next = ((mcache_obj_t *)buf)->obj_next;
7385 if (next != addr) {
7386 continue;
7387 }
7388 if (!mclverify) {
7389 if (next != NULL && !MBUF_IN_MAP(next)) {
7390 mcache_t *cp = m_cache(sp->sl_class);
7391 panic("%s: %s buffer %p in slab %p modified "
7392 "after free at offset 0: %p out of range "
7393 "[%p-%p)\n", __func__, cp->mc_name,
7394 (void *)buf, sp, next, mbutl, embutl);
7395 /* NOTREACHED */
7396 }
7397 } else {
7398 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
7399 (mcache_obj_t *)buf);
7400 mcl_audit_verify_nextptr(next, mca);
7401 }
7402 }
7403 }
7404
7405 static void
7406 slab_detach(mcl_slab_t *sp)
7407 {
7408 sp->sl_link.tqe_next = (mcl_slab_t *)-1;
7409 sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
7410 sp->sl_flags |= SLF_DETACHED;
7411 }
7412
7413 static boolean_t
7414 slab_is_detached(mcl_slab_t *sp)
7415 {
7416 return (intptr_t)sp->sl_link.tqe_next == -1 &&
7417 (intptr_t)sp->sl_link.tqe_prev == -1 &&
7418 (sp->sl_flags & SLF_DETACHED);
7419 }
7420
7421 static void
7422 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
7423 mcache_obj_t **con_list, size_t con_size, unsigned int num)
7424 {
7425 mcache_audit_t *mca, *mca_tail;
7426 mcache_obj_t *con = NULL;
7427 boolean_t save_contents = (con_list != NULL);
7428 unsigned int i, ix;
7429
7430 ASSERT(num <= NMBPG);
7431 ASSERT(con_list == NULL || con_size != 0);
7432
7433 ix = MTOPG(buf);
7434 VERIFY(ix < maxclaudit);
7435
7436 /* Make sure we haven't been here before */
7437 for (i = 0; i < num; i++) {
7438 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
7439 }
7440
7441 mca = mca_tail = *mca_list;
7442 if (save_contents) {
7443 con = *con_list;
7444 }
7445
7446 for (i = 0; i < num; i++) {
7447 mcache_audit_t *next;
7448
7449 next = mca->mca_next;
7450 bzero(mca, sizeof(*mca));
7451 mca->mca_next = next;
7452 mclaudit[ix].cl_audit[i] = mca;
7453
7454 /* Attach the contents buffer if requested */
7455 if (save_contents) {
7456 mcl_saved_contents_t *msc =
7457 (mcl_saved_contents_t *)(void *)con;
7458
7459 VERIFY(msc != NULL);
7460 VERIFY(IS_P2ALIGNED(msc, sizeof(u_int64_t)));
7461 VERIFY(con_size == sizeof(*msc));
7462 mca->mca_contents_size = con_size;
7463 mca->mca_contents = msc;
7464 con = con->obj_next;
7465 bzero(mca->mca_contents, mca->mca_contents_size);
7466 }
7467
7468 mca_tail = mca;
7469 mca = mca->mca_next;
7470 }
7471
7472 if (save_contents) {
7473 *con_list = con;
7474 }
7475
7476 *mca_list = mca_tail->mca_next;
7477 mca_tail->mca_next = NULL;
7478 }
7479
7480 static void
7481 mcl_audit_free(void *buf, unsigned int num)
7482 {
7483 unsigned int i, ix;
7484 mcache_audit_t *mca, *mca_list;
7485
7486 ix = MTOPG(buf);
7487 VERIFY(ix < maxclaudit);
7488
7489 if (mclaudit[ix].cl_audit[0] != NULL) {
7490 mca_list = mclaudit[ix].cl_audit[0];
7491 for (i = 0; i < num; i++) {
7492 mca = mclaudit[ix].cl_audit[i];
7493 mclaudit[ix].cl_audit[i] = NULL;
7494 if (mca->mca_contents) {
7495 mcache_free(mcl_audit_con_cache,
7496 mca->mca_contents);
7497 }
7498 }
7499 mcache_free_ext(mcache_audit_cache,
7500 (mcache_obj_t *)mca_list);
7501 }
7502 }
7503
7504 /*
7505 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
7506 * the corresponding audit structure for that buffer.
7507 */
7508 static mcache_audit_t *
7509 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj)
7510 {
7511 mcache_audit_t *mca = NULL;
7512 int ix = MTOPG(mobj), m_idx = 0;
7513 unsigned char *page_addr;
7514
7515 VERIFY(ix < maxclaudit);
7516 VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE)));
7517
7518 page_addr = PGTOM(ix);
7519
7520 switch (class) {
7521 case MC_MBUF:
7522 /*
7523 * For the mbuf case, find the index of the page
7524 * used by the mbuf and use that index to locate the
7525 * base address of the page. Then find out the
7526 * mbuf index relative to the page base and use
7527 * it to locate the audit structure.
7528 */
7529 m_idx = MBPAGEIDX(page_addr, mobj);
7530 VERIFY(m_idx < (int)NMBPG);
7531 mca = mclaudit[ix].cl_audit[m_idx];
7532 break;
7533
7534 case MC_CL:
7535 /*
7536 * Same thing as above, but for 2KB clusters in a page.
7537 */
7538 m_idx = CLPAGEIDX(page_addr, mobj);
7539 VERIFY(m_idx < (int)NCLPG);
7540 mca = mclaudit[ix].cl_audit[m_idx];
7541 break;
7542
7543 case MC_BIGCL:
7544 m_idx = BCLPAGEIDX(page_addr, mobj);
7545 VERIFY(m_idx < (int)NBCLPG);
7546 mca = mclaudit[ix].cl_audit[m_idx];
7547 break;
7548 case MC_16KCL:
7549 /*
7550 * Same as above, but only return the first element.
7551 */
7552 mca = mclaudit[ix].cl_audit[0];
7553 break;
7554
7555 default:
7556 VERIFY(0);
7557 /* NOTREACHED */
7558 }
7559
7560 return mca;
7561 }
7562
7563 static void
7564 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
7565 boolean_t alloc)
7566 {
7567 struct mbuf *m = addr;
7568 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
7569
7570 VERIFY(mca->mca_contents != NULL &&
7571 mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7572
7573 if (mclverify) {
7574 mcl_audit_verify_nextptr(next, mca);
7575 }
7576
7577 if (!alloc) {
7578 /* Save constructed mbuf fields */
7579 mcl_audit_save_mbuf(m, mca);
7580 if (mclverify) {
7581 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
7582 m_maxsize(MC_MBUF));
7583 }
7584 ((mcache_obj_t *)m)->obj_next = next;
7585 return;
7586 }
7587
7588 /* Check if the buffer has been corrupted while in freelist */
7589 if (mclverify) {
7590 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
7591 }
7592 /* Restore constructed mbuf fields */
7593 mcl_audit_restore_mbuf(m, mca, composite);
7594 }
7595
7596 static void
7597 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
7598 {
7599 struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
7600
7601 if (composite) {
7602 struct mbuf *next = m->m_next;
7603 VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL &&
7604 MBUF_IS_COMPOSITE(ms));
7605 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7606 /*
7607 * We could have hand-picked the mbuf fields and restore
7608 * them individually, but that will be a maintenance
7609 * headache. Instead, restore everything that was saved;
7610 * the mbuf layer will recheck and reinitialize anyway.
7611 */
7612 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
7613 m->m_next = next;
7614 } else {
7615 /*
7616 * For a regular mbuf (no cluster attached) there's nothing
7617 * to restore other than the type field, which is expected
7618 * to be MT_FREE.
7619 */
7620 m->m_type = ms->m_type;
7621 }
7622 _MCHECK(m);
7623 }
7624
7625 static void
7626 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
7627 {
7628 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7629 _MCHECK(m);
7630 bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
7631 }
7632
7633 static void
7634 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
7635 boolean_t save_next)
7636 {
7637 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
7638
7639 if (!alloc) {
7640 if (mclverify) {
7641 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
7642 }
7643 if (save_next) {
7644 mcl_audit_verify_nextptr(next, mca);
7645 ((mcache_obj_t *)addr)->obj_next = next;
7646 }
7647 } else if (mclverify) {
7648 /* Check if the buffer has been corrupted while in freelist */
7649 mcl_audit_verify_nextptr(next, mca);
7650 mcache_audit_free_verify_set(mca, addr, 0, size);
7651 }
7652 }
7653
7654 static void
7655 mcl_audit_scratch(mcache_audit_t *mca)
7656 {
7657 void *stack[MCACHE_STACK_DEPTH + 1];
7658 mcl_scratch_audit_t *msa;
7659 struct timeval now;
7660
7661 VERIFY(mca->mca_contents != NULL);
7662 msa = MCA_SAVED_SCRATCH_PTR(mca);
7663
7664 msa->msa_pthread = msa->msa_thread;
7665 msa->msa_thread = current_thread();
7666 bcopy(msa->msa_stack, msa->msa_pstack, sizeof(msa->msa_pstack));
7667 msa->msa_pdepth = msa->msa_depth;
7668 bzero(stack, sizeof(stack));
7669 msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
7670 bcopy(&stack[1], msa->msa_stack, sizeof(msa->msa_stack));
7671
7672 msa->msa_ptstamp = msa->msa_tstamp;
7673 microuptime(&now);
7674 /* tstamp is in ms relative to base_ts */
7675 msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
7676 if ((now.tv_sec - mb_start.tv_sec) > 0) {
7677 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
7678 }
7679 }
7680
7681 __abortlike
7682 static void
7683 mcl_audit_mcheck_panic(struct mbuf *m)
7684 {
7685 char buf[DUMP_MCA_BUF_SIZE];
7686 mcache_audit_t *mca;
7687
7688 MRANGE(m);
7689 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7690
7691 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s",
7692 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(buf, mca));
7693 /* NOTREACHED */
7694 }
7695
7696 __abortlike
7697 static void
7698 mcl_audit_verify_nextptr_panic(void *next, mcache_audit_t *mca)
7699 {
7700 char buf[DUMP_MCA_BUF_SIZE];
7701 panic("mcl_audit: buffer %p modified after free at offset 0: "
7702 "%p out of range [%p-%p)\n%s\n",
7703 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(buf, mca));
7704 /* NOTREACHED */
7705 }
7706
7707 static void
7708 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
7709 {
7710 if (next != NULL && !MBUF_IN_MAP(next) &&
7711 (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
7712 mcl_audit_verify_nextptr_panic(next, mca);
7713 }
7714 }
7715
7716 static uintptr_t
7717 hash_mix(uintptr_t x)
7718 {
7719 #ifndef __LP64__
7720 x += ~(x << 15);
7721 x ^= (x >> 10);
7722 x += (x << 3);
7723 x ^= (x >> 6);
7724 x += ~(x << 11);
7725 x ^= (x >> 16);
7726 #else
7727 x += ~(x << 32);
7728 x ^= (x >> 22);
7729 x += ~(x << 13);
7730 x ^= (x >> 8);
7731 x += (x << 3);
7732 x ^= (x >> 15);
7733 x += ~(x << 27);
7734 x ^= (x >> 31);
7735 #endif
7736 return x;
7737 }
7738
7739 static uint32_t
7740 hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
7741 {
7742 uintptr_t hash = 0;
7743 uintptr_t mask = max_size - 1;
7744
7745 while (depth) {
7746 hash += bt[--depth];
7747 }
7748
7749 hash = hash_mix(hash) & mask;
7750
7751 assert(hash < max_size);
7752
7753 return (uint32_t) hash;
7754 }
7755
7756 static uint32_t
7757 hashaddr(uintptr_t pt, uint32_t max_size)
7758 {
7759 uintptr_t hash = 0;
7760 uintptr_t mask = max_size - 1;
7761
7762 hash = hash_mix(pt) & mask;
7763
7764 assert(hash < max_size);
7765
7766 return (uint32_t) hash;
7767 }
7768
7769 /* This function turns on mbuf leak detection */
7770 static void
7771 mleak_activate(void)
7772 {
7773 mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
7774 PE_parse_boot_argn("mleak_sample_factor",
7775 &mleak_table.mleak_sample_factor,
7776 sizeof(mleak_table.mleak_sample_factor));
7777
7778 if (mleak_table.mleak_sample_factor == 0) {
7779 mclfindleak = 0;
7780 }
7781
7782 if (mclfindleak == 0) {
7783 return;
7784 }
7785
7786 vm_size_t alloc_size =
7787 mleak_alloc_buckets * sizeof(struct mallocation);
7788 vm_size_t trace_size = mleak_trace_buckets * sizeof(struct mtrace);
7789
7790 mleak_allocations = zalloc_permanent(alloc_size, ZALIGN(struct mallocation));
7791 mleak_traces = zalloc_permanent(trace_size, ZALIGN(struct mtrace));
7792 mleak_stat = zalloc_permanent(MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
7793 ZALIGN(mleak_stat_t));
7794
7795 mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
7796 #ifdef __LP64__
7797 mleak_stat->ml_isaddr64 = 1;
7798 #endif /* __LP64__ */
7799 }
7800
7801 static void
7802 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
7803 {
7804 int temp;
7805
7806 if (mclfindleak == 0) {
7807 return;
7808 }
7809
7810 if (!alloc) {
7811 return mleak_free(addr);
7812 }
7813
7814 temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
7815
7816 if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
7817 uintptr_t bt[MLEAK_STACK_DEPTH];
7818 unsigned int logged = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
7819 mleak_log(bt, addr, logged, num);
7820 }
7821 }
7822
7823 /*
7824 * This function records the allocation in the mleak_allocations table
7825 * and the backtrace in the mleak_traces table; if allocation slot is in use,
7826 * replace old allocation with new one if the trace slot is in use, return
7827 * (or increment refcount if same trace).
7828 */
7829 static boolean_t
7830 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
7831 {
7832 struct mallocation *allocation;
7833 struct mtrace *trace;
7834 uint32_t trace_index;
7835
7836 /* Quit if someone else modifying the tables */
7837 if (!lck_mtx_try_lock_spin(mleak_lock)) {
7838 mleak_table.total_conflicts++;
7839 return FALSE;
7840 }
7841
7842 allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
7843 mleak_alloc_buckets)];
7844 trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
7845 trace = &mleak_traces[trace_index];
7846
7847 VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
7848 VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
7849
7850 allocation->hitcount++;
7851 trace->hitcount++;
7852
7853 /*
7854 * If the allocation bucket we want is occupied
7855 * and the occupier has the same trace, just bail.
7856 */
7857 if (allocation->element != NULL &&
7858 trace_index == allocation->trace_index) {
7859 mleak_table.alloc_collisions++;
7860 lck_mtx_unlock(mleak_lock);
7861 return TRUE;
7862 }
7863
7864 /*
7865 * Store the backtrace in the traces array;
7866 * Size of zero = trace bucket is free.
7867 */
7868 if (trace->allocs > 0 &&
7869 bcmp(trace->addr, bt, (depth * sizeof(uintptr_t))) != 0) {
7870 /* Different, unique trace, but the same hash! Bail out. */
7871 trace->collisions++;
7872 mleak_table.trace_collisions++;
7873 lck_mtx_unlock(mleak_lock);
7874 return TRUE;
7875 } else if (trace->allocs > 0) {
7876 /* Same trace, already added, so increment refcount */
7877 trace->allocs++;
7878 } else {
7879 /* Found an unused trace bucket, so record the trace here */
7880 if (trace->depth != 0) {
7881 /* this slot previously used but not currently in use */
7882 mleak_table.trace_overwrites++;
7883 }
7884 mleak_table.trace_recorded++;
7885 trace->allocs = 1;
7886 memcpy(trace->addr, bt, (depth * sizeof(uintptr_t)));
7887 trace->depth = depth;
7888 trace->collisions = 0;
7889 }
7890
7891 /* Step 2: Store the allocation record in the allocations array */
7892 if (allocation->element != NULL) {
7893 /*
7894 * Replace an existing allocation. No need to preserve
7895 * because only a subset of the allocations are being
7896 * recorded anyway.
7897 */
7898 mleak_table.alloc_collisions++;
7899 } else if (allocation->trace_index != 0) {
7900 mleak_table.alloc_overwrites++;
7901 }
7902 allocation->element = addr;
7903 allocation->trace_index = trace_index;
7904 allocation->count = num;
7905 mleak_table.alloc_recorded++;
7906 mleak_table.outstanding_allocs++;
7907
7908 lck_mtx_unlock(mleak_lock);
7909 return TRUE;
7910 }
7911
7912 static void
7913 mleak_free(mcache_obj_t *addr)
7914 {
7915 while (addr != NULL) {
7916 struct mallocation *allocation = &mleak_allocations
7917 [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
7918
7919 if (allocation->element == addr &&
7920 allocation->trace_index < mleak_trace_buckets) {
7921 lck_mtx_lock_spin(mleak_lock);
7922 if (allocation->element == addr &&
7923 allocation->trace_index < mleak_trace_buckets) {
7924 struct mtrace *trace;
7925 trace = &mleak_traces[allocation->trace_index];
7926 /* allocs = 0 means trace bucket is unused */
7927 if (trace->allocs > 0) {
7928 trace->allocs--;
7929 }
7930 if (trace->allocs == 0) {
7931 trace->depth = 0;
7932 }
7933 /* NULL element means alloc bucket is unused */
7934 allocation->element = NULL;
7935 mleak_table.outstanding_allocs--;
7936 }
7937 lck_mtx_unlock(mleak_lock);
7938 }
7939 addr = addr->obj_next;
7940 }
7941 }
7942
7943 static void
7944 mleak_sort_traces()
7945 {
7946 int i, j, k;
7947 struct mtrace *swap;
7948
7949 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
7950 mleak_top_trace[i] = NULL;
7951 }
7952
7953 for (i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++) {
7954 if (mleak_traces[i].allocs <= 0) {
7955 continue;
7956 }
7957
7958 mleak_top_trace[j] = &mleak_traces[i];
7959 for (k = j; k > 0; k--) {
7960 if (mleak_top_trace[k]->allocs <=
7961 mleak_top_trace[k - 1]->allocs) {
7962 break;
7963 }
7964
7965 swap = mleak_top_trace[k - 1];
7966 mleak_top_trace[k - 1] = mleak_top_trace[k];
7967 mleak_top_trace[k] = swap;
7968 }
7969 j++;
7970 }
7971
7972 j--;
7973 for (; i < mleak_trace_buckets; i++) {
7974 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs) {
7975 continue;
7976 }
7977
7978 mleak_top_trace[j] = &mleak_traces[i];
7979
7980 for (k = j; k > 0; k--) {
7981 if (mleak_top_trace[k]->allocs <=
7982 mleak_top_trace[k - 1]->allocs) {
7983 break;
7984 }
7985
7986 swap = mleak_top_trace[k - 1];
7987 mleak_top_trace[k - 1] = mleak_top_trace[k];
7988 mleak_top_trace[k] = swap;
7989 }
7990 }
7991 }
7992
7993 static void
7994 mleak_update_stats()
7995 {
7996 mleak_trace_stat_t *mltr;
7997 int i;
7998
7999 VERIFY(mleak_stat != NULL);
8000 #ifdef __LP64__
8001 VERIFY(mleak_stat->ml_isaddr64);
8002 #else
8003 VERIFY(!mleak_stat->ml_isaddr64);
8004 #endif /* !__LP64__ */
8005 VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
8006
8007 mleak_sort_traces();
8008
8009 mltr = &mleak_stat->ml_trace[0];
8010 bzero(mltr, sizeof(*mltr) * MLEAK_NUM_TRACES);
8011 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
8012 int j;
8013
8014 if (mleak_top_trace[i] == NULL ||
8015 mleak_top_trace[i]->allocs == 0) {
8016 continue;
8017 }
8018
8019 mltr->mltr_collisions = mleak_top_trace[i]->collisions;
8020 mltr->mltr_hitcount = mleak_top_trace[i]->hitcount;
8021 mltr->mltr_allocs = mleak_top_trace[i]->allocs;
8022 mltr->mltr_depth = mleak_top_trace[i]->depth;
8023
8024 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
8025 for (j = 0; j < mltr->mltr_depth; j++) {
8026 mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
8027 }
8028
8029 mltr++;
8030 }
8031 }
8032
8033 static struct mbtypes {
8034 int mt_type;
8035 const char *mt_name;
8036 } mbtypes[] = {
8037 { MT_DATA, "data" },
8038 { MT_OOBDATA, "oob data" },
8039 { MT_CONTROL, "ancillary data" },
8040 { MT_HEADER, "packet headers" },
8041 { MT_SOCKET, "socket structures" },
8042 { MT_PCB, "protocol control blocks" },
8043 { MT_RTABLE, "routing table entries" },
8044 { MT_HTABLE, "IMP host table entries" },
8045 { MT_ATABLE, "address resolution tables" },
8046 { MT_FTABLE, "fragment reassembly queue headers" },
8047 { MT_SONAME, "socket names and addresses" },
8048 { MT_SOOPTS, "socket options" },
8049 { MT_RIGHTS, "access rights" },
8050 { MT_IFADDR, "interface addresses" },
8051 { MT_TAG, "packet tags" },
8052 { 0, NULL }
8053 };
8054
8055 #define MBUF_DUMP_BUF_CHK() { \
8056 clen -= k; \
8057 if (clen < 1) \
8058 goto done; \
8059 c += k; \
8060 }
8061
8062 static char *
8063 mbuf_dump(void)
8064 {
8065 unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct,
8066 totreturned = 0;
8067 u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
8068 u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
8069 u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
8070 int nmbtypes = sizeof(mbstat.m_mtypes) / sizeof(short);
8071 uint8_t seen[256];
8072 struct mbtypes *mp;
8073 mb_class_stat_t *sp;
8074 mleak_trace_stat_t *mltr;
8075 char *c = mbuf_dump_buf;
8076 int i, j, k, clen = MBUF_DUMP_BUF_SIZE;
8077 struct mbuf_watchdog_defunct_args args = {};
8078
8079 mbuf_dump_buf[0] = '\0';
8080
8081 /* synchronize all statistics in the mbuf table */
8082 mbuf_stat_sync();
8083 mbuf_mtypes_sync(TRUE);
8084
8085 sp = &mb_stat->mbs_class[0];
8086 for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
8087 u_int32_t mem;
8088
8089 if (m_class(i) == MC_MBUF) {
8090 m_mbufs = sp->mbcl_active;
8091 } else if (m_class(i) == MC_CL) {
8092 m_clfree = sp->mbcl_total - sp->mbcl_active;
8093 } else if (m_class(i) == MC_BIGCL) {
8094 m_bigclfree = sp->mbcl_total - sp->mbcl_active;
8095 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
8096 m_16kclfree = sp->mbcl_total - sp->mbcl_active;
8097 m_16kclusters = sp->mbcl_total;
8098 } else if (m_class(i) == MC_MBUF_CL) {
8099 m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
8100 } else if (m_class(i) == MC_MBUF_BIGCL) {
8101 m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
8102 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
8103 m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
8104 }
8105
8106 mem = sp->mbcl_ctotal * sp->mbcl_size;
8107 totmem += mem;
8108 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
8109 sp->mbcl_size;
8110 totreturned += sp->mbcl_release_cnt;
8111 }
8112
8113 /* adjust free counts to include composite caches */
8114 m_clfree += m_mbufclfree;
8115 m_bigclfree += m_mbufbigclfree;
8116 m_16kclfree += m_mbuf16kclfree;
8117
8118 totmbufs = 0;
8119 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
8120 totmbufs += mbstat.m_mtypes[mp->mt_type];
8121 }
8122 if (totmbufs > m_mbufs) {
8123 totmbufs = m_mbufs;
8124 }
8125 k = scnprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
8126 MBUF_DUMP_BUF_CHK();
8127
8128 bzero(&seen, sizeof(seen));
8129 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
8130 if (mbstat.m_mtypes[mp->mt_type] != 0) {
8131 seen[mp->mt_type] = 1;
8132 k = scnprintf(c, clen, "\t%u mbufs allocated to %s\n",
8133 mbstat.m_mtypes[mp->mt_type], mp->mt_name);
8134 MBUF_DUMP_BUF_CHK();
8135 }
8136 }
8137 seen[MT_FREE] = 1;
8138 for (i = 0; i < nmbtypes; i++) {
8139 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
8140 k = scnprintf(c, clen, "\t%u mbufs allocated to "
8141 "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
8142 MBUF_DUMP_BUF_CHK();
8143 }
8144 }
8145 if ((m_mbufs - totmbufs) > 0) {
8146 k = scnprintf(c, clen, "\t%lu mbufs allocated to caches\n",
8147 m_mbufs - totmbufs);
8148 MBUF_DUMP_BUF_CHK();
8149 }
8150 k = scnprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
8151 "%u/%u mbuf 4KB clusters in use\n",
8152 (unsigned int)(mbstat.m_clusters - m_clfree),
8153 (unsigned int)mbstat.m_clusters,
8154 (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
8155 (unsigned int)mbstat.m_bigclusters);
8156 MBUF_DUMP_BUF_CHK();
8157
8158 if (njcl > 0) {
8159 k = scnprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
8160 m_16kclusters - m_16kclfree, m_16kclusters,
8161 njclbytes / 1024);
8162 MBUF_DUMP_BUF_CHK();
8163 }
8164 totused = totmem - totfree;
8165 if (totmem == 0) {
8166 totpct = 0;
8167 } else if (totused < (ULONG_MAX / 100)) {
8168 totpct = (totused * 100) / totmem;
8169 } else {
8170 u_long totmem1 = totmem / 100;
8171 u_long totused1 = totused / 100;
8172 totpct = (totused1 * 100) / totmem1;
8173 }
8174 k = scnprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
8175 "in use)\n", totmem / 1024, totpct);
8176 MBUF_DUMP_BUF_CHK();
8177 k = scnprintf(c, clen, "%lu KB returned to the system\n",
8178 totreturned / 1024);
8179 MBUF_DUMP_BUF_CHK();
8180
8181 net_update_uptime();
8182
8183 k = scnprintf(c, clen,
8184 "worker thread runs: %u, expansions: %llu, cl %llu/%llu, "
8185 "bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt,
8186 mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total,
8187 mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt,
8188 mb_expand_16kcl_total);
8189 MBUF_DUMP_BUF_CHK();
8190 if (mbuf_worker_last_runtime != 0) {
8191 k = scnprintf(c, clen, "worker thread last run time: "
8192 "%llu (%llu seconds ago)\n",
8193 mbuf_worker_last_runtime,
8194 net_uptime() - mbuf_worker_last_runtime);
8195 MBUF_DUMP_BUF_CHK();
8196 }
8197 if (mbuf_drain_last_runtime != 0) {
8198 k = scnprintf(c, clen, "drain routine last run time: "
8199 "%llu (%llu seconds ago)\n",
8200 mbuf_drain_last_runtime,
8201 net_uptime() - mbuf_drain_last_runtime);
8202 MBUF_DUMP_BUF_CHK();
8203 }
8204
8205 /*
8206 * Log where the most mbufs have accumulated:
8207 * - Process socket buffers
8208 * - TCP reassembly queue
8209 * - Interface AQM queue (output) and DLIL input queue
8210 */
8211 args.non_blocking = true;
8212 proc_iterate(PROC_ALLPROCLIST,
8213 mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
8214 if (args.top_app != NULL) {
8215 k = scnprintf(c, clen, "\ntop proc mbuf space %u bytes by %s:%d\n",
8216 args.top_app_space_used,
8217 proc_name_address(args.top_app),
8218 proc_pid(args.top_app));
8219 proc_rele(args.top_app);
8220 }
8221 MBUF_DUMP_BUF_CHK();
8222
8223 #if INET
8224 k = dump_tcp_reass_qlen(c, clen);
8225 MBUF_DUMP_BUF_CHK();
8226 #endif /* INET */
8227
8228 #if MPTCP
8229 k = dump_mptcp_reass_qlen(c, clen);
8230 MBUF_DUMP_BUF_CHK();
8231 #endif /* MPTCP */
8232
8233 #if NETWORKING
8234 k = dlil_dump_top_if_qlen(c, clen);
8235 MBUF_DUMP_BUF_CHK();
8236 #endif /* NETWORKING */
8237
8238 /* mbuf leak detection statistics */
8239 mleak_update_stats();
8240
8241 k = scnprintf(c, clen, "\nmbuf leak detection table:\n");
8242 MBUF_DUMP_BUF_CHK();
8243 k = scnprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
8244 mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
8245 mleak_table.mleak_sample_factor);
8246 MBUF_DUMP_BUF_CHK();
8247 k = scnprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
8248 mleak_table.outstanding_allocs);
8249 MBUF_DUMP_BUF_CHK();
8250 k = scnprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
8251 mleak_table.alloc_recorded, mleak_table.trace_recorded);
8252 MBUF_DUMP_BUF_CHK();
8253 k = scnprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
8254 mleak_table.alloc_collisions, mleak_table.trace_collisions);
8255 MBUF_DUMP_BUF_CHK();
8256 k = scnprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
8257 mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
8258 MBUF_DUMP_BUF_CHK();
8259 k = scnprintf(c, clen, "\tlock conflicts: %llu\n\n",
8260 mleak_table.total_conflicts);
8261 MBUF_DUMP_BUF_CHK();
8262
8263 k = scnprintf(c, clen, "top %d outstanding traces:\n",
8264 mleak_stat->ml_cnt);
8265 MBUF_DUMP_BUF_CHK();
8266 for (i = 0; i < mleak_stat->ml_cnt; i++) {
8267 mltr = &mleak_stat->ml_trace[i];
8268 k = scnprintf(c, clen, "[%d] %llu outstanding alloc(s), "
8269 "%llu hit(s), %llu collision(s)\n", (i + 1),
8270 mltr->mltr_allocs, mltr->mltr_hitcount,
8271 mltr->mltr_collisions);
8272 MBUF_DUMP_BUF_CHK();
8273 }
8274
8275 if (mleak_stat->ml_isaddr64) {
8276 k = scnprintf(c, clen, MB_LEAK_HDR_64);
8277 } else {
8278 k = scnprintf(c, clen, MB_LEAK_HDR_32);
8279 }
8280 MBUF_DUMP_BUF_CHK();
8281
8282 for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
8283 k = scnprintf(c, clen, "%2d: ", (i + 1));
8284 MBUF_DUMP_BUF_CHK();
8285 for (j = 0; j < mleak_stat->ml_cnt; j++) {
8286 mltr = &mleak_stat->ml_trace[j];
8287 if (i < mltr->mltr_depth) {
8288 if (mleak_stat->ml_isaddr64) {
8289 k = scnprintf(c, clen, "0x%0llx ",
8290 (uint64_t)VM_KERNEL_UNSLIDE(
8291 mltr->mltr_addr[i]));
8292 } else {
8293 k = scnprintf(c, clen,
8294 "0x%08x ",
8295 (uint32_t)VM_KERNEL_UNSLIDE(
8296 mltr->mltr_addr[i]));
8297 }
8298 } else {
8299 if (mleak_stat->ml_isaddr64) {
8300 k = scnprintf(c, clen,
8301 MB_LEAK_SPACING_64);
8302 } else {
8303 k = scnprintf(c, clen,
8304 MB_LEAK_SPACING_32);
8305 }
8306 }
8307 MBUF_DUMP_BUF_CHK();
8308 }
8309 k = scnprintf(c, clen, "\n");
8310 MBUF_DUMP_BUF_CHK();
8311 }
8312
8313 done:
8314 return mbuf_dump_buf;
8315 }
8316
8317 #undef MBUF_DUMP_BUF_CHK
8318
8319 /*
8320 * Convert between a regular and a packet header mbuf. Caller is responsible
8321 * for setting or clearing M_PKTHDR; this routine does the rest of the work.
8322 */
8323 int
8324 m_reinit(struct mbuf *m, int hdr)
8325 {
8326 int ret = 0;
8327
8328 if (hdr) {
8329 VERIFY(!(m->m_flags & M_PKTHDR));
8330 if (!(m->m_flags & M_EXT) &&
8331 (m->m_data != m->m_dat || m->m_len > 0)) {
8332 /*
8333 * If there's no external cluster attached and the
8334 * mbuf appears to contain user data, we cannot
8335 * safely convert this to a packet header mbuf,
8336 * as the packet header structure might overlap
8337 * with the data.
8338 */
8339 printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
8340 "m_data %llx (expected %llx), "
8341 "m_len %d (expected 0)\n",
8342 __func__,
8343 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m),
8344 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m->m_data),
8345 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)(m->m_dat)), m->m_len);
8346 ret = EBUSY;
8347 } else {
8348 VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
8349 m->m_flags |= M_PKTHDR;
8350 MBUF_INIT_PKTHDR(m);
8351 }
8352 } else {
8353 /* Check for scratch area overflow */
8354 m_redzone_verify(m);
8355 /* Free the aux data and tags if there is any */
8356 m_tag_delete_chain(m, NULL);
8357 m_do_tx_compl_callback(m, NULL);
8358 m->m_flags &= ~M_PKTHDR;
8359 }
8360
8361 return ret;
8362 }
8363
8364 int
8365 m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n)
8366 {
8367 ASSERT(m->m_flags & M_EXT);
8368 return atomic_test_set_32(&MEXT_PRIV(m), o, n);
8369 }
8370
8371 uint32_t
8372 m_ext_get_prop(struct mbuf *m)
8373 {
8374 ASSERT(m->m_flags & M_EXT);
8375 return MEXT_PRIV(m);
8376 }
8377
8378 int
8379 m_ext_paired_is_active(struct mbuf *m)
8380 {
8381 return MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : 1;
8382 }
8383
8384 void
8385 m_ext_paired_activate(struct mbuf *m)
8386 {
8387 struct ext_ref *rfa;
8388 int hdr, type;
8389 caddr_t extbuf;
8390 m_ext_free_func_t extfree;
8391 u_int extsize;
8392
8393 VERIFY(MBUF_IS_PAIRED(m));
8394 VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
8395 VERIFY(MEXT_PREF(m) == MEXT_MINREF(m));
8396
8397 hdr = (m->m_flags & M_PKTHDR);
8398 type = m->m_type;
8399 extbuf = m->m_ext.ext_buf;
8400 extfree = m_get_ext_free(m);
8401 extsize = m->m_ext.ext_size;
8402 rfa = m_get_rfa(m);
8403
8404 VERIFY(extbuf != NULL && rfa != NULL);
8405
8406 /*
8407 * Safe to reinitialize packet header tags, since it's
8408 * already taken care of at m_free() time. Similar to
8409 * what's done in m_clattach() for the cluster. Bump
8410 * up MEXT_PREF to indicate activation.
8411 */
8412 MBUF_INIT(m, hdr, type);
8413 MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
8414 1, 1, 2, EXTF_PAIRED, MEXT_PRIV(m), m);
8415 }
8416
8417 void
8418 m_scratch_init(struct mbuf *m)
8419 {
8420 struct pkthdr *pkt = &m->m_pkthdr;
8421
8422 VERIFY(m->m_flags & M_PKTHDR);
8423
8424 /* See comments in <rdar://problem/14040693> */
8425 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
8426 panic_plain("Invalid attempt to modify guarded module-private "
8427 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
8428 /* NOTREACHED */
8429 }
8430
8431 bzero(&pkt->pkt_mpriv, sizeof(pkt->pkt_mpriv));
8432 }
8433
8434 /*
8435 * This routine is reserved for mbuf_get_driver_scratch(); clients inside
8436 * xnu that intend on utilizing the module-private area should directly
8437 * refer to the pkt_mpriv structure in the pkthdr. They are also expected
8438 * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
8439 * to handing it off to another module, respectively.
8440 */
8441 u_int32_t
8442 m_scratch_get(struct mbuf *m, u_int8_t **p)
8443 {
8444 struct pkthdr *pkt = &m->m_pkthdr;
8445
8446 VERIFY(m->m_flags & M_PKTHDR);
8447
8448 /* See comments in <rdar://problem/14040693> */
8449 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
8450 panic_plain("Invalid attempt to access guarded module-private "
8451 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
8452 /* NOTREACHED */
8453 }
8454
8455 if (mcltrace) {
8456 mcache_audit_t *mca;
8457
8458 lck_mtx_lock(mbuf_mlock);
8459 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
8460 if (mca->mca_uflags & MB_SCVALID) {
8461 mcl_audit_scratch(mca);
8462 }
8463 lck_mtx_unlock(mbuf_mlock);
8464 }
8465
8466 *p = (u_int8_t *)&pkt->pkt_mpriv;
8467 return sizeof(pkt->pkt_mpriv);
8468 }
8469
8470 void
8471 m_add_crumb(struct mbuf *m, uint16_t crumb)
8472 {
8473 VERIFY(m->m_flags & M_PKTHDR);
8474
8475 m->m_pkthdr.pkt_crumbs |= crumb;
8476 }
8477
8478 static void
8479 m_redzone_init(struct mbuf *m)
8480 {
8481 VERIFY(m->m_flags & M_PKTHDR);
8482 /*
8483 * Each mbuf has a unique red zone pattern, which is a XOR
8484 * of the red zone cookie and the address of the mbuf.
8485 */
8486 m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
8487 }
8488
8489 static void
8490 m_redzone_verify(struct mbuf *m)
8491 {
8492 u_int32_t mb_redzone;
8493
8494 VERIFY(m->m_flags & M_PKTHDR);
8495
8496 mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
8497 if (m->m_pkthdr.redzone != mb_redzone) {
8498 panic("mbuf %p redzone violation with value 0x%x "
8499 "(instead of 0x%x, using cookie 0x%x)\n",
8500 m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
8501 /* NOTREACHED */
8502 }
8503 }
8504
8505 __private_extern__ inline void
8506 m_set_ext(struct mbuf *m, struct ext_ref *rfa, m_ext_free_func_t ext_free,
8507 caddr_t ext_arg)
8508 {
8509 VERIFY(m->m_flags & M_EXT);
8510 if (rfa != NULL) {
8511 m->m_ext.ext_refflags =
8512 (struct ext_ref *)(((uintptr_t)rfa) ^ mb_obscure_extref);
8513 if (ext_free != NULL) {
8514 rfa->ext_token = ((uintptr_t)&rfa->ext_token) ^
8515 mb_obscure_extfree;
8516 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, ext_free) ^ rfa->ext_token;
8517 m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
8518 if (ext_arg != NULL) {
8519 m->m_ext.ext_arg =
8520 (caddr_t)(((uintptr_t)ext_arg) ^ rfa->ext_token);
8521 } else {
8522 m->m_ext.ext_arg = NULL;
8523 }
8524 } else {
8525 rfa->ext_token = 0;
8526 m->m_ext.ext_free = NULL;
8527 m->m_ext.ext_arg = NULL;
8528 }
8529 } else {
8530 /*
8531 * If we are going to loose the cookie in ext_token by
8532 * resetting the rfa, we should use the global cookie
8533 * to obscure the ext_free and ext_arg pointers.
8534 */
8535 if (ext_free != NULL) {
8536 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, ext_free) ^ mb_obscure_extfree;
8537 m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
8538 if (ext_arg != NULL) {
8539 m->m_ext.ext_arg =
8540 (caddr_t)((uintptr_t)ext_arg ^
8541 mb_obscure_extfree);
8542 } else {
8543 m->m_ext.ext_arg = NULL;
8544 }
8545 } else {
8546 m->m_ext.ext_free = NULL;
8547 m->m_ext.ext_arg = NULL;
8548 }
8549 m->m_ext.ext_refflags = NULL;
8550 }
8551 }
8552
8553 __private_extern__ inline struct ext_ref *
8554 m_get_rfa(struct mbuf *m)
8555 {
8556 if (m->m_ext.ext_refflags == NULL) {
8557 return NULL;
8558 } else {
8559 return (struct ext_ref *)(((uintptr_t)m->m_ext.ext_refflags) ^ mb_obscure_extref);
8560 }
8561 }
8562
8563 __private_extern__ inline m_ext_free_func_t
8564 m_get_ext_free(struct mbuf *m)
8565 {
8566 struct ext_ref *rfa;
8567 if (m->m_ext.ext_free == NULL) {
8568 return NULL;
8569 }
8570
8571 rfa = m_get_rfa(m);
8572 if (rfa == NULL) {
8573 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, m->m_ext.ext_free) ^ mb_obscure_extfree;
8574 return ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
8575 } else {
8576 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, m->m_ext.ext_free) ^ rfa->ext_token;
8577 return ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
8578 }
8579 }
8580
8581 __private_extern__ inline caddr_t
8582 m_get_ext_arg(struct mbuf *m)
8583 {
8584 struct ext_ref *rfa;
8585 if (m->m_ext.ext_arg == NULL) {
8586 return NULL;
8587 }
8588
8589 rfa = m_get_rfa(m);
8590 if (rfa == NULL) {
8591 return (caddr_t)((uintptr_t)m->m_ext.ext_arg ^ mb_obscure_extfree);
8592 } else {
8593 return (caddr_t)(((uintptr_t)m->m_ext.ext_arg) ^
8594 rfa->ext_token);
8595 }
8596 }
8597
8598 /*
8599 * Send a report of mbuf usage if the usage is at least 6% of max limit
8600 * or if there has been at least 3% increase since the last report.
8601 *
8602 * The values 6% and 3% are chosen so that we can do simple arithmetic
8603 * with shift operations.
8604 */
8605 static boolean_t
8606 mbuf_report_usage(mbuf_class_t cl)
8607 {
8608 /* if a report is already in progress, nothing to do */
8609 if (mb_peak_newreport) {
8610 return TRUE;
8611 }
8612
8613 if (m_total(cl) > m_peak(cl) &&
8614 m_total(cl) >= (m_maxlimit(cl) >> 4) &&
8615 (m_total(cl) - m_peak(cl)) >= (m_peak(cl) >> 5)) {
8616 return TRUE;
8617 }
8618 return FALSE;
8619 }
8620
8621 __private_extern__ void
8622 mbuf_report_peak_usage(void)
8623 {
8624 int i = 0;
8625 u_int64_t uptime;
8626 struct nstat_sysinfo_data ns_data;
8627 uint32_t memreleased = 0;
8628 static uint32_t prevmemreleased;
8629
8630 uptime = net_uptime();
8631 lck_mtx_lock(mbuf_mlock);
8632
8633 /* Generate an initial report after 1 week of uptime */
8634 if (!mb_peak_firstreport &&
8635 uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
8636 mb_peak_newreport = TRUE;
8637 mb_peak_firstreport = TRUE;
8638 }
8639
8640 if (!mb_peak_newreport) {
8641 lck_mtx_unlock(mbuf_mlock);
8642 return;
8643 }
8644
8645 /*
8646 * Since a report is being generated before 1 week,
8647 * we do not need to force another one later
8648 */
8649 if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
8650 mb_peak_firstreport = TRUE;
8651 }
8652
8653 for (i = 0; i < NELEM(mbuf_table); i++) {
8654 m_peak(m_class(i)) = m_total(m_class(i));
8655 memreleased += m_release_cnt(i);
8656 }
8657 memreleased = memreleased - prevmemreleased;
8658 prevmemreleased = memreleased;
8659 mb_peak_newreport = FALSE;
8660 lck_mtx_unlock(mbuf_mlock);
8661
8662 bzero(&ns_data, sizeof(ns_data));
8663 ns_data.flags = NSTAT_SYSINFO_MBUF_STATS;
8664 ns_data.u.mb_stats.total_256b = m_peak(MC_MBUF);
8665 ns_data.u.mb_stats.total_2kb = m_peak(MC_CL);
8666 ns_data.u.mb_stats.total_4kb = m_peak(MC_BIGCL);
8667 ns_data.u.mb_stats.total_16kb = m_peak(MC_16KCL);
8668 ns_data.u.mb_stats.sbmb_total = total_sbmb_cnt_peak;
8669 ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached;
8670 ns_data.u.mb_stats.draincnt = mbstat.m_drain;
8671 ns_data.u.mb_stats.memreleased = memreleased;
8672 ns_data.u.mb_stats.sbmb_floor = total_sbmb_cnt_floor;
8673
8674 nstat_sysinfo_send_data(&ns_data);
8675
8676 /*
8677 * Reset the floor whenever we report a new
8678 * peak to track the trend (increase peek usage
8679 * is not a leak if mbufs get released
8680 * between reports and the floor stays low)
8681 */
8682 total_sbmb_cnt_floor = total_sbmb_cnt_peak;
8683 }
8684
8685 /*
8686 * Simple routine to avoid taking the lock when we can't run the
8687 * mbuf drain.
8688 */
8689 static int
8690 mbuf_drain_checks(boolean_t ignore_waiters)
8691 {
8692 if (mb_drain_maxint == 0) {
8693 return 0;
8694 }
8695 if (!ignore_waiters && mb_waiters != 0) {
8696 return 0;
8697 }
8698
8699 return 1;
8700 }
8701
8702 /*
8703 * Called by the VM when there's memory pressure or when we exhausted
8704 * the 4k/16k reserved space.
8705 */
8706 static void
8707 mbuf_drain_locked(boolean_t ignore_waiters)
8708 {
8709 mbuf_class_t mc;
8710 mcl_slab_t *sp, *sp_tmp, *nsp;
8711 unsigned int num, k, interval, released = 0;
8712 unsigned long total_mem = 0, use_mem = 0;
8713 boolean_t ret, purge_caches = FALSE;
8714 ppnum_t offset;
8715 mcache_obj_t *obj;
8716 unsigned long per;
8717 static unsigned char scratch[32];
8718 static ppnum_t scratch_pa = 0;
8719
8720 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8721 if (!mbuf_drain_checks(ignore_waiters)) {
8722 return;
8723 }
8724 if (scratch_pa == 0) {
8725 bzero(scratch, sizeof(scratch));
8726 scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
8727 VERIFY(scratch_pa);
8728 } else if (mclverify) {
8729 /*
8730 * Panic if a driver wrote to our scratch memory.
8731 */
8732 for (k = 0; k < sizeof(scratch); k++) {
8733 if (scratch[k]) {
8734 panic("suspect DMA to freed address");
8735 }
8736 }
8737 }
8738 /*
8739 * Don't free memory too often as that could cause excessive
8740 * waiting times for mbufs. Purge caches if we were asked to drain
8741 * in the last 5 minutes.
8742 */
8743 if (mbuf_drain_last_runtime != 0) {
8744 interval = net_uptime() - mbuf_drain_last_runtime;
8745 if (interval <= mb_drain_maxint) {
8746 return;
8747 }
8748 if (interval <= mb_drain_maxint * 5) {
8749 purge_caches = TRUE;
8750 }
8751 }
8752 mbuf_drain_last_runtime = net_uptime();
8753 /*
8754 * Don't free any memory if we're using 60% or more.
8755 */
8756 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8757 total_mem += m_total(mc) * m_maxsize(mc);
8758 use_mem += m_active(mc) * m_maxsize(mc);
8759 }
8760 per = (use_mem * 100) / total_mem;
8761 if (per >= 60) {
8762 return;
8763 }
8764 /*
8765 * Purge all the caches. This effectively disables
8766 * caching for a few seconds, but the mbuf worker thread will
8767 * re-enable them again.
8768 */
8769 if (purge_caches == TRUE) {
8770 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8771 if (m_total(mc) < m_avgtotal(mc)) {
8772 continue;
8773 }
8774 lck_mtx_unlock(mbuf_mlock);
8775 ret = mcache_purge_cache(m_cache(mc), FALSE);
8776 lck_mtx_lock(mbuf_mlock);
8777 if (ret == TRUE) {
8778 m_purge_cnt(mc)++;
8779 }
8780 }
8781 }
8782 /*
8783 * Move the objects from the composite class freelist to
8784 * the rudimentary slabs list, but keep at least 10% of the average
8785 * total in the freelist.
8786 */
8787 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8788 while (m_cobjlist(mc) &&
8789 m_total(mc) < m_avgtotal(mc) &&
8790 m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
8791 obj = m_cobjlist(mc);
8792 m_cobjlist(mc) = obj->obj_next;
8793 obj->obj_next = NULL;
8794 num = cslab_free(mc, obj, 1);
8795 VERIFY(num == 1);
8796 m_free_cnt(mc)++;
8797 m_infree(mc)--;
8798 /* cslab_free() handles m_total */
8799 }
8800 }
8801 /*
8802 * Free the buffers present in the slab list up to 10% of the total
8803 * average per class.
8804 *
8805 * We walk the list backwards in an attempt to reduce fragmentation.
8806 */
8807 for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) {
8808 TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
8809 /*
8810 * Process only unused slabs occupying memory.
8811 */
8812 if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
8813 sp->sl_base == NULL) {
8814 continue;
8815 }
8816 if (m_total(mc) < m_avgtotal(mc) ||
8817 m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
8818 break;
8819 }
8820 slab_remove(sp, mc);
8821 switch (mc) {
8822 case MC_MBUF:
8823 m_infree(mc) -= NMBPG;
8824 m_total(mc) -= NMBPG;
8825 if (mclaudit != NULL) {
8826 mcl_audit_free(sp->sl_base, NMBPG);
8827 }
8828 break;
8829 case MC_CL:
8830 m_infree(mc) -= NCLPG;
8831 m_total(mc) -= NCLPG;
8832 if (mclaudit != NULL) {
8833 mcl_audit_free(sp->sl_base, NMBPG);
8834 }
8835 break;
8836 case MC_BIGCL:
8837 {
8838 m_infree(mc) -= NBCLPG;
8839 m_total(mc) -= NBCLPG;
8840 if (mclaudit != NULL) {
8841 mcl_audit_free(sp->sl_base, NMBPG);
8842 }
8843 break;
8844 }
8845 case MC_16KCL:
8846 m_infree(mc)--;
8847 m_total(mc)--;
8848 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
8849 nsp = nsp->sl_next;
8850 VERIFY(nsp->sl_refcnt == 0 &&
8851 nsp->sl_base != NULL &&
8852 nsp->sl_len == 0);
8853 slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
8854 0);
8855 nsp->sl_flags = 0;
8856 }
8857 if (mclaudit != NULL) {
8858 if (sp->sl_len == PAGE_SIZE) {
8859 mcl_audit_free(sp->sl_base,
8860 NMBPG);
8861 } else {
8862 mcl_audit_free(sp->sl_base, 1);
8863 }
8864 }
8865 break;
8866 default:
8867 /*
8868 * The composite classes have their own
8869 * freelist (m_cobjlist), so we only
8870 * process rudimentary classes here.
8871 */
8872 VERIFY(0);
8873 }
8874 m_release_cnt(mc) += m_size(mc);
8875 released += m_size(mc);
8876 VERIFY(sp->sl_base != NULL &&
8877 sp->sl_len >= PAGE_SIZE);
8878 offset = MTOPG(sp->sl_base);
8879 /*
8880 * Make sure the IOMapper points to a valid, but
8881 * bogus, address. This should prevent further DMA
8882 * accesses to freed memory.
8883 */
8884 IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
8885 mcl_paddr[offset] = 0;
8886 kmem_free(mb_map, (vm_offset_t)sp->sl_base,
8887 sp->sl_len);
8888 slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
8889 sp->sl_flags = 0;
8890 }
8891 }
8892 mbstat.m_drain++;
8893 mbstat.m_bigclusters = m_total(MC_BIGCL);
8894 mbstat.m_clusters = m_total(MC_CL);
8895 mbstat.m_mbufs = m_total(MC_MBUF);
8896 mbuf_stat_sync();
8897 mbuf_mtypes_sync(TRUE);
8898 }
8899
8900 __private_extern__ void
8901 mbuf_drain(boolean_t ignore_waiters)
8902 {
8903 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_NOTOWNED);
8904 if (!mbuf_drain_checks(ignore_waiters)) {
8905 return;
8906 }
8907 lck_mtx_lock(mbuf_mlock);
8908 mbuf_drain_locked(ignore_waiters);
8909 lck_mtx_unlock(mbuf_mlock);
8910 }
8911
8912
8913 static int
8914 m_drain_force_sysctl SYSCTL_HANDLER_ARGS
8915 {
8916 #pragma unused(arg1, arg2)
8917 int val = 0, err;
8918
8919 err = sysctl_handle_int(oidp, &val, 0, req);
8920 if (err != 0 || req->newptr == USER_ADDR_NULL) {
8921 return err;
8922 }
8923 if (val) {
8924 mbuf_drain(TRUE);
8925 }
8926
8927 return err;
8928 }
8929
8930 #if DEBUG || DEVELOPMENT
8931 __printflike(3, 4)
8932 static void
8933 _mbwdog_logger(const char *func, const int line, const char *fmt, ...)
8934 {
8935 va_list ap;
8936 struct timeval now;
8937 char str[384], p[256];
8938 int len;
8939
8940 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8941 if (mbwdog_logging == NULL) {
8942 /*
8943 * This might block under a mutex, which isn't really great,
8944 * but this happens once, so we'll live.
8945 */
8946 mbwdog_logging = zalloc_permanent(mbwdog_logging_size,
8947 ZALIGN_NONE);
8948 }
8949 va_start(ap, fmt);
8950 vsnprintf(p, sizeof(p), fmt, ap);
8951 va_end(ap);
8952 microuptime(&now);
8953 len = scnprintf(str, sizeof(str),
8954 "\n%ld.%d (%d/%llx) %s:%d %s",
8955 now.tv_sec, now.tv_usec,
8956 proc_getpid(current_proc()),
8957 (uint64_t)VM_KERNEL_ADDRPERM(current_thread()),
8958 func, line, p);
8959 if (len < 0) {
8960 return;
8961 }
8962 if (mbwdog_logging_used + len > mbwdog_logging_size) {
8963 mbwdog_logging_used = mbwdog_logging_used / 2;
8964 memmove(mbwdog_logging, mbwdog_logging + mbwdog_logging_used,
8965 mbwdog_logging_size - mbwdog_logging_used);
8966 mbwdog_logging[mbwdog_logging_used] = 0;
8967 }
8968 strlcat(mbwdog_logging, str, mbwdog_logging_size);
8969 mbwdog_logging_used += len;
8970 }
8971
8972 #endif // DEBUG || DEVELOPMENT
8973
8974 static void
8975 mtracelarge_register(size_t size)
8976 {
8977 int i;
8978 struct mtracelarge *trace;
8979 uintptr_t bt[MLEAK_STACK_DEPTH];
8980 unsigned int depth;
8981
8982 depth = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
8983 /* Check if this entry is already on the list. */
8984 for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
8985 trace = &mtracelarge_table[i];
8986 if (trace->size == size && trace->depth == depth &&
8987 memcmp(bt, trace->addr, depth * sizeof(uintptr_t)) == 0) {
8988 return;
8989 }
8990 }
8991 for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
8992 trace = &mtracelarge_table[i];
8993 if (size > trace->size) {
8994 trace->depth = depth;
8995 memcpy(trace->addr, bt, depth * sizeof(uintptr_t));
8996 trace->size = size;
8997 break;
8998 }
8999 }
9000 }
9001
9002 #if DEBUG || DEVELOPMENT
9003
9004 static int
9005 mbuf_wd_dump_sysctl SYSCTL_HANDLER_ARGS
9006 {
9007 char *str;
9008
9009 ifnet_head_lock_shared();
9010 lck_mtx_lock(mbuf_mlock);
9011
9012 str = mbuf_dump();
9013
9014 lck_mtx_unlock(mbuf_mlock);
9015 ifnet_head_done();
9016
9017 return sysctl_io_string(req, str, 0, 0, NULL);
9018 }
9019
9020 #endif /* DEBUG || DEVELOPMENT */
9021
9022 SYSCTL_DECL(_kern_ipc);
9023 #if DEBUG || DEVELOPMENT
9024 #if SKYWALK
9025 SYSCTL_UINT(_kern_ipc, OID_AUTO, mc_threshold_scale_factor,
9026 CTLFLAG_RW | CTLFLAG_LOCKED, &mc_threshold_scale_down_factor,
9027 MC_THRESHOLD_SCALE_DOWN_FACTOR,
9028 "scale down factor for mbuf cache thresholds");
9029 #endif /* SKYWALK */
9030 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_wd_dump,
9031 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED,
9032 0, 0, mbuf_wd_dump_sysctl, "A", "mbuf watchdog dump");
9033 #endif
9034 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
9035 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
9036 0, 0, mbstat_sysctl, "S,mbstat", "");
9037 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
9038 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
9039 0, 0, mb_stat_sysctl, "S,mb_stat", "");
9040 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
9041 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
9042 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
9043 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
9044 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
9045 0, 0, mleak_table_sysctl, "S,mleak_table", "");
9046 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
9047 CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
9048 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
9049 CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
9050 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
9051 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
9052 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
9053 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
9054 m_drain_force_sysctl, "I",
9055 "Forces the mbuf garbage collection to run");
9056 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
9057 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0,
9058 "Minimum time interval between garbage collection");
9059 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_memory_pressure_percentage,
9060 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_memory_pressure_percentage, 0,
9061 "Percentage of when we trigger memory-pressure for an mbuf-class");
9062