1 /*
2 * Copyright (c) 1998-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <ptrauth.h>
71
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/malloc.h>
75 #include <sys/mbuf.h>
76 #include <sys/kernel.h>
77 #include <sys/sysctl.h>
78 #include <sys/syslog.h>
79 #include <sys/protosw.h>
80 #include <sys/domain.h>
81 #include <sys/queue.h>
82 #include <sys/proc.h>
83 #include <sys/filedesc.h>
84 #include <sys/file_internal.h>
85
86 #include <dev/random/randomdev.h>
87
88 #include <kern/kern_types.h>
89 #include <kern/simple_lock.h>
90 #include <kern/queue.h>
91 #include <kern/sched_prim.h>
92 #include <kern/backtrace.h>
93 #include <kern/percpu.h>
94 #include <kern/zalloc.h>
95
96 #include <libkern/OSAtomic.h>
97 #include <libkern/OSDebug.h>
98 #include <libkern/libkern.h>
99
100 #include <os/log.h>
101 #include <os/ptrtools.h>
102
103 #include <IOKit/IOMapper.h>
104
105 #include <machine/limits.h>
106 #include <machine/machine_routines.h>
107
108 #include <sys/mcache.h>
109 #include <net/ntstat.h>
110
111 #if INET
112 extern int dump_tcp_reass_qlen(char *, int);
113 extern int tcp_reass_qlen_space(struct socket *);
114 #endif /* INET */
115
116 #if MPTCP
117 extern int dump_mptcp_reass_qlen(char *, int);
118 #endif /* MPTCP */
119
120
121 #if NETWORKING
122 extern int dlil_dump_top_if_qlen(char *, int);
123 #endif /* NETWORKING */
124
125 /*
126 * MBUF IMPLEMENTATION NOTES.
127 *
128 * There is a total of 5 per-CPU caches:
129 *
130 * MC_MBUF:
131 * This is a cache of rudimentary objects of MSIZE in size; each
132 * object represents an mbuf structure. This cache preserves only
133 * the m_type field of the mbuf during its transactions.
134 *
135 * MC_CL:
136 * This is a cache of rudimentary objects of MCLBYTES in size; each
137 * object represents a mcluster structure. This cache does not
138 * preserve the contents of the objects during its transactions.
139 *
140 * MC_BIGCL:
141 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each
142 * object represents a mbigcluster structure. This cache does not
143 * preserve the contents of the objects during its transaction.
144 *
145 * MC_MBUF_CL:
146 * This is a cache of mbufs each having a cluster attached to it.
147 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
148 * fields of the mbuf related to the external cluster are preserved
149 * during transactions.
150 *
151 * MC_MBUF_BIGCL:
152 * This is a cache of mbufs each having a big cluster attached to it.
153 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
154 * fields of the mbuf related to the external cluster are preserved
155 * during transactions.
156 *
157 * OBJECT ALLOCATION:
158 *
159 * Allocation requests are handled first at the per-CPU (mcache) layer
160 * before falling back to the slab layer. Performance is optimal when
161 * the request is satisfied at the CPU layer because global data/lock
162 * never gets accessed. When the slab layer is entered for allocation,
163 * the slab freelist will be checked first for available objects before
164 * the VM backing store is invoked. Slab layer operations are serialized
165 * for all of the caches as the mbuf global lock is held most of the time.
166 * Allocation paths are different depending on the class of objects:
167 *
168 * a. Rudimentary object:
169 *
170 * { m_get_common(), m_clattach(), m_mclget(),
171 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
172 * composite object allocation }
173 * | ^
174 * | |
175 * | +-----------------------+
176 * v |
177 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
178 * | ^
179 * v |
180 * [CPU cache] -------> (found?) -------+
181 * | |
182 * v |
183 * mbuf_slab_alloc() |
184 * | |
185 * v |
186 * +---------> [freelist] -------> (found?) -------+
187 * | |
188 * | v
189 * | m_clalloc()
190 * | |
191 * | v
192 * +---<<---- kmem_mb_alloc()
193 *
194 * b. Composite object:
195 *
196 * { m_getpackets_internal(), m_allocpacket_internal() }
197 * | ^
198 * | |
199 * | +------ (done) ---------+
200 * v |
201 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
202 * | ^
203 * v |
204 * [CPU cache] -------> (found?) -------+
205 * | |
206 * v |
207 * mbuf_cslab_alloc() |
208 * | |
209 * v |
210 * [freelist] -------> (found?) -------+
211 * | |
212 * v |
213 * (rudimentary object) |
214 * mcache_alloc/mcache_alloc_ext() ------>>-----+
215 *
216 * Auditing notes: If auditing is enabled, buffers will be subjected to
217 * integrity checks by the audit routine. This is done by verifying their
218 * contents against DEADBEEF (free) pattern before returning them to caller.
219 * As part of this step, the routine will also record the transaction and
220 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
221 * also restore any constructed data structure fields if necessary.
222 *
223 * OBJECT DEALLOCATION:
224 *
225 * Freeing an object simply involves placing it into the CPU cache; this
226 * pollutes the cache to benefit subsequent allocations. The slab layer
227 * will only be entered if the object is to be purged out of the cache.
228 * During normal operations, this happens only when the CPU layer resizes
229 * its bucket while it's adjusting to the allocation load. Deallocation
230 * paths are different depending on the class of objects:
231 *
232 * a. Rudimentary object:
233 *
234 * { m_free(), m_freem_list(), composite object deallocation }
235 * | ^
236 * | |
237 * | +------ (done) ---------+
238 * v |
239 * mcache_free/mcache_free_ext() |
240 * | |
241 * v |
242 * mbuf_slab_audit() |
243 * | |
244 * v |
245 * [CPU cache] ---> (not purging?) -----+
246 * | |
247 * v |
248 * mbuf_slab_free() |
249 * | |
250 * v |
251 * [freelist] ----------->>------------+
252 * (objects get purged to VM only on demand)
253 *
254 * b. Composite object:
255 *
256 * { m_free(), m_freem_list() }
257 * | ^
258 * | |
259 * | +------ (done) ---------+
260 * v |
261 * mcache_free/mcache_free_ext() |
262 * | |
263 * v |
264 * mbuf_cslab_audit() |
265 * | |
266 * v |
267 * [CPU cache] ---> (not purging?) -----+
268 * | |
269 * v |
270 * mbuf_cslab_free() |
271 * | |
272 * v |
273 * [freelist] ---> (not purging?) -----+
274 * | |
275 * v |
276 * (rudimentary object) |
277 * mcache_free/mcache_free_ext() ------->>------+
278 *
279 * Auditing notes: If auditing is enabled, the audit routine will save
280 * any constructed data structure fields (if necessary) before filling the
281 * contents of the buffers with DEADBEEF (free) pattern and recording the
282 * transaction. Buffers that are freed (whether at CPU or slab layer) are
283 * expected to contain the free pattern.
284 *
285 * DEBUGGING:
286 *
287 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
288 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
289 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
290 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
291 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
292 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
293 *
294 * Each object is associated with exactly one mcache_audit_t structure that
295 * contains the information related to its last buffer transaction. Given
296 * an address of an object, the audit structure can be retrieved by finding
297 * the position of the object relevant to the base address of the cluster:
298 *
299 * +------------+ +=============+
300 * | mbuf addr | | mclaudit[i] |
301 * +------------+ +=============+
302 * | | cl_audit[0] |
303 * i = MTOBG(addr) +-------------+
304 * | +-----> | cl_audit[1] | -----> mcache_audit_t
305 * b = BGTOM(i) | +-------------+
306 * | | | ... |
307 * x = MCLIDX(b, addr) | +-------------+
308 * | | | cl_audit[7] |
309 * +-----------------+ +-------------+
310 * (e.g. x == 1)
311 *
312 * The mclaudit[] array is allocated at initialization time, but its contents
313 * get populated when the corresponding cluster is created. Because a page
314 * can be turned into NMBPG number of mbufs, we preserve enough space for the
315 * mbufs so that there is a 1-to-1 mapping between them. A page that never
316 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
317 * remaining entries unused. For 16KB cluster, only one entry from the first
318 * page is allocated and used for the entire object.
319 */
320
321 /* TODO: should be in header file */
322 /* kernel translater */
323 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
324 extern vm_map_t mb_map; /* special map */
325
326 static uint32_t mb_kmem_contig_failed;
327 static uint32_t mb_kmem_failed;
328 static uint32_t mb_kmem_one_failed;
329 /* Timestamp of allocation failures. */
330 static uint64_t mb_kmem_contig_failed_ts;
331 static uint64_t mb_kmem_failed_ts;
332 static uint64_t mb_kmem_one_failed_ts;
333 static uint64_t mb_kmem_contig_failed_size;
334 static uint64_t mb_kmem_failed_size;
335 static uint32_t mb_kmem_stats[6];
336
337 /* Global lock */
338 static LCK_GRP_DECLARE(mbuf_mlock_grp, "mbuf");
339 static LCK_MTX_DECLARE(mbuf_mlock_data, &mbuf_mlock_grp);
340 static lck_mtx_t *const mbuf_mlock = &mbuf_mlock_data;
341
342 /* Back-end (common) layer */
343 static uint64_t mb_expand_cnt;
344 static uint64_t mb_expand_cl_cnt;
345 static uint64_t mb_expand_cl_total;
346 static uint64_t mb_expand_bigcl_cnt;
347 static uint64_t mb_expand_bigcl_total;
348 static uint64_t mb_expand_16kcl_cnt;
349 static uint64_t mb_expand_16kcl_total;
350 static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */
351 static uint32_t mbuf_worker_run_cnt;
352 static uint64_t mbuf_worker_last_runtime;
353 static uint64_t mbuf_drain_last_runtime;
354 static int mbuf_worker_ready; /* worker thread is runnable */
355 static unsigned int ncpu; /* number of CPUs */
356 static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */
357 static ppnum_t mcl_pages; /* Size of array (# physical pages) */
358 static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
359 static mcache_t *ref_cache; /* Cache of cluster reference & flags */
360 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
361 unsigned int mbuf_debug; /* patchable mbuf mcache flags */
362 static unsigned int mb_normalized; /* number of packets "normalized" */
363
364 #define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
365 #define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
366
367 typedef enum {
368 MC_MBUF = 0, /* Regular mbuf */
369 MC_CL, /* Cluster */
370 MC_BIGCL, /* Large (4KB) cluster */
371 MC_16KCL, /* Jumbo (16KB) cluster */
372 MC_MBUF_CL, /* mbuf + cluster */
373 MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */
374 MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */
375 } mbuf_class_t;
376
377 #define MBUF_CLASS_MIN MC_MBUF
378 #define MBUF_CLASS_MAX MC_MBUF_16KCL
379 #define MBUF_CLASS_LAST MC_16KCL
380 #define MBUF_CLASS_VALID(c) \
381 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
382 #define MBUF_CLASS_COMPOSITE(c) \
383 ((int)(c) > MBUF_CLASS_LAST)
384
385
386 /*
387 * mbuf specific mcache allocation request flags.
388 */
389 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
390
391 /*
392 * Per-cluster slab structure.
393 *
394 * A slab is a cluster control structure that contains one or more object
395 * chunks; the available chunks are chained in the slab's freelist (sl_head).
396 * Each time a chunk is taken out of the slab, the slab's reference count
397 * gets incremented. When all chunks have been taken out, the empty slab
398 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
399 * returned to a slab causes the slab's reference count to be decremented;
400 * it also causes the slab to be reinserted back to class's slab list, if
401 * it's not already done.
402 *
403 * Compartmentalizing of the object chunks into slabs allows us to easily
404 * merge one or more slabs together when the adjacent slabs are idle, as
405 * well as to convert or move a slab from one class to another; e.g. the
406 * mbuf cluster slab can be converted to a regular cluster slab when all
407 * mbufs in the slab have been freed.
408 *
409 * A slab may also span across multiple clusters for chunks larger than
410 * a cluster's size. In this case, only the slab of the first cluster is
411 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
412 * that they are part of the larger slab.
413 *
414 * Each slab controls a page of memory.
415 */
416 typedef struct mcl_slab {
417 struct mcl_slab *sl_next; /* neighboring slab */
418 u_int8_t sl_class; /* controlling mbuf class */
419 int8_t sl_refcnt; /* outstanding allocations */
420 int8_t sl_chunks; /* chunks (bufs) in this slab */
421 u_int16_t sl_flags; /* slab flags (see below) */
422 u_int16_t sl_len; /* slab length */
423 void *sl_base; /* base of allocated memory */
424 void *sl_head; /* first free buffer */
425 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
426 } mcl_slab_t;
427
428 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
429 #define SLF_PARTIAL 0x0002 /* part of another slab */
430 #define SLF_DETACHED 0x0004 /* not in slab freelist */
431
432 /*
433 * The array of slabs are broken into groups of arrays per 1MB of kernel
434 * memory to reduce the footprint. Each group is allocated on demand
435 * whenever a new piece of memory mapped in from the VM crosses the 1MB
436 * boundary.
437 */
438 #define NSLABSPMB ((1 << MBSHIFT) >> PAGE_SHIFT)
439
440 typedef struct mcl_slabg {
441 mcl_slab_t *slg_slab; /* group of slabs */
442 } mcl_slabg_t;
443
444 /*
445 * Number of slabs needed to control a 16KB cluster object.
446 */
447 #define NSLABSP16KB (M16KCLBYTES >> PAGE_SHIFT)
448
449 /*
450 * Per-cluster audit structure.
451 */
452 typedef struct {
453 mcache_audit_t **cl_audit; /* array of audits */
454 } mcl_audit_t;
455
456 typedef struct {
457 struct thread *msa_thread; /* thread doing transaction */
458 struct thread *msa_pthread; /* previous transaction thread */
459 uint32_t msa_tstamp; /* transaction timestamp (ms) */
460 uint32_t msa_ptstamp; /* prev transaction timestamp (ms) */
461 uint16_t msa_depth; /* pc stack depth */
462 uint16_t msa_pdepth; /* previous transaction pc stack */
463 void *msa_stack[MCACHE_STACK_DEPTH];
464 void *msa_pstack[MCACHE_STACK_DEPTH];
465 } mcl_scratch_audit_t;
466
467 typedef struct {
468 /*
469 * Size of data from the beginning of an mbuf that covers m_hdr,
470 * pkthdr and m_ext structures. If auditing is enabled, we allocate
471 * a shadow mbuf structure of this size inside each audit structure,
472 * and the contents of the real mbuf gets copied into it when the mbuf
473 * is freed. This allows us to pattern-fill the mbuf for integrity
474 * check, and to preserve any constructed mbuf fields (e.g. mbuf +
475 * cluster cache case). Note that we don't save the contents of
476 * clusters when they are freed; we simply pattern-fill them.
477 */
478 u_int8_t sc_mbuf[(MSIZE - _MHLEN) + sizeof(_m_ext_t)];
479 mcl_scratch_audit_t sc_scratch __attribute__((aligned(8)));
480 } mcl_saved_contents_t;
481
482 #define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t))
483
484 #define MCA_SAVED_MBUF_PTR(_mca) \
485 ((struct mbuf *)(void *)((mcl_saved_contents_t *) \
486 (_mca)->mca_contents)->sc_mbuf)
487 #define MCA_SAVED_MBUF_SIZE \
488 (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
489 #define MCA_SAVED_SCRATCH_PTR(_mca) \
490 (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
491
492 /*
493 * mbuf specific mcache audit flags
494 */
495 #define MB_INUSE 0x01 /* object has not been returned to slab */
496 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
497 #define MB_SCVALID 0x04 /* object has valid saved contents */
498
499 /*
500 * Each of the following two arrays hold up to nmbclusters elements.
501 */
502 static mcl_audit_t *mclaudit; /* array of cluster audit information */
503 static unsigned int maxclaudit; /* max # of entries in audit table */
504 static mcl_slabg_t **slabstbl; /* cluster slabs table */
505 static unsigned int maxslabgrp; /* max # of entries in slabs table */
506 static unsigned int slabgrp; /* # of entries in slabs table */
507
508 /* Globals */
509 int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
510 int njcl; /* # of clusters for jumbo sizes */
511 int njclbytes; /* size of a jumbo cluster */
512 unsigned char *mbutl; /* first mapped cluster address */
513 unsigned char *embutl; /* ending virtual address of mclusters */
514 int max_linkhdr; /* largest link-level header */
515 int max_protohdr; /* largest protocol header */
516 int max_hdr; /* largest link+protocol header */
517 int max_datalen; /* MHLEN - max_hdr */
518
519 static boolean_t mclverify; /* debug: pattern-checking */
520 static boolean_t mcltrace; /* debug: stack tracing */
521 static boolean_t mclfindleak; /* debug: leak detection */
522 static boolean_t mclexpleak; /* debug: expose leak info to user space */
523
524 static struct timeval mb_start; /* beginning of time */
525
526 /* mbuf leak detection variables */
527 static struct mleak_table mleak_table;
528 static mleak_stat_t *mleak_stat;
529
530 #define MLEAK_STAT_SIZE(n) \
531 __builtin_offsetof(mleak_stat_t, ml_trace[n])
532
533 struct mallocation {
534 mcache_obj_t *element; /* the alloc'ed element, NULL if unused */
535 u_int32_t trace_index; /* mtrace index for corresponding backtrace */
536 u_int32_t count; /* How many objects were requested */
537 u_int64_t hitcount; /* for determining hash effectiveness */
538 };
539
540 struct mtrace {
541 u_int64_t collisions;
542 u_int64_t hitcount;
543 u_int64_t allocs;
544 u_int64_t depth;
545 uintptr_t addr[MLEAK_STACK_DEPTH];
546 };
547
548 /* Size must be a power of two for the zhash to be able to just mask off bits */
549 #define MLEAK_ALLOCATION_MAP_NUM 512
550 #define MLEAK_TRACE_MAP_NUM 256
551
552 /*
553 * Sample factor for how often to record a trace. This is overwritable
554 * by the boot-arg mleak_sample_factor.
555 */
556 #define MLEAK_SAMPLE_FACTOR 500
557
558 /*
559 * Number of top leakers recorded.
560 */
561 #define MLEAK_NUM_TRACES 5
562
563 #define MB_LEAK_SPACING_64 " "
564 #define MB_LEAK_SPACING_32 " "
565
566
567 #define MB_LEAK_HDR_32 "\n\
568 trace [1] trace [2] trace [3] trace [4] trace [5] \n\
569 ---------- ---------- ---------- ---------- ---------- \n\
570 "
571
572 #define MB_LEAK_HDR_64 "\n\
573 trace [1] trace [2] trace [3] \
574 trace [4] trace [5] \n\
575 ------------------ ------------------ ------------------ \
576 ------------------ ------------------ \n\
577 "
578
579 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
580 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
581
582 /* Hashmaps of allocations and their corresponding traces */
583 static struct mallocation *mleak_allocations;
584 static struct mtrace *mleak_traces;
585 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
586
587 /* Lock to protect mleak tables from concurrent modification */
588 static LCK_GRP_DECLARE(mleak_lock_grp, "mleak_lock");
589 static LCK_MTX_DECLARE(mleak_lock_data, &mleak_lock_grp);
590 static lck_mtx_t *const mleak_lock = &mleak_lock_data;
591
592 /* *Failed* large allocations. */
593 struct mtracelarge {
594 uint64_t size;
595 uint64_t depth;
596 uintptr_t addr[MLEAK_STACK_DEPTH];
597 };
598
599 #define MTRACELARGE_NUM_TRACES 5
600 static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES];
601
602 static void mtracelarge_register(size_t size);
603
604 /* Lock to protect the completion callback table */
605 static LCK_GRP_DECLARE(mbuf_tx_compl_tbl_lck_grp, "mbuf_tx_compl_tbl");
606 LCK_RW_DECLARE(mbuf_tx_compl_tbl_lock, &mbuf_tx_compl_tbl_lck_grp);
607
608 extern u_int32_t high_sb_max;
609
610 /* The minimum number of objects that are allocated, to start. */
611 #define MINCL 32
612 #define MINBIGCL (MINCL >> 1)
613 #define MIN16KCL (MINCL >> 2)
614
615 /* Low watermarks (only map in pages once free counts go below) */
616 #define MBIGCL_LOWAT MINBIGCL
617 #define M16KCL_LOWAT MIN16KCL
618
619 typedef struct {
620 mbuf_class_t mtbl_class; /* class type */
621 mcache_t *mtbl_cache; /* mcache for this buffer class */
622 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
623 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */
624 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */
625 u_int32_t mtbl_maxsize; /* maximum buffer size */
626 int mtbl_minlimit; /* minimum allowed */
627 int mtbl_maxlimit; /* maximum allowed */
628 u_int32_t mtbl_wantpurge; /* purge during next reclaim */
629 uint32_t mtbl_avgtotal; /* average total on iOS */
630 u_int32_t mtbl_expand; /* worker should expand the class */
631 } mbuf_table_t;
632
633 #define m_class(c) mbuf_table[c].mtbl_class
634 #define m_cache(c) mbuf_table[c].mtbl_cache
635 #define m_slablist(c) mbuf_table[c].mtbl_slablist
636 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
637 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
638 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
639 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
640 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
641 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
642 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
643 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
644 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
645 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
646 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
647 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
648 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
649 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
650 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
651 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
652 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
653 #define m_peak(c) mbuf_table[c].mtbl_stats->mbcl_peak_reported
654 #define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
655 #define m_region_expand(c) mbuf_table[c].mtbl_expand
656
657 static mbuf_table_t mbuf_table[] = {
658 /*
659 * The caches for mbufs, regular clusters and big clusters.
660 * The average total values were based on data gathered by actual
661 * usage patterns on iOS.
662 */
663 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
664 NULL, NULL, 0, 0, 0, 0, 3000, 0 },
665 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
666 NULL, NULL, 0, 0, 0, 0, 2000, 0 },
667 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
668 NULL, NULL, 0, 0, 0, 0, 1000, 0 },
669 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
670 NULL, NULL, 0, 0, 0, 0, 200, 0 },
671 /*
672 * The following are special caches; they serve as intermediate
673 * caches backed by the above rudimentary caches. Each object
674 * in the cache is an mbuf with a cluster attached to it. Unlike
675 * the above caches, these intermediate caches do not directly
676 * deal with the slab structures; instead, the constructed
677 * cached elements are simply stored in the freelists.
678 */
679 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000, 0 },
680 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000, 0 },
681 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 200, 0 },
682 };
683
684 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
685
686 #if SKYWALK
687 #define MC_THRESHOLD_SCALE_DOWN_FACTOR 2
688 static unsigned int mc_threshold_scale_down_factor =
689 MC_THRESHOLD_SCALE_DOWN_FACTOR;
690 #endif /* SKYWALK */
691
692 static uint32_t
m_avgtotal(mbuf_class_t c)693 m_avgtotal(mbuf_class_t c)
694 {
695 #if SKYWALK
696 return if_is_fsw_transport_netagent_enabled() ?
697 (mbuf_table[c].mtbl_avgtotal / mc_threshold_scale_down_factor) :
698 mbuf_table[c].mtbl_avgtotal;
699 #else /* !SKYWALK */
700 return mbuf_table[c].mtbl_avgtotal;
701 #endif /* SKYWALK */
702 }
703
704 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
705 static int mb_waiters; /* number of waiters */
706
707 boolean_t mb_peak_newreport = FALSE;
708 boolean_t mb_peak_firstreport = FALSE;
709
710 /* generate a report by default after 1 week of uptime */
711 #define MBUF_PEAK_FIRST_REPORT_THRESHOLD 604800
712
713 #define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */
714 static struct timeval mb_wdtstart; /* watchdog start timestamp */
715 static char *mbuf_dump_buf;
716
717 #define MBUF_DUMP_BUF_SIZE 4096
718
719 /*
720 * mbuf watchdog is enabled by default. It is also toggeable via the
721 * kern.ipc.mb_watchdog sysctl.
722 * Garbage collection is enabled by default on embedded platforms.
723 * mb_drain_maxint controls the amount of time to wait (in seconds) before
724 * consecutive calls to mbuf_drain().
725 */
726 static unsigned int mb_watchdog = 1;
727 #if !XNU_TARGET_OS_OSX
728 static unsigned int mb_drain_maxint = 60;
729 #else /* XNU_TARGET_OS_OSX */
730 static unsigned int mb_drain_maxint = 0;
731 #endif /* XNU_TARGET_OS_OSX */
732 static unsigned int mb_memory_pressure_percentage = 80;
733
734 uintptr_t mb_obscure_extfree __attribute__((visibility("hidden")));
735 uintptr_t mb_obscure_extref __attribute__((visibility("hidden")));
736
737 /* Red zone */
738 static u_int32_t mb_redzone_cookie;
739 static void m_redzone_init(struct mbuf *);
740 static void m_redzone_verify(struct mbuf *m);
741
742 /* The following are used to serialize m_clalloc() */
743 static boolean_t mb_clalloc_busy;
744 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
745 static int mb_clalloc_waiters;
746
747 static void mbuf_mtypes_sync(boolean_t);
748 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
749 static void mbuf_stat_sync(void);
750 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
751 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
752 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
753 static char *mbuf_dump(void);
754 static void mbuf_table_init(void);
755 static inline void m_incref(struct mbuf *);
756 static inline u_int16_t m_decref(struct mbuf *);
757 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
758 static void mbuf_worker_thread_init(void);
759 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
760 static void slab_free(mbuf_class_t, mcache_obj_t *);
761 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
762 unsigned int, int);
763 static void mbuf_slab_free(void *, mcache_obj_t *, int);
764 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
765 static void mbuf_slab_notify(void *, u_int32_t);
766 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
767 unsigned int);
768 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
769 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
770 unsigned int, int);
771 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
772 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
773 static int freelist_populate(mbuf_class_t, unsigned int, int);
774 static void freelist_init(mbuf_class_t);
775 static boolean_t mbuf_cached_above(mbuf_class_t, int);
776 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
777 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
778 static int m_howmany(int, size_t);
779 static void mbuf_worker_thread(void);
780 static void mbuf_watchdog(void);
781 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
782
783 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
784 size_t, unsigned int);
785 static void mcl_audit_free(void *, unsigned int);
786 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
787 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
788 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
789 boolean_t);
790 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
791 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
792 static void mcl_audit_scratch(mcache_audit_t *);
793 static void mcl_audit_mcheck_panic(struct mbuf *);
794 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
795
796 static void mleak_activate(void);
797 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
798 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
799 static void mleak_free(mcache_obj_t *);
800 static void mleak_sort_traces(void);
801 static void mleak_update_stats(void);
802
803 static mcl_slab_t *slab_get(void *);
804 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
805 void *, void *, unsigned int, int, int);
806 static void slab_insert(mcl_slab_t *, mbuf_class_t);
807 static void slab_remove(mcl_slab_t *, mbuf_class_t);
808 static boolean_t slab_inrange(mcl_slab_t *, void *);
809 static void slab_nextptr_panic(mcl_slab_t *, void *);
810 static void slab_detach(mcl_slab_t *);
811 static boolean_t slab_is_detached(mcl_slab_t *);
812
813 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
814 static struct mbuf *m_split0(struct mbuf *, int, int, int);
815 __private_extern__ void mbuf_report_peak_usage(void);
816 static boolean_t mbuf_report_usage(mbuf_class_t);
817 #if DEBUG || DEVELOPMENT
818 #define mbwdog_logger(fmt, ...) _mbwdog_logger(__func__, __LINE__, fmt, ## __VA_ARGS__)
819 static void _mbwdog_logger(const char *func, const int line, const char *fmt, ...);
820 static char *mbwdog_logging;
821 const unsigned mbwdog_logging_size = 4096;
822 static size_t mbwdog_logging_used;
823 #else
824 #define mbwdog_logger(fmt, ...) do { } while (0)
825 #endif
826 static void mbuf_drain_locked(boolean_t);
827
828 /* flags for m_copyback0 */
829 #define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
830 #define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
831 #define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
832 #define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
833
834 /*
835 * This flag is set for all mbufs that come out of and into the composite
836 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
837 * are marked with such a flag have clusters attached to them, and will be
838 * treated differently when they are freed; instead of being placed back
839 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
840 * are placed back into the appropriate composite cache's freelist, and the
841 * actual freeing is deferred until the composite objects are purged. At
842 * such a time, this flag will be cleared from the mbufs and the objects
843 * will be freed into their own separate freelists.
844 */
845 #define EXTF_COMPOSITE 0x1
846
847 /*
848 * This flag indicates that the external cluster is read-only, i.e. it is
849 * or was referred to by more than one mbufs. Once set, this flag is never
850 * cleared.
851 */
852 #define EXTF_READONLY 0x2
853 /*
854 * This flag indicates that the external cluster is paired with the mbuf.
855 * Pairing implies an external free routine defined which will be invoked
856 * when the reference count drops to the minimum at m_free time. This
857 * flag is never cleared.
858 */
859 #define EXTF_PAIRED 0x4
860
861 #define EXTF_MASK \
862 (EXTF_COMPOSITE | EXTF_READONLY | EXTF_PAIRED)
863
864 #define MEXT_MINREF(m) ((m_get_rfa(m))->minref)
865 #define MEXT_REF(m) ((m_get_rfa(m))->refcnt)
866 #define MEXT_PREF(m) ((m_get_rfa(m))->prefcnt)
867 #define MEXT_FLAGS(m) ((m_get_rfa(m))->flags)
868 #define MEXT_PRIV(m) ((m_get_rfa(m))->priv)
869 #define MEXT_PMBUF(m) ((m_get_rfa(m))->paired)
870 #define MEXT_TOKEN(m) ((m_get_rfa(m))->ext_token)
871 #define MBUF_IS_COMPOSITE(m) \
872 (MEXT_REF(m) == MEXT_MINREF(m) && \
873 (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
874 /*
875 * This macro can be used to test if the mbuf is paired to an external
876 * cluster. The test for MEXT_PMBUF being equal to the mbuf in subject
877 * is important, as EXTF_PAIRED alone is insufficient since it is immutable,
878 * and thus survives calls to m_free_paired.
879 */
880 #define MBUF_IS_PAIRED(m) \
881 (((m)->m_flags & M_EXT) && \
882 (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_PAIRED && \
883 MEXT_PMBUF(m) == (m))
884
885 /*
886 * Macros used to verify the integrity of the mbuf.
887 */
888 #define _MCHECK(m) { \
889 if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) { \
890 if (mclaudit == NULL) \
891 panic("MCHECK: m_type=%d m=%p", \
892 (u_int16_t)(m)->m_type, m); \
893 else \
894 mcl_audit_mcheck_panic(m); \
895 } \
896 }
897
898 #define MBUF_IN_MAP(addr) \
899 ((unsigned char *)(addr) >= mbutl && \
900 (unsigned char *)(addr) < embutl)
901
902 #define MRANGE(addr) { \
903 if (!MBUF_IN_MAP(addr)) \
904 panic("MRANGE: address out of range 0x%p", addr); \
905 }
906
907 /*
908 * Macro version of mtod.
909 */
910 #define MTOD(m, t) ((t)((m)->m_data))
911
912 /*
913 * Macros to obtain page index given a base cluster address
914 */
915 #define MTOPG(x) (((unsigned char *)x - mbutl) >> PAGE_SHIFT)
916 #define PGTOM(x) (mbutl + (x << PAGE_SHIFT))
917
918 /*
919 * Macro to find the mbuf index relative to a base.
920 */
921 #define MBPAGEIDX(c, m) \
922 (((unsigned char *)(m) - (unsigned char *)(c)) >> MSIZESHIFT)
923
924 /*
925 * Same thing for 2KB cluster index.
926 */
927 #define CLPAGEIDX(c, m) \
928 (((unsigned char *)(m) - (unsigned char *)(c)) >> MCLSHIFT)
929
930 /*
931 * Macro to find 4KB cluster index relative to a base
932 */
933 #define BCLPAGEIDX(c, m) \
934 (((unsigned char *)(m) - (unsigned char *)(c)) >> MBIGCLSHIFT)
935
936 /*
937 * Macros used during mbuf and cluster initialization.
938 */
939 #define MBUF_INIT_PKTHDR(m) { \
940 (m)->m_pkthdr.rcvif = NULL; \
941 (m)->m_pkthdr.pkt_hdr = NULL; \
942 (m)->m_pkthdr.len = 0; \
943 (m)->m_pkthdr.csum_flags = 0; \
944 (m)->m_pkthdr.csum_data = 0; \
945 (m)->m_pkthdr.vlan_tag = 0; \
946 (m)->m_pkthdr.comp_gencnt = 0; \
947 (m)->m_pkthdr.pkt_crumbs = 0; \
948 m_classifier_init(m, 0); \
949 m_tag_init(m, 1); \
950 m_scratch_init(m); \
951 m_redzone_init(m); \
952 }
953
954 #define MBUF_INIT(m, pkthdr, type) { \
955 _MCHECK(m); \
956 (m)->m_next = (m)->m_nextpkt = NULL; \
957 (m)->m_len = 0; \
958 (m)->m_type = type; \
959 if ((pkthdr) == 0) { \
960 (m)->m_data = (m)->m_dat; \
961 (m)->m_flags = 0; \
962 } else { \
963 (m)->m_data = (m)->m_pktdat; \
964 (m)->m_flags = M_PKTHDR; \
965 MBUF_INIT_PKTHDR(m); \
966 } \
967 }
968
969 #define MEXT_INIT(m, buf, size, free, arg, rfa, min, ref, pref, flag, \
970 priv, pm) { \
971 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
972 (m)->m_flags |= M_EXT; \
973 m_set_ext((m), (rfa), (free), (arg)); \
974 (m)->m_ext.ext_size = (u_int)(size); \
975 MEXT_MINREF(m) = (min); \
976 MEXT_REF(m) = (ref); \
977 MEXT_PREF(m) = (pref); \
978 MEXT_FLAGS(m) = (flag); \
979 MEXT_PRIV(m) = (priv); \
980 MEXT_PMBUF(m) = (pm); \
981 }
982
983 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
984 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, 0, \
985 ref, 0, flag, 0, NULL)
986
987 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
988 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, 0, \
989 ref, 0, flag, 0, NULL)
990
991 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
992 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, 0, \
993 ref, 0, flag, 0, NULL)
994
995 /*
996 * Macro to convert BSD malloc sleep flag to mcache's
997 */
998 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
999
1000 /*
1001 * The structure that holds all mbuf class statistics exportable via sysctl.
1002 * Similar to mbstat structure, the mb_stat structure is protected by the
1003 * global mbuf lock. It contains additional information about the classes
1004 * that allows for a more accurate view of the state of the allocator.
1005 */
1006 struct mb_stat *mb_stat;
1007 struct omb_stat *omb_stat; /* For backwards compatibility */
1008
1009 #define MB_STAT_SIZE(n) \
1010 __builtin_offsetof(mb_stat_t, mbs_class[n])
1011 #define OMB_STAT_SIZE(n) \
1012 __builtin_offsetof(struct omb_stat, mbs_class[n])
1013
1014 /*
1015 * The legacy structure holding all of the mbuf allocation statistics.
1016 * The actual statistics used by the kernel are stored in the mbuf_table
1017 * instead, and are updated atomically while the global mbuf lock is held.
1018 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
1019 * Unlike before, the kernel no longer relies on the contents of mbstat for
1020 * its operations (e.g. cluster expansion) because the structure is exposed
1021 * to outside and could possibly be modified, therefore making it unsafe.
1022 * With the exception of the mbstat.m_mtypes array (see below), all of the
1023 * statistics are updated as they change.
1024 */
1025 struct mbstat mbstat;
1026
1027 #define MBSTAT_MTYPES_MAX \
1028 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
1029
1030 /*
1031 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
1032 * atomically and stored in a per-CPU structure which is lock-free; this is
1033 * done in order to avoid writing to the global mbstat data structure which
1034 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
1035 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
1036 * array and returned to the application. Any updates for types greater or
1037 * equal than MT_MAX would be done atomically to the mbstat; this slows down
1038 * performance but is okay since the kernel uses only up to MT_MAX-1 while
1039 * anything beyond that (up to type 255) is considered a corner case.
1040 */
1041 typedef struct {
1042 unsigned int cpu_mtypes[MT_MAX];
1043 } mbuf_mtypes_t;
1044
1045 static mbuf_mtypes_t PERCPU_DATA(mbuf_mtypes);
1046
1047 #define mtype_stat_add(type, n) { \
1048 if ((unsigned)(type) < MT_MAX) { \
1049 mbuf_mtypes_t *mbs = PERCPU_GET(mbuf_mtypes); \
1050 atomic_add_32(&mbs->cpu_mtypes[type], n); \
1051 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
1052 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \
1053 } \
1054 }
1055
1056 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
1057 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
1058 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
1059
1060 static void
mbuf_mtypes_sync(boolean_t locked)1061 mbuf_mtypes_sync(boolean_t locked)
1062 {
1063 mbuf_mtypes_t mtc;
1064
1065 if (locked) {
1066 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1067 }
1068
1069 mtc = *PERCPU_GET_MASTER(mbuf_mtypes);
1070 percpu_foreach_secondary(mtype, mbuf_mtypes) {
1071 for (int n = 0; n < MT_MAX; n++) {
1072 mtc.cpu_mtypes[n] += mtype->cpu_mtypes[n];
1073 }
1074 }
1075
1076 if (!locked) {
1077 lck_mtx_lock(mbuf_mlock);
1078 }
1079 for (int n = 0; n < MT_MAX; n++) {
1080 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
1081 }
1082 if (!locked) {
1083 lck_mtx_unlock(mbuf_mlock);
1084 }
1085 }
1086
1087 static int
1088 mbstat_sysctl SYSCTL_HANDLER_ARGS
1089 {
1090 #pragma unused(oidp, arg1, arg2)
1091 mbuf_mtypes_sync(FALSE);
1092
1093 return SYSCTL_OUT(req, &mbstat, sizeof(mbstat));
1094 }
1095
1096 static void
mbuf_stat_sync(void)1097 mbuf_stat_sync(void)
1098 {
1099 mb_class_stat_t *sp;
1100 mcache_cpu_t *ccp;
1101 mcache_t *cp;
1102 int k, m, bktsize;
1103
1104 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1105
1106 for (k = 0; k < NELEM(mbuf_table); k++) {
1107 cp = m_cache(k);
1108 ccp = &cp->mc_cpu[0];
1109 bktsize = ccp->cc_bktsize;
1110 sp = mbuf_table[k].mtbl_stats;
1111
1112 if (cp->mc_flags & MCF_NOCPUCACHE) {
1113 sp->mbcl_mc_state = MCS_DISABLED;
1114 } else if (cp->mc_purge_cnt > 0) {
1115 sp->mbcl_mc_state = MCS_PURGING;
1116 } else if (bktsize == 0) {
1117 sp->mbcl_mc_state = MCS_OFFLINE;
1118 } else {
1119 sp->mbcl_mc_state = MCS_ONLINE;
1120 }
1121
1122 sp->mbcl_mc_cached = 0;
1123 for (m = 0; m < ncpu; m++) {
1124 ccp = &cp->mc_cpu[m];
1125 if (ccp->cc_objs > 0) {
1126 sp->mbcl_mc_cached += ccp->cc_objs;
1127 }
1128 if (ccp->cc_pobjs > 0) {
1129 sp->mbcl_mc_cached += ccp->cc_pobjs;
1130 }
1131 }
1132 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
1133 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1134 sp->mbcl_infree;
1135
1136 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1137 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1138 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1139
1140 /* Calculate total count specific to each class */
1141 sp->mbcl_ctotal = sp->mbcl_total;
1142 switch (m_class(k)) {
1143 case MC_MBUF:
1144 /* Deduct mbufs used in composite caches */
1145 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1146 m_total(MC_MBUF_BIGCL));
1147 break;
1148
1149 case MC_CL:
1150 /* Deduct clusters used in composite cache */
1151 sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1152 break;
1153
1154 case MC_BIGCL:
1155 /* Deduct clusters used in composite cache */
1156 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1157 break;
1158
1159 case MC_16KCL:
1160 /* Deduct clusters used in composite cache */
1161 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1162 break;
1163
1164 default:
1165 break;
1166 }
1167 }
1168 }
1169
1170 static int
1171 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1172 {
1173 #pragma unused(oidp, arg1, arg2)
1174 void *statp;
1175 int k, statsz, proc64 = proc_is64bit(req->p);
1176
1177 lck_mtx_lock(mbuf_mlock);
1178 mbuf_stat_sync();
1179
1180 if (!proc64) {
1181 struct omb_class_stat *oc;
1182 struct mb_class_stat *c;
1183
1184 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1185 oc = &omb_stat->mbs_class[0];
1186 c = &mb_stat->mbs_class[0];
1187 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1188 (void) snprintf(oc->mbcl_cname, sizeof(oc->mbcl_cname),
1189 "%s", c->mbcl_cname);
1190 oc->mbcl_size = c->mbcl_size;
1191 oc->mbcl_total = c->mbcl_total;
1192 oc->mbcl_active = c->mbcl_active;
1193 oc->mbcl_infree = c->mbcl_infree;
1194 oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1195 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1196 oc->mbcl_free_cnt = c->mbcl_free_cnt;
1197 oc->mbcl_notified = c->mbcl_notified;
1198 oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1199 oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1200 oc->mbcl_ctotal = c->mbcl_ctotal;
1201 oc->mbcl_release_cnt = c->mbcl_release_cnt;
1202 oc->mbcl_mc_state = c->mbcl_mc_state;
1203 oc->mbcl_mc_cached = c->mbcl_mc_cached;
1204 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1205 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1206 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1207 oc->mbcl_peak_reported = c->mbcl_peak_reported;
1208 }
1209 statp = omb_stat;
1210 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1211 } else {
1212 statp = mb_stat;
1213 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1214 }
1215
1216 lck_mtx_unlock(mbuf_mlock);
1217
1218 return SYSCTL_OUT(req, statp, statsz);
1219 }
1220
1221 static int
1222 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1223 {
1224 #pragma unused(oidp, arg1, arg2)
1225 int i;
1226
1227 /* Ensure leak tracing turned on */
1228 if (!mclfindleak || !mclexpleak) {
1229 return ENXIO;
1230 }
1231
1232 lck_mtx_lock(mleak_lock);
1233 mleak_update_stats();
1234 i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1235 lck_mtx_unlock(mleak_lock);
1236
1237 return i;
1238 }
1239
1240 static int
1241 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1242 {
1243 #pragma unused(oidp, arg1, arg2)
1244 int i = 0;
1245
1246 /* Ensure leak tracing turned on */
1247 if (!mclfindleak || !mclexpleak) {
1248 return ENXIO;
1249 }
1250
1251 lck_mtx_lock(mleak_lock);
1252 i = SYSCTL_OUT(req, &mleak_table, sizeof(mleak_table));
1253 lck_mtx_unlock(mleak_lock);
1254
1255 return i;
1256 }
1257
1258 static inline void
m_incref(struct mbuf * m)1259 m_incref(struct mbuf *m)
1260 {
1261 UInt16 old, new;
1262 volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m);
1263
1264 do {
1265 old = *addr;
1266 new = old + 1;
1267 VERIFY(new != 0);
1268 } while (!OSCompareAndSwap16(old, new, addr));
1269
1270 /*
1271 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1272 * we don't clear the flag when the refcount goes back to the
1273 * minimum, to simplify code calling m_mclhasreference().
1274 */
1275 if (new > (MEXT_MINREF(m) + 1) && !(MEXT_FLAGS(m) & EXTF_READONLY)) {
1276 (void) OSBitOrAtomic16(EXTF_READONLY, &MEXT_FLAGS(m));
1277 }
1278 }
1279
1280 static inline u_int16_t
m_decref(struct mbuf * m)1281 m_decref(struct mbuf *m)
1282 {
1283 UInt16 old, new;
1284 volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m);
1285
1286 do {
1287 old = *addr;
1288 new = old - 1;
1289 VERIFY(old != 0);
1290 } while (!OSCompareAndSwap16(old, new, addr));
1291
1292 return new;
1293 }
1294
1295 static void
mbuf_table_init(void)1296 mbuf_table_init(void)
1297 {
1298 unsigned int b, c, s;
1299 int m, config_mbuf_jumbo = 0;
1300
1301 omb_stat = zalloc_permanent(OMB_STAT_SIZE(NELEM(mbuf_table)),
1302 ZALIGN(struct omb_stat));
1303
1304 mb_stat = zalloc_permanent(MB_STAT_SIZE(NELEM(mbuf_table)),
1305 ZALIGN(mb_stat_t));
1306
1307 mb_stat->mbs_cnt = NELEM(mbuf_table);
1308 for (m = 0; m < NELEM(mbuf_table); m++) {
1309 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1310 }
1311
1312 #if CONFIG_MBUF_JUMBO
1313 config_mbuf_jumbo = 1;
1314 #endif /* CONFIG_MBUF_JUMBO */
1315
1316 if (config_mbuf_jumbo == 1 || PAGE_SIZE == M16KCLBYTES) {
1317 /*
1318 * Set aside 1/3 of the mbuf cluster map for jumbo
1319 * clusters; we do this only on platforms where jumbo
1320 * cluster pool is enabled.
1321 */
1322 njcl = nmbclusters / 3;
1323 njclbytes = M16KCLBYTES;
1324 }
1325
1326 /*
1327 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1328 * a multiple of 4KB clusters.
1329 */
1330 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1331 if (njcl > 0) {
1332 /*
1333 * Each jumbo cluster takes 8 2KB clusters, so make
1334 * sure that the pool size is evenly divisible by 8;
1335 * njcl is in 2KB unit, hence treated as such.
1336 */
1337 njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL);
1338
1339 /* Update nclusters with rounded down value of njcl */
1340 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1341 }
1342
1343 /*
1344 * njcl is valid only on platforms with 16KB jumbo clusters or
1345 * with 16KB pages, where it is configured to 1/3 of the pool
1346 * size. On these platforms, the remaining is used for 2KB
1347 * and 4KB clusters. On platforms without 16KB jumbo clusters,
1348 * the entire pool is used for both 2KB and 4KB clusters. A 4KB
1349 * cluster can either be splitted into 16 mbufs, or into 2 2KB
1350 * clusters.
1351 *
1352 * +---+---+------------ ... -----------+------- ... -------+
1353 * | c | b | s | njcl |
1354 * +---+---+------------ ... -----------+------- ... -------+
1355 *
1356 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1357 * clusters (1/64th each.)
1358 */
1359 c = P2ROUNDDOWN((nclusters >> 6), NCLPG); /* in 2KB unit */
1360 b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), NBCLPG); /* in 4KB unit */
1361 s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */
1362
1363 /*
1364 * 1/64th (c) is reserved for 2KB clusters.
1365 */
1366 m_minlimit(MC_CL) = c;
1367 m_maxlimit(MC_CL) = s + c; /* in 2KB unit */
1368 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1369 (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1370
1371 /*
1372 * Another 1/64th (b) of the map is reserved for 4KB clusters.
1373 * It cannot be turned into 2KB clusters or mbufs.
1374 */
1375 m_minlimit(MC_BIGCL) = b;
1376 m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */
1377 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1378 (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1379
1380 /*
1381 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1382 */
1383 m_minlimit(MC_MBUF) = 0;
1384 m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT); /* in mbuf unit */
1385 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1386 (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1387
1388 /*
1389 * Set limits for the composite classes.
1390 */
1391 m_minlimit(MC_MBUF_CL) = 0;
1392 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1393 m_maxsize(MC_MBUF_CL) = MCLBYTES;
1394 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1395 (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1396
1397 m_minlimit(MC_MBUF_BIGCL) = 0;
1398 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1399 m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1400 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1401 (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1402
1403 /*
1404 * And for jumbo classes.
1405 */
1406 m_minlimit(MC_16KCL) = 0;
1407 m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */
1408 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1409 (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1410
1411 m_minlimit(MC_MBUF_16KCL) = 0;
1412 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1413 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1414 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1415 (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1416
1417 /*
1418 * Initialize the legacy mbstat structure.
1419 */
1420 bzero(&mbstat, sizeof(mbstat));
1421 mbstat.m_msize = m_maxsize(MC_MBUF);
1422 mbstat.m_mclbytes = m_maxsize(MC_CL);
1423 mbstat.m_minclsize = MINCLSIZE;
1424 mbstat.m_mlen = MLEN;
1425 mbstat.m_mhlen = MHLEN;
1426 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1427 }
1428
1429 int
mbuf_get_class(struct mbuf * m)1430 mbuf_get_class(struct mbuf *m)
1431 {
1432 if (m->m_flags & M_EXT) {
1433 uint32_t composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
1434 m_ext_free_func_t m_free_func = m_get_ext_free(m);
1435
1436 if (m_free_func == NULL) {
1437 if (composite) {
1438 return MC_MBUF_CL;
1439 } else {
1440 return MC_CL;
1441 }
1442 } else if (m_free_func == m_bigfree) {
1443 if (composite) {
1444 return MC_MBUF_BIGCL;
1445 } else {
1446 return MC_BIGCL;
1447 }
1448 } else if (m_free_func == m_16kfree) {
1449 if (composite) {
1450 return MC_MBUF_16KCL;
1451 } else {
1452 return MC_16KCL;
1453 }
1454 }
1455 }
1456
1457 return MC_MBUF;
1458 }
1459
1460 bool
mbuf_class_under_pressure(struct mbuf * m)1461 mbuf_class_under_pressure(struct mbuf *m)
1462 {
1463 int mclass = mbuf_get_class(m);
1464
1465 if (m_total(mclass) - m_infree(mclass) >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
1466 /*
1467 * The above computation does not include the per-CPU cached objects.
1468 * As a fast-path check this is good-enough. But now we do
1469 * the "slower" count of the cached objects to know exactly the
1470 * number of active mbufs in use.
1471 *
1472 * We do not take the mbuf_lock here to avoid lock-contention. Numbers
1473 * might be slightly off but we don't try to be 100% accurate.
1474 * At worst, we drop a packet that we shouldn't have dropped or
1475 * we might go slightly above our memory-pressure threshold.
1476 */
1477 mcache_t *cp = m_cache(mclass);
1478 mcache_cpu_t *ccp = &cp->mc_cpu[0];
1479
1480 int bktsize = os_access_once(ccp->cc_bktsize);
1481 uint32_t bl_total = os_access_once(cp->mc_full.bl_total);
1482 uint32_t cached = 0;
1483 int i;
1484
1485 for (i = 0; i < ncpu; i++) {
1486 ccp = &cp->mc_cpu[i];
1487
1488 int cc_objs = os_access_once(ccp->cc_objs);
1489 if (cc_objs > 0) {
1490 cached += cc_objs;
1491 }
1492
1493 int cc_pobjs = os_access_once(ccp->cc_pobjs);
1494 if (cc_pobjs > 0) {
1495 cached += cc_pobjs;
1496 }
1497 }
1498 cached += (bl_total * bktsize);
1499
1500 if (m_total(mclass) - m_infree(mclass) - cached >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
1501 os_log(OS_LOG_DEFAULT,
1502 "%s memory-pressure on mbuf due to class %u, total %u free %u cached %u max %u",
1503 __func__, mclass, m_total(mclass), m_infree(mclass), cached, m_maxlimit(mclass));
1504 return true;
1505 }
1506 }
1507
1508 return false;
1509 }
1510
1511 #if defined(__LP64__)
1512 typedef struct ncl_tbl {
1513 uint64_t nt_maxmem; /* memory (sane) size */
1514 uint32_t nt_mbpool; /* mbuf pool size */
1515 } ncl_tbl_t;
1516
1517 static const ncl_tbl_t ncl_table[] = {
1518 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ },
1519 { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (96 << MBSHIFT) /* 96 MB */ },
1520 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (128 << MBSHIFT) /* 128 MB */ },
1521 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (256 << MBSHIFT) /* 256 MB */ },
1522 { (1ULL << (GBSHIFT + 5)) /* 32 GB */, (512 << MBSHIFT) /* 512 MB */ },
1523 { 0, 0 }
1524 };
1525 #endif /* __LP64__ */
1526
1527 __private_extern__ unsigned int
mbuf_default_ncl(uint64_t mem)1528 mbuf_default_ncl(uint64_t mem)
1529 {
1530 #if !defined(__LP64__)
1531 unsigned int n;
1532 /*
1533 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1534 */
1535 if ((n = ((mem / 16) / MCLBYTES)) > 32768) {
1536 n = 32768;
1537 }
1538 #else
1539 unsigned int n, i;
1540 /*
1541 * 64-bit kernel (mbuf pool size based on table).
1542 */
1543 n = ncl_table[0].nt_mbpool;
1544 for (i = 0; ncl_table[i].nt_mbpool != 0; i++) {
1545 if (mem < ncl_table[i].nt_maxmem) {
1546 break;
1547 }
1548 n = ncl_table[i].nt_mbpool;
1549 }
1550 n >>= MCLSHIFT;
1551 #endif /* !__LP64__ */
1552 return n;
1553 }
1554
1555 __private_extern__ void
mbinit(void)1556 mbinit(void)
1557 {
1558 unsigned int m;
1559 unsigned int initmcl = 0;
1560 thread_t thread = THREAD_NULL;
1561
1562 microuptime(&mb_start);
1563
1564 /*
1565 * These MBUF_ values must be equal to their private counterparts.
1566 */
1567 _CASSERT(MBUF_EXT == M_EXT);
1568 _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1569 _CASSERT(MBUF_EOR == M_EOR);
1570 _CASSERT(MBUF_LOOP == M_LOOP);
1571 _CASSERT(MBUF_BCAST == M_BCAST);
1572 _CASSERT(MBUF_MCAST == M_MCAST);
1573 _CASSERT(MBUF_FRAG == M_FRAG);
1574 _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1575 _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1576 _CASSERT(MBUF_PROMISC == M_PROMISC);
1577 _CASSERT(MBUF_HASFCS == M_HASFCS);
1578
1579 _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1580 _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1581 _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1582 _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1583 _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1584 _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1585 _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1586 _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1587 _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1588 _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1589 _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1590 _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1591 _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1592 _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1593 _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1594
1595 _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1596 _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1597 _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1598 _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1599 _CASSERT(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
1600 _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1601 _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1602 _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1603 _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1604 _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1605 _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1606 _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1607 _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1608 _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1609
1610 _CASSERT(MBUF_WAITOK == M_WAIT);
1611 _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1612 _CASSERT(MBUF_COPYALL == M_COPYALL);
1613
1614 _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1615 _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1616 _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1617 _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1618 _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1619 _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1620 _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1621 _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1622 _CASSERT(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI);
1623 _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1624 _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1625
1626 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1627 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1628 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1629 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1630
1631 /* Module specific scratch space (32-bit alignment requirement) */
1632 _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
1633 sizeof(uint32_t)));
1634
1635 /* pktdata needs to start at 128-bit offset! */
1636 _CASSERT((offsetof(struct mbuf, m_pktdat) % 16) == 0);
1637
1638 /* Initialize random red zone cookie value */
1639 _CASSERT(sizeof(mb_redzone_cookie) ==
1640 sizeof(((struct pkthdr *)0)->redzone));
1641 read_random(&mb_redzone_cookie, sizeof(mb_redzone_cookie));
1642 read_random(&mb_obscure_extref, sizeof(mb_obscure_extref));
1643 read_random(&mb_obscure_extfree, sizeof(mb_obscure_extfree));
1644 mb_obscure_extref |= 0x3;
1645 mb_obscure_extfree |= 0x3;
1646
1647 /* Make sure we don't save more than we should */
1648 _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof(struct mbuf));
1649
1650 if (nmbclusters == 0) {
1651 nmbclusters = NMBCLUSTERS;
1652 }
1653
1654 /* This should be a sane (at least even) value by now */
1655 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1656
1657 /* Setup the mbuf table */
1658 mbuf_table_init();
1659
1660 /*
1661 * Allocate cluster slabs table:
1662 *
1663 * maxslabgrp = (N * 2048) / (1024 * 1024)
1664 *
1665 * Where N is nmbclusters rounded up to the nearest 512. This yields
1666 * mcl_slab_g_t units, each one representing a MB of memory.
1667 */
1668 maxslabgrp =
1669 (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
1670 slabstbl = zalloc_permanent(maxslabgrp * sizeof(mcl_slabg_t *),
1671 ZALIGN(mcl_slabg_t));
1672
1673 /*
1674 * Allocate audit structures, if needed:
1675 *
1676 * maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE
1677 *
1678 * This yields mcl_audit_t units, each one representing a page.
1679 */
1680 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof(mbuf_debug));
1681 mbuf_debug |= mcache_getflags();
1682 if (mbuf_debug & MCF_DEBUG) {
1683 int l;
1684 mcl_audit_t *mclad;
1685 maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
1686 mclaudit = zalloc_permanent(maxclaudit * sizeof(*mclaudit),
1687 ZALIGN(mcl_audit_t));
1688 for (l = 0, mclad = mclaudit; l < maxclaudit; l++) {
1689 mclad[l].cl_audit = zalloc_permanent(NMBPG * sizeof(mcache_audit_t *),
1690 ZALIGN_PTR);
1691 }
1692
1693 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1694 AUDIT_CONTENTS_SIZE, sizeof(u_int64_t), 0, MCR_SLEEP);
1695 VERIFY(mcl_audit_con_cache != NULL);
1696 }
1697 mclverify = (mbuf_debug & MCF_VERIFY);
1698 mcltrace = (mbuf_debug & MCF_TRACE);
1699 mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1700 mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1701
1702 /* Enable mbuf leak logging, with a lock to protect the tables */
1703
1704 mleak_activate();
1705
1706 /*
1707 * Allocate structure for per-CPU statistics that's aligned
1708 * on the CPU cache boundary; this code assumes that we never
1709 * uninitialize this framework, since the original address
1710 * before alignment is not saved.
1711 */
1712 ncpu = ml_wait_max_cpus();
1713
1714 /* Calculate the number of pages assigned to the cluster pool */
1715 mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
1716 mcl_paddr = zalloc_permanent(mcl_pages * sizeof(ppnum_t),
1717 ZALIGN(ppnum_t));
1718
1719 /* Register with the I/O Bus mapper */
1720 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1721
1722 embutl = (mbutl + (nmbclusters * MCLBYTES));
1723 VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0);
1724
1725 /* Prime up the freelist */
1726 PE_parse_boot_argn("initmcl", &initmcl, sizeof(initmcl));
1727 if (initmcl != 0) {
1728 initmcl >>= NCLPBGSHIFT; /* become a 4K unit */
1729 if (initmcl > m_maxlimit(MC_BIGCL)) {
1730 initmcl = m_maxlimit(MC_BIGCL);
1731 }
1732 }
1733 if (initmcl < m_minlimit(MC_BIGCL)) {
1734 initmcl = m_minlimit(MC_BIGCL);
1735 }
1736
1737 lck_mtx_lock(mbuf_mlock);
1738
1739 /*
1740 * For classes with non-zero minimum limits, populate their freelists
1741 * so that m_total(class) is at least m_minlimit(class).
1742 */
1743 VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1744 freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1745 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1746 freelist_init(m_class(MC_CL));
1747
1748 for (m = 0; m < NELEM(mbuf_table); m++) {
1749 /* Make sure we didn't miss any */
1750 VERIFY(m_minlimit(m_class(m)) == 0 ||
1751 m_total(m_class(m)) >= m_minlimit(m_class(m)));
1752
1753 /* populate the initial sizes and report from there on */
1754 m_peak(m_class(m)) = m_total(m_class(m));
1755 }
1756 mb_peak_newreport = FALSE;
1757
1758 lck_mtx_unlock(mbuf_mlock);
1759
1760 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1761 NULL, &thread);
1762 thread_deallocate(thread);
1763
1764 ref_cache = mcache_create("mext_ref", sizeof(struct ext_ref),
1765 0, 0, MCR_SLEEP);
1766
1767 /* Create the cache for each class */
1768 for (m = 0; m < NELEM(mbuf_table); m++) {
1769 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1770 u_int32_t flags;
1771
1772 flags = mbuf_debug;
1773 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1774 m_class(m) == MC_MBUF_16KCL) {
1775 allocfunc = mbuf_cslab_alloc;
1776 freefunc = mbuf_cslab_free;
1777 auditfunc = mbuf_cslab_audit;
1778 logfunc = mleak_logger;
1779 } else {
1780 allocfunc = mbuf_slab_alloc;
1781 freefunc = mbuf_slab_free;
1782 auditfunc = mbuf_slab_audit;
1783 logfunc = mleak_logger;
1784 }
1785
1786 /*
1787 * Disable per-CPU caches for jumbo classes if there
1788 * is no jumbo cluster pool available in the system.
1789 * The cache itself is still created (but will never
1790 * be populated) since it simplifies the code.
1791 */
1792 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1793 njcl == 0) {
1794 flags |= MCF_NOCPUCACHE;
1795 }
1796
1797 if (!mclfindleak) {
1798 flags |= MCF_NOLEAKLOG;
1799 }
1800
1801 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1802 allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1803 (void *)(uintptr_t)m, flags, MCR_SLEEP);
1804 }
1805
1806 /*
1807 * Set the max limit on sb_max to be 1/16 th of the size of
1808 * memory allocated for mbuf clusters.
1809 */
1810 high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1811 if (high_sb_max < sb_max) {
1812 /* sb_max is too large for this configuration, scale it down */
1813 if (high_sb_max > (1 << MBSHIFT)) {
1814 /* We have atleast 16 M of mbuf pool */
1815 sb_max = high_sb_max;
1816 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1817 /*
1818 * If we have more than 1M of mbufpool, cap the size of
1819 * max sock buf at 1M
1820 */
1821 sb_max = high_sb_max = (1 << MBSHIFT);
1822 } else {
1823 sb_max = high_sb_max;
1824 }
1825 }
1826
1827 /* allocate space for mbuf_dump_buf */
1828 mbuf_dump_buf = zalloc_permanent(MBUF_DUMP_BUF_SIZE, ZALIGN_NONE);
1829
1830 if (mbuf_debug & MCF_DEBUG) {
1831 printf("%s: MLEN %d, MHLEN %d\n", __func__,
1832 (int)_MLEN, (int)_MHLEN);
1833 }
1834
1835 printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1836 (nmbclusters << MCLSHIFT) >> MBSHIFT,
1837 (nclusters << MCLSHIFT) >> MBSHIFT,
1838 (njcl << MCLSHIFT) >> MBSHIFT);
1839 }
1840
1841 /*
1842 * Obtain a slab of object(s) from the class's freelist.
1843 */
1844 static mcache_obj_t *
slab_alloc(mbuf_class_t class,int wait)1845 slab_alloc(mbuf_class_t class, int wait)
1846 {
1847 mcl_slab_t *sp;
1848 mcache_obj_t *buf;
1849
1850 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1851
1852 /* This should always be NULL for us */
1853 VERIFY(m_cobjlist(class) == NULL);
1854
1855 /*
1856 * Treat composite objects as having longer lifespan by using
1857 * a slab from the reverse direction, in hoping that this could
1858 * reduce the probability of fragmentation for slabs that hold
1859 * more than one buffer chunks (e.g. mbuf slabs). For other
1860 * slabs, this probably doesn't make much of a difference.
1861 */
1862 if ((class == MC_MBUF || class == MC_CL || class == MC_BIGCL)
1863 && (wait & MCR_COMP)) {
1864 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1865 } else {
1866 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1867 }
1868
1869 if (sp == NULL) {
1870 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1871 /* The slab list for this class is empty */
1872 return NULL;
1873 }
1874
1875 VERIFY(m_infree(class) > 0);
1876 VERIFY(!slab_is_detached(sp));
1877 VERIFY(sp->sl_class == class &&
1878 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1879 buf = sp->sl_head;
1880 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1881 sp->sl_head = buf->obj_next;
1882 /* Increment slab reference */
1883 sp->sl_refcnt++;
1884
1885 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == sp->sl_chunks);
1886
1887 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1888 slab_nextptr_panic(sp, sp->sl_head);
1889 /* In case sl_head is in the map but not in the slab */
1890 VERIFY(slab_inrange(sp, sp->sl_head));
1891 /* NOTREACHED */
1892 }
1893
1894 if (mclaudit != NULL) {
1895 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1896 mca->mca_uflags = 0;
1897 /* Save contents on mbuf objects only */
1898 if (class == MC_MBUF) {
1899 mca->mca_uflags |= MB_SCVALID;
1900 }
1901 }
1902
1903 if (class == MC_CL) {
1904 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1905 /*
1906 * A 2K cluster slab can have at most NCLPG references.
1907 */
1908 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPG &&
1909 sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1910 VERIFY(sp->sl_refcnt < NCLPG || sp->sl_head == NULL);
1911 } else if (class == MC_BIGCL) {
1912 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1913 m_infree(MC_MBUF_BIGCL);
1914 /*
1915 * A 4K cluster slab can have NBCLPG references.
1916 */
1917 VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG &&
1918 sp->sl_len == PAGE_SIZE &&
1919 (sp->sl_refcnt < NBCLPG || sp->sl_head == NULL));
1920 } else if (class == MC_16KCL) {
1921 mcl_slab_t *nsp;
1922 int k;
1923
1924 --m_infree(MC_16KCL);
1925 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1926 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1927 /*
1928 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1929 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1930 * most 1 reference.
1931 */
1932 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1933 nsp = nsp->sl_next;
1934 /* Next slab must already be present */
1935 VERIFY(nsp != NULL);
1936 nsp->sl_refcnt++;
1937 VERIFY(!slab_is_detached(nsp));
1938 VERIFY(nsp->sl_class == MC_16KCL &&
1939 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1940 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1941 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1942 nsp->sl_head == NULL);
1943 }
1944 } else {
1945 VERIFY(class == MC_MBUF);
1946 --m_infree(MC_MBUF);
1947 /*
1948 * If auditing is turned on, this check is
1949 * deferred until later in mbuf_slab_audit().
1950 */
1951 if (mclaudit == NULL) {
1952 _MCHECK((struct mbuf *)buf);
1953 }
1954 /*
1955 * Since we have incremented the reference count above,
1956 * an mbuf slab (formerly a 4KB cluster slab that was cut
1957 * up into mbufs) must have a reference count between 1
1958 * and NMBPG at this point.
1959 */
1960 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPG &&
1961 sp->sl_chunks == NMBPG &&
1962 sp->sl_len == PAGE_SIZE);
1963 VERIFY(sp->sl_refcnt < NMBPG || sp->sl_head == NULL);
1964 }
1965
1966 /* If empty, remove this slab from the class's freelist */
1967 if (sp->sl_head == NULL) {
1968 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPG);
1969 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPG);
1970 VERIFY(class != MC_BIGCL || sp->sl_refcnt == NBCLPG);
1971 slab_remove(sp, class);
1972 }
1973
1974 return buf;
1975 }
1976
1977 /*
1978 * Place a slab of object(s) back into a class's slab list.
1979 */
1980 static void
slab_free(mbuf_class_t class,mcache_obj_t * buf)1981 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1982 {
1983 mcl_slab_t *sp;
1984 boolean_t reinit_supercl = false;
1985 mbuf_class_t super_class;
1986
1987 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1988
1989 VERIFY(class != MC_16KCL || njcl > 0);
1990 VERIFY(buf->obj_next == NULL);
1991
1992 /*
1993 * Synchronizing with m_clalloc, as it reads m_total, while we here
1994 * are modifying m_total.
1995 */
1996 while (mb_clalloc_busy) {
1997 mb_clalloc_waiters++;
1998 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
1999 (PZERO - 1), "m_clalloc", NULL);
2000 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2001 }
2002
2003 /* We are busy now; tell everyone else to go away */
2004 mb_clalloc_busy = TRUE;
2005
2006 sp = slab_get(buf);
2007 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
2008 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2009
2010 /* Decrement slab reference */
2011 sp->sl_refcnt--;
2012
2013 if (class == MC_CL) {
2014 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
2015 /*
2016 * A slab that has been splitted for 2KB clusters can have
2017 * at most 1 outstanding reference at this point.
2018 */
2019 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPG - 1) &&
2020 sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
2021 VERIFY(sp->sl_refcnt < (NCLPG - 1) ||
2022 (slab_is_detached(sp) && sp->sl_head == NULL));
2023 } else if (class == MC_BIGCL) {
2024 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
2025
2026 /* A 4KB cluster slab can have NBCLPG references at most */
2027 VERIFY(sp->sl_refcnt >= 0 && sp->sl_chunks == NBCLPG);
2028 VERIFY(sp->sl_refcnt < (NBCLPG - 1) ||
2029 (slab_is_detached(sp) && sp->sl_head == NULL));
2030 } else if (class == MC_16KCL) {
2031 mcl_slab_t *nsp;
2032 int k;
2033 /*
2034 * A 16KB cluster takes NSLABSP16KB slabs, all must
2035 * now have 0 reference.
2036 */
2037 VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE));
2038 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
2039 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
2040 VERIFY(slab_is_detached(sp));
2041 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
2042 nsp = nsp->sl_next;
2043 /* Next slab must already be present */
2044 VERIFY(nsp != NULL);
2045 nsp->sl_refcnt--;
2046 VERIFY(slab_is_detached(nsp));
2047 VERIFY(nsp->sl_class == MC_16KCL &&
2048 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
2049 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
2050 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
2051 nsp->sl_head == NULL);
2052 }
2053 } else {
2054 /*
2055 * A slab that has been splitted for mbufs has at most
2056 * NMBPG reference counts. Since we have decremented
2057 * one reference above, it must now be between 0 and
2058 * NMBPG-1.
2059 */
2060 VERIFY(class == MC_MBUF);
2061 VERIFY(sp->sl_refcnt >= 0 &&
2062 sp->sl_refcnt <= (NMBPG - 1) &&
2063 sp->sl_chunks == NMBPG &&
2064 sp->sl_len == PAGE_SIZE);
2065 VERIFY(sp->sl_refcnt < (NMBPG - 1) ||
2066 (slab_is_detached(sp) && sp->sl_head == NULL));
2067 }
2068
2069 /*
2070 * When auditing is enabled, ensure that the buffer still
2071 * contains the free pattern. Otherwise it got corrupted
2072 * while at the CPU cache layer.
2073 */
2074 if (mclaudit != NULL) {
2075 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
2076 if (mclverify) {
2077 mcache_audit_free_verify(mca, buf, 0,
2078 m_maxsize(class));
2079 }
2080 mca->mca_uflags &= ~MB_SCVALID;
2081 }
2082
2083 if (class == MC_CL) {
2084 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
2085 buf->obj_next = sp->sl_head;
2086 } else if (class == MC_BIGCL) {
2087 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
2088 m_infree(MC_MBUF_BIGCL);
2089 buf->obj_next = sp->sl_head;
2090 } else if (class == MC_16KCL) {
2091 ++m_infree(MC_16KCL);
2092 } else {
2093 ++m_infree(MC_MBUF);
2094 buf->obj_next = sp->sl_head;
2095 }
2096 sp->sl_head = buf;
2097
2098 /*
2099 * If a slab has been split to either one which holds 2KB clusters,
2100 * or one which holds mbufs, turn it back to one which holds a
2101 * 4 or 16 KB cluster depending on the page size.
2102 */
2103 if (m_maxsize(MC_BIGCL) == PAGE_SIZE) {
2104 super_class = MC_BIGCL;
2105 } else {
2106 VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL));
2107 super_class = MC_16KCL;
2108 }
2109 if (class == MC_MBUF && sp->sl_refcnt == 0 &&
2110 m_total(class) >= (m_minlimit(class) + NMBPG) &&
2111 m_total(super_class) < m_maxlimit(super_class)) {
2112 int i = NMBPG;
2113
2114 m_total(MC_MBUF) -= NMBPG;
2115 mbstat.m_mbufs = m_total(MC_MBUF);
2116 m_infree(MC_MBUF) -= NMBPG;
2117 mtype_stat_add(MT_FREE, -((unsigned)NMBPG));
2118
2119 while (i--) {
2120 struct mbuf *m = sp->sl_head;
2121 VERIFY(m != NULL);
2122 sp->sl_head = m->m_next;
2123 m->m_next = NULL;
2124 }
2125 reinit_supercl = true;
2126 } else if (class == MC_CL && sp->sl_refcnt == 0 &&
2127 m_total(class) >= (m_minlimit(class) + NCLPG) &&
2128 m_total(super_class) < m_maxlimit(super_class)) {
2129 int i = NCLPG;
2130
2131 m_total(MC_CL) -= NCLPG;
2132 mbstat.m_clusters = m_total(MC_CL);
2133 m_infree(MC_CL) -= NCLPG;
2134
2135 while (i--) {
2136 union mcluster *c = sp->sl_head;
2137 VERIFY(c != NULL);
2138 sp->sl_head = c->mcl_next;
2139 c->mcl_next = NULL;
2140 }
2141 reinit_supercl = true;
2142 } else if (class == MC_BIGCL && super_class != MC_BIGCL &&
2143 sp->sl_refcnt == 0 &&
2144 m_total(class) >= (m_minlimit(class) + NBCLPG) &&
2145 m_total(super_class) < m_maxlimit(super_class)) {
2146 int i = NBCLPG;
2147
2148 VERIFY(super_class == MC_16KCL);
2149 m_total(MC_BIGCL) -= NBCLPG;
2150 mbstat.m_bigclusters = m_total(MC_BIGCL);
2151 m_infree(MC_BIGCL) -= NBCLPG;
2152
2153 while (i--) {
2154 union mbigcluster *bc = sp->sl_head;
2155 VERIFY(bc != NULL);
2156 sp->sl_head = bc->mbc_next;
2157 bc->mbc_next = NULL;
2158 }
2159 reinit_supercl = true;
2160 }
2161
2162 if (reinit_supercl) {
2163 VERIFY(sp->sl_head == NULL);
2164 VERIFY(m_total(class) >= m_minlimit(class));
2165 slab_remove(sp, class);
2166
2167 /* Reinitialize it as a cluster for the super class */
2168 m_total(super_class)++;
2169 m_infree(super_class)++;
2170 VERIFY(sp->sl_flags == (SLF_MAPPED | SLF_DETACHED) &&
2171 sp->sl_len == PAGE_SIZE && sp->sl_refcnt == 0);
2172
2173 slab_init(sp, super_class, SLF_MAPPED, sp->sl_base,
2174 sp->sl_base, PAGE_SIZE, 0, 1);
2175 if (mclverify) {
2176 mcache_set_pattern(MCACHE_FREE_PATTERN,
2177 (caddr_t)sp->sl_base, sp->sl_len);
2178 }
2179 ((mcache_obj_t *)(sp->sl_base))->obj_next = NULL;
2180
2181 if (super_class == MC_BIGCL) {
2182 mbstat.m_bigclusters = m_total(MC_BIGCL);
2183 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2184 m_infree(MC_MBUF_BIGCL);
2185 }
2186
2187 VERIFY(slab_is_detached(sp));
2188 VERIFY(m_total(super_class) <= m_maxlimit(super_class));
2189
2190 /* And finally switch class */
2191 class = super_class;
2192 }
2193
2194 /* Reinsert the slab to the class's slab list */
2195 if (slab_is_detached(sp)) {
2196 slab_insert(sp, class);
2197 }
2198
2199 /* We're done; let others enter */
2200 mb_clalloc_busy = FALSE;
2201 if (mb_clalloc_waiters > 0) {
2202 mb_clalloc_waiters = 0;
2203 wakeup(mb_clalloc_waitchan);
2204 }
2205 }
2206
2207 /*
2208 * Common allocator for rudimentary objects called by the CPU cache layer
2209 * during an allocation request whenever there is no available element in the
2210 * bucket layer. It returns one or more elements from the appropriate global
2211 * freelist. If the freelist is empty, it will attempt to populate it and
2212 * retry the allocation.
2213 */
2214 static unsigned int
mbuf_slab_alloc(void * arg,mcache_obj_t *** plist,unsigned int num,int wait)2215 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
2216 {
2217 mbuf_class_t class = (mbuf_class_t)arg;
2218 unsigned int need = num;
2219 mcache_obj_t **list = *plist;
2220
2221 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2222 ASSERT(need > 0);
2223
2224 lck_mtx_lock(mbuf_mlock);
2225
2226 for (;;) {
2227 if ((*list = slab_alloc(class, wait)) != NULL) {
2228 (*list)->obj_next = NULL;
2229 list = *plist = &(*list)->obj_next;
2230
2231 if (--need == 0) {
2232 /*
2233 * If the number of elements in freelist has
2234 * dropped below low watermark, asynchronously
2235 * populate the freelist now rather than doing
2236 * it later when we run out of elements.
2237 */
2238 if (!mbuf_cached_above(class, wait) &&
2239 m_infree(class) < (m_total(class) >> 5)) {
2240 (void) freelist_populate(class, 1,
2241 M_DONTWAIT);
2242 }
2243 break;
2244 }
2245 } else {
2246 VERIFY(m_infree(class) == 0 || class == MC_CL);
2247
2248 (void) freelist_populate(class, 1,
2249 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2250
2251 if (m_infree(class) > 0) {
2252 continue;
2253 }
2254
2255 /* Check if there's anything at the cache layer */
2256 if (mbuf_cached_above(class, wait)) {
2257 break;
2258 }
2259
2260 /* watchdog checkpoint */
2261 mbuf_watchdog();
2262
2263 /* We have nothing and cannot block; give up */
2264 if (wait & MCR_NOSLEEP) {
2265 if (!(wait & MCR_TRYHARD)) {
2266 m_fail_cnt(class)++;
2267 mbstat.m_drops++;
2268 break;
2269 }
2270 }
2271
2272 /*
2273 * If the freelist is still empty and the caller is
2274 * willing to be blocked, sleep on the wait channel
2275 * until an element is available. Otherwise, if
2276 * MCR_TRYHARD is set, do our best to satisfy the
2277 * request without having to go to sleep.
2278 */
2279 if (mbuf_worker_ready &&
2280 mbuf_sleep(class, need, wait)) {
2281 break;
2282 }
2283
2284 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2285 }
2286 }
2287
2288 m_alloc_cnt(class) += num - need;
2289 lck_mtx_unlock(mbuf_mlock);
2290
2291 return num - need;
2292 }
2293
2294 /*
2295 * Common de-allocator for rudimentary objects called by the CPU cache
2296 * layer when one or more elements need to be returned to the appropriate
2297 * global freelist.
2298 */
2299 static void
mbuf_slab_free(void * arg,mcache_obj_t * list,__unused int purged)2300 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2301 {
2302 mbuf_class_t class = (mbuf_class_t)arg;
2303 mcache_obj_t *nlist;
2304 unsigned int num = 0;
2305 int w;
2306
2307 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2308
2309 lck_mtx_lock(mbuf_mlock);
2310
2311 for (;;) {
2312 nlist = list->obj_next;
2313 list->obj_next = NULL;
2314 slab_free(class, list);
2315 ++num;
2316 if ((list = nlist) == NULL) {
2317 break;
2318 }
2319 }
2320 m_free_cnt(class) += num;
2321
2322 if ((w = mb_waiters) > 0) {
2323 mb_waiters = 0;
2324 }
2325 if (w) {
2326 mbwdog_logger("waking up all threads");
2327 }
2328 lck_mtx_unlock(mbuf_mlock);
2329
2330 if (w != 0) {
2331 wakeup(mb_waitchan);
2332 }
2333 }
2334
2335 /*
2336 * Common auditor for rudimentary objects called by the CPU cache layer
2337 * during an allocation or free request. For the former, this is called
2338 * after the objects are obtained from either the bucket or slab layer
2339 * and before they are returned to the caller. For the latter, this is
2340 * called immediately during free and before placing the objects into
2341 * the bucket or slab layer.
2342 */
2343 static void
mbuf_slab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)2344 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2345 {
2346 mbuf_class_t class = (mbuf_class_t)arg;
2347 mcache_audit_t *mca;
2348
2349 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2350
2351 while (list != NULL) {
2352 lck_mtx_lock(mbuf_mlock);
2353 mca = mcl_audit_buf2mca(class, list);
2354
2355 /* Do the sanity checks */
2356 if (class == MC_MBUF) {
2357 mcl_audit_mbuf(mca, list, FALSE, alloc);
2358 ASSERT(mca->mca_uflags & MB_SCVALID);
2359 } else {
2360 mcl_audit_cluster(mca, list, m_maxsize(class),
2361 alloc, TRUE);
2362 ASSERT(!(mca->mca_uflags & MB_SCVALID));
2363 }
2364 /* Record this transaction */
2365 if (mcltrace) {
2366 mcache_buffer_log(mca, list, m_cache(class), &mb_start);
2367 }
2368
2369 if (alloc) {
2370 mca->mca_uflags |= MB_INUSE;
2371 } else {
2372 mca->mca_uflags &= ~MB_INUSE;
2373 }
2374 /* Unpair the object (unconditionally) */
2375 mca->mca_uptr = NULL;
2376 lck_mtx_unlock(mbuf_mlock);
2377
2378 list = list->obj_next;
2379 }
2380 }
2381
2382 /*
2383 * Common notify routine for all caches. It is called by mcache when
2384 * one or more objects get freed. We use this indication to trigger
2385 * the wakeup of any sleeping threads so that they can retry their
2386 * allocation requests.
2387 */
2388 static void
mbuf_slab_notify(void * arg,u_int32_t reason)2389 mbuf_slab_notify(void *arg, u_int32_t reason)
2390 {
2391 mbuf_class_t class = (mbuf_class_t)arg;
2392 int w;
2393
2394 ASSERT(MBUF_CLASS_VALID(class));
2395
2396 if (reason != MCN_RETRYALLOC) {
2397 return;
2398 }
2399
2400 lck_mtx_lock(mbuf_mlock);
2401 if ((w = mb_waiters) > 0) {
2402 m_notified(class)++;
2403 mb_waiters = 0;
2404 }
2405 if (w) {
2406 mbwdog_logger("waking up all threads");
2407 }
2408 lck_mtx_unlock(mbuf_mlock);
2409
2410 if (w != 0) {
2411 wakeup(mb_waitchan);
2412 }
2413 }
2414
2415 /*
2416 * Obtain object(s) from the composite class's freelist.
2417 */
2418 static unsigned int
cslab_alloc(mbuf_class_t class,mcache_obj_t *** plist,unsigned int num)2419 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2420 {
2421 unsigned int need = num;
2422 mcl_slab_t *sp, *clsp, *nsp;
2423 struct mbuf *m;
2424 mcache_obj_t **list = *plist;
2425 void *cl;
2426
2427 VERIFY(need > 0);
2428 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2429 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2430
2431 /* Get what we can from the freelist */
2432 while ((*list = m_cobjlist(class)) != NULL) {
2433 MRANGE(*list);
2434
2435 m = (struct mbuf *)*list;
2436 sp = slab_get(m);
2437 cl = m->m_ext.ext_buf;
2438 clsp = slab_get(cl);
2439 VERIFY(m->m_flags == M_EXT && cl != NULL);
2440 VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
2441
2442 if (class == MC_MBUF_CL) {
2443 VERIFY(clsp->sl_refcnt >= 1 &&
2444 clsp->sl_refcnt <= NCLPG);
2445 } else {
2446 VERIFY(clsp->sl_refcnt >= 1 &&
2447 clsp->sl_refcnt <= NBCLPG);
2448 }
2449
2450 if (class == MC_MBUF_16KCL) {
2451 int k;
2452 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2453 nsp = nsp->sl_next;
2454 /* Next slab must already be present */
2455 VERIFY(nsp != NULL);
2456 VERIFY(nsp->sl_refcnt == 1);
2457 }
2458 }
2459
2460 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2461 !MBUF_IN_MAP(m_cobjlist(class))) {
2462 slab_nextptr_panic(sp, m_cobjlist(class));
2463 /* NOTREACHED */
2464 }
2465 (*list)->obj_next = NULL;
2466 list = *plist = &(*list)->obj_next;
2467
2468 if (--need == 0) {
2469 break;
2470 }
2471 }
2472 m_infree(class) -= (num - need);
2473
2474 return num - need;
2475 }
2476
2477 /*
2478 * Place object(s) back into a composite class's freelist.
2479 */
2480 static unsigned int
cslab_free(mbuf_class_t class,mcache_obj_t * list,int purged)2481 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2482 {
2483 mcache_obj_t *o, *tail;
2484 unsigned int num = 0;
2485 struct mbuf *m, *ms;
2486 mcache_audit_t *mca = NULL;
2487 mcache_obj_t *ref_list = NULL;
2488 mcl_slab_t *clsp, *nsp;
2489 void *cl;
2490 mbuf_class_t cl_class;
2491
2492 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2493 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2494 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2495
2496 if (class == MC_MBUF_CL) {
2497 cl_class = MC_CL;
2498 } else if (class == MC_MBUF_BIGCL) {
2499 cl_class = MC_BIGCL;
2500 } else {
2501 VERIFY(class == MC_MBUF_16KCL);
2502 cl_class = MC_16KCL;
2503 }
2504
2505 o = tail = list;
2506
2507 while ((m = ms = (struct mbuf *)o) != NULL) {
2508 mcache_obj_t *rfa, *nexto = o->obj_next;
2509
2510 /* Do the mbuf sanity checks */
2511 if (mclaudit != NULL) {
2512 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2513 if (mclverify) {
2514 mcache_audit_free_verify(mca, m, 0,
2515 m_maxsize(MC_MBUF));
2516 }
2517 ms = MCA_SAVED_MBUF_PTR(mca);
2518 }
2519
2520 /* Do the cluster sanity checks */
2521 cl = ms->m_ext.ext_buf;
2522 clsp = slab_get(cl);
2523 if (mclverify) {
2524 size_t size = m_maxsize(cl_class);
2525 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2526 (mcache_obj_t *)cl), cl, 0, size);
2527 }
2528 VERIFY(ms->m_type == MT_FREE);
2529 VERIFY(ms->m_flags == M_EXT);
2530 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2531 if (cl_class == MC_CL) {
2532 VERIFY(clsp->sl_refcnt >= 1 &&
2533 clsp->sl_refcnt <= NCLPG);
2534 } else {
2535 VERIFY(clsp->sl_refcnt >= 1 &&
2536 clsp->sl_refcnt <= NBCLPG);
2537 }
2538 if (cl_class == MC_16KCL) {
2539 int k;
2540 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2541 nsp = nsp->sl_next;
2542 /* Next slab must already be present */
2543 VERIFY(nsp != NULL);
2544 VERIFY(nsp->sl_refcnt == 1);
2545 }
2546 }
2547
2548 /*
2549 * If we're asked to purge, restore the actual mbuf using
2550 * contents of the shadow structure (if auditing is enabled)
2551 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2552 * about to free it and the attached cluster into their caches.
2553 */
2554 if (purged) {
2555 /* Restore constructed mbuf fields */
2556 if (mclaudit != NULL) {
2557 mcl_audit_restore_mbuf(m, mca, TRUE);
2558 }
2559
2560 MEXT_MINREF(m) = 0;
2561 MEXT_REF(m) = 0;
2562 MEXT_PREF(m) = 0;
2563 MEXT_FLAGS(m) = 0;
2564 MEXT_PRIV(m) = 0;
2565 MEXT_PMBUF(m) = NULL;
2566 MEXT_TOKEN(m) = 0;
2567
2568 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
2569 m_set_ext(m, NULL, NULL, NULL);
2570 rfa->obj_next = ref_list;
2571 ref_list = rfa;
2572
2573 m->m_type = MT_FREE;
2574 m->m_flags = m->m_len = 0;
2575 m->m_next = m->m_nextpkt = NULL;
2576
2577 /* Save mbuf fields and make auditing happy */
2578 if (mclaudit != NULL) {
2579 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2580 }
2581
2582 VERIFY(m_total(class) > 0);
2583 m_total(class)--;
2584
2585 /* Free the mbuf */
2586 o->obj_next = NULL;
2587 slab_free(MC_MBUF, o);
2588
2589 /* And free the cluster */
2590 ((mcache_obj_t *)cl)->obj_next = NULL;
2591 if (class == MC_MBUF_CL) {
2592 slab_free(MC_CL, cl);
2593 } else if (class == MC_MBUF_BIGCL) {
2594 slab_free(MC_BIGCL, cl);
2595 } else {
2596 slab_free(MC_16KCL, cl);
2597 }
2598 }
2599
2600 ++num;
2601 tail = o;
2602 o = nexto;
2603 }
2604
2605 if (!purged) {
2606 tail->obj_next = m_cobjlist(class);
2607 m_cobjlist(class) = list;
2608 m_infree(class) += num;
2609 } else if (ref_list != NULL) {
2610 mcache_free_ext(ref_cache, ref_list);
2611 }
2612
2613 return num;
2614 }
2615
2616 /*
2617 * Common allocator for composite objects called by the CPU cache layer
2618 * during an allocation request whenever there is no available element in
2619 * the bucket layer. It returns one or more composite elements from the
2620 * appropriate global freelist. If the freelist is empty, it will attempt
2621 * to obtain the rudimentary objects from their caches and construct them
2622 * into composite mbuf + cluster objects.
2623 */
2624 static unsigned int
mbuf_cslab_alloc(void * arg,mcache_obj_t *** plist,unsigned int needed,int wait)2625 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2626 int wait)
2627 {
2628 mbuf_class_t class = (mbuf_class_t)arg;
2629 mbuf_class_t cl_class = 0;
2630 unsigned int num = 0, cnum = 0, want = needed;
2631 mcache_obj_t *ref_list = NULL;
2632 mcache_obj_t *mp_list = NULL;
2633 mcache_obj_t *clp_list = NULL;
2634 mcache_obj_t **list;
2635 struct ext_ref *rfa;
2636 struct mbuf *m;
2637 void *cl;
2638
2639 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2640 ASSERT(needed > 0);
2641
2642 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2643
2644 /* There should not be any slab for this class */
2645 VERIFY(m_slab_cnt(class) == 0 &&
2646 m_slablist(class).tqh_first == NULL &&
2647 m_slablist(class).tqh_last == NULL);
2648
2649 lck_mtx_lock(mbuf_mlock);
2650
2651 /* Try using the freelist first */
2652 num = cslab_alloc(class, plist, needed);
2653 list = *plist;
2654 if (num == needed) {
2655 m_alloc_cnt(class) += num;
2656 lck_mtx_unlock(mbuf_mlock);
2657 return needed;
2658 }
2659
2660 lck_mtx_unlock(mbuf_mlock);
2661
2662 /*
2663 * We could not satisfy the request using the freelist alone;
2664 * allocate from the appropriate rudimentary caches and use
2665 * whatever we can get to construct the composite objects.
2666 */
2667 needed -= num;
2668
2669 /*
2670 * Mark these allocation requests as coming from a composite cache.
2671 * Also, if the caller is willing to be blocked, mark the request
2672 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2673 * slab layer waiting for the individual object when one or more
2674 * of the already-constructed composite objects are available.
2675 */
2676 wait |= MCR_COMP;
2677 if (!(wait & MCR_NOSLEEP)) {
2678 wait |= MCR_FAILOK;
2679 }
2680
2681 /* allocate mbufs */
2682 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2683 if (needed == 0) {
2684 ASSERT(mp_list == NULL);
2685 goto fail;
2686 }
2687
2688 /* allocate clusters */
2689 if (class == MC_MBUF_CL) {
2690 cl_class = MC_CL;
2691 } else if (class == MC_MBUF_BIGCL) {
2692 cl_class = MC_BIGCL;
2693 } else {
2694 VERIFY(class == MC_MBUF_16KCL);
2695 cl_class = MC_16KCL;
2696 }
2697 needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2698 if (needed == 0) {
2699 ASSERT(clp_list == NULL);
2700 goto fail;
2701 }
2702
2703 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2704 if (needed == 0) {
2705 ASSERT(ref_list == NULL);
2706 goto fail;
2707 }
2708
2709 /*
2710 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
2711 * overs will get freed accordingly before we return to caller.
2712 */
2713 for (cnum = 0; cnum < needed; cnum++) {
2714 struct mbuf *ms;
2715
2716 m = ms = (struct mbuf *)mp_list;
2717 mp_list = mp_list->obj_next;
2718
2719 cl = clp_list;
2720 clp_list = clp_list->obj_next;
2721 ((mcache_obj_t *)cl)->obj_next = NULL;
2722
2723 rfa = (struct ext_ref *)ref_list;
2724 ref_list = ref_list->obj_next;
2725 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2726
2727 /*
2728 * If auditing is enabled, construct the shadow mbuf
2729 * in the audit structure instead of in the actual one.
2730 * mbuf_cslab_audit() will take care of restoring the
2731 * contents after the integrity check.
2732 */
2733 if (mclaudit != NULL) {
2734 mcache_audit_t *mca, *cl_mca;
2735
2736 lck_mtx_lock(mbuf_mlock);
2737 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2738 ms = MCA_SAVED_MBUF_PTR(mca);
2739 cl_mca = mcl_audit_buf2mca(cl_class,
2740 (mcache_obj_t *)cl);
2741
2742 /*
2743 * Pair them up. Note that this is done at the time
2744 * the mbuf+cluster objects are constructed. This
2745 * information should be treated as "best effort"
2746 * debugging hint since more than one mbufs can refer
2747 * to a cluster. In that case, the cluster might not
2748 * be freed along with the mbuf it was paired with.
2749 */
2750 mca->mca_uptr = cl_mca;
2751 cl_mca->mca_uptr = mca;
2752
2753 ASSERT(mca->mca_uflags & MB_SCVALID);
2754 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2755 lck_mtx_unlock(mbuf_mlock);
2756
2757 /* Technically, they are in the freelist */
2758 if (mclverify) {
2759 size_t size;
2760
2761 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2762 m_maxsize(MC_MBUF));
2763
2764 if (class == MC_MBUF_CL) {
2765 size = m_maxsize(MC_CL);
2766 } else if (class == MC_MBUF_BIGCL) {
2767 size = m_maxsize(MC_BIGCL);
2768 } else {
2769 size = m_maxsize(MC_16KCL);
2770 }
2771
2772 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2773 size);
2774 }
2775 }
2776
2777 MBUF_INIT(ms, 0, MT_FREE);
2778 if (class == MC_MBUF_16KCL) {
2779 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2780 } else if (class == MC_MBUF_BIGCL) {
2781 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2782 } else {
2783 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2784 }
2785 VERIFY(ms->m_flags == M_EXT);
2786 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2787
2788 *list = (mcache_obj_t *)m;
2789 (*list)->obj_next = NULL;
2790 list = *plist = &(*list)->obj_next;
2791 }
2792
2793 fail:
2794 /*
2795 * Free up what's left of the above.
2796 */
2797 if (mp_list != NULL) {
2798 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2799 }
2800 if (clp_list != NULL) {
2801 mcache_free_ext(m_cache(cl_class), clp_list);
2802 }
2803 if (ref_list != NULL) {
2804 mcache_free_ext(ref_cache, ref_list);
2805 }
2806
2807 lck_mtx_lock(mbuf_mlock);
2808 if (num > 0 || cnum > 0) {
2809 m_total(class) += cnum;
2810 VERIFY(m_total(class) <= m_maxlimit(class));
2811 m_alloc_cnt(class) += num + cnum;
2812 }
2813 if ((num + cnum) < want) {
2814 m_fail_cnt(class) += (want - (num + cnum));
2815 }
2816 lck_mtx_unlock(mbuf_mlock);
2817
2818 return num + cnum;
2819 }
2820
2821 /*
2822 * Common de-allocator for composite objects called by the CPU cache
2823 * layer when one or more elements need to be returned to the appropriate
2824 * global freelist.
2825 */
2826 static void
mbuf_cslab_free(void * arg,mcache_obj_t * list,int purged)2827 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2828 {
2829 mbuf_class_t class = (mbuf_class_t)arg;
2830 unsigned int num;
2831 int w;
2832
2833 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2834
2835 lck_mtx_lock(mbuf_mlock);
2836
2837 num = cslab_free(class, list, purged);
2838 m_free_cnt(class) += num;
2839
2840 if ((w = mb_waiters) > 0) {
2841 mb_waiters = 0;
2842 }
2843 if (w) {
2844 mbwdog_logger("waking up all threads");
2845 }
2846
2847 lck_mtx_unlock(mbuf_mlock);
2848
2849 if (w != 0) {
2850 wakeup(mb_waitchan);
2851 }
2852 }
2853
2854 /*
2855 * Common auditor for composite objects called by the CPU cache layer
2856 * during an allocation or free request. For the former, this is called
2857 * after the objects are obtained from either the bucket or slab layer
2858 * and before they are returned to the caller. For the latter, this is
2859 * called immediately during free and before placing the objects into
2860 * the bucket or slab layer.
2861 */
2862 static void
mbuf_cslab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)2863 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2864 {
2865 mbuf_class_t class = (mbuf_class_t)arg, cl_class;
2866 mcache_audit_t *mca;
2867 struct mbuf *m, *ms;
2868 mcl_slab_t *clsp, *nsp;
2869 size_t cl_size;
2870 void *cl;
2871
2872 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2873 if (class == MC_MBUF_CL) {
2874 cl_class = MC_CL;
2875 } else if (class == MC_MBUF_BIGCL) {
2876 cl_class = MC_BIGCL;
2877 } else {
2878 cl_class = MC_16KCL;
2879 }
2880 cl_size = m_maxsize(cl_class);
2881
2882 while ((m = ms = (struct mbuf *)list) != NULL) {
2883 lck_mtx_lock(mbuf_mlock);
2884 /* Do the mbuf sanity checks and record its transaction */
2885 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2886 mcl_audit_mbuf(mca, m, TRUE, alloc);
2887 if (mcltrace) {
2888 mcache_buffer_log(mca, m, m_cache(class), &mb_start);
2889 }
2890
2891 if (alloc) {
2892 mca->mca_uflags |= MB_COMP_INUSE;
2893 } else {
2894 mca->mca_uflags &= ~MB_COMP_INUSE;
2895 }
2896
2897 /*
2898 * Use the shadow mbuf in the audit structure if we are
2899 * freeing, since the contents of the actual mbuf has been
2900 * pattern-filled by the above call to mcl_audit_mbuf().
2901 */
2902 if (!alloc && mclverify) {
2903 ms = MCA_SAVED_MBUF_PTR(mca);
2904 }
2905
2906 /* Do the cluster sanity checks and record its transaction */
2907 cl = ms->m_ext.ext_buf;
2908 clsp = slab_get(cl);
2909 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2910 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2911 if (class == MC_MBUF_CL) {
2912 VERIFY(clsp->sl_refcnt >= 1 &&
2913 clsp->sl_refcnt <= NCLPG);
2914 } else {
2915 VERIFY(clsp->sl_refcnt >= 1 &&
2916 clsp->sl_refcnt <= NBCLPG);
2917 }
2918
2919 if (class == MC_MBUF_16KCL) {
2920 int k;
2921 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2922 nsp = nsp->sl_next;
2923 /* Next slab must already be present */
2924 VERIFY(nsp != NULL);
2925 VERIFY(nsp->sl_refcnt == 1);
2926 }
2927 }
2928
2929
2930 mca = mcl_audit_buf2mca(cl_class, cl);
2931 mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE);
2932 if (mcltrace) {
2933 mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
2934 }
2935
2936 if (alloc) {
2937 mca->mca_uflags |= MB_COMP_INUSE;
2938 } else {
2939 mca->mca_uflags &= ~MB_COMP_INUSE;
2940 }
2941 lck_mtx_unlock(mbuf_mlock);
2942
2943 list = list->obj_next;
2944 }
2945 }
2946
2947 static void
m_vm_error_stats(uint32_t * cnt,uint64_t * ts,uint64_t * size,uint64_t alloc_size,kern_return_t error)2948 m_vm_error_stats(uint32_t *cnt, uint64_t *ts, uint64_t *size,
2949 uint64_t alloc_size, kern_return_t error)
2950 {
2951 *cnt = *cnt + 1;
2952 *ts = net_uptime();
2953 if (size) {
2954 *size = alloc_size;
2955 }
2956 switch (error) {
2957 case KERN_SUCCESS:
2958 break;
2959 case KERN_INVALID_ARGUMENT:
2960 mb_kmem_stats[0]++;
2961 break;
2962 case KERN_INVALID_ADDRESS:
2963 mb_kmem_stats[1]++;
2964 break;
2965 case KERN_RESOURCE_SHORTAGE:
2966 mb_kmem_stats[2]++;
2967 break;
2968 case KERN_NO_SPACE:
2969 mb_kmem_stats[3]++;
2970 break;
2971 case KERN_FAILURE:
2972 mb_kmem_stats[4]++;
2973 break;
2974 default:
2975 mb_kmem_stats[5]++;
2976 break;
2977 }
2978 }
2979
2980 static vm_offset_t
kmem_mb_alloc(vm_map_t mbmap,int size,int physContig,kern_return_t * err)2981 kmem_mb_alloc(vm_map_t mbmap, int size, int physContig, kern_return_t *err)
2982 {
2983 vm_offset_t addr = 0;
2984 kern_return_t kr = KERN_SUCCESS;
2985
2986 if (!physContig) {
2987 kr = kmem_alloc(mbmap, &addr, size,
2988 KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
2989 } else {
2990 kr = kmem_alloc_contig(mbmap, &addr, size, PAGE_MASK, 0xfffff,
2991 0, KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
2992 }
2993
2994 if (kr != KERN_SUCCESS) {
2995 addr = 0;
2996 }
2997 if (err) {
2998 *err = kr;
2999 }
3000
3001 return addr;
3002 }
3003
3004 /*
3005 * Allocate some number of mbuf clusters and place on cluster freelist.
3006 */
3007 static int
m_clalloc(const u_int32_t num,const int wait,const u_int32_t bufsize)3008 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
3009 {
3010 int i, count = 0;
3011 vm_size_t size = 0;
3012 int numpages = 0, large_buffer;
3013 vm_offset_t page = 0;
3014 mcache_audit_t *mca_list = NULL;
3015 mcache_obj_t *con_list = NULL;
3016 mcl_slab_t *sp;
3017 mbuf_class_t class;
3018 kern_return_t error;
3019
3020 /* Set if a buffer allocation needs allocation of multiple pages */
3021 large_buffer = ((bufsize == m_maxsize(MC_16KCL)) &&
3022 PAGE_SIZE < M16KCLBYTES);
3023 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
3024 bufsize == m_maxsize(MC_16KCL));
3025
3026 VERIFY((bufsize == PAGE_SIZE) ||
3027 (bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL)));
3028
3029 if (bufsize == m_size(MC_BIGCL)) {
3030 class = MC_BIGCL;
3031 } else {
3032 class = MC_16KCL;
3033 }
3034
3035 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3036
3037 /*
3038 * Multiple threads may attempt to populate the cluster map one
3039 * after another. Since we drop the lock below prior to acquiring
3040 * the physical page(s), our view of the cluster map may no longer
3041 * be accurate, and we could end up over-committing the pages beyond
3042 * the maximum allowed for each class. To prevent it, this entire
3043 * operation (including the page mapping) is serialized.
3044 */
3045 while (mb_clalloc_busy) {
3046 mb_clalloc_waiters++;
3047 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
3048 (PZERO - 1), "m_clalloc", NULL);
3049 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3050 }
3051
3052 /* We are busy now; tell everyone else to go away */
3053 mb_clalloc_busy = TRUE;
3054
3055 /*
3056 * Honor the caller's wish to block or not block. We have a way
3057 * to grow the pool asynchronously using the mbuf worker thread.
3058 */
3059 i = m_howmany(num, bufsize);
3060 if (i <= 0 || (wait & M_DONTWAIT)) {
3061 goto out;
3062 }
3063
3064 lck_mtx_unlock(mbuf_mlock);
3065
3066 size = round_page(i * bufsize);
3067 page = kmem_mb_alloc(mb_map, size, large_buffer, &error);
3068
3069 /*
3070 * If we did ask for "n" 16KB physically contiguous chunks
3071 * and didn't get them, then please try again without this
3072 * restriction.
3073 */
3074 net_update_uptime();
3075 if (large_buffer && page == 0) {
3076 m_vm_error_stats(&mb_kmem_contig_failed,
3077 &mb_kmem_contig_failed_ts,
3078 &mb_kmem_contig_failed_size,
3079 size, error);
3080 page = kmem_mb_alloc(mb_map, size, 0, &error);
3081 }
3082
3083 if (page == 0) {
3084 m_vm_error_stats(&mb_kmem_failed,
3085 &mb_kmem_failed_ts,
3086 &mb_kmem_failed_size,
3087 size, error);
3088 #if PAGE_SIZE == 4096
3089 if (bufsize == m_maxsize(MC_BIGCL)) {
3090 #else
3091 if (bufsize >= m_maxsize(MC_BIGCL)) {
3092 #endif
3093 /* Try for 1 page if failed */
3094 size = PAGE_SIZE;
3095 page = kmem_mb_alloc(mb_map, size, 0, &error);
3096 if (page == 0) {
3097 m_vm_error_stats(&mb_kmem_one_failed,
3098 &mb_kmem_one_failed_ts,
3099 NULL, size, error);
3100 }
3101 }
3102
3103 if (page == 0) {
3104 lck_mtx_lock(mbuf_mlock);
3105 goto out;
3106 }
3107 }
3108
3109 VERIFY(IS_P2ALIGNED(page, PAGE_SIZE));
3110 numpages = size / PAGE_SIZE;
3111
3112 /* If auditing is enabled, allocate the audit structures now */
3113 if (mclaudit != NULL) {
3114 int needed;
3115
3116 /*
3117 * Yes, I realize this is a waste of memory for clusters
3118 * that never get transformed into mbufs, as we may end
3119 * up with NMBPG-1 unused audit structures per cluster.
3120 * But doing so tremendously simplifies the allocation
3121 * strategy, since at this point we are not holding the
3122 * mbuf lock and the caller is okay to be blocked.
3123 */
3124 if (bufsize == PAGE_SIZE) {
3125 needed = numpages * NMBPG;
3126
3127 i = mcache_alloc_ext(mcl_audit_con_cache,
3128 &con_list, needed, MCR_SLEEP);
3129
3130 VERIFY(con_list != NULL && i == needed);
3131 } else {
3132 /*
3133 * if multiple 4K pages are being used for a
3134 * 16K cluster
3135 */
3136 needed = numpages / NSLABSP16KB;
3137 }
3138
3139 i = mcache_alloc_ext(mcache_audit_cache,
3140 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
3141
3142 VERIFY(mca_list != NULL && i == needed);
3143 }
3144
3145 lck_mtx_lock(mbuf_mlock);
3146
3147 for (i = 0; i < numpages; i++, page += PAGE_SIZE) {
3148 ppnum_t offset =
3149 ((unsigned char *)page - mbutl) >> PAGE_SHIFT;
3150 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
3151
3152 /*
3153 * If there is a mapper the appropriate I/O page is
3154 * returned; zero out the page to discard its past
3155 * contents to prevent exposing leftover kernel memory.
3156 */
3157 VERIFY(offset < mcl_pages);
3158 if (mcl_paddr_base != 0) {
3159 bzero((void *)(uintptr_t) page, PAGE_SIZE);
3160 new_page = IOMapperInsertPage(mcl_paddr_base,
3161 offset, new_page);
3162 }
3163 mcl_paddr[offset] = new_page;
3164
3165 /* Pattern-fill this fresh page */
3166 if (mclverify) {
3167 mcache_set_pattern(MCACHE_FREE_PATTERN,
3168 (caddr_t)page, PAGE_SIZE);
3169 }
3170 if (bufsize == PAGE_SIZE) {
3171 mcache_obj_t *buf;
3172 /* One for the entire page */
3173 sp = slab_get((void *)page);
3174 if (mclaudit != NULL) {
3175 mcl_audit_init((void *)page,
3176 &mca_list, &con_list,
3177 AUDIT_CONTENTS_SIZE, NMBPG);
3178 }
3179 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3180 slab_init(sp, class, SLF_MAPPED, (void *)page,
3181 (void *)page, PAGE_SIZE, 0, 1);
3182 buf = (mcache_obj_t *)page;
3183 buf->obj_next = NULL;
3184
3185 /* Insert this slab */
3186 slab_insert(sp, class);
3187
3188 /* Update stats now since slab_get drops the lock */
3189 ++m_infree(class);
3190 ++m_total(class);
3191 VERIFY(m_total(class) <= m_maxlimit(class));
3192 if (class == MC_BIGCL) {
3193 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3194 m_infree(MC_MBUF_BIGCL);
3195 mbstat.m_bigclusters = m_total(MC_BIGCL);
3196 }
3197 ++count;
3198 } else if ((bufsize > PAGE_SIZE) &&
3199 (i % NSLABSP16KB) == 0) {
3200 union m16kcluster *m16kcl = (union m16kcluster *)page;
3201 mcl_slab_t *nsp;
3202 int k;
3203
3204 /* One for the entire 16KB */
3205 sp = slab_get(m16kcl);
3206 if (mclaudit != NULL) {
3207 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
3208 }
3209
3210 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3211 slab_init(sp, MC_16KCL, SLF_MAPPED,
3212 m16kcl, m16kcl, bufsize, 0, 1);
3213 m16kcl->m16kcl_next = NULL;
3214
3215 /*
3216 * 2nd-Nth page's slab is part of the first one,
3217 * where N is NSLABSP16KB.
3218 */
3219 for (k = 1; k < NSLABSP16KB; k++) {
3220 nsp = slab_get(((union mbigcluster *)page) + k);
3221 VERIFY(nsp->sl_refcnt == 0 &&
3222 nsp->sl_flags == 0);
3223 slab_init(nsp, MC_16KCL,
3224 SLF_MAPPED | SLF_PARTIAL,
3225 m16kcl, NULL, 0, 0, 0);
3226 }
3227 /* Insert this slab */
3228 slab_insert(sp, MC_16KCL);
3229
3230 /* Update stats now since slab_get drops the lock */
3231 ++m_infree(MC_16KCL);
3232 ++m_total(MC_16KCL);
3233 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3234 ++count;
3235 }
3236 }
3237 VERIFY(mca_list == NULL && con_list == NULL);
3238
3239 if (!mb_peak_newreport && mbuf_report_usage(class)) {
3240 mb_peak_newreport = TRUE;
3241 }
3242
3243 /* We're done; let others enter */
3244 mb_clalloc_busy = FALSE;
3245 if (mb_clalloc_waiters > 0) {
3246 mb_clalloc_waiters = 0;
3247 wakeup(mb_clalloc_waitchan);
3248 }
3249
3250 return count;
3251 out:
3252 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3253
3254 mtracelarge_register(size);
3255
3256 /* We're done; let others enter */
3257 mb_clalloc_busy = FALSE;
3258 if (mb_clalloc_waiters > 0) {
3259 mb_clalloc_waiters = 0;
3260 wakeup(mb_clalloc_waitchan);
3261 }
3262
3263 /*
3264 * When non-blocking we kick a thread if we have to grow the
3265 * pool or if the number of free clusters is less than requested.
3266 */
3267 if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) {
3268 mbwdog_logger("waking up the worker thread to to grow %s by %d",
3269 m_cname(class), i);
3270 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
3271 mbuf_worker_needs_wakeup = FALSE;
3272 }
3273 if (class == MC_BIGCL) {
3274 if (i > 0) {
3275 /*
3276 * Remember total number of 4KB clusters needed
3277 * at this time.
3278 */
3279 i += m_total(MC_BIGCL);
3280 if (i > m_region_expand(MC_BIGCL)) {
3281 m_region_expand(MC_BIGCL) = i;
3282 }
3283 }
3284 if (m_infree(MC_BIGCL) >= num) {
3285 return 1;
3286 }
3287 } else {
3288 if (i > 0) {
3289 /*
3290 * Remember total number of 16KB clusters needed
3291 * at this time.
3292 */
3293 i += m_total(MC_16KCL);
3294 if (i > m_region_expand(MC_16KCL)) {
3295 m_region_expand(MC_16KCL) = i;
3296 }
3297 }
3298 if (m_infree(MC_16KCL) >= num) {
3299 return 1;
3300 }
3301 }
3302 return 0;
3303 }
3304
3305 /*
3306 * Populate the global freelist of the corresponding buffer class.
3307 */
3308 static int
3309 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
3310 {
3311 mcache_obj_t *o = NULL;
3312 int i, numpages = 0, count;
3313 mbuf_class_t super_class;
3314
3315 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
3316 class == MC_16KCL);
3317
3318 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3319
3320 VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) ||
3321 PAGE_SIZE == m_maxsize(MC_16KCL));
3322
3323 if (m_maxsize(class) >= PAGE_SIZE) {
3324 return m_clalloc(num, wait, m_maxsize(class)) != 0;
3325 }
3326
3327 /*
3328 * The rest of the function will allocate pages and will slice
3329 * them up into the right size
3330 */
3331
3332 numpages = (num * m_size(class) + PAGE_SIZE - 1) / PAGE_SIZE;
3333
3334 /* Currently assume that pages are 4K or 16K */
3335 if (PAGE_SIZE == m_maxsize(MC_BIGCL)) {
3336 super_class = MC_BIGCL;
3337 } else {
3338 super_class = MC_16KCL;
3339 }
3340
3341 i = m_clalloc(numpages, wait, m_maxsize(super_class));
3342
3343 /* how many objects will we cut the page into? */
3344 int numobj = PAGE_SIZE / m_maxsize(class);
3345
3346 for (count = 0; count < numpages; count++) {
3347 /* respect totals, minlimit, maxlimit */
3348 if (m_total(super_class) <= m_minlimit(super_class) ||
3349 m_total(class) >= m_maxlimit(class)) {
3350 break;
3351 }
3352
3353 if ((o = slab_alloc(super_class, wait)) == NULL) {
3354 break;
3355 }
3356
3357 struct mbuf *m = (struct mbuf *)o;
3358 union mcluster *c = (union mcluster *)o;
3359 union mbigcluster *mbc = (union mbigcluster *)o;
3360 mcl_slab_t *sp = slab_get(o);
3361 mcache_audit_t *mca = NULL;
3362
3363 /*
3364 * since one full page will be converted to MC_MBUF or
3365 * MC_CL, verify that the reference count will match that
3366 * assumption
3367 */
3368 VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp));
3369 VERIFY((sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
3370 /*
3371 * Make sure that the cluster is unmolested
3372 * while in freelist
3373 */
3374 if (mclverify) {
3375 mca = mcl_audit_buf2mca(super_class,
3376 (mcache_obj_t *)o);
3377 mcache_audit_free_verify(mca,
3378 (mcache_obj_t *)o, 0, m_maxsize(super_class));
3379 }
3380
3381 /* Reinitialize it as an mbuf or 2K or 4K slab */
3382 slab_init(sp, class, sp->sl_flags,
3383 sp->sl_base, NULL, PAGE_SIZE, 0, numobj);
3384
3385 VERIFY(sp->sl_head == NULL);
3386
3387 VERIFY(m_total(super_class) >= 1);
3388 m_total(super_class)--;
3389
3390 if (super_class == MC_BIGCL) {
3391 mbstat.m_bigclusters = m_total(MC_BIGCL);
3392 }
3393
3394 m_total(class) += numobj;
3395 VERIFY(m_total(class) <= m_maxlimit(class));
3396 m_infree(class) += numobj;
3397
3398 if (!mb_peak_newreport && mbuf_report_usage(class)) {
3399 mb_peak_newreport = TRUE;
3400 }
3401
3402 i = numobj;
3403 if (class == MC_MBUF) {
3404 mbstat.m_mbufs = m_total(MC_MBUF);
3405 mtype_stat_add(MT_FREE, NMBPG);
3406 while (i--) {
3407 /*
3408 * If auditing is enabled, construct the
3409 * shadow mbuf in the audit structure
3410 * instead of the actual one.
3411 * mbuf_slab_audit() will take care of
3412 * restoring the contents after the
3413 * integrity check.
3414 */
3415 if (mclaudit != NULL) {
3416 struct mbuf *ms;
3417 mca = mcl_audit_buf2mca(MC_MBUF,
3418 (mcache_obj_t *)m);
3419 ms = MCA_SAVED_MBUF_PTR(mca);
3420 ms->m_type = MT_FREE;
3421 } else {
3422 m->m_type = MT_FREE;
3423 }
3424 m->m_next = sp->sl_head;
3425 sp->sl_head = (void *)m++;
3426 }
3427 } else if (class == MC_CL) { /* MC_CL */
3428 mbstat.m_clfree =
3429 m_infree(MC_CL) + m_infree(MC_MBUF_CL);
3430 mbstat.m_clusters = m_total(MC_CL);
3431 while (i--) {
3432 c->mcl_next = sp->sl_head;
3433 sp->sl_head = (void *)c++;
3434 }
3435 } else {
3436 VERIFY(class == MC_BIGCL);
3437 mbstat.m_bigclusters = m_total(MC_BIGCL);
3438 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3439 m_infree(MC_MBUF_BIGCL);
3440 while (i--) {
3441 mbc->mbc_next = sp->sl_head;
3442 sp->sl_head = (void *)mbc++;
3443 }
3444 }
3445
3446 /* Insert into the mbuf or 2k or 4k slab list */
3447 slab_insert(sp, class);
3448
3449 if ((i = mb_waiters) > 0) {
3450 mb_waiters = 0;
3451 }
3452 if (i != 0) {
3453 mbwdog_logger("waking up all threads");
3454 wakeup(mb_waitchan);
3455 }
3456 }
3457 return count != 0;
3458 }
3459
3460 /*
3461 * For each class, initialize the freelist to hold m_minlimit() objects.
3462 */
3463 static void
3464 freelist_init(mbuf_class_t class)
3465 {
3466 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3467
3468 VERIFY(class == MC_CL || class == MC_BIGCL);
3469 VERIFY(m_total(class) == 0);
3470 VERIFY(m_minlimit(class) > 0);
3471
3472 while (m_total(class) < m_minlimit(class)) {
3473 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3474 }
3475
3476 VERIFY(m_total(class) >= m_minlimit(class));
3477 }
3478
3479 /*
3480 * (Inaccurately) check if it might be worth a trip back to the
3481 * mcache layer due the availability of objects there. We'll
3482 * end up back here if there's nothing up there.
3483 */
3484 static boolean_t
3485 mbuf_cached_above(mbuf_class_t class, int wait)
3486 {
3487 switch (class) {
3488 case MC_MBUF:
3489 if (wait & MCR_COMP) {
3490 return !mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3491 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
3492 }
3493 break;
3494
3495 case MC_CL:
3496 if (wait & MCR_COMP) {
3497 return !mcache_bkt_isempty(m_cache(MC_MBUF_CL));
3498 }
3499 break;
3500
3501 case MC_BIGCL:
3502 if (wait & MCR_COMP) {
3503 return !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
3504 }
3505 break;
3506
3507 case MC_16KCL:
3508 if (wait & MCR_COMP) {
3509 return !mcache_bkt_isempty(m_cache(MC_MBUF_16KCL));
3510 }
3511 break;
3512
3513 case MC_MBUF_CL:
3514 case MC_MBUF_BIGCL:
3515 case MC_MBUF_16KCL:
3516 break;
3517
3518 default:
3519 VERIFY(0);
3520 /* NOTREACHED */
3521 }
3522
3523 return !mcache_bkt_isempty(m_cache(class));
3524 }
3525
3526 /*
3527 * If possible, convert constructed objects to raw ones.
3528 */
3529 static boolean_t
3530 mbuf_steal(mbuf_class_t class, unsigned int num)
3531 {
3532 mcache_obj_t *top = NULL;
3533 mcache_obj_t **list = ⊤
3534 unsigned int tot = 0;
3535
3536 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3537
3538 switch (class) {
3539 case MC_MBUF:
3540 case MC_CL:
3541 case MC_BIGCL:
3542 case MC_16KCL:
3543 return FALSE;
3544
3545 case MC_MBUF_CL:
3546 case MC_MBUF_BIGCL:
3547 case MC_MBUF_16KCL:
3548 /* Get the required number of constructed objects if possible */
3549 if (m_infree(class) > m_minlimit(class)) {
3550 tot = cslab_alloc(class, &list,
3551 MIN(num, m_infree(class)));
3552 }
3553
3554 /* And destroy them to get back the raw objects */
3555 if (top != NULL) {
3556 (void) cslab_free(class, top, 1);
3557 }
3558 break;
3559
3560 default:
3561 VERIFY(0);
3562 /* NOTREACHED */
3563 }
3564
3565 return tot == num;
3566 }
3567
3568 static void
3569 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3570 {
3571 int m, bmap = 0;
3572
3573 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3574
3575 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3576 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3577 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3578
3579 /*
3580 * This logic can be made smarter; for now, simply mark
3581 * all other related classes as potential victims.
3582 */
3583 switch (class) {
3584 case MC_MBUF:
3585 m_wantpurge(MC_CL)++;
3586 m_wantpurge(MC_BIGCL)++;
3587 m_wantpurge(MC_MBUF_CL)++;
3588 m_wantpurge(MC_MBUF_BIGCL)++;
3589 break;
3590
3591 case MC_CL:
3592 m_wantpurge(MC_MBUF)++;
3593 m_wantpurge(MC_BIGCL)++;
3594 m_wantpurge(MC_MBUF_BIGCL)++;
3595 if (!comp) {
3596 m_wantpurge(MC_MBUF_CL)++;
3597 }
3598 break;
3599
3600 case MC_BIGCL:
3601 m_wantpurge(MC_MBUF)++;
3602 m_wantpurge(MC_CL)++;
3603 m_wantpurge(MC_MBUF_CL)++;
3604 if (!comp) {
3605 m_wantpurge(MC_MBUF_BIGCL)++;
3606 }
3607 break;
3608
3609 case MC_16KCL:
3610 if (!comp) {
3611 m_wantpurge(MC_MBUF_16KCL)++;
3612 }
3613 break;
3614
3615 default:
3616 VERIFY(0);
3617 /* NOTREACHED */
3618 }
3619
3620 /*
3621 * Run through each marked class and check if we really need to
3622 * purge (and therefore temporarily disable) the per-CPU caches
3623 * layer used by the class. If so, remember the classes since
3624 * we are going to drop the lock below prior to purging.
3625 */
3626 for (m = 0; m < NELEM(mbuf_table); m++) {
3627 if (m_wantpurge(m) > 0) {
3628 m_wantpurge(m) = 0;
3629 /*
3630 * Try hard to steal the required number of objects
3631 * from the freelist of other mbuf classes. Only
3632 * purge and disable the per-CPU caches layer when
3633 * we don't have enough; it's the last resort.
3634 */
3635 if (!mbuf_steal(m, num)) {
3636 bmap |= (1 << m);
3637 }
3638 }
3639 }
3640
3641 lck_mtx_unlock(mbuf_mlock);
3642
3643 if (bmap != 0) {
3644 /* signal the domains to drain */
3645 net_drain_domains();
3646
3647 /* Sigh; we have no other choices but to ask mcache to purge */
3648 for (m = 0; m < NELEM(mbuf_table); m++) {
3649 if ((bmap & (1 << m)) &&
3650 mcache_purge_cache(m_cache(m), TRUE)) {
3651 lck_mtx_lock(mbuf_mlock);
3652 m_purge_cnt(m)++;
3653 mbstat.m_drain++;
3654 lck_mtx_unlock(mbuf_mlock);
3655 }
3656 }
3657 } else {
3658 /*
3659 * Request mcache to reap extra elements from all of its caches;
3660 * note that all reaps are serialized and happen only at a fixed
3661 * interval.
3662 */
3663 mcache_reap();
3664 }
3665 lck_mtx_lock(mbuf_mlock);
3666 }
3667
3668 static inline struct mbuf *
3669 m_get_common(int wait, short type, int hdr)
3670 {
3671 struct mbuf *m;
3672 int mcflags = MSLEEPF(wait);
3673
3674 /* Is this due to a non-blocking retry? If so, then try harder */
3675 if (mcflags & MCR_NOSLEEP) {
3676 mcflags |= MCR_TRYHARD;
3677 }
3678
3679 m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3680 if (m != NULL) {
3681 MBUF_INIT(m, hdr, type);
3682 mtype_stat_inc(type);
3683 mtype_stat_dec(MT_FREE);
3684 }
3685 return m;
3686 }
3687
3688 /*
3689 * Space allocation routines; these are also available as macros
3690 * for critical paths.
3691 */
3692 #define _M_GET(wait, type) m_get_common(wait, type, 0)
3693 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
3694 #define _M_RETRY(wait, type) _M_GET(wait, type)
3695 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3696 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
3697 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
3698
3699 struct mbuf *
3700 m_get(int wait, int type)
3701 {
3702 return _M_GET(wait, type);
3703 }
3704
3705 struct mbuf *
3706 m_gethdr(int wait, int type)
3707 {
3708 return _M_GETHDR(wait, type);
3709 }
3710
3711 struct mbuf *
3712 m_retry(int wait, int type)
3713 {
3714 return _M_RETRY(wait, type);
3715 }
3716
3717 struct mbuf *
3718 m_retryhdr(int wait, int type)
3719 {
3720 return _M_RETRYHDR(wait, type);
3721 }
3722
3723 struct mbuf *
3724 m_getclr(int wait, int type)
3725 {
3726 struct mbuf *m;
3727
3728 _MGET(m, wait, type);
3729 if (m != NULL) {
3730 bzero(MTOD(m, caddr_t), MLEN);
3731 }
3732 return m;
3733 }
3734
3735 static int
3736 m_free_paired(struct mbuf *m)
3737 {
3738 VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED));
3739
3740 membar_sync();
3741 if (MEXT_PMBUF(m) == m) {
3742 volatile UInt16 *addr = (volatile UInt16 *)&MEXT_PREF(m);
3743 int16_t oprefcnt, prefcnt;
3744
3745 /*
3746 * Paired ref count might be negative in case we lose
3747 * against another thread clearing MEXT_PMBUF, in the
3748 * event it occurs after the above memory barrier sync.
3749 * In that case just ignore as things have been unpaired.
3750 */
3751 do {
3752 oprefcnt = *addr;
3753 prefcnt = oprefcnt - 1;
3754 } while (!OSCompareAndSwap16(oprefcnt, prefcnt, addr));
3755
3756 if (prefcnt > 1) {
3757 return 1;
3758 } else if (prefcnt == 1) {
3759 m_ext_free_func_t m_free_func = m_get_ext_free(m);
3760 VERIFY(m_free_func != NULL);
3761 (*m_free_func)(m->m_ext.ext_buf,
3762 m->m_ext.ext_size, m_get_ext_arg(m));
3763 return 1;
3764 } else if (prefcnt == 0) {
3765 VERIFY(MBUF_IS_PAIRED(m));
3766
3767 /*
3768 * Restore minref to its natural value, so that
3769 * the caller will be able to free the cluster
3770 * as appropriate.
3771 */
3772 MEXT_MINREF(m) = 0;
3773
3774 /*
3775 * Clear MEXT_PMBUF, but leave EXTF_PAIRED intact
3776 * as it is immutable. atomic_set_ptr also causes
3777 * memory barrier sync.
3778 */
3779 atomic_set_ptr(&MEXT_PMBUF(m), NULL);
3780
3781 switch (m->m_ext.ext_size) {
3782 case MCLBYTES:
3783 m_set_ext(m, m_get_rfa(m), NULL, NULL);
3784 break;
3785
3786 case MBIGCLBYTES:
3787 m_set_ext(m, m_get_rfa(m), m_bigfree, NULL);
3788 break;
3789
3790 case M16KCLBYTES:
3791 m_set_ext(m, m_get_rfa(m), m_16kfree, NULL);
3792 break;
3793
3794 default:
3795 VERIFY(0);
3796 /* NOTREACHED */
3797 }
3798 }
3799 }
3800
3801 /*
3802 * Tell caller the unpair has occurred, and that the reference
3803 * count on the external cluster held for the paired mbuf should
3804 * now be dropped.
3805 */
3806 return 0;
3807 }
3808
3809 struct mbuf *
3810 m_free(struct mbuf *m)
3811 {
3812 struct mbuf *n = m->m_next;
3813
3814 if (m->m_type == MT_FREE) {
3815 panic("m_free: freeing an already freed mbuf");
3816 }
3817
3818 if (m->m_flags & M_PKTHDR) {
3819 /* Check for scratch area overflow */
3820 m_redzone_verify(m);
3821 /* Free the aux data and tags if there is any */
3822 m_tag_delete_chain(m, NULL);
3823
3824 m_do_tx_compl_callback(m, NULL);
3825 }
3826
3827 if (m->m_flags & M_EXT) {
3828 uint16_t refcnt;
3829 uint32_t composite;
3830 m_ext_free_func_t m_free_func;
3831
3832 if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
3833 return n;
3834 }
3835
3836 refcnt = m_decref(m);
3837 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3838 m_free_func = m_get_ext_free(m);
3839
3840 if (refcnt == MEXT_MINREF(m) && !composite) {
3841 if (m_free_func == NULL) {
3842 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3843 } else if (m_free_func == m_bigfree) {
3844 mcache_free(m_cache(MC_BIGCL),
3845 m->m_ext.ext_buf);
3846 } else if (m_free_func == m_16kfree) {
3847 mcache_free(m_cache(MC_16KCL),
3848 m->m_ext.ext_buf);
3849 } else {
3850 (*m_free_func)(m->m_ext.ext_buf,
3851 m->m_ext.ext_size, m_get_ext_arg(m));
3852 }
3853 mcache_free(ref_cache, m_get_rfa(m));
3854 m_set_ext(m, NULL, NULL, NULL);
3855 } else if (refcnt == MEXT_MINREF(m) && composite) {
3856 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
3857 VERIFY(m->m_type != MT_FREE);
3858
3859 mtype_stat_dec(m->m_type);
3860 mtype_stat_inc(MT_FREE);
3861
3862 m->m_type = MT_FREE;
3863 m->m_flags = M_EXT;
3864 m->m_len = 0;
3865 m->m_next = m->m_nextpkt = NULL;
3866
3867 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3868
3869 /* "Free" into the intermediate cache */
3870 if (m_free_func == NULL) {
3871 mcache_free(m_cache(MC_MBUF_CL), m);
3872 } else if (m_free_func == m_bigfree) {
3873 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3874 } else {
3875 VERIFY(m_free_func == m_16kfree);
3876 mcache_free(m_cache(MC_MBUF_16KCL), m);
3877 }
3878 return n;
3879 }
3880 }
3881
3882 if (m->m_type != MT_FREE) {
3883 mtype_stat_dec(m->m_type);
3884 mtype_stat_inc(MT_FREE);
3885 }
3886
3887 m->m_type = MT_FREE;
3888 m->m_flags = m->m_len = 0;
3889 m->m_next = m->m_nextpkt = NULL;
3890
3891 mcache_free(m_cache(MC_MBUF), m);
3892
3893 return n;
3894 }
3895
3896 __private_extern__ struct mbuf *
3897 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3898 void (*extfree)(caddr_t, u_int, caddr_t), size_t extsize, caddr_t extarg,
3899 int wait, int pair)
3900 {
3901 struct ext_ref *rfa = NULL;
3902
3903 /*
3904 * If pairing is requested and an existing mbuf is provided, reject
3905 * it if it's already been paired to another cluster. Otherwise,
3906 * allocate a new one or free any existing below.
3907 */
3908 if ((m != NULL && MBUF_IS_PAIRED(m)) ||
3909 (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)) {
3910 return NULL;
3911 }
3912
3913 if (m->m_flags & M_EXT) {
3914 u_int16_t refcnt;
3915 u_int32_t composite;
3916 m_ext_free_func_t m_free_func;
3917
3918 refcnt = m_decref(m);
3919 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3920 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
3921 m_free_func = m_get_ext_free(m);
3922 if (refcnt == MEXT_MINREF(m) && !composite) {
3923 if (m_free_func == NULL) {
3924 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3925 } else if (m_free_func == m_bigfree) {
3926 mcache_free(m_cache(MC_BIGCL),
3927 m->m_ext.ext_buf);
3928 } else if (m_free_func == m_16kfree) {
3929 mcache_free(m_cache(MC_16KCL),
3930 m->m_ext.ext_buf);
3931 } else {
3932 (*m_free_func)(m->m_ext.ext_buf,
3933 m->m_ext.ext_size, m_get_ext_arg(m));
3934 }
3935 /* Re-use the reference structure */
3936 rfa = m_get_rfa(m);
3937 } else if (refcnt == MEXT_MINREF(m) && composite) {
3938 VERIFY(m->m_type != MT_FREE);
3939
3940 mtype_stat_dec(m->m_type);
3941 mtype_stat_inc(MT_FREE);
3942
3943 m->m_type = MT_FREE;
3944 m->m_flags = M_EXT;
3945 m->m_len = 0;
3946 m->m_next = m->m_nextpkt = NULL;
3947
3948 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3949
3950 /* "Free" into the intermediate cache */
3951 if (m_free_func == NULL) {
3952 mcache_free(m_cache(MC_MBUF_CL), m);
3953 } else if (m_free_func == m_bigfree) {
3954 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3955 } else {
3956 VERIFY(m_free_func == m_16kfree);
3957 mcache_free(m_cache(MC_MBUF_16KCL), m);
3958 }
3959 /*
3960 * Allocate a new mbuf, since we didn't divorce
3961 * the composite mbuf + cluster pair above.
3962 */
3963 if ((m = _M_GETHDR(wait, type)) == NULL) {
3964 return NULL;
3965 }
3966 }
3967 }
3968
3969 if (rfa == NULL &&
3970 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3971 m_free(m);
3972 return NULL;
3973 }
3974
3975 if (!pair) {
3976 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa,
3977 0, 1, 0, 0, 0, NULL);
3978 } else {
3979 MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
3980 1, 1, 1, EXTF_PAIRED, 0, m);
3981 }
3982
3983 return m;
3984 }
3985
3986 /*
3987 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3988 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3989 */
3990 struct mbuf *
3991 m_getcl(int wait, int type, int flags)
3992 {
3993 struct mbuf *m;
3994 int mcflags = MSLEEPF(wait);
3995 int hdr = (flags & M_PKTHDR);
3996
3997 /* Is this due to a non-blocking retry? If so, then try harder */
3998 if (mcflags & MCR_NOSLEEP) {
3999 mcflags |= MCR_TRYHARD;
4000 }
4001
4002 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
4003 if (m != NULL) {
4004 u_int16_t flag;
4005 struct ext_ref *rfa;
4006 void *cl;
4007
4008 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4009 cl = m->m_ext.ext_buf;
4010 rfa = m_get_rfa(m);
4011
4012 ASSERT(cl != NULL && rfa != NULL);
4013 VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
4014
4015 flag = MEXT_FLAGS(m);
4016
4017 MBUF_INIT(m, hdr, type);
4018 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4019
4020 mtype_stat_inc(type);
4021 mtype_stat_dec(MT_FREE);
4022 }
4023 return m;
4024 }
4025
4026 /* m_mclget() add an mbuf cluster to a normal mbuf */
4027 struct mbuf *
4028 m_mclget(struct mbuf *m, int wait)
4029 {
4030 struct ext_ref *rfa;
4031
4032 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4033 return m;
4034 }
4035
4036 m->m_ext.ext_buf = m_mclalloc(wait);
4037 if (m->m_ext.ext_buf != NULL) {
4038 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4039 } else {
4040 mcache_free(ref_cache, rfa);
4041 }
4042 return m;
4043 }
4044
4045 /* Allocate an mbuf cluster */
4046 caddr_t
4047 m_mclalloc(int wait)
4048 {
4049 int mcflags = MSLEEPF(wait);
4050
4051 /* Is this due to a non-blocking retry? If so, then try harder */
4052 if (mcflags & MCR_NOSLEEP) {
4053 mcflags |= MCR_TRYHARD;
4054 }
4055
4056 return mcache_alloc(m_cache(MC_CL), mcflags);
4057 }
4058
4059 /* Free an mbuf cluster */
4060 void
4061 m_mclfree(caddr_t p)
4062 {
4063 mcache_free(m_cache(MC_CL), p);
4064 }
4065
4066 /*
4067 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
4068 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
4069 */
4070 int
4071 m_mclhasreference(struct mbuf *m)
4072 {
4073 if (!(m->m_flags & M_EXT)) {
4074 return 0;
4075 }
4076
4077 ASSERT(m_get_rfa(m) != NULL);
4078
4079 return (MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0;
4080 }
4081
4082 __private_extern__ caddr_t
4083 m_bigalloc(int wait)
4084 {
4085 int mcflags = MSLEEPF(wait);
4086
4087 /* Is this due to a non-blocking retry? If so, then try harder */
4088 if (mcflags & MCR_NOSLEEP) {
4089 mcflags |= MCR_TRYHARD;
4090 }
4091
4092 return mcache_alloc(m_cache(MC_BIGCL), mcflags);
4093 }
4094
4095 __private_extern__ void
4096 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
4097 {
4098 mcache_free(m_cache(MC_BIGCL), p);
4099 }
4100
4101 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
4102 __private_extern__ struct mbuf *
4103 m_mbigget(struct mbuf *m, int wait)
4104 {
4105 struct ext_ref *rfa;
4106
4107 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4108 return m;
4109 }
4110
4111 m->m_ext.ext_buf = m_bigalloc(wait);
4112 if (m->m_ext.ext_buf != NULL) {
4113 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4114 } else {
4115 mcache_free(ref_cache, rfa);
4116 }
4117 return m;
4118 }
4119
4120 __private_extern__ caddr_t
4121 m_16kalloc(int wait)
4122 {
4123 int mcflags = MSLEEPF(wait);
4124
4125 /* Is this due to a non-blocking retry? If so, then try harder */
4126 if (mcflags & MCR_NOSLEEP) {
4127 mcflags |= MCR_TRYHARD;
4128 }
4129
4130 return mcache_alloc(m_cache(MC_16KCL), mcflags);
4131 }
4132
4133 __private_extern__ void
4134 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
4135 {
4136 mcache_free(m_cache(MC_16KCL), p);
4137 }
4138
4139 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
4140 __private_extern__ struct mbuf *
4141 m_m16kget(struct mbuf *m, int wait)
4142 {
4143 struct ext_ref *rfa;
4144
4145 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4146 return m;
4147 }
4148
4149 m->m_ext.ext_buf = m_16kalloc(wait);
4150 if (m->m_ext.ext_buf != NULL) {
4151 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4152 } else {
4153 mcache_free(ref_cache, rfa);
4154 }
4155 return m;
4156 }
4157
4158 /*
4159 * "Move" mbuf pkthdr from "from" to "to".
4160 * "from" must have M_PKTHDR set, and "to" must be empty.
4161 */
4162 void
4163 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
4164 {
4165 VERIFY(from->m_flags & M_PKTHDR);
4166
4167 /* Check for scratch area overflow */
4168 m_redzone_verify(from);
4169
4170 if (to->m_flags & M_PKTHDR) {
4171 /* Check for scratch area overflow */
4172 m_redzone_verify(to);
4173 /* We will be taking over the tags of 'to' */
4174 m_tag_delete_chain(to, NULL);
4175 }
4176 to->m_pkthdr = from->m_pkthdr; /* especially tags */
4177 m_classifier_init(from, 0); /* purge classifier info */
4178 m_tag_init(from, 1); /* purge all tags from src */
4179 m_scratch_init(from); /* clear src scratch area */
4180 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
4181 if ((to->m_flags & M_EXT) == 0) {
4182 to->m_data = to->m_pktdat;
4183 }
4184 m_redzone_init(to); /* setup red zone on dst */
4185 }
4186
4187 /*
4188 * Duplicate "from"'s mbuf pkthdr in "to".
4189 * "from" must have M_PKTHDR set, and "to" must be empty.
4190 * In particular, this does a deep copy of the packet tags.
4191 */
4192 int
4193 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
4194 {
4195 VERIFY(from->m_flags & M_PKTHDR);
4196
4197 /* Check for scratch area overflow */
4198 m_redzone_verify(from);
4199
4200 if (to->m_flags & M_PKTHDR) {
4201 /* Check for scratch area overflow */
4202 m_redzone_verify(to);
4203 /* We will be taking over the tags of 'to' */
4204 m_tag_delete_chain(to, NULL);
4205 }
4206 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
4207 if ((to->m_flags & M_EXT) == 0) {
4208 to->m_data = to->m_pktdat;
4209 }
4210 to->m_pkthdr = from->m_pkthdr;
4211 /* clear TX completion flag so the callback is not called in the copy */
4212 to->m_pkthdr.pkt_flags &= ~PKTF_TX_COMPL_TS_REQ;
4213 m_redzone_init(to); /* setup red zone on dst */
4214 m_tag_init(to, 0); /* preserve dst static tags */
4215 return m_tag_copy_chain(to, from, how);
4216 }
4217
4218 void
4219 m_copy_pftag(struct mbuf *to, struct mbuf *from)
4220 {
4221 memcpy(m_pftag(to), m_pftag(from), sizeof(struct pf_mtag));
4222 #if PF_ECN
4223 m_pftag(to)->pftag_hdr = NULL;
4224 m_pftag(to)->pftag_flags &= ~(PF_TAG_HDR_INET | PF_TAG_HDR_INET6);
4225 #endif /* PF_ECN */
4226 }
4227
4228 void
4229 m_copy_necptag(struct mbuf *to, struct mbuf *from)
4230 {
4231 memcpy(m_necptag(to), m_necptag(from), sizeof(struct necp_mtag_));
4232 }
4233
4234 void
4235 m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
4236 {
4237 VERIFY(m->m_flags & M_PKTHDR);
4238
4239 m->m_pkthdr.pkt_proto = 0;
4240 m->m_pkthdr.pkt_flowsrc = 0;
4241 m->m_pkthdr.pkt_flowid = 0;
4242 m->m_pkthdr.pkt_flags &= pktf_mask; /* caller-defined mask */
4243 /* preserve service class and interface info for loopback packets */
4244 if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
4245 (void) m_set_service_class(m, MBUF_SC_BE);
4246 }
4247 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
4248 m->m_pkthdr.pkt_ifainfo = 0;
4249 }
4250 /*
4251 * Preserve timestamp if requested
4252 */
4253 if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID)) {
4254 m->m_pkthdr.pkt_timestamp = 0;
4255 }
4256 }
4257
4258 void
4259 m_copy_classifier(struct mbuf *to, struct mbuf *from)
4260 {
4261 VERIFY(to->m_flags & M_PKTHDR);
4262 VERIFY(from->m_flags & M_PKTHDR);
4263
4264 to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
4265 to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
4266 to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
4267 to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
4268 to->m_pkthdr.pkt_ext_flags = from->m_pkthdr.pkt_ext_flags;
4269 (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
4270 to->m_pkthdr.pkt_ifainfo = from->m_pkthdr.pkt_ifainfo;
4271 }
4272
4273 /*
4274 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
4275 * if wantall is not set, return whatever number were available. Set up the
4276 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
4277 * are chained on the m_nextpkt field. Any packets requested beyond this
4278 * are chained onto the last packet header's m_next field. The size of
4279 * the cluster is controlled by the parameter bufsize.
4280 */
4281 __private_extern__ struct mbuf *
4282 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
4283 int wait, int wantall, size_t bufsize)
4284 {
4285 struct mbuf *m;
4286 struct mbuf **np, *top;
4287 unsigned int pnum, needed = *num_needed;
4288 mcache_obj_t *mp_list = NULL;
4289 int mcflags = MSLEEPF(wait);
4290 u_int16_t flag;
4291 struct ext_ref *rfa;
4292 mcache_t *cp;
4293 void *cl;
4294
4295 ASSERT(bufsize == m_maxsize(MC_CL) ||
4296 bufsize == m_maxsize(MC_BIGCL) ||
4297 bufsize == m_maxsize(MC_16KCL));
4298
4299 /*
4300 * Caller must first check for njcl because this
4301 * routine is internal and not exposed/used via KPI.
4302 */
4303 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
4304
4305 top = NULL;
4306 np = ⊤
4307 pnum = 0;
4308
4309 /*
4310 * The caller doesn't want all the requested buffers; only some.
4311 * Try hard to get what we can, but don't block. This effectively
4312 * overrides MCR_SLEEP, since this thread will not go to sleep
4313 * if we can't get all the buffers.
4314 */
4315 if (!wantall || (mcflags & MCR_NOSLEEP)) {
4316 mcflags |= MCR_TRYHARD;
4317 }
4318
4319 /* Allocate the composite mbuf + cluster elements from the cache */
4320 if (bufsize == m_maxsize(MC_CL)) {
4321 cp = m_cache(MC_MBUF_CL);
4322 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4323 cp = m_cache(MC_MBUF_BIGCL);
4324 } else {
4325 cp = m_cache(MC_MBUF_16KCL);
4326 }
4327 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
4328
4329 for (pnum = 0; pnum < needed; pnum++) {
4330 m = (struct mbuf *)mp_list;
4331 mp_list = mp_list->obj_next;
4332
4333 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4334 cl = m->m_ext.ext_buf;
4335 rfa = m_get_rfa(m);
4336
4337 ASSERT(cl != NULL && rfa != NULL);
4338 VERIFY(MBUF_IS_COMPOSITE(m));
4339
4340 flag = MEXT_FLAGS(m);
4341
4342 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
4343 if (bufsize == m_maxsize(MC_16KCL)) {
4344 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4345 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4346 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4347 } else {
4348 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4349 }
4350
4351 if (num_with_pkthdrs > 0) {
4352 --num_with_pkthdrs;
4353 }
4354
4355 *np = m;
4356 if (num_with_pkthdrs > 0) {
4357 np = &m->m_nextpkt;
4358 } else {
4359 np = &m->m_next;
4360 }
4361 }
4362 ASSERT(pnum != *num_needed || mp_list == NULL);
4363 if (mp_list != NULL) {
4364 mcache_free_ext(cp, mp_list);
4365 }
4366
4367 if (pnum > 0) {
4368 mtype_stat_add(MT_DATA, pnum);
4369 mtype_stat_sub(MT_FREE, pnum);
4370 }
4371
4372 if (wantall && (pnum != *num_needed)) {
4373 if (top != NULL) {
4374 m_freem_list(top);
4375 }
4376 return NULL;
4377 }
4378
4379 if (pnum > *num_needed) {
4380 printf("%s: File a radar related to <rdar://10146739>. \
4381 needed = %u, pnum = %u, num_needed = %u \n",
4382 __func__, needed, pnum, *num_needed);
4383 }
4384
4385 *num_needed = pnum;
4386 return top;
4387 }
4388
4389 /*
4390 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
4391 * wantall is not set, return whatever number were available. The size of
4392 * each mbuf in the list is controlled by the parameter packetlen. Each
4393 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
4394 * in the chain is called a segment. If maxsegments is not null and the
4395 * value pointed to is not null, this specify the maximum number of segments
4396 * for a chain of mbufs. If maxsegments is zero or the value pointed to
4397 * is zero the caller does not have any restriction on the number of segments.
4398 * The actual number of segments of a mbuf chain is return in the value
4399 * pointed to by maxsegments.
4400 */
4401 __private_extern__ struct mbuf *
4402 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
4403 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
4404 {
4405 struct mbuf **np, *top, *first = NULL;
4406 size_t bufsize, r_bufsize;
4407 unsigned int num = 0;
4408 unsigned int nsegs = 0;
4409 unsigned int needed, resid;
4410 int mcflags = MSLEEPF(wait);
4411 mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
4412 mcache_t *cp = NULL, *rcp = NULL;
4413
4414 if (*numlist == 0) {
4415 return NULL;
4416 }
4417
4418 top = NULL;
4419 np = ⊤
4420
4421 if (wantsize == 0) {
4422 if (packetlen <= MINCLSIZE) {
4423 bufsize = packetlen;
4424 } else if (packetlen > m_maxsize(MC_CL)) {
4425 /* Use 4KB if jumbo cluster pool isn't available */
4426 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0) {
4427 bufsize = m_maxsize(MC_BIGCL);
4428 } else {
4429 bufsize = m_maxsize(MC_16KCL);
4430 }
4431 } else {
4432 bufsize = m_maxsize(MC_CL);
4433 }
4434 } else if (wantsize == m_maxsize(MC_CL) ||
4435 wantsize == m_maxsize(MC_BIGCL) ||
4436 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
4437 bufsize = wantsize;
4438 } else {
4439 *numlist = 0;
4440 return NULL;
4441 }
4442
4443 if (bufsize <= MHLEN) {
4444 nsegs = 1;
4445 } else if (bufsize <= MINCLSIZE) {
4446 if (maxsegments != NULL && *maxsegments == 1) {
4447 bufsize = m_maxsize(MC_CL);
4448 nsegs = 1;
4449 } else {
4450 nsegs = 2;
4451 }
4452 } else if (bufsize == m_maxsize(MC_16KCL)) {
4453 VERIFY(njcl > 0);
4454 nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1;
4455 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4456 nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1;
4457 } else {
4458 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
4459 }
4460 if (maxsegments != NULL) {
4461 if (*maxsegments && nsegs > *maxsegments) {
4462 *maxsegments = nsegs;
4463 *numlist = 0;
4464 return NULL;
4465 }
4466 *maxsegments = nsegs;
4467 }
4468
4469 /*
4470 * The caller doesn't want all the requested buffers; only some.
4471 * Try hard to get what we can, but don't block. This effectively
4472 * overrides MCR_SLEEP, since this thread will not go to sleep
4473 * if we can't get all the buffers.
4474 */
4475 if (!wantall || (mcflags & MCR_NOSLEEP)) {
4476 mcflags |= MCR_TRYHARD;
4477 }
4478
4479 /*
4480 * Simple case where all elements in the lists/chains are mbufs.
4481 * Unless bufsize is greater than MHLEN, each segment chain is made
4482 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
4483 * of 2 mbufs; the second one is used for the residual data, i.e.
4484 * the remaining data that cannot fit into the first mbuf.
4485 */
4486 if (bufsize <= MINCLSIZE) {
4487 /* Allocate the elements in one shot from the mbuf cache */
4488 ASSERT(bufsize <= MHLEN || nsegs == 2);
4489 cp = m_cache(MC_MBUF);
4490 needed = mcache_alloc_ext(cp, &mp_list,
4491 (*numlist) * nsegs, mcflags);
4492
4493 /*
4494 * The number of elements must be even if we are to use an
4495 * mbuf (instead of a cluster) to store the residual data.
4496 * If we couldn't allocate the requested number of mbufs,
4497 * trim the number down (if it's odd) in order to avoid
4498 * creating a partial segment chain.
4499 */
4500 if (bufsize > MHLEN && (needed & 0x1)) {
4501 needed--;
4502 }
4503
4504 while (num < needed) {
4505 struct mbuf *m;
4506
4507 m = (struct mbuf *)mp_list;
4508 mp_list = mp_list->obj_next;
4509 ASSERT(m != NULL);
4510
4511 MBUF_INIT(m, 1, MT_DATA);
4512 num++;
4513 if (bufsize > MHLEN) {
4514 /* A second mbuf for this segment chain */
4515 m->m_next = (struct mbuf *)mp_list;
4516 mp_list = mp_list->obj_next;
4517 ASSERT(m->m_next != NULL);
4518
4519 MBUF_INIT(m->m_next, 0, MT_DATA);
4520 num++;
4521 }
4522 *np = m;
4523 np = &m->m_nextpkt;
4524 }
4525 ASSERT(num != *numlist || mp_list == NULL);
4526
4527 if (num > 0) {
4528 mtype_stat_add(MT_DATA, num);
4529 mtype_stat_sub(MT_FREE, num);
4530 }
4531 num /= nsegs;
4532
4533 /* We've got them all; return to caller */
4534 if (num == *numlist) {
4535 return top;
4536 }
4537
4538 goto fail;
4539 }
4540
4541 /*
4542 * Complex cases where elements are made up of one or more composite
4543 * mbufs + cluster, depending on packetlen. Each N-segment chain can
4544 * be illustrated as follows:
4545 *
4546 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4547 *
4548 * Every composite mbuf + cluster element comes from the intermediate
4549 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
4550 * the last composite element will come from the MC_MBUF_CL cache,
4551 * unless the residual data is larger than 2KB where we use the
4552 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
4553 * data is defined as extra data beyond the first element that cannot
4554 * fit into the previous element, i.e. there is no residual data if
4555 * the chain only has 1 segment.
4556 */
4557 r_bufsize = bufsize;
4558 resid = packetlen > bufsize ? packetlen % bufsize : 0;
4559 if (resid > 0) {
4560 /* There is residual data; figure out the cluster size */
4561 if (wantsize == 0 && packetlen > MINCLSIZE) {
4562 /*
4563 * Caller didn't request that all of the segments
4564 * in the chain use the same cluster size; use the
4565 * smaller of the cluster sizes.
4566 */
4567 if (njcl > 0 && resid > m_maxsize(MC_BIGCL)) {
4568 r_bufsize = m_maxsize(MC_16KCL);
4569 } else if (resid > m_maxsize(MC_CL)) {
4570 r_bufsize = m_maxsize(MC_BIGCL);
4571 } else {
4572 r_bufsize = m_maxsize(MC_CL);
4573 }
4574 } else {
4575 /* Use the same cluster size as the other segments */
4576 resid = 0;
4577 }
4578 }
4579
4580 needed = *numlist;
4581 if (resid > 0) {
4582 /*
4583 * Attempt to allocate composite mbuf + cluster elements for
4584 * the residual data in each chain; record the number of such
4585 * elements that can be allocated so that we know how many
4586 * segment chains we can afford to create.
4587 */
4588 if (r_bufsize <= m_maxsize(MC_CL)) {
4589 rcp = m_cache(MC_MBUF_CL);
4590 } else if (r_bufsize <= m_maxsize(MC_BIGCL)) {
4591 rcp = m_cache(MC_MBUF_BIGCL);
4592 } else {
4593 rcp = m_cache(MC_MBUF_16KCL);
4594 }
4595 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
4596
4597 if (needed == 0) {
4598 goto fail;
4599 }
4600
4601 /* This is temporarily reduced for calculation */
4602 ASSERT(nsegs > 1);
4603 nsegs--;
4604 }
4605
4606 /*
4607 * Attempt to allocate the rest of the composite mbuf + cluster
4608 * elements for the number of segment chains that we need.
4609 */
4610 if (bufsize <= m_maxsize(MC_CL)) {
4611 cp = m_cache(MC_MBUF_CL);
4612 } else if (bufsize <= m_maxsize(MC_BIGCL)) {
4613 cp = m_cache(MC_MBUF_BIGCL);
4614 } else {
4615 cp = m_cache(MC_MBUF_16KCL);
4616 }
4617 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
4618
4619 /* Round it down to avoid creating a partial segment chain */
4620 needed = (needed / nsegs) * nsegs;
4621 if (needed == 0) {
4622 goto fail;
4623 }
4624
4625 if (resid > 0) {
4626 /*
4627 * We're about to construct the chain(s); take into account
4628 * the number of segments we have created above to hold the
4629 * residual data for each chain, as well as restore the
4630 * original count of segments per chain.
4631 */
4632 ASSERT(nsegs > 0);
4633 needed += needed / nsegs;
4634 nsegs++;
4635 }
4636
4637 for (;;) {
4638 struct mbuf *m;
4639 u_int16_t flag;
4640 struct ext_ref *rfa;
4641 void *cl;
4642 int pkthdr;
4643 m_ext_free_func_t m_free_func;
4644
4645 ++num;
4646 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
4647 m = (struct mbuf *)mp_list;
4648 mp_list = mp_list->obj_next;
4649 } else {
4650 m = (struct mbuf *)rmp_list;
4651 rmp_list = rmp_list->obj_next;
4652 }
4653 m_free_func = m_get_ext_free(m);
4654 ASSERT(m != NULL);
4655 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4656 VERIFY(m_free_func == NULL || m_free_func == m_bigfree ||
4657 m_free_func == m_16kfree);
4658
4659 cl = m->m_ext.ext_buf;
4660 rfa = m_get_rfa(m);
4661
4662 ASSERT(cl != NULL && rfa != NULL);
4663 VERIFY(MBUF_IS_COMPOSITE(m));
4664
4665 flag = MEXT_FLAGS(m);
4666
4667 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4668 if (pkthdr) {
4669 first = m;
4670 }
4671 MBUF_INIT(m, pkthdr, MT_DATA);
4672 if (m_free_func == m_16kfree) {
4673 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4674 } else if (m_free_func == m_bigfree) {
4675 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4676 } else {
4677 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4678 }
4679
4680 *np = m;
4681 if ((num % nsegs) == 0) {
4682 np = &first->m_nextpkt;
4683 } else {
4684 np = &m->m_next;
4685 }
4686
4687 if (num == needed) {
4688 break;
4689 }
4690 }
4691
4692 if (num > 0) {
4693 mtype_stat_add(MT_DATA, num);
4694 mtype_stat_sub(MT_FREE, num);
4695 }
4696
4697 num /= nsegs;
4698
4699 /* We've got them all; return to caller */
4700 if (num == *numlist) {
4701 ASSERT(mp_list == NULL && rmp_list == NULL);
4702 return top;
4703 }
4704
4705 fail:
4706 /* Free up what's left of the above */
4707 if (mp_list != NULL) {
4708 mcache_free_ext(cp, mp_list);
4709 }
4710 if (rmp_list != NULL) {
4711 mcache_free_ext(rcp, rmp_list);
4712 }
4713 if (wantall && top != NULL) {
4714 m_freem_list(top);
4715 *numlist = 0;
4716 return NULL;
4717 }
4718 *numlist = num;
4719 return top;
4720 }
4721
4722 /*
4723 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4724 * packets on receive ring.
4725 */
4726 __private_extern__ struct mbuf *
4727 m_getpacket_how(int wait)
4728 {
4729 unsigned int num_needed = 1;
4730
4731 return m_getpackets_internal(&num_needed, 1, wait, 1,
4732 m_maxsize(MC_CL));
4733 }
4734
4735 /*
4736 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4737 * packets on receive ring.
4738 */
4739 struct mbuf *
4740 m_getpacket(void)
4741 {
4742 unsigned int num_needed = 1;
4743
4744 return m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4745 m_maxsize(MC_CL));
4746 }
4747
4748 /*
4749 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
4750 * if this can't be met, return whatever number were available. Set up the
4751 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
4752 * are chained on the m_nextpkt field. Any packets requested beyond this are
4753 * chained onto the last packet header's m_next field.
4754 */
4755 struct mbuf *
4756 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4757 {
4758 unsigned int n = num_needed;
4759
4760 return m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4761 m_maxsize(MC_CL));
4762 }
4763
4764 /*
4765 * Return a list of mbuf hdrs set up as packet hdrs chained together
4766 * on the m_nextpkt field
4767 */
4768 struct mbuf *
4769 m_getpackethdrs(int num_needed, int how)
4770 {
4771 struct mbuf *m;
4772 struct mbuf **np, *top;
4773
4774 top = NULL;
4775 np = ⊤
4776
4777 while (num_needed--) {
4778 m = _M_RETRYHDR(how, MT_DATA);
4779 if (m == NULL) {
4780 break;
4781 }
4782
4783 *np = m;
4784 np = &m->m_nextpkt;
4785 }
4786
4787 return top;
4788 }
4789
4790 /*
4791 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
4792 * for mbufs packets freed. Used by the drivers.
4793 */
4794 int
4795 m_freem_list(struct mbuf *m)
4796 {
4797 struct mbuf *nextpkt;
4798 mcache_obj_t *mp_list = NULL;
4799 mcache_obj_t *mcl_list = NULL;
4800 mcache_obj_t *mbc_list = NULL;
4801 mcache_obj_t *m16k_list = NULL;
4802 mcache_obj_t *m_mcl_list = NULL;
4803 mcache_obj_t *m_mbc_list = NULL;
4804 mcache_obj_t *m_m16k_list = NULL;
4805 mcache_obj_t *ref_list = NULL;
4806 int pktcount = 0;
4807 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4808
4809 while (m != NULL) {
4810 pktcount++;
4811
4812 nextpkt = m->m_nextpkt;
4813 m->m_nextpkt = NULL;
4814
4815 while (m != NULL) {
4816 struct mbuf *next = m->m_next;
4817 mcache_obj_t *o, *rfa;
4818 u_int32_t composite;
4819 u_int16_t refcnt;
4820 m_ext_free_func_t m_free_func;
4821
4822 if (m->m_type == MT_FREE) {
4823 panic("m_free: freeing an already freed mbuf");
4824 }
4825
4826 if (m->m_flags & M_PKTHDR) {
4827 /* Check for scratch area overflow */
4828 m_redzone_verify(m);
4829 /* Free the aux data and tags if there is any */
4830 m_tag_delete_chain(m, NULL);
4831 m_do_tx_compl_callback(m, NULL);
4832 }
4833
4834 if (!(m->m_flags & M_EXT)) {
4835 mt_free++;
4836 goto simple_free;
4837 }
4838
4839 if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
4840 m = next;
4841 continue;
4842 }
4843
4844 mt_free++;
4845
4846 o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4847 refcnt = m_decref(m);
4848 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4849 m_free_func = m_get_ext_free(m);
4850 if (refcnt == MEXT_MINREF(m) && !composite) {
4851 if (m_free_func == NULL) {
4852 o->obj_next = mcl_list;
4853 mcl_list = o;
4854 } else if (m_free_func == m_bigfree) {
4855 o->obj_next = mbc_list;
4856 mbc_list = o;
4857 } else if (m_free_func == m_16kfree) {
4858 o->obj_next = m16k_list;
4859 m16k_list = o;
4860 } else {
4861 (*(m_free_func))((caddr_t)o,
4862 m->m_ext.ext_size,
4863 m_get_ext_arg(m));
4864 }
4865 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
4866 rfa->obj_next = ref_list;
4867 ref_list = rfa;
4868 m_set_ext(m, NULL, NULL, NULL);
4869 } else if (refcnt == MEXT_MINREF(m) && composite) {
4870 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
4871 VERIFY(m->m_type != MT_FREE);
4872 /*
4873 * Amortize the costs of atomic operations
4874 * by doing them at the end, if possible.
4875 */
4876 if (m->m_type == MT_DATA) {
4877 mt_data++;
4878 } else if (m->m_type == MT_HEADER) {
4879 mt_header++;
4880 } else if (m->m_type == MT_SONAME) {
4881 mt_soname++;
4882 } else if (m->m_type == MT_TAG) {
4883 mt_tag++;
4884 } else {
4885 mtype_stat_dec(m->m_type);
4886 }
4887
4888 m->m_type = MT_FREE;
4889 m->m_flags = M_EXT;
4890 m->m_len = 0;
4891 m->m_next = m->m_nextpkt = NULL;
4892
4893 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4894
4895 /* "Free" into the intermediate cache */
4896 o = (mcache_obj_t *)m;
4897 if (m_free_func == NULL) {
4898 o->obj_next = m_mcl_list;
4899 m_mcl_list = o;
4900 } else if (m_free_func == m_bigfree) {
4901 o->obj_next = m_mbc_list;
4902 m_mbc_list = o;
4903 } else {
4904 VERIFY(m_free_func == m_16kfree);
4905 o->obj_next = m_m16k_list;
4906 m_m16k_list = o;
4907 }
4908 m = next;
4909 continue;
4910 }
4911 simple_free:
4912 /*
4913 * Amortize the costs of atomic operations
4914 * by doing them at the end, if possible.
4915 */
4916 if (m->m_type == MT_DATA) {
4917 mt_data++;
4918 } else if (m->m_type == MT_HEADER) {
4919 mt_header++;
4920 } else if (m->m_type == MT_SONAME) {
4921 mt_soname++;
4922 } else if (m->m_type == MT_TAG) {
4923 mt_tag++;
4924 } else if (m->m_type != MT_FREE) {
4925 mtype_stat_dec(m->m_type);
4926 }
4927
4928 m->m_type = MT_FREE;
4929 m->m_flags = m->m_len = 0;
4930 m->m_next = m->m_nextpkt = NULL;
4931
4932 ((mcache_obj_t *)m)->obj_next = mp_list;
4933 mp_list = (mcache_obj_t *)m;
4934
4935 m = next;
4936 }
4937
4938 m = nextpkt;
4939 }
4940
4941 if (mt_free > 0) {
4942 mtype_stat_add(MT_FREE, mt_free);
4943 }
4944 if (mt_data > 0) {
4945 mtype_stat_sub(MT_DATA, mt_data);
4946 }
4947 if (mt_header > 0) {
4948 mtype_stat_sub(MT_HEADER, mt_header);
4949 }
4950 if (mt_soname > 0) {
4951 mtype_stat_sub(MT_SONAME, mt_soname);
4952 }
4953 if (mt_tag > 0) {
4954 mtype_stat_sub(MT_TAG, mt_tag);
4955 }
4956
4957 if (mp_list != NULL) {
4958 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4959 }
4960 if (mcl_list != NULL) {
4961 mcache_free_ext(m_cache(MC_CL), mcl_list);
4962 }
4963 if (mbc_list != NULL) {
4964 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4965 }
4966 if (m16k_list != NULL) {
4967 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4968 }
4969 if (m_mcl_list != NULL) {
4970 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4971 }
4972 if (m_mbc_list != NULL) {
4973 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4974 }
4975 if (m_m16k_list != NULL) {
4976 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4977 }
4978 if (ref_list != NULL) {
4979 mcache_free_ext(ref_cache, ref_list);
4980 }
4981
4982 return pktcount;
4983 }
4984
4985 void
4986 m_freem(struct mbuf *m)
4987 {
4988 while (m != NULL) {
4989 m = m_free(m);
4990 }
4991 }
4992
4993 /*
4994 * Mbuffer utility routines.
4995 */
4996 /*
4997 * Set the m_data pointer of a newly allocated mbuf to place an object of the
4998 * specified size at the end of the mbuf, longword aligned.
4999 *
5000 * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
5001 * separate macros, each asserting that it was called at the proper moment.
5002 * This required callers to themselves test the storage type and call the
5003 * right one. Rather than require callers to be aware of those layout
5004 * decisions, we centralize here.
5005 */
5006 void
5007 m_align(struct mbuf *m, int len)
5008 {
5009 int adjust = 0;
5010
5011 /* At this point data must point to start */
5012 VERIFY(m->m_data == M_START(m));
5013 VERIFY(len >= 0);
5014 VERIFY(len <= M_SIZE(m));
5015 adjust = M_SIZE(m) - len;
5016 m->m_data += adjust & ~(sizeof(long) - 1);
5017 }
5018
5019 /*
5020 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
5021 * copy junk along. Does not adjust packet header length.
5022 */
5023 struct mbuf *
5024 m_prepend(struct mbuf *m, int len, int how)
5025 {
5026 struct mbuf *mn;
5027
5028 _MGET(mn, how, m->m_type);
5029 if (mn == NULL) {
5030 m_freem(m);
5031 return NULL;
5032 }
5033 if (m->m_flags & M_PKTHDR) {
5034 M_COPY_PKTHDR(mn, m);
5035 m->m_flags &= ~M_PKTHDR;
5036 }
5037 mn->m_next = m;
5038 m = mn;
5039 if (m->m_flags & M_PKTHDR) {
5040 VERIFY(len <= MHLEN);
5041 MH_ALIGN(m, len);
5042 } else {
5043 VERIFY(len <= MLEN);
5044 M_ALIGN(m, len);
5045 }
5046 m->m_len = len;
5047 return m;
5048 }
5049
5050 /*
5051 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
5052 * chain, copy junk along, and adjust length.
5053 */
5054 struct mbuf *
5055 m_prepend_2(struct mbuf *m, int len, int how, int align)
5056 {
5057 if (M_LEADINGSPACE(m) >= len &&
5058 (!align || IS_P2ALIGNED((m->m_data - len), sizeof(u_int32_t)))) {
5059 m->m_data -= len;
5060 m->m_len += len;
5061 } else {
5062 m = m_prepend(m, len, how);
5063 }
5064 if ((m) && (m->m_flags & M_PKTHDR)) {
5065 m->m_pkthdr.len += len;
5066 }
5067 return m;
5068 }
5069
5070 /*
5071 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
5072 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
5073 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
5074 */
5075 int MCFail;
5076
5077 struct mbuf *
5078 m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode)
5079 {
5080 struct mbuf *n, *mhdr = NULL, **np;
5081 int off = off0;
5082 struct mbuf *top;
5083 int copyhdr = 0;
5084
5085 if (off < 0 || len < 0) {
5086 panic("m_copym: invalid offset %d or len %d", off, len);
5087 }
5088
5089 VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
5090 mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR));
5091
5092 if ((off == 0 && (m->m_flags & M_PKTHDR)) ||
5093 mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) {
5094 mhdr = m;
5095 copyhdr = 1;
5096 }
5097
5098 while (off >= m->m_len) {
5099 if (m->m_next == NULL) {
5100 panic("m_copym: invalid mbuf chain");
5101 }
5102 off -= m->m_len;
5103 m = m->m_next;
5104 }
5105 np = ⊤
5106 top = NULL;
5107
5108 while (len > 0) {
5109 if (m == NULL) {
5110 if (len != M_COPYALL) {
5111 panic("m_copym: len != M_COPYALL");
5112 }
5113 break;
5114 }
5115
5116 if (copyhdr) {
5117 n = _M_RETRYHDR(wait, m->m_type);
5118 } else {
5119 n = _M_RETRY(wait, m->m_type);
5120 }
5121 *np = n;
5122
5123 if (n == NULL) {
5124 goto nospace;
5125 }
5126
5127 if (copyhdr != 0) {
5128 if ((mode == M_COPYM_MOVE_HDR) ||
5129 (mode == M_COPYM_MUST_MOVE_HDR)) {
5130 M_COPY_PKTHDR(n, mhdr);
5131 } else if ((mode == M_COPYM_COPY_HDR) ||
5132 (mode == M_COPYM_MUST_COPY_HDR)) {
5133 if (m_dup_pkthdr(n, mhdr, wait) == 0) {
5134 goto nospace;
5135 }
5136 }
5137 if (len == M_COPYALL) {
5138 n->m_pkthdr.len -= off0;
5139 } else {
5140 n->m_pkthdr.len = len;
5141 }
5142 copyhdr = 0;
5143 /*
5144 * There is data to copy from the packet header mbuf
5145 * if it is empty or it is before the starting offset
5146 */
5147 if (mhdr != m) {
5148 np = &n->m_next;
5149 continue;
5150 }
5151 }
5152 n->m_len = MIN(len, (m->m_len - off));
5153 if (m->m_flags & M_EXT) {
5154 n->m_ext = m->m_ext;
5155 m_incref(m);
5156 n->m_data = m->m_data + off;
5157 n->m_flags |= M_EXT;
5158 } else {
5159 /*
5160 * Limit to the capacity of the destination
5161 */
5162 if (n->m_flags & M_PKTHDR) {
5163 n->m_len = MIN(n->m_len, MHLEN);
5164 } else {
5165 n->m_len = MIN(n->m_len, MLEN);
5166 }
5167
5168 if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE) {
5169 panic("%s n %p copy overflow",
5170 __func__, n);
5171 }
5172
5173 bcopy(MTOD(m, caddr_t) + off, MTOD(n, caddr_t),
5174 (unsigned)n->m_len);
5175 }
5176 if (len != M_COPYALL) {
5177 len -= n->m_len;
5178 }
5179 off = 0;
5180 m = m->m_next;
5181 np = &n->m_next;
5182 }
5183
5184 if (top == NULL) {
5185 MCFail++;
5186 }
5187
5188 return top;
5189 nospace:
5190
5191 m_freem(top);
5192 MCFail++;
5193 return NULL;
5194 }
5195
5196
5197 struct mbuf *
5198 m_copym(struct mbuf *m, int off0, int len, int wait)
5199 {
5200 return m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR);
5201 }
5202
5203 /*
5204 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
5205 * within this routine also, the last mbuf and offset accessed are passed
5206 * out and can be passed back in to avoid having to rescan the entire mbuf
5207 * list (normally hung off of the socket)
5208 */
5209 struct mbuf *
5210 m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
5211 struct mbuf **m_lastm, int *m_off, uint32_t mode)
5212 {
5213 struct mbuf *m = m0, *n, **np = NULL;
5214 int off = off0, len = len0;
5215 struct mbuf *top = NULL;
5216 int mcflags = MSLEEPF(wait);
5217 int copyhdr = 0;
5218 int type = 0;
5219 mcache_obj_t *list = NULL;
5220 int needed = 0;
5221
5222 if (off == 0 && (m->m_flags & M_PKTHDR)) {
5223 copyhdr = 1;
5224 }
5225
5226 if (m_lastm != NULL && *m_lastm != NULL) {
5227 m = *m_lastm;
5228 off = *m_off;
5229 } else {
5230 while (off >= m->m_len) {
5231 off -= m->m_len;
5232 m = m->m_next;
5233 }
5234 }
5235
5236 n = m;
5237 while (len > 0) {
5238 needed++;
5239 ASSERT(n != NULL);
5240 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
5241 n = n->m_next;
5242 }
5243 needed++;
5244 len = len0;
5245
5246 /*
5247 * If the caller doesn't want to be put to sleep, mark it with
5248 * MCR_TRYHARD so that we may reclaim buffers from other places
5249 * before giving up.
5250 */
5251 if (mcflags & MCR_NOSLEEP) {
5252 mcflags |= MCR_TRYHARD;
5253 }
5254
5255 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
5256 mcflags) != needed) {
5257 goto nospace;
5258 }
5259
5260 needed = 0;
5261 while (len > 0) {
5262 n = (struct mbuf *)list;
5263 list = list->obj_next;
5264 ASSERT(n != NULL && m != NULL);
5265
5266 type = (top == NULL) ? MT_HEADER : m->m_type;
5267 MBUF_INIT(n, (top == NULL), type);
5268
5269 if (top == NULL) {
5270 top = n;
5271 np = &top->m_next;
5272 continue;
5273 } else {
5274 needed++;
5275 *np = n;
5276 }
5277
5278 if (copyhdr) {
5279 if ((mode == M_COPYM_MOVE_HDR) ||
5280 (mode == M_COPYM_MUST_MOVE_HDR)) {
5281 M_COPY_PKTHDR(n, m);
5282 } else if ((mode == M_COPYM_COPY_HDR) ||
5283 (mode == M_COPYM_MUST_COPY_HDR)) {
5284 if (m_dup_pkthdr(n, m, wait) == 0) {
5285 goto nospace;
5286 }
5287 }
5288 n->m_pkthdr.len = len;
5289 copyhdr = 0;
5290 }
5291 n->m_len = MIN(len, (m->m_len - off));
5292
5293 if (m->m_flags & M_EXT) {
5294 n->m_ext = m->m_ext;
5295 m_incref(m);
5296 n->m_data = m->m_data + off;
5297 n->m_flags |= M_EXT;
5298 } else {
5299 if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE) {
5300 panic("%s n %p copy overflow",
5301 __func__, n);
5302 }
5303
5304 bcopy(MTOD(m, caddr_t) + off, MTOD(n, caddr_t),
5305 (unsigned)n->m_len);
5306 }
5307 len -= n->m_len;
5308
5309 if (len == 0) {
5310 if (m_lastm != NULL && m_off != NULL) {
5311 if ((off + n->m_len) == m->m_len) {
5312 *m_lastm = m->m_next;
5313 *m_off = 0;
5314 } else {
5315 *m_lastm = m;
5316 *m_off = off + n->m_len;
5317 }
5318 }
5319 break;
5320 }
5321 off = 0;
5322 m = m->m_next;
5323 np = &n->m_next;
5324 }
5325
5326 mtype_stat_inc(MT_HEADER);
5327 mtype_stat_add(type, needed);
5328 mtype_stat_sub(MT_FREE, needed + 1);
5329
5330 ASSERT(list == NULL);
5331 return top;
5332
5333 nospace:
5334 if (list != NULL) {
5335 mcache_free_ext(m_cache(MC_MBUF), list);
5336 }
5337 if (top != NULL) {
5338 m_freem(top);
5339 }
5340 MCFail++;
5341 return NULL;
5342 }
5343
5344 /*
5345 * Copy data from an mbuf chain starting "off" bytes from the beginning,
5346 * continuing for "len" bytes, into the indicated buffer.
5347 */
5348 void
5349 m_copydata(struct mbuf *m, int off, int len, void *vp)
5350 {
5351 int off0 = off, len0 = len;
5352 struct mbuf *m0 = m;
5353 unsigned count;
5354 char *cp = vp;
5355
5356 if (__improbable(off < 0 || len < 0)) {
5357 panic("%s: invalid offset %d or len %d", __func__, off, len);
5358 /* NOTREACHED */
5359 }
5360
5361 while (off > 0) {
5362 if (__improbable(m == NULL)) {
5363 panic("%s: invalid mbuf chain %p [off %d, len %d]",
5364 __func__, m0, off0, len0);
5365 /* NOTREACHED */
5366 }
5367 if (off < m->m_len) {
5368 break;
5369 }
5370 off -= m->m_len;
5371 m = m->m_next;
5372 }
5373 while (len > 0) {
5374 if (__improbable(m == NULL)) {
5375 panic("%s: invalid mbuf chain %p [off %d, len %d]",
5376 __func__, m0, off0, len0);
5377 /* NOTREACHED */
5378 }
5379 count = MIN(m->m_len - off, len);
5380 bcopy(MTOD(m, caddr_t) + off, cp, count);
5381 len -= count;
5382 cp += count;
5383 off = 0;
5384 m = m->m_next;
5385 }
5386 }
5387
5388 /*
5389 * Concatenate mbuf chain n to m. Both chains must be of the same type
5390 * (e.g. MT_DATA). Any m_pkthdr is not updated.
5391 */
5392 void
5393 m_cat(struct mbuf *m, struct mbuf *n)
5394 {
5395 while (m->m_next) {
5396 m = m->m_next;
5397 }
5398 while (n) {
5399 if ((m->m_flags & M_EXT) ||
5400 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
5401 /* just join the two chains */
5402 m->m_next = n;
5403 return;
5404 }
5405 /* splat the data from one into the other */
5406 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5407 (u_int)n->m_len);
5408 m->m_len += n->m_len;
5409 n = m_free(n);
5410 }
5411 }
5412
5413 void
5414 m_adj(struct mbuf *mp, int req_len)
5415 {
5416 int len = req_len;
5417 struct mbuf *m;
5418 int count;
5419
5420 if ((m = mp) == NULL) {
5421 return;
5422 }
5423 if (len >= 0) {
5424 /*
5425 * Trim from head.
5426 */
5427 while (m != NULL && len > 0) {
5428 if (m->m_len <= len) {
5429 len -= m->m_len;
5430 m->m_len = 0;
5431 m = m->m_next;
5432 } else {
5433 m->m_len -= len;
5434 m->m_data += len;
5435 len = 0;
5436 }
5437 }
5438 m = mp;
5439 if (m->m_flags & M_PKTHDR) {
5440 m->m_pkthdr.len -= (req_len - len);
5441 }
5442 } else {
5443 /*
5444 * Trim from tail. Scan the mbuf chain,
5445 * calculating its length and finding the last mbuf.
5446 * If the adjustment only affects this mbuf, then just
5447 * adjust and return. Otherwise, rescan and truncate
5448 * after the remaining size.
5449 */
5450 len = -len;
5451 count = 0;
5452 for (;;) {
5453 count += m->m_len;
5454 if (m->m_next == (struct mbuf *)0) {
5455 break;
5456 }
5457 m = m->m_next;
5458 }
5459 if (m->m_len >= len) {
5460 m->m_len -= len;
5461 m = mp;
5462 if (m->m_flags & M_PKTHDR) {
5463 m->m_pkthdr.len -= len;
5464 }
5465 return;
5466 }
5467 count -= len;
5468 if (count < 0) {
5469 count = 0;
5470 }
5471 /*
5472 * Correct length for chain is "count".
5473 * Find the mbuf with last data, adjust its length,
5474 * and toss data from remaining mbufs on chain.
5475 */
5476 m = mp;
5477 if (m->m_flags & M_PKTHDR) {
5478 m->m_pkthdr.len = count;
5479 }
5480 for (; m; m = m->m_next) {
5481 if (m->m_len >= count) {
5482 m->m_len = count;
5483 break;
5484 }
5485 count -= m->m_len;
5486 }
5487 while ((m = m->m_next)) {
5488 m->m_len = 0;
5489 }
5490 }
5491 }
5492
5493 /*
5494 * Rearange an mbuf chain so that len bytes are contiguous
5495 * and in the data area of an mbuf (so that mtod and dtom
5496 * will work for a structure of size len). Returns the resulting
5497 * mbuf chain on success, frees it and returns null on failure.
5498 * If there is room, it will add up to max_protohdr-len extra bytes to the
5499 * contiguous region in an attempt to avoid being called next time.
5500 */
5501 int MPFail;
5502
5503 struct mbuf *
5504 m_pullup(struct mbuf *n, int len)
5505 {
5506 struct mbuf *m;
5507 int count;
5508 int space;
5509
5510 /* check invalid arguments */
5511 if (n == NULL) {
5512 panic("%s: n == NULL", __func__);
5513 }
5514 if (len < 0) {
5515 os_log_info(OS_LOG_DEFAULT, "%s: failed negative len %d",
5516 __func__, len);
5517 goto bad;
5518 }
5519 if (len > MLEN) {
5520 os_log_info(OS_LOG_DEFAULT, "%s: failed len %d too big",
5521 __func__, len);
5522 goto bad;
5523 }
5524 if ((n->m_flags & M_EXT) == 0 &&
5525 n->m_data >= &n->m_dat[MLEN]) {
5526 os_log_info(OS_LOG_DEFAULT, "%s: m_data out of bounds",
5527 __func__);
5528 goto bad;
5529 }
5530
5531 /*
5532 * If first mbuf has no cluster, and has room for len bytes
5533 * without shifting current data, pullup into it,
5534 * otherwise allocate a new mbuf to prepend to the chain.
5535 */
5536 if ((n->m_flags & M_EXT) == 0 &&
5537 len < &n->m_dat[MLEN] - n->m_data && n->m_next != NULL) {
5538 if (n->m_len >= len) {
5539 return n;
5540 }
5541 m = n;
5542 n = n->m_next;
5543 len -= m->m_len;
5544 } else {
5545 if (len > MHLEN) {
5546 goto bad;
5547 }
5548 _MGET(m, M_DONTWAIT, n->m_type);
5549 if (m == 0) {
5550 goto bad;
5551 }
5552 m->m_len = 0;
5553 if (n->m_flags & M_PKTHDR) {
5554 M_COPY_PKTHDR(m, n);
5555 n->m_flags &= ~M_PKTHDR;
5556 }
5557 }
5558 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5559 do {
5560 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
5561 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5562 (unsigned)count);
5563 len -= count;
5564 m->m_len += count;
5565 n->m_len -= count;
5566 space -= count;
5567 if (n->m_len != 0) {
5568 n->m_data += count;
5569 } else {
5570 n = m_free(n);
5571 }
5572 } while (len > 0 && n != NULL);
5573 if (len > 0) {
5574 (void) m_free(m);
5575 goto bad;
5576 }
5577 m->m_next = n;
5578 return m;
5579 bad:
5580 m_freem(n);
5581 MPFail++;
5582 return 0;
5583 }
5584
5585 /*
5586 * Like m_pullup(), except a new mbuf is always allocated, and we allow
5587 * the amount of empty space before the data in the new mbuf to be specified
5588 * (in the event that the caller expects to prepend later).
5589 */
5590 __private_extern__ int MSFail = 0;
5591
5592 __private_extern__ struct mbuf *
5593 m_copyup(struct mbuf *n, int len, int dstoff)
5594 {
5595 struct mbuf *m;
5596 int count, space;
5597
5598 VERIFY(len >= 0 && dstoff >= 0);
5599
5600 if (len > (MHLEN - dstoff)) {
5601 goto bad;
5602 }
5603 MGET(m, M_DONTWAIT, n->m_type);
5604 if (m == NULL) {
5605 goto bad;
5606 }
5607 m->m_len = 0;
5608 if (n->m_flags & M_PKTHDR) {
5609 m_copy_pkthdr(m, n);
5610 n->m_flags &= ~M_PKTHDR;
5611 }
5612 m->m_data += dstoff;
5613 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5614 do {
5615 count = min(min(max(len, max_protohdr), space), n->m_len);
5616 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
5617 (unsigned)count);
5618 len -= count;
5619 m->m_len += count;
5620 n->m_len -= count;
5621 space -= count;
5622 if (n->m_len) {
5623 n->m_data += count;
5624 } else {
5625 n = m_free(n);
5626 }
5627 } while (len > 0 && n);
5628 if (len > 0) {
5629 (void) m_free(m);
5630 goto bad;
5631 }
5632 m->m_next = n;
5633 return m;
5634 bad:
5635 m_freem(n);
5636 MSFail++;
5637 return NULL;
5638 }
5639
5640 /*
5641 * Partition an mbuf chain in two pieces, returning the tail --
5642 * all but the first len0 bytes. In case of failure, it returns NULL and
5643 * attempts to restore the chain to its original state.
5644 */
5645 struct mbuf *
5646 m_split(struct mbuf *m0, int len0, int wait)
5647 {
5648 return m_split0(m0, len0, wait, 1);
5649 }
5650
5651 static struct mbuf *
5652 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
5653 {
5654 struct mbuf *m, *n;
5655 unsigned len = len0, remain;
5656
5657 /*
5658 * First iterate to the mbuf which contains the first byte of
5659 * data at offset len0
5660 */
5661 for (m = m0; m && len > m->m_len; m = m->m_next) {
5662 len -= m->m_len;
5663 }
5664 if (m == NULL) {
5665 return NULL;
5666 }
5667 /*
5668 * len effectively is now the offset in the current
5669 * mbuf where we have to perform split.
5670 *
5671 * remain becomes the tail length.
5672 * Note that len can also be == m->m_len
5673 */
5674 remain = m->m_len - len;
5675
5676 /*
5677 * If current mbuf len contains the entire remaining offset len,
5678 * just make the second mbuf chain pointing to next mbuf onwards
5679 * and return after making necessary adjustments
5680 */
5681 if (copyhdr && (m0->m_flags & M_PKTHDR) && remain == 0) {
5682 _MGETHDR(n, wait, m0->m_type);
5683 if (n == NULL) {
5684 return NULL;
5685 }
5686 n->m_next = m->m_next;
5687 m->m_next = NULL;
5688 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5689 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5690 m0->m_pkthdr.len = len0;
5691 return n;
5692 }
5693 if (copyhdr && (m0->m_flags & M_PKTHDR)) {
5694 _MGETHDR(n, wait, m0->m_type);
5695 if (n == NULL) {
5696 return NULL;
5697 }
5698 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5699 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5700 m0->m_pkthdr.len = len0;
5701
5702 /*
5703 * If current points to external storage
5704 * then it can be shared by making last mbuf
5705 * of head chain and first mbuf of current chain
5706 * pointing to different data offsets
5707 */
5708 if (m->m_flags & M_EXT) {
5709 goto extpacket;
5710 }
5711 if (remain > MHLEN) {
5712 /* m can't be the lead packet */
5713 MH_ALIGN(n, 0);
5714 n->m_next = m_split(m, len, wait);
5715 if (n->m_next == NULL) {
5716 (void) m_free(n);
5717 return NULL;
5718 } else {
5719 return n;
5720 }
5721 } else {
5722 MH_ALIGN(n, remain);
5723 }
5724 } else if (remain == 0) {
5725 n = m->m_next;
5726 m->m_next = NULL;
5727 return n;
5728 } else {
5729 _MGET(n, wait, m->m_type);
5730 if (n == NULL) {
5731 return NULL;
5732 }
5733
5734 if ((m->m_flags & M_EXT) == 0) {
5735 VERIFY(remain <= MLEN);
5736 M_ALIGN(n, remain);
5737 }
5738 }
5739 extpacket:
5740 if (m->m_flags & M_EXT) {
5741 n->m_flags |= M_EXT;
5742 n->m_ext = m->m_ext;
5743 m_incref(m);
5744 n->m_data = m->m_data + len;
5745 } else {
5746 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
5747 }
5748 n->m_len = remain;
5749 m->m_len = len;
5750 n->m_next = m->m_next;
5751 m->m_next = NULL;
5752 return n;
5753 }
5754
5755 /*
5756 * Routine to copy from device local memory into mbufs.
5757 */
5758 struct mbuf *
5759 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
5760 void (*copy)(const void *, void *, size_t))
5761 {
5762 struct mbuf *m;
5763 struct mbuf *top = NULL, **mp = ⊤
5764 int off = off0, len;
5765 char *cp;
5766 char *epkt;
5767
5768 cp = buf;
5769 epkt = cp + totlen;
5770 if (off) {
5771 /*
5772 * If 'off' is non-zero, packet is trailer-encapsulated,
5773 * so we have to skip the type and length fields.
5774 */
5775 cp += off + 2 * sizeof(u_int16_t);
5776 totlen -= 2 * sizeof(u_int16_t);
5777 }
5778 _MGETHDR(m, M_DONTWAIT, MT_DATA);
5779 if (m == NULL) {
5780 return NULL;
5781 }
5782 m->m_pkthdr.rcvif = ifp;
5783 m->m_pkthdr.len = totlen;
5784 m->m_len = MHLEN;
5785
5786 while (totlen > 0) {
5787 if (top != NULL) {
5788 _MGET(m, M_DONTWAIT, MT_DATA);
5789 if (m == NULL) {
5790 m_freem(top);
5791 return NULL;
5792 }
5793 m->m_len = MLEN;
5794 }
5795 len = MIN(totlen, epkt - cp);
5796 if (len >= MINCLSIZE) {
5797 MCLGET(m, M_DONTWAIT);
5798 if (m->m_flags & M_EXT) {
5799 m->m_len = len = MIN(len, m_maxsize(MC_CL));
5800 } else {
5801 /* give up when it's out of cluster mbufs */
5802 if (top != NULL) {
5803 m_freem(top);
5804 }
5805 m_freem(m);
5806 return NULL;
5807 }
5808 } else {
5809 /*
5810 * Place initial small packet/header at end of mbuf.
5811 */
5812 if (len < m->m_len) {
5813 if (top == NULL &&
5814 len + max_linkhdr <= m->m_len) {
5815 m->m_data += max_linkhdr;
5816 }
5817 m->m_len = len;
5818 } else {
5819 len = m->m_len;
5820 }
5821 }
5822 if (copy) {
5823 copy(cp, MTOD(m, caddr_t), (unsigned)len);
5824 } else {
5825 bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
5826 }
5827 cp += len;
5828 *mp = m;
5829 mp = &m->m_next;
5830 totlen -= len;
5831 if (cp == epkt) {
5832 cp = buf;
5833 }
5834 }
5835 return top;
5836 }
5837
5838 #ifndef MBUF_GROWTH_NORMAL_THRESH
5839 #define MBUF_GROWTH_NORMAL_THRESH 25
5840 #endif
5841
5842 /*
5843 * Cluster freelist allocation check.
5844 */
5845 static int
5846 m_howmany(int num, size_t bufsize)
5847 {
5848 int i = 0, j = 0;
5849 u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5850 u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5851 u_int32_t sumclusters, freeclusters;
5852 u_int32_t percent_pool, percent_kmem;
5853 u_int32_t mb_growth, mb_growth_thresh;
5854
5855 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5856 bufsize == m_maxsize(MC_16KCL));
5857
5858 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5859
5860 /* Numbers in 2K cluster units */
5861 m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5862 m_clusters = m_total(MC_CL);
5863 m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5864 m_16kclusters = m_total(MC_16KCL);
5865 sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5866
5867 m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5868 m_clfree = m_infree(MC_CL);
5869 m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5870 m_16kclfree = m_infree(MC_16KCL);
5871 freeclusters = m_mbfree + m_clfree + m_bigclfree;
5872
5873 /* Bail if we've maxed out the mbuf memory map */
5874 if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5875 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5876 (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5877 mbwdog_logger("maxed out nclusters (%u >= %u) or njcl (%u >= %u)",
5878 sumclusters, nclusters,
5879 (m_16kclusters << NCLPJCLSHIFT), njcl);
5880 return 0;
5881 }
5882
5883 if (bufsize == m_maxsize(MC_BIGCL)) {
5884 /* Under minimum */
5885 if (m_bigclusters < m_minlimit(MC_BIGCL)) {
5886 return m_minlimit(MC_BIGCL) - m_bigclusters;
5887 }
5888
5889 percent_pool =
5890 ((sumclusters - freeclusters) * 100) / sumclusters;
5891 percent_kmem = (sumclusters * 100) / nclusters;
5892
5893 /*
5894 * If a light/normal user, grow conservatively (75%)
5895 * If a heavy user, grow aggressively (50%)
5896 */
5897 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH) {
5898 mb_growth = MB_GROWTH_NORMAL;
5899 } else {
5900 mb_growth = MB_GROWTH_AGGRESSIVE;
5901 }
5902
5903 if (percent_kmem < 5) {
5904 /* For initial allocations */
5905 i = num;
5906 } else {
5907 /* Return if >= MBIGCL_LOWAT clusters available */
5908 if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5909 m_total(MC_BIGCL) >=
5910 MBIGCL_LOWAT + m_minlimit(MC_BIGCL)) {
5911 return 0;
5912 }
5913
5914 /* Ensure at least num clusters are accessible */
5915 if (num >= m_infree(MC_BIGCL)) {
5916 i = num - m_infree(MC_BIGCL);
5917 }
5918 if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL)) {
5919 j = num - (m_total(MC_BIGCL) -
5920 m_minlimit(MC_BIGCL));
5921 }
5922
5923 i = MAX(i, j);
5924
5925 /*
5926 * Grow pool if percent_pool > 75 (normal growth)
5927 * or percent_pool > 50 (aggressive growth).
5928 */
5929 mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5930 if (percent_pool > mb_growth_thresh) {
5931 j = ((sumclusters + num) >> mb_growth) -
5932 freeclusters;
5933 }
5934 i = MAX(i, j);
5935 }
5936
5937 /* Check to ensure we didn't go over limits */
5938 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) {
5939 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5940 }
5941 if ((i << 1) + sumclusters >= nclusters) {
5942 i = (nclusters - sumclusters) >> 1;
5943 }
5944 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5945 VERIFY(sumclusters + (i << 1) <= nclusters);
5946 } else { /* 16K CL */
5947 VERIFY(njcl > 0);
5948 /* Ensure at least num clusters are available */
5949 if (num >= m_16kclfree) {
5950 i = num - m_16kclfree;
5951 }
5952
5953 /* Always grow 16KCL pool aggressively */
5954 if (((m_16kclusters + num) >> 1) > m_16kclfree) {
5955 j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5956 }
5957 i = MAX(i, j);
5958
5959 /* Check to ensure we don't go over limit */
5960 if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL)) {
5961 i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
5962 }
5963 }
5964 return i;
5965 }
5966 /*
5967 * Return the number of bytes in the mbuf chain, m.
5968 */
5969 unsigned int
5970 m_length(struct mbuf *m)
5971 {
5972 struct mbuf *m0;
5973 unsigned int pktlen;
5974
5975 if (m->m_flags & M_PKTHDR) {
5976 return m->m_pkthdr.len;
5977 }
5978
5979 pktlen = 0;
5980 for (m0 = m; m0 != NULL; m0 = m0->m_next) {
5981 pktlen += m0->m_len;
5982 }
5983 return pktlen;
5984 }
5985
5986 /*
5987 * Copy data from a buffer back into the indicated mbuf chain,
5988 * starting "off" bytes from the beginning, extending the mbuf
5989 * chain if necessary.
5990 */
5991 void
5992 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5993 {
5994 #if DEBUG
5995 struct mbuf *origm = m0;
5996 int error;
5997 #endif /* DEBUG */
5998
5999 if (m0 == NULL) {
6000 return;
6001 }
6002
6003 #if DEBUG
6004 error =
6005 #endif /* DEBUG */
6006 m_copyback0(&m0, off, len, cp,
6007 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
6008
6009 #if DEBUG
6010 if (error != 0 || (m0 != NULL && origm != m0)) {
6011 panic("m_copyback");
6012 }
6013 #endif /* DEBUG */
6014 }
6015
6016 struct mbuf *
6017 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
6018 {
6019 int error;
6020
6021 /* don't support chain expansion */
6022 VERIFY(off + len <= m_length(m0));
6023
6024 error = m_copyback0(&m0, off, len, cp,
6025 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
6026 if (error) {
6027 /*
6028 * no way to recover from partial success.
6029 * just free the chain.
6030 */
6031 m_freem(m0);
6032 return NULL;
6033 }
6034 return m0;
6035 }
6036
6037 /*
6038 * m_makewritable: ensure the specified range writable.
6039 */
6040 int
6041 m_makewritable(struct mbuf **mp, int off, int len, int how)
6042 {
6043 int error;
6044 #if DEBUG
6045 struct mbuf *n;
6046 int origlen, reslen;
6047
6048 origlen = m_length(*mp);
6049 #endif /* DEBUG */
6050
6051 #if 0 /* M_COPYALL is large enough */
6052 if (len == M_COPYALL) {
6053 len = m_length(*mp) - off; /* XXX */
6054 }
6055 #endif
6056
6057 error = m_copyback0(mp, off, len, NULL,
6058 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
6059
6060 #if DEBUG
6061 reslen = 0;
6062 for (n = *mp; n; n = n->m_next) {
6063 reslen += n->m_len;
6064 }
6065 if (origlen != reslen) {
6066 panic("m_makewritable: length changed");
6067 }
6068 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len) {
6069 panic("m_makewritable: inconsist");
6070 }
6071 #endif /* DEBUG */
6072
6073 return error;
6074 }
6075
6076 static int
6077 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
6078 int how)
6079 {
6080 int mlen;
6081 struct mbuf *m, *n;
6082 struct mbuf **mp;
6083 int totlen = 0;
6084 const char *cp = vp;
6085
6086 VERIFY(mp0 != NULL);
6087 VERIFY(*mp0 != NULL);
6088 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
6089 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
6090
6091 /*
6092 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
6093 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
6094 */
6095
6096 VERIFY((~flags & (M_COPYBACK0_EXTEND | M_COPYBACK0_COW)) != 0);
6097
6098 mp = mp0;
6099 m = *mp;
6100 while (off > (mlen = m->m_len)) {
6101 off -= mlen;
6102 totlen += mlen;
6103 if (m->m_next == NULL) {
6104 int tspace;
6105 extend:
6106 if (!(flags & M_COPYBACK0_EXTEND)) {
6107 goto out;
6108 }
6109
6110 /*
6111 * try to make some space at the end of "m".
6112 */
6113
6114 mlen = m->m_len;
6115 if (off + len >= MINCLSIZE &&
6116 !(m->m_flags & M_EXT) && m->m_len == 0) {
6117 MCLGET(m, how);
6118 }
6119 tspace = M_TRAILINGSPACE(m);
6120 if (tspace > 0) {
6121 tspace = MIN(tspace, off + len);
6122 VERIFY(tspace > 0);
6123 bzero(mtod(m, char *) + m->m_len,
6124 MIN(off, tspace));
6125 m->m_len += tspace;
6126 off += mlen;
6127 totlen -= mlen;
6128 continue;
6129 }
6130
6131 /*
6132 * need to allocate an mbuf.
6133 */
6134
6135 if (off + len >= MINCLSIZE) {
6136 n = m_getcl(how, m->m_type, 0);
6137 } else {
6138 n = _M_GET(how, m->m_type);
6139 }
6140 if (n == NULL) {
6141 goto out;
6142 }
6143 n->m_len = 0;
6144 n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
6145 bzero(mtod(n, char *), MIN(n->m_len, off));
6146 m->m_next = n;
6147 }
6148 mp = &m->m_next;
6149 m = m->m_next;
6150 }
6151 while (len > 0) {
6152 mlen = m->m_len - off;
6153 if (mlen != 0 && m_mclhasreference(m)) {
6154 char *datap;
6155 int eatlen;
6156
6157 /*
6158 * this mbuf is read-only.
6159 * allocate a new writable mbuf and try again.
6160 */
6161
6162 #if DIAGNOSTIC
6163 if (!(flags & M_COPYBACK0_COW)) {
6164 panic("m_copyback0: read-only");
6165 }
6166 #endif /* DIAGNOSTIC */
6167
6168 /*
6169 * if we're going to write into the middle of
6170 * a mbuf, split it first.
6171 */
6172 if (off > 0 && len < mlen) {
6173 n = m_split0(m, off, how, 0);
6174 if (n == NULL) {
6175 goto enobufs;
6176 }
6177 m->m_next = n;
6178 mp = &m->m_next;
6179 m = n;
6180 off = 0;
6181 continue;
6182 }
6183
6184 /*
6185 * XXX TODO coalesce into the trailingspace of
6186 * the previous mbuf when possible.
6187 */
6188
6189 /*
6190 * allocate a new mbuf. copy packet header if needed.
6191 */
6192 n = _M_GET(how, m->m_type);
6193 if (n == NULL) {
6194 goto enobufs;
6195 }
6196 if (off == 0 && (m->m_flags & M_PKTHDR)) {
6197 M_COPY_PKTHDR(n, m);
6198 n->m_len = MHLEN;
6199 } else {
6200 if (len >= MINCLSIZE) {
6201 MCLGET(n, M_DONTWAIT);
6202 }
6203 n->m_len =
6204 (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
6205 }
6206 if (n->m_len > len) {
6207 n->m_len = len;
6208 }
6209
6210 /*
6211 * free the region which has been overwritten.
6212 * copying data from old mbufs if requested.
6213 */
6214 if (flags & M_COPYBACK0_PRESERVE) {
6215 datap = mtod(n, char *);
6216 } else {
6217 datap = NULL;
6218 }
6219 eatlen = n->m_len;
6220 VERIFY(off == 0 || eatlen >= mlen);
6221 if (off > 0) {
6222 VERIFY(len >= mlen);
6223 m->m_len = off;
6224 m->m_next = n;
6225 if (datap) {
6226 m_copydata(m, off, mlen, datap);
6227 datap += mlen;
6228 }
6229 eatlen -= mlen;
6230 mp = &m->m_next;
6231 m = m->m_next;
6232 }
6233 while (m != NULL && m_mclhasreference(m) &&
6234 n->m_type == m->m_type && eatlen > 0) {
6235 mlen = MIN(eatlen, m->m_len);
6236 if (datap) {
6237 m_copydata(m, 0, mlen, datap);
6238 datap += mlen;
6239 }
6240 m->m_data += mlen;
6241 m->m_len -= mlen;
6242 eatlen -= mlen;
6243 if (m->m_len == 0) {
6244 *mp = m = m_free(m);
6245 }
6246 }
6247 if (eatlen > 0) {
6248 n->m_len -= eatlen;
6249 }
6250 n->m_next = m;
6251 *mp = m = n;
6252 continue;
6253 }
6254 mlen = MIN(mlen, len);
6255 if (flags & M_COPYBACK0_COPYBACK) {
6256 bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
6257 cp += mlen;
6258 }
6259 len -= mlen;
6260 mlen += off;
6261 off = 0;
6262 totlen += mlen;
6263 if (len == 0) {
6264 break;
6265 }
6266 if (m->m_next == NULL) {
6267 goto extend;
6268 }
6269 mp = &m->m_next;
6270 m = m->m_next;
6271 }
6272 out:
6273 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
6274 VERIFY(flags & M_COPYBACK0_EXTEND);
6275 m->m_pkthdr.len = totlen;
6276 }
6277
6278 return 0;
6279
6280 enobufs:
6281 return ENOBUFS;
6282 }
6283
6284 uint64_t
6285 mcl_to_paddr(char *addr)
6286 {
6287 vm_offset_t base_phys;
6288
6289 if (!MBUF_IN_MAP(addr)) {
6290 return 0;
6291 }
6292 base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
6293
6294 if (base_phys == 0) {
6295 return 0;
6296 }
6297 return (uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK));
6298 }
6299
6300 /*
6301 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
6302 * And really copy the thing. That way, we don't "precompute" checksums
6303 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
6304 * small packets, don't dup into a cluster. That way received packets
6305 * don't take up too much room in the sockbuf (cf. sbspace()).
6306 */
6307 int MDFail;
6308
6309 struct mbuf *
6310 m_dup(struct mbuf *m, int how)
6311 {
6312 struct mbuf *n, **np;
6313 struct mbuf *top;
6314 int copyhdr = 0;
6315
6316 np = ⊤
6317 top = NULL;
6318 if (m->m_flags & M_PKTHDR) {
6319 copyhdr = 1;
6320 }
6321
6322 /*
6323 * Quick check: if we have one mbuf and its data fits in an
6324 * mbuf with packet header, just copy and go.
6325 */
6326 if (m->m_next == NULL) {
6327 /* Then just move the data into an mbuf and be done... */
6328 if (copyhdr) {
6329 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
6330 if ((n = _M_GETHDR(how, m->m_type)) == NULL) {
6331 return NULL;
6332 }
6333 n->m_len = m->m_len;
6334 m_dup_pkthdr(n, m, how);
6335 bcopy(m->m_data, n->m_data, m->m_len);
6336 return n;
6337 }
6338 } else if (m->m_len <= MLEN) {
6339 if ((n = _M_GET(how, m->m_type)) == NULL) {
6340 return NULL;
6341 }
6342 bcopy(m->m_data, n->m_data, m->m_len);
6343 n->m_len = m->m_len;
6344 return n;
6345 }
6346 }
6347 while (m != NULL) {
6348 #if BLUE_DEBUG
6349 printf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
6350 m->m_data);
6351 #endif
6352 if (copyhdr) {
6353 n = _M_GETHDR(how, m->m_type);
6354 } else {
6355 n = _M_GET(how, m->m_type);
6356 }
6357 if (n == NULL) {
6358 goto nospace;
6359 }
6360 if (m->m_flags & M_EXT) {
6361 if (m->m_len <= m_maxsize(MC_CL)) {
6362 MCLGET(n, how);
6363 } else if (m->m_len <= m_maxsize(MC_BIGCL)) {
6364 n = m_mbigget(n, how);
6365 } else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0) {
6366 n = m_m16kget(n, how);
6367 }
6368 if (!(n->m_flags & M_EXT)) {
6369 (void) m_free(n);
6370 goto nospace;
6371 }
6372 } else {
6373 VERIFY((copyhdr == 1 && m->m_len <= MHLEN) ||
6374 (copyhdr == 0 && m->m_len <= MLEN));
6375 }
6376 *np = n;
6377 if (copyhdr) {
6378 /* Don't use M_COPY_PKTHDR: preserve m_data */
6379 m_dup_pkthdr(n, m, how);
6380 copyhdr = 0;
6381 if (!(n->m_flags & M_EXT)) {
6382 n->m_data = n->m_pktdat;
6383 }
6384 }
6385 n->m_len = m->m_len;
6386 /*
6387 * Get the dup on the same bdry as the original
6388 * Assume that the two mbufs have the same offset to data area
6389 * (up to word boundaries)
6390 */
6391 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
6392 m = m->m_next;
6393 np = &n->m_next;
6394 #if BLUE_DEBUG
6395 printf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
6396 n->m_data);
6397 #endif
6398 }
6399
6400 if (top == NULL) {
6401 MDFail++;
6402 }
6403 return top;
6404
6405 nospace:
6406 m_freem(top);
6407 MDFail++;
6408 return NULL;
6409 }
6410
6411 #define MBUF_MULTIPAGES(m) \
6412 (((m)->m_flags & M_EXT) && \
6413 ((IS_P2ALIGNED((m)->m_data, PAGE_SIZE) \
6414 && (m)->m_len > PAGE_SIZE) || \
6415 (!IS_P2ALIGNED((m)->m_data, PAGE_SIZE) && \
6416 P2ROUNDUP((m)->m_data, PAGE_SIZE) < ((uintptr_t)(m)->m_data + (m)->m_len))))
6417
6418 static struct mbuf *
6419 m_expand(struct mbuf *m, struct mbuf **last)
6420 {
6421 struct mbuf *top = NULL;
6422 struct mbuf **nm = ⊤
6423 uintptr_t data0, data;
6424 unsigned int len0, len;
6425
6426 VERIFY(MBUF_MULTIPAGES(m));
6427 VERIFY(m->m_next == NULL);
6428 data0 = (uintptr_t)m->m_data;
6429 len0 = m->m_len;
6430 *last = top;
6431
6432 for (;;) {
6433 struct mbuf *n;
6434
6435 data = data0;
6436 if (IS_P2ALIGNED(data, PAGE_SIZE) && len0 > PAGE_SIZE) {
6437 len = PAGE_SIZE;
6438 } else if (!IS_P2ALIGNED(data, PAGE_SIZE) &&
6439 P2ROUNDUP(data, PAGE_SIZE) < (data + len0)) {
6440 len = P2ROUNDUP(data, PAGE_SIZE) - data;
6441 } else {
6442 len = len0;
6443 }
6444
6445 VERIFY(len > 0);
6446 VERIFY(m->m_flags & M_EXT);
6447 m->m_data = (void *)data;
6448 m->m_len = len;
6449
6450 *nm = *last = m;
6451 nm = &m->m_next;
6452 m->m_next = NULL;
6453
6454 data0 += len;
6455 len0 -= len;
6456 if (len0 == 0) {
6457 break;
6458 }
6459
6460 n = _M_RETRY(M_DONTWAIT, MT_DATA);
6461 if (n == NULL) {
6462 m_freem(top);
6463 top = *last = NULL;
6464 break;
6465 }
6466
6467 n->m_ext = m->m_ext;
6468 m_incref(m);
6469 n->m_flags |= M_EXT;
6470 m = n;
6471 }
6472 return top;
6473 }
6474
6475 struct mbuf *
6476 m_normalize(struct mbuf *m)
6477 {
6478 struct mbuf *top = NULL;
6479 struct mbuf **nm = ⊤
6480 boolean_t expanded = FALSE;
6481
6482 while (m != NULL) {
6483 struct mbuf *n;
6484
6485 n = m->m_next;
6486 m->m_next = NULL;
6487
6488 /* Does the data cross one or more page boundaries? */
6489 if (MBUF_MULTIPAGES(m)) {
6490 struct mbuf *last;
6491 if ((m = m_expand(m, &last)) == NULL) {
6492 m_freem(n);
6493 m_freem(top);
6494 top = NULL;
6495 break;
6496 }
6497 *nm = m;
6498 nm = &last->m_next;
6499 expanded = TRUE;
6500 } else {
6501 *nm = m;
6502 nm = &m->m_next;
6503 }
6504 m = n;
6505 }
6506 if (expanded) {
6507 atomic_add_32(&mb_normalized, 1);
6508 }
6509 return top;
6510 }
6511
6512 /*
6513 * Append the specified data to the indicated mbuf chain,
6514 * Extend the mbuf chain if the new data does not fit in
6515 * existing space.
6516 *
6517 * Return 1 if able to complete the job; otherwise 0.
6518 */
6519 int
6520 m_append(struct mbuf *m0, int len, caddr_t cp)
6521 {
6522 struct mbuf *m, *n;
6523 int remainder, space;
6524
6525 for (m = m0; m->m_next != NULL; m = m->m_next) {
6526 ;
6527 }
6528 remainder = len;
6529 space = M_TRAILINGSPACE(m);
6530 if (space > 0) {
6531 /*
6532 * Copy into available space.
6533 */
6534 if (space > remainder) {
6535 space = remainder;
6536 }
6537 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
6538 m->m_len += space;
6539 cp += space;
6540 remainder -= space;
6541 }
6542 while (remainder > 0) {
6543 /*
6544 * Allocate a new mbuf; could check space
6545 * and allocate a cluster instead.
6546 */
6547 n = m_get(M_WAITOK, m->m_type);
6548 if (n == NULL) {
6549 break;
6550 }
6551 n->m_len = min(MLEN, remainder);
6552 bcopy(cp, mtod(n, caddr_t), n->m_len);
6553 cp += n->m_len;
6554 remainder -= n->m_len;
6555 m->m_next = n;
6556 m = n;
6557 }
6558 if (m0->m_flags & M_PKTHDR) {
6559 m0->m_pkthdr.len += len - remainder;
6560 }
6561 return remainder == 0;
6562 }
6563
6564 struct mbuf *
6565 m_last(struct mbuf *m)
6566 {
6567 while (m->m_next != NULL) {
6568 m = m->m_next;
6569 }
6570 return m;
6571 }
6572
6573 unsigned int
6574 m_fixhdr(struct mbuf *m0)
6575 {
6576 u_int len;
6577
6578 VERIFY(m0->m_flags & M_PKTHDR);
6579
6580 len = m_length2(m0, NULL);
6581 m0->m_pkthdr.len = len;
6582 return len;
6583 }
6584
6585 unsigned int
6586 m_length2(struct mbuf *m0, struct mbuf **last)
6587 {
6588 struct mbuf *m;
6589 u_int len;
6590
6591 len = 0;
6592 for (m = m0; m != NULL; m = m->m_next) {
6593 len += m->m_len;
6594 if (m->m_next == NULL) {
6595 break;
6596 }
6597 }
6598 if (last != NULL) {
6599 *last = m;
6600 }
6601 return len;
6602 }
6603
6604 /*
6605 * Defragment a mbuf chain, returning the shortest possible chain of mbufs
6606 * and clusters. If allocation fails and this cannot be completed, NULL will
6607 * be returned, but the passed in chain will be unchanged. Upon success,
6608 * the original chain will be freed, and the new chain will be returned.
6609 *
6610 * If a non-packet header is passed in, the original mbuf (chain?) will
6611 * be returned unharmed.
6612 *
6613 * If offset is specfied, the first mbuf in the chain will have a leading
6614 * space of the amount stated by the "off" parameter.
6615 *
6616 * This routine requires that the m_pkthdr.header field of the original
6617 * mbuf chain is cleared by the caller.
6618 */
6619 struct mbuf *
6620 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
6621 {
6622 struct mbuf *m_new = NULL, *m_final = NULL;
6623 int progress = 0, length, pktlen;
6624
6625 if (!(m0->m_flags & M_PKTHDR)) {
6626 return m0;
6627 }
6628
6629 VERIFY(off < MHLEN);
6630 m_fixhdr(m0); /* Needed sanity check */
6631
6632 pktlen = m0->m_pkthdr.len + off;
6633 if (pktlen > MHLEN) {
6634 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
6635 } else {
6636 m_final = m_gethdr(how, MT_DATA);
6637 }
6638
6639 if (m_final == NULL) {
6640 goto nospace;
6641 }
6642
6643 if (off > 0) {
6644 pktlen -= off;
6645 m_final->m_data += off;
6646 }
6647
6648 /*
6649 * Caller must have handled the contents pointed to by this
6650 * pointer before coming here, as otherwise it will point to
6651 * the original mbuf which will get freed upon success.
6652 */
6653 VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
6654
6655 if (m_dup_pkthdr(m_final, m0, how) == 0) {
6656 goto nospace;
6657 }
6658
6659 m_new = m_final;
6660
6661 while (progress < pktlen) {
6662 length = pktlen - progress;
6663 if (length > MCLBYTES) {
6664 length = MCLBYTES;
6665 }
6666 length -= ((m_new == m_final) ? off : 0);
6667 if (length < 0) {
6668 goto nospace;
6669 }
6670
6671 if (m_new == NULL) {
6672 if (length > MLEN) {
6673 m_new = m_getcl(how, MT_DATA, 0);
6674 } else {
6675 m_new = m_get(how, MT_DATA);
6676 }
6677 if (m_new == NULL) {
6678 goto nospace;
6679 }
6680 }
6681
6682 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
6683 progress += length;
6684 m_new->m_len = length;
6685 if (m_new != m_final) {
6686 m_cat(m_final, m_new);
6687 }
6688 m_new = NULL;
6689 }
6690 m_freem(m0);
6691 m0 = m_final;
6692 return m0;
6693 nospace:
6694 if (m_final) {
6695 m_freem(m_final);
6696 }
6697 return NULL;
6698 }
6699
6700 struct mbuf *
6701 m_defrag(struct mbuf *m0, int how)
6702 {
6703 return m_defrag_offset(m0, 0, how);
6704 }
6705
6706 void
6707 m_mchtype(struct mbuf *m, int t)
6708 {
6709 mtype_stat_inc(t);
6710 mtype_stat_dec(m->m_type);
6711 (m)->m_type = t;
6712 }
6713
6714 void *
6715 m_mtod(struct mbuf *m)
6716 {
6717 return MTOD(m, void *);
6718 }
6719
6720 struct mbuf *
6721 m_dtom(void *x)
6722 {
6723 return (struct mbuf *)((uintptr_t)(x) & ~(MSIZE - 1));
6724 }
6725
6726 void
6727 m_mcheck(struct mbuf *m)
6728 {
6729 _MCHECK(m);
6730 }
6731
6732 /*
6733 * Return a pointer to mbuf/offset of location in mbuf chain.
6734 */
6735 struct mbuf *
6736 m_getptr(struct mbuf *m, int loc, int *off)
6737 {
6738 while (loc >= 0) {
6739 /* Normal end of search. */
6740 if (m->m_len > loc) {
6741 *off = loc;
6742 return m;
6743 } else {
6744 loc -= m->m_len;
6745 if (m->m_next == NULL) {
6746 if (loc == 0) {
6747 /* Point at the end of valid data. */
6748 *off = m->m_len;
6749 return m;
6750 }
6751 return NULL;
6752 }
6753 m = m->m_next;
6754 }
6755 }
6756 return NULL;
6757 }
6758
6759 /*
6760 * Inform the corresponding mcache(s) that there's a waiter below.
6761 */
6762 static void
6763 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
6764 {
6765 mcache_waiter_inc(m_cache(class));
6766 if (comp) {
6767 if (class == MC_CL) {
6768 mcache_waiter_inc(m_cache(MC_MBUF_CL));
6769 } else if (class == MC_BIGCL) {
6770 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6771 } else if (class == MC_16KCL) {
6772 mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
6773 } else {
6774 mcache_waiter_inc(m_cache(MC_MBUF_CL));
6775 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6776 }
6777 }
6778 }
6779
6780 /*
6781 * Inform the corresponding mcache(s) that there's no more waiter below.
6782 */
6783 static void
6784 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
6785 {
6786 mcache_waiter_dec(m_cache(class));
6787 if (comp) {
6788 if (class == MC_CL) {
6789 mcache_waiter_dec(m_cache(MC_MBUF_CL));
6790 } else if (class == MC_BIGCL) {
6791 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6792 } else if (class == MC_16KCL) {
6793 mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
6794 } else {
6795 mcache_waiter_dec(m_cache(MC_MBUF_CL));
6796 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6797 }
6798 }
6799 }
6800
6801 static bool mbuf_watchdog_defunct_active = false;
6802
6803 static uint32_t
6804 mbuf_watchdog_socket_space(struct socket *so)
6805 {
6806 uint32_t space = 0;
6807
6808 if (so == NULL) {
6809 return 0;
6810 }
6811
6812 space = so->so_snd.sb_mbcnt + so->so_rcv.sb_mbcnt;
6813
6814 #if INET
6815 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6816 SOCK_PROTO(so) == IPPROTO_TCP) {
6817 space += tcp_reass_qlen_space(so);
6818 }
6819 #endif /* INET */
6820
6821 return space;
6822 }
6823
6824 struct mbuf_watchdog_defunct_args {
6825 struct proc *top_app;
6826 uint32_t top_app_space_used;
6827 bool non_blocking;
6828 };
6829
6830 static bool
6831 proc_fd_trylock(proc_t p)
6832 {
6833 return lck_mtx_try_lock(&p->p_fd.fd_lock);
6834 }
6835
6836 static int
6837 mbuf_watchdog_defunct_iterate(proc_t p, void *arg)
6838 {
6839 struct fileproc *fp = NULL;
6840 struct mbuf_watchdog_defunct_args *args =
6841 (struct mbuf_watchdog_defunct_args *)arg;
6842 uint32_t space_used = 0;
6843
6844 /*
6845 * Non-blocking is only used when dumping the mbuf usage from the watchdog
6846 */
6847 if (args->non_blocking) {
6848 if (!proc_fd_trylock(p)) {
6849 return PROC_RETURNED;
6850 }
6851 } else {
6852 proc_fdlock(p);
6853 }
6854 fdt_foreach(fp, p) {
6855 struct fileglob *fg = fp->fp_glob;
6856 struct socket *so = NULL;
6857
6858 if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
6859 continue;
6860 }
6861 so = fg_get_data(fg);
6862 /*
6863 * We calculate the space without the socket
6864 * lock because we don't want to be blocked
6865 * by another process that called send() and
6866 * is stuck waiting for mbufs.
6867 *
6868 * These variables are 32-bit so we don't have
6869 * to worry about incomplete reads.
6870 */
6871 space_used += mbuf_watchdog_socket_space(so);
6872 }
6873 proc_fdunlock(p);
6874 if (space_used > args->top_app_space_used) {
6875 if (args->top_app != NULL) {
6876 proc_rele(args->top_app);
6877 }
6878 args->top_app = p;
6879 args->top_app_space_used = space_used;
6880
6881 return PROC_CLAIMED;
6882 } else {
6883 return PROC_RETURNED;
6884 }
6885 }
6886
6887 extern char *proc_name_address(void *p);
6888
6889 static void
6890 mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1)
6891 {
6892 #pragma unused(arg0, arg1)
6893 struct mbuf_watchdog_defunct_args args = {};
6894 struct fileproc *fp = NULL;
6895
6896 args.non_blocking = false;
6897 proc_iterate(PROC_ALLPROCLIST,
6898 mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
6899
6900 /*
6901 * Defunct all sockets from this app.
6902 */
6903 if (args.top_app != NULL) {
6904 /* Restart the watchdog count. */
6905 lck_mtx_lock(mbuf_mlock);
6906 microuptime(&mb_wdtstart);
6907 lck_mtx_unlock(mbuf_mlock);
6908 os_log(OS_LOG_DEFAULT, "%s: defuncting all sockets from %s.%d",
6909 __func__,
6910 proc_name_address(args.top_app),
6911 proc_pid(args.top_app));
6912 proc_fdlock(args.top_app);
6913 fdt_foreach(fp, args.top_app) {
6914 struct fileglob *fg = fp->fp_glob;
6915 struct socket *so = NULL;
6916
6917 if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
6918 continue;
6919 }
6920 so = (struct socket *)fp_get_data(fp);
6921 if (!socket_try_lock(so)) {
6922 continue;
6923 }
6924 if (sosetdefunct(args.top_app, so,
6925 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
6926 TRUE) == 0) {
6927 sodefunct(args.top_app, so,
6928 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
6929 }
6930 socket_unlock(so, 0);
6931 }
6932 proc_fdunlock(args.top_app);
6933 proc_rele(args.top_app);
6934 mbstat.m_forcedefunct++;
6935 }
6936 mbuf_watchdog_defunct_active = false;
6937 }
6938
6939 /*
6940 * Called during slab (blocking and non-blocking) allocation. If there
6941 * is at least one waiter, and the time since the first waiter is blocked
6942 * is greater than the watchdog timeout, panic the system.
6943 */
6944 static void
6945 mbuf_watchdog(void)
6946 {
6947 struct timeval now;
6948 unsigned int since;
6949 static thread_call_t defunct_tcall = NULL;
6950
6951 if (mb_waiters == 0 || !mb_watchdog) {
6952 return;
6953 }
6954
6955 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6956
6957 microuptime(&now);
6958 since = now.tv_sec - mb_wdtstart.tv_sec;
6959
6960 if (mbuf_watchdog_defunct_active) {
6961 /*
6962 * Don't panic the system while we are trying
6963 * to find sockets to defunct.
6964 */
6965 return;
6966 }
6967 if (since >= MB_WDT_MAXTIME) {
6968 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
6969 mb_waiters, since, mbuf_dump());
6970 /* NOTREACHED */
6971 }
6972 /*
6973 * Check if we are about to panic the system due
6974 * to lack of mbufs and start defuncting sockets
6975 * from processes that use too many sockets.
6976 *
6977 * We're always called with the mbuf_mlock held,
6978 * so that also protects mbuf_watchdog_defunct_active.
6979 */
6980 if (since >= MB_WDT_MAXTIME / 2) {
6981 /*
6982 * Start a thread to defunct sockets
6983 * from apps that are over-using their socket
6984 * buffers.
6985 */
6986 if (defunct_tcall == NULL) {
6987 defunct_tcall =
6988 thread_call_allocate_with_options(mbuf_watchdog_defunct,
6989 NULL,
6990 THREAD_CALL_PRIORITY_KERNEL,
6991 THREAD_CALL_OPTIONS_ONCE);
6992 }
6993 if (defunct_tcall != NULL) {
6994 mbuf_watchdog_defunct_active = true;
6995 thread_call_enter(defunct_tcall);
6996 }
6997 }
6998 }
6999
7000 /*
7001 * Called during blocking allocation. Returns TRUE if one or more objects
7002 * are available at the per-CPU caches layer and that allocation should be
7003 * retried at that level.
7004 */
7005 static boolean_t
7006 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
7007 {
7008 boolean_t mcache_retry = FALSE;
7009
7010 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
7011
7012 /* Check if there's anything at the cache layer */
7013 if (mbuf_cached_above(class, wait)) {
7014 mcache_retry = TRUE;
7015 goto done;
7016 }
7017
7018 /* Nothing? Then try hard to get it from somewhere */
7019 m_reclaim(class, num, (wait & MCR_COMP));
7020
7021 /* We tried hard and got something? */
7022 if (m_infree(class) > 0) {
7023 mbstat.m_wait++;
7024 goto done;
7025 } else if (mbuf_cached_above(class, wait)) {
7026 mbstat.m_wait++;
7027 mcache_retry = TRUE;
7028 goto done;
7029 } else if (wait & MCR_TRYHARD) {
7030 mcache_retry = TRUE;
7031 goto done;
7032 }
7033
7034 /*
7035 * There's really nothing for us right now; inform the
7036 * cache(s) that there is a waiter below and go to sleep.
7037 */
7038 mbuf_waiter_inc(class, (wait & MCR_COMP));
7039
7040 VERIFY(!(wait & MCR_NOSLEEP));
7041
7042 /*
7043 * If this is the first waiter, arm the watchdog timer. Otherwise
7044 * check if we need to panic the system due to watchdog timeout.
7045 */
7046 if (mb_waiters == 0) {
7047 microuptime(&mb_wdtstart);
7048 } else {
7049 mbuf_watchdog();
7050 }
7051
7052 mb_waiters++;
7053 m_region_expand(class) += m_total(class) + num;
7054 /* wake up the worker thread */
7055 if (mbuf_worker_ready &&
7056 mbuf_worker_needs_wakeup) {
7057 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
7058 mbuf_worker_needs_wakeup = FALSE;
7059 }
7060 mbwdog_logger("waiting (%d mbufs in class %s)", num, m_cname(class));
7061 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO - 1), m_cname(class), NULL);
7062 mbwdog_logger("woke up (%d mbufs in class %s) ", num, m_cname(class));
7063
7064 /* We are now up; stop getting notified until next round */
7065 mbuf_waiter_dec(class, (wait & MCR_COMP));
7066
7067 /* We waited and got something */
7068 if (m_infree(class) > 0) {
7069 mbstat.m_wait++;
7070 goto done;
7071 } else if (mbuf_cached_above(class, wait)) {
7072 mbstat.m_wait++;
7073 mcache_retry = TRUE;
7074 }
7075 done:
7076 return mcache_retry;
7077 }
7078
7079 __attribute__((noreturn))
7080 static void
7081 mbuf_worker_thread(void)
7082 {
7083 int mbuf_expand;
7084
7085 while (1) {
7086 lck_mtx_lock(mbuf_mlock);
7087 mbwdog_logger("worker thread running");
7088 mbuf_worker_run_cnt++;
7089 mbuf_expand = 0;
7090 /*
7091 * Allocations are based on page size, so if we have depleted
7092 * the reserved spaces, try to free mbufs from the major classes.
7093 */
7094 #if PAGE_SIZE == 4096
7095 uint32_t m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
7096 uint32_t m_clusters = m_total(MC_CL);
7097 uint32_t m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
7098 uint32_t sumclusters = m_mbclusters + m_clusters + m_bigclusters;
7099 if (sumclusters >= nclusters) {
7100 mbwdog_logger("reclaiming bigcl");
7101 mbuf_drain_locked(TRUE);
7102 m_reclaim(MC_BIGCL, 4, FALSE);
7103 }
7104 #else
7105 uint32_t m_16kclusters = m_total(MC_16KCL);
7106 if (njcl > 0 && (m_16kclusters << NCLPJCLSHIFT) >= njcl) {
7107 mbwdog_logger("reclaiming 16kcl");
7108 mbuf_drain_locked(TRUE);
7109 m_reclaim(MC_16KCL, 4, FALSE);
7110 }
7111 #endif
7112 if (m_region_expand(MC_CL) > 0) {
7113 int n;
7114 mb_expand_cl_cnt++;
7115 /* Adjust to current number of cluster in use */
7116 n = m_region_expand(MC_CL) -
7117 (m_total(MC_CL) - m_infree(MC_CL));
7118 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL)) {
7119 n = m_maxlimit(MC_CL) - m_total(MC_CL);
7120 }
7121 if (n > 0) {
7122 mb_expand_cl_total += n;
7123 }
7124 m_region_expand(MC_CL) = 0;
7125
7126 if (n > 0) {
7127 mbwdog_logger("expanding MC_CL by %d", n);
7128 freelist_populate(MC_CL, n, M_WAIT);
7129 }
7130 }
7131 if (m_region_expand(MC_BIGCL) > 0) {
7132 int n;
7133 mb_expand_bigcl_cnt++;
7134 /* Adjust to current number of 4 KB cluster in use */
7135 n = m_region_expand(MC_BIGCL) -
7136 (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
7137 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL)) {
7138 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
7139 }
7140 if (n > 0) {
7141 mb_expand_bigcl_total += n;
7142 }
7143 m_region_expand(MC_BIGCL) = 0;
7144
7145 if (n > 0) {
7146 mbwdog_logger("expanding MC_BIGCL by %d", n);
7147 freelist_populate(MC_BIGCL, n, M_WAIT);
7148 }
7149 }
7150 if (m_region_expand(MC_16KCL) > 0) {
7151 int n;
7152 mb_expand_16kcl_cnt++;
7153 /* Adjust to current number of 16 KB cluster in use */
7154 n = m_region_expand(MC_16KCL) -
7155 (m_total(MC_16KCL) - m_infree(MC_16KCL));
7156 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL)) {
7157 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
7158 }
7159 if (n > 0) {
7160 mb_expand_16kcl_total += n;
7161 }
7162 m_region_expand(MC_16KCL) = 0;
7163
7164 if (n > 0) {
7165 mbwdog_logger("expanding MC_16KCL by %d", n);
7166 (void) freelist_populate(MC_16KCL, n, M_WAIT);
7167 }
7168 }
7169
7170 /*
7171 * Because we can run out of memory before filling the mbuf
7172 * map, we should not allocate more clusters than they are
7173 * mbufs -- otherwise we could have a large number of useless
7174 * clusters allocated.
7175 */
7176 mbwdog_logger("totals: MC_MBUF %d MC_BIGCL %d MC_CL %d MC_16KCL %d",
7177 m_total(MC_MBUF), m_total(MC_BIGCL), m_total(MC_CL),
7178 m_total(MC_16KCL));
7179 uint32_t total_mbufs = m_total(MC_MBUF);
7180 uint32_t total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
7181 m_total(MC_16KCL);
7182 if (total_mbufs < total_clusters) {
7183 mbwdog_logger("expanding MC_MBUF by %d",
7184 total_clusters - total_mbufs);
7185 }
7186 while (total_mbufs < total_clusters) {
7187 mb_expand_cnt++;
7188 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0) {
7189 break;
7190 }
7191 total_mbufs = m_total(MC_MBUF);
7192 total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
7193 m_total(MC_16KCL);
7194 }
7195
7196 mbuf_worker_needs_wakeup = TRUE;
7197 /*
7198 * If there's a deadlock and we're not sending / receiving
7199 * packets, net_uptime() won't be updated. Update it here
7200 * so we are sure it's correct.
7201 */
7202 net_update_uptime();
7203 mbuf_worker_last_runtime = net_uptime();
7204 assert_wait((caddr_t)&mbuf_worker_needs_wakeup,
7205 THREAD_UNINT);
7206 mbwdog_logger("worker thread sleeping");
7207 lck_mtx_unlock(mbuf_mlock);
7208 (void) thread_block((thread_continue_t)mbuf_worker_thread);
7209 }
7210 }
7211
7212 __attribute__((noreturn))
7213 static void
7214 mbuf_worker_thread_init(void)
7215 {
7216 mbuf_worker_ready++;
7217 mbuf_worker_thread();
7218 }
7219
7220 static mcl_slab_t *
7221 slab_get(void *buf)
7222 {
7223 mcl_slabg_t *slg;
7224 unsigned int ix, k;
7225
7226 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
7227
7228 VERIFY(MBUF_IN_MAP(buf));
7229 ix = ((unsigned char *)buf - mbutl) >> MBSHIFT;
7230 VERIFY(ix < maxslabgrp);
7231
7232 if ((slg = slabstbl[ix]) == NULL) {
7233 /*
7234 * In the current implementation, we never shrink the slabs
7235 * table; if we attempt to reallocate a cluster group when
7236 * it's already allocated, panic since this is a sign of a
7237 * memory corruption (slabstbl[ix] got nullified).
7238 */
7239 ++slabgrp;
7240 VERIFY(ix < slabgrp);
7241 /*
7242 * Slabs expansion can only be done single threaded; when
7243 * we get here, it must be as a result of m_clalloc() which
7244 * is serialized and therefore mb_clalloc_busy must be set.
7245 */
7246 VERIFY(mb_clalloc_busy);
7247 lck_mtx_unlock(mbuf_mlock);
7248
7249 /* This is a new buffer; create the slabs group for it */
7250 slg = zalloc_permanent_type(mcl_slabg_t);
7251 slg->slg_slab = zalloc_permanent(sizeof(mcl_slab_t) * NSLABSPMB,
7252 ZALIGN(mcl_slab_t));
7253
7254 lck_mtx_lock(mbuf_mlock);
7255 /*
7256 * No other thread could have gone into m_clalloc() after
7257 * we dropped the lock above, so verify that it's true.
7258 */
7259 VERIFY(mb_clalloc_busy);
7260
7261 slabstbl[ix] = slg;
7262
7263 /* Chain each slab in the group to its forward neighbor */
7264 for (k = 1; k < NSLABSPMB; k++) {
7265 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
7266 }
7267 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
7268
7269 /* And chain the last slab in the previous group to this */
7270 if (ix > 0) {
7271 VERIFY(slabstbl[ix - 1]->
7272 slg_slab[NSLABSPMB - 1].sl_next == NULL);
7273 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
7274 &slg->slg_slab[0];
7275 }
7276 }
7277
7278 ix = MTOPG(buf) % NSLABSPMB;
7279 VERIFY(ix < NSLABSPMB);
7280
7281 return &slg->slg_slab[ix];
7282 }
7283
7284 static void
7285 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
7286 void *base, void *head, unsigned int len, int refcnt, int chunks)
7287 {
7288 sp->sl_class = class;
7289 sp->sl_flags = flags;
7290 sp->sl_base = base;
7291 sp->sl_head = head;
7292 sp->sl_len = len;
7293 sp->sl_refcnt = refcnt;
7294 sp->sl_chunks = chunks;
7295 slab_detach(sp);
7296 }
7297
7298 static void
7299 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
7300 {
7301 VERIFY(slab_is_detached(sp));
7302 m_slab_cnt(class)++;
7303 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
7304 sp->sl_flags &= ~SLF_DETACHED;
7305
7306 /*
7307 * If a buffer spans multiple contiguous pages then mark them as
7308 * detached too
7309 */
7310 if (class == MC_16KCL) {
7311 int k;
7312 for (k = 1; k < NSLABSP16KB; k++) {
7313 sp = sp->sl_next;
7314 /* Next slab must already be present */
7315 VERIFY(sp != NULL && slab_is_detached(sp));
7316 sp->sl_flags &= ~SLF_DETACHED;
7317 }
7318 }
7319 }
7320
7321 static void
7322 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
7323 {
7324 int k;
7325 VERIFY(!slab_is_detached(sp));
7326 VERIFY(m_slab_cnt(class) > 0);
7327 m_slab_cnt(class)--;
7328 TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
7329 slab_detach(sp);
7330 if (class == MC_16KCL) {
7331 for (k = 1; k < NSLABSP16KB; k++) {
7332 sp = sp->sl_next;
7333 /* Next slab must already be present */
7334 VERIFY(sp != NULL);
7335 VERIFY(!slab_is_detached(sp));
7336 slab_detach(sp);
7337 }
7338 }
7339 }
7340
7341 static boolean_t
7342 slab_inrange(mcl_slab_t *sp, void *buf)
7343 {
7344 return (uintptr_t)buf >= (uintptr_t)sp->sl_base &&
7345 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len);
7346 }
7347
7348 #undef panic
7349
7350 static void
7351 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
7352 {
7353 int i;
7354 unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
7355 uintptr_t buf = (uintptr_t)sp->sl_base;
7356
7357 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
7358 void *next = ((mcache_obj_t *)buf)->obj_next;
7359 if (next != addr) {
7360 continue;
7361 }
7362 if (!mclverify) {
7363 if (next != NULL && !MBUF_IN_MAP(next)) {
7364 mcache_t *cp = m_cache(sp->sl_class);
7365 panic("%s: %s buffer %p in slab %p modified "
7366 "after free at offset 0: %p out of range "
7367 "[%p-%p)\n", __func__, cp->mc_name,
7368 (void *)buf, sp, next, mbutl, embutl);
7369 /* NOTREACHED */
7370 }
7371 } else {
7372 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
7373 (mcache_obj_t *)buf);
7374 mcl_audit_verify_nextptr(next, mca);
7375 }
7376 }
7377 }
7378
7379 static void
7380 slab_detach(mcl_slab_t *sp)
7381 {
7382 sp->sl_link.tqe_next = (mcl_slab_t *)-1;
7383 sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
7384 sp->sl_flags |= SLF_DETACHED;
7385 }
7386
7387 static boolean_t
7388 slab_is_detached(mcl_slab_t *sp)
7389 {
7390 return (intptr_t)sp->sl_link.tqe_next == -1 &&
7391 (intptr_t)sp->sl_link.tqe_prev == -1 &&
7392 (sp->sl_flags & SLF_DETACHED);
7393 }
7394
7395 static void
7396 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
7397 mcache_obj_t **con_list, size_t con_size, unsigned int num)
7398 {
7399 mcache_audit_t *mca, *mca_tail;
7400 mcache_obj_t *con = NULL;
7401 boolean_t save_contents = (con_list != NULL);
7402 unsigned int i, ix;
7403
7404 ASSERT(num <= NMBPG);
7405 ASSERT(con_list == NULL || con_size != 0);
7406
7407 ix = MTOPG(buf);
7408 VERIFY(ix < maxclaudit);
7409
7410 /* Make sure we haven't been here before */
7411 for (i = 0; i < num; i++) {
7412 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
7413 }
7414
7415 mca = mca_tail = *mca_list;
7416 if (save_contents) {
7417 con = *con_list;
7418 }
7419
7420 for (i = 0; i < num; i++) {
7421 mcache_audit_t *next;
7422
7423 next = mca->mca_next;
7424 bzero(mca, sizeof(*mca));
7425 mca->mca_next = next;
7426 mclaudit[ix].cl_audit[i] = mca;
7427
7428 /* Attach the contents buffer if requested */
7429 if (save_contents) {
7430 mcl_saved_contents_t *msc =
7431 (mcl_saved_contents_t *)(void *)con;
7432
7433 VERIFY(msc != NULL);
7434 VERIFY(IS_P2ALIGNED(msc, sizeof(u_int64_t)));
7435 VERIFY(con_size == sizeof(*msc));
7436 mca->mca_contents_size = con_size;
7437 mca->mca_contents = msc;
7438 con = con->obj_next;
7439 bzero(mca->mca_contents, mca->mca_contents_size);
7440 }
7441
7442 mca_tail = mca;
7443 mca = mca->mca_next;
7444 }
7445
7446 if (save_contents) {
7447 *con_list = con;
7448 }
7449
7450 *mca_list = mca_tail->mca_next;
7451 mca_tail->mca_next = NULL;
7452 }
7453
7454 static void
7455 mcl_audit_free(void *buf, unsigned int num)
7456 {
7457 unsigned int i, ix;
7458 mcache_audit_t *mca, *mca_list;
7459
7460 ix = MTOPG(buf);
7461 VERIFY(ix < maxclaudit);
7462
7463 if (mclaudit[ix].cl_audit[0] != NULL) {
7464 mca_list = mclaudit[ix].cl_audit[0];
7465 for (i = 0; i < num; i++) {
7466 mca = mclaudit[ix].cl_audit[i];
7467 mclaudit[ix].cl_audit[i] = NULL;
7468 if (mca->mca_contents) {
7469 mcache_free(mcl_audit_con_cache,
7470 mca->mca_contents);
7471 }
7472 }
7473 mcache_free_ext(mcache_audit_cache,
7474 (mcache_obj_t *)mca_list);
7475 }
7476 }
7477
7478 /*
7479 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
7480 * the corresponding audit structure for that buffer.
7481 */
7482 static mcache_audit_t *
7483 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj)
7484 {
7485 mcache_audit_t *mca = NULL;
7486 int ix = MTOPG(mobj), m_idx = 0;
7487 unsigned char *page_addr;
7488
7489 VERIFY(ix < maxclaudit);
7490 VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE)));
7491
7492 page_addr = PGTOM(ix);
7493
7494 switch (class) {
7495 case MC_MBUF:
7496 /*
7497 * For the mbuf case, find the index of the page
7498 * used by the mbuf and use that index to locate the
7499 * base address of the page. Then find out the
7500 * mbuf index relative to the page base and use
7501 * it to locate the audit structure.
7502 */
7503 m_idx = MBPAGEIDX(page_addr, mobj);
7504 VERIFY(m_idx < (int)NMBPG);
7505 mca = mclaudit[ix].cl_audit[m_idx];
7506 break;
7507
7508 case MC_CL:
7509 /*
7510 * Same thing as above, but for 2KB clusters in a page.
7511 */
7512 m_idx = CLPAGEIDX(page_addr, mobj);
7513 VERIFY(m_idx < (int)NCLPG);
7514 mca = mclaudit[ix].cl_audit[m_idx];
7515 break;
7516
7517 case MC_BIGCL:
7518 m_idx = BCLPAGEIDX(page_addr, mobj);
7519 VERIFY(m_idx < (int)NBCLPG);
7520 mca = mclaudit[ix].cl_audit[m_idx];
7521 break;
7522 case MC_16KCL:
7523 /*
7524 * Same as above, but only return the first element.
7525 */
7526 mca = mclaudit[ix].cl_audit[0];
7527 break;
7528
7529 default:
7530 VERIFY(0);
7531 /* NOTREACHED */
7532 }
7533
7534 return mca;
7535 }
7536
7537 static void
7538 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
7539 boolean_t alloc)
7540 {
7541 struct mbuf *m = addr;
7542 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
7543
7544 VERIFY(mca->mca_contents != NULL &&
7545 mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7546
7547 if (mclverify) {
7548 mcl_audit_verify_nextptr(next, mca);
7549 }
7550
7551 if (!alloc) {
7552 /* Save constructed mbuf fields */
7553 mcl_audit_save_mbuf(m, mca);
7554 if (mclverify) {
7555 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
7556 m_maxsize(MC_MBUF));
7557 }
7558 ((mcache_obj_t *)m)->obj_next = next;
7559 return;
7560 }
7561
7562 /* Check if the buffer has been corrupted while in freelist */
7563 if (mclverify) {
7564 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
7565 }
7566 /* Restore constructed mbuf fields */
7567 mcl_audit_restore_mbuf(m, mca, composite);
7568 }
7569
7570 static void
7571 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
7572 {
7573 struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
7574
7575 if (composite) {
7576 struct mbuf *next = m->m_next;
7577 VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL &&
7578 MBUF_IS_COMPOSITE(ms));
7579 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7580 /*
7581 * We could have hand-picked the mbuf fields and restore
7582 * them individually, but that will be a maintenance
7583 * headache. Instead, restore everything that was saved;
7584 * the mbuf layer will recheck and reinitialize anyway.
7585 */
7586 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
7587 m->m_next = next;
7588 } else {
7589 /*
7590 * For a regular mbuf (no cluster attached) there's nothing
7591 * to restore other than the type field, which is expected
7592 * to be MT_FREE.
7593 */
7594 m->m_type = ms->m_type;
7595 }
7596 _MCHECK(m);
7597 }
7598
7599 static void
7600 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
7601 {
7602 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7603 _MCHECK(m);
7604 bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
7605 }
7606
7607 static void
7608 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
7609 boolean_t save_next)
7610 {
7611 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
7612
7613 if (!alloc) {
7614 if (mclverify) {
7615 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
7616 }
7617 if (save_next) {
7618 mcl_audit_verify_nextptr(next, mca);
7619 ((mcache_obj_t *)addr)->obj_next = next;
7620 }
7621 } else if (mclverify) {
7622 /* Check if the buffer has been corrupted while in freelist */
7623 mcl_audit_verify_nextptr(next, mca);
7624 mcache_audit_free_verify_set(mca, addr, 0, size);
7625 }
7626 }
7627
7628 static void
7629 mcl_audit_scratch(mcache_audit_t *mca)
7630 {
7631 void *stack[MCACHE_STACK_DEPTH + 1];
7632 mcl_scratch_audit_t *msa;
7633 struct timeval now;
7634
7635 VERIFY(mca->mca_contents != NULL);
7636 msa = MCA_SAVED_SCRATCH_PTR(mca);
7637
7638 msa->msa_pthread = msa->msa_thread;
7639 msa->msa_thread = current_thread();
7640 bcopy(msa->msa_stack, msa->msa_pstack, sizeof(msa->msa_pstack));
7641 msa->msa_pdepth = msa->msa_depth;
7642 bzero(stack, sizeof(stack));
7643 msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
7644 bcopy(&stack[1], msa->msa_stack, sizeof(msa->msa_stack));
7645
7646 msa->msa_ptstamp = msa->msa_tstamp;
7647 microuptime(&now);
7648 /* tstamp is in ms relative to base_ts */
7649 msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
7650 if ((now.tv_sec - mb_start.tv_sec) > 0) {
7651 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
7652 }
7653 }
7654
7655 __abortlike
7656 static void
7657 mcl_audit_mcheck_panic(struct mbuf *m)
7658 {
7659 char buf[DUMP_MCA_BUF_SIZE];
7660 mcache_audit_t *mca;
7661
7662 MRANGE(m);
7663 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7664
7665 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s",
7666 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(buf, mca));
7667 /* NOTREACHED */
7668 }
7669
7670 __abortlike
7671 static void
7672 mcl_audit_verify_nextptr_panic(void *next, mcache_audit_t *mca)
7673 {
7674 char buf[DUMP_MCA_BUF_SIZE];
7675 panic("mcl_audit: buffer %p modified after free at offset 0: "
7676 "%p out of range [%p-%p)\n%s\n",
7677 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(buf, mca));
7678 /* NOTREACHED */
7679 }
7680
7681 static void
7682 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
7683 {
7684 if (next != NULL && !MBUF_IN_MAP(next) &&
7685 (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
7686 mcl_audit_verify_nextptr_panic(next, mca);
7687 }
7688 }
7689
7690 static uintptr_t
7691 hash_mix(uintptr_t x)
7692 {
7693 #ifndef __LP64__
7694 x += ~(x << 15);
7695 x ^= (x >> 10);
7696 x += (x << 3);
7697 x ^= (x >> 6);
7698 x += ~(x << 11);
7699 x ^= (x >> 16);
7700 #else
7701 x += ~(x << 32);
7702 x ^= (x >> 22);
7703 x += ~(x << 13);
7704 x ^= (x >> 8);
7705 x += (x << 3);
7706 x ^= (x >> 15);
7707 x += ~(x << 27);
7708 x ^= (x >> 31);
7709 #endif
7710 return x;
7711 }
7712
7713 static uint32_t
7714 hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
7715 {
7716 uintptr_t hash = 0;
7717 uintptr_t mask = max_size - 1;
7718
7719 while (depth) {
7720 hash += bt[--depth];
7721 }
7722
7723 hash = hash_mix(hash) & mask;
7724
7725 assert(hash < max_size);
7726
7727 return (uint32_t) hash;
7728 }
7729
7730 static uint32_t
7731 hashaddr(uintptr_t pt, uint32_t max_size)
7732 {
7733 uintptr_t hash = 0;
7734 uintptr_t mask = max_size - 1;
7735
7736 hash = hash_mix(pt) & mask;
7737
7738 assert(hash < max_size);
7739
7740 return (uint32_t) hash;
7741 }
7742
7743 /* This function turns on mbuf leak detection */
7744 static void
7745 mleak_activate(void)
7746 {
7747 mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
7748 PE_parse_boot_argn("mleak_sample_factor",
7749 &mleak_table.mleak_sample_factor,
7750 sizeof(mleak_table.mleak_sample_factor));
7751
7752 if (mleak_table.mleak_sample_factor == 0) {
7753 mclfindleak = 0;
7754 }
7755
7756 if (mclfindleak == 0) {
7757 return;
7758 }
7759
7760 vm_size_t alloc_size =
7761 mleak_alloc_buckets * sizeof(struct mallocation);
7762 vm_size_t trace_size = mleak_trace_buckets * sizeof(struct mtrace);
7763
7764 mleak_allocations = zalloc_permanent(alloc_size, ZALIGN(struct mallocation));
7765 mleak_traces = zalloc_permanent(trace_size, ZALIGN(struct mtrace));
7766 mleak_stat = zalloc_permanent(MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
7767 ZALIGN(mleak_stat_t));
7768
7769 mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
7770 #ifdef __LP64__
7771 mleak_stat->ml_isaddr64 = 1;
7772 #endif /* __LP64__ */
7773 }
7774
7775 static void
7776 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
7777 {
7778 int temp;
7779
7780 if (mclfindleak == 0) {
7781 return;
7782 }
7783
7784 if (!alloc) {
7785 return mleak_free(addr);
7786 }
7787
7788 temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
7789
7790 if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
7791 uintptr_t bt[MLEAK_STACK_DEPTH];
7792 unsigned int logged = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
7793 mleak_log(bt, addr, logged, num);
7794 }
7795 }
7796
7797 /*
7798 * This function records the allocation in the mleak_allocations table
7799 * and the backtrace in the mleak_traces table; if allocation slot is in use,
7800 * replace old allocation with new one if the trace slot is in use, return
7801 * (or increment refcount if same trace).
7802 */
7803 static boolean_t
7804 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
7805 {
7806 struct mallocation *allocation;
7807 struct mtrace *trace;
7808 uint32_t trace_index;
7809
7810 /* Quit if someone else modifying the tables */
7811 if (!lck_mtx_try_lock_spin(mleak_lock)) {
7812 mleak_table.total_conflicts++;
7813 return FALSE;
7814 }
7815
7816 allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
7817 mleak_alloc_buckets)];
7818 trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
7819 trace = &mleak_traces[trace_index];
7820
7821 VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
7822 VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
7823
7824 allocation->hitcount++;
7825 trace->hitcount++;
7826
7827 /*
7828 * If the allocation bucket we want is occupied
7829 * and the occupier has the same trace, just bail.
7830 */
7831 if (allocation->element != NULL &&
7832 trace_index == allocation->trace_index) {
7833 mleak_table.alloc_collisions++;
7834 lck_mtx_unlock(mleak_lock);
7835 return TRUE;
7836 }
7837
7838 /*
7839 * Store the backtrace in the traces array;
7840 * Size of zero = trace bucket is free.
7841 */
7842 if (trace->allocs > 0 &&
7843 bcmp(trace->addr, bt, (depth * sizeof(uintptr_t))) != 0) {
7844 /* Different, unique trace, but the same hash! Bail out. */
7845 trace->collisions++;
7846 mleak_table.trace_collisions++;
7847 lck_mtx_unlock(mleak_lock);
7848 return TRUE;
7849 } else if (trace->allocs > 0) {
7850 /* Same trace, already added, so increment refcount */
7851 trace->allocs++;
7852 } else {
7853 /* Found an unused trace bucket, so record the trace here */
7854 if (trace->depth != 0) {
7855 /* this slot previously used but not currently in use */
7856 mleak_table.trace_overwrites++;
7857 }
7858 mleak_table.trace_recorded++;
7859 trace->allocs = 1;
7860 memcpy(trace->addr, bt, (depth * sizeof(uintptr_t)));
7861 trace->depth = depth;
7862 trace->collisions = 0;
7863 }
7864
7865 /* Step 2: Store the allocation record in the allocations array */
7866 if (allocation->element != NULL) {
7867 /*
7868 * Replace an existing allocation. No need to preserve
7869 * because only a subset of the allocations are being
7870 * recorded anyway.
7871 */
7872 mleak_table.alloc_collisions++;
7873 } else if (allocation->trace_index != 0) {
7874 mleak_table.alloc_overwrites++;
7875 }
7876 allocation->element = addr;
7877 allocation->trace_index = trace_index;
7878 allocation->count = num;
7879 mleak_table.alloc_recorded++;
7880 mleak_table.outstanding_allocs++;
7881
7882 lck_mtx_unlock(mleak_lock);
7883 return TRUE;
7884 }
7885
7886 static void
7887 mleak_free(mcache_obj_t *addr)
7888 {
7889 while (addr != NULL) {
7890 struct mallocation *allocation = &mleak_allocations
7891 [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
7892
7893 if (allocation->element == addr &&
7894 allocation->trace_index < mleak_trace_buckets) {
7895 lck_mtx_lock_spin(mleak_lock);
7896 if (allocation->element == addr &&
7897 allocation->trace_index < mleak_trace_buckets) {
7898 struct mtrace *trace;
7899 trace = &mleak_traces[allocation->trace_index];
7900 /* allocs = 0 means trace bucket is unused */
7901 if (trace->allocs > 0) {
7902 trace->allocs--;
7903 }
7904 if (trace->allocs == 0) {
7905 trace->depth = 0;
7906 }
7907 /* NULL element means alloc bucket is unused */
7908 allocation->element = NULL;
7909 mleak_table.outstanding_allocs--;
7910 }
7911 lck_mtx_unlock(mleak_lock);
7912 }
7913 addr = addr->obj_next;
7914 }
7915 }
7916
7917 static void
7918 mleak_sort_traces()
7919 {
7920 int i, j, k;
7921 struct mtrace *swap;
7922
7923 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
7924 mleak_top_trace[i] = NULL;
7925 }
7926
7927 for (i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++) {
7928 if (mleak_traces[i].allocs <= 0) {
7929 continue;
7930 }
7931
7932 mleak_top_trace[j] = &mleak_traces[i];
7933 for (k = j; k > 0; k--) {
7934 if (mleak_top_trace[k]->allocs <=
7935 mleak_top_trace[k - 1]->allocs) {
7936 break;
7937 }
7938
7939 swap = mleak_top_trace[k - 1];
7940 mleak_top_trace[k - 1] = mleak_top_trace[k];
7941 mleak_top_trace[k] = swap;
7942 }
7943 j++;
7944 }
7945
7946 j--;
7947 for (; i < mleak_trace_buckets; i++) {
7948 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs) {
7949 continue;
7950 }
7951
7952 mleak_top_trace[j] = &mleak_traces[i];
7953
7954 for (k = j; k > 0; k--) {
7955 if (mleak_top_trace[k]->allocs <=
7956 mleak_top_trace[k - 1]->allocs) {
7957 break;
7958 }
7959
7960 swap = mleak_top_trace[k - 1];
7961 mleak_top_trace[k - 1] = mleak_top_trace[k];
7962 mleak_top_trace[k] = swap;
7963 }
7964 }
7965 }
7966
7967 static void
7968 mleak_update_stats()
7969 {
7970 mleak_trace_stat_t *mltr;
7971 int i;
7972
7973 VERIFY(mleak_stat != NULL);
7974 #ifdef __LP64__
7975 VERIFY(mleak_stat->ml_isaddr64);
7976 #else
7977 VERIFY(!mleak_stat->ml_isaddr64);
7978 #endif /* !__LP64__ */
7979 VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
7980
7981 mleak_sort_traces();
7982
7983 mltr = &mleak_stat->ml_trace[0];
7984 bzero(mltr, sizeof(*mltr) * MLEAK_NUM_TRACES);
7985 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
7986 int j;
7987
7988 if (mleak_top_trace[i] == NULL ||
7989 mleak_top_trace[i]->allocs == 0) {
7990 continue;
7991 }
7992
7993 mltr->mltr_collisions = mleak_top_trace[i]->collisions;
7994 mltr->mltr_hitcount = mleak_top_trace[i]->hitcount;
7995 mltr->mltr_allocs = mleak_top_trace[i]->allocs;
7996 mltr->mltr_depth = mleak_top_trace[i]->depth;
7997
7998 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
7999 for (j = 0; j < mltr->mltr_depth; j++) {
8000 mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
8001 }
8002
8003 mltr++;
8004 }
8005 }
8006
8007 static struct mbtypes {
8008 int mt_type;
8009 const char *mt_name;
8010 } mbtypes[] = {
8011 { MT_DATA, "data" },
8012 { MT_OOBDATA, "oob data" },
8013 { MT_CONTROL, "ancillary data" },
8014 { MT_HEADER, "packet headers" },
8015 { MT_SOCKET, "socket structures" },
8016 { MT_PCB, "protocol control blocks" },
8017 { MT_RTABLE, "routing table entries" },
8018 { MT_HTABLE, "IMP host table entries" },
8019 { MT_ATABLE, "address resolution tables" },
8020 { MT_FTABLE, "fragment reassembly queue headers" },
8021 { MT_SONAME, "socket names and addresses" },
8022 { MT_SOOPTS, "socket options" },
8023 { MT_RIGHTS, "access rights" },
8024 { MT_IFADDR, "interface addresses" },
8025 { MT_TAG, "packet tags" },
8026 { 0, NULL }
8027 };
8028
8029 #define MBUF_DUMP_BUF_CHK() { \
8030 clen -= k; \
8031 if (clen < 1) \
8032 goto done; \
8033 c += k; \
8034 }
8035
8036 static char *
8037 mbuf_dump(void)
8038 {
8039 unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct,
8040 totreturned = 0;
8041 u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
8042 u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
8043 u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
8044 int nmbtypes = sizeof(mbstat.m_mtypes) / sizeof(short);
8045 uint8_t seen[256];
8046 struct mbtypes *mp;
8047 mb_class_stat_t *sp;
8048 mleak_trace_stat_t *mltr;
8049 char *c = mbuf_dump_buf;
8050 int i, j, k, clen = MBUF_DUMP_BUF_SIZE;
8051 struct mbuf_watchdog_defunct_args args = {};
8052
8053 mbuf_dump_buf[0] = '\0';
8054
8055 /* synchronize all statistics in the mbuf table */
8056 mbuf_stat_sync();
8057 mbuf_mtypes_sync(TRUE);
8058
8059 sp = &mb_stat->mbs_class[0];
8060 for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
8061 u_int32_t mem;
8062
8063 if (m_class(i) == MC_MBUF) {
8064 m_mbufs = sp->mbcl_active;
8065 } else if (m_class(i) == MC_CL) {
8066 m_clfree = sp->mbcl_total - sp->mbcl_active;
8067 } else if (m_class(i) == MC_BIGCL) {
8068 m_bigclfree = sp->mbcl_total - sp->mbcl_active;
8069 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
8070 m_16kclfree = sp->mbcl_total - sp->mbcl_active;
8071 m_16kclusters = sp->mbcl_total;
8072 } else if (m_class(i) == MC_MBUF_CL) {
8073 m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
8074 } else if (m_class(i) == MC_MBUF_BIGCL) {
8075 m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
8076 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
8077 m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
8078 }
8079
8080 mem = sp->mbcl_ctotal * sp->mbcl_size;
8081 totmem += mem;
8082 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
8083 sp->mbcl_size;
8084 totreturned += sp->mbcl_release_cnt;
8085 }
8086
8087 /* adjust free counts to include composite caches */
8088 m_clfree += m_mbufclfree;
8089 m_bigclfree += m_mbufbigclfree;
8090 m_16kclfree += m_mbuf16kclfree;
8091
8092 totmbufs = 0;
8093 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
8094 totmbufs += mbstat.m_mtypes[mp->mt_type];
8095 }
8096 if (totmbufs > m_mbufs) {
8097 totmbufs = m_mbufs;
8098 }
8099 k = scnprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
8100 MBUF_DUMP_BUF_CHK();
8101
8102 bzero(&seen, sizeof(seen));
8103 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
8104 if (mbstat.m_mtypes[mp->mt_type] != 0) {
8105 seen[mp->mt_type] = 1;
8106 k = scnprintf(c, clen, "\t%u mbufs allocated to %s\n",
8107 mbstat.m_mtypes[mp->mt_type], mp->mt_name);
8108 MBUF_DUMP_BUF_CHK();
8109 }
8110 }
8111 seen[MT_FREE] = 1;
8112 for (i = 0; i < nmbtypes; i++) {
8113 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
8114 k = scnprintf(c, clen, "\t%u mbufs allocated to "
8115 "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
8116 MBUF_DUMP_BUF_CHK();
8117 }
8118 }
8119 if ((m_mbufs - totmbufs) > 0) {
8120 k = scnprintf(c, clen, "\t%lu mbufs allocated to caches\n",
8121 m_mbufs - totmbufs);
8122 MBUF_DUMP_BUF_CHK();
8123 }
8124 k = scnprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
8125 "%u/%u mbuf 4KB clusters in use\n",
8126 (unsigned int)(mbstat.m_clusters - m_clfree),
8127 (unsigned int)mbstat.m_clusters,
8128 (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
8129 (unsigned int)mbstat.m_bigclusters);
8130 MBUF_DUMP_BUF_CHK();
8131
8132 if (njcl > 0) {
8133 k = scnprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
8134 m_16kclusters - m_16kclfree, m_16kclusters,
8135 njclbytes / 1024);
8136 MBUF_DUMP_BUF_CHK();
8137 }
8138 totused = totmem - totfree;
8139 if (totmem == 0) {
8140 totpct = 0;
8141 } else if (totused < (ULONG_MAX / 100)) {
8142 totpct = (totused * 100) / totmem;
8143 } else {
8144 u_long totmem1 = totmem / 100;
8145 u_long totused1 = totused / 100;
8146 totpct = (totused1 * 100) / totmem1;
8147 }
8148 k = scnprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
8149 "in use)\n", totmem / 1024, totpct);
8150 MBUF_DUMP_BUF_CHK();
8151 k = scnprintf(c, clen, "%lu KB returned to the system\n",
8152 totreturned / 1024);
8153 MBUF_DUMP_BUF_CHK();
8154
8155 net_update_uptime();
8156
8157 k = scnprintf(c, clen,
8158 "worker thread runs: %u, expansions: %llu, cl %llu/%llu, "
8159 "bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt,
8160 mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total,
8161 mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt,
8162 mb_expand_16kcl_total);
8163 MBUF_DUMP_BUF_CHK();
8164 if (mbuf_worker_last_runtime != 0) {
8165 k = scnprintf(c, clen, "worker thread last run time: "
8166 "%llu (%llu seconds ago)\n",
8167 mbuf_worker_last_runtime,
8168 net_uptime() - mbuf_worker_last_runtime);
8169 MBUF_DUMP_BUF_CHK();
8170 }
8171 if (mbuf_drain_last_runtime != 0) {
8172 k = scnprintf(c, clen, "drain routine last run time: "
8173 "%llu (%llu seconds ago)\n",
8174 mbuf_drain_last_runtime,
8175 net_uptime() - mbuf_drain_last_runtime);
8176 MBUF_DUMP_BUF_CHK();
8177 }
8178
8179 /*
8180 * Log where the most mbufs have accumulated:
8181 * - Process socket buffers
8182 * - TCP reassembly queue
8183 * - Interface AQM queue (output) and DLIL input queue
8184 */
8185 args.non_blocking = true;
8186 proc_iterate(PROC_ALLPROCLIST,
8187 mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
8188 if (args.top_app != NULL) {
8189 k = scnprintf(c, clen, "\ntop proc mbuf space %u bytes by %s:%d\n",
8190 args.top_app_space_used,
8191 proc_name_address(args.top_app),
8192 proc_pid(args.top_app));
8193 proc_rele(args.top_app);
8194 }
8195 MBUF_DUMP_BUF_CHK();
8196
8197 #if INET
8198 k = dump_tcp_reass_qlen(c, clen);
8199 MBUF_DUMP_BUF_CHK();
8200 #endif /* INET */
8201
8202 #if MPTCP
8203 k = dump_mptcp_reass_qlen(c, clen);
8204 MBUF_DUMP_BUF_CHK();
8205 #endif /* MPTCP */
8206
8207 #if NETWORKING
8208 k = dlil_dump_top_if_qlen(c, clen);
8209 MBUF_DUMP_BUF_CHK();
8210 #endif /* NETWORKING */
8211
8212 /* mbuf leak detection statistics */
8213 mleak_update_stats();
8214
8215 k = scnprintf(c, clen, "\nmbuf leak detection table:\n");
8216 MBUF_DUMP_BUF_CHK();
8217 k = scnprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
8218 mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
8219 mleak_table.mleak_sample_factor);
8220 MBUF_DUMP_BUF_CHK();
8221 k = scnprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
8222 mleak_table.outstanding_allocs);
8223 MBUF_DUMP_BUF_CHK();
8224 k = scnprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
8225 mleak_table.alloc_recorded, mleak_table.trace_recorded);
8226 MBUF_DUMP_BUF_CHK();
8227 k = scnprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
8228 mleak_table.alloc_collisions, mleak_table.trace_collisions);
8229 MBUF_DUMP_BUF_CHK();
8230 k = scnprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
8231 mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
8232 MBUF_DUMP_BUF_CHK();
8233 k = scnprintf(c, clen, "\tlock conflicts: %llu\n\n",
8234 mleak_table.total_conflicts);
8235 MBUF_DUMP_BUF_CHK();
8236
8237 k = scnprintf(c, clen, "top %d outstanding traces:\n",
8238 mleak_stat->ml_cnt);
8239 MBUF_DUMP_BUF_CHK();
8240 for (i = 0; i < mleak_stat->ml_cnt; i++) {
8241 mltr = &mleak_stat->ml_trace[i];
8242 k = scnprintf(c, clen, "[%d] %llu outstanding alloc(s), "
8243 "%llu hit(s), %llu collision(s)\n", (i + 1),
8244 mltr->mltr_allocs, mltr->mltr_hitcount,
8245 mltr->mltr_collisions);
8246 MBUF_DUMP_BUF_CHK();
8247 }
8248
8249 if (mleak_stat->ml_isaddr64) {
8250 k = scnprintf(c, clen, MB_LEAK_HDR_64);
8251 } else {
8252 k = scnprintf(c, clen, MB_LEAK_HDR_32);
8253 }
8254 MBUF_DUMP_BUF_CHK();
8255
8256 for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
8257 k = scnprintf(c, clen, "%2d: ", (i + 1));
8258 MBUF_DUMP_BUF_CHK();
8259 for (j = 0; j < mleak_stat->ml_cnt; j++) {
8260 mltr = &mleak_stat->ml_trace[j];
8261 if (i < mltr->mltr_depth) {
8262 if (mleak_stat->ml_isaddr64) {
8263 k = scnprintf(c, clen, "0x%0llx ",
8264 (uint64_t)VM_KERNEL_UNSLIDE(
8265 mltr->mltr_addr[i]));
8266 } else {
8267 k = scnprintf(c, clen,
8268 "0x%08x ",
8269 (uint32_t)VM_KERNEL_UNSLIDE(
8270 mltr->mltr_addr[i]));
8271 }
8272 } else {
8273 if (mleak_stat->ml_isaddr64) {
8274 k = scnprintf(c, clen,
8275 MB_LEAK_SPACING_64);
8276 } else {
8277 k = scnprintf(c, clen,
8278 MB_LEAK_SPACING_32);
8279 }
8280 }
8281 MBUF_DUMP_BUF_CHK();
8282 }
8283 k = scnprintf(c, clen, "\n");
8284 MBUF_DUMP_BUF_CHK();
8285 }
8286
8287 done:
8288 return mbuf_dump_buf;
8289 }
8290
8291 #undef MBUF_DUMP_BUF_CHK
8292
8293 /*
8294 * Convert between a regular and a packet header mbuf. Caller is responsible
8295 * for setting or clearing M_PKTHDR; this routine does the rest of the work.
8296 */
8297 int
8298 m_reinit(struct mbuf *m, int hdr)
8299 {
8300 int ret = 0;
8301
8302 if (hdr) {
8303 VERIFY(!(m->m_flags & M_PKTHDR));
8304 if (!(m->m_flags & M_EXT) &&
8305 (m->m_data != m->m_dat || m->m_len > 0)) {
8306 /*
8307 * If there's no external cluster attached and the
8308 * mbuf appears to contain user data, we cannot
8309 * safely convert this to a packet header mbuf,
8310 * as the packet header structure might overlap
8311 * with the data.
8312 */
8313 printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
8314 "m_data %llx (expected %llx), "
8315 "m_len %d (expected 0)\n",
8316 __func__,
8317 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m),
8318 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m->m_data),
8319 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)(m->m_dat)), m->m_len);
8320 ret = EBUSY;
8321 } else {
8322 VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
8323 m->m_flags |= M_PKTHDR;
8324 MBUF_INIT_PKTHDR(m);
8325 }
8326 } else {
8327 /* Check for scratch area overflow */
8328 m_redzone_verify(m);
8329 /* Free the aux data and tags if there is any */
8330 m_tag_delete_chain(m, NULL);
8331 m_do_tx_compl_callback(m, NULL);
8332 m->m_flags &= ~M_PKTHDR;
8333 }
8334
8335 return ret;
8336 }
8337
8338 int
8339 m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n)
8340 {
8341 ASSERT(m->m_flags & M_EXT);
8342 return atomic_test_set_32(&MEXT_PRIV(m), o, n);
8343 }
8344
8345 uint32_t
8346 m_ext_get_prop(struct mbuf *m)
8347 {
8348 ASSERT(m->m_flags & M_EXT);
8349 return MEXT_PRIV(m);
8350 }
8351
8352 int
8353 m_ext_paired_is_active(struct mbuf *m)
8354 {
8355 return MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : 1;
8356 }
8357
8358 void
8359 m_ext_paired_activate(struct mbuf *m)
8360 {
8361 struct ext_ref *rfa;
8362 int hdr, type;
8363 caddr_t extbuf;
8364 m_ext_free_func_t extfree;
8365 u_int extsize;
8366
8367 VERIFY(MBUF_IS_PAIRED(m));
8368 VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
8369 VERIFY(MEXT_PREF(m) == MEXT_MINREF(m));
8370
8371 hdr = (m->m_flags & M_PKTHDR);
8372 type = m->m_type;
8373 extbuf = m->m_ext.ext_buf;
8374 extfree = m_get_ext_free(m);
8375 extsize = m->m_ext.ext_size;
8376 rfa = m_get_rfa(m);
8377
8378 VERIFY(extbuf != NULL && rfa != NULL);
8379
8380 /*
8381 * Safe to reinitialize packet header tags, since it's
8382 * already taken care of at m_free() time. Similar to
8383 * what's done in m_clattach() for the cluster. Bump
8384 * up MEXT_PREF to indicate activation.
8385 */
8386 MBUF_INIT(m, hdr, type);
8387 MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
8388 1, 1, 2, EXTF_PAIRED, MEXT_PRIV(m), m);
8389 }
8390
8391 void
8392 m_scratch_init(struct mbuf *m)
8393 {
8394 struct pkthdr *pkt = &m->m_pkthdr;
8395
8396 VERIFY(m->m_flags & M_PKTHDR);
8397
8398 /* See comments in <rdar://problem/14040693> */
8399 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
8400 panic_plain("Invalid attempt to modify guarded module-private "
8401 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
8402 /* NOTREACHED */
8403 }
8404
8405 bzero(&pkt->pkt_mpriv, sizeof(pkt->pkt_mpriv));
8406 }
8407
8408 /*
8409 * This routine is reserved for mbuf_get_driver_scratch(); clients inside
8410 * xnu that intend on utilizing the module-private area should directly
8411 * refer to the pkt_mpriv structure in the pkthdr. They are also expected
8412 * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
8413 * to handing it off to another module, respectively.
8414 */
8415 u_int32_t
8416 m_scratch_get(struct mbuf *m, u_int8_t **p)
8417 {
8418 struct pkthdr *pkt = &m->m_pkthdr;
8419
8420 VERIFY(m->m_flags & M_PKTHDR);
8421
8422 /* See comments in <rdar://problem/14040693> */
8423 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
8424 panic_plain("Invalid attempt to access guarded module-private "
8425 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
8426 /* NOTREACHED */
8427 }
8428
8429 if (mcltrace) {
8430 mcache_audit_t *mca;
8431
8432 lck_mtx_lock(mbuf_mlock);
8433 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
8434 if (mca->mca_uflags & MB_SCVALID) {
8435 mcl_audit_scratch(mca);
8436 }
8437 lck_mtx_unlock(mbuf_mlock);
8438 }
8439
8440 *p = (u_int8_t *)&pkt->pkt_mpriv;
8441 return sizeof(pkt->pkt_mpriv);
8442 }
8443
8444 void
8445 m_add_crumb(struct mbuf *m, uint16_t crumb)
8446 {
8447 VERIFY(m->m_flags & M_PKTHDR);
8448
8449 m->m_pkthdr.pkt_crumbs |= crumb;
8450 }
8451
8452 static void
8453 m_redzone_init(struct mbuf *m)
8454 {
8455 VERIFY(m->m_flags & M_PKTHDR);
8456 /*
8457 * Each mbuf has a unique red zone pattern, which is a XOR
8458 * of the red zone cookie and the address of the mbuf.
8459 */
8460 m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
8461 }
8462
8463 static void
8464 m_redzone_verify(struct mbuf *m)
8465 {
8466 u_int32_t mb_redzone;
8467
8468 VERIFY(m->m_flags & M_PKTHDR);
8469
8470 mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
8471 if (m->m_pkthdr.redzone != mb_redzone) {
8472 panic("mbuf %p redzone violation with value 0x%x "
8473 "(instead of 0x%x, using cookie 0x%x)\n",
8474 m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
8475 /* NOTREACHED */
8476 }
8477 }
8478
8479 __private_extern__ inline void
8480 m_set_ext(struct mbuf *m, struct ext_ref *rfa, m_ext_free_func_t ext_free,
8481 caddr_t ext_arg)
8482 {
8483 VERIFY(m->m_flags & M_EXT);
8484 if (rfa != NULL) {
8485 m->m_ext.ext_refflags =
8486 (struct ext_ref *)(((uintptr_t)rfa) ^ mb_obscure_extref);
8487 if (ext_free != NULL) {
8488 rfa->ext_token = ((uintptr_t)&rfa->ext_token) ^
8489 mb_obscure_extfree;
8490 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, ext_free) ^ rfa->ext_token;
8491 m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
8492 if (ext_arg != NULL) {
8493 m->m_ext.ext_arg =
8494 (caddr_t)(((uintptr_t)ext_arg) ^ rfa->ext_token);
8495 } else {
8496 m->m_ext.ext_arg = NULL;
8497 }
8498 } else {
8499 rfa->ext_token = 0;
8500 m->m_ext.ext_free = NULL;
8501 m->m_ext.ext_arg = NULL;
8502 }
8503 } else {
8504 /*
8505 * If we are going to loose the cookie in ext_token by
8506 * resetting the rfa, we should use the global cookie
8507 * to obscure the ext_free and ext_arg pointers.
8508 */
8509 if (ext_free != NULL) {
8510 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, ext_free) ^ mb_obscure_extfree;
8511 m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
8512 if (ext_arg != NULL) {
8513 m->m_ext.ext_arg =
8514 (caddr_t)((uintptr_t)ext_arg ^
8515 mb_obscure_extfree);
8516 } else {
8517 m->m_ext.ext_arg = NULL;
8518 }
8519 } else {
8520 m->m_ext.ext_free = NULL;
8521 m->m_ext.ext_arg = NULL;
8522 }
8523 m->m_ext.ext_refflags = NULL;
8524 }
8525 }
8526
8527 __private_extern__ inline struct ext_ref *
8528 m_get_rfa(struct mbuf *m)
8529 {
8530 if (m->m_ext.ext_refflags == NULL) {
8531 return NULL;
8532 } else {
8533 return (struct ext_ref *)(((uintptr_t)m->m_ext.ext_refflags) ^ mb_obscure_extref);
8534 }
8535 }
8536
8537 __private_extern__ inline m_ext_free_func_t
8538 m_get_ext_free(struct mbuf *m)
8539 {
8540 struct ext_ref *rfa;
8541 if (m->m_ext.ext_free == NULL) {
8542 return NULL;
8543 }
8544
8545 rfa = m_get_rfa(m);
8546 if (rfa == NULL) {
8547 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, m->m_ext.ext_free) ^ mb_obscure_extfree;
8548 return ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
8549 } else {
8550 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, m->m_ext.ext_free) ^ rfa->ext_token;
8551 return ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
8552 }
8553 }
8554
8555 __private_extern__ inline caddr_t
8556 m_get_ext_arg(struct mbuf *m)
8557 {
8558 struct ext_ref *rfa;
8559 if (m->m_ext.ext_arg == NULL) {
8560 return NULL;
8561 }
8562
8563 rfa = m_get_rfa(m);
8564 if (rfa == NULL) {
8565 return (caddr_t)((uintptr_t)m->m_ext.ext_arg ^ mb_obscure_extfree);
8566 } else {
8567 return (caddr_t)(((uintptr_t)m->m_ext.ext_arg) ^
8568 rfa->ext_token);
8569 }
8570 }
8571
8572 /*
8573 * Send a report of mbuf usage if the usage is at least 6% of max limit
8574 * or if there has been at least 3% increase since the last report.
8575 *
8576 * The values 6% and 3% are chosen so that we can do simple arithmetic
8577 * with shift operations.
8578 */
8579 static boolean_t
8580 mbuf_report_usage(mbuf_class_t cl)
8581 {
8582 /* if a report is already in progress, nothing to do */
8583 if (mb_peak_newreport) {
8584 return TRUE;
8585 }
8586
8587 if (m_total(cl) > m_peak(cl) &&
8588 m_total(cl) >= (m_maxlimit(cl) >> 4) &&
8589 (m_total(cl) - m_peak(cl)) >= (m_peak(cl) >> 5)) {
8590 return TRUE;
8591 }
8592 return FALSE;
8593 }
8594
8595 __private_extern__ void
8596 mbuf_report_peak_usage(void)
8597 {
8598 int i = 0;
8599 u_int64_t uptime;
8600 struct nstat_sysinfo_data ns_data;
8601 uint32_t memreleased = 0;
8602 static uint32_t prevmemreleased;
8603
8604 uptime = net_uptime();
8605 lck_mtx_lock(mbuf_mlock);
8606
8607 /* Generate an initial report after 1 week of uptime */
8608 if (!mb_peak_firstreport &&
8609 uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
8610 mb_peak_newreport = TRUE;
8611 mb_peak_firstreport = TRUE;
8612 }
8613
8614 if (!mb_peak_newreport) {
8615 lck_mtx_unlock(mbuf_mlock);
8616 return;
8617 }
8618
8619 /*
8620 * Since a report is being generated before 1 week,
8621 * we do not need to force another one later
8622 */
8623 if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
8624 mb_peak_firstreport = TRUE;
8625 }
8626
8627 for (i = 0; i < NELEM(mbuf_table); i++) {
8628 m_peak(m_class(i)) = m_total(m_class(i));
8629 memreleased += m_release_cnt(i);
8630 }
8631 memreleased = memreleased - prevmemreleased;
8632 prevmemreleased = memreleased;
8633 mb_peak_newreport = FALSE;
8634 lck_mtx_unlock(mbuf_mlock);
8635
8636 bzero(&ns_data, sizeof(ns_data));
8637 ns_data.flags = NSTAT_SYSINFO_MBUF_STATS;
8638 ns_data.u.mb_stats.total_256b = m_peak(MC_MBUF);
8639 ns_data.u.mb_stats.total_2kb = m_peak(MC_CL);
8640 ns_data.u.mb_stats.total_4kb = m_peak(MC_BIGCL);
8641 ns_data.u.mb_stats.total_16kb = m_peak(MC_16KCL);
8642 ns_data.u.mb_stats.sbmb_total = total_sbmb_cnt_peak;
8643 ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached;
8644 ns_data.u.mb_stats.draincnt = mbstat.m_drain;
8645 ns_data.u.mb_stats.memreleased = memreleased;
8646 ns_data.u.mb_stats.sbmb_floor = total_sbmb_cnt_floor;
8647
8648 nstat_sysinfo_send_data(&ns_data);
8649
8650 /*
8651 * Reset the floor whenever we report a new
8652 * peak to track the trend (increase peek usage
8653 * is not a leak if mbufs get released
8654 * between reports and the floor stays low)
8655 */
8656 total_sbmb_cnt_floor = total_sbmb_cnt_peak;
8657 }
8658
8659 /*
8660 * Simple routine to avoid taking the lock when we can't run the
8661 * mbuf drain.
8662 */
8663 static int
8664 mbuf_drain_checks(boolean_t ignore_waiters)
8665 {
8666 if (mb_drain_maxint == 0) {
8667 return 0;
8668 }
8669 if (!ignore_waiters && mb_waiters != 0) {
8670 return 0;
8671 }
8672
8673 return 1;
8674 }
8675
8676 /*
8677 * Called by the VM when there's memory pressure or when we exhausted
8678 * the 4k/16k reserved space.
8679 */
8680 static void
8681 mbuf_drain_locked(boolean_t ignore_waiters)
8682 {
8683 mbuf_class_t mc;
8684 mcl_slab_t *sp, *sp_tmp, *nsp;
8685 unsigned int num, k, interval, released = 0;
8686 unsigned long total_mem = 0, use_mem = 0;
8687 boolean_t ret, purge_caches = FALSE;
8688 ppnum_t offset;
8689 mcache_obj_t *obj;
8690 unsigned long per;
8691 static unsigned char scratch[32];
8692 static ppnum_t scratch_pa = 0;
8693
8694 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8695 if (!mbuf_drain_checks(ignore_waiters)) {
8696 return;
8697 }
8698 if (scratch_pa == 0) {
8699 bzero(scratch, sizeof(scratch));
8700 scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
8701 VERIFY(scratch_pa);
8702 } else if (mclverify) {
8703 /*
8704 * Panic if a driver wrote to our scratch memory.
8705 */
8706 for (k = 0; k < sizeof(scratch); k++) {
8707 if (scratch[k]) {
8708 panic("suspect DMA to freed address");
8709 }
8710 }
8711 }
8712 /*
8713 * Don't free memory too often as that could cause excessive
8714 * waiting times for mbufs. Purge caches if we were asked to drain
8715 * in the last 5 minutes.
8716 */
8717 if (mbuf_drain_last_runtime != 0) {
8718 interval = net_uptime() - mbuf_drain_last_runtime;
8719 if (interval <= mb_drain_maxint) {
8720 return;
8721 }
8722 if (interval <= mb_drain_maxint * 5) {
8723 purge_caches = TRUE;
8724 }
8725 }
8726 mbuf_drain_last_runtime = net_uptime();
8727 /*
8728 * Don't free any memory if we're using 60% or more.
8729 */
8730 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8731 total_mem += m_total(mc) * m_maxsize(mc);
8732 use_mem += m_active(mc) * m_maxsize(mc);
8733 }
8734 per = (use_mem * 100) / total_mem;
8735 if (per >= 60) {
8736 return;
8737 }
8738 /*
8739 * Purge all the caches. This effectively disables
8740 * caching for a few seconds, but the mbuf worker thread will
8741 * re-enable them again.
8742 */
8743 if (purge_caches == TRUE) {
8744 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8745 if (m_total(mc) < m_avgtotal(mc)) {
8746 continue;
8747 }
8748 lck_mtx_unlock(mbuf_mlock);
8749 ret = mcache_purge_cache(m_cache(mc), FALSE);
8750 lck_mtx_lock(mbuf_mlock);
8751 if (ret == TRUE) {
8752 m_purge_cnt(mc)++;
8753 }
8754 }
8755 }
8756 /*
8757 * Move the objects from the composite class freelist to
8758 * the rudimentary slabs list, but keep at least 10% of the average
8759 * total in the freelist.
8760 */
8761 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8762 while (m_cobjlist(mc) &&
8763 m_total(mc) < m_avgtotal(mc) &&
8764 m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
8765 obj = m_cobjlist(mc);
8766 m_cobjlist(mc) = obj->obj_next;
8767 obj->obj_next = NULL;
8768 num = cslab_free(mc, obj, 1);
8769 VERIFY(num == 1);
8770 m_free_cnt(mc)++;
8771 m_infree(mc)--;
8772 /* cslab_free() handles m_total */
8773 }
8774 }
8775 /*
8776 * Free the buffers present in the slab list up to 10% of the total
8777 * average per class.
8778 *
8779 * We walk the list backwards in an attempt to reduce fragmentation.
8780 */
8781 for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) {
8782 TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
8783 /*
8784 * Process only unused slabs occupying memory.
8785 */
8786 if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
8787 sp->sl_base == NULL) {
8788 continue;
8789 }
8790 if (m_total(mc) < m_avgtotal(mc) ||
8791 m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
8792 break;
8793 }
8794 slab_remove(sp, mc);
8795 switch (mc) {
8796 case MC_MBUF:
8797 m_infree(mc) -= NMBPG;
8798 m_total(mc) -= NMBPG;
8799 if (mclaudit != NULL) {
8800 mcl_audit_free(sp->sl_base, NMBPG);
8801 }
8802 break;
8803 case MC_CL:
8804 m_infree(mc) -= NCLPG;
8805 m_total(mc) -= NCLPG;
8806 if (mclaudit != NULL) {
8807 mcl_audit_free(sp->sl_base, NMBPG);
8808 }
8809 break;
8810 case MC_BIGCL:
8811 {
8812 m_infree(mc) -= NBCLPG;
8813 m_total(mc) -= NBCLPG;
8814 if (mclaudit != NULL) {
8815 mcl_audit_free(sp->sl_base, NMBPG);
8816 }
8817 break;
8818 }
8819 case MC_16KCL:
8820 m_infree(mc)--;
8821 m_total(mc)--;
8822 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
8823 nsp = nsp->sl_next;
8824 VERIFY(nsp->sl_refcnt == 0 &&
8825 nsp->sl_base != NULL &&
8826 nsp->sl_len == 0);
8827 slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
8828 0);
8829 nsp->sl_flags = 0;
8830 }
8831 if (mclaudit != NULL) {
8832 if (sp->sl_len == PAGE_SIZE) {
8833 mcl_audit_free(sp->sl_base,
8834 NMBPG);
8835 } else {
8836 mcl_audit_free(sp->sl_base, 1);
8837 }
8838 }
8839 break;
8840 default:
8841 /*
8842 * The composite classes have their own
8843 * freelist (m_cobjlist), so we only
8844 * process rudimentary classes here.
8845 */
8846 VERIFY(0);
8847 }
8848 m_release_cnt(mc) += m_size(mc);
8849 released += m_size(mc);
8850 VERIFY(sp->sl_base != NULL &&
8851 sp->sl_len >= PAGE_SIZE);
8852 offset = MTOPG(sp->sl_base);
8853 /*
8854 * Make sure the IOMapper points to a valid, but
8855 * bogus, address. This should prevent further DMA
8856 * accesses to freed memory.
8857 */
8858 IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
8859 mcl_paddr[offset] = 0;
8860 kmem_free(mb_map, (vm_offset_t)sp->sl_base,
8861 sp->sl_len);
8862 slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
8863 sp->sl_flags = 0;
8864 }
8865 }
8866 mbstat.m_drain++;
8867 mbstat.m_bigclusters = m_total(MC_BIGCL);
8868 mbstat.m_clusters = m_total(MC_CL);
8869 mbstat.m_mbufs = m_total(MC_MBUF);
8870 mbuf_stat_sync();
8871 mbuf_mtypes_sync(TRUE);
8872 }
8873
8874 __private_extern__ void
8875 mbuf_drain(boolean_t ignore_waiters)
8876 {
8877 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_NOTOWNED);
8878 if (!mbuf_drain_checks(ignore_waiters)) {
8879 return;
8880 }
8881 lck_mtx_lock(mbuf_mlock);
8882 mbuf_drain_locked(ignore_waiters);
8883 lck_mtx_unlock(mbuf_mlock);
8884 }
8885
8886
8887 static int
8888 m_drain_force_sysctl SYSCTL_HANDLER_ARGS
8889 {
8890 #pragma unused(arg1, arg2)
8891 int val = 0, err;
8892
8893 err = sysctl_handle_int(oidp, &val, 0, req);
8894 if (err != 0 || req->newptr == USER_ADDR_NULL) {
8895 return err;
8896 }
8897 if (val) {
8898 mbuf_drain(TRUE);
8899 }
8900
8901 return err;
8902 }
8903
8904 #if DEBUG || DEVELOPMENT
8905 __printflike(3, 4)
8906 static void
8907 _mbwdog_logger(const char *func, const int line, const char *fmt, ...)
8908 {
8909 va_list ap;
8910 struct timeval now;
8911 char str[384], p[256];
8912 int len;
8913
8914 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8915 if (mbwdog_logging == NULL) {
8916 /*
8917 * This might block under a mutex, which isn't really great,
8918 * but this happens once, so we'll live.
8919 */
8920 mbwdog_logging = zalloc_permanent(mbwdog_logging_size,
8921 ZALIGN_NONE);
8922 }
8923 va_start(ap, fmt);
8924 vsnprintf(p, sizeof(p), fmt, ap);
8925 va_end(ap);
8926 microuptime(&now);
8927 len = scnprintf(str, sizeof(str),
8928 "\n%ld.%d (%d/%llx) %s:%d %s",
8929 now.tv_sec, now.tv_usec,
8930 proc_getpid(current_proc()),
8931 (uint64_t)VM_KERNEL_ADDRPERM(current_thread()),
8932 func, line, p);
8933 if (len < 0) {
8934 return;
8935 }
8936 if (mbwdog_logging_used + len > mbwdog_logging_size) {
8937 mbwdog_logging_used = mbwdog_logging_used / 2;
8938 memmove(mbwdog_logging, mbwdog_logging + mbwdog_logging_used,
8939 mbwdog_logging_size - mbwdog_logging_used);
8940 mbwdog_logging[mbwdog_logging_used] = 0;
8941 }
8942 strlcat(mbwdog_logging, str, mbwdog_logging_size);
8943 mbwdog_logging_used += len;
8944 }
8945
8946 #endif // DEBUG || DEVELOPMENT
8947
8948 static void
8949 mtracelarge_register(size_t size)
8950 {
8951 int i;
8952 struct mtracelarge *trace;
8953 uintptr_t bt[MLEAK_STACK_DEPTH];
8954 unsigned int depth;
8955
8956 depth = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
8957 /* Check if this entry is already on the list. */
8958 for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
8959 trace = &mtracelarge_table[i];
8960 if (trace->size == size && trace->depth == depth &&
8961 memcmp(bt, trace->addr, depth * sizeof(uintptr_t)) == 0) {
8962 return;
8963 }
8964 }
8965 for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
8966 trace = &mtracelarge_table[i];
8967 if (size > trace->size) {
8968 trace->depth = depth;
8969 memcpy(trace->addr, bt, depth * sizeof(uintptr_t));
8970 trace->size = size;
8971 break;
8972 }
8973 }
8974 }
8975
8976 #if DEBUG || DEVELOPMENT
8977
8978 static int
8979 mbuf_wd_dump_sysctl SYSCTL_HANDLER_ARGS
8980 {
8981 char *str;
8982
8983 ifnet_head_lock_shared();
8984 lck_mtx_lock(mbuf_mlock);
8985
8986 str = mbuf_dump();
8987
8988 lck_mtx_unlock(mbuf_mlock);
8989 ifnet_head_done();
8990
8991 return sysctl_io_string(req, str, 0, 0, NULL);
8992 }
8993
8994 #endif /* DEBUG || DEVELOPMENT */
8995
8996 SYSCTL_DECL(_kern_ipc);
8997 #if DEBUG || DEVELOPMENT
8998 #if SKYWALK
8999 SYSCTL_UINT(_kern_ipc, OID_AUTO, mc_threshold_scale_factor,
9000 CTLFLAG_RW | CTLFLAG_LOCKED, &mc_threshold_scale_down_factor,
9001 MC_THRESHOLD_SCALE_DOWN_FACTOR,
9002 "scale down factor for mbuf cache thresholds");
9003 #endif /* SKYWALK */
9004 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_wd_dump,
9005 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED,
9006 0, 0, mbuf_wd_dump_sysctl, "A", "mbuf watchdog dump");
9007 #endif
9008 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
9009 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
9010 0, 0, mbstat_sysctl, "S,mbstat", "");
9011 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
9012 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
9013 0, 0, mb_stat_sysctl, "S,mb_stat", "");
9014 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
9015 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
9016 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
9017 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
9018 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
9019 0, 0, mleak_table_sysctl, "S,mleak_table", "");
9020 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
9021 CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
9022 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
9023 CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
9024 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
9025 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
9026 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
9027 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
9028 m_drain_force_sysctl, "I",
9029 "Forces the mbuf garbage collection to run");
9030 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
9031 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0,
9032 "Minimum time interval between garbage collection");
9033 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_memory_pressure_percentage,
9034 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_memory_pressure_percentage, 0,
9035 "Percentage of when we trigger memory-pressure for an mbuf-class");
9036