1 /*
2 * Copyright (c) 1998-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <ptrauth.h>
71
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/malloc.h>
75 #include <sys/mbuf.h>
76 #include <sys/kernel.h>
77 #include <sys/sysctl.h>
78 #include <sys/syslog.h>
79 #include <sys/protosw.h>
80 #include <sys/domain.h>
81 #include <sys/queue.h>
82 #include <sys/proc.h>
83 #include <sys/filedesc.h>
84 #include <sys/file_internal.h>
85
86 #include <dev/random/randomdev.h>
87
88 #include <kern/kern_types.h>
89 #include <kern/simple_lock.h>
90 #include <kern/queue.h>
91 #include <kern/sched_prim.h>
92 #include <kern/backtrace.h>
93 #include <kern/percpu.h>
94 #include <kern/zalloc.h>
95
96 #include <libkern/OSAtomic.h>
97 #include <libkern/OSDebug.h>
98 #include <libkern/libkern.h>
99
100 #include <os/log.h>
101 #include <os/ptrtools.h>
102
103 #include <IOKit/IOMapper.h>
104
105 #include <machine/limits.h>
106 #include <machine/machine_routines.h>
107
108 #include <sys/mcache.h>
109 #include <net/ntstat.h>
110
111 #if INET
112 extern int dump_tcp_reass_qlen(char *, int);
113 extern int tcp_reass_qlen_space(struct socket *);
114 #endif /* INET */
115
116 #if MPTCP
117 extern int dump_mptcp_reass_qlen(char *, int);
118 #endif /* MPTCP */
119
120
121 #if NETWORKING
122 extern int dlil_dump_top_if_qlen(char *, int);
123 #endif /* NETWORKING */
124
125 /*
126 * MBUF IMPLEMENTATION NOTES.
127 *
128 * There is a total of 5 per-CPU caches:
129 *
130 * MC_MBUF:
131 * This is a cache of rudimentary objects of MSIZE in size; each
132 * object represents an mbuf structure. This cache preserves only
133 * the m_type field of the mbuf during its transactions.
134 *
135 * MC_CL:
136 * This is a cache of rudimentary objects of MCLBYTES in size; each
137 * object represents a mcluster structure. This cache does not
138 * preserve the contents of the objects during its transactions.
139 *
140 * MC_BIGCL:
141 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each
142 * object represents a mbigcluster structure. This cache does not
143 * preserve the contents of the objects during its transaction.
144 *
145 * MC_MBUF_CL:
146 * This is a cache of mbufs each having a cluster attached to it.
147 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
148 * fields of the mbuf related to the external cluster are preserved
149 * during transactions.
150 *
151 * MC_MBUF_BIGCL:
152 * This is a cache of mbufs each having a big cluster attached to it.
153 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
154 * fields of the mbuf related to the external cluster are preserved
155 * during transactions.
156 *
157 * OBJECT ALLOCATION:
158 *
159 * Allocation requests are handled first at the per-CPU (mcache) layer
160 * before falling back to the slab layer. Performance is optimal when
161 * the request is satisfied at the CPU layer because global data/lock
162 * never gets accessed. When the slab layer is entered for allocation,
163 * the slab freelist will be checked first for available objects before
164 * the VM backing store is invoked. Slab layer operations are serialized
165 * for all of the caches as the mbuf global lock is held most of the time.
166 * Allocation paths are different depending on the class of objects:
167 *
168 * a. Rudimentary object:
169 *
170 * { m_get_common(), m_clattach(), m_mclget(),
171 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
172 * composite object allocation }
173 * | ^
174 * | |
175 * | +-----------------------+
176 * v |
177 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
178 * | ^
179 * v |
180 * [CPU cache] -------> (found?) -------+
181 * | |
182 * v |
183 * mbuf_slab_alloc() |
184 * | |
185 * v |
186 * +---------> [freelist] -------> (found?) -------+
187 * | |
188 * | v
189 * | m_clalloc()
190 * | |
191 * | v
192 * +---<<---- kmem_mb_alloc()
193 *
194 * b. Composite object:
195 *
196 * { m_getpackets_internal(), m_allocpacket_internal() }
197 * | ^
198 * | |
199 * | +------ (done) ---------+
200 * v |
201 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
202 * | ^
203 * v |
204 * [CPU cache] -------> (found?) -------+
205 * | |
206 * v |
207 * mbuf_cslab_alloc() |
208 * | |
209 * v |
210 * [freelist] -------> (found?) -------+
211 * | |
212 * v |
213 * (rudimentary object) |
214 * mcache_alloc/mcache_alloc_ext() ------>>-----+
215 *
216 * Auditing notes: If auditing is enabled, buffers will be subjected to
217 * integrity checks by the audit routine. This is done by verifying their
218 * contents against DEADBEEF (free) pattern before returning them to caller.
219 * As part of this step, the routine will also record the transaction and
220 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
221 * also restore any constructed data structure fields if necessary.
222 *
223 * OBJECT DEALLOCATION:
224 *
225 * Freeing an object simply involves placing it into the CPU cache; this
226 * pollutes the cache to benefit subsequent allocations. The slab layer
227 * will only be entered if the object is to be purged out of the cache.
228 * During normal operations, this happens only when the CPU layer resizes
229 * its bucket while it's adjusting to the allocation load. Deallocation
230 * paths are different depending on the class of objects:
231 *
232 * a. Rudimentary object:
233 *
234 * { m_free(), m_freem_list(), composite object deallocation }
235 * | ^
236 * | |
237 * | +------ (done) ---------+
238 * v |
239 * mcache_free/mcache_free_ext() |
240 * | |
241 * v |
242 * mbuf_slab_audit() |
243 * | |
244 * v |
245 * [CPU cache] ---> (not purging?) -----+
246 * | |
247 * v |
248 * mbuf_slab_free() |
249 * | |
250 * v |
251 * [freelist] ----------->>------------+
252 * (objects get purged to VM only on demand)
253 *
254 * b. Composite object:
255 *
256 * { m_free(), m_freem_list() }
257 * | ^
258 * | |
259 * | +------ (done) ---------+
260 * v |
261 * mcache_free/mcache_free_ext() |
262 * | |
263 * v |
264 * mbuf_cslab_audit() |
265 * | |
266 * v |
267 * [CPU cache] ---> (not purging?) -----+
268 * | |
269 * v |
270 * mbuf_cslab_free() |
271 * | |
272 * v |
273 * [freelist] ---> (not purging?) -----+
274 * | |
275 * v |
276 * (rudimentary object) |
277 * mcache_free/mcache_free_ext() ------->>------+
278 *
279 * Auditing notes: If auditing is enabled, the audit routine will save
280 * any constructed data structure fields (if necessary) before filling the
281 * contents of the buffers with DEADBEEF (free) pattern and recording the
282 * transaction. Buffers that are freed (whether at CPU or slab layer) are
283 * expected to contain the free pattern.
284 *
285 * DEBUGGING:
286 *
287 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
288 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
289 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
290 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
291 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
292 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
293 *
294 * Each object is associated with exactly one mcache_audit_t structure that
295 * contains the information related to its last buffer transaction. Given
296 * an address of an object, the audit structure can be retrieved by finding
297 * the position of the object relevant to the base address of the cluster:
298 *
299 * +------------+ +=============+
300 * | mbuf addr | | mclaudit[i] |
301 * +------------+ +=============+
302 * | | cl_audit[0] |
303 * i = MTOBG(addr) +-------------+
304 * | +-----> | cl_audit[1] | -----> mcache_audit_t
305 * b = BGTOM(i) | +-------------+
306 * | | | ... |
307 * x = MCLIDX(b, addr) | +-------------+
308 * | | | cl_audit[7] |
309 * +-----------------+ +-------------+
310 * (e.g. x == 1)
311 *
312 * The mclaudit[] array is allocated at initialization time, but its contents
313 * get populated when the corresponding cluster is created. Because a page
314 * can be turned into NMBPG number of mbufs, we preserve enough space for the
315 * mbufs so that there is a 1-to-1 mapping between them. A page that never
316 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
317 * remaining entries unused. For 16KB cluster, only one entry from the first
318 * page is allocated and used for the entire object.
319 */
320
321 /* TODO: should be in header file */
322 /* kernel translater */
323 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
324 extern vm_map_t mb_map; /* special map */
325
326 static uint32_t mb_kmem_contig_failed;
327 static uint32_t mb_kmem_failed;
328 static uint32_t mb_kmem_one_failed;
329 /* Timestamp of allocation failures. */
330 static uint64_t mb_kmem_contig_failed_ts;
331 static uint64_t mb_kmem_failed_ts;
332 static uint64_t mb_kmem_one_failed_ts;
333 static uint64_t mb_kmem_contig_failed_size;
334 static uint64_t mb_kmem_failed_size;
335 static uint32_t mb_kmem_stats[6];
336
337 /* Global lock */
338 static LCK_GRP_DECLARE(mbuf_mlock_grp, "mbuf");
339 static LCK_MTX_DECLARE(mbuf_mlock_data, &mbuf_mlock_grp);
340 static lck_mtx_t *const mbuf_mlock = &mbuf_mlock_data;
341
342 /* Back-end (common) layer */
343 static uint64_t mb_expand_cnt;
344 static uint64_t mb_expand_cl_cnt;
345 static uint64_t mb_expand_cl_total;
346 static uint64_t mb_expand_bigcl_cnt;
347 static uint64_t mb_expand_bigcl_total;
348 static uint64_t mb_expand_16kcl_cnt;
349 static uint64_t mb_expand_16kcl_total;
350 static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */
351 static uint32_t mbuf_worker_run_cnt;
352 static uint64_t mbuf_worker_last_runtime;
353 static uint64_t mbuf_drain_last_runtime;
354 static int mbuf_worker_ready; /* worker thread is runnable */
355 static unsigned int ncpu; /* number of CPUs */
356 static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */
357 static ppnum_t mcl_pages; /* Size of array (# physical pages) */
358 static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
359 static mcache_t *ref_cache; /* Cache of cluster reference & flags */
360 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
361 unsigned int mbuf_debug; /* patchable mbuf mcache flags */
362 static unsigned int mb_normalized; /* number of packets "normalized" */
363
364 #define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
365 #define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
366
367 typedef enum {
368 MC_MBUF = 0, /* Regular mbuf */
369 MC_CL, /* Cluster */
370 MC_BIGCL, /* Large (4KB) cluster */
371 MC_16KCL, /* Jumbo (16KB) cluster */
372 MC_MBUF_CL, /* mbuf + cluster */
373 MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */
374 MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */
375 } mbuf_class_t;
376
377 #define MBUF_CLASS_MIN MC_MBUF
378 #define MBUF_CLASS_MAX MC_MBUF_16KCL
379 #define MBUF_CLASS_LAST MC_16KCL
380 #define MBUF_CLASS_VALID(c) \
381 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
382 #define MBUF_CLASS_COMPOSITE(c) \
383 ((int)(c) > MBUF_CLASS_LAST)
384
385
386 /*
387 * mbuf specific mcache allocation request flags.
388 */
389 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
390
391 /*
392 * Per-cluster slab structure.
393 *
394 * A slab is a cluster control structure that contains one or more object
395 * chunks; the available chunks are chained in the slab's freelist (sl_head).
396 * Each time a chunk is taken out of the slab, the slab's reference count
397 * gets incremented. When all chunks have been taken out, the empty slab
398 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
399 * returned to a slab causes the slab's reference count to be decremented;
400 * it also causes the slab to be reinserted back to class's slab list, if
401 * it's not already done.
402 *
403 * Compartmentalizing of the object chunks into slabs allows us to easily
404 * merge one or more slabs together when the adjacent slabs are idle, as
405 * well as to convert or move a slab from one class to another; e.g. the
406 * mbuf cluster slab can be converted to a regular cluster slab when all
407 * mbufs in the slab have been freed.
408 *
409 * A slab may also span across multiple clusters for chunks larger than
410 * a cluster's size. In this case, only the slab of the first cluster is
411 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
412 * that they are part of the larger slab.
413 *
414 * Each slab controls a page of memory.
415 */
416 typedef struct mcl_slab {
417 struct mcl_slab *sl_next; /* neighboring slab */
418 u_int8_t sl_class; /* controlling mbuf class */
419 int8_t sl_refcnt; /* outstanding allocations */
420 int8_t sl_chunks; /* chunks (bufs) in this slab */
421 u_int16_t sl_flags; /* slab flags (see below) */
422 u_int16_t sl_len; /* slab length */
423 void *sl_base; /* base of allocated memory */
424 void *sl_head; /* first free buffer */
425 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
426 } mcl_slab_t;
427
428 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
429 #define SLF_PARTIAL 0x0002 /* part of another slab */
430 #define SLF_DETACHED 0x0004 /* not in slab freelist */
431
432 /*
433 * The array of slabs are broken into groups of arrays per 1MB of kernel
434 * memory to reduce the footprint. Each group is allocated on demand
435 * whenever a new piece of memory mapped in from the VM crosses the 1MB
436 * boundary.
437 */
438 #define NSLABSPMB ((1 << MBSHIFT) >> PAGE_SHIFT)
439
440 typedef struct mcl_slabg {
441 mcl_slab_t *slg_slab; /* group of slabs */
442 } mcl_slabg_t;
443
444 /*
445 * Number of slabs needed to control a 16KB cluster object.
446 */
447 #define NSLABSP16KB (M16KCLBYTES >> PAGE_SHIFT)
448
449 /*
450 * Per-cluster audit structure.
451 */
452 typedef struct {
453 mcache_audit_t **cl_audit; /* array of audits */
454 } mcl_audit_t;
455
456 typedef struct {
457 struct thread *msa_thread; /* thread doing transaction */
458 struct thread *msa_pthread; /* previous transaction thread */
459 uint32_t msa_tstamp; /* transaction timestamp (ms) */
460 uint32_t msa_ptstamp; /* prev transaction timestamp (ms) */
461 uint16_t msa_depth; /* pc stack depth */
462 uint16_t msa_pdepth; /* previous transaction pc stack */
463 void *msa_stack[MCACHE_STACK_DEPTH];
464 void *msa_pstack[MCACHE_STACK_DEPTH];
465 } mcl_scratch_audit_t;
466
467 typedef struct {
468 /*
469 * Size of data from the beginning of an mbuf that covers m_hdr,
470 * pkthdr and m_ext structures. If auditing is enabled, we allocate
471 * a shadow mbuf structure of this size inside each audit structure,
472 * and the contents of the real mbuf gets copied into it when the mbuf
473 * is freed. This allows us to pattern-fill the mbuf for integrity
474 * check, and to preserve any constructed mbuf fields (e.g. mbuf +
475 * cluster cache case). Note that we don't save the contents of
476 * clusters when they are freed; we simply pattern-fill them.
477 */
478 u_int8_t sc_mbuf[(MSIZE - _MHLEN) + sizeof(_m_ext_t)];
479 mcl_scratch_audit_t sc_scratch __attribute__((aligned(8)));
480 } mcl_saved_contents_t;
481
482 #define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t))
483
484 #define MCA_SAVED_MBUF_PTR(_mca) \
485 ((struct mbuf *)(void *)((mcl_saved_contents_t *) \
486 (_mca)->mca_contents)->sc_mbuf)
487 #define MCA_SAVED_MBUF_SIZE \
488 (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
489 #define MCA_SAVED_SCRATCH_PTR(_mca) \
490 (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
491
492 /*
493 * mbuf specific mcache audit flags
494 */
495 #define MB_INUSE 0x01 /* object has not been returned to slab */
496 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
497 #define MB_SCVALID 0x04 /* object has valid saved contents */
498
499 /*
500 * Each of the following two arrays hold up to nmbclusters elements.
501 */
502 static mcl_audit_t *mclaudit; /* array of cluster audit information */
503 static unsigned int maxclaudit; /* max # of entries in audit table */
504 static mcl_slabg_t **slabstbl; /* cluster slabs table */
505 static unsigned int maxslabgrp; /* max # of entries in slabs table */
506 static unsigned int slabgrp; /* # of entries in slabs table */
507
508 /* Globals */
509 int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
510 int njcl; /* # of clusters for jumbo sizes */
511 int njclbytes; /* size of a jumbo cluster */
512 unsigned char *mbutl; /* first mapped cluster address */
513 unsigned char *embutl; /* ending virtual address of mclusters */
514 int _max_linkhdr; /* largest link-level header */
515 int _max_protohdr; /* largest protocol header */
516 int max_hdr; /* largest link+protocol header */
517 int max_datalen; /* MHLEN - max_hdr */
518
519 static boolean_t mclverify; /* debug: pattern-checking */
520 static boolean_t mcltrace; /* debug: stack tracing */
521 static boolean_t mclfindleak; /* debug: leak detection */
522 static boolean_t mclexpleak; /* debug: expose leak info to user space */
523
524 static struct timeval mb_start; /* beginning of time */
525
526 /* mbuf leak detection variables */
527 static struct mleak_table mleak_table;
528 static mleak_stat_t *mleak_stat;
529
530 #define MLEAK_STAT_SIZE(n) \
531 __builtin_offsetof(mleak_stat_t, ml_trace[n])
532
533 struct mallocation {
534 mcache_obj_t *element; /* the alloc'ed element, NULL if unused */
535 u_int32_t trace_index; /* mtrace index for corresponding backtrace */
536 u_int32_t count; /* How many objects were requested */
537 u_int64_t hitcount; /* for determining hash effectiveness */
538 };
539
540 struct mtrace {
541 u_int64_t collisions;
542 u_int64_t hitcount;
543 u_int64_t allocs;
544 u_int64_t depth;
545 uintptr_t addr[MLEAK_STACK_DEPTH];
546 };
547
548 /* Size must be a power of two for the zhash to be able to just mask off bits */
549 #define MLEAK_ALLOCATION_MAP_NUM 512
550 #define MLEAK_TRACE_MAP_NUM 256
551
552 /*
553 * Sample factor for how often to record a trace. This is overwritable
554 * by the boot-arg mleak_sample_factor.
555 */
556 #define MLEAK_SAMPLE_FACTOR 500
557
558 /*
559 * Number of top leakers recorded.
560 */
561 #define MLEAK_NUM_TRACES 5
562
563 #define MB_LEAK_SPACING_64 " "
564 #define MB_LEAK_SPACING_32 " "
565
566
567 #define MB_LEAK_HDR_32 "\n\
568 trace [1] trace [2] trace [3] trace [4] trace [5] \n\
569 ---------- ---------- ---------- ---------- ---------- \n\
570 "
571
572 #define MB_LEAK_HDR_64 "\n\
573 trace [1] trace [2] trace [3] \
574 trace [4] trace [5] \n\
575 ------------------ ------------------ ------------------ \
576 ------------------ ------------------ \n\
577 "
578
579 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
580 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
581
582 /* Hashmaps of allocations and their corresponding traces */
583 static struct mallocation *mleak_allocations;
584 static struct mtrace *mleak_traces;
585 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
586
587 /* Lock to protect mleak tables from concurrent modification */
588 static LCK_GRP_DECLARE(mleak_lock_grp, "mleak_lock");
589 static LCK_MTX_DECLARE(mleak_lock_data, &mleak_lock_grp);
590 static lck_mtx_t *const mleak_lock = &mleak_lock_data;
591
592 /* *Failed* large allocations. */
593 struct mtracelarge {
594 uint64_t size;
595 uint64_t depth;
596 uintptr_t addr[MLEAK_STACK_DEPTH];
597 };
598
599 #define MTRACELARGE_NUM_TRACES 5
600 static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES];
601
602 static void mtracelarge_register(size_t size);
603
604 /* Lock to protect the completion callback table */
605 static LCK_GRP_DECLARE(mbuf_tx_compl_tbl_lck_grp, "mbuf_tx_compl_tbl");
606 LCK_RW_DECLARE(mbuf_tx_compl_tbl_lock, &mbuf_tx_compl_tbl_lck_grp);
607
608 extern u_int32_t high_sb_max;
609
610 /* The minimum number of objects that are allocated, to start. */
611 #define MINCL 32
612 #define MINBIGCL (MINCL >> 1)
613 #define MIN16KCL (MINCL >> 2)
614
615 /* Low watermarks (only map in pages once free counts go below) */
616 #define MBIGCL_LOWAT MINBIGCL
617 #define M16KCL_LOWAT MIN16KCL
618
619 typedef struct {
620 mbuf_class_t mtbl_class; /* class type */
621 mcache_t *mtbl_cache; /* mcache for this buffer class */
622 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
623 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */
624 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */
625 u_int32_t mtbl_maxsize; /* maximum buffer size */
626 int mtbl_minlimit; /* minimum allowed */
627 int mtbl_maxlimit; /* maximum allowed */
628 u_int32_t mtbl_wantpurge; /* purge during next reclaim */
629 uint32_t mtbl_avgtotal; /* average total on iOS */
630 u_int32_t mtbl_expand; /* worker should expand the class */
631 } mbuf_table_t;
632
633 #define m_class(c) mbuf_table[c].mtbl_class
634 #define m_cache(c) mbuf_table[c].mtbl_cache
635 #define m_slablist(c) mbuf_table[c].mtbl_slablist
636 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
637 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
638 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
639 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
640 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
641 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
642 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
643 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
644 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
645 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
646 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
647 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
648 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
649 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
650 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
651 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
652 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
653 #define m_peak(c) mbuf_table[c].mtbl_stats->mbcl_peak_reported
654 #define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
655 #define m_region_expand(c) mbuf_table[c].mtbl_expand
656
657 static mbuf_table_t mbuf_table[] = {
658 /*
659 * The caches for mbufs, regular clusters and big clusters.
660 * The average total values were based on data gathered by actual
661 * usage patterns on iOS.
662 */
663 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
664 NULL, NULL, 0, 0, 0, 0, 3000, 0 },
665 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
666 NULL, NULL, 0, 0, 0, 0, 2000, 0 },
667 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
668 NULL, NULL, 0, 0, 0, 0, 1000, 0 },
669 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
670 NULL, NULL, 0, 0, 0, 0, 200, 0 },
671 /*
672 * The following are special caches; they serve as intermediate
673 * caches backed by the above rudimentary caches. Each object
674 * in the cache is an mbuf with a cluster attached to it. Unlike
675 * the above caches, these intermediate caches do not directly
676 * deal with the slab structures; instead, the constructed
677 * cached elements are simply stored in the freelists.
678 */
679 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000, 0 },
680 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000, 0 },
681 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 200, 0 },
682 };
683
684 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
685
686 #if SKYWALK
687 #define MC_THRESHOLD_SCALE_DOWN_FACTOR 2
688 static unsigned int mc_threshold_scale_down_factor =
689 MC_THRESHOLD_SCALE_DOWN_FACTOR;
690 #endif /* SKYWALK */
691
692 static uint32_t
m_avgtotal(mbuf_class_t c)693 m_avgtotal(mbuf_class_t c)
694 {
695 #if SKYWALK
696 return if_is_fsw_transport_netagent_enabled() ?
697 (mbuf_table[c].mtbl_avgtotal / mc_threshold_scale_down_factor) :
698 mbuf_table[c].mtbl_avgtotal;
699 #else /* !SKYWALK */
700 return mbuf_table[c].mtbl_avgtotal;
701 #endif /* SKYWALK */
702 }
703
704 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
705 static int mb_waiters; /* number of waiters */
706
707 boolean_t mb_peak_newreport = FALSE;
708 boolean_t mb_peak_firstreport = FALSE;
709
710 /* generate a report by default after 1 week of uptime */
711 #define MBUF_PEAK_FIRST_REPORT_THRESHOLD 604800
712
713 #define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */
714 static struct timeval mb_wdtstart; /* watchdog start timestamp */
715 static char *mbuf_dump_buf;
716
717 #define MBUF_DUMP_BUF_SIZE 4096
718
719 /*
720 * mbuf watchdog is enabled by default. It is also toggeable via the
721 * kern.ipc.mb_watchdog sysctl.
722 * Garbage collection is enabled by default on embedded platforms.
723 * mb_drain_maxint controls the amount of time to wait (in seconds) before
724 * consecutive calls to mbuf_drain().
725 */
726 static unsigned int mb_watchdog = 1;
727 #if !XNU_TARGET_OS_OSX
728 static unsigned int mb_drain_maxint = 60;
729 #else /* XNU_TARGET_OS_OSX */
730 static unsigned int mb_drain_maxint = 0;
731 #endif /* XNU_TARGET_OS_OSX */
732 static unsigned int mb_memory_pressure_percentage = 80;
733
734 uintptr_t mb_obscure_extfree __attribute__((visibility("hidden")));
735 uintptr_t mb_obscure_extref __attribute__((visibility("hidden")));
736
737 /* Red zone */
738 static u_int32_t mb_redzone_cookie;
739 static void m_redzone_init(struct mbuf *);
740 static void m_redzone_verify(struct mbuf *m);
741
742 /* The following are used to serialize m_clalloc() */
743 static boolean_t mb_clalloc_busy;
744 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
745 static int mb_clalloc_waiters;
746
747 static void mbuf_mtypes_sync(boolean_t);
748 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
749 static void mbuf_stat_sync(void);
750 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
751 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
752 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
753 static char *mbuf_dump(void);
754 static void mbuf_table_init(void);
755 static inline void m_incref(struct mbuf *);
756 static inline u_int16_t m_decref(struct mbuf *);
757 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
758 static void mbuf_worker_thread_init(void);
759 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
760 static void slab_free(mbuf_class_t, mcache_obj_t *);
761 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
762 unsigned int, int);
763 static void mbuf_slab_free(void *, mcache_obj_t *, int);
764 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
765 static void mbuf_slab_notify(void *, u_int32_t);
766 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
767 unsigned int);
768 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
769 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
770 unsigned int, int);
771 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
772 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
773 static int freelist_populate(mbuf_class_t, unsigned int, int);
774 static void freelist_init(mbuf_class_t);
775 static boolean_t mbuf_cached_above(mbuf_class_t, int);
776 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
777 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
778 static int m_howmany(int, size_t);
779 static void mbuf_worker_thread(void);
780 static void mbuf_watchdog(void);
781 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
782
783 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
784 size_t, unsigned int);
785 static void mcl_audit_free(void *, unsigned int);
786 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
787 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
788 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
789 boolean_t);
790 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
791 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
792 static void mcl_audit_scratch(mcache_audit_t *);
793 static void mcl_audit_mcheck_panic(struct mbuf *);
794 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
795
796 static void mleak_activate(void);
797 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
798 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
799 static void mleak_free(mcache_obj_t *);
800 static void mleak_sort_traces(void);
801 static void mleak_update_stats(void);
802
803 static mcl_slab_t *slab_get(void *);
804 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
805 void *, void *, unsigned int, int, int);
806 static void slab_insert(mcl_slab_t *, mbuf_class_t);
807 static void slab_remove(mcl_slab_t *, mbuf_class_t);
808 static boolean_t slab_inrange(mcl_slab_t *, void *);
809 static void slab_nextptr_panic(mcl_slab_t *, void *);
810 static void slab_detach(mcl_slab_t *);
811 static boolean_t slab_is_detached(mcl_slab_t *);
812
813 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
814 static struct mbuf *m_split0(struct mbuf *, int, int, int);
815 __private_extern__ void mbuf_report_peak_usage(void);
816 static boolean_t mbuf_report_usage(mbuf_class_t);
817 #if DEBUG || DEVELOPMENT
818 #define mbwdog_logger(fmt, ...) _mbwdog_logger(__func__, __LINE__, fmt, ## __VA_ARGS__)
819 static void _mbwdog_logger(const char *func, const int line, const char *fmt, ...);
820 static char *mbwdog_logging;
821 const unsigned mbwdog_logging_size = 4096;
822 static size_t mbwdog_logging_used;
823 #else
824 #define mbwdog_logger(fmt, ...) do { } while (0)
825 #endif
826 static void mbuf_drain_locked(boolean_t);
827
828 /* flags for m_copyback0 */
829 #define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
830 #define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
831 #define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
832 #define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
833
834 /*
835 * This flag is set for all mbufs that come out of and into the composite
836 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
837 * are marked with such a flag have clusters attached to them, and will be
838 * treated differently when they are freed; instead of being placed back
839 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
840 * are placed back into the appropriate composite cache's freelist, and the
841 * actual freeing is deferred until the composite objects are purged. At
842 * such a time, this flag will be cleared from the mbufs and the objects
843 * will be freed into their own separate freelists.
844 */
845 #define EXTF_COMPOSITE 0x1
846
847 /*
848 * This flag indicates that the external cluster is read-only, i.e. it is
849 * or was referred to by more than one mbufs. Once set, this flag is never
850 * cleared.
851 */
852 #define EXTF_READONLY 0x2
853 /*
854 * This flag indicates that the external cluster is paired with the mbuf.
855 * Pairing implies an external free routine defined which will be invoked
856 * when the reference count drops to the minimum at m_free time. This
857 * flag is never cleared.
858 */
859 #define EXTF_PAIRED 0x4
860
861 #define EXTF_MASK \
862 (EXTF_COMPOSITE | EXTF_READONLY | EXTF_PAIRED)
863
864 #define MEXT_MINREF(m) ((m_get_rfa(m))->minref)
865 #define MEXT_REF(m) ((m_get_rfa(m))->refcnt)
866 #define MEXT_PREF(m) ((m_get_rfa(m))->prefcnt)
867 #define MEXT_FLAGS(m) ((m_get_rfa(m))->flags)
868 #define MEXT_PRIV(m) ((m_get_rfa(m))->priv)
869 #define MEXT_PMBUF(m) ((m_get_rfa(m))->paired)
870 #define MEXT_TOKEN(m) ((m_get_rfa(m))->ext_token)
871 #define MBUF_IS_COMPOSITE(m) \
872 (MEXT_REF(m) == MEXT_MINREF(m) && \
873 (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
874 /*
875 * This macro can be used to test if the mbuf is paired to an external
876 * cluster. The test for MEXT_PMBUF being equal to the mbuf in subject
877 * is important, as EXTF_PAIRED alone is insufficient since it is immutable,
878 * and thus survives calls to m_free_paired.
879 */
880 #define MBUF_IS_PAIRED(m) \
881 (((m)->m_flags & M_EXT) && \
882 (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_PAIRED && \
883 MEXT_PMBUF(m) == (m))
884
885 /*
886 * Macros used to verify the integrity of the mbuf.
887 */
888 #define _MCHECK(m) { \
889 if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) { \
890 if (mclaudit == NULL) \
891 panic("MCHECK: m_type=%d m=%p", \
892 (u_int16_t)(m)->m_type, m); \
893 else \
894 mcl_audit_mcheck_panic(m); \
895 } \
896 }
897
898 #define MBUF_IN_MAP(addr) \
899 ((unsigned char *)(addr) >= mbutl && \
900 (unsigned char *)(addr) < embutl)
901
902 #define MRANGE(addr) { \
903 if (!MBUF_IN_MAP(addr)) \
904 panic("MRANGE: address out of range 0x%p", addr); \
905 }
906
907 /*
908 * Macro version of mtod.
909 */
910 #define MTOD(m, t) ((t)((m)->m_data))
911
912 /*
913 * Macros to obtain page index given a base cluster address
914 */
915 #define MTOPG(x) (((unsigned char *)x - mbutl) >> PAGE_SHIFT)
916 #define PGTOM(x) (mbutl + (x << PAGE_SHIFT))
917
918 /*
919 * Macro to find the mbuf index relative to a base.
920 */
921 #define MBPAGEIDX(c, m) \
922 (((unsigned char *)(m) - (unsigned char *)(c)) >> MSIZESHIFT)
923
924 /*
925 * Same thing for 2KB cluster index.
926 */
927 #define CLPAGEIDX(c, m) \
928 (((unsigned char *)(m) - (unsigned char *)(c)) >> MCLSHIFT)
929
930 /*
931 * Macro to find 4KB cluster index relative to a base
932 */
933 #define BCLPAGEIDX(c, m) \
934 (((unsigned char *)(m) - (unsigned char *)(c)) >> MBIGCLSHIFT)
935
936 /*
937 * Macros used during mbuf and cluster initialization.
938 */
939 #define MBUF_INIT_PKTHDR(m) { \
940 (m)->m_pkthdr.rcvif = NULL; \
941 (m)->m_pkthdr.pkt_hdr = NULL; \
942 (m)->m_pkthdr.len = 0; \
943 (m)->m_pkthdr.csum_flags = 0; \
944 (m)->m_pkthdr.csum_data = 0; \
945 (m)->m_pkthdr.vlan_tag = 0; \
946 (m)->m_pkthdr.comp_gencnt = 0; \
947 (m)->m_pkthdr.pkt_crumbs = 0; \
948 m_classifier_init(m, 0); \
949 m_tag_init(m, 1); \
950 m_scratch_init(m); \
951 m_redzone_init(m); \
952 }
953
954 #define MBUF_INIT(m, pkthdr, type) { \
955 _MCHECK(m); \
956 (m)->m_next = (m)->m_nextpkt = NULL; \
957 (m)->m_len = 0; \
958 (m)->m_type = type; \
959 if ((pkthdr) == 0) { \
960 (m)->m_data = (m)->m_dat; \
961 (m)->m_flags = 0; \
962 } else { \
963 (m)->m_data = (m)->m_pktdat; \
964 (m)->m_flags = M_PKTHDR; \
965 MBUF_INIT_PKTHDR(m); \
966 } \
967 }
968
969 #define MEXT_INIT(m, buf, size, free, arg, rfa, min, ref, pref, flag, \
970 priv, pm) { \
971 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
972 (m)->m_flags |= M_EXT; \
973 m_set_ext((m), (rfa), (free), (arg)); \
974 (m)->m_ext.ext_size = (u_int)(size); \
975 MEXT_MINREF(m) = (min); \
976 MEXT_REF(m) = (ref); \
977 MEXT_PREF(m) = (pref); \
978 MEXT_FLAGS(m) = (flag); \
979 MEXT_PRIV(m) = (priv); \
980 MEXT_PMBUF(m) = (pm); \
981 }
982
983 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
984 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, 0, \
985 ref, 0, flag, 0, NULL)
986
987 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
988 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, 0, \
989 ref, 0, flag, 0, NULL)
990
991 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
992 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, 0, \
993 ref, 0, flag, 0, NULL)
994
995 /*
996 * Macro to convert BSD malloc sleep flag to mcache's
997 */
998 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
999
1000 /*
1001 * The structure that holds all mbuf class statistics exportable via sysctl.
1002 * Similar to mbstat structure, the mb_stat structure is protected by the
1003 * global mbuf lock. It contains additional information about the classes
1004 * that allows for a more accurate view of the state of the allocator.
1005 */
1006 struct mb_stat *mb_stat;
1007 struct omb_stat *omb_stat; /* For backwards compatibility */
1008
1009 #define MB_STAT_SIZE(n) \
1010 __builtin_offsetof(mb_stat_t, mbs_class[n])
1011 #define OMB_STAT_SIZE(n) \
1012 __builtin_offsetof(struct omb_stat, mbs_class[n])
1013
1014 /*
1015 * The legacy structure holding all of the mbuf allocation statistics.
1016 * The actual statistics used by the kernel are stored in the mbuf_table
1017 * instead, and are updated atomically while the global mbuf lock is held.
1018 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
1019 * Unlike before, the kernel no longer relies on the contents of mbstat for
1020 * its operations (e.g. cluster expansion) because the structure is exposed
1021 * to outside and could possibly be modified, therefore making it unsafe.
1022 * With the exception of the mbstat.m_mtypes array (see below), all of the
1023 * statistics are updated as they change.
1024 */
1025 struct mbstat mbstat;
1026
1027 #define MBSTAT_MTYPES_MAX \
1028 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
1029
1030 /*
1031 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
1032 * atomically and stored in a per-CPU structure which is lock-free; this is
1033 * done in order to avoid writing to the global mbstat data structure which
1034 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
1035 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
1036 * array and returned to the application. Any updates for types greater or
1037 * equal than MT_MAX would be done atomically to the mbstat; this slows down
1038 * performance but is okay since the kernel uses only up to MT_MAX-1 while
1039 * anything beyond that (up to type 255) is considered a corner case.
1040 */
1041 typedef struct {
1042 unsigned int cpu_mtypes[MT_MAX];
1043 } mbuf_mtypes_t;
1044
1045 static mbuf_mtypes_t PERCPU_DATA(mbuf_mtypes);
1046
1047 #define mtype_stat_add(type, n) { \
1048 if ((unsigned)(type) < MT_MAX) { \
1049 mbuf_mtypes_t *mbs = PERCPU_GET(mbuf_mtypes); \
1050 atomic_add_32(&mbs->cpu_mtypes[type], n); \
1051 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
1052 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \
1053 } \
1054 }
1055
1056 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
1057 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
1058 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
1059
1060 static void
mbuf_mtypes_sync(boolean_t locked)1061 mbuf_mtypes_sync(boolean_t locked)
1062 {
1063 mbuf_mtypes_t mtc;
1064
1065 if (locked) {
1066 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1067 }
1068
1069 mtc = *PERCPU_GET_MASTER(mbuf_mtypes);
1070 percpu_foreach_secondary(mtype, mbuf_mtypes) {
1071 for (int n = 0; n < MT_MAX; n++) {
1072 mtc.cpu_mtypes[n] += mtype->cpu_mtypes[n];
1073 }
1074 }
1075
1076 if (!locked) {
1077 lck_mtx_lock(mbuf_mlock);
1078 }
1079 for (int n = 0; n < MT_MAX; n++) {
1080 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
1081 }
1082 if (!locked) {
1083 lck_mtx_unlock(mbuf_mlock);
1084 }
1085 }
1086
1087 static int
1088 mbstat_sysctl SYSCTL_HANDLER_ARGS
1089 {
1090 #pragma unused(oidp, arg1, arg2)
1091 mbuf_mtypes_sync(FALSE);
1092
1093 return SYSCTL_OUT(req, &mbstat, sizeof(mbstat));
1094 }
1095
1096 static void
mbuf_stat_sync(void)1097 mbuf_stat_sync(void)
1098 {
1099 mb_class_stat_t *sp;
1100 mcache_cpu_t *ccp;
1101 mcache_t *cp;
1102 int k, m, bktsize;
1103
1104 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1105
1106 for (k = 0; k < NELEM(mbuf_table); k++) {
1107 cp = m_cache(k);
1108 ccp = &cp->mc_cpu[0];
1109 bktsize = ccp->cc_bktsize;
1110 sp = mbuf_table[k].mtbl_stats;
1111
1112 if (cp->mc_flags & MCF_NOCPUCACHE) {
1113 sp->mbcl_mc_state = MCS_DISABLED;
1114 } else if (cp->mc_purge_cnt > 0) {
1115 sp->mbcl_mc_state = MCS_PURGING;
1116 } else if (bktsize == 0) {
1117 sp->mbcl_mc_state = MCS_OFFLINE;
1118 } else {
1119 sp->mbcl_mc_state = MCS_ONLINE;
1120 }
1121
1122 sp->mbcl_mc_cached = 0;
1123 for (m = 0; m < ncpu; m++) {
1124 ccp = &cp->mc_cpu[m];
1125 if (ccp->cc_objs > 0) {
1126 sp->mbcl_mc_cached += ccp->cc_objs;
1127 }
1128 if (ccp->cc_pobjs > 0) {
1129 sp->mbcl_mc_cached += ccp->cc_pobjs;
1130 }
1131 }
1132 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
1133 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1134 sp->mbcl_infree;
1135
1136 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1137 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1138 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1139
1140 /* Calculate total count specific to each class */
1141 sp->mbcl_ctotal = sp->mbcl_total;
1142 switch (m_class(k)) {
1143 case MC_MBUF:
1144 /* Deduct mbufs used in composite caches */
1145 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1146 m_total(MC_MBUF_BIGCL));
1147 break;
1148
1149 case MC_CL:
1150 /* Deduct clusters used in composite cache */
1151 sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1152 break;
1153
1154 case MC_BIGCL:
1155 /* Deduct clusters used in composite cache */
1156 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1157 break;
1158
1159 case MC_16KCL:
1160 /* Deduct clusters used in composite cache */
1161 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1162 break;
1163
1164 default:
1165 break;
1166 }
1167 }
1168 }
1169
1170 static int
1171 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1172 {
1173 #pragma unused(oidp, arg1, arg2)
1174 void *statp;
1175 int k, statsz, proc64 = proc_is64bit(req->p);
1176
1177 lck_mtx_lock(mbuf_mlock);
1178 mbuf_stat_sync();
1179
1180 if (!proc64) {
1181 struct omb_class_stat *oc;
1182 struct mb_class_stat *c;
1183
1184 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1185 oc = &omb_stat->mbs_class[0];
1186 c = &mb_stat->mbs_class[0];
1187 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1188 (void) snprintf(oc->mbcl_cname, sizeof(oc->mbcl_cname),
1189 "%s", c->mbcl_cname);
1190 oc->mbcl_size = c->mbcl_size;
1191 oc->mbcl_total = c->mbcl_total;
1192 oc->mbcl_active = c->mbcl_active;
1193 oc->mbcl_infree = c->mbcl_infree;
1194 oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1195 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1196 oc->mbcl_free_cnt = c->mbcl_free_cnt;
1197 oc->mbcl_notified = c->mbcl_notified;
1198 oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1199 oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1200 oc->mbcl_ctotal = c->mbcl_ctotal;
1201 oc->mbcl_release_cnt = c->mbcl_release_cnt;
1202 oc->mbcl_mc_state = c->mbcl_mc_state;
1203 oc->mbcl_mc_cached = c->mbcl_mc_cached;
1204 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1205 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1206 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1207 oc->mbcl_peak_reported = c->mbcl_peak_reported;
1208 }
1209 statp = omb_stat;
1210 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1211 } else {
1212 statp = mb_stat;
1213 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1214 }
1215
1216 lck_mtx_unlock(mbuf_mlock);
1217
1218 return SYSCTL_OUT(req, statp, statsz);
1219 }
1220
1221 static int
1222 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1223 {
1224 #pragma unused(oidp, arg1, arg2)
1225 int i;
1226
1227 /* Ensure leak tracing turned on */
1228 if (!mclfindleak || !mclexpleak) {
1229 return ENXIO;
1230 }
1231
1232 lck_mtx_lock(mleak_lock);
1233 mleak_update_stats();
1234 i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1235 lck_mtx_unlock(mleak_lock);
1236
1237 return i;
1238 }
1239
1240 static int
1241 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1242 {
1243 #pragma unused(oidp, arg1, arg2)
1244 int i = 0;
1245
1246 /* Ensure leak tracing turned on */
1247 if (!mclfindleak || !mclexpleak) {
1248 return ENXIO;
1249 }
1250
1251 lck_mtx_lock(mleak_lock);
1252 i = SYSCTL_OUT(req, &mleak_table, sizeof(mleak_table));
1253 lck_mtx_unlock(mleak_lock);
1254
1255 return i;
1256 }
1257
1258 static inline void
m_incref(struct mbuf * m)1259 m_incref(struct mbuf *m)
1260 {
1261 UInt16 old, new;
1262 volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m);
1263
1264 do {
1265 old = *addr;
1266 new = old + 1;
1267 VERIFY(new != 0);
1268 } while (!OSCompareAndSwap16(old, new, addr));
1269
1270 /*
1271 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1272 * we don't clear the flag when the refcount goes back to the
1273 * minimum, to simplify code calling m_mclhasreference().
1274 */
1275 if (new > (MEXT_MINREF(m) + 1) && !(MEXT_FLAGS(m) & EXTF_READONLY)) {
1276 (void) OSBitOrAtomic16(EXTF_READONLY, &MEXT_FLAGS(m));
1277 }
1278 }
1279
1280 static inline u_int16_t
m_decref(struct mbuf * m)1281 m_decref(struct mbuf *m)
1282 {
1283 UInt16 old, new;
1284 volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m);
1285
1286 do {
1287 old = *addr;
1288 new = old - 1;
1289 VERIFY(old != 0);
1290 } while (!OSCompareAndSwap16(old, new, addr));
1291
1292 return new;
1293 }
1294
1295 static void
mbuf_table_init(void)1296 mbuf_table_init(void)
1297 {
1298 unsigned int b, c, s;
1299 int m, config_mbuf_jumbo = 0;
1300
1301 omb_stat = zalloc_permanent(OMB_STAT_SIZE(NELEM(mbuf_table)),
1302 ZALIGN(struct omb_stat));
1303
1304 mb_stat = zalloc_permanent(MB_STAT_SIZE(NELEM(mbuf_table)),
1305 ZALIGN(mb_stat_t));
1306
1307 mb_stat->mbs_cnt = NELEM(mbuf_table);
1308 for (m = 0; m < NELEM(mbuf_table); m++) {
1309 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1310 }
1311
1312 #if CONFIG_MBUF_JUMBO
1313 config_mbuf_jumbo = 1;
1314 #endif /* CONFIG_MBUF_JUMBO */
1315
1316 if (config_mbuf_jumbo == 1 || PAGE_SIZE == M16KCLBYTES) {
1317 /*
1318 * Set aside 1/3 of the mbuf cluster map for jumbo
1319 * clusters; we do this only on platforms where jumbo
1320 * cluster pool is enabled.
1321 */
1322 njcl = nmbclusters / 3;
1323 njclbytes = M16KCLBYTES;
1324 }
1325
1326 /*
1327 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1328 * a multiple of 4KB clusters.
1329 */
1330 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1331 if (njcl > 0) {
1332 /*
1333 * Each jumbo cluster takes 8 2KB clusters, so make
1334 * sure that the pool size is evenly divisible by 8;
1335 * njcl is in 2KB unit, hence treated as such.
1336 */
1337 njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL);
1338
1339 /* Update nclusters with rounded down value of njcl */
1340 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1341 }
1342
1343 /*
1344 * njcl is valid only on platforms with 16KB jumbo clusters or
1345 * with 16KB pages, where it is configured to 1/3 of the pool
1346 * size. On these platforms, the remaining is used for 2KB
1347 * and 4KB clusters. On platforms without 16KB jumbo clusters,
1348 * the entire pool is used for both 2KB and 4KB clusters. A 4KB
1349 * cluster can either be splitted into 16 mbufs, or into 2 2KB
1350 * clusters.
1351 *
1352 * +---+---+------------ ... -----------+------- ... -------+
1353 * | c | b | s | njcl |
1354 * +---+---+------------ ... -----------+------- ... -------+
1355 *
1356 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1357 * clusters (1/64th each.)
1358 */
1359 c = P2ROUNDDOWN((nclusters >> 6), NCLPG); /* in 2KB unit */
1360 b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), NBCLPG); /* in 4KB unit */
1361 s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */
1362
1363 /*
1364 * 1/64th (c) is reserved for 2KB clusters.
1365 */
1366 m_minlimit(MC_CL) = c;
1367 m_maxlimit(MC_CL) = s + c; /* in 2KB unit */
1368 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1369 (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1370
1371 /*
1372 * Another 1/64th (b) of the map is reserved for 4KB clusters.
1373 * It cannot be turned into 2KB clusters or mbufs.
1374 */
1375 m_minlimit(MC_BIGCL) = b;
1376 m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */
1377 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1378 (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1379
1380 /*
1381 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1382 */
1383 m_minlimit(MC_MBUF) = 0;
1384 m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT); /* in mbuf unit */
1385 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1386 (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1387
1388 /*
1389 * Set limits for the composite classes.
1390 */
1391 m_minlimit(MC_MBUF_CL) = 0;
1392 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1393 m_maxsize(MC_MBUF_CL) = MCLBYTES;
1394 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1395 (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1396
1397 m_minlimit(MC_MBUF_BIGCL) = 0;
1398 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1399 m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1400 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1401 (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1402
1403 /*
1404 * And for jumbo classes.
1405 */
1406 m_minlimit(MC_16KCL) = 0;
1407 m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */
1408 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1409 (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1410
1411 m_minlimit(MC_MBUF_16KCL) = 0;
1412 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1413 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1414 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1415 (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1416
1417 /*
1418 * Initialize the legacy mbstat structure.
1419 */
1420 bzero(&mbstat, sizeof(mbstat));
1421 mbstat.m_msize = m_maxsize(MC_MBUF);
1422 mbstat.m_mclbytes = m_maxsize(MC_CL);
1423 mbstat.m_minclsize = MINCLSIZE;
1424 mbstat.m_mlen = MLEN;
1425 mbstat.m_mhlen = MHLEN;
1426 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1427 }
1428
1429 int
mbuf_get_class(struct mbuf * m)1430 mbuf_get_class(struct mbuf *m)
1431 {
1432 if (m->m_flags & M_EXT) {
1433 uint32_t composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
1434 m_ext_free_func_t m_free_func = m_get_ext_free(m);
1435
1436 if (m_free_func == NULL) {
1437 if (composite) {
1438 return MC_MBUF_CL;
1439 } else {
1440 return MC_CL;
1441 }
1442 } else if (m_free_func == m_bigfree) {
1443 if (composite) {
1444 return MC_MBUF_BIGCL;
1445 } else {
1446 return MC_BIGCL;
1447 }
1448 } else if (m_free_func == m_16kfree) {
1449 if (composite) {
1450 return MC_MBUF_16KCL;
1451 } else {
1452 return MC_16KCL;
1453 }
1454 }
1455 }
1456
1457 return MC_MBUF;
1458 }
1459
1460 bool
mbuf_class_under_pressure(struct mbuf * m)1461 mbuf_class_under_pressure(struct mbuf *m)
1462 {
1463 int mclass = mbuf_get_class(m);
1464
1465 if (m_total(mclass) - m_infree(mclass) >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
1466 /*
1467 * The above computation does not include the per-CPU cached objects.
1468 * As a fast-path check this is good-enough. But now we do
1469 * the "slower" count of the cached objects to know exactly the
1470 * number of active mbufs in use.
1471 *
1472 * We do not take the mbuf_lock here to avoid lock-contention. Numbers
1473 * might be slightly off but we don't try to be 100% accurate.
1474 * At worst, we drop a packet that we shouldn't have dropped or
1475 * we might go slightly above our memory-pressure threshold.
1476 */
1477 mcache_t *cp = m_cache(mclass);
1478 mcache_cpu_t *ccp = &cp->mc_cpu[0];
1479
1480 int bktsize = os_access_once(ccp->cc_bktsize);
1481 uint32_t bl_total = os_access_once(cp->mc_full.bl_total);
1482 uint32_t cached = 0;
1483 int i;
1484
1485 for (i = 0; i < ncpu; i++) {
1486 ccp = &cp->mc_cpu[i];
1487
1488 int cc_objs = os_access_once(ccp->cc_objs);
1489 if (cc_objs > 0) {
1490 cached += cc_objs;
1491 }
1492
1493 int cc_pobjs = os_access_once(ccp->cc_pobjs);
1494 if (cc_pobjs > 0) {
1495 cached += cc_pobjs;
1496 }
1497 }
1498 cached += (bl_total * bktsize);
1499
1500 if (m_total(mclass) - m_infree(mclass) - cached >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
1501 os_log(OS_LOG_DEFAULT,
1502 "%s memory-pressure on mbuf due to class %u, total %u free %u cached %u max %u",
1503 __func__, mclass, m_total(mclass), m_infree(mclass), cached, m_maxlimit(mclass));
1504 return true;
1505 }
1506 }
1507
1508 return false;
1509 }
1510
1511 #if defined(__LP64__)
1512 typedef struct ncl_tbl {
1513 uint64_t nt_maxmem; /* memory (sane) size */
1514 uint32_t nt_mbpool; /* mbuf pool size */
1515 } ncl_tbl_t;
1516
1517 static const ncl_tbl_t ncl_table[] = {
1518 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ },
1519 { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (96 << MBSHIFT) /* 96 MB */ },
1520 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (128 << MBSHIFT) /* 128 MB */ },
1521 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (256 << MBSHIFT) /* 256 MB */ },
1522 { (1ULL << (GBSHIFT + 5)) /* 32 GB */, (512 << MBSHIFT) /* 512 MB */ },
1523 { 0, 0 }
1524 };
1525 #endif /* __LP64__ */
1526
1527 __private_extern__ unsigned int
mbuf_default_ncl(uint64_t mem)1528 mbuf_default_ncl(uint64_t mem)
1529 {
1530 #if !defined(__LP64__)
1531 unsigned int n;
1532 /*
1533 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1534 */
1535 if ((n = ((mem / 16) / MCLBYTES)) > 32768) {
1536 n = 32768;
1537 }
1538 #else
1539 unsigned int n, i;
1540 /*
1541 * 64-bit kernel (mbuf pool size based on table).
1542 */
1543 n = ncl_table[0].nt_mbpool;
1544 for (i = 0; ncl_table[i].nt_mbpool != 0; i++) {
1545 if (mem < ncl_table[i].nt_maxmem) {
1546 break;
1547 }
1548 n = ncl_table[i].nt_mbpool;
1549 }
1550 n >>= MCLSHIFT;
1551 #endif /* !__LP64__ */
1552 return n;
1553 }
1554
1555 __private_extern__ void
mbinit(void)1556 mbinit(void)
1557 {
1558 unsigned int m;
1559 unsigned int initmcl = 0;
1560 thread_t thread = THREAD_NULL;
1561
1562 microuptime(&mb_start);
1563
1564 /*
1565 * These MBUF_ values must be equal to their private counterparts.
1566 */
1567 _CASSERT(MBUF_EXT == M_EXT);
1568 _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1569 _CASSERT(MBUF_EOR == M_EOR);
1570 _CASSERT(MBUF_LOOP == M_LOOP);
1571 _CASSERT(MBUF_BCAST == M_BCAST);
1572 _CASSERT(MBUF_MCAST == M_MCAST);
1573 _CASSERT(MBUF_FRAG == M_FRAG);
1574 _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1575 _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1576 _CASSERT(MBUF_PROMISC == M_PROMISC);
1577 _CASSERT(MBUF_HASFCS == M_HASFCS);
1578
1579 _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1580 _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1581 _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1582 _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1583 _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1584 _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1585 _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1586 _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1587 _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1588 _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1589 _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1590 _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1591 _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1592 _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1593 _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1594
1595 _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1596 _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1597 _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1598 _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1599 _CASSERT(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
1600 _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1601 _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1602 _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1603 _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1604 _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1605 _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1606 _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1607 _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1608 _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1609
1610 _CASSERT(MBUF_WAITOK == M_WAIT);
1611 _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1612 _CASSERT(MBUF_COPYALL == M_COPYALL);
1613
1614 _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1615 _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1616 _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1617 _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1618 _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1619 _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1620 _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1621 _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1622 _CASSERT(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI);
1623 _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1624 _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1625
1626 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1627 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1628 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1629 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1630
1631 /* Module specific scratch space (32-bit alignment requirement) */
1632 _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
1633 sizeof(uint32_t)));
1634
1635 /* pktdata needs to start at 128-bit offset! */
1636 _CASSERT((offsetof(struct mbuf, m_pktdat) % 16) == 0);
1637
1638 /* Initialize random red zone cookie value */
1639 _CASSERT(sizeof(mb_redzone_cookie) ==
1640 sizeof(((struct pkthdr *)0)->redzone));
1641 read_random(&mb_redzone_cookie, sizeof(mb_redzone_cookie));
1642 read_random(&mb_obscure_extref, sizeof(mb_obscure_extref));
1643 read_random(&mb_obscure_extfree, sizeof(mb_obscure_extfree));
1644 mb_obscure_extref |= 0x3;
1645 mb_obscure_extfree |= 0x3;
1646
1647 /* Make sure we don't save more than we should */
1648 _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof(struct mbuf));
1649
1650 if (nmbclusters == 0) {
1651 nmbclusters = NMBCLUSTERS;
1652 }
1653
1654 /* This should be a sane (at least even) value by now */
1655 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1656
1657 /* Setup the mbuf table */
1658 mbuf_table_init();
1659
1660 /*
1661 * Allocate cluster slabs table:
1662 *
1663 * maxslabgrp = (N * 2048) / (1024 * 1024)
1664 *
1665 * Where N is nmbclusters rounded up to the nearest 512. This yields
1666 * mcl_slab_g_t units, each one representing a MB of memory.
1667 */
1668 maxslabgrp =
1669 (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
1670 slabstbl = zalloc_permanent(maxslabgrp * sizeof(mcl_slabg_t *),
1671 ZALIGN(mcl_slabg_t));
1672
1673 /*
1674 * Allocate audit structures, if needed:
1675 *
1676 * maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE
1677 *
1678 * This yields mcl_audit_t units, each one representing a page.
1679 */
1680 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof(mbuf_debug));
1681 mbuf_debug |= mcache_getflags();
1682 if (mbuf_debug & MCF_DEBUG) {
1683 int l;
1684 mcl_audit_t *mclad;
1685 maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
1686 mclaudit = zalloc_permanent(maxclaudit * sizeof(*mclaudit),
1687 ZALIGN(mcl_audit_t));
1688 for (l = 0, mclad = mclaudit; l < maxclaudit; l++) {
1689 mclad[l].cl_audit = zalloc_permanent(NMBPG * sizeof(mcache_audit_t *),
1690 ZALIGN_PTR);
1691 }
1692
1693 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1694 AUDIT_CONTENTS_SIZE, sizeof(u_int64_t), 0, MCR_SLEEP);
1695 VERIFY(mcl_audit_con_cache != NULL);
1696 }
1697 mclverify = (mbuf_debug & MCF_VERIFY);
1698 mcltrace = (mbuf_debug & MCF_TRACE);
1699 mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1700 mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1701
1702 /* Enable mbuf leak logging, with a lock to protect the tables */
1703
1704 mleak_activate();
1705
1706 /*
1707 * Allocate structure for per-CPU statistics that's aligned
1708 * on the CPU cache boundary; this code assumes that we never
1709 * uninitialize this framework, since the original address
1710 * before alignment is not saved.
1711 */
1712 ncpu = ml_wait_max_cpus();
1713
1714 /* Calculate the number of pages assigned to the cluster pool */
1715 mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
1716 mcl_paddr = zalloc_permanent(mcl_pages * sizeof(ppnum_t),
1717 ZALIGN(ppnum_t));
1718
1719 /* Register with the I/O Bus mapper */
1720 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1721
1722 embutl = (mbutl + (nmbclusters * MCLBYTES));
1723 VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0);
1724
1725 /* Prime up the freelist */
1726 PE_parse_boot_argn("initmcl", &initmcl, sizeof(initmcl));
1727 if (initmcl != 0) {
1728 initmcl >>= NCLPBGSHIFT; /* become a 4K unit */
1729 if (initmcl > m_maxlimit(MC_BIGCL)) {
1730 initmcl = m_maxlimit(MC_BIGCL);
1731 }
1732 }
1733 if (initmcl < m_minlimit(MC_BIGCL)) {
1734 initmcl = m_minlimit(MC_BIGCL);
1735 }
1736
1737 lck_mtx_lock(mbuf_mlock);
1738
1739 /*
1740 * For classes with non-zero minimum limits, populate their freelists
1741 * so that m_total(class) is at least m_minlimit(class).
1742 */
1743 VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1744 freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1745 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1746 freelist_init(m_class(MC_CL));
1747
1748 for (m = 0; m < NELEM(mbuf_table); m++) {
1749 /* Make sure we didn't miss any */
1750 VERIFY(m_minlimit(m_class(m)) == 0 ||
1751 m_total(m_class(m)) >= m_minlimit(m_class(m)));
1752
1753 /* populate the initial sizes and report from there on */
1754 m_peak(m_class(m)) = m_total(m_class(m));
1755 }
1756 mb_peak_newreport = FALSE;
1757
1758 lck_mtx_unlock(mbuf_mlock);
1759
1760 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1761 NULL, &thread);
1762 thread_deallocate(thread);
1763
1764 ref_cache = mcache_create("mext_ref", sizeof(struct ext_ref),
1765 0, 0, MCR_SLEEP);
1766
1767 /* Create the cache for each class */
1768 for (m = 0; m < NELEM(mbuf_table); m++) {
1769 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1770 u_int32_t flags;
1771
1772 flags = mbuf_debug;
1773 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1774 m_class(m) == MC_MBUF_16KCL) {
1775 allocfunc = mbuf_cslab_alloc;
1776 freefunc = mbuf_cslab_free;
1777 auditfunc = mbuf_cslab_audit;
1778 logfunc = mleak_logger;
1779 } else {
1780 allocfunc = mbuf_slab_alloc;
1781 freefunc = mbuf_slab_free;
1782 auditfunc = mbuf_slab_audit;
1783 logfunc = mleak_logger;
1784 }
1785
1786 /*
1787 * Disable per-CPU caches for jumbo classes if there
1788 * is no jumbo cluster pool available in the system.
1789 * The cache itself is still created (but will never
1790 * be populated) since it simplifies the code.
1791 */
1792 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1793 njcl == 0) {
1794 flags |= MCF_NOCPUCACHE;
1795 }
1796
1797 if (!mclfindleak) {
1798 flags |= MCF_NOLEAKLOG;
1799 }
1800
1801 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1802 allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1803 (void *)(uintptr_t)m, flags, MCR_SLEEP);
1804 }
1805
1806 /*
1807 * Set the max limit on sb_max to be 1/16 th of the size of
1808 * memory allocated for mbuf clusters.
1809 */
1810 high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1811 if (high_sb_max < sb_max) {
1812 /* sb_max is too large for this configuration, scale it down */
1813 if (high_sb_max > (1 << MBSHIFT)) {
1814 /* We have atleast 16 M of mbuf pool */
1815 sb_max = high_sb_max;
1816 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1817 /*
1818 * If we have more than 1M of mbufpool, cap the size of
1819 * max sock buf at 1M
1820 */
1821 sb_max = high_sb_max = (1 << MBSHIFT);
1822 } else {
1823 sb_max = high_sb_max;
1824 }
1825 }
1826
1827 /* allocate space for mbuf_dump_buf */
1828 mbuf_dump_buf = zalloc_permanent(MBUF_DUMP_BUF_SIZE, ZALIGN_NONE);
1829
1830 if (mbuf_debug & MCF_DEBUG) {
1831 printf("%s: MLEN %d, MHLEN %d\n", __func__,
1832 (int)_MLEN, (int)_MHLEN);
1833 }
1834
1835 printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1836 (nmbclusters << MCLSHIFT) >> MBSHIFT,
1837 (nclusters << MCLSHIFT) >> MBSHIFT,
1838 (njcl << MCLSHIFT) >> MBSHIFT);
1839 }
1840
1841 /*
1842 * Obtain a slab of object(s) from the class's freelist.
1843 */
1844 static mcache_obj_t *
slab_alloc(mbuf_class_t class,int wait)1845 slab_alloc(mbuf_class_t class, int wait)
1846 {
1847 mcl_slab_t *sp;
1848 mcache_obj_t *buf;
1849
1850 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1851
1852 /* This should always be NULL for us */
1853 VERIFY(m_cobjlist(class) == NULL);
1854
1855 /*
1856 * Treat composite objects as having longer lifespan by using
1857 * a slab from the reverse direction, in hoping that this could
1858 * reduce the probability of fragmentation for slabs that hold
1859 * more than one buffer chunks (e.g. mbuf slabs). For other
1860 * slabs, this probably doesn't make much of a difference.
1861 */
1862 if ((class == MC_MBUF || class == MC_CL || class == MC_BIGCL)
1863 && (wait & MCR_COMP)) {
1864 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1865 } else {
1866 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1867 }
1868
1869 if (sp == NULL) {
1870 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1871 /* The slab list for this class is empty */
1872 return NULL;
1873 }
1874
1875 VERIFY(m_infree(class) > 0);
1876 VERIFY(!slab_is_detached(sp));
1877 VERIFY(sp->sl_class == class &&
1878 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1879 buf = sp->sl_head;
1880 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1881 sp->sl_head = buf->obj_next;
1882 /* Increment slab reference */
1883 sp->sl_refcnt++;
1884
1885 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == sp->sl_chunks);
1886
1887 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1888 slab_nextptr_panic(sp, sp->sl_head);
1889 /* In case sl_head is in the map but not in the slab */
1890 VERIFY(slab_inrange(sp, sp->sl_head));
1891 /* NOTREACHED */
1892 }
1893
1894 if (mclaudit != NULL) {
1895 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1896 mca->mca_uflags = 0;
1897 /* Save contents on mbuf objects only */
1898 if (class == MC_MBUF) {
1899 mca->mca_uflags |= MB_SCVALID;
1900 }
1901 }
1902
1903 if (class == MC_CL) {
1904 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1905 /*
1906 * A 2K cluster slab can have at most NCLPG references.
1907 */
1908 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPG &&
1909 sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1910 VERIFY(sp->sl_refcnt < NCLPG || sp->sl_head == NULL);
1911 } else if (class == MC_BIGCL) {
1912 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1913 m_infree(MC_MBUF_BIGCL);
1914 /*
1915 * A 4K cluster slab can have NBCLPG references.
1916 */
1917 VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG &&
1918 sp->sl_len == PAGE_SIZE &&
1919 (sp->sl_refcnt < NBCLPG || sp->sl_head == NULL));
1920 } else if (class == MC_16KCL) {
1921 mcl_slab_t *nsp;
1922 int k;
1923
1924 --m_infree(MC_16KCL);
1925 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1926 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1927 /*
1928 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1929 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1930 * most 1 reference.
1931 */
1932 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1933 nsp = nsp->sl_next;
1934 /* Next slab must already be present */
1935 VERIFY(nsp != NULL);
1936 nsp->sl_refcnt++;
1937 VERIFY(!slab_is_detached(nsp));
1938 VERIFY(nsp->sl_class == MC_16KCL &&
1939 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1940 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1941 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1942 nsp->sl_head == NULL);
1943 }
1944 } else {
1945 VERIFY(class == MC_MBUF);
1946 --m_infree(MC_MBUF);
1947 /*
1948 * If auditing is turned on, this check is
1949 * deferred until later in mbuf_slab_audit().
1950 */
1951 if (mclaudit == NULL) {
1952 _MCHECK((struct mbuf *)buf);
1953 }
1954 /*
1955 * Since we have incremented the reference count above,
1956 * an mbuf slab (formerly a 4KB cluster slab that was cut
1957 * up into mbufs) must have a reference count between 1
1958 * and NMBPG at this point.
1959 */
1960 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPG &&
1961 sp->sl_chunks == NMBPG &&
1962 sp->sl_len == PAGE_SIZE);
1963 VERIFY(sp->sl_refcnt < NMBPG || sp->sl_head == NULL);
1964 }
1965
1966 /* If empty, remove this slab from the class's freelist */
1967 if (sp->sl_head == NULL) {
1968 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPG);
1969 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPG);
1970 VERIFY(class != MC_BIGCL || sp->sl_refcnt == NBCLPG);
1971 slab_remove(sp, class);
1972 }
1973
1974 return buf;
1975 }
1976
1977 /*
1978 * Place a slab of object(s) back into a class's slab list.
1979 */
1980 static void
slab_free(mbuf_class_t class,mcache_obj_t * buf)1981 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1982 {
1983 mcl_slab_t *sp;
1984 boolean_t reinit_supercl = false;
1985 mbuf_class_t super_class;
1986
1987 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1988
1989 VERIFY(class != MC_16KCL || njcl > 0);
1990 VERIFY(buf->obj_next == NULL);
1991
1992 /*
1993 * Synchronizing with m_clalloc, as it reads m_total, while we here
1994 * are modifying m_total.
1995 */
1996 while (mb_clalloc_busy) {
1997 mb_clalloc_waiters++;
1998 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
1999 (PZERO - 1), "m_clalloc", NULL);
2000 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2001 }
2002
2003 /* We are busy now; tell everyone else to go away */
2004 mb_clalloc_busy = TRUE;
2005
2006 sp = slab_get(buf);
2007 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
2008 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2009
2010 /* Decrement slab reference */
2011 sp->sl_refcnt--;
2012
2013 if (class == MC_CL) {
2014 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
2015 /*
2016 * A slab that has been splitted for 2KB clusters can have
2017 * at most 1 outstanding reference at this point.
2018 */
2019 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPG - 1) &&
2020 sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
2021 VERIFY(sp->sl_refcnt < (NCLPG - 1) ||
2022 (slab_is_detached(sp) && sp->sl_head == NULL));
2023 } else if (class == MC_BIGCL) {
2024 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
2025
2026 /* A 4KB cluster slab can have NBCLPG references at most */
2027 VERIFY(sp->sl_refcnt >= 0 && sp->sl_chunks == NBCLPG);
2028 VERIFY(sp->sl_refcnt < (NBCLPG - 1) ||
2029 (slab_is_detached(sp) && sp->sl_head == NULL));
2030 } else if (class == MC_16KCL) {
2031 mcl_slab_t *nsp;
2032 int k;
2033 /*
2034 * A 16KB cluster takes NSLABSP16KB slabs, all must
2035 * now have 0 reference.
2036 */
2037 VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE));
2038 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
2039 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
2040 VERIFY(slab_is_detached(sp));
2041 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
2042 nsp = nsp->sl_next;
2043 /* Next slab must already be present */
2044 VERIFY(nsp != NULL);
2045 nsp->sl_refcnt--;
2046 VERIFY(slab_is_detached(nsp));
2047 VERIFY(nsp->sl_class == MC_16KCL &&
2048 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
2049 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
2050 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
2051 nsp->sl_head == NULL);
2052 }
2053 } else {
2054 /*
2055 * A slab that has been splitted for mbufs has at most
2056 * NMBPG reference counts. Since we have decremented
2057 * one reference above, it must now be between 0 and
2058 * NMBPG-1.
2059 */
2060 VERIFY(class == MC_MBUF);
2061 VERIFY(sp->sl_refcnt >= 0 &&
2062 sp->sl_refcnt <= (NMBPG - 1) &&
2063 sp->sl_chunks == NMBPG &&
2064 sp->sl_len == PAGE_SIZE);
2065 VERIFY(sp->sl_refcnt < (NMBPG - 1) ||
2066 (slab_is_detached(sp) && sp->sl_head == NULL));
2067 }
2068
2069 /*
2070 * When auditing is enabled, ensure that the buffer still
2071 * contains the free pattern. Otherwise it got corrupted
2072 * while at the CPU cache layer.
2073 */
2074 if (mclaudit != NULL) {
2075 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
2076 if (mclverify) {
2077 mcache_audit_free_verify(mca, buf, 0,
2078 m_maxsize(class));
2079 }
2080 mca->mca_uflags &= ~MB_SCVALID;
2081 }
2082
2083 if (class == MC_CL) {
2084 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
2085 buf->obj_next = sp->sl_head;
2086 } else if (class == MC_BIGCL) {
2087 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
2088 m_infree(MC_MBUF_BIGCL);
2089 buf->obj_next = sp->sl_head;
2090 } else if (class == MC_16KCL) {
2091 ++m_infree(MC_16KCL);
2092 } else {
2093 ++m_infree(MC_MBUF);
2094 buf->obj_next = sp->sl_head;
2095 }
2096 sp->sl_head = buf;
2097
2098 /*
2099 * If a slab has been split to either one which holds 2KB clusters,
2100 * or one which holds mbufs, turn it back to one which holds a
2101 * 4 or 16 KB cluster depending on the page size.
2102 */
2103 if (m_maxsize(MC_BIGCL) == PAGE_SIZE) {
2104 super_class = MC_BIGCL;
2105 } else {
2106 VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL));
2107 super_class = MC_16KCL;
2108 }
2109 if (class == MC_MBUF && sp->sl_refcnt == 0 &&
2110 m_total(class) >= (m_minlimit(class) + NMBPG) &&
2111 m_total(super_class) < m_maxlimit(super_class)) {
2112 int i = NMBPG;
2113
2114 m_total(MC_MBUF) -= NMBPG;
2115 mbstat.m_mbufs = m_total(MC_MBUF);
2116 m_infree(MC_MBUF) -= NMBPG;
2117 mtype_stat_add(MT_FREE, -((unsigned)NMBPG));
2118
2119 while (i--) {
2120 struct mbuf *m = sp->sl_head;
2121 VERIFY(m != NULL);
2122 sp->sl_head = m->m_next;
2123 m->m_next = NULL;
2124 }
2125 reinit_supercl = true;
2126 } else if (class == MC_CL && sp->sl_refcnt == 0 &&
2127 m_total(class) >= (m_minlimit(class) + NCLPG) &&
2128 m_total(super_class) < m_maxlimit(super_class)) {
2129 int i = NCLPG;
2130
2131 m_total(MC_CL) -= NCLPG;
2132 mbstat.m_clusters = m_total(MC_CL);
2133 m_infree(MC_CL) -= NCLPG;
2134
2135 while (i--) {
2136 union mcluster *c = sp->sl_head;
2137 VERIFY(c != NULL);
2138 sp->sl_head = c->mcl_next;
2139 c->mcl_next = NULL;
2140 }
2141 reinit_supercl = true;
2142 } else if (class == MC_BIGCL && super_class != MC_BIGCL &&
2143 sp->sl_refcnt == 0 &&
2144 m_total(class) >= (m_minlimit(class) + NBCLPG) &&
2145 m_total(super_class) < m_maxlimit(super_class)) {
2146 int i = NBCLPG;
2147
2148 VERIFY(super_class == MC_16KCL);
2149 m_total(MC_BIGCL) -= NBCLPG;
2150 mbstat.m_bigclusters = m_total(MC_BIGCL);
2151 m_infree(MC_BIGCL) -= NBCLPG;
2152
2153 while (i--) {
2154 union mbigcluster *bc = sp->sl_head;
2155 VERIFY(bc != NULL);
2156 sp->sl_head = bc->mbc_next;
2157 bc->mbc_next = NULL;
2158 }
2159 reinit_supercl = true;
2160 }
2161
2162 if (reinit_supercl) {
2163 VERIFY(sp->sl_head == NULL);
2164 VERIFY(m_total(class) >= m_minlimit(class));
2165 slab_remove(sp, class);
2166
2167 /* Reinitialize it as a cluster for the super class */
2168 m_total(super_class)++;
2169 m_infree(super_class)++;
2170 VERIFY(sp->sl_flags == (SLF_MAPPED | SLF_DETACHED) &&
2171 sp->sl_len == PAGE_SIZE && sp->sl_refcnt == 0);
2172
2173 slab_init(sp, super_class, SLF_MAPPED, sp->sl_base,
2174 sp->sl_base, PAGE_SIZE, 0, 1);
2175 if (mclverify) {
2176 mcache_set_pattern(MCACHE_FREE_PATTERN,
2177 (caddr_t)sp->sl_base, sp->sl_len);
2178 }
2179 ((mcache_obj_t *)(sp->sl_base))->obj_next = NULL;
2180
2181 if (super_class == MC_BIGCL) {
2182 mbstat.m_bigclusters = m_total(MC_BIGCL);
2183 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2184 m_infree(MC_MBUF_BIGCL);
2185 }
2186
2187 VERIFY(slab_is_detached(sp));
2188 VERIFY(m_total(super_class) <= m_maxlimit(super_class));
2189
2190 /* And finally switch class */
2191 class = super_class;
2192 }
2193
2194 /* Reinsert the slab to the class's slab list */
2195 if (slab_is_detached(sp)) {
2196 slab_insert(sp, class);
2197 }
2198
2199 /* We're done; let others enter */
2200 mb_clalloc_busy = FALSE;
2201 if (mb_clalloc_waiters > 0) {
2202 mb_clalloc_waiters = 0;
2203 wakeup(mb_clalloc_waitchan);
2204 }
2205 }
2206
2207 /*
2208 * Common allocator for rudimentary objects called by the CPU cache layer
2209 * during an allocation request whenever there is no available element in the
2210 * bucket layer. It returns one or more elements from the appropriate global
2211 * freelist. If the freelist is empty, it will attempt to populate it and
2212 * retry the allocation.
2213 */
2214 static unsigned int
mbuf_slab_alloc(void * arg,mcache_obj_t *** plist,unsigned int num,int wait)2215 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
2216 {
2217 mbuf_class_t class = (mbuf_class_t)arg;
2218 unsigned int need = num;
2219 mcache_obj_t **list = *plist;
2220
2221 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2222 ASSERT(need > 0);
2223
2224 lck_mtx_lock(mbuf_mlock);
2225
2226 for (;;) {
2227 if ((*list = slab_alloc(class, wait)) != NULL) {
2228 (*list)->obj_next = NULL;
2229 list = *plist = &(*list)->obj_next;
2230
2231 if (--need == 0) {
2232 /*
2233 * If the number of elements in freelist has
2234 * dropped below low watermark, asynchronously
2235 * populate the freelist now rather than doing
2236 * it later when we run out of elements.
2237 */
2238 if (!mbuf_cached_above(class, wait) &&
2239 m_infree(class) < (m_total(class) >> 5)) {
2240 (void) freelist_populate(class, 1,
2241 M_DONTWAIT);
2242 }
2243 break;
2244 }
2245 } else {
2246 VERIFY(m_infree(class) == 0 || class == MC_CL);
2247
2248 (void) freelist_populate(class, 1,
2249 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2250
2251 if (m_infree(class) > 0) {
2252 continue;
2253 }
2254
2255 /* Check if there's anything at the cache layer */
2256 if (mbuf_cached_above(class, wait)) {
2257 break;
2258 }
2259
2260 /* watchdog checkpoint */
2261 mbuf_watchdog();
2262
2263 /* We have nothing and cannot block; give up */
2264 if (wait & MCR_NOSLEEP) {
2265 if (!(wait & MCR_TRYHARD)) {
2266 m_fail_cnt(class)++;
2267 mbstat.m_drops++;
2268 break;
2269 }
2270 }
2271
2272 /*
2273 * If the freelist is still empty and the caller is
2274 * willing to be blocked, sleep on the wait channel
2275 * until an element is available. Otherwise, if
2276 * MCR_TRYHARD is set, do our best to satisfy the
2277 * request without having to go to sleep.
2278 */
2279 if (mbuf_worker_ready &&
2280 mbuf_sleep(class, need, wait)) {
2281 break;
2282 }
2283
2284 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2285 }
2286 }
2287
2288 m_alloc_cnt(class) += num - need;
2289 lck_mtx_unlock(mbuf_mlock);
2290
2291 return num - need;
2292 }
2293
2294 /*
2295 * Common de-allocator for rudimentary objects called by the CPU cache
2296 * layer when one or more elements need to be returned to the appropriate
2297 * global freelist.
2298 */
2299 static void
mbuf_slab_free(void * arg,mcache_obj_t * list,__unused int purged)2300 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2301 {
2302 mbuf_class_t class = (mbuf_class_t)arg;
2303 mcache_obj_t *nlist;
2304 unsigned int num = 0;
2305 int w;
2306
2307 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2308
2309 lck_mtx_lock(mbuf_mlock);
2310
2311 for (;;) {
2312 nlist = list->obj_next;
2313 list->obj_next = NULL;
2314 slab_free(class, list);
2315 ++num;
2316 if ((list = nlist) == NULL) {
2317 break;
2318 }
2319 }
2320 m_free_cnt(class) += num;
2321
2322 if ((w = mb_waiters) > 0) {
2323 mb_waiters = 0;
2324 }
2325 if (w) {
2326 mbwdog_logger("waking up all threads");
2327 }
2328 lck_mtx_unlock(mbuf_mlock);
2329
2330 if (w != 0) {
2331 wakeup(mb_waitchan);
2332 }
2333 }
2334
2335 /*
2336 * Common auditor for rudimentary objects called by the CPU cache layer
2337 * during an allocation or free request. For the former, this is called
2338 * after the objects are obtained from either the bucket or slab layer
2339 * and before they are returned to the caller. For the latter, this is
2340 * called immediately during free and before placing the objects into
2341 * the bucket or slab layer.
2342 */
2343 static void
mbuf_slab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)2344 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2345 {
2346 mbuf_class_t class = (mbuf_class_t)arg;
2347 mcache_audit_t *mca;
2348
2349 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2350
2351 while (list != NULL) {
2352 lck_mtx_lock(mbuf_mlock);
2353 mca = mcl_audit_buf2mca(class, list);
2354
2355 /* Do the sanity checks */
2356 if (class == MC_MBUF) {
2357 mcl_audit_mbuf(mca, list, FALSE, alloc);
2358 ASSERT(mca->mca_uflags & MB_SCVALID);
2359 } else {
2360 mcl_audit_cluster(mca, list, m_maxsize(class),
2361 alloc, TRUE);
2362 ASSERT(!(mca->mca_uflags & MB_SCVALID));
2363 }
2364 /* Record this transaction */
2365 if (mcltrace) {
2366 mcache_buffer_log(mca, list, m_cache(class), &mb_start);
2367 }
2368
2369 if (alloc) {
2370 mca->mca_uflags |= MB_INUSE;
2371 } else {
2372 mca->mca_uflags &= ~MB_INUSE;
2373 }
2374 /* Unpair the object (unconditionally) */
2375 mca->mca_uptr = NULL;
2376 lck_mtx_unlock(mbuf_mlock);
2377
2378 list = list->obj_next;
2379 }
2380 }
2381
2382 /*
2383 * Common notify routine for all caches. It is called by mcache when
2384 * one or more objects get freed. We use this indication to trigger
2385 * the wakeup of any sleeping threads so that they can retry their
2386 * allocation requests.
2387 */
2388 static void
mbuf_slab_notify(void * arg,u_int32_t reason)2389 mbuf_slab_notify(void *arg, u_int32_t reason)
2390 {
2391 mbuf_class_t class = (mbuf_class_t)arg;
2392 int w;
2393
2394 ASSERT(MBUF_CLASS_VALID(class));
2395
2396 if (reason != MCN_RETRYALLOC) {
2397 return;
2398 }
2399
2400 lck_mtx_lock(mbuf_mlock);
2401 if ((w = mb_waiters) > 0) {
2402 m_notified(class)++;
2403 mb_waiters = 0;
2404 }
2405 if (w) {
2406 mbwdog_logger("waking up all threads");
2407 }
2408 lck_mtx_unlock(mbuf_mlock);
2409
2410 if (w != 0) {
2411 wakeup(mb_waitchan);
2412 }
2413 }
2414
2415 /*
2416 * Obtain object(s) from the composite class's freelist.
2417 */
2418 static unsigned int
cslab_alloc(mbuf_class_t class,mcache_obj_t *** plist,unsigned int num)2419 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2420 {
2421 unsigned int need = num;
2422 mcl_slab_t *sp, *clsp, *nsp;
2423 struct mbuf *m;
2424 mcache_obj_t **list = *plist;
2425 void *cl;
2426
2427 VERIFY(need > 0);
2428 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2429 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2430
2431 /* Get what we can from the freelist */
2432 while ((*list = m_cobjlist(class)) != NULL) {
2433 MRANGE(*list);
2434
2435 m = (struct mbuf *)*list;
2436 sp = slab_get(m);
2437 cl = m->m_ext.ext_buf;
2438 clsp = slab_get(cl);
2439 VERIFY(m->m_flags == M_EXT && cl != NULL);
2440 VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
2441
2442 if (class == MC_MBUF_CL) {
2443 VERIFY(clsp->sl_refcnt >= 1 &&
2444 clsp->sl_refcnt <= NCLPG);
2445 } else {
2446 VERIFY(clsp->sl_refcnt >= 1 &&
2447 clsp->sl_refcnt <= NBCLPG);
2448 }
2449
2450 if (class == MC_MBUF_16KCL) {
2451 int k;
2452 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2453 nsp = nsp->sl_next;
2454 /* Next slab must already be present */
2455 VERIFY(nsp != NULL);
2456 VERIFY(nsp->sl_refcnt == 1);
2457 }
2458 }
2459
2460 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2461 !MBUF_IN_MAP(m_cobjlist(class))) {
2462 slab_nextptr_panic(sp, m_cobjlist(class));
2463 /* NOTREACHED */
2464 }
2465 (*list)->obj_next = NULL;
2466 list = *plist = &(*list)->obj_next;
2467
2468 if (--need == 0) {
2469 break;
2470 }
2471 }
2472 m_infree(class) -= (num - need);
2473
2474 return num - need;
2475 }
2476
2477 /*
2478 * Place object(s) back into a composite class's freelist.
2479 */
2480 static unsigned int
cslab_free(mbuf_class_t class,mcache_obj_t * list,int purged)2481 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2482 {
2483 mcache_obj_t *o, *tail;
2484 unsigned int num = 0;
2485 struct mbuf *m, *ms;
2486 mcache_audit_t *mca = NULL;
2487 mcache_obj_t *ref_list = NULL;
2488 mcl_slab_t *clsp, *nsp;
2489 void *cl;
2490 mbuf_class_t cl_class;
2491
2492 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2493 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2494 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2495
2496 if (class == MC_MBUF_CL) {
2497 cl_class = MC_CL;
2498 } else if (class == MC_MBUF_BIGCL) {
2499 cl_class = MC_BIGCL;
2500 } else {
2501 VERIFY(class == MC_MBUF_16KCL);
2502 cl_class = MC_16KCL;
2503 }
2504
2505 o = tail = list;
2506
2507 while ((m = ms = (struct mbuf *)o) != NULL) {
2508 mcache_obj_t *rfa, *nexto = o->obj_next;
2509
2510 /* Do the mbuf sanity checks */
2511 if (mclaudit != NULL) {
2512 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2513 if (mclverify) {
2514 mcache_audit_free_verify(mca, m, 0,
2515 m_maxsize(MC_MBUF));
2516 }
2517 ms = MCA_SAVED_MBUF_PTR(mca);
2518 }
2519
2520 /* Do the cluster sanity checks */
2521 cl = ms->m_ext.ext_buf;
2522 clsp = slab_get(cl);
2523 if (mclverify) {
2524 size_t size = m_maxsize(cl_class);
2525 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2526 (mcache_obj_t *)cl), cl, 0, size);
2527 }
2528 VERIFY(ms->m_type == MT_FREE);
2529 VERIFY(ms->m_flags == M_EXT);
2530 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2531 if (cl_class == MC_CL) {
2532 VERIFY(clsp->sl_refcnt >= 1 &&
2533 clsp->sl_refcnt <= NCLPG);
2534 } else {
2535 VERIFY(clsp->sl_refcnt >= 1 &&
2536 clsp->sl_refcnt <= NBCLPG);
2537 }
2538 if (cl_class == MC_16KCL) {
2539 int k;
2540 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2541 nsp = nsp->sl_next;
2542 /* Next slab must already be present */
2543 VERIFY(nsp != NULL);
2544 VERIFY(nsp->sl_refcnt == 1);
2545 }
2546 }
2547
2548 /*
2549 * If we're asked to purge, restore the actual mbuf using
2550 * contents of the shadow structure (if auditing is enabled)
2551 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2552 * about to free it and the attached cluster into their caches.
2553 */
2554 if (purged) {
2555 /* Restore constructed mbuf fields */
2556 if (mclaudit != NULL) {
2557 mcl_audit_restore_mbuf(m, mca, TRUE);
2558 }
2559
2560 MEXT_MINREF(m) = 0;
2561 MEXT_REF(m) = 0;
2562 MEXT_PREF(m) = 0;
2563 MEXT_FLAGS(m) = 0;
2564 MEXT_PRIV(m) = 0;
2565 MEXT_PMBUF(m) = NULL;
2566 MEXT_TOKEN(m) = 0;
2567
2568 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
2569 m_set_ext(m, NULL, NULL, NULL);
2570 rfa->obj_next = ref_list;
2571 ref_list = rfa;
2572
2573 m->m_type = MT_FREE;
2574 m->m_flags = m->m_len = 0;
2575 m->m_next = m->m_nextpkt = NULL;
2576
2577 /* Save mbuf fields and make auditing happy */
2578 if (mclaudit != NULL) {
2579 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2580 }
2581
2582 VERIFY(m_total(class) > 0);
2583 m_total(class)--;
2584
2585 /* Free the mbuf */
2586 o->obj_next = NULL;
2587 slab_free(MC_MBUF, o);
2588
2589 /* And free the cluster */
2590 ((mcache_obj_t *)cl)->obj_next = NULL;
2591 if (class == MC_MBUF_CL) {
2592 slab_free(MC_CL, cl);
2593 } else if (class == MC_MBUF_BIGCL) {
2594 slab_free(MC_BIGCL, cl);
2595 } else {
2596 slab_free(MC_16KCL, cl);
2597 }
2598 }
2599
2600 ++num;
2601 tail = o;
2602 o = nexto;
2603 }
2604
2605 if (!purged) {
2606 tail->obj_next = m_cobjlist(class);
2607 m_cobjlist(class) = list;
2608 m_infree(class) += num;
2609 } else if (ref_list != NULL) {
2610 mcache_free_ext(ref_cache, ref_list);
2611 }
2612
2613 return num;
2614 }
2615
2616 /*
2617 * Common allocator for composite objects called by the CPU cache layer
2618 * during an allocation request whenever there is no available element in
2619 * the bucket layer. It returns one or more composite elements from the
2620 * appropriate global freelist. If the freelist is empty, it will attempt
2621 * to obtain the rudimentary objects from their caches and construct them
2622 * into composite mbuf + cluster objects.
2623 */
2624 static unsigned int
mbuf_cslab_alloc(void * arg,mcache_obj_t *** plist,unsigned int needed,int wait)2625 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2626 int wait)
2627 {
2628 mbuf_class_t class = (mbuf_class_t)arg;
2629 mbuf_class_t cl_class = 0;
2630 unsigned int num = 0, cnum = 0, want = needed;
2631 mcache_obj_t *ref_list = NULL;
2632 mcache_obj_t *mp_list = NULL;
2633 mcache_obj_t *clp_list = NULL;
2634 mcache_obj_t **list;
2635 struct ext_ref *rfa;
2636 struct mbuf *m;
2637 void *cl;
2638
2639 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2640 ASSERT(needed > 0);
2641
2642 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2643
2644 /* There should not be any slab for this class */
2645 VERIFY(m_slab_cnt(class) == 0 &&
2646 m_slablist(class).tqh_first == NULL &&
2647 m_slablist(class).tqh_last == NULL);
2648
2649 lck_mtx_lock(mbuf_mlock);
2650
2651 /* Try using the freelist first */
2652 num = cslab_alloc(class, plist, needed);
2653 list = *plist;
2654 if (num == needed) {
2655 m_alloc_cnt(class) += num;
2656 lck_mtx_unlock(mbuf_mlock);
2657 return needed;
2658 }
2659
2660 lck_mtx_unlock(mbuf_mlock);
2661
2662 /*
2663 * We could not satisfy the request using the freelist alone;
2664 * allocate from the appropriate rudimentary caches and use
2665 * whatever we can get to construct the composite objects.
2666 */
2667 needed -= num;
2668
2669 /*
2670 * Mark these allocation requests as coming from a composite cache.
2671 * Also, if the caller is willing to be blocked, mark the request
2672 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2673 * slab layer waiting for the individual object when one or more
2674 * of the already-constructed composite objects are available.
2675 */
2676 wait |= MCR_COMP;
2677 if (!(wait & MCR_NOSLEEP)) {
2678 wait |= MCR_FAILOK;
2679 }
2680
2681 /* allocate mbufs */
2682 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2683 if (needed == 0) {
2684 ASSERT(mp_list == NULL);
2685 goto fail;
2686 }
2687
2688 /* allocate clusters */
2689 if (class == MC_MBUF_CL) {
2690 cl_class = MC_CL;
2691 } else if (class == MC_MBUF_BIGCL) {
2692 cl_class = MC_BIGCL;
2693 } else {
2694 VERIFY(class == MC_MBUF_16KCL);
2695 cl_class = MC_16KCL;
2696 }
2697 needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2698 if (needed == 0) {
2699 ASSERT(clp_list == NULL);
2700 goto fail;
2701 }
2702
2703 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2704 if (needed == 0) {
2705 ASSERT(ref_list == NULL);
2706 goto fail;
2707 }
2708
2709 /*
2710 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
2711 * overs will get freed accordingly before we return to caller.
2712 */
2713 for (cnum = 0; cnum < needed; cnum++) {
2714 struct mbuf *ms;
2715
2716 m = ms = (struct mbuf *)mp_list;
2717 mp_list = mp_list->obj_next;
2718
2719 cl = clp_list;
2720 clp_list = clp_list->obj_next;
2721 ((mcache_obj_t *)cl)->obj_next = NULL;
2722
2723 rfa = (struct ext_ref *)ref_list;
2724 ref_list = ref_list->obj_next;
2725 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2726
2727 /*
2728 * If auditing is enabled, construct the shadow mbuf
2729 * in the audit structure instead of in the actual one.
2730 * mbuf_cslab_audit() will take care of restoring the
2731 * contents after the integrity check.
2732 */
2733 if (mclaudit != NULL) {
2734 mcache_audit_t *mca, *cl_mca;
2735
2736 lck_mtx_lock(mbuf_mlock);
2737 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2738 ms = MCA_SAVED_MBUF_PTR(mca);
2739 cl_mca = mcl_audit_buf2mca(cl_class,
2740 (mcache_obj_t *)cl);
2741
2742 /*
2743 * Pair them up. Note that this is done at the time
2744 * the mbuf+cluster objects are constructed. This
2745 * information should be treated as "best effort"
2746 * debugging hint since more than one mbufs can refer
2747 * to a cluster. In that case, the cluster might not
2748 * be freed along with the mbuf it was paired with.
2749 */
2750 mca->mca_uptr = cl_mca;
2751 cl_mca->mca_uptr = mca;
2752
2753 ASSERT(mca->mca_uflags & MB_SCVALID);
2754 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2755 lck_mtx_unlock(mbuf_mlock);
2756
2757 /* Technically, they are in the freelist */
2758 if (mclverify) {
2759 size_t size;
2760
2761 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2762 m_maxsize(MC_MBUF));
2763
2764 if (class == MC_MBUF_CL) {
2765 size = m_maxsize(MC_CL);
2766 } else if (class == MC_MBUF_BIGCL) {
2767 size = m_maxsize(MC_BIGCL);
2768 } else {
2769 size = m_maxsize(MC_16KCL);
2770 }
2771
2772 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2773 size);
2774 }
2775 }
2776
2777 MBUF_INIT(ms, 0, MT_FREE);
2778 if (class == MC_MBUF_16KCL) {
2779 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2780 } else if (class == MC_MBUF_BIGCL) {
2781 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2782 } else {
2783 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2784 }
2785 VERIFY(ms->m_flags == M_EXT);
2786 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2787
2788 *list = (mcache_obj_t *)m;
2789 (*list)->obj_next = NULL;
2790 list = *plist = &(*list)->obj_next;
2791 }
2792
2793 fail:
2794 /*
2795 * Free up what's left of the above.
2796 */
2797 if (mp_list != NULL) {
2798 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2799 }
2800 if (clp_list != NULL) {
2801 mcache_free_ext(m_cache(cl_class), clp_list);
2802 }
2803 if (ref_list != NULL) {
2804 mcache_free_ext(ref_cache, ref_list);
2805 }
2806
2807 lck_mtx_lock(mbuf_mlock);
2808 if (num > 0 || cnum > 0) {
2809 m_total(class) += cnum;
2810 VERIFY(m_total(class) <= m_maxlimit(class));
2811 m_alloc_cnt(class) += num + cnum;
2812 }
2813 if ((num + cnum) < want) {
2814 m_fail_cnt(class) += (want - (num + cnum));
2815 }
2816 lck_mtx_unlock(mbuf_mlock);
2817
2818 return num + cnum;
2819 }
2820
2821 /*
2822 * Common de-allocator for composite objects called by the CPU cache
2823 * layer when one or more elements need to be returned to the appropriate
2824 * global freelist.
2825 */
2826 static void
mbuf_cslab_free(void * arg,mcache_obj_t * list,int purged)2827 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2828 {
2829 mbuf_class_t class = (mbuf_class_t)arg;
2830 unsigned int num;
2831 int w;
2832
2833 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2834
2835 lck_mtx_lock(mbuf_mlock);
2836
2837 num = cslab_free(class, list, purged);
2838 m_free_cnt(class) += num;
2839
2840 if ((w = mb_waiters) > 0) {
2841 mb_waiters = 0;
2842 }
2843 if (w) {
2844 mbwdog_logger("waking up all threads");
2845 }
2846
2847 lck_mtx_unlock(mbuf_mlock);
2848
2849 if (w != 0) {
2850 wakeup(mb_waitchan);
2851 }
2852 }
2853
2854 /*
2855 * Common auditor for composite objects called by the CPU cache layer
2856 * during an allocation or free request. For the former, this is called
2857 * after the objects are obtained from either the bucket or slab layer
2858 * and before they are returned to the caller. For the latter, this is
2859 * called immediately during free and before placing the objects into
2860 * the bucket or slab layer.
2861 */
2862 static void
mbuf_cslab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)2863 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2864 {
2865 mbuf_class_t class = (mbuf_class_t)arg, cl_class;
2866 mcache_audit_t *mca;
2867 struct mbuf *m, *ms;
2868 mcl_slab_t *clsp, *nsp;
2869 size_t cl_size;
2870 void *cl;
2871
2872 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2873 if (class == MC_MBUF_CL) {
2874 cl_class = MC_CL;
2875 } else if (class == MC_MBUF_BIGCL) {
2876 cl_class = MC_BIGCL;
2877 } else {
2878 cl_class = MC_16KCL;
2879 }
2880 cl_size = m_maxsize(cl_class);
2881
2882 while ((m = ms = (struct mbuf *)list) != NULL) {
2883 lck_mtx_lock(mbuf_mlock);
2884 /* Do the mbuf sanity checks and record its transaction */
2885 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2886 mcl_audit_mbuf(mca, m, TRUE, alloc);
2887 if (mcltrace) {
2888 mcache_buffer_log(mca, m, m_cache(class), &mb_start);
2889 }
2890
2891 if (alloc) {
2892 mca->mca_uflags |= MB_COMP_INUSE;
2893 } else {
2894 mca->mca_uflags &= ~MB_COMP_INUSE;
2895 }
2896
2897 /*
2898 * Use the shadow mbuf in the audit structure if we are
2899 * freeing, since the contents of the actual mbuf has been
2900 * pattern-filled by the above call to mcl_audit_mbuf().
2901 */
2902 if (!alloc && mclverify) {
2903 ms = MCA_SAVED_MBUF_PTR(mca);
2904 }
2905
2906 /* Do the cluster sanity checks and record its transaction */
2907 cl = ms->m_ext.ext_buf;
2908 clsp = slab_get(cl);
2909 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2910 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2911 if (class == MC_MBUF_CL) {
2912 VERIFY(clsp->sl_refcnt >= 1 &&
2913 clsp->sl_refcnt <= NCLPG);
2914 } else {
2915 VERIFY(clsp->sl_refcnt >= 1 &&
2916 clsp->sl_refcnt <= NBCLPG);
2917 }
2918
2919 if (class == MC_MBUF_16KCL) {
2920 int k;
2921 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2922 nsp = nsp->sl_next;
2923 /* Next slab must already be present */
2924 VERIFY(nsp != NULL);
2925 VERIFY(nsp->sl_refcnt == 1);
2926 }
2927 }
2928
2929
2930 mca = mcl_audit_buf2mca(cl_class, cl);
2931 mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE);
2932 if (mcltrace) {
2933 mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
2934 }
2935
2936 if (alloc) {
2937 mca->mca_uflags |= MB_COMP_INUSE;
2938 } else {
2939 mca->mca_uflags &= ~MB_COMP_INUSE;
2940 }
2941 lck_mtx_unlock(mbuf_mlock);
2942
2943 list = list->obj_next;
2944 }
2945 }
2946
2947 static void
m_vm_error_stats(uint32_t * cnt,uint64_t * ts,uint64_t * size,uint64_t alloc_size,kern_return_t error)2948 m_vm_error_stats(uint32_t *cnt, uint64_t *ts, uint64_t *size,
2949 uint64_t alloc_size, kern_return_t error)
2950 {
2951 *cnt = *cnt + 1;
2952 *ts = net_uptime();
2953 if (size) {
2954 *size = alloc_size;
2955 }
2956 switch (error) {
2957 case KERN_SUCCESS:
2958 break;
2959 case KERN_INVALID_ARGUMENT:
2960 mb_kmem_stats[0]++;
2961 break;
2962 case KERN_INVALID_ADDRESS:
2963 mb_kmem_stats[1]++;
2964 break;
2965 case KERN_RESOURCE_SHORTAGE:
2966 mb_kmem_stats[2]++;
2967 break;
2968 case KERN_NO_SPACE:
2969 mb_kmem_stats[3]++;
2970 break;
2971 case KERN_FAILURE:
2972 mb_kmem_stats[4]++;
2973 break;
2974 default:
2975 mb_kmem_stats[5]++;
2976 break;
2977 }
2978 }
2979
2980 static vm_offset_t
kmem_mb_alloc(vm_map_t mbmap,int size,int physContig,kern_return_t * err)2981 kmem_mb_alloc(vm_map_t mbmap, int size, int physContig, kern_return_t *err)
2982 {
2983 vm_offset_t addr = 0;
2984 kern_return_t kr = KERN_SUCCESS;
2985
2986 if (!physContig) {
2987 kr = kmem_alloc(mbmap, &addr, size,
2988 KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
2989 } else {
2990 kr = kmem_alloc_contig(mbmap, &addr, size, PAGE_MASK, 0xfffff,
2991 0, KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
2992 }
2993
2994 if (kr != KERN_SUCCESS) {
2995 addr = 0;
2996 }
2997 if (err) {
2998 *err = kr;
2999 }
3000
3001 return addr;
3002 }
3003
3004 /*
3005 * Allocate some number of mbuf clusters and place on cluster freelist.
3006 */
3007 static int
m_clalloc(const u_int32_t num,const int wait,const u_int32_t bufsize)3008 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
3009 {
3010 int i, count = 0;
3011 vm_size_t size = 0;
3012 int numpages = 0, large_buffer;
3013 vm_offset_t page = 0;
3014 mcache_audit_t *mca_list = NULL;
3015 mcache_obj_t *con_list = NULL;
3016 mcl_slab_t *sp;
3017 mbuf_class_t class;
3018 kern_return_t error;
3019
3020 /* Set if a buffer allocation needs allocation of multiple pages */
3021 large_buffer = ((bufsize == m_maxsize(MC_16KCL)) &&
3022 PAGE_SIZE < M16KCLBYTES);
3023 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
3024 bufsize == m_maxsize(MC_16KCL));
3025
3026 VERIFY((bufsize == PAGE_SIZE) ||
3027 (bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL)));
3028
3029 if (bufsize == m_size(MC_BIGCL)) {
3030 class = MC_BIGCL;
3031 } else {
3032 class = MC_16KCL;
3033 }
3034
3035 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3036
3037 /*
3038 * Multiple threads may attempt to populate the cluster map one
3039 * after another. Since we drop the lock below prior to acquiring
3040 * the physical page(s), our view of the cluster map may no longer
3041 * be accurate, and we could end up over-committing the pages beyond
3042 * the maximum allowed for each class. To prevent it, this entire
3043 * operation (including the page mapping) is serialized.
3044 */
3045 while (mb_clalloc_busy) {
3046 mb_clalloc_waiters++;
3047 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
3048 (PZERO - 1), "m_clalloc", NULL);
3049 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3050 }
3051
3052 /* We are busy now; tell everyone else to go away */
3053 mb_clalloc_busy = TRUE;
3054
3055 /*
3056 * Honor the caller's wish to block or not block. We have a way
3057 * to grow the pool asynchronously using the mbuf worker thread.
3058 */
3059 i = m_howmany(num, bufsize);
3060 if (i <= 0 || (wait & M_DONTWAIT)) {
3061 goto out;
3062 }
3063
3064 lck_mtx_unlock(mbuf_mlock);
3065
3066 size = round_page(i * bufsize);
3067 page = kmem_mb_alloc(mb_map, size, large_buffer, &error);
3068
3069 /*
3070 * If we did ask for "n" 16KB physically contiguous chunks
3071 * and didn't get them, then please try again without this
3072 * restriction.
3073 */
3074 net_update_uptime();
3075 if (large_buffer && page == 0) {
3076 m_vm_error_stats(&mb_kmem_contig_failed,
3077 &mb_kmem_contig_failed_ts,
3078 &mb_kmem_contig_failed_size,
3079 size, error);
3080 page = kmem_mb_alloc(mb_map, size, 0, &error);
3081 }
3082
3083 if (page == 0) {
3084 m_vm_error_stats(&mb_kmem_failed,
3085 &mb_kmem_failed_ts,
3086 &mb_kmem_failed_size,
3087 size, error);
3088 #if PAGE_SIZE == 4096
3089 if (bufsize == m_maxsize(MC_BIGCL)) {
3090 #else
3091 if (bufsize >= m_maxsize(MC_BIGCL)) {
3092 #endif
3093 /* Try for 1 page if failed */
3094 size = PAGE_SIZE;
3095 page = kmem_mb_alloc(mb_map, size, 0, &error);
3096 if (page == 0) {
3097 m_vm_error_stats(&mb_kmem_one_failed,
3098 &mb_kmem_one_failed_ts,
3099 NULL, size, error);
3100 }
3101 }
3102
3103 if (page == 0) {
3104 lck_mtx_lock(mbuf_mlock);
3105 goto out;
3106 }
3107 }
3108
3109 VERIFY(IS_P2ALIGNED(page, PAGE_SIZE));
3110 numpages = size / PAGE_SIZE;
3111
3112 /* If auditing is enabled, allocate the audit structures now */
3113 if (mclaudit != NULL) {
3114 int needed;
3115
3116 /*
3117 * Yes, I realize this is a waste of memory for clusters
3118 * that never get transformed into mbufs, as we may end
3119 * up with NMBPG-1 unused audit structures per cluster.
3120 * But doing so tremendously simplifies the allocation
3121 * strategy, since at this point we are not holding the
3122 * mbuf lock and the caller is okay to be blocked.
3123 */
3124 if (bufsize == PAGE_SIZE) {
3125 needed = numpages * NMBPG;
3126
3127 i = mcache_alloc_ext(mcl_audit_con_cache,
3128 &con_list, needed, MCR_SLEEP);
3129
3130 VERIFY(con_list != NULL && i == needed);
3131 } else {
3132 /*
3133 * if multiple 4K pages are being used for a
3134 * 16K cluster
3135 */
3136 needed = numpages / NSLABSP16KB;
3137 }
3138
3139 i = mcache_alloc_ext(mcache_audit_cache,
3140 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
3141
3142 VERIFY(mca_list != NULL && i == needed);
3143 }
3144
3145 lck_mtx_lock(mbuf_mlock);
3146
3147 for (i = 0; i < numpages; i++, page += PAGE_SIZE) {
3148 ppnum_t offset =
3149 ((unsigned char *)page - mbutl) >> PAGE_SHIFT;
3150 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
3151
3152 /*
3153 * If there is a mapper the appropriate I/O page is
3154 * returned; zero out the page to discard its past
3155 * contents to prevent exposing leftover kernel memory.
3156 */
3157 VERIFY(offset < mcl_pages);
3158 if (mcl_paddr_base != 0) {
3159 bzero((void *)(uintptr_t) page, PAGE_SIZE);
3160 new_page = IOMapperInsertPage(mcl_paddr_base,
3161 offset, new_page);
3162 }
3163 mcl_paddr[offset] = new_page;
3164
3165 /* Pattern-fill this fresh page */
3166 if (mclverify) {
3167 mcache_set_pattern(MCACHE_FREE_PATTERN,
3168 (caddr_t)page, PAGE_SIZE);
3169 }
3170 if (bufsize == PAGE_SIZE) {
3171 mcache_obj_t *buf;
3172 /* One for the entire page */
3173 sp = slab_get((void *)page);
3174 if (mclaudit != NULL) {
3175 mcl_audit_init((void *)page,
3176 &mca_list, &con_list,
3177 AUDIT_CONTENTS_SIZE, NMBPG);
3178 }
3179 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3180 slab_init(sp, class, SLF_MAPPED, (void *)page,
3181 (void *)page, PAGE_SIZE, 0, 1);
3182 buf = (mcache_obj_t *)page;
3183 buf->obj_next = NULL;
3184
3185 /* Insert this slab */
3186 slab_insert(sp, class);
3187
3188 /* Update stats now since slab_get drops the lock */
3189 ++m_infree(class);
3190 ++m_total(class);
3191 VERIFY(m_total(class) <= m_maxlimit(class));
3192 if (class == MC_BIGCL) {
3193 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3194 m_infree(MC_MBUF_BIGCL);
3195 mbstat.m_bigclusters = m_total(MC_BIGCL);
3196 }
3197 ++count;
3198 } else if ((bufsize > PAGE_SIZE) &&
3199 (i % NSLABSP16KB) == 0) {
3200 union m16kcluster *m16kcl = (union m16kcluster *)page;
3201 mcl_slab_t *nsp;
3202 int k;
3203
3204 /* One for the entire 16KB */
3205 sp = slab_get(m16kcl);
3206 if (mclaudit != NULL) {
3207 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
3208 }
3209
3210 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3211 slab_init(sp, MC_16KCL, SLF_MAPPED,
3212 m16kcl, m16kcl, bufsize, 0, 1);
3213 m16kcl->m16kcl_next = NULL;
3214
3215 /*
3216 * 2nd-Nth page's slab is part of the first one,
3217 * where N is NSLABSP16KB.
3218 */
3219 for (k = 1; k < NSLABSP16KB; k++) {
3220 nsp = slab_get(((union mbigcluster *)page) + k);
3221 VERIFY(nsp->sl_refcnt == 0 &&
3222 nsp->sl_flags == 0);
3223 slab_init(nsp, MC_16KCL,
3224 SLF_MAPPED | SLF_PARTIAL,
3225 m16kcl, NULL, 0, 0, 0);
3226 }
3227 /* Insert this slab */
3228 slab_insert(sp, MC_16KCL);
3229
3230 /* Update stats now since slab_get drops the lock */
3231 ++m_infree(MC_16KCL);
3232 ++m_total(MC_16KCL);
3233 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3234 ++count;
3235 }
3236 }
3237 VERIFY(mca_list == NULL && con_list == NULL);
3238
3239 if (!mb_peak_newreport && mbuf_report_usage(class)) {
3240 mb_peak_newreport = TRUE;
3241 }
3242
3243 /* We're done; let others enter */
3244 mb_clalloc_busy = FALSE;
3245 if (mb_clalloc_waiters > 0) {
3246 mb_clalloc_waiters = 0;
3247 wakeup(mb_clalloc_waitchan);
3248 }
3249
3250 return count;
3251 out:
3252 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3253
3254 mtracelarge_register(size);
3255
3256 /* We're done; let others enter */
3257 mb_clalloc_busy = FALSE;
3258 if (mb_clalloc_waiters > 0) {
3259 mb_clalloc_waiters = 0;
3260 wakeup(mb_clalloc_waitchan);
3261 }
3262
3263 /*
3264 * When non-blocking we kick a thread if we have to grow the
3265 * pool or if the number of free clusters is less than requested.
3266 */
3267 if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) {
3268 mbwdog_logger("waking up the worker thread to to grow %s by %d",
3269 m_cname(class), i);
3270 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
3271 mbuf_worker_needs_wakeup = FALSE;
3272 }
3273 if (class == MC_BIGCL) {
3274 if (i > 0) {
3275 /*
3276 * Remember total number of 4KB clusters needed
3277 * at this time.
3278 */
3279 i += m_total(MC_BIGCL);
3280 if (i > m_region_expand(MC_BIGCL)) {
3281 m_region_expand(MC_BIGCL) = i;
3282 }
3283 }
3284 if (m_infree(MC_BIGCL) >= num) {
3285 return 1;
3286 }
3287 } else {
3288 if (i > 0) {
3289 /*
3290 * Remember total number of 16KB clusters needed
3291 * at this time.
3292 */
3293 i += m_total(MC_16KCL);
3294 if (i > m_region_expand(MC_16KCL)) {
3295 m_region_expand(MC_16KCL) = i;
3296 }
3297 }
3298 if (m_infree(MC_16KCL) >= num) {
3299 return 1;
3300 }
3301 }
3302 return 0;
3303 }
3304
3305 /*
3306 * Populate the global freelist of the corresponding buffer class.
3307 */
3308 static int
3309 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
3310 {
3311 mcache_obj_t *o = NULL;
3312 int i, numpages = 0, count;
3313 mbuf_class_t super_class;
3314
3315 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
3316 class == MC_16KCL);
3317
3318 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3319
3320 VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) ||
3321 PAGE_SIZE == m_maxsize(MC_16KCL));
3322
3323 if (m_maxsize(class) >= PAGE_SIZE) {
3324 return m_clalloc(num, wait, m_maxsize(class)) != 0;
3325 }
3326
3327 /*
3328 * The rest of the function will allocate pages and will slice
3329 * them up into the right size
3330 */
3331
3332 numpages = (num * m_size(class) + PAGE_SIZE - 1) / PAGE_SIZE;
3333
3334 /* Currently assume that pages are 4K or 16K */
3335 if (PAGE_SIZE == m_maxsize(MC_BIGCL)) {
3336 super_class = MC_BIGCL;
3337 } else {
3338 super_class = MC_16KCL;
3339 }
3340
3341 i = m_clalloc(numpages, wait, m_maxsize(super_class));
3342
3343 /* how many objects will we cut the page into? */
3344 int numobj = PAGE_SIZE / m_maxsize(class);
3345
3346 for (count = 0; count < numpages; count++) {
3347 /* respect totals, minlimit, maxlimit */
3348 if (m_total(super_class) <= m_minlimit(super_class) ||
3349 m_total(class) >= m_maxlimit(class)) {
3350 break;
3351 }
3352
3353 if ((o = slab_alloc(super_class, wait)) == NULL) {
3354 break;
3355 }
3356
3357 struct mbuf *m = (struct mbuf *)o;
3358 union mcluster *c = (union mcluster *)o;
3359 union mbigcluster *mbc = (union mbigcluster *)o;
3360 mcl_slab_t *sp = slab_get(o);
3361 mcache_audit_t *mca = NULL;
3362
3363 /*
3364 * since one full page will be converted to MC_MBUF or
3365 * MC_CL, verify that the reference count will match that
3366 * assumption
3367 */
3368 VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp));
3369 VERIFY((sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
3370 /*
3371 * Make sure that the cluster is unmolested
3372 * while in freelist
3373 */
3374 if (mclverify) {
3375 mca = mcl_audit_buf2mca(super_class,
3376 (mcache_obj_t *)o);
3377 mcache_audit_free_verify(mca,
3378 (mcache_obj_t *)o, 0, m_maxsize(super_class));
3379 }
3380
3381 /* Reinitialize it as an mbuf or 2K or 4K slab */
3382 slab_init(sp, class, sp->sl_flags,
3383 sp->sl_base, NULL, PAGE_SIZE, 0, numobj);
3384
3385 VERIFY(sp->sl_head == NULL);
3386
3387 VERIFY(m_total(super_class) >= 1);
3388 m_total(super_class)--;
3389
3390 if (super_class == MC_BIGCL) {
3391 mbstat.m_bigclusters = m_total(MC_BIGCL);
3392 }
3393
3394 m_total(class) += numobj;
3395 VERIFY(m_total(class) <= m_maxlimit(class));
3396 m_infree(class) += numobj;
3397
3398 if (!mb_peak_newreport && mbuf_report_usage(class)) {
3399 mb_peak_newreport = TRUE;
3400 }
3401
3402 i = numobj;
3403 if (class == MC_MBUF) {
3404 mbstat.m_mbufs = m_total(MC_MBUF);
3405 mtype_stat_add(MT_FREE, NMBPG);
3406 while (i--) {
3407 /*
3408 * If auditing is enabled, construct the
3409 * shadow mbuf in the audit structure
3410 * instead of the actual one.
3411 * mbuf_slab_audit() will take care of
3412 * restoring the contents after the
3413 * integrity check.
3414 */
3415 if (mclaudit != NULL) {
3416 struct mbuf *ms;
3417 mca = mcl_audit_buf2mca(MC_MBUF,
3418 (mcache_obj_t *)m);
3419 ms = MCA_SAVED_MBUF_PTR(mca);
3420 ms->m_type = MT_FREE;
3421 } else {
3422 m->m_type = MT_FREE;
3423 }
3424 m->m_next = sp->sl_head;
3425 sp->sl_head = (void *)m++;
3426 }
3427 } else if (class == MC_CL) { /* MC_CL */
3428 mbstat.m_clfree =
3429 m_infree(MC_CL) + m_infree(MC_MBUF_CL);
3430 mbstat.m_clusters = m_total(MC_CL);
3431 while (i--) {
3432 c->mcl_next = sp->sl_head;
3433 sp->sl_head = (void *)c++;
3434 }
3435 } else {
3436 VERIFY(class == MC_BIGCL);
3437 mbstat.m_bigclusters = m_total(MC_BIGCL);
3438 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3439 m_infree(MC_MBUF_BIGCL);
3440 while (i--) {
3441 mbc->mbc_next = sp->sl_head;
3442 sp->sl_head = (void *)mbc++;
3443 }
3444 }
3445
3446 /* Insert into the mbuf or 2k or 4k slab list */
3447 slab_insert(sp, class);
3448
3449 if ((i = mb_waiters) > 0) {
3450 mb_waiters = 0;
3451 }
3452 if (i != 0) {
3453 mbwdog_logger("waking up all threads");
3454 wakeup(mb_waitchan);
3455 }
3456 }
3457 return count != 0;
3458 }
3459
3460 /*
3461 * For each class, initialize the freelist to hold m_minlimit() objects.
3462 */
3463 static void
3464 freelist_init(mbuf_class_t class)
3465 {
3466 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3467
3468 VERIFY(class == MC_CL || class == MC_BIGCL);
3469 VERIFY(m_total(class) == 0);
3470 VERIFY(m_minlimit(class) > 0);
3471
3472 while (m_total(class) < m_minlimit(class)) {
3473 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3474 }
3475
3476 VERIFY(m_total(class) >= m_minlimit(class));
3477 }
3478
3479 /*
3480 * (Inaccurately) check if it might be worth a trip back to the
3481 * mcache layer due the availability of objects there. We'll
3482 * end up back here if there's nothing up there.
3483 */
3484 static boolean_t
3485 mbuf_cached_above(mbuf_class_t class, int wait)
3486 {
3487 switch (class) {
3488 case MC_MBUF:
3489 if (wait & MCR_COMP) {
3490 return !mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3491 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
3492 }
3493 break;
3494
3495 case MC_CL:
3496 if (wait & MCR_COMP) {
3497 return !mcache_bkt_isempty(m_cache(MC_MBUF_CL));
3498 }
3499 break;
3500
3501 case MC_BIGCL:
3502 if (wait & MCR_COMP) {
3503 return !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
3504 }
3505 break;
3506
3507 case MC_16KCL:
3508 if (wait & MCR_COMP) {
3509 return !mcache_bkt_isempty(m_cache(MC_MBUF_16KCL));
3510 }
3511 break;
3512
3513 case MC_MBUF_CL:
3514 case MC_MBUF_BIGCL:
3515 case MC_MBUF_16KCL:
3516 break;
3517
3518 default:
3519 VERIFY(0);
3520 /* NOTREACHED */
3521 }
3522
3523 return !mcache_bkt_isempty(m_cache(class));
3524 }
3525
3526 /*
3527 * If possible, convert constructed objects to raw ones.
3528 */
3529 static boolean_t
3530 mbuf_steal(mbuf_class_t class, unsigned int num)
3531 {
3532 mcache_obj_t *top = NULL;
3533 mcache_obj_t **list = ⊤
3534 unsigned int tot = 0;
3535
3536 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3537
3538 switch (class) {
3539 case MC_MBUF:
3540 case MC_CL:
3541 case MC_BIGCL:
3542 case MC_16KCL:
3543 return FALSE;
3544
3545 case MC_MBUF_CL:
3546 case MC_MBUF_BIGCL:
3547 case MC_MBUF_16KCL:
3548 /* Get the required number of constructed objects if possible */
3549 if (m_infree(class) > m_minlimit(class)) {
3550 tot = cslab_alloc(class, &list,
3551 MIN(num, m_infree(class)));
3552 }
3553
3554 /* And destroy them to get back the raw objects */
3555 if (top != NULL) {
3556 (void) cslab_free(class, top, 1);
3557 }
3558 break;
3559
3560 default:
3561 VERIFY(0);
3562 /* NOTREACHED */
3563 }
3564
3565 return tot == num;
3566 }
3567
3568 static void
3569 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3570 {
3571 int m, bmap = 0;
3572
3573 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3574
3575 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3576 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3577 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3578
3579 /*
3580 * This logic can be made smarter; for now, simply mark
3581 * all other related classes as potential victims.
3582 */
3583 switch (class) {
3584 case MC_MBUF:
3585 m_wantpurge(MC_CL)++;
3586 m_wantpurge(MC_BIGCL)++;
3587 m_wantpurge(MC_MBUF_CL)++;
3588 m_wantpurge(MC_MBUF_BIGCL)++;
3589 break;
3590
3591 case MC_CL:
3592 m_wantpurge(MC_MBUF)++;
3593 m_wantpurge(MC_BIGCL)++;
3594 m_wantpurge(MC_MBUF_BIGCL)++;
3595 if (!comp) {
3596 m_wantpurge(MC_MBUF_CL)++;
3597 }
3598 break;
3599
3600 case MC_BIGCL:
3601 m_wantpurge(MC_MBUF)++;
3602 m_wantpurge(MC_CL)++;
3603 m_wantpurge(MC_MBUF_CL)++;
3604 if (!comp) {
3605 m_wantpurge(MC_MBUF_BIGCL)++;
3606 }
3607 break;
3608
3609 case MC_16KCL:
3610 if (!comp) {
3611 m_wantpurge(MC_MBUF_16KCL)++;
3612 }
3613 break;
3614
3615 default:
3616 VERIFY(0);
3617 /* NOTREACHED */
3618 }
3619
3620 /*
3621 * Run through each marked class and check if we really need to
3622 * purge (and therefore temporarily disable) the per-CPU caches
3623 * layer used by the class. If so, remember the classes since
3624 * we are going to drop the lock below prior to purging.
3625 */
3626 for (m = 0; m < NELEM(mbuf_table); m++) {
3627 if (m_wantpurge(m) > 0) {
3628 m_wantpurge(m) = 0;
3629 /*
3630 * Try hard to steal the required number of objects
3631 * from the freelist of other mbuf classes. Only
3632 * purge and disable the per-CPU caches layer when
3633 * we don't have enough; it's the last resort.
3634 */
3635 if (!mbuf_steal(m, num)) {
3636 bmap |= (1 << m);
3637 }
3638 }
3639 }
3640
3641 lck_mtx_unlock(mbuf_mlock);
3642
3643 if (bmap != 0) {
3644 /* signal the domains to drain */
3645 net_drain_domains();
3646
3647 /* Sigh; we have no other choices but to ask mcache to purge */
3648 for (m = 0; m < NELEM(mbuf_table); m++) {
3649 if ((bmap & (1 << m)) &&
3650 mcache_purge_cache(m_cache(m), TRUE)) {
3651 lck_mtx_lock(mbuf_mlock);
3652 m_purge_cnt(m)++;
3653 mbstat.m_drain++;
3654 lck_mtx_unlock(mbuf_mlock);
3655 }
3656 }
3657 } else {
3658 /*
3659 * Request mcache to reap extra elements from all of its caches;
3660 * note that all reaps are serialized and happen only at a fixed
3661 * interval.
3662 */
3663 mcache_reap();
3664 }
3665 lck_mtx_lock(mbuf_mlock);
3666 }
3667
3668 static inline struct mbuf *
3669 m_get_common(int wait, short type, int hdr)
3670 {
3671 struct mbuf *m;
3672 int mcflags = MSLEEPF(wait);
3673
3674 /* Is this due to a non-blocking retry? If so, then try harder */
3675 if (mcflags & MCR_NOSLEEP) {
3676 mcflags |= MCR_TRYHARD;
3677 }
3678
3679 m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3680 if (m != NULL) {
3681 MBUF_INIT(m, hdr, type);
3682 mtype_stat_inc(type);
3683 mtype_stat_dec(MT_FREE);
3684 }
3685 return m;
3686 }
3687
3688 /*
3689 * Space allocation routines; these are also available as macros
3690 * for critical paths.
3691 */
3692 #define _M_GET(wait, type) m_get_common(wait, type, 0)
3693 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
3694 #define _M_RETRY(wait, type) _M_GET(wait, type)
3695 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3696 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
3697 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
3698
3699 struct mbuf *
3700 m_get(int wait, int type)
3701 {
3702 return _M_GET(wait, type);
3703 }
3704
3705 struct mbuf *
3706 m_gethdr(int wait, int type)
3707 {
3708 return _M_GETHDR(wait, type);
3709 }
3710
3711 struct mbuf *
3712 m_retry(int wait, int type)
3713 {
3714 return _M_RETRY(wait, type);
3715 }
3716
3717 struct mbuf *
3718 m_retryhdr(int wait, int type)
3719 {
3720 return _M_RETRYHDR(wait, type);
3721 }
3722
3723 struct mbuf *
3724 m_getclr(int wait, int type)
3725 {
3726 struct mbuf *m;
3727
3728 _MGET(m, wait, type);
3729 if (m != NULL) {
3730 bzero(MTOD(m, caddr_t), MLEN);
3731 }
3732 return m;
3733 }
3734
3735 static int
3736 m_free_paired(struct mbuf *m)
3737 {
3738 VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED));
3739
3740 membar_sync();
3741 if (MEXT_PMBUF(m) == m) {
3742 volatile UInt16 *addr = (volatile UInt16 *)&MEXT_PREF(m);
3743 int16_t oprefcnt, prefcnt;
3744
3745 /*
3746 * Paired ref count might be negative in case we lose
3747 * against another thread clearing MEXT_PMBUF, in the
3748 * event it occurs after the above memory barrier sync.
3749 * In that case just ignore as things have been unpaired.
3750 */
3751 do {
3752 oprefcnt = *addr;
3753 prefcnt = oprefcnt - 1;
3754 } while (!OSCompareAndSwap16(oprefcnt, prefcnt, addr));
3755
3756 if (prefcnt > 1) {
3757 return 1;
3758 } else if (prefcnt == 1) {
3759 m_ext_free_func_t m_free_func = m_get_ext_free(m);
3760 VERIFY(m_free_func != NULL);
3761 (*m_free_func)(m->m_ext.ext_buf,
3762 m->m_ext.ext_size, m_get_ext_arg(m));
3763 return 1;
3764 } else if (prefcnt == 0) {
3765 VERIFY(MBUF_IS_PAIRED(m));
3766
3767 /*
3768 * Restore minref to its natural value, so that
3769 * the caller will be able to free the cluster
3770 * as appropriate.
3771 */
3772 MEXT_MINREF(m) = 0;
3773
3774 /*
3775 * Clear MEXT_PMBUF, but leave EXTF_PAIRED intact
3776 * as it is immutable. atomic_set_ptr also causes
3777 * memory barrier sync.
3778 */
3779 atomic_set_ptr(&MEXT_PMBUF(m), NULL);
3780
3781 switch (m->m_ext.ext_size) {
3782 case MCLBYTES:
3783 m_set_ext(m, m_get_rfa(m), NULL, NULL);
3784 break;
3785
3786 case MBIGCLBYTES:
3787 m_set_ext(m, m_get_rfa(m), m_bigfree, NULL);
3788 break;
3789
3790 case M16KCLBYTES:
3791 m_set_ext(m, m_get_rfa(m), m_16kfree, NULL);
3792 break;
3793
3794 default:
3795 VERIFY(0);
3796 /* NOTREACHED */
3797 }
3798 }
3799 }
3800
3801 /*
3802 * Tell caller the unpair has occurred, and that the reference
3803 * count on the external cluster held for the paired mbuf should
3804 * now be dropped.
3805 */
3806 return 0;
3807 }
3808
3809 struct mbuf *
3810 m_free(struct mbuf *m)
3811 {
3812 struct mbuf *n = m->m_next;
3813
3814 if (m->m_type == MT_FREE) {
3815 panic("m_free: freeing an already freed mbuf");
3816 }
3817
3818 if (m->m_flags & M_PKTHDR) {
3819 /* Check for scratch area overflow */
3820 m_redzone_verify(m);
3821 /* Free the aux data and tags if there is any */
3822 m_tag_delete_chain(m, NULL);
3823
3824 m_do_tx_compl_callback(m, NULL);
3825 }
3826
3827 if (m->m_flags & M_EXT) {
3828 uint16_t refcnt;
3829 uint32_t composite;
3830 m_ext_free_func_t m_free_func;
3831
3832 if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
3833 return n;
3834 }
3835
3836 refcnt = m_decref(m);
3837 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3838 m_free_func = m_get_ext_free(m);
3839
3840 if (refcnt == MEXT_MINREF(m) && !composite) {
3841 if (m_free_func == NULL) {
3842 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3843 } else if (m_free_func == m_bigfree) {
3844 mcache_free(m_cache(MC_BIGCL),
3845 m->m_ext.ext_buf);
3846 } else if (m_free_func == m_16kfree) {
3847 mcache_free(m_cache(MC_16KCL),
3848 m->m_ext.ext_buf);
3849 } else {
3850 (*m_free_func)(m->m_ext.ext_buf,
3851 m->m_ext.ext_size, m_get_ext_arg(m));
3852 }
3853 mcache_free(ref_cache, m_get_rfa(m));
3854 m_set_ext(m, NULL, NULL, NULL);
3855 } else if (refcnt == MEXT_MINREF(m) && composite) {
3856 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
3857 VERIFY(m->m_type != MT_FREE);
3858
3859 mtype_stat_dec(m->m_type);
3860 mtype_stat_inc(MT_FREE);
3861
3862 m->m_type = MT_FREE;
3863 m->m_flags = M_EXT;
3864 m->m_len = 0;
3865 m->m_next = m->m_nextpkt = NULL;
3866
3867 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3868
3869 /* "Free" into the intermediate cache */
3870 if (m_free_func == NULL) {
3871 mcache_free(m_cache(MC_MBUF_CL), m);
3872 } else if (m_free_func == m_bigfree) {
3873 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3874 } else {
3875 VERIFY(m_free_func == m_16kfree);
3876 mcache_free(m_cache(MC_MBUF_16KCL), m);
3877 }
3878 return n;
3879 }
3880 }
3881
3882 if (m->m_type != MT_FREE) {
3883 mtype_stat_dec(m->m_type);
3884 mtype_stat_inc(MT_FREE);
3885 }
3886
3887 m->m_type = MT_FREE;
3888 m->m_flags = m->m_len = 0;
3889 m->m_next = m->m_nextpkt = NULL;
3890
3891 mcache_free(m_cache(MC_MBUF), m);
3892
3893 return n;
3894 }
3895
3896 __private_extern__ struct mbuf *
3897 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3898 void (*extfree)(caddr_t, u_int, caddr_t), size_t extsize, caddr_t extarg,
3899 int wait, int pair)
3900 {
3901 struct ext_ref *rfa = NULL;
3902
3903 /*
3904 * If pairing is requested and an existing mbuf is provided, reject
3905 * it if it's already been paired to another cluster. Otherwise,
3906 * allocate a new one or free any existing below.
3907 */
3908 if ((m != NULL && MBUF_IS_PAIRED(m)) ||
3909 (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)) {
3910 return NULL;
3911 }
3912
3913 if (m->m_flags & M_EXT) {
3914 u_int16_t refcnt;
3915 u_int32_t composite;
3916 m_ext_free_func_t m_free_func;
3917
3918 refcnt = m_decref(m);
3919 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3920 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
3921 m_free_func = m_get_ext_free(m);
3922 if (refcnt == MEXT_MINREF(m) && !composite) {
3923 if (m_free_func == NULL) {
3924 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3925 } else if (m_free_func == m_bigfree) {
3926 mcache_free(m_cache(MC_BIGCL),
3927 m->m_ext.ext_buf);
3928 } else if (m_free_func == m_16kfree) {
3929 mcache_free(m_cache(MC_16KCL),
3930 m->m_ext.ext_buf);
3931 } else {
3932 (*m_free_func)(m->m_ext.ext_buf,
3933 m->m_ext.ext_size, m_get_ext_arg(m));
3934 }
3935 /* Re-use the reference structure */
3936 rfa = m_get_rfa(m);
3937 } else if (refcnt == MEXT_MINREF(m) && composite) {
3938 VERIFY(m->m_type != MT_FREE);
3939
3940 mtype_stat_dec(m->m_type);
3941 mtype_stat_inc(MT_FREE);
3942
3943 m->m_type = MT_FREE;
3944 m->m_flags = M_EXT;
3945 m->m_len = 0;
3946 m->m_next = m->m_nextpkt = NULL;
3947
3948 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3949
3950 /* "Free" into the intermediate cache */
3951 if (m_free_func == NULL) {
3952 mcache_free(m_cache(MC_MBUF_CL), m);
3953 } else if (m_free_func == m_bigfree) {
3954 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3955 } else {
3956 VERIFY(m_free_func == m_16kfree);
3957 mcache_free(m_cache(MC_MBUF_16KCL), m);
3958 }
3959 /*
3960 * Allocate a new mbuf, since we didn't divorce
3961 * the composite mbuf + cluster pair above.
3962 */
3963 if ((m = _M_GETHDR(wait, type)) == NULL) {
3964 return NULL;
3965 }
3966 }
3967 }
3968
3969 if (rfa == NULL &&
3970 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3971 m_free(m);
3972 return NULL;
3973 }
3974
3975 if (!pair) {
3976 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa,
3977 0, 1, 0, 0, 0, NULL);
3978 } else {
3979 MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
3980 1, 1, 1, EXTF_PAIRED, 0, m);
3981 }
3982
3983 return m;
3984 }
3985
3986 /*
3987 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3988 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3989 */
3990 struct mbuf *
3991 m_getcl(int wait, int type, int flags)
3992 {
3993 struct mbuf *m;
3994 int mcflags = MSLEEPF(wait);
3995 int hdr = (flags & M_PKTHDR);
3996
3997 /* Is this due to a non-blocking retry? If so, then try harder */
3998 if (mcflags & MCR_NOSLEEP) {
3999 mcflags |= MCR_TRYHARD;
4000 }
4001
4002 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
4003 if (m != NULL) {
4004 u_int16_t flag;
4005 struct ext_ref *rfa;
4006 void *cl;
4007
4008 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4009 cl = m->m_ext.ext_buf;
4010 rfa = m_get_rfa(m);
4011
4012 ASSERT(cl != NULL && rfa != NULL);
4013 VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
4014
4015 flag = MEXT_FLAGS(m);
4016
4017 MBUF_INIT(m, hdr, type);
4018 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4019
4020 mtype_stat_inc(type);
4021 mtype_stat_dec(MT_FREE);
4022 }
4023 return m;
4024 }
4025
4026 /* m_mclget() add an mbuf cluster to a normal mbuf */
4027 struct mbuf *
4028 m_mclget(struct mbuf *m, int wait)
4029 {
4030 struct ext_ref *rfa;
4031
4032 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4033 return m;
4034 }
4035
4036 m->m_ext.ext_buf = m_mclalloc(wait);
4037 if (m->m_ext.ext_buf != NULL) {
4038 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4039 } else {
4040 mcache_free(ref_cache, rfa);
4041 }
4042 return m;
4043 }
4044
4045 /* Allocate an mbuf cluster */
4046 caddr_t
4047 m_mclalloc(int wait)
4048 {
4049 int mcflags = MSLEEPF(wait);
4050
4051 /* Is this due to a non-blocking retry? If so, then try harder */
4052 if (mcflags & MCR_NOSLEEP) {
4053 mcflags |= MCR_TRYHARD;
4054 }
4055
4056 return mcache_alloc(m_cache(MC_CL), mcflags);
4057 }
4058
4059 /* Free an mbuf cluster */
4060 void
4061 m_mclfree(caddr_t p)
4062 {
4063 mcache_free(m_cache(MC_CL), p);
4064 }
4065
4066 /*
4067 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
4068 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
4069 */
4070 int
4071 m_mclhasreference(struct mbuf *m)
4072 {
4073 if (!(m->m_flags & M_EXT)) {
4074 return 0;
4075 }
4076
4077 ASSERT(m_get_rfa(m) != NULL);
4078
4079 return (MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0;
4080 }
4081
4082 __private_extern__ caddr_t
4083 m_bigalloc(int wait)
4084 {
4085 int mcflags = MSLEEPF(wait);
4086
4087 /* Is this due to a non-blocking retry? If so, then try harder */
4088 if (mcflags & MCR_NOSLEEP) {
4089 mcflags |= MCR_TRYHARD;
4090 }
4091
4092 return mcache_alloc(m_cache(MC_BIGCL), mcflags);
4093 }
4094
4095 __private_extern__ void
4096 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
4097 {
4098 mcache_free(m_cache(MC_BIGCL), p);
4099 }
4100
4101 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
4102 __private_extern__ struct mbuf *
4103 m_mbigget(struct mbuf *m, int wait)
4104 {
4105 struct ext_ref *rfa;
4106
4107 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4108 return m;
4109 }
4110
4111 m->m_ext.ext_buf = m_bigalloc(wait);
4112 if (m->m_ext.ext_buf != NULL) {
4113 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4114 } else {
4115 mcache_free(ref_cache, rfa);
4116 }
4117 return m;
4118 }
4119
4120 __private_extern__ caddr_t
4121 m_16kalloc(int wait)
4122 {
4123 int mcflags = MSLEEPF(wait);
4124
4125 /* Is this due to a non-blocking retry? If so, then try harder */
4126 if (mcflags & MCR_NOSLEEP) {
4127 mcflags |= MCR_TRYHARD;
4128 }
4129
4130 return mcache_alloc(m_cache(MC_16KCL), mcflags);
4131 }
4132
4133 __private_extern__ void
4134 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
4135 {
4136 mcache_free(m_cache(MC_16KCL), p);
4137 }
4138
4139 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
4140 __private_extern__ struct mbuf *
4141 m_m16kget(struct mbuf *m, int wait)
4142 {
4143 struct ext_ref *rfa;
4144
4145 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4146 return m;
4147 }
4148
4149 m->m_ext.ext_buf = m_16kalloc(wait);
4150 if (m->m_ext.ext_buf != NULL) {
4151 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4152 } else {
4153 mcache_free(ref_cache, rfa);
4154 }
4155 return m;
4156 }
4157
4158 /*
4159 * "Move" mbuf pkthdr from "from" to "to".
4160 * "from" must have M_PKTHDR set, and "to" must be empty.
4161 */
4162 void
4163 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
4164 {
4165 VERIFY(from->m_flags & M_PKTHDR);
4166
4167 /* Check for scratch area overflow */
4168 m_redzone_verify(from);
4169
4170 if (to->m_flags & M_PKTHDR) {
4171 /* Check for scratch area overflow */
4172 m_redzone_verify(to);
4173 /* We will be taking over the tags of 'to' */
4174 m_tag_delete_chain(to, NULL);
4175 }
4176 to->m_pkthdr = from->m_pkthdr; /* especially tags */
4177 m_classifier_init(from, 0); /* purge classifier info */
4178 m_tag_init(from, 1); /* purge all tags from src */
4179 m_scratch_init(from); /* clear src scratch area */
4180 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
4181 if ((to->m_flags & M_EXT) == 0) {
4182 to->m_data = to->m_pktdat;
4183 }
4184 m_redzone_init(to); /* setup red zone on dst */
4185 }
4186
4187 /*
4188 * Duplicate "from"'s mbuf pkthdr in "to".
4189 * "from" must have M_PKTHDR set, and "to" must be empty.
4190 * In particular, this does a deep copy of the packet tags.
4191 */
4192 int
4193 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
4194 {
4195 VERIFY(from->m_flags & M_PKTHDR);
4196
4197 /* Check for scratch area overflow */
4198 m_redzone_verify(from);
4199
4200 if (to->m_flags & M_PKTHDR) {
4201 /* Check for scratch area overflow */
4202 m_redzone_verify(to);
4203 /* We will be taking over the tags of 'to' */
4204 m_tag_delete_chain(to, NULL);
4205 }
4206 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
4207 if ((to->m_flags & M_EXT) == 0) {
4208 to->m_data = to->m_pktdat;
4209 }
4210 to->m_pkthdr = from->m_pkthdr;
4211 m_redzone_init(to); /* setup red zone on dst */
4212 m_tag_init(to, 0); /* preserve dst static tags */
4213 return m_tag_copy_chain(to, from, how);
4214 }
4215
4216 void
4217 m_copy_pftag(struct mbuf *to, struct mbuf *from)
4218 {
4219 memcpy(m_pftag(to), m_pftag(from), sizeof(struct pf_mtag));
4220 #if PF_ECN
4221 m_pftag(to)->pftag_hdr = NULL;
4222 m_pftag(to)->pftag_flags &= ~(PF_TAG_HDR_INET | PF_TAG_HDR_INET6);
4223 #endif /* PF_ECN */
4224 }
4225
4226 void
4227 m_copy_necptag(struct mbuf *to, struct mbuf *from)
4228 {
4229 memcpy(m_necptag(to), m_necptag(from), sizeof(struct necp_mtag_));
4230 }
4231
4232 void
4233 m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
4234 {
4235 VERIFY(m->m_flags & M_PKTHDR);
4236
4237 m->m_pkthdr.pkt_proto = 0;
4238 m->m_pkthdr.pkt_flowsrc = 0;
4239 m->m_pkthdr.pkt_flowid = 0;
4240 m->m_pkthdr.pkt_flags &= pktf_mask; /* caller-defined mask */
4241 /* preserve service class and interface info for loopback packets */
4242 if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
4243 (void) m_set_service_class(m, MBUF_SC_BE);
4244 }
4245 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
4246 m->m_pkthdr.pkt_ifainfo = 0;
4247 }
4248 /*
4249 * Preserve timestamp if requested
4250 */
4251 if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID)) {
4252 m->m_pkthdr.pkt_timestamp = 0;
4253 }
4254 }
4255
4256 void
4257 m_copy_classifier(struct mbuf *to, struct mbuf *from)
4258 {
4259 VERIFY(to->m_flags & M_PKTHDR);
4260 VERIFY(from->m_flags & M_PKTHDR);
4261
4262 to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
4263 to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
4264 to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
4265 to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
4266 to->m_pkthdr.pkt_ext_flags = from->m_pkthdr.pkt_ext_flags;
4267 (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
4268 to->m_pkthdr.pkt_ifainfo = from->m_pkthdr.pkt_ifainfo;
4269 }
4270
4271 /*
4272 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
4273 * if wantall is not set, return whatever number were available. Set up the
4274 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
4275 * are chained on the m_nextpkt field. Any packets requested beyond this
4276 * are chained onto the last packet header's m_next field. The size of
4277 * the cluster is controlled by the parameter bufsize.
4278 */
4279 __private_extern__ struct mbuf *
4280 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
4281 int wait, int wantall, size_t bufsize)
4282 {
4283 struct mbuf *m;
4284 struct mbuf **np, *top;
4285 unsigned int pnum, needed = *num_needed;
4286 mcache_obj_t *mp_list = NULL;
4287 int mcflags = MSLEEPF(wait);
4288 u_int16_t flag;
4289 struct ext_ref *rfa;
4290 mcache_t *cp;
4291 void *cl;
4292
4293 ASSERT(bufsize == m_maxsize(MC_CL) ||
4294 bufsize == m_maxsize(MC_BIGCL) ||
4295 bufsize == m_maxsize(MC_16KCL));
4296
4297 /*
4298 * Caller must first check for njcl because this
4299 * routine is internal and not exposed/used via KPI.
4300 */
4301 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
4302
4303 top = NULL;
4304 np = ⊤
4305 pnum = 0;
4306
4307 /*
4308 * The caller doesn't want all the requested buffers; only some.
4309 * Try hard to get what we can, but don't block. This effectively
4310 * overrides MCR_SLEEP, since this thread will not go to sleep
4311 * if we can't get all the buffers.
4312 */
4313 if (!wantall || (mcflags & MCR_NOSLEEP)) {
4314 mcflags |= MCR_TRYHARD;
4315 }
4316
4317 /* Allocate the composite mbuf + cluster elements from the cache */
4318 if (bufsize == m_maxsize(MC_CL)) {
4319 cp = m_cache(MC_MBUF_CL);
4320 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4321 cp = m_cache(MC_MBUF_BIGCL);
4322 } else {
4323 cp = m_cache(MC_MBUF_16KCL);
4324 }
4325 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
4326
4327 for (pnum = 0; pnum < needed; pnum++) {
4328 m = (struct mbuf *)mp_list;
4329 mp_list = mp_list->obj_next;
4330
4331 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4332 cl = m->m_ext.ext_buf;
4333 rfa = m_get_rfa(m);
4334
4335 ASSERT(cl != NULL && rfa != NULL);
4336 VERIFY(MBUF_IS_COMPOSITE(m));
4337
4338 flag = MEXT_FLAGS(m);
4339
4340 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
4341 if (bufsize == m_maxsize(MC_16KCL)) {
4342 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4343 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4344 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4345 } else {
4346 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4347 }
4348
4349 if (num_with_pkthdrs > 0) {
4350 --num_with_pkthdrs;
4351 }
4352
4353 *np = m;
4354 if (num_with_pkthdrs > 0) {
4355 np = &m->m_nextpkt;
4356 } else {
4357 np = &m->m_next;
4358 }
4359 }
4360 ASSERT(pnum != *num_needed || mp_list == NULL);
4361 if (mp_list != NULL) {
4362 mcache_free_ext(cp, mp_list);
4363 }
4364
4365 if (pnum > 0) {
4366 mtype_stat_add(MT_DATA, pnum);
4367 mtype_stat_sub(MT_FREE, pnum);
4368 }
4369
4370 if (wantall && (pnum != *num_needed)) {
4371 if (top != NULL) {
4372 m_freem_list(top);
4373 }
4374 return NULL;
4375 }
4376
4377 if (pnum > *num_needed) {
4378 printf("%s: File a radar related to <rdar://10146739>. \
4379 needed = %u, pnum = %u, num_needed = %u \n",
4380 __func__, needed, pnum, *num_needed);
4381 }
4382
4383 *num_needed = pnum;
4384 return top;
4385 }
4386
4387 /*
4388 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
4389 * wantall is not set, return whatever number were available. The size of
4390 * each mbuf in the list is controlled by the parameter packetlen. Each
4391 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
4392 * in the chain is called a segment. If maxsegments is not null and the
4393 * value pointed to is not null, this specify the maximum number of segments
4394 * for a chain of mbufs. If maxsegments is zero or the value pointed to
4395 * is zero the caller does not have any restriction on the number of segments.
4396 * The actual number of segments of a mbuf chain is return in the value
4397 * pointed to by maxsegments.
4398 */
4399 __private_extern__ struct mbuf *
4400 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
4401 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
4402 {
4403 struct mbuf **np, *top, *first = NULL;
4404 size_t bufsize, r_bufsize;
4405 unsigned int num = 0;
4406 unsigned int nsegs = 0;
4407 unsigned int needed, resid;
4408 int mcflags = MSLEEPF(wait);
4409 mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
4410 mcache_t *cp = NULL, *rcp = NULL;
4411
4412 if (*numlist == 0) {
4413 return NULL;
4414 }
4415
4416 top = NULL;
4417 np = ⊤
4418
4419 if (wantsize == 0) {
4420 if (packetlen <= MINCLSIZE) {
4421 bufsize = packetlen;
4422 } else if (packetlen > m_maxsize(MC_CL)) {
4423 /* Use 4KB if jumbo cluster pool isn't available */
4424 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0) {
4425 bufsize = m_maxsize(MC_BIGCL);
4426 } else {
4427 bufsize = m_maxsize(MC_16KCL);
4428 }
4429 } else {
4430 bufsize = m_maxsize(MC_CL);
4431 }
4432 } else if (wantsize == m_maxsize(MC_CL) ||
4433 wantsize == m_maxsize(MC_BIGCL) ||
4434 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
4435 bufsize = wantsize;
4436 } else {
4437 *numlist = 0;
4438 return NULL;
4439 }
4440
4441 if (bufsize <= MHLEN) {
4442 nsegs = 1;
4443 } else if (bufsize <= MINCLSIZE) {
4444 if (maxsegments != NULL && *maxsegments == 1) {
4445 bufsize = m_maxsize(MC_CL);
4446 nsegs = 1;
4447 } else {
4448 nsegs = 2;
4449 }
4450 } else if (bufsize == m_maxsize(MC_16KCL)) {
4451 VERIFY(njcl > 0);
4452 nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1;
4453 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4454 nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1;
4455 } else {
4456 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
4457 }
4458 if (maxsegments != NULL) {
4459 if (*maxsegments && nsegs > *maxsegments) {
4460 *maxsegments = nsegs;
4461 *numlist = 0;
4462 return NULL;
4463 }
4464 *maxsegments = nsegs;
4465 }
4466
4467 /*
4468 * The caller doesn't want all the requested buffers; only some.
4469 * Try hard to get what we can, but don't block. This effectively
4470 * overrides MCR_SLEEP, since this thread will not go to sleep
4471 * if we can't get all the buffers.
4472 */
4473 if (!wantall || (mcflags & MCR_NOSLEEP)) {
4474 mcflags |= MCR_TRYHARD;
4475 }
4476
4477 /*
4478 * Simple case where all elements in the lists/chains are mbufs.
4479 * Unless bufsize is greater than MHLEN, each segment chain is made
4480 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
4481 * of 2 mbufs; the second one is used for the residual data, i.e.
4482 * the remaining data that cannot fit into the first mbuf.
4483 */
4484 if (bufsize <= MINCLSIZE) {
4485 /* Allocate the elements in one shot from the mbuf cache */
4486 ASSERT(bufsize <= MHLEN || nsegs == 2);
4487 cp = m_cache(MC_MBUF);
4488 needed = mcache_alloc_ext(cp, &mp_list,
4489 (*numlist) * nsegs, mcflags);
4490
4491 /*
4492 * The number of elements must be even if we are to use an
4493 * mbuf (instead of a cluster) to store the residual data.
4494 * If we couldn't allocate the requested number of mbufs,
4495 * trim the number down (if it's odd) in order to avoid
4496 * creating a partial segment chain.
4497 */
4498 if (bufsize > MHLEN && (needed & 0x1)) {
4499 needed--;
4500 }
4501
4502 while (num < needed) {
4503 struct mbuf *m;
4504
4505 m = (struct mbuf *)mp_list;
4506 mp_list = mp_list->obj_next;
4507 ASSERT(m != NULL);
4508
4509 MBUF_INIT(m, 1, MT_DATA);
4510 num++;
4511 if (bufsize > MHLEN) {
4512 /* A second mbuf for this segment chain */
4513 m->m_next = (struct mbuf *)mp_list;
4514 mp_list = mp_list->obj_next;
4515 ASSERT(m->m_next != NULL);
4516
4517 MBUF_INIT(m->m_next, 0, MT_DATA);
4518 num++;
4519 }
4520 *np = m;
4521 np = &m->m_nextpkt;
4522 }
4523 ASSERT(num != *numlist || mp_list == NULL);
4524
4525 if (num > 0) {
4526 mtype_stat_add(MT_DATA, num);
4527 mtype_stat_sub(MT_FREE, num);
4528 }
4529 num /= nsegs;
4530
4531 /* We've got them all; return to caller */
4532 if (num == *numlist) {
4533 return top;
4534 }
4535
4536 goto fail;
4537 }
4538
4539 /*
4540 * Complex cases where elements are made up of one or more composite
4541 * mbufs + cluster, depending on packetlen. Each N-segment chain can
4542 * be illustrated as follows:
4543 *
4544 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4545 *
4546 * Every composite mbuf + cluster element comes from the intermediate
4547 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
4548 * the last composite element will come from the MC_MBUF_CL cache,
4549 * unless the residual data is larger than 2KB where we use the
4550 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
4551 * data is defined as extra data beyond the first element that cannot
4552 * fit into the previous element, i.e. there is no residual data if
4553 * the chain only has 1 segment.
4554 */
4555 r_bufsize = bufsize;
4556 resid = packetlen > bufsize ? packetlen % bufsize : 0;
4557 if (resid > 0) {
4558 /* There is residual data; figure out the cluster size */
4559 if (wantsize == 0 && packetlen > MINCLSIZE) {
4560 /*
4561 * Caller didn't request that all of the segments
4562 * in the chain use the same cluster size; use the
4563 * smaller of the cluster sizes.
4564 */
4565 if (njcl > 0 && resid > m_maxsize(MC_BIGCL)) {
4566 r_bufsize = m_maxsize(MC_16KCL);
4567 } else if (resid > m_maxsize(MC_CL)) {
4568 r_bufsize = m_maxsize(MC_BIGCL);
4569 } else {
4570 r_bufsize = m_maxsize(MC_CL);
4571 }
4572 } else {
4573 /* Use the same cluster size as the other segments */
4574 resid = 0;
4575 }
4576 }
4577
4578 needed = *numlist;
4579 if (resid > 0) {
4580 /*
4581 * Attempt to allocate composite mbuf + cluster elements for
4582 * the residual data in each chain; record the number of such
4583 * elements that can be allocated so that we know how many
4584 * segment chains we can afford to create.
4585 */
4586 if (r_bufsize <= m_maxsize(MC_CL)) {
4587 rcp = m_cache(MC_MBUF_CL);
4588 } else if (r_bufsize <= m_maxsize(MC_BIGCL)) {
4589 rcp = m_cache(MC_MBUF_BIGCL);
4590 } else {
4591 rcp = m_cache(MC_MBUF_16KCL);
4592 }
4593 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
4594
4595 if (needed == 0) {
4596 goto fail;
4597 }
4598
4599 /* This is temporarily reduced for calculation */
4600 ASSERT(nsegs > 1);
4601 nsegs--;
4602 }
4603
4604 /*
4605 * Attempt to allocate the rest of the composite mbuf + cluster
4606 * elements for the number of segment chains that we need.
4607 */
4608 if (bufsize <= m_maxsize(MC_CL)) {
4609 cp = m_cache(MC_MBUF_CL);
4610 } else if (bufsize <= m_maxsize(MC_BIGCL)) {
4611 cp = m_cache(MC_MBUF_BIGCL);
4612 } else {
4613 cp = m_cache(MC_MBUF_16KCL);
4614 }
4615 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
4616
4617 /* Round it down to avoid creating a partial segment chain */
4618 needed = (needed / nsegs) * nsegs;
4619 if (needed == 0) {
4620 goto fail;
4621 }
4622
4623 if (resid > 0) {
4624 /*
4625 * We're about to construct the chain(s); take into account
4626 * the number of segments we have created above to hold the
4627 * residual data for each chain, as well as restore the
4628 * original count of segments per chain.
4629 */
4630 ASSERT(nsegs > 0);
4631 needed += needed / nsegs;
4632 nsegs++;
4633 }
4634
4635 for (;;) {
4636 struct mbuf *m;
4637 u_int16_t flag;
4638 struct ext_ref *rfa;
4639 void *cl;
4640 int pkthdr;
4641 m_ext_free_func_t m_free_func;
4642
4643 ++num;
4644 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
4645 m = (struct mbuf *)mp_list;
4646 mp_list = mp_list->obj_next;
4647 } else {
4648 m = (struct mbuf *)rmp_list;
4649 rmp_list = rmp_list->obj_next;
4650 }
4651 m_free_func = m_get_ext_free(m);
4652 ASSERT(m != NULL);
4653 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4654 VERIFY(m_free_func == NULL || m_free_func == m_bigfree ||
4655 m_free_func == m_16kfree);
4656
4657 cl = m->m_ext.ext_buf;
4658 rfa = m_get_rfa(m);
4659
4660 ASSERT(cl != NULL && rfa != NULL);
4661 VERIFY(MBUF_IS_COMPOSITE(m));
4662
4663 flag = MEXT_FLAGS(m);
4664
4665 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4666 if (pkthdr) {
4667 first = m;
4668 }
4669 MBUF_INIT(m, pkthdr, MT_DATA);
4670 if (m_free_func == m_16kfree) {
4671 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4672 } else if (m_free_func == m_bigfree) {
4673 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4674 } else {
4675 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4676 }
4677
4678 *np = m;
4679 if ((num % nsegs) == 0) {
4680 np = &first->m_nextpkt;
4681 } else {
4682 np = &m->m_next;
4683 }
4684
4685 if (num == needed) {
4686 break;
4687 }
4688 }
4689
4690 if (num > 0) {
4691 mtype_stat_add(MT_DATA, num);
4692 mtype_stat_sub(MT_FREE, num);
4693 }
4694
4695 num /= nsegs;
4696
4697 /* We've got them all; return to caller */
4698 if (num == *numlist) {
4699 ASSERT(mp_list == NULL && rmp_list == NULL);
4700 return top;
4701 }
4702
4703 fail:
4704 /* Free up what's left of the above */
4705 if (mp_list != NULL) {
4706 mcache_free_ext(cp, mp_list);
4707 }
4708 if (rmp_list != NULL) {
4709 mcache_free_ext(rcp, rmp_list);
4710 }
4711 if (wantall && top != NULL) {
4712 m_freem_list(top);
4713 *numlist = 0;
4714 return NULL;
4715 }
4716 *numlist = num;
4717 return top;
4718 }
4719
4720 /*
4721 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4722 * packets on receive ring.
4723 */
4724 __private_extern__ struct mbuf *
4725 m_getpacket_how(int wait)
4726 {
4727 unsigned int num_needed = 1;
4728
4729 return m_getpackets_internal(&num_needed, 1, wait, 1,
4730 m_maxsize(MC_CL));
4731 }
4732
4733 /*
4734 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4735 * packets on receive ring.
4736 */
4737 struct mbuf *
4738 m_getpacket(void)
4739 {
4740 unsigned int num_needed = 1;
4741
4742 return m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4743 m_maxsize(MC_CL));
4744 }
4745
4746 /*
4747 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
4748 * if this can't be met, return whatever number were available. Set up the
4749 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
4750 * are chained on the m_nextpkt field. Any packets requested beyond this are
4751 * chained onto the last packet header's m_next field.
4752 */
4753 struct mbuf *
4754 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4755 {
4756 unsigned int n = num_needed;
4757
4758 return m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4759 m_maxsize(MC_CL));
4760 }
4761
4762 /*
4763 * Return a list of mbuf hdrs set up as packet hdrs chained together
4764 * on the m_nextpkt field
4765 */
4766 struct mbuf *
4767 m_getpackethdrs(int num_needed, int how)
4768 {
4769 struct mbuf *m;
4770 struct mbuf **np, *top;
4771
4772 top = NULL;
4773 np = ⊤
4774
4775 while (num_needed--) {
4776 m = _M_RETRYHDR(how, MT_DATA);
4777 if (m == NULL) {
4778 break;
4779 }
4780
4781 *np = m;
4782 np = &m->m_nextpkt;
4783 }
4784
4785 return top;
4786 }
4787
4788 /*
4789 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
4790 * for mbufs packets freed. Used by the drivers.
4791 */
4792 int
4793 m_freem_list(struct mbuf *m)
4794 {
4795 struct mbuf *nextpkt;
4796 mcache_obj_t *mp_list = NULL;
4797 mcache_obj_t *mcl_list = NULL;
4798 mcache_obj_t *mbc_list = NULL;
4799 mcache_obj_t *m16k_list = NULL;
4800 mcache_obj_t *m_mcl_list = NULL;
4801 mcache_obj_t *m_mbc_list = NULL;
4802 mcache_obj_t *m_m16k_list = NULL;
4803 mcache_obj_t *ref_list = NULL;
4804 int pktcount = 0;
4805 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4806
4807 while (m != NULL) {
4808 pktcount++;
4809
4810 nextpkt = m->m_nextpkt;
4811 m->m_nextpkt = NULL;
4812
4813 while (m != NULL) {
4814 struct mbuf *next = m->m_next;
4815 mcache_obj_t *o, *rfa;
4816 u_int32_t composite;
4817 u_int16_t refcnt;
4818 m_ext_free_func_t m_free_func;
4819
4820 if (m->m_type == MT_FREE) {
4821 panic("m_free: freeing an already freed mbuf");
4822 }
4823
4824 if (m->m_flags & M_PKTHDR) {
4825 /* Check for scratch area overflow */
4826 m_redzone_verify(m);
4827 /* Free the aux data and tags if there is any */
4828 m_tag_delete_chain(m, NULL);
4829 m_do_tx_compl_callback(m, NULL);
4830 }
4831
4832 if (!(m->m_flags & M_EXT)) {
4833 mt_free++;
4834 goto simple_free;
4835 }
4836
4837 if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
4838 m = next;
4839 continue;
4840 }
4841
4842 mt_free++;
4843
4844 o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4845 refcnt = m_decref(m);
4846 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4847 m_free_func = m_get_ext_free(m);
4848 if (refcnt == MEXT_MINREF(m) && !composite) {
4849 if (m_free_func == NULL) {
4850 o->obj_next = mcl_list;
4851 mcl_list = o;
4852 } else if (m_free_func == m_bigfree) {
4853 o->obj_next = mbc_list;
4854 mbc_list = o;
4855 } else if (m_free_func == m_16kfree) {
4856 o->obj_next = m16k_list;
4857 m16k_list = o;
4858 } else {
4859 (*(m_free_func))((caddr_t)o,
4860 m->m_ext.ext_size,
4861 m_get_ext_arg(m));
4862 }
4863 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
4864 rfa->obj_next = ref_list;
4865 ref_list = rfa;
4866 m_set_ext(m, NULL, NULL, NULL);
4867 } else if (refcnt == MEXT_MINREF(m) && composite) {
4868 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
4869 VERIFY(m->m_type != MT_FREE);
4870 /*
4871 * Amortize the costs of atomic operations
4872 * by doing them at the end, if possible.
4873 */
4874 if (m->m_type == MT_DATA) {
4875 mt_data++;
4876 } else if (m->m_type == MT_HEADER) {
4877 mt_header++;
4878 } else if (m->m_type == MT_SONAME) {
4879 mt_soname++;
4880 } else if (m->m_type == MT_TAG) {
4881 mt_tag++;
4882 } else {
4883 mtype_stat_dec(m->m_type);
4884 }
4885
4886 m->m_type = MT_FREE;
4887 m->m_flags = M_EXT;
4888 m->m_len = 0;
4889 m->m_next = m->m_nextpkt = NULL;
4890
4891 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4892
4893 /* "Free" into the intermediate cache */
4894 o = (mcache_obj_t *)m;
4895 if (m_free_func == NULL) {
4896 o->obj_next = m_mcl_list;
4897 m_mcl_list = o;
4898 } else if (m_free_func == m_bigfree) {
4899 o->obj_next = m_mbc_list;
4900 m_mbc_list = o;
4901 } else {
4902 VERIFY(m_free_func == m_16kfree);
4903 o->obj_next = m_m16k_list;
4904 m_m16k_list = o;
4905 }
4906 m = next;
4907 continue;
4908 }
4909 simple_free:
4910 /*
4911 * Amortize the costs of atomic operations
4912 * by doing them at the end, if possible.
4913 */
4914 if (m->m_type == MT_DATA) {
4915 mt_data++;
4916 } else if (m->m_type == MT_HEADER) {
4917 mt_header++;
4918 } else if (m->m_type == MT_SONAME) {
4919 mt_soname++;
4920 } else if (m->m_type == MT_TAG) {
4921 mt_tag++;
4922 } else if (m->m_type != MT_FREE) {
4923 mtype_stat_dec(m->m_type);
4924 }
4925
4926 m->m_type = MT_FREE;
4927 m->m_flags = m->m_len = 0;
4928 m->m_next = m->m_nextpkt = NULL;
4929
4930 ((mcache_obj_t *)m)->obj_next = mp_list;
4931 mp_list = (mcache_obj_t *)m;
4932
4933 m = next;
4934 }
4935
4936 m = nextpkt;
4937 }
4938
4939 if (mt_free > 0) {
4940 mtype_stat_add(MT_FREE, mt_free);
4941 }
4942 if (mt_data > 0) {
4943 mtype_stat_sub(MT_DATA, mt_data);
4944 }
4945 if (mt_header > 0) {
4946 mtype_stat_sub(MT_HEADER, mt_header);
4947 }
4948 if (mt_soname > 0) {
4949 mtype_stat_sub(MT_SONAME, mt_soname);
4950 }
4951 if (mt_tag > 0) {
4952 mtype_stat_sub(MT_TAG, mt_tag);
4953 }
4954
4955 if (mp_list != NULL) {
4956 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4957 }
4958 if (mcl_list != NULL) {
4959 mcache_free_ext(m_cache(MC_CL), mcl_list);
4960 }
4961 if (mbc_list != NULL) {
4962 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4963 }
4964 if (m16k_list != NULL) {
4965 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4966 }
4967 if (m_mcl_list != NULL) {
4968 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4969 }
4970 if (m_mbc_list != NULL) {
4971 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4972 }
4973 if (m_m16k_list != NULL) {
4974 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4975 }
4976 if (ref_list != NULL) {
4977 mcache_free_ext(ref_cache, ref_list);
4978 }
4979
4980 return pktcount;
4981 }
4982
4983 void
4984 m_freem(struct mbuf *m)
4985 {
4986 while (m != NULL) {
4987 m = m_free(m);
4988 }
4989 }
4990
4991 /*
4992 * Mbuffer utility routines.
4993 */
4994 /*
4995 * Set the m_data pointer of a newly allocated mbuf to place an object of the
4996 * specified size at the end of the mbuf, longword aligned.
4997 *
4998 * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
4999 * separate macros, each asserting that it was called at the proper moment.
5000 * This required callers to themselves test the storage type and call the
5001 * right one. Rather than require callers to be aware of those layout
5002 * decisions, we centralize here.
5003 */
5004 void
5005 m_align(struct mbuf *m, int len)
5006 {
5007 int adjust = 0;
5008
5009 /* At this point data must point to start */
5010 VERIFY(m->m_data == M_START(m));
5011 VERIFY(len >= 0);
5012 VERIFY(len <= M_SIZE(m));
5013 adjust = M_SIZE(m) - len;
5014 m->m_data += adjust & ~(sizeof(long) - 1);
5015 }
5016
5017 /*
5018 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
5019 * copy junk along. Does not adjust packet header length.
5020 */
5021 struct mbuf *
5022 m_prepend(struct mbuf *m, int len, int how)
5023 {
5024 struct mbuf *mn;
5025
5026 _MGET(mn, how, m->m_type);
5027 if (mn == NULL) {
5028 m_freem(m);
5029 return NULL;
5030 }
5031 if (m->m_flags & M_PKTHDR) {
5032 M_COPY_PKTHDR(mn, m);
5033 m->m_flags &= ~M_PKTHDR;
5034 }
5035 mn->m_next = m;
5036 m = mn;
5037 if (m->m_flags & M_PKTHDR) {
5038 VERIFY(len <= MHLEN);
5039 MH_ALIGN(m, len);
5040 } else {
5041 VERIFY(len <= MLEN);
5042 M_ALIGN(m, len);
5043 }
5044 m->m_len = len;
5045 return m;
5046 }
5047
5048 /*
5049 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
5050 * chain, copy junk along, and adjust length.
5051 */
5052 struct mbuf *
5053 m_prepend_2(struct mbuf *m, int len, int how, int align)
5054 {
5055 if (M_LEADINGSPACE(m) >= len &&
5056 (!align || IS_P2ALIGNED((m->m_data - len), sizeof(u_int32_t)))) {
5057 m->m_data -= len;
5058 m->m_len += len;
5059 } else {
5060 m = m_prepend(m, len, how);
5061 }
5062 if ((m) && (m->m_flags & M_PKTHDR)) {
5063 m->m_pkthdr.len += len;
5064 }
5065 return m;
5066 }
5067
5068 /*
5069 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
5070 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
5071 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
5072 */
5073 int MCFail;
5074
5075 struct mbuf *
5076 m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode)
5077 {
5078 struct mbuf *n, *mhdr = NULL, **np;
5079 int off = off0;
5080 struct mbuf *top;
5081 int copyhdr = 0;
5082
5083 if (off < 0 || len < 0) {
5084 panic("m_copym: invalid offset %d or len %d", off, len);
5085 }
5086
5087 VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
5088 mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR));
5089
5090 if ((off == 0 && (m->m_flags & M_PKTHDR)) ||
5091 mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) {
5092 mhdr = m;
5093 copyhdr = 1;
5094 }
5095
5096 while (off >= m->m_len) {
5097 if (m->m_next == NULL) {
5098 panic("m_copym: invalid mbuf chain");
5099 }
5100 off -= m->m_len;
5101 m = m->m_next;
5102 }
5103 np = ⊤
5104 top = NULL;
5105
5106 while (len > 0) {
5107 if (m == NULL) {
5108 if (len != M_COPYALL) {
5109 panic("m_copym: len != M_COPYALL");
5110 }
5111 break;
5112 }
5113
5114 if (copyhdr) {
5115 n = _M_RETRYHDR(wait, m->m_type);
5116 } else {
5117 n = _M_RETRY(wait, m->m_type);
5118 }
5119 *np = n;
5120
5121 if (n == NULL) {
5122 goto nospace;
5123 }
5124
5125 if (copyhdr != 0) {
5126 if ((mode == M_COPYM_MOVE_HDR) ||
5127 (mode == M_COPYM_MUST_MOVE_HDR)) {
5128 M_COPY_PKTHDR(n, mhdr);
5129 } else if ((mode == M_COPYM_COPY_HDR) ||
5130 (mode == M_COPYM_MUST_COPY_HDR)) {
5131 if (m_dup_pkthdr(n, mhdr, wait) == 0) {
5132 goto nospace;
5133 }
5134 }
5135 if (len == M_COPYALL) {
5136 n->m_pkthdr.len -= off0;
5137 } else {
5138 n->m_pkthdr.len = len;
5139 }
5140 copyhdr = 0;
5141 /*
5142 * There is data to copy from the packet header mbuf
5143 * if it is empty or it is before the starting offset
5144 */
5145 if (mhdr != m) {
5146 np = &n->m_next;
5147 continue;
5148 }
5149 }
5150 n->m_len = MIN(len, (m->m_len - off));
5151 if (m->m_flags & M_EXT) {
5152 n->m_ext = m->m_ext;
5153 m_incref(m);
5154 n->m_data = m->m_data + off;
5155 n->m_flags |= M_EXT;
5156 } else {
5157 /*
5158 * Limit to the capacity of the destination
5159 */
5160 if (n->m_flags & M_PKTHDR) {
5161 n->m_len = MIN(n->m_len, MHLEN);
5162 } else {
5163 n->m_len = MIN(n->m_len, MLEN);
5164 }
5165
5166 if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE) {
5167 panic("%s n %p copy overflow",
5168 __func__, n);
5169 }
5170
5171 bcopy(MTOD(m, caddr_t) + off, MTOD(n, caddr_t),
5172 (unsigned)n->m_len);
5173 }
5174 if (len != M_COPYALL) {
5175 len -= n->m_len;
5176 }
5177 off = 0;
5178 m = m->m_next;
5179 np = &n->m_next;
5180 }
5181
5182 if (top == NULL) {
5183 MCFail++;
5184 }
5185
5186 return top;
5187 nospace:
5188
5189 m_freem(top);
5190 MCFail++;
5191 return NULL;
5192 }
5193
5194
5195 struct mbuf *
5196 m_copym(struct mbuf *m, int off0, int len, int wait)
5197 {
5198 return m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR);
5199 }
5200
5201 /*
5202 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
5203 * within this routine also, the last mbuf and offset accessed are passed
5204 * out and can be passed back in to avoid having to rescan the entire mbuf
5205 * list (normally hung off of the socket)
5206 */
5207 struct mbuf *
5208 m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
5209 struct mbuf **m_lastm, int *m_off, uint32_t mode)
5210 {
5211 struct mbuf *m = m0, *n, **np = NULL;
5212 int off = off0, len = len0;
5213 struct mbuf *top = NULL;
5214 int mcflags = MSLEEPF(wait);
5215 int copyhdr = 0;
5216 int type = 0;
5217 mcache_obj_t *list = NULL;
5218 int needed = 0;
5219
5220 if (off == 0 && (m->m_flags & M_PKTHDR)) {
5221 copyhdr = 1;
5222 }
5223
5224 if (m_lastm != NULL && *m_lastm != NULL) {
5225 m = *m_lastm;
5226 off = *m_off;
5227 } else {
5228 while (off >= m->m_len) {
5229 off -= m->m_len;
5230 m = m->m_next;
5231 }
5232 }
5233
5234 n = m;
5235 while (len > 0) {
5236 needed++;
5237 ASSERT(n != NULL);
5238 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
5239 n = n->m_next;
5240 }
5241 needed++;
5242 len = len0;
5243
5244 /*
5245 * If the caller doesn't want to be put to sleep, mark it with
5246 * MCR_TRYHARD so that we may reclaim buffers from other places
5247 * before giving up.
5248 */
5249 if (mcflags & MCR_NOSLEEP) {
5250 mcflags |= MCR_TRYHARD;
5251 }
5252
5253 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
5254 mcflags) != needed) {
5255 goto nospace;
5256 }
5257
5258 needed = 0;
5259 while (len > 0) {
5260 n = (struct mbuf *)list;
5261 list = list->obj_next;
5262 ASSERT(n != NULL && m != NULL);
5263
5264 type = (top == NULL) ? MT_HEADER : m->m_type;
5265 MBUF_INIT(n, (top == NULL), type);
5266
5267 if (top == NULL) {
5268 top = n;
5269 np = &top->m_next;
5270 continue;
5271 } else {
5272 needed++;
5273 *np = n;
5274 }
5275
5276 if (copyhdr) {
5277 if ((mode == M_COPYM_MOVE_HDR) ||
5278 (mode == M_COPYM_MUST_MOVE_HDR)) {
5279 M_COPY_PKTHDR(n, m);
5280 } else if ((mode == M_COPYM_COPY_HDR) ||
5281 (mode == M_COPYM_MUST_COPY_HDR)) {
5282 if (m_dup_pkthdr(n, m, wait) == 0) {
5283 goto nospace;
5284 }
5285 }
5286 n->m_pkthdr.len = len;
5287 copyhdr = 0;
5288 }
5289 n->m_len = MIN(len, (m->m_len - off));
5290
5291 if (m->m_flags & M_EXT) {
5292 n->m_ext = m->m_ext;
5293 m_incref(m);
5294 n->m_data = m->m_data + off;
5295 n->m_flags |= M_EXT;
5296 } else {
5297 if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE) {
5298 panic("%s n %p copy overflow",
5299 __func__, n);
5300 }
5301
5302 bcopy(MTOD(m, caddr_t) + off, MTOD(n, caddr_t),
5303 (unsigned)n->m_len);
5304 }
5305 len -= n->m_len;
5306
5307 if (len == 0) {
5308 if (m_lastm != NULL && m_off != NULL) {
5309 if ((off + n->m_len) == m->m_len) {
5310 *m_lastm = m->m_next;
5311 *m_off = 0;
5312 } else {
5313 *m_lastm = m;
5314 *m_off = off + n->m_len;
5315 }
5316 }
5317 break;
5318 }
5319 off = 0;
5320 m = m->m_next;
5321 np = &n->m_next;
5322 }
5323
5324 mtype_stat_inc(MT_HEADER);
5325 mtype_stat_add(type, needed);
5326 mtype_stat_sub(MT_FREE, needed + 1);
5327
5328 ASSERT(list == NULL);
5329 return top;
5330
5331 nospace:
5332 if (list != NULL) {
5333 mcache_free_ext(m_cache(MC_MBUF), list);
5334 }
5335 if (top != NULL) {
5336 m_freem(top);
5337 }
5338 MCFail++;
5339 return NULL;
5340 }
5341
5342 /*
5343 * Copy data from an mbuf chain starting "off" bytes from the beginning,
5344 * continuing for "len" bytes, into the indicated buffer.
5345 */
5346 void
5347 m_copydata(struct mbuf *m, int off, int len, void *vp)
5348 {
5349 int off0 = off, len0 = len;
5350 struct mbuf *m0 = m;
5351 unsigned count;
5352 char *cp = vp;
5353
5354 if (__improbable(off < 0 || len < 0)) {
5355 panic("%s: invalid offset %d or len %d", __func__, off, len);
5356 /* NOTREACHED */
5357 }
5358
5359 while (off > 0) {
5360 if (__improbable(m == NULL)) {
5361 panic("%s: invalid mbuf chain %p [off %d, len %d]",
5362 __func__, m0, off0, len0);
5363 /* NOTREACHED */
5364 }
5365 if (off < m->m_len) {
5366 break;
5367 }
5368 off -= m->m_len;
5369 m = m->m_next;
5370 }
5371 while (len > 0) {
5372 if (__improbable(m == NULL)) {
5373 panic("%s: invalid mbuf chain %p [off %d, len %d]",
5374 __func__, m0, off0, len0);
5375 /* NOTREACHED */
5376 }
5377 count = MIN(m->m_len - off, len);
5378 bcopy(MTOD(m, caddr_t) + off, cp, count);
5379 len -= count;
5380 cp += count;
5381 off = 0;
5382 m = m->m_next;
5383 }
5384 }
5385
5386 /*
5387 * Concatenate mbuf chain n to m. Both chains must be of the same type
5388 * (e.g. MT_DATA). Any m_pkthdr is not updated.
5389 */
5390 void
5391 m_cat(struct mbuf *m, struct mbuf *n)
5392 {
5393 while (m->m_next) {
5394 m = m->m_next;
5395 }
5396 while (n) {
5397 if ((m->m_flags & M_EXT) ||
5398 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
5399 /* just join the two chains */
5400 m->m_next = n;
5401 return;
5402 }
5403 /* splat the data from one into the other */
5404 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5405 (u_int)n->m_len);
5406 m->m_len += n->m_len;
5407 n = m_free(n);
5408 }
5409 }
5410
5411 void
5412 m_adj(struct mbuf *mp, int req_len)
5413 {
5414 int len = req_len;
5415 struct mbuf *m;
5416 int count;
5417
5418 if ((m = mp) == NULL) {
5419 return;
5420 }
5421 if (len >= 0) {
5422 /*
5423 * Trim from head.
5424 */
5425 while (m != NULL && len > 0) {
5426 if (m->m_len <= len) {
5427 len -= m->m_len;
5428 m->m_len = 0;
5429 m = m->m_next;
5430 } else {
5431 m->m_len -= len;
5432 m->m_data += len;
5433 len = 0;
5434 }
5435 }
5436 m = mp;
5437 if (m->m_flags & M_PKTHDR) {
5438 m->m_pkthdr.len -= (req_len - len);
5439 }
5440 } else {
5441 /*
5442 * Trim from tail. Scan the mbuf chain,
5443 * calculating its length and finding the last mbuf.
5444 * If the adjustment only affects this mbuf, then just
5445 * adjust and return. Otherwise, rescan and truncate
5446 * after the remaining size.
5447 */
5448 len = -len;
5449 count = 0;
5450 for (;;) {
5451 count += m->m_len;
5452 if (m->m_next == (struct mbuf *)0) {
5453 break;
5454 }
5455 m = m->m_next;
5456 }
5457 if (m->m_len >= len) {
5458 m->m_len -= len;
5459 m = mp;
5460 if (m->m_flags & M_PKTHDR) {
5461 m->m_pkthdr.len -= len;
5462 }
5463 return;
5464 }
5465 count -= len;
5466 if (count < 0) {
5467 count = 0;
5468 }
5469 /*
5470 * Correct length for chain is "count".
5471 * Find the mbuf with last data, adjust its length,
5472 * and toss data from remaining mbufs on chain.
5473 */
5474 m = mp;
5475 if (m->m_flags & M_PKTHDR) {
5476 m->m_pkthdr.len = count;
5477 }
5478 for (; m; m = m->m_next) {
5479 if (m->m_len >= count) {
5480 m->m_len = count;
5481 break;
5482 }
5483 count -= m->m_len;
5484 }
5485 while ((m = m->m_next)) {
5486 m->m_len = 0;
5487 }
5488 }
5489 }
5490
5491 /*
5492 * Rearange an mbuf chain so that len bytes are contiguous
5493 * and in the data area of an mbuf (so that mtod and dtom
5494 * will work for a structure of size len). Returns the resulting
5495 * mbuf chain on success, frees it and returns null on failure.
5496 * If there is room, it will add up to max_protohdr-len extra bytes to the
5497 * contiguous region in an attempt to avoid being called next time.
5498 */
5499 int MPFail;
5500
5501 struct mbuf *
5502 m_pullup(struct mbuf *n, int len)
5503 {
5504 struct mbuf *m;
5505 int count;
5506 int space;
5507
5508 /* check invalid arguments */
5509 if (n == NULL) {
5510 panic("%s: n == NULL", __func__);
5511 }
5512 if (len < 0) {
5513 os_log_info(OS_LOG_DEFAULT, "%s: failed negative len %d",
5514 __func__, len);
5515 goto bad;
5516 }
5517 if (len > MLEN) {
5518 os_log_info(OS_LOG_DEFAULT, "%s: failed len %d too big",
5519 __func__, len);
5520 goto bad;
5521 }
5522 if ((n->m_flags & M_EXT) == 0 &&
5523 n->m_data >= &n->m_dat[MLEN]) {
5524 os_log_info(OS_LOG_DEFAULT, "%s: m_data out of bounds",
5525 __func__);
5526 goto bad;
5527 }
5528
5529 /*
5530 * If first mbuf has no cluster, and has room for len bytes
5531 * without shifting current data, pullup into it,
5532 * otherwise allocate a new mbuf to prepend to the chain.
5533 */
5534 if ((n->m_flags & M_EXT) == 0 &&
5535 len < &n->m_dat[MLEN] - n->m_data && n->m_next != NULL) {
5536 if (n->m_len >= len) {
5537 return n;
5538 }
5539 m = n;
5540 n = n->m_next;
5541 len -= m->m_len;
5542 } else {
5543 if (len > MHLEN) {
5544 goto bad;
5545 }
5546 _MGET(m, M_DONTWAIT, n->m_type);
5547 if (m == 0) {
5548 goto bad;
5549 }
5550 m->m_len = 0;
5551 if (n->m_flags & M_PKTHDR) {
5552 M_COPY_PKTHDR(m, n);
5553 n->m_flags &= ~M_PKTHDR;
5554 }
5555 }
5556 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5557 do {
5558 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
5559 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5560 (unsigned)count);
5561 len -= count;
5562 m->m_len += count;
5563 n->m_len -= count;
5564 space -= count;
5565 if (n->m_len != 0) {
5566 n->m_data += count;
5567 } else {
5568 n = m_free(n);
5569 }
5570 } while (len > 0 && n != NULL);
5571 if (len > 0) {
5572 (void) m_free(m);
5573 goto bad;
5574 }
5575 m->m_next = n;
5576 return m;
5577 bad:
5578 m_freem(n);
5579 MPFail++;
5580 return 0;
5581 }
5582
5583 /*
5584 * Like m_pullup(), except a new mbuf is always allocated, and we allow
5585 * the amount of empty space before the data in the new mbuf to be specified
5586 * (in the event that the caller expects to prepend later).
5587 */
5588 __private_extern__ int MSFail = 0;
5589
5590 __private_extern__ struct mbuf *
5591 m_copyup(struct mbuf *n, int len, int dstoff)
5592 {
5593 struct mbuf *m;
5594 int count, space;
5595
5596 VERIFY(len >= 0 && dstoff >= 0);
5597
5598 if (len > (MHLEN - dstoff)) {
5599 goto bad;
5600 }
5601 MGET(m, M_DONTWAIT, n->m_type);
5602 if (m == NULL) {
5603 goto bad;
5604 }
5605 m->m_len = 0;
5606 if (n->m_flags & M_PKTHDR) {
5607 m_copy_pkthdr(m, n);
5608 n->m_flags &= ~M_PKTHDR;
5609 }
5610 m->m_data += dstoff;
5611 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5612 do {
5613 count = min(min(max(len, max_protohdr), space), n->m_len);
5614 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
5615 (unsigned)count);
5616 len -= count;
5617 m->m_len += count;
5618 n->m_len -= count;
5619 space -= count;
5620 if (n->m_len) {
5621 n->m_data += count;
5622 } else {
5623 n = m_free(n);
5624 }
5625 } while (len > 0 && n);
5626 if (len > 0) {
5627 (void) m_free(m);
5628 goto bad;
5629 }
5630 m->m_next = n;
5631 return m;
5632 bad:
5633 m_freem(n);
5634 MSFail++;
5635 return NULL;
5636 }
5637
5638 /*
5639 * Partition an mbuf chain in two pieces, returning the tail --
5640 * all but the first len0 bytes. In case of failure, it returns NULL and
5641 * attempts to restore the chain to its original state.
5642 */
5643 struct mbuf *
5644 m_split(struct mbuf *m0, int len0, int wait)
5645 {
5646 return m_split0(m0, len0, wait, 1);
5647 }
5648
5649 static struct mbuf *
5650 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
5651 {
5652 struct mbuf *m, *n;
5653 unsigned len = len0, remain;
5654
5655 /*
5656 * First iterate to the mbuf which contains the first byte of
5657 * data at offset len0
5658 */
5659 for (m = m0; m && len > m->m_len; m = m->m_next) {
5660 len -= m->m_len;
5661 }
5662 if (m == NULL) {
5663 return NULL;
5664 }
5665 /*
5666 * len effectively is now the offset in the current
5667 * mbuf where we have to perform split.
5668 *
5669 * remain becomes the tail length.
5670 * Note that len can also be == m->m_len
5671 */
5672 remain = m->m_len - len;
5673
5674 /*
5675 * If current mbuf len contains the entire remaining offset len,
5676 * just make the second mbuf chain pointing to next mbuf onwards
5677 * and return after making necessary adjustments
5678 */
5679 if (copyhdr && (m0->m_flags & M_PKTHDR) && remain == 0) {
5680 _MGETHDR(n, wait, m0->m_type);
5681 if (n == NULL) {
5682 return NULL;
5683 }
5684 n->m_next = m->m_next;
5685 m->m_next = NULL;
5686 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5687 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5688 m0->m_pkthdr.len = len0;
5689 return n;
5690 }
5691 if (copyhdr && (m0->m_flags & M_PKTHDR)) {
5692 _MGETHDR(n, wait, m0->m_type);
5693 if (n == NULL) {
5694 return NULL;
5695 }
5696 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5697 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5698 m0->m_pkthdr.len = len0;
5699
5700 /*
5701 * If current points to external storage
5702 * then it can be shared by making last mbuf
5703 * of head chain and first mbuf of current chain
5704 * pointing to different data offsets
5705 */
5706 if (m->m_flags & M_EXT) {
5707 goto extpacket;
5708 }
5709 if (remain > MHLEN) {
5710 /* m can't be the lead packet */
5711 MH_ALIGN(n, 0);
5712 n->m_next = m_split(m, len, wait);
5713 if (n->m_next == NULL) {
5714 (void) m_free(n);
5715 return NULL;
5716 } else {
5717 return n;
5718 }
5719 } else {
5720 MH_ALIGN(n, remain);
5721 }
5722 } else if (remain == 0) {
5723 n = m->m_next;
5724 m->m_next = NULL;
5725 return n;
5726 } else {
5727 _MGET(n, wait, m->m_type);
5728 if (n == NULL) {
5729 return NULL;
5730 }
5731
5732 if ((m->m_flags & M_EXT) == 0) {
5733 VERIFY(remain <= MLEN);
5734 M_ALIGN(n, remain);
5735 }
5736 }
5737 extpacket:
5738 if (m->m_flags & M_EXT) {
5739 n->m_flags |= M_EXT;
5740 n->m_ext = m->m_ext;
5741 m_incref(m);
5742 n->m_data = m->m_data + len;
5743 } else {
5744 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
5745 }
5746 n->m_len = remain;
5747 m->m_len = len;
5748 n->m_next = m->m_next;
5749 m->m_next = NULL;
5750 return n;
5751 }
5752
5753 /*
5754 * Routine to copy from device local memory into mbufs.
5755 */
5756 struct mbuf *
5757 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
5758 void (*copy)(const void *, void *, size_t))
5759 {
5760 struct mbuf *m;
5761 struct mbuf *top = NULL, **mp = ⊤
5762 int off = off0, len;
5763 char *cp;
5764 char *epkt;
5765
5766 cp = buf;
5767 epkt = cp + totlen;
5768 if (off) {
5769 /*
5770 * If 'off' is non-zero, packet is trailer-encapsulated,
5771 * so we have to skip the type and length fields.
5772 */
5773 cp += off + 2 * sizeof(u_int16_t);
5774 totlen -= 2 * sizeof(u_int16_t);
5775 }
5776 _MGETHDR(m, M_DONTWAIT, MT_DATA);
5777 if (m == NULL) {
5778 return NULL;
5779 }
5780 m->m_pkthdr.rcvif = ifp;
5781 m->m_pkthdr.len = totlen;
5782 m->m_len = MHLEN;
5783
5784 while (totlen > 0) {
5785 if (top != NULL) {
5786 _MGET(m, M_DONTWAIT, MT_DATA);
5787 if (m == NULL) {
5788 m_freem(top);
5789 return NULL;
5790 }
5791 m->m_len = MLEN;
5792 }
5793 len = MIN(totlen, epkt - cp);
5794 if (len >= MINCLSIZE) {
5795 MCLGET(m, M_DONTWAIT);
5796 if (m->m_flags & M_EXT) {
5797 m->m_len = len = MIN(len, m_maxsize(MC_CL));
5798 } else {
5799 /* give up when it's out of cluster mbufs */
5800 if (top != NULL) {
5801 m_freem(top);
5802 }
5803 m_freem(m);
5804 return NULL;
5805 }
5806 } else {
5807 /*
5808 * Place initial small packet/header at end of mbuf.
5809 */
5810 if (len < m->m_len) {
5811 if (top == NULL &&
5812 len + max_linkhdr <= m->m_len) {
5813 m->m_data += max_linkhdr;
5814 }
5815 m->m_len = len;
5816 } else {
5817 len = m->m_len;
5818 }
5819 }
5820 if (copy) {
5821 copy(cp, MTOD(m, caddr_t), (unsigned)len);
5822 } else {
5823 bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
5824 }
5825 cp += len;
5826 *mp = m;
5827 mp = &m->m_next;
5828 totlen -= len;
5829 if (cp == epkt) {
5830 cp = buf;
5831 }
5832 }
5833 return top;
5834 }
5835
5836 #ifndef MBUF_GROWTH_NORMAL_THRESH
5837 #define MBUF_GROWTH_NORMAL_THRESH 25
5838 #endif
5839
5840 /*
5841 * Cluster freelist allocation check.
5842 */
5843 static int
5844 m_howmany(int num, size_t bufsize)
5845 {
5846 int i = 0, j = 0;
5847 u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5848 u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5849 u_int32_t sumclusters, freeclusters;
5850 u_int32_t percent_pool, percent_kmem;
5851 u_int32_t mb_growth, mb_growth_thresh;
5852
5853 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5854 bufsize == m_maxsize(MC_16KCL));
5855
5856 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5857
5858 /* Numbers in 2K cluster units */
5859 m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5860 m_clusters = m_total(MC_CL);
5861 m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5862 m_16kclusters = m_total(MC_16KCL);
5863 sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5864
5865 m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5866 m_clfree = m_infree(MC_CL);
5867 m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5868 m_16kclfree = m_infree(MC_16KCL);
5869 freeclusters = m_mbfree + m_clfree + m_bigclfree;
5870
5871 /* Bail if we've maxed out the mbuf memory map */
5872 if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5873 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5874 (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5875 mbwdog_logger("maxed out nclusters (%u >= %u) or njcl (%u >= %u)",
5876 sumclusters, nclusters,
5877 (m_16kclusters << NCLPJCLSHIFT), njcl);
5878 return 0;
5879 }
5880
5881 if (bufsize == m_maxsize(MC_BIGCL)) {
5882 /* Under minimum */
5883 if (m_bigclusters < m_minlimit(MC_BIGCL)) {
5884 return m_minlimit(MC_BIGCL) - m_bigclusters;
5885 }
5886
5887 percent_pool =
5888 ((sumclusters - freeclusters) * 100) / sumclusters;
5889 percent_kmem = (sumclusters * 100) / nclusters;
5890
5891 /*
5892 * If a light/normal user, grow conservatively (75%)
5893 * If a heavy user, grow aggressively (50%)
5894 */
5895 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH) {
5896 mb_growth = MB_GROWTH_NORMAL;
5897 } else {
5898 mb_growth = MB_GROWTH_AGGRESSIVE;
5899 }
5900
5901 if (percent_kmem < 5) {
5902 /* For initial allocations */
5903 i = num;
5904 } else {
5905 /* Return if >= MBIGCL_LOWAT clusters available */
5906 if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5907 m_total(MC_BIGCL) >=
5908 MBIGCL_LOWAT + m_minlimit(MC_BIGCL)) {
5909 return 0;
5910 }
5911
5912 /* Ensure at least num clusters are accessible */
5913 if (num >= m_infree(MC_BIGCL)) {
5914 i = num - m_infree(MC_BIGCL);
5915 }
5916 if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL)) {
5917 j = num - (m_total(MC_BIGCL) -
5918 m_minlimit(MC_BIGCL));
5919 }
5920
5921 i = MAX(i, j);
5922
5923 /*
5924 * Grow pool if percent_pool > 75 (normal growth)
5925 * or percent_pool > 50 (aggressive growth).
5926 */
5927 mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5928 if (percent_pool > mb_growth_thresh) {
5929 j = ((sumclusters + num) >> mb_growth) -
5930 freeclusters;
5931 }
5932 i = MAX(i, j);
5933 }
5934
5935 /* Check to ensure we didn't go over limits */
5936 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) {
5937 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5938 }
5939 if ((i << 1) + sumclusters >= nclusters) {
5940 i = (nclusters - sumclusters) >> 1;
5941 }
5942 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5943 VERIFY(sumclusters + (i << 1) <= nclusters);
5944 } else { /* 16K CL */
5945 VERIFY(njcl > 0);
5946 /* Ensure at least num clusters are available */
5947 if (num >= m_16kclfree) {
5948 i = num - m_16kclfree;
5949 }
5950
5951 /* Always grow 16KCL pool aggressively */
5952 if (((m_16kclusters + num) >> 1) > m_16kclfree) {
5953 j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5954 }
5955 i = MAX(i, j);
5956
5957 /* Check to ensure we don't go over limit */
5958 if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL)) {
5959 i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
5960 }
5961 }
5962 return i;
5963 }
5964 /*
5965 * Return the number of bytes in the mbuf chain, m.
5966 */
5967 unsigned int
5968 m_length(struct mbuf *m)
5969 {
5970 struct mbuf *m0;
5971 unsigned int pktlen;
5972
5973 if (m->m_flags & M_PKTHDR) {
5974 return m->m_pkthdr.len;
5975 }
5976
5977 pktlen = 0;
5978 for (m0 = m; m0 != NULL; m0 = m0->m_next) {
5979 pktlen += m0->m_len;
5980 }
5981 return pktlen;
5982 }
5983
5984 /*
5985 * Copy data from a buffer back into the indicated mbuf chain,
5986 * starting "off" bytes from the beginning, extending the mbuf
5987 * chain if necessary.
5988 */
5989 void
5990 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5991 {
5992 #if DEBUG
5993 struct mbuf *origm = m0;
5994 int error;
5995 #endif /* DEBUG */
5996
5997 if (m0 == NULL) {
5998 return;
5999 }
6000
6001 #if DEBUG
6002 error =
6003 #endif /* DEBUG */
6004 m_copyback0(&m0, off, len, cp,
6005 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
6006
6007 #if DEBUG
6008 if (error != 0 || (m0 != NULL && origm != m0)) {
6009 panic("m_copyback");
6010 }
6011 #endif /* DEBUG */
6012 }
6013
6014 struct mbuf *
6015 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
6016 {
6017 int error;
6018
6019 /* don't support chain expansion */
6020 VERIFY(off + len <= m_length(m0));
6021
6022 error = m_copyback0(&m0, off, len, cp,
6023 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
6024 if (error) {
6025 /*
6026 * no way to recover from partial success.
6027 * just free the chain.
6028 */
6029 m_freem(m0);
6030 return NULL;
6031 }
6032 return m0;
6033 }
6034
6035 /*
6036 * m_makewritable: ensure the specified range writable.
6037 */
6038 int
6039 m_makewritable(struct mbuf **mp, int off, int len, int how)
6040 {
6041 int error;
6042 #if DEBUG
6043 struct mbuf *n;
6044 int origlen, reslen;
6045
6046 origlen = m_length(*mp);
6047 #endif /* DEBUG */
6048
6049 #if 0 /* M_COPYALL is large enough */
6050 if (len == M_COPYALL) {
6051 len = m_length(*mp) - off; /* XXX */
6052 }
6053 #endif
6054
6055 error = m_copyback0(mp, off, len, NULL,
6056 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
6057
6058 #if DEBUG
6059 reslen = 0;
6060 for (n = *mp; n; n = n->m_next) {
6061 reslen += n->m_len;
6062 }
6063 if (origlen != reslen) {
6064 panic("m_makewritable: length changed");
6065 }
6066 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len) {
6067 panic("m_makewritable: inconsist");
6068 }
6069 #endif /* DEBUG */
6070
6071 return error;
6072 }
6073
6074 static int
6075 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
6076 int how)
6077 {
6078 int mlen;
6079 struct mbuf *m, *n;
6080 struct mbuf **mp;
6081 int totlen = 0;
6082 const char *cp = vp;
6083
6084 VERIFY(mp0 != NULL);
6085 VERIFY(*mp0 != NULL);
6086 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
6087 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
6088
6089 /*
6090 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
6091 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
6092 */
6093
6094 VERIFY((~flags & (M_COPYBACK0_EXTEND | M_COPYBACK0_COW)) != 0);
6095
6096 mp = mp0;
6097 m = *mp;
6098 while (off > (mlen = m->m_len)) {
6099 off -= mlen;
6100 totlen += mlen;
6101 if (m->m_next == NULL) {
6102 int tspace;
6103 extend:
6104 if (!(flags & M_COPYBACK0_EXTEND)) {
6105 goto out;
6106 }
6107
6108 /*
6109 * try to make some space at the end of "m".
6110 */
6111
6112 mlen = m->m_len;
6113 if (off + len >= MINCLSIZE &&
6114 !(m->m_flags & M_EXT) && m->m_len == 0) {
6115 MCLGET(m, how);
6116 }
6117 tspace = M_TRAILINGSPACE(m);
6118 if (tspace > 0) {
6119 tspace = MIN(tspace, off + len);
6120 VERIFY(tspace > 0);
6121 bzero(mtod(m, char *) + m->m_len,
6122 MIN(off, tspace));
6123 m->m_len += tspace;
6124 off += mlen;
6125 totlen -= mlen;
6126 continue;
6127 }
6128
6129 /*
6130 * need to allocate an mbuf.
6131 */
6132
6133 if (off + len >= MINCLSIZE) {
6134 n = m_getcl(how, m->m_type, 0);
6135 } else {
6136 n = _M_GET(how, m->m_type);
6137 }
6138 if (n == NULL) {
6139 goto out;
6140 }
6141 n->m_len = 0;
6142 n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
6143 bzero(mtod(n, char *), MIN(n->m_len, off));
6144 m->m_next = n;
6145 }
6146 mp = &m->m_next;
6147 m = m->m_next;
6148 }
6149 while (len > 0) {
6150 mlen = m->m_len - off;
6151 if (mlen != 0 && m_mclhasreference(m)) {
6152 char *datap;
6153 int eatlen;
6154
6155 /*
6156 * this mbuf is read-only.
6157 * allocate a new writable mbuf and try again.
6158 */
6159
6160 #if DIAGNOSTIC
6161 if (!(flags & M_COPYBACK0_COW)) {
6162 panic("m_copyback0: read-only");
6163 }
6164 #endif /* DIAGNOSTIC */
6165
6166 /*
6167 * if we're going to write into the middle of
6168 * a mbuf, split it first.
6169 */
6170 if (off > 0 && len < mlen) {
6171 n = m_split0(m, off, how, 0);
6172 if (n == NULL) {
6173 goto enobufs;
6174 }
6175 m->m_next = n;
6176 mp = &m->m_next;
6177 m = n;
6178 off = 0;
6179 continue;
6180 }
6181
6182 /*
6183 * XXX TODO coalesce into the trailingspace of
6184 * the previous mbuf when possible.
6185 */
6186
6187 /*
6188 * allocate a new mbuf. copy packet header if needed.
6189 */
6190 n = _M_GET(how, m->m_type);
6191 if (n == NULL) {
6192 goto enobufs;
6193 }
6194 if (off == 0 && (m->m_flags & M_PKTHDR)) {
6195 M_COPY_PKTHDR(n, m);
6196 n->m_len = MHLEN;
6197 } else {
6198 if (len >= MINCLSIZE) {
6199 MCLGET(n, M_DONTWAIT);
6200 }
6201 n->m_len =
6202 (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
6203 }
6204 if (n->m_len > len) {
6205 n->m_len = len;
6206 }
6207
6208 /*
6209 * free the region which has been overwritten.
6210 * copying data from old mbufs if requested.
6211 */
6212 if (flags & M_COPYBACK0_PRESERVE) {
6213 datap = mtod(n, char *);
6214 } else {
6215 datap = NULL;
6216 }
6217 eatlen = n->m_len;
6218 VERIFY(off == 0 || eatlen >= mlen);
6219 if (off > 0) {
6220 VERIFY(len >= mlen);
6221 m->m_len = off;
6222 m->m_next = n;
6223 if (datap) {
6224 m_copydata(m, off, mlen, datap);
6225 datap += mlen;
6226 }
6227 eatlen -= mlen;
6228 mp = &m->m_next;
6229 m = m->m_next;
6230 }
6231 while (m != NULL && m_mclhasreference(m) &&
6232 n->m_type == m->m_type && eatlen > 0) {
6233 mlen = MIN(eatlen, m->m_len);
6234 if (datap) {
6235 m_copydata(m, 0, mlen, datap);
6236 datap += mlen;
6237 }
6238 m->m_data += mlen;
6239 m->m_len -= mlen;
6240 eatlen -= mlen;
6241 if (m->m_len == 0) {
6242 *mp = m = m_free(m);
6243 }
6244 }
6245 if (eatlen > 0) {
6246 n->m_len -= eatlen;
6247 }
6248 n->m_next = m;
6249 *mp = m = n;
6250 continue;
6251 }
6252 mlen = MIN(mlen, len);
6253 if (flags & M_COPYBACK0_COPYBACK) {
6254 bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
6255 cp += mlen;
6256 }
6257 len -= mlen;
6258 mlen += off;
6259 off = 0;
6260 totlen += mlen;
6261 if (len == 0) {
6262 break;
6263 }
6264 if (m->m_next == NULL) {
6265 goto extend;
6266 }
6267 mp = &m->m_next;
6268 m = m->m_next;
6269 }
6270 out:
6271 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
6272 VERIFY(flags & M_COPYBACK0_EXTEND);
6273 m->m_pkthdr.len = totlen;
6274 }
6275
6276 return 0;
6277
6278 enobufs:
6279 return ENOBUFS;
6280 }
6281
6282 uint64_t
6283 mcl_to_paddr(char *addr)
6284 {
6285 vm_offset_t base_phys;
6286
6287 if (!MBUF_IN_MAP(addr)) {
6288 return 0;
6289 }
6290 base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
6291
6292 if (base_phys == 0) {
6293 return 0;
6294 }
6295 return (uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK));
6296 }
6297
6298 /*
6299 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
6300 * And really copy the thing. That way, we don't "precompute" checksums
6301 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
6302 * small packets, don't dup into a cluster. That way received packets
6303 * don't take up too much room in the sockbuf (cf. sbspace()).
6304 */
6305 int MDFail;
6306
6307 struct mbuf *
6308 m_dup(struct mbuf *m, int how)
6309 {
6310 struct mbuf *n, **np;
6311 struct mbuf *top;
6312 int copyhdr = 0;
6313
6314 np = ⊤
6315 top = NULL;
6316 if (m->m_flags & M_PKTHDR) {
6317 copyhdr = 1;
6318 }
6319
6320 /*
6321 * Quick check: if we have one mbuf and its data fits in an
6322 * mbuf with packet header, just copy and go.
6323 */
6324 if (m->m_next == NULL) {
6325 /* Then just move the data into an mbuf and be done... */
6326 if (copyhdr) {
6327 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
6328 if ((n = _M_GETHDR(how, m->m_type)) == NULL) {
6329 return NULL;
6330 }
6331 n->m_len = m->m_len;
6332 m_dup_pkthdr(n, m, how);
6333 bcopy(m->m_data, n->m_data, m->m_len);
6334 return n;
6335 }
6336 } else if (m->m_len <= MLEN) {
6337 if ((n = _M_GET(how, m->m_type)) == NULL) {
6338 return NULL;
6339 }
6340 bcopy(m->m_data, n->m_data, m->m_len);
6341 n->m_len = m->m_len;
6342 return n;
6343 }
6344 }
6345 while (m != NULL) {
6346 #if BLUE_DEBUG
6347 printf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
6348 m->m_data);
6349 #endif
6350 if (copyhdr) {
6351 n = _M_GETHDR(how, m->m_type);
6352 } else {
6353 n = _M_GET(how, m->m_type);
6354 }
6355 if (n == NULL) {
6356 goto nospace;
6357 }
6358 if (m->m_flags & M_EXT) {
6359 if (m->m_len <= m_maxsize(MC_CL)) {
6360 MCLGET(n, how);
6361 } else if (m->m_len <= m_maxsize(MC_BIGCL)) {
6362 n = m_mbigget(n, how);
6363 } else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0) {
6364 n = m_m16kget(n, how);
6365 }
6366 if (!(n->m_flags & M_EXT)) {
6367 (void) m_free(n);
6368 goto nospace;
6369 }
6370 } else {
6371 VERIFY((copyhdr == 1 && m->m_len <= MHLEN) ||
6372 (copyhdr == 0 && m->m_len <= MLEN));
6373 }
6374 *np = n;
6375 if (copyhdr) {
6376 /* Don't use M_COPY_PKTHDR: preserve m_data */
6377 m_dup_pkthdr(n, m, how);
6378 copyhdr = 0;
6379 if (!(n->m_flags & M_EXT)) {
6380 n->m_data = n->m_pktdat;
6381 }
6382 }
6383 n->m_len = m->m_len;
6384 /*
6385 * Get the dup on the same bdry as the original
6386 * Assume that the two mbufs have the same offset to data area
6387 * (up to word boundaries)
6388 */
6389 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
6390 m = m->m_next;
6391 np = &n->m_next;
6392 #if BLUE_DEBUG
6393 printf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
6394 n->m_data);
6395 #endif
6396 }
6397
6398 if (top == NULL) {
6399 MDFail++;
6400 }
6401 return top;
6402
6403 nospace:
6404 m_freem(top);
6405 MDFail++;
6406 return NULL;
6407 }
6408
6409 #define MBUF_MULTIPAGES(m) \
6410 (((m)->m_flags & M_EXT) && \
6411 ((IS_P2ALIGNED((m)->m_data, PAGE_SIZE) \
6412 && (m)->m_len > PAGE_SIZE) || \
6413 (!IS_P2ALIGNED((m)->m_data, PAGE_SIZE) && \
6414 P2ROUNDUP((m)->m_data, PAGE_SIZE) < ((uintptr_t)(m)->m_data + (m)->m_len))))
6415
6416 static struct mbuf *
6417 m_expand(struct mbuf *m, struct mbuf **last)
6418 {
6419 struct mbuf *top = NULL;
6420 struct mbuf **nm = ⊤
6421 uintptr_t data0, data;
6422 unsigned int len0, len;
6423
6424 VERIFY(MBUF_MULTIPAGES(m));
6425 VERIFY(m->m_next == NULL);
6426 data0 = (uintptr_t)m->m_data;
6427 len0 = m->m_len;
6428 *last = top;
6429
6430 for (;;) {
6431 struct mbuf *n;
6432
6433 data = data0;
6434 if (IS_P2ALIGNED(data, PAGE_SIZE) && len0 > PAGE_SIZE) {
6435 len = PAGE_SIZE;
6436 } else if (!IS_P2ALIGNED(data, PAGE_SIZE) &&
6437 P2ROUNDUP(data, PAGE_SIZE) < (data + len0)) {
6438 len = P2ROUNDUP(data, PAGE_SIZE) - data;
6439 } else {
6440 len = len0;
6441 }
6442
6443 VERIFY(len > 0);
6444 VERIFY(m->m_flags & M_EXT);
6445 m->m_data = (void *)data;
6446 m->m_len = len;
6447
6448 *nm = *last = m;
6449 nm = &m->m_next;
6450 m->m_next = NULL;
6451
6452 data0 += len;
6453 len0 -= len;
6454 if (len0 == 0) {
6455 break;
6456 }
6457
6458 n = _M_RETRY(M_DONTWAIT, MT_DATA);
6459 if (n == NULL) {
6460 m_freem(top);
6461 top = *last = NULL;
6462 break;
6463 }
6464
6465 n->m_ext = m->m_ext;
6466 m_incref(m);
6467 n->m_flags |= M_EXT;
6468 m = n;
6469 }
6470 return top;
6471 }
6472
6473 struct mbuf *
6474 m_normalize(struct mbuf *m)
6475 {
6476 struct mbuf *top = NULL;
6477 struct mbuf **nm = ⊤
6478 boolean_t expanded = FALSE;
6479
6480 while (m != NULL) {
6481 struct mbuf *n;
6482
6483 n = m->m_next;
6484 m->m_next = NULL;
6485
6486 /* Does the data cross one or more page boundaries? */
6487 if (MBUF_MULTIPAGES(m)) {
6488 struct mbuf *last;
6489 if ((m = m_expand(m, &last)) == NULL) {
6490 m_freem(n);
6491 m_freem(top);
6492 top = NULL;
6493 break;
6494 }
6495 *nm = m;
6496 nm = &last->m_next;
6497 expanded = TRUE;
6498 } else {
6499 *nm = m;
6500 nm = &m->m_next;
6501 }
6502 m = n;
6503 }
6504 if (expanded) {
6505 atomic_add_32(&mb_normalized, 1);
6506 }
6507 return top;
6508 }
6509
6510 /*
6511 * Append the specified data to the indicated mbuf chain,
6512 * Extend the mbuf chain if the new data does not fit in
6513 * existing space.
6514 *
6515 * Return 1 if able to complete the job; otherwise 0.
6516 */
6517 int
6518 m_append(struct mbuf *m0, int len, caddr_t cp)
6519 {
6520 struct mbuf *m, *n;
6521 int remainder, space;
6522
6523 for (m = m0; m->m_next != NULL; m = m->m_next) {
6524 ;
6525 }
6526 remainder = len;
6527 space = M_TRAILINGSPACE(m);
6528 if (space > 0) {
6529 /*
6530 * Copy into available space.
6531 */
6532 if (space > remainder) {
6533 space = remainder;
6534 }
6535 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
6536 m->m_len += space;
6537 cp += space;
6538 remainder -= space;
6539 }
6540 while (remainder > 0) {
6541 /*
6542 * Allocate a new mbuf; could check space
6543 * and allocate a cluster instead.
6544 */
6545 n = m_get(M_WAITOK, m->m_type);
6546 if (n == NULL) {
6547 break;
6548 }
6549 n->m_len = min(MLEN, remainder);
6550 bcopy(cp, mtod(n, caddr_t), n->m_len);
6551 cp += n->m_len;
6552 remainder -= n->m_len;
6553 m->m_next = n;
6554 m = n;
6555 }
6556 if (m0->m_flags & M_PKTHDR) {
6557 m0->m_pkthdr.len += len - remainder;
6558 }
6559 return remainder == 0;
6560 }
6561
6562 struct mbuf *
6563 m_last(struct mbuf *m)
6564 {
6565 while (m->m_next != NULL) {
6566 m = m->m_next;
6567 }
6568 return m;
6569 }
6570
6571 unsigned int
6572 m_fixhdr(struct mbuf *m0)
6573 {
6574 u_int len;
6575
6576 VERIFY(m0->m_flags & M_PKTHDR);
6577
6578 len = m_length2(m0, NULL);
6579 m0->m_pkthdr.len = len;
6580 return len;
6581 }
6582
6583 unsigned int
6584 m_length2(struct mbuf *m0, struct mbuf **last)
6585 {
6586 struct mbuf *m;
6587 u_int len;
6588
6589 len = 0;
6590 for (m = m0; m != NULL; m = m->m_next) {
6591 len += m->m_len;
6592 if (m->m_next == NULL) {
6593 break;
6594 }
6595 }
6596 if (last != NULL) {
6597 *last = m;
6598 }
6599 return len;
6600 }
6601
6602 /*
6603 * Defragment a mbuf chain, returning the shortest possible chain of mbufs
6604 * and clusters. If allocation fails and this cannot be completed, NULL will
6605 * be returned, but the passed in chain will be unchanged. Upon success,
6606 * the original chain will be freed, and the new chain will be returned.
6607 *
6608 * If a non-packet header is passed in, the original mbuf (chain?) will
6609 * be returned unharmed.
6610 *
6611 * If offset is specfied, the first mbuf in the chain will have a leading
6612 * space of the amount stated by the "off" parameter.
6613 *
6614 * This routine requires that the m_pkthdr.header field of the original
6615 * mbuf chain is cleared by the caller.
6616 */
6617 struct mbuf *
6618 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
6619 {
6620 struct mbuf *m_new = NULL, *m_final = NULL;
6621 int progress = 0, length, pktlen;
6622
6623 if (!(m0->m_flags & M_PKTHDR)) {
6624 return m0;
6625 }
6626
6627 VERIFY(off < MHLEN);
6628 m_fixhdr(m0); /* Needed sanity check */
6629
6630 pktlen = m0->m_pkthdr.len + off;
6631 if (pktlen > MHLEN) {
6632 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
6633 } else {
6634 m_final = m_gethdr(how, MT_DATA);
6635 }
6636
6637 if (m_final == NULL) {
6638 goto nospace;
6639 }
6640
6641 if (off > 0) {
6642 pktlen -= off;
6643 m_final->m_data += off;
6644 }
6645
6646 /*
6647 * Caller must have handled the contents pointed to by this
6648 * pointer before coming here, as otherwise it will point to
6649 * the original mbuf which will get freed upon success.
6650 */
6651 VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
6652
6653 if (m_dup_pkthdr(m_final, m0, how) == 0) {
6654 goto nospace;
6655 }
6656
6657 m_new = m_final;
6658
6659 while (progress < pktlen) {
6660 length = pktlen - progress;
6661 if (length > MCLBYTES) {
6662 length = MCLBYTES;
6663 }
6664 length -= ((m_new == m_final) ? off : 0);
6665 if (length < 0) {
6666 goto nospace;
6667 }
6668
6669 if (m_new == NULL) {
6670 if (length > MLEN) {
6671 m_new = m_getcl(how, MT_DATA, 0);
6672 } else {
6673 m_new = m_get(how, MT_DATA);
6674 }
6675 if (m_new == NULL) {
6676 goto nospace;
6677 }
6678 }
6679
6680 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
6681 progress += length;
6682 m_new->m_len = length;
6683 if (m_new != m_final) {
6684 m_cat(m_final, m_new);
6685 }
6686 m_new = NULL;
6687 }
6688 m_freem(m0);
6689 m0 = m_final;
6690 return m0;
6691 nospace:
6692 if (m_final) {
6693 m_freem(m_final);
6694 }
6695 return NULL;
6696 }
6697
6698 struct mbuf *
6699 m_defrag(struct mbuf *m0, int how)
6700 {
6701 return m_defrag_offset(m0, 0, how);
6702 }
6703
6704 void
6705 m_mchtype(struct mbuf *m, int t)
6706 {
6707 mtype_stat_inc(t);
6708 mtype_stat_dec(m->m_type);
6709 (m)->m_type = t;
6710 }
6711
6712 void *
6713 m_mtod(struct mbuf *m)
6714 {
6715 return MTOD(m, void *);
6716 }
6717
6718 struct mbuf *
6719 m_dtom(void *x)
6720 {
6721 return (struct mbuf *)((uintptr_t)(x) & ~(MSIZE - 1));
6722 }
6723
6724 void
6725 m_mcheck(struct mbuf *m)
6726 {
6727 _MCHECK(m);
6728 }
6729
6730 /*
6731 * Return a pointer to mbuf/offset of location in mbuf chain.
6732 */
6733 struct mbuf *
6734 m_getptr(struct mbuf *m, int loc, int *off)
6735 {
6736 while (loc >= 0) {
6737 /* Normal end of search. */
6738 if (m->m_len > loc) {
6739 *off = loc;
6740 return m;
6741 } else {
6742 loc -= m->m_len;
6743 if (m->m_next == NULL) {
6744 if (loc == 0) {
6745 /* Point at the end of valid data. */
6746 *off = m->m_len;
6747 return m;
6748 }
6749 return NULL;
6750 }
6751 m = m->m_next;
6752 }
6753 }
6754 return NULL;
6755 }
6756
6757 /*
6758 * Inform the corresponding mcache(s) that there's a waiter below.
6759 */
6760 static void
6761 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
6762 {
6763 mcache_waiter_inc(m_cache(class));
6764 if (comp) {
6765 if (class == MC_CL) {
6766 mcache_waiter_inc(m_cache(MC_MBUF_CL));
6767 } else if (class == MC_BIGCL) {
6768 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6769 } else if (class == MC_16KCL) {
6770 mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
6771 } else {
6772 mcache_waiter_inc(m_cache(MC_MBUF_CL));
6773 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6774 }
6775 }
6776 }
6777
6778 /*
6779 * Inform the corresponding mcache(s) that there's no more waiter below.
6780 */
6781 static void
6782 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
6783 {
6784 mcache_waiter_dec(m_cache(class));
6785 if (comp) {
6786 if (class == MC_CL) {
6787 mcache_waiter_dec(m_cache(MC_MBUF_CL));
6788 } else if (class == MC_BIGCL) {
6789 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6790 } else if (class == MC_16KCL) {
6791 mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
6792 } else {
6793 mcache_waiter_dec(m_cache(MC_MBUF_CL));
6794 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6795 }
6796 }
6797 }
6798
6799 static bool mbuf_watchdog_defunct_active = false;
6800
6801 static uint32_t
6802 mbuf_watchdog_socket_space(struct socket *so)
6803 {
6804 uint32_t space = 0;
6805
6806 if (so == NULL) {
6807 return 0;
6808 }
6809
6810 space = so->so_snd.sb_mbcnt + so->so_rcv.sb_mbcnt;
6811
6812 #if INET
6813 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6814 SOCK_PROTO(so) == IPPROTO_TCP) {
6815 space += tcp_reass_qlen_space(so);
6816 }
6817 #endif /* INET */
6818
6819 return space;
6820 }
6821
6822 struct mbuf_watchdog_defunct_args {
6823 struct proc *top_app;
6824 uint32_t top_app_space_used;
6825 bool non_blocking;
6826 };
6827
6828 static bool
6829 proc_fd_trylock(proc_t p)
6830 {
6831 return lck_mtx_try_lock(&p->p_fd.fd_lock);
6832 }
6833
6834 static int
6835 mbuf_watchdog_defunct_iterate(proc_t p, void *arg)
6836 {
6837 struct fileproc *fp = NULL;
6838 struct mbuf_watchdog_defunct_args *args =
6839 (struct mbuf_watchdog_defunct_args *)arg;
6840 uint32_t space_used = 0;
6841
6842 /*
6843 * Non-blocking is only used when dumping the mbuf usage from the watchdog
6844 */
6845 if (args->non_blocking) {
6846 if (!proc_fd_trylock(p)) {
6847 return PROC_RETURNED;
6848 }
6849 } else {
6850 proc_fdlock(p);
6851 }
6852 fdt_foreach(fp, p) {
6853 struct fileglob *fg = fp->fp_glob;
6854 struct socket *so = NULL;
6855
6856 if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
6857 continue;
6858 }
6859 so = fg_get_data(fg);
6860 /*
6861 * We calculate the space without the socket
6862 * lock because we don't want to be blocked
6863 * by another process that called send() and
6864 * is stuck waiting for mbufs.
6865 *
6866 * These variables are 32-bit so we don't have
6867 * to worry about incomplete reads.
6868 */
6869 space_used += mbuf_watchdog_socket_space(so);
6870 }
6871 proc_fdunlock(p);
6872 if (space_used > args->top_app_space_used) {
6873 if (args->top_app != NULL) {
6874 proc_rele(args->top_app);
6875 }
6876 args->top_app = p;
6877 args->top_app_space_used = space_used;
6878
6879 return PROC_CLAIMED;
6880 } else {
6881 return PROC_RETURNED;
6882 }
6883 }
6884
6885 extern char *proc_name_address(void *p);
6886
6887 static void
6888 mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1)
6889 {
6890 #pragma unused(arg0, arg1)
6891 struct mbuf_watchdog_defunct_args args = {};
6892 struct fileproc *fp = NULL;
6893
6894 args.non_blocking = false;
6895 proc_iterate(PROC_ALLPROCLIST,
6896 mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
6897
6898 /*
6899 * Defunct all sockets from this app.
6900 */
6901 if (args.top_app != NULL) {
6902 /* Restart the watchdog count. */
6903 lck_mtx_lock(mbuf_mlock);
6904 microuptime(&mb_wdtstart);
6905 lck_mtx_unlock(mbuf_mlock);
6906 os_log(OS_LOG_DEFAULT, "%s: defuncting all sockets from %s.%d",
6907 __func__,
6908 proc_name_address(args.top_app),
6909 proc_pid(args.top_app));
6910 proc_fdlock(args.top_app);
6911 fdt_foreach(fp, args.top_app) {
6912 struct fileglob *fg = fp->fp_glob;
6913 struct socket *so = NULL;
6914
6915 if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
6916 continue;
6917 }
6918 so = (struct socket *)fp_get_data(fp);
6919 socket_lock(so, 0);
6920 if (sosetdefunct(args.top_app, so,
6921 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
6922 TRUE) == 0) {
6923 sodefunct(args.top_app, so,
6924 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
6925 }
6926 socket_unlock(so, 0);
6927 }
6928 proc_fdunlock(args.top_app);
6929 proc_rele(args.top_app);
6930 mbstat.m_forcedefunct++;
6931 }
6932 mbuf_watchdog_defunct_active = false;
6933 }
6934
6935 /*
6936 * Called during slab (blocking and non-blocking) allocation. If there
6937 * is at least one waiter, and the time since the first waiter is blocked
6938 * is greater than the watchdog timeout, panic the system.
6939 */
6940 static void
6941 mbuf_watchdog(void)
6942 {
6943 struct timeval now;
6944 unsigned int since;
6945 static thread_call_t defunct_tcall = NULL;
6946
6947 if (mb_waiters == 0 || !mb_watchdog) {
6948 return;
6949 }
6950
6951 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6952
6953 microuptime(&now);
6954 since = now.tv_sec - mb_wdtstart.tv_sec;
6955
6956 if (mbuf_watchdog_defunct_active) {
6957 /*
6958 * Don't panic the system while we are trying
6959 * to find sockets to defunct.
6960 */
6961 return;
6962 }
6963 if (since >= MB_WDT_MAXTIME) {
6964 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
6965 mb_waiters, since, mbuf_dump());
6966 /* NOTREACHED */
6967 }
6968 /*
6969 * Check if we are about to panic the system due
6970 * to lack of mbufs and start defuncting sockets
6971 * from processes that use too many sockets.
6972 *
6973 * We're always called with the mbuf_mlock held,
6974 * so that also protects mbuf_watchdog_defunct_active.
6975 */
6976 if (since >= MB_WDT_MAXTIME / 2) {
6977 /*
6978 * Start a thread to defunct sockets
6979 * from apps that are over-using their socket
6980 * buffers.
6981 */
6982 if (defunct_tcall == NULL) {
6983 defunct_tcall =
6984 thread_call_allocate_with_options(mbuf_watchdog_defunct,
6985 NULL,
6986 THREAD_CALL_PRIORITY_KERNEL,
6987 THREAD_CALL_OPTIONS_ONCE);
6988 }
6989 if (defunct_tcall != NULL) {
6990 mbuf_watchdog_defunct_active = true;
6991 thread_call_enter(defunct_tcall);
6992 }
6993 }
6994 }
6995
6996 /*
6997 * Called during blocking allocation. Returns TRUE if one or more objects
6998 * are available at the per-CPU caches layer and that allocation should be
6999 * retried at that level.
7000 */
7001 static boolean_t
7002 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
7003 {
7004 boolean_t mcache_retry = FALSE;
7005
7006 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
7007
7008 /* Check if there's anything at the cache layer */
7009 if (mbuf_cached_above(class, wait)) {
7010 mcache_retry = TRUE;
7011 goto done;
7012 }
7013
7014 /* Nothing? Then try hard to get it from somewhere */
7015 m_reclaim(class, num, (wait & MCR_COMP));
7016
7017 /* We tried hard and got something? */
7018 if (m_infree(class) > 0) {
7019 mbstat.m_wait++;
7020 goto done;
7021 } else if (mbuf_cached_above(class, wait)) {
7022 mbstat.m_wait++;
7023 mcache_retry = TRUE;
7024 goto done;
7025 } else if (wait & MCR_TRYHARD) {
7026 mcache_retry = TRUE;
7027 goto done;
7028 }
7029
7030 /*
7031 * There's really nothing for us right now; inform the
7032 * cache(s) that there is a waiter below and go to sleep.
7033 */
7034 mbuf_waiter_inc(class, (wait & MCR_COMP));
7035
7036 VERIFY(!(wait & MCR_NOSLEEP));
7037
7038 /*
7039 * If this is the first waiter, arm the watchdog timer. Otherwise
7040 * check if we need to panic the system due to watchdog timeout.
7041 */
7042 if (mb_waiters == 0) {
7043 microuptime(&mb_wdtstart);
7044 } else {
7045 mbuf_watchdog();
7046 }
7047
7048 mb_waiters++;
7049 m_region_expand(class) += m_total(class) + num;
7050 /* wake up the worker thread */
7051 if (mbuf_worker_ready &&
7052 mbuf_worker_needs_wakeup) {
7053 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
7054 mbuf_worker_needs_wakeup = FALSE;
7055 }
7056 mbwdog_logger("waiting (%d mbufs in class %s)", num, m_cname(class));
7057 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO - 1), m_cname(class), NULL);
7058 mbwdog_logger("woke up (%d mbufs in class %s) ", num, m_cname(class));
7059
7060 /* We are now up; stop getting notified until next round */
7061 mbuf_waiter_dec(class, (wait & MCR_COMP));
7062
7063 /* We waited and got something */
7064 if (m_infree(class) > 0) {
7065 mbstat.m_wait++;
7066 goto done;
7067 } else if (mbuf_cached_above(class, wait)) {
7068 mbstat.m_wait++;
7069 mcache_retry = TRUE;
7070 }
7071 done:
7072 return mcache_retry;
7073 }
7074
7075 __attribute__((noreturn))
7076 static void
7077 mbuf_worker_thread(void)
7078 {
7079 int mbuf_expand;
7080
7081 while (1) {
7082 lck_mtx_lock(mbuf_mlock);
7083 mbwdog_logger("worker thread running");
7084 mbuf_worker_run_cnt++;
7085 mbuf_expand = 0;
7086 /*
7087 * Allocations are based on page size, so if we have depleted
7088 * the reserved spaces, try to free mbufs from the major classes.
7089 */
7090 #if PAGE_SIZE == 4096
7091 uint32_t m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
7092 uint32_t m_clusters = m_total(MC_CL);
7093 uint32_t m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
7094 uint32_t sumclusters = m_mbclusters + m_clusters + m_bigclusters;
7095 if (sumclusters >= nclusters) {
7096 mbwdog_logger("reclaiming bigcl");
7097 mbuf_drain_locked(TRUE);
7098 m_reclaim(MC_BIGCL, 4, FALSE);
7099 }
7100 #else
7101 uint32_t m_16kclusters = m_total(MC_16KCL);
7102 if (njcl > 0 && (m_16kclusters << NCLPJCLSHIFT) >= njcl) {
7103 mbwdog_logger("reclaiming 16kcl");
7104 mbuf_drain_locked(TRUE);
7105 m_reclaim(MC_16KCL, 4, FALSE);
7106 }
7107 #endif
7108 if (m_region_expand(MC_CL) > 0) {
7109 int n;
7110 mb_expand_cl_cnt++;
7111 /* Adjust to current number of cluster in use */
7112 n = m_region_expand(MC_CL) -
7113 (m_total(MC_CL) - m_infree(MC_CL));
7114 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL)) {
7115 n = m_maxlimit(MC_CL) - m_total(MC_CL);
7116 }
7117 if (n > 0) {
7118 mb_expand_cl_total += n;
7119 }
7120 m_region_expand(MC_CL) = 0;
7121
7122 if (n > 0) {
7123 mbwdog_logger("expanding MC_CL by %d", n);
7124 freelist_populate(MC_CL, n, M_WAIT);
7125 }
7126 }
7127 if (m_region_expand(MC_BIGCL) > 0) {
7128 int n;
7129 mb_expand_bigcl_cnt++;
7130 /* Adjust to current number of 4 KB cluster in use */
7131 n = m_region_expand(MC_BIGCL) -
7132 (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
7133 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL)) {
7134 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
7135 }
7136 if (n > 0) {
7137 mb_expand_bigcl_total += n;
7138 }
7139 m_region_expand(MC_BIGCL) = 0;
7140
7141 if (n > 0) {
7142 mbwdog_logger("expanding MC_BIGCL by %d", n);
7143 freelist_populate(MC_BIGCL, n, M_WAIT);
7144 }
7145 }
7146 if (m_region_expand(MC_16KCL) > 0) {
7147 int n;
7148 mb_expand_16kcl_cnt++;
7149 /* Adjust to current number of 16 KB cluster in use */
7150 n = m_region_expand(MC_16KCL) -
7151 (m_total(MC_16KCL) - m_infree(MC_16KCL));
7152 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL)) {
7153 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
7154 }
7155 if (n > 0) {
7156 mb_expand_16kcl_total += n;
7157 }
7158 m_region_expand(MC_16KCL) = 0;
7159
7160 if (n > 0) {
7161 mbwdog_logger("expanding MC_16KCL by %d", n);
7162 (void) freelist_populate(MC_16KCL, n, M_WAIT);
7163 }
7164 }
7165
7166 /*
7167 * Because we can run out of memory before filling the mbuf
7168 * map, we should not allocate more clusters than they are
7169 * mbufs -- otherwise we could have a large number of useless
7170 * clusters allocated.
7171 */
7172 mbwdog_logger("totals: MC_MBUF %d MC_BIGCL %d MC_CL %d MC_16KCL %d",
7173 m_total(MC_MBUF), m_total(MC_BIGCL), m_total(MC_CL),
7174 m_total(MC_16KCL));
7175 uint32_t total_mbufs = m_total(MC_MBUF);
7176 uint32_t total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
7177 m_total(MC_16KCL);
7178 if (total_mbufs < total_clusters) {
7179 mbwdog_logger("expanding MC_MBUF by %d",
7180 total_clusters - total_mbufs);
7181 }
7182 while (total_mbufs < total_clusters) {
7183 mb_expand_cnt++;
7184 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0) {
7185 break;
7186 }
7187 total_mbufs = m_total(MC_MBUF);
7188 total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
7189 m_total(MC_16KCL);
7190 }
7191
7192 mbuf_worker_needs_wakeup = TRUE;
7193 /*
7194 * If there's a deadlock and we're not sending / receiving
7195 * packets, net_uptime() won't be updated. Update it here
7196 * so we are sure it's correct.
7197 */
7198 net_update_uptime();
7199 mbuf_worker_last_runtime = net_uptime();
7200 assert_wait((caddr_t)&mbuf_worker_needs_wakeup,
7201 THREAD_UNINT);
7202 mbwdog_logger("worker thread sleeping");
7203 lck_mtx_unlock(mbuf_mlock);
7204 (void) thread_block((thread_continue_t)mbuf_worker_thread);
7205 }
7206 }
7207
7208 __attribute__((noreturn))
7209 static void
7210 mbuf_worker_thread_init(void)
7211 {
7212 mbuf_worker_ready++;
7213 mbuf_worker_thread();
7214 }
7215
7216 static mcl_slab_t *
7217 slab_get(void *buf)
7218 {
7219 mcl_slabg_t *slg;
7220 unsigned int ix, k;
7221
7222 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
7223
7224 VERIFY(MBUF_IN_MAP(buf));
7225 ix = ((unsigned char *)buf - mbutl) >> MBSHIFT;
7226 VERIFY(ix < maxslabgrp);
7227
7228 if ((slg = slabstbl[ix]) == NULL) {
7229 /*
7230 * In the current implementation, we never shrink the slabs
7231 * table; if we attempt to reallocate a cluster group when
7232 * it's already allocated, panic since this is a sign of a
7233 * memory corruption (slabstbl[ix] got nullified).
7234 */
7235 ++slabgrp;
7236 VERIFY(ix < slabgrp);
7237 /*
7238 * Slabs expansion can only be done single threaded; when
7239 * we get here, it must be as a result of m_clalloc() which
7240 * is serialized and therefore mb_clalloc_busy must be set.
7241 */
7242 VERIFY(mb_clalloc_busy);
7243 lck_mtx_unlock(mbuf_mlock);
7244
7245 /* This is a new buffer; create the slabs group for it */
7246 slg = zalloc_permanent_type(mcl_slabg_t);
7247 slg->slg_slab = zalloc_permanent(sizeof(mcl_slab_t) * NSLABSPMB,
7248 ZALIGN(mcl_slab_t));
7249
7250 lck_mtx_lock(mbuf_mlock);
7251 /*
7252 * No other thread could have gone into m_clalloc() after
7253 * we dropped the lock above, so verify that it's true.
7254 */
7255 VERIFY(mb_clalloc_busy);
7256
7257 slabstbl[ix] = slg;
7258
7259 /* Chain each slab in the group to its forward neighbor */
7260 for (k = 1; k < NSLABSPMB; k++) {
7261 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
7262 }
7263 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
7264
7265 /* And chain the last slab in the previous group to this */
7266 if (ix > 0) {
7267 VERIFY(slabstbl[ix - 1]->
7268 slg_slab[NSLABSPMB - 1].sl_next == NULL);
7269 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
7270 &slg->slg_slab[0];
7271 }
7272 }
7273
7274 ix = MTOPG(buf) % NSLABSPMB;
7275 VERIFY(ix < NSLABSPMB);
7276
7277 return &slg->slg_slab[ix];
7278 }
7279
7280 static void
7281 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
7282 void *base, void *head, unsigned int len, int refcnt, int chunks)
7283 {
7284 sp->sl_class = class;
7285 sp->sl_flags = flags;
7286 sp->sl_base = base;
7287 sp->sl_head = head;
7288 sp->sl_len = len;
7289 sp->sl_refcnt = refcnt;
7290 sp->sl_chunks = chunks;
7291 slab_detach(sp);
7292 }
7293
7294 static void
7295 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
7296 {
7297 VERIFY(slab_is_detached(sp));
7298 m_slab_cnt(class)++;
7299 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
7300 sp->sl_flags &= ~SLF_DETACHED;
7301
7302 /*
7303 * If a buffer spans multiple contiguous pages then mark them as
7304 * detached too
7305 */
7306 if (class == MC_16KCL) {
7307 int k;
7308 for (k = 1; k < NSLABSP16KB; k++) {
7309 sp = sp->sl_next;
7310 /* Next slab must already be present */
7311 VERIFY(sp != NULL && slab_is_detached(sp));
7312 sp->sl_flags &= ~SLF_DETACHED;
7313 }
7314 }
7315 }
7316
7317 static void
7318 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
7319 {
7320 int k;
7321 VERIFY(!slab_is_detached(sp));
7322 VERIFY(m_slab_cnt(class) > 0);
7323 m_slab_cnt(class)--;
7324 TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
7325 slab_detach(sp);
7326 if (class == MC_16KCL) {
7327 for (k = 1; k < NSLABSP16KB; k++) {
7328 sp = sp->sl_next;
7329 /* Next slab must already be present */
7330 VERIFY(sp != NULL);
7331 VERIFY(!slab_is_detached(sp));
7332 slab_detach(sp);
7333 }
7334 }
7335 }
7336
7337 static boolean_t
7338 slab_inrange(mcl_slab_t *sp, void *buf)
7339 {
7340 return (uintptr_t)buf >= (uintptr_t)sp->sl_base &&
7341 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len);
7342 }
7343
7344 #undef panic
7345
7346 static void
7347 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
7348 {
7349 int i;
7350 unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
7351 uintptr_t buf = (uintptr_t)sp->sl_base;
7352
7353 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
7354 void *next = ((mcache_obj_t *)buf)->obj_next;
7355 if (next != addr) {
7356 continue;
7357 }
7358 if (!mclverify) {
7359 if (next != NULL && !MBUF_IN_MAP(next)) {
7360 mcache_t *cp = m_cache(sp->sl_class);
7361 panic("%s: %s buffer %p in slab %p modified "
7362 "after free at offset 0: %p out of range "
7363 "[%p-%p)\n", __func__, cp->mc_name,
7364 (void *)buf, sp, next, mbutl, embutl);
7365 /* NOTREACHED */
7366 }
7367 } else {
7368 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
7369 (mcache_obj_t *)buf);
7370 mcl_audit_verify_nextptr(next, mca);
7371 }
7372 }
7373 }
7374
7375 static void
7376 slab_detach(mcl_slab_t *sp)
7377 {
7378 sp->sl_link.tqe_next = (mcl_slab_t *)-1;
7379 sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
7380 sp->sl_flags |= SLF_DETACHED;
7381 }
7382
7383 static boolean_t
7384 slab_is_detached(mcl_slab_t *sp)
7385 {
7386 return (intptr_t)sp->sl_link.tqe_next == -1 &&
7387 (intptr_t)sp->sl_link.tqe_prev == -1 &&
7388 (sp->sl_flags & SLF_DETACHED);
7389 }
7390
7391 static void
7392 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
7393 mcache_obj_t **con_list, size_t con_size, unsigned int num)
7394 {
7395 mcache_audit_t *mca, *mca_tail;
7396 mcache_obj_t *con = NULL;
7397 boolean_t save_contents = (con_list != NULL);
7398 unsigned int i, ix;
7399
7400 ASSERT(num <= NMBPG);
7401 ASSERT(con_list == NULL || con_size != 0);
7402
7403 ix = MTOPG(buf);
7404 VERIFY(ix < maxclaudit);
7405
7406 /* Make sure we haven't been here before */
7407 for (i = 0; i < num; i++) {
7408 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
7409 }
7410
7411 mca = mca_tail = *mca_list;
7412 if (save_contents) {
7413 con = *con_list;
7414 }
7415
7416 for (i = 0; i < num; i++) {
7417 mcache_audit_t *next;
7418
7419 next = mca->mca_next;
7420 bzero(mca, sizeof(*mca));
7421 mca->mca_next = next;
7422 mclaudit[ix].cl_audit[i] = mca;
7423
7424 /* Attach the contents buffer if requested */
7425 if (save_contents) {
7426 mcl_saved_contents_t *msc =
7427 (mcl_saved_contents_t *)(void *)con;
7428
7429 VERIFY(msc != NULL);
7430 VERIFY(IS_P2ALIGNED(msc, sizeof(u_int64_t)));
7431 VERIFY(con_size == sizeof(*msc));
7432 mca->mca_contents_size = con_size;
7433 mca->mca_contents = msc;
7434 con = con->obj_next;
7435 bzero(mca->mca_contents, mca->mca_contents_size);
7436 }
7437
7438 mca_tail = mca;
7439 mca = mca->mca_next;
7440 }
7441
7442 if (save_contents) {
7443 *con_list = con;
7444 }
7445
7446 *mca_list = mca_tail->mca_next;
7447 mca_tail->mca_next = NULL;
7448 }
7449
7450 static void
7451 mcl_audit_free(void *buf, unsigned int num)
7452 {
7453 unsigned int i, ix;
7454 mcache_audit_t *mca, *mca_list;
7455
7456 ix = MTOPG(buf);
7457 VERIFY(ix < maxclaudit);
7458
7459 if (mclaudit[ix].cl_audit[0] != NULL) {
7460 mca_list = mclaudit[ix].cl_audit[0];
7461 for (i = 0; i < num; i++) {
7462 mca = mclaudit[ix].cl_audit[i];
7463 mclaudit[ix].cl_audit[i] = NULL;
7464 if (mca->mca_contents) {
7465 mcache_free(mcl_audit_con_cache,
7466 mca->mca_contents);
7467 }
7468 }
7469 mcache_free_ext(mcache_audit_cache,
7470 (mcache_obj_t *)mca_list);
7471 }
7472 }
7473
7474 /*
7475 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
7476 * the corresponding audit structure for that buffer.
7477 */
7478 static mcache_audit_t *
7479 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj)
7480 {
7481 mcache_audit_t *mca = NULL;
7482 int ix = MTOPG(mobj), m_idx = 0;
7483 unsigned char *page_addr;
7484
7485 VERIFY(ix < maxclaudit);
7486 VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE)));
7487
7488 page_addr = PGTOM(ix);
7489
7490 switch (class) {
7491 case MC_MBUF:
7492 /*
7493 * For the mbuf case, find the index of the page
7494 * used by the mbuf and use that index to locate the
7495 * base address of the page. Then find out the
7496 * mbuf index relative to the page base and use
7497 * it to locate the audit structure.
7498 */
7499 m_idx = MBPAGEIDX(page_addr, mobj);
7500 VERIFY(m_idx < (int)NMBPG);
7501 mca = mclaudit[ix].cl_audit[m_idx];
7502 break;
7503
7504 case MC_CL:
7505 /*
7506 * Same thing as above, but for 2KB clusters in a page.
7507 */
7508 m_idx = CLPAGEIDX(page_addr, mobj);
7509 VERIFY(m_idx < (int)NCLPG);
7510 mca = mclaudit[ix].cl_audit[m_idx];
7511 break;
7512
7513 case MC_BIGCL:
7514 m_idx = BCLPAGEIDX(page_addr, mobj);
7515 VERIFY(m_idx < (int)NBCLPG);
7516 mca = mclaudit[ix].cl_audit[m_idx];
7517 break;
7518 case MC_16KCL:
7519 /*
7520 * Same as above, but only return the first element.
7521 */
7522 mca = mclaudit[ix].cl_audit[0];
7523 break;
7524
7525 default:
7526 VERIFY(0);
7527 /* NOTREACHED */
7528 }
7529
7530 return mca;
7531 }
7532
7533 static void
7534 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
7535 boolean_t alloc)
7536 {
7537 struct mbuf *m = addr;
7538 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
7539
7540 VERIFY(mca->mca_contents != NULL &&
7541 mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7542
7543 if (mclverify) {
7544 mcl_audit_verify_nextptr(next, mca);
7545 }
7546
7547 if (!alloc) {
7548 /* Save constructed mbuf fields */
7549 mcl_audit_save_mbuf(m, mca);
7550 if (mclverify) {
7551 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
7552 m_maxsize(MC_MBUF));
7553 }
7554 ((mcache_obj_t *)m)->obj_next = next;
7555 return;
7556 }
7557
7558 /* Check if the buffer has been corrupted while in freelist */
7559 if (mclverify) {
7560 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
7561 }
7562 /* Restore constructed mbuf fields */
7563 mcl_audit_restore_mbuf(m, mca, composite);
7564 }
7565
7566 static void
7567 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
7568 {
7569 struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
7570
7571 if (composite) {
7572 struct mbuf *next = m->m_next;
7573 VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL &&
7574 MBUF_IS_COMPOSITE(ms));
7575 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7576 /*
7577 * We could have hand-picked the mbuf fields and restore
7578 * them individually, but that will be a maintenance
7579 * headache. Instead, restore everything that was saved;
7580 * the mbuf layer will recheck and reinitialize anyway.
7581 */
7582 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
7583 m->m_next = next;
7584 } else {
7585 /*
7586 * For a regular mbuf (no cluster attached) there's nothing
7587 * to restore other than the type field, which is expected
7588 * to be MT_FREE.
7589 */
7590 m->m_type = ms->m_type;
7591 }
7592 _MCHECK(m);
7593 }
7594
7595 static void
7596 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
7597 {
7598 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7599 _MCHECK(m);
7600 bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
7601 }
7602
7603 static void
7604 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
7605 boolean_t save_next)
7606 {
7607 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
7608
7609 if (!alloc) {
7610 if (mclverify) {
7611 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
7612 }
7613 if (save_next) {
7614 mcl_audit_verify_nextptr(next, mca);
7615 ((mcache_obj_t *)addr)->obj_next = next;
7616 }
7617 } else if (mclverify) {
7618 /* Check if the buffer has been corrupted while in freelist */
7619 mcl_audit_verify_nextptr(next, mca);
7620 mcache_audit_free_verify_set(mca, addr, 0, size);
7621 }
7622 }
7623
7624 static void
7625 mcl_audit_scratch(mcache_audit_t *mca)
7626 {
7627 void *stack[MCACHE_STACK_DEPTH + 1];
7628 mcl_scratch_audit_t *msa;
7629 struct timeval now;
7630
7631 VERIFY(mca->mca_contents != NULL);
7632 msa = MCA_SAVED_SCRATCH_PTR(mca);
7633
7634 msa->msa_pthread = msa->msa_thread;
7635 msa->msa_thread = current_thread();
7636 bcopy(msa->msa_stack, msa->msa_pstack, sizeof(msa->msa_pstack));
7637 msa->msa_pdepth = msa->msa_depth;
7638 bzero(stack, sizeof(stack));
7639 msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
7640 bcopy(&stack[1], msa->msa_stack, sizeof(msa->msa_stack));
7641
7642 msa->msa_ptstamp = msa->msa_tstamp;
7643 microuptime(&now);
7644 /* tstamp is in ms relative to base_ts */
7645 msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
7646 if ((now.tv_sec - mb_start.tv_sec) > 0) {
7647 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
7648 }
7649 }
7650
7651 __abortlike
7652 static void
7653 mcl_audit_mcheck_panic(struct mbuf *m)
7654 {
7655 char buf[DUMP_MCA_BUF_SIZE];
7656 mcache_audit_t *mca;
7657
7658 MRANGE(m);
7659 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7660
7661 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s",
7662 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(buf, mca));
7663 /* NOTREACHED */
7664 }
7665
7666 __abortlike
7667 static void
7668 mcl_audit_verify_nextptr_panic(void *next, mcache_audit_t *mca)
7669 {
7670 char buf[DUMP_MCA_BUF_SIZE];
7671 panic("mcl_audit: buffer %p modified after free at offset 0: "
7672 "%p out of range [%p-%p)\n%s\n",
7673 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(buf, mca));
7674 /* NOTREACHED */
7675 }
7676
7677 static void
7678 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
7679 {
7680 if (next != NULL && !MBUF_IN_MAP(next) &&
7681 (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
7682 mcl_audit_verify_nextptr_panic(next, mca);
7683 }
7684 }
7685
7686 static uintptr_t
7687 hash_mix(uintptr_t x)
7688 {
7689 #ifndef __LP64__
7690 x += ~(x << 15);
7691 x ^= (x >> 10);
7692 x += (x << 3);
7693 x ^= (x >> 6);
7694 x += ~(x << 11);
7695 x ^= (x >> 16);
7696 #else
7697 x += ~(x << 32);
7698 x ^= (x >> 22);
7699 x += ~(x << 13);
7700 x ^= (x >> 8);
7701 x += (x << 3);
7702 x ^= (x >> 15);
7703 x += ~(x << 27);
7704 x ^= (x >> 31);
7705 #endif
7706 return x;
7707 }
7708
7709 static uint32_t
7710 hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
7711 {
7712 uintptr_t hash = 0;
7713 uintptr_t mask = max_size - 1;
7714
7715 while (depth) {
7716 hash += bt[--depth];
7717 }
7718
7719 hash = hash_mix(hash) & mask;
7720
7721 assert(hash < max_size);
7722
7723 return (uint32_t) hash;
7724 }
7725
7726 static uint32_t
7727 hashaddr(uintptr_t pt, uint32_t max_size)
7728 {
7729 uintptr_t hash = 0;
7730 uintptr_t mask = max_size - 1;
7731
7732 hash = hash_mix(pt) & mask;
7733
7734 assert(hash < max_size);
7735
7736 return (uint32_t) hash;
7737 }
7738
7739 /* This function turns on mbuf leak detection */
7740 static void
7741 mleak_activate(void)
7742 {
7743 mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
7744 PE_parse_boot_argn("mleak_sample_factor",
7745 &mleak_table.mleak_sample_factor,
7746 sizeof(mleak_table.mleak_sample_factor));
7747
7748 if (mleak_table.mleak_sample_factor == 0) {
7749 mclfindleak = 0;
7750 }
7751
7752 if (mclfindleak == 0) {
7753 return;
7754 }
7755
7756 vm_size_t alloc_size =
7757 mleak_alloc_buckets * sizeof(struct mallocation);
7758 vm_size_t trace_size = mleak_trace_buckets * sizeof(struct mtrace);
7759
7760 mleak_allocations = zalloc_permanent(alloc_size, ZALIGN(struct mallocation));
7761 mleak_traces = zalloc_permanent(trace_size, ZALIGN(struct mtrace));
7762 mleak_stat = zalloc_permanent(MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
7763 ZALIGN(mleak_stat_t));
7764
7765 mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
7766 #ifdef __LP64__
7767 mleak_stat->ml_isaddr64 = 1;
7768 #endif /* __LP64__ */
7769 }
7770
7771 static void
7772 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
7773 {
7774 int temp;
7775
7776 if (mclfindleak == 0) {
7777 return;
7778 }
7779
7780 if (!alloc) {
7781 return mleak_free(addr);
7782 }
7783
7784 temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
7785
7786 if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
7787 uintptr_t bt[MLEAK_STACK_DEPTH];
7788 unsigned int logged = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
7789 mleak_log(bt, addr, logged, num);
7790 }
7791 }
7792
7793 /*
7794 * This function records the allocation in the mleak_allocations table
7795 * and the backtrace in the mleak_traces table; if allocation slot is in use,
7796 * replace old allocation with new one if the trace slot is in use, return
7797 * (or increment refcount if same trace).
7798 */
7799 static boolean_t
7800 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
7801 {
7802 struct mallocation *allocation;
7803 struct mtrace *trace;
7804 uint32_t trace_index;
7805
7806 /* Quit if someone else modifying the tables */
7807 if (!lck_mtx_try_lock_spin(mleak_lock)) {
7808 mleak_table.total_conflicts++;
7809 return FALSE;
7810 }
7811
7812 allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
7813 mleak_alloc_buckets)];
7814 trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
7815 trace = &mleak_traces[trace_index];
7816
7817 VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
7818 VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
7819
7820 allocation->hitcount++;
7821 trace->hitcount++;
7822
7823 /*
7824 * If the allocation bucket we want is occupied
7825 * and the occupier has the same trace, just bail.
7826 */
7827 if (allocation->element != NULL &&
7828 trace_index == allocation->trace_index) {
7829 mleak_table.alloc_collisions++;
7830 lck_mtx_unlock(mleak_lock);
7831 return TRUE;
7832 }
7833
7834 /*
7835 * Store the backtrace in the traces array;
7836 * Size of zero = trace bucket is free.
7837 */
7838 if (trace->allocs > 0 &&
7839 bcmp(trace->addr, bt, (depth * sizeof(uintptr_t))) != 0) {
7840 /* Different, unique trace, but the same hash! Bail out. */
7841 trace->collisions++;
7842 mleak_table.trace_collisions++;
7843 lck_mtx_unlock(mleak_lock);
7844 return TRUE;
7845 } else if (trace->allocs > 0) {
7846 /* Same trace, already added, so increment refcount */
7847 trace->allocs++;
7848 } else {
7849 /* Found an unused trace bucket, so record the trace here */
7850 if (trace->depth != 0) {
7851 /* this slot previously used but not currently in use */
7852 mleak_table.trace_overwrites++;
7853 }
7854 mleak_table.trace_recorded++;
7855 trace->allocs = 1;
7856 memcpy(trace->addr, bt, (depth * sizeof(uintptr_t)));
7857 trace->depth = depth;
7858 trace->collisions = 0;
7859 }
7860
7861 /* Step 2: Store the allocation record in the allocations array */
7862 if (allocation->element != NULL) {
7863 /*
7864 * Replace an existing allocation. No need to preserve
7865 * because only a subset of the allocations are being
7866 * recorded anyway.
7867 */
7868 mleak_table.alloc_collisions++;
7869 } else if (allocation->trace_index != 0) {
7870 mleak_table.alloc_overwrites++;
7871 }
7872 allocation->element = addr;
7873 allocation->trace_index = trace_index;
7874 allocation->count = num;
7875 mleak_table.alloc_recorded++;
7876 mleak_table.outstanding_allocs++;
7877
7878 lck_mtx_unlock(mleak_lock);
7879 return TRUE;
7880 }
7881
7882 static void
7883 mleak_free(mcache_obj_t *addr)
7884 {
7885 while (addr != NULL) {
7886 struct mallocation *allocation = &mleak_allocations
7887 [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
7888
7889 if (allocation->element == addr &&
7890 allocation->trace_index < mleak_trace_buckets) {
7891 lck_mtx_lock_spin(mleak_lock);
7892 if (allocation->element == addr &&
7893 allocation->trace_index < mleak_trace_buckets) {
7894 struct mtrace *trace;
7895 trace = &mleak_traces[allocation->trace_index];
7896 /* allocs = 0 means trace bucket is unused */
7897 if (trace->allocs > 0) {
7898 trace->allocs--;
7899 }
7900 if (trace->allocs == 0) {
7901 trace->depth = 0;
7902 }
7903 /* NULL element means alloc bucket is unused */
7904 allocation->element = NULL;
7905 mleak_table.outstanding_allocs--;
7906 }
7907 lck_mtx_unlock(mleak_lock);
7908 }
7909 addr = addr->obj_next;
7910 }
7911 }
7912
7913 static void
7914 mleak_sort_traces()
7915 {
7916 int i, j, k;
7917 struct mtrace *swap;
7918
7919 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
7920 mleak_top_trace[i] = NULL;
7921 }
7922
7923 for (i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++) {
7924 if (mleak_traces[i].allocs <= 0) {
7925 continue;
7926 }
7927
7928 mleak_top_trace[j] = &mleak_traces[i];
7929 for (k = j; k > 0; k--) {
7930 if (mleak_top_trace[k]->allocs <=
7931 mleak_top_trace[k - 1]->allocs) {
7932 break;
7933 }
7934
7935 swap = mleak_top_trace[k - 1];
7936 mleak_top_trace[k - 1] = mleak_top_trace[k];
7937 mleak_top_trace[k] = swap;
7938 }
7939 j++;
7940 }
7941
7942 j--;
7943 for (; i < mleak_trace_buckets; i++) {
7944 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs) {
7945 continue;
7946 }
7947
7948 mleak_top_trace[j] = &mleak_traces[i];
7949
7950 for (k = j; k > 0; k--) {
7951 if (mleak_top_trace[k]->allocs <=
7952 mleak_top_trace[k - 1]->allocs) {
7953 break;
7954 }
7955
7956 swap = mleak_top_trace[k - 1];
7957 mleak_top_trace[k - 1] = mleak_top_trace[k];
7958 mleak_top_trace[k] = swap;
7959 }
7960 }
7961 }
7962
7963 static void
7964 mleak_update_stats()
7965 {
7966 mleak_trace_stat_t *mltr;
7967 int i;
7968
7969 VERIFY(mleak_stat != NULL);
7970 #ifdef __LP64__
7971 VERIFY(mleak_stat->ml_isaddr64);
7972 #else
7973 VERIFY(!mleak_stat->ml_isaddr64);
7974 #endif /* !__LP64__ */
7975 VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
7976
7977 mleak_sort_traces();
7978
7979 mltr = &mleak_stat->ml_trace[0];
7980 bzero(mltr, sizeof(*mltr) * MLEAK_NUM_TRACES);
7981 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
7982 int j;
7983
7984 if (mleak_top_trace[i] == NULL ||
7985 mleak_top_trace[i]->allocs == 0) {
7986 continue;
7987 }
7988
7989 mltr->mltr_collisions = mleak_top_trace[i]->collisions;
7990 mltr->mltr_hitcount = mleak_top_trace[i]->hitcount;
7991 mltr->mltr_allocs = mleak_top_trace[i]->allocs;
7992 mltr->mltr_depth = mleak_top_trace[i]->depth;
7993
7994 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
7995 for (j = 0; j < mltr->mltr_depth; j++) {
7996 mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
7997 }
7998
7999 mltr++;
8000 }
8001 }
8002
8003 static struct mbtypes {
8004 int mt_type;
8005 const char *mt_name;
8006 } mbtypes[] = {
8007 { MT_DATA, "data" },
8008 { MT_OOBDATA, "oob data" },
8009 { MT_CONTROL, "ancillary data" },
8010 { MT_HEADER, "packet headers" },
8011 { MT_SOCKET, "socket structures" },
8012 { MT_PCB, "protocol control blocks" },
8013 { MT_RTABLE, "routing table entries" },
8014 { MT_HTABLE, "IMP host table entries" },
8015 { MT_ATABLE, "address resolution tables" },
8016 { MT_FTABLE, "fragment reassembly queue headers" },
8017 { MT_SONAME, "socket names and addresses" },
8018 { MT_SOOPTS, "socket options" },
8019 { MT_RIGHTS, "access rights" },
8020 { MT_IFADDR, "interface addresses" },
8021 { MT_TAG, "packet tags" },
8022 { 0, NULL }
8023 };
8024
8025 #define MBUF_DUMP_BUF_CHK() { \
8026 clen -= k; \
8027 if (clen < 1) \
8028 goto done; \
8029 c += k; \
8030 }
8031
8032 static char *
8033 mbuf_dump(void)
8034 {
8035 unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct,
8036 totreturned = 0;
8037 u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
8038 u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
8039 u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
8040 int nmbtypes = sizeof(mbstat.m_mtypes) / sizeof(short);
8041 uint8_t seen[256];
8042 struct mbtypes *mp;
8043 mb_class_stat_t *sp;
8044 mleak_trace_stat_t *mltr;
8045 char *c = mbuf_dump_buf;
8046 int i, j, k, clen = MBUF_DUMP_BUF_SIZE;
8047 struct mbuf_watchdog_defunct_args args = {};
8048
8049 mbuf_dump_buf[0] = '\0';
8050
8051 /* synchronize all statistics in the mbuf table */
8052 mbuf_stat_sync();
8053 mbuf_mtypes_sync(TRUE);
8054
8055 sp = &mb_stat->mbs_class[0];
8056 for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
8057 u_int32_t mem;
8058
8059 if (m_class(i) == MC_MBUF) {
8060 m_mbufs = sp->mbcl_active;
8061 } else if (m_class(i) == MC_CL) {
8062 m_clfree = sp->mbcl_total - sp->mbcl_active;
8063 } else if (m_class(i) == MC_BIGCL) {
8064 m_bigclfree = sp->mbcl_total - sp->mbcl_active;
8065 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
8066 m_16kclfree = sp->mbcl_total - sp->mbcl_active;
8067 m_16kclusters = sp->mbcl_total;
8068 } else if (m_class(i) == MC_MBUF_CL) {
8069 m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
8070 } else if (m_class(i) == MC_MBUF_BIGCL) {
8071 m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
8072 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
8073 m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
8074 }
8075
8076 mem = sp->mbcl_ctotal * sp->mbcl_size;
8077 totmem += mem;
8078 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
8079 sp->mbcl_size;
8080 totreturned += sp->mbcl_release_cnt;
8081 }
8082
8083 /* adjust free counts to include composite caches */
8084 m_clfree += m_mbufclfree;
8085 m_bigclfree += m_mbufbigclfree;
8086 m_16kclfree += m_mbuf16kclfree;
8087
8088 totmbufs = 0;
8089 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
8090 totmbufs += mbstat.m_mtypes[mp->mt_type];
8091 }
8092 if (totmbufs > m_mbufs) {
8093 totmbufs = m_mbufs;
8094 }
8095 k = scnprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
8096 MBUF_DUMP_BUF_CHK();
8097
8098 bzero(&seen, sizeof(seen));
8099 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
8100 if (mbstat.m_mtypes[mp->mt_type] != 0) {
8101 seen[mp->mt_type] = 1;
8102 k = scnprintf(c, clen, "\t%u mbufs allocated to %s\n",
8103 mbstat.m_mtypes[mp->mt_type], mp->mt_name);
8104 MBUF_DUMP_BUF_CHK();
8105 }
8106 }
8107 seen[MT_FREE] = 1;
8108 for (i = 0; i < nmbtypes; i++) {
8109 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
8110 k = scnprintf(c, clen, "\t%u mbufs allocated to "
8111 "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
8112 MBUF_DUMP_BUF_CHK();
8113 }
8114 }
8115 if ((m_mbufs - totmbufs) > 0) {
8116 k = scnprintf(c, clen, "\t%lu mbufs allocated to caches\n",
8117 m_mbufs - totmbufs);
8118 MBUF_DUMP_BUF_CHK();
8119 }
8120 k = scnprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
8121 "%u/%u mbuf 4KB clusters in use\n",
8122 (unsigned int)(mbstat.m_clusters - m_clfree),
8123 (unsigned int)mbstat.m_clusters,
8124 (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
8125 (unsigned int)mbstat.m_bigclusters);
8126 MBUF_DUMP_BUF_CHK();
8127
8128 if (njcl > 0) {
8129 k = scnprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
8130 m_16kclusters - m_16kclfree, m_16kclusters,
8131 njclbytes / 1024);
8132 MBUF_DUMP_BUF_CHK();
8133 }
8134 totused = totmem - totfree;
8135 if (totmem == 0) {
8136 totpct = 0;
8137 } else if (totused < (ULONG_MAX / 100)) {
8138 totpct = (totused * 100) / totmem;
8139 } else {
8140 u_long totmem1 = totmem / 100;
8141 u_long totused1 = totused / 100;
8142 totpct = (totused1 * 100) / totmem1;
8143 }
8144 k = scnprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
8145 "in use)\n", totmem / 1024, totpct);
8146 MBUF_DUMP_BUF_CHK();
8147 k = scnprintf(c, clen, "%lu KB returned to the system\n",
8148 totreturned / 1024);
8149 MBUF_DUMP_BUF_CHK();
8150
8151 net_update_uptime();
8152
8153 k = scnprintf(c, clen,
8154 "worker thread runs: %u, expansions: %llu, cl %llu/%llu, "
8155 "bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt,
8156 mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total,
8157 mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt,
8158 mb_expand_16kcl_total);
8159 MBUF_DUMP_BUF_CHK();
8160 if (mbuf_worker_last_runtime != 0) {
8161 k = scnprintf(c, clen, "worker thread last run time: "
8162 "%llu (%llu seconds ago)\n",
8163 mbuf_worker_last_runtime,
8164 net_uptime() - mbuf_worker_last_runtime);
8165 MBUF_DUMP_BUF_CHK();
8166 }
8167 if (mbuf_drain_last_runtime != 0) {
8168 k = scnprintf(c, clen, "drain routine last run time: "
8169 "%llu (%llu seconds ago)\n",
8170 mbuf_drain_last_runtime,
8171 net_uptime() - mbuf_drain_last_runtime);
8172 MBUF_DUMP_BUF_CHK();
8173 }
8174
8175 /*
8176 * Log where the most mbufs have accumulated:
8177 * - Process socket buffers
8178 * - TCP reassembly queue
8179 * - Interface AQM queue (output) and DLIL input queue
8180 */
8181 args.non_blocking = true;
8182 proc_iterate(PROC_ALLPROCLIST,
8183 mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
8184 if (args.top_app != NULL) {
8185 k = scnprintf(c, clen, "\ntop proc mbuf space %u bytes by %s:%d\n",
8186 args.top_app_space_used,
8187 proc_name_address(args.top_app),
8188 proc_pid(args.top_app));
8189 proc_rele(args.top_app);
8190 }
8191 MBUF_DUMP_BUF_CHK();
8192
8193 #if INET
8194 k = dump_tcp_reass_qlen(c, clen);
8195 MBUF_DUMP_BUF_CHK();
8196 #endif /* INET */
8197
8198 #if MPTCP
8199 k = dump_mptcp_reass_qlen(c, clen);
8200 MBUF_DUMP_BUF_CHK();
8201 #endif /* MPTCP */
8202
8203 #if NETWORKING
8204 k = dlil_dump_top_if_qlen(c, clen);
8205 MBUF_DUMP_BUF_CHK();
8206 #endif /* NETWORKING */
8207
8208 /* mbuf leak detection statistics */
8209 mleak_update_stats();
8210
8211 k = scnprintf(c, clen, "\nmbuf leak detection table:\n");
8212 MBUF_DUMP_BUF_CHK();
8213 k = scnprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
8214 mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
8215 mleak_table.mleak_sample_factor);
8216 MBUF_DUMP_BUF_CHK();
8217 k = scnprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
8218 mleak_table.outstanding_allocs);
8219 MBUF_DUMP_BUF_CHK();
8220 k = scnprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
8221 mleak_table.alloc_recorded, mleak_table.trace_recorded);
8222 MBUF_DUMP_BUF_CHK();
8223 k = scnprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
8224 mleak_table.alloc_collisions, mleak_table.trace_collisions);
8225 MBUF_DUMP_BUF_CHK();
8226 k = scnprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
8227 mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
8228 MBUF_DUMP_BUF_CHK();
8229 k = scnprintf(c, clen, "\tlock conflicts: %llu\n\n",
8230 mleak_table.total_conflicts);
8231 MBUF_DUMP_BUF_CHK();
8232
8233 k = scnprintf(c, clen, "top %d outstanding traces:\n",
8234 mleak_stat->ml_cnt);
8235 MBUF_DUMP_BUF_CHK();
8236 for (i = 0; i < mleak_stat->ml_cnt; i++) {
8237 mltr = &mleak_stat->ml_trace[i];
8238 k = scnprintf(c, clen, "[%d] %llu outstanding alloc(s), "
8239 "%llu hit(s), %llu collision(s)\n", (i + 1),
8240 mltr->mltr_allocs, mltr->mltr_hitcount,
8241 mltr->mltr_collisions);
8242 MBUF_DUMP_BUF_CHK();
8243 }
8244
8245 if (mleak_stat->ml_isaddr64) {
8246 k = scnprintf(c, clen, MB_LEAK_HDR_64);
8247 } else {
8248 k = scnprintf(c, clen, MB_LEAK_HDR_32);
8249 }
8250 MBUF_DUMP_BUF_CHK();
8251
8252 for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
8253 k = scnprintf(c, clen, "%2d: ", (i + 1));
8254 MBUF_DUMP_BUF_CHK();
8255 for (j = 0; j < mleak_stat->ml_cnt; j++) {
8256 mltr = &mleak_stat->ml_trace[j];
8257 if (i < mltr->mltr_depth) {
8258 if (mleak_stat->ml_isaddr64) {
8259 k = scnprintf(c, clen, "0x%0llx ",
8260 (uint64_t)VM_KERNEL_UNSLIDE(
8261 mltr->mltr_addr[i]));
8262 } else {
8263 k = scnprintf(c, clen,
8264 "0x%08x ",
8265 (uint32_t)VM_KERNEL_UNSLIDE(
8266 mltr->mltr_addr[i]));
8267 }
8268 } else {
8269 if (mleak_stat->ml_isaddr64) {
8270 k = scnprintf(c, clen,
8271 MB_LEAK_SPACING_64);
8272 } else {
8273 k = scnprintf(c, clen,
8274 MB_LEAK_SPACING_32);
8275 }
8276 }
8277 MBUF_DUMP_BUF_CHK();
8278 }
8279 k = scnprintf(c, clen, "\n");
8280 MBUF_DUMP_BUF_CHK();
8281 }
8282
8283 done:
8284 return mbuf_dump_buf;
8285 }
8286
8287 #undef MBUF_DUMP_BUF_CHK
8288
8289 /*
8290 * Convert between a regular and a packet header mbuf. Caller is responsible
8291 * for setting or clearing M_PKTHDR; this routine does the rest of the work.
8292 */
8293 int
8294 m_reinit(struct mbuf *m, int hdr)
8295 {
8296 int ret = 0;
8297
8298 if (hdr) {
8299 VERIFY(!(m->m_flags & M_PKTHDR));
8300 if (!(m->m_flags & M_EXT) &&
8301 (m->m_data != m->m_dat || m->m_len > 0)) {
8302 /*
8303 * If there's no external cluster attached and the
8304 * mbuf appears to contain user data, we cannot
8305 * safely convert this to a packet header mbuf,
8306 * as the packet header structure might overlap
8307 * with the data.
8308 */
8309 printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
8310 "m_data %llx (expected %llx), "
8311 "m_len %d (expected 0)\n",
8312 __func__,
8313 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m),
8314 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m->m_data),
8315 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)(m->m_dat)), m->m_len);
8316 ret = EBUSY;
8317 } else {
8318 VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
8319 m->m_flags |= M_PKTHDR;
8320 MBUF_INIT_PKTHDR(m);
8321 }
8322 } else {
8323 /* Check for scratch area overflow */
8324 m_redzone_verify(m);
8325 /* Free the aux data and tags if there is any */
8326 m_tag_delete_chain(m, NULL);
8327 m_do_tx_compl_callback(m, NULL);
8328 m->m_flags &= ~M_PKTHDR;
8329 }
8330
8331 return ret;
8332 }
8333
8334 int
8335 m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n)
8336 {
8337 ASSERT(m->m_flags & M_EXT);
8338 return atomic_test_set_32(&MEXT_PRIV(m), o, n);
8339 }
8340
8341 uint32_t
8342 m_ext_get_prop(struct mbuf *m)
8343 {
8344 ASSERT(m->m_flags & M_EXT);
8345 return MEXT_PRIV(m);
8346 }
8347
8348 int
8349 m_ext_paired_is_active(struct mbuf *m)
8350 {
8351 return MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : 1;
8352 }
8353
8354 void
8355 m_ext_paired_activate(struct mbuf *m)
8356 {
8357 struct ext_ref *rfa;
8358 int hdr, type;
8359 caddr_t extbuf;
8360 m_ext_free_func_t extfree;
8361 u_int extsize;
8362
8363 VERIFY(MBUF_IS_PAIRED(m));
8364 VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
8365 VERIFY(MEXT_PREF(m) == MEXT_MINREF(m));
8366
8367 hdr = (m->m_flags & M_PKTHDR);
8368 type = m->m_type;
8369 extbuf = m->m_ext.ext_buf;
8370 extfree = m_get_ext_free(m);
8371 extsize = m->m_ext.ext_size;
8372 rfa = m_get_rfa(m);
8373
8374 VERIFY(extbuf != NULL && rfa != NULL);
8375
8376 /*
8377 * Safe to reinitialize packet header tags, since it's
8378 * already taken care of at m_free() time. Similar to
8379 * what's done in m_clattach() for the cluster. Bump
8380 * up MEXT_PREF to indicate activation.
8381 */
8382 MBUF_INIT(m, hdr, type);
8383 MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
8384 1, 1, 2, EXTF_PAIRED, MEXT_PRIV(m), m);
8385 }
8386
8387 void
8388 m_scratch_init(struct mbuf *m)
8389 {
8390 struct pkthdr *pkt = &m->m_pkthdr;
8391
8392 VERIFY(m->m_flags & M_PKTHDR);
8393
8394 /* See comments in <rdar://problem/14040693> */
8395 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
8396 panic_plain("Invalid attempt to modify guarded module-private "
8397 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
8398 /* NOTREACHED */
8399 }
8400
8401 bzero(&pkt->pkt_mpriv, sizeof(pkt->pkt_mpriv));
8402 }
8403
8404 /*
8405 * This routine is reserved for mbuf_get_driver_scratch(); clients inside
8406 * xnu that intend on utilizing the module-private area should directly
8407 * refer to the pkt_mpriv structure in the pkthdr. They are also expected
8408 * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
8409 * to handing it off to another module, respectively.
8410 */
8411 u_int32_t
8412 m_scratch_get(struct mbuf *m, u_int8_t **p)
8413 {
8414 struct pkthdr *pkt = &m->m_pkthdr;
8415
8416 VERIFY(m->m_flags & M_PKTHDR);
8417
8418 /* See comments in <rdar://problem/14040693> */
8419 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
8420 panic_plain("Invalid attempt to access guarded module-private "
8421 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
8422 /* NOTREACHED */
8423 }
8424
8425 if (mcltrace) {
8426 mcache_audit_t *mca;
8427
8428 lck_mtx_lock(mbuf_mlock);
8429 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
8430 if (mca->mca_uflags & MB_SCVALID) {
8431 mcl_audit_scratch(mca);
8432 }
8433 lck_mtx_unlock(mbuf_mlock);
8434 }
8435
8436 *p = (u_int8_t *)&pkt->pkt_mpriv;
8437 return sizeof(pkt->pkt_mpriv);
8438 }
8439
8440 void
8441 m_add_crumb(struct mbuf *m, uint16_t crumb)
8442 {
8443 VERIFY(m->m_flags & M_PKTHDR);
8444
8445 m->m_pkthdr.pkt_crumbs |= crumb;
8446 }
8447
8448 static void
8449 m_redzone_init(struct mbuf *m)
8450 {
8451 VERIFY(m->m_flags & M_PKTHDR);
8452 /*
8453 * Each mbuf has a unique red zone pattern, which is a XOR
8454 * of the red zone cookie and the address of the mbuf.
8455 */
8456 m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
8457 }
8458
8459 static void
8460 m_redzone_verify(struct mbuf *m)
8461 {
8462 u_int32_t mb_redzone;
8463
8464 VERIFY(m->m_flags & M_PKTHDR);
8465
8466 mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
8467 if (m->m_pkthdr.redzone != mb_redzone) {
8468 panic("mbuf %p redzone violation with value 0x%x "
8469 "(instead of 0x%x, using cookie 0x%x)\n",
8470 m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
8471 /* NOTREACHED */
8472 }
8473 }
8474
8475 __private_extern__ inline void
8476 m_set_ext(struct mbuf *m, struct ext_ref *rfa, m_ext_free_func_t ext_free,
8477 caddr_t ext_arg)
8478 {
8479 VERIFY(m->m_flags & M_EXT);
8480 if (rfa != NULL) {
8481 m->m_ext.ext_refflags =
8482 (struct ext_ref *)(((uintptr_t)rfa) ^ mb_obscure_extref);
8483 if (ext_free != NULL) {
8484 rfa->ext_token = ((uintptr_t)&rfa->ext_token) ^
8485 mb_obscure_extfree;
8486 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, ext_free) ^ rfa->ext_token;
8487 m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
8488 if (ext_arg != NULL) {
8489 m->m_ext.ext_arg =
8490 (caddr_t)(((uintptr_t)ext_arg) ^ rfa->ext_token);
8491 } else {
8492 m->m_ext.ext_arg = NULL;
8493 }
8494 } else {
8495 rfa->ext_token = 0;
8496 m->m_ext.ext_free = NULL;
8497 m->m_ext.ext_arg = NULL;
8498 }
8499 } else {
8500 /*
8501 * If we are going to loose the cookie in ext_token by
8502 * resetting the rfa, we should use the global cookie
8503 * to obscure the ext_free and ext_arg pointers.
8504 */
8505 if (ext_free != NULL) {
8506 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, ext_free) ^ mb_obscure_extfree;
8507 m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
8508 if (ext_arg != NULL) {
8509 m->m_ext.ext_arg =
8510 (caddr_t)((uintptr_t)ext_arg ^
8511 mb_obscure_extfree);
8512 } else {
8513 m->m_ext.ext_arg = NULL;
8514 }
8515 } else {
8516 m->m_ext.ext_free = NULL;
8517 m->m_ext.ext_arg = NULL;
8518 }
8519 m->m_ext.ext_refflags = NULL;
8520 }
8521 }
8522
8523 __private_extern__ inline struct ext_ref *
8524 m_get_rfa(struct mbuf *m)
8525 {
8526 if (m->m_ext.ext_refflags == NULL) {
8527 return NULL;
8528 } else {
8529 return (struct ext_ref *)(((uintptr_t)m->m_ext.ext_refflags) ^ mb_obscure_extref);
8530 }
8531 }
8532
8533 __private_extern__ inline m_ext_free_func_t
8534 m_get_ext_free(struct mbuf *m)
8535 {
8536 struct ext_ref *rfa;
8537 if (m->m_ext.ext_free == NULL) {
8538 return NULL;
8539 }
8540
8541 rfa = m_get_rfa(m);
8542 if (rfa == NULL) {
8543 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, m->m_ext.ext_free) ^ mb_obscure_extfree;
8544 return ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
8545 } else {
8546 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, m->m_ext.ext_free) ^ rfa->ext_token;
8547 return ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
8548 }
8549 }
8550
8551 __private_extern__ inline caddr_t
8552 m_get_ext_arg(struct mbuf *m)
8553 {
8554 struct ext_ref *rfa;
8555 if (m->m_ext.ext_arg == NULL) {
8556 return NULL;
8557 }
8558
8559 rfa = m_get_rfa(m);
8560 if (rfa == NULL) {
8561 return (caddr_t)((uintptr_t)m->m_ext.ext_arg ^ mb_obscure_extfree);
8562 } else {
8563 return (caddr_t)(((uintptr_t)m->m_ext.ext_arg) ^
8564 rfa->ext_token);
8565 }
8566 }
8567
8568 /*
8569 * Send a report of mbuf usage if the usage is at least 6% of max limit
8570 * or if there has been at least 3% increase since the last report.
8571 *
8572 * The values 6% and 3% are chosen so that we can do simple arithmetic
8573 * with shift operations.
8574 */
8575 static boolean_t
8576 mbuf_report_usage(mbuf_class_t cl)
8577 {
8578 /* if a report is already in progress, nothing to do */
8579 if (mb_peak_newreport) {
8580 return TRUE;
8581 }
8582
8583 if (m_total(cl) > m_peak(cl) &&
8584 m_total(cl) >= (m_maxlimit(cl) >> 4) &&
8585 (m_total(cl) - m_peak(cl)) >= (m_peak(cl) >> 5)) {
8586 return TRUE;
8587 }
8588 return FALSE;
8589 }
8590
8591 __private_extern__ void
8592 mbuf_report_peak_usage(void)
8593 {
8594 int i = 0;
8595 u_int64_t uptime;
8596 struct nstat_sysinfo_data ns_data;
8597 uint32_t memreleased = 0;
8598 static uint32_t prevmemreleased;
8599
8600 uptime = net_uptime();
8601 lck_mtx_lock(mbuf_mlock);
8602
8603 /* Generate an initial report after 1 week of uptime */
8604 if (!mb_peak_firstreport &&
8605 uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
8606 mb_peak_newreport = TRUE;
8607 mb_peak_firstreport = TRUE;
8608 }
8609
8610 if (!mb_peak_newreport) {
8611 lck_mtx_unlock(mbuf_mlock);
8612 return;
8613 }
8614
8615 /*
8616 * Since a report is being generated before 1 week,
8617 * we do not need to force another one later
8618 */
8619 if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
8620 mb_peak_firstreport = TRUE;
8621 }
8622
8623 for (i = 0; i < NELEM(mbuf_table); i++) {
8624 m_peak(m_class(i)) = m_total(m_class(i));
8625 memreleased += m_release_cnt(i);
8626 }
8627 memreleased = memreleased - prevmemreleased;
8628 prevmemreleased = memreleased;
8629 mb_peak_newreport = FALSE;
8630 lck_mtx_unlock(mbuf_mlock);
8631
8632 bzero(&ns_data, sizeof(ns_data));
8633 ns_data.flags = NSTAT_SYSINFO_MBUF_STATS;
8634 ns_data.u.mb_stats.total_256b = m_peak(MC_MBUF);
8635 ns_data.u.mb_stats.total_2kb = m_peak(MC_CL);
8636 ns_data.u.mb_stats.total_4kb = m_peak(MC_BIGCL);
8637 ns_data.u.mb_stats.total_16kb = m_peak(MC_16KCL);
8638 ns_data.u.mb_stats.sbmb_total = total_sbmb_cnt_peak;
8639 ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached;
8640 ns_data.u.mb_stats.draincnt = mbstat.m_drain;
8641 ns_data.u.mb_stats.memreleased = memreleased;
8642 ns_data.u.mb_stats.sbmb_floor = total_sbmb_cnt_floor;
8643
8644 nstat_sysinfo_send_data(&ns_data);
8645
8646 /*
8647 * Reset the floor whenever we report a new
8648 * peak to track the trend (increase peek usage
8649 * is not a leak if mbufs get released
8650 * between reports and the floor stays low)
8651 */
8652 total_sbmb_cnt_floor = total_sbmb_cnt_peak;
8653 }
8654
8655 /*
8656 * Simple routine to avoid taking the lock when we can't run the
8657 * mbuf drain.
8658 */
8659 static int
8660 mbuf_drain_checks(boolean_t ignore_waiters)
8661 {
8662 if (mb_drain_maxint == 0) {
8663 return 0;
8664 }
8665 if (!ignore_waiters && mb_waiters != 0) {
8666 return 0;
8667 }
8668
8669 return 1;
8670 }
8671
8672 /*
8673 * Called by the VM when there's memory pressure or when we exhausted
8674 * the 4k/16k reserved space.
8675 */
8676 static void
8677 mbuf_drain_locked(boolean_t ignore_waiters)
8678 {
8679 mbuf_class_t mc;
8680 mcl_slab_t *sp, *sp_tmp, *nsp;
8681 unsigned int num, k, interval, released = 0;
8682 unsigned long total_mem = 0, use_mem = 0;
8683 boolean_t ret, purge_caches = FALSE;
8684 ppnum_t offset;
8685 mcache_obj_t *obj;
8686 unsigned long per;
8687 static unsigned char scratch[32];
8688 static ppnum_t scratch_pa = 0;
8689
8690 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8691 if (!mbuf_drain_checks(ignore_waiters)) {
8692 return;
8693 }
8694 if (scratch_pa == 0) {
8695 bzero(scratch, sizeof(scratch));
8696 scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
8697 VERIFY(scratch_pa);
8698 } else if (mclverify) {
8699 /*
8700 * Panic if a driver wrote to our scratch memory.
8701 */
8702 for (k = 0; k < sizeof(scratch); k++) {
8703 if (scratch[k]) {
8704 panic("suspect DMA to freed address");
8705 }
8706 }
8707 }
8708 /*
8709 * Don't free memory too often as that could cause excessive
8710 * waiting times for mbufs. Purge caches if we were asked to drain
8711 * in the last 5 minutes.
8712 */
8713 if (mbuf_drain_last_runtime != 0) {
8714 interval = net_uptime() - mbuf_drain_last_runtime;
8715 if (interval <= mb_drain_maxint) {
8716 return;
8717 }
8718 if (interval <= mb_drain_maxint * 5) {
8719 purge_caches = TRUE;
8720 }
8721 }
8722 mbuf_drain_last_runtime = net_uptime();
8723 /*
8724 * Don't free any memory if we're using 60% or more.
8725 */
8726 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8727 total_mem += m_total(mc) * m_maxsize(mc);
8728 use_mem += m_active(mc) * m_maxsize(mc);
8729 }
8730 per = (use_mem * 100) / total_mem;
8731 if (per >= 60) {
8732 return;
8733 }
8734 /*
8735 * Purge all the caches. This effectively disables
8736 * caching for a few seconds, but the mbuf worker thread will
8737 * re-enable them again.
8738 */
8739 if (purge_caches == TRUE) {
8740 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8741 if (m_total(mc) < m_avgtotal(mc)) {
8742 continue;
8743 }
8744 lck_mtx_unlock(mbuf_mlock);
8745 ret = mcache_purge_cache(m_cache(mc), FALSE);
8746 lck_mtx_lock(mbuf_mlock);
8747 if (ret == TRUE) {
8748 m_purge_cnt(mc)++;
8749 }
8750 }
8751 }
8752 /*
8753 * Move the objects from the composite class freelist to
8754 * the rudimentary slabs list, but keep at least 10% of the average
8755 * total in the freelist.
8756 */
8757 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8758 while (m_cobjlist(mc) &&
8759 m_total(mc) < m_avgtotal(mc) &&
8760 m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
8761 obj = m_cobjlist(mc);
8762 m_cobjlist(mc) = obj->obj_next;
8763 obj->obj_next = NULL;
8764 num = cslab_free(mc, obj, 1);
8765 VERIFY(num == 1);
8766 m_free_cnt(mc)++;
8767 m_infree(mc)--;
8768 /* cslab_free() handles m_total */
8769 }
8770 }
8771 /*
8772 * Free the buffers present in the slab list up to 10% of the total
8773 * average per class.
8774 *
8775 * We walk the list backwards in an attempt to reduce fragmentation.
8776 */
8777 for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) {
8778 TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
8779 /*
8780 * Process only unused slabs occupying memory.
8781 */
8782 if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
8783 sp->sl_base == NULL) {
8784 continue;
8785 }
8786 if (m_total(mc) < m_avgtotal(mc) ||
8787 m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
8788 break;
8789 }
8790 slab_remove(sp, mc);
8791 switch (mc) {
8792 case MC_MBUF:
8793 m_infree(mc) -= NMBPG;
8794 m_total(mc) -= NMBPG;
8795 if (mclaudit != NULL) {
8796 mcl_audit_free(sp->sl_base, NMBPG);
8797 }
8798 break;
8799 case MC_CL:
8800 m_infree(mc) -= NCLPG;
8801 m_total(mc) -= NCLPG;
8802 if (mclaudit != NULL) {
8803 mcl_audit_free(sp->sl_base, NMBPG);
8804 }
8805 break;
8806 case MC_BIGCL:
8807 {
8808 m_infree(mc) -= NBCLPG;
8809 m_total(mc) -= NBCLPG;
8810 if (mclaudit != NULL) {
8811 mcl_audit_free(sp->sl_base, NMBPG);
8812 }
8813 break;
8814 }
8815 case MC_16KCL:
8816 m_infree(mc)--;
8817 m_total(mc)--;
8818 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
8819 nsp = nsp->sl_next;
8820 VERIFY(nsp->sl_refcnt == 0 &&
8821 nsp->sl_base != NULL &&
8822 nsp->sl_len == 0);
8823 slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
8824 0);
8825 nsp->sl_flags = 0;
8826 }
8827 if (mclaudit != NULL) {
8828 if (sp->sl_len == PAGE_SIZE) {
8829 mcl_audit_free(sp->sl_base,
8830 NMBPG);
8831 } else {
8832 mcl_audit_free(sp->sl_base, 1);
8833 }
8834 }
8835 break;
8836 default:
8837 /*
8838 * The composite classes have their own
8839 * freelist (m_cobjlist), so we only
8840 * process rudimentary classes here.
8841 */
8842 VERIFY(0);
8843 }
8844 m_release_cnt(mc) += m_size(mc);
8845 released += m_size(mc);
8846 VERIFY(sp->sl_base != NULL &&
8847 sp->sl_len >= PAGE_SIZE);
8848 offset = MTOPG(sp->sl_base);
8849 /*
8850 * Make sure the IOMapper points to a valid, but
8851 * bogus, address. This should prevent further DMA
8852 * accesses to freed memory.
8853 */
8854 IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
8855 mcl_paddr[offset] = 0;
8856 kmem_free(mb_map, (vm_offset_t)sp->sl_base,
8857 sp->sl_len);
8858 slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
8859 sp->sl_flags = 0;
8860 }
8861 }
8862 mbstat.m_drain++;
8863 mbstat.m_bigclusters = m_total(MC_BIGCL);
8864 mbstat.m_clusters = m_total(MC_CL);
8865 mbstat.m_mbufs = m_total(MC_MBUF);
8866 mbuf_stat_sync();
8867 mbuf_mtypes_sync(TRUE);
8868 }
8869
8870 __private_extern__ void
8871 mbuf_drain(boolean_t ignore_waiters)
8872 {
8873 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_NOTOWNED);
8874 if (!mbuf_drain_checks(ignore_waiters)) {
8875 return;
8876 }
8877 lck_mtx_lock(mbuf_mlock);
8878 mbuf_drain_locked(ignore_waiters);
8879 lck_mtx_unlock(mbuf_mlock);
8880 }
8881
8882
8883 static int
8884 m_drain_force_sysctl SYSCTL_HANDLER_ARGS
8885 {
8886 #pragma unused(arg1, arg2)
8887 int val = 0, err;
8888
8889 err = sysctl_handle_int(oidp, &val, 0, req);
8890 if (err != 0 || req->newptr == USER_ADDR_NULL) {
8891 return err;
8892 }
8893 if (val) {
8894 mbuf_drain(TRUE);
8895 }
8896
8897 return err;
8898 }
8899
8900 #if DEBUG || DEVELOPMENT
8901 __printflike(3, 4)
8902 static void
8903 _mbwdog_logger(const char *func, const int line, const char *fmt, ...)
8904 {
8905 va_list ap;
8906 struct timeval now;
8907 char str[384], p[256];
8908 int len;
8909
8910 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8911 if (mbwdog_logging == NULL) {
8912 /*
8913 * This might block under a mutex, which isn't really great,
8914 * but this happens once, so we'll live.
8915 */
8916 mbwdog_logging = zalloc_permanent(mbwdog_logging_size,
8917 ZALIGN_NONE);
8918 }
8919 va_start(ap, fmt);
8920 vsnprintf(p, sizeof(p), fmt, ap);
8921 va_end(ap);
8922 microuptime(&now);
8923 len = scnprintf(str, sizeof(str),
8924 "\n%ld.%d (%d/%llx) %s:%d %s",
8925 now.tv_sec, now.tv_usec,
8926 proc_getpid(current_proc()),
8927 (uint64_t)VM_KERNEL_ADDRPERM(current_thread()),
8928 func, line, p);
8929 if (len < 0) {
8930 return;
8931 }
8932 if (mbwdog_logging_used + len > mbwdog_logging_size) {
8933 mbwdog_logging_used = mbwdog_logging_used / 2;
8934 memmove(mbwdog_logging, mbwdog_logging + mbwdog_logging_used,
8935 mbwdog_logging_size - mbwdog_logging_used);
8936 mbwdog_logging[mbwdog_logging_used] = 0;
8937 }
8938 strlcat(mbwdog_logging, str, mbwdog_logging_size);
8939 mbwdog_logging_used += len;
8940 }
8941
8942 #endif // DEBUG || DEVELOPMENT
8943
8944 static void
8945 mtracelarge_register(size_t size)
8946 {
8947 int i;
8948 struct mtracelarge *trace;
8949 uintptr_t bt[MLEAK_STACK_DEPTH];
8950 unsigned int depth;
8951
8952 depth = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
8953 /* Check if this entry is already on the list. */
8954 for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
8955 trace = &mtracelarge_table[i];
8956 if (trace->size == size && trace->depth == depth &&
8957 memcmp(bt, trace->addr, depth * sizeof(uintptr_t)) == 0) {
8958 return;
8959 }
8960 }
8961 for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
8962 trace = &mtracelarge_table[i];
8963 if (size > trace->size) {
8964 trace->depth = depth;
8965 memcpy(trace->addr, bt, depth * sizeof(uintptr_t));
8966 trace->size = size;
8967 break;
8968 }
8969 }
8970 }
8971
8972 #if DEBUG || DEVELOPMENT
8973
8974 static int
8975 mbuf_wd_dump_sysctl SYSCTL_HANDLER_ARGS
8976 {
8977 char *str;
8978
8979 ifnet_head_lock_shared();
8980 lck_mtx_lock(mbuf_mlock);
8981
8982 str = mbuf_dump();
8983
8984 lck_mtx_unlock(mbuf_mlock);
8985 ifnet_head_done();
8986
8987 return sysctl_io_string(req, str, 0, 0, NULL);
8988 }
8989
8990 #endif /* DEBUG || DEVELOPMENT */
8991
8992 SYSCTL_DECL(_kern_ipc);
8993 #if DEBUG || DEVELOPMENT
8994 #if SKYWALK
8995 SYSCTL_UINT(_kern_ipc, OID_AUTO, mc_threshold_scale_factor,
8996 CTLFLAG_RW | CTLFLAG_LOCKED, &mc_threshold_scale_down_factor,
8997 MC_THRESHOLD_SCALE_DOWN_FACTOR,
8998 "scale down factor for mbuf cache thresholds");
8999 #endif /* SKYWALK */
9000 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_wd_dump,
9001 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED,
9002 0, 0, mbuf_wd_dump_sysctl, "A", "mbuf watchdog dump");
9003 #endif
9004 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
9005 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
9006 0, 0, mbstat_sysctl, "S,mbstat", "");
9007 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
9008 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
9009 0, 0, mb_stat_sysctl, "S,mb_stat", "");
9010 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
9011 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
9012 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
9013 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
9014 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
9015 0, 0, mleak_table_sysctl, "S,mleak_table", "");
9016 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
9017 CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
9018 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
9019 CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
9020 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
9021 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
9022 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
9023 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
9024 m_drain_force_sysctl, "I",
9025 "Forces the mbuf garbage collection to run");
9026 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
9027 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0,
9028 "Minimum time interval between garbage collection");
9029 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_memory_pressure_percentage,
9030 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_memory_pressure_percentage, 0,
9031 "Percentage of when we trigger memory-pressure for an mbuf-class");
9032