xref: /xnu-12377.1.9/bsd/kern/uipc_mbuf_mcache.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1982, 1986, 1988, 1991, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. All advertising materials mentioning features or use of this software
42  *    must display the following acknowledgement:
43  *	This product includes software developed by the University of
44  *	California, Berkeley and its contributors.
45  * 4. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
62  */
63 /*
64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65  * support for mandatory and extensible security protections.  This notice
66  * is included in support of clause 2.2 (b) of the Apple Public License,
67  * Version 2.0.
68  */
69 
70 #include <ptrauth.h>
71 
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/malloc.h>
75 #include <sys/mbuf.h>
76 #include <sys/kernel.h>
77 #include <sys/sysctl.h>
78 #include <sys/syslog.h>
79 #include <sys/protosw.h>
80 #include <sys/domain.h>
81 #include <sys/queue.h>
82 #include <sys/proc.h>
83 #include <sys/filedesc.h>
84 #include <sys/file_internal.h>
85 
86 #include <vm/vm_kern_xnu.h>
87 
88 #include <dev/random/randomdev.h>
89 
90 #include <kern/kern_types.h>
91 #include <kern/simple_lock.h>
92 #include <kern/queue.h>
93 #include <kern/sched_prim.h>
94 #include <kern/backtrace.h>
95 #include <kern/percpu.h>
96 #include <kern/zalloc.h>
97 
98 #include <libkern/OSDebug.h>
99 #include <libkern/libkern.h>
100 
101 #include <os/log.h>
102 #include <os/ptrtools.h>
103 
104 #include <IOKit/IOMapper.h>
105 
106 #include <machine/limits.h>
107 #include <machine/machine_routines.h>
108 
109 #include <sys/mcache.h>
110 
111 #include <net/droptap.h>
112 #include <netinet/mptcp_var.h>
113 #include <netinet/tcp_var.h>
114 
115 #define DUMP_BUF_CHK() {        \
116 	clen -= k;              \
117 	if (clen < 1)           \
118 	        goto done;      \
119 	c += k;                 \
120 }
121 
122 #if INET
123 static int
dump_tcp_reass_qlen(char * str,int str_len)124 dump_tcp_reass_qlen(char *str, int str_len)
125 {
126 	char *c = str;
127 	int k, clen = str_len;
128 
129 	if (tcp_reass_total_qlen != 0) {
130 		k = scnprintf(c, clen, "\ntcp reass qlen %d\n", tcp_reass_total_qlen);
131 		DUMP_BUF_CHK();
132 	}
133 
134 done:
135 	return str_len - clen;
136 }
137 #endif /* INET */
138 
139 #if MPTCP
140 static int
dump_mptcp_reass_qlen(char * str,int str_len)141 dump_mptcp_reass_qlen(char *str, int str_len)
142 {
143 	char *c = str;
144 	int k, clen = str_len;
145 
146 	if (mptcp_reass_total_qlen != 0) {
147 		k = scnprintf(c, clen, "\nmptcp reass qlen %d\n", mptcp_reass_total_qlen);
148 		DUMP_BUF_CHK();
149 	}
150 
151 done:
152 	return str_len - clen;
153 }
154 #endif /* MPTCP */
155 
156 #if NETWORKING
157 extern int dlil_dump_top_if_qlen(char *__counted_by(str_len), int str_len);
158 #endif /* NETWORKING */
159 
160 /*
161  * MBUF IMPLEMENTATION NOTES.
162  *
163  * There is a total of 5 per-CPU caches:
164  *
165  * MC_MBUF:
166  *	This is a cache of rudimentary objects of _MSIZE in size; each
167  *	object represents an mbuf structure.  This cache preserves only
168  *	the m_type field of the mbuf during its transactions.
169  *
170  * MC_CL:
171  *	This is a cache of rudimentary objects of MCLBYTES in size; each
172  *	object represents a mcluster structure.  This cache does not
173  *	preserve the contents of the objects during its transactions.
174  *
175  * MC_BIGCL:
176  *	This is a cache of rudimentary objects of MBIGCLBYTES in size; each
177  *	object represents a mbigcluster structure.  This cache does not
178  *	preserve the contents of the objects during its transaction.
179  *
180  * MC_MBUF_CL:
181  *	This is a cache of mbufs each having a cluster attached to it.
182  *	It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
183  *	fields of the mbuf related to the external cluster are preserved
184  *	during transactions.
185  *
186  * MC_MBUF_BIGCL:
187  *	This is a cache of mbufs each having a big cluster attached to it.
188  *	It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
189  *	fields of the mbuf related to the external cluster are preserved
190  *	during transactions.
191  *
192  * OBJECT ALLOCATION:
193  *
194  * Allocation requests are handled first at the per-CPU (mcache) layer
195  * before falling back to the slab layer.  Performance is optimal when
196  * the request is satisfied at the CPU layer because global data/lock
197  * never gets accessed.  When the slab layer is entered for allocation,
198  * the slab freelist will be checked first for available objects before
199  * the VM backing store is invoked.  Slab layer operations are serialized
200  * for all of the caches as the mbuf global lock is held most of the time.
201  * Allocation paths are different depending on the class of objects:
202  *
203  * a. Rudimentary object:
204  *
205  *	{ m_get_common(), m_clattach(), m_mclget(),
206  *	  m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
207  *	  composite object allocation }
208  *			|	^
209  *			|	|
210  *			|	+-----------------------+
211  *			v				|
212  *	   mcache_alloc/mcache_alloc_ext()	mbuf_slab_audit()
213  *			|				^
214  *			v				|
215  *		   [CPU cache] ------->	(found?) -------+
216  *			|				|
217  *			v				|
218  *		 mbuf_slab_alloc()			|
219  *			|				|
220  *			v				|
221  *	+---------> [freelist] ------->	(found?) -------+
222  *	|		|
223  *	|		v
224  *	|	    m_clalloc()
225  *	|		|
226  *	|		v
227  *	+---<<---- kmem_mb_alloc()
228  *
229  * b. Composite object:
230  *
231  *	{ m_getpackets_internal(), m_allocpacket_internal() }
232  *			|	^
233  *			|	|
234  *			|	+------	(done) ---------+
235  *			v				|
236  *	   mcache_alloc/mcache_alloc_ext()	mbuf_cslab_audit()
237  *			|				^
238  *			v				|
239  *		   [CPU cache] ------->	(found?) -------+
240  *			|				|
241  *			v				|
242  *		 mbuf_cslab_alloc()			|
243  *			|				|
244  *			v				|
245  *		    [freelist] ------->	(found?) -------+
246  *			|				|
247  *			v				|
248  *		(rudimentary object)			|
249  *	   mcache_alloc/mcache_alloc_ext() ------>>-----+
250  *
251  * Auditing notes: If auditing is enabled, buffers will be subjected to
252  * integrity checks by the audit routine.  This is done by verifying their
253  * contents against DEADBEEF (free) pattern before returning them to caller.
254  * As part of this step, the routine will also record the transaction and
255  * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
256  * also restore any constructed data structure fields if necessary.
257  *
258  * OBJECT DEALLOCATION:
259  *
260  * Freeing an object simply involves placing it into the CPU cache; this
261  * pollutes the cache to benefit subsequent allocations.  The slab layer
262  * will only be entered if the object is to be purged out of the cache.
263  * During normal operations, this happens only when the CPU layer resizes
264  * its bucket while it's adjusting to the allocation load.  Deallocation
265  * paths are different depending on the class of objects:
266  *
267  * a. Rudimentary object:
268  *
269  *	{ m_free(), m_freem_list(), composite object deallocation }
270  *			|	^
271  *			|	|
272  *			|	+------	(done) ---------+
273  *			v				|
274  *	   mcache_free/mcache_free_ext()		|
275  *			|				|
276  *			v				|
277  *		mbuf_slab_audit()			|
278  *			|				|
279  *			v				|
280  *		   [CPU cache] ---> (not purging?) -----+
281  *			|				|
282  *			v				|
283  *		 mbuf_slab_free()			|
284  *			|				|
285  *			v				|
286  *		    [freelist] ----------->>------------+
287  *	 (objects get purged to VM only on demand)
288  *
289  * b. Composite object:
290  *
291  *	{ m_free(), m_freem_list() }
292  *			|	^
293  *			|	|
294  *			|	+------	(done) ---------+
295  *			v				|
296  *	   mcache_free/mcache_free_ext()		|
297  *			|				|
298  *			v				|
299  *		mbuf_cslab_audit()			|
300  *			|				|
301  *			v				|
302  *		   [CPU cache] ---> (not purging?) -----+
303  *			|				|
304  *			v				|
305  *		 mbuf_cslab_free()			|
306  *			|				|
307  *			v				|
308  *		    [freelist] ---> (not purging?) -----+
309  *			|				|
310  *			v				|
311  *		(rudimentary object)			|
312  *	   mcache_free/mcache_free_ext() ------->>------+
313  *
314  * Auditing notes: If auditing is enabled, the audit routine will save
315  * any constructed data structure fields (if necessary) before filling the
316  * contents of the buffers with DEADBEEF (free) pattern and recording the
317  * transaction.  Buffers that are freed (whether at CPU or slab layer) are
318  * expected to contain the free pattern.
319  *
320  * DEBUGGING:
321  *
322  * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
323  * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
324  * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
325  * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Leak
326  * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
327  * "mbuf_debug=0x113".  Note that debugging consumes more CPU and memory.
328  *
329  * Each object is associated with exactly one mcache_audit_t structure that
330  * contains the information related to its last buffer transaction.  Given
331  * an address of an object, the audit structure can be retrieved by finding
332  * the position of the object relevant to the base address of the cluster:
333  *
334  *	+------------+			+=============+
335  *	| mbuf addr  |			| mclaudit[i] |
336  *	+------------+			+=============+
337  *	      |				| cl_audit[0] |
338  *	i = MTOBG(addr)			+-------------+
339  *	      |			+----->	| cl_audit[1] | -----> mcache_audit_t
340  *	b = BGTOM(i)		|	+-------------+
341  *	      |			|	|     ...     |
342  *	x = MCLIDX(b, addr)	|	+-------------+
343  *	      |			|	| cl_audit[7] |
344  *	      +-----------------+	+-------------+
345  *		 (e.g. x == 1)
346  *
347  * The mclaudit[] array is allocated at initialization time, but its contents
348  * get populated when the corresponding cluster is created.  Because a page
349  * can be turned into NMBPG number of mbufs, we preserve enough space for the
350  * mbufs so that there is a 1-to-1 mapping between them.  A page that never
351  * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
352  * remaining entries unused.  For 16KB cluster, only one entry from the first
353  * page is allocated and used for the entire object.
354  */
355 
356 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
357 extern vm_map_t mb_map;         /* special map */
358 
359 static uint32_t mb_kmem_contig_failed;
360 static uint32_t mb_kmem_failed;
361 static uint32_t mb_kmem_one_failed;
362 /* Timestamp of allocation failures. */
363 static uint64_t mb_kmem_contig_failed_ts;
364 static uint64_t mb_kmem_failed_ts;
365 static uint64_t mb_kmem_one_failed_ts;
366 static uint64_t mb_kmem_contig_failed_size;
367 static uint64_t mb_kmem_failed_size;
368 static uint32_t mb_kmem_stats[6];
369 
370 /* Back-end (common) layer */
371 static uint64_t mb_expand_cnt;
372 static uint64_t mb_expand_cl_cnt;
373 static uint64_t mb_expand_cl_total;
374 static uint64_t mb_expand_bigcl_cnt;
375 static uint64_t mb_expand_bigcl_total;
376 static uint64_t mb_expand_16kcl_cnt;
377 static uint64_t mb_expand_16kcl_total;
378 static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */
379 static uint32_t mbuf_worker_run_cnt;
380 static uint64_t mbuf_worker_last_runtime;
381 static uint64_t mbuf_drain_last_runtime;
382 static int mbuf_worker_ready;   /* worker thread is runnable */
383 static unsigned int ncpu;                /* number of CPUs */
384 static ppnum_t *mcl_paddr;      /* Array of cluster physical addresses */
385 static ppnum_t mcl_pages;       /* Size of array (# physical pages) */
386 static ppnum_t mcl_paddr_base;  /* Handle returned by IOMapper::iovmAlloc() */
387 static mcache_t *ref_cache;     /* Cache of cluster reference & flags */
388 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
389 unsigned int mbuf_debug; /* patchable mbuf mcache flags */
390 static unsigned int mb_normalized; /* number of packets "normalized" */
391 
392 #define MB_GROWTH_AGGRESSIVE    1       /* Threshold: 1/2 of total */
393 #define MB_GROWTH_NORMAL        2       /* Threshold: 3/4 of total */
394 
395 #define MBUF_CLASS_VALID(c) \
396 	((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
397 
398 /*
399  * mbuf specific mcache allocation request flags.
400  */
401 #define MCR_COMP        MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
402 
403 /*
404  * Per-cluster slab structure.
405  *
406  * A slab is a cluster control structure that contains one or more object
407  * chunks; the available chunks are chained in the slab's freelist (sl_head).
408  * Each time a chunk is taken out of the slab, the slab's reference count
409  * gets incremented.  When all chunks have been taken out, the empty slab
410  * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
411  * returned to a slab causes the slab's reference count to be decremented;
412  * it also causes the slab to be reinserted back to class's slab list, if
413  * it's not already done.
414  *
415  * Compartmentalizing of the object chunks into slabs allows us to easily
416  * merge one or more slabs together when the adjacent slabs are idle, as
417  * well as to convert or move a slab from one class to another; e.g. the
418  * mbuf cluster slab can be converted to a regular cluster slab when all
419  * mbufs in the slab have been freed.
420  *
421  * A slab may also span across multiple clusters for chunks larger than
422  * a cluster's size.  In this case, only the slab of the first cluster is
423  * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
424  * that they are part of the larger slab.
425  *
426  * Each slab controls a page of memory.
427  */
428 typedef struct mcl_slab {
429 	struct mcl_slab *sl_next;       /* neighboring slab */
430 	u_int8_t        sl_class;       /* controlling mbuf class */
431 	int8_t          sl_refcnt;      /* outstanding allocations */
432 	int8_t          sl_chunks;      /* chunks (bufs) in this slab */
433 	u_int16_t       sl_flags;       /* slab flags (see below) */
434 	u_int16_t       sl_len;         /* slab length */
435 	void            *sl_base;       /* base of allocated memory */
436 	void            *sl_head;       /* first free buffer */
437 	TAILQ_ENTRY(mcl_slab) sl_link;  /* next/prev slab on freelist */
438 } mcl_slab_t;
439 
440 #define SLF_MAPPED      0x0001          /* backed by a mapped page */
441 #define SLF_PARTIAL     0x0002          /* part of another slab */
442 #define SLF_DETACHED    0x0004          /* not in slab freelist */
443 
444 /*
445  * The array of slabs are broken into groups of arrays per 1MB of kernel
446  * memory to reduce the footprint.  Each group is allocated on demand
447  * whenever a new piece of memory mapped in from the VM crosses the 1MB
448  * boundary.
449  */
450 #define NSLABSPMB       ((1 << MBSHIFT) >> PAGE_SHIFT)
451 
452 typedef struct mcl_slabg {
453 	mcl_slab_t      *slg_slab;      /* group of slabs */
454 } mcl_slabg_t;
455 
456 /*
457  * Number of slabs needed to control a 16KB cluster object.
458  */
459 #define NSLABSP16KB     (M16KCLBYTES >> PAGE_SHIFT)
460 
461 /*
462  * Per-cluster audit structure.
463  */
464 typedef struct {
465 	mcache_audit_t  **cl_audit;     /* array of audits */
466 } mcl_audit_t;
467 
468 typedef struct {
469 	struct thread   *msa_thread;    /* thread doing transaction */
470 	struct thread   *msa_pthread;   /* previous transaction thread */
471 	uint32_t        msa_tstamp;     /* transaction timestamp (ms) */
472 	uint32_t        msa_ptstamp;    /* prev transaction timestamp (ms) */
473 	uint16_t        msa_depth;      /* pc stack depth */
474 	uint16_t        msa_pdepth;     /* previous transaction pc stack */
475 	void            *msa_stack[MCACHE_STACK_DEPTH];
476 	void            *msa_pstack[MCACHE_STACK_DEPTH];
477 } mcl_scratch_audit_t;
478 
479 typedef struct {
480 	/*
481 	 * Size of data from the beginning of an mbuf that covers m_hdr,
482 	 * pkthdr and m_ext structures.  If auditing is enabled, we allocate
483 	 * a shadow mbuf structure of this size inside each audit structure,
484 	 * and the contents of the real mbuf gets copied into it when the mbuf
485 	 * is freed.  This allows us to pattern-fill the mbuf for integrity
486 	 * check, and to preserve any constructed mbuf fields (e.g. mbuf +
487 	 * cluster cache case).  Note that we don't save the contents of
488 	 * clusters when they are freed; we simply pattern-fill them.
489 	 */
490 	u_int8_t                sc_mbuf[(_MSIZE - _MHLEN) + sizeof(_m_ext_t)];
491 	mcl_scratch_audit_t     sc_scratch __attribute__((aligned(8)));
492 } mcl_saved_contents_t;
493 
494 #define AUDIT_CONTENTS_SIZE     (sizeof (mcl_saved_contents_t))
495 
496 #define MCA_SAVED_MBUF_PTR(_mca)                                        \
497 	((struct mbuf *)(void *)((mcl_saved_contents_t *)               \
498 	(_mca)->mca_contents)->sc_mbuf)
499 #define MCA_SAVED_MBUF_SIZE                                             \
500 	(sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
501 #define MCA_SAVED_SCRATCH_PTR(_mca)                                     \
502 	(&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
503 
504 /*
505  * mbuf specific mcache audit flags
506  */
507 #define MB_INUSE        0x01    /* object has not been returned to slab */
508 #define MB_COMP_INUSE   0x02    /* object has not been returned to cslab */
509 #define MB_SCVALID      0x04    /* object has valid saved contents */
510 
511 /*
512  * Each of the following two arrays hold up to nmbclusters elements.
513  */
514 static mcl_audit_t *mclaudit;   /* array of cluster audit information */
515 static unsigned int maxclaudit; /* max # of entries in audit table */
516 static mcl_slabg_t **slabstbl;  /* cluster slabs table */
517 static unsigned int maxslabgrp; /* max # of entries in slabs table */
518 static unsigned int slabgrp;    /* # of entries in slabs table */
519 
520 /* Globals */
521 unsigned char *mbutl;           /* first mapped cluster address */
522 static unsigned char *embutl;          /* ending virtual address of mclusters */
523 
524 static boolean_t mclverify;     /* debug: pattern-checking */
525 static boolean_t mcltrace;      /* debug: stack tracing */
526 static boolean_t mclfindleak;   /* debug: leak detection */
527 static boolean_t mclexpleak;    /* debug: expose leak info to user space */
528 
529 static struct timeval mb_start; /* beginning of time */
530 
531 /* mbuf leak detection variables */
532 static struct mleak_table mleak_table;
533 static mleak_stat_t *mleak_stat;
534 
535 #define MLEAK_STAT_SIZE(n) \
536 	__builtin_offsetof(mleak_stat_t, ml_trace[n])
537 
538 struct mallocation {
539 	mcache_obj_t *element;  /* the alloc'ed element, NULL if unused */
540 	u_int32_t trace_index;  /* mtrace index for corresponding backtrace */
541 	u_int32_t count;        /* How many objects were requested */
542 	u_int64_t hitcount;     /* for determining hash effectiveness */
543 };
544 
545 struct mtrace {
546 	u_int64_t       collisions;
547 	u_int64_t       hitcount;
548 	u_int64_t       allocs;
549 	u_int64_t       depth;
550 	uintptr_t       addr[MLEAK_STACK_DEPTH];
551 };
552 
553 /* Size must be a power of two for the zhash to be able to just mask off bits */
554 #define MLEAK_ALLOCATION_MAP_NUM        512
555 #define MLEAK_TRACE_MAP_NUM             256
556 
557 /*
558  * Sample factor for how often to record a trace.  This is overwritable
559  * by the boot-arg mleak_sample_factor.
560  */
561 #define MLEAK_SAMPLE_FACTOR             500
562 
563 /*
564  * Number of top leakers recorded.
565  */
566 #define MLEAK_NUM_TRACES                5
567 
568 #define MB_LEAK_SPACING_64 "                    "
569 #define MB_LEAK_SPACING_32 "            "
570 
571 
572 #define MB_LEAK_HDR_32  "\n\
573     trace [1]   trace [2]   trace [3]   trace [4]   trace [5]  \n\
574     ----------  ----------  ----------  ----------  ---------- \n\
575 "
576 
577 #define MB_LEAK_HDR_64  "\n\
578     trace [1]           trace [2]           trace [3]       \
579 	trace [4]           trace [5]      \n\
580     ------------------  ------------------  ------------------  \
581     ------------------  ------------------ \n\
582 "
583 
584 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
585 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
586 
587 /* Hashmaps of allocations and their corresponding traces */
588 static struct mallocation *mleak_allocations;
589 static struct mtrace *mleak_traces;
590 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
591 
592 /* Lock to protect mleak tables from concurrent modification */
593 static LCK_GRP_DECLARE(mleak_lock_grp, "mleak_lock");
594 static LCK_MTX_DECLARE(mleak_lock_data, &mleak_lock_grp);
595 static lck_mtx_t *const mleak_lock = &mleak_lock_data;
596 
597 /* *Failed* large allocations. */
598 struct mtracelarge {
599 	uint64_t        size;
600 	uint64_t        depth;
601 	uintptr_t       addr[MLEAK_STACK_DEPTH];
602 };
603 
604 #define MTRACELARGE_NUM_TRACES          5
605 static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES];
606 
607 static void mtracelarge_register(size_t size);
608 
609 /* The minimum number of objects that are allocated, to start. */
610 #define MINCL           32
611 #define MINBIGCL        (MINCL >> 1)
612 
613 /* Low watermarks (only map in pages once free counts go below) */
614 #define MBIGCL_LOWAT    MINBIGCL
615 
616 #define m_cache(c)      mbuf_table[c].mtbl_cache
617 #define m_slablist(c)   mbuf_table[c].mtbl_slablist
618 #define m_cobjlist(c)   mbuf_table[c].mtbl_cobjlist
619 #define m_wantpurge(c)  mbuf_table[c].mtbl_wantpurge
620 #define m_active(c)     mbuf_table[c].mtbl_stats->mbcl_active
621 #define m_slab_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_slab_cnt
622 #define m_alloc_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
623 #define m_free_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_free_cnt
624 #define m_notified(c)   mbuf_table[c].mtbl_stats->mbcl_notified
625 #define m_purge_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_purge_cnt
626 #define m_fail_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_fail_cnt
627 #define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
628 #define m_region_expand(c)      mbuf_table[c].mtbl_expand
629 
630 mbuf_table_t mbuf_table[] = {
631 	/*
632 	 * The caches for mbufs, regular clusters and big clusters.
633 	 * The average total values were based on data gathered by actual
634 	 * usage patterns on iOS.
635 	 */
636 	{ MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
637 	  NULL, NULL, 0, 0, 0, 0, 3000, 0 },
638 	{ MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
639 	  NULL, NULL, 0, 0, 0, 0, 2000, 0 },
640 	{ MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
641 	  NULL, NULL, 0, 0, 0, 0, 1000, 0 },
642 	{ MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
643 	  NULL, NULL, 0, 0, 0, 0, 200, 0 },
644 	/*
645 	 * The following are special caches; they serve as intermediate
646 	 * caches backed by the above rudimentary caches.  Each object
647 	 * in the cache is an mbuf with a cluster attached to it.  Unlike
648 	 * the above caches, these intermediate caches do not directly
649 	 * deal with the slab structures; instead, the constructed
650 	 * cached elements are simply stored in the freelists.
651 	 */
652 	{ MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000, 0 },
653 	{ MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000, 0 },
654 	{ MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 200, 0 },
655 };
656 
657 #if SKYWALK
658 #define MC_THRESHOLD_SCALE_DOWN_FACTOR  2
659 static unsigned int mc_threshold_scale_down_factor =
660     MC_THRESHOLD_SCALE_DOWN_FACTOR;
661 #endif /* SKYWALK */
662 
663 static uint32_t
m_avgtotal(mbuf_class_t c)664 m_avgtotal(mbuf_class_t c)
665 {
666 #if SKYWALK
667 	return if_is_fsw_transport_netagent_enabled() ?
668 	       (mbuf_table[c].mtbl_avgtotal / mc_threshold_scale_down_factor) :
669 	       mbuf_table[c].mtbl_avgtotal;
670 #else /* !SKYWALK */
671 	return mbuf_table[c].mtbl_avgtotal;
672 #endif /* SKYWALK */
673 }
674 
675 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
676 static int mb_waiters;                  /* number of waiters */
677 
678 static struct timeval mb_wdtstart;      /* watchdog start timestamp */
679 static char *mbuf_dump_buf;
680 
681 #define MBUF_DUMP_BUF_SIZE      4096
682 
683 /*
684  * mbuf watchdog is enabled by default.  It is also toggeable via the
685  * kern.ipc.mb_watchdog sysctl.
686  * Garbage collection is enabled by default on embedded platforms.
687  * mb_drain_maxint controls the amount of time to wait (in seconds) before
688  * consecutive calls to mbuf_drain().
689  */
690 static unsigned int mb_watchdog = 1;
691 #if !XNU_TARGET_OS_OSX
692 static unsigned int mb_drain_maxint = 60;
693 #else /* XNU_TARGET_OS_OSX */
694 static unsigned int mb_drain_maxint = 0;
695 #endif /* XNU_TARGET_OS_OSX */
696 
697 /* The following are used to serialize m_clalloc() */
698 static boolean_t mb_clalloc_busy;
699 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
700 static int mb_clalloc_waiters;
701 
702 static char *mbuf_dump(void);
703 static void mbuf_worker_thread_init(void);
704 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
705 static void slab_free(mbuf_class_t, mcache_obj_t *);
706 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
707     unsigned int, int);
708 static void mbuf_slab_free(void *, mcache_obj_t *, int);
709 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
710 static void mbuf_slab_notify(void *, u_int32_t);
711 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
712     unsigned int);
713 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
714 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
715     unsigned int, int);
716 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
717 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
718 static int freelist_populate(mbuf_class_t, unsigned int, int);
719 static void freelist_init(mbuf_class_t);
720 static boolean_t mbuf_cached_above(mbuf_class_t, int);
721 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
722 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
723 static int m_howmany(int, size_t);
724 static void mbuf_worker_thread(void);
725 static void mbuf_watchdog(void);
726 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
727 
728 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
729     size_t, unsigned int);
730 static void mcl_audit_free(void *, unsigned int);
731 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
732 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
733 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
734     boolean_t);
735 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
736 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
737 static void mcl_audit_scratch(mcache_audit_t *);
738 static void mcl_audit_mcheck_panic(struct mbuf *);
739 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
740 
741 static void mleak_activate(void);
742 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
743 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
744 static void mleak_free(mcache_obj_t *);
745 static void mleak_sort_traces(void);
746 static void mleak_update_stats(void);
747 
748 static mcl_slab_t *slab_get(void *);
749 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
750     void *, void *, unsigned int, int, int);
751 static void slab_insert(mcl_slab_t *, mbuf_class_t);
752 static void slab_remove(mcl_slab_t *, mbuf_class_t);
753 static boolean_t slab_inrange(mcl_slab_t *, void *);
754 static void slab_nextptr_panic(mcl_slab_t *, void *);
755 static void slab_detach(mcl_slab_t *);
756 static boolean_t slab_is_detached(mcl_slab_t *);
757 
758 #if (DEBUG || DEVELOPMENT)
759 #define mbwdog_logger(fmt, ...)  _mbwdog_logger(__func__, __LINE__, fmt, ## __VA_ARGS__)
760 static void _mbwdog_logger(const char *func, const int line, const char *fmt, ...);
761 static char *mbwdog_logging;
762 const unsigned mbwdog_logging_size = 4096;
763 static size_t mbwdog_logging_used;
764 #else
765 #define mbwdog_logger(fmt, ...)  do { } while (0)
766 #endif /* DEBUG || DEVELOPMENT */
767 static void mbuf_drain_locked(boolean_t);
768 
769 void
mbuf_mcheck(struct mbuf * m)770 mbuf_mcheck(struct mbuf *m)
771 {
772 	if (__improbable(m->m_type != MT_FREE && !MBUF_IS_PAIRED(m))) {
773 		if (mclaudit == NULL) {
774 			panic("MCHECK: m_type=%d m=%p",
775 			    (u_int16_t)(m)->m_type, m);
776 		} else {
777 			mcl_audit_mcheck_panic(m);
778 		}
779 	}
780 }
781 
782 #define MBUF_IN_MAP(addr)                                               \
783 	((unsigned char *)(addr) >= mbutl &&                            \
784 	(unsigned char *)(addr) < embutl)
785 
786 #define MRANGE(addr) {                                                  \
787 	if (!MBUF_IN_MAP(addr))                                         \
788 	        panic("MRANGE: address out of range 0x%p", addr);       \
789 }
790 
791 /*
792  * Macros to obtain page index given a base cluster address
793  */
794 #define MTOPG(x)        (((unsigned char *)x - mbutl) >> PAGE_SHIFT)
795 #define PGTOM(x)        (mbutl + (x << PAGE_SHIFT))
796 
797 /*
798  * Macro to find the mbuf index relative to a base.
799  */
800 #define MBPAGEIDX(c, m) \
801 	(((unsigned char *)(m) - (unsigned char *)(c)) >> _MSIZESHIFT)
802 
803 /*
804  * Same thing for 2KB cluster index.
805  */
806 #define CLPAGEIDX(c, m) \
807 	(((unsigned char *)(m) - (unsigned char *)(c)) >> MCLSHIFT)
808 
809 /*
810  * Macro to find 4KB cluster index relative to a base
811  */
812 #define BCLPAGEIDX(c, m) \
813 	(((unsigned char *)(m) - (unsigned char *)(c)) >> MBIGCLSHIFT)
814 
815 /*
816  * Macro to convert BSD malloc sleep flag to mcache's
817  */
818 #define MSLEEPF(f)      ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
819 
820 static int
821 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
822 {
823 #pragma unused(oidp, arg1, arg2)
824 	int i;
825 
826 	/* Ensure leak tracing turned on */
827 	if (!mclfindleak || !mclexpleak) {
828 		return ENXIO;
829 	}
830 
831 	lck_mtx_lock(mleak_lock);
832 	mleak_update_stats();
833 	i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
834 	lck_mtx_unlock(mleak_lock);
835 
836 	return i;
837 }
838 
839 static int
840 mleak_table_sysctl SYSCTL_HANDLER_ARGS
841 {
842 #pragma unused(oidp, arg1, arg2)
843 	int i = 0;
844 
845 	/* Ensure leak tracing turned on */
846 	if (!mclfindleak || !mclexpleak) {
847 		return ENXIO;
848 	}
849 
850 	lck_mtx_lock(mleak_lock);
851 	i = SYSCTL_OUT(req, &mleak_table, sizeof(mleak_table));
852 	lck_mtx_unlock(mleak_lock);
853 
854 	return i;
855 }
856 
857 void
mbuf_stat_sync(void)858 mbuf_stat_sync(void)
859 {
860 	mb_class_stat_t *sp;
861 	mcache_cpu_t *ccp;
862 	mcache_t *cp;
863 	int k, m, bktsize;
864 
865 
866 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
867 
868 	for (k = 0; k < MC_MAX; k++) {
869 		cp = m_cache(k);
870 		ccp = &cp->mc_cpu[0];
871 		bktsize = ccp->cc_bktsize;
872 		sp = mbuf_table[k].mtbl_stats;
873 
874 		if (cp->mc_flags & MCF_NOCPUCACHE) {
875 			sp->mbcl_mc_state = MCS_DISABLED;
876 		} else if (cp->mc_purge_cnt > 0) {
877 			sp->mbcl_mc_state = MCS_PURGING;
878 		} else if (bktsize == 0) {
879 			sp->mbcl_mc_state = MCS_OFFLINE;
880 		} else {
881 			sp->mbcl_mc_state = MCS_ONLINE;
882 		}
883 
884 		sp->mbcl_mc_cached = 0;
885 		for (m = 0; m < ncpu; m++) {
886 			ccp = &cp->mc_cpu[m];
887 			if (ccp->cc_objs > 0) {
888 				sp->mbcl_mc_cached += ccp->cc_objs;
889 			}
890 			if (ccp->cc_pobjs > 0) {
891 				sp->mbcl_mc_cached += ccp->cc_pobjs;
892 			}
893 		}
894 		sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
895 		sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
896 		    sp->mbcl_infree;
897 
898 		sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
899 		sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
900 		sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
901 
902 		/* Calculate total count specific to each class */
903 		sp->mbcl_ctotal = sp->mbcl_total;
904 		switch (m_class(k)) {
905 		case MC_MBUF:
906 			/* Deduct mbufs used in composite caches */
907 			sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
908 			    m_total(MC_MBUF_BIGCL) - m_total(MC_MBUF_16KCL));
909 			break;
910 
911 		case MC_CL:
912 			/* Deduct clusters used in composite cache */
913 			sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
914 			break;
915 
916 		case MC_BIGCL:
917 			/* Deduct clusters used in composite cache */
918 			sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
919 			break;
920 
921 		case MC_16KCL:
922 			/* Deduct clusters used in composite cache */
923 			sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
924 			break;
925 
926 		default:
927 			break;
928 		}
929 	}
930 }
931 
932 bool
mbuf_class_under_pressure(struct mbuf * m)933 mbuf_class_under_pressure(struct mbuf *m)
934 {
935 	int mclass = mbuf_get_class(m);
936 
937 	if (m_total(mclass) - m_infree(mclass) >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
938 		/*
939 		 * The above computation does not include the per-CPU cached objects.
940 		 * As a fast-path check this is good-enough. But now we do
941 		 * the "slower" count of the cached objects to know exactly the
942 		 * number of active mbufs in use.
943 		 *
944 		 * We do not take the mbuf_lock here to avoid lock-contention. Numbers
945 		 * might be slightly off but we don't try to be 100% accurate.
946 		 * At worst, we drop a packet that we shouldn't have dropped or
947 		 * we might go slightly above our memory-pressure threshold.
948 		 */
949 		mcache_t *cp = m_cache(mclass);
950 		mcache_cpu_t *ccp = &cp->mc_cpu[0];
951 
952 		int bktsize = os_access_once(ccp->cc_bktsize);
953 		uint32_t bl_total = os_access_once(cp->mc_full.bl_total);
954 		uint32_t cached = 0;
955 		int i;
956 
957 		for (i = 0; i < ncpu; i++) {
958 			ccp = &cp->mc_cpu[i];
959 
960 			int cc_objs = os_access_once(ccp->cc_objs);
961 			if (cc_objs > 0) {
962 				cached += cc_objs;
963 			}
964 
965 			int cc_pobjs = os_access_once(ccp->cc_pobjs);
966 			if (cc_pobjs > 0) {
967 				cached += cc_pobjs;
968 			}
969 		}
970 		cached += (bl_total * bktsize);
971 		if (m_total(mclass) - m_infree(mclass) - cached >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
972 			os_log(OS_LOG_DEFAULT,
973 			    "%s memory-pressure on mbuf due to class %u, total %u free %u cached %u max %u",
974 			    __func__, mclass, m_total(mclass), m_infree(mclass), cached, m_maxlimit(mclass));
975 			return true;
976 		}
977 	}
978 
979 	return false;
980 }
981 
982 __private_extern__ void
mbinit(void)983 mbinit(void)
984 {
985 	unsigned int m;
986 	unsigned int initmcl = 0;
987 	thread_t thread = THREAD_NULL;
988 
989 	microuptime(&mb_start);
990 
991 	/*
992 	 * These MBUF_ values must be equal to their private counterparts.
993 	 */
994 	static_assert(MBUF_EXT == M_EXT);
995 	static_assert(MBUF_PKTHDR == M_PKTHDR);
996 	static_assert(MBUF_EOR == M_EOR);
997 	static_assert(MBUF_LOOP == M_LOOP);
998 	static_assert(MBUF_BCAST == M_BCAST);
999 	static_assert(MBUF_MCAST == M_MCAST);
1000 	static_assert(MBUF_FRAG == M_FRAG);
1001 	static_assert(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1002 	static_assert(MBUF_LASTFRAG == M_LASTFRAG);
1003 	static_assert(MBUF_PROMISC == M_PROMISC);
1004 	static_assert(MBUF_HASFCS == M_HASFCS);
1005 
1006 	static_assert(MBUF_TYPE_FREE == MT_FREE);
1007 	static_assert(MBUF_TYPE_DATA == MT_DATA);
1008 	static_assert(MBUF_TYPE_HEADER == MT_HEADER);
1009 	static_assert(MBUF_TYPE_SOCKET == MT_SOCKET);
1010 	static_assert(MBUF_TYPE_PCB == MT_PCB);
1011 	static_assert(MBUF_TYPE_RTABLE == MT_RTABLE);
1012 	static_assert(MBUF_TYPE_HTABLE == MT_HTABLE);
1013 	static_assert(MBUF_TYPE_ATABLE == MT_ATABLE);
1014 	static_assert(MBUF_TYPE_SONAME == MT_SONAME);
1015 	static_assert(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1016 	static_assert(MBUF_TYPE_FTABLE == MT_FTABLE);
1017 	static_assert(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1018 	static_assert(MBUF_TYPE_IFADDR == MT_IFADDR);
1019 	static_assert(MBUF_TYPE_CONTROL == MT_CONTROL);
1020 	static_assert(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1021 
1022 	static_assert(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1023 	static_assert(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1024 	static_assert(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1025 	static_assert(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1026 	static_assert(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
1027 	static_assert(MBUF_CSUM_REQ_IP == CSUM_IP);
1028 	static_assert(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1029 	static_assert(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1030 	static_assert(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1031 	static_assert(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1032 	static_assert(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1033 	static_assert(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1034 	static_assert(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1035 	static_assert(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1036 
1037 	static_assert(MBUF_WAITOK == M_WAIT);
1038 	static_assert(MBUF_DONTWAIT == M_DONTWAIT);
1039 	static_assert(MBUF_COPYALL == M_COPYALL);
1040 
1041 	static_assert(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1042 	static_assert(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1043 	static_assert(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1044 	static_assert(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1045 	static_assert(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1046 	static_assert(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1047 	static_assert(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1048 	static_assert(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1049 	static_assert(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI);
1050 	static_assert(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1051 	static_assert(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1052 
1053 	static_assert(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1054 	static_assert(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1055 	static_assert(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1056 	static_assert(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1057 
1058 	/* Module specific scratch space (32-bit alignment requirement) */
1059 	static_assert(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) % sizeof(uint32_t)));
1060 
1061 	/* Make sure we don't save more than we should */
1062 	static_assert(MCA_SAVED_MBUF_SIZE <= sizeof(struct mbuf));
1063 
1064 	if (nmbclusters == 0) {
1065 		nmbclusters = NMBCLUSTERS;
1066 	}
1067 
1068 	/* This should be a sane (at least even) value by now */
1069 	VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1070 
1071 	/* Setup the mbuf table */
1072 	mbuf_table_init();
1073 
1074 	static_assert(sizeof(struct mbuf) == _MSIZE);
1075 
1076 	/*
1077 	 * Allocate cluster slabs table:
1078 	 *
1079 	 *	maxslabgrp = (N * 2048) / (1024 * 1024)
1080 	 *
1081 	 * Where N is nmbclusters rounded up to the nearest 512.  This yields
1082 	 * mcl_slab_g_t units, each one representing a MB of memory.
1083 	 */
1084 	maxslabgrp =
1085 	    (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
1086 	slabstbl = zalloc_permanent(maxslabgrp * sizeof(mcl_slabg_t *),
1087 	    ZALIGN(mcl_slabg_t));
1088 
1089 	/*
1090 	 * Allocate audit structures, if needed:
1091 	 *
1092 	 *	maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE
1093 	 *
1094 	 * This yields mcl_audit_t units, each one representing a page.
1095 	 */
1096 	PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof(mbuf_debug));
1097 	mbuf_debug |= mcache_getflags();
1098 	if (mbuf_debug & MCF_DEBUG) {
1099 		int l;
1100 		mcl_audit_t *mclad;
1101 		maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
1102 		mclaudit = zalloc_permanent(maxclaudit * sizeof(*mclaudit),
1103 		    ZALIGN(mcl_audit_t));
1104 		for (l = 0, mclad = mclaudit; l < maxclaudit; l++) {
1105 			mclad[l].cl_audit = zalloc_permanent(NMBPG * sizeof(mcache_audit_t *),
1106 			    ZALIGN_PTR);
1107 		}
1108 
1109 		mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1110 		    AUDIT_CONTENTS_SIZE, sizeof(u_int64_t), 0, MCR_SLEEP);
1111 		VERIFY(mcl_audit_con_cache != NULL);
1112 	}
1113 	mclverify = (mbuf_debug & MCF_VERIFY);
1114 	mcltrace = (mbuf_debug & MCF_TRACE);
1115 	mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1116 	mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1117 
1118 	/* Enable mbuf leak logging, with a lock to protect the tables */
1119 
1120 	mleak_activate();
1121 
1122 	/*
1123 	 * Allocate structure for per-CPU statistics that's aligned
1124 	 * on the CPU cache boundary; this code assumes that we never
1125 	 * uninitialize this framework, since the original address
1126 	 * before alignment is not saved.
1127 	 */
1128 	ncpu = ml_wait_max_cpus();
1129 
1130 	/* Calculate the number of pages assigned to the cluster pool */
1131 	mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
1132 	mcl_paddr = zalloc_permanent(mcl_pages * sizeof(ppnum_t),
1133 	    ZALIGN(ppnum_t));
1134 
1135 	/* Register with the I/O Bus mapper */
1136 	mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1137 
1138 	embutl = (mbutl + (nmbclusters * MCLBYTES));
1139 	VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0);
1140 
1141 	/* Prime up the freelist */
1142 	PE_parse_boot_argn("initmcl", &initmcl, sizeof(initmcl));
1143 	if (initmcl != 0) {
1144 		initmcl >>= NCLPBGSHIFT;        /* become a 4K unit */
1145 		if (initmcl > m_maxlimit(MC_BIGCL)) {
1146 			initmcl = m_maxlimit(MC_BIGCL);
1147 		}
1148 	}
1149 	if (initmcl < m_minlimit(MC_BIGCL)) {
1150 		initmcl = m_minlimit(MC_BIGCL);
1151 	}
1152 
1153 	lck_mtx_lock(mbuf_mlock);
1154 
1155 	/*
1156 	 * For classes with non-zero minimum limits, populate their freelists
1157 	 * so that m_total(class) is at least m_minlimit(class).
1158 	 */
1159 	VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1160 	freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1161 	VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1162 	freelist_init(m_class(MC_CL));
1163 
1164 	for (m = 0; m < MC_MAX; m++) {
1165 		/* Make sure we didn't miss any */
1166 		VERIFY(m_minlimit(m_class(m)) == 0 ||
1167 		    m_total(m_class(m)) >= m_minlimit(m_class(m)));
1168 	}
1169 
1170 	lck_mtx_unlock(mbuf_mlock);
1171 
1172 	(void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1173 	    NULL, &thread);
1174 	thread_deallocate(thread);
1175 
1176 	ref_cache = mcache_create("mext_ref", sizeof(struct ext_ref),
1177 	    0, 0, MCR_SLEEP);
1178 
1179 	/* Create the cache for each class */
1180 	for (m = 0; m < MC_MAX; m++) {
1181 		void *allocfunc, *freefunc, *auditfunc, *logfunc;
1182 		u_int32_t flags;
1183 
1184 		flags = mbuf_debug;
1185 		if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1186 		    m_class(m) == MC_MBUF_16KCL) {
1187 			allocfunc = mbuf_cslab_alloc;
1188 			freefunc = mbuf_cslab_free;
1189 			auditfunc = mbuf_cslab_audit;
1190 			logfunc = mleak_logger;
1191 		} else {
1192 			allocfunc = mbuf_slab_alloc;
1193 			freefunc = mbuf_slab_free;
1194 			auditfunc = mbuf_slab_audit;
1195 			logfunc = mleak_logger;
1196 		}
1197 
1198 		if (!mclfindleak) {
1199 			flags |= MCF_NOLEAKLOG;
1200 		}
1201 
1202 		m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1203 		    allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1204 		    (void *)(uintptr_t)m, flags, MCR_SLEEP);
1205 	}
1206 
1207 	/*
1208 	 * Set the max limit on sb_max to be 1/16 th of the size of
1209 	 * memory allocated for mbuf clusters.
1210 	 */
1211 	high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1212 	if (high_sb_max < sb_max) {
1213 		/* sb_max is too large for this configuration, scale it down */
1214 		if (high_sb_max > (1 << MBSHIFT)) {
1215 			/* We have atleast 16 M of mbuf pool */
1216 			sb_max = high_sb_max;
1217 		} else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1218 			/*
1219 			 * If we have more than 1M of mbufpool, cap the size of
1220 			 * max sock buf at 1M
1221 			 */
1222 			sb_max = high_sb_max = (1 << MBSHIFT);
1223 		} else {
1224 			sb_max = high_sb_max;
1225 		}
1226 	}
1227 
1228 	/* allocate space for mbuf_dump_buf */
1229 	mbuf_dump_buf = zalloc_permanent(MBUF_DUMP_BUF_SIZE, ZALIGN_NONE);
1230 
1231 	if (mbuf_debug & MCF_DEBUG) {
1232 		printf("%s: MLEN %d, MHLEN %d\n", __func__,
1233 		    (int)_MLEN, (int)_MHLEN);
1234 	}
1235 	printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1236 	    (nmbclusters << MCLSHIFT) >> MBSHIFT,
1237 	    (nclusters << MCLSHIFT) >> MBSHIFT,
1238 	    (njcl << MCLSHIFT) >> MBSHIFT);
1239 }
1240 
1241 /*
1242  * Obtain a slab of object(s) from the class's freelist.
1243  */
1244 static mcache_obj_t *
slab_alloc(mbuf_class_t class,int wait)1245 slab_alloc(mbuf_class_t class, int wait)
1246 {
1247 	mcl_slab_t *sp;
1248 	mcache_obj_t *buf;
1249 
1250 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1251 
1252 	/* This should always be NULL for us */
1253 	VERIFY(m_cobjlist(class) == NULL);
1254 
1255 	/*
1256 	 * Treat composite objects as having longer lifespan by using
1257 	 * a slab from the reverse direction, in hoping that this could
1258 	 * reduce the probability of fragmentation for slabs that hold
1259 	 * more than one buffer chunks (e.g. mbuf slabs).  For other
1260 	 * slabs, this probably doesn't make much of a difference.
1261 	 */
1262 	if ((class == MC_MBUF || class == MC_CL || class == MC_BIGCL)
1263 	    && (wait & MCR_COMP)) {
1264 		sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1265 	} else {
1266 		sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1267 	}
1268 
1269 	if (sp == NULL) {
1270 		VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1271 		/* The slab list for this class is empty */
1272 		return NULL;
1273 	}
1274 
1275 	VERIFY(m_infree(class) > 0);
1276 	VERIFY(!slab_is_detached(sp));
1277 	VERIFY(sp->sl_class == class &&
1278 	    (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1279 	buf = sp->sl_head;
1280 	VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1281 	sp->sl_head = buf->obj_next;
1282 	/* Increment slab reference */
1283 	sp->sl_refcnt++;
1284 
1285 	VERIFY(sp->sl_head != NULL || sp->sl_refcnt == sp->sl_chunks);
1286 
1287 	if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1288 		slab_nextptr_panic(sp, sp->sl_head);
1289 		/* In case sl_head is in the map but not in the slab */
1290 		VERIFY(slab_inrange(sp, sp->sl_head));
1291 		/* NOTREACHED */
1292 	}
1293 
1294 	if (mclaudit != NULL) {
1295 		mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1296 		mca->mca_uflags = 0;
1297 		/* Save contents on mbuf objects only */
1298 		if (class == MC_MBUF) {
1299 			mca->mca_uflags |= MB_SCVALID;
1300 		}
1301 	}
1302 
1303 	if (class == MC_CL) {
1304 		mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1305 		/*
1306 		 * A 2K cluster slab can have at most NCLPG references.
1307 		 */
1308 		VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPG &&
1309 		    sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1310 		VERIFY(sp->sl_refcnt < NCLPG || sp->sl_head == NULL);
1311 	} else if (class == MC_BIGCL) {
1312 		mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1313 		    m_infree(MC_MBUF_BIGCL);
1314 		/*
1315 		 * A 4K cluster slab can have NBCLPG references.
1316 		 */
1317 		VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG &&
1318 		    sp->sl_len == PAGE_SIZE &&
1319 		    (sp->sl_refcnt < NBCLPG || sp->sl_head == NULL));
1320 	} else if (class == MC_16KCL) {
1321 		mcl_slab_t *nsp;
1322 		int k;
1323 
1324 		--m_infree(MC_16KCL);
1325 		VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1326 		    sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1327 		/*
1328 		 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1329 		 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1330 		 * most 1 reference.
1331 		 */
1332 		for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1333 			nsp = nsp->sl_next;
1334 			/* Next slab must already be present */
1335 			VERIFY(nsp != NULL);
1336 			nsp->sl_refcnt++;
1337 			VERIFY(!slab_is_detached(nsp));
1338 			VERIFY(nsp->sl_class == MC_16KCL &&
1339 			    nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1340 			    nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1341 			    nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1342 			    nsp->sl_head == NULL);
1343 		}
1344 	} else {
1345 		VERIFY(class == MC_MBUF);
1346 		--m_infree(MC_MBUF);
1347 		/*
1348 		 * If auditing is turned on, this check is
1349 		 * deferred until later in mbuf_slab_audit().
1350 		 */
1351 		if (mclaudit == NULL) {
1352 			mbuf_mcheck((struct mbuf *)buf);
1353 		}
1354 		/*
1355 		 * Since we have incremented the reference count above,
1356 		 * an mbuf slab (formerly a 4KB cluster slab that was cut
1357 		 * up into mbufs) must have a reference count between 1
1358 		 * and NMBPG at this point.
1359 		 */
1360 		VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPG &&
1361 		    sp->sl_chunks == NMBPG &&
1362 		    sp->sl_len == PAGE_SIZE);
1363 		VERIFY(sp->sl_refcnt < NMBPG || sp->sl_head == NULL);
1364 	}
1365 
1366 	/* If empty, remove this slab from the class's freelist */
1367 	if (sp->sl_head == NULL) {
1368 		VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPG);
1369 		VERIFY(class != MC_CL || sp->sl_refcnt == NCLPG);
1370 		VERIFY(class != MC_BIGCL || sp->sl_refcnt == NBCLPG);
1371 		slab_remove(sp, class);
1372 	}
1373 
1374 	return buf;
1375 }
1376 
1377 /*
1378  * Place a slab of object(s) back into a class's slab list.
1379  */
1380 static void
slab_free(mbuf_class_t class,mcache_obj_t * buf)1381 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1382 {
1383 	mcl_slab_t *sp;
1384 	boolean_t reinit_supercl = false;
1385 	mbuf_class_t super_class;
1386 
1387 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1388 
1389 	VERIFY(buf->obj_next == NULL);
1390 
1391 	/*
1392 	 * Synchronizing with m_clalloc, as it reads m_total, while we here
1393 	 * are modifying m_total.
1394 	 */
1395 	while (mb_clalloc_busy) {
1396 		mb_clalloc_waiters++;
1397 		(void) msleep(mb_clalloc_waitchan, mbuf_mlock,
1398 		    (PZERO - 1), "m_clalloc", NULL);
1399 		LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1400 	}
1401 
1402 	/* We are busy now; tell everyone else to go away */
1403 	mb_clalloc_busy = TRUE;
1404 
1405 	sp = slab_get(buf);
1406 	VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1407 	    (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1408 
1409 	/* Decrement slab reference */
1410 	sp->sl_refcnt--;
1411 
1412 	if (class == MC_CL) {
1413 		VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1414 		/*
1415 		 * A slab that has been splitted for 2KB clusters can have
1416 		 * at most 1 outstanding reference at this point.
1417 		 */
1418 		VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPG - 1) &&
1419 		    sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1420 		VERIFY(sp->sl_refcnt < (NCLPG - 1) ||
1421 		    (slab_is_detached(sp) && sp->sl_head == NULL));
1422 	} else if (class == MC_BIGCL) {
1423 		VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1424 
1425 		/* A 4KB cluster slab can have NBCLPG references at most */
1426 		VERIFY(sp->sl_refcnt >= 0 && sp->sl_chunks == NBCLPG);
1427 		VERIFY(sp->sl_refcnt < (NBCLPG - 1) ||
1428 		    (slab_is_detached(sp) && sp->sl_head == NULL));
1429 	} else if (class == MC_16KCL) {
1430 		mcl_slab_t *nsp;
1431 		int k;
1432 		/*
1433 		 * A 16KB cluster takes NSLABSP16KB slabs, all must
1434 		 * now have 0 reference.
1435 		 */
1436 		VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE));
1437 		VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1438 		    sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1439 		VERIFY(slab_is_detached(sp));
1440 		for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1441 			nsp = nsp->sl_next;
1442 			/* Next slab must already be present */
1443 			VERIFY(nsp != NULL);
1444 			nsp->sl_refcnt--;
1445 			VERIFY(slab_is_detached(nsp));
1446 			VERIFY(nsp->sl_class == MC_16KCL &&
1447 			    (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1448 			    nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1449 			    nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1450 			    nsp->sl_head == NULL);
1451 		}
1452 	} else {
1453 		/*
1454 		 * A slab that has been splitted for mbufs has at most
1455 		 * NMBPG reference counts.  Since we have decremented
1456 		 * one reference above, it must now be between 0 and
1457 		 * NMBPG-1.
1458 		 */
1459 		VERIFY(class == MC_MBUF);
1460 		VERIFY(sp->sl_refcnt >= 0 &&
1461 		    sp->sl_refcnt <= (NMBPG - 1) &&
1462 		    sp->sl_chunks == NMBPG &&
1463 		    sp->sl_len == PAGE_SIZE);
1464 		VERIFY(sp->sl_refcnt < (NMBPG - 1) ||
1465 		    (slab_is_detached(sp) && sp->sl_head == NULL));
1466 	}
1467 
1468 	/*
1469 	 * When auditing is enabled, ensure that the buffer still
1470 	 * contains the free pattern.  Otherwise it got corrupted
1471 	 * while at the CPU cache layer.
1472 	 */
1473 	if (mclaudit != NULL) {
1474 		mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1475 		if (mclverify) {
1476 			mcache_audit_free_verify(mca, buf, 0,
1477 			    m_maxsize(class));
1478 		}
1479 		mca->mca_uflags &= ~MB_SCVALID;
1480 	}
1481 
1482 	if (class == MC_CL) {
1483 		mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1484 		buf->obj_next = sp->sl_head;
1485 	} else if (class == MC_BIGCL) {
1486 		mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1487 		    m_infree(MC_MBUF_BIGCL);
1488 		buf->obj_next = sp->sl_head;
1489 	} else if (class == MC_16KCL) {
1490 		++m_infree(MC_16KCL);
1491 	} else {
1492 		++m_infree(MC_MBUF);
1493 		buf->obj_next = sp->sl_head;
1494 	}
1495 	sp->sl_head = buf;
1496 
1497 	/*
1498 	 * If a slab has been split to either one which holds 2KB clusters,
1499 	 * or one which holds mbufs, turn it back to one which holds a
1500 	 * 4 or 16 KB cluster depending on the page size.
1501 	 */
1502 	if (m_maxsize(MC_BIGCL) == PAGE_SIZE) {
1503 		super_class = MC_BIGCL;
1504 	} else {
1505 		VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL));
1506 		super_class = MC_16KCL;
1507 	}
1508 	if (class == MC_MBUF && sp->sl_refcnt == 0 &&
1509 	    m_total(class) >= (m_minlimit(class) + NMBPG) &&
1510 	    m_total(super_class) < m_maxlimit(super_class)) {
1511 		int i = NMBPG;
1512 
1513 		m_total(MC_MBUF) -= NMBPG;
1514 		mbstat.m_mbufs = m_total(MC_MBUF);
1515 		m_infree(MC_MBUF) -= NMBPG;
1516 		mtype_stat_add(MT_FREE, -((unsigned)NMBPG));
1517 
1518 		while (i--) {
1519 			struct mbuf *m = sp->sl_head;
1520 			VERIFY(m != NULL);
1521 			sp->sl_head = m->m_next;
1522 			m->m_next = NULL;
1523 		}
1524 		reinit_supercl = true;
1525 	} else if (class == MC_CL && sp->sl_refcnt == 0 &&
1526 	    m_total(class) >= (m_minlimit(class) + NCLPG) &&
1527 	    m_total(super_class) < m_maxlimit(super_class)) {
1528 		int i = NCLPG;
1529 
1530 		m_total(MC_CL) -= NCLPG;
1531 		mbstat.m_clusters = m_total(MC_CL);
1532 		m_infree(MC_CL) -= NCLPG;
1533 
1534 		while (i--) {
1535 			union mcluster *c = sp->sl_head;
1536 			VERIFY(c != NULL);
1537 			sp->sl_head = c->mcl_next;
1538 			c->mcl_next = NULL;
1539 		}
1540 		reinit_supercl = true;
1541 	} else if (class == MC_BIGCL && super_class != MC_BIGCL &&
1542 	    sp->sl_refcnt == 0 &&
1543 	    m_total(class) >= (m_minlimit(class) + NBCLPG) &&
1544 	    m_total(super_class) < m_maxlimit(super_class)) {
1545 		int i = NBCLPG;
1546 
1547 		VERIFY(super_class == MC_16KCL);
1548 		m_total(MC_BIGCL) -= NBCLPG;
1549 		mbstat.m_bigclusters = m_total(MC_BIGCL);
1550 		m_infree(MC_BIGCL) -= NBCLPG;
1551 
1552 		while (i--) {
1553 			union mbigcluster *bc = sp->sl_head;
1554 			VERIFY(bc != NULL);
1555 			sp->sl_head = bc->mbc_next;
1556 			bc->mbc_next = NULL;
1557 		}
1558 		reinit_supercl = true;
1559 	}
1560 
1561 	if (reinit_supercl) {
1562 		VERIFY(sp->sl_head == NULL);
1563 		VERIFY(m_total(class) >= m_minlimit(class));
1564 		slab_remove(sp, class);
1565 
1566 		/* Reinitialize it as a cluster for the super class */
1567 		m_total(super_class)++;
1568 		m_infree(super_class)++;
1569 		VERIFY(sp->sl_flags == (SLF_MAPPED | SLF_DETACHED) &&
1570 		    sp->sl_len == PAGE_SIZE && sp->sl_refcnt == 0);
1571 
1572 		slab_init(sp, super_class, SLF_MAPPED, sp->sl_base,
1573 		    sp->sl_base, PAGE_SIZE, 0, 1);
1574 		if (mclverify) {
1575 			mcache_set_pattern(MCACHE_FREE_PATTERN,
1576 			    (caddr_t)sp->sl_base, sp->sl_len);
1577 		}
1578 		((mcache_obj_t *)(sp->sl_base))->obj_next = NULL;
1579 
1580 		if (super_class == MC_BIGCL) {
1581 			mbstat.m_bigclusters = m_total(MC_BIGCL);
1582 			mbstat.m_bigclfree = m_infree(MC_BIGCL) +
1583 			    m_infree(MC_MBUF_BIGCL);
1584 		}
1585 
1586 		VERIFY(slab_is_detached(sp));
1587 		VERIFY(m_total(super_class) <= m_maxlimit(super_class));
1588 
1589 		/* And finally switch class */
1590 		class = super_class;
1591 	}
1592 
1593 	/* Reinsert the slab to the class's slab list */
1594 	if (slab_is_detached(sp)) {
1595 		slab_insert(sp, class);
1596 	}
1597 
1598 	/* We're done; let others enter */
1599 	mb_clalloc_busy = FALSE;
1600 	if (mb_clalloc_waiters > 0) {
1601 		mb_clalloc_waiters = 0;
1602 		wakeup(mb_clalloc_waitchan);
1603 	}
1604 }
1605 
1606 /*
1607  * Common allocator for rudimentary objects called by the CPU cache layer
1608  * during an allocation request whenever there is no available element in the
1609  * bucket layer.  It returns one or more elements from the appropriate global
1610  * freelist.  If the freelist is empty, it will attempt to populate it and
1611  * retry the allocation.
1612  */
1613 static unsigned int
mbuf_slab_alloc(void * arg,mcache_obj_t *** plist,unsigned int num,int wait)1614 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1615 {
1616 	mbuf_class_t class = (mbuf_class_t)arg;
1617 	unsigned int need = num;
1618 	mcache_obj_t **list = *plist;
1619 
1620 	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1621 	ASSERT(need > 0);
1622 
1623 	lck_mtx_lock(mbuf_mlock);
1624 
1625 	for (;;) {
1626 		if ((*list = slab_alloc(class, wait)) != NULL) {
1627 			(*list)->obj_next = NULL;
1628 			list = *plist = &(*list)->obj_next;
1629 
1630 			if (--need == 0) {
1631 				/*
1632 				 * If the number of elements in freelist has
1633 				 * dropped below low watermark, asynchronously
1634 				 * populate the freelist now rather than doing
1635 				 * it later when we run out of elements.
1636 				 */
1637 				if (!mbuf_cached_above(class, wait) &&
1638 				    m_infree(class) < (m_total(class) >> 5)) {
1639 					(void) freelist_populate(class, 1,
1640 					    M_DONTWAIT);
1641 				}
1642 				break;
1643 			}
1644 		} else {
1645 			VERIFY(m_infree(class) == 0 || class == MC_CL);
1646 
1647 			(void) freelist_populate(class, 1,
1648 			    (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
1649 
1650 			if (m_infree(class) > 0) {
1651 				continue;
1652 			}
1653 
1654 			/* Check if there's anything at the cache layer */
1655 			if (mbuf_cached_above(class, wait)) {
1656 				break;
1657 			}
1658 
1659 			/* watchdog checkpoint */
1660 			mbuf_watchdog();
1661 
1662 			/* We have nothing and cannot block; give up */
1663 			if (wait & MCR_NOSLEEP) {
1664 				if (!(wait & MCR_TRYHARD)) {
1665 					m_fail_cnt(class)++;
1666 					mbstat.m_drops++;
1667 					break;
1668 				}
1669 			}
1670 
1671 			/*
1672 			 * If the freelist is still empty and the caller is
1673 			 * willing to be blocked, sleep on the wait channel
1674 			 * until an element is available.  Otherwise, if
1675 			 * MCR_TRYHARD is set, do our best to satisfy the
1676 			 * request without having to go to sleep.
1677 			 */
1678 			if (mbuf_worker_ready &&
1679 			    mbuf_sleep(class, need, wait)) {
1680 				break;
1681 			}
1682 
1683 			LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1684 		}
1685 	}
1686 
1687 	m_alloc_cnt(class) += num - need;
1688 	lck_mtx_unlock(mbuf_mlock);
1689 
1690 	return num - need;
1691 }
1692 
1693 /*
1694  * Common de-allocator for rudimentary objects called by the CPU cache
1695  * layer when one or more elements need to be returned to the appropriate
1696  * global freelist.
1697  */
1698 static void
mbuf_slab_free(void * arg,mcache_obj_t * list,__unused int purged)1699 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
1700 {
1701 	mbuf_class_t class = (mbuf_class_t)arg;
1702 	mcache_obj_t *nlist;
1703 	unsigned int num = 0;
1704 	int w;
1705 
1706 	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1707 
1708 	lck_mtx_lock(mbuf_mlock);
1709 
1710 	for (;;) {
1711 		nlist = list->obj_next;
1712 		list->obj_next = NULL;
1713 		slab_free(class, list);
1714 		++num;
1715 		if ((list = nlist) == NULL) {
1716 			break;
1717 		}
1718 	}
1719 	m_free_cnt(class) += num;
1720 
1721 	if ((w = mb_waiters) > 0) {
1722 		mb_waiters = 0;
1723 	}
1724 	if (w) {
1725 		mbwdog_logger("waking up all threads");
1726 	}
1727 	lck_mtx_unlock(mbuf_mlock);
1728 
1729 	if (w != 0) {
1730 		wakeup(mb_waitchan);
1731 	}
1732 }
1733 
1734 /*
1735  * Common auditor for rudimentary objects called by the CPU cache layer
1736  * during an allocation or free request.  For the former, this is called
1737  * after the objects are obtained from either the bucket or slab layer
1738  * and before they are returned to the caller.  For the latter, this is
1739  * called immediately during free and before placing the objects into
1740  * the bucket or slab layer.
1741  */
1742 static void
mbuf_slab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)1743 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1744 {
1745 	mbuf_class_t class = (mbuf_class_t)arg;
1746 	mcache_audit_t *mca;
1747 
1748 	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1749 
1750 	while (list != NULL) {
1751 		lck_mtx_lock(mbuf_mlock);
1752 		mca = mcl_audit_buf2mca(class, list);
1753 
1754 		/* Do the sanity checks */
1755 		if (class == MC_MBUF) {
1756 			mcl_audit_mbuf(mca, list, FALSE, alloc);
1757 			ASSERT(mca->mca_uflags & MB_SCVALID);
1758 		} else {
1759 			mcl_audit_cluster(mca, list, m_maxsize(class),
1760 			    alloc, TRUE);
1761 			ASSERT(!(mca->mca_uflags & MB_SCVALID));
1762 		}
1763 		/* Record this transaction */
1764 		if (mcltrace) {
1765 			mcache_buffer_log(mca, list, m_cache(class), &mb_start);
1766 		}
1767 
1768 		if (alloc) {
1769 			mca->mca_uflags |= MB_INUSE;
1770 		} else {
1771 			mca->mca_uflags &= ~MB_INUSE;
1772 		}
1773 		/* Unpair the object (unconditionally) */
1774 		mca->mca_uptr = NULL;
1775 		lck_mtx_unlock(mbuf_mlock);
1776 
1777 		list = list->obj_next;
1778 	}
1779 }
1780 
1781 /*
1782  * Common notify routine for all caches.  It is called by mcache when
1783  * one or more objects get freed.  We use this indication to trigger
1784  * the wakeup of any sleeping threads so that they can retry their
1785  * allocation requests.
1786  */
1787 static void
mbuf_slab_notify(void * arg,u_int32_t reason)1788 mbuf_slab_notify(void *arg, u_int32_t reason)
1789 {
1790 	mbuf_class_t class = (mbuf_class_t)arg;
1791 	int w;
1792 
1793 	ASSERT(MBUF_CLASS_VALID(class));
1794 
1795 	if (reason != MCN_RETRYALLOC) {
1796 		return;
1797 	}
1798 
1799 	lck_mtx_lock(mbuf_mlock);
1800 	if ((w = mb_waiters) > 0) {
1801 		m_notified(class)++;
1802 		mb_waiters = 0;
1803 	}
1804 	if (w) {
1805 		mbwdog_logger("waking up all threads");
1806 	}
1807 	lck_mtx_unlock(mbuf_mlock);
1808 
1809 	if (w != 0) {
1810 		wakeup(mb_waitchan);
1811 	}
1812 }
1813 
1814 /*
1815  * Obtain object(s) from the composite class's freelist.
1816  */
1817 static unsigned int
cslab_alloc(mbuf_class_t class,mcache_obj_t *** plist,unsigned int num)1818 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
1819 {
1820 	unsigned int need = num;
1821 	mcl_slab_t *sp, *clsp, *nsp;
1822 	struct mbuf *m;
1823 	mcache_obj_t **list = *plist;
1824 	void *cl;
1825 
1826 	VERIFY(need > 0);
1827 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1828 
1829 	/* Get what we can from the freelist */
1830 	while ((*list = m_cobjlist(class)) != NULL) {
1831 		MRANGE(*list);
1832 
1833 		m = (struct mbuf *)*list;
1834 		sp = slab_get(m);
1835 		cl = m->m_ext.ext_buf;
1836 		clsp = slab_get(cl);
1837 		VERIFY(m->m_flags == M_EXT && cl != NULL);
1838 		VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
1839 
1840 		if (class == MC_MBUF_CL) {
1841 			VERIFY(clsp->sl_refcnt >= 1 &&
1842 			    clsp->sl_refcnt <= NCLPG);
1843 		} else {
1844 			VERIFY(clsp->sl_refcnt >= 1 &&
1845 			    clsp->sl_refcnt <= NBCLPG);
1846 		}
1847 
1848 		if (class == MC_MBUF_16KCL) {
1849 			int k;
1850 			for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
1851 				nsp = nsp->sl_next;
1852 				/* Next slab must already be present */
1853 				VERIFY(nsp != NULL);
1854 				VERIFY(nsp->sl_refcnt == 1);
1855 			}
1856 		}
1857 
1858 		if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
1859 		    !MBUF_IN_MAP(m_cobjlist(class))) {
1860 			slab_nextptr_panic(sp, m_cobjlist(class));
1861 			/* NOTREACHED */
1862 		}
1863 		(*list)->obj_next = NULL;
1864 		list = *plist = &(*list)->obj_next;
1865 
1866 		if (--need == 0) {
1867 			break;
1868 		}
1869 	}
1870 	m_infree(class) -= (num - need);
1871 
1872 	return num - need;
1873 }
1874 
1875 /*
1876  * Place object(s) back into a composite class's freelist.
1877  */
1878 static unsigned int
cslab_free(mbuf_class_t class,mcache_obj_t * list,int purged)1879 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
1880 {
1881 	mcache_obj_t *o, *tail;
1882 	unsigned int num = 0;
1883 	struct mbuf *m, *ms;
1884 	mcache_audit_t *mca = NULL;
1885 	mcache_obj_t *ref_list = NULL;
1886 	mcl_slab_t *clsp, *nsp;
1887 	void *cl;
1888 	mbuf_class_t cl_class;
1889 
1890 	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1891 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1892 
1893 	if (class == MC_MBUF_CL) {
1894 		cl_class = MC_CL;
1895 	} else if (class == MC_MBUF_BIGCL) {
1896 		cl_class = MC_BIGCL;
1897 	} else {
1898 		VERIFY(class == MC_MBUF_16KCL);
1899 		cl_class = MC_16KCL;
1900 	}
1901 
1902 	o = tail = list;
1903 
1904 	while ((m = ms = (struct mbuf *)o) != NULL) {
1905 		mcache_obj_t *rfa, *nexto = o->obj_next;
1906 
1907 		/* Do the mbuf sanity checks */
1908 		if (mclaudit != NULL) {
1909 			mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
1910 			if (mclverify) {
1911 				mcache_audit_free_verify(mca, m, 0,
1912 				    m_maxsize(MC_MBUF));
1913 			}
1914 			ms = MCA_SAVED_MBUF_PTR(mca);
1915 		}
1916 
1917 		/* Do the cluster sanity checks */
1918 		cl = ms->m_ext.ext_buf;
1919 		clsp = slab_get(cl);
1920 		if (mclverify) {
1921 			size_t size = m_maxsize(cl_class);
1922 			mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
1923 			    (mcache_obj_t *)cl), cl, 0, size);
1924 		}
1925 		VERIFY(ms->m_type == MT_FREE);
1926 		VERIFY(ms->m_flags == M_EXT);
1927 		VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
1928 		if (cl_class == MC_CL) {
1929 			VERIFY(clsp->sl_refcnt >= 1 &&
1930 			    clsp->sl_refcnt <= NCLPG);
1931 		} else {
1932 			VERIFY(clsp->sl_refcnt >= 1 &&
1933 			    clsp->sl_refcnt <= NBCLPG);
1934 		}
1935 		if (cl_class == MC_16KCL) {
1936 			int k;
1937 			for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
1938 				nsp = nsp->sl_next;
1939 				/* Next slab must already be present */
1940 				VERIFY(nsp != NULL);
1941 				VERIFY(nsp->sl_refcnt == 1);
1942 			}
1943 		}
1944 
1945 		/*
1946 		 * If we're asked to purge, restore the actual mbuf using
1947 		 * contents of the shadow structure (if auditing is enabled)
1948 		 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
1949 		 * about to free it and the attached cluster into their caches.
1950 		 */
1951 		if (purged) {
1952 			/* Restore constructed mbuf fields */
1953 			if (mclaudit != NULL) {
1954 				mcl_audit_restore_mbuf(m, mca, TRUE);
1955 			}
1956 
1957 			MEXT_MINREF(m) = 0;
1958 			MEXT_REF(m) = 0;
1959 			MEXT_PREF(m) = 0;
1960 			MEXT_FLAGS(m) = 0;
1961 			MEXT_PRIV(m) = 0;
1962 			MEXT_PMBUF(m) = NULL;
1963 
1964 			rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
1965 			m_set_ext(m, NULL, NULL, NULL);
1966 			rfa->obj_next = ref_list;
1967 			ref_list = rfa;
1968 
1969 			m->m_type = MT_FREE;
1970 			m->m_flags = m->m_len = 0;
1971 			m->m_next = m->m_nextpkt = NULL;
1972 
1973 			/* Save mbuf fields and make auditing happy */
1974 			if (mclaudit != NULL) {
1975 				mcl_audit_mbuf(mca, o, FALSE, FALSE);
1976 			}
1977 
1978 			VERIFY(m_total(class) > 0);
1979 			m_total(class)--;
1980 
1981 			/* Free the mbuf */
1982 			o->obj_next = NULL;
1983 			slab_free(MC_MBUF, o);
1984 
1985 			/* And free the cluster */
1986 			((mcache_obj_t *)cl)->obj_next = NULL;
1987 			if (class == MC_MBUF_CL) {
1988 				slab_free(MC_CL, cl);
1989 			} else if (class == MC_MBUF_BIGCL) {
1990 				slab_free(MC_BIGCL, cl);
1991 			} else {
1992 				slab_free(MC_16KCL, cl);
1993 			}
1994 		}
1995 
1996 		++num;
1997 		tail = o;
1998 		o = nexto;
1999 	}
2000 
2001 	if (!purged) {
2002 		tail->obj_next = m_cobjlist(class);
2003 		m_cobjlist(class) = list;
2004 		m_infree(class) += num;
2005 	} else if (ref_list != NULL) {
2006 		mcache_free_ext(ref_cache, ref_list);
2007 	}
2008 
2009 	return num;
2010 }
2011 
2012 /*
2013  * Common allocator for composite objects called by the CPU cache layer
2014  * during an allocation request whenever there is no available element in
2015  * the bucket layer.  It returns one or more composite elements from the
2016  * appropriate global freelist.  If the freelist is empty, it will attempt
2017  * to obtain the rudimentary objects from their caches and construct them
2018  * into composite mbuf + cluster objects.
2019  */
2020 static unsigned int
mbuf_cslab_alloc(void * arg,mcache_obj_t *** plist,unsigned int needed,int wait)2021 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2022     int wait)
2023 {
2024 	mbuf_class_t class = (mbuf_class_t)arg;
2025 	mbuf_class_t cl_class = 0;
2026 	unsigned int num = 0, cnum = 0, want = needed;
2027 	mcache_obj_t *ref_list = NULL;
2028 	mcache_obj_t *mp_list = NULL;
2029 	mcache_obj_t *clp_list = NULL;
2030 	mcache_obj_t **list;
2031 	struct ext_ref *rfa;
2032 	struct mbuf *m;
2033 	void *cl;
2034 
2035 	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2036 	ASSERT(needed > 0);
2037 
2038 	/* There should not be any slab for this class */
2039 	VERIFY(m_slab_cnt(class) == 0 &&
2040 	    m_slablist(class).tqh_first == NULL &&
2041 	    m_slablist(class).tqh_last == NULL);
2042 
2043 	lck_mtx_lock(mbuf_mlock);
2044 
2045 	/* Try using the freelist first */
2046 	num = cslab_alloc(class, plist, needed);
2047 	list = *plist;
2048 	if (num == needed) {
2049 		m_alloc_cnt(class) += num;
2050 		lck_mtx_unlock(mbuf_mlock);
2051 		return needed;
2052 	}
2053 
2054 	lck_mtx_unlock(mbuf_mlock);
2055 
2056 	/*
2057 	 * We could not satisfy the request using the freelist alone;
2058 	 * allocate from the appropriate rudimentary caches and use
2059 	 * whatever we can get to construct the composite objects.
2060 	 */
2061 	needed -= num;
2062 
2063 	/*
2064 	 * Mark these allocation requests as coming from a composite cache.
2065 	 * Also, if the caller is willing to be blocked, mark the request
2066 	 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2067 	 * slab layer waiting for the individual object when one or more
2068 	 * of the already-constructed composite objects are available.
2069 	 */
2070 	wait |= MCR_COMP;
2071 	if (!(wait & MCR_NOSLEEP)) {
2072 		wait |= MCR_FAILOK;
2073 	}
2074 
2075 	/* allocate mbufs */
2076 	needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2077 	if (needed == 0) {
2078 		ASSERT(mp_list == NULL);
2079 		goto fail;
2080 	}
2081 
2082 	/* allocate clusters */
2083 	if (class == MC_MBUF_CL) {
2084 		cl_class = MC_CL;
2085 	} else if (class == MC_MBUF_BIGCL) {
2086 		cl_class = MC_BIGCL;
2087 	} else {
2088 		VERIFY(class == MC_MBUF_16KCL);
2089 		cl_class = MC_16KCL;
2090 	}
2091 	needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2092 	if (needed == 0) {
2093 		ASSERT(clp_list == NULL);
2094 		goto fail;
2095 	}
2096 
2097 	needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2098 	if (needed == 0) {
2099 		ASSERT(ref_list == NULL);
2100 		goto fail;
2101 	}
2102 
2103 	/*
2104 	 * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
2105 	 * overs will get freed accordingly before we return to caller.
2106 	 */
2107 	for (cnum = 0; cnum < needed; cnum++) {
2108 		struct mbuf *ms;
2109 
2110 		m = ms = (struct mbuf *)mp_list;
2111 		mp_list = mp_list->obj_next;
2112 
2113 		cl = clp_list;
2114 		clp_list = clp_list->obj_next;
2115 		((mcache_obj_t *)cl)->obj_next = NULL;
2116 
2117 		rfa = (struct ext_ref *)ref_list;
2118 		ref_list = ref_list->obj_next;
2119 		((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2120 
2121 		/*
2122 		 * If auditing is enabled, construct the shadow mbuf
2123 		 * in the audit structure instead of in the actual one.
2124 		 * mbuf_cslab_audit() will take care of restoring the
2125 		 * contents after the integrity check.
2126 		 */
2127 		if (mclaudit != NULL) {
2128 			mcache_audit_t *mca, *cl_mca;
2129 
2130 			lck_mtx_lock(mbuf_mlock);
2131 			mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2132 			ms = MCA_SAVED_MBUF_PTR(mca);
2133 			cl_mca = mcl_audit_buf2mca(cl_class,
2134 			    (mcache_obj_t *)cl);
2135 
2136 			/*
2137 			 * Pair them up.  Note that this is done at the time
2138 			 * the mbuf+cluster objects are constructed.  This
2139 			 * information should be treated as "best effort"
2140 			 * debugging hint since more than one mbufs can refer
2141 			 * to a cluster.  In that case, the cluster might not
2142 			 * be freed along with the mbuf it was paired with.
2143 			 */
2144 			mca->mca_uptr = cl_mca;
2145 			cl_mca->mca_uptr = mca;
2146 
2147 			ASSERT(mca->mca_uflags & MB_SCVALID);
2148 			ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2149 			lck_mtx_unlock(mbuf_mlock);
2150 
2151 			/* Technically, they are in the freelist */
2152 			if (mclverify) {
2153 				size_t size;
2154 
2155 				mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2156 				    m_maxsize(MC_MBUF));
2157 
2158 				if (class == MC_MBUF_CL) {
2159 					size = m_maxsize(MC_CL);
2160 				} else if (class == MC_MBUF_BIGCL) {
2161 					size = m_maxsize(MC_BIGCL);
2162 				} else {
2163 					size = m_maxsize(MC_16KCL);
2164 				}
2165 
2166 				mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2167 				    size);
2168 			}
2169 		}
2170 
2171 		mbuf_init(ms, 0, MT_FREE);
2172 		if (class == MC_MBUF_16KCL) {
2173 			MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2174 		} else if (class == MC_MBUF_BIGCL) {
2175 			MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2176 		} else {
2177 			MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2178 		}
2179 		VERIFY(ms->m_flags == M_EXT);
2180 		VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2181 
2182 		*list = (mcache_obj_t *)m;
2183 		(*list)->obj_next = NULL;
2184 		list = *plist = &(*list)->obj_next;
2185 	}
2186 
2187 fail:
2188 	/*
2189 	 * Free up what's left of the above.
2190 	 */
2191 	if (mp_list != NULL) {
2192 		mcache_free_ext(m_cache(MC_MBUF), mp_list);
2193 	}
2194 	if (clp_list != NULL) {
2195 		mcache_free_ext(m_cache(cl_class), clp_list);
2196 	}
2197 	if (ref_list != NULL) {
2198 		mcache_free_ext(ref_cache, ref_list);
2199 	}
2200 
2201 	lck_mtx_lock(mbuf_mlock);
2202 	if (num > 0 || cnum > 0) {
2203 		m_total(class) += cnum;
2204 		VERIFY(m_total(class) <= m_maxlimit(class));
2205 		m_alloc_cnt(class) += num + cnum;
2206 	}
2207 	if ((num + cnum) < want) {
2208 		m_fail_cnt(class) += (want - (num + cnum));
2209 	}
2210 	lck_mtx_unlock(mbuf_mlock);
2211 
2212 	return num + cnum;
2213 }
2214 
2215 /*
2216  * Common de-allocator for composite objects called by the CPU cache
2217  * layer when one or more elements need to be returned to the appropriate
2218  * global freelist.
2219  */
2220 static void
mbuf_cslab_free(void * arg,mcache_obj_t * list,int purged)2221 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2222 {
2223 	mbuf_class_t class = (mbuf_class_t)arg;
2224 	unsigned int num;
2225 	int w;
2226 
2227 	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2228 
2229 	lck_mtx_lock(mbuf_mlock);
2230 
2231 	num = cslab_free(class, list, purged);
2232 	m_free_cnt(class) += num;
2233 
2234 	if ((w = mb_waiters) > 0) {
2235 		mb_waiters = 0;
2236 	}
2237 	if (w) {
2238 		mbwdog_logger("waking up all threads");
2239 	}
2240 
2241 	lck_mtx_unlock(mbuf_mlock);
2242 
2243 	if (w != 0) {
2244 		wakeup(mb_waitchan);
2245 	}
2246 }
2247 
2248 /*
2249  * Common auditor for composite objects called by the CPU cache layer
2250  * during an allocation or free request.  For the former, this is called
2251  * after the objects are obtained from either the bucket or slab layer
2252  * and before they are returned to the caller.  For the latter, this is
2253  * called immediately during free and before placing the objects into
2254  * the bucket or slab layer.
2255  */
2256 static void
mbuf_cslab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)2257 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2258 {
2259 	mbuf_class_t class = (mbuf_class_t)arg, cl_class;
2260 	mcache_audit_t *mca;
2261 	struct mbuf *m, *ms;
2262 	mcl_slab_t *clsp, *nsp;
2263 	size_t cl_size;
2264 	void *cl;
2265 
2266 	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2267 	if (class == MC_MBUF_CL) {
2268 		cl_class = MC_CL;
2269 	} else if (class == MC_MBUF_BIGCL) {
2270 		cl_class = MC_BIGCL;
2271 	} else {
2272 		cl_class = MC_16KCL;
2273 	}
2274 	cl_size = m_maxsize(cl_class);
2275 
2276 	while ((m = ms = (struct mbuf *)list) != NULL) {
2277 		lck_mtx_lock(mbuf_mlock);
2278 		/* Do the mbuf sanity checks and record its transaction */
2279 		mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2280 		mcl_audit_mbuf(mca, m, TRUE, alloc);
2281 		if (mcltrace) {
2282 			mcache_buffer_log(mca, m, m_cache(class), &mb_start);
2283 		}
2284 
2285 		if (alloc) {
2286 			mca->mca_uflags |= MB_COMP_INUSE;
2287 		} else {
2288 			mca->mca_uflags &= ~MB_COMP_INUSE;
2289 		}
2290 
2291 		/*
2292 		 * Use the shadow mbuf in the audit structure if we are
2293 		 * freeing, since the contents of the actual mbuf has been
2294 		 * pattern-filled by the above call to mcl_audit_mbuf().
2295 		 */
2296 		if (!alloc && mclverify) {
2297 			ms = MCA_SAVED_MBUF_PTR(mca);
2298 		}
2299 
2300 		/* Do the cluster sanity checks and record its transaction */
2301 		cl = ms->m_ext.ext_buf;
2302 		clsp = slab_get(cl);
2303 		VERIFY(ms->m_flags == M_EXT && cl != NULL);
2304 		VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2305 		if (class == MC_MBUF_CL) {
2306 			VERIFY(clsp->sl_refcnt >= 1 &&
2307 			    clsp->sl_refcnt <= NCLPG);
2308 		} else {
2309 			VERIFY(clsp->sl_refcnt >= 1 &&
2310 			    clsp->sl_refcnt <= NBCLPG);
2311 		}
2312 
2313 		if (class == MC_MBUF_16KCL) {
2314 			int k;
2315 			for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2316 				nsp = nsp->sl_next;
2317 				/* Next slab must already be present */
2318 				VERIFY(nsp != NULL);
2319 				VERIFY(nsp->sl_refcnt == 1);
2320 			}
2321 		}
2322 
2323 
2324 		mca = mcl_audit_buf2mca(cl_class, cl);
2325 		mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE);
2326 		if (mcltrace) {
2327 			mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
2328 		}
2329 
2330 		if (alloc) {
2331 			mca->mca_uflags |= MB_COMP_INUSE;
2332 		} else {
2333 			mca->mca_uflags &= ~MB_COMP_INUSE;
2334 		}
2335 		lck_mtx_unlock(mbuf_mlock);
2336 
2337 		list = list->obj_next;
2338 	}
2339 }
2340 
2341 static void
m_vm_error_stats(uint32_t * cnt,uint64_t * ts,uint64_t * size,uint64_t alloc_size,kern_return_t error)2342 m_vm_error_stats(uint32_t *cnt, uint64_t *ts, uint64_t *size,
2343     uint64_t alloc_size, kern_return_t error)
2344 {
2345 	*cnt = *cnt + 1;
2346 	*ts = net_uptime();
2347 	if (size) {
2348 		*size = alloc_size;
2349 	}
2350 	switch (error) {
2351 	case KERN_SUCCESS:
2352 		break;
2353 	case KERN_INVALID_ARGUMENT:
2354 		mb_kmem_stats[0]++;
2355 		break;
2356 	case KERN_INVALID_ADDRESS:
2357 		mb_kmem_stats[1]++;
2358 		break;
2359 	case KERN_RESOURCE_SHORTAGE:
2360 		mb_kmem_stats[2]++;
2361 		break;
2362 	case KERN_NO_SPACE:
2363 		mb_kmem_stats[3]++;
2364 		break;
2365 	case KERN_FAILURE:
2366 		mb_kmem_stats[4]++;
2367 		break;
2368 	default:
2369 		mb_kmem_stats[5]++;
2370 		break;
2371 	}
2372 }
2373 
2374 static vm_offset_t
kmem_mb_alloc(vm_map_t mbmap,int size,int physContig,kern_return_t * err)2375 kmem_mb_alloc(vm_map_t mbmap, int size, int physContig, kern_return_t *err)
2376 {
2377 	vm_offset_t addr = 0;
2378 	kern_return_t kr = KERN_SUCCESS;
2379 
2380 	if (!physContig) {
2381 		kr = kmem_alloc(mbmap, &addr, size,
2382 		    KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
2383 	} else {
2384 		kr = kmem_alloc_contig(mbmap, &addr, size, PAGE_MASK, 0xfffff,
2385 		    0, KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
2386 	}
2387 
2388 	if (kr != KERN_SUCCESS) {
2389 		addr = 0;
2390 	}
2391 	if (err) {
2392 		*err = kr;
2393 	}
2394 
2395 	return addr;
2396 }
2397 
2398 /*
2399  * Allocate some number of mbuf clusters and place on cluster freelist.
2400  */
2401 static int
m_clalloc(const u_int32_t num,const int wait,const u_int32_t bufsize)2402 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2403 {
2404 	int i, count = 0;
2405 	vm_size_t size = 0;
2406 	int numpages = 0, large_buffer;
2407 	vm_offset_t page = 0;
2408 	mcache_audit_t *mca_list = NULL;
2409 	mcache_obj_t *con_list = NULL;
2410 	mcl_slab_t *sp;
2411 	mbuf_class_t class;
2412 	kern_return_t error;
2413 
2414 	/* Set if a buffer allocation needs allocation of multiple pages */
2415 	large_buffer = ((bufsize == m_maxsize(MC_16KCL)) &&
2416 	    PAGE_SIZE < M16KCLBYTES);
2417 	VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2418 	    bufsize == m_maxsize(MC_16KCL));
2419 
2420 	VERIFY((bufsize == PAGE_SIZE) ||
2421 	    (bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL)));
2422 
2423 	if (bufsize == m_size(MC_BIGCL)) {
2424 		class = MC_BIGCL;
2425 	} else {
2426 		class = MC_16KCL;
2427 	}
2428 
2429 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2430 
2431 	/*
2432 	 * Multiple threads may attempt to populate the cluster map one
2433 	 * after another.  Since we drop the lock below prior to acquiring
2434 	 * the physical page(s), our view of the cluster map may no longer
2435 	 * be accurate, and we could end up over-committing the pages beyond
2436 	 * the maximum allowed for each class.  To prevent it, this entire
2437 	 * operation (including the page mapping) is serialized.
2438 	 */
2439 	while (mb_clalloc_busy) {
2440 		mb_clalloc_waiters++;
2441 		(void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2442 		    (PZERO - 1), "m_clalloc", NULL);
2443 		LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2444 	}
2445 
2446 	/* We are busy now; tell everyone else to go away */
2447 	mb_clalloc_busy = TRUE;
2448 
2449 	/*
2450 	 * Honor the caller's wish to block or not block.  We have a way
2451 	 * to grow the pool asynchronously using the mbuf worker thread.
2452 	 */
2453 	i = m_howmany(num, bufsize);
2454 	if (i <= 0 || (wait & M_DONTWAIT)) {
2455 		goto out;
2456 	}
2457 
2458 	lck_mtx_unlock(mbuf_mlock);
2459 
2460 	size = round_page(i * bufsize);
2461 	page = kmem_mb_alloc(mb_map, size, large_buffer, &error);
2462 
2463 	/*
2464 	 * If we did ask for "n" 16KB physically contiguous chunks
2465 	 * and didn't get them, then please try again without this
2466 	 * restriction.
2467 	 */
2468 	net_update_uptime();
2469 	if (large_buffer && page == 0) {
2470 		m_vm_error_stats(&mb_kmem_contig_failed,
2471 		    &mb_kmem_contig_failed_ts,
2472 		    &mb_kmem_contig_failed_size,
2473 		    size, error);
2474 		page = kmem_mb_alloc(mb_map, size, 0, &error);
2475 	}
2476 
2477 	if (page == 0) {
2478 		m_vm_error_stats(&mb_kmem_failed,
2479 		    &mb_kmem_failed_ts,
2480 		    &mb_kmem_failed_size,
2481 		    size, error);
2482 #if PAGE_SIZE == 4096
2483 		if (bufsize == m_maxsize(MC_BIGCL)) {
2484 #else
2485 		if (bufsize >= m_maxsize(MC_BIGCL)) {
2486 #endif
2487 			/* Try for 1 page if failed */
2488 			size = PAGE_SIZE;
2489 			page = kmem_mb_alloc(mb_map, size, 0, &error);
2490 			if (page == 0) {
2491 				m_vm_error_stats(&mb_kmem_one_failed,
2492 				    &mb_kmem_one_failed_ts,
2493 				    NULL, size, error);
2494 			}
2495 		}
2496 
2497 		if (page == 0) {
2498 			lck_mtx_lock(mbuf_mlock);
2499 			goto out;
2500 		}
2501 	}
2502 
2503 	VERIFY(IS_P2ALIGNED(page, PAGE_SIZE));
2504 	numpages = size / PAGE_SIZE;
2505 
2506 	/* If auditing is enabled, allocate the audit structures now */
2507 	if (mclaudit != NULL) {
2508 		int needed;
2509 
2510 		/*
2511 		 * Yes, I realize this is a waste of memory for clusters
2512 		 * that never get transformed into mbufs, as we may end
2513 		 * up with NMBPG-1 unused audit structures per cluster.
2514 		 * But doing so tremendously simplifies the allocation
2515 		 * strategy, since at this point we are not holding the
2516 		 * mbuf lock and the caller is okay to be blocked.
2517 		 */
2518 		if (bufsize == PAGE_SIZE) {
2519 			needed = numpages * NMBPG;
2520 
2521 			i = mcache_alloc_ext(mcl_audit_con_cache,
2522 			    &con_list, needed, MCR_SLEEP);
2523 
2524 			VERIFY(con_list != NULL && i == needed);
2525 		} else {
2526 			/*
2527 			 * if multiple 4K pages are being used for a
2528 			 * 16K cluster
2529 			 */
2530 			needed = numpages / NSLABSP16KB;
2531 		}
2532 
2533 		i = mcache_alloc_ext(mcache_audit_cache,
2534 		    (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2535 
2536 		VERIFY(mca_list != NULL && i == needed);
2537 	}
2538 
2539 	lck_mtx_lock(mbuf_mlock);
2540 
2541 	for (i = 0; i < numpages; i++, page += PAGE_SIZE) {
2542 		ppnum_t offset =
2543 		    ((unsigned char *)page - mbutl) >> PAGE_SHIFT;
2544 		ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
2545 
2546 		/*
2547 		 * If there is a mapper the appropriate I/O page is
2548 		 * returned; zero out the page to discard its past
2549 		 * contents to prevent exposing leftover kernel memory.
2550 		 */
2551 		VERIFY(offset < mcl_pages);
2552 		if (mcl_paddr_base != 0) {
2553 			bzero((void *)(uintptr_t) page, PAGE_SIZE);
2554 			new_page = IOMapperInsertPage(mcl_paddr_base,
2555 			    offset, new_page);
2556 		}
2557 		mcl_paddr[offset] = new_page;
2558 
2559 		/* Pattern-fill this fresh page */
2560 		if (mclverify) {
2561 			mcache_set_pattern(MCACHE_FREE_PATTERN,
2562 			    (caddr_t)page, PAGE_SIZE);
2563 		}
2564 		if (bufsize == PAGE_SIZE) {
2565 			mcache_obj_t *buf;
2566 			/* One for the entire page */
2567 			sp = slab_get((void *)page);
2568 			if (mclaudit != NULL) {
2569 				mcl_audit_init((void *)page,
2570 				    &mca_list, &con_list,
2571 				    AUDIT_CONTENTS_SIZE, NMBPG);
2572 			}
2573 			VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2574 			slab_init(sp, class, SLF_MAPPED, (void *)page,
2575 			    (void *)page, PAGE_SIZE, 0, 1);
2576 			buf = (mcache_obj_t *)page;
2577 			buf->obj_next = NULL;
2578 
2579 			/* Insert this slab */
2580 			slab_insert(sp, class);
2581 
2582 			/* Update stats now since slab_get drops the lock */
2583 			++m_infree(class);
2584 			++m_total(class);
2585 			VERIFY(m_total(class) <= m_maxlimit(class));
2586 			if (class == MC_BIGCL) {
2587 				mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2588 				    m_infree(MC_MBUF_BIGCL);
2589 				mbstat.m_bigclusters = m_total(MC_BIGCL);
2590 			}
2591 			++count;
2592 		} else if ((bufsize > PAGE_SIZE) &&
2593 		    (i % NSLABSP16KB) == 0) {
2594 			union m16kcluster *m16kcl = (union m16kcluster *)page;
2595 			mcl_slab_t *nsp;
2596 			int k;
2597 
2598 			/* One for the entire 16KB */
2599 			sp = slab_get(m16kcl);
2600 			if (mclaudit != NULL) {
2601 				mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2602 			}
2603 
2604 			VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2605 			slab_init(sp, MC_16KCL, SLF_MAPPED,
2606 			    m16kcl, m16kcl, bufsize, 0, 1);
2607 			m16kcl->m16kcl_next = NULL;
2608 
2609 			/*
2610 			 * 2nd-Nth page's slab is part of the first one,
2611 			 * where N is NSLABSP16KB.
2612 			 */
2613 			for (k = 1; k < NSLABSP16KB; k++) {
2614 				nsp = slab_get(((union mbigcluster *)page) + k);
2615 				VERIFY(nsp->sl_refcnt == 0 &&
2616 				    nsp->sl_flags == 0);
2617 				slab_init(nsp, MC_16KCL,
2618 				    SLF_MAPPED | SLF_PARTIAL,
2619 				    m16kcl, NULL, 0, 0, 0);
2620 			}
2621 			/* Insert this slab */
2622 			slab_insert(sp, MC_16KCL);
2623 
2624 			/* Update stats now since slab_get drops the lock */
2625 			++m_infree(MC_16KCL);
2626 			++m_total(MC_16KCL);
2627 			VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2628 			++count;
2629 		}
2630 	}
2631 	VERIFY(mca_list == NULL && con_list == NULL);
2632 
2633 	/* We're done; let others enter */
2634 	mb_clalloc_busy = FALSE;
2635 	if (mb_clalloc_waiters > 0) {
2636 		mb_clalloc_waiters = 0;
2637 		wakeup(mb_clalloc_waitchan);
2638 	}
2639 
2640 	return count;
2641 out:
2642 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2643 
2644 	mtracelarge_register(size);
2645 
2646 	/* We're done; let others enter */
2647 	mb_clalloc_busy = FALSE;
2648 	if (mb_clalloc_waiters > 0) {
2649 		mb_clalloc_waiters = 0;
2650 		wakeup(mb_clalloc_waitchan);
2651 	}
2652 
2653 	/*
2654 	 * When non-blocking we kick a thread if we have to grow the
2655 	 * pool or if the number of free clusters is less than requested.
2656 	 */
2657 	if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) {
2658 		mbwdog_logger("waking up the worker thread to to grow %s by %d",
2659 		    m_cname(class), i);
2660 		wakeup((caddr_t)&mbuf_worker_needs_wakeup);
2661 		mbuf_worker_needs_wakeup = FALSE;
2662 	}
2663 	if (class == MC_BIGCL) {
2664 		if (i > 0) {
2665 			/*
2666 			 * Remember total number of 4KB clusters needed
2667 			 * at this time.
2668 			 */
2669 			i += m_total(MC_BIGCL);
2670 			if (i > m_region_expand(MC_BIGCL)) {
2671 				m_region_expand(MC_BIGCL) = i;
2672 			}
2673 		}
2674 		if (m_infree(MC_BIGCL) >= num) {
2675 			return 1;
2676 		}
2677 	} else {
2678 		if (i > 0) {
2679 			/*
2680 			 * Remember total number of 16KB clusters needed
2681 			 * at this time.
2682 			 */
2683 			i += m_total(MC_16KCL);
2684 			if (i > m_region_expand(MC_16KCL)) {
2685 				m_region_expand(MC_16KCL) = i;
2686 			}
2687 		}
2688 		if (m_infree(MC_16KCL) >= num) {
2689 			return 1;
2690 		}
2691 	}
2692 	return 0;
2693 }
2694 
2695 /*
2696  * Populate the global freelist of the corresponding buffer class.
2697  */
2698 static int
2699 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2700 {
2701 	mcache_obj_t *o = NULL;
2702 	int i, numpages = 0, count;
2703 	mbuf_class_t super_class;
2704 
2705 	VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2706 	    class == MC_16KCL);
2707 
2708 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2709 
2710 	VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) ||
2711 	    PAGE_SIZE == m_maxsize(MC_16KCL));
2712 
2713 	if (m_maxsize(class) >= PAGE_SIZE) {
2714 		return m_clalloc(num, wait, m_maxsize(class)) != 0;
2715 	}
2716 
2717 	/*
2718 	 * The rest of the function will allocate pages and will slice
2719 	 * them up into the right size
2720 	 */
2721 
2722 	numpages = (num * m_size(class) + PAGE_SIZE - 1) / PAGE_SIZE;
2723 
2724 	/* Currently assume that pages are 4K or 16K */
2725 	if (PAGE_SIZE == m_maxsize(MC_BIGCL)) {
2726 		super_class = MC_BIGCL;
2727 	} else {
2728 		super_class = MC_16KCL;
2729 	}
2730 
2731 	i = m_clalloc(numpages, wait, m_maxsize(super_class));
2732 
2733 	/* how many objects will we cut the page into? */
2734 	int numobj = PAGE_SIZE / m_maxsize(class);
2735 
2736 	for (count = 0; count < numpages; count++) {
2737 		/* respect totals, minlimit, maxlimit */
2738 		if (m_total(super_class) <= m_minlimit(super_class) ||
2739 		    m_total(class) >= m_maxlimit(class)) {
2740 			break;
2741 		}
2742 
2743 		if ((o = slab_alloc(super_class, wait)) == NULL) {
2744 			break;
2745 		}
2746 
2747 		struct mbuf *m = (struct mbuf *)o;
2748 		union mcluster *c = (union mcluster *)o;
2749 		union mbigcluster *mbc = (union mbigcluster *)o;
2750 		mcl_slab_t *sp = slab_get(o);
2751 		mcache_audit_t *mca = NULL;
2752 
2753 		/*
2754 		 * since one full page will be converted to MC_MBUF or
2755 		 * MC_CL, verify that the reference count will match that
2756 		 * assumption
2757 		 */
2758 		VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp));
2759 		VERIFY((sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2760 		/*
2761 		 * Make sure that the cluster is unmolested
2762 		 * while in freelist
2763 		 */
2764 		if (mclverify) {
2765 			mca = mcl_audit_buf2mca(super_class,
2766 			    (mcache_obj_t *)o);
2767 			mcache_audit_free_verify(mca,
2768 			    (mcache_obj_t *)o, 0, m_maxsize(super_class));
2769 		}
2770 
2771 		/* Reinitialize it as an mbuf or 2K or 4K slab */
2772 		slab_init(sp, class, sp->sl_flags,
2773 		    sp->sl_base, NULL, PAGE_SIZE, 0, numobj);
2774 
2775 		VERIFY(sp->sl_head == NULL);
2776 
2777 		VERIFY(m_total(super_class) >= 1);
2778 		m_total(super_class)--;
2779 
2780 		if (super_class == MC_BIGCL) {
2781 			mbstat.m_bigclusters = m_total(MC_BIGCL);
2782 		}
2783 
2784 		m_total(class) += numobj;
2785 		VERIFY(m_total(class) <= m_maxlimit(class));
2786 		m_infree(class) += numobj;
2787 
2788 		i = numobj;
2789 		if (class == MC_MBUF) {
2790 			mbstat.m_mbufs = m_total(MC_MBUF);
2791 			mtype_stat_add(MT_FREE, NMBPG);
2792 			while (i--) {
2793 				/*
2794 				 * If auditing is enabled, construct the
2795 				 * shadow mbuf in the audit structure
2796 				 * instead of the actual one.
2797 				 * mbuf_slab_audit() will take care of
2798 				 * restoring the contents after the
2799 				 * integrity check.
2800 				 */
2801 				if (mclaudit != NULL) {
2802 					struct mbuf *ms;
2803 					mca = mcl_audit_buf2mca(MC_MBUF,
2804 					    (mcache_obj_t *)m);
2805 					ms = MCA_SAVED_MBUF_PTR(mca);
2806 					ms->m_type = MT_FREE;
2807 				} else {
2808 					m->m_type = MT_FREE;
2809 				}
2810 				m->m_next = sp->sl_head;
2811 				sp->sl_head = (void *)m++;
2812 			}
2813 		} else if (class == MC_CL) { /* MC_CL */
2814 			mbstat.m_clfree =
2815 			    m_infree(MC_CL) + m_infree(MC_MBUF_CL);
2816 			mbstat.m_clusters = m_total(MC_CL);
2817 			while (i--) {
2818 				c->mcl_next = sp->sl_head;
2819 				sp->sl_head = (void *)c++;
2820 			}
2821 		} else {
2822 			VERIFY(class == MC_BIGCL);
2823 			mbstat.m_bigclusters = m_total(MC_BIGCL);
2824 			mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2825 			    m_infree(MC_MBUF_BIGCL);
2826 			while (i--) {
2827 				mbc->mbc_next = sp->sl_head;
2828 				sp->sl_head = (void *)mbc++;
2829 			}
2830 		}
2831 
2832 		/* Insert into the mbuf or 2k or 4k slab list */
2833 		slab_insert(sp, class);
2834 
2835 		if ((i = mb_waiters) > 0) {
2836 			mb_waiters = 0;
2837 		}
2838 		if (i != 0) {
2839 			mbwdog_logger("waking up all threads");
2840 			wakeup(mb_waitchan);
2841 		}
2842 	}
2843 	return count != 0;
2844 }
2845 
2846 /*
2847  * For each class, initialize the freelist to hold m_minlimit() objects.
2848  */
2849 static void
2850 freelist_init(mbuf_class_t class)
2851 {
2852 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2853 
2854 	VERIFY(class == MC_CL || class == MC_BIGCL);
2855 	VERIFY(m_total(class) == 0);
2856 	VERIFY(m_minlimit(class) > 0);
2857 
2858 	while (m_total(class) < m_minlimit(class)) {
2859 		(void) freelist_populate(class, m_minlimit(class), M_WAIT);
2860 	}
2861 
2862 	VERIFY(m_total(class) >= m_minlimit(class));
2863 }
2864 
2865 /*
2866  * (Inaccurately) check if it might be worth a trip back to the
2867  * mcache layer due the availability of objects there.  We'll
2868  * end up back here if there's nothing up there.
2869  */
2870 static boolean_t
2871 mbuf_cached_above(mbuf_class_t class, int wait)
2872 {
2873 	switch (class) {
2874 	case MC_MBUF:
2875 		if (wait & MCR_COMP) {
2876 			return !mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
2877 			       !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
2878 		}
2879 		break;
2880 
2881 	case MC_CL:
2882 		if (wait & MCR_COMP) {
2883 			return !mcache_bkt_isempty(m_cache(MC_MBUF_CL));
2884 		}
2885 		break;
2886 
2887 	case MC_BIGCL:
2888 		if (wait & MCR_COMP) {
2889 			return !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
2890 		}
2891 		break;
2892 
2893 	case MC_16KCL:
2894 		if (wait & MCR_COMP) {
2895 			return !mcache_bkt_isempty(m_cache(MC_MBUF_16KCL));
2896 		}
2897 		break;
2898 
2899 	case MC_MBUF_CL:
2900 	case MC_MBUF_BIGCL:
2901 	case MC_MBUF_16KCL:
2902 		break;
2903 
2904 	default:
2905 		VERIFY(0);
2906 		/* NOTREACHED */
2907 	}
2908 
2909 	return !mcache_bkt_isempty(m_cache(class));
2910 }
2911 
2912 /*
2913  * If possible, convert constructed objects to raw ones.
2914  */
2915 static boolean_t
2916 mbuf_steal(mbuf_class_t class, unsigned int num)
2917 {
2918 	mcache_obj_t *top = NULL;
2919 	mcache_obj_t **list = &top;
2920 	unsigned int tot = 0;
2921 
2922 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2923 
2924 	switch (class) {
2925 	case MC_MBUF:
2926 	case MC_CL:
2927 	case MC_BIGCL:
2928 	case MC_16KCL:
2929 		return FALSE;
2930 
2931 	case MC_MBUF_CL:
2932 	case MC_MBUF_BIGCL:
2933 	case MC_MBUF_16KCL:
2934 		/* Get the required number of constructed objects if possible */
2935 		if (m_infree(class) > m_minlimit(class)) {
2936 			tot = cslab_alloc(class, &list,
2937 			    MIN(num, m_infree(class)));
2938 		}
2939 
2940 		/* And destroy them to get back the raw objects */
2941 		if (top != NULL) {
2942 			(void) cslab_free(class, top, 1);
2943 		}
2944 		break;
2945 
2946 	default:
2947 		VERIFY(0);
2948 		/* NOTREACHED */
2949 	}
2950 
2951 	return tot == num;
2952 }
2953 
2954 static void
2955 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
2956 {
2957 	int m, bmap = 0;
2958 
2959 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2960 
2961 	VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2962 	VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2963 	VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2964 
2965 	/*
2966 	 * This logic can be made smarter; for now, simply mark
2967 	 * all other related classes as potential victims.
2968 	 */
2969 	switch (class) {
2970 	case MC_MBUF:
2971 		m_wantpurge(MC_CL)++;
2972 		m_wantpurge(MC_BIGCL)++;
2973 		m_wantpurge(MC_MBUF_CL)++;
2974 		m_wantpurge(MC_MBUF_BIGCL)++;
2975 		break;
2976 
2977 	case MC_CL:
2978 		m_wantpurge(MC_MBUF)++;
2979 		m_wantpurge(MC_BIGCL)++;
2980 		m_wantpurge(MC_MBUF_BIGCL)++;
2981 		if (!comp) {
2982 			m_wantpurge(MC_MBUF_CL)++;
2983 		}
2984 		break;
2985 
2986 	case MC_BIGCL:
2987 		m_wantpurge(MC_MBUF)++;
2988 		m_wantpurge(MC_CL)++;
2989 		m_wantpurge(MC_MBUF_CL)++;
2990 		if (!comp) {
2991 			m_wantpurge(MC_MBUF_BIGCL)++;
2992 		}
2993 		break;
2994 
2995 	case MC_16KCL:
2996 		if (!comp) {
2997 			m_wantpurge(MC_MBUF_16KCL)++;
2998 		}
2999 		break;
3000 
3001 	default:
3002 		VERIFY(0);
3003 		/* NOTREACHED */
3004 	}
3005 
3006 	/*
3007 	 * Run through each marked class and check if we really need to
3008 	 * purge (and therefore temporarily disable) the per-CPU caches
3009 	 * layer used by the class.  If so, remember the classes since
3010 	 * we are going to drop the lock below prior to purging.
3011 	 */
3012 	for (m = 0; m < MC_MAX; m++) {
3013 		if (m_wantpurge(m) > 0) {
3014 			m_wantpurge(m) = 0;
3015 			/*
3016 			 * Try hard to steal the required number of objects
3017 			 * from the freelist of other mbuf classes.  Only
3018 			 * purge and disable the per-CPU caches layer when
3019 			 * we don't have enough; it's the last resort.
3020 			 */
3021 			if (!mbuf_steal(m, num)) {
3022 				bmap |= (1 << m);
3023 			}
3024 		}
3025 	}
3026 
3027 	lck_mtx_unlock(mbuf_mlock);
3028 
3029 	if (bmap != 0) {
3030 		/* signal the domains to drain */
3031 		net_drain_domains();
3032 
3033 		/* Sigh; we have no other choices but to ask mcache to purge */
3034 		for (m = 0; m < MC_MAX; m++) {
3035 			if ((bmap & (1 << m)) &&
3036 			    mcache_purge_cache(m_cache(m), TRUE)) {
3037 				lck_mtx_lock(mbuf_mlock);
3038 				m_purge_cnt(m)++;
3039 				mbstat.m_drain++;
3040 				lck_mtx_unlock(mbuf_mlock);
3041 			}
3042 		}
3043 	} else {
3044 		/*
3045 		 * Request mcache to reap extra elements from all of its caches;
3046 		 * note that all reaps are serialized and happen only at a fixed
3047 		 * interval.
3048 		 */
3049 		mcache_reap();
3050 	}
3051 	lck_mtx_lock(mbuf_mlock);
3052 }
3053 
3054 struct mbuf *
3055 m_get_common(int wait, short type, int hdr)
3056 {
3057 	struct mbuf *m;
3058 
3059 	int mcflags = MSLEEPF(wait);
3060 
3061 	/* Is this due to a non-blocking retry?  If so, then try harder */
3062 	if (mcflags & MCR_NOSLEEP) {
3063 		mcflags |= MCR_TRYHARD;
3064 	}
3065 
3066 	m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3067 	if (m != NULL) {
3068 		mbuf_init(m, hdr, type);
3069 		mtype_stat_inc(type);
3070 		mtype_stat_dec(MT_FREE);
3071 	}
3072 	return m;
3073 }
3074 
3075 /*
3076  * Space allocation routines; these are also available as macros
3077  * for critical paths.
3078  */
3079 #define _M_GETHDR(wait, type)   m_get_common(wait, type, 1)
3080 
3081 struct mbuf *
3082 m_free(struct mbuf *m)
3083 {
3084 	struct mbuf *n = m->m_next;
3085 
3086 	if (m->m_type == MT_FREE) {
3087 		panic("m_free: freeing an already freed mbuf");
3088 	}
3089 
3090 	if (m->m_flags & M_PKTHDR) {
3091 		/* Free the aux data and tags if there is any */
3092 		m_tag_delete_chain(m);
3093 
3094 		m_do_tx_compl_callback(m, NULL);
3095 	}
3096 
3097 	if (m->m_flags & M_EXT) {
3098 		if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
3099 			return n;
3100 		}
3101 		/*
3102 		 * Make sure that we don't touch any ext_ref
3103 		 * member after we decrement the reference count
3104 		 * since that may lead to use-after-free
3105 		 * when we do not hold the last reference.
3106 		 */
3107 		const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
3108 		const m_ext_free_func_t m_free_func = m_get_ext_free(m);
3109 		const uint16_t minref = MEXT_MINREF(m);
3110 		const uint16_t refcnt = m_decref(m);
3111 
3112 		if (refcnt == minref && !composite) {
3113 			if (m_free_func == NULL) {
3114 				mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3115 			} else if (m_free_func == m_bigfree) {
3116 				mcache_free(m_cache(MC_BIGCL),
3117 				    m->m_ext.ext_buf);
3118 			} else if (m_free_func == m_16kfree) {
3119 				mcache_free(m_cache(MC_16KCL),
3120 				    m->m_ext.ext_buf);
3121 			} else {
3122 				(*m_free_func)(m->m_ext.ext_buf,
3123 				    m->m_ext.ext_size, m_get_ext_arg(m));
3124 			}
3125 			mcache_free(ref_cache, m_get_rfa(m));
3126 			m_set_ext(m, NULL, NULL, NULL);
3127 		} else if (refcnt == minref && composite) {
3128 			VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
3129 
3130 			mtype_stat_dec(m->m_type);
3131 			mtype_stat_inc(MT_FREE);
3132 
3133 			m->m_type = MT_FREE;
3134 			m->m_flags = M_EXT;
3135 			m->m_len = 0;
3136 			m->m_next = m->m_nextpkt = NULL;
3137 			/*
3138 			 * MEXT_FLAGS is safe to access here
3139 			 * since we are now sure that we held
3140 			 * the last reference to ext_ref.
3141 			 */
3142 			MEXT_FLAGS(m) &= ~EXTF_READONLY;
3143 
3144 			/* "Free" into the intermediate cache */
3145 			if (m_free_func == NULL) {
3146 				mcache_free(m_cache(MC_MBUF_CL), m);
3147 			} else if (m_free_func == m_bigfree) {
3148 				mcache_free(m_cache(MC_MBUF_BIGCL), m);
3149 			} else {
3150 				VERIFY(m_free_func == m_16kfree);
3151 				mcache_free(m_cache(MC_MBUF_16KCL), m);
3152 			}
3153 			return n;
3154 		}
3155 	}
3156 
3157 	mtype_stat_dec(m->m_type);
3158 	mtype_stat_inc(MT_FREE);
3159 
3160 	m->m_type = MT_FREE;
3161 	m->m_flags = m->m_len = 0;
3162 	m->m_next = m->m_nextpkt = NULL;
3163 
3164 	mcache_free(m_cache(MC_MBUF), m);
3165 
3166 	return n;
3167 }
3168 
3169 __private_extern__ struct mbuf *
3170 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3171     void (*extfree)(caddr_t, u_int, caddr_t), size_t extsize, caddr_t extarg,
3172     int wait, int pair)
3173 {
3174 	struct ext_ref *rfa = NULL;
3175 
3176 	/*
3177 	 * If pairing is requested and an existing mbuf is provided, reject
3178 	 * it if it's already been paired to another cluster.  Otherwise,
3179 	 * allocate a new one or free any existing below.
3180 	 */
3181 	if ((m != NULL && MBUF_IS_PAIRED(m)) ||
3182 	    (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)) {
3183 		return NULL;
3184 	}
3185 
3186 	if (m->m_flags & M_EXT) {
3187 		/*
3188 		 * Make sure that we don't touch any ext_ref
3189 		 * member after we decrement the reference count
3190 		 * since that may lead to use-after-free
3191 		 * when we do not hold the last reference.
3192 		 */
3193 		const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
3194 		VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
3195 		const m_ext_free_func_t m_free_func = m_get_ext_free(m);
3196 		const uint16_t minref = MEXT_MINREF(m);
3197 		const uint16_t refcnt = m_decref(m);
3198 
3199 		if (refcnt == minref && !composite) {
3200 			if (m_free_func == NULL) {
3201 				mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3202 			} else if (m_free_func == m_bigfree) {
3203 				mcache_free(m_cache(MC_BIGCL),
3204 				    m->m_ext.ext_buf);
3205 			} else if (m_free_func == m_16kfree) {
3206 				mcache_free(m_cache(MC_16KCL),
3207 				    m->m_ext.ext_buf);
3208 			} else {
3209 				(*m_free_func)(m->m_ext.ext_buf,
3210 				    m->m_ext.ext_size, m_get_ext_arg(m));
3211 			}
3212 			/* Re-use the reference structure */
3213 			rfa = m_get_rfa(m);
3214 		} else if (refcnt == minref && composite) {
3215 			VERIFY(m->m_type != MT_FREE);
3216 
3217 			mtype_stat_dec(m->m_type);
3218 			mtype_stat_inc(MT_FREE);
3219 
3220 			m->m_type = MT_FREE;
3221 			m->m_flags = M_EXT;
3222 			m->m_len = 0;
3223 			m->m_next = m->m_nextpkt = NULL;
3224 
3225 			/*
3226 			 * MEXT_FLAGS is safe to access here
3227 			 * since we are now sure that we held
3228 			 * the last reference to ext_ref.
3229 			 */
3230 			MEXT_FLAGS(m) &= ~EXTF_READONLY;
3231 
3232 			/* "Free" into the intermediate cache */
3233 			if (m_free_func == NULL) {
3234 				mcache_free(m_cache(MC_MBUF_CL), m);
3235 			} else if (m_free_func == m_bigfree) {
3236 				mcache_free(m_cache(MC_MBUF_BIGCL), m);
3237 			} else {
3238 				VERIFY(m_free_func == m_16kfree);
3239 				mcache_free(m_cache(MC_MBUF_16KCL), m);
3240 			}
3241 			/*
3242 			 * Allocate a new mbuf, since we didn't divorce
3243 			 * the composite mbuf + cluster pair above.
3244 			 */
3245 			if ((m = _M_GETHDR(wait, type)) == NULL) {
3246 				return NULL;
3247 			}
3248 		}
3249 	}
3250 
3251 	if (rfa == NULL &&
3252 	    (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3253 		m_free(m);
3254 		return NULL;
3255 	}
3256 
3257 	if (!pair) {
3258 		mext_init(m, extbuf, extsize, extfree, extarg, rfa,
3259 		    0, 1, 0, 0, 0, NULL);
3260 	} else {
3261 		mext_init(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
3262 		    1, 1, 1, EXTF_PAIRED, 0, m);
3263 	}
3264 
3265 	return m;
3266 }
3267 
3268 /*
3269  * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3270  * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3271  */
3272 struct mbuf *
3273 m_getcl(int wait, int type, int flags)
3274 {
3275 	struct mbuf *m = NULL;
3276 	int hdr = (flags & M_PKTHDR);
3277 
3278 	int mcflags = MSLEEPF(wait);
3279 
3280 	/* Is this due to a non-blocking retry?  If so, then try harder */
3281 	if (mcflags & MCR_NOSLEEP) {
3282 		mcflags |= MCR_TRYHARD;
3283 	}
3284 
3285 	m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3286 	if (m != NULL) {
3287 		u_int16_t flag;
3288 		struct ext_ref *rfa;
3289 		void *cl;
3290 
3291 		VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3292 		cl = m->m_ext.ext_buf;
3293 		rfa = m_get_rfa(m);
3294 
3295 		ASSERT(cl != NULL && rfa != NULL);
3296 		VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
3297 
3298 		flag = MEXT_FLAGS(m);
3299 
3300 		mbuf_init(m, hdr, type);
3301 		MBUF_CL_INIT(m, cl, rfa, 1, flag);
3302 
3303 		mtype_stat_inc(type);
3304 		mtype_stat_dec(MT_FREE);
3305 	}
3306 	return m;
3307 }
3308 
3309 /* m_mclget() add an mbuf cluster to a normal mbuf */
3310 struct mbuf *
3311 m_mclget(struct mbuf *m, int wait)
3312 {
3313 	struct ext_ref *rfa = NULL;
3314 
3315 	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3316 		return m;
3317 	}
3318 	m->m_ext.ext_buf = m_mclalloc(wait);
3319 	if (m->m_ext.ext_buf != NULL) {
3320 		MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3321 	} else {
3322 		mcache_free(ref_cache, rfa);
3323 	}
3324 
3325 	return m;
3326 }
3327 
3328 /* Allocate an mbuf cluster */
3329 caddr_t
3330 m_mclalloc(int wait)
3331 {
3332 	int mcflags = MSLEEPF(wait);
3333 
3334 	/* Is this due to a non-blocking retry?  If so, then try harder */
3335 	if (mcflags & MCR_NOSLEEP) {
3336 		mcflags |= MCR_TRYHARD;
3337 	}
3338 
3339 	return mcache_alloc(m_cache(MC_CL), mcflags);
3340 }
3341 
3342 /* Free an mbuf cluster */
3343 void
3344 m_mclfree(caddr_t p)
3345 {
3346 	mcache_free(m_cache(MC_CL), p);
3347 }
3348 
3349 __private_extern__ caddr_t
3350 m_bigalloc(int wait)
3351 {
3352 	int mcflags = MSLEEPF(wait);
3353 
3354 	/* Is this due to a non-blocking retry?  If so, then try harder */
3355 	if (mcflags & MCR_NOSLEEP) {
3356 		mcflags |= MCR_TRYHARD;
3357 	}
3358 
3359 	return mcache_alloc(m_cache(MC_BIGCL), mcflags);
3360 }
3361 
3362 __private_extern__ void
3363 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3364 {
3365 	mcache_free(m_cache(MC_BIGCL), p);
3366 }
3367 
3368 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3369 __private_extern__ struct mbuf *
3370 m_mbigget(struct mbuf *m, int wait)
3371 {
3372 	struct ext_ref *rfa = NULL;
3373 
3374 	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3375 		return m;
3376 	}
3377 	m->m_ext.ext_buf = m_bigalloc(wait);
3378 	if (m->m_ext.ext_buf != NULL) {
3379 		MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3380 	} else {
3381 		mcache_free(ref_cache, rfa);
3382 	}
3383 	return m;
3384 }
3385 
3386 __private_extern__ caddr_t
3387 m_16kalloc(int wait)
3388 {
3389 	int mcflags = MSLEEPF(wait);
3390 
3391 	/* Is this due to a non-blocking retry?  If so, then try harder */
3392 	if (mcflags & MCR_NOSLEEP) {
3393 		mcflags |= MCR_TRYHARD;
3394 	}
3395 
3396 	return mcache_alloc(m_cache(MC_16KCL), mcflags);
3397 }
3398 
3399 __private_extern__ void
3400 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3401 {
3402 	mcache_free(m_cache(MC_16KCL), p);
3403 }
3404 
3405 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3406 __private_extern__ struct mbuf *
3407 m_m16kget(struct mbuf *m, int wait)
3408 {
3409 	struct ext_ref *rfa = NULL;
3410 
3411 	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3412 		return m;
3413 	}
3414 	m->m_ext.ext_buf =  m_16kalloc(wait);
3415 	if (m->m_ext.ext_buf != NULL) {
3416 		MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3417 	} else {
3418 		mcache_free(ref_cache, rfa);
3419 	}
3420 
3421 	return m;
3422 }
3423 
3424 /*
3425  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3426  * if wantall is not set, return whatever number were available.  Set up the
3427  * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3428  * are chained on the m_nextpkt field.  Any packets requested beyond this
3429  * are chained onto the last packet header's m_next field.  The size of
3430  * the cluster is controlled by the parameter bufsize.
3431  */
3432 __private_extern__ struct mbuf *
3433 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3434     int wait, int wantall, size_t bufsize)
3435 {
3436 	struct mbuf *m = NULL;
3437 	struct mbuf **np, *top;
3438 	unsigned int pnum, needed = *num_needed;
3439 	mcache_obj_t *mp_list = NULL;
3440 	int mcflags = MSLEEPF(wait);
3441 	mcache_t *cp;
3442 	u_int16_t flag;
3443 	struct ext_ref *rfa;
3444 	void *cl;
3445 
3446 	ASSERT(bufsize == m_maxsize(MC_CL) ||
3447 	    bufsize == m_maxsize(MC_BIGCL) ||
3448 	    bufsize == m_maxsize(MC_16KCL));
3449 
3450 	top = NULL;
3451 	np = &top;
3452 	pnum = 0;
3453 
3454 	/*
3455 	 * The caller doesn't want all the requested buffers; only some.
3456 	 * Try hard to get what we can, but don't block.  This effectively
3457 	 * overrides MCR_SLEEP, since this thread will not go to sleep
3458 	 * if we can't get all the buffers.
3459 	 */
3460 	if (!wantall || (mcflags & MCR_NOSLEEP)) {
3461 		mcflags |= MCR_TRYHARD;
3462 	}
3463 
3464 	/* Allocate the composite mbuf + cluster elements from the cache */
3465 	if (bufsize == m_maxsize(MC_CL)) {
3466 		cp = m_cache(MC_MBUF_CL);
3467 	} else if (bufsize == m_maxsize(MC_BIGCL)) {
3468 		cp = m_cache(MC_MBUF_BIGCL);
3469 	} else {
3470 		cp = m_cache(MC_MBUF_16KCL);
3471 	}
3472 	needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3473 
3474 	for (pnum = 0; pnum < needed; pnum++) {
3475 		m = (struct mbuf *)mp_list;
3476 		mp_list = mp_list->obj_next;
3477 
3478 		VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3479 		cl = m->m_ext.ext_buf;
3480 		rfa = m_get_rfa(m);
3481 
3482 		ASSERT(cl != NULL && rfa != NULL);
3483 		VERIFY(MBUF_IS_COMPOSITE(m));
3484 
3485 		flag = MEXT_FLAGS(m);
3486 
3487 		mbuf_init(m, num_with_pkthdrs, MT_DATA);
3488 		if (bufsize == m_maxsize(MC_16KCL)) {
3489 			MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3490 		} else if (bufsize == m_maxsize(MC_BIGCL)) {
3491 			MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3492 		} else {
3493 			MBUF_CL_INIT(m, cl, rfa, 1, flag);
3494 		}
3495 
3496 		if (num_with_pkthdrs > 0) {
3497 			--num_with_pkthdrs;
3498 		}
3499 
3500 		*np = m;
3501 		if (num_with_pkthdrs > 0) {
3502 			np = &m->m_nextpkt;
3503 		} else {
3504 			np = &m->m_next;
3505 		}
3506 	}
3507 	ASSERT(pnum != *num_needed || mp_list == NULL);
3508 	if (mp_list != NULL) {
3509 		mcache_free_ext(cp, mp_list);
3510 	}
3511 	if (pnum > 0) {
3512 		mtype_stat_add(MT_DATA, pnum);
3513 		mtype_stat_sub(MT_FREE, pnum);
3514 	}
3515 
3516 	if (wantall && (pnum != *num_needed)) {
3517 		if (top != NULL) {
3518 			m_freem_list(top);
3519 		}
3520 		return NULL;
3521 	}
3522 
3523 	if (pnum > *num_needed) {
3524 		printf("%s: File a radar related to <rdar://10146739>. \
3525 			needed = %u, pnum = %u, num_needed = %u \n",
3526 		    __func__, needed, pnum, *num_needed);
3527 	}
3528 	*num_needed = pnum;
3529 
3530 	return top;
3531 }
3532 
3533 /*
3534  * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
3535  * wantall is not set, return whatever number were available.  The size of
3536  * each mbuf in the list is controlled by the parameter packetlen.  Each
3537  * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
3538  * in the chain is called a segment.  If maxsegments is not null and the
3539  * value pointed to is not null, this specify the maximum number of segments
3540  * for a chain of mbufs.  If maxsegments is zero or the value pointed to
3541  * is zero the caller does not have any restriction on the number of segments.
3542  * The actual  number of segments of a mbuf chain is return in the value
3543  * pointed to by maxsegments.
3544  */
3545 __private_extern__ struct mbuf *
3546 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3547     unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3548 {
3549 	struct mbuf **np, *top, *first = NULL;
3550 	size_t bufsize, r_bufsize;
3551 	unsigned int num = 0;
3552 	unsigned int nsegs = 0;
3553 	unsigned int needed = 0, resid;
3554 	int mcflags = MSLEEPF(wait);
3555 	mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3556 	mcache_t *cp = NULL, *rcp = NULL;
3557 
3558 	if (*numlist == 0) {
3559 		os_log(OS_LOG_DEFAULT, "m_allocpacket_internal *numlist is 0");
3560 		return NULL;
3561 	}
3562 
3563 	top = NULL;
3564 	np = &top;
3565 
3566 	if (wantsize == 0) {
3567 		if (packetlen <= MINCLSIZE) {
3568 			bufsize = packetlen;
3569 		} else if (packetlen > m_maxsize(MC_CL)) {
3570 			/* Use 4KB if jumbo cluster pool isn't available */
3571 			if (packetlen <= m_maxsize(MC_BIGCL)) {
3572 				bufsize = m_maxsize(MC_BIGCL);
3573 			} else {
3574 				bufsize = m_maxsize(MC_16KCL);
3575 			}
3576 		} else {
3577 			bufsize = m_maxsize(MC_CL);
3578 		}
3579 	} else if (wantsize == m_maxsize(MC_CL) ||
3580 	    wantsize == m_maxsize(MC_BIGCL) ||
3581 	    wantsize == m_maxsize(MC_16KCL)) {
3582 		bufsize = wantsize;
3583 	} else {
3584 		*numlist = 0;
3585 		os_log(OS_LOG_DEFAULT, "m_allocpacket_internal wantsize unsupported");
3586 		return NULL;
3587 	}
3588 
3589 	if (bufsize <= MHLEN) {
3590 		nsegs = 1;
3591 	} else if (bufsize <= MINCLSIZE) {
3592 		if (maxsegments != NULL && *maxsegments == 1) {
3593 			bufsize = m_maxsize(MC_CL);
3594 			nsegs = 1;
3595 		} else {
3596 			nsegs = 2;
3597 		}
3598 	} else if (bufsize == m_maxsize(MC_16KCL)) {
3599 		nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1;
3600 	} else if (bufsize == m_maxsize(MC_BIGCL)) {
3601 		nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1;
3602 	} else {
3603 		nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3604 	}
3605 	if (maxsegments != NULL) {
3606 		if (*maxsegments && nsegs > *maxsegments) {
3607 			*maxsegments = nsegs;
3608 			*numlist = 0;
3609 			os_log(OS_LOG_DEFAULT, "m_allocpacket_internal nsegs > *maxsegments");
3610 			return NULL;
3611 		}
3612 		*maxsegments = nsegs;
3613 	}
3614 
3615 	/*
3616 	 * The caller doesn't want all the requested buffers; only some.
3617 	 * Try hard to get what we can, but don't block.  This effectively
3618 	 * overrides MCR_SLEEP, since this thread will not go to sleep
3619 	 * if we can't get all the buffers.
3620 	 */
3621 	if (!wantall || (mcflags & MCR_NOSLEEP)) {
3622 		mcflags |= MCR_TRYHARD;
3623 	}
3624 
3625 	/*
3626 	 * Simple case where all elements in the lists/chains are mbufs.
3627 	 * Unless bufsize is greater than MHLEN, each segment chain is made
3628 	 * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
3629 	 * of 2 mbufs; the second one is used for the residual data, i.e.
3630 	 * the remaining data that cannot fit into the first mbuf.
3631 	 */
3632 	if (bufsize <= MINCLSIZE) {
3633 		/* Allocate the elements in one shot from the mbuf cache */
3634 		ASSERT(bufsize <= MHLEN || nsegs == 2);
3635 		cp = m_cache(MC_MBUF);
3636 		needed = mcache_alloc_ext(cp, &mp_list,
3637 		    (*numlist) * nsegs, mcflags);
3638 
3639 		/*
3640 		 * The number of elements must be even if we are to use an
3641 		 * mbuf (instead of a cluster) to store the residual data.
3642 		 * If we couldn't allocate the requested number of mbufs,
3643 		 * trim the number down (if it's odd) in order to avoid
3644 		 * creating a partial segment chain.
3645 		 */
3646 		if (bufsize > MHLEN && (needed & 0x1)) {
3647 			needed--;
3648 		}
3649 
3650 		while (num < needed) {
3651 			struct mbuf *m = NULL;
3652 
3653 			m = (struct mbuf *)mp_list;
3654 			mp_list = mp_list->obj_next;
3655 			ASSERT(m != NULL);
3656 
3657 			mbuf_init(m, 1, MT_DATA);
3658 			num++;
3659 			if (bufsize > MHLEN) {
3660 				/* A second mbuf for this segment chain */
3661 				m->m_next = (struct mbuf *)mp_list;
3662 				mp_list = mp_list->obj_next;
3663 
3664 				ASSERT(m->m_next != NULL);
3665 
3666 				mbuf_init(m->m_next, 0, MT_DATA);
3667 				num++;
3668 			}
3669 			*np = m;
3670 			np = &m->m_nextpkt;
3671 		}
3672 		ASSERT(num != *numlist || mp_list == NULL);
3673 
3674 		if (num > 0) {
3675 			mtype_stat_add(MT_DATA, num);
3676 			mtype_stat_sub(MT_FREE, num);
3677 		}
3678 		num /= nsegs;
3679 
3680 		/* We've got them all; return to caller */
3681 		if (num == *numlist) {
3682 			return top;
3683 		}
3684 
3685 		goto fail;
3686 	}
3687 
3688 	/*
3689 	 * Complex cases where elements are made up of one or more composite
3690 	 * mbufs + cluster, depending on packetlen.  Each N-segment chain can
3691 	 * be illustrated as follows:
3692 	 *
3693 	 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3694 	 *
3695 	 * Every composite mbuf + cluster element comes from the intermediate
3696 	 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
3697 	 * the last composite element will come from the MC_MBUF_CL cache,
3698 	 * unless the residual data is larger than 2KB where we use the
3699 	 * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
3700 	 * data is defined as extra data beyond the first element that cannot
3701 	 * fit into the previous element, i.e. there is no residual data if
3702 	 * the chain only has 1 segment.
3703 	 */
3704 	r_bufsize = bufsize;
3705 	resid = packetlen > bufsize ? packetlen % bufsize : 0;
3706 	if (resid > 0) {
3707 		/* There is residual data; figure out the cluster size */
3708 		if (wantsize == 0 && packetlen > MINCLSIZE) {
3709 			/*
3710 			 * Caller didn't request that all of the segments
3711 			 * in the chain use the same cluster size; use the
3712 			 * smaller of the cluster sizes.
3713 			 */
3714 			if (resid > m_maxsize(MC_BIGCL)) {
3715 				r_bufsize = m_maxsize(MC_16KCL);
3716 			} else if (resid > m_maxsize(MC_CL)) {
3717 				r_bufsize = m_maxsize(MC_BIGCL);
3718 			} else {
3719 				r_bufsize = m_maxsize(MC_CL);
3720 			}
3721 		} else {
3722 			/* Use the same cluster size as the other segments */
3723 			resid = 0;
3724 		}
3725 	}
3726 
3727 	needed = *numlist;
3728 	if (resid > 0) {
3729 		/*
3730 		 * Attempt to allocate composite mbuf + cluster elements for
3731 		 * the residual data in each chain; record the number of such
3732 		 * elements that can be allocated so that we know how many
3733 		 * segment chains we can afford to create.
3734 		 */
3735 		if (r_bufsize <= m_maxsize(MC_CL)) {
3736 			rcp = m_cache(MC_MBUF_CL);
3737 		} else if (r_bufsize <= m_maxsize(MC_BIGCL)) {
3738 			rcp = m_cache(MC_MBUF_BIGCL);
3739 		} else {
3740 			rcp = m_cache(MC_MBUF_16KCL);
3741 		}
3742 		needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
3743 		if (needed == 0) {
3744 			goto fail;
3745 		}
3746 
3747 		/* This is temporarily reduced for calculation */
3748 		ASSERT(nsegs > 1);
3749 		nsegs--;
3750 	}
3751 
3752 	/*
3753 	 * Attempt to allocate the rest of the composite mbuf + cluster
3754 	 * elements for the number of segment chains that we need.
3755 	 */
3756 	if (bufsize <= m_maxsize(MC_CL)) {
3757 		cp = m_cache(MC_MBUF_CL);
3758 	} else if (bufsize <= m_maxsize(MC_BIGCL)) {
3759 		cp = m_cache(MC_MBUF_BIGCL);
3760 	} else {
3761 		cp = m_cache(MC_MBUF_16KCL);
3762 	}
3763 	needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
3764 
3765 	/* Round it down to avoid creating a partial segment chain */
3766 	needed = (needed / nsegs) * nsegs;
3767 	if (needed == 0) {
3768 		goto fail;
3769 	}
3770 
3771 	if (resid > 0) {
3772 		/*
3773 		 * We're about to construct the chain(s); take into account
3774 		 * the number of segments we have created above to hold the
3775 		 * residual data for each chain, as well as restore the
3776 		 * original count of segments per chain.
3777 		 */
3778 		ASSERT(nsegs > 0);
3779 		needed += needed / nsegs;
3780 		nsegs++;
3781 	}
3782 
3783 	for (;;) {
3784 		struct mbuf *m = NULL;
3785 		u_int16_t flag;
3786 		struct ext_ref *rfa;
3787 		void *cl;
3788 		int pkthdr;
3789 		m_ext_free_func_t m_free_func;
3790 
3791 		++num;
3792 
3793 		if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
3794 			m = (struct mbuf *)mp_list;
3795 			mp_list = mp_list->obj_next;
3796 		} else {
3797 			m = (struct mbuf *)rmp_list;
3798 			rmp_list = rmp_list->obj_next;
3799 		}
3800 		m_free_func = m_get_ext_free(m);
3801 		ASSERT(m != NULL);
3802 		VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3803 		VERIFY(m_free_func == NULL || m_free_func == m_bigfree ||
3804 		    m_free_func == m_16kfree);
3805 
3806 		cl = m->m_ext.ext_buf;
3807 		rfa = m_get_rfa(m);
3808 
3809 		ASSERT(cl != NULL && rfa != NULL);
3810 		VERIFY(MBUF_IS_COMPOSITE(m));
3811 
3812 		flag = MEXT_FLAGS(m);
3813 
3814 		pkthdr = (nsegs == 1 || (num % nsegs) == 1);
3815 		if (pkthdr) {
3816 			first = m;
3817 		}
3818 		mbuf_init(m, pkthdr, MT_DATA);
3819 		if (m_free_func == m_16kfree) {
3820 			MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3821 		} else if (m_free_func == m_bigfree) {
3822 			MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3823 		} else {
3824 			MBUF_CL_INIT(m, cl, rfa, 1, flag);
3825 		}
3826 
3827 		*np = m;
3828 		if ((num % nsegs) == 0) {
3829 			np = &first->m_nextpkt;
3830 		} else {
3831 			np = &m->m_next;
3832 		}
3833 
3834 		if (num == needed) {
3835 			break;
3836 		}
3837 	}
3838 
3839 	if (num > 0) {
3840 		mtype_stat_add(MT_DATA, num);
3841 		mtype_stat_sub(MT_FREE, num);
3842 	}
3843 
3844 	num /= nsegs;
3845 
3846 	/* We've got them all; return to caller */
3847 	if (num == *numlist) {
3848 		ASSERT(mp_list == NULL && rmp_list == NULL);
3849 		return top;
3850 	}
3851 
3852 fail:
3853 	/* Free up what's left of the above */
3854 	if (mp_list != NULL) {
3855 		mcache_free_ext(cp, mp_list);
3856 	}
3857 	if (rmp_list != NULL) {
3858 		mcache_free_ext(rcp, rmp_list);
3859 	}
3860 	if (wantall && top != NULL) {
3861 		m_freem_list(top);
3862 		*numlist = 0;
3863 		return NULL;
3864 	}
3865 	*numlist = num;
3866 	return top;
3867 }
3868 
3869 /*
3870  * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
3871  * for mbufs packets freed.  Used by the drivers.
3872  */
3873 int
3874 m_freem_list(struct mbuf *m)
3875 {
3876 	struct mbuf *nextpkt;
3877 	mcache_obj_t *mp_list = NULL;
3878 	mcache_obj_t *mcl_list = NULL;
3879 	mcache_obj_t *mbc_list = NULL;
3880 	mcache_obj_t *m16k_list = NULL;
3881 	mcache_obj_t *m_mcl_list = NULL;
3882 	mcache_obj_t *m_mbc_list = NULL;
3883 	mcache_obj_t *m_m16k_list = NULL;
3884 	mcache_obj_t *ref_list = NULL;
3885 	int pktcount = 0;
3886 	int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
3887 
3888 	while (m != NULL) {
3889 		pktcount++;
3890 
3891 		nextpkt = m->m_nextpkt;
3892 		m->m_nextpkt = NULL;
3893 
3894 		while (m != NULL) {
3895 			struct mbuf *next = m->m_next;
3896 			mcache_obj_t *o, *rfa;
3897 			if (m->m_type == MT_FREE) {
3898 				panic("m_free: freeing an already freed mbuf");
3899 			}
3900 
3901 			if (m->m_flags & M_PKTHDR) {
3902 				/* Free the aux data and tags if there is any */
3903 				m_tag_delete_chain(m);
3904 				m_do_tx_compl_callback(m, NULL);
3905 			}
3906 
3907 			if (!(m->m_flags & M_EXT)) {
3908 				mt_free++;
3909 				goto simple_free;
3910 			}
3911 
3912 			if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
3913 				m = next;
3914 				continue;
3915 			}
3916 
3917 			mt_free++;
3918 
3919 			o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
3920 			/*
3921 			 * Make sure that we don't touch any ext_ref
3922 			 * member after we decrement the reference count
3923 			 * since that may lead to use-after-free
3924 			 * when we do not hold the last reference.
3925 			 */
3926 			const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
3927 			const m_ext_free_func_t m_free_func = m_get_ext_free(m);
3928 			const uint16_t minref = MEXT_MINREF(m);
3929 			const uint16_t refcnt = m_decref(m);
3930 			if (refcnt == minref && !composite) {
3931 				if (m_free_func == NULL) {
3932 					o->obj_next = mcl_list;
3933 					mcl_list = o;
3934 				} else if (m_free_func == m_bigfree) {
3935 					o->obj_next = mbc_list;
3936 					mbc_list = o;
3937 				} else if (m_free_func == m_16kfree) {
3938 					o->obj_next = m16k_list;
3939 					m16k_list = o;
3940 				} else {
3941 					(*(m_free_func))((caddr_t)o,
3942 					    m->m_ext.ext_size,
3943 					    m_get_ext_arg(m));
3944 				}
3945 				rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
3946 				rfa->obj_next = ref_list;
3947 				ref_list = rfa;
3948 				m_set_ext(m, NULL, NULL, NULL);
3949 			} else if (refcnt == minref && composite) {
3950 				VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
3951 				/*
3952 				 * Amortize the costs of atomic operations
3953 				 * by doing them at the end, if possible.
3954 				 */
3955 				if (m->m_type == MT_DATA) {
3956 					mt_data++;
3957 				} else if (m->m_type == MT_HEADER) {
3958 					mt_header++;
3959 				} else if (m->m_type == MT_SONAME) {
3960 					mt_soname++;
3961 				} else if (m->m_type == MT_TAG) {
3962 					mt_tag++;
3963 				} else {
3964 					mtype_stat_dec(m->m_type);
3965 				}
3966 
3967 				m->m_type = MT_FREE;
3968 				m->m_flags = M_EXT;
3969 				m->m_len = 0;
3970 				m->m_next = m->m_nextpkt = NULL;
3971 
3972 				/*
3973 				 * MEXT_FLAGS is safe to access here
3974 				 * since we are now sure that we held
3975 				 * the last reference to ext_ref.
3976 				 */
3977 				MEXT_FLAGS(m) &= ~EXTF_READONLY;
3978 
3979 				/* "Free" into the intermediate cache */
3980 				o = (mcache_obj_t *)m;
3981 				if (m_free_func == NULL) {
3982 					o->obj_next = m_mcl_list;
3983 					m_mcl_list = o;
3984 				} else if (m_free_func == m_bigfree) {
3985 					o->obj_next = m_mbc_list;
3986 					m_mbc_list = o;
3987 				} else {
3988 					VERIFY(m_free_func == m_16kfree);
3989 					o->obj_next = m_m16k_list;
3990 					m_m16k_list = o;
3991 				}
3992 				m = next;
3993 				continue;
3994 			}
3995 simple_free:
3996 			/*
3997 			 * Amortize the costs of atomic operations
3998 			 * by doing them at the end, if possible.
3999 			 */
4000 			if (m->m_type == MT_DATA) {
4001 				mt_data++;
4002 			} else if (m->m_type == MT_HEADER) {
4003 				mt_header++;
4004 			} else if (m->m_type == MT_SONAME) {
4005 				mt_soname++;
4006 			} else if (m->m_type == MT_TAG) {
4007 				mt_tag++;
4008 			} else if (m->m_type != MT_FREE) {
4009 				mtype_stat_dec(m->m_type);
4010 			}
4011 
4012 			m->m_type = MT_FREE;
4013 			m->m_flags = m->m_len = 0;
4014 			m->m_next = m->m_nextpkt = NULL;
4015 
4016 			((mcache_obj_t *)m)->obj_next = mp_list;
4017 			mp_list = (mcache_obj_t *)m;
4018 
4019 			m = next;
4020 		}
4021 
4022 		m = nextpkt;
4023 	}
4024 
4025 	if (mt_free > 0) {
4026 		mtype_stat_add(MT_FREE, mt_free);
4027 	}
4028 	if (mt_data > 0) {
4029 		mtype_stat_sub(MT_DATA, mt_data);
4030 	}
4031 	if (mt_header > 0) {
4032 		mtype_stat_sub(MT_HEADER, mt_header);
4033 	}
4034 	if (mt_soname > 0) {
4035 		mtype_stat_sub(MT_SONAME, mt_soname);
4036 	}
4037 	if (mt_tag > 0) {
4038 		mtype_stat_sub(MT_TAG, mt_tag);
4039 	}
4040 	if (mp_list != NULL) {
4041 		mcache_free_ext(m_cache(MC_MBUF), mp_list);
4042 	}
4043 	if (mcl_list != NULL) {
4044 		mcache_free_ext(m_cache(MC_CL), mcl_list);
4045 	}
4046 	if (mbc_list != NULL) {
4047 		mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4048 	}
4049 	if (m16k_list != NULL) {
4050 		mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4051 	}
4052 	if (m_mcl_list != NULL) {
4053 		mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4054 	}
4055 	if (m_mbc_list != NULL) {
4056 		mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4057 	}
4058 	if (m_m16k_list != NULL) {
4059 		mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4060 	}
4061 	if (ref_list != NULL) {
4062 		mcache_free_ext(ref_cache, ref_list);
4063 	}
4064 
4065 	return pktcount;
4066 }
4067 
4068 /*
4069  * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4070  * within this routine also.
4071  *
4072  * The last mbuf and offset accessed are passed in and adjusted on return to
4073  * avoid having to iterate over the entire mbuf chain each time.
4074  */
4075 struct mbuf *
4076 m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
4077     struct mbuf **m_lastm, int *m_off, uint32_t mode)
4078 {
4079 	struct mbuf *m = m0, *n, **np = NULL;
4080 	int off = off0, len = len0;
4081 	struct mbuf *top = NULL;
4082 	int mcflags = MSLEEPF(wait);
4083 	mcache_obj_t *list = NULL;
4084 	int copyhdr = 0;
4085 	int type = 0;
4086 	int needed = 0;
4087 
4088 	if (off == 0 && (m->m_flags & M_PKTHDR)) {
4089 		copyhdr = 1;
4090 	}
4091 
4092 	if (m_lastm != NULL && *m_lastm != NULL) {
4093 		if (off0 >= *m_off) {
4094 			m = *m_lastm;
4095 			off = off0 - *m_off;
4096 		}
4097 	}
4098 
4099 	while (off >= m->m_len) {
4100 		off -= m->m_len;
4101 		m = m->m_next;
4102 	}
4103 
4104 	n = m;
4105 	while (len > 0) {
4106 		needed++;
4107 		len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4108 		n = n->m_next;
4109 	}
4110 	needed++;
4111 	len = len0;
4112 
4113 	/*
4114 	 * If the caller doesn't want to be put to sleep, mark it with
4115 	 * MCR_TRYHARD so that we may reclaim buffers from other places
4116 	 * before giving up.
4117 	 */
4118 	if (mcflags & MCR_NOSLEEP) {
4119 		mcflags |= MCR_TRYHARD;
4120 	}
4121 
4122 	if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4123 	    mcflags) != needed) {
4124 		goto nospace;
4125 	}
4126 
4127 	needed = 0;
4128 	while (len > 0) {
4129 		n = (struct mbuf *)list;
4130 		list = list->obj_next;
4131 		ASSERT(n != NULL && m != NULL);
4132 
4133 		type = (top == NULL) ? MT_HEADER : m->m_type;
4134 		mbuf_init(n, (top == NULL), type);
4135 
4136 		if (top == NULL) {
4137 			top = n;
4138 			np = &top->m_next;
4139 			continue;
4140 		} else {
4141 			needed++;
4142 			*np = n;
4143 		}
4144 
4145 		if (copyhdr) {
4146 			if ((mode == M_COPYM_MOVE_HDR) ||
4147 			    (mode == M_COPYM_MUST_MOVE_HDR)) {
4148 				M_COPY_PKTHDR(n, m);
4149 			} else if ((mode == M_COPYM_COPY_HDR) ||
4150 			    (mode == M_COPYM_MUST_COPY_HDR)) {
4151 				if (m_dup_pkthdr(n, m, wait) == 0) {
4152 					goto nospace;
4153 				}
4154 			}
4155 			n->m_pkthdr.len = len;
4156 			copyhdr = 0;
4157 		}
4158 		n->m_len = MIN(len, (m->m_len - off));
4159 
4160 		if (m->m_flags & M_EXT) {
4161 			n->m_ext = m->m_ext;
4162 			m_incref(m);
4163 			n->m_data = m->m_data + off;
4164 			n->m_flags |= M_EXT;
4165 		} else {
4166 			if (m_mtod_end(n) > m_mtod_upper_bound(n)) {
4167 				panic("%s n %p copy overflow",
4168 				    __func__, n);
4169 			}
4170 
4171 			bcopy(mtod(m, caddr_t) + off, mtod(n, caddr_t),
4172 			    (unsigned)n->m_len);
4173 		}
4174 		len -= n->m_len;
4175 
4176 		if (len == 0) {
4177 			if (m_lastm != NULL) {
4178 				*m_lastm = m;
4179 				*m_off = off0 + len0 - (off + n->m_len);
4180 			}
4181 			break;
4182 		}
4183 		off = 0;
4184 		m = m->m_next;
4185 		np = &n->m_next;
4186 	}
4187 
4188 	mtype_stat_inc(MT_HEADER);
4189 	mtype_stat_add(type, needed);
4190 	mtype_stat_sub(MT_FREE, needed + 1);
4191 
4192 	ASSERT(list == NULL);
4193 
4194 	return top;
4195 
4196 nospace:
4197 	if (list != NULL) {
4198 		mcache_free_ext(m_cache(MC_MBUF), list);
4199 	}
4200 	if (top != NULL) {
4201 		m_freem(top);
4202 	}
4203 	return NULL;
4204 }
4205 
4206 #ifndef MBUF_GROWTH_NORMAL_THRESH
4207 #define MBUF_GROWTH_NORMAL_THRESH 25
4208 #endif
4209 
4210 /*
4211  * Cluster freelist allocation check.
4212  */
4213 static int
4214 m_howmany(int num, size_t bufsize)
4215 {
4216 	int i = 0, j = 0;
4217 	u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
4218 	u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
4219 	u_int32_t sumclusters, freeclusters;
4220 	u_int32_t percent_pool, percent_kmem;
4221 	u_int32_t mb_growth, mb_growth_thresh;
4222 
4223 	VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
4224 	    bufsize == m_maxsize(MC_16KCL));
4225 
4226 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4227 
4228 	/* Numbers in 2K cluster units */
4229 	m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
4230 	m_clusters = m_total(MC_CL);
4231 	m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
4232 	m_16kclusters = m_total(MC_16KCL);
4233 	sumclusters = m_mbclusters + m_clusters + m_bigclusters;
4234 
4235 	m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
4236 	m_clfree = m_infree(MC_CL);
4237 	m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
4238 	m_16kclfree = m_infree(MC_16KCL);
4239 	freeclusters = m_mbfree + m_clfree + m_bigclfree;
4240 
4241 	/* Bail if we've maxed out the mbuf memory map */
4242 	if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
4243 	    (bufsize == m_maxsize(MC_16KCL) &&
4244 	    (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
4245 		mbwdog_logger("maxed out nclusters (%u >= %u) or njcl (%u >= %u)",
4246 		    sumclusters, nclusters,
4247 		    (m_16kclusters << NCLPJCLSHIFT), njcl);
4248 		return 0;
4249 	}
4250 
4251 	if (bufsize == m_maxsize(MC_BIGCL)) {
4252 		/* Under minimum */
4253 		if (m_bigclusters < m_minlimit(MC_BIGCL)) {
4254 			return m_minlimit(MC_BIGCL) - m_bigclusters;
4255 		}
4256 
4257 		percent_pool =
4258 		    ((sumclusters - freeclusters) * 100) / sumclusters;
4259 		percent_kmem = (sumclusters * 100) / nclusters;
4260 
4261 		/*
4262 		 * If a light/normal user, grow conservatively (75%)
4263 		 * If a heavy user, grow aggressively (50%)
4264 		 */
4265 		if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH) {
4266 			mb_growth = MB_GROWTH_NORMAL;
4267 		} else {
4268 			mb_growth = MB_GROWTH_AGGRESSIVE;
4269 		}
4270 
4271 		if (percent_kmem < 5) {
4272 			/* For initial allocations */
4273 			i = num;
4274 		} else {
4275 			/* Return if >= MBIGCL_LOWAT clusters available */
4276 			if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
4277 			    m_total(MC_BIGCL) >=
4278 			    MBIGCL_LOWAT + m_minlimit(MC_BIGCL)) {
4279 				return 0;
4280 			}
4281 
4282 			/* Ensure at least num clusters are accessible */
4283 			if (num >= m_infree(MC_BIGCL)) {
4284 				i = num - m_infree(MC_BIGCL);
4285 			}
4286 			if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL)) {
4287 				j = num - (m_total(MC_BIGCL) -
4288 				    m_minlimit(MC_BIGCL));
4289 			}
4290 
4291 			i = MAX(i, j);
4292 
4293 			/*
4294 			 * Grow pool if percent_pool > 75 (normal growth)
4295 			 * or percent_pool > 50 (aggressive growth).
4296 			 */
4297 			mb_growth_thresh = 100 - (100 / (1 << mb_growth));
4298 			if (percent_pool > mb_growth_thresh) {
4299 				j = ((sumclusters + num) >> mb_growth) -
4300 				    freeclusters;
4301 			}
4302 			i = MAX(i, j);
4303 		}
4304 
4305 		/* Check to ensure we didn't go over limits */
4306 		if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) {
4307 			i = m_maxlimit(MC_BIGCL) - m_bigclusters;
4308 		}
4309 		if ((i << 1) + sumclusters >= nclusters) {
4310 			i = (nclusters - sumclusters) >> 1;
4311 		}
4312 		VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
4313 		VERIFY(sumclusters + (i << 1) <= nclusters);
4314 	} else { /* 16K CL */
4315 		/* Ensure at least num clusters are available */
4316 		if (num >= m_16kclfree) {
4317 			i = num - m_16kclfree;
4318 		}
4319 
4320 		/* Always grow 16KCL pool aggressively */
4321 		if (((m_16kclusters + num) >> 1) > m_16kclfree) {
4322 			j = ((m_16kclusters + num) >> 1) - m_16kclfree;
4323 		}
4324 		i = MAX(i, j);
4325 
4326 		/* Check to ensure we don't go over limit */
4327 		if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL)) {
4328 			i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
4329 		}
4330 	}
4331 	return i;
4332 }
4333 
4334 uint64_t
4335 mcl_to_paddr(char *addr)
4336 {
4337 	vm_offset_t base_phys;
4338 
4339 	if (!MBUF_IN_MAP(addr)) {
4340 		return 0;
4341 	}
4342 	base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
4343 
4344 	if (base_phys == 0) {
4345 		return 0;
4346 	}
4347 	return (uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK));
4348 }
4349 
4350 /*
4351  * Inform the corresponding mcache(s) that there's a waiter below.
4352  */
4353 static void
4354 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
4355 {
4356 	mcache_waiter_inc(m_cache(class));
4357 	if (comp) {
4358 		if (class == MC_CL) {
4359 			mcache_waiter_inc(m_cache(MC_MBUF_CL));
4360 		} else if (class == MC_BIGCL) {
4361 			mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
4362 		} else if (class == MC_16KCL) {
4363 			mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
4364 		} else {
4365 			mcache_waiter_inc(m_cache(MC_MBUF_CL));
4366 			mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
4367 		}
4368 	}
4369 }
4370 
4371 /*
4372  * Inform the corresponding mcache(s) that there's no more waiter below.
4373  */
4374 static void
4375 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
4376 {
4377 	mcache_waiter_dec(m_cache(class));
4378 	if (comp) {
4379 		if (class == MC_CL) {
4380 			mcache_waiter_dec(m_cache(MC_MBUF_CL));
4381 		} else if (class == MC_BIGCL) {
4382 			mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
4383 		} else if (class == MC_16KCL) {
4384 			mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
4385 		} else {
4386 			mcache_waiter_dec(m_cache(MC_MBUF_CL));
4387 			mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
4388 		}
4389 	}
4390 }
4391 
4392 static bool mbuf_watchdog_defunct_active = false;
4393 
4394 struct mbuf_watchdog_defunct_args {
4395 	struct proc *top_app;
4396 	uint32_t top_app_space_used;
4397 	bool non_blocking;
4398 };
4399 
4400 extern const char *proc_name_address(void *p);
4401 
4402 static void
4403 mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1)
4404 {
4405 #pragma unused(arg0, arg1)
4406 	struct mbuf_watchdog_defunct_args args = {};
4407 	struct fileproc *fp = NULL;
4408 
4409 	args.non_blocking = false;
4410 	proc_iterate(PROC_ALLPROCLIST,
4411 	    mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
4412 
4413 	/*
4414 	 * Defunct all sockets from this app.
4415 	 */
4416 	if (args.top_app != NULL) {
4417 		/* Restart the watchdog count. */
4418 		lck_mtx_lock(mbuf_mlock);
4419 		microuptime(&mb_wdtstart);
4420 		lck_mtx_unlock(mbuf_mlock);
4421 		os_log(OS_LOG_DEFAULT, "%s: defuncting all sockets from %s.%d",
4422 		    __func__,
4423 		    proc_name_address(args.top_app),
4424 		    proc_pid(args.top_app));
4425 		proc_fdlock(args.top_app);
4426 		fdt_foreach(fp, args.top_app) {
4427 			struct fileglob *fg = fp->fp_glob;
4428 			struct socket *so = NULL;
4429 
4430 			if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
4431 				continue;
4432 			}
4433 			so = (struct socket *)fp_get_data(fp);
4434 			if (!socket_try_lock(so)) {
4435 				continue;
4436 			}
4437 			if (sosetdefunct(args.top_app, so,
4438 			    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
4439 			    TRUE) == 0) {
4440 				sodefunct(args.top_app, so,
4441 				    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
4442 			}
4443 			socket_unlock(so, 0);
4444 		}
4445 		proc_fdunlock(args.top_app);
4446 		proc_rele(args.top_app);
4447 		mbstat.m_forcedefunct++;
4448 	}
4449 	mbuf_watchdog_defunct_active = false;
4450 }
4451 
4452 /*
4453  * Called during slab (blocking and non-blocking) allocation.  If there
4454  * is at least one waiter, and the time since the first waiter is blocked
4455  * is greater than the watchdog timeout, panic the system.
4456  */
4457 static void
4458 mbuf_watchdog(void)
4459 {
4460 	struct timeval now;
4461 	unsigned int since;
4462 	static thread_call_t defunct_tcall = NULL;
4463 
4464 	if (mb_waiters == 0 || !mb_watchdog) {
4465 		return;
4466 	}
4467 
4468 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4469 
4470 	microuptime(&now);
4471 	since = now.tv_sec - mb_wdtstart.tv_sec;
4472 
4473 	if (mbuf_watchdog_defunct_active) {
4474 		/*
4475 		 * Don't panic the system while we are trying
4476 		 * to find sockets to defunct.
4477 		 */
4478 		return;
4479 	}
4480 	if (since >= MB_WDT_MAXTIME) {
4481 		panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
4482 		    mb_waiters, since, mbuf_dump());
4483 		/* NOTREACHED */
4484 	}
4485 	/*
4486 	 * Check if we are about to panic the system due
4487 	 * to lack of mbufs and start defuncting sockets
4488 	 * from processes that use too many sockets.
4489 	 *
4490 	 * We're always called with the mbuf_mlock held,
4491 	 * so that also protects mbuf_watchdog_defunct_active.
4492 	 */
4493 	if (since >= MB_WDT_MAXTIME / 2) {
4494 		/*
4495 		 * Start a thread to defunct sockets
4496 		 * from apps that are over-using their socket
4497 		 * buffers.
4498 		 */
4499 		if (defunct_tcall == NULL) {
4500 			defunct_tcall =
4501 			    thread_call_allocate_with_options(mbuf_watchdog_defunct,
4502 			    NULL,
4503 			    THREAD_CALL_PRIORITY_KERNEL,
4504 			    THREAD_CALL_OPTIONS_ONCE);
4505 		}
4506 		if (defunct_tcall != NULL) {
4507 			mbuf_watchdog_defunct_active = true;
4508 			thread_call_enter(defunct_tcall);
4509 		}
4510 	}
4511 }
4512 
4513 /*
4514  * Called during blocking allocation.  Returns TRUE if one or more objects
4515  * are available at the per-CPU caches layer and that allocation should be
4516  * retried at that level.
4517  */
4518 static boolean_t
4519 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
4520 {
4521 	boolean_t mcache_retry = FALSE;
4522 
4523 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4524 
4525 	/* Check if there's anything at the cache layer */
4526 	if (mbuf_cached_above(class, wait)) {
4527 		mcache_retry = TRUE;
4528 		goto done;
4529 	}
4530 
4531 	/* Nothing?  Then try hard to get it from somewhere */
4532 	m_reclaim(class, num, (wait & MCR_COMP));
4533 
4534 	/* We tried hard and got something? */
4535 	if (m_infree(class) > 0) {
4536 		mbstat.m_wait++;
4537 		goto done;
4538 	} else if (mbuf_cached_above(class, wait)) {
4539 		mbstat.m_wait++;
4540 		mcache_retry = TRUE;
4541 		goto done;
4542 	} else if (wait & MCR_TRYHARD) {
4543 		mcache_retry = TRUE;
4544 		goto done;
4545 	}
4546 
4547 	/*
4548 	 * There's really nothing for us right now; inform the
4549 	 * cache(s) that there is a waiter below and go to sleep.
4550 	 */
4551 	mbuf_waiter_inc(class, (wait & MCR_COMP));
4552 
4553 	VERIFY(!(wait & MCR_NOSLEEP));
4554 
4555 	/*
4556 	 * If this is the first waiter, arm the watchdog timer.  Otherwise
4557 	 * check if we need to panic the system due to watchdog timeout.
4558 	 */
4559 	if (mb_waiters == 0) {
4560 		microuptime(&mb_wdtstart);
4561 	} else {
4562 		mbuf_watchdog();
4563 	}
4564 
4565 	mb_waiters++;
4566 	m_region_expand(class) += m_total(class) + num;
4567 	/* wake up the worker thread */
4568 	if (mbuf_worker_ready &&
4569 	    mbuf_worker_needs_wakeup) {
4570 		wakeup((caddr_t)&mbuf_worker_needs_wakeup);
4571 		mbuf_worker_needs_wakeup = FALSE;
4572 	}
4573 	mbwdog_logger("waiting (%d mbufs in class %s)", num, m_cname(class));
4574 	(void) msleep(mb_waitchan, mbuf_mlock, (PZERO - 1), m_cname(class), NULL);
4575 	mbwdog_logger("woke up (%d mbufs in class %s) ", num, m_cname(class));
4576 
4577 	/* We are now up; stop getting notified until next round */
4578 	mbuf_waiter_dec(class, (wait & MCR_COMP));
4579 
4580 	/* We waited and got something */
4581 	if (m_infree(class) > 0) {
4582 		mbstat.m_wait++;
4583 		goto done;
4584 	} else if (mbuf_cached_above(class, wait)) {
4585 		mbstat.m_wait++;
4586 		mcache_retry = TRUE;
4587 	}
4588 done:
4589 	return mcache_retry;
4590 }
4591 
4592 __attribute__((noreturn))
4593 static void
4594 mbuf_worker_thread(void)
4595 {
4596 	int mbuf_expand;
4597 
4598 	while (1) {
4599 		lck_mtx_lock(mbuf_mlock);
4600 		mbwdog_logger("worker thread running");
4601 		mbuf_worker_run_cnt++;
4602 		mbuf_expand = 0;
4603 		/*
4604 		 * Allocations are based on page size, so if we have depleted
4605 		 * the reserved spaces, try to free mbufs from the major classes.
4606 		 */
4607 #if PAGE_SIZE == 4096
4608 		uint32_t m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
4609 		uint32_t m_clusters = m_total(MC_CL);
4610 		uint32_t m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
4611 		uint32_t sumclusters = m_mbclusters + m_clusters + m_bigclusters;
4612 		if (sumclusters >= nclusters) {
4613 			mbwdog_logger("reclaiming bigcl");
4614 			mbuf_drain_locked(TRUE);
4615 			m_reclaim(MC_BIGCL, 4, FALSE);
4616 		}
4617 #else
4618 		uint32_t m_16kclusters = m_total(MC_16KCL);
4619 		if ((m_16kclusters << NCLPJCLSHIFT) >= njcl) {
4620 			mbwdog_logger("reclaiming 16kcl");
4621 			mbuf_drain_locked(TRUE);
4622 			m_reclaim(MC_16KCL, 4, FALSE);
4623 		}
4624 #endif
4625 		if (m_region_expand(MC_CL) > 0) {
4626 			int n;
4627 			mb_expand_cl_cnt++;
4628 			/* Adjust to current number of cluster in use */
4629 			n = m_region_expand(MC_CL) -
4630 			    (m_total(MC_CL) - m_infree(MC_CL));
4631 			if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL)) {
4632 				n = m_maxlimit(MC_CL) - m_total(MC_CL);
4633 			}
4634 			if (n > 0) {
4635 				mb_expand_cl_total += n;
4636 			}
4637 			m_region_expand(MC_CL) = 0;
4638 
4639 			if (n > 0) {
4640 				mbwdog_logger("expanding MC_CL by %d", n);
4641 				freelist_populate(MC_CL, n, M_WAIT);
4642 			}
4643 		}
4644 		if (m_region_expand(MC_BIGCL) > 0) {
4645 			int n;
4646 			mb_expand_bigcl_cnt++;
4647 			/* Adjust to current number of 4 KB cluster in use */
4648 			n = m_region_expand(MC_BIGCL) -
4649 			    (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
4650 			if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL)) {
4651 				n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
4652 			}
4653 			if (n > 0) {
4654 				mb_expand_bigcl_total += n;
4655 			}
4656 			m_region_expand(MC_BIGCL) = 0;
4657 
4658 			if (n > 0) {
4659 				mbwdog_logger("expanding MC_BIGCL by %d", n);
4660 				freelist_populate(MC_BIGCL, n, M_WAIT);
4661 			}
4662 		}
4663 		if (m_region_expand(MC_16KCL) > 0) {
4664 			int n;
4665 			mb_expand_16kcl_cnt++;
4666 			/* Adjust to current number of 16 KB cluster in use */
4667 			n = m_region_expand(MC_16KCL) -
4668 			    (m_total(MC_16KCL) - m_infree(MC_16KCL));
4669 			if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL)) {
4670 				n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
4671 			}
4672 			if (n > 0) {
4673 				mb_expand_16kcl_total += n;
4674 			}
4675 			m_region_expand(MC_16KCL) = 0;
4676 
4677 			if (n > 0) {
4678 				mbwdog_logger("expanding MC_16KCL by %d", n);
4679 				(void) freelist_populate(MC_16KCL, n, M_WAIT);
4680 			}
4681 		}
4682 
4683 		/*
4684 		 * Because we can run out of memory before filling the mbuf
4685 		 * map, we should not allocate more clusters than they are
4686 		 * mbufs -- otherwise we could have a large number of useless
4687 		 * clusters allocated.
4688 		 */
4689 		mbwdog_logger("totals: MC_MBUF %d MC_BIGCL %d MC_CL %d MC_16KCL %d",
4690 		    m_total(MC_MBUF), m_total(MC_BIGCL), m_total(MC_CL),
4691 		    m_total(MC_16KCL));
4692 		uint32_t total_mbufs = m_total(MC_MBUF);
4693 		uint32_t total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
4694 		    m_total(MC_16KCL);
4695 		if (total_mbufs < total_clusters) {
4696 			mbwdog_logger("expanding MC_MBUF by %d",
4697 			    total_clusters - total_mbufs);
4698 		}
4699 		while (total_mbufs < total_clusters) {
4700 			mb_expand_cnt++;
4701 			if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0) {
4702 				break;
4703 			}
4704 			total_mbufs = m_total(MC_MBUF);
4705 			total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
4706 			    m_total(MC_16KCL);
4707 		}
4708 
4709 		mbuf_worker_needs_wakeup = TRUE;
4710 		/*
4711 		 * If there's a deadlock and we're not sending / receiving
4712 		 * packets, net_uptime() won't be updated.  Update it here
4713 		 * so we are sure it's correct.
4714 		 */
4715 		net_update_uptime();
4716 		mbuf_worker_last_runtime = net_uptime();
4717 		assert_wait((caddr_t)&mbuf_worker_needs_wakeup,
4718 		    THREAD_UNINT);
4719 		mbwdog_logger("worker thread sleeping");
4720 		lck_mtx_unlock(mbuf_mlock);
4721 		(void) thread_block((thread_continue_t)mbuf_worker_thread);
4722 	}
4723 }
4724 
4725 __attribute__((noreturn))
4726 static void
4727 mbuf_worker_thread_init(void)
4728 {
4729 	mbuf_worker_ready++;
4730 	mbuf_worker_thread();
4731 }
4732 
4733 static mcl_slab_t *
4734 slab_get(void *buf)
4735 {
4736 	mcl_slabg_t *slg;
4737 	unsigned int ix, k;
4738 
4739 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4740 
4741 	VERIFY(MBUF_IN_MAP(buf));
4742 	ix = ((unsigned char *)buf - mbutl) >> MBSHIFT;
4743 	VERIFY(ix < maxslabgrp);
4744 
4745 	if ((slg = slabstbl[ix]) == NULL) {
4746 		/*
4747 		 * In the current implementation, we never shrink the slabs
4748 		 * table; if we attempt to reallocate a cluster group when
4749 		 * it's already allocated, panic since this is a sign of a
4750 		 * memory corruption (slabstbl[ix] got nullified).
4751 		 */
4752 		++slabgrp;
4753 		VERIFY(ix < slabgrp);
4754 		/*
4755 		 * Slabs expansion can only be done single threaded; when
4756 		 * we get here, it must be as a result of m_clalloc() which
4757 		 * is serialized and therefore mb_clalloc_busy must be set.
4758 		 */
4759 		VERIFY(mb_clalloc_busy);
4760 		lck_mtx_unlock(mbuf_mlock);
4761 
4762 		/* This is a new buffer; create the slabs group for it */
4763 		slg = zalloc_permanent_type(mcl_slabg_t);
4764 		slg->slg_slab = zalloc_permanent(sizeof(mcl_slab_t) * NSLABSPMB,
4765 		    ZALIGN(mcl_slab_t));
4766 
4767 		lck_mtx_lock(mbuf_mlock);
4768 		/*
4769 		 * No other thread could have gone into m_clalloc() after
4770 		 * we dropped the lock above, so verify that it's true.
4771 		 */
4772 		VERIFY(mb_clalloc_busy);
4773 
4774 		slabstbl[ix] = slg;
4775 
4776 		/* Chain each slab in the group to its forward neighbor */
4777 		for (k = 1; k < NSLABSPMB; k++) {
4778 			slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
4779 		}
4780 		VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
4781 
4782 		/* And chain the last slab in the previous group to this */
4783 		if (ix > 0) {
4784 			VERIFY(slabstbl[ix - 1]->
4785 			    slg_slab[NSLABSPMB - 1].sl_next == NULL);
4786 			slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
4787 			    &slg->slg_slab[0];
4788 		}
4789 	}
4790 
4791 	ix = MTOPG(buf) % NSLABSPMB;
4792 	VERIFY(ix < NSLABSPMB);
4793 
4794 	return &slg->slg_slab[ix];
4795 }
4796 
4797 static void
4798 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
4799     void *base, void *head, unsigned int len, int refcnt, int chunks)
4800 {
4801 	sp->sl_class = class;
4802 	sp->sl_flags = flags;
4803 	sp->sl_base = base;
4804 	sp->sl_head = head;
4805 	sp->sl_len = len;
4806 	sp->sl_refcnt = refcnt;
4807 	sp->sl_chunks = chunks;
4808 	slab_detach(sp);
4809 }
4810 
4811 static void
4812 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
4813 {
4814 	VERIFY(slab_is_detached(sp));
4815 	m_slab_cnt(class)++;
4816 	TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
4817 	sp->sl_flags &= ~SLF_DETACHED;
4818 
4819 	/*
4820 	 * If a buffer spans multiple contiguous pages then mark them as
4821 	 * detached too
4822 	 */
4823 	if (class == MC_16KCL) {
4824 		int k;
4825 		for (k = 1; k < NSLABSP16KB; k++) {
4826 			sp = sp->sl_next;
4827 			/* Next slab must already be present */
4828 			VERIFY(sp != NULL && slab_is_detached(sp));
4829 			sp->sl_flags &= ~SLF_DETACHED;
4830 		}
4831 	}
4832 }
4833 
4834 static void
4835 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
4836 {
4837 	int k;
4838 	VERIFY(!slab_is_detached(sp));
4839 	VERIFY(m_slab_cnt(class) > 0);
4840 	m_slab_cnt(class)--;
4841 	TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
4842 	slab_detach(sp);
4843 	if (class == MC_16KCL) {
4844 		for (k = 1; k < NSLABSP16KB; k++) {
4845 			sp = sp->sl_next;
4846 			/* Next slab must already be present */
4847 			VERIFY(sp != NULL);
4848 			VERIFY(!slab_is_detached(sp));
4849 			slab_detach(sp);
4850 		}
4851 	}
4852 }
4853 
4854 static boolean_t
4855 slab_inrange(mcl_slab_t *sp, void *buf)
4856 {
4857 	return (uintptr_t)buf >= (uintptr_t)sp->sl_base &&
4858 	       (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len);
4859 }
4860 
4861 #undef panic
4862 
4863 static void
4864 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
4865 {
4866 	int i;
4867 	unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
4868 	uintptr_t buf = (uintptr_t)sp->sl_base;
4869 
4870 	for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
4871 		void *next = ((mcache_obj_t *)buf)->obj_next;
4872 		if (next != addr) {
4873 			continue;
4874 		}
4875 		if (!mclverify) {
4876 			if (next != NULL && !MBUF_IN_MAP(next)) {
4877 				mcache_t *cp = m_cache(sp->sl_class);
4878 				panic("%s: %s buffer %p in slab %p modified "
4879 				    "after free at offset 0: %p out of range "
4880 				    "[%p-%p)\n", __func__, cp->mc_name,
4881 				    (void *)buf, sp, next, mbutl, embutl);
4882 				/* NOTREACHED */
4883 			}
4884 		} else {
4885 			mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
4886 			    (mcache_obj_t *)buf);
4887 			mcl_audit_verify_nextptr(next, mca);
4888 		}
4889 	}
4890 }
4891 
4892 static void
4893 slab_detach(mcl_slab_t *sp)
4894 {
4895 	sp->sl_link.tqe_next = (mcl_slab_t *)-1;
4896 	sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
4897 	sp->sl_flags |= SLF_DETACHED;
4898 }
4899 
4900 static boolean_t
4901 slab_is_detached(mcl_slab_t *sp)
4902 {
4903 	return (intptr_t)sp->sl_link.tqe_next == -1 &&
4904 	       (intptr_t)sp->sl_link.tqe_prev == -1 &&
4905 	       (sp->sl_flags & SLF_DETACHED);
4906 }
4907 
4908 static void
4909 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
4910     mcache_obj_t **con_list, size_t con_size, unsigned int num)
4911 {
4912 	mcache_audit_t *mca, *mca_tail;
4913 	mcache_obj_t *con = NULL;
4914 	boolean_t save_contents = (con_list != NULL);
4915 	unsigned int i, ix;
4916 
4917 	ASSERT(num <= NMBPG);
4918 	ASSERT(con_list == NULL || con_size != 0);
4919 
4920 	ix = MTOPG(buf);
4921 	VERIFY(ix < maxclaudit);
4922 
4923 	/* Make sure we haven't been here before */
4924 	for (i = 0; i < num; i++) {
4925 		VERIFY(mclaudit[ix].cl_audit[i] == NULL);
4926 	}
4927 
4928 	mca = mca_tail = *mca_list;
4929 	if (save_contents) {
4930 		con = *con_list;
4931 	}
4932 
4933 	for (i = 0; i < num; i++) {
4934 		mcache_audit_t *next;
4935 
4936 		next = mca->mca_next;
4937 		bzero(mca, sizeof(*mca));
4938 		mca->mca_next = next;
4939 		mclaudit[ix].cl_audit[i] = mca;
4940 
4941 		/* Attach the contents buffer if requested */
4942 		if (save_contents) {
4943 			mcl_saved_contents_t *msc =
4944 			    (mcl_saved_contents_t *)(void *)con;
4945 
4946 			VERIFY(msc != NULL);
4947 			VERIFY(IS_P2ALIGNED(msc, sizeof(u_int64_t)));
4948 			VERIFY(con_size == sizeof(*msc));
4949 			mca->mca_contents_size = con_size;
4950 			mca->mca_contents = msc;
4951 			con = con->obj_next;
4952 			bzero(mca->mca_contents, mca->mca_contents_size);
4953 		}
4954 
4955 		mca_tail = mca;
4956 		mca = mca->mca_next;
4957 	}
4958 
4959 	if (save_contents) {
4960 		*con_list = con;
4961 	}
4962 
4963 	*mca_list = mca_tail->mca_next;
4964 	mca_tail->mca_next = NULL;
4965 }
4966 
4967 static void
4968 mcl_audit_free(void *buf, unsigned int num)
4969 {
4970 	unsigned int i, ix;
4971 	mcache_audit_t *mca, *mca_list;
4972 
4973 	ix = MTOPG(buf);
4974 	VERIFY(ix < maxclaudit);
4975 
4976 	if (mclaudit[ix].cl_audit[0] != NULL) {
4977 		mca_list = mclaudit[ix].cl_audit[0];
4978 		for (i = 0; i < num; i++) {
4979 			mca = mclaudit[ix].cl_audit[i];
4980 			mclaudit[ix].cl_audit[i] = NULL;
4981 			if (mca->mca_contents) {
4982 				mcache_free(mcl_audit_con_cache,
4983 				    mca->mca_contents);
4984 			}
4985 		}
4986 		mcache_free_ext(mcache_audit_cache,
4987 		    (mcache_obj_t *)mca_list);
4988 	}
4989 }
4990 
4991 /*
4992  * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
4993  * the corresponding audit structure for that buffer.
4994  */
4995 static mcache_audit_t *
4996 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj)
4997 {
4998 	mcache_audit_t *mca = NULL;
4999 	int ix = MTOPG(mobj), m_idx = 0;
5000 	unsigned char *page_addr;
5001 
5002 	VERIFY(ix < maxclaudit);
5003 	VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE)));
5004 
5005 	page_addr = PGTOM(ix);
5006 
5007 	switch (class) {
5008 	case MC_MBUF:
5009 		/*
5010 		 * For the mbuf case, find the index of the page
5011 		 * used by the mbuf and use that index to locate the
5012 		 * base address of the page.  Then find out the
5013 		 * mbuf index relative to the page base and use
5014 		 * it to locate the audit structure.
5015 		 */
5016 		m_idx = MBPAGEIDX(page_addr, mobj);
5017 		VERIFY(m_idx < (int)NMBPG);
5018 		mca = mclaudit[ix].cl_audit[m_idx];
5019 		break;
5020 
5021 	case MC_CL:
5022 		/*
5023 		 * Same thing as above, but for 2KB clusters in a page.
5024 		 */
5025 		m_idx = CLPAGEIDX(page_addr, mobj);
5026 		VERIFY(m_idx < (int)NCLPG);
5027 		mca = mclaudit[ix].cl_audit[m_idx];
5028 		break;
5029 
5030 	case MC_BIGCL:
5031 		m_idx = BCLPAGEIDX(page_addr, mobj);
5032 		VERIFY(m_idx < (int)NBCLPG);
5033 		mca = mclaudit[ix].cl_audit[m_idx];
5034 		break;
5035 	case MC_16KCL:
5036 		/*
5037 		 * Same as above, but only return the first element.
5038 		 */
5039 		mca = mclaudit[ix].cl_audit[0];
5040 		break;
5041 
5042 	default:
5043 		VERIFY(0);
5044 		/* NOTREACHED */
5045 	}
5046 
5047 	return mca;
5048 }
5049 
5050 static void
5051 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
5052     boolean_t alloc)
5053 {
5054 	struct mbuf *m = addr;
5055 	mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
5056 
5057 	VERIFY(mca->mca_contents != NULL &&
5058 	    mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
5059 
5060 	if (mclverify) {
5061 		mcl_audit_verify_nextptr(next, mca);
5062 	}
5063 
5064 	if (!alloc) {
5065 		/* Save constructed mbuf fields */
5066 		mcl_audit_save_mbuf(m, mca);
5067 		if (mclverify) {
5068 			mcache_set_pattern(MCACHE_FREE_PATTERN, m,
5069 			    m_maxsize(MC_MBUF));
5070 		}
5071 		((mcache_obj_t *)m)->obj_next = next;
5072 		return;
5073 	}
5074 
5075 	/* Check if the buffer has been corrupted while in freelist */
5076 	if (mclverify) {
5077 		mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
5078 	}
5079 	/* Restore constructed mbuf fields */
5080 	mcl_audit_restore_mbuf(m, mca, composite);
5081 }
5082 
5083 static void
5084 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
5085 {
5086 	struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
5087 
5088 	if (composite) {
5089 		struct mbuf *next = m->m_next;
5090 		VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL &&
5091 		    MBUF_IS_COMPOSITE(ms));
5092 		VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
5093 		/*
5094 		 * We could have hand-picked the mbuf fields and restore
5095 		 * them individually, but that will be a maintenance
5096 		 * headache.  Instead, restore everything that was saved;
5097 		 * the mbuf layer will recheck and reinitialize anyway.
5098 		 */
5099 		bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
5100 		m->m_next = next;
5101 	} else {
5102 		/*
5103 		 * For a regular mbuf (no cluster attached) there's nothing
5104 		 * to restore other than the type field, which is expected
5105 		 * to be MT_FREE.
5106 		 */
5107 		m->m_type = ms->m_type;
5108 	}
5109 	mbuf_mcheck(m);
5110 }
5111 
5112 static void
5113 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
5114 {
5115 	VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
5116 	mbuf_mcheck(m);
5117 	bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
5118 }
5119 
5120 static void
5121 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
5122     boolean_t save_next)
5123 {
5124 	mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
5125 
5126 	if (!alloc) {
5127 		if (mclverify) {
5128 			mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
5129 		}
5130 		if (save_next) {
5131 			mcl_audit_verify_nextptr(next, mca);
5132 			((mcache_obj_t *)addr)->obj_next = next;
5133 		}
5134 	} else if (mclverify) {
5135 		/* Check if the buffer has been corrupted while in freelist */
5136 		mcl_audit_verify_nextptr(next, mca);
5137 		mcache_audit_free_verify_set(mca, addr, 0, size);
5138 	}
5139 }
5140 
5141 static void
5142 mcl_audit_scratch(mcache_audit_t *mca)
5143 {
5144 	void *stack[MCACHE_STACK_DEPTH + 1];
5145 	mcl_scratch_audit_t *msa;
5146 	struct timeval now;
5147 
5148 	VERIFY(mca->mca_contents != NULL);
5149 	msa = MCA_SAVED_SCRATCH_PTR(mca);
5150 
5151 	msa->msa_pthread = msa->msa_thread;
5152 	msa->msa_thread = current_thread();
5153 	bcopy(msa->msa_stack, msa->msa_pstack, sizeof(msa->msa_pstack));
5154 	msa->msa_pdepth = msa->msa_depth;
5155 	bzero(stack, sizeof(stack));
5156 	msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
5157 	bcopy(&stack[1], msa->msa_stack, sizeof(msa->msa_stack));
5158 
5159 	msa->msa_ptstamp = msa->msa_tstamp;
5160 	microuptime(&now);
5161 	/* tstamp is in ms relative to base_ts */
5162 	msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
5163 	if ((now.tv_sec - mb_start.tv_sec) > 0) {
5164 		msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
5165 	}
5166 }
5167 
5168 __abortlike
5169 static void
5170 mcl_audit_mcheck_panic(struct mbuf *m)
5171 {
5172 	char buf[DUMP_MCA_BUF_SIZE];
5173 	mcache_audit_t *mca;
5174 
5175 	MRANGE(m);
5176 	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
5177 
5178 	panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s",
5179 	    m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(buf, mca));
5180 	/* NOTREACHED */
5181 }
5182 
5183 __abortlike
5184 static void
5185 mcl_audit_verify_nextptr_panic(void *next, mcache_audit_t *mca)
5186 {
5187 	char buf[DUMP_MCA_BUF_SIZE];
5188 	panic("mcl_audit: buffer %p modified after free at offset 0: "
5189 	    "%p out of range [%p-%p)\n%s\n",
5190 	    mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(buf, mca));
5191 	/* NOTREACHED */
5192 }
5193 
5194 static void
5195 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
5196 {
5197 	if (next != NULL && !MBUF_IN_MAP(next) &&
5198 	    (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
5199 		mcl_audit_verify_nextptr_panic(next, mca);
5200 	}
5201 }
5202 
5203 static uintptr_t
5204 hash_mix(uintptr_t x)
5205 {
5206 #ifndef __LP64__
5207 	x += ~(x << 15);
5208 	x ^=  (x >> 10);
5209 	x +=  (x << 3);
5210 	x ^=  (x >> 6);
5211 	x += ~(x << 11);
5212 	x ^=  (x >> 16);
5213 #else
5214 	x += ~(x << 32);
5215 	x ^=  (x >> 22);
5216 	x += ~(x << 13);
5217 	x ^=  (x >> 8);
5218 	x +=  (x << 3);
5219 	x ^=  (x >> 15);
5220 	x += ~(x << 27);
5221 	x ^=  (x >> 31);
5222 #endif
5223 	return x;
5224 }
5225 
5226 static uint32_t
5227 hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
5228 {
5229 	uintptr_t hash = 0;
5230 	uintptr_t mask = max_size - 1;
5231 
5232 	while (depth) {
5233 		hash += bt[--depth];
5234 	}
5235 
5236 	hash = hash_mix(hash) & mask;
5237 
5238 	assert(hash < max_size);
5239 
5240 	return (uint32_t) hash;
5241 }
5242 
5243 static uint32_t
5244 hashaddr(uintptr_t pt, uint32_t max_size)
5245 {
5246 	uintptr_t hash = 0;
5247 	uintptr_t mask = max_size - 1;
5248 
5249 	hash = hash_mix(pt) & mask;
5250 
5251 	assert(hash < max_size);
5252 
5253 	return (uint32_t) hash;
5254 }
5255 
5256 /* This function turns on mbuf leak detection */
5257 static void
5258 mleak_activate(void)
5259 {
5260 	mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
5261 	PE_parse_boot_argn("mleak_sample_factor",
5262 	    &mleak_table.mleak_sample_factor,
5263 	    sizeof(mleak_table.mleak_sample_factor));
5264 
5265 	if (mleak_table.mleak_sample_factor == 0) {
5266 		mclfindleak = 0;
5267 	}
5268 
5269 	if (mclfindleak == 0) {
5270 		return;
5271 	}
5272 
5273 	vm_size_t alloc_size =
5274 	    mleak_alloc_buckets * sizeof(struct mallocation);
5275 	vm_size_t trace_size = mleak_trace_buckets * sizeof(struct mtrace);
5276 
5277 	mleak_allocations = zalloc_permanent(alloc_size, ZALIGN(struct mallocation));
5278 	mleak_traces = zalloc_permanent(trace_size, ZALIGN(struct mtrace));
5279 	mleak_stat = zalloc_permanent(MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
5280 	    ZALIGN(mleak_stat_t));
5281 
5282 	mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
5283 #ifdef __LP64__
5284 	mleak_stat->ml_isaddr64 = 1;
5285 #endif /* __LP64__ */
5286 }
5287 
5288 static void
5289 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
5290 {
5291 	int temp;
5292 
5293 	if (mclfindleak == 0) {
5294 		return;
5295 	}
5296 
5297 	if (!alloc) {
5298 		return mleak_free(addr);
5299 	}
5300 
5301 	temp = os_atomic_inc_orig(&mleak_table.mleak_capture, relaxed);
5302 
5303 	if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
5304 		uintptr_t bt[MLEAK_STACK_DEPTH];
5305 		unsigned int logged = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
5306 		mleak_log(bt, addr, logged, num);
5307 	}
5308 }
5309 
5310 /*
5311  * This function records the allocation in the mleak_allocations table
5312  * and the backtrace in the mleak_traces table; if allocation slot is in use,
5313  * replace old allocation with new one if the trace slot is in use, return
5314  * (or increment refcount if same trace).
5315  */
5316 static boolean_t
5317 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
5318 {
5319 	struct mallocation *allocation;
5320 	struct mtrace *trace;
5321 	uint32_t trace_index;
5322 
5323 	/* Quit if someone else modifying the tables */
5324 	if (!lck_mtx_try_lock_spin(mleak_lock)) {
5325 		mleak_table.total_conflicts++;
5326 		return FALSE;
5327 	}
5328 
5329 	allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
5330 	    mleak_alloc_buckets)];
5331 	trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
5332 	trace = &mleak_traces[trace_index];
5333 
5334 	VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
5335 	VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
5336 
5337 	allocation->hitcount++;
5338 	trace->hitcount++;
5339 
5340 	/*
5341 	 * If the allocation bucket we want is occupied
5342 	 * and the occupier has the same trace, just bail.
5343 	 */
5344 	if (allocation->element != NULL &&
5345 	    trace_index == allocation->trace_index) {
5346 		mleak_table.alloc_collisions++;
5347 		lck_mtx_unlock(mleak_lock);
5348 		return TRUE;
5349 	}
5350 
5351 	/*
5352 	 * Store the backtrace in the traces array;
5353 	 * Size of zero = trace bucket is free.
5354 	 */
5355 	if (trace->allocs > 0 &&
5356 	    bcmp(trace->addr, bt, (depth * sizeof(uintptr_t))) != 0) {
5357 		/* Different, unique trace, but the same hash! Bail out. */
5358 		trace->collisions++;
5359 		mleak_table.trace_collisions++;
5360 		lck_mtx_unlock(mleak_lock);
5361 		return TRUE;
5362 	} else if (trace->allocs > 0) {
5363 		/* Same trace, already added, so increment refcount */
5364 		trace->allocs++;
5365 	} else {
5366 		/* Found an unused trace bucket, so record the trace here */
5367 		if (trace->depth != 0) {
5368 			/* this slot previously used but not currently in use */
5369 			mleak_table.trace_overwrites++;
5370 		}
5371 		mleak_table.trace_recorded++;
5372 		trace->allocs = 1;
5373 		memcpy(trace->addr, bt, (depth * sizeof(uintptr_t)));
5374 		trace->depth = depth;
5375 		trace->collisions = 0;
5376 	}
5377 
5378 	/* Step 2: Store the allocation record in the allocations array */
5379 	if (allocation->element != NULL) {
5380 		/*
5381 		 * Replace an existing allocation.  No need to preserve
5382 		 * because only a subset of the allocations are being
5383 		 * recorded anyway.
5384 		 */
5385 		mleak_table.alloc_collisions++;
5386 	} else if (allocation->trace_index != 0) {
5387 		mleak_table.alloc_overwrites++;
5388 	}
5389 	allocation->element = addr;
5390 	allocation->trace_index = trace_index;
5391 	allocation->count = num;
5392 	mleak_table.alloc_recorded++;
5393 	mleak_table.outstanding_allocs++;
5394 
5395 	lck_mtx_unlock(mleak_lock);
5396 	return TRUE;
5397 }
5398 
5399 static void
5400 mleak_free(mcache_obj_t *addr)
5401 {
5402 	while (addr != NULL) {
5403 		struct mallocation *allocation = &mleak_allocations
5404 		    [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
5405 
5406 		if (allocation->element == addr &&
5407 		    allocation->trace_index < mleak_trace_buckets) {
5408 			lck_mtx_lock_spin(mleak_lock);
5409 			if (allocation->element == addr &&
5410 			    allocation->trace_index < mleak_trace_buckets) {
5411 				struct mtrace *trace;
5412 				trace = &mleak_traces[allocation->trace_index];
5413 				/* allocs = 0 means trace bucket is unused */
5414 				if (trace->allocs > 0) {
5415 					trace->allocs--;
5416 				}
5417 				if (trace->allocs == 0) {
5418 					trace->depth = 0;
5419 				}
5420 				/* NULL element means alloc bucket is unused */
5421 				allocation->element = NULL;
5422 				mleak_table.outstanding_allocs--;
5423 			}
5424 			lck_mtx_unlock(mleak_lock);
5425 		}
5426 		addr = addr->obj_next;
5427 	}
5428 }
5429 
5430 static void
5431 mleak_sort_traces()
5432 {
5433 	int i, j, k;
5434 	struct mtrace *swap;
5435 
5436 	for (i = 0; i < MLEAK_NUM_TRACES; i++) {
5437 		mleak_top_trace[i] = NULL;
5438 	}
5439 
5440 	for (i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++) {
5441 		if (mleak_traces[i].allocs <= 0) {
5442 			continue;
5443 		}
5444 
5445 		mleak_top_trace[j] = &mleak_traces[i];
5446 		for (k = j; k > 0; k--) {
5447 			if (mleak_top_trace[k]->allocs <=
5448 			    mleak_top_trace[k - 1]->allocs) {
5449 				break;
5450 			}
5451 
5452 			swap = mleak_top_trace[k - 1];
5453 			mleak_top_trace[k - 1] = mleak_top_trace[k];
5454 			mleak_top_trace[k] = swap;
5455 		}
5456 		j++;
5457 	}
5458 
5459 	j--;
5460 	for (; i < mleak_trace_buckets; i++) {
5461 		if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs) {
5462 			continue;
5463 		}
5464 
5465 		mleak_top_trace[j] = &mleak_traces[i];
5466 
5467 		for (k = j; k > 0; k--) {
5468 			if (mleak_top_trace[k]->allocs <=
5469 			    mleak_top_trace[k - 1]->allocs) {
5470 				break;
5471 			}
5472 
5473 			swap = mleak_top_trace[k - 1];
5474 			mleak_top_trace[k - 1] = mleak_top_trace[k];
5475 			mleak_top_trace[k] = swap;
5476 		}
5477 	}
5478 }
5479 
5480 static void
5481 mleak_update_stats()
5482 {
5483 	mleak_trace_stat_t *mltr;
5484 	int i;
5485 
5486 	VERIFY(mleak_stat != NULL);
5487 #ifdef __LP64__
5488 	VERIFY(mleak_stat->ml_isaddr64);
5489 #else
5490 	VERIFY(!mleak_stat->ml_isaddr64);
5491 #endif /* !__LP64__ */
5492 	VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
5493 
5494 	mleak_sort_traces();
5495 
5496 	mltr = &mleak_stat->ml_trace[0];
5497 	bzero(mltr, sizeof(*mltr) * MLEAK_NUM_TRACES);
5498 	for (i = 0; i < MLEAK_NUM_TRACES; i++) {
5499 		int j;
5500 
5501 		if (mleak_top_trace[i] == NULL ||
5502 		    mleak_top_trace[i]->allocs == 0) {
5503 			continue;
5504 		}
5505 
5506 		mltr->mltr_collisions   = mleak_top_trace[i]->collisions;
5507 		mltr->mltr_hitcount     = mleak_top_trace[i]->hitcount;
5508 		mltr->mltr_allocs       = mleak_top_trace[i]->allocs;
5509 		mltr->mltr_depth        = mleak_top_trace[i]->depth;
5510 
5511 		VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
5512 		for (j = 0; j < mltr->mltr_depth; j++) {
5513 			mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
5514 		}
5515 
5516 		mltr++;
5517 	}
5518 }
5519 
5520 static struct mbtypes {
5521 	int             mt_type;
5522 	const char      *mt_name;
5523 } mbtypes[] = {
5524 	{ MT_DATA, "data" },
5525 	{ MT_OOBDATA, "oob data" },
5526 	{ MT_CONTROL, "ancillary data" },
5527 	{ MT_HEADER, "packet headers" },
5528 	{ MT_SOCKET, "socket structures" },
5529 	{ MT_PCB, "protocol control blocks" },
5530 	{ MT_RTABLE, "routing table entries" },
5531 	{ MT_HTABLE, "IMP host table entries" },
5532 	{ MT_ATABLE, "address resolution tables" },
5533 	{ MT_FTABLE, "fragment reassembly queue headers" },
5534 	{ MT_SONAME, "socket names and addresses" },
5535 	{ MT_SOOPTS, "socket options" },
5536 	{ MT_RIGHTS, "access rights" },
5537 	{ MT_IFADDR, "interface addresses" },
5538 	{ MT_TAG, "packet tags" },
5539 	{ 0, NULL }
5540 };
5541 
5542 #define MBUF_DUMP_BUF_CHK() {   \
5543 	clen -= k;              \
5544 	if (clen < 1)           \
5545 	        goto done;      \
5546 	c += k;                 \
5547 }
5548 
5549 static char *
5550 mbuf_dump(void)
5551 {
5552 	unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct,
5553 	    totreturned = 0;
5554 	u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
5555 	u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
5556 	u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
5557 	int nmbtypes = sizeof(mbstat.m_mtypes) / sizeof(short);
5558 	uint8_t seen[256];
5559 	struct mbtypes *mp;
5560 	mb_class_stat_t *sp;
5561 	mleak_trace_stat_t *mltr;
5562 	char *c = mbuf_dump_buf;
5563 	int i, j, k, clen = MBUF_DUMP_BUF_SIZE;
5564 	struct mbuf_watchdog_defunct_args args = {};
5565 
5566 	mbuf_dump_buf[0] = '\0';
5567 
5568 	/* synchronize all statistics in the mbuf table */
5569 	mbuf_stat_sync();
5570 	mbuf_mtypes_sync();
5571 
5572 	sp = &mb_stat->mbs_class[0];
5573 	for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
5574 		u_int32_t mem;
5575 
5576 		if (m_class(i) == MC_MBUF) {
5577 			m_mbufs = sp->mbcl_active;
5578 		} else if (m_class(i) == MC_CL) {
5579 			m_clfree = sp->mbcl_total - sp->mbcl_active;
5580 		} else if (m_class(i) == MC_BIGCL) {
5581 			m_bigclfree = sp->mbcl_total - sp->mbcl_active;
5582 		} else if (m_class(i) == MC_16KCL) {
5583 			m_16kclfree = sp->mbcl_total - sp->mbcl_active;
5584 			m_16kclusters = sp->mbcl_total;
5585 		} else if (m_class(i) == MC_MBUF_CL) {
5586 			m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
5587 		} else if (m_class(i) == MC_MBUF_BIGCL) {
5588 			m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
5589 		} else if (m_class(i) == MC_MBUF_16KCL) {
5590 			m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
5591 		}
5592 
5593 		mem = sp->mbcl_ctotal * sp->mbcl_size;
5594 		totmem += mem;
5595 		totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
5596 		    sp->mbcl_size;
5597 		totreturned += sp->mbcl_release_cnt;
5598 	}
5599 
5600 	/* adjust free counts to include composite caches */
5601 	m_clfree += m_mbufclfree;
5602 	m_bigclfree += m_mbufbigclfree;
5603 	m_16kclfree += m_mbuf16kclfree;
5604 
5605 	totmbufs = 0;
5606 	for (mp = mbtypes; mp->mt_name != NULL; mp++) {
5607 		totmbufs += mbstat.m_mtypes[mp->mt_type];
5608 	}
5609 	if (totmbufs > m_mbufs) {
5610 		totmbufs = m_mbufs;
5611 	}
5612 	k = scnprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
5613 	MBUF_DUMP_BUF_CHK();
5614 
5615 	bzero(&seen, sizeof(seen));
5616 	for (mp = mbtypes; mp->mt_name != NULL; mp++) {
5617 		if (mbstat.m_mtypes[mp->mt_type] != 0) {
5618 			seen[mp->mt_type] = 1;
5619 			k = scnprintf(c, clen, "\t%u mbufs allocated to %s\n",
5620 			    mbstat.m_mtypes[mp->mt_type], mp->mt_name);
5621 			MBUF_DUMP_BUF_CHK();
5622 		}
5623 	}
5624 	seen[MT_FREE] = 1;
5625 	for (i = 0; i < nmbtypes; i++) {
5626 		if (!seen[i] && mbstat.m_mtypes[i] != 0) {
5627 			k = scnprintf(c, clen, "\t%u mbufs allocated to "
5628 			    "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
5629 			MBUF_DUMP_BUF_CHK();
5630 		}
5631 	}
5632 	if ((m_mbufs - totmbufs) > 0) {
5633 		k = scnprintf(c, clen, "\t%lu mbufs allocated to caches\n",
5634 		    m_mbufs - totmbufs);
5635 		MBUF_DUMP_BUF_CHK();
5636 	}
5637 	k = scnprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
5638 	    "%u/%u mbuf 4KB clusters in use\n",
5639 	    (unsigned int)(mbstat.m_clusters - m_clfree),
5640 	    (unsigned int)mbstat.m_clusters,
5641 	    (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
5642 	    (unsigned int)mbstat.m_bigclusters);
5643 	MBUF_DUMP_BUF_CHK();
5644 
5645 	k = scnprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
5646 	    m_16kclusters - m_16kclfree, m_16kclusters,
5647 	    njclbytes / 1024);
5648 	MBUF_DUMP_BUF_CHK();
5649 	totused = totmem - totfree;
5650 	if (totmem == 0) {
5651 		totpct = 0;
5652 	} else if (totused < (ULONG_MAX / 100)) {
5653 		totpct = (totused * 100) / totmem;
5654 	} else {
5655 		u_long totmem1 = totmem / 100;
5656 		u_long totused1 = totused / 100;
5657 		totpct = (totused1 * 100) / totmem1;
5658 	}
5659 	k = scnprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
5660 	    "in use)\n", totmem / 1024, totpct);
5661 	MBUF_DUMP_BUF_CHK();
5662 	k = scnprintf(c, clen, "%lu KB returned to the system\n",
5663 	    totreturned / 1024);
5664 	MBUF_DUMP_BUF_CHK();
5665 
5666 	net_update_uptime();
5667 
5668 	k = scnprintf(c, clen,
5669 	    "worker thread runs: %u, expansions: %llu, cl %llu/%llu, "
5670 	    "bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt,
5671 	    mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total,
5672 	    mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt,
5673 	    mb_expand_16kcl_total);
5674 	MBUF_DUMP_BUF_CHK();
5675 	if (mbuf_worker_last_runtime != 0) {
5676 		k = scnprintf(c, clen, "worker thread last run time: "
5677 		    "%llu (%llu seconds ago)\n",
5678 		    mbuf_worker_last_runtime,
5679 		    net_uptime() - mbuf_worker_last_runtime);
5680 		MBUF_DUMP_BUF_CHK();
5681 	}
5682 	if (mbuf_drain_last_runtime != 0) {
5683 		k = scnprintf(c, clen, "drain routine last run time: "
5684 		    "%llu (%llu seconds ago)\n",
5685 		    mbuf_drain_last_runtime,
5686 		    net_uptime() - mbuf_drain_last_runtime);
5687 		MBUF_DUMP_BUF_CHK();
5688 	}
5689 
5690 	/*
5691 	 * Log where the most mbufs have accumulated:
5692 	 * - Process socket buffers
5693 	 * - TCP reassembly queue
5694 	 * - Interface AQM queue (output) and DLIL input queue
5695 	 */
5696 	args.non_blocking = true;
5697 	proc_iterate(PROC_ALLPROCLIST,
5698 	    mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
5699 	if (args.top_app != NULL) {
5700 		k = scnprintf(c, clen, "\ntop proc mbuf space %u bytes by %s:%d\n",
5701 		    args.top_app_space_used,
5702 		    proc_name_address(args.top_app),
5703 		    proc_pid(args.top_app));
5704 		proc_rele(args.top_app);
5705 	}
5706 	MBUF_DUMP_BUF_CHK();
5707 
5708 #if INET
5709 	k = dump_tcp_reass_qlen(c, clen);
5710 	MBUF_DUMP_BUF_CHK();
5711 #endif /* INET */
5712 
5713 #if MPTCP
5714 	k = dump_mptcp_reass_qlen(c, clen);
5715 	MBUF_DUMP_BUF_CHK();
5716 #endif /* MPTCP */
5717 
5718 #if NETWORKING
5719 	k = dlil_dump_top_if_qlen(c, clen);
5720 	MBUF_DUMP_BUF_CHK();
5721 #endif /* NETWORKING */
5722 
5723 	/* mbuf leak detection statistics */
5724 	mleak_update_stats();
5725 
5726 	k = scnprintf(c, clen, "\nmbuf leak detection table:\n");
5727 	MBUF_DUMP_BUF_CHK();
5728 	k = scnprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
5729 	    mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
5730 	    mleak_table.mleak_sample_factor);
5731 	MBUF_DUMP_BUF_CHK();
5732 	k = scnprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
5733 	    mleak_table.outstanding_allocs);
5734 	MBUF_DUMP_BUF_CHK();
5735 	k = scnprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
5736 	    mleak_table.alloc_recorded, mleak_table.trace_recorded);
5737 	MBUF_DUMP_BUF_CHK();
5738 	k = scnprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
5739 	    mleak_table.alloc_collisions, mleak_table.trace_collisions);
5740 	MBUF_DUMP_BUF_CHK();
5741 	k = scnprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
5742 	    mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
5743 	MBUF_DUMP_BUF_CHK();
5744 	k = scnprintf(c, clen, "\tlock conflicts: %llu\n\n",
5745 	    mleak_table.total_conflicts);
5746 	MBUF_DUMP_BUF_CHK();
5747 
5748 	k = scnprintf(c, clen, "top %d outstanding traces:\n",
5749 	    mleak_stat->ml_cnt);
5750 	MBUF_DUMP_BUF_CHK();
5751 	for (i = 0; i < mleak_stat->ml_cnt; i++) {
5752 		mltr = &mleak_stat->ml_trace[i];
5753 		k = scnprintf(c, clen, "[%d] %llu outstanding alloc(s), "
5754 		    "%llu hit(s), %llu collision(s)\n", (i + 1),
5755 		    mltr->mltr_allocs, mltr->mltr_hitcount,
5756 		    mltr->mltr_collisions);
5757 		MBUF_DUMP_BUF_CHK();
5758 	}
5759 
5760 	if (mleak_stat->ml_isaddr64) {
5761 		k = scnprintf(c, clen, MB_LEAK_HDR_64);
5762 	} else {
5763 		k = scnprintf(c, clen, MB_LEAK_HDR_32);
5764 	}
5765 	MBUF_DUMP_BUF_CHK();
5766 
5767 	for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
5768 		k = scnprintf(c, clen, "%2d: ", (i + 1));
5769 		MBUF_DUMP_BUF_CHK();
5770 		for (j = 0; j < mleak_stat->ml_cnt; j++) {
5771 			mltr = &mleak_stat->ml_trace[j];
5772 			if (i < mltr->mltr_depth) {
5773 				if (mleak_stat->ml_isaddr64) {
5774 					k = scnprintf(c, clen, "0x%0llx  ",
5775 					    (uint64_t)VM_KERNEL_UNSLIDE(
5776 						    mltr->mltr_addr[i]));
5777 				} else {
5778 					k = scnprintf(c, clen,
5779 					    "0x%08x  ",
5780 					    (uint32_t)VM_KERNEL_UNSLIDE(
5781 						    mltr->mltr_addr[i]));
5782 				}
5783 			} else {
5784 				if (mleak_stat->ml_isaddr64) {
5785 					k = scnprintf(c, clen,
5786 					    MB_LEAK_SPACING_64);
5787 				} else {
5788 					k = scnprintf(c, clen,
5789 					    MB_LEAK_SPACING_32);
5790 				}
5791 			}
5792 			MBUF_DUMP_BUF_CHK();
5793 		}
5794 		k = scnprintf(c, clen, "\n");
5795 		MBUF_DUMP_BUF_CHK();
5796 	}
5797 
5798 done:
5799 	return mbuf_dump_buf;
5800 }
5801 
5802 #undef MBUF_DUMP_BUF_CHK
5803 
5804 /*
5805  * This routine is reserved for mbuf_get_driver_scratch(); clients inside
5806  * xnu that intend on utilizing the module-private area should directly
5807  * refer to the pkt_mpriv structure in the pkthdr.  They are also expected
5808  * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
5809  * to handing it off to another module, respectively.
5810  */
5811 u_int32_t
5812 m_scratch_get(struct mbuf *m, u_int8_t **p)
5813 {
5814 	struct pkthdr *pkt = &m->m_pkthdr;
5815 
5816 	VERIFY(m->m_flags & M_PKTHDR);
5817 
5818 	/* See comments in <rdar://problem/14040693> */
5819 	if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
5820 		panic_plain("Invalid attempt to access guarded module-private "
5821 		    "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
5822 		/* NOTREACHED */
5823 	}
5824 
5825 	if (mcltrace) {
5826 		mcache_audit_t *mca;
5827 
5828 		lck_mtx_lock(mbuf_mlock);
5829 		mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
5830 		if (mca->mca_uflags & MB_SCVALID) {
5831 			mcl_audit_scratch(mca);
5832 		}
5833 		lck_mtx_unlock(mbuf_mlock);
5834 	}
5835 
5836 	*p = (u_int8_t *)&pkt->pkt_mpriv;
5837 	return sizeof(pkt->pkt_mpriv);
5838 }
5839 
5840 /*
5841  * Simple routine to avoid taking the lock when we can't run the
5842  * mbuf drain.
5843  */
5844 static int
5845 mbuf_drain_checks(boolean_t ignore_waiters)
5846 {
5847 	if (mb_drain_maxint == 0) {
5848 		return 0;
5849 	}
5850 	if (!ignore_waiters && mb_waiters != 0) {
5851 		return 0;
5852 	}
5853 
5854 	return 1;
5855 }
5856 
5857 /*
5858  * Called by the VM when there's memory pressure or when we exhausted
5859  * the 4k/16k reserved space.
5860  */
5861 static void
5862 mbuf_drain_locked(boolean_t ignore_waiters)
5863 {
5864 	mbuf_class_t mc;
5865 	mcl_slab_t *sp, *sp_tmp, *nsp;
5866 	unsigned int num, k, interval, released = 0;
5867 	unsigned long total_mem = 0, use_mem = 0;
5868 	boolean_t ret, purge_caches = FALSE;
5869 	ppnum_t offset;
5870 	mcache_obj_t *obj;
5871 	unsigned long per;
5872 	static unsigned char scratch[32];
5873 	static ppnum_t scratch_pa = 0;
5874 
5875 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5876 	if (!mbuf_drain_checks(ignore_waiters)) {
5877 		return;
5878 	}
5879 	if (scratch_pa == 0) {
5880 		bzero(scratch, sizeof(scratch));
5881 		scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
5882 		VERIFY(scratch_pa);
5883 	} else if (mclverify) {
5884 		/*
5885 		 * Panic if a driver wrote to our scratch memory.
5886 		 */
5887 		for (k = 0; k < sizeof(scratch); k++) {
5888 			if (scratch[k]) {
5889 				panic("suspect DMA to freed address");
5890 			}
5891 		}
5892 	}
5893 	/*
5894 	 * Don't free memory too often as that could cause excessive
5895 	 * waiting times for mbufs.  Purge caches if we were asked to drain
5896 	 * in the last 5 minutes.
5897 	 */
5898 	if (mbuf_drain_last_runtime != 0) {
5899 		interval = net_uptime() - mbuf_drain_last_runtime;
5900 		if (interval <= mb_drain_maxint) {
5901 			return;
5902 		}
5903 		if (interval <= mb_drain_maxint * 5) {
5904 			purge_caches = TRUE;
5905 		}
5906 	}
5907 	mbuf_drain_last_runtime = net_uptime();
5908 	/*
5909 	 * Don't free any memory if we're using 60% or more.
5910 	 */
5911 	for (mc = 0; mc < MC_MAX; mc++) {
5912 		total_mem += m_total(mc) * m_maxsize(mc);
5913 		use_mem += m_active(mc) * m_maxsize(mc);
5914 	}
5915 	per = (use_mem * 100) / total_mem;
5916 	if (per >= 60) {
5917 		return;
5918 	}
5919 	/*
5920 	 * Purge all the caches.  This effectively disables
5921 	 * caching for a few seconds, but the mbuf worker thread will
5922 	 * re-enable them again.
5923 	 */
5924 	if (purge_caches == TRUE) {
5925 		for (mc = 0; mc < MC_MAX; mc++) {
5926 			if (m_total(mc) < m_avgtotal(mc)) {
5927 				continue;
5928 			}
5929 			lck_mtx_unlock(mbuf_mlock);
5930 			ret = mcache_purge_cache(m_cache(mc), FALSE);
5931 			lck_mtx_lock(mbuf_mlock);
5932 			if (ret == TRUE) {
5933 				m_purge_cnt(mc)++;
5934 			}
5935 		}
5936 	}
5937 	/*
5938 	 * Move the objects from the composite class freelist to
5939 	 * the rudimentary slabs list, but keep at least 10% of the average
5940 	 * total in the freelist.
5941 	 */
5942 	for (mc = 0; mc < MC_MAX; mc++) {
5943 		while (m_cobjlist(mc) &&
5944 		    m_total(mc) < m_avgtotal(mc) &&
5945 		    m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
5946 			obj = m_cobjlist(mc);
5947 			m_cobjlist(mc) = obj->obj_next;
5948 			obj->obj_next = NULL;
5949 			num = cslab_free(mc, obj, 1);
5950 			VERIFY(num == 1);
5951 			m_free_cnt(mc)++;
5952 			m_infree(mc)--;
5953 			/* cslab_free() handles m_total */
5954 		}
5955 	}
5956 	/*
5957 	 * Free the buffers present in the slab list up to 10% of the total
5958 	 * average per class.
5959 	 *
5960 	 * We walk the list backwards in an attempt to reduce fragmentation.
5961 	 */
5962 	for (mc = MC_MAX - 1; (int)mc >= 0; mc--) {
5963 		TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
5964 			/*
5965 			 * Process only unused slabs occupying memory.
5966 			 */
5967 			if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
5968 			    sp->sl_base == NULL) {
5969 				continue;
5970 			}
5971 			if (m_total(mc) < m_avgtotal(mc) ||
5972 			    m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
5973 				break;
5974 			}
5975 			slab_remove(sp, mc);
5976 			switch (mc) {
5977 			case MC_MBUF:
5978 				m_infree(mc) -= NMBPG;
5979 				m_total(mc) -= NMBPG;
5980 				if (mclaudit != NULL) {
5981 					mcl_audit_free(sp->sl_base, NMBPG);
5982 				}
5983 				break;
5984 			case MC_CL:
5985 				m_infree(mc) -= NCLPG;
5986 				m_total(mc) -= NCLPG;
5987 				if (mclaudit != NULL) {
5988 					mcl_audit_free(sp->sl_base, NMBPG);
5989 				}
5990 				break;
5991 			case MC_BIGCL:
5992 			{
5993 				m_infree(mc) -= NBCLPG;
5994 				m_total(mc) -= NBCLPG;
5995 				if (mclaudit != NULL) {
5996 					mcl_audit_free(sp->sl_base, NMBPG);
5997 				}
5998 				break;
5999 			}
6000 			case MC_16KCL:
6001 				m_infree(mc)--;
6002 				m_total(mc)--;
6003 				for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
6004 					nsp = nsp->sl_next;
6005 					VERIFY(nsp->sl_refcnt == 0 &&
6006 					    nsp->sl_base != NULL &&
6007 					    nsp->sl_len == 0);
6008 					slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
6009 					    0);
6010 					nsp->sl_flags = 0;
6011 				}
6012 				if (mclaudit != NULL) {
6013 					if (sp->sl_len == PAGE_SIZE) {
6014 						mcl_audit_free(sp->sl_base,
6015 						    NMBPG);
6016 					} else {
6017 						mcl_audit_free(sp->sl_base, 1);
6018 					}
6019 				}
6020 				break;
6021 			default:
6022 				/*
6023 				 * The composite classes have their own
6024 				 * freelist (m_cobjlist), so we only
6025 				 * process rudimentary classes here.
6026 				 */
6027 				VERIFY(0);
6028 			}
6029 			m_release_cnt(mc) += m_size(mc);
6030 			released += m_size(mc);
6031 			VERIFY(sp->sl_base != NULL &&
6032 			    sp->sl_len >= PAGE_SIZE);
6033 			offset = MTOPG(sp->sl_base);
6034 			/*
6035 			 * Make sure the IOMapper points to a valid, but
6036 			 * bogus, address.  This should prevent further DMA
6037 			 * accesses to freed memory.
6038 			 */
6039 			IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
6040 			mcl_paddr[offset] = 0;
6041 			kmem_free(mb_map, (vm_offset_t)sp->sl_base,
6042 			    sp->sl_len);
6043 			slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
6044 			sp->sl_flags = 0;
6045 		}
6046 	}
6047 	mbstat.m_drain++;
6048 	mbstat.m_bigclusters = m_total(MC_BIGCL);
6049 	mbstat.m_clusters = m_total(MC_CL);
6050 	mbstat.m_mbufs = m_total(MC_MBUF);
6051 	mbuf_stat_sync();
6052 	mbuf_mtypes_sync();
6053 }
6054 
6055 __private_extern__ void
6056 mbuf_drain(boolean_t ignore_waiters)
6057 {
6058 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_NOTOWNED);
6059 	if (!mbuf_drain_checks(ignore_waiters)) {
6060 		return;
6061 	}
6062 	lck_mtx_lock(mbuf_mlock);
6063 	mbuf_drain_locked(ignore_waiters);
6064 	lck_mtx_unlock(mbuf_mlock);
6065 }
6066 
6067 
6068 static int
6069 m_drain_force_sysctl SYSCTL_HANDLER_ARGS
6070 {
6071 #pragma unused(arg1, arg2)
6072 	int val = 0, err;
6073 
6074 	err = sysctl_handle_int(oidp, &val, 0, req);
6075 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
6076 		return err;
6077 	}
6078 	if (val) {
6079 		mbuf_drain(TRUE);
6080 	}
6081 
6082 	return err;
6083 }
6084 
6085 #if DEBUG || DEVELOPMENT
6086 __printflike(3, 4)
6087 static void
6088 _mbwdog_logger(const char *func, const int line, const char *fmt, ...)
6089 {
6090 	va_list ap;
6091 	struct timeval now;
6092 	char str[384], p[256];
6093 	int len;
6094 
6095 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6096 	if (mbwdog_logging == NULL) {
6097 		/*
6098 		 * This might block under a mutex, which isn't really great,
6099 		 * but this happens once, so we'll live.
6100 		 */
6101 		mbwdog_logging = zalloc_permanent(mbwdog_logging_size,
6102 		    ZALIGN_NONE);
6103 	}
6104 	va_start(ap, fmt);
6105 	vsnprintf(p, sizeof(p), fmt, ap);
6106 	va_end(ap);
6107 	microuptime(&now);
6108 	len = scnprintf(str, sizeof(str),
6109 	    "\n%ld.%d (%d/%llx) %s:%d %s",
6110 	    now.tv_sec, now.tv_usec,
6111 	    proc_getpid(current_proc()),
6112 	    (uint64_t)VM_KERNEL_ADDRPERM(current_thread()),
6113 	    func, line, p);
6114 	if (len < 0) {
6115 		return;
6116 	}
6117 	if (mbwdog_logging_used + len > mbwdog_logging_size) {
6118 		mbwdog_logging_used = mbwdog_logging_used / 2;
6119 		memmove(mbwdog_logging, mbwdog_logging + mbwdog_logging_used,
6120 		    mbwdog_logging_size - mbwdog_logging_used);
6121 		mbwdog_logging[mbwdog_logging_used] = 0;
6122 	}
6123 	strlcat(mbwdog_logging, str, mbwdog_logging_size);
6124 	mbwdog_logging_used += len;
6125 }
6126 
6127 #endif // DEBUG || DEVELOPMENT
6128 
6129 static void
6130 mtracelarge_register(size_t size)
6131 {
6132 	int i;
6133 	struct mtracelarge *trace;
6134 	uintptr_t bt[MLEAK_STACK_DEPTH];
6135 	unsigned int depth;
6136 
6137 	depth = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
6138 	/* Check if this entry is already on the list. */
6139 	for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
6140 		trace = &mtracelarge_table[i];
6141 		if (trace->size == size && trace->depth == depth &&
6142 		    memcmp(bt, trace->addr, depth * sizeof(uintptr_t)) == 0) {
6143 			return;
6144 		}
6145 	}
6146 	for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
6147 		trace = &mtracelarge_table[i];
6148 		if (size > trace->size) {
6149 			trace->depth = depth;
6150 			memcpy(trace->addr, bt, depth * sizeof(uintptr_t));
6151 			trace->size = size;
6152 			break;
6153 		}
6154 	}
6155 }
6156 
6157 #if DEBUG || DEVELOPMENT
6158 
6159 static int
6160 mbuf_wd_dump_sysctl SYSCTL_HANDLER_ARGS
6161 {
6162 	char *str;
6163 
6164 	ifnet_head_lock_shared();
6165 	lck_mtx_lock(mbuf_mlock);
6166 
6167 	str = mbuf_dump();
6168 
6169 	lck_mtx_unlock(mbuf_mlock);
6170 	ifnet_head_done();
6171 
6172 	return sysctl_io_string(req, str, 0, 0, NULL);
6173 }
6174 
6175 #endif /* DEBUG || DEVELOPMENT */
6176 
6177 SYSCTL_DECL(_kern_ipc);
6178 #if DEBUG || DEVELOPMENT
6179 #if SKYWALK
6180 SYSCTL_UINT(_kern_ipc, OID_AUTO, mc_threshold_scale_factor,
6181     CTLFLAG_RW | CTLFLAG_LOCKED, &mc_threshold_scale_down_factor,
6182     MC_THRESHOLD_SCALE_DOWN_FACTOR,
6183     "scale down factor for mbuf cache thresholds");
6184 #endif /* SKYWALK */
6185 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_wd_dump,
6186     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED,
6187     0, 0, mbuf_wd_dump_sysctl, "A", "mbuf watchdog dump");
6188 #endif /* DEBUG || DEVELOPMENT */
6189 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
6190     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
6191     0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
6192 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
6193     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
6194     0, 0, mleak_table_sysctl, "S,mleak_table", "");
6195 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
6196     CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
6197 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
6198     CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
6199 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
6200     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
6201 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
6202     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
6203     m_drain_force_sysctl, "I",
6204     "Forces the mbuf garbage collection to run");
6205 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
6206     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0,
6207     "Minimum time interval between garbage collection");
6208