xref: /xnu-12377.1.9/bsd/kern/uipc_mbuf.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1982, 1986, 1988, 1991, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. All advertising materials mentioning features or use of this software
42  *    must display the following acknowledgement:
43  *	This product includes software developed by the University of
44  *	California, Berkeley and its contributors.
45  * 4. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
62  */
63 /*
64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65  * support for mandatory and extensible security protections.  This notice
66  * is included in support of clause 2.2 (b) of the Apple Public License,
67  * Version 2.0.
68  */
69 
70 #include <ptrauth.h>
71 
72 #include <stdint.h>
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/malloc.h>
76 #include <sys/mbuf.h>
77 #include <sys/kernel.h>
78 #include <sys/sysctl.h>
79 #include <sys/syslog.h>
80 #include <sys/protosw.h>
81 #include <sys/domain.h>
82 #include <sys/queue.h>
83 #include <sys/proc.h>
84 #include <sys/filedesc.h>
85 #include <sys/file_internal.h>
86 
87 #include <vm/vm_kern_xnu.h>
88 
89 #include <dev/random/randomdev.h>
90 
91 #include <kern/kern_types.h>
92 #include <kern/simple_lock.h>
93 #include <kern/queue.h>
94 #include <kern/sched_prim.h>
95 #include <kern/backtrace.h>
96 #include <kern/percpu.h>
97 #include <kern/zalloc.h>
98 
99 #include <libkern/OSDebug.h>
100 #include <libkern/libkern.h>
101 
102 #include <os/log.h>
103 #include <os/ptrtools.h>
104 
105 #include <machine/limits.h>
106 #include <machine/machine_routines.h>
107 
108 #include <net/droptap.h>
109 #include <net/ntstat.h>
110 
111 #if INET
112 extern int tcp_reass_qlen_space(struct socket *);
113 #endif /* INET */
114 
115 /*
116  * MBUF IMPLEMENTATION NOTES (using zalloc).
117  *
118  * There are a total of 4 zones and 3 zcaches.
119  *
120  * MC_MBUF:
121  *	This is a zone of rudimentary objects of _MSIZE in size; each
122  *	object represents an mbuf structure.  This cache preserves only
123  *	the m_type field of the mbuf during its transactions.
124  *
125  * MC_CL:
126  *	This is a zone of rudimentary objects of MCLBYTES in size; each
127  *	object represents a mcluster structure.  This cache does not
128  *	preserve the contents of the objects during its transactions.
129  *
130  * MC_BIGCL:
131  *	This is a zone of rudimentary objects of MBIGCLBYTES in size; each
132  *	object represents a mbigcluster structure.  This cache does not
133  *	preserve the contents of the objects during its transaction.
134  *
135  * MC_16KCL:
136  *	This is a zone of rudimentary objects of M16KCLBYTES in size; each
137  *	object represents a m16kcluster structure.  This cache does not
138  *	preserve the contents of the objects during its transaction.
139  *
140  * MC_MBUF_CL:
141  *	This is a cache of mbufs each having a cluster attached to it.
142  *	It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
143  *	fields of the mbuf related to the external cluster are preserved
144  *	during transactions.
145  *
146  * MC_MBUF_BIGCL:
147  *	This is a cache of mbufs each having a big cluster attached to it.
148  *	It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
149  *	fields of the mbuf related to the external cluster are preserved
150  *	during transactions.
151  *
152  * MC_MBUF_16KCL:
153  *	This is a cache of mbufs each having a big cluster attached to it.
154  *	It is backed by MC_MBUF and MC_16KCL rudimentary caches.  Several
155  *	fields of the mbuf related to the external cluster are preserved
156  *	during transactions.
157  *
158  * OBJECT ALLOCATION:
159  *
160  * Allocation requests are handled first at the zalloc per-CPU layer
161  * before falling back to the zalloc depot.  Performance is optimal when
162  * the request is satisfied at the CPU layer. zalloc has an additional
163  * overflow layer called the depot, not pictured in the diagram below.
164  *
165  * Allocation paths are different depending on the class of objects:
166  *
167  * a. Rudimentary object:
168  *
169  *	{ m_get_common(), m_clattach(), m_mclget(),
170  *	  m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
171  *	  composite object allocation }
172  *			|	^
173  *			|	|
174  *			|	+------- (done) --------+
175  *			v				|
176  *	      zalloc_flags/zalloc_n()	              KASAN
177  *			|				^
178  *			v				|
179  *      +----> [zalloc per-CPU cache] -----> (found?) --+
180  *	|		|				|
181  *	|		v				|
182  *	|  [zalloc recirculation layer] --> (found?) ---+
183  *	|		|
184  *	|		v
185  *	+--<<-- [zone backing store]
186  *
187  * b. Composite object:
188  *
189  *	{ m_getpackets_internal(), m_allocpacket_internal() }
190  *			|	^
191  *			|	|
192  *			|	+------	(done) ---------+
193  *			v				|
194  *              mz_composite_alloc()                  KASAN
195  *			|				^
196  *                      v                               |
197  *                zcache_alloc_n()                      |
198  *			|                               |
199  *			v                               |
200  *	     [zalloc per-CPU cache] --> mark_valid() ---+
201  *			|				|
202  *			v				|
203  *	  [zalloc recirculation layer] -> mark_valid() -+
204  *			|				|
205  *			v				|
206  *               mz_composite_build()                   |
207  *			|				|
208  *			v				|
209  *		(rudimentary objects)			|
210  *                   zalloc_id() ---------------->>-----+
211  *
212  * Auditing notes: If KASAN enabled, buffers will be subjected to
213  * integrity checks by the AddressSanitizer.
214  *
215  * OBJECT DEALLOCATION:
216  *
217  * Freeing an object simply involves placing it into the CPU cache; this
218  * pollutes the cache to benefit subsequent allocations.  The depot
219  * will only be entered if the object is to be purged out of the cache.
220  * Objects may be purged based on the overall memory pressure or
221  * during zone garbage collection.
222  * To improve performance, objects are not zero-filled when freed
223  * as it's custom for other zalloc zones.
224  *
225  * Deallocation paths are different depending on the class of objects:
226  *
227  * a. Rudimentary object:
228  *
229  *	{ m_free(), m_freem_list(), composite object deallocation }
230  *			|	^
231  *			|	|
232  *			|	+------	(done) ---------+
233  *			v				|
234  *	          zfree_nozero()                        |
235  *			|			        |
236  *                      v                               |
237  *                    KASAN                             |
238  *			|				|
239  *			v				|
240  *	     [zalloc per-CPU cache] -> (not purging?) --+
241  *			|				|
242  *			v				|
243  *	    [zalloc recirculation layer] --->>----------+
244  *
245  *
246  * b. Composite object:
247  *
248  *	{ m_free(), m_freem_list() }
249  *			|	^
250  *			|	|
251  *			|	+------	(done) ---------+
252  *			v				|
253  *	        mz_composite_free()	                |
254  *			|			        |
255  *			v				|
256  *                zcache_free_n()                       |
257  *                      |                               |
258  *			v				|
259  *                    KASAN                             |
260  *			|				|
261  *			v				|
262  *	     [zalloc per-CPU cache] -> mark_invalid() --+
263  *			|				|
264  *			v				|
265  *	        mz_composite_destroy()                  |
266  *			|				|
267  *			v				|
268  *		(rudimentary object)			|
269  *	           zfree_nozero() -------------->>------+
270  *
271  * Auditing notes: If KASAN enabled, buffers will be subjected to
272  * integrity checks by the AddressSanitizer.
273  *
274  * DEBUGGING:
275  *
276  * Debugging mbufs can be done by booting a KASAN enabled kernel.
277  */
278 
279 
280 /*
281  * Convention typedefs for local __single pointers.
282  */
283 typedef typeof(*((zone_t)0)) *__single zone_ref_t;
284 typedef void * __single any_ref_t;
285 
286 /* Global lock */
287 static LCK_GRP_DECLARE(mbuf_mlock_grp, "mbuf");
288 static LCK_MTX_DECLARE(mbuf_mlock_data, &mbuf_mlock_grp);
289 #if !CONFIG_MBUF_MCACHE
290 static
291 #endif
292 lck_mtx_t *const mbuf_mlock = &mbuf_mlock_data;
293 
294 /* Globals */
295 #if !CONFIG_MBUF_MCACHE
296 static
297 #endif
298 int nclusters;                  /* # of clusters for non-jumbo (legacy) sizes */
299 int njcl;                       /* # of clusters for jumbo sizes */
300 int njclbytes;                  /* size of a jumbo cluster */
301 int max_linkhdr;                /* largest link-level header */
302 int max_protohdr;              /* largest protocol header */
303 int max_hdr;                    /* largest link+protocol header */
304 int max_datalen;                /* MHLEN - max_hdr */
305 
306 /* Lock to protect the completion callback table */
307 static LCK_GRP_DECLARE(mbuf_tx_compl_tbl_lck_grp, "mbuf_tx_compl_tbl");
308 LCK_RW_DECLARE(mbuf_tx_compl_tbl_lock, &mbuf_tx_compl_tbl_lck_grp);
309 
310 #define m_stats(c)      mbuf_table[c].mtbl_stats
311 #define m_ctotal(c)     mbuf_table[c].mtbl_stats->mbcl_ctotal
312 
313 #if !CONFIG_MBUF_MCACHE
314 /*
315  * Note: number of entries in mbuf_table must not exceed
316  * MB_STAT_MAX_MB_CLASSES
317  */
318 static mbuf_table_t mbuf_table[] = {
319 	{ .mtbl_class = MC_MBUF },
320 	{ .mtbl_class = MC_CL },
321 	{ .mtbl_class = MC_BIGCL },
322 	{ .mtbl_class = MC_16KCL },
323 	{ .mtbl_class = MC_MBUF_CL },
324 	{ .mtbl_class = MC_MBUF_BIGCL },
325 	{ .mtbl_class = MC_MBUF_16KCL },
326 };
327 #endif /* !CONFIG_MBUF_MCACHE */
328 
329 #if !CONFIG_MBUF_MCACHE
330 static
331 #endif /* !CONFIG_MBUF_MCACHE */
332 unsigned int mb_memory_pressure_percentage = 80;
333 
334 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
335 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
336 #if !CONFIG_MBUF_MCACHE
337 static void mbuf_watchdog_defunct(thread_call_param_t, thread_call_param_t);
338 static void mbuf_watchdog_drain_composite(thread_call_param_t, thread_call_param_t);
339 static struct mbuf *mz_alloc(zalloc_flags_t);
340 static void mz_free(struct mbuf *);
341 static struct ext_ref *mz_ref_alloc(zalloc_flags_t);
342 static void mz_ref_free(struct ext_ref *);
343 static void * __bidi_indexable mz_cl_alloc(zone_id_t, zalloc_flags_t);
344 static void mz_cl_free(zone_id_t, void *);
345 static struct mbuf *mz_composite_alloc(mbuf_class_t, zalloc_flags_t);
346 static zstack_t mz_composite_alloc_n(mbuf_class_t, unsigned int, zalloc_flags_t);
347 static void mz_composite_free(mbuf_class_t, struct mbuf *);
348 static void mz_composite_free_n(mbuf_class_t, zstack_t);
349 static void *mz_composite_build(zone_id_t, zalloc_flags_t);
350 static void *mz_composite_mark_valid(zone_id_t, void *);
351 static void *mz_composite_mark_invalid(zone_id_t, void *);
352 static void  mz_composite_destroy(zone_id_t, void *);
353 
354 ZONE_DEFINE_ID(ZONE_ID_MBUF_REF, "mbuf.ref", struct ext_ref,
355     ZC_CACHING | ZC_KASAN_NOQUARANTINE);
356 ZONE_DEFINE_ID(ZONE_ID_MBUF, "mbuf", struct mbuf,
357     ZC_CACHING | ZC_KASAN_NOQUARANTINE);
358 ZONE_DEFINE_ID(ZONE_ID_CLUSTER_2K, "mbuf.cluster.2k", union mcluster,
359     ZC_CACHING | ZC_KASAN_NOQUARANTINE | ZC_DATA);
360 ZONE_DEFINE_ID(ZONE_ID_CLUSTER_4K, "mbuf.cluster.4k", union mbigcluster,
361     ZC_CACHING | ZC_KASAN_NOQUARANTINE | ZC_DATA);
362 ZONE_DEFINE_ID(ZONE_ID_CLUSTER_16K, "mbuf.cluster.16k", union m16kcluster,
363     ZC_CACHING | ZC_KASAN_NOQUARANTINE | ZC_DATA);
364 static_assert(sizeof(union mcluster) == MCLBYTES);
365 static_assert(sizeof(union mbigcluster) == MBIGCLBYTES);
366 static_assert(sizeof(union m16kcluster) == M16KCLBYTES);
367 
368 static const struct zone_cache_ops mz_composite_ops = {
369 	.zc_op_alloc        = mz_composite_build,
370 	.zc_op_mark_valid   = mz_composite_mark_valid,
371 	.zc_op_mark_invalid = mz_composite_mark_invalid,
372 	.zc_op_free         = mz_composite_destroy,
373 };
374 ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_2K, "mbuf.composite.2k", struct mbuf,
375     sizeof(struct mbuf) + sizeof(struct ext_ref) + MCLBYTES,
376     &mz_composite_ops);
377 ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_4K, "mbuf.composite.4k", struct mbuf,
378     sizeof(struct mbuf) + sizeof(struct ext_ref) + MBIGCLBYTES,
379     &mz_composite_ops);
380 ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_16K, "mbuf.composite.16k", struct mbuf,
381     sizeof(struct mbuf) + sizeof(struct ext_ref) + M16KCLBYTES,
382     &mz_composite_ops);
383 static_assert(ZONE_ID_MBUF + MC_MBUF == ZONE_ID_MBUF);
384 static_assert(ZONE_ID_MBUF + MC_CL == ZONE_ID_CLUSTER_2K);
385 static_assert(ZONE_ID_MBUF + MC_BIGCL == ZONE_ID_CLUSTER_4K);
386 static_assert(ZONE_ID_MBUF + MC_16KCL == ZONE_ID_CLUSTER_16K);
387 static_assert(ZONE_ID_MBUF + MC_MBUF_CL == ZONE_ID_MBUF_CLUSTER_2K);
388 static_assert(ZONE_ID_MBUF + MC_MBUF_BIGCL == ZONE_ID_MBUF_CLUSTER_4K);
389 static_assert(ZONE_ID_MBUF + MC_MBUF_16KCL == ZONE_ID_MBUF_CLUSTER_16K);
390 
391 /* Converts a an mbuf class to a zalloc zone ID. */
392 __attribute__((always_inline))
393 static inline zone_id_t
m_class_to_zid(mbuf_class_t class)394 m_class_to_zid(mbuf_class_t class)
395 {
396 	return ZONE_ID_MBUF + class - MC_MBUF;
397 }
398 
399 __attribute__((always_inline))
400 static inline mbuf_class_t
m_class_from_zid(zone_id_t zid)401 m_class_from_zid(zone_id_t zid)
402 {
403 	return MC_MBUF + zid - ZONE_ID_MBUF;
404 }
405 
406 static thread_call_t mbuf_defunct_tcall;
407 static thread_call_t mbuf_drain_tcall;
408 #endif /* !CONFIG_MBUF_MCACHE */
409 
410 static int m_copyback0(struct mbuf **, int, int len, const void * __sized_by_or_null(len), int, int);
411 static struct mbuf *m_split0(struct mbuf *, int, int, int);
412 
413 /* flags for m_copyback0 */
414 #define M_COPYBACK0_COPYBACK    0x0001  /* copyback from cp */
415 #define M_COPYBACK0_PRESERVE    0x0002  /* preserve original data */
416 #define M_COPYBACK0_COW         0x0004  /* do copy-on-write */
417 #define M_COPYBACK0_EXTEND      0x0008  /* extend chain */
418 
419 /*
420  * The structure that holds all mbuf class statistics exportable via sysctl.
421  * Similar to mbstat structure, the mb_stat structure is protected by the
422  * global mbuf lock.  It contains additional information about the classes
423  * that allows for a more accurate view of the state of the allocator.
424  */
425 struct mb_stat *mb_stat;
426 struct omb_stat *omb_stat;      /* For backwards compatibility */
427 
428 #define MB_STAT_SIZE(n) \
429 	__builtin_offsetof(mb_stat_t, mbs_class[n])
430 
431 #define OMB_STAT_SIZE(n) \
432 	__builtin_offsetof(struct omb_stat, mbs_class[n])
433 
434 /*
435  * The legacy structure holding all of the mbuf allocation statistics.
436  * The actual statistics used by the kernel are stored in the mbuf_table
437  * instead, and are updated atomically while the global mbuf lock is held.
438  * They are mirrored in mbstat to support legacy applications (e.g. netstat).
439  * Unlike before, the kernel no longer relies on the contents of mbstat for
440  * its operations (e.g. cluster expansion) because the structure is exposed
441  * to outside and could possibly be modified, therefore making it unsafe.
442  * With the exception of the mbstat.m_mtypes array (see below), all of the
443  * statistics are updated as they change.
444  */
445 struct mbstat mbstat;
446 
447 #define MBSTAT_MTYPES_MAX \
448 	(sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
449 
450 #if !CONFIG_MBUF_MCACHE
451 static
452 #endif
453 mbuf_mtypes_t PERCPU_DATA(mbuf_mtypes);
454 
455 __private_extern__ inline struct ext_ref *
m_get_rfa(struct mbuf * m)456 m_get_rfa(struct mbuf *m)
457 {
458 	return m->m_ext.ext_refflags;
459 }
460 
461 __private_extern__ inline m_ext_free_func_t
m_get_ext_free(struct mbuf * m)462 m_get_ext_free(struct mbuf *m)
463 {
464 	if (m->m_ext.ext_free == NULL) {
465 		return NULL;
466 	}
467 
468 	return ptrauth_nop_cast(m_ext_free_func_t, m->m_ext.ext_free);
469 }
470 
471 #if !CONFIG_MBUF_MCACHE
472 static
473 #endif
474 caddr_t
m_get_ext_arg(struct mbuf * m)475 m_get_ext_arg(struct mbuf *m)
476 {
477 	return (caddr_t)m->m_ext.ext_arg;
478 }
479 
480 #if !CONFIG_MBUF_MCACHE
481 static
482 #endif
483 void
m_set_ext(struct mbuf * m,struct ext_ref * rfa,m_ext_free_func_t ext_free,caddr_t ext_arg)484 m_set_ext(struct mbuf *m, struct ext_ref *rfa, m_ext_free_func_t ext_free,
485     caddr_t ext_arg)
486 {
487 	VERIFY(m->m_flags & M_EXT);
488 	if (rfa != NULL) {
489 		m->m_ext.ext_refflags = rfa;
490 		if (ext_free != NULL) {
491 			m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free);
492 			m->m_ext.ext_arg = ext_arg;
493 		} else {
494 			m->m_ext.ext_free = NULL;
495 			m->m_ext.ext_arg = NULL;
496 		}
497 	} else {
498 		if (ext_free != NULL) {
499 			m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free);
500 			m->m_ext.ext_arg = ext_arg;
501 		} else {
502 			m->m_ext.ext_free = NULL;
503 			m->m_ext.ext_arg = NULL;
504 		}
505 		m->m_ext.ext_refflags = NULL;
506 	}
507 }
508 
509 #if !CONFIG_MBUF_MCACHE
510 static
511 #endif
512 void
mext_init(struct mbuf * m,void * __sized_by (size)buf,u_int size,m_ext_free_func_t free,caddr_t free_arg,struct ext_ref * rfa,u_int16_t min,u_int16_t ref,u_int16_t pref,u_int16_t flag,u_int32_t priv,struct mbuf * pm)513 mext_init(struct mbuf *m, void *__sized_by(size)buf, u_int size,
514     m_ext_free_func_t free, caddr_t free_arg, struct ext_ref *rfa,
515     u_int16_t min, u_int16_t ref, u_int16_t pref, u_int16_t flag,
516     u_int32_t priv, struct mbuf *pm)
517 {
518 	m->m_ext.ext_buf = buf;
519 	m->m_ext.ext_size = size;
520 	m->m_data = (uintptr_t)m->m_ext.ext_buf;
521 	m->m_len = 0;
522 	m->m_flags |= M_EXT;
523 	m_set_ext(m, rfa, free, free_arg);
524 	MEXT_MINREF(m) = min;
525 	MEXT_REF(m) = ref;
526 	MEXT_PREF(m) = pref;
527 	MEXT_FLAGS(m) = flag;
528 	MEXT_PRIV(m) = priv;
529 	MEXT_PMBUF(m) = pm;
530 }
531 
532 #if !CONFIG_MBUF_MCACHE
533 static
534 #endif
535 void
mbuf_mtypes_sync(void)536 mbuf_mtypes_sync(void)
537 {
538 	mbuf_mtypes_t mtc;
539 
540 	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
541 
542 	mtc = *PERCPU_GET_MASTER(mbuf_mtypes);
543 	percpu_foreach_secondary(mtype, mbuf_mtypes) {
544 		for (int n = 0; n < MT_MAX; n++) {
545 			mtc.cpu_mtypes[n] += mtype->cpu_mtypes[n];
546 		}
547 	}
548 
549 	for (int n = 0; n < MT_MAX; n++) {
550 		mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
551 	}
552 }
553 
554 #if !CONFIG_MBUF_MCACHE
555 static void
mbuf_stat_sync(void)556 mbuf_stat_sync(void)
557 {
558 	mb_class_stat_t *sp;
559 	int k;
560 	uint64_t drops = 0;
561 
562 
563 	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
564 
565 	for (k = 0; k < MC_MAX; k++) {
566 		const zone_id_t zid = m_class_to_zid(m_class(k));
567 		const zone_ref_t zone = zone_by_id(zid);
568 		struct zone_basic_stats stats = {};
569 
570 		sp = m_stats(k);
571 		zone_get_stats(zone, &stats);
572 		drops += stats.zbs_alloc_fail;
573 		sp->mbcl_total = stats.zbs_avail;
574 		sp->mbcl_active = stats.zbs_alloc;
575 		/*
576 		 * infree is what mcache considers the freelist (uncached)
577 		 * free_cnt contains all the cached/uncached elements
578 		 * in a zone.
579 		 */
580 		sp->mbcl_infree = stats.zbs_free - stats.zbs_cached;
581 		sp->mbcl_fail_cnt = stats.zbs_alloc_fail;
582 		sp->mbcl_ctotal = sp->mbcl_total;
583 
584 		/* These stats are not available in zalloc. */
585 		sp->mbcl_alloc_cnt = 0;
586 		sp->mbcl_free_cnt = 0;
587 		sp->mbcl_notified = 0;
588 		sp->mbcl_purge_cnt = 0;
589 		sp->mbcl_slab_cnt = 0;
590 		sp->mbcl_release_cnt = 0;
591 
592 		/* zalloc caches are always on. */
593 		sp->mbcl_mc_state = MCS_ONLINE;
594 		sp->mbcl_mc_cached = stats.zbs_cached;
595 		/* These stats are not collected by zalloc. */
596 		sp->mbcl_mc_waiter_cnt = 0;
597 		sp->mbcl_mc_wretry_cnt = 0;
598 		sp->mbcl_mc_nwretry_cnt = 0;
599 	}
600 	/* Deduct clusters used in composite cache */
601 	m_ctotal(MC_MBUF) -= (m_total(MC_MBUF_CL) +
602 	    m_total(MC_MBUF_BIGCL) -
603 	    m_total(MC_MBUF_16KCL));
604 	m_ctotal(MC_CL) -= m_total(MC_MBUF_CL);
605 	m_ctotal(MC_BIGCL) -= m_total(MC_MBUF_BIGCL);
606 	m_ctotal(MC_16KCL) -= m_total(MC_MBUF_16KCL);
607 
608 	/* Update mbstat. */
609 	mbstat.m_mbufs = m_total(MC_MBUF);
610 	mbstat.m_clusters = m_total(MC_CL);
611 	mbstat.m_clfree = m_infree(MC_CL) + m_infree(MC_MBUF_CL);
612 	mbstat.m_drops = drops;
613 	mbstat.m_bigclusters = m_total(MC_BIGCL);
614 	mbstat.m_bigclfree = m_infree(MC_BIGCL) + m_infree(MC_MBUF_BIGCL);
615 }
616 #endif /* !CONFIG_MBUF_MCACHE */
617 
618 static int
619 mbstat_sysctl SYSCTL_HANDLER_ARGS
620 {
621 #pragma unused(oidp, arg1, arg2)
622 
623 	lck_mtx_lock(mbuf_mlock);
624 	mbuf_stat_sync();
625 	mbuf_mtypes_sync();
626 	lck_mtx_unlock(mbuf_mlock);
627 
628 	return SYSCTL_OUT(req, &mbstat, sizeof(mbstat));
629 }
630 
631 static int
632 mb_stat_sysctl SYSCTL_HANDLER_ARGS
633 {
634 #pragma unused(oidp, arg1, arg2)
635 	any_ref_t statp;
636 	int k, statsz, proc64 = proc_is64bit(req->p);
637 
638 	lck_mtx_lock(mbuf_mlock);
639 	mbuf_stat_sync();
640 
641 	if (!proc64) {
642 		struct omb_class_stat *oc;
643 		struct mb_class_stat *c;
644 
645 		omb_stat->mbs_cnt = mb_stat->mbs_cnt;
646 		oc = &omb_stat->mbs_class[0];
647 		c = &mb_stat->mbs_class[0];
648 		for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
649 			(void) snprintf(oc->mbcl_cname, sizeof(oc->mbcl_cname),
650 			    "%s", c->mbcl_cname);
651 			oc->mbcl_size = c->mbcl_size;
652 			oc->mbcl_total = c->mbcl_total;
653 			oc->mbcl_active = c->mbcl_active;
654 			oc->mbcl_infree = c->mbcl_infree;
655 			oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
656 			oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
657 			oc->mbcl_free_cnt = c->mbcl_free_cnt;
658 			oc->mbcl_notified = c->mbcl_notified;
659 			oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
660 			oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
661 			oc->mbcl_ctotal = c->mbcl_ctotal;
662 			oc->mbcl_release_cnt = c->mbcl_release_cnt;
663 			oc->mbcl_mc_state = c->mbcl_mc_state;
664 			oc->mbcl_mc_cached = c->mbcl_mc_cached;
665 			oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
666 			oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
667 			oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
668 		}
669 		statp = omb_stat;
670 		statsz = OMB_STAT_SIZE(MC_MAX);
671 	} else {
672 		statp = mb_stat;
673 		statsz = MB_STAT_SIZE(MC_MAX);
674 	}
675 
676 	lck_mtx_unlock(mbuf_mlock);
677 
678 	return SYSCTL_OUT(req, statp, statsz);
679 }
680 
681 #if !CONFIG_MBUF_MCACHE
682 static void
mbuf_mcheck(struct mbuf * m)683 mbuf_mcheck(struct mbuf *m)
684 {
685 	if (__improbable(m->m_type != MT_FREE && !MBUF_IS_PAIRED(m))) {
686 		panic("MCHECK: m_type=%d m=%p",
687 		    (u_int16_t)(m)->m_type, m);
688 	}
689 }
690 #endif /* !CONFIG_MBUF_MCACHE */
691 
692 static void
m_scratch_init(struct mbuf * m)693 m_scratch_init(struct mbuf *m)
694 {
695 	struct pkthdr *pkt = &m->m_pkthdr;
696 
697 	VERIFY(m->m_flags & M_PKTHDR);
698 
699 	/* See comments in <rdar://problem/14040693> */
700 	if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
701 		panic_plain("Invalid attempt to modify guarded module-private "
702 		    "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
703 		/* NOTREACHED */
704 	}
705 
706 	bzero(&pkt->pkt_mpriv, sizeof(pkt->pkt_mpriv));
707 }
708 
709 
710 static void
mbuf_init_pkthdr(struct mbuf * m)711 mbuf_init_pkthdr(struct mbuf *m)
712 {
713 	m->m_pkthdr.rcvif = NULL;
714 	m->m_pkthdr.pkt_hdr = NULL;
715 	m->m_pkthdr.len = 0;
716 	m->m_pkthdr.csum_flags = 0;
717 	m->m_pkthdr.csum_data = 0;
718 	m->m_pkthdr.vlan_tag = 0;
719 	m->m_pkthdr.comp_gencnt = 0;
720 	m->m_pkthdr.pkt_crumbs = 0;
721 	m_classifier_init(m, 0);
722 	m_tag_init(m, 1);
723 	m_scratch_init(m);
724 }
725 
726 #if !CONFIG_MBUF_MCACHE
727 static
728 #endif
729 void
mbuf_init(struct mbuf * m,int pkthdr,int type)730 mbuf_init(struct mbuf *m, int pkthdr, int type)
731 {
732 	mbuf_mcheck(m);
733 	m->m_next = m->m_nextpkt = NULL;
734 	m->m_len = 0;
735 	m->m_type = type;
736 	if (pkthdr == 0) {
737 		m->m_data = (uintptr_t)m->m_dat;
738 		m->m_flags = 0;
739 	} else {
740 		m->m_data = (uintptr_t)m->m_pktdat;
741 		m->m_flags = M_PKTHDR;
742 		mbuf_init_pkthdr(m);
743 	}
744 }
745 
746 
747 #if !CONFIG_MBUF_MCACHE
748 /*
749  * The following functions are wrappers around mbuf
750  * allocation for zalloc.  They all have the prefix "mz"
751  * which was chosen to avoid conflicts with the mbuf KPIs.
752  *
753  * Z_NOPAGEWAIT is used in place of Z_NOWAIT because
754  * Z_NOPAGEWAIT maps closer to MCR_TRYHARD. Z_NOWAIT will
755  * fail immediately if it has to take a mutex and that
756  * may cause packets to be dropped more frequently.
757  * In general, the mbuf subsystem can sustain grabbing a mutex
758  * during "non-blocking" allocation and that's the reason
759  * why Z_NOPAGEWAIT was chosen.
760  *
761  * mbufs are elided (removed all pointers) before they are
762  * returned to the cache. The exception are composite mbufs which
763  * are re-initialized on allocation.
764  */
765 __attribute__((always_inline))
766 static inline void
m_elide(struct mbuf * m)767 m_elide(struct mbuf *m)
768 {
769 	m->m_next = m->m_nextpkt = NULL;
770 	m->m_data = 0;
771 	memset(&m->m_ext, 0, sizeof(m->m_ext));
772 	m->m_pkthdr.rcvif = NULL;
773 	m->m_pkthdr.pkt_hdr = NULL;
774 	m->m_flags |= M_PKTHDR;
775 	m_tag_init(m, 1);
776 	m->m_pkthdr.pkt_flags = 0;
777 	m_scratch_init(m);
778 	m->m_flags &= ~M_PKTHDR;
779 }
780 
781 __attribute__((always_inline))
782 static inline struct mbuf *
mz_alloc(zalloc_flags_t flags)783 mz_alloc(zalloc_flags_t flags)
784 {
785 	if (flags & Z_NOWAIT) {
786 		flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
787 	} else if (!(flags & Z_NOPAGEWAIT)) {
788 		flags |= Z_NOFAIL;
789 	}
790 	return zalloc_id(ZONE_ID_MBUF, flags | Z_NOZZC);
791 }
792 
793 __attribute__((always_inline))
794 static inline zstack_t
mz_alloc_n(uint32_t count,zalloc_flags_t flags)795 mz_alloc_n(uint32_t count, zalloc_flags_t flags)
796 {
797 	if (flags & Z_NOWAIT) {
798 		flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
799 	} else if (!(flags & Z_NOPAGEWAIT)) {
800 		flags |= Z_NOFAIL;
801 	}
802 	return zalloc_n(ZONE_ID_MBUF, count, flags | Z_NOZZC);
803 }
804 
805 __attribute__((always_inline))
806 static inline void
mz_free(struct mbuf * m)807 mz_free(struct mbuf *m)
808 {
809 #if KASAN
810 	zone_require(zone_by_id(ZONE_ID_MBUF), m);
811 #endif
812 	m_elide(m);
813 	zfree_nozero(ZONE_ID_MBUF, m);
814 }
815 
816 __attribute__((always_inline))
817 static inline void
mz_free_n(zstack_t list)818 mz_free_n(zstack_t list)
819 {
820 	/* Callers of this function have already elided the mbuf. */
821 	zfree_nozero_n(ZONE_ID_MBUF, list);
822 }
823 
824 __attribute__((always_inline))
825 static inline struct ext_ref *
mz_ref_alloc(zalloc_flags_t flags)826 mz_ref_alloc(zalloc_flags_t flags)
827 {
828 	if (flags & Z_NOWAIT) {
829 		flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
830 	}
831 	return zalloc_id(ZONE_ID_MBUF_REF, flags | Z_NOZZC);
832 }
833 
834 __attribute__((always_inline))
835 static inline void
mz_ref_free(struct ext_ref * rfa)836 mz_ref_free(struct ext_ref *rfa)
837 {
838 	VERIFY(rfa->minref == rfa->refcnt);
839 #if KASAN
840 	zone_require(zone_by_id(ZONE_ID_MBUF_REF), rfa);
841 #endif
842 	zfree_nozero(ZONE_ID_MBUF_REF, rfa);
843 }
844 
845 __attribute__((always_inline))
846 static inline void * __bidi_indexable
mz_cl_alloc(zone_id_t zid,zalloc_flags_t flags)847 mz_cl_alloc(zone_id_t zid, zalloc_flags_t flags)
848 {
849 	void * p __unsafe_indexable;
850 	if (flags & Z_NOWAIT) {
851 		flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
852 	} else if (!(flags & Z_NOPAGEWAIT)) {
853 		flags |= Z_NOFAIL;
854 	}
855 	flags |= Z_NOZZC;
856 
857 	/*
858 	 * N.B. Invoking `(zalloc_id)' directly, vs. via `zalloc_id' macro.
859 	 */
860 	p = (zalloc_id)(zid, flags);
861 	return __unsafe_forge_bidi_indexable(void *, p, zone_get_elem_size(zone_by_id(zid)));
862 }
863 
864 __attribute__((always_inline))
865 static inline void
mz_cl_free(zone_id_t zid,void * cl)866 mz_cl_free(zone_id_t zid, void *cl)
867 {
868 #if KASAN
869 	zone_require(zone_by_id(zid), cl);
870 #endif
871 	zfree_nozero(zid, cl);
872 }
873 
874 __attribute__((always_inline))
875 static inline zstack_t
mz_composite_alloc_n(mbuf_class_t class,unsigned int n,zalloc_flags_t flags)876 mz_composite_alloc_n(mbuf_class_t class, unsigned int n, zalloc_flags_t flags)
877 {
878 	if (flags & Z_NOWAIT) {
879 		flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
880 	}
881 	return (zcache_alloc_n)(m_class_to_zid(class), n, flags,
882 	       &mz_composite_ops);
883 }
884 
885 __attribute__((always_inline))
886 static inline struct mbuf *
mz_composite_alloc(mbuf_class_t class,zalloc_flags_t flags)887 mz_composite_alloc(mbuf_class_t class, zalloc_flags_t flags)
888 {
889 	zstack_t list = {};
890 	list = mz_composite_alloc_n(class, 1, flags);
891 	if (!zstack_empty(list)) {
892 		return zstack_pop(&list);
893 	} else {
894 		return NULL;
895 	}
896 }
897 
898 __attribute__((always_inline))
899 static inline void
mz_composite_free_n(mbuf_class_t class,zstack_t list)900 mz_composite_free_n(mbuf_class_t class, zstack_t list)
901 {
902 	(zcache_free_n)(m_class_to_zid(class), list, &mz_composite_ops);
903 }
904 
905 __attribute__((always_inline))
906 static inline void
mz_composite_free(mbuf_class_t class,struct mbuf * m)907 mz_composite_free(mbuf_class_t class, struct mbuf *m)
908 {
909 	zstack_t list = {};
910 	zstack_push(&list, m);
911 	(zcache_free_n)(m_class_to_zid(class), list, &mz_composite_ops);
912 }
913 
914 /* Converts composite zone ID to the cluster zone ID. */
915 __attribute__((always_inline))
916 static inline zone_id_t
mz_cl_zid(zone_id_t zid)917 mz_cl_zid(zone_id_t zid)
918 {
919 	return ZONE_ID_CLUSTER_2K + zid - ZONE_ID_MBUF_CLUSTER_2K;
920 }
921 
922 static void *
mz_composite_build(zone_id_t zid,zalloc_flags_t flags)923 mz_composite_build(zone_id_t zid, zalloc_flags_t flags)
924 {
925 	const zone_id_t cl_zid = mz_cl_zid(zid);
926 	struct mbuf *m = NULL;
927 	struct ext_ref *rfa = NULL;
928 	void *cl = NULL;
929 
930 	cl = mz_cl_alloc(cl_zid, flags);
931 	if (__improbable(cl == NULL)) {
932 		goto out;
933 	}
934 	rfa = mz_ref_alloc(flags);
935 	if (__improbable(rfa == NULL)) {
936 		goto out_free_cl;
937 	}
938 	m = mz_alloc(flags);
939 	if (__improbable(m == NULL)) {
940 		goto out_free_rfa;
941 	}
942 	mbuf_init(m, 0, MT_FREE);
943 	if (zid == ZONE_ID_MBUF_CLUSTER_2K) {
944 		MBUF_CL_INIT(m, cl, rfa, 0, EXTF_COMPOSITE);
945 	} else if (zid == ZONE_ID_MBUF_CLUSTER_4K) {
946 		MBUF_BIGCL_INIT(m, cl, rfa, 0, EXTF_COMPOSITE);
947 	} else {
948 		MBUF_16KCL_INIT(m, cl, rfa, 0, EXTF_COMPOSITE);
949 	}
950 	VERIFY(m->m_flags == M_EXT);
951 	VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
952 
953 	return m;
954 out_free_rfa:
955 	mz_ref_free(rfa);
956 out_free_cl:
957 	mz_cl_free(cl_zid, cl);
958 out:
959 	return NULL;
960 }
961 
962 static void *
mz_composite_mark_valid(zone_id_t zid,void * p)963 mz_composite_mark_valid(zone_id_t zid, void *p)
964 {
965 	mbuf_ref_t m = p;
966 
967 	m = zcache_mark_valid_single(zone_by_id(ZONE_ID_MBUF), m);
968 #if KASAN
969 	struct ext_ref *rfa __single = m_get_rfa(m);
970 	const zone_id_t cl_zid = mz_cl_zid(zid);
971 	void *cl = m->m_ext.ext_buf;
972 
973 	cl = __unsafe_forge_bidi_indexable(void *,
974 	    zcache_mark_valid(zone_by_id(cl_zid), cl),
975 	    zone_get_elem_size(zone_by_id(cl_zid)));
976 	rfa = __unsafe_forge_single(struct ext_ref *,
977 	    zcache_mark_valid(zone_by_id(ZONE_ID_MBUF_REF), rfa));
978 	m->m_data = (uintptr_t)cl;
979 	m->m_ext.ext_buf = cl;
980 	m->m_ext.ext_size = m->m_ext.ext_size;
981 	m->m_ext.ext_refflags = rfa;
982 #else
983 #pragma unused(zid)
984 #endif
985 	VERIFY(MBUF_IS_COMPOSITE(m));
986 
987 	return m;
988 }
989 
990 static void *
mz_composite_mark_invalid(zone_id_t zid,void * p)991 mz_composite_mark_invalid(zone_id_t zid, void *p)
992 {
993 	mbuf_ref_t m = p;
994 
995 	VERIFY(MBUF_IS_COMPOSITE(m));
996 	VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
997 #if KASAN
998 	struct ext_ref *rfa __single = m_get_rfa(m);
999 	const zone_id_t cl_zid = mz_cl_zid(zid);
1000 	void *cl = m->m_ext.ext_buf;
1001 
1002 	cl = __unsafe_forge_bidi_indexable(void *,
1003 	    zcache_mark_invalid(zone_by_id(cl_zid), cl),
1004 	    zone_get_elem_size(zone_by_id(cl_zid)));
1005 	rfa = __unsafe_forge_single(struct ext_ref *,
1006 	    zcache_mark_invalid(zone_by_id(ZONE_ID_MBUF_REF), rfa));
1007 	m->m_data = (uintptr_t)cl;
1008 	m->m_ext.ext_buf = cl;
1009 	m->m_ext.ext_size = m->m_ext.ext_size;
1010 	m->m_ext.ext_refflags = rfa;
1011 #else
1012 #pragma unused(zid)
1013 #endif
1014 
1015 	return zcache_mark_invalid_single(zone_by_id(ZONE_ID_MBUF), m);
1016 }
1017 
1018 static void
mz_composite_destroy(zone_id_t zid,void * p)1019 mz_composite_destroy(zone_id_t zid, void *p)
1020 {
1021 	const zone_id_t cl_zid = mz_cl_zid(zid);
1022 	struct ext_ref *rfa = NULL;
1023 	mbuf_ref_t m = p;
1024 
1025 	VERIFY(MBUF_IS_COMPOSITE(m));
1026 
1027 	MEXT_MINREF(m) = 0;
1028 	MEXT_REF(m) = 0;
1029 	MEXT_PREF(m) = 0;
1030 	MEXT_FLAGS(m) = 0;
1031 	MEXT_PRIV(m) = 0;
1032 	MEXT_PMBUF(m) = NULL;
1033 
1034 	rfa = m_get_rfa(m);
1035 	m_set_ext(m, NULL, NULL, NULL);
1036 
1037 	m->m_type = MT_FREE;
1038 	m->m_flags = m->m_len = 0;
1039 	m->m_next = m->m_nextpkt = NULL;
1040 
1041 	mz_cl_free(cl_zid, m->m_ext.ext_buf);
1042 	m->m_ext.ext_size = 0;
1043 	m->m_ext.ext_buf = NULL;
1044 	mz_ref_free(rfa);
1045 	mz_free(m);
1046 }
1047 #endif /* !CONFIG_MBUF_MCACHE */
1048 
1049 #if !CONFIG_MBUF_MCACHE
1050 static
1051 #endif
1052 void
m_incref(struct mbuf * m)1053 m_incref(struct mbuf *m)
1054 {
1055 	uint16_t new = os_atomic_inc(&MEXT_REF(m), relaxed);
1056 
1057 	VERIFY(new != 0);
1058 	/*
1059 	 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1060 	 * we don't clear the flag when the refcount goes back to the
1061 	 * minimum, to simplify code calling m_mclhasreference().
1062 	 */
1063 	if (new > (MEXT_MINREF(m) + 1) && !(MEXT_FLAGS(m) & EXTF_READONLY)) {
1064 		os_atomic_or(&MEXT_FLAGS(m), EXTF_READONLY, relaxed);
1065 	}
1066 }
1067 
1068 #if !CONFIG_MBUF_MCACHE
1069 static
1070 #endif
1071 uint16_t
m_decref(struct mbuf * m)1072 m_decref(struct mbuf *m)
1073 {
1074 	VERIFY(MEXT_REF(m) != 0);
1075 
1076 	return os_atomic_dec(&MEXT_REF(m), acq_rel);
1077 }
1078 
1079 /* By default, mbuf_limit is enabled. Except when serverperfmode is set. */
1080 static int mbuf_limit = 1;
1081 
1082 #if !CONFIG_MBUF_MCACHE
1083 static
1084 #endif
1085 void
mbuf_table_init(void)1086 mbuf_table_init(void)
1087 {
1088 	unsigned int b, c, s;
1089 	int m;
1090 
1091 	omb_stat = zalloc_permanent(OMB_STAT_SIZE(MC_MAX),
1092 	    ZALIGN(struct omb_stat));
1093 
1094 	mb_stat = zalloc_permanent(MB_STAT_SIZE(MC_MAX),
1095 	    ZALIGN(mb_stat_t));
1096 
1097 	mb_stat->mbs_cnt = MC_MAX;
1098 	for (m = 0; m < MC_MAX; m++) {
1099 		mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1100 	}
1101 
1102 	/*
1103 	 * Set aside 1/3 of the mbuf cluster map for jumbo
1104 	 * clusters; we do this only on platforms where jumbo
1105 	 * cluster pool is enabled.
1106 	 */
1107 	njcl = nmbclusters / 3;
1108 	njclbytes = M16KCLBYTES;
1109 
1110 	/*
1111 	 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1112 	 * a multiple of 4KB clusters.
1113 	 */
1114 	nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1115 
1116 	/*
1117 	 * Each jumbo cluster takes 8 2KB clusters, so make
1118 	 * sure that the pool size is evenly divisible by 8;
1119 	 * njcl is in 2KB unit, hence treated as such.
1120 	 */
1121 	njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL);
1122 
1123 	/* Update nclusters with rounded down value of njcl */
1124 	nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1125 
1126 	/*
1127 	 * njcl is valid only on platforms with 16KB jumbo clusters or
1128 	 * with 16KB pages, where it is configured to 1/3 of the pool
1129 	 * size.  On these platforms, the remaining is used for 2KB
1130 	 * and 4KB clusters.  On platforms without 16KB jumbo clusters,
1131 	 * the entire pool is used for both 2KB and 4KB clusters.  A 4KB
1132 	 * cluster can either be splitted into 16 mbufs, or into 2 2KB
1133 	 * clusters.
1134 	 *
1135 	 *  +---+---+------------ ... -----------+------- ... -------+
1136 	 *  | c | b |              s             |        njcl       |
1137 	 *  +---+---+------------ ... -----------+------- ... -------+
1138 	 *
1139 	 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1140 	 * clusters (1/64th each.)
1141 	 */
1142 	c = P2ROUNDDOWN((nclusters >> 6), NCLPG);       /* in 2KB unit */
1143 	b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), NBCLPG);  /* in 4KB unit */
1144 	s = nclusters - (c + (b << NCLPBGSHIFT));       /* in 2KB unit */
1145 
1146 	/*
1147 	 * 1/64th (c) is reserved for 2KB clusters.
1148 	 */
1149 	m_minlimit(MC_CL) = c;
1150 	if (mbuf_limit) {
1151 		m_maxlimit(MC_CL) = s + c;                      /* in 2KB unit */
1152 	} else {
1153 		m_maxlimit(MC_CL) = INT_MAX;
1154 	}
1155 	m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1156 	snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1157 
1158 	/*
1159 	 * Another 1/64th (b) of the map is reserved for 4KB clusters.
1160 	 * It cannot be turned into 2KB clusters or mbufs.
1161 	 */
1162 	m_minlimit(MC_BIGCL) = b;
1163 	if (mbuf_limit) {
1164 		m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b;  /* in 4KB unit */
1165 	} else {
1166 		m_maxlimit(MC_BIGCL) = INT_MAX;
1167 	}
1168 	m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1169 	snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1170 
1171 	/*
1172 	 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1173 	 */
1174 	m_minlimit(MC_MBUF) = 0;
1175 	if (mbuf_limit) {
1176 		m_maxlimit(MC_MBUF) = s * NMBPCL;       /* in mbuf unit */
1177 	} else {
1178 		m_maxlimit(MC_MBUF) = INT_MAX;
1179 	}
1180 	m_maxsize(MC_MBUF) = m_size(MC_MBUF) = _MSIZE;
1181 	snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1182 
1183 	/*
1184 	 * Set limits for the composite classes.
1185 	 */
1186 	m_minlimit(MC_MBUF_CL) = 0;
1187 	if (mbuf_limit) {
1188 		m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1189 	} else {
1190 		m_maxlimit(MC_MBUF_CL) = INT_MAX;
1191 	}
1192 	m_maxsize(MC_MBUF_CL) = MCLBYTES;
1193 	m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1194 	snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1195 
1196 	m_minlimit(MC_MBUF_BIGCL) = 0;
1197 	if (mbuf_limit) {
1198 		m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1199 	} else {
1200 		m_maxlimit(MC_MBUF_BIGCL) = INT_MAX;
1201 	}
1202 	m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1203 	m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1204 	snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1205 
1206 	/*
1207 	 * And for jumbo classes.
1208 	 */
1209 	m_minlimit(MC_16KCL) = 0;
1210 	if (mbuf_limit) {
1211 		m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT);  /* in 16KB unit */
1212 	} else {
1213 		m_maxlimit(MC_16KCL) = INT_MAX;
1214 	}
1215 	m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1216 	snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1217 
1218 	m_minlimit(MC_MBUF_16KCL) = 0;
1219 	if (mbuf_limit) {
1220 		m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1221 	} else {
1222 		m_maxlimit(MC_MBUF_16KCL) = INT_MAX;
1223 	}
1224 	m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1225 	m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1226 	snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1227 
1228 	/*
1229 	 * Initialize the legacy mbstat structure.
1230 	 */
1231 	bzero(&mbstat, sizeof(mbstat));
1232 	mbstat.m_msize = m_maxsize(MC_MBUF);
1233 	mbstat.m_mclbytes = m_maxsize(MC_CL);
1234 	mbstat.m_minclsize = MINCLSIZE;
1235 	mbstat.m_mlen = MLEN;
1236 	mbstat.m_mhlen = MHLEN;
1237 	mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1238 }
1239 
1240 #if !CONFIG_MBUF_MCACHE
1241 static
1242 #endif
1243 int
mbuf_get_class(struct mbuf * m)1244 mbuf_get_class(struct mbuf *m)
1245 {
1246 	if (m->m_flags & M_EXT) {
1247 		uint32_t composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
1248 		m_ext_free_func_t m_free_func = m_get_ext_free(m);
1249 
1250 		if (m_free_func == NULL) {
1251 			if (composite) {
1252 				return MC_MBUF_CL;
1253 			} else {
1254 				return MC_CL;
1255 			}
1256 		} else if (m_free_func == m_bigfree) {
1257 			if (composite) {
1258 				return MC_MBUF_BIGCL;
1259 			} else {
1260 				return MC_BIGCL;
1261 			}
1262 		} else if (m_free_func == m_16kfree) {
1263 			if (composite) {
1264 				return MC_MBUF_16KCL;
1265 			} else {
1266 				return MC_16KCL;
1267 			}
1268 		}
1269 	}
1270 
1271 	return MC_MBUF;
1272 }
1273 
1274 #if !CONFIG_MBUF_MCACHE
1275 bool
mbuf_class_under_pressure(struct mbuf * m)1276 mbuf_class_under_pressure(struct mbuf *m)
1277 {
1278 	struct zone_basic_stats stats = {};
1279 	zone_ref_t zone;
1280 	zone_id_t zid;
1281 	int mclass;
1282 
1283 	if (mbuf_limit == 0) {
1284 		return false;
1285 	}
1286 
1287 	mclass = mbuf_get_class(m);
1288 
1289 	/*
1290 	 * Grab the statistics from zalloc.
1291 	 * We can't call mbuf_stat_sync() since that requires a lock.
1292 	 */
1293 	zid = m_class_to_zid(m_class(mclass));
1294 	zone = zone_by_id(zid);
1295 
1296 	zone_get_stats(zone, &stats);
1297 	if (stats.zbs_avail - stats.zbs_free >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
1298 		os_log(OS_LOG_DEFAULT,
1299 		    "%s memory-pressure on mbuf due to class %u, total %llu free %llu max %u",
1300 		    __func__, mclass, stats.zbs_avail, stats.zbs_free, m_maxlimit(mclass));
1301 		return true;
1302 	}
1303 
1304 	return false;
1305 }
1306 #endif /* CONFIG_MBUF_MCACHE */
1307 
1308 #if defined(__LP64__)
1309 typedef struct ncl_tbl {
1310 	uint64_t nt_maxmem;     /* memory (sane) size */
1311 	uint32_t nt_mbpool;     /* mbuf pool size */
1312 } ncl_tbl_t;
1313 
1314 static const ncl_tbl_t ncl_table[] = {
1315 	{ (1ULL << GBSHIFT) /*  1 GB */, (64 << MBSHIFT) /*  64 MB */ },
1316 	{ (1ULL << (GBSHIFT + 2)) /*  4 GB */, (96 << MBSHIFT) /*  96 MB */ },
1317 	{ (1ULL << (GBSHIFT + 3)) /* 8 GB */, (128 << MBSHIFT) /* 128 MB */ },
1318 	{ (1ULL << (GBSHIFT + 4)) /* 16 GB */, (256 << MBSHIFT) /* 256 MB */ },
1319 	{ (1ULL << (GBSHIFT + 5)) /* 32 GB */, (512 << MBSHIFT) /* 512 MB */ },
1320 	{ 0, 0 }
1321 };
1322 #endif /* __LP64__ */
1323 
1324 __private_extern__ unsigned int
mbuf_default_ncl(uint64_t mem)1325 mbuf_default_ncl(uint64_t mem)
1326 {
1327 #if !defined(__LP64__)
1328 	unsigned int n;
1329 	/*
1330 	 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1331 	 */
1332 	if ((n = ((mem / 16) / MCLBYTES)) > 32768) {
1333 		n = 32768;
1334 	}
1335 #else
1336 	unsigned int n, i;
1337 	/*
1338 	 * 64-bit kernel (mbuf pool size based on table).
1339 	 */
1340 	n = ncl_table[0].nt_mbpool;
1341 	for (i = 0; ncl_table[i].nt_mbpool != 0; i++) {
1342 		if (mem < ncl_table[i].nt_maxmem) {
1343 			break;
1344 		}
1345 		n = ncl_table[i].nt_mbpool;
1346 	}
1347 	n >>= MCLSHIFT;
1348 #endif /* !__LP64__ */
1349 	return n;
1350 }
1351 
1352 #if !CONFIG_MBUF_MCACHE
1353 __private_extern__ void
mbinit(void)1354 mbinit(void)
1355 {
1356 	unsigned int m;
1357 
1358 	/*
1359 	 * These MBUF_ values must be equal to their private counterparts.
1360 	 */
1361 	static_assert(MBUF_EXT == M_EXT);
1362 	static_assert(MBUF_PKTHDR == M_PKTHDR);
1363 	static_assert(MBUF_EOR == M_EOR);
1364 	static_assert(MBUF_LOOP == M_LOOP);
1365 	static_assert(MBUF_BCAST == M_BCAST);
1366 	static_assert(MBUF_MCAST == M_MCAST);
1367 	static_assert(MBUF_FRAG == M_FRAG);
1368 	static_assert(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1369 	static_assert(MBUF_LASTFRAG == M_LASTFRAG);
1370 	static_assert(MBUF_PROMISC == M_PROMISC);
1371 	static_assert(MBUF_HASFCS == M_HASFCS);
1372 
1373 	static_assert(MBUF_TYPE_FREE == MT_FREE);
1374 	static_assert(MBUF_TYPE_DATA == MT_DATA);
1375 	static_assert(MBUF_TYPE_HEADER == MT_HEADER);
1376 	static_assert(MBUF_TYPE_SOCKET == MT_SOCKET);
1377 	static_assert(MBUF_TYPE_PCB == MT_PCB);
1378 	static_assert(MBUF_TYPE_RTABLE == MT_RTABLE);
1379 	static_assert(MBUF_TYPE_HTABLE == MT_HTABLE);
1380 	static_assert(MBUF_TYPE_ATABLE == MT_ATABLE);
1381 	static_assert(MBUF_TYPE_SONAME == MT_SONAME);
1382 	static_assert(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1383 	static_assert(MBUF_TYPE_FTABLE == MT_FTABLE);
1384 	static_assert(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1385 	static_assert(MBUF_TYPE_IFADDR == MT_IFADDR);
1386 	static_assert(MBUF_TYPE_CONTROL == MT_CONTROL);
1387 	static_assert(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1388 
1389 	static_assert(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1390 	static_assert(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1391 	static_assert(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1392 	static_assert(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1393 	static_assert(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
1394 	static_assert(MBUF_CSUM_REQ_IP == CSUM_IP);
1395 	static_assert(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1396 	static_assert(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1397 	static_assert(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1398 	static_assert(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1399 	static_assert(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1400 	static_assert(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1401 	static_assert(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1402 	static_assert(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1403 
1404 	static_assert(MBUF_WAITOK == M_WAIT);
1405 	static_assert(MBUF_DONTWAIT == M_DONTWAIT);
1406 	static_assert(MBUF_COPYALL == M_COPYALL);
1407 
1408 	static_assert(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1409 	static_assert(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1410 	static_assert(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1411 	static_assert(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1412 	static_assert(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1413 	static_assert(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1414 	static_assert(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1415 	static_assert(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1416 	static_assert(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI);
1417 	static_assert(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1418 	static_assert(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1419 
1420 	static_assert(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1421 	static_assert(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1422 	static_assert(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1423 	static_assert(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1424 
1425 	/* Module specific scratch space (32-bit alignment requirement) */
1426 	static_assert(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) % sizeof(uint32_t)));
1427 
1428 	if (nmbclusters == 0) {
1429 		nmbclusters = NMBCLUSTERS;
1430 	}
1431 
1432 	/* This should be a sane (at least even) value by now */
1433 	VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1434 
1435 	PE_parse_boot_argn("mbuf_limit", &mbuf_limit, sizeof(mbuf_limit));
1436 	if (serverperfmode) {
1437 		mbuf_limit = 0;
1438 	}
1439 
1440 	/* Setup the mbuf table */
1441 	mbuf_table_init();
1442 
1443 	static_assert(sizeof(struct mbuf) == _MSIZE);
1444 
1445 	/*
1446 	 * We have yet to create the non composite zones
1447 	 * and thus we haven't asked zalloc to allocate
1448 	 * anything yet, which means that at this point
1449 	 * m_total() is zero.  Once we create the zones and
1450 	 * raise the reserve, m_total() will be calculated,
1451 	 * but until then just assume that we will have
1452 	 * at least the minium limit allocated.
1453 	 */
1454 	m_total(MC_BIGCL) = m_minlimit(MC_BIGCL);
1455 	m_total(MC_CL) = m_minlimit(MC_CL);
1456 
1457 	for (m = 0; m < MC_MAX; m++) {
1458 		/* Make sure we didn't miss any */
1459 		VERIFY(m_minlimit(m_class(m)) == 0 ||
1460 		    m_total(m_class(m)) >= m_minlimit(m_class(m)));
1461 	}
1462 
1463 	/* Create the cache for each class */
1464 	for (m = 0; m < MC_MAX; m++) {
1465 		if (!MBUF_CLASS_COMPOSITE(m)) {
1466 			zone_ref_t zone = zone_by_id(m_class_to_zid(m));
1467 
1468 			if (mbuf_limit) {
1469 				zone_set_exhaustible(zone, m_maxlimit(m), false);
1470 			}
1471 			zone_raise_reserve(zone, m_minlimit(m));
1472 			/*
1473 			 * Pretend that we have allocated m_total() items
1474 			 * at this point.  zalloc will eventually do that
1475 			 * but it's an async operation.
1476 			 */
1477 			m_total(m) = m_minlimit(m);
1478 		}
1479 	}
1480 
1481 	/*
1482 	 * Set the max limit on sb_max to be 1/16 th of the size of
1483 	 * memory allocated for mbuf clusters.
1484 	 */
1485 	high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1486 	if (high_sb_max < sb_max) {
1487 		/* sb_max is too large for this configuration, scale it down */
1488 		if (high_sb_max > (1 << MBSHIFT)) {
1489 			/* We have atleast 16 M of mbuf pool */
1490 			sb_max = high_sb_max;
1491 		} else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1492 			/*
1493 			 * If we have more than 1M of mbufpool, cap the size of
1494 			 * max sock buf at 1M
1495 			 */
1496 			sb_max = high_sb_max = (1 << MBSHIFT);
1497 		} else {
1498 			sb_max = high_sb_max;
1499 		}
1500 	}
1501 
1502 	mbuf_defunct_tcall =
1503 	    thread_call_allocate_with_options(mbuf_watchdog_defunct,
1504 	    NULL,
1505 	    THREAD_CALL_PRIORITY_KERNEL,
1506 	    THREAD_CALL_OPTIONS_ONCE);
1507 	mbuf_drain_tcall =
1508 	    thread_call_allocate_with_options(mbuf_watchdog_drain_composite,
1509 	    NULL,
1510 	    THREAD_CALL_PRIORITY_KERNEL,
1511 	    THREAD_CALL_OPTIONS_ONCE);
1512 	printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1513 	    (nmbclusters << MCLSHIFT) >> MBSHIFT,
1514 	    (nclusters << MCLSHIFT) >> MBSHIFT,
1515 	    (njcl << MCLSHIFT) >> MBSHIFT);
1516 }
1517 
1518 static inline struct mbuf *
m_get_common(int wait,short type,int hdr)1519 m_get_common(int wait, short type, int hdr)
1520 {
1521 	struct mbuf *m;
1522 
1523 	m = mz_alloc(wait);
1524 	if (m != NULL) {
1525 		mbuf_init(m, hdr, type);
1526 		mtype_stat_inc(type);
1527 		mtype_stat_dec(MT_FREE);
1528 	}
1529 	return m;
1530 }
1531 #endif /* !CONFIG_MBUF_MCACHE */
1532 
1533 /*
1534  * Space allocation routines; these are also available as macros
1535  * for critical paths.
1536  */
1537 #define _M_GET(wait, type)      m_get_common(wait, type, 0)
1538 #define _M_GETHDR(wait, type)   m_get_common(wait, type, 1)
1539 #define _M_RETRY(wait, type)    _M_GET(wait, type)
1540 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
1541 #define _MGET(m, how, type)     ((m) = _M_GET(how, type))
1542 #define _MGETHDR(m, how, type)  ((m) = _M_GETHDR(how, type))
1543 
1544 struct mbuf *
m_get(int wait,int type)1545 m_get(int wait, int type)
1546 {
1547 	return _M_GET(wait, type);
1548 }
1549 
1550 struct mbuf *
m_gethdr(int wait,int type)1551 m_gethdr(int wait, int type)
1552 {
1553 	return _M_GETHDR(wait, type);
1554 }
1555 
1556 struct mbuf *
m_retry(int wait,int type)1557 m_retry(int wait, int type)
1558 {
1559 	return _M_RETRY(wait, type);
1560 }
1561 
1562 struct mbuf *
m_retryhdr(int wait,int type)1563 m_retryhdr(int wait, int type)
1564 {
1565 	return _M_RETRYHDR(wait, type);
1566 }
1567 
1568 struct mbuf *
m_getclr(int wait,int type)1569 m_getclr(int wait, int type)
1570 {
1571 	struct mbuf *m;
1572 
1573 	_MGET(m, wait, type);
1574 	if (m != NULL) {
1575 		bzero(mtod(m, caddr_t), MLEN);
1576 	}
1577 	return m;
1578 }
1579 
1580 #if !CONFIG_MBUF_MCACHE
1581 static
1582 #endif
1583 int
m_free_paired(struct mbuf * m)1584 m_free_paired(struct mbuf *m)
1585 {
1586 	VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED));
1587 
1588 	os_atomic_thread_fence(seq_cst);
1589 	if (MEXT_PMBUF(m) == m) {
1590 		/*
1591 		 * Paired ref count might be negative in case we lose
1592 		 * against another thread clearing MEXT_PMBUF, in the
1593 		 * event it occurs after the above memory barrier sync.
1594 		 * In that case just ignore as things have been unpaired.
1595 		 */
1596 		int16_t prefcnt = os_atomic_dec(&MEXT_PREF(m), acq_rel);
1597 		if (prefcnt > 1) {
1598 			return 1;
1599 		} else if (prefcnt == 1) {
1600 			m_ext_free_func_t m_free_func = m_get_ext_free(m);
1601 			VERIFY(m_free_func != NULL);
1602 			(*m_free_func)(m->m_ext.ext_buf,
1603 			    m->m_ext.ext_size, m_get_ext_arg(m));
1604 			return 1;
1605 		} else if (prefcnt == 0) {
1606 			VERIFY(MBUF_IS_PAIRED(m));
1607 
1608 			/*
1609 			 * Restore minref to its natural value, so that
1610 			 * the caller will be able to free the cluster
1611 			 * as appropriate.
1612 			 */
1613 			MEXT_MINREF(m) = 0;
1614 
1615 			/*
1616 			 * Clear MEXT_PMBUF, but leave EXTF_PAIRED intact
1617 			 * as it is immutable.  atomic_set_ptr also causes
1618 			 * memory barrier sync.
1619 			 */
1620 			os_atomic_store(&MEXT_PMBUF(m), (mbuf_ref_t)0, release);
1621 
1622 			switch (m->m_ext.ext_size) {
1623 			case MCLBYTES:
1624 				m_set_ext(m, m_get_rfa(m), NULL, NULL);
1625 				break;
1626 
1627 			case MBIGCLBYTES:
1628 				m_set_ext(m, m_get_rfa(m), m_bigfree, NULL);
1629 				break;
1630 
1631 			case M16KCLBYTES:
1632 				m_set_ext(m, m_get_rfa(m), m_16kfree, NULL);
1633 				break;
1634 
1635 			default:
1636 				VERIFY(0);
1637 				/* NOTREACHED */
1638 			}
1639 		}
1640 	}
1641 
1642 	/*
1643 	 * Tell caller the unpair has occurred, and that the reference
1644 	 * count on the external cluster held for the paired mbuf should
1645 	 * now be dropped.
1646 	 */
1647 	return 0;
1648 }
1649 
1650 #if !CONFIG_MBUF_MCACHE
1651 struct mbuf *
m_free(struct mbuf * m)1652 m_free(struct mbuf *m)
1653 {
1654 	struct mbuf *n = m->m_next;
1655 
1656 	if (m->m_type == MT_FREE) {
1657 		panic("m_free: freeing an already freed mbuf");
1658 	}
1659 
1660 	if (m->m_flags & M_PKTHDR) {
1661 		/* Free the aux data and tags if there is any */
1662 		m_tag_delete_chain(m);
1663 
1664 		m_do_tx_compl_callback(m, NULL);
1665 	}
1666 
1667 	if (m->m_flags & M_EXT) {
1668 		if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
1669 			return n;
1670 		}
1671 		/*
1672 		 * Make sure that we don't touch any ext_ref
1673 		 * member after we decrement the reference count
1674 		 * since that may lead to use-after-free
1675 		 * when we do not hold the last reference.
1676 		 */
1677 		const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
1678 		const m_ext_free_func_t m_free_func = m_get_ext_free(m);
1679 		const uint16_t minref = MEXT_MINREF(m);
1680 		const uint16_t refcnt = m_decref(m);
1681 
1682 		if (refcnt == minref && !composite) {
1683 			if (m_free_func == NULL) {
1684 				mz_cl_free(ZONE_ID_CLUSTER_2K, m->m_ext.ext_buf);
1685 			} else if (m_free_func == m_bigfree) {
1686 				mz_cl_free(ZONE_ID_CLUSTER_4K, m->m_ext.ext_buf);
1687 			} else if (m_free_func == m_16kfree) {
1688 				mz_cl_free(ZONE_ID_CLUSTER_16K, m->m_ext.ext_buf);
1689 			} else {
1690 				(*m_free_func)(m->m_ext.ext_buf,
1691 				    m->m_ext.ext_size, m_get_ext_arg(m));
1692 			}
1693 			mz_ref_free(m_get_rfa(m));
1694 			m_set_ext(m, NULL, NULL, NULL);
1695 		} else if (refcnt == minref && composite) {
1696 			VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
1697 
1698 			mtype_stat_dec(m->m_type);
1699 			mtype_stat_inc(MT_FREE);
1700 
1701 			m->m_type = MT_FREE;
1702 			m->m_flags = M_EXT;
1703 			m->m_len = 0;
1704 			m->m_next = m->m_nextpkt = NULL;
1705 			/*
1706 			 * MEXT_FLAGS is safe to access here
1707 			 * since we are now sure that we held
1708 			 * the last reference to ext_ref.
1709 			 */
1710 			MEXT_FLAGS(m) &= ~EXTF_READONLY;
1711 
1712 			/* "Free" into the intermediate cache */
1713 			if (m_free_func == NULL) {
1714 				mz_composite_free(MC_MBUF_CL, m);
1715 			} else if (m_free_func == m_bigfree) {
1716 				mz_composite_free(MC_MBUF_BIGCL, m);
1717 			} else {
1718 				VERIFY(m_free_func == m_16kfree);
1719 				mz_composite_free(MC_MBUF_16KCL, m);
1720 			}
1721 			return n;
1722 		}
1723 	}
1724 
1725 	mtype_stat_dec(m->m_type);
1726 	mtype_stat_inc(MT_FREE);
1727 
1728 	m->m_type = MT_FREE;
1729 	m->m_flags = m->m_len = 0;
1730 	m->m_next = m->m_nextpkt = NULL;
1731 
1732 	mz_free(m);
1733 
1734 	return n;
1735 }
1736 
1737 __private_extern__ struct mbuf *
m_clattach(struct mbuf * m,int type,caddr_t extbuf __sized_by (extsize),void (* extfree)(caddr_t,u_int,caddr_t),size_t extsize,caddr_t extarg,int wait,int pair)1738 m_clattach(struct mbuf *m, int type, caddr_t extbuf __sized_by(extsize),
1739     void (*extfree)(caddr_t, u_int, caddr_t), size_t extsize, caddr_t extarg,
1740     int wait, int pair)
1741 {
1742 	struct ext_ref *rfa = NULL;
1743 
1744 	/*
1745 	 * If pairing is requested and an existing mbuf is provided, reject
1746 	 * it if it's already been paired to another cluster.  Otherwise,
1747 	 * allocate a new one or free any existing below.
1748 	 */
1749 	if ((m != NULL && MBUF_IS_PAIRED(m)) ||
1750 	    (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)) {
1751 		return NULL;
1752 	}
1753 
1754 	if (m->m_flags & M_EXT) {
1755 		/*
1756 		 * Make sure that we don't touch any ext_ref
1757 		 * member after we decrement the reference count
1758 		 * since that may lead to use-after-free
1759 		 * when we do not hold the last reference.
1760 		 */
1761 		const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
1762 		VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
1763 		const m_ext_free_func_t m_free_func = m_get_ext_free(m);
1764 		const uint16_t minref = MEXT_MINREF(m);
1765 		const uint16_t refcnt = m_decref(m);
1766 
1767 		if (refcnt == minref && !composite) {
1768 			if (m_free_func == NULL) {
1769 				mz_cl_free(ZONE_ID_CLUSTER_2K, m->m_ext.ext_buf);
1770 			} else if (m_free_func == m_bigfree) {
1771 				mz_cl_free(ZONE_ID_CLUSTER_4K, m->m_ext.ext_buf);
1772 			} else if (m_free_func == m_16kfree) {
1773 				mz_cl_free(ZONE_ID_CLUSTER_16K, m->m_ext.ext_buf);
1774 			} else {
1775 				(*m_free_func)(m->m_ext.ext_buf,
1776 				    m->m_ext.ext_size, m_get_ext_arg(m));
1777 			}
1778 			/* Re-use the reference structure */
1779 			rfa = m_get_rfa(m);
1780 		} else if (refcnt == minref && composite) {
1781 			VERIFY(m->m_type != MT_FREE);
1782 
1783 			mtype_stat_dec(m->m_type);
1784 			mtype_stat_inc(MT_FREE);
1785 
1786 			m->m_type = MT_FREE;
1787 			m->m_flags = M_EXT;
1788 			m->m_len = 0;
1789 			m->m_next = m->m_nextpkt = NULL;
1790 
1791 			/*
1792 			 * MEXT_FLAGS is safe to access here
1793 			 * since we are now sure that we held
1794 			 * the last reference to ext_ref.
1795 			 */
1796 			MEXT_FLAGS(m) &= ~EXTF_READONLY;
1797 
1798 			/* "Free" into the intermediate cache */
1799 			if (m_free_func == NULL) {
1800 				mz_composite_free(MC_MBUF_CL, m);
1801 			} else if (m_free_func == m_bigfree) {
1802 				mz_composite_free(MC_MBUF_BIGCL, m);
1803 			} else {
1804 				VERIFY(m_free_func == m_16kfree);
1805 				mz_composite_free(MC_MBUF_16KCL, m);
1806 			}
1807 			/*
1808 			 * Allocate a new mbuf, since we didn't divorce
1809 			 * the composite mbuf + cluster pair above.
1810 			 */
1811 			if ((m = _M_GETHDR(wait, type)) == NULL) {
1812 				return NULL;
1813 			}
1814 		}
1815 	}
1816 
1817 	if (rfa == NULL &&
1818 	    (rfa = mz_ref_alloc(wait)) == NULL) {
1819 		m_free(m);
1820 		return NULL;
1821 	}
1822 
1823 	if (!pair) {
1824 		mext_init(m, extbuf, extsize, extfree, extarg, rfa,
1825 		    0, 1, 0, 0, 0, NULL);
1826 	} else {
1827 		mext_init(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
1828 		    1, 1, 1, EXTF_PAIRED, 0, m);
1829 	}
1830 
1831 	return m;
1832 }
1833 
1834 /*
1835  * Perform `fast' allocation mbuf clusters from a cache of recently-freed
1836  * clusters. (If the cache is empty, new clusters are allocated en-masse.)
1837  */
1838 struct mbuf *
m_getcl(int wait,int type,int flags)1839 m_getcl(int wait, int type, int flags)
1840 {
1841 	struct mbuf *m = NULL;
1842 	int hdr = (flags & M_PKTHDR);
1843 
1844 	m = mz_composite_alloc(MC_MBUF_CL, wait);
1845 	if (m != NULL) {
1846 		u_int16_t flag;
1847 		struct ext_ref *rfa;
1848 		void *cl;
1849 
1850 		VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
1851 		cl = m->m_ext.ext_buf;
1852 		rfa = m_get_rfa(m);
1853 
1854 		ASSERT(cl != NULL && rfa != NULL);
1855 		VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
1856 
1857 		flag = MEXT_FLAGS(m);
1858 
1859 		mbuf_init(m, hdr, type);
1860 		MBUF_CL_INIT(m, cl, rfa, 1, flag);
1861 
1862 		mtype_stat_inc(type);
1863 		mtype_stat_dec(MT_FREE);
1864 	}
1865 	return m;
1866 }
1867 
1868 /* m_mclget() add an mbuf cluster to a normal mbuf */
1869 struct mbuf *
m_mclget(struct mbuf * m,int wait)1870 m_mclget(struct mbuf *m, int wait)
1871 {
1872 	struct ext_ref *rfa = NULL;
1873 	char *bytes = NULL;
1874 
1875 	if ((rfa = mz_ref_alloc(wait)) == NULL) {
1876 		return m;
1877 	}
1878 
1879 	if ((bytes = m_mclalloc(wait)) != NULL) {
1880 		m->m_ext.ext_size = MCLBYTES;
1881 		m->m_ext.ext_buf = bytes;
1882 		MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
1883 	} else {
1884 		m->m_ext.ext_size = 0;
1885 		m->m_ext.ext_buf = NULL;
1886 		mz_ref_free(rfa);
1887 	}
1888 
1889 	return m;
1890 }
1891 
1892 /* Allocate an mbuf cluster */
1893 char *
__sized_by_or_null(MCLBYTES)1894 __sized_by_or_null(MCLBYTES)
1895 m_mclalloc(int wait)
1896 {
1897 	return mz_cl_alloc(ZONE_ID_CLUSTER_2K, wait);
1898 }
1899 
1900 /* Free an mbuf cluster */
1901 void
m_mclfree(caddr_t p)1902 m_mclfree(caddr_t p)
1903 {
1904 	mz_cl_free(ZONE_ID_CLUSTER_2K, p);
1905 }
1906 #endif /* !CONFIG_MBUF_MCACHE */
1907 
1908 /*
1909  * mcl_hasreference() checks if a cluster of an mbuf is referenced by
1910  * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
1911  */
1912 int
m_mclhasreference(struct mbuf * m)1913 m_mclhasreference(struct mbuf *m)
1914 {
1915 	if (!(m->m_flags & M_EXT)) {
1916 		return 0;
1917 	}
1918 
1919 	ASSERT(m_get_rfa(m) != NULL);
1920 
1921 	return (MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0;
1922 }
1923 
1924 #if !CONFIG_MBUF_MCACHE
1925 __private_extern__ char *
__sized_by_or_null(MBIGCLBYTES)1926 __sized_by_or_null(MBIGCLBYTES)
1927 m_bigalloc(int wait)
1928 {
1929 	return mz_cl_alloc(ZONE_ID_CLUSTER_4K, wait);
1930 }
1931 
1932 __private_extern__ void
m_bigfree(caddr_t p,__unused u_int size,__unused caddr_t arg)1933 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
1934 {
1935 	mz_cl_free(ZONE_ID_CLUSTER_4K, p);
1936 }
1937 
1938 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
1939 __private_extern__ struct mbuf *
m_mbigget(struct mbuf * m,int wait)1940 m_mbigget(struct mbuf *m, int wait)
1941 {
1942 	struct ext_ref *rfa = NULL;
1943 	void * bytes = NULL;
1944 
1945 	if ((rfa = mz_ref_alloc(wait)) == NULL) {
1946 		return m;
1947 	}
1948 
1949 	if ((bytes = m_bigalloc(wait)) != NULL) {
1950 		m->m_ext.ext_size = MBIGCLBYTES;
1951 		m->m_ext.ext_buf = bytes;
1952 		MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
1953 	} else {
1954 		m->m_ext.ext_size = 0;
1955 		m->m_ext.ext_buf = NULL;
1956 		mz_ref_free(rfa);
1957 	}
1958 
1959 	return m;
1960 }
1961 
1962 __private_extern__ char *
__sized_by_or_null(M16KCLBYTES)1963 __sized_by_or_null(M16KCLBYTES)
1964 m_16kalloc(int wait)
1965 {
1966 	return mz_cl_alloc(ZONE_ID_CLUSTER_16K, wait);
1967 }
1968 
1969 __private_extern__ void
m_16kfree(caddr_t p,__unused u_int size,__unused caddr_t arg)1970 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
1971 {
1972 	mz_cl_free(ZONE_ID_CLUSTER_16K, p);
1973 }
1974 
1975 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
1976 __private_extern__ struct mbuf *
m_m16kget(struct mbuf * m,int wait)1977 m_m16kget(struct mbuf *m, int wait)
1978 {
1979 	struct ext_ref *rfa = NULL;
1980 	void *bytes = NULL;
1981 
1982 	if ((rfa = mz_ref_alloc(wait)) == NULL) {
1983 		return m;
1984 	}
1985 
1986 	if ((bytes = m_16kalloc(wait)) != NULL) {
1987 		m->m_ext.ext_size = M16KCLBYTES;
1988 		m->m_ext.ext_buf = bytes;
1989 		MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
1990 	} else {
1991 		m->m_ext.ext_size = 0;
1992 		m->m_ext.ext_buf = NULL;
1993 		mz_ref_free(rfa);
1994 	}
1995 
1996 	return m;
1997 }
1998 #endif /* !CONFIG_MBUF_MCACHE */
1999 
2000 /*
2001  * "Move" mbuf pkthdr from "from" to "to".
2002  * "from" must have M_PKTHDR set, and "to" must be empty.
2003  */
2004 void
m_copy_pkthdr(struct mbuf * to,struct mbuf * from)2005 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
2006 {
2007 	VERIFY(from->m_flags & M_PKTHDR);
2008 
2009 	if (to->m_flags & M_PKTHDR) {
2010 		/* We will be taking over the tags of 'to' */
2011 		m_tag_delete_chain(to);
2012 	}
2013 	to->m_pkthdr = from->m_pkthdr;          /* especially tags */
2014 	m_classifier_init(from, 0);             /* purge classifier info */
2015 	m_tag_init(from, 1);                    /* purge all tags from src */
2016 	m_scratch_init(from);                   /* clear src scratch area */
2017 	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
2018 	if ((to->m_flags & M_EXT) == 0) {
2019 		to->m_data = (uintptr_t)to->m_pktdat;
2020 	}
2021 }
2022 
2023 /*
2024  * Duplicate "from"'s mbuf pkthdr in "to".
2025  * "from" must have M_PKTHDR set, and "to" must be empty.
2026  * In particular, this does a deep copy of the packet tags.
2027  */
2028 int
m_dup_pkthdr(struct mbuf * to,struct mbuf * from,int how)2029 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
2030 {
2031 	VERIFY(from->m_flags & M_PKTHDR);
2032 
2033 	if (to->m_flags & M_PKTHDR) {
2034 		/* We will be taking over the tags of 'to' */
2035 		m_tag_delete_chain(to);
2036 	}
2037 	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
2038 	if ((to->m_flags & M_EXT) == 0) {
2039 		to->m_data = (uintptr_t)to->m_pktdat;
2040 	}
2041 	to->m_pkthdr = from->m_pkthdr;
2042 	/* clear TX completion flag so the callback is not called in the copy */
2043 	to->m_pkthdr.pkt_flags &= ~PKTF_TX_COMPL_TS_REQ;
2044 	m_tag_init(to, 0);                      /* preserve dst static tags */
2045 	return m_tag_copy_chain(to, from, how);
2046 }
2047 
2048 void
m_copy_pftag(struct mbuf * to,struct mbuf * from)2049 m_copy_pftag(struct mbuf *to, struct mbuf *from)
2050 {
2051 	memcpy(m_pftag(to), m_pftag(from), sizeof(struct pf_mtag));
2052 #if PF_ECN
2053 	m_pftag(to)->pftag_hdr = NULL;
2054 	m_pftag(to)->pftag_flags &= ~(PF_TAG_HDR_INET | PF_TAG_HDR_INET6);
2055 #endif /* PF_ECN */
2056 }
2057 
2058 void
m_copy_necptag(struct mbuf * to,struct mbuf * from)2059 m_copy_necptag(struct mbuf *to, struct mbuf *from)
2060 {
2061 	memcpy(m_necptag(to), m_necptag(from), sizeof(struct necp_mtag_));
2062 }
2063 
2064 void
m_classifier_init(struct mbuf * m,uint32_t pktf_mask)2065 m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
2066 {
2067 	VERIFY(m->m_flags & M_PKTHDR);
2068 
2069 	m->m_pkthdr.pkt_proto = 0;
2070 	m->m_pkthdr.pkt_flowsrc = 0;
2071 	m->m_pkthdr.pkt_flowid = 0;
2072 	m->m_pkthdr.pkt_ext_flags = 0;
2073 	m->m_pkthdr.pkt_flags &= pktf_mask;     /* caller-defined mask */
2074 	/* preserve service class and interface info for loopback packets */
2075 	if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
2076 		(void) m_set_service_class(m, MBUF_SC_BE);
2077 	}
2078 	if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
2079 		m->m_pkthdr.pkt_ifainfo = 0;
2080 	}
2081 	/*
2082 	 * Preserve timestamp if requested
2083 	 */
2084 	if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID)) {
2085 		m->m_pkthdr.pkt_timestamp = 0;
2086 	}
2087 }
2088 
2089 void
m_copy_classifier(struct mbuf * to,struct mbuf * from)2090 m_copy_classifier(struct mbuf *to, struct mbuf *from)
2091 {
2092 	VERIFY(to->m_flags & M_PKTHDR);
2093 	VERIFY(from->m_flags & M_PKTHDR);
2094 
2095 	to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
2096 	to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
2097 	to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
2098 	to->m_pkthdr.pkt_mpriv_srcid = from->m_pkthdr.pkt_mpriv_srcid;
2099 	to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
2100 	to->m_pkthdr.pkt_ext_flags = from->m_pkthdr.pkt_ext_flags;
2101 	(void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
2102 	to->m_pkthdr.pkt_ifainfo  = from->m_pkthdr.pkt_ifainfo;
2103 }
2104 
2105 #if !CONFIG_MBUF_MCACHE
2106 /*
2107  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
2108  * if wantall is not set, return whatever number were available.  Set up the
2109  * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
2110  * are chained on the m_nextpkt field.  Any packets requested beyond this
2111  * are chained onto the last packet header's m_next field.  The size of
2112  * the cluster is controlled by the parameter bufsize.
2113  */
2114 __private_extern__ struct mbuf *
m_getpackets_internal(unsigned int * num_needed,int num_with_pkthdrs,int wait,int wantall,size_t bufsize)2115 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
2116     int wait, int wantall, size_t bufsize)
2117 {
2118 	mbuf_ref_t m = NULL;
2119 	mbuf_ref_t *np, top;
2120 	unsigned int pnum, needed = *num_needed;
2121 	zstack_t mp_list = {};
2122 	mbuf_class_t class = MC_MBUF_CL;
2123 	u_int16_t flag;
2124 	struct ext_ref *rfa;
2125 	void *cl;
2126 
2127 	ASSERT(bufsize == m_maxsize(MC_CL) ||
2128 	    bufsize == m_maxsize(MC_BIGCL) ||
2129 	    bufsize == m_maxsize(MC_16KCL));
2130 
2131 	top = NULL;
2132 	np = &top;
2133 	pnum = 0;
2134 
2135 	/*
2136 	 * The caller doesn't want all the requested buffers; only some.
2137 	 * Try hard to get what we can, but don't block.  This effectively
2138 	 * overrides MCR_SLEEP, since this thread will not go to sleep
2139 	 * if we can't get all the buffers.
2140 	 */
2141 	if (!wantall || (wait & Z_NOWAIT)) {
2142 		wait &= ~Z_NOWAIT;
2143 		wait |= Z_NOPAGEWAIT;
2144 	}
2145 
2146 	/* Allocate the composite mbuf + cluster elements from the cache */
2147 	if (bufsize == m_maxsize(MC_CL)) {
2148 		class = MC_MBUF_CL;
2149 	} else if (bufsize == m_maxsize(MC_BIGCL)) {
2150 		class = MC_MBUF_BIGCL;
2151 	} else {
2152 		class = MC_MBUF_16KCL;
2153 	}
2154 	mp_list = mz_composite_alloc_n(class, needed, wait);
2155 	needed = zstack_count(mp_list);
2156 
2157 	for (pnum = 0; pnum < needed; pnum++) {
2158 		m = zstack_pop(&mp_list);
2159 
2160 		VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
2161 		cl = m->m_ext.ext_buf;
2162 		rfa = m_get_rfa(m);
2163 
2164 		ASSERT(cl != NULL && rfa != NULL);
2165 		VERIFY(MBUF_IS_COMPOSITE(m));
2166 
2167 		flag = MEXT_FLAGS(m);
2168 
2169 		mbuf_init(m, num_with_pkthdrs, MT_DATA);
2170 		if (bufsize == m_maxsize(MC_16KCL)) {
2171 			MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
2172 		} else if (bufsize == m_maxsize(MC_BIGCL)) {
2173 			MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
2174 		} else {
2175 			MBUF_CL_INIT(m, cl, rfa, 1, flag);
2176 		}
2177 
2178 		if (num_with_pkthdrs > 0) {
2179 			--num_with_pkthdrs;
2180 		}
2181 
2182 		*np = m;
2183 		if (num_with_pkthdrs > 0) {
2184 			np = &m->m_nextpkt;
2185 		} else {
2186 			np = &m->m_next;
2187 		}
2188 	}
2189 	ASSERT(pnum != *num_needed || zstack_empty(mp_list));
2190 	if (!zstack_empty(mp_list)) {
2191 		mz_composite_free_n(class, mp_list);
2192 	}
2193 	if (pnum > 0) {
2194 		mtype_stat_add(MT_DATA, pnum);
2195 		mtype_stat_sub(MT_FREE, pnum);
2196 	}
2197 
2198 	if (wantall && (pnum != *num_needed)) {
2199 		if (top != NULL) {
2200 			m_freem_list(top);
2201 		}
2202 		return NULL;
2203 	}
2204 
2205 	if (pnum > *num_needed) {
2206 		printf("%s: File a radar related to <rdar://10146739>. \
2207 			needed = %u, pnum = %u, num_needed = %u \n",
2208 		    __func__, needed, pnum, *num_needed);
2209 	}
2210 	*num_needed = pnum;
2211 
2212 	return top;
2213 }
2214 
2215 /*
2216  * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
2217  * wantall is not set, return whatever number were available.  The size of
2218  * each mbuf in the list is controlled by the parameter packetlen.  Each
2219  * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
2220  * in the chain is called a segment.  If maxsegments is not null and the
2221  * value pointed to is not null, this specify the maximum number of segments
2222  * for a chain of mbufs.  If maxsegments is zero or the value pointed to
2223  * is zero the caller does not have any restriction on the number of segments.
2224  * The actual  number of segments of a mbuf chain is return in the value
2225  * pointed to by maxsegments.
2226  */
2227 __private_extern__ struct mbuf *
m_allocpacket_internal(unsigned int * numlist,size_t packetlen,unsigned int * maxsegments,int wait,int wantall,size_t wantsize)2228 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
2229     unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
2230 {
2231 	mbuf_ref_t *np, top, first = NULL;
2232 	size_t bufsize, r_bufsize;
2233 	unsigned int num = 0;
2234 	unsigned int nsegs = 0;
2235 	unsigned int needed = 0, resid;
2236 	zstack_t mp_list = {}, rmp_list = {};
2237 	mbuf_class_t class = MC_MBUF, rclass = MC_MBUF_CL;
2238 
2239 	if (*numlist == 0) {
2240 		os_log(OS_LOG_DEFAULT, "m_allocpacket_internal *numlist is 0");
2241 		return NULL;
2242 	}
2243 
2244 	top = NULL;
2245 	np = &top;
2246 
2247 	if (wantsize == 0) {
2248 		if (packetlen <= MINCLSIZE) {
2249 			bufsize = packetlen;
2250 		} else if (packetlen > m_maxsize(MC_CL)) {
2251 			/* Use 4KB if jumbo cluster pool isn't available */
2252 			if (packetlen <= m_maxsize(MC_BIGCL)) {
2253 				bufsize = m_maxsize(MC_BIGCL);
2254 			} else {
2255 				bufsize = m_maxsize(MC_16KCL);
2256 			}
2257 		} else {
2258 			bufsize = m_maxsize(MC_CL);
2259 		}
2260 	} else if (wantsize == m_maxsize(MC_CL) ||
2261 	    wantsize == m_maxsize(MC_BIGCL) ||
2262 	    (wantsize == m_maxsize(MC_16KCL))) {
2263 		bufsize = wantsize;
2264 	} else {
2265 		*numlist = 0;
2266 		os_log(OS_LOG_DEFAULT, "m_allocpacket_internal wantsize unsupported");
2267 		return NULL;
2268 	}
2269 
2270 	if (bufsize <= MHLEN) {
2271 		nsegs = 1;
2272 	} else if (bufsize <= MINCLSIZE) {
2273 		if (maxsegments != NULL && *maxsegments == 1) {
2274 			bufsize = m_maxsize(MC_CL);
2275 			nsegs = 1;
2276 		} else {
2277 			nsegs = 2;
2278 		}
2279 	} else if (bufsize == m_maxsize(MC_16KCL)) {
2280 		nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1;
2281 	} else if (bufsize == m_maxsize(MC_BIGCL)) {
2282 		nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1;
2283 	} else {
2284 		nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
2285 	}
2286 	if (maxsegments != NULL) {
2287 		if (*maxsegments && nsegs > *maxsegments) {
2288 			*maxsegments = nsegs;
2289 			*numlist = 0;
2290 			os_log(OS_LOG_DEFAULT, "m_allocpacket_internal nsegs > *maxsegments");
2291 			return NULL;
2292 		}
2293 		*maxsegments = nsegs;
2294 	}
2295 
2296 	/*
2297 	 * The caller doesn't want all the requested buffers; only some.
2298 	 * Try hard to get what we can, but don't block.  This effectively
2299 	 * overrides MCR_SLEEP, since this thread will not go to sleep
2300 	 * if we can't get all the buffers.
2301 	 */
2302 	if (!wantall || (wait & Z_NOWAIT)) {
2303 		wait &= ~Z_NOWAIT;
2304 		wait |= Z_NOPAGEWAIT;
2305 	}
2306 
2307 	/*
2308 	 * Simple case where all elements in the lists/chains are mbufs.
2309 	 * Unless bufsize is greater than MHLEN, each segment chain is made
2310 	 * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
2311 	 * of 2 mbufs; the second one is used for the residual data, i.e.
2312 	 * the remaining data that cannot fit into the first mbuf.
2313 	 */
2314 	if (bufsize <= MINCLSIZE) {
2315 		/* Allocate the elements in one shot from the mbuf cache */
2316 		ASSERT(bufsize <= MHLEN || nsegs == 2);
2317 		class = MC_MBUF;
2318 		mp_list = mz_alloc_n((*numlist) * nsegs, wait);
2319 		needed = zstack_count(mp_list);
2320 
2321 		/*
2322 		 * The number of elements must be even if we are to use an
2323 		 * mbuf (instead of a cluster) to store the residual data.
2324 		 * If we couldn't allocate the requested number of mbufs,
2325 		 * trim the number down (if it's odd) in order to avoid
2326 		 * creating a partial segment chain.
2327 		 */
2328 		if (bufsize > MHLEN && (needed & 0x1)) {
2329 			needed--;
2330 		}
2331 
2332 		while (num < needed) {
2333 			mbuf_ref_t m = NULL;
2334 
2335 			m = zstack_pop(&mp_list);
2336 			ASSERT(m != NULL);
2337 
2338 			mbuf_init(m, 1, MT_DATA);
2339 			num++;
2340 			if (bufsize > MHLEN) {
2341 				/* A second mbuf for this segment chain */
2342 				m->m_next = zstack_pop(&mp_list);
2343 
2344 				ASSERT(m->m_next != NULL);
2345 
2346 				mbuf_init(m->m_next, 0, MT_DATA);
2347 				num++;
2348 			}
2349 			*np = m;
2350 			np = &m->m_nextpkt;
2351 		}
2352 		ASSERT(num != *numlist || zstack_empty(mp_list));
2353 
2354 		if (num > 0) {
2355 			mtype_stat_add(MT_DATA, num);
2356 			mtype_stat_sub(MT_FREE, num);
2357 		}
2358 		num /= nsegs;
2359 
2360 		/* We've got them all; return to caller */
2361 		if (num == *numlist) {
2362 			return top;
2363 		}
2364 
2365 		goto fail;
2366 	}
2367 
2368 	/*
2369 	 * Complex cases where elements are made up of one or more composite
2370 	 * mbufs + cluster, depending on packetlen.  Each N-segment chain can
2371 	 * be illustrated as follows:
2372 	 *
2373 	 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
2374 	 *
2375 	 * Every composite mbuf + cluster element comes from the intermediate
2376 	 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
2377 	 * the last composite element will come from the MC_MBUF_CL cache,
2378 	 * unless the residual data is larger than 2KB where we use the
2379 	 * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
2380 	 * data is defined as extra data beyond the first element that cannot
2381 	 * fit into the previous element, i.e. there is no residual data if
2382 	 * the chain only has 1 segment.
2383 	 */
2384 	r_bufsize = bufsize;
2385 	resid = packetlen > bufsize ? packetlen % bufsize : 0;
2386 	if (resid > 0) {
2387 		/* There is residual data; figure out the cluster size */
2388 		if (wantsize == 0 && packetlen > MINCLSIZE) {
2389 			/*
2390 			 * Caller didn't request that all of the segments
2391 			 * in the chain use the same cluster size; use the
2392 			 * smaller of the cluster sizes.
2393 			 */
2394 			if (resid > m_maxsize(MC_BIGCL)) {
2395 				r_bufsize = m_maxsize(MC_16KCL);
2396 			} else if (resid > m_maxsize(MC_CL)) {
2397 				r_bufsize = m_maxsize(MC_BIGCL);
2398 			} else {
2399 				r_bufsize = m_maxsize(MC_CL);
2400 			}
2401 		} else {
2402 			/* Use the same cluster size as the other segments */
2403 			resid = 0;
2404 		}
2405 	}
2406 
2407 	needed = *numlist;
2408 	if (resid > 0) {
2409 		/*
2410 		 * Attempt to allocate composite mbuf + cluster elements for
2411 		 * the residual data in each chain; record the number of such
2412 		 * elements that can be allocated so that we know how many
2413 		 * segment chains we can afford to create.
2414 		 */
2415 		if (r_bufsize <= m_maxsize(MC_CL)) {
2416 			rclass = MC_MBUF_CL;
2417 		} else if (r_bufsize <= m_maxsize(MC_BIGCL)) {
2418 			rclass = MC_MBUF_BIGCL;
2419 		} else {
2420 			rclass = MC_MBUF_16KCL;
2421 		}
2422 		rmp_list = mz_composite_alloc_n(rclass, *numlist, wait);
2423 		needed = zstack_count(rmp_list);
2424 		if (needed == 0) {
2425 			goto fail;
2426 		}
2427 
2428 		/* This is temporarily reduced for calculation */
2429 		ASSERT(nsegs > 1);
2430 		nsegs--;
2431 	}
2432 
2433 	/*
2434 	 * Attempt to allocate the rest of the composite mbuf + cluster
2435 	 * elements for the number of segment chains that we need.
2436 	 */
2437 	if (bufsize <= m_maxsize(MC_CL)) {
2438 		class = MC_MBUF_CL;
2439 	} else if (bufsize <= m_maxsize(MC_BIGCL)) {
2440 		class = MC_MBUF_BIGCL;
2441 	} else {
2442 		class = MC_MBUF_16KCL;
2443 	}
2444 	mp_list = mz_composite_alloc_n(class, needed * nsegs, wait);
2445 	needed = zstack_count(mp_list);
2446 
2447 	/* Round it down to avoid creating a partial segment chain */
2448 	needed = (needed / nsegs) * nsegs;
2449 	if (needed == 0) {
2450 		goto fail;
2451 	}
2452 
2453 	if (resid > 0) {
2454 		/*
2455 		 * We're about to construct the chain(s); take into account
2456 		 * the number of segments we have created above to hold the
2457 		 * residual data for each chain, as well as restore the
2458 		 * original count of segments per chain.
2459 		 */
2460 		ASSERT(nsegs > 0);
2461 		needed += needed / nsegs;
2462 		nsegs++;
2463 	}
2464 
2465 	for (;;) {
2466 		mbuf_ref_t m = NULL;
2467 		u_int16_t flag;
2468 		struct ext_ref *rfa;
2469 		void *cl;
2470 		int pkthdr;
2471 		m_ext_free_func_t m_free_func;
2472 
2473 		++num;
2474 
2475 		if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
2476 			m = zstack_pop(&mp_list);
2477 		} else {
2478 			m = zstack_pop(&rmp_list);
2479 		}
2480 		m_free_func = m_get_ext_free(m);
2481 		ASSERT(m != NULL);
2482 		VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
2483 		VERIFY(m_free_func == NULL || m_free_func == m_bigfree ||
2484 		    m_free_func == m_16kfree);
2485 
2486 		cl = m->m_ext.ext_buf;
2487 		rfa = m_get_rfa(m);
2488 
2489 		ASSERT(cl != NULL && rfa != NULL);
2490 		VERIFY(MBUF_IS_COMPOSITE(m));
2491 
2492 		flag = MEXT_FLAGS(m);
2493 
2494 		pkthdr = (nsegs == 1 || (num % nsegs) == 1);
2495 		if (pkthdr) {
2496 			first = m;
2497 		}
2498 		mbuf_init(m, pkthdr, MT_DATA);
2499 		if (m_free_func == m_16kfree) {
2500 			MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
2501 		} else if (m_free_func == m_bigfree) {
2502 			MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
2503 		} else {
2504 			MBUF_CL_INIT(m, cl, rfa, 1, flag);
2505 		}
2506 
2507 		*np = m;
2508 		if ((num % nsegs) == 0) {
2509 			np = &first->m_nextpkt;
2510 		} else {
2511 			np = &m->m_next;
2512 		}
2513 
2514 		if (num == needed) {
2515 			break;
2516 		}
2517 	}
2518 
2519 	if (num > 0) {
2520 		mtype_stat_add(MT_DATA, num);
2521 		mtype_stat_sub(MT_FREE, num);
2522 	}
2523 
2524 	num /= nsegs;
2525 
2526 	/* We've got them all; return to caller */
2527 	if (num == *numlist) {
2528 		ASSERT(zstack_empty(mp_list) && zstack_empty(rmp_list));
2529 		return top;
2530 	}
2531 
2532 fail:
2533 	/* Free up what's left of the above */
2534 	if (!zstack_empty(mp_list)) {
2535 		if (class == MC_MBUF) {
2536 			/* No need to elide, these mbufs came from the cache. */
2537 			mz_free_n(mp_list);
2538 		} else {
2539 			mz_composite_free_n(class, mp_list);
2540 		}
2541 	}
2542 	if (!zstack_empty(rmp_list)) {
2543 		mz_composite_free_n(rclass, rmp_list);
2544 	}
2545 	if (wantall && top != NULL) {
2546 		m_freem_list(top);
2547 		*numlist = 0;
2548 		return NULL;
2549 	}
2550 	*numlist = num;
2551 	return top;
2552 }
2553 #endif /* !CONFIG_MBUF_MCACHE */
2554 
2555 /*
2556  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
2557  * packets on receive ring.
2558  */
2559 __private_extern__ struct mbuf *
m_getpacket_how(int wait)2560 m_getpacket_how(int wait)
2561 {
2562 	unsigned int num_needed = 1;
2563 
2564 	return m_getpackets_internal(&num_needed, 1, wait, 1,
2565 	           m_maxsize(MC_CL));
2566 }
2567 
2568 /*
2569  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
2570  * packets on receive ring.
2571  */
2572 struct mbuf *
m_getpacket(void)2573 m_getpacket(void)
2574 {
2575 	unsigned int num_needed = 1;
2576 
2577 	return m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
2578 	           m_maxsize(MC_CL));
2579 }
2580 
2581 /*
2582  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
2583  * if this can't be met, return whatever number were available.  Set up the
2584  * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
2585  * are chained on the m_nextpkt field.  Any packets requested beyond this are
2586  * chained onto the last packet header's m_next field.
2587  */
2588 struct mbuf *
m_getpackets(int num_needed,int num_with_pkthdrs,int how)2589 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
2590 {
2591 	unsigned int n = num_needed;
2592 
2593 	return m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
2594 	           m_maxsize(MC_CL));
2595 }
2596 
2597 /*
2598  * Return a list of mbuf hdrs set up as packet hdrs chained together
2599  * on the m_nextpkt field
2600  */
2601 struct mbuf *
m_getpackethdrs(int num_needed,int how)2602 m_getpackethdrs(int num_needed, int how)
2603 {
2604 	mbuf_ref_t m, *np, top;
2605 
2606 	top = NULL;
2607 	np = &top;
2608 
2609 	while (num_needed--) {
2610 		m = _M_RETRYHDR(how, MT_DATA);
2611 		if (m == NULL) {
2612 			break;
2613 		}
2614 
2615 		*np = m;
2616 		np = &m->m_nextpkt;
2617 	}
2618 
2619 	return top;
2620 }
2621 
2622 #if !CONFIG_MBUF_MCACHE
2623 /*
2624  * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
2625  * for mbufs packets freed.  Used by the drivers.
2626  */
2627 int
m_freem_list(struct mbuf * m)2628 m_freem_list(struct mbuf *m)
2629 {
2630 	struct mbuf *nextpkt;
2631 	zstack_t mp_list = {}, mcl_list = {}, mbc_list = {},
2632 	    m16k_list = {}, m_mcl_list = {},
2633 	    m_mbc_list = {}, m_m16k_list = {}, ref_list = {};
2634 	int pktcount = 0;
2635 	int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
2636 
2637 	while (m != NULL) {
2638 		pktcount++;
2639 
2640 		nextpkt = m->m_nextpkt;
2641 		m->m_nextpkt = NULL;
2642 
2643 		while (m != NULL) {
2644 			struct mbuf *next = m->m_next;
2645 			void *cl = NULL;
2646 			if (m->m_type == MT_FREE) {
2647 				panic("m_free: freeing an already freed mbuf");
2648 			}
2649 
2650 			if (m->m_flags & M_PKTHDR) {
2651 				/* Free the aux data and tags if there is any */
2652 				m_tag_delete_chain(m);
2653 				m_do_tx_compl_callback(m, NULL);
2654 			}
2655 
2656 			if (!(m->m_flags & M_EXT)) {
2657 				mt_free++;
2658 				goto simple_free;
2659 			}
2660 
2661 			if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
2662 				m = next;
2663 				continue;
2664 			}
2665 
2666 			mt_free++;
2667 
2668 			cl = m->m_ext.ext_buf;
2669 			/*
2670 			 * Make sure that we don't touch any ext_ref
2671 			 * member after we decrement the reference count
2672 			 * since that may lead to use-after-free
2673 			 * when we do not hold the last reference.
2674 			 */
2675 			const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
2676 			const m_ext_free_func_t m_free_func = m_get_ext_free(m);
2677 			const uint16_t minref = MEXT_MINREF(m);
2678 			const uint16_t refcnt = m_decref(m);
2679 			if (refcnt == minref && !composite) {
2680 				if (m_free_func == NULL) {
2681 					zstack_push(&mcl_list, cl);
2682 				} else if (m_free_func == m_bigfree) {
2683 					zstack_push(&mbc_list, cl);
2684 				} else if (m_free_func == m_16kfree) {
2685 					zstack_push(&m16k_list, cl);
2686 				} else {
2687 					(*(m_free_func))((caddr_t)cl,
2688 					    m->m_ext.ext_size,
2689 					    m_get_ext_arg(m));
2690 				}
2691 				zstack_push(&ref_list, m_get_rfa(m));
2692 				m_set_ext(m, NULL, NULL, NULL);
2693 			} else if (refcnt == minref && composite) {
2694 				VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
2695 				/*
2696 				 * Amortize the costs of atomic operations
2697 				 * by doing them at the end, if possible.
2698 				 */
2699 				if (m->m_type == MT_DATA) {
2700 					mt_data++;
2701 				} else if (m->m_type == MT_HEADER) {
2702 					mt_header++;
2703 				} else if (m->m_type == MT_SONAME) {
2704 					mt_soname++;
2705 				} else if (m->m_type == MT_TAG) {
2706 					mt_tag++;
2707 				} else {
2708 					mtype_stat_dec(m->m_type);
2709 				}
2710 
2711 				m->m_type = MT_FREE;
2712 				m->m_flags = M_EXT;
2713 				m->m_len = 0;
2714 				m->m_next = m->m_nextpkt = NULL;
2715 
2716 				/*
2717 				 * MEXT_FLAGS is safe to access here
2718 				 * since we are now sure that we held
2719 				 * the last reference to ext_ref.
2720 				 */
2721 				MEXT_FLAGS(m) &= ~EXTF_READONLY;
2722 
2723 				/* "Free" into the intermediate cache */
2724 				if (m_free_func == NULL) {
2725 					zstack_push(&m_mcl_list, m);
2726 				} else if (m_free_func == m_bigfree) {
2727 					zstack_push(&m_mbc_list, m);
2728 				} else {
2729 					VERIFY(m_free_func == m_16kfree);
2730 					zstack_push(&m_m16k_list, m);
2731 				}
2732 				m = next;
2733 				continue;
2734 			}
2735 simple_free:
2736 			/*
2737 			 * Amortize the costs of atomic operations
2738 			 * by doing them at the end, if possible.
2739 			 */
2740 			if (m->m_type == MT_DATA) {
2741 				mt_data++;
2742 			} else if (m->m_type == MT_HEADER) {
2743 				mt_header++;
2744 			} else if (m->m_type == MT_SONAME) {
2745 				mt_soname++;
2746 			} else if (m->m_type == MT_TAG) {
2747 				mt_tag++;
2748 			} else if (m->m_type != MT_FREE) {
2749 				mtype_stat_dec(m->m_type);
2750 			}
2751 
2752 			m->m_type = MT_FREE;
2753 			m->m_flags = m->m_len = 0;
2754 			m->m_next = m->m_nextpkt = NULL;
2755 
2756 			m_elide(m);
2757 			zstack_push(&mp_list, m);
2758 
2759 			m = next;
2760 		}
2761 
2762 		m = nextpkt;
2763 	}
2764 
2765 	if (mt_free > 0) {
2766 		mtype_stat_add(MT_FREE, mt_free);
2767 	}
2768 	if (mt_data > 0) {
2769 		mtype_stat_sub(MT_DATA, mt_data);
2770 	}
2771 	if (mt_header > 0) {
2772 		mtype_stat_sub(MT_HEADER, mt_header);
2773 	}
2774 	if (mt_soname > 0) {
2775 		mtype_stat_sub(MT_SONAME, mt_soname);
2776 	}
2777 	if (mt_tag > 0) {
2778 		mtype_stat_sub(MT_TAG, mt_tag);
2779 	}
2780 	if (!zstack_empty(mp_list)) {
2781 		/* mbufs elided above. */
2782 		mz_free_n(mp_list);
2783 	}
2784 	if (!zstack_empty(mcl_list)) {
2785 		zfree_nozero_n(ZONE_ID_CLUSTER_2K, mcl_list);
2786 	}
2787 	if (!zstack_empty(mbc_list)) {
2788 		zfree_nozero_n(ZONE_ID_CLUSTER_4K, mbc_list);
2789 	}
2790 	if (!zstack_empty(m16k_list)) {
2791 		zfree_nozero_n(ZONE_ID_CLUSTER_16K, m16k_list);
2792 	}
2793 	if (!zstack_empty(m_mcl_list)) {
2794 		mz_composite_free_n(MC_MBUF_CL, m_mcl_list);
2795 	}
2796 	if (!zstack_empty(m_mbc_list)) {
2797 		mz_composite_free_n(MC_MBUF_BIGCL, m_mbc_list);
2798 	}
2799 	if (!zstack_empty(m_m16k_list)) {
2800 		mz_composite_free_n(MC_MBUF_16KCL, m_m16k_list);
2801 	}
2802 	if (!zstack_empty(ref_list)) {
2803 		zfree_nozero_n(ZONE_ID_MBUF_REF, ref_list);
2804 	}
2805 
2806 	return pktcount;
2807 }
2808 #endif /* !CONFIG_MBUF_MCACHE */
2809 
2810 /*
2811  * Wrapper around m_freem_list which captures the packet that's going to be
2812  * dropped. If funcname is NULL, that means we do not want to store both
2813  * function name and line number, and only the drop reason will be saved.
2814  * Make sure to pass the direction flag (DROPTAP_FLAG_DIR_OUT,
2815  * DROPTAP_FLAG_DIR_IN), or the packet will not be captured.
2816  */
2817 void
m_drop_list(mbuf_t m_head,struct ifnet * ifp,uint16_t flags,uint32_t reason,const char * funcname,uint16_t linenum)2818 m_drop_list(mbuf_t m_head, struct ifnet *ifp, uint16_t flags, uint32_t reason, const char *funcname,
2819     uint16_t linenum)
2820 {
2821 	struct mbuf *m = m_head;
2822 	struct mbuf *nextpkt;
2823 
2824 	if (m_head == NULL) {
2825 		return;
2826 	}
2827 
2828 	if (__probable(droptap_total_tap_count == 0)) {
2829 		m_freem_list(m_head);
2830 		return;
2831 	}
2832 
2833 	if (flags & DROPTAP_FLAG_DIR_OUT) {
2834 		while (m != NULL) {
2835 			uint16_t tmp_flags = flags;
2836 
2837 			nextpkt = m->m_nextpkt;
2838 			if (m->m_pkthdr.pkt_hdr == NULL) {
2839 				tmp_flags |= DROPTAP_FLAG_L2_MISSING;
2840 			}
2841 			droptap_output_mbuf(m, reason, funcname, linenum, tmp_flags,
2842 			    ifp);
2843 			m = nextpkt;
2844 		}
2845 	} else if (flags & DROPTAP_FLAG_DIR_IN) {
2846 		while (m != NULL) {
2847 			char *frame_header __single;
2848 			uint16_t tmp_flags = flags;
2849 
2850 			nextpkt = m->m_nextpkt;
2851 
2852 			if ((flags & DROPTAP_FLAG_L2_MISSING) == 0 &&
2853 			    m->m_pkthdr.pkt_hdr != NULL) {
2854 				frame_header = m->m_pkthdr.pkt_hdr;
2855 			} else {
2856 				frame_header = NULL;
2857 				tmp_flags |= DROPTAP_FLAG_L2_MISSING;
2858 			}
2859 
2860 			droptap_input_mbuf(m, reason, funcname, linenum, tmp_flags,
2861 			    m->m_pkthdr.rcvif, frame_header);
2862 			m = nextpkt;
2863 		}
2864 	}
2865 	m_freem_list(m_head);
2866 }
2867 
2868 void
m_freem(struct mbuf * m)2869 m_freem(struct mbuf *m)
2870 {
2871 	while (m != NULL) {
2872 		m = m_free(m);
2873 	}
2874 }
2875 
2876 /*
2877  * Wrapper around m_freem which captures the packet that's going to be dropped.
2878  * If funcname is NULL, that means we do not want to store both function name
2879  * and line number, and only the drop reason will be saved. Make sure to pass the
2880  * direction flag (DROPTAP_FLAG_DIR_OUT, DROPTAP_FLAG_DIR_IN), or the packet will
2881  * not be captured.
2882  */
2883 static void
m_drop_common(mbuf_t m,struct ifnet * ifp,uint16_t flags,uint32_t reason,const char * funcname,uint16_t linenum)2884 m_drop_common(mbuf_t m, struct ifnet *ifp, uint16_t flags, uint32_t reason, const char *funcname,
2885     uint16_t linenum)
2886 {
2887 	if (flags & DROPTAP_FLAG_DIR_OUT) {
2888 		droptap_output_mbuf(m, reason, funcname, linenum, flags, ifp);
2889 	} else if (flags & DROPTAP_FLAG_DIR_IN) {
2890 		char *frame_header __single;
2891 
2892 		if ((flags & DROPTAP_FLAG_L2_MISSING) == 0 &&
2893 		    m->m_pkthdr.pkt_hdr != NULL) {
2894 			frame_header = m->m_pkthdr.pkt_hdr;
2895 		} else {
2896 			frame_header = NULL;
2897 			flags |= DROPTAP_FLAG_L2_MISSING;
2898 		}
2899 
2900 		droptap_input_mbuf(m, reason, funcname, linenum, flags, ifp,
2901 		    frame_header);
2902 	}
2903 	m_freem(m);
2904 }
2905 
2906 void
m_drop(mbuf_t m,uint16_t flags,uint32_t reason,const char * funcname,uint16_t linenum)2907 m_drop(mbuf_t m, uint16_t flags, uint32_t reason, const char *funcname,
2908     uint16_t linenum)
2909 {
2910 	if (m == NULL) {
2911 		return;
2912 	}
2913 
2914 	if (__probable(droptap_total_tap_count == 0)) {
2915 		m_freem(m);
2916 		return;
2917 	}
2918 
2919 	if (flags & DROPTAP_FLAG_DIR_OUT) {
2920 		m_drop_common(m, NULL, flags, reason, funcname, linenum);
2921 	} else if (flags & DROPTAP_FLAG_DIR_IN) {
2922 		m_drop_common(m, m->m_pkthdr.rcvif, flags, reason, funcname, linenum);
2923 	}
2924 }
2925 
2926 void
m_drop_if(mbuf_t m,struct ifnet * ifp,uint16_t flags,uint32_t reason,const char * funcname,uint16_t linenum)2927 m_drop_if(mbuf_t m, struct ifnet *ifp, uint16_t flags, uint32_t reason, const char *funcname,
2928     uint16_t linenum)
2929 {
2930 	if (m == NULL) {
2931 		return;
2932 	}
2933 
2934 	if (__probable(droptap_total_tap_count == 0)) {
2935 		m_freem(m);
2936 		return;
2937 	}
2938 
2939 	m_drop_common(m, ifp, flags, reason, funcname, linenum);
2940 }
2941 
2942 void
m_drop_extended(mbuf_t m,struct ifnet * ifp,char * frame_header,uint16_t flags,uint32_t reason,const char * funcname,uint16_t linenum)2943 m_drop_extended(mbuf_t m, struct ifnet *ifp, char *frame_header,
2944     uint16_t flags, uint32_t reason, const char *funcname, uint16_t linenum)
2945 {
2946 	if (m == NULL) {
2947 		return;
2948 	}
2949 
2950 	if (__probable(droptap_total_tap_count == 0)) {
2951 		m_freem(m);
2952 		return;
2953 	}
2954 
2955 	if (flags & DROPTAP_FLAG_DIR_OUT) {
2956 		droptap_output_mbuf(m, reason, funcname, linenum, flags,
2957 		    ifp);
2958 	} else if (flags & DROPTAP_FLAG_DIR_IN) {
2959 		droptap_input_mbuf(m, reason, funcname, linenum, flags,
2960 		    m->m_pkthdr.rcvif, frame_header);
2961 	}
2962 	m_freem(m);
2963 }
2964 
2965 /*
2966  * Mbuffer utility routines.
2967  */
2968 /*
2969  * Set the m_data pointer of a newly allocated mbuf to place an object of the
2970  * specified size at the end of the mbuf, longword aligned.
2971  *
2972  * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
2973  * separate macros, each asserting that it was called at the proper moment.
2974  * This required callers to themselves test the storage type and call the
2975  * right one.  Rather than require callers to be aware of those layout
2976  * decisions, we centralize here.
2977  */
2978 void
m_align(struct mbuf * m,int len)2979 m_align(struct mbuf *m, int len)
2980 {
2981 	int adjust = 0;
2982 
2983 	/* At this point data must point to start */
2984 	VERIFY(m->m_data == (uintptr_t)M_START(m));
2985 	VERIFY(len >= 0);
2986 	VERIFY(len <= M_SIZE(m));
2987 	adjust = M_SIZE(m) - len;
2988 	m->m_data += adjust & ~(sizeof(long) - 1);
2989 }
2990 
2991 /*
2992  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
2993  * copy junk along.  Does not adjust packet header length.
2994  */
2995 struct mbuf *
m_prepend(struct mbuf * m,int len,int how)2996 m_prepend(struct mbuf *m, int len, int how)
2997 {
2998 	struct mbuf *mn;
2999 
3000 	_MGET(mn, how, m->m_type);
3001 	if (mn == NULL) {
3002 		m_freem(m);
3003 		return NULL;
3004 	}
3005 	if (m->m_flags & M_PKTHDR) {
3006 		M_COPY_PKTHDR(mn, m);
3007 		m->m_flags &= ~M_PKTHDR;
3008 	}
3009 	mn->m_next = m;
3010 	m = mn;
3011 	if (m->m_flags & M_PKTHDR) {
3012 		VERIFY(len <= MHLEN);
3013 		MH_ALIGN(m, len);
3014 	} else {
3015 		VERIFY(len <= MLEN);
3016 		M_ALIGN(m, len);
3017 	}
3018 	m->m_len = len;
3019 	return m;
3020 }
3021 
3022 /*
3023  * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
3024  * chain, copy junk along, and adjust length.
3025  */
3026 struct mbuf *
m_prepend_2(struct mbuf * m,int len,int how,int align)3027 m_prepend_2(struct mbuf *m, int len, int how, int align)
3028 {
3029 	if (M_LEADINGSPACE(m) >= len &&
3030 	    (!align || IS_P2ALIGNED((m->m_data - len), sizeof(u_int32_t)))) {
3031 		m->m_data -= len;
3032 		m->m_len += len;
3033 	} else {
3034 		m = m_prepend(m, len, how);
3035 	}
3036 	if ((m) && (m->m_flags & M_PKTHDR)) {
3037 		m->m_pkthdr.len += len;
3038 	}
3039 	return m;
3040 }
3041 
3042 /*
3043  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
3044  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
3045  * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
3046  *
3047  * The last mbuf and offset accessed are passed in and adjusted on return to
3048  * avoid having to iterate over the entire mbuf chain each time.
3049  */
3050 struct mbuf *
m_copym_mode(struct mbuf * m,int off0,int len0,int wait,struct mbuf ** m_lastm,int * m_off,uint32_t mode)3051 m_copym_mode(struct mbuf *m, int off0, int len0, int wait,
3052     struct mbuf **m_lastm, int *m_off, uint32_t mode)
3053 {
3054 	mbuf_ref_t n, mhdr = NULL, *np, top;
3055 	int off = off0, len = len0;
3056 	int copyhdr = 0;
3057 
3058 	if (off < 0 || len < 0) {
3059 		panic("m_copym: invalid offset %d or len %d", off, len);
3060 	}
3061 
3062 	VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
3063 	    mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR));
3064 
3065 	if ((off == 0 && (m->m_flags & M_PKTHDR)) ||
3066 	    mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) {
3067 		mhdr = m;
3068 		copyhdr = 1;
3069 	}
3070 
3071 	if (m_lastm != NULL && *m_lastm != NULL) {
3072 		if (off0 >= *m_off) {
3073 			m = *m_lastm;
3074 			off = off0 - *m_off;
3075 		}
3076 	}
3077 
3078 	while (off >= m->m_len) {
3079 		off -= m->m_len;
3080 		m = m->m_next;
3081 	}
3082 	np = &top;
3083 	top = NULL;
3084 
3085 	while (len > 0) {
3086 		if (m == NULL) {
3087 			if (len != M_COPYALL) {
3088 				panic("m_copym: len != M_COPYALL");
3089 			}
3090 			break;
3091 		}
3092 
3093 		if (copyhdr) {
3094 			n = _M_RETRYHDR(wait, m->m_type);
3095 		} else {
3096 			n = _M_RETRY(wait, m->m_type);
3097 		}
3098 		*np = n;
3099 
3100 		if (n == NULL) {
3101 			goto nospace;
3102 		}
3103 
3104 		if (copyhdr != 0) {
3105 			if ((mode == M_COPYM_MOVE_HDR) ||
3106 			    (mode == M_COPYM_MUST_MOVE_HDR)) {
3107 				M_COPY_PKTHDR(n, mhdr);
3108 			} else if ((mode == M_COPYM_COPY_HDR) ||
3109 			    (mode == M_COPYM_MUST_COPY_HDR)) {
3110 				if (m_dup_pkthdr(n, mhdr, wait) == 0) {
3111 					goto nospace;
3112 				}
3113 			}
3114 			if (len == M_COPYALL) {
3115 				n->m_pkthdr.len -= off0;
3116 			} else {
3117 				n->m_pkthdr.len = len;
3118 			}
3119 			copyhdr = 0;
3120 			/*
3121 			 * There is data to copy from the packet header mbuf
3122 			 * if it is empty or it is before the starting offset
3123 			 */
3124 			if (mhdr != m) {
3125 				np = &n->m_next;
3126 				continue;
3127 			}
3128 		}
3129 		n->m_len = MIN(len, (m->m_len - off));
3130 		if (m->m_flags & M_EXT) {
3131 			n->m_ext = m->m_ext;
3132 			m_incref(m);
3133 			n->m_data = m->m_data + off;
3134 			n->m_flags |= M_EXT;
3135 		} else {
3136 			/*
3137 			 * Limit to the capacity of the destination
3138 			 */
3139 			n->m_len = MIN(n->m_len, M_SIZE(n));
3140 
3141 			if (m_mtod_end(n) > m_mtod_upper_bound(n)) {
3142 				panic("%s n %p copy overflow",
3143 				    __func__, n);
3144 			}
3145 
3146 			bcopy(mtod(m, caddr_t) + off, mtod(n, caddr_t),
3147 			    (unsigned)n->m_len);
3148 		}
3149 		if (len != M_COPYALL) {
3150 			len -= n->m_len;
3151 		}
3152 
3153 		if (len == 0) {
3154 			if (m_lastm != NULL) {
3155 				*m_lastm = m;
3156 				*m_off = off0 + len0 - (off + n->m_len);
3157 			}
3158 		}
3159 		off = 0;
3160 		m = m->m_next;
3161 		np = &n->m_next;
3162 	}
3163 
3164 	return top;
3165 nospace:
3166 	m_freem(top);
3167 
3168 	return NULL;
3169 }
3170 
3171 
3172 struct mbuf *
m_copym(struct mbuf * m,int off0,int len,int wait)3173 m_copym(struct mbuf *m, int off0, int len, int wait)
3174 {
3175 	return m_copym_mode(m, off0, len, wait, NULL, NULL, M_COPYM_MOVE_HDR);
3176 }
3177 
3178 #if !CONFIG_MBUF_MCACHE
3179 /*
3180  * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
3181  * within this routine also.
3182  *
3183  * The last mbuf and offset accessed are passed in and adjusted on return to
3184  * avoid having to iterate over the entire mbuf chain each time.
3185  */
3186 struct mbuf *
m_copym_with_hdrs(struct mbuf * m0,int off0,int len0,int wait,struct mbuf ** m_lastm,int * m_off,uint32_t mode)3187 m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
3188     struct mbuf **m_lastm, int *m_off, uint32_t mode)
3189 {
3190 	mbuf_ref_t m = m0, n, *np = NULL, top = NULL;
3191 	int off = off0, len = len0;
3192 	zstack_t list = {};
3193 	int copyhdr = 0;
3194 	int type = 0;
3195 	int needed = 0;
3196 
3197 	if (off == 0 && (m->m_flags & M_PKTHDR)) {
3198 		copyhdr = 1;
3199 	}
3200 
3201 	if (m_lastm != NULL && *m_lastm != NULL) {
3202 		if (off0 >= *m_off) {
3203 			m = *m_lastm;
3204 			off = off0 - *m_off;
3205 		}
3206 	}
3207 
3208 	while (off >= m->m_len) {
3209 		off -= m->m_len;
3210 		m = m->m_next;
3211 	}
3212 
3213 	n = m;
3214 	while (len > 0) {
3215 		needed++;
3216 		len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
3217 		n = n->m_next;
3218 	}
3219 	needed++;
3220 	len = len0;
3221 
3222 	list = mz_alloc_n(needed, wait);
3223 	if (zstack_count(list) != needed) {
3224 		goto nospace;
3225 	}
3226 
3227 	needed = 0;
3228 	while (len > 0) {
3229 		n = zstack_pop(&list);
3230 		ASSERT(n != NULL && m != NULL);
3231 
3232 		type = (top == NULL) ? MT_HEADER : m->m_type;
3233 		mbuf_init(n, (top == NULL), type);
3234 
3235 		if (top == NULL) {
3236 			top = n;
3237 			np = &top->m_next;
3238 			continue;
3239 		} else {
3240 			needed++;
3241 			*np = n;
3242 		}
3243 
3244 		if (copyhdr) {
3245 			if ((mode == M_COPYM_MOVE_HDR) ||
3246 			    (mode == M_COPYM_MUST_MOVE_HDR)) {
3247 				M_COPY_PKTHDR(n, m);
3248 			} else if ((mode == M_COPYM_COPY_HDR) ||
3249 			    (mode == M_COPYM_MUST_COPY_HDR)) {
3250 				if (m_dup_pkthdr(n, m, wait) == 0) {
3251 					m_elide(n);
3252 					goto nospace;
3253 				}
3254 			}
3255 			n->m_pkthdr.len = len;
3256 			copyhdr = 0;
3257 		}
3258 		n->m_len = MIN(len, (m->m_len - off));
3259 
3260 		if (m->m_flags & M_EXT) {
3261 			n->m_ext = m->m_ext;
3262 			m_incref(m);
3263 			n->m_data = m->m_data + off;
3264 			n->m_flags |= M_EXT;
3265 		} else {
3266 			if (m_mtod_end(n) > m_mtod_upper_bound(n)) {
3267 				panic("%s n %p copy overflow",
3268 				    __func__, n);
3269 			}
3270 
3271 			bcopy(mtod(m, caddr_t) + off, mtod(n, caddr_t),
3272 			    (unsigned)n->m_len);
3273 		}
3274 		len -= n->m_len;
3275 
3276 		if (len == 0) {
3277 			if (m_lastm != NULL) {
3278 				*m_lastm = m;
3279 				*m_off = off0 + len0 - (off + n->m_len);
3280 			}
3281 			break;
3282 		}
3283 		off = 0;
3284 		m = m->m_next;
3285 		np = &n->m_next;
3286 	}
3287 
3288 	mtype_stat_inc(MT_HEADER);
3289 	mtype_stat_add(type, needed);
3290 	mtype_stat_sub(MT_FREE, needed + 1);
3291 
3292 	ASSERT(zstack_empty(list));
3293 
3294 	return top;
3295 
3296 nospace:
3297 	if (!zstack_empty(list)) {
3298 		/* No need to elide, these mbufs came from the cache. */
3299 		mz_free_n(list);
3300 	}
3301 	if (top != NULL) {
3302 		m_freem(top);
3303 	}
3304 	return NULL;
3305 }
3306 #endif /* !CONFIG_MBUF_MCACHE */
3307 
3308 /*
3309  * Copy data from an mbuf chain starting "off" bytes from the beginning,
3310  * continuing for "len" bytes, into the indicated buffer.
3311  */
3312 void
m_copydata(struct mbuf * m,int off,int len0,void * vp __sized_by (len0))3313 m_copydata(struct mbuf *m, int off, int len0, void *vp __sized_by(len0))
3314 {
3315 	int off0 = off, len = len0;
3316 	struct mbuf *m0 = m;
3317 	unsigned count;
3318 	char *cp = vp;
3319 
3320 	if (__improbable(off < 0 || len < 0)) {
3321 		panic("%s: invalid offset %d or len %d", __func__, off, len);
3322 		/* NOTREACHED */
3323 	}
3324 
3325 	while (off > 0) {
3326 		if (__improbable(m == NULL)) {
3327 			panic("%s: invalid mbuf chain %p [off %d, len %d]",
3328 			    __func__, m0, off0, len0);
3329 			/* NOTREACHED */
3330 		}
3331 		if (off < m->m_len) {
3332 			break;
3333 		}
3334 		off -= m->m_len;
3335 		m = m->m_next;
3336 	}
3337 	while (len > 0) {
3338 		if (__improbable(m == NULL)) {
3339 			panic("%s: invalid mbuf chain %p [off %d, len %d]",
3340 			    __func__, m0, off0, len0);
3341 			/* NOTREACHED */
3342 		}
3343 		count = MIN(m->m_len - off, len);
3344 		bcopy(mtod(m, caddr_t) + off, cp, count);
3345 		len -= count;
3346 		cp += count;
3347 		off = 0;
3348 		m = m->m_next;
3349 	}
3350 }
3351 
3352 /*
3353  * Concatenate mbuf chain n to m.  Both chains must be of the same type
3354  * (e.g. MT_DATA).  Any m_pkthdr is not updated.
3355  */
3356 void
m_cat(struct mbuf * m,struct mbuf * n)3357 m_cat(struct mbuf *m, struct mbuf *n)
3358 {
3359 	while (m->m_next) {
3360 		m = m->m_next;
3361 	}
3362 	while (n) {
3363 		if ((m->m_flags & M_EXT) ||
3364 		    m->m_data + m->m_len + n->m_len >= (uintptr_t)&m->m_dat[MLEN]) {
3365 			/* just join the two chains */
3366 			m->m_next = n;
3367 			return;
3368 		}
3369 		/* splat the data from one into the other */
3370 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
3371 		    (u_int)n->m_len);
3372 		m->m_len += n->m_len;
3373 		n = m_free(n);
3374 	}
3375 }
3376 
3377 void
m_adj(struct mbuf * mp,int req_len)3378 m_adj(struct mbuf *mp, int req_len)
3379 {
3380 	int len = req_len;
3381 	struct mbuf *m;
3382 	int count;
3383 
3384 	if ((m = mp) == NULL) {
3385 		return;
3386 	}
3387 	if (len >= 0) {
3388 		/*
3389 		 * Trim from head.
3390 		 */
3391 		while (m != NULL && len > 0) {
3392 			if (m->m_len <= len) {
3393 				len -= m->m_len;
3394 				m->m_len = 0;
3395 				m = m->m_next;
3396 			} else {
3397 				m->m_len -= len;
3398 				m->m_data += len;
3399 				len = 0;
3400 			}
3401 		}
3402 		m = mp;
3403 		if (m->m_flags & M_PKTHDR) {
3404 			m->m_pkthdr.len -= (req_len - len);
3405 		}
3406 	} else {
3407 		/*
3408 		 * Trim from tail.  Scan the mbuf chain,
3409 		 * calculating its length and finding the last mbuf.
3410 		 * If the adjustment only affects this mbuf, then just
3411 		 * adjust and return.  Otherwise, rescan and truncate
3412 		 * after the remaining size.
3413 		 */
3414 		len = -len;
3415 		count = 0;
3416 		for (;;) {
3417 			count += m->m_len;
3418 			if (m->m_next == NULL) {
3419 				break;
3420 			}
3421 			m = m->m_next;
3422 		}
3423 		if (m->m_len >= len) {
3424 			m->m_len -= len;
3425 			m = mp;
3426 			if (m->m_flags & M_PKTHDR) {
3427 				m->m_pkthdr.len -= len;
3428 			}
3429 			return;
3430 		}
3431 		count -= len;
3432 		if (count < 0) {
3433 			count = 0;
3434 		}
3435 		/*
3436 		 * Correct length for chain is "count".
3437 		 * Find the mbuf with last data, adjust its length,
3438 		 * and toss data from remaining mbufs on chain.
3439 		 */
3440 		m = mp;
3441 		if (m->m_flags & M_PKTHDR) {
3442 			m->m_pkthdr.len = count;
3443 		}
3444 		for (; m; m = m->m_next) {
3445 			if (m->m_len >= count) {
3446 				m->m_len = count;
3447 				break;
3448 			}
3449 			count -= m->m_len;
3450 		}
3451 		while ((m = m->m_next)) {
3452 			m->m_len = 0;
3453 		}
3454 	}
3455 }
3456 
3457 /*
3458  * Rearange an mbuf chain so that len bytes are contiguous
3459  * and in the data area of an mbuf (so that mtod
3460  * will work for a structure of size len).  Returns the resulting
3461  * mbuf chain on success, frees it and returns null on failure.
3462  * If there is room, it will add up to max_protohdr-len extra bytes to the
3463  * contiguous region in an attempt to avoid being called next time.
3464  */
3465 struct mbuf *
m_pullup(struct mbuf * n,int len)3466 m_pullup(struct mbuf *n, int len)
3467 {
3468 	struct mbuf *m;
3469 	int count;
3470 	int space;
3471 
3472 	/* check invalid arguments */
3473 	if (n == NULL) {
3474 		panic("%s: n == NULL", __func__);
3475 	}
3476 	if (len < 0) {
3477 		os_log_info(OS_LOG_DEFAULT, "%s: failed negative len %d",
3478 		    __func__, len);
3479 		goto bad;
3480 	}
3481 	if (len > MLEN) {
3482 		os_log_info(OS_LOG_DEFAULT, "%s: failed len %d too big",
3483 		    __func__, len);
3484 		goto bad;
3485 	}
3486 	if ((n->m_flags & M_EXT) == 0 &&
3487 	    m_mtod_current(n) >= m_mtod_upper_bound(n)) {
3488 		os_log_info(OS_LOG_DEFAULT, "%s: m_data out of bounds",
3489 		    __func__);
3490 		goto bad;
3491 	}
3492 
3493 	/*
3494 	 * If first mbuf has no cluster, and has room for len bytes
3495 	 * without shifting current data, pullup into it,
3496 	 * otherwise allocate a new mbuf to prepend to the chain.
3497 	 */
3498 	if ((n->m_flags & M_EXT) == 0 &&
3499 	    len < m_mtod_upper_bound(n) - m_mtod_current(n) && n->m_next != NULL) {
3500 		if (n->m_len >= len) {
3501 			return n;
3502 		}
3503 		m = n;
3504 		n = n->m_next;
3505 		len -= m->m_len;
3506 	} else {
3507 		if (len > MHLEN) {
3508 			goto bad;
3509 		}
3510 		_MGET(m, M_DONTWAIT, n->m_type);
3511 		if (m == 0) {
3512 			goto bad;
3513 		}
3514 		m->m_len = 0;
3515 		if (n->m_flags & M_PKTHDR) {
3516 			M_COPY_PKTHDR(m, n);
3517 			n->m_flags &= ~M_PKTHDR;
3518 		}
3519 	}
3520 	space = m_mtod_upper_bound(m) - m_mtod_end(m);
3521 	do {
3522 		count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
3523 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
3524 		    (unsigned)count);
3525 		len -= count;
3526 		m->m_len += count;
3527 		n->m_len -= count;
3528 		space -= count;
3529 		if (n->m_len != 0) {
3530 			n->m_data += count;
3531 		} else {
3532 			n = m_free(n);
3533 		}
3534 	} while (len > 0 && n != NULL);
3535 	if (len > 0) {
3536 		(void) m_free(m);
3537 		goto bad;
3538 	}
3539 	m->m_next = n;
3540 	return m;
3541 bad:
3542 	m_freem(n);
3543 	return 0;
3544 }
3545 
3546 /*
3547  * Like m_pullup(), except a new mbuf is always allocated, and we allow
3548  * the amount of empty space before the data in the new mbuf to be specified
3549  * (in the event that the caller expects to prepend later).
3550  */
3551 __private_extern__ struct mbuf *
m_copyup(struct mbuf * n,int len,int dstoff)3552 m_copyup(struct mbuf *n, int len, int dstoff)
3553 {
3554 	struct mbuf *m;
3555 	int count, space;
3556 
3557 	VERIFY(len >= 0 && dstoff >= 0);
3558 
3559 	if (len > (MHLEN - dstoff)) {
3560 		goto bad;
3561 	}
3562 	MGET(m, M_DONTWAIT, n->m_type);
3563 	if (m == NULL) {
3564 		goto bad;
3565 	}
3566 	m->m_len = 0;
3567 	if (n->m_flags & M_PKTHDR) {
3568 		m_copy_pkthdr(m, n);
3569 		n->m_flags &= ~M_PKTHDR;
3570 	}
3571 	m->m_data += dstoff;
3572 	space = m_mtod_upper_bound(m) - m_mtod_end(m);
3573 	do {
3574 		count = min(min(max(len, max_protohdr), space), n->m_len);
3575 		memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
3576 		    (unsigned)count);
3577 		len -= count;
3578 		m->m_len += count;
3579 		n->m_len -= count;
3580 		space -= count;
3581 		if (n->m_len) {
3582 			n->m_data += count;
3583 		} else {
3584 			n = m_free(n);
3585 		}
3586 	} while (len > 0 && n);
3587 	if (len > 0) {
3588 		(void) m_free(m);
3589 		goto bad;
3590 	}
3591 	m->m_next = n;
3592 	return m;
3593 bad:
3594 	m_freem(n);
3595 
3596 	return NULL;
3597 }
3598 
3599 /*
3600  * Partition an mbuf chain in two pieces, returning the tail --
3601  * all but the first len0 bytes.  In case of failure, it returns NULL and
3602  * attempts to restore the chain to its original state.
3603  */
3604 struct mbuf *
m_split(struct mbuf * m0,int len0,int wait)3605 m_split(struct mbuf *m0, int len0, int wait)
3606 {
3607 	return m_split0(m0, len0, wait, 1);
3608 }
3609 
3610 static struct mbuf *
m_split0(struct mbuf * m0,int len0,int wait,int copyhdr)3611 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
3612 {
3613 	struct mbuf *m, *n;
3614 	unsigned len = len0, remain;
3615 
3616 	/*
3617 	 * First iterate to the mbuf which contains the first byte of
3618 	 * data at offset len0
3619 	 */
3620 	for (m = m0; m && len > m->m_len; m = m->m_next) {
3621 		len -= m->m_len;
3622 	}
3623 	if (m == NULL) {
3624 		return NULL;
3625 	}
3626 	/*
3627 	 * len effectively is now the offset in the current
3628 	 * mbuf where we have to perform split.
3629 	 *
3630 	 * remain becomes the tail length.
3631 	 * Note that len can also be == m->m_len
3632 	 */
3633 	remain = m->m_len - len;
3634 
3635 	/*
3636 	 * If current mbuf len contains the entire remaining offset len,
3637 	 * just make the second mbuf chain pointing to next mbuf onwards
3638 	 * and return after making necessary adjustments
3639 	 */
3640 	if (copyhdr && (m0->m_flags & M_PKTHDR) && remain == 0) {
3641 		_MGETHDR(n, wait, m0->m_type);
3642 		if (n == NULL) {
3643 			return NULL;
3644 		}
3645 		n->m_next = m->m_next;
3646 		m->m_next = NULL;
3647 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
3648 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
3649 		m0->m_pkthdr.len = len0;
3650 		return n;
3651 	}
3652 	if (copyhdr && (m0->m_flags & M_PKTHDR)) {
3653 		_MGETHDR(n, wait, m0->m_type);
3654 		if (n == NULL) {
3655 			return NULL;
3656 		}
3657 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
3658 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
3659 		m0->m_pkthdr.len = len0;
3660 
3661 		/*
3662 		 * If current points to external storage
3663 		 * then it can be shared by making last mbuf
3664 		 * of head chain and first mbuf of current chain
3665 		 * pointing to different data offsets
3666 		 */
3667 		if (m->m_flags & M_EXT) {
3668 			goto extpacket;
3669 		}
3670 		if (remain > MHLEN) {
3671 			/* m can't be the lead packet */
3672 			MH_ALIGN(n, 0);
3673 			n->m_next = m_split(m, len, wait);
3674 			if (n->m_next == NULL) {
3675 				(void) m_free(n);
3676 				return NULL;
3677 			} else {
3678 				return n;
3679 			}
3680 		} else {
3681 			MH_ALIGN(n, remain);
3682 		}
3683 	} else if (remain == 0) {
3684 		n = m->m_next;
3685 		m->m_next = NULL;
3686 		return n;
3687 	} else {
3688 		_MGET(n, wait, m->m_type);
3689 		if (n == NULL) {
3690 			return NULL;
3691 		}
3692 
3693 		if ((m->m_flags & M_EXT) == 0) {
3694 			VERIFY(remain <= MLEN);
3695 			M_ALIGN(n, remain);
3696 		}
3697 	}
3698 extpacket:
3699 	if (m->m_flags & M_EXT) {
3700 		n->m_flags |= M_EXT;
3701 		n->m_ext = m->m_ext;
3702 		m_incref(m);
3703 		n->m_data = m->m_data + len;
3704 	} else {
3705 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
3706 	}
3707 	n->m_len = remain;
3708 	m->m_len = len;
3709 	n->m_next = m->m_next;
3710 	m->m_next = NULL;
3711 	return n;
3712 }
3713 
3714 
3715 /*
3716  * Return the number of bytes in the mbuf chain, m.
3717  */
3718 unsigned int
m_length(struct mbuf * m)3719 m_length(struct mbuf *m)
3720 {
3721 	struct mbuf *m0;
3722 	unsigned int pktlen;
3723 
3724 	if (m->m_flags & M_PKTHDR) {
3725 		return m->m_pkthdr.len;
3726 	}
3727 
3728 	pktlen = 0;
3729 	for (m0 = m; m0 != NULL; m0 = m0->m_next) {
3730 		pktlen += m0->m_len;
3731 	}
3732 	return pktlen;
3733 }
3734 
3735 int
m_chain_capacity(const struct mbuf * m)3736 m_chain_capacity(const struct mbuf *m)
3737 {
3738 	int rawlen = 0;
3739 	while (m) {
3740 		rawlen += m_capacity(m);
3741 		m = m->m_next;
3742 	}
3743 
3744 	return rawlen;
3745 }
3746 
3747 
3748 /*
3749  * Copy data from a buffer back into the indicated mbuf chain,
3750  * starting "off" bytes from the beginning, extending the mbuf
3751  * chain if necessary.
3752  */
3753 void
m_copyback(struct mbuf * m0,int off,int len,const void * cp __sized_by (len))3754 m_copyback(struct mbuf *m0, int off, int len, const void *cp __sized_by(len))
3755 {
3756 #if DEBUG
3757 	struct mbuf *origm = m0;
3758 	int error;
3759 #endif /* DEBUG */
3760 
3761 	if (m0 == NULL) {
3762 		return;
3763 	}
3764 
3765 #if DEBUG
3766 	error =
3767 #endif /* DEBUG */
3768 	m_copyback0(&m0, off, len, cp,
3769 	    M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
3770 
3771 #if DEBUG
3772 	if (error != 0 || (m0 != NULL && origm != m0)) {
3773 		panic("m_copyback");
3774 	}
3775 #endif /* DEBUG */
3776 }
3777 
3778 struct mbuf *
m_copyback_cow(struct mbuf * m0,int off,int len,const void * cp __sized_by (len),int how)3779 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp __sized_by(len), int how)
3780 {
3781 	int error;
3782 
3783 	/* don't support chain expansion */
3784 	VERIFY(off + len <= m_length(m0));
3785 
3786 	error = m_copyback0(&m0, off, len, cp,
3787 	    M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
3788 	if (error) {
3789 		/*
3790 		 * no way to recover from partial success.
3791 		 * just free the chain.
3792 		 */
3793 		m_freem(m0);
3794 		return NULL;
3795 	}
3796 	return m0;
3797 }
3798 
3799 /*
3800  * m_makewritable: ensure the specified range writable.
3801  */
3802 int
m_makewritable(struct mbuf ** mp,int off,int len,int how)3803 m_makewritable(struct mbuf **mp, int off, int len, int how)
3804 {
3805 	int error;
3806 #if DEBUG
3807 	struct mbuf *n;
3808 	int origlen, reslen;
3809 
3810 	origlen = m_length(*mp);
3811 #endif /* DEBUG */
3812 
3813 	error = m_copyback0(mp, off, len, NULL,
3814 	    M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
3815 
3816 #if DEBUG
3817 	reslen = 0;
3818 	for (n = *mp; n; n = n->m_next) {
3819 		reslen += n->m_len;
3820 	}
3821 	if (origlen != reslen) {
3822 		panic("m_makewritable: length changed");
3823 	}
3824 	if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len) {
3825 		panic("m_makewritable: inconsist");
3826 	}
3827 #endif /* DEBUG */
3828 
3829 	return error;
3830 }
3831 
3832 static int
m_copyback0(struct mbuf ** mp0,int off,int len0,const void * vp __sized_by_or_null (len0),int flags,int how)3833 m_copyback0(struct mbuf **mp0, int off, int len0, const void *vp __sized_by_or_null(len0), int flags,
3834     int how)
3835 {
3836 	int mlen, len = len0, totlen = 0;
3837 	mbuf_ref_t m, n, *mp;
3838 	const char *cp = vp;
3839 
3840 	VERIFY(mp0 != NULL);
3841 	VERIFY(*mp0 != NULL);
3842 	VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
3843 	VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
3844 
3845 	/*
3846 	 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
3847 	 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
3848 	 */
3849 
3850 	VERIFY((~flags & (M_COPYBACK0_EXTEND | M_COPYBACK0_COW)) != 0);
3851 
3852 	mp = mp0;
3853 	m = *mp;
3854 	while (off > (mlen = m->m_len)) {
3855 		off -= mlen;
3856 		totlen += mlen;
3857 		if (m->m_next == NULL) {
3858 			int tspace;
3859 extend:
3860 			if (!(flags & M_COPYBACK0_EXTEND)) {
3861 				goto out;
3862 			}
3863 
3864 			/*
3865 			 * try to make some space at the end of "m".
3866 			 */
3867 
3868 			mlen = m->m_len;
3869 			if (off + len >= MINCLSIZE &&
3870 			    !(m->m_flags & M_EXT) && m->m_len == 0) {
3871 				MCLGET(m, how);
3872 			}
3873 			tspace = M_TRAILINGSPACE(m);
3874 			if (tspace > 0) {
3875 				tspace = MIN(tspace, off + len);
3876 				VERIFY(tspace > 0);
3877 				bzero(mtod(m, char *) + m->m_len,
3878 				    MIN(off, tspace));
3879 				m->m_len += tspace;
3880 				off += mlen;
3881 				totlen -= mlen;
3882 				continue;
3883 			}
3884 
3885 			/*
3886 			 * need to allocate an mbuf.
3887 			 */
3888 
3889 			if (off + len >= MINCLSIZE) {
3890 				n = m_getcl(how, m->m_type, 0);
3891 			} else {
3892 				n = _M_GET(how, m->m_type);
3893 			}
3894 			if (n == NULL) {
3895 				goto out;
3896 			}
3897 			n->m_len = 0;
3898 			n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
3899 			bzero(mtod(n, char *), MIN(n->m_len, off));
3900 			m->m_next = n;
3901 		}
3902 		mp = &m->m_next;
3903 		m = m->m_next;
3904 	}
3905 	while (len > 0) {
3906 		mlen = m->m_len - off;
3907 		if (mlen != 0 && m_mclhasreference(m)) {
3908 			char *datap;
3909 			int eatlen;
3910 
3911 			/*
3912 			 * this mbuf is read-only.
3913 			 * allocate a new writable mbuf and try again.
3914 			 */
3915 
3916 			/*
3917 			 * if we're going to write into the middle of
3918 			 * a mbuf, split it first.
3919 			 */
3920 			if (off > 0 && len < mlen) {
3921 				n = m_split0(m, off, how, 0);
3922 				if (n == NULL) {
3923 					goto enobufs;
3924 				}
3925 				m->m_next = n;
3926 				mp = &m->m_next;
3927 				m = n;
3928 				off = 0;
3929 				continue;
3930 			}
3931 
3932 			/*
3933 			 * XXX TODO coalesce into the trailingspace of
3934 			 * the previous mbuf when possible.
3935 			 */
3936 
3937 			/*
3938 			 * allocate a new mbuf.  copy packet header if needed.
3939 			 */
3940 			n = _M_GET(how, m->m_type);
3941 			if (n == NULL) {
3942 				goto enobufs;
3943 			}
3944 			if (off == 0 && (m->m_flags & M_PKTHDR)) {
3945 				M_COPY_PKTHDR(n, m);
3946 				n->m_len = MHLEN;
3947 			} else {
3948 				if (len >= MINCLSIZE) {
3949 					MCLGET(n, M_DONTWAIT);
3950 				}
3951 				n->m_len =
3952 				    (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
3953 			}
3954 			if (n->m_len > len) {
3955 				n->m_len = len;
3956 			}
3957 
3958 			/*
3959 			 * free the region which has been overwritten.
3960 			 * copying data from old mbufs if requested.
3961 			 */
3962 			if (flags & M_COPYBACK0_PRESERVE) {
3963 				datap = mtod(n, char *);
3964 			} else {
3965 				datap = NULL;
3966 			}
3967 			eatlen = n->m_len;
3968 			VERIFY(off == 0 || eatlen >= mlen);
3969 			if (off > 0) {
3970 				VERIFY(len >= mlen);
3971 				m->m_len = off;
3972 				m->m_next = n;
3973 				if (datap) {
3974 					m_copydata(m, off, mlen, datap);
3975 					datap += mlen;
3976 				}
3977 				eatlen -= mlen;
3978 				mp = &m->m_next;
3979 				m = m->m_next;
3980 			}
3981 			while (m != NULL && m_mclhasreference(m) &&
3982 			    n->m_type == m->m_type && eatlen > 0) {
3983 				mlen = MIN(eatlen, m->m_len);
3984 				if (datap) {
3985 					m_copydata(m, 0, mlen, datap);
3986 					datap += mlen;
3987 				}
3988 				m->m_data += mlen;
3989 				m->m_len -= mlen;
3990 				eatlen -= mlen;
3991 				if (m->m_len == 0) {
3992 					*mp = m = m_free(m);
3993 				}
3994 			}
3995 			if (eatlen > 0) {
3996 				n->m_len -= eatlen;
3997 			}
3998 			n->m_next = m;
3999 			*mp = m = n;
4000 			continue;
4001 		}
4002 		mlen = MIN(mlen, len);
4003 		if (flags & M_COPYBACK0_COPYBACK) {
4004 			bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
4005 			cp += mlen;
4006 		}
4007 		len -= mlen;
4008 		mlen += off;
4009 		off = 0;
4010 		totlen += mlen;
4011 		if (len == 0) {
4012 			break;
4013 		}
4014 		if (m->m_next == NULL) {
4015 			goto extend;
4016 		}
4017 		mp = &m->m_next;
4018 		m = m->m_next;
4019 	}
4020 out:
4021 	if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
4022 		VERIFY(flags & M_COPYBACK0_EXTEND);
4023 		m->m_pkthdr.len = totlen;
4024 	}
4025 
4026 	return 0;
4027 
4028 enobufs:
4029 	return ENOBUFS;
4030 }
4031 
4032 #if !CONFIG_MBUF_MCACHE
4033 uint64_t
mcl_to_paddr(char * addr)4034 mcl_to_paddr(char *addr)
4035 {
4036 	extern addr64_t kvtophys(vm_offset_t va);
4037 
4038 	return kvtophys((vm_offset_t)addr);
4039 }
4040 #endif /* !CONFIG_MBUF_MCACHE */
4041 
4042 /*
4043  * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
4044  * And really copy the thing.  That way, we don't "precompute" checksums
4045  * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
4046  * small packets, don't dup into a cluster.  That way received  packets
4047  * don't take up too much room in the sockbuf (cf. sbspace()).
4048  */
4049 struct mbuf *
m_dup(struct mbuf * m,int how)4050 m_dup(struct mbuf *m, int how)
4051 {
4052 	mbuf_ref_t n, top, *np;
4053 	int copyhdr = 0;
4054 
4055 	np = &top;
4056 	top = NULL;
4057 	if (m->m_flags & M_PKTHDR) {
4058 		copyhdr = 1;
4059 	}
4060 
4061 	/*
4062 	 * Quick check: if we have one mbuf and its data fits in an
4063 	 *  mbuf with packet header, just copy and go.
4064 	 */
4065 	if (m->m_next == NULL) {
4066 		/* Then just move the data into an mbuf and be done... */
4067 		if (copyhdr) {
4068 			if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
4069 				if ((n = _M_GETHDR(how, m->m_type)) == NULL) {
4070 					return NULL;
4071 				}
4072 				n->m_len = m->m_len;
4073 				m_dup_pkthdr(n, m, how);
4074 				bcopy(mtod(m, caddr_t), mtod(n, caddr_t), m->m_len);
4075 				return n;
4076 			}
4077 		} else if (m->m_len <= MLEN) {
4078 			if ((n = _M_GET(how, m->m_type)) == NULL) {
4079 				return NULL;
4080 			}
4081 			bcopy(mtod(m, caddr_t), mtod(n, caddr_t), m->m_len);
4082 			n->m_len = m->m_len;
4083 			return n;
4084 		}
4085 	}
4086 	while (m != NULL) {
4087 #if BLUE_DEBUG
4088 		printf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
4089 		    m->m_data);
4090 #endif
4091 		if (copyhdr) {
4092 			n = _M_GETHDR(how, m->m_type);
4093 		} else {
4094 			n = _M_GET(how, m->m_type);
4095 		}
4096 		if (n == NULL) {
4097 			goto nospace;
4098 		}
4099 		if (m->m_flags & M_EXT) {
4100 			if (m->m_len <= m_maxsize(MC_CL)) {
4101 				MCLGET(n, how);
4102 			} else if (m->m_len <= m_maxsize(MC_BIGCL)) {
4103 				n = m_mbigget(n, how);
4104 			} else if (m->m_len <= m_maxsize(MC_16KCL)) {
4105 				n = m_m16kget(n, how);
4106 			}
4107 			if (!(n->m_flags & M_EXT)) {
4108 				(void) m_free(n);
4109 				goto nospace;
4110 			}
4111 		} else {
4112 			VERIFY((copyhdr == 1 && m->m_len <= MHLEN) ||
4113 			    (copyhdr == 0 && m->m_len <= MLEN));
4114 		}
4115 		*np = n;
4116 		if (copyhdr) {
4117 			/* Don't use M_COPY_PKTHDR: preserve m_data */
4118 			m_dup_pkthdr(n, m, how);
4119 			copyhdr = 0;
4120 			if (!(n->m_flags & M_EXT)) {
4121 				n->m_data = (uintptr_t)n->m_pktdat;
4122 			}
4123 		}
4124 		n->m_len = m->m_len;
4125 		/*
4126 		 * Get the dup on the same bdry as the original
4127 		 * Assume that the two mbufs have the same offset to data area
4128 		 * (up to word boundaries)
4129 		 */
4130 		bcopy(mtod(m, caddr_t), mtod(n, caddr_t), (unsigned)n->m_len);
4131 		m = m->m_next;
4132 		np = &n->m_next;
4133 #if BLUE_DEBUG
4134 		printf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
4135 		    n->m_data);
4136 #endif
4137 	}
4138 
4139 	return top;
4140 
4141 nospace:
4142 	m_freem(top);
4143 	return NULL;
4144 }
4145 
4146 #define MBUF_MULTIPAGES(m)                                              \
4147 	(((m)->m_flags & M_EXT) &&                                      \
4148 	((IS_P2ALIGNED((m)->m_data, PAGE_SIZE)                          \
4149 	&& (m)->m_len > PAGE_SIZE) ||                                   \
4150 	(!IS_P2ALIGNED((m)->m_data, PAGE_SIZE) &&                       \
4151 	P2ROUNDUP((m)->m_data, PAGE_SIZE) < ((uintptr_t)(m)->m_data + (m)->m_len))))
4152 
4153 static struct mbuf *
m_expand(struct mbuf * m,struct mbuf ** last)4154 m_expand(struct mbuf *m, struct mbuf **last)
4155 {
4156 	mbuf_ref_t top = NULL, *nm = &top;
4157 	uintptr_t data0, data;
4158 	unsigned int len0, len;
4159 
4160 	VERIFY(MBUF_MULTIPAGES(m));
4161 	VERIFY(m->m_next == NULL);
4162 	data0 = (uintptr_t)m->m_data;
4163 	len0 = m->m_len;
4164 	*last = top;
4165 
4166 	for (;;) {
4167 		struct mbuf *n;
4168 
4169 		data = data0;
4170 		if (IS_P2ALIGNED(data, PAGE_SIZE) && len0 > PAGE_SIZE) {
4171 			len = PAGE_SIZE;
4172 		} else if (!IS_P2ALIGNED(data, PAGE_SIZE) &&
4173 		    P2ROUNDUP(data, PAGE_SIZE) < (data + len0)) {
4174 			len = P2ROUNDUP(data, PAGE_SIZE) - data;
4175 		} else {
4176 			len = len0;
4177 		}
4178 
4179 		VERIFY(len > 0);
4180 		VERIFY(m->m_flags & M_EXT);
4181 		m->m_data = data;
4182 		m->m_len = len;
4183 
4184 		*nm = *last = m;
4185 		nm = &m->m_next;
4186 		m->m_next = NULL;
4187 
4188 		data0 += len;
4189 		len0 -= len;
4190 		if (len0 == 0) {
4191 			break;
4192 		}
4193 
4194 		n = _M_RETRY(M_DONTWAIT, MT_DATA);
4195 		if (n == NULL) {
4196 			m_freem(top);
4197 			top = *last = NULL;
4198 			break;
4199 		}
4200 
4201 		n->m_ext = m->m_ext;
4202 		m_incref(m);
4203 		n->m_flags |= M_EXT;
4204 		m = n;
4205 	}
4206 	return top;
4207 }
4208 
4209 struct mbuf *
m_normalize(struct mbuf * m)4210 m_normalize(struct mbuf *m)
4211 {
4212 	mbuf_ref_t top = NULL, *nm = &top;
4213 	boolean_t expanded = FALSE;
4214 
4215 	while (m != NULL) {
4216 		mbuf_ref_t n;
4217 
4218 		n = m->m_next;
4219 		m->m_next = NULL;
4220 
4221 		/* Does the data cross one or more page boundaries? */
4222 		if (MBUF_MULTIPAGES(m)) {
4223 			mbuf_ref_t last;
4224 			if ((m = m_expand(m, &last)) == NULL) {
4225 				m_freem(n);
4226 				m_freem(top);
4227 				top = NULL;
4228 				break;
4229 			}
4230 			*nm = m;
4231 			nm = &last->m_next;
4232 			expanded = TRUE;
4233 		} else {
4234 			*nm = m;
4235 			nm = &m->m_next;
4236 		}
4237 		m = n;
4238 	}
4239 	return top;
4240 }
4241 
4242 /*
4243  * Append the specified data to the indicated mbuf chain,
4244  * Extend the mbuf chain if the new data does not fit in
4245  * existing space.
4246  *
4247  * Return 1 if able to complete the job; otherwise 0.
4248  */
4249 int
m_append(struct mbuf * m0,int len0,caddr_t cp0 __sized_by (len0))4250 m_append(struct mbuf *m0, int len0, caddr_t cp0 __sized_by(len0))
4251 {
4252 	struct mbuf *m, *n;
4253 	int remainder, space, len = len0;
4254 	caddr_t cp = cp0;
4255 
4256 	for (m = m0; m->m_next != NULL; m = m->m_next) {
4257 		;
4258 	}
4259 	remainder = len;
4260 	space = M_TRAILINGSPACE(m);
4261 	if (space > 0) {
4262 		/*
4263 		 * Copy into available space.
4264 		 */
4265 		if (space > remainder) {
4266 			space = remainder;
4267 		}
4268 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
4269 		m->m_len += space;
4270 		cp += space;
4271 		remainder -= space;
4272 	}
4273 	while (remainder > 0) {
4274 		/*
4275 		 * Allocate a new mbuf; could check space
4276 		 * and allocate a cluster instead.
4277 		 */
4278 		n = m_get(M_WAITOK, m->m_type);
4279 		if (n == NULL) {
4280 			break;
4281 		}
4282 		n->m_len = min(MLEN, remainder);
4283 		bcopy(cp, mtod(n, caddr_t), n->m_len);
4284 		cp += n->m_len;
4285 		remainder -= n->m_len;
4286 		m->m_next = n;
4287 		m = n;
4288 	}
4289 	if (m0->m_flags & M_PKTHDR) {
4290 		m0->m_pkthdr.len += len - remainder;
4291 	}
4292 	return remainder == 0;
4293 }
4294 
4295 struct mbuf *
m_last(struct mbuf * m)4296 m_last(struct mbuf *m)
4297 {
4298 	while (m->m_next != NULL) {
4299 		m = m->m_next;
4300 	}
4301 	return m;
4302 }
4303 
4304 unsigned int
m_fixhdr(struct mbuf * m0)4305 m_fixhdr(struct mbuf *m0)
4306 {
4307 	u_int len;
4308 
4309 	VERIFY(m0->m_flags & M_PKTHDR);
4310 
4311 	len = m_length2(m0, NULL);
4312 	m0->m_pkthdr.len = len;
4313 	return len;
4314 }
4315 
4316 unsigned int
m_length2(struct mbuf * m0,struct mbuf ** last)4317 m_length2(struct mbuf *m0, struct mbuf **last)
4318 {
4319 	struct mbuf *m;
4320 	u_int len;
4321 
4322 	len = 0;
4323 	for (m = m0; m != NULL; m = m->m_next) {
4324 		len += m->m_len;
4325 		if (m->m_next == NULL) {
4326 			break;
4327 		}
4328 	}
4329 	if (last != NULL) {
4330 		*last = m;
4331 	}
4332 	return len;
4333 }
4334 
4335 /*
4336  * Defragment a mbuf chain, returning the shortest possible chain of mbufs
4337  * and clusters.  If allocation fails and this cannot be completed, NULL will
4338  * be returned, but the passed in chain will be unchanged.  Upon success,
4339  * the original chain will be freed, and the new chain will be returned.
4340  *
4341  * If a non-packet header is passed in, the original mbuf (chain?) will
4342  * be returned unharmed.
4343  *
4344  * If offset is specfied, the first mbuf in the chain will have a leading
4345  * space of the amount stated by the "off" parameter.
4346  *
4347  * This routine requires that the m_pkthdr.header field of the original
4348  * mbuf chain is cleared by the caller.
4349  */
4350 struct mbuf *
m_defrag_offset(struct mbuf * m0,u_int32_t off,int how)4351 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
4352 {
4353 	struct mbuf *m_new = NULL, *m_final = NULL;
4354 	int progress = 0, length, pktlen;
4355 
4356 	if (!(m0->m_flags & M_PKTHDR)) {
4357 		return m0;
4358 	}
4359 
4360 	VERIFY(off < MHLEN);
4361 	m_fixhdr(m0); /* Needed sanity check */
4362 
4363 	pktlen = m0->m_pkthdr.len + off;
4364 	if (pktlen > MHLEN) {
4365 		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
4366 	} else {
4367 		m_final = m_gethdr(how, MT_DATA);
4368 	}
4369 
4370 	if (m_final == NULL) {
4371 		goto nospace;
4372 	}
4373 
4374 	if (off > 0) {
4375 		pktlen -= off;
4376 		m_final->m_data += off;
4377 	}
4378 
4379 	/*
4380 	 * Caller must have handled the contents pointed to by this
4381 	 * pointer before coming here, as otherwise it will point to
4382 	 * the original mbuf which will get freed upon success.
4383 	 */
4384 	VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
4385 
4386 	if (m_dup_pkthdr(m_final, m0, how) == 0) {
4387 		goto nospace;
4388 	}
4389 
4390 	m_new = m_final;
4391 
4392 	while (progress < pktlen) {
4393 		length = pktlen - progress;
4394 		if (length > MCLBYTES) {
4395 			length = MCLBYTES;
4396 		}
4397 		length -= ((m_new == m_final) ? off : 0);
4398 		if (length < 0) {
4399 			goto nospace;
4400 		}
4401 
4402 		if (m_new == NULL) {
4403 			if (length > MLEN) {
4404 				m_new = m_getcl(how, MT_DATA, 0);
4405 			} else {
4406 				m_new = m_get(how, MT_DATA);
4407 			}
4408 			if (m_new == NULL) {
4409 				goto nospace;
4410 			}
4411 		}
4412 
4413 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
4414 		progress += length;
4415 		m_new->m_len = length;
4416 		if (m_new != m_final) {
4417 			m_cat(m_final, m_new);
4418 		}
4419 		m_new = NULL;
4420 	}
4421 	m_freem(m0);
4422 	m0 = m_final;
4423 	return m0;
4424 nospace:
4425 	if (m_final) {
4426 		m_freem(m_final);
4427 	}
4428 	return NULL;
4429 }
4430 
4431 struct mbuf *
m_defrag(struct mbuf * m0,int how)4432 m_defrag(struct mbuf *m0, int how)
4433 {
4434 	return m_defrag_offset(m0, 0, how);
4435 }
4436 
4437 void
m_mchtype(struct mbuf * m,int t)4438 m_mchtype(struct mbuf *m, int t)
4439 {
4440 	mtype_stat_inc(t);
4441 	mtype_stat_dec(m->m_type);
4442 	(m)->m_type = t;
4443 }
4444 
4445 void *__unsafe_indexable
m_mtod(struct mbuf * m)4446 m_mtod(struct mbuf *m)
4447 {
4448 	return m_mtod_current(m);
4449 }
4450 
4451 /*
4452  * Return a pointer to mbuf/offset of location in mbuf chain.
4453  */
4454 struct mbuf *
m_getptr(struct mbuf * m,int loc,int * off)4455 m_getptr(struct mbuf *m, int loc, int *off)
4456 {
4457 	while (loc >= 0) {
4458 		/* Normal end of search. */
4459 		if (m->m_len > loc) {
4460 			*off = loc;
4461 			return m;
4462 		} else {
4463 			loc -= m->m_len;
4464 			if (m->m_next == NULL) {
4465 				if (loc == 0) {
4466 					/* Point at the end of valid data. */
4467 					*off = m->m_len;
4468 					return m;
4469 				}
4470 				return NULL;
4471 			}
4472 			m = m->m_next;
4473 		}
4474 	}
4475 	return NULL;
4476 }
4477 
4478 static uint32_t
mbuf_watchdog_socket_space(struct socket * so)4479 mbuf_watchdog_socket_space(struct socket *so)
4480 {
4481 	uint32_t space = 0;
4482 
4483 	if (so == NULL) {
4484 		return 0;
4485 	}
4486 
4487 	space = so->so_snd.sb_mbcnt + so->so_rcv.sb_mbcnt;
4488 
4489 #if INET
4490 	if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
4491 	    SOCK_PROTO(so) == IPPROTO_TCP) {
4492 		space += tcp_reass_qlen_space(so);
4493 	}
4494 #endif /* INET */
4495 
4496 	return space;
4497 }
4498 
4499 struct mbuf_watchdog_defunct_args {
4500 	struct proc *top_app;
4501 	uint32_t top_app_space_used;
4502 	bool non_blocking;
4503 };
4504 
4505 static bool
proc_fd_trylock(proc_t p)4506 proc_fd_trylock(proc_t p)
4507 {
4508 	return lck_mtx_try_lock(&p->p_fd.fd_lock);
4509 }
4510 
4511 #if !CONFIG_MBUF_MCACHE
4512 static
4513 #endif
4514 int
mbuf_watchdog_defunct_iterate(proc_t p,void * arg)4515 mbuf_watchdog_defunct_iterate(proc_t p, void *arg)
4516 {
4517 	struct fileproc *fp = NULL;
4518 	struct mbuf_watchdog_defunct_args *args =
4519 	    (struct mbuf_watchdog_defunct_args *)arg;
4520 	uint32_t space_used = 0;
4521 
4522 	/*
4523 	 * Non-blocking is only used when dumping the mbuf usage from the watchdog
4524 	 */
4525 	if (args->non_blocking) {
4526 		if (!proc_fd_trylock(p)) {
4527 			return PROC_RETURNED;
4528 		}
4529 	} else {
4530 		proc_fdlock(p);
4531 	}
4532 	fdt_foreach(fp, p) {
4533 		struct fileglob *fg = fp->fp_glob;
4534 		socket_ref_t so = NULL;
4535 
4536 		if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
4537 			continue;
4538 		}
4539 		so = fg_get_data(fg);
4540 		/*
4541 		 * We calculate the space without the socket
4542 		 * lock because we don't want to be blocked
4543 		 * by another process that called send() and
4544 		 * is stuck waiting for mbufs.
4545 		 *
4546 		 * These variables are 32-bit so we don't have
4547 		 * to worry about incomplete reads.
4548 		 */
4549 		space_used += mbuf_watchdog_socket_space(so);
4550 	}
4551 	proc_fdunlock(p);
4552 	if (space_used > args->top_app_space_used) {
4553 		if (args->top_app != NULL) {
4554 			proc_rele(args->top_app);
4555 		}
4556 		args->top_app = p;
4557 		args->top_app_space_used = space_used;
4558 
4559 		return PROC_CLAIMED;
4560 	} else {
4561 		return PROC_RETURNED;
4562 	}
4563 }
4564 
4565 extern char *proc_name_address(void *p);
4566 
4567 #if !CONFIG_MBUF_MCACHE
4568 static void
mbuf_watchdog_defunct(thread_call_param_t arg0,thread_call_param_t arg1)4569 mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1)
4570 {
4571 #pragma unused(arg0, arg1)
4572 	struct mbuf_watchdog_defunct_args args = {};
4573 	struct fileproc *fp = NULL;
4574 
4575 	args.non_blocking = false;
4576 	proc_iterate(PROC_ALLPROCLIST,
4577 	    mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
4578 
4579 	/*
4580 	 * Defunct all sockets from this app.
4581 	 */
4582 	if (args.top_app != NULL) {
4583 		os_log(OS_LOG_DEFAULT, "%s: defuncting all sockets from %s.%d",
4584 		    __func__,
4585 		    proc_name_address(args.top_app),
4586 		    proc_pid(args.top_app));
4587 		proc_fdlock(args.top_app);
4588 		fdt_foreach(fp, args.top_app) {
4589 			struct fileglob *fg = fp->fp_glob;
4590 			struct socket *so = NULL;
4591 
4592 			if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
4593 				continue;
4594 			}
4595 			so = (struct socket *)fp_get_data(fp);
4596 			if (!socket_try_lock(so)) {
4597 				continue;
4598 			}
4599 			if (sosetdefunct(args.top_app, so,
4600 			    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
4601 			    TRUE) == 0) {
4602 				sodefunct(args.top_app, so,
4603 				    SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
4604 			}
4605 			socket_unlock(so, 0);
4606 		}
4607 		proc_fdunlock(args.top_app);
4608 		proc_rele(args.top_app);
4609 		mbstat.m_forcedefunct++;
4610 		zcache_drain(ZONE_ID_MBUF_CLUSTER_2K);
4611 		zcache_drain(ZONE_ID_MBUF_CLUSTER_4K);
4612 		zcache_drain(ZONE_ID_MBUF_CLUSTER_16K);
4613 		zone_drain(zone_by_id(ZONE_ID_MBUF));
4614 		zone_drain(zone_by_id(ZONE_ID_CLUSTER_2K));
4615 		zone_drain(zone_by_id(ZONE_ID_CLUSTER_4K));
4616 		zone_drain(zone_by_id(ZONE_ID_CLUSTER_16K));
4617 		zone_drain(zone_by_id(ZONE_ID_MBUF_REF));
4618 	}
4619 }
4620 
4621 static LCK_GRP_DECLARE(mbuf_exhausted_grp, "mbuf-exhausted");
4622 static LCK_TICKET_DECLARE(mbuf_exhausted_lock, &mbuf_exhausted_grp);
4623 static uint32_t mbuf_exhausted_mask;
4624 
4625 #define MBUF_EXHAUSTED_DRAIN_MASK  (\
4626 	(1u << MC_MBUF) | \
4627 	(1u << MC_CL) | \
4628 	(1u << MC_BIGCL) | \
4629 	(1u << MC_16KCL))
4630 
4631 #define MBUF_EXHAUSTED_DEFUNCT_MASK  (\
4632 	(1u << MC_MBUF) | \
4633 	(1u << MC_MBUF_CL) | \
4634 	(1u << MC_MBUF_BIGCL) | \
4635 	(1u << MC_MBUF_16KCL))
4636 
4637 static void
mbuf_watchdog_drain_composite(thread_call_param_t arg0,thread_call_param_t arg1)4638 mbuf_watchdog_drain_composite(thread_call_param_t arg0, thread_call_param_t arg1)
4639 {
4640 #pragma unused(arg0, arg1)
4641 	zcache_drain(ZONE_ID_MBUF_CLUSTER_2K);
4642 	zcache_drain(ZONE_ID_MBUF_CLUSTER_4K);
4643 	zcache_drain(ZONE_ID_MBUF_CLUSTER_16K);
4644 }
4645 
4646 static void
mbuf_zone_exhausted_start(uint32_t bit)4647 mbuf_zone_exhausted_start(uint32_t bit)
4648 {
4649 	uint64_t deadline;
4650 	uint32_t mask;
4651 
4652 	mask = mbuf_exhausted_mask;
4653 	mbuf_exhausted_mask = mask | bit;
4654 
4655 	if ((mask & MBUF_EXHAUSTED_DRAIN_MASK) == 0 &&
4656 	    (bit & MBUF_EXHAUSTED_DRAIN_MASK)) {
4657 		clock_interval_to_deadline(MB_WDT_MAXTIME * 1000 / 10,
4658 		    NSEC_PER_MSEC, &deadline);
4659 		thread_call_enter_delayed(mbuf_drain_tcall, deadline);
4660 	}
4661 
4662 	if ((mask & MBUF_EXHAUSTED_DEFUNCT_MASK) == 0 &&
4663 	    (bit & MBUF_EXHAUSTED_DEFUNCT_MASK)) {
4664 		clock_interval_to_deadline(MB_WDT_MAXTIME * 1000 / 2,
4665 		    NSEC_PER_MSEC, &deadline);
4666 		thread_call_enter_delayed(mbuf_defunct_tcall, deadline);
4667 	}
4668 }
4669 
4670 static void
mbuf_zone_exhausted_end(uint32_t bit)4671 mbuf_zone_exhausted_end(uint32_t bit)
4672 {
4673 	uint32_t mask;
4674 
4675 	mask = (mbuf_exhausted_mask &= ~bit);
4676 
4677 	if ((mask & MBUF_EXHAUSTED_DRAIN_MASK) == 0 &&
4678 	    (bit & MBUF_EXHAUSTED_DRAIN_MASK)) {
4679 		thread_call_cancel(mbuf_drain_tcall);
4680 	}
4681 
4682 	if ((mask & MBUF_EXHAUSTED_DEFUNCT_MASK) == 0 &&
4683 	    (bit & MBUF_EXHAUSTED_DEFUNCT_MASK)) {
4684 		thread_call_cancel(mbuf_defunct_tcall);
4685 	}
4686 }
4687 
4688 static void
mbuf_zone_exhausted(zone_id_t zid,zone_t zone __unused,bool exhausted)4689 mbuf_zone_exhausted(zone_id_t zid, zone_t zone __unused, bool exhausted)
4690 {
4691 	uint32_t bit;
4692 
4693 	if (zid < m_class_to_zid(MBUF_CLASS_MIN) ||
4694 	    zid > m_class_to_zid(MBUF_CLASS_MAX)) {
4695 		return;
4696 	}
4697 
4698 	bit = 1u << m_class_from_zid(zid);
4699 
4700 	lck_ticket_lock_nopreempt(&mbuf_exhausted_lock, &mbuf_exhausted_grp);
4701 
4702 	if (exhausted) {
4703 		mbuf_zone_exhausted_start(bit);
4704 	} else {
4705 		mbuf_zone_exhausted_end(bit);
4706 	}
4707 
4708 	lck_ticket_unlock_nopreempt(&mbuf_exhausted_lock);
4709 }
4710 EVENT_REGISTER_HANDLER(ZONE_EXHAUSTED, mbuf_zone_exhausted);
4711 #endif /* !CONFIG_MBUF_MCACHE */
4712 
4713 /*
4714  * Convert between a regular and a packet header mbuf.  Caller is responsible
4715  * for setting or clearing M_PKTHDR; this routine does the rest of the work.
4716  */
4717 int
m_reinit(struct mbuf * m,int hdr)4718 m_reinit(struct mbuf *m, int hdr)
4719 {
4720 	int ret = 0;
4721 
4722 	if (hdr) {
4723 		VERIFY(!(m->m_flags & M_PKTHDR));
4724 		if (!(m->m_flags & M_EXT) &&
4725 		    (m->m_data != (uintptr_t)m->m_dat || m->m_len > 0)) {
4726 			/*
4727 			 * If there's no external cluster attached and the
4728 			 * mbuf appears to contain user data, we cannot
4729 			 * safely convert this to a packet header mbuf,
4730 			 * as the packet header structure might overlap
4731 			 * with the data.
4732 			 */
4733 			printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
4734 			    "m_data %llx (expected %llx), "
4735 			    "m_len %d (expected 0)\n",
4736 			    __func__,
4737 			    (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m),
4738 			    (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m->m_data),
4739 			    (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)(m->m_dat)), m->m_len);
4740 			ret = EBUSY;
4741 		} else {
4742 			VERIFY((m->m_flags & M_EXT) || m->m_data == (uintptr_t)m->m_dat);
4743 			m->m_flags |= M_PKTHDR;
4744 			mbuf_init_pkthdr(m);
4745 		}
4746 	} else {
4747 		/* Free the aux data and tags if there is any */
4748 		m_tag_delete_chain(m);
4749 		m_do_tx_compl_callback(m, NULL);
4750 		m->m_flags &= ~M_PKTHDR;
4751 	}
4752 
4753 	return ret;
4754 }
4755 
4756 int
m_ext_set_prop(struct mbuf * m,uint32_t o,uint32_t n)4757 m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n)
4758 {
4759 	ASSERT(m->m_flags & M_EXT);
4760 	return os_atomic_cmpxchg(&MEXT_PRIV(m), o, n, acq_rel);
4761 }
4762 
4763 uint32_t
m_ext_get_prop(struct mbuf * m)4764 m_ext_get_prop(struct mbuf *m)
4765 {
4766 	ASSERT(m->m_flags & M_EXT);
4767 	return MEXT_PRIV(m);
4768 }
4769 
4770 int
m_ext_paired_is_active(struct mbuf * m)4771 m_ext_paired_is_active(struct mbuf *m)
4772 {
4773 	return MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : 1;
4774 }
4775 
4776 void
m_ext_paired_activate(struct mbuf * m)4777 m_ext_paired_activate(struct mbuf *m)
4778 {
4779 	struct ext_ref *rfa;
4780 	int hdr, type;
4781 	caddr_t extbuf;
4782 	m_ext_free_func_t extfree;
4783 	u_int extsize;
4784 
4785 	VERIFY(MBUF_IS_PAIRED(m));
4786 	VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
4787 	VERIFY(MEXT_PREF(m) == MEXT_MINREF(m));
4788 
4789 	hdr = (m->m_flags & M_PKTHDR);
4790 	type = m->m_type;
4791 	extbuf = m->m_ext.ext_buf;
4792 	extfree = m_get_ext_free(m);
4793 	extsize = m->m_ext.ext_size;
4794 	rfa = m_get_rfa(m);
4795 
4796 	VERIFY(extbuf != NULL && rfa != NULL);
4797 
4798 	/*
4799 	 * Safe to reinitialize packet header tags, since it's
4800 	 * already taken care of at m_free() time.  Similar to
4801 	 * what's done in m_clattach() for the cluster.  Bump
4802 	 * up MEXT_PREF to indicate activation.
4803 	 */
4804 	mbuf_init(m, hdr, type);
4805 	mext_init(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
4806 	    1, 1, 2, EXTF_PAIRED, MEXT_PRIV(m), m);
4807 }
4808 
4809 #if !CONFIG_MBUF_MCACHE
4810 /*
4811  * This routine is reserved for mbuf_get_driver_scratch(); clients inside
4812  * xnu that intend on utilizing the module-private area should directly
4813  * refer to the pkt_mpriv structure in the pkthdr.  They are also expected
4814  * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
4815  * to handing it off to another module, respectively.
4816  */
4817 uint32_t
m_scratch_get(struct mbuf * m,uint8_t ** p)4818 m_scratch_get(struct mbuf *m, uint8_t **p)
4819 {
4820 	struct pkthdr *pkt = &m->m_pkthdr;
4821 
4822 	VERIFY(m->m_flags & M_PKTHDR);
4823 
4824 	/* See comments in <rdar://problem/14040693> */
4825 	if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
4826 		panic_plain("Invalid attempt to access guarded module-private "
4827 		    "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
4828 		/* NOTREACHED */
4829 	}
4830 
4831 	*p = (uint8_t *)&pkt->pkt_mpriv;
4832 	return sizeof(pkt->pkt_mpriv);
4833 }
4834 #endif /* !CONFIG_MBUF_MCACHE */
4835 
4836 void
m_add_crumb(struct mbuf * m,uint16_t crumb)4837 m_add_crumb(struct mbuf *m, uint16_t crumb)
4838 {
4839 	VERIFY(m->m_flags & M_PKTHDR);
4840 
4841 	m->m_pkthdr.pkt_crumbs |= crumb;
4842 }
4843 
4844 void
m_add_hdr_crumb(struct mbuf * m,uint64_t crumb,uint64_t flag)4845 m_add_hdr_crumb(struct mbuf *m, uint64_t crumb, uint64_t flag)
4846 {
4847 #if defined(__arm64__)
4848 	while (m != NULL) {
4849 		m->m_mhdrcommon_crumbs &= ~flag;
4850 		m->m_mhdrcommon_crumbs |= (crumb & flag);
4851 		m = m->m_next;
4852 	}
4853 #else
4854 #pragma unused(m, crumb, flag)
4855 #endif /*__arm64__*/
4856 }
4857 
4858 void
m_add_hdr_crumb_chain(struct mbuf * head,uint64_t crumb,uint64_t flag)4859 m_add_hdr_crumb_chain(struct mbuf *head, uint64_t crumb, uint64_t flag)
4860 {
4861 #if defined(__arm64__)
4862 	while (head) {
4863 		/* This assumes that we might have a chain of mbuf chains */
4864 		m_add_hdr_crumb(head, crumb, flag);
4865 		head = head->m_nextpkt;
4866 	}
4867 #else
4868 #pragma unused(head, crumb, flag)
4869 #endif /*__arm64__*/
4870 }
4871 
4872 SYSCTL_DECL(_kern_ipc);
4873 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
4874     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
4875     0, 0, mbstat_sysctl, "S,mbstat", "");
4876 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
4877     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
4878     0, 0, mb_stat_sysctl, "S,mb_stat", "");
4879 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_memory_pressure_percentage,
4880     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_memory_pressure_percentage, 0,
4881     "Percentage of when we trigger memory-pressure for an mbuf-class");
4882