1 /*
2 * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <ptrauth.h>
71
72 #include <stdint.h>
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/malloc.h>
76 #include <sys/mbuf.h>
77 #include <sys/kernel.h>
78 #include <sys/sysctl.h>
79 #include <sys/syslog.h>
80 #include <sys/protosw.h>
81 #include <sys/domain.h>
82 #include <sys/queue.h>
83 #include <sys/proc.h>
84 #include <sys/filedesc.h>
85 #include <sys/file_internal.h>
86
87 #include <vm/vm_kern_xnu.h>
88
89 #include <dev/random/randomdev.h>
90
91 #include <kern/kern_types.h>
92 #include <kern/simple_lock.h>
93 #include <kern/queue.h>
94 #include <kern/sched_prim.h>
95 #include <kern/backtrace.h>
96 #include <kern/percpu.h>
97 #include <kern/zalloc.h>
98
99 #include <libkern/OSDebug.h>
100 #include <libkern/libkern.h>
101
102 #include <os/log.h>
103 #include <os/ptrtools.h>
104
105 #include <machine/limits.h>
106 #include <machine/machine_routines.h>
107
108 #include <net/droptap.h>
109 #include <net/ntstat.h>
110
111 #if INET
112 extern int tcp_reass_qlen_space(struct socket *);
113 #endif /* INET */
114
115 /*
116 * MBUF IMPLEMENTATION NOTES (using zalloc).
117 *
118 * There are a total of 4 zones and 3 zcaches.
119 *
120 * MC_MBUF:
121 * This is a zone of rudimentary objects of _MSIZE in size; each
122 * object represents an mbuf structure. This cache preserves only
123 * the m_type field of the mbuf during its transactions.
124 *
125 * MC_CL:
126 * This is a zone of rudimentary objects of MCLBYTES in size; each
127 * object represents a mcluster structure. This cache does not
128 * preserve the contents of the objects during its transactions.
129 *
130 * MC_BIGCL:
131 * This is a zone of rudimentary objects of MBIGCLBYTES in size; each
132 * object represents a mbigcluster structure. This cache does not
133 * preserve the contents of the objects during its transaction.
134 *
135 * MC_16KCL:
136 * This is a zone of rudimentary objects of M16KCLBYTES in size; each
137 * object represents a m16kcluster structure. This cache does not
138 * preserve the contents of the objects during its transaction.
139 *
140 * MC_MBUF_CL:
141 * This is a cache of mbufs each having a cluster attached to it.
142 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
143 * fields of the mbuf related to the external cluster are preserved
144 * during transactions.
145 *
146 * MC_MBUF_BIGCL:
147 * This is a cache of mbufs each having a big cluster attached to it.
148 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
149 * fields of the mbuf related to the external cluster are preserved
150 * during transactions.
151 *
152 * MC_MBUF_16KCL:
153 * This is a cache of mbufs each having a big cluster attached to it.
154 * It is backed by MC_MBUF and MC_16KCL rudimentary caches. Several
155 * fields of the mbuf related to the external cluster are preserved
156 * during transactions.
157 *
158 * OBJECT ALLOCATION:
159 *
160 * Allocation requests are handled first at the zalloc per-CPU layer
161 * before falling back to the zalloc depot. Performance is optimal when
162 * the request is satisfied at the CPU layer. zalloc has an additional
163 * overflow layer called the depot, not pictured in the diagram below.
164 *
165 * Allocation paths are different depending on the class of objects:
166 *
167 * a. Rudimentary object:
168 *
169 * { m_get_common(), m_clattach(), m_mclget(),
170 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
171 * composite object allocation }
172 * | ^
173 * | |
174 * | +------- (done) --------+
175 * v |
176 * zalloc_flags/zalloc_n() KASAN
177 * | ^
178 * v |
179 * +----> [zalloc per-CPU cache] -----> (found?) --+
180 * | | |
181 * | v |
182 * | [zalloc recirculation layer] --> (found?) ---+
183 * | |
184 * | v
185 * +--<<-- [zone backing store]
186 *
187 * b. Composite object:
188 *
189 * { m_getpackets_internal(), m_allocpacket_internal() }
190 * | ^
191 * | |
192 * | +------ (done) ---------+
193 * v |
194 * mz_composite_alloc() KASAN
195 * | ^
196 * v |
197 * zcache_alloc_n() |
198 * | |
199 * v |
200 * [zalloc per-CPU cache] --> mark_valid() ---+
201 * | |
202 * v |
203 * [zalloc recirculation layer] -> mark_valid() -+
204 * | |
205 * v |
206 * mz_composite_build() |
207 * | |
208 * v |
209 * (rudimentary objects) |
210 * zalloc_id() ---------------->>-----+
211 *
212 * Auditing notes: If KASAN enabled, buffers will be subjected to
213 * integrity checks by the AddressSanitizer.
214 *
215 * OBJECT DEALLOCATION:
216 *
217 * Freeing an object simply involves placing it into the CPU cache; this
218 * pollutes the cache to benefit subsequent allocations. The depot
219 * will only be entered if the object is to be purged out of the cache.
220 * Objects may be purged based on the overall memory pressure or
221 * during zone garbage collection.
222 * To improve performance, objects are not zero-filled when freed
223 * as it's custom for other zalloc zones.
224 *
225 * Deallocation paths are different depending on the class of objects:
226 *
227 * a. Rudimentary object:
228 *
229 * { m_free(), m_freem_list(), composite object deallocation }
230 * | ^
231 * | |
232 * | +------ (done) ---------+
233 * v |
234 * zfree_nozero() |
235 * | |
236 * v |
237 * KASAN |
238 * | |
239 * v |
240 * [zalloc per-CPU cache] -> (not purging?) --+
241 * | |
242 * v |
243 * [zalloc recirculation layer] --->>----------+
244 *
245 *
246 * b. Composite object:
247 *
248 * { m_free(), m_freem_list() }
249 * | ^
250 * | |
251 * | +------ (done) ---------+
252 * v |
253 * mz_composite_free() |
254 * | |
255 * v |
256 * zcache_free_n() |
257 * | |
258 * v |
259 * KASAN |
260 * | |
261 * v |
262 * [zalloc per-CPU cache] -> mark_invalid() --+
263 * | |
264 * v |
265 * mz_composite_destroy() |
266 * | |
267 * v |
268 * (rudimentary object) |
269 * zfree_nozero() -------------->>------+
270 *
271 * Auditing notes: If KASAN enabled, buffers will be subjected to
272 * integrity checks by the AddressSanitizer.
273 *
274 * DEBUGGING:
275 *
276 * Debugging mbufs can be done by booting a KASAN enabled kernel.
277 */
278
279
280 /*
281 * Convention typedefs for local __single pointers.
282 */
283 typedef typeof(*((zone_t)0)) *__single zone_ref_t;
284 typedef void * __single any_ref_t;
285
286 /* Global lock */
287 static LCK_GRP_DECLARE(mbuf_mlock_grp, "mbuf");
288 static LCK_MTX_DECLARE(mbuf_mlock_data, &mbuf_mlock_grp);
289 #if !CONFIG_MBUF_MCACHE
290 static
291 #endif
292 lck_mtx_t *const mbuf_mlock = &mbuf_mlock_data;
293
294 /* Globals */
295 #if !CONFIG_MBUF_MCACHE
296 static
297 #endif
298 int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
299 int njcl; /* # of clusters for jumbo sizes */
300 int njclbytes; /* size of a jumbo cluster */
301 int max_linkhdr; /* largest link-level header */
302 int max_protohdr; /* largest protocol header */
303 int max_hdr; /* largest link+protocol header */
304 int max_datalen; /* MHLEN - max_hdr */
305
306 /* Lock to protect the completion callback table */
307 static LCK_GRP_DECLARE(mbuf_tx_compl_tbl_lck_grp, "mbuf_tx_compl_tbl");
308 LCK_RW_DECLARE(mbuf_tx_compl_tbl_lock, &mbuf_tx_compl_tbl_lck_grp);
309
310 #define m_stats(c) mbuf_table[c].mtbl_stats
311 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
312
313 #if !CONFIG_MBUF_MCACHE
314 /*
315 * Note: number of entries in mbuf_table must not exceed
316 * MB_STAT_MAX_MB_CLASSES
317 */
318 static mbuf_table_t mbuf_table[] = {
319 { .mtbl_class = MC_MBUF },
320 { .mtbl_class = MC_CL },
321 { .mtbl_class = MC_BIGCL },
322 { .mtbl_class = MC_16KCL },
323 { .mtbl_class = MC_MBUF_CL },
324 { .mtbl_class = MC_MBUF_BIGCL },
325 { .mtbl_class = MC_MBUF_16KCL },
326 };
327 #endif /* !CONFIG_MBUF_MCACHE */
328
329 #if !CONFIG_MBUF_MCACHE
330 static
331 #endif /* !CONFIG_MBUF_MCACHE */
332 unsigned int mb_memory_pressure_percentage = 80;
333
334 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
335 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
336 #if !CONFIG_MBUF_MCACHE
337 static void mbuf_watchdog_defunct(thread_call_param_t, thread_call_param_t);
338 static void mbuf_watchdog_drain_composite(thread_call_param_t, thread_call_param_t);
339 static struct mbuf *mz_alloc(zalloc_flags_t);
340 static void mz_free(struct mbuf *);
341 static struct ext_ref *mz_ref_alloc(zalloc_flags_t);
342 static void mz_ref_free(struct ext_ref *);
343 static void * __bidi_indexable mz_cl_alloc(zone_id_t, zalloc_flags_t);
344 static void mz_cl_free(zone_id_t, void *);
345 static struct mbuf *mz_composite_alloc(mbuf_class_t, zalloc_flags_t);
346 static zstack_t mz_composite_alloc_n(mbuf_class_t, unsigned int, zalloc_flags_t);
347 static void mz_composite_free(mbuf_class_t, struct mbuf *);
348 static void mz_composite_free_n(mbuf_class_t, zstack_t);
349 static void *mz_composite_build(zone_id_t, zalloc_flags_t);
350 static void *mz_composite_mark_valid(zone_id_t, void *);
351 static void *mz_composite_mark_invalid(zone_id_t, void *);
352 static void mz_composite_destroy(zone_id_t, void *);
353
354 ZONE_DEFINE_ID(ZONE_ID_MBUF_REF, "mbuf.ref", struct ext_ref,
355 ZC_CACHING | ZC_KASAN_NOQUARANTINE);
356 ZONE_DEFINE_ID(ZONE_ID_MBUF, "mbuf", struct mbuf,
357 ZC_CACHING | ZC_KASAN_NOQUARANTINE);
358 ZONE_DEFINE_ID(ZONE_ID_CLUSTER_2K, "mbuf.cluster.2k", union mcluster,
359 ZC_CACHING | ZC_KASAN_NOQUARANTINE | ZC_DATA);
360 ZONE_DEFINE_ID(ZONE_ID_CLUSTER_4K, "mbuf.cluster.4k", union mbigcluster,
361 ZC_CACHING | ZC_KASAN_NOQUARANTINE | ZC_DATA);
362 ZONE_DEFINE_ID(ZONE_ID_CLUSTER_16K, "mbuf.cluster.16k", union m16kcluster,
363 ZC_CACHING | ZC_KASAN_NOQUARANTINE | ZC_DATA);
364 static_assert(sizeof(union mcluster) == MCLBYTES);
365 static_assert(sizeof(union mbigcluster) == MBIGCLBYTES);
366 static_assert(sizeof(union m16kcluster) == M16KCLBYTES);
367
368 static const struct zone_cache_ops mz_composite_ops = {
369 .zc_op_alloc = mz_composite_build,
370 .zc_op_mark_valid = mz_composite_mark_valid,
371 .zc_op_mark_invalid = mz_composite_mark_invalid,
372 .zc_op_free = mz_composite_destroy,
373 };
374 ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_2K, "mbuf.composite.2k", struct mbuf,
375 sizeof(struct mbuf) + sizeof(struct ext_ref) + MCLBYTES,
376 &mz_composite_ops);
377 ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_4K, "mbuf.composite.4k", struct mbuf,
378 sizeof(struct mbuf) + sizeof(struct ext_ref) + MBIGCLBYTES,
379 &mz_composite_ops);
380 ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_16K, "mbuf.composite.16k", struct mbuf,
381 sizeof(struct mbuf) + sizeof(struct ext_ref) + M16KCLBYTES,
382 &mz_composite_ops);
383 static_assert(ZONE_ID_MBUF + MC_MBUF == ZONE_ID_MBUF);
384 static_assert(ZONE_ID_MBUF + MC_CL == ZONE_ID_CLUSTER_2K);
385 static_assert(ZONE_ID_MBUF + MC_BIGCL == ZONE_ID_CLUSTER_4K);
386 static_assert(ZONE_ID_MBUF + MC_16KCL == ZONE_ID_CLUSTER_16K);
387 static_assert(ZONE_ID_MBUF + MC_MBUF_CL == ZONE_ID_MBUF_CLUSTER_2K);
388 static_assert(ZONE_ID_MBUF + MC_MBUF_BIGCL == ZONE_ID_MBUF_CLUSTER_4K);
389 static_assert(ZONE_ID_MBUF + MC_MBUF_16KCL == ZONE_ID_MBUF_CLUSTER_16K);
390
391 /* Converts a an mbuf class to a zalloc zone ID. */
392 __attribute__((always_inline))
393 static inline zone_id_t
m_class_to_zid(mbuf_class_t class)394 m_class_to_zid(mbuf_class_t class)
395 {
396 return ZONE_ID_MBUF + class - MC_MBUF;
397 }
398
399 __attribute__((always_inline))
400 static inline mbuf_class_t
m_class_from_zid(zone_id_t zid)401 m_class_from_zid(zone_id_t zid)
402 {
403 return MC_MBUF + zid - ZONE_ID_MBUF;
404 }
405
406 static thread_call_t mbuf_defunct_tcall;
407 static thread_call_t mbuf_drain_tcall;
408 #endif /* !CONFIG_MBUF_MCACHE */
409
410 static int m_copyback0(struct mbuf **, int, int len, const void * __sized_by_or_null(len), int, int);
411 static struct mbuf *m_split0(struct mbuf *, int, int, int);
412
413 /* flags for m_copyback0 */
414 #define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
415 #define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
416 #define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
417 #define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
418
419 /*
420 * The structure that holds all mbuf class statistics exportable via sysctl.
421 * Similar to mbstat structure, the mb_stat structure is protected by the
422 * global mbuf lock. It contains additional information about the classes
423 * that allows for a more accurate view of the state of the allocator.
424 */
425 struct mb_stat *mb_stat;
426 struct omb_stat *omb_stat; /* For backwards compatibility */
427
428 #define MB_STAT_SIZE(n) \
429 __builtin_offsetof(mb_stat_t, mbs_class[n])
430
431 #define OMB_STAT_SIZE(n) \
432 __builtin_offsetof(struct omb_stat, mbs_class[n])
433
434 /*
435 * The legacy structure holding all of the mbuf allocation statistics.
436 * The actual statistics used by the kernel are stored in the mbuf_table
437 * instead, and are updated atomically while the global mbuf lock is held.
438 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
439 * Unlike before, the kernel no longer relies on the contents of mbstat for
440 * its operations (e.g. cluster expansion) because the structure is exposed
441 * to outside and could possibly be modified, therefore making it unsafe.
442 * With the exception of the mbstat.m_mtypes array (see below), all of the
443 * statistics are updated as they change.
444 */
445 struct mbstat mbstat;
446
447 #define MBSTAT_MTYPES_MAX \
448 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
449
450 #if !CONFIG_MBUF_MCACHE
451 static
452 #endif
453 mbuf_mtypes_t PERCPU_DATA(mbuf_mtypes);
454
455 __private_extern__ inline struct ext_ref *
m_get_rfa(struct mbuf * m)456 m_get_rfa(struct mbuf *m)
457 {
458 return m->m_ext.ext_refflags;
459 }
460
461 __private_extern__ inline m_ext_free_func_t
m_get_ext_free(struct mbuf * m)462 m_get_ext_free(struct mbuf *m)
463 {
464 if (m->m_ext.ext_free == NULL) {
465 return NULL;
466 }
467
468 return ptrauth_nop_cast(m_ext_free_func_t, m->m_ext.ext_free);
469 }
470
471 #if !CONFIG_MBUF_MCACHE
472 static
473 #endif
474 caddr_t
m_get_ext_arg(struct mbuf * m)475 m_get_ext_arg(struct mbuf *m)
476 {
477 return (caddr_t)m->m_ext.ext_arg;
478 }
479
480 #if !CONFIG_MBUF_MCACHE
481 static
482 #endif
483 void
m_set_ext(struct mbuf * m,struct ext_ref * rfa,m_ext_free_func_t ext_free,caddr_t ext_arg)484 m_set_ext(struct mbuf *m, struct ext_ref *rfa, m_ext_free_func_t ext_free,
485 caddr_t ext_arg)
486 {
487 VERIFY(m->m_flags & M_EXT);
488 if (rfa != NULL) {
489 m->m_ext.ext_refflags = rfa;
490 if (ext_free != NULL) {
491 m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free);
492 m->m_ext.ext_arg = ext_arg;
493 } else {
494 m->m_ext.ext_free = NULL;
495 m->m_ext.ext_arg = NULL;
496 }
497 } else {
498 if (ext_free != NULL) {
499 m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free);
500 m->m_ext.ext_arg = ext_arg;
501 } else {
502 m->m_ext.ext_free = NULL;
503 m->m_ext.ext_arg = NULL;
504 }
505 m->m_ext.ext_refflags = NULL;
506 }
507 }
508
509 #if !CONFIG_MBUF_MCACHE
510 static
511 #endif
512 void
mext_init(struct mbuf * m,void * __sized_by (size)buf,u_int size,m_ext_free_func_t free,caddr_t free_arg,struct ext_ref * rfa,u_int16_t min,u_int16_t ref,u_int16_t pref,u_int16_t flag,u_int32_t priv,struct mbuf * pm)513 mext_init(struct mbuf *m, void *__sized_by(size)buf, u_int size,
514 m_ext_free_func_t free, caddr_t free_arg, struct ext_ref *rfa,
515 u_int16_t min, u_int16_t ref, u_int16_t pref, u_int16_t flag,
516 u_int32_t priv, struct mbuf *pm)
517 {
518 m->m_ext.ext_buf = buf;
519 m->m_ext.ext_size = size;
520 m->m_data = (uintptr_t)m->m_ext.ext_buf;
521 m->m_len = 0;
522 m->m_flags |= M_EXT;
523 m_set_ext(m, rfa, free, free_arg);
524 MEXT_MINREF(m) = min;
525 MEXT_REF(m) = ref;
526 MEXT_PREF(m) = pref;
527 MEXT_FLAGS(m) = flag;
528 MEXT_PRIV(m) = priv;
529 MEXT_PMBUF(m) = pm;
530 }
531
532 #if !CONFIG_MBUF_MCACHE
533 static
534 #endif
535 void
mbuf_mtypes_sync(void)536 mbuf_mtypes_sync(void)
537 {
538 mbuf_mtypes_t mtc;
539
540 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
541
542 mtc = *PERCPU_GET_MASTER(mbuf_mtypes);
543 percpu_foreach_secondary(mtype, mbuf_mtypes) {
544 for (int n = 0; n < MT_MAX; n++) {
545 mtc.cpu_mtypes[n] += mtype->cpu_mtypes[n];
546 }
547 }
548
549 for (int n = 0; n < MT_MAX; n++) {
550 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
551 }
552 }
553
554 #if !CONFIG_MBUF_MCACHE
555 static void
mbuf_stat_sync(void)556 mbuf_stat_sync(void)
557 {
558 mb_class_stat_t *sp;
559 int k;
560 uint64_t drops = 0;
561
562
563 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
564
565 for (k = 0; k < MC_MAX; k++) {
566 const zone_id_t zid = m_class_to_zid(m_class(k));
567 const zone_ref_t zone = zone_by_id(zid);
568 struct zone_basic_stats stats = {};
569
570 sp = m_stats(k);
571 zone_get_stats(zone, &stats);
572 drops += stats.zbs_alloc_fail;
573 sp->mbcl_total = stats.zbs_avail;
574 sp->mbcl_active = stats.zbs_alloc;
575 /*
576 * infree is what mcache considers the freelist (uncached)
577 * free_cnt contains all the cached/uncached elements
578 * in a zone.
579 */
580 sp->mbcl_infree = stats.zbs_free - stats.zbs_cached;
581 sp->mbcl_fail_cnt = stats.zbs_alloc_fail;
582 sp->mbcl_ctotal = sp->mbcl_total;
583
584 /* These stats are not available in zalloc. */
585 sp->mbcl_alloc_cnt = 0;
586 sp->mbcl_free_cnt = 0;
587 sp->mbcl_notified = 0;
588 sp->mbcl_purge_cnt = 0;
589 sp->mbcl_slab_cnt = 0;
590 sp->mbcl_release_cnt = 0;
591
592 /* zalloc caches are always on. */
593 sp->mbcl_mc_state = MCS_ONLINE;
594 sp->mbcl_mc_cached = stats.zbs_cached;
595 /* These stats are not collected by zalloc. */
596 sp->mbcl_mc_waiter_cnt = 0;
597 sp->mbcl_mc_wretry_cnt = 0;
598 sp->mbcl_mc_nwretry_cnt = 0;
599 }
600 /* Deduct clusters used in composite cache */
601 m_ctotal(MC_MBUF) -= (m_total(MC_MBUF_CL) +
602 m_total(MC_MBUF_BIGCL) -
603 m_total(MC_MBUF_16KCL));
604 m_ctotal(MC_CL) -= m_total(MC_MBUF_CL);
605 m_ctotal(MC_BIGCL) -= m_total(MC_MBUF_BIGCL);
606 m_ctotal(MC_16KCL) -= m_total(MC_MBUF_16KCL);
607
608 /* Update mbstat. */
609 mbstat.m_mbufs = m_total(MC_MBUF);
610 mbstat.m_clusters = m_total(MC_CL);
611 mbstat.m_clfree = m_infree(MC_CL) + m_infree(MC_MBUF_CL);
612 mbstat.m_drops = drops;
613 mbstat.m_bigclusters = m_total(MC_BIGCL);
614 mbstat.m_bigclfree = m_infree(MC_BIGCL) + m_infree(MC_MBUF_BIGCL);
615 }
616 #endif /* !CONFIG_MBUF_MCACHE */
617
618 static int
619 mbstat_sysctl SYSCTL_HANDLER_ARGS
620 {
621 #pragma unused(oidp, arg1, arg2)
622
623 lck_mtx_lock(mbuf_mlock);
624 mbuf_stat_sync();
625 mbuf_mtypes_sync();
626 lck_mtx_unlock(mbuf_mlock);
627
628 return SYSCTL_OUT(req, &mbstat, sizeof(mbstat));
629 }
630
631 static int
632 mb_stat_sysctl SYSCTL_HANDLER_ARGS
633 {
634 #pragma unused(oidp, arg1, arg2)
635 any_ref_t statp;
636 int k, statsz, proc64 = proc_is64bit(req->p);
637
638 lck_mtx_lock(mbuf_mlock);
639 mbuf_stat_sync();
640
641 if (!proc64) {
642 struct omb_class_stat *oc;
643 struct mb_class_stat *c;
644
645 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
646 oc = &omb_stat->mbs_class[0];
647 c = &mb_stat->mbs_class[0];
648 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
649 (void) snprintf(oc->mbcl_cname, sizeof(oc->mbcl_cname),
650 "%s", c->mbcl_cname);
651 oc->mbcl_size = c->mbcl_size;
652 oc->mbcl_total = c->mbcl_total;
653 oc->mbcl_active = c->mbcl_active;
654 oc->mbcl_infree = c->mbcl_infree;
655 oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
656 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
657 oc->mbcl_free_cnt = c->mbcl_free_cnt;
658 oc->mbcl_notified = c->mbcl_notified;
659 oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
660 oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
661 oc->mbcl_ctotal = c->mbcl_ctotal;
662 oc->mbcl_release_cnt = c->mbcl_release_cnt;
663 oc->mbcl_mc_state = c->mbcl_mc_state;
664 oc->mbcl_mc_cached = c->mbcl_mc_cached;
665 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
666 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
667 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
668 }
669 statp = omb_stat;
670 statsz = OMB_STAT_SIZE(MC_MAX);
671 } else {
672 statp = mb_stat;
673 statsz = MB_STAT_SIZE(MC_MAX);
674 }
675
676 lck_mtx_unlock(mbuf_mlock);
677
678 return SYSCTL_OUT(req, statp, statsz);
679 }
680
681 #if !CONFIG_MBUF_MCACHE
682 static void
mbuf_mcheck(struct mbuf * m)683 mbuf_mcheck(struct mbuf *m)
684 {
685 if (__improbable(m->m_type != MT_FREE && !MBUF_IS_PAIRED(m))) {
686 panic("MCHECK: m_type=%d m=%p",
687 (u_int16_t)(m)->m_type, m);
688 }
689 }
690 #endif /* !CONFIG_MBUF_MCACHE */
691
692 static void
m_scratch_init(struct mbuf * m)693 m_scratch_init(struct mbuf *m)
694 {
695 struct pkthdr *pkt = &m->m_pkthdr;
696
697 VERIFY(m->m_flags & M_PKTHDR);
698
699 /* See comments in <rdar://problem/14040693> */
700 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
701 panic_plain("Invalid attempt to modify guarded module-private "
702 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
703 /* NOTREACHED */
704 }
705
706 bzero(&pkt->pkt_mpriv, sizeof(pkt->pkt_mpriv));
707 }
708
709
710 static void
mbuf_init_pkthdr(struct mbuf * m)711 mbuf_init_pkthdr(struct mbuf *m)
712 {
713 m->m_pkthdr.rcvif = NULL;
714 m->m_pkthdr.pkt_hdr = NULL;
715 m->m_pkthdr.len = 0;
716 m->m_pkthdr.csum_flags = 0;
717 m->m_pkthdr.csum_data = 0;
718 m->m_pkthdr.vlan_tag = 0;
719 m->m_pkthdr.comp_gencnt = 0;
720 m->m_pkthdr.pkt_crumbs = 0;
721 m_classifier_init(m, 0);
722 m_tag_init(m, 1);
723 m_scratch_init(m);
724 }
725
726 #if !CONFIG_MBUF_MCACHE
727 static
728 #endif
729 void
mbuf_init(struct mbuf * m,int pkthdr,int type)730 mbuf_init(struct mbuf *m, int pkthdr, int type)
731 {
732 mbuf_mcheck(m);
733 m->m_next = m->m_nextpkt = NULL;
734 m->m_len = 0;
735 m->m_type = type;
736 if (pkthdr == 0) {
737 m->m_data = (uintptr_t)m->m_dat;
738 m->m_flags = 0;
739 } else {
740 m->m_data = (uintptr_t)m->m_pktdat;
741 m->m_flags = M_PKTHDR;
742 mbuf_init_pkthdr(m);
743 }
744 }
745
746
747 #if !CONFIG_MBUF_MCACHE
748 /*
749 * The following functions are wrappers around mbuf
750 * allocation for zalloc. They all have the prefix "mz"
751 * which was chosen to avoid conflicts with the mbuf KPIs.
752 *
753 * Z_NOPAGEWAIT is used in place of Z_NOWAIT because
754 * Z_NOPAGEWAIT maps closer to MCR_TRYHARD. Z_NOWAIT will
755 * fail immediately if it has to take a mutex and that
756 * may cause packets to be dropped more frequently.
757 * In general, the mbuf subsystem can sustain grabbing a mutex
758 * during "non-blocking" allocation and that's the reason
759 * why Z_NOPAGEWAIT was chosen.
760 *
761 * mbufs are elided (removed all pointers) before they are
762 * returned to the cache. The exception are composite mbufs which
763 * are re-initialized on allocation.
764 */
765 __attribute__((always_inline))
766 static inline void
m_elide(struct mbuf * m)767 m_elide(struct mbuf *m)
768 {
769 m->m_next = m->m_nextpkt = NULL;
770 m->m_data = 0;
771 memset(&m->m_ext, 0, sizeof(m->m_ext));
772 m->m_pkthdr.rcvif = NULL;
773 m->m_pkthdr.pkt_hdr = NULL;
774 m->m_flags |= M_PKTHDR;
775 m_tag_init(m, 1);
776 m->m_pkthdr.pkt_flags = 0;
777 m_scratch_init(m);
778 m->m_flags &= ~M_PKTHDR;
779 }
780
781 __attribute__((always_inline))
782 static inline struct mbuf *
mz_alloc(zalloc_flags_t flags)783 mz_alloc(zalloc_flags_t flags)
784 {
785 if (flags & Z_NOWAIT) {
786 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
787 } else if (!(flags & Z_NOPAGEWAIT)) {
788 flags |= Z_NOFAIL;
789 }
790 return zalloc_id(ZONE_ID_MBUF, flags | Z_NOZZC);
791 }
792
793 __attribute__((always_inline))
794 static inline zstack_t
mz_alloc_n(uint32_t count,zalloc_flags_t flags)795 mz_alloc_n(uint32_t count, zalloc_flags_t flags)
796 {
797 if (flags & Z_NOWAIT) {
798 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
799 } else if (!(flags & Z_NOPAGEWAIT)) {
800 flags |= Z_NOFAIL;
801 }
802 return zalloc_n(ZONE_ID_MBUF, count, flags | Z_NOZZC);
803 }
804
805 __attribute__((always_inline))
806 static inline void
mz_free(struct mbuf * m)807 mz_free(struct mbuf *m)
808 {
809 #if KASAN
810 zone_require(zone_by_id(ZONE_ID_MBUF), m);
811 #endif
812 m_elide(m);
813 zfree_nozero(ZONE_ID_MBUF, m);
814 }
815
816 __attribute__((always_inline))
817 static inline void
mz_free_n(zstack_t list)818 mz_free_n(zstack_t list)
819 {
820 /* Callers of this function have already elided the mbuf. */
821 zfree_nozero_n(ZONE_ID_MBUF, list);
822 }
823
824 __attribute__((always_inline))
825 static inline struct ext_ref *
mz_ref_alloc(zalloc_flags_t flags)826 mz_ref_alloc(zalloc_flags_t flags)
827 {
828 if (flags & Z_NOWAIT) {
829 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
830 }
831 return zalloc_id(ZONE_ID_MBUF_REF, flags | Z_NOZZC);
832 }
833
834 __attribute__((always_inline))
835 static inline void
mz_ref_free(struct ext_ref * rfa)836 mz_ref_free(struct ext_ref *rfa)
837 {
838 VERIFY(rfa->minref == rfa->refcnt);
839 #if KASAN
840 zone_require(zone_by_id(ZONE_ID_MBUF_REF), rfa);
841 #endif
842 zfree_nozero(ZONE_ID_MBUF_REF, rfa);
843 }
844
845 __attribute__((always_inline))
846 static inline void * __bidi_indexable
mz_cl_alloc(zone_id_t zid,zalloc_flags_t flags)847 mz_cl_alloc(zone_id_t zid, zalloc_flags_t flags)
848 {
849 void * p __unsafe_indexable;
850 if (flags & Z_NOWAIT) {
851 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
852 } else if (!(flags & Z_NOPAGEWAIT)) {
853 flags |= Z_NOFAIL;
854 }
855 flags |= Z_NOZZC;
856
857 /*
858 * N.B. Invoking `(zalloc_id)' directly, vs. via `zalloc_id' macro.
859 */
860 p = (zalloc_id)(zid, flags);
861 return __unsafe_forge_bidi_indexable(void *, p, zone_get_elem_size(zone_by_id(zid)));
862 }
863
864 __attribute__((always_inline))
865 static inline void
mz_cl_free(zone_id_t zid,void * cl)866 mz_cl_free(zone_id_t zid, void *cl)
867 {
868 #if KASAN
869 zone_require(zone_by_id(zid), cl);
870 #endif
871 zfree_nozero(zid, cl);
872 }
873
874 __attribute__((always_inline))
875 static inline zstack_t
mz_composite_alloc_n(mbuf_class_t class,unsigned int n,zalloc_flags_t flags)876 mz_composite_alloc_n(mbuf_class_t class, unsigned int n, zalloc_flags_t flags)
877 {
878 if (flags & Z_NOWAIT) {
879 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
880 }
881 return (zcache_alloc_n)(m_class_to_zid(class), n, flags,
882 &mz_composite_ops);
883 }
884
885 __attribute__((always_inline))
886 static inline struct mbuf *
mz_composite_alloc(mbuf_class_t class,zalloc_flags_t flags)887 mz_composite_alloc(mbuf_class_t class, zalloc_flags_t flags)
888 {
889 zstack_t list = {};
890 list = mz_composite_alloc_n(class, 1, flags);
891 if (!zstack_empty(list)) {
892 return zstack_pop(&list);
893 } else {
894 return NULL;
895 }
896 }
897
898 __attribute__((always_inline))
899 static inline void
mz_composite_free_n(mbuf_class_t class,zstack_t list)900 mz_composite_free_n(mbuf_class_t class, zstack_t list)
901 {
902 (zcache_free_n)(m_class_to_zid(class), list, &mz_composite_ops);
903 }
904
905 __attribute__((always_inline))
906 static inline void
mz_composite_free(mbuf_class_t class,struct mbuf * m)907 mz_composite_free(mbuf_class_t class, struct mbuf *m)
908 {
909 zstack_t list = {};
910 zstack_push(&list, m);
911 (zcache_free_n)(m_class_to_zid(class), list, &mz_composite_ops);
912 }
913
914 /* Converts composite zone ID to the cluster zone ID. */
915 __attribute__((always_inline))
916 static inline zone_id_t
mz_cl_zid(zone_id_t zid)917 mz_cl_zid(zone_id_t zid)
918 {
919 return ZONE_ID_CLUSTER_2K + zid - ZONE_ID_MBUF_CLUSTER_2K;
920 }
921
922 static void *
mz_composite_build(zone_id_t zid,zalloc_flags_t flags)923 mz_composite_build(zone_id_t zid, zalloc_flags_t flags)
924 {
925 const zone_id_t cl_zid = mz_cl_zid(zid);
926 struct mbuf *m = NULL;
927 struct ext_ref *rfa = NULL;
928 void *cl = NULL;
929
930 cl = mz_cl_alloc(cl_zid, flags);
931 if (__improbable(cl == NULL)) {
932 goto out;
933 }
934 rfa = mz_ref_alloc(flags);
935 if (__improbable(rfa == NULL)) {
936 goto out_free_cl;
937 }
938 m = mz_alloc(flags);
939 if (__improbable(m == NULL)) {
940 goto out_free_rfa;
941 }
942 mbuf_init(m, 0, MT_FREE);
943 if (zid == ZONE_ID_MBUF_CLUSTER_2K) {
944 MBUF_CL_INIT(m, cl, rfa, 0, EXTF_COMPOSITE);
945 } else if (zid == ZONE_ID_MBUF_CLUSTER_4K) {
946 MBUF_BIGCL_INIT(m, cl, rfa, 0, EXTF_COMPOSITE);
947 } else {
948 MBUF_16KCL_INIT(m, cl, rfa, 0, EXTF_COMPOSITE);
949 }
950 VERIFY(m->m_flags == M_EXT);
951 VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
952
953 return m;
954 out_free_rfa:
955 mz_ref_free(rfa);
956 out_free_cl:
957 mz_cl_free(cl_zid, cl);
958 out:
959 return NULL;
960 }
961
962 static void *
mz_composite_mark_valid(zone_id_t zid,void * p)963 mz_composite_mark_valid(zone_id_t zid, void *p)
964 {
965 mbuf_ref_t m = p;
966
967 m = zcache_mark_valid_single(zone_by_id(ZONE_ID_MBUF), m);
968 #if KASAN
969 struct ext_ref *rfa __single = m_get_rfa(m);
970 const zone_id_t cl_zid = mz_cl_zid(zid);
971 void *cl = m->m_ext.ext_buf;
972
973 cl = __unsafe_forge_bidi_indexable(void *,
974 zcache_mark_valid(zone_by_id(cl_zid), cl),
975 zone_get_elem_size(zone_by_id(cl_zid)));
976 rfa = __unsafe_forge_single(struct ext_ref *,
977 zcache_mark_valid(zone_by_id(ZONE_ID_MBUF_REF), rfa));
978 m->m_data = (uintptr_t)cl;
979 m->m_ext.ext_buf = cl;
980 m->m_ext.ext_size = m->m_ext.ext_size;
981 m->m_ext.ext_refflags = rfa;
982 #else
983 #pragma unused(zid)
984 #endif
985 VERIFY(MBUF_IS_COMPOSITE(m));
986
987 return m;
988 }
989
990 static void *
mz_composite_mark_invalid(zone_id_t zid,void * p)991 mz_composite_mark_invalid(zone_id_t zid, void *p)
992 {
993 mbuf_ref_t m = p;
994
995 VERIFY(MBUF_IS_COMPOSITE(m));
996 VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
997 #if KASAN
998 struct ext_ref *rfa __single = m_get_rfa(m);
999 const zone_id_t cl_zid = mz_cl_zid(zid);
1000 void *cl = m->m_ext.ext_buf;
1001
1002 cl = __unsafe_forge_bidi_indexable(void *,
1003 zcache_mark_invalid(zone_by_id(cl_zid), cl),
1004 zone_get_elem_size(zone_by_id(cl_zid)));
1005 rfa = __unsafe_forge_single(struct ext_ref *,
1006 zcache_mark_invalid(zone_by_id(ZONE_ID_MBUF_REF), rfa));
1007 m->m_data = (uintptr_t)cl;
1008 m->m_ext.ext_buf = cl;
1009 m->m_ext.ext_size = m->m_ext.ext_size;
1010 m->m_ext.ext_refflags = rfa;
1011 #else
1012 #pragma unused(zid)
1013 #endif
1014
1015 return zcache_mark_invalid_single(zone_by_id(ZONE_ID_MBUF), m);
1016 }
1017
1018 static void
mz_composite_destroy(zone_id_t zid,void * p)1019 mz_composite_destroy(zone_id_t zid, void *p)
1020 {
1021 const zone_id_t cl_zid = mz_cl_zid(zid);
1022 struct ext_ref *rfa = NULL;
1023 mbuf_ref_t m = p;
1024
1025 VERIFY(MBUF_IS_COMPOSITE(m));
1026
1027 MEXT_MINREF(m) = 0;
1028 MEXT_REF(m) = 0;
1029 MEXT_PREF(m) = 0;
1030 MEXT_FLAGS(m) = 0;
1031 MEXT_PRIV(m) = 0;
1032 MEXT_PMBUF(m) = NULL;
1033
1034 rfa = m_get_rfa(m);
1035 m_set_ext(m, NULL, NULL, NULL);
1036
1037 m->m_type = MT_FREE;
1038 m->m_flags = m->m_len = 0;
1039 m->m_next = m->m_nextpkt = NULL;
1040
1041 mz_cl_free(cl_zid, m->m_ext.ext_buf);
1042 m->m_ext.ext_size = 0;
1043 m->m_ext.ext_buf = NULL;
1044 mz_ref_free(rfa);
1045 mz_free(m);
1046 }
1047 #endif /* !CONFIG_MBUF_MCACHE */
1048
1049 #if !CONFIG_MBUF_MCACHE
1050 static
1051 #endif
1052 void
m_incref(struct mbuf * m)1053 m_incref(struct mbuf *m)
1054 {
1055 uint16_t new = os_atomic_inc(&MEXT_REF(m), relaxed);
1056
1057 VERIFY(new != 0);
1058 /*
1059 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1060 * we don't clear the flag when the refcount goes back to the
1061 * minimum, to simplify code calling m_mclhasreference().
1062 */
1063 if (new > (MEXT_MINREF(m) + 1) && !(MEXT_FLAGS(m) & EXTF_READONLY)) {
1064 os_atomic_or(&MEXT_FLAGS(m), EXTF_READONLY, relaxed);
1065 }
1066 }
1067
1068 #if !CONFIG_MBUF_MCACHE
1069 static
1070 #endif
1071 uint16_t
m_decref(struct mbuf * m)1072 m_decref(struct mbuf *m)
1073 {
1074 VERIFY(MEXT_REF(m) != 0);
1075
1076 return os_atomic_dec(&MEXT_REF(m), acq_rel);
1077 }
1078
1079 /* By default, mbuf_limit is enabled. Except when serverperfmode is set. */
1080 static int mbuf_limit = 1;
1081
1082 #if !CONFIG_MBUF_MCACHE
1083 static
1084 #endif
1085 void
mbuf_table_init(void)1086 mbuf_table_init(void)
1087 {
1088 unsigned int b, c, s;
1089 int m;
1090
1091 omb_stat = zalloc_permanent(OMB_STAT_SIZE(MC_MAX),
1092 ZALIGN(struct omb_stat));
1093
1094 mb_stat = zalloc_permanent(MB_STAT_SIZE(MC_MAX),
1095 ZALIGN(mb_stat_t));
1096
1097 mb_stat->mbs_cnt = MC_MAX;
1098 for (m = 0; m < MC_MAX; m++) {
1099 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1100 }
1101
1102 /*
1103 * Set aside 1/3 of the mbuf cluster map for jumbo
1104 * clusters; we do this only on platforms where jumbo
1105 * cluster pool is enabled.
1106 */
1107 njcl = nmbclusters / 3;
1108 njclbytes = M16KCLBYTES;
1109
1110 /*
1111 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1112 * a multiple of 4KB clusters.
1113 */
1114 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1115
1116 /*
1117 * Each jumbo cluster takes 8 2KB clusters, so make
1118 * sure that the pool size is evenly divisible by 8;
1119 * njcl is in 2KB unit, hence treated as such.
1120 */
1121 njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL);
1122
1123 /* Update nclusters with rounded down value of njcl */
1124 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1125
1126 /*
1127 * njcl is valid only on platforms with 16KB jumbo clusters or
1128 * with 16KB pages, where it is configured to 1/3 of the pool
1129 * size. On these platforms, the remaining is used for 2KB
1130 * and 4KB clusters. On platforms without 16KB jumbo clusters,
1131 * the entire pool is used for both 2KB and 4KB clusters. A 4KB
1132 * cluster can either be splitted into 16 mbufs, or into 2 2KB
1133 * clusters.
1134 *
1135 * +---+---+------------ ... -----------+------- ... -------+
1136 * | c | b | s | njcl |
1137 * +---+---+------------ ... -----------+------- ... -------+
1138 *
1139 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1140 * clusters (1/64th each.)
1141 */
1142 c = P2ROUNDDOWN((nclusters >> 6), NCLPG); /* in 2KB unit */
1143 b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), NBCLPG); /* in 4KB unit */
1144 s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */
1145
1146 /*
1147 * 1/64th (c) is reserved for 2KB clusters.
1148 */
1149 m_minlimit(MC_CL) = c;
1150 if (mbuf_limit) {
1151 m_maxlimit(MC_CL) = s + c; /* in 2KB unit */
1152 } else {
1153 m_maxlimit(MC_CL) = INT_MAX;
1154 }
1155 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1156 snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1157
1158 /*
1159 * Another 1/64th (b) of the map is reserved for 4KB clusters.
1160 * It cannot be turned into 2KB clusters or mbufs.
1161 */
1162 m_minlimit(MC_BIGCL) = b;
1163 if (mbuf_limit) {
1164 m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */
1165 } else {
1166 m_maxlimit(MC_BIGCL) = INT_MAX;
1167 }
1168 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1169 snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1170
1171 /*
1172 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1173 */
1174 m_minlimit(MC_MBUF) = 0;
1175 if (mbuf_limit) {
1176 m_maxlimit(MC_MBUF) = s * NMBPCL; /* in mbuf unit */
1177 } else {
1178 m_maxlimit(MC_MBUF) = INT_MAX;
1179 }
1180 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = _MSIZE;
1181 snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1182
1183 /*
1184 * Set limits for the composite classes.
1185 */
1186 m_minlimit(MC_MBUF_CL) = 0;
1187 if (mbuf_limit) {
1188 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1189 } else {
1190 m_maxlimit(MC_MBUF_CL) = INT_MAX;
1191 }
1192 m_maxsize(MC_MBUF_CL) = MCLBYTES;
1193 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1194 snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1195
1196 m_minlimit(MC_MBUF_BIGCL) = 0;
1197 if (mbuf_limit) {
1198 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1199 } else {
1200 m_maxlimit(MC_MBUF_BIGCL) = INT_MAX;
1201 }
1202 m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1203 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1204 snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1205
1206 /*
1207 * And for jumbo classes.
1208 */
1209 m_minlimit(MC_16KCL) = 0;
1210 if (mbuf_limit) {
1211 m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */
1212 } else {
1213 m_maxlimit(MC_16KCL) = INT_MAX;
1214 }
1215 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1216 snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1217
1218 m_minlimit(MC_MBUF_16KCL) = 0;
1219 if (mbuf_limit) {
1220 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1221 } else {
1222 m_maxlimit(MC_MBUF_16KCL) = INT_MAX;
1223 }
1224 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1225 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1226 snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1227
1228 /*
1229 * Initialize the legacy mbstat structure.
1230 */
1231 bzero(&mbstat, sizeof(mbstat));
1232 mbstat.m_msize = m_maxsize(MC_MBUF);
1233 mbstat.m_mclbytes = m_maxsize(MC_CL);
1234 mbstat.m_minclsize = MINCLSIZE;
1235 mbstat.m_mlen = MLEN;
1236 mbstat.m_mhlen = MHLEN;
1237 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1238 }
1239
1240 #if !CONFIG_MBUF_MCACHE
1241 static
1242 #endif
1243 int
mbuf_get_class(struct mbuf * m)1244 mbuf_get_class(struct mbuf *m)
1245 {
1246 if (m->m_flags & M_EXT) {
1247 uint32_t composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
1248 m_ext_free_func_t m_free_func = m_get_ext_free(m);
1249
1250 if (m_free_func == NULL) {
1251 if (composite) {
1252 return MC_MBUF_CL;
1253 } else {
1254 return MC_CL;
1255 }
1256 } else if (m_free_func == m_bigfree) {
1257 if (composite) {
1258 return MC_MBUF_BIGCL;
1259 } else {
1260 return MC_BIGCL;
1261 }
1262 } else if (m_free_func == m_16kfree) {
1263 if (composite) {
1264 return MC_MBUF_16KCL;
1265 } else {
1266 return MC_16KCL;
1267 }
1268 }
1269 }
1270
1271 return MC_MBUF;
1272 }
1273
1274 #if !CONFIG_MBUF_MCACHE
1275 bool
mbuf_class_under_pressure(struct mbuf * m)1276 mbuf_class_under_pressure(struct mbuf *m)
1277 {
1278 struct zone_basic_stats stats = {};
1279 zone_ref_t zone;
1280 zone_id_t zid;
1281 int mclass;
1282
1283 if (mbuf_limit == 0) {
1284 return false;
1285 }
1286
1287 mclass = mbuf_get_class(m);
1288
1289 /*
1290 * Grab the statistics from zalloc.
1291 * We can't call mbuf_stat_sync() since that requires a lock.
1292 */
1293 zid = m_class_to_zid(m_class(mclass));
1294 zone = zone_by_id(zid);
1295
1296 zone_get_stats(zone, &stats);
1297 if (stats.zbs_avail - stats.zbs_free >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
1298 os_log(OS_LOG_DEFAULT,
1299 "%s memory-pressure on mbuf due to class %u, total %llu free %llu max %u",
1300 __func__, mclass, stats.zbs_avail, stats.zbs_free, m_maxlimit(mclass));
1301 return true;
1302 }
1303
1304 return false;
1305 }
1306 #endif /* CONFIG_MBUF_MCACHE */
1307
1308 #if defined(__LP64__)
1309 typedef struct ncl_tbl {
1310 uint64_t nt_maxmem; /* memory (sane) size */
1311 uint32_t nt_mbpool; /* mbuf pool size */
1312 } ncl_tbl_t;
1313
1314 static const ncl_tbl_t ncl_table[] = {
1315 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ },
1316 { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (96 << MBSHIFT) /* 96 MB */ },
1317 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (128 << MBSHIFT) /* 128 MB */ },
1318 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (256 << MBSHIFT) /* 256 MB */ },
1319 { (1ULL << (GBSHIFT + 5)) /* 32 GB */, (512 << MBSHIFT) /* 512 MB */ },
1320 { 0, 0 }
1321 };
1322 #endif /* __LP64__ */
1323
1324 __private_extern__ unsigned int
mbuf_default_ncl(uint64_t mem)1325 mbuf_default_ncl(uint64_t mem)
1326 {
1327 #if !defined(__LP64__)
1328 unsigned int n;
1329 /*
1330 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1331 */
1332 if ((n = ((mem / 16) / MCLBYTES)) > 32768) {
1333 n = 32768;
1334 }
1335 #else
1336 unsigned int n, i;
1337 /*
1338 * 64-bit kernel (mbuf pool size based on table).
1339 */
1340 n = ncl_table[0].nt_mbpool;
1341 for (i = 0; ncl_table[i].nt_mbpool != 0; i++) {
1342 if (mem < ncl_table[i].nt_maxmem) {
1343 break;
1344 }
1345 n = ncl_table[i].nt_mbpool;
1346 }
1347 n >>= MCLSHIFT;
1348 #endif /* !__LP64__ */
1349 return n;
1350 }
1351
1352 #if !CONFIG_MBUF_MCACHE
1353 __private_extern__ void
mbinit(void)1354 mbinit(void)
1355 {
1356 unsigned int m;
1357
1358 /*
1359 * These MBUF_ values must be equal to their private counterparts.
1360 */
1361 static_assert(MBUF_EXT == M_EXT);
1362 static_assert(MBUF_PKTHDR == M_PKTHDR);
1363 static_assert(MBUF_EOR == M_EOR);
1364 static_assert(MBUF_LOOP == M_LOOP);
1365 static_assert(MBUF_BCAST == M_BCAST);
1366 static_assert(MBUF_MCAST == M_MCAST);
1367 static_assert(MBUF_FRAG == M_FRAG);
1368 static_assert(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1369 static_assert(MBUF_LASTFRAG == M_LASTFRAG);
1370 static_assert(MBUF_PROMISC == M_PROMISC);
1371 static_assert(MBUF_HASFCS == M_HASFCS);
1372
1373 static_assert(MBUF_TYPE_FREE == MT_FREE);
1374 static_assert(MBUF_TYPE_DATA == MT_DATA);
1375 static_assert(MBUF_TYPE_HEADER == MT_HEADER);
1376 static_assert(MBUF_TYPE_SOCKET == MT_SOCKET);
1377 static_assert(MBUF_TYPE_PCB == MT_PCB);
1378 static_assert(MBUF_TYPE_RTABLE == MT_RTABLE);
1379 static_assert(MBUF_TYPE_HTABLE == MT_HTABLE);
1380 static_assert(MBUF_TYPE_ATABLE == MT_ATABLE);
1381 static_assert(MBUF_TYPE_SONAME == MT_SONAME);
1382 static_assert(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1383 static_assert(MBUF_TYPE_FTABLE == MT_FTABLE);
1384 static_assert(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1385 static_assert(MBUF_TYPE_IFADDR == MT_IFADDR);
1386 static_assert(MBUF_TYPE_CONTROL == MT_CONTROL);
1387 static_assert(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1388
1389 static_assert(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1390 static_assert(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1391 static_assert(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1392 static_assert(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1393 static_assert(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
1394 static_assert(MBUF_CSUM_REQ_IP == CSUM_IP);
1395 static_assert(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1396 static_assert(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1397 static_assert(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1398 static_assert(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1399 static_assert(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1400 static_assert(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1401 static_assert(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1402 static_assert(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1403
1404 static_assert(MBUF_WAITOK == M_WAIT);
1405 static_assert(MBUF_DONTWAIT == M_DONTWAIT);
1406 static_assert(MBUF_COPYALL == M_COPYALL);
1407
1408 static_assert(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1409 static_assert(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1410 static_assert(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1411 static_assert(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1412 static_assert(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1413 static_assert(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1414 static_assert(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1415 static_assert(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1416 static_assert(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI);
1417 static_assert(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1418 static_assert(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1419
1420 static_assert(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1421 static_assert(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1422 static_assert(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1423 static_assert(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1424
1425 /* Module specific scratch space (32-bit alignment requirement) */
1426 static_assert(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) % sizeof(uint32_t)));
1427
1428 if (nmbclusters == 0) {
1429 nmbclusters = NMBCLUSTERS;
1430 }
1431
1432 /* This should be a sane (at least even) value by now */
1433 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1434
1435 PE_parse_boot_argn("mbuf_limit", &mbuf_limit, sizeof(mbuf_limit));
1436 if (serverperfmode) {
1437 mbuf_limit = 0;
1438 }
1439
1440 /* Setup the mbuf table */
1441 mbuf_table_init();
1442
1443 static_assert(sizeof(struct mbuf) == _MSIZE);
1444
1445 /*
1446 * We have yet to create the non composite zones
1447 * and thus we haven't asked zalloc to allocate
1448 * anything yet, which means that at this point
1449 * m_total() is zero. Once we create the zones and
1450 * raise the reserve, m_total() will be calculated,
1451 * but until then just assume that we will have
1452 * at least the minium limit allocated.
1453 */
1454 m_total(MC_BIGCL) = m_minlimit(MC_BIGCL);
1455 m_total(MC_CL) = m_minlimit(MC_CL);
1456
1457 for (m = 0; m < MC_MAX; m++) {
1458 /* Make sure we didn't miss any */
1459 VERIFY(m_minlimit(m_class(m)) == 0 ||
1460 m_total(m_class(m)) >= m_minlimit(m_class(m)));
1461 }
1462
1463 /* Create the cache for each class */
1464 for (m = 0; m < MC_MAX; m++) {
1465 if (!MBUF_CLASS_COMPOSITE(m)) {
1466 zone_ref_t zone = zone_by_id(m_class_to_zid(m));
1467
1468 if (mbuf_limit) {
1469 zone_set_exhaustible(zone, m_maxlimit(m), false);
1470 }
1471 zone_raise_reserve(zone, m_minlimit(m));
1472 /*
1473 * Pretend that we have allocated m_total() items
1474 * at this point. zalloc will eventually do that
1475 * but it's an async operation.
1476 */
1477 m_total(m) = m_minlimit(m);
1478 }
1479 }
1480
1481 /*
1482 * Set the max limit on sb_max to be 1/16 th of the size of
1483 * memory allocated for mbuf clusters.
1484 */
1485 high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1486 if (high_sb_max < sb_max) {
1487 /* sb_max is too large for this configuration, scale it down */
1488 if (high_sb_max > (1 << MBSHIFT)) {
1489 /* We have atleast 16 M of mbuf pool */
1490 sb_max = high_sb_max;
1491 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1492 /*
1493 * If we have more than 1M of mbufpool, cap the size of
1494 * max sock buf at 1M
1495 */
1496 sb_max = high_sb_max = (1 << MBSHIFT);
1497 } else {
1498 sb_max = high_sb_max;
1499 }
1500 }
1501
1502 mbuf_defunct_tcall =
1503 thread_call_allocate_with_options(mbuf_watchdog_defunct,
1504 NULL,
1505 THREAD_CALL_PRIORITY_KERNEL,
1506 THREAD_CALL_OPTIONS_ONCE);
1507 mbuf_drain_tcall =
1508 thread_call_allocate_with_options(mbuf_watchdog_drain_composite,
1509 NULL,
1510 THREAD_CALL_PRIORITY_KERNEL,
1511 THREAD_CALL_OPTIONS_ONCE);
1512 printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1513 (nmbclusters << MCLSHIFT) >> MBSHIFT,
1514 (nclusters << MCLSHIFT) >> MBSHIFT,
1515 (njcl << MCLSHIFT) >> MBSHIFT);
1516 }
1517
1518 static inline struct mbuf *
m_get_common(int wait,short type,int hdr)1519 m_get_common(int wait, short type, int hdr)
1520 {
1521 struct mbuf *m;
1522
1523 m = mz_alloc(wait);
1524 if (m != NULL) {
1525 mbuf_init(m, hdr, type);
1526 mtype_stat_inc(type);
1527 mtype_stat_dec(MT_FREE);
1528 }
1529 return m;
1530 }
1531 #endif /* !CONFIG_MBUF_MCACHE */
1532
1533 /*
1534 * Space allocation routines; these are also available as macros
1535 * for critical paths.
1536 */
1537 #define _M_GET(wait, type) m_get_common(wait, type, 0)
1538 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
1539 #define _M_RETRY(wait, type) _M_GET(wait, type)
1540 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
1541 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
1542 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
1543
1544 struct mbuf *
m_get(int wait,int type)1545 m_get(int wait, int type)
1546 {
1547 return _M_GET(wait, type);
1548 }
1549
1550 struct mbuf *
m_gethdr(int wait,int type)1551 m_gethdr(int wait, int type)
1552 {
1553 return _M_GETHDR(wait, type);
1554 }
1555
1556 struct mbuf *
m_retry(int wait,int type)1557 m_retry(int wait, int type)
1558 {
1559 return _M_RETRY(wait, type);
1560 }
1561
1562 struct mbuf *
m_retryhdr(int wait,int type)1563 m_retryhdr(int wait, int type)
1564 {
1565 return _M_RETRYHDR(wait, type);
1566 }
1567
1568 struct mbuf *
m_getclr(int wait,int type)1569 m_getclr(int wait, int type)
1570 {
1571 struct mbuf *m;
1572
1573 _MGET(m, wait, type);
1574 if (m != NULL) {
1575 bzero(mtod(m, caddr_t), MLEN);
1576 }
1577 return m;
1578 }
1579
1580 #if !CONFIG_MBUF_MCACHE
1581 static
1582 #endif
1583 int
m_free_paired(struct mbuf * m)1584 m_free_paired(struct mbuf *m)
1585 {
1586 VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED));
1587
1588 os_atomic_thread_fence(seq_cst);
1589 if (MEXT_PMBUF(m) == m) {
1590 /*
1591 * Paired ref count might be negative in case we lose
1592 * against another thread clearing MEXT_PMBUF, in the
1593 * event it occurs after the above memory barrier sync.
1594 * In that case just ignore as things have been unpaired.
1595 */
1596 int16_t prefcnt = os_atomic_dec(&MEXT_PREF(m), acq_rel);
1597 if (prefcnt > 1) {
1598 return 1;
1599 } else if (prefcnt == 1) {
1600 m_ext_free_func_t m_free_func = m_get_ext_free(m);
1601 VERIFY(m_free_func != NULL);
1602 (*m_free_func)(m->m_ext.ext_buf,
1603 m->m_ext.ext_size, m_get_ext_arg(m));
1604 return 1;
1605 } else if (prefcnt == 0) {
1606 VERIFY(MBUF_IS_PAIRED(m));
1607
1608 /*
1609 * Restore minref to its natural value, so that
1610 * the caller will be able to free the cluster
1611 * as appropriate.
1612 */
1613 MEXT_MINREF(m) = 0;
1614
1615 /*
1616 * Clear MEXT_PMBUF, but leave EXTF_PAIRED intact
1617 * as it is immutable. atomic_set_ptr also causes
1618 * memory barrier sync.
1619 */
1620 os_atomic_store(&MEXT_PMBUF(m), (mbuf_ref_t)0, release);
1621
1622 switch (m->m_ext.ext_size) {
1623 case MCLBYTES:
1624 m_set_ext(m, m_get_rfa(m), NULL, NULL);
1625 break;
1626
1627 case MBIGCLBYTES:
1628 m_set_ext(m, m_get_rfa(m), m_bigfree, NULL);
1629 break;
1630
1631 case M16KCLBYTES:
1632 m_set_ext(m, m_get_rfa(m), m_16kfree, NULL);
1633 break;
1634
1635 default:
1636 VERIFY(0);
1637 /* NOTREACHED */
1638 }
1639 }
1640 }
1641
1642 /*
1643 * Tell caller the unpair has occurred, and that the reference
1644 * count on the external cluster held for the paired mbuf should
1645 * now be dropped.
1646 */
1647 return 0;
1648 }
1649
1650 #if !CONFIG_MBUF_MCACHE
1651 struct mbuf *
m_free(struct mbuf * m)1652 m_free(struct mbuf *m)
1653 {
1654 struct mbuf *n = m->m_next;
1655
1656 if (m->m_type == MT_FREE) {
1657 panic("m_free: freeing an already freed mbuf");
1658 }
1659
1660 if (m->m_flags & M_PKTHDR) {
1661 /* Free the aux data and tags if there is any */
1662 m_tag_delete_chain(m);
1663
1664 m_do_tx_compl_callback(m, NULL);
1665 }
1666
1667 if (m->m_flags & M_EXT) {
1668 if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
1669 return n;
1670 }
1671 /*
1672 * Make sure that we don't touch any ext_ref
1673 * member after we decrement the reference count
1674 * since that may lead to use-after-free
1675 * when we do not hold the last reference.
1676 */
1677 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
1678 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
1679 const uint16_t minref = MEXT_MINREF(m);
1680 const uint16_t refcnt = m_decref(m);
1681
1682 if (refcnt == minref && !composite) {
1683 if (m_free_func == NULL) {
1684 mz_cl_free(ZONE_ID_CLUSTER_2K, m->m_ext.ext_buf);
1685 } else if (m_free_func == m_bigfree) {
1686 mz_cl_free(ZONE_ID_CLUSTER_4K, m->m_ext.ext_buf);
1687 } else if (m_free_func == m_16kfree) {
1688 mz_cl_free(ZONE_ID_CLUSTER_16K, m->m_ext.ext_buf);
1689 } else {
1690 (*m_free_func)(m->m_ext.ext_buf,
1691 m->m_ext.ext_size, m_get_ext_arg(m));
1692 }
1693 mz_ref_free(m_get_rfa(m));
1694 m_set_ext(m, NULL, NULL, NULL);
1695 } else if (refcnt == minref && composite) {
1696 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
1697
1698 mtype_stat_dec(m->m_type);
1699 mtype_stat_inc(MT_FREE);
1700
1701 m->m_type = MT_FREE;
1702 m->m_flags = M_EXT;
1703 m->m_len = 0;
1704 m->m_next = m->m_nextpkt = NULL;
1705 /*
1706 * MEXT_FLAGS is safe to access here
1707 * since we are now sure that we held
1708 * the last reference to ext_ref.
1709 */
1710 MEXT_FLAGS(m) &= ~EXTF_READONLY;
1711
1712 /* "Free" into the intermediate cache */
1713 if (m_free_func == NULL) {
1714 mz_composite_free(MC_MBUF_CL, m);
1715 } else if (m_free_func == m_bigfree) {
1716 mz_composite_free(MC_MBUF_BIGCL, m);
1717 } else {
1718 VERIFY(m_free_func == m_16kfree);
1719 mz_composite_free(MC_MBUF_16KCL, m);
1720 }
1721 return n;
1722 }
1723 }
1724
1725 mtype_stat_dec(m->m_type);
1726 mtype_stat_inc(MT_FREE);
1727
1728 m->m_type = MT_FREE;
1729 m->m_flags = m->m_len = 0;
1730 m->m_next = m->m_nextpkt = NULL;
1731
1732 mz_free(m);
1733
1734 return n;
1735 }
1736
1737 __private_extern__ struct mbuf *
m_clattach(struct mbuf * m,int type,caddr_t extbuf __sized_by (extsize),void (* extfree)(caddr_t,u_int,caddr_t),size_t extsize,caddr_t extarg,int wait,int pair)1738 m_clattach(struct mbuf *m, int type, caddr_t extbuf __sized_by(extsize),
1739 void (*extfree)(caddr_t, u_int, caddr_t), size_t extsize, caddr_t extarg,
1740 int wait, int pair)
1741 {
1742 struct ext_ref *rfa = NULL;
1743
1744 /*
1745 * If pairing is requested and an existing mbuf is provided, reject
1746 * it if it's already been paired to another cluster. Otherwise,
1747 * allocate a new one or free any existing below.
1748 */
1749 if ((m != NULL && MBUF_IS_PAIRED(m)) ||
1750 (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)) {
1751 return NULL;
1752 }
1753
1754 if (m->m_flags & M_EXT) {
1755 /*
1756 * Make sure that we don't touch any ext_ref
1757 * member after we decrement the reference count
1758 * since that may lead to use-after-free
1759 * when we do not hold the last reference.
1760 */
1761 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
1762 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
1763 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
1764 const uint16_t minref = MEXT_MINREF(m);
1765 const uint16_t refcnt = m_decref(m);
1766
1767 if (refcnt == minref && !composite) {
1768 if (m_free_func == NULL) {
1769 mz_cl_free(ZONE_ID_CLUSTER_2K, m->m_ext.ext_buf);
1770 } else if (m_free_func == m_bigfree) {
1771 mz_cl_free(ZONE_ID_CLUSTER_4K, m->m_ext.ext_buf);
1772 } else if (m_free_func == m_16kfree) {
1773 mz_cl_free(ZONE_ID_CLUSTER_16K, m->m_ext.ext_buf);
1774 } else {
1775 (*m_free_func)(m->m_ext.ext_buf,
1776 m->m_ext.ext_size, m_get_ext_arg(m));
1777 }
1778 /* Re-use the reference structure */
1779 rfa = m_get_rfa(m);
1780 } else if (refcnt == minref && composite) {
1781 VERIFY(m->m_type != MT_FREE);
1782
1783 mtype_stat_dec(m->m_type);
1784 mtype_stat_inc(MT_FREE);
1785
1786 m->m_type = MT_FREE;
1787 m->m_flags = M_EXT;
1788 m->m_len = 0;
1789 m->m_next = m->m_nextpkt = NULL;
1790
1791 /*
1792 * MEXT_FLAGS is safe to access here
1793 * since we are now sure that we held
1794 * the last reference to ext_ref.
1795 */
1796 MEXT_FLAGS(m) &= ~EXTF_READONLY;
1797
1798 /* "Free" into the intermediate cache */
1799 if (m_free_func == NULL) {
1800 mz_composite_free(MC_MBUF_CL, m);
1801 } else if (m_free_func == m_bigfree) {
1802 mz_composite_free(MC_MBUF_BIGCL, m);
1803 } else {
1804 VERIFY(m_free_func == m_16kfree);
1805 mz_composite_free(MC_MBUF_16KCL, m);
1806 }
1807 /*
1808 * Allocate a new mbuf, since we didn't divorce
1809 * the composite mbuf + cluster pair above.
1810 */
1811 if ((m = _M_GETHDR(wait, type)) == NULL) {
1812 return NULL;
1813 }
1814 }
1815 }
1816
1817 if (rfa == NULL &&
1818 (rfa = mz_ref_alloc(wait)) == NULL) {
1819 m_free(m);
1820 return NULL;
1821 }
1822
1823 if (!pair) {
1824 mext_init(m, extbuf, extsize, extfree, extarg, rfa,
1825 0, 1, 0, 0, 0, NULL);
1826 } else {
1827 mext_init(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
1828 1, 1, 1, EXTF_PAIRED, 0, m);
1829 }
1830
1831 return m;
1832 }
1833
1834 /*
1835 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
1836 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
1837 */
1838 struct mbuf *
m_getcl(int wait,int type,int flags)1839 m_getcl(int wait, int type, int flags)
1840 {
1841 struct mbuf *m = NULL;
1842 int hdr = (flags & M_PKTHDR);
1843
1844 m = mz_composite_alloc(MC_MBUF_CL, wait);
1845 if (m != NULL) {
1846 u_int16_t flag;
1847 struct ext_ref *rfa;
1848 void *cl;
1849
1850 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
1851 cl = m->m_ext.ext_buf;
1852 rfa = m_get_rfa(m);
1853
1854 ASSERT(cl != NULL && rfa != NULL);
1855 VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
1856
1857 flag = MEXT_FLAGS(m);
1858
1859 mbuf_init(m, hdr, type);
1860 MBUF_CL_INIT(m, cl, rfa, 1, flag);
1861
1862 mtype_stat_inc(type);
1863 mtype_stat_dec(MT_FREE);
1864 }
1865 return m;
1866 }
1867
1868 /* m_mclget() add an mbuf cluster to a normal mbuf */
1869 struct mbuf *
m_mclget(struct mbuf * m,int wait)1870 m_mclget(struct mbuf *m, int wait)
1871 {
1872 struct ext_ref *rfa = NULL;
1873 char *bytes = NULL;
1874
1875 if ((rfa = mz_ref_alloc(wait)) == NULL) {
1876 return m;
1877 }
1878
1879 if ((bytes = m_mclalloc(wait)) != NULL) {
1880 m->m_ext.ext_size = MCLBYTES;
1881 m->m_ext.ext_buf = bytes;
1882 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
1883 } else {
1884 m->m_ext.ext_size = 0;
1885 m->m_ext.ext_buf = NULL;
1886 mz_ref_free(rfa);
1887 }
1888
1889 return m;
1890 }
1891
1892 /* Allocate an mbuf cluster */
1893 char *
__sized_by_or_null(MCLBYTES)1894 __sized_by_or_null(MCLBYTES)
1895 m_mclalloc(int wait)
1896 {
1897 return mz_cl_alloc(ZONE_ID_CLUSTER_2K, wait);
1898 }
1899
1900 /* Free an mbuf cluster */
1901 void
m_mclfree(caddr_t p)1902 m_mclfree(caddr_t p)
1903 {
1904 mz_cl_free(ZONE_ID_CLUSTER_2K, p);
1905 }
1906 #endif /* !CONFIG_MBUF_MCACHE */
1907
1908 /*
1909 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
1910 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
1911 */
1912 int
m_mclhasreference(struct mbuf * m)1913 m_mclhasreference(struct mbuf *m)
1914 {
1915 if (!(m->m_flags & M_EXT)) {
1916 return 0;
1917 }
1918
1919 ASSERT(m_get_rfa(m) != NULL);
1920
1921 return (MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0;
1922 }
1923
1924 #if !CONFIG_MBUF_MCACHE
1925 __private_extern__ char *
__sized_by_or_null(MBIGCLBYTES)1926 __sized_by_or_null(MBIGCLBYTES)
1927 m_bigalloc(int wait)
1928 {
1929 return mz_cl_alloc(ZONE_ID_CLUSTER_4K, wait);
1930 }
1931
1932 __private_extern__ void
m_bigfree(caddr_t p,__unused u_int size,__unused caddr_t arg)1933 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
1934 {
1935 mz_cl_free(ZONE_ID_CLUSTER_4K, p);
1936 }
1937
1938 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
1939 __private_extern__ struct mbuf *
m_mbigget(struct mbuf * m,int wait)1940 m_mbigget(struct mbuf *m, int wait)
1941 {
1942 struct ext_ref *rfa = NULL;
1943 void * bytes = NULL;
1944
1945 if ((rfa = mz_ref_alloc(wait)) == NULL) {
1946 return m;
1947 }
1948
1949 if ((bytes = m_bigalloc(wait)) != NULL) {
1950 m->m_ext.ext_size = MBIGCLBYTES;
1951 m->m_ext.ext_buf = bytes;
1952 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
1953 } else {
1954 m->m_ext.ext_size = 0;
1955 m->m_ext.ext_buf = NULL;
1956 mz_ref_free(rfa);
1957 }
1958
1959 return m;
1960 }
1961
1962 __private_extern__ char *
__sized_by_or_null(M16KCLBYTES)1963 __sized_by_or_null(M16KCLBYTES)
1964 m_16kalloc(int wait)
1965 {
1966 return mz_cl_alloc(ZONE_ID_CLUSTER_16K, wait);
1967 }
1968
1969 __private_extern__ void
m_16kfree(caddr_t p,__unused u_int size,__unused caddr_t arg)1970 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
1971 {
1972 mz_cl_free(ZONE_ID_CLUSTER_16K, p);
1973 }
1974
1975 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
1976 __private_extern__ struct mbuf *
m_m16kget(struct mbuf * m,int wait)1977 m_m16kget(struct mbuf *m, int wait)
1978 {
1979 struct ext_ref *rfa = NULL;
1980 void *bytes = NULL;
1981
1982 if ((rfa = mz_ref_alloc(wait)) == NULL) {
1983 return m;
1984 }
1985
1986 if ((bytes = m_16kalloc(wait)) != NULL) {
1987 m->m_ext.ext_size = M16KCLBYTES;
1988 m->m_ext.ext_buf = bytes;
1989 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
1990 } else {
1991 m->m_ext.ext_size = 0;
1992 m->m_ext.ext_buf = NULL;
1993 mz_ref_free(rfa);
1994 }
1995
1996 return m;
1997 }
1998 #endif /* !CONFIG_MBUF_MCACHE */
1999
2000 /*
2001 * "Move" mbuf pkthdr from "from" to "to".
2002 * "from" must have M_PKTHDR set, and "to" must be empty.
2003 */
2004 void
m_copy_pkthdr(struct mbuf * to,struct mbuf * from)2005 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
2006 {
2007 VERIFY(from->m_flags & M_PKTHDR);
2008
2009 if (to->m_flags & M_PKTHDR) {
2010 /* We will be taking over the tags of 'to' */
2011 m_tag_delete_chain(to);
2012 }
2013 to->m_pkthdr = from->m_pkthdr; /* especially tags */
2014 m_classifier_init(from, 0); /* purge classifier info */
2015 m_tag_init(from, 1); /* purge all tags from src */
2016 m_scratch_init(from); /* clear src scratch area */
2017 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
2018 if ((to->m_flags & M_EXT) == 0) {
2019 to->m_data = (uintptr_t)to->m_pktdat;
2020 }
2021 }
2022
2023 /*
2024 * Duplicate "from"'s mbuf pkthdr in "to".
2025 * "from" must have M_PKTHDR set, and "to" must be empty.
2026 * In particular, this does a deep copy of the packet tags.
2027 */
2028 int
m_dup_pkthdr(struct mbuf * to,struct mbuf * from,int how)2029 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
2030 {
2031 VERIFY(from->m_flags & M_PKTHDR);
2032
2033 if (to->m_flags & M_PKTHDR) {
2034 /* We will be taking over the tags of 'to' */
2035 m_tag_delete_chain(to);
2036 }
2037 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
2038 if ((to->m_flags & M_EXT) == 0) {
2039 to->m_data = (uintptr_t)to->m_pktdat;
2040 }
2041 to->m_pkthdr = from->m_pkthdr;
2042 /* clear TX completion flag so the callback is not called in the copy */
2043 to->m_pkthdr.pkt_flags &= ~PKTF_TX_COMPL_TS_REQ;
2044 m_tag_init(to, 0); /* preserve dst static tags */
2045 return m_tag_copy_chain(to, from, how);
2046 }
2047
2048 void
m_copy_pftag(struct mbuf * to,struct mbuf * from)2049 m_copy_pftag(struct mbuf *to, struct mbuf *from)
2050 {
2051 memcpy(m_pftag(to), m_pftag(from), sizeof(struct pf_mtag));
2052 #if PF_ECN
2053 m_pftag(to)->pftag_hdr = NULL;
2054 m_pftag(to)->pftag_flags &= ~(PF_TAG_HDR_INET | PF_TAG_HDR_INET6);
2055 #endif /* PF_ECN */
2056 }
2057
2058 void
m_copy_necptag(struct mbuf * to,struct mbuf * from)2059 m_copy_necptag(struct mbuf *to, struct mbuf *from)
2060 {
2061 memcpy(m_necptag(to), m_necptag(from), sizeof(struct necp_mtag_));
2062 }
2063
2064 void
m_classifier_init(struct mbuf * m,uint32_t pktf_mask)2065 m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
2066 {
2067 VERIFY(m->m_flags & M_PKTHDR);
2068
2069 m->m_pkthdr.pkt_proto = 0;
2070 m->m_pkthdr.pkt_flowsrc = 0;
2071 m->m_pkthdr.pkt_flowid = 0;
2072 m->m_pkthdr.pkt_ext_flags = 0;
2073 m->m_pkthdr.pkt_flags &= pktf_mask; /* caller-defined mask */
2074 /* preserve service class and interface info for loopback packets */
2075 if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
2076 (void) m_set_service_class(m, MBUF_SC_BE);
2077 }
2078 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
2079 m->m_pkthdr.pkt_ifainfo = 0;
2080 }
2081 /*
2082 * Preserve timestamp if requested
2083 */
2084 if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID)) {
2085 m->m_pkthdr.pkt_timestamp = 0;
2086 }
2087 }
2088
2089 void
m_copy_classifier(struct mbuf * to,struct mbuf * from)2090 m_copy_classifier(struct mbuf *to, struct mbuf *from)
2091 {
2092 VERIFY(to->m_flags & M_PKTHDR);
2093 VERIFY(from->m_flags & M_PKTHDR);
2094
2095 to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
2096 to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
2097 to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
2098 to->m_pkthdr.pkt_mpriv_srcid = from->m_pkthdr.pkt_mpriv_srcid;
2099 to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
2100 to->m_pkthdr.pkt_ext_flags = from->m_pkthdr.pkt_ext_flags;
2101 (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
2102 to->m_pkthdr.pkt_ifainfo = from->m_pkthdr.pkt_ifainfo;
2103 }
2104
2105 #if !CONFIG_MBUF_MCACHE
2106 /*
2107 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
2108 * if wantall is not set, return whatever number were available. Set up the
2109 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
2110 * are chained on the m_nextpkt field. Any packets requested beyond this
2111 * are chained onto the last packet header's m_next field. The size of
2112 * the cluster is controlled by the parameter bufsize.
2113 */
2114 __private_extern__ struct mbuf *
m_getpackets_internal(unsigned int * num_needed,int num_with_pkthdrs,int wait,int wantall,size_t bufsize)2115 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
2116 int wait, int wantall, size_t bufsize)
2117 {
2118 mbuf_ref_t m = NULL;
2119 mbuf_ref_t *np, top;
2120 unsigned int pnum, needed = *num_needed;
2121 zstack_t mp_list = {};
2122 mbuf_class_t class = MC_MBUF_CL;
2123 u_int16_t flag;
2124 struct ext_ref *rfa;
2125 void *cl;
2126
2127 ASSERT(bufsize == m_maxsize(MC_CL) ||
2128 bufsize == m_maxsize(MC_BIGCL) ||
2129 bufsize == m_maxsize(MC_16KCL));
2130
2131 top = NULL;
2132 np = ⊤
2133 pnum = 0;
2134
2135 /*
2136 * The caller doesn't want all the requested buffers; only some.
2137 * Try hard to get what we can, but don't block. This effectively
2138 * overrides MCR_SLEEP, since this thread will not go to sleep
2139 * if we can't get all the buffers.
2140 */
2141 if (!wantall || (wait & Z_NOWAIT)) {
2142 wait &= ~Z_NOWAIT;
2143 wait |= Z_NOPAGEWAIT;
2144 }
2145
2146 /* Allocate the composite mbuf + cluster elements from the cache */
2147 if (bufsize == m_maxsize(MC_CL)) {
2148 class = MC_MBUF_CL;
2149 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2150 class = MC_MBUF_BIGCL;
2151 } else {
2152 class = MC_MBUF_16KCL;
2153 }
2154 mp_list = mz_composite_alloc_n(class, needed, wait);
2155 needed = zstack_count(mp_list);
2156
2157 for (pnum = 0; pnum < needed; pnum++) {
2158 m = zstack_pop(&mp_list);
2159
2160 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
2161 cl = m->m_ext.ext_buf;
2162 rfa = m_get_rfa(m);
2163
2164 ASSERT(cl != NULL && rfa != NULL);
2165 VERIFY(MBUF_IS_COMPOSITE(m));
2166
2167 flag = MEXT_FLAGS(m);
2168
2169 mbuf_init(m, num_with_pkthdrs, MT_DATA);
2170 if (bufsize == m_maxsize(MC_16KCL)) {
2171 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
2172 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2173 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
2174 } else {
2175 MBUF_CL_INIT(m, cl, rfa, 1, flag);
2176 }
2177
2178 if (num_with_pkthdrs > 0) {
2179 --num_with_pkthdrs;
2180 }
2181
2182 *np = m;
2183 if (num_with_pkthdrs > 0) {
2184 np = &m->m_nextpkt;
2185 } else {
2186 np = &m->m_next;
2187 }
2188 }
2189 ASSERT(pnum != *num_needed || zstack_empty(mp_list));
2190 if (!zstack_empty(mp_list)) {
2191 mz_composite_free_n(class, mp_list);
2192 }
2193 if (pnum > 0) {
2194 mtype_stat_add(MT_DATA, pnum);
2195 mtype_stat_sub(MT_FREE, pnum);
2196 }
2197
2198 if (wantall && (pnum != *num_needed)) {
2199 if (top != NULL) {
2200 m_freem_list(top);
2201 }
2202 return NULL;
2203 }
2204
2205 if (pnum > *num_needed) {
2206 printf("%s: File a radar related to <rdar://10146739>. \
2207 needed = %u, pnum = %u, num_needed = %u \n",
2208 __func__, needed, pnum, *num_needed);
2209 }
2210 *num_needed = pnum;
2211
2212 return top;
2213 }
2214
2215 /*
2216 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
2217 * wantall is not set, return whatever number were available. The size of
2218 * each mbuf in the list is controlled by the parameter packetlen. Each
2219 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
2220 * in the chain is called a segment. If maxsegments is not null and the
2221 * value pointed to is not null, this specify the maximum number of segments
2222 * for a chain of mbufs. If maxsegments is zero or the value pointed to
2223 * is zero the caller does not have any restriction on the number of segments.
2224 * The actual number of segments of a mbuf chain is return in the value
2225 * pointed to by maxsegments.
2226 */
2227 __private_extern__ struct mbuf *
m_allocpacket_internal(unsigned int * numlist,size_t packetlen,unsigned int * maxsegments,int wait,int wantall,size_t wantsize)2228 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
2229 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
2230 {
2231 mbuf_ref_t *np, top, first = NULL;
2232 size_t bufsize, r_bufsize;
2233 unsigned int num = 0;
2234 unsigned int nsegs = 0;
2235 unsigned int needed = 0, resid;
2236 zstack_t mp_list = {}, rmp_list = {};
2237 mbuf_class_t class = MC_MBUF, rclass = MC_MBUF_CL;
2238
2239 if (*numlist == 0) {
2240 os_log(OS_LOG_DEFAULT, "m_allocpacket_internal *numlist is 0");
2241 return NULL;
2242 }
2243
2244 top = NULL;
2245 np = ⊤
2246
2247 if (wantsize == 0) {
2248 if (packetlen <= MINCLSIZE) {
2249 bufsize = packetlen;
2250 } else if (packetlen > m_maxsize(MC_CL)) {
2251 /* Use 4KB if jumbo cluster pool isn't available */
2252 if (packetlen <= m_maxsize(MC_BIGCL)) {
2253 bufsize = m_maxsize(MC_BIGCL);
2254 } else {
2255 bufsize = m_maxsize(MC_16KCL);
2256 }
2257 } else {
2258 bufsize = m_maxsize(MC_CL);
2259 }
2260 } else if (wantsize == m_maxsize(MC_CL) ||
2261 wantsize == m_maxsize(MC_BIGCL) ||
2262 (wantsize == m_maxsize(MC_16KCL))) {
2263 bufsize = wantsize;
2264 } else {
2265 *numlist = 0;
2266 os_log(OS_LOG_DEFAULT, "m_allocpacket_internal wantsize unsupported");
2267 return NULL;
2268 }
2269
2270 if (bufsize <= MHLEN) {
2271 nsegs = 1;
2272 } else if (bufsize <= MINCLSIZE) {
2273 if (maxsegments != NULL && *maxsegments == 1) {
2274 bufsize = m_maxsize(MC_CL);
2275 nsegs = 1;
2276 } else {
2277 nsegs = 2;
2278 }
2279 } else if (bufsize == m_maxsize(MC_16KCL)) {
2280 nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1;
2281 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2282 nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1;
2283 } else {
2284 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
2285 }
2286 if (maxsegments != NULL) {
2287 if (*maxsegments && nsegs > *maxsegments) {
2288 *maxsegments = nsegs;
2289 *numlist = 0;
2290 os_log(OS_LOG_DEFAULT, "m_allocpacket_internal nsegs > *maxsegments");
2291 return NULL;
2292 }
2293 *maxsegments = nsegs;
2294 }
2295
2296 /*
2297 * The caller doesn't want all the requested buffers; only some.
2298 * Try hard to get what we can, but don't block. This effectively
2299 * overrides MCR_SLEEP, since this thread will not go to sleep
2300 * if we can't get all the buffers.
2301 */
2302 if (!wantall || (wait & Z_NOWAIT)) {
2303 wait &= ~Z_NOWAIT;
2304 wait |= Z_NOPAGEWAIT;
2305 }
2306
2307 /*
2308 * Simple case where all elements in the lists/chains are mbufs.
2309 * Unless bufsize is greater than MHLEN, each segment chain is made
2310 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
2311 * of 2 mbufs; the second one is used for the residual data, i.e.
2312 * the remaining data that cannot fit into the first mbuf.
2313 */
2314 if (bufsize <= MINCLSIZE) {
2315 /* Allocate the elements in one shot from the mbuf cache */
2316 ASSERT(bufsize <= MHLEN || nsegs == 2);
2317 class = MC_MBUF;
2318 mp_list = mz_alloc_n((*numlist) * nsegs, wait);
2319 needed = zstack_count(mp_list);
2320
2321 /*
2322 * The number of elements must be even if we are to use an
2323 * mbuf (instead of a cluster) to store the residual data.
2324 * If we couldn't allocate the requested number of mbufs,
2325 * trim the number down (if it's odd) in order to avoid
2326 * creating a partial segment chain.
2327 */
2328 if (bufsize > MHLEN && (needed & 0x1)) {
2329 needed--;
2330 }
2331
2332 while (num < needed) {
2333 mbuf_ref_t m = NULL;
2334
2335 m = zstack_pop(&mp_list);
2336 ASSERT(m != NULL);
2337
2338 mbuf_init(m, 1, MT_DATA);
2339 num++;
2340 if (bufsize > MHLEN) {
2341 /* A second mbuf for this segment chain */
2342 m->m_next = zstack_pop(&mp_list);
2343
2344 ASSERT(m->m_next != NULL);
2345
2346 mbuf_init(m->m_next, 0, MT_DATA);
2347 num++;
2348 }
2349 *np = m;
2350 np = &m->m_nextpkt;
2351 }
2352 ASSERT(num != *numlist || zstack_empty(mp_list));
2353
2354 if (num > 0) {
2355 mtype_stat_add(MT_DATA, num);
2356 mtype_stat_sub(MT_FREE, num);
2357 }
2358 num /= nsegs;
2359
2360 /* We've got them all; return to caller */
2361 if (num == *numlist) {
2362 return top;
2363 }
2364
2365 goto fail;
2366 }
2367
2368 /*
2369 * Complex cases where elements are made up of one or more composite
2370 * mbufs + cluster, depending on packetlen. Each N-segment chain can
2371 * be illustrated as follows:
2372 *
2373 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
2374 *
2375 * Every composite mbuf + cluster element comes from the intermediate
2376 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
2377 * the last composite element will come from the MC_MBUF_CL cache,
2378 * unless the residual data is larger than 2KB where we use the
2379 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
2380 * data is defined as extra data beyond the first element that cannot
2381 * fit into the previous element, i.e. there is no residual data if
2382 * the chain only has 1 segment.
2383 */
2384 r_bufsize = bufsize;
2385 resid = packetlen > bufsize ? packetlen % bufsize : 0;
2386 if (resid > 0) {
2387 /* There is residual data; figure out the cluster size */
2388 if (wantsize == 0 && packetlen > MINCLSIZE) {
2389 /*
2390 * Caller didn't request that all of the segments
2391 * in the chain use the same cluster size; use the
2392 * smaller of the cluster sizes.
2393 */
2394 if (resid > m_maxsize(MC_BIGCL)) {
2395 r_bufsize = m_maxsize(MC_16KCL);
2396 } else if (resid > m_maxsize(MC_CL)) {
2397 r_bufsize = m_maxsize(MC_BIGCL);
2398 } else {
2399 r_bufsize = m_maxsize(MC_CL);
2400 }
2401 } else {
2402 /* Use the same cluster size as the other segments */
2403 resid = 0;
2404 }
2405 }
2406
2407 needed = *numlist;
2408 if (resid > 0) {
2409 /*
2410 * Attempt to allocate composite mbuf + cluster elements for
2411 * the residual data in each chain; record the number of such
2412 * elements that can be allocated so that we know how many
2413 * segment chains we can afford to create.
2414 */
2415 if (r_bufsize <= m_maxsize(MC_CL)) {
2416 rclass = MC_MBUF_CL;
2417 } else if (r_bufsize <= m_maxsize(MC_BIGCL)) {
2418 rclass = MC_MBUF_BIGCL;
2419 } else {
2420 rclass = MC_MBUF_16KCL;
2421 }
2422 rmp_list = mz_composite_alloc_n(rclass, *numlist, wait);
2423 needed = zstack_count(rmp_list);
2424 if (needed == 0) {
2425 goto fail;
2426 }
2427
2428 /* This is temporarily reduced for calculation */
2429 ASSERT(nsegs > 1);
2430 nsegs--;
2431 }
2432
2433 /*
2434 * Attempt to allocate the rest of the composite mbuf + cluster
2435 * elements for the number of segment chains that we need.
2436 */
2437 if (bufsize <= m_maxsize(MC_CL)) {
2438 class = MC_MBUF_CL;
2439 } else if (bufsize <= m_maxsize(MC_BIGCL)) {
2440 class = MC_MBUF_BIGCL;
2441 } else {
2442 class = MC_MBUF_16KCL;
2443 }
2444 mp_list = mz_composite_alloc_n(class, needed * nsegs, wait);
2445 needed = zstack_count(mp_list);
2446
2447 /* Round it down to avoid creating a partial segment chain */
2448 needed = (needed / nsegs) * nsegs;
2449 if (needed == 0) {
2450 goto fail;
2451 }
2452
2453 if (resid > 0) {
2454 /*
2455 * We're about to construct the chain(s); take into account
2456 * the number of segments we have created above to hold the
2457 * residual data for each chain, as well as restore the
2458 * original count of segments per chain.
2459 */
2460 ASSERT(nsegs > 0);
2461 needed += needed / nsegs;
2462 nsegs++;
2463 }
2464
2465 for (;;) {
2466 mbuf_ref_t m = NULL;
2467 u_int16_t flag;
2468 struct ext_ref *rfa;
2469 void *cl;
2470 int pkthdr;
2471 m_ext_free_func_t m_free_func;
2472
2473 ++num;
2474
2475 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
2476 m = zstack_pop(&mp_list);
2477 } else {
2478 m = zstack_pop(&rmp_list);
2479 }
2480 m_free_func = m_get_ext_free(m);
2481 ASSERT(m != NULL);
2482 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
2483 VERIFY(m_free_func == NULL || m_free_func == m_bigfree ||
2484 m_free_func == m_16kfree);
2485
2486 cl = m->m_ext.ext_buf;
2487 rfa = m_get_rfa(m);
2488
2489 ASSERT(cl != NULL && rfa != NULL);
2490 VERIFY(MBUF_IS_COMPOSITE(m));
2491
2492 flag = MEXT_FLAGS(m);
2493
2494 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
2495 if (pkthdr) {
2496 first = m;
2497 }
2498 mbuf_init(m, pkthdr, MT_DATA);
2499 if (m_free_func == m_16kfree) {
2500 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
2501 } else if (m_free_func == m_bigfree) {
2502 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
2503 } else {
2504 MBUF_CL_INIT(m, cl, rfa, 1, flag);
2505 }
2506
2507 *np = m;
2508 if ((num % nsegs) == 0) {
2509 np = &first->m_nextpkt;
2510 } else {
2511 np = &m->m_next;
2512 }
2513
2514 if (num == needed) {
2515 break;
2516 }
2517 }
2518
2519 if (num > 0) {
2520 mtype_stat_add(MT_DATA, num);
2521 mtype_stat_sub(MT_FREE, num);
2522 }
2523
2524 num /= nsegs;
2525
2526 /* We've got them all; return to caller */
2527 if (num == *numlist) {
2528 ASSERT(zstack_empty(mp_list) && zstack_empty(rmp_list));
2529 return top;
2530 }
2531
2532 fail:
2533 /* Free up what's left of the above */
2534 if (!zstack_empty(mp_list)) {
2535 if (class == MC_MBUF) {
2536 /* No need to elide, these mbufs came from the cache. */
2537 mz_free_n(mp_list);
2538 } else {
2539 mz_composite_free_n(class, mp_list);
2540 }
2541 }
2542 if (!zstack_empty(rmp_list)) {
2543 mz_composite_free_n(rclass, rmp_list);
2544 }
2545 if (wantall && top != NULL) {
2546 m_freem_list(top);
2547 *numlist = 0;
2548 return NULL;
2549 }
2550 *numlist = num;
2551 return top;
2552 }
2553 #endif /* !CONFIG_MBUF_MCACHE */
2554
2555 /*
2556 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
2557 * packets on receive ring.
2558 */
2559 __private_extern__ struct mbuf *
m_getpacket_how(int wait)2560 m_getpacket_how(int wait)
2561 {
2562 unsigned int num_needed = 1;
2563
2564 return m_getpackets_internal(&num_needed, 1, wait, 1,
2565 m_maxsize(MC_CL));
2566 }
2567
2568 /*
2569 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
2570 * packets on receive ring.
2571 */
2572 struct mbuf *
m_getpacket(void)2573 m_getpacket(void)
2574 {
2575 unsigned int num_needed = 1;
2576
2577 return m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
2578 m_maxsize(MC_CL));
2579 }
2580
2581 /*
2582 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
2583 * if this can't be met, return whatever number were available. Set up the
2584 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
2585 * are chained on the m_nextpkt field. Any packets requested beyond this are
2586 * chained onto the last packet header's m_next field.
2587 */
2588 struct mbuf *
m_getpackets(int num_needed,int num_with_pkthdrs,int how)2589 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
2590 {
2591 unsigned int n = num_needed;
2592
2593 return m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
2594 m_maxsize(MC_CL));
2595 }
2596
2597 /*
2598 * Return a list of mbuf hdrs set up as packet hdrs chained together
2599 * on the m_nextpkt field
2600 */
2601 struct mbuf *
m_getpackethdrs(int num_needed,int how)2602 m_getpackethdrs(int num_needed, int how)
2603 {
2604 mbuf_ref_t m, *np, top;
2605
2606 top = NULL;
2607 np = ⊤
2608
2609 while (num_needed--) {
2610 m = _M_RETRYHDR(how, MT_DATA);
2611 if (m == NULL) {
2612 break;
2613 }
2614
2615 *np = m;
2616 np = &m->m_nextpkt;
2617 }
2618
2619 return top;
2620 }
2621
2622 #if !CONFIG_MBUF_MCACHE
2623 /*
2624 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
2625 * for mbufs packets freed. Used by the drivers.
2626 */
2627 int
m_freem_list(struct mbuf * m)2628 m_freem_list(struct mbuf *m)
2629 {
2630 struct mbuf *nextpkt;
2631 zstack_t mp_list = {}, mcl_list = {}, mbc_list = {},
2632 m16k_list = {}, m_mcl_list = {},
2633 m_mbc_list = {}, m_m16k_list = {}, ref_list = {};
2634 int pktcount = 0;
2635 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
2636
2637 while (m != NULL) {
2638 pktcount++;
2639
2640 nextpkt = m->m_nextpkt;
2641 m->m_nextpkt = NULL;
2642
2643 while (m != NULL) {
2644 struct mbuf *next = m->m_next;
2645 void *cl = NULL;
2646 if (m->m_type == MT_FREE) {
2647 panic("m_free: freeing an already freed mbuf");
2648 }
2649
2650 if (m->m_flags & M_PKTHDR) {
2651 /* Free the aux data and tags if there is any */
2652 m_tag_delete_chain(m);
2653 m_do_tx_compl_callback(m, NULL);
2654 }
2655
2656 if (!(m->m_flags & M_EXT)) {
2657 mt_free++;
2658 goto simple_free;
2659 }
2660
2661 if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
2662 m = next;
2663 continue;
2664 }
2665
2666 mt_free++;
2667
2668 cl = m->m_ext.ext_buf;
2669 /*
2670 * Make sure that we don't touch any ext_ref
2671 * member after we decrement the reference count
2672 * since that may lead to use-after-free
2673 * when we do not hold the last reference.
2674 */
2675 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
2676 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
2677 const uint16_t minref = MEXT_MINREF(m);
2678 const uint16_t refcnt = m_decref(m);
2679 if (refcnt == minref && !composite) {
2680 if (m_free_func == NULL) {
2681 zstack_push(&mcl_list, cl);
2682 } else if (m_free_func == m_bigfree) {
2683 zstack_push(&mbc_list, cl);
2684 } else if (m_free_func == m_16kfree) {
2685 zstack_push(&m16k_list, cl);
2686 } else {
2687 (*(m_free_func))((caddr_t)cl,
2688 m->m_ext.ext_size,
2689 m_get_ext_arg(m));
2690 }
2691 zstack_push(&ref_list, m_get_rfa(m));
2692 m_set_ext(m, NULL, NULL, NULL);
2693 } else if (refcnt == minref && composite) {
2694 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
2695 /*
2696 * Amortize the costs of atomic operations
2697 * by doing them at the end, if possible.
2698 */
2699 if (m->m_type == MT_DATA) {
2700 mt_data++;
2701 } else if (m->m_type == MT_HEADER) {
2702 mt_header++;
2703 } else if (m->m_type == MT_SONAME) {
2704 mt_soname++;
2705 } else if (m->m_type == MT_TAG) {
2706 mt_tag++;
2707 } else {
2708 mtype_stat_dec(m->m_type);
2709 }
2710
2711 m->m_type = MT_FREE;
2712 m->m_flags = M_EXT;
2713 m->m_len = 0;
2714 m->m_next = m->m_nextpkt = NULL;
2715
2716 /*
2717 * MEXT_FLAGS is safe to access here
2718 * since we are now sure that we held
2719 * the last reference to ext_ref.
2720 */
2721 MEXT_FLAGS(m) &= ~EXTF_READONLY;
2722
2723 /* "Free" into the intermediate cache */
2724 if (m_free_func == NULL) {
2725 zstack_push(&m_mcl_list, m);
2726 } else if (m_free_func == m_bigfree) {
2727 zstack_push(&m_mbc_list, m);
2728 } else {
2729 VERIFY(m_free_func == m_16kfree);
2730 zstack_push(&m_m16k_list, m);
2731 }
2732 m = next;
2733 continue;
2734 }
2735 simple_free:
2736 /*
2737 * Amortize the costs of atomic operations
2738 * by doing them at the end, if possible.
2739 */
2740 if (m->m_type == MT_DATA) {
2741 mt_data++;
2742 } else if (m->m_type == MT_HEADER) {
2743 mt_header++;
2744 } else if (m->m_type == MT_SONAME) {
2745 mt_soname++;
2746 } else if (m->m_type == MT_TAG) {
2747 mt_tag++;
2748 } else if (m->m_type != MT_FREE) {
2749 mtype_stat_dec(m->m_type);
2750 }
2751
2752 m->m_type = MT_FREE;
2753 m->m_flags = m->m_len = 0;
2754 m->m_next = m->m_nextpkt = NULL;
2755
2756 m_elide(m);
2757 zstack_push(&mp_list, m);
2758
2759 m = next;
2760 }
2761
2762 m = nextpkt;
2763 }
2764
2765 if (mt_free > 0) {
2766 mtype_stat_add(MT_FREE, mt_free);
2767 }
2768 if (mt_data > 0) {
2769 mtype_stat_sub(MT_DATA, mt_data);
2770 }
2771 if (mt_header > 0) {
2772 mtype_stat_sub(MT_HEADER, mt_header);
2773 }
2774 if (mt_soname > 0) {
2775 mtype_stat_sub(MT_SONAME, mt_soname);
2776 }
2777 if (mt_tag > 0) {
2778 mtype_stat_sub(MT_TAG, mt_tag);
2779 }
2780 if (!zstack_empty(mp_list)) {
2781 /* mbufs elided above. */
2782 mz_free_n(mp_list);
2783 }
2784 if (!zstack_empty(mcl_list)) {
2785 zfree_nozero_n(ZONE_ID_CLUSTER_2K, mcl_list);
2786 }
2787 if (!zstack_empty(mbc_list)) {
2788 zfree_nozero_n(ZONE_ID_CLUSTER_4K, mbc_list);
2789 }
2790 if (!zstack_empty(m16k_list)) {
2791 zfree_nozero_n(ZONE_ID_CLUSTER_16K, m16k_list);
2792 }
2793 if (!zstack_empty(m_mcl_list)) {
2794 mz_composite_free_n(MC_MBUF_CL, m_mcl_list);
2795 }
2796 if (!zstack_empty(m_mbc_list)) {
2797 mz_composite_free_n(MC_MBUF_BIGCL, m_mbc_list);
2798 }
2799 if (!zstack_empty(m_m16k_list)) {
2800 mz_composite_free_n(MC_MBUF_16KCL, m_m16k_list);
2801 }
2802 if (!zstack_empty(ref_list)) {
2803 zfree_nozero_n(ZONE_ID_MBUF_REF, ref_list);
2804 }
2805
2806 return pktcount;
2807 }
2808 #endif /* !CONFIG_MBUF_MCACHE */
2809
2810 /*
2811 * Wrapper around m_freem_list which captures the packet that's going to be
2812 * dropped. If funcname is NULL, that means we do not want to store both
2813 * function name and line number, and only the drop reason will be saved.
2814 * Make sure to pass the direction flag (DROPTAP_FLAG_DIR_OUT,
2815 * DROPTAP_FLAG_DIR_IN), or the packet will not be captured.
2816 */
2817 void
m_drop_list(mbuf_t m_head,struct ifnet * ifp,uint16_t flags,uint32_t reason,const char * funcname,uint16_t linenum)2818 m_drop_list(mbuf_t m_head, struct ifnet *ifp, uint16_t flags, uint32_t reason, const char *funcname,
2819 uint16_t linenum)
2820 {
2821 struct mbuf *m = m_head;
2822 struct mbuf *nextpkt;
2823
2824 if (m_head == NULL) {
2825 return;
2826 }
2827
2828 if (__probable(droptap_total_tap_count == 0)) {
2829 m_freem_list(m_head);
2830 return;
2831 }
2832
2833 if (flags & DROPTAP_FLAG_DIR_OUT) {
2834 while (m != NULL) {
2835 uint16_t tmp_flags = flags;
2836
2837 nextpkt = m->m_nextpkt;
2838 if (m->m_pkthdr.pkt_hdr == NULL) {
2839 tmp_flags |= DROPTAP_FLAG_L2_MISSING;
2840 }
2841 droptap_output_mbuf(m, reason, funcname, linenum, tmp_flags,
2842 ifp);
2843 m = nextpkt;
2844 }
2845 } else if (flags & DROPTAP_FLAG_DIR_IN) {
2846 while (m != NULL) {
2847 char *frame_header __single;
2848 uint16_t tmp_flags = flags;
2849
2850 nextpkt = m->m_nextpkt;
2851
2852 if ((flags & DROPTAP_FLAG_L2_MISSING) == 0 &&
2853 m->m_pkthdr.pkt_hdr != NULL) {
2854 frame_header = m->m_pkthdr.pkt_hdr;
2855 } else {
2856 frame_header = NULL;
2857 tmp_flags |= DROPTAP_FLAG_L2_MISSING;
2858 }
2859
2860 droptap_input_mbuf(m, reason, funcname, linenum, tmp_flags,
2861 m->m_pkthdr.rcvif, frame_header);
2862 m = nextpkt;
2863 }
2864 }
2865 m_freem_list(m_head);
2866 }
2867
2868 void
m_freem(struct mbuf * m)2869 m_freem(struct mbuf *m)
2870 {
2871 while (m != NULL) {
2872 m = m_free(m);
2873 }
2874 }
2875
2876 /*
2877 * Wrapper around m_freem which captures the packet that's going to be dropped.
2878 * If funcname is NULL, that means we do not want to store both function name
2879 * and line number, and only the drop reason will be saved. Make sure to pass the
2880 * direction flag (DROPTAP_FLAG_DIR_OUT, DROPTAP_FLAG_DIR_IN), or the packet will
2881 * not be captured.
2882 */
2883 static void
m_drop_common(mbuf_t m,struct ifnet * ifp,uint16_t flags,uint32_t reason,const char * funcname,uint16_t linenum)2884 m_drop_common(mbuf_t m, struct ifnet *ifp, uint16_t flags, uint32_t reason, const char *funcname,
2885 uint16_t linenum)
2886 {
2887 if (flags & DROPTAP_FLAG_DIR_OUT) {
2888 droptap_output_mbuf(m, reason, funcname, linenum, flags, ifp);
2889 } else if (flags & DROPTAP_FLAG_DIR_IN) {
2890 char *frame_header __single;
2891
2892 if ((flags & DROPTAP_FLAG_L2_MISSING) == 0 &&
2893 m->m_pkthdr.pkt_hdr != NULL) {
2894 frame_header = m->m_pkthdr.pkt_hdr;
2895 } else {
2896 frame_header = NULL;
2897 flags |= DROPTAP_FLAG_L2_MISSING;
2898 }
2899
2900 droptap_input_mbuf(m, reason, funcname, linenum, flags, ifp,
2901 frame_header);
2902 }
2903 m_freem(m);
2904 }
2905
2906 void
m_drop(mbuf_t m,uint16_t flags,uint32_t reason,const char * funcname,uint16_t linenum)2907 m_drop(mbuf_t m, uint16_t flags, uint32_t reason, const char *funcname,
2908 uint16_t linenum)
2909 {
2910 if (m == NULL) {
2911 return;
2912 }
2913
2914 if (__probable(droptap_total_tap_count == 0)) {
2915 m_freem(m);
2916 return;
2917 }
2918
2919 if (flags & DROPTAP_FLAG_DIR_OUT) {
2920 m_drop_common(m, NULL, flags, reason, funcname, linenum);
2921 } else if (flags & DROPTAP_FLAG_DIR_IN) {
2922 m_drop_common(m, m->m_pkthdr.rcvif, flags, reason, funcname, linenum);
2923 }
2924 }
2925
2926 void
m_drop_if(mbuf_t m,struct ifnet * ifp,uint16_t flags,uint32_t reason,const char * funcname,uint16_t linenum)2927 m_drop_if(mbuf_t m, struct ifnet *ifp, uint16_t flags, uint32_t reason, const char *funcname,
2928 uint16_t linenum)
2929 {
2930 if (m == NULL) {
2931 return;
2932 }
2933
2934 if (__probable(droptap_total_tap_count == 0)) {
2935 m_freem(m);
2936 return;
2937 }
2938
2939 m_drop_common(m, ifp, flags, reason, funcname, linenum);
2940 }
2941
2942 void
m_drop_extended(mbuf_t m,struct ifnet * ifp,char * frame_header,uint16_t flags,uint32_t reason,const char * funcname,uint16_t linenum)2943 m_drop_extended(mbuf_t m, struct ifnet *ifp, char *frame_header,
2944 uint16_t flags, uint32_t reason, const char *funcname, uint16_t linenum)
2945 {
2946 if (m == NULL) {
2947 return;
2948 }
2949
2950 if (__probable(droptap_total_tap_count == 0)) {
2951 m_freem(m);
2952 return;
2953 }
2954
2955 if (flags & DROPTAP_FLAG_DIR_OUT) {
2956 droptap_output_mbuf(m, reason, funcname, linenum, flags,
2957 ifp);
2958 } else if (flags & DROPTAP_FLAG_DIR_IN) {
2959 droptap_input_mbuf(m, reason, funcname, linenum, flags,
2960 m->m_pkthdr.rcvif, frame_header);
2961 }
2962 m_freem(m);
2963 }
2964
2965 /*
2966 * Mbuffer utility routines.
2967 */
2968 /*
2969 * Set the m_data pointer of a newly allocated mbuf to place an object of the
2970 * specified size at the end of the mbuf, longword aligned.
2971 *
2972 * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
2973 * separate macros, each asserting that it was called at the proper moment.
2974 * This required callers to themselves test the storage type and call the
2975 * right one. Rather than require callers to be aware of those layout
2976 * decisions, we centralize here.
2977 */
2978 void
m_align(struct mbuf * m,int len)2979 m_align(struct mbuf *m, int len)
2980 {
2981 int adjust = 0;
2982
2983 /* At this point data must point to start */
2984 VERIFY(m->m_data == (uintptr_t)M_START(m));
2985 VERIFY(len >= 0);
2986 VERIFY(len <= M_SIZE(m));
2987 adjust = M_SIZE(m) - len;
2988 m->m_data += adjust & ~(sizeof(long) - 1);
2989 }
2990
2991 /*
2992 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
2993 * copy junk along. Does not adjust packet header length.
2994 */
2995 struct mbuf *
m_prepend(struct mbuf * m,int len,int how)2996 m_prepend(struct mbuf *m, int len, int how)
2997 {
2998 struct mbuf *mn;
2999
3000 _MGET(mn, how, m->m_type);
3001 if (mn == NULL) {
3002 m_freem(m);
3003 return NULL;
3004 }
3005 if (m->m_flags & M_PKTHDR) {
3006 M_COPY_PKTHDR(mn, m);
3007 m->m_flags &= ~M_PKTHDR;
3008 }
3009 mn->m_next = m;
3010 m = mn;
3011 if (m->m_flags & M_PKTHDR) {
3012 VERIFY(len <= MHLEN);
3013 MH_ALIGN(m, len);
3014 } else {
3015 VERIFY(len <= MLEN);
3016 M_ALIGN(m, len);
3017 }
3018 m->m_len = len;
3019 return m;
3020 }
3021
3022 /*
3023 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
3024 * chain, copy junk along, and adjust length.
3025 */
3026 struct mbuf *
m_prepend_2(struct mbuf * m,int len,int how,int align)3027 m_prepend_2(struct mbuf *m, int len, int how, int align)
3028 {
3029 if (M_LEADINGSPACE(m) >= len &&
3030 (!align || IS_P2ALIGNED((m->m_data - len), sizeof(u_int32_t)))) {
3031 m->m_data -= len;
3032 m->m_len += len;
3033 } else {
3034 m = m_prepend(m, len, how);
3035 }
3036 if ((m) && (m->m_flags & M_PKTHDR)) {
3037 m->m_pkthdr.len += len;
3038 }
3039 return m;
3040 }
3041
3042 /*
3043 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
3044 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
3045 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
3046 *
3047 * The last mbuf and offset accessed are passed in and adjusted on return to
3048 * avoid having to iterate over the entire mbuf chain each time.
3049 */
3050 struct mbuf *
m_copym_mode(struct mbuf * m,int off0,int len0,int wait,struct mbuf ** m_lastm,int * m_off,uint32_t mode)3051 m_copym_mode(struct mbuf *m, int off0, int len0, int wait,
3052 struct mbuf **m_lastm, int *m_off, uint32_t mode)
3053 {
3054 mbuf_ref_t n, mhdr = NULL, *np, top;
3055 int off = off0, len = len0;
3056 int copyhdr = 0;
3057
3058 if (off < 0 || len < 0) {
3059 panic("m_copym: invalid offset %d or len %d", off, len);
3060 }
3061
3062 VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
3063 mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR));
3064
3065 if ((off == 0 && (m->m_flags & M_PKTHDR)) ||
3066 mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) {
3067 mhdr = m;
3068 copyhdr = 1;
3069 }
3070
3071 if (m_lastm != NULL && *m_lastm != NULL) {
3072 if (off0 >= *m_off) {
3073 m = *m_lastm;
3074 off = off0 - *m_off;
3075 }
3076 }
3077
3078 while (off >= m->m_len) {
3079 off -= m->m_len;
3080 m = m->m_next;
3081 }
3082 np = ⊤
3083 top = NULL;
3084
3085 while (len > 0) {
3086 if (m == NULL) {
3087 if (len != M_COPYALL) {
3088 panic("m_copym: len != M_COPYALL");
3089 }
3090 break;
3091 }
3092
3093 if (copyhdr) {
3094 n = _M_RETRYHDR(wait, m->m_type);
3095 } else {
3096 n = _M_RETRY(wait, m->m_type);
3097 }
3098 *np = n;
3099
3100 if (n == NULL) {
3101 goto nospace;
3102 }
3103
3104 if (copyhdr != 0) {
3105 if ((mode == M_COPYM_MOVE_HDR) ||
3106 (mode == M_COPYM_MUST_MOVE_HDR)) {
3107 M_COPY_PKTHDR(n, mhdr);
3108 } else if ((mode == M_COPYM_COPY_HDR) ||
3109 (mode == M_COPYM_MUST_COPY_HDR)) {
3110 if (m_dup_pkthdr(n, mhdr, wait) == 0) {
3111 goto nospace;
3112 }
3113 }
3114 if (len == M_COPYALL) {
3115 n->m_pkthdr.len -= off0;
3116 } else {
3117 n->m_pkthdr.len = len;
3118 }
3119 copyhdr = 0;
3120 /*
3121 * There is data to copy from the packet header mbuf
3122 * if it is empty or it is before the starting offset
3123 */
3124 if (mhdr != m) {
3125 np = &n->m_next;
3126 continue;
3127 }
3128 }
3129 n->m_len = MIN(len, (m->m_len - off));
3130 if (m->m_flags & M_EXT) {
3131 n->m_ext = m->m_ext;
3132 m_incref(m);
3133 n->m_data = m->m_data + off;
3134 n->m_flags |= M_EXT;
3135 } else {
3136 /*
3137 * Limit to the capacity of the destination
3138 */
3139 n->m_len = MIN(n->m_len, M_SIZE(n));
3140
3141 if (m_mtod_end(n) > m_mtod_upper_bound(n)) {
3142 panic("%s n %p copy overflow",
3143 __func__, n);
3144 }
3145
3146 bcopy(mtod(m, caddr_t) + off, mtod(n, caddr_t),
3147 (unsigned)n->m_len);
3148 }
3149 if (len != M_COPYALL) {
3150 len -= n->m_len;
3151 }
3152
3153 if (len == 0) {
3154 if (m_lastm != NULL) {
3155 *m_lastm = m;
3156 *m_off = off0 + len0 - (off + n->m_len);
3157 }
3158 }
3159 off = 0;
3160 m = m->m_next;
3161 np = &n->m_next;
3162 }
3163
3164 return top;
3165 nospace:
3166 m_freem(top);
3167
3168 return NULL;
3169 }
3170
3171
3172 struct mbuf *
m_copym(struct mbuf * m,int off0,int len,int wait)3173 m_copym(struct mbuf *m, int off0, int len, int wait)
3174 {
3175 return m_copym_mode(m, off0, len, wait, NULL, NULL, M_COPYM_MOVE_HDR);
3176 }
3177
3178 #if !CONFIG_MBUF_MCACHE
3179 /*
3180 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
3181 * within this routine also.
3182 *
3183 * The last mbuf and offset accessed are passed in and adjusted on return to
3184 * avoid having to iterate over the entire mbuf chain each time.
3185 */
3186 struct mbuf *
m_copym_with_hdrs(struct mbuf * m0,int off0,int len0,int wait,struct mbuf ** m_lastm,int * m_off,uint32_t mode)3187 m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
3188 struct mbuf **m_lastm, int *m_off, uint32_t mode)
3189 {
3190 mbuf_ref_t m = m0, n, *np = NULL, top = NULL;
3191 int off = off0, len = len0;
3192 zstack_t list = {};
3193 int copyhdr = 0;
3194 int type = 0;
3195 int needed = 0;
3196
3197 if (off == 0 && (m->m_flags & M_PKTHDR)) {
3198 copyhdr = 1;
3199 }
3200
3201 if (m_lastm != NULL && *m_lastm != NULL) {
3202 if (off0 >= *m_off) {
3203 m = *m_lastm;
3204 off = off0 - *m_off;
3205 }
3206 }
3207
3208 while (off >= m->m_len) {
3209 off -= m->m_len;
3210 m = m->m_next;
3211 }
3212
3213 n = m;
3214 while (len > 0) {
3215 needed++;
3216 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
3217 n = n->m_next;
3218 }
3219 needed++;
3220 len = len0;
3221
3222 list = mz_alloc_n(needed, wait);
3223 if (zstack_count(list) != needed) {
3224 goto nospace;
3225 }
3226
3227 needed = 0;
3228 while (len > 0) {
3229 n = zstack_pop(&list);
3230 ASSERT(n != NULL && m != NULL);
3231
3232 type = (top == NULL) ? MT_HEADER : m->m_type;
3233 mbuf_init(n, (top == NULL), type);
3234
3235 if (top == NULL) {
3236 top = n;
3237 np = &top->m_next;
3238 continue;
3239 } else {
3240 needed++;
3241 *np = n;
3242 }
3243
3244 if (copyhdr) {
3245 if ((mode == M_COPYM_MOVE_HDR) ||
3246 (mode == M_COPYM_MUST_MOVE_HDR)) {
3247 M_COPY_PKTHDR(n, m);
3248 } else if ((mode == M_COPYM_COPY_HDR) ||
3249 (mode == M_COPYM_MUST_COPY_HDR)) {
3250 if (m_dup_pkthdr(n, m, wait) == 0) {
3251 m_elide(n);
3252 goto nospace;
3253 }
3254 }
3255 n->m_pkthdr.len = len;
3256 copyhdr = 0;
3257 }
3258 n->m_len = MIN(len, (m->m_len - off));
3259
3260 if (m->m_flags & M_EXT) {
3261 n->m_ext = m->m_ext;
3262 m_incref(m);
3263 n->m_data = m->m_data + off;
3264 n->m_flags |= M_EXT;
3265 } else {
3266 if (m_mtod_end(n) > m_mtod_upper_bound(n)) {
3267 panic("%s n %p copy overflow",
3268 __func__, n);
3269 }
3270
3271 bcopy(mtod(m, caddr_t) + off, mtod(n, caddr_t),
3272 (unsigned)n->m_len);
3273 }
3274 len -= n->m_len;
3275
3276 if (len == 0) {
3277 if (m_lastm != NULL) {
3278 *m_lastm = m;
3279 *m_off = off0 + len0 - (off + n->m_len);
3280 }
3281 break;
3282 }
3283 off = 0;
3284 m = m->m_next;
3285 np = &n->m_next;
3286 }
3287
3288 mtype_stat_inc(MT_HEADER);
3289 mtype_stat_add(type, needed);
3290 mtype_stat_sub(MT_FREE, needed + 1);
3291
3292 ASSERT(zstack_empty(list));
3293
3294 return top;
3295
3296 nospace:
3297 if (!zstack_empty(list)) {
3298 /* No need to elide, these mbufs came from the cache. */
3299 mz_free_n(list);
3300 }
3301 if (top != NULL) {
3302 m_freem(top);
3303 }
3304 return NULL;
3305 }
3306 #endif /* !CONFIG_MBUF_MCACHE */
3307
3308 /*
3309 * Copy data from an mbuf chain starting "off" bytes from the beginning,
3310 * continuing for "len" bytes, into the indicated buffer.
3311 */
3312 void
m_copydata(struct mbuf * m,int off,int len0,void * vp __sized_by (len0))3313 m_copydata(struct mbuf *m, int off, int len0, void *vp __sized_by(len0))
3314 {
3315 int off0 = off, len = len0;
3316 struct mbuf *m0 = m;
3317 unsigned count;
3318 char *cp = vp;
3319
3320 if (__improbable(off < 0 || len < 0)) {
3321 panic("%s: invalid offset %d or len %d", __func__, off, len);
3322 /* NOTREACHED */
3323 }
3324
3325 while (off > 0) {
3326 if (__improbable(m == NULL)) {
3327 panic("%s: invalid mbuf chain %p [off %d, len %d]",
3328 __func__, m0, off0, len0);
3329 /* NOTREACHED */
3330 }
3331 if (off < m->m_len) {
3332 break;
3333 }
3334 off -= m->m_len;
3335 m = m->m_next;
3336 }
3337 while (len > 0) {
3338 if (__improbable(m == NULL)) {
3339 panic("%s: invalid mbuf chain %p [off %d, len %d]",
3340 __func__, m0, off0, len0);
3341 /* NOTREACHED */
3342 }
3343 count = MIN(m->m_len - off, len);
3344 bcopy(mtod(m, caddr_t) + off, cp, count);
3345 len -= count;
3346 cp += count;
3347 off = 0;
3348 m = m->m_next;
3349 }
3350 }
3351
3352 /*
3353 * Concatenate mbuf chain n to m. Both chains must be of the same type
3354 * (e.g. MT_DATA). Any m_pkthdr is not updated.
3355 */
3356 void
m_cat(struct mbuf * m,struct mbuf * n)3357 m_cat(struct mbuf *m, struct mbuf *n)
3358 {
3359 while (m->m_next) {
3360 m = m->m_next;
3361 }
3362 while (n) {
3363 if ((m->m_flags & M_EXT) ||
3364 m->m_data + m->m_len + n->m_len >= (uintptr_t)&m->m_dat[MLEN]) {
3365 /* just join the two chains */
3366 m->m_next = n;
3367 return;
3368 }
3369 /* splat the data from one into the other */
3370 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
3371 (u_int)n->m_len);
3372 m->m_len += n->m_len;
3373 n = m_free(n);
3374 }
3375 }
3376
3377 void
m_adj(struct mbuf * mp,int req_len)3378 m_adj(struct mbuf *mp, int req_len)
3379 {
3380 int len = req_len;
3381 struct mbuf *m;
3382 int count;
3383
3384 if ((m = mp) == NULL) {
3385 return;
3386 }
3387 if (len >= 0) {
3388 /*
3389 * Trim from head.
3390 */
3391 while (m != NULL && len > 0) {
3392 if (m->m_len <= len) {
3393 len -= m->m_len;
3394 m->m_len = 0;
3395 m = m->m_next;
3396 } else {
3397 m->m_len -= len;
3398 m->m_data += len;
3399 len = 0;
3400 }
3401 }
3402 m = mp;
3403 if (m->m_flags & M_PKTHDR) {
3404 m->m_pkthdr.len -= (req_len - len);
3405 }
3406 } else {
3407 /*
3408 * Trim from tail. Scan the mbuf chain,
3409 * calculating its length and finding the last mbuf.
3410 * If the adjustment only affects this mbuf, then just
3411 * adjust and return. Otherwise, rescan and truncate
3412 * after the remaining size.
3413 */
3414 len = -len;
3415 count = 0;
3416 for (;;) {
3417 count += m->m_len;
3418 if (m->m_next == NULL) {
3419 break;
3420 }
3421 m = m->m_next;
3422 }
3423 if (m->m_len >= len) {
3424 m->m_len -= len;
3425 m = mp;
3426 if (m->m_flags & M_PKTHDR) {
3427 m->m_pkthdr.len -= len;
3428 }
3429 return;
3430 }
3431 count -= len;
3432 if (count < 0) {
3433 count = 0;
3434 }
3435 /*
3436 * Correct length for chain is "count".
3437 * Find the mbuf with last data, adjust its length,
3438 * and toss data from remaining mbufs on chain.
3439 */
3440 m = mp;
3441 if (m->m_flags & M_PKTHDR) {
3442 m->m_pkthdr.len = count;
3443 }
3444 for (; m; m = m->m_next) {
3445 if (m->m_len >= count) {
3446 m->m_len = count;
3447 break;
3448 }
3449 count -= m->m_len;
3450 }
3451 while ((m = m->m_next)) {
3452 m->m_len = 0;
3453 }
3454 }
3455 }
3456
3457 /*
3458 * Rearange an mbuf chain so that len bytes are contiguous
3459 * and in the data area of an mbuf (so that mtod
3460 * will work for a structure of size len). Returns the resulting
3461 * mbuf chain on success, frees it and returns null on failure.
3462 * If there is room, it will add up to max_protohdr-len extra bytes to the
3463 * contiguous region in an attempt to avoid being called next time.
3464 */
3465 struct mbuf *
m_pullup(struct mbuf * n,int len)3466 m_pullup(struct mbuf *n, int len)
3467 {
3468 struct mbuf *m;
3469 int count;
3470 int space;
3471
3472 /* check invalid arguments */
3473 if (n == NULL) {
3474 panic("%s: n == NULL", __func__);
3475 }
3476 if (len < 0) {
3477 os_log_info(OS_LOG_DEFAULT, "%s: failed negative len %d",
3478 __func__, len);
3479 goto bad;
3480 }
3481 if (len > MLEN) {
3482 os_log_info(OS_LOG_DEFAULT, "%s: failed len %d too big",
3483 __func__, len);
3484 goto bad;
3485 }
3486 if ((n->m_flags & M_EXT) == 0 &&
3487 m_mtod_current(n) >= m_mtod_upper_bound(n)) {
3488 os_log_info(OS_LOG_DEFAULT, "%s: m_data out of bounds",
3489 __func__);
3490 goto bad;
3491 }
3492
3493 /*
3494 * If first mbuf has no cluster, and has room for len bytes
3495 * without shifting current data, pullup into it,
3496 * otherwise allocate a new mbuf to prepend to the chain.
3497 */
3498 if ((n->m_flags & M_EXT) == 0 &&
3499 len < m_mtod_upper_bound(n) - m_mtod_current(n) && n->m_next != NULL) {
3500 if (n->m_len >= len) {
3501 return n;
3502 }
3503 m = n;
3504 n = n->m_next;
3505 len -= m->m_len;
3506 } else {
3507 if (len > MHLEN) {
3508 goto bad;
3509 }
3510 _MGET(m, M_DONTWAIT, n->m_type);
3511 if (m == 0) {
3512 goto bad;
3513 }
3514 m->m_len = 0;
3515 if (n->m_flags & M_PKTHDR) {
3516 M_COPY_PKTHDR(m, n);
3517 n->m_flags &= ~M_PKTHDR;
3518 }
3519 }
3520 space = m_mtod_upper_bound(m) - m_mtod_end(m);
3521 do {
3522 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
3523 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
3524 (unsigned)count);
3525 len -= count;
3526 m->m_len += count;
3527 n->m_len -= count;
3528 space -= count;
3529 if (n->m_len != 0) {
3530 n->m_data += count;
3531 } else {
3532 n = m_free(n);
3533 }
3534 } while (len > 0 && n != NULL);
3535 if (len > 0) {
3536 (void) m_free(m);
3537 goto bad;
3538 }
3539 m->m_next = n;
3540 return m;
3541 bad:
3542 m_freem(n);
3543 return 0;
3544 }
3545
3546 /*
3547 * Like m_pullup(), except a new mbuf is always allocated, and we allow
3548 * the amount of empty space before the data in the new mbuf to be specified
3549 * (in the event that the caller expects to prepend later).
3550 */
3551 __private_extern__ struct mbuf *
m_copyup(struct mbuf * n,int len,int dstoff)3552 m_copyup(struct mbuf *n, int len, int dstoff)
3553 {
3554 struct mbuf *m;
3555 int count, space;
3556
3557 VERIFY(len >= 0 && dstoff >= 0);
3558
3559 if (len > (MHLEN - dstoff)) {
3560 goto bad;
3561 }
3562 MGET(m, M_DONTWAIT, n->m_type);
3563 if (m == NULL) {
3564 goto bad;
3565 }
3566 m->m_len = 0;
3567 if (n->m_flags & M_PKTHDR) {
3568 m_copy_pkthdr(m, n);
3569 n->m_flags &= ~M_PKTHDR;
3570 }
3571 m->m_data += dstoff;
3572 space = m_mtod_upper_bound(m) - m_mtod_end(m);
3573 do {
3574 count = min(min(max(len, max_protohdr), space), n->m_len);
3575 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
3576 (unsigned)count);
3577 len -= count;
3578 m->m_len += count;
3579 n->m_len -= count;
3580 space -= count;
3581 if (n->m_len) {
3582 n->m_data += count;
3583 } else {
3584 n = m_free(n);
3585 }
3586 } while (len > 0 && n);
3587 if (len > 0) {
3588 (void) m_free(m);
3589 goto bad;
3590 }
3591 m->m_next = n;
3592 return m;
3593 bad:
3594 m_freem(n);
3595
3596 return NULL;
3597 }
3598
3599 /*
3600 * Partition an mbuf chain in two pieces, returning the tail --
3601 * all but the first len0 bytes. In case of failure, it returns NULL and
3602 * attempts to restore the chain to its original state.
3603 */
3604 struct mbuf *
m_split(struct mbuf * m0,int len0,int wait)3605 m_split(struct mbuf *m0, int len0, int wait)
3606 {
3607 return m_split0(m0, len0, wait, 1);
3608 }
3609
3610 static struct mbuf *
m_split0(struct mbuf * m0,int len0,int wait,int copyhdr)3611 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
3612 {
3613 struct mbuf *m, *n;
3614 unsigned len = len0, remain;
3615
3616 /*
3617 * First iterate to the mbuf which contains the first byte of
3618 * data at offset len0
3619 */
3620 for (m = m0; m && len > m->m_len; m = m->m_next) {
3621 len -= m->m_len;
3622 }
3623 if (m == NULL) {
3624 return NULL;
3625 }
3626 /*
3627 * len effectively is now the offset in the current
3628 * mbuf where we have to perform split.
3629 *
3630 * remain becomes the tail length.
3631 * Note that len can also be == m->m_len
3632 */
3633 remain = m->m_len - len;
3634
3635 /*
3636 * If current mbuf len contains the entire remaining offset len,
3637 * just make the second mbuf chain pointing to next mbuf onwards
3638 * and return after making necessary adjustments
3639 */
3640 if (copyhdr && (m0->m_flags & M_PKTHDR) && remain == 0) {
3641 _MGETHDR(n, wait, m0->m_type);
3642 if (n == NULL) {
3643 return NULL;
3644 }
3645 n->m_next = m->m_next;
3646 m->m_next = NULL;
3647 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
3648 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
3649 m0->m_pkthdr.len = len0;
3650 return n;
3651 }
3652 if (copyhdr && (m0->m_flags & M_PKTHDR)) {
3653 _MGETHDR(n, wait, m0->m_type);
3654 if (n == NULL) {
3655 return NULL;
3656 }
3657 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
3658 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
3659 m0->m_pkthdr.len = len0;
3660
3661 /*
3662 * If current points to external storage
3663 * then it can be shared by making last mbuf
3664 * of head chain and first mbuf of current chain
3665 * pointing to different data offsets
3666 */
3667 if (m->m_flags & M_EXT) {
3668 goto extpacket;
3669 }
3670 if (remain > MHLEN) {
3671 /* m can't be the lead packet */
3672 MH_ALIGN(n, 0);
3673 n->m_next = m_split(m, len, wait);
3674 if (n->m_next == NULL) {
3675 (void) m_free(n);
3676 return NULL;
3677 } else {
3678 return n;
3679 }
3680 } else {
3681 MH_ALIGN(n, remain);
3682 }
3683 } else if (remain == 0) {
3684 n = m->m_next;
3685 m->m_next = NULL;
3686 return n;
3687 } else {
3688 _MGET(n, wait, m->m_type);
3689 if (n == NULL) {
3690 return NULL;
3691 }
3692
3693 if ((m->m_flags & M_EXT) == 0) {
3694 VERIFY(remain <= MLEN);
3695 M_ALIGN(n, remain);
3696 }
3697 }
3698 extpacket:
3699 if (m->m_flags & M_EXT) {
3700 n->m_flags |= M_EXT;
3701 n->m_ext = m->m_ext;
3702 m_incref(m);
3703 n->m_data = m->m_data + len;
3704 } else {
3705 bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
3706 }
3707 n->m_len = remain;
3708 m->m_len = len;
3709 n->m_next = m->m_next;
3710 m->m_next = NULL;
3711 return n;
3712 }
3713
3714
3715 /*
3716 * Return the number of bytes in the mbuf chain, m.
3717 */
3718 unsigned int
m_length(struct mbuf * m)3719 m_length(struct mbuf *m)
3720 {
3721 struct mbuf *m0;
3722 unsigned int pktlen;
3723
3724 if (m->m_flags & M_PKTHDR) {
3725 return m->m_pkthdr.len;
3726 }
3727
3728 pktlen = 0;
3729 for (m0 = m; m0 != NULL; m0 = m0->m_next) {
3730 pktlen += m0->m_len;
3731 }
3732 return pktlen;
3733 }
3734
3735 int
m_chain_capacity(const struct mbuf * m)3736 m_chain_capacity(const struct mbuf *m)
3737 {
3738 int rawlen = 0;
3739 while (m) {
3740 rawlen += m_capacity(m);
3741 m = m->m_next;
3742 }
3743
3744 return rawlen;
3745 }
3746
3747
3748 /*
3749 * Copy data from a buffer back into the indicated mbuf chain,
3750 * starting "off" bytes from the beginning, extending the mbuf
3751 * chain if necessary.
3752 */
3753 void
m_copyback(struct mbuf * m0,int off,int len,const void * cp __sized_by (len))3754 m_copyback(struct mbuf *m0, int off, int len, const void *cp __sized_by(len))
3755 {
3756 #if DEBUG
3757 struct mbuf *origm = m0;
3758 int error;
3759 #endif /* DEBUG */
3760
3761 if (m0 == NULL) {
3762 return;
3763 }
3764
3765 #if DEBUG
3766 error =
3767 #endif /* DEBUG */
3768 m_copyback0(&m0, off, len, cp,
3769 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
3770
3771 #if DEBUG
3772 if (error != 0 || (m0 != NULL && origm != m0)) {
3773 panic("m_copyback");
3774 }
3775 #endif /* DEBUG */
3776 }
3777
3778 struct mbuf *
m_copyback_cow(struct mbuf * m0,int off,int len,const void * cp __sized_by (len),int how)3779 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp __sized_by(len), int how)
3780 {
3781 int error;
3782
3783 /* don't support chain expansion */
3784 VERIFY(off + len <= m_length(m0));
3785
3786 error = m_copyback0(&m0, off, len, cp,
3787 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
3788 if (error) {
3789 /*
3790 * no way to recover from partial success.
3791 * just free the chain.
3792 */
3793 m_freem(m0);
3794 return NULL;
3795 }
3796 return m0;
3797 }
3798
3799 /*
3800 * m_makewritable: ensure the specified range writable.
3801 */
3802 int
m_makewritable(struct mbuf ** mp,int off,int len,int how)3803 m_makewritable(struct mbuf **mp, int off, int len, int how)
3804 {
3805 int error;
3806 #if DEBUG
3807 struct mbuf *n;
3808 int origlen, reslen;
3809
3810 origlen = m_length(*mp);
3811 #endif /* DEBUG */
3812
3813 error = m_copyback0(mp, off, len, NULL,
3814 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
3815
3816 #if DEBUG
3817 reslen = 0;
3818 for (n = *mp; n; n = n->m_next) {
3819 reslen += n->m_len;
3820 }
3821 if (origlen != reslen) {
3822 panic("m_makewritable: length changed");
3823 }
3824 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len) {
3825 panic("m_makewritable: inconsist");
3826 }
3827 #endif /* DEBUG */
3828
3829 return error;
3830 }
3831
3832 static int
m_copyback0(struct mbuf ** mp0,int off,int len0,const void * vp __sized_by_or_null (len0),int flags,int how)3833 m_copyback0(struct mbuf **mp0, int off, int len0, const void *vp __sized_by_or_null(len0), int flags,
3834 int how)
3835 {
3836 int mlen, len = len0, totlen = 0;
3837 mbuf_ref_t m, n, *mp;
3838 const char *cp = vp;
3839
3840 VERIFY(mp0 != NULL);
3841 VERIFY(*mp0 != NULL);
3842 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
3843 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
3844
3845 /*
3846 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
3847 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
3848 */
3849
3850 VERIFY((~flags & (M_COPYBACK0_EXTEND | M_COPYBACK0_COW)) != 0);
3851
3852 mp = mp0;
3853 m = *mp;
3854 while (off > (mlen = m->m_len)) {
3855 off -= mlen;
3856 totlen += mlen;
3857 if (m->m_next == NULL) {
3858 int tspace;
3859 extend:
3860 if (!(flags & M_COPYBACK0_EXTEND)) {
3861 goto out;
3862 }
3863
3864 /*
3865 * try to make some space at the end of "m".
3866 */
3867
3868 mlen = m->m_len;
3869 if (off + len >= MINCLSIZE &&
3870 !(m->m_flags & M_EXT) && m->m_len == 0) {
3871 MCLGET(m, how);
3872 }
3873 tspace = M_TRAILINGSPACE(m);
3874 if (tspace > 0) {
3875 tspace = MIN(tspace, off + len);
3876 VERIFY(tspace > 0);
3877 bzero(mtod(m, char *) + m->m_len,
3878 MIN(off, tspace));
3879 m->m_len += tspace;
3880 off += mlen;
3881 totlen -= mlen;
3882 continue;
3883 }
3884
3885 /*
3886 * need to allocate an mbuf.
3887 */
3888
3889 if (off + len >= MINCLSIZE) {
3890 n = m_getcl(how, m->m_type, 0);
3891 } else {
3892 n = _M_GET(how, m->m_type);
3893 }
3894 if (n == NULL) {
3895 goto out;
3896 }
3897 n->m_len = 0;
3898 n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
3899 bzero(mtod(n, char *), MIN(n->m_len, off));
3900 m->m_next = n;
3901 }
3902 mp = &m->m_next;
3903 m = m->m_next;
3904 }
3905 while (len > 0) {
3906 mlen = m->m_len - off;
3907 if (mlen != 0 && m_mclhasreference(m)) {
3908 char *datap;
3909 int eatlen;
3910
3911 /*
3912 * this mbuf is read-only.
3913 * allocate a new writable mbuf and try again.
3914 */
3915
3916 /*
3917 * if we're going to write into the middle of
3918 * a mbuf, split it first.
3919 */
3920 if (off > 0 && len < mlen) {
3921 n = m_split0(m, off, how, 0);
3922 if (n == NULL) {
3923 goto enobufs;
3924 }
3925 m->m_next = n;
3926 mp = &m->m_next;
3927 m = n;
3928 off = 0;
3929 continue;
3930 }
3931
3932 /*
3933 * XXX TODO coalesce into the trailingspace of
3934 * the previous mbuf when possible.
3935 */
3936
3937 /*
3938 * allocate a new mbuf. copy packet header if needed.
3939 */
3940 n = _M_GET(how, m->m_type);
3941 if (n == NULL) {
3942 goto enobufs;
3943 }
3944 if (off == 0 && (m->m_flags & M_PKTHDR)) {
3945 M_COPY_PKTHDR(n, m);
3946 n->m_len = MHLEN;
3947 } else {
3948 if (len >= MINCLSIZE) {
3949 MCLGET(n, M_DONTWAIT);
3950 }
3951 n->m_len =
3952 (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
3953 }
3954 if (n->m_len > len) {
3955 n->m_len = len;
3956 }
3957
3958 /*
3959 * free the region which has been overwritten.
3960 * copying data from old mbufs if requested.
3961 */
3962 if (flags & M_COPYBACK0_PRESERVE) {
3963 datap = mtod(n, char *);
3964 } else {
3965 datap = NULL;
3966 }
3967 eatlen = n->m_len;
3968 VERIFY(off == 0 || eatlen >= mlen);
3969 if (off > 0) {
3970 VERIFY(len >= mlen);
3971 m->m_len = off;
3972 m->m_next = n;
3973 if (datap) {
3974 m_copydata(m, off, mlen, datap);
3975 datap += mlen;
3976 }
3977 eatlen -= mlen;
3978 mp = &m->m_next;
3979 m = m->m_next;
3980 }
3981 while (m != NULL && m_mclhasreference(m) &&
3982 n->m_type == m->m_type && eatlen > 0) {
3983 mlen = MIN(eatlen, m->m_len);
3984 if (datap) {
3985 m_copydata(m, 0, mlen, datap);
3986 datap += mlen;
3987 }
3988 m->m_data += mlen;
3989 m->m_len -= mlen;
3990 eatlen -= mlen;
3991 if (m->m_len == 0) {
3992 *mp = m = m_free(m);
3993 }
3994 }
3995 if (eatlen > 0) {
3996 n->m_len -= eatlen;
3997 }
3998 n->m_next = m;
3999 *mp = m = n;
4000 continue;
4001 }
4002 mlen = MIN(mlen, len);
4003 if (flags & M_COPYBACK0_COPYBACK) {
4004 bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
4005 cp += mlen;
4006 }
4007 len -= mlen;
4008 mlen += off;
4009 off = 0;
4010 totlen += mlen;
4011 if (len == 0) {
4012 break;
4013 }
4014 if (m->m_next == NULL) {
4015 goto extend;
4016 }
4017 mp = &m->m_next;
4018 m = m->m_next;
4019 }
4020 out:
4021 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
4022 VERIFY(flags & M_COPYBACK0_EXTEND);
4023 m->m_pkthdr.len = totlen;
4024 }
4025
4026 return 0;
4027
4028 enobufs:
4029 return ENOBUFS;
4030 }
4031
4032 #if !CONFIG_MBUF_MCACHE
4033 uint64_t
mcl_to_paddr(char * addr)4034 mcl_to_paddr(char *addr)
4035 {
4036 extern addr64_t kvtophys(vm_offset_t va);
4037
4038 return kvtophys((vm_offset_t)addr);
4039 }
4040 #endif /* !CONFIG_MBUF_MCACHE */
4041
4042 /*
4043 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
4044 * And really copy the thing. That way, we don't "precompute" checksums
4045 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
4046 * small packets, don't dup into a cluster. That way received packets
4047 * don't take up too much room in the sockbuf (cf. sbspace()).
4048 */
4049 struct mbuf *
m_dup(struct mbuf * m,int how)4050 m_dup(struct mbuf *m, int how)
4051 {
4052 mbuf_ref_t n, top, *np;
4053 int copyhdr = 0;
4054
4055 np = ⊤
4056 top = NULL;
4057 if (m->m_flags & M_PKTHDR) {
4058 copyhdr = 1;
4059 }
4060
4061 /*
4062 * Quick check: if we have one mbuf and its data fits in an
4063 * mbuf with packet header, just copy and go.
4064 */
4065 if (m->m_next == NULL) {
4066 /* Then just move the data into an mbuf and be done... */
4067 if (copyhdr) {
4068 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
4069 if ((n = _M_GETHDR(how, m->m_type)) == NULL) {
4070 return NULL;
4071 }
4072 n->m_len = m->m_len;
4073 m_dup_pkthdr(n, m, how);
4074 bcopy(mtod(m, caddr_t), mtod(n, caddr_t), m->m_len);
4075 return n;
4076 }
4077 } else if (m->m_len <= MLEN) {
4078 if ((n = _M_GET(how, m->m_type)) == NULL) {
4079 return NULL;
4080 }
4081 bcopy(mtod(m, caddr_t), mtod(n, caddr_t), m->m_len);
4082 n->m_len = m->m_len;
4083 return n;
4084 }
4085 }
4086 while (m != NULL) {
4087 #if BLUE_DEBUG
4088 printf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
4089 m->m_data);
4090 #endif
4091 if (copyhdr) {
4092 n = _M_GETHDR(how, m->m_type);
4093 } else {
4094 n = _M_GET(how, m->m_type);
4095 }
4096 if (n == NULL) {
4097 goto nospace;
4098 }
4099 if (m->m_flags & M_EXT) {
4100 if (m->m_len <= m_maxsize(MC_CL)) {
4101 MCLGET(n, how);
4102 } else if (m->m_len <= m_maxsize(MC_BIGCL)) {
4103 n = m_mbigget(n, how);
4104 } else if (m->m_len <= m_maxsize(MC_16KCL)) {
4105 n = m_m16kget(n, how);
4106 }
4107 if (!(n->m_flags & M_EXT)) {
4108 (void) m_free(n);
4109 goto nospace;
4110 }
4111 } else {
4112 VERIFY((copyhdr == 1 && m->m_len <= MHLEN) ||
4113 (copyhdr == 0 && m->m_len <= MLEN));
4114 }
4115 *np = n;
4116 if (copyhdr) {
4117 /* Don't use M_COPY_PKTHDR: preserve m_data */
4118 m_dup_pkthdr(n, m, how);
4119 copyhdr = 0;
4120 if (!(n->m_flags & M_EXT)) {
4121 n->m_data = (uintptr_t)n->m_pktdat;
4122 }
4123 }
4124 n->m_len = m->m_len;
4125 /*
4126 * Get the dup on the same bdry as the original
4127 * Assume that the two mbufs have the same offset to data area
4128 * (up to word boundaries)
4129 */
4130 bcopy(mtod(m, caddr_t), mtod(n, caddr_t), (unsigned)n->m_len);
4131 m = m->m_next;
4132 np = &n->m_next;
4133 #if BLUE_DEBUG
4134 printf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
4135 n->m_data);
4136 #endif
4137 }
4138
4139 return top;
4140
4141 nospace:
4142 m_freem(top);
4143 return NULL;
4144 }
4145
4146 #define MBUF_MULTIPAGES(m) \
4147 (((m)->m_flags & M_EXT) && \
4148 ((IS_P2ALIGNED((m)->m_data, PAGE_SIZE) \
4149 && (m)->m_len > PAGE_SIZE) || \
4150 (!IS_P2ALIGNED((m)->m_data, PAGE_SIZE) && \
4151 P2ROUNDUP((m)->m_data, PAGE_SIZE) < ((uintptr_t)(m)->m_data + (m)->m_len))))
4152
4153 static struct mbuf *
m_expand(struct mbuf * m,struct mbuf ** last)4154 m_expand(struct mbuf *m, struct mbuf **last)
4155 {
4156 mbuf_ref_t top = NULL, *nm = ⊤
4157 uintptr_t data0, data;
4158 unsigned int len0, len;
4159
4160 VERIFY(MBUF_MULTIPAGES(m));
4161 VERIFY(m->m_next == NULL);
4162 data0 = (uintptr_t)m->m_data;
4163 len0 = m->m_len;
4164 *last = top;
4165
4166 for (;;) {
4167 struct mbuf *n;
4168
4169 data = data0;
4170 if (IS_P2ALIGNED(data, PAGE_SIZE) && len0 > PAGE_SIZE) {
4171 len = PAGE_SIZE;
4172 } else if (!IS_P2ALIGNED(data, PAGE_SIZE) &&
4173 P2ROUNDUP(data, PAGE_SIZE) < (data + len0)) {
4174 len = P2ROUNDUP(data, PAGE_SIZE) - data;
4175 } else {
4176 len = len0;
4177 }
4178
4179 VERIFY(len > 0);
4180 VERIFY(m->m_flags & M_EXT);
4181 m->m_data = data;
4182 m->m_len = len;
4183
4184 *nm = *last = m;
4185 nm = &m->m_next;
4186 m->m_next = NULL;
4187
4188 data0 += len;
4189 len0 -= len;
4190 if (len0 == 0) {
4191 break;
4192 }
4193
4194 n = _M_RETRY(M_DONTWAIT, MT_DATA);
4195 if (n == NULL) {
4196 m_freem(top);
4197 top = *last = NULL;
4198 break;
4199 }
4200
4201 n->m_ext = m->m_ext;
4202 m_incref(m);
4203 n->m_flags |= M_EXT;
4204 m = n;
4205 }
4206 return top;
4207 }
4208
4209 struct mbuf *
m_normalize(struct mbuf * m)4210 m_normalize(struct mbuf *m)
4211 {
4212 mbuf_ref_t top = NULL, *nm = ⊤
4213 boolean_t expanded = FALSE;
4214
4215 while (m != NULL) {
4216 mbuf_ref_t n;
4217
4218 n = m->m_next;
4219 m->m_next = NULL;
4220
4221 /* Does the data cross one or more page boundaries? */
4222 if (MBUF_MULTIPAGES(m)) {
4223 mbuf_ref_t last;
4224 if ((m = m_expand(m, &last)) == NULL) {
4225 m_freem(n);
4226 m_freem(top);
4227 top = NULL;
4228 break;
4229 }
4230 *nm = m;
4231 nm = &last->m_next;
4232 expanded = TRUE;
4233 } else {
4234 *nm = m;
4235 nm = &m->m_next;
4236 }
4237 m = n;
4238 }
4239 return top;
4240 }
4241
4242 /*
4243 * Append the specified data to the indicated mbuf chain,
4244 * Extend the mbuf chain if the new data does not fit in
4245 * existing space.
4246 *
4247 * Return 1 if able to complete the job; otherwise 0.
4248 */
4249 int
m_append(struct mbuf * m0,int len0,caddr_t cp0 __sized_by (len0))4250 m_append(struct mbuf *m0, int len0, caddr_t cp0 __sized_by(len0))
4251 {
4252 struct mbuf *m, *n;
4253 int remainder, space, len = len0;
4254 caddr_t cp = cp0;
4255
4256 for (m = m0; m->m_next != NULL; m = m->m_next) {
4257 ;
4258 }
4259 remainder = len;
4260 space = M_TRAILINGSPACE(m);
4261 if (space > 0) {
4262 /*
4263 * Copy into available space.
4264 */
4265 if (space > remainder) {
4266 space = remainder;
4267 }
4268 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
4269 m->m_len += space;
4270 cp += space;
4271 remainder -= space;
4272 }
4273 while (remainder > 0) {
4274 /*
4275 * Allocate a new mbuf; could check space
4276 * and allocate a cluster instead.
4277 */
4278 n = m_get(M_WAITOK, m->m_type);
4279 if (n == NULL) {
4280 break;
4281 }
4282 n->m_len = min(MLEN, remainder);
4283 bcopy(cp, mtod(n, caddr_t), n->m_len);
4284 cp += n->m_len;
4285 remainder -= n->m_len;
4286 m->m_next = n;
4287 m = n;
4288 }
4289 if (m0->m_flags & M_PKTHDR) {
4290 m0->m_pkthdr.len += len - remainder;
4291 }
4292 return remainder == 0;
4293 }
4294
4295 struct mbuf *
m_last(struct mbuf * m)4296 m_last(struct mbuf *m)
4297 {
4298 while (m->m_next != NULL) {
4299 m = m->m_next;
4300 }
4301 return m;
4302 }
4303
4304 unsigned int
m_fixhdr(struct mbuf * m0)4305 m_fixhdr(struct mbuf *m0)
4306 {
4307 u_int len;
4308
4309 VERIFY(m0->m_flags & M_PKTHDR);
4310
4311 len = m_length2(m0, NULL);
4312 m0->m_pkthdr.len = len;
4313 return len;
4314 }
4315
4316 unsigned int
m_length2(struct mbuf * m0,struct mbuf ** last)4317 m_length2(struct mbuf *m0, struct mbuf **last)
4318 {
4319 struct mbuf *m;
4320 u_int len;
4321
4322 len = 0;
4323 for (m = m0; m != NULL; m = m->m_next) {
4324 len += m->m_len;
4325 if (m->m_next == NULL) {
4326 break;
4327 }
4328 }
4329 if (last != NULL) {
4330 *last = m;
4331 }
4332 return len;
4333 }
4334
4335 /*
4336 * Defragment a mbuf chain, returning the shortest possible chain of mbufs
4337 * and clusters. If allocation fails and this cannot be completed, NULL will
4338 * be returned, but the passed in chain will be unchanged. Upon success,
4339 * the original chain will be freed, and the new chain will be returned.
4340 *
4341 * If a non-packet header is passed in, the original mbuf (chain?) will
4342 * be returned unharmed.
4343 *
4344 * If offset is specfied, the first mbuf in the chain will have a leading
4345 * space of the amount stated by the "off" parameter.
4346 *
4347 * This routine requires that the m_pkthdr.header field of the original
4348 * mbuf chain is cleared by the caller.
4349 */
4350 struct mbuf *
m_defrag_offset(struct mbuf * m0,u_int32_t off,int how)4351 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
4352 {
4353 struct mbuf *m_new = NULL, *m_final = NULL;
4354 int progress = 0, length, pktlen;
4355
4356 if (!(m0->m_flags & M_PKTHDR)) {
4357 return m0;
4358 }
4359
4360 VERIFY(off < MHLEN);
4361 m_fixhdr(m0); /* Needed sanity check */
4362
4363 pktlen = m0->m_pkthdr.len + off;
4364 if (pktlen > MHLEN) {
4365 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
4366 } else {
4367 m_final = m_gethdr(how, MT_DATA);
4368 }
4369
4370 if (m_final == NULL) {
4371 goto nospace;
4372 }
4373
4374 if (off > 0) {
4375 pktlen -= off;
4376 m_final->m_data += off;
4377 }
4378
4379 /*
4380 * Caller must have handled the contents pointed to by this
4381 * pointer before coming here, as otherwise it will point to
4382 * the original mbuf which will get freed upon success.
4383 */
4384 VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
4385
4386 if (m_dup_pkthdr(m_final, m0, how) == 0) {
4387 goto nospace;
4388 }
4389
4390 m_new = m_final;
4391
4392 while (progress < pktlen) {
4393 length = pktlen - progress;
4394 if (length > MCLBYTES) {
4395 length = MCLBYTES;
4396 }
4397 length -= ((m_new == m_final) ? off : 0);
4398 if (length < 0) {
4399 goto nospace;
4400 }
4401
4402 if (m_new == NULL) {
4403 if (length > MLEN) {
4404 m_new = m_getcl(how, MT_DATA, 0);
4405 } else {
4406 m_new = m_get(how, MT_DATA);
4407 }
4408 if (m_new == NULL) {
4409 goto nospace;
4410 }
4411 }
4412
4413 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
4414 progress += length;
4415 m_new->m_len = length;
4416 if (m_new != m_final) {
4417 m_cat(m_final, m_new);
4418 }
4419 m_new = NULL;
4420 }
4421 m_freem(m0);
4422 m0 = m_final;
4423 return m0;
4424 nospace:
4425 if (m_final) {
4426 m_freem(m_final);
4427 }
4428 return NULL;
4429 }
4430
4431 struct mbuf *
m_defrag(struct mbuf * m0,int how)4432 m_defrag(struct mbuf *m0, int how)
4433 {
4434 return m_defrag_offset(m0, 0, how);
4435 }
4436
4437 void
m_mchtype(struct mbuf * m,int t)4438 m_mchtype(struct mbuf *m, int t)
4439 {
4440 mtype_stat_inc(t);
4441 mtype_stat_dec(m->m_type);
4442 (m)->m_type = t;
4443 }
4444
4445 void *__unsafe_indexable
m_mtod(struct mbuf * m)4446 m_mtod(struct mbuf *m)
4447 {
4448 return m_mtod_current(m);
4449 }
4450
4451 /*
4452 * Return a pointer to mbuf/offset of location in mbuf chain.
4453 */
4454 struct mbuf *
m_getptr(struct mbuf * m,int loc,int * off)4455 m_getptr(struct mbuf *m, int loc, int *off)
4456 {
4457 while (loc >= 0) {
4458 /* Normal end of search. */
4459 if (m->m_len > loc) {
4460 *off = loc;
4461 return m;
4462 } else {
4463 loc -= m->m_len;
4464 if (m->m_next == NULL) {
4465 if (loc == 0) {
4466 /* Point at the end of valid data. */
4467 *off = m->m_len;
4468 return m;
4469 }
4470 return NULL;
4471 }
4472 m = m->m_next;
4473 }
4474 }
4475 return NULL;
4476 }
4477
4478 static uint32_t
mbuf_watchdog_socket_space(struct socket * so)4479 mbuf_watchdog_socket_space(struct socket *so)
4480 {
4481 uint32_t space = 0;
4482
4483 if (so == NULL) {
4484 return 0;
4485 }
4486
4487 space = so->so_snd.sb_mbcnt + so->so_rcv.sb_mbcnt;
4488
4489 #if INET
4490 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
4491 SOCK_PROTO(so) == IPPROTO_TCP) {
4492 space += tcp_reass_qlen_space(so);
4493 }
4494 #endif /* INET */
4495
4496 return space;
4497 }
4498
4499 struct mbuf_watchdog_defunct_args {
4500 struct proc *top_app;
4501 uint32_t top_app_space_used;
4502 bool non_blocking;
4503 };
4504
4505 static bool
proc_fd_trylock(proc_t p)4506 proc_fd_trylock(proc_t p)
4507 {
4508 return lck_mtx_try_lock(&p->p_fd.fd_lock);
4509 }
4510
4511 #if !CONFIG_MBUF_MCACHE
4512 static
4513 #endif
4514 int
mbuf_watchdog_defunct_iterate(proc_t p,void * arg)4515 mbuf_watchdog_defunct_iterate(proc_t p, void *arg)
4516 {
4517 struct fileproc *fp = NULL;
4518 struct mbuf_watchdog_defunct_args *args =
4519 (struct mbuf_watchdog_defunct_args *)arg;
4520 uint32_t space_used = 0;
4521
4522 /*
4523 * Non-blocking is only used when dumping the mbuf usage from the watchdog
4524 */
4525 if (args->non_blocking) {
4526 if (!proc_fd_trylock(p)) {
4527 return PROC_RETURNED;
4528 }
4529 } else {
4530 proc_fdlock(p);
4531 }
4532 fdt_foreach(fp, p) {
4533 struct fileglob *fg = fp->fp_glob;
4534 socket_ref_t so = NULL;
4535
4536 if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
4537 continue;
4538 }
4539 so = fg_get_data(fg);
4540 /*
4541 * We calculate the space without the socket
4542 * lock because we don't want to be blocked
4543 * by another process that called send() and
4544 * is stuck waiting for mbufs.
4545 *
4546 * These variables are 32-bit so we don't have
4547 * to worry about incomplete reads.
4548 */
4549 space_used += mbuf_watchdog_socket_space(so);
4550 }
4551 proc_fdunlock(p);
4552 if (space_used > args->top_app_space_used) {
4553 if (args->top_app != NULL) {
4554 proc_rele(args->top_app);
4555 }
4556 args->top_app = p;
4557 args->top_app_space_used = space_used;
4558
4559 return PROC_CLAIMED;
4560 } else {
4561 return PROC_RETURNED;
4562 }
4563 }
4564
4565 extern char *proc_name_address(void *p);
4566
4567 #if !CONFIG_MBUF_MCACHE
4568 static void
mbuf_watchdog_defunct(thread_call_param_t arg0,thread_call_param_t arg1)4569 mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1)
4570 {
4571 #pragma unused(arg0, arg1)
4572 struct mbuf_watchdog_defunct_args args = {};
4573 struct fileproc *fp = NULL;
4574
4575 args.non_blocking = false;
4576 proc_iterate(PROC_ALLPROCLIST,
4577 mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
4578
4579 /*
4580 * Defunct all sockets from this app.
4581 */
4582 if (args.top_app != NULL) {
4583 os_log(OS_LOG_DEFAULT, "%s: defuncting all sockets from %s.%d",
4584 __func__,
4585 proc_name_address(args.top_app),
4586 proc_pid(args.top_app));
4587 proc_fdlock(args.top_app);
4588 fdt_foreach(fp, args.top_app) {
4589 struct fileglob *fg = fp->fp_glob;
4590 struct socket *so = NULL;
4591
4592 if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
4593 continue;
4594 }
4595 so = (struct socket *)fp_get_data(fp);
4596 if (!socket_try_lock(so)) {
4597 continue;
4598 }
4599 if (sosetdefunct(args.top_app, so,
4600 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
4601 TRUE) == 0) {
4602 sodefunct(args.top_app, so,
4603 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
4604 }
4605 socket_unlock(so, 0);
4606 }
4607 proc_fdunlock(args.top_app);
4608 proc_rele(args.top_app);
4609 mbstat.m_forcedefunct++;
4610 zcache_drain(ZONE_ID_MBUF_CLUSTER_2K);
4611 zcache_drain(ZONE_ID_MBUF_CLUSTER_4K);
4612 zcache_drain(ZONE_ID_MBUF_CLUSTER_16K);
4613 zone_drain(zone_by_id(ZONE_ID_MBUF));
4614 zone_drain(zone_by_id(ZONE_ID_CLUSTER_2K));
4615 zone_drain(zone_by_id(ZONE_ID_CLUSTER_4K));
4616 zone_drain(zone_by_id(ZONE_ID_CLUSTER_16K));
4617 zone_drain(zone_by_id(ZONE_ID_MBUF_REF));
4618 }
4619 }
4620
4621 static LCK_GRP_DECLARE(mbuf_exhausted_grp, "mbuf-exhausted");
4622 static LCK_TICKET_DECLARE(mbuf_exhausted_lock, &mbuf_exhausted_grp);
4623 static uint32_t mbuf_exhausted_mask;
4624
4625 #define MBUF_EXHAUSTED_DRAIN_MASK (\
4626 (1u << MC_MBUF) | \
4627 (1u << MC_CL) | \
4628 (1u << MC_BIGCL) | \
4629 (1u << MC_16KCL))
4630
4631 #define MBUF_EXHAUSTED_DEFUNCT_MASK (\
4632 (1u << MC_MBUF) | \
4633 (1u << MC_MBUF_CL) | \
4634 (1u << MC_MBUF_BIGCL) | \
4635 (1u << MC_MBUF_16KCL))
4636
4637 static void
mbuf_watchdog_drain_composite(thread_call_param_t arg0,thread_call_param_t arg1)4638 mbuf_watchdog_drain_composite(thread_call_param_t arg0, thread_call_param_t arg1)
4639 {
4640 #pragma unused(arg0, arg1)
4641 zcache_drain(ZONE_ID_MBUF_CLUSTER_2K);
4642 zcache_drain(ZONE_ID_MBUF_CLUSTER_4K);
4643 zcache_drain(ZONE_ID_MBUF_CLUSTER_16K);
4644 }
4645
4646 static void
mbuf_zone_exhausted_start(uint32_t bit)4647 mbuf_zone_exhausted_start(uint32_t bit)
4648 {
4649 uint64_t deadline;
4650 uint32_t mask;
4651
4652 mask = mbuf_exhausted_mask;
4653 mbuf_exhausted_mask = mask | bit;
4654
4655 if ((mask & MBUF_EXHAUSTED_DRAIN_MASK) == 0 &&
4656 (bit & MBUF_EXHAUSTED_DRAIN_MASK)) {
4657 clock_interval_to_deadline(MB_WDT_MAXTIME * 1000 / 10,
4658 NSEC_PER_MSEC, &deadline);
4659 thread_call_enter_delayed(mbuf_drain_tcall, deadline);
4660 }
4661
4662 if ((mask & MBUF_EXHAUSTED_DEFUNCT_MASK) == 0 &&
4663 (bit & MBUF_EXHAUSTED_DEFUNCT_MASK)) {
4664 clock_interval_to_deadline(MB_WDT_MAXTIME * 1000 / 2,
4665 NSEC_PER_MSEC, &deadline);
4666 thread_call_enter_delayed(mbuf_defunct_tcall, deadline);
4667 }
4668 }
4669
4670 static void
mbuf_zone_exhausted_end(uint32_t bit)4671 mbuf_zone_exhausted_end(uint32_t bit)
4672 {
4673 uint32_t mask;
4674
4675 mask = (mbuf_exhausted_mask &= ~bit);
4676
4677 if ((mask & MBUF_EXHAUSTED_DRAIN_MASK) == 0 &&
4678 (bit & MBUF_EXHAUSTED_DRAIN_MASK)) {
4679 thread_call_cancel(mbuf_drain_tcall);
4680 }
4681
4682 if ((mask & MBUF_EXHAUSTED_DEFUNCT_MASK) == 0 &&
4683 (bit & MBUF_EXHAUSTED_DEFUNCT_MASK)) {
4684 thread_call_cancel(mbuf_defunct_tcall);
4685 }
4686 }
4687
4688 static void
mbuf_zone_exhausted(zone_id_t zid,zone_t zone __unused,bool exhausted)4689 mbuf_zone_exhausted(zone_id_t zid, zone_t zone __unused, bool exhausted)
4690 {
4691 uint32_t bit;
4692
4693 if (zid < m_class_to_zid(MBUF_CLASS_MIN) ||
4694 zid > m_class_to_zid(MBUF_CLASS_MAX)) {
4695 return;
4696 }
4697
4698 bit = 1u << m_class_from_zid(zid);
4699
4700 lck_ticket_lock_nopreempt(&mbuf_exhausted_lock, &mbuf_exhausted_grp);
4701
4702 if (exhausted) {
4703 mbuf_zone_exhausted_start(bit);
4704 } else {
4705 mbuf_zone_exhausted_end(bit);
4706 }
4707
4708 lck_ticket_unlock_nopreempt(&mbuf_exhausted_lock);
4709 }
4710 EVENT_REGISTER_HANDLER(ZONE_EXHAUSTED, mbuf_zone_exhausted);
4711 #endif /* !CONFIG_MBUF_MCACHE */
4712
4713 /*
4714 * Convert between a regular and a packet header mbuf. Caller is responsible
4715 * for setting or clearing M_PKTHDR; this routine does the rest of the work.
4716 */
4717 int
m_reinit(struct mbuf * m,int hdr)4718 m_reinit(struct mbuf *m, int hdr)
4719 {
4720 int ret = 0;
4721
4722 if (hdr) {
4723 VERIFY(!(m->m_flags & M_PKTHDR));
4724 if (!(m->m_flags & M_EXT) &&
4725 (m->m_data != (uintptr_t)m->m_dat || m->m_len > 0)) {
4726 /*
4727 * If there's no external cluster attached and the
4728 * mbuf appears to contain user data, we cannot
4729 * safely convert this to a packet header mbuf,
4730 * as the packet header structure might overlap
4731 * with the data.
4732 */
4733 printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
4734 "m_data %llx (expected %llx), "
4735 "m_len %d (expected 0)\n",
4736 __func__,
4737 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m),
4738 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m->m_data),
4739 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)(m->m_dat)), m->m_len);
4740 ret = EBUSY;
4741 } else {
4742 VERIFY((m->m_flags & M_EXT) || m->m_data == (uintptr_t)m->m_dat);
4743 m->m_flags |= M_PKTHDR;
4744 mbuf_init_pkthdr(m);
4745 }
4746 } else {
4747 /* Free the aux data and tags if there is any */
4748 m_tag_delete_chain(m);
4749 m_do_tx_compl_callback(m, NULL);
4750 m->m_flags &= ~M_PKTHDR;
4751 }
4752
4753 return ret;
4754 }
4755
4756 int
m_ext_set_prop(struct mbuf * m,uint32_t o,uint32_t n)4757 m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n)
4758 {
4759 ASSERT(m->m_flags & M_EXT);
4760 return os_atomic_cmpxchg(&MEXT_PRIV(m), o, n, acq_rel);
4761 }
4762
4763 uint32_t
m_ext_get_prop(struct mbuf * m)4764 m_ext_get_prop(struct mbuf *m)
4765 {
4766 ASSERT(m->m_flags & M_EXT);
4767 return MEXT_PRIV(m);
4768 }
4769
4770 int
m_ext_paired_is_active(struct mbuf * m)4771 m_ext_paired_is_active(struct mbuf *m)
4772 {
4773 return MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : 1;
4774 }
4775
4776 void
m_ext_paired_activate(struct mbuf * m)4777 m_ext_paired_activate(struct mbuf *m)
4778 {
4779 struct ext_ref *rfa;
4780 int hdr, type;
4781 caddr_t extbuf;
4782 m_ext_free_func_t extfree;
4783 u_int extsize;
4784
4785 VERIFY(MBUF_IS_PAIRED(m));
4786 VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
4787 VERIFY(MEXT_PREF(m) == MEXT_MINREF(m));
4788
4789 hdr = (m->m_flags & M_PKTHDR);
4790 type = m->m_type;
4791 extbuf = m->m_ext.ext_buf;
4792 extfree = m_get_ext_free(m);
4793 extsize = m->m_ext.ext_size;
4794 rfa = m_get_rfa(m);
4795
4796 VERIFY(extbuf != NULL && rfa != NULL);
4797
4798 /*
4799 * Safe to reinitialize packet header tags, since it's
4800 * already taken care of at m_free() time. Similar to
4801 * what's done in m_clattach() for the cluster. Bump
4802 * up MEXT_PREF to indicate activation.
4803 */
4804 mbuf_init(m, hdr, type);
4805 mext_init(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
4806 1, 1, 2, EXTF_PAIRED, MEXT_PRIV(m), m);
4807 }
4808
4809 #if !CONFIG_MBUF_MCACHE
4810 /*
4811 * This routine is reserved for mbuf_get_driver_scratch(); clients inside
4812 * xnu that intend on utilizing the module-private area should directly
4813 * refer to the pkt_mpriv structure in the pkthdr. They are also expected
4814 * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
4815 * to handing it off to another module, respectively.
4816 */
4817 uint32_t
m_scratch_get(struct mbuf * m,uint8_t ** p)4818 m_scratch_get(struct mbuf *m, uint8_t **p)
4819 {
4820 struct pkthdr *pkt = &m->m_pkthdr;
4821
4822 VERIFY(m->m_flags & M_PKTHDR);
4823
4824 /* See comments in <rdar://problem/14040693> */
4825 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
4826 panic_plain("Invalid attempt to access guarded module-private "
4827 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
4828 /* NOTREACHED */
4829 }
4830
4831 *p = (uint8_t *)&pkt->pkt_mpriv;
4832 return sizeof(pkt->pkt_mpriv);
4833 }
4834 #endif /* !CONFIG_MBUF_MCACHE */
4835
4836 void
m_add_crumb(struct mbuf * m,uint16_t crumb)4837 m_add_crumb(struct mbuf *m, uint16_t crumb)
4838 {
4839 VERIFY(m->m_flags & M_PKTHDR);
4840
4841 m->m_pkthdr.pkt_crumbs |= crumb;
4842 }
4843
4844 void
m_add_hdr_crumb(struct mbuf * m,uint64_t crumb,uint64_t flag)4845 m_add_hdr_crumb(struct mbuf *m, uint64_t crumb, uint64_t flag)
4846 {
4847 #if defined(__arm64__)
4848 while (m != NULL) {
4849 m->m_mhdrcommon_crumbs &= ~flag;
4850 m->m_mhdrcommon_crumbs |= (crumb & flag);
4851 m = m->m_next;
4852 }
4853 #else
4854 #pragma unused(m, crumb, flag)
4855 #endif /*__arm64__*/
4856 }
4857
4858 void
m_add_hdr_crumb_chain(struct mbuf * head,uint64_t crumb,uint64_t flag)4859 m_add_hdr_crumb_chain(struct mbuf *head, uint64_t crumb, uint64_t flag)
4860 {
4861 #if defined(__arm64__)
4862 while (head) {
4863 /* This assumes that we might have a chain of mbuf chains */
4864 m_add_hdr_crumb(head, crumb, flag);
4865 head = head->m_nextpkt;
4866 }
4867 #else
4868 #pragma unused(head, crumb, flag)
4869 #endif /*__arm64__*/
4870 }
4871
4872 SYSCTL_DECL(_kern_ipc);
4873 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
4874 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
4875 0, 0, mbstat_sysctl, "S,mbstat", "");
4876 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
4877 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
4878 0, 0, mb_stat_sysctl, "S,mb_stat", "");
4879 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_memory_pressure_percentage,
4880 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_memory_pressure_percentage, 0,
4881 "Percentage of when we trigger memory-pressure for an mbuf-class");
4882