1 /*
2 * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <ptrauth.h>
71
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/malloc.h>
75 #include <sys/mbuf.h>
76 #include <sys/kernel.h>
77 #include <sys/sysctl.h>
78 #include <sys/syslog.h>
79 #include <sys/protosw.h>
80 #include <sys/domain.h>
81 #include <sys/queue.h>
82 #include <sys/proc.h>
83 #include <sys/filedesc.h>
84 #include <sys/file_internal.h>
85
86 #include <dev/random/randomdev.h>
87
88 #include <kern/kern_types.h>
89 #include <kern/simple_lock.h>
90 #include <kern/queue.h>
91 #include <kern/sched_prim.h>
92 #include <kern/backtrace.h>
93 #include <kern/percpu.h>
94 #include <kern/zalloc.h>
95
96 #include <libkern/OSDebug.h>
97 #include <libkern/libkern.h>
98
99 #include <os/log.h>
100 #include <os/ptrtools.h>
101
102 #include <IOKit/IOMapper.h>
103
104 #include <machine/limits.h>
105 #include <machine/machine_routines.h>
106
107 #if CONFIG_MBUF_MCACHE
108 #include <sys/mcache.h>
109 #endif /* CONFIG_MBUF_MCACHE */
110 #include <net/ntstat.h>
111
112 #if INET
113 extern int dump_tcp_reass_qlen(char *, int);
114 extern int tcp_reass_qlen_space(struct socket *);
115 #endif /* INET */
116
117 #if MPTCP
118 extern int dump_mptcp_reass_qlen(char *, int);
119 #endif /* MPTCP */
120
121
122 #if NETWORKING
123 extern int dlil_dump_top_if_qlen(char *, int);
124 #endif /* NETWORKING */
125
126 #if CONFIG_MBUF_MCACHE
127 /*
128 * MBUF IMPLEMENTATION NOTES.
129 *
130 * There is a total of 5 per-CPU caches:
131 *
132 * MC_MBUF:
133 * This is a cache of rudimentary objects of _MSIZE in size; each
134 * object represents an mbuf structure. This cache preserves only
135 * the m_type field of the mbuf during its transactions.
136 *
137 * MC_CL:
138 * This is a cache of rudimentary objects of MCLBYTES in size; each
139 * object represents a mcluster structure. This cache does not
140 * preserve the contents of the objects during its transactions.
141 *
142 * MC_BIGCL:
143 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each
144 * object represents a mbigcluster structure. This cache does not
145 * preserve the contents of the objects during its transaction.
146 *
147 * MC_MBUF_CL:
148 * This is a cache of mbufs each having a cluster attached to it.
149 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
150 * fields of the mbuf related to the external cluster are preserved
151 * during transactions.
152 *
153 * MC_MBUF_BIGCL:
154 * This is a cache of mbufs each having a big cluster attached to it.
155 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
156 * fields of the mbuf related to the external cluster are preserved
157 * during transactions.
158 *
159 * OBJECT ALLOCATION:
160 *
161 * Allocation requests are handled first at the per-CPU (mcache) layer
162 * before falling back to the slab layer. Performance is optimal when
163 * the request is satisfied at the CPU layer because global data/lock
164 * never gets accessed. When the slab layer is entered for allocation,
165 * the slab freelist will be checked first for available objects before
166 * the VM backing store is invoked. Slab layer operations are serialized
167 * for all of the caches as the mbuf global lock is held most of the time.
168 * Allocation paths are different depending on the class of objects:
169 *
170 * a. Rudimentary object:
171 *
172 * { m_get_common(), m_clattach(), m_mclget(),
173 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
174 * composite object allocation }
175 * | ^
176 * | |
177 * | +-----------------------+
178 * v |
179 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
180 * | ^
181 * v |
182 * [CPU cache] -------> (found?) -------+
183 * | |
184 * v |
185 * mbuf_slab_alloc() |
186 * | |
187 * v |
188 * +---------> [freelist] -------> (found?) -------+
189 * | |
190 * | v
191 * | m_clalloc()
192 * | |
193 * | v
194 * +---<<---- kmem_mb_alloc()
195 *
196 * b. Composite object:
197 *
198 * { m_getpackets_internal(), m_allocpacket_internal() }
199 * | ^
200 * | |
201 * | +------ (done) ---------+
202 * v |
203 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
204 * | ^
205 * v |
206 * [CPU cache] -------> (found?) -------+
207 * | |
208 * v |
209 * mbuf_cslab_alloc() |
210 * | |
211 * v |
212 * [freelist] -------> (found?) -------+
213 * | |
214 * v |
215 * (rudimentary object) |
216 * mcache_alloc/mcache_alloc_ext() ------>>-----+
217 *
218 * Auditing notes: If auditing is enabled, buffers will be subjected to
219 * integrity checks by the audit routine. This is done by verifying their
220 * contents against DEADBEEF (free) pattern before returning them to caller.
221 * As part of this step, the routine will also record the transaction and
222 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
223 * also restore any constructed data structure fields if necessary.
224 *
225 * OBJECT DEALLOCATION:
226 *
227 * Freeing an object simply involves placing it into the CPU cache; this
228 * pollutes the cache to benefit subsequent allocations. The slab layer
229 * will only be entered if the object is to be purged out of the cache.
230 * During normal operations, this happens only when the CPU layer resizes
231 * its bucket while it's adjusting to the allocation load. Deallocation
232 * paths are different depending on the class of objects:
233 *
234 * a. Rudimentary object:
235 *
236 * { m_free(), m_freem_list(), composite object deallocation }
237 * | ^
238 * | |
239 * | +------ (done) ---------+
240 * v |
241 * mcache_free/mcache_free_ext() |
242 * | |
243 * v |
244 * mbuf_slab_audit() |
245 * | |
246 * v |
247 * [CPU cache] ---> (not purging?) -----+
248 * | |
249 * v |
250 * mbuf_slab_free() |
251 * | |
252 * v |
253 * [freelist] ----------->>------------+
254 * (objects get purged to VM only on demand)
255 *
256 * b. Composite object:
257 *
258 * { m_free(), m_freem_list() }
259 * | ^
260 * | |
261 * | +------ (done) ---------+
262 * v |
263 * mcache_free/mcache_free_ext() |
264 * | |
265 * v |
266 * mbuf_cslab_audit() |
267 * | |
268 * v |
269 * [CPU cache] ---> (not purging?) -----+
270 * | |
271 * v |
272 * mbuf_cslab_free() |
273 * | |
274 * v |
275 * [freelist] ---> (not purging?) -----+
276 * | |
277 * v |
278 * (rudimentary object) |
279 * mcache_free/mcache_free_ext() ------->>------+
280 *
281 * Auditing notes: If auditing is enabled, the audit routine will save
282 * any constructed data structure fields (if necessary) before filling the
283 * contents of the buffers with DEADBEEF (free) pattern and recording the
284 * transaction. Buffers that are freed (whether at CPU or slab layer) are
285 * expected to contain the free pattern.
286 *
287 * DEBUGGING:
288 *
289 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
290 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
291 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
292 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
293 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
294 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
295 *
296 * Each object is associated with exactly one mcache_audit_t structure that
297 * contains the information related to its last buffer transaction. Given
298 * an address of an object, the audit structure can be retrieved by finding
299 * the position of the object relevant to the base address of the cluster:
300 *
301 * +------------+ +=============+
302 * | mbuf addr | | mclaudit[i] |
303 * +------------+ +=============+
304 * | | cl_audit[0] |
305 * i = MTOBG(addr) +-------------+
306 * | +-----> | cl_audit[1] | -----> mcache_audit_t
307 * b = BGTOM(i) | +-------------+
308 * | | | ... |
309 * x = MCLIDX(b, addr) | +-------------+
310 * | | | cl_audit[7] |
311 * +-----------------+ +-------------+
312 * (e.g. x == 1)
313 *
314 * The mclaudit[] array is allocated at initialization time, but its contents
315 * get populated when the corresponding cluster is created. Because a page
316 * can be turned into NMBPG number of mbufs, we preserve enough space for the
317 * mbufs so that there is a 1-to-1 mapping between them. A page that never
318 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
319 * remaining entries unused. For 16KB cluster, only one entry from the first
320 * page is allocated and used for the entire object.
321 */
322 #else
323 /*
324 * MBUF IMPLEMENTATION NOTES (using zalloc).
325 *
326 * There are a total of 4 zones and 3 zcaches.
327 *
328 * MC_MBUF:
329 * This is a zone of rudimentary objects of _MSIZE in size; each
330 * object represents an mbuf structure. This cache preserves only
331 * the m_type field of the mbuf during its transactions.
332 *
333 * MC_CL:
334 * This is a zone of rudimentary objects of MCLBYTES in size; each
335 * object represents a mcluster structure. This cache does not
336 * preserve the contents of the objects during its transactions.
337 *
338 * MC_BIGCL:
339 * This is a zone of rudimentary objects of MBIGCLBYTES in size; each
340 * object represents a mbigcluster structure. This cache does not
341 * preserve the contents of the objects during its transaction.
342 *
343 * MC_16KCL:
344 * This is a zone of rudimentary objects of M16KCLBYTES in size; each
345 * object represents a m16kcluster structure. This cache does not
346 * preserve the contents of the objects during its transaction.
347 *
348 * MC_MBUF_CL:
349 * This is a cache of mbufs each having a cluster attached to it.
350 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
351 * fields of the mbuf related to the external cluster are preserved
352 * during transactions.
353 *
354 * MC_MBUF_BIGCL:
355 * This is a cache of mbufs each having a big cluster attached to it.
356 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
357 * fields of the mbuf related to the external cluster are preserved
358 * during transactions.
359 *
360 * MC_MBUF_16KCL:
361 * This is a cache of mbufs each having a big cluster attached to it.
362 * It is backed by MC_MBUF and MC_16KCL rudimentary caches. Several
363 * fields of the mbuf related to the external cluster are preserved
364 * during transactions.
365 *
366 * OBJECT ALLOCATION:
367 *
368 * Allocation requests are handled first at the zalloc per-CPU layer
369 * before falling back to the zalloc depot. Performance is optimal when
370 * the request is satisfied at the CPU layer. zalloc has an additional
371 * overflow layer called the depot, not pictured in the diagram below.
372 *
373 * Allocation paths are different depending on the class of objects:
374 *
375 * a. Rudimentary object:
376 *
377 * { m_get_common(), m_clattach(), m_mclget(),
378 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
379 * composite object allocation }
380 * | ^
381 * | |
382 * | +------- (done) --------+
383 * v |
384 * zalloc_flags/zalloc_n() KASAN
385 * | ^
386 * v |
387 * +----> [zalloc per-CPU cache] -----> (found?) --+
388 * | | |
389 * | v |
390 * | [zalloc recirculation layer] --> (found?) ---+
391 * | |
392 * | v
393 * +--<<-- [zone backing store]
394 *
395 * b. Composite object:
396 *
397 * { m_getpackets_internal(), m_allocpacket_internal() }
398 * | ^
399 * | |
400 * | +------ (done) ---------+
401 * v |
402 * mz_composite_alloc() KASAN
403 * | ^
404 * v |
405 * zcache_alloc_n() |
406 * | |
407 * v |
408 * [zalloc per-CPU cache] --> mark_valid() ---+
409 * | |
410 * v |
411 * [zalloc recirculation layer] -> mark_valid() -+
412 * | |
413 * v |
414 * mz_composite_build() |
415 * | |
416 * v |
417 * (rudimentary objects) |
418 * zalloc_id() ---------------->>-----+
419 *
420 * Auditing notes: If KASAN enabled, buffers will be subjected to
421 * integrity checks by the AddressSanitizer.
422 *
423 * OBJECT DEALLOCATION:
424 *
425 * Freeing an object simply involves placing it into the CPU cache; this
426 * pollutes the cache to benefit subsequent allocations. The depot
427 * will only be entered if the object is to be purged out of the cache.
428 * Objects may be purged based on the overall memory pressure or
429 * during zone garbage collection.
430 * To improve performance, objects are not zero-filled when freed
431 * as it's custom for other zalloc zones.
432 *
433 * Deallocation paths are different depending on the class of objects:
434 *
435 * a. Rudimentary object:
436 *
437 * { m_free(), m_freem_list(), composite object deallocation }
438 * | ^
439 * | |
440 * | +------ (done) ---------+
441 * v |
442 * zfree_nozero() |
443 * | |
444 * v |
445 * KASAN |
446 * | |
447 * v |
448 * [zalloc per-CPU cache] -> (not purging?) --+
449 * | |
450 * v |
451 * [zalloc recirculation layer] --->>----------+
452 *
453 *
454 * b. Composite object:
455 *
456 * { m_free(), m_freem_list() }
457 * | ^
458 * | |
459 * | +------ (done) ---------+
460 * v |
461 * mz_composite_free() |
462 * | |
463 * v |
464 * zcache_free_n() |
465 * | |
466 * v |
467 * KASAN |
468 * | |
469 * v |
470 * [zalloc per-CPU cache] -> mark_invalid() --+
471 * | |
472 * v |
473 * mz_composite_destroy() |
474 * | |
475 * v |
476 * (rudimentary object) |
477 * zfree_nozero() -------------->>------+
478 *
479 * Auditing notes: If KASAN enabled, buffers will be subjected to
480 * integrity checks by the AddressSanitizer.
481 *
482 * DEBUGGING:
483 *
484 * Debugging mbufs can be done by booting a KASAN enabled kernel.
485 */
486
487 #endif /* CONFIG_MBUF_MCACHE */
488
489 /* TODO: should be in header file */
490 /* kernel translater */
491 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
492 extern vm_map_t mb_map; /* special map */
493
494 #if CONFIG_MBUF_MCACHE
495 static uint32_t mb_kmem_contig_failed;
496 static uint32_t mb_kmem_failed;
497 static uint32_t mb_kmem_one_failed;
498 /* Timestamp of allocation failures. */
499 static uint64_t mb_kmem_contig_failed_ts;
500 static uint64_t mb_kmem_failed_ts;
501 static uint64_t mb_kmem_one_failed_ts;
502 static uint64_t mb_kmem_contig_failed_size;
503 static uint64_t mb_kmem_failed_size;
504 static uint32_t mb_kmem_stats[6];
505 #endif /* CONFIG_MBUF_MCACHE */
506
507 /* Global lock */
508 static LCK_GRP_DECLARE(mbuf_mlock_grp, "mbuf");
509 static LCK_MTX_DECLARE(mbuf_mlock_data, &mbuf_mlock_grp);
510 static lck_mtx_t *const mbuf_mlock = &mbuf_mlock_data;
511
512 #if CONFIG_MBUF_MCACHE
513 /* Back-end (common) layer */
514 static uint64_t mb_expand_cnt;
515 static uint64_t mb_expand_cl_cnt;
516 static uint64_t mb_expand_cl_total;
517 static uint64_t mb_expand_bigcl_cnt;
518 static uint64_t mb_expand_bigcl_total;
519 static uint64_t mb_expand_16kcl_cnt;
520 static uint64_t mb_expand_16kcl_total;
521 static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */
522 static uint32_t mbuf_worker_run_cnt;
523 static uint64_t mbuf_worker_last_runtime;
524 static uint64_t mbuf_drain_last_runtime;
525 static int mbuf_worker_ready; /* worker thread is runnable */
526 static unsigned int ncpu; /* number of CPUs */
527 static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */
528 static ppnum_t mcl_pages; /* Size of array (# physical pages) */
529 static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
530 static mcache_t *ref_cache; /* Cache of cluster reference & flags */
531 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
532 unsigned int mbuf_debug; /* patchable mbuf mcache flags */
533 #endif /* CONFIG_MBUF_DEBUG */
534 static unsigned int mb_normalized; /* number of packets "normalized" */
535
536 extern unsigned int mb_tag_mbuf;
537
538 #define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
539 #define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
540
541 typedef enum {
542 MC_MBUF = 0, /* Regular mbuf */
543 MC_CL, /* Cluster */
544 MC_BIGCL, /* Large (4KB) cluster */
545 MC_16KCL, /* Jumbo (16KB) cluster */
546 MC_MBUF_CL, /* mbuf + cluster */
547 MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */
548 MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */
549 } mbuf_class_t;
550
551 #define MBUF_CLASS_MIN MC_MBUF
552 #define MBUF_CLASS_MAX MC_MBUF_16KCL
553 #define MBUF_CLASS_LAST MC_16KCL
554 #define MBUF_CLASS_VALID(c) \
555 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
556 #define MBUF_CLASS_COMPOSITE(c) \
557 ((int)(c) > MBUF_CLASS_LAST)
558
559
560 /*
561 * mbuf specific mcache allocation request flags.
562 */
563 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
564
565 /*
566 * Per-cluster slab structure.
567 *
568 * A slab is a cluster control structure that contains one or more object
569 * chunks; the available chunks are chained in the slab's freelist (sl_head).
570 * Each time a chunk is taken out of the slab, the slab's reference count
571 * gets incremented. When all chunks have been taken out, the empty slab
572 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
573 * returned to a slab causes the slab's reference count to be decremented;
574 * it also causes the slab to be reinserted back to class's slab list, if
575 * it's not already done.
576 *
577 * Compartmentalizing of the object chunks into slabs allows us to easily
578 * merge one or more slabs together when the adjacent slabs are idle, as
579 * well as to convert or move a slab from one class to another; e.g. the
580 * mbuf cluster slab can be converted to a regular cluster slab when all
581 * mbufs in the slab have been freed.
582 *
583 * A slab may also span across multiple clusters for chunks larger than
584 * a cluster's size. In this case, only the slab of the first cluster is
585 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
586 * that they are part of the larger slab.
587 *
588 * Each slab controls a page of memory.
589 */
590 typedef struct mcl_slab {
591 struct mcl_slab *sl_next; /* neighboring slab */
592 u_int8_t sl_class; /* controlling mbuf class */
593 int8_t sl_refcnt; /* outstanding allocations */
594 int8_t sl_chunks; /* chunks (bufs) in this slab */
595 u_int16_t sl_flags; /* slab flags (see below) */
596 u_int16_t sl_len; /* slab length */
597 void *sl_base; /* base of allocated memory */
598 void *sl_head; /* first free buffer */
599 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
600 } mcl_slab_t;
601
602 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
603 #define SLF_PARTIAL 0x0002 /* part of another slab */
604 #define SLF_DETACHED 0x0004 /* not in slab freelist */
605
606 /*
607 * The array of slabs are broken into groups of arrays per 1MB of kernel
608 * memory to reduce the footprint. Each group is allocated on demand
609 * whenever a new piece of memory mapped in from the VM crosses the 1MB
610 * boundary.
611 */
612 #define NSLABSPMB ((1 << MBSHIFT) >> PAGE_SHIFT)
613
614 typedef struct mcl_slabg {
615 mcl_slab_t *slg_slab; /* group of slabs */
616 } mcl_slabg_t;
617
618 /*
619 * Number of slabs needed to control a 16KB cluster object.
620 */
621 #define NSLABSP16KB (M16KCLBYTES >> PAGE_SHIFT)
622
623 #if CONFIG_MBUF_MCACHE
624 /*
625 * Per-cluster audit structure.
626 */
627 typedef struct {
628 mcache_audit_t **cl_audit; /* array of audits */
629 } mcl_audit_t;
630
631 typedef struct {
632 struct thread *msa_thread; /* thread doing transaction */
633 struct thread *msa_pthread; /* previous transaction thread */
634 uint32_t msa_tstamp; /* transaction timestamp (ms) */
635 uint32_t msa_ptstamp; /* prev transaction timestamp (ms) */
636 uint16_t msa_depth; /* pc stack depth */
637 uint16_t msa_pdepth; /* previous transaction pc stack */
638 void *msa_stack[MCACHE_STACK_DEPTH];
639 void *msa_pstack[MCACHE_STACK_DEPTH];
640 } mcl_scratch_audit_t;
641
642 typedef struct {
643 /*
644 * Size of data from the beginning of an mbuf that covers m_hdr,
645 * pkthdr and m_ext structures. If auditing is enabled, we allocate
646 * a shadow mbuf structure of this size inside each audit structure,
647 * and the contents of the real mbuf gets copied into it when the mbuf
648 * is freed. This allows us to pattern-fill the mbuf for integrity
649 * check, and to preserve any constructed mbuf fields (e.g. mbuf +
650 * cluster cache case). Note that we don't save the contents of
651 * clusters when they are freed; we simply pattern-fill them.
652 */
653 u_int8_t sc_mbuf[(_MSIZE - _MHLEN) + sizeof(_m_ext_t)];
654 mcl_scratch_audit_t sc_scratch __attribute__((aligned(8)));
655 } mcl_saved_contents_t;
656
657 #define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t))
658
659 #define MCA_SAVED_MBUF_PTR(_mca) \
660 ((struct mbuf *)(void *)((mcl_saved_contents_t *) \
661 (_mca)->mca_contents)->sc_mbuf)
662 #define MCA_SAVED_MBUF_SIZE \
663 (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
664 #define MCA_SAVED_SCRATCH_PTR(_mca) \
665 (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
666
667 /*
668 * mbuf specific mcache audit flags
669 */
670 #define MB_INUSE 0x01 /* object has not been returned to slab */
671 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
672 #define MB_SCVALID 0x04 /* object has valid saved contents */
673
674 /*
675 * Each of the following two arrays hold up to nmbclusters elements.
676 */
677 static mcl_audit_t *mclaudit; /* array of cluster audit information */
678 static unsigned int maxclaudit; /* max # of entries in audit table */
679 static mcl_slabg_t **slabstbl; /* cluster slabs table */
680 static unsigned int maxslabgrp; /* max # of entries in slabs table */
681 static unsigned int slabgrp; /* # of entries in slabs table */
682 #endif /* CONFIG_MBUF_MCACHE */
683
684 /* Globals */
685 int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
686 int njcl; /* # of clusters for jumbo sizes */
687 int njclbytes; /* size of a jumbo cluster */
688 unsigned char *mbutl; /* first mapped cluster address */
689 unsigned char *embutl; /* ending virtual address of mclusters */
690 int max_linkhdr; /* largest link-level header */
691 int max_protohdr; /* largest protocol header */
692 int max_hdr; /* largest link+protocol header */
693 int max_datalen; /* MHLEN - max_hdr */
694
695 #if CONFIG_MBUF_MCACHE
696 static boolean_t mclverify; /* debug: pattern-checking */
697 static boolean_t mcltrace; /* debug: stack tracing */
698 static boolean_t mclfindleak; /* debug: leak detection */
699 static boolean_t mclexpleak; /* debug: expose leak info to user space */
700
701 static struct timeval mb_start; /* beginning of time */
702
703 /* mbuf leak detection variables */
704 static struct mleak_table mleak_table;
705 static mleak_stat_t *mleak_stat;
706
707 #define MLEAK_STAT_SIZE(n) \
708 __builtin_offsetof(mleak_stat_t, ml_trace[n])
709
710 struct mallocation {
711 mcache_obj_t *element; /* the alloc'ed element, NULL if unused */
712 u_int32_t trace_index; /* mtrace index for corresponding backtrace */
713 u_int32_t count; /* How many objects were requested */
714 u_int64_t hitcount; /* for determining hash effectiveness */
715 };
716
717 struct mtrace {
718 u_int64_t collisions;
719 u_int64_t hitcount;
720 u_int64_t allocs;
721 u_int64_t depth;
722 uintptr_t addr[MLEAK_STACK_DEPTH];
723 };
724
725 /* Size must be a power of two for the zhash to be able to just mask off bits */
726 #define MLEAK_ALLOCATION_MAP_NUM 512
727 #define MLEAK_TRACE_MAP_NUM 256
728
729 /*
730 * Sample factor for how often to record a trace. This is overwritable
731 * by the boot-arg mleak_sample_factor.
732 */
733 #define MLEAK_SAMPLE_FACTOR 500
734
735 /*
736 * Number of top leakers recorded.
737 */
738 #define MLEAK_NUM_TRACES 5
739
740 #define MB_LEAK_SPACING_64 " "
741 #define MB_LEAK_SPACING_32 " "
742
743
744 #define MB_LEAK_HDR_32 "\n\
745 trace [1] trace [2] trace [3] trace [4] trace [5] \n\
746 ---------- ---------- ---------- ---------- ---------- \n\
747 "
748
749 #define MB_LEAK_HDR_64 "\n\
750 trace [1] trace [2] trace [3] \
751 trace [4] trace [5] \n\
752 ------------------ ------------------ ------------------ \
753 ------------------ ------------------ \n\
754 "
755
756 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
757 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
758
759 /* Hashmaps of allocations and their corresponding traces */
760 static struct mallocation *mleak_allocations;
761 static struct mtrace *mleak_traces;
762 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
763
764 /* Lock to protect mleak tables from concurrent modification */
765 static LCK_GRP_DECLARE(mleak_lock_grp, "mleak_lock");
766 static LCK_MTX_DECLARE(mleak_lock_data, &mleak_lock_grp);
767 static lck_mtx_t *const mleak_lock = &mleak_lock_data;
768
769 /* *Failed* large allocations. */
770 struct mtracelarge {
771 uint64_t size;
772 uint64_t depth;
773 uintptr_t addr[MLEAK_STACK_DEPTH];
774 };
775
776 #define MTRACELARGE_NUM_TRACES 5
777 static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES];
778
779 static void mtracelarge_register(size_t size);
780 #endif /* CONFIG_MBUF_MCACHE */
781
782 /* Lock to protect the completion callback table */
783 static LCK_GRP_DECLARE(mbuf_tx_compl_tbl_lck_grp, "mbuf_tx_compl_tbl");
784 LCK_RW_DECLARE(mbuf_tx_compl_tbl_lock, &mbuf_tx_compl_tbl_lck_grp);
785
786 extern u_int32_t high_sb_max;
787
788 /* The minimum number of objects that are allocated, to start. */
789 #define MINCL 32
790 #define MINBIGCL (MINCL >> 1)
791 #define MIN16KCL (MINCL >> 2)
792
793 /* Low watermarks (only map in pages once free counts go below) */
794 #define MBIGCL_LOWAT MINBIGCL
795 #define M16KCL_LOWAT MIN16KCL
796
797 typedef struct {
798 mbuf_class_t mtbl_class; /* class type */
799 #if CONFIG_MBUF_MCACHE
800 mcache_t *mtbl_cache; /* mcache for this buffer class */
801 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
802 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */
803 #endif /* CONFIG_MBUF_MCACHE */
804 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */
805 u_int32_t mtbl_maxsize; /* maximum buffer size */
806 int mtbl_minlimit; /* minimum allowed */
807 int mtbl_maxlimit; /* maximum allowed */
808 u_int32_t mtbl_wantpurge; /* purge during next reclaim */
809 uint32_t mtbl_avgtotal; /* average total on iOS */
810 u_int32_t mtbl_expand; /* worker should expand the class */
811 } mbuf_table_t;
812
813 #define m_class(c) mbuf_table[c].mtbl_class
814 #if CONFIG_MBUF_MCACHE
815 #define m_cache(c) mbuf_table[c].mtbl_cache
816 #define m_slablist(c) mbuf_table[c].mtbl_slablist
817 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
818 #else
819 #define m_stats(c) mbuf_table[c].mtbl_stats
820 #endif /* CONFIG_MBUF_MCACHE */
821 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
822 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
823 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
824 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
825 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
826 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
827 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
828 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
829 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
830 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
831 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
832 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
833 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
834 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
835 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
836 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
837 #define m_peak(c) mbuf_table[c].mtbl_stats->mbcl_peak_reported
838 #define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
839 #define m_region_expand(c) mbuf_table[c].mtbl_expand
840
841 static mbuf_table_t mbuf_table[] = {
842 #if CONFIG_MBUF_MCACHE
843 /*
844 * The caches for mbufs, regular clusters and big clusters.
845 * The average total values were based on data gathered by actual
846 * usage patterns on iOS.
847 */
848 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
849 NULL, NULL, 0, 0, 0, 0, 3000, 0 },
850 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
851 NULL, NULL, 0, 0, 0, 0, 2000, 0 },
852 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
853 NULL, NULL, 0, 0, 0, 0, 1000, 0 },
854 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
855 NULL, NULL, 0, 0, 0, 0, 200, 0 },
856 /*
857 * The following are special caches; they serve as intermediate
858 * caches backed by the above rudimentary caches. Each object
859 * in the cache is an mbuf with a cluster attached to it. Unlike
860 * the above caches, these intermediate caches do not directly
861 * deal with the slab structures; instead, the constructed
862 * cached elements are simply stored in the freelists.
863 */
864 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000, 0 },
865 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000, 0 },
866 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 200, 0 },
867 #else
868 { .mtbl_class = MC_MBUF },
869 { .mtbl_class = MC_CL },
870 { .mtbl_class = MC_BIGCL },
871 { .mtbl_class = MC_16KCL },
872 { .mtbl_class = MC_MBUF_CL },
873 { .mtbl_class = MC_MBUF_BIGCL },
874 { .mtbl_class = MC_MBUF_16KCL },
875 #endif /* CONFIG_MBUF_MCACHE */
876 };
877
878 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
879
880 #if SKYWALK && CONFIG_MBUF_MCACHE
881 #define MC_THRESHOLD_SCALE_DOWN_FACTOR 2
882 static unsigned int mc_threshold_scale_down_factor =
883 MC_THRESHOLD_SCALE_DOWN_FACTOR;
884 #endif /* SKYWALK */
885
886 #if CONFIG_MBUF_MCACHE
887 static uint32_t
m_avgtotal(mbuf_class_t c)888 m_avgtotal(mbuf_class_t c)
889 {
890 #if SKYWALK
891 return if_is_fsw_transport_netagent_enabled() ?
892 (mbuf_table[c].mtbl_avgtotal / mc_threshold_scale_down_factor) :
893 mbuf_table[c].mtbl_avgtotal;
894 #else /* !SKYWALK */
895 return mbuf_table[c].mtbl_avgtotal;
896 #endif /* SKYWALK */
897 }
898 #endif /* CONFIG_MBUF_MCACHE */
899
900 #if CONFIG_MBUF_MCACHE
901 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
902 static int mb_waiters; /* number of waiters */
903 #endif /* CONFIG_MBUF_MCACHE */
904
905 boolean_t mb_peak_newreport = FALSE;
906 boolean_t mb_peak_firstreport = FALSE;
907
908 /* generate a report by default after 1 week of uptime */
909 #define MBUF_PEAK_FIRST_REPORT_THRESHOLD 604800
910
911 #define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */
912 #if CONFIG_MBUF_MCACHE
913 static struct timeval mb_wdtstart; /* watchdog start timestamp */
914 static char *mbuf_dump_buf;
915
916 #define MBUF_DUMP_BUF_SIZE 4096
917
918 /*
919 * mbuf watchdog is enabled by default. It is also toggeable via the
920 * kern.ipc.mb_watchdog sysctl.
921 * Garbage collection is enabled by default on embedded platforms.
922 * mb_drain_maxint controls the amount of time to wait (in seconds) before
923 * consecutive calls to mbuf_drain().
924 */
925 static unsigned int mb_watchdog = 1;
926 #if !XNU_TARGET_OS_OSX
927 static unsigned int mb_drain_maxint = 60;
928 #else /* XNU_TARGET_OS_OSX */
929 static unsigned int mb_drain_maxint = 0;
930 #endif /* XNU_TARGET_OS_OSX */
931 #endif /* CONFIG_MBUF_MCACHE */
932 static unsigned int mb_memory_pressure_percentage = 80;
933
934 uintptr_t mb_obscure_extfree __attribute__((visibility("hidden")));
935 uintptr_t mb_obscure_extref __attribute__((visibility("hidden")));
936
937 /* Red zone */
938 static u_int32_t mb_redzone_cookie;
939 static void m_redzone_init(struct mbuf *);
940 static void m_redzone_verify(struct mbuf *m);
941
942 static void m_set_rfa(struct mbuf *, struct ext_ref *);
943
944 #if CONFIG_MBUF_MCACHE
945 /* The following are used to serialize m_clalloc() */
946 static boolean_t mb_clalloc_busy;
947 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
948 static int mb_clalloc_waiters;
949 #endif /* CONFIG_MBUF_MCACHE */
950
951 static void mbuf_mtypes_sync(boolean_t);
952 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
953 static void mbuf_stat_sync(void);
954 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
955 #if CONFIG_MBUF_MCACHE
956 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
957 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
958 static char *mbuf_dump(void);
959 #endif /* CONFIG_MBUF_MCACHE */
960 static void mbuf_table_init(void);
961 static inline void m_incref(struct mbuf *);
962 static inline u_int16_t m_decref(struct mbuf *);
963 static void mbuf_watchdog_defunct(thread_call_param_t, thread_call_param_t);
964 #if CONFIG_MBUF_MCACHE
965 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
966 static void mbuf_worker_thread_init(void);
967 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
968 static void slab_free(mbuf_class_t, mcache_obj_t *);
969 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
970 unsigned int, int);
971 static void mbuf_slab_free(void *, mcache_obj_t *, int);
972 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
973 static void mbuf_slab_notify(void *, u_int32_t);
974 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
975 unsigned int);
976 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
977 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
978 unsigned int, int);
979 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
980 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
981 static int freelist_populate(mbuf_class_t, unsigned int, int);
982 static void freelist_init(mbuf_class_t);
983 static boolean_t mbuf_cached_above(mbuf_class_t, int);
984 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
985 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
986 static int m_howmany(int, size_t);
987 static void mbuf_worker_thread(void);
988 static void mbuf_watchdog(void);
989 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
990
991 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
992 size_t, unsigned int);
993 static void mcl_audit_free(void *, unsigned int);
994 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
995 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
996 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
997 boolean_t);
998 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
999 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
1000 static void mcl_audit_scratch(mcache_audit_t *);
1001 static void mcl_audit_mcheck_panic(struct mbuf *);
1002 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
1003
1004 static void mleak_activate(void);
1005 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
1006 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
1007 static void mleak_free(mcache_obj_t *);
1008 static void mleak_sort_traces(void);
1009 static void mleak_update_stats(void);
1010
1011 static mcl_slab_t *slab_get(void *);
1012 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
1013 void *, void *, unsigned int, int, int);
1014 static void slab_insert(mcl_slab_t *, mbuf_class_t);
1015 static void slab_remove(mcl_slab_t *, mbuf_class_t);
1016 static boolean_t slab_inrange(mcl_slab_t *, void *);
1017 static void slab_nextptr_panic(mcl_slab_t *, void *);
1018 static void slab_detach(mcl_slab_t *);
1019 static boolean_t slab_is_detached(mcl_slab_t *);
1020 #else /* !CONFIG_MBUF_MCACHE */
1021 static void mbuf_watchdog_drain_composite(thread_call_param_t, thread_call_param_t);
1022 static struct mbuf *mz_alloc(zalloc_flags_t);
1023 static void mz_free(struct mbuf *);
1024 static struct ext_ref *mz_ref_alloc(zalloc_flags_t);
1025 static void mz_ref_free(struct ext_ref *);
1026 static void *mz_cl_alloc(zone_id_t, zalloc_flags_t);
1027 static void mz_cl_free(zone_id_t, void *);
1028 static struct mbuf *mz_composite_alloc(mbuf_class_t, zalloc_flags_t);
1029 static zstack_t mz_composite_alloc_n(mbuf_class_t, unsigned int, zalloc_flags_t);
1030 static void mz_composite_free(mbuf_class_t, struct mbuf *);
1031 static void mz_composite_free_n(mbuf_class_t, zstack_t);
1032 static void *mz_composite_build(zone_id_t, zalloc_flags_t);
1033 static void *mz_composite_mark_valid(zone_id_t, void *);
1034 static void *mz_composite_mark_invalid(zone_id_t, void *);
1035 static void mz_composite_destroy(zone_id_t, void *);
1036
1037 ZONE_DEFINE_ID(ZONE_ID_MBUF_REF, "mbuf.ref", struct ext_ref,
1038 ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE);
1039 ZONE_DEFINE_ID(ZONE_ID_MBUF, "mbuf", struct mbuf,
1040 ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE);
1041 ZONE_DEFINE_ID(ZONE_ID_CLUSTER_2K, "mbuf.cluster.2k", union mcluster,
1042 ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE | ZC_DATA);
1043 ZONE_DEFINE_ID(ZONE_ID_CLUSTER_4K, "mbuf.cluster.4k", union mbigcluster,
1044 ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE | ZC_DATA);
1045 ZONE_DEFINE_ID(ZONE_ID_CLUSTER_16K, "mbuf.cluster.16k", union m16kcluster,
1046 ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE | ZC_DATA);
1047 static_assert(sizeof(union mcluster) == MCLBYTES);
1048 static_assert(sizeof(union mbigcluster) == MBIGCLBYTES);
1049 static_assert(sizeof(union m16kcluster) == M16KCLBYTES);
1050
1051 static const struct zone_cache_ops mz_composite_ops = {
1052 .zc_op_alloc = mz_composite_build,
1053 .zc_op_mark_valid = mz_composite_mark_valid,
1054 .zc_op_mark_invalid = mz_composite_mark_invalid,
1055 .zc_op_free = mz_composite_destroy,
1056 };
1057 ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_2K, "mbuf.composite.2k", struct mbuf,
1058 sizeof(struct mbuf) + sizeof(struct ext_ref) + MCLBYTES,
1059 &mz_composite_ops);
1060 ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_4K, "mbuf.composite.4k", struct mbuf,
1061 sizeof(struct mbuf) + sizeof(struct ext_ref) + MBIGCLBYTES,
1062 &mz_composite_ops);
1063 ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_16K, "mbuf.composite.16k", struct mbuf,
1064 sizeof(struct mbuf) + sizeof(struct ext_ref) + M16KCLBYTES,
1065 &mz_composite_ops);
1066 static_assert(ZONE_ID_MBUF + MC_MBUF == ZONE_ID_MBUF);
1067 static_assert(ZONE_ID_MBUF + MC_CL == ZONE_ID_CLUSTER_2K);
1068 static_assert(ZONE_ID_MBUF + MC_BIGCL == ZONE_ID_CLUSTER_4K);
1069 static_assert(ZONE_ID_MBUF + MC_16KCL == ZONE_ID_CLUSTER_16K);
1070 static_assert(ZONE_ID_MBUF + MC_MBUF_CL == ZONE_ID_MBUF_CLUSTER_2K);
1071 static_assert(ZONE_ID_MBUF + MC_MBUF_BIGCL == ZONE_ID_MBUF_CLUSTER_4K);
1072 static_assert(ZONE_ID_MBUF + MC_MBUF_16KCL == ZONE_ID_MBUF_CLUSTER_16K);
1073
1074 /* Converts a an mbuf class to a zalloc zone ID. */
1075 __attribute__((always_inline))
1076 static inline zone_id_t
m_class_to_zid(mbuf_class_t class)1077 m_class_to_zid(mbuf_class_t class)
1078 {
1079 return ZONE_ID_MBUF + class - MC_MBUF;
1080 }
1081
1082 __attribute__((always_inline))
1083 static inline mbuf_class_t
m_class_from_zid(zone_id_t zid)1084 m_class_from_zid(zone_id_t zid)
1085 {
1086 return MC_MBUF + zid - ZONE_ID_MBUF;
1087 }
1088
1089 static thread_call_t mbuf_defunct_tcall;
1090 static thread_call_t mbuf_drain_tcall;
1091 #endif /* CONFIG_MBUF_MCACHE */
1092
1093 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
1094 static struct mbuf *m_split0(struct mbuf *, int, int, int);
1095 __private_extern__ void mbuf_report_peak_usage(void);
1096 #if CONFIG_MBUF_MCACHE
1097 static boolean_t mbuf_report_usage(mbuf_class_t);
1098 #endif /* CONFIG_MBUF_MCACHE */
1099 #if CONFIG_MBUF_MCACHE && (DEBUG || DEVELOPMENT)
1100 #define mbwdog_logger(fmt, ...) _mbwdog_logger(__func__, __LINE__, fmt, ## __VA_ARGS__)
1101 static void _mbwdog_logger(const char *func, const int line, const char *fmt, ...);
1102 static char *mbwdog_logging;
1103 const unsigned mbwdog_logging_size = 4096;
1104 static size_t mbwdog_logging_used;
1105 #else
1106 #define mbwdog_logger(fmt, ...) do { } while (0)
1107 #endif /* CONFIG_MBUF_MCACHE &&DEBUG || DEVELOPMENT */
1108 #if CONFIG_MBUF_MCACHE
1109 static void mbuf_drain_locked(boolean_t);
1110 #endif /* CONFIG_MBUF_MCACHE */
1111
1112 /* flags for m_copyback0 */
1113 #define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
1114 #define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
1115 #define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
1116 #define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
1117
1118 /*
1119 * This flag is set for all mbufs that come out of and into the composite
1120 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
1121 * are marked with such a flag have clusters attached to them, and will be
1122 * treated differently when they are freed; instead of being placed back
1123 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
1124 * are placed back into the appropriate composite cache's freelist, and the
1125 * actual freeing is deferred until the composite objects are purged. At
1126 * such a time, this flag will be cleared from the mbufs and the objects
1127 * will be freed into their own separate freelists.
1128 */
1129 #define EXTF_COMPOSITE 0x1
1130
1131 /*
1132 * This flag indicates that the external cluster is read-only, i.e. it is
1133 * or was referred to by more than one mbufs. Once set, this flag is never
1134 * cleared.
1135 */
1136 #define EXTF_READONLY 0x2
1137 /*
1138 * This flag indicates that the external cluster is paired with the mbuf.
1139 * Pairing implies an external free routine defined which will be invoked
1140 * when the reference count drops to the minimum at m_free time. This
1141 * flag is never cleared.
1142 */
1143 #define EXTF_PAIRED 0x4
1144
1145 #define EXTF_MASK \
1146 (EXTF_COMPOSITE | EXTF_READONLY | EXTF_PAIRED)
1147
1148 #define MEXT_MINREF(m) ((m_get_rfa(m))->minref)
1149 #define MEXT_REF(m) ((m_get_rfa(m))->refcnt)
1150 #define MEXT_PREF(m) ((m_get_rfa(m))->prefcnt)
1151 #define MEXT_FLAGS(m) ((m_get_rfa(m))->flags)
1152 #define MEXT_PRIV(m) ((m_get_rfa(m))->priv)
1153 #define MEXT_PMBUF(m) ((m_get_rfa(m))->paired)
1154 #define MEXT_TOKEN(m) ((m_get_rfa(m))->ext_token)
1155 #define MBUF_IS_COMPOSITE(m) \
1156 (MEXT_REF(m) == MEXT_MINREF(m) && \
1157 (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
1158 /*
1159 * This macro can be used to test if the mbuf is paired to an external
1160 * cluster. The test for MEXT_PMBUF being equal to the mbuf in subject
1161 * is important, as EXTF_PAIRED alone is insufficient since it is immutable,
1162 * and thus survives calls to m_free_paired.
1163 */
1164 #define MBUF_IS_PAIRED(m) \
1165 (((m)->m_flags & M_EXT) && \
1166 (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_PAIRED && \
1167 MEXT_PMBUF(m) == (m))
1168
1169 /*
1170 * Macros used to verify the integrity of the mbuf.
1171 */
1172 #if CONFIG_MBUF_MCACHE
1173 #define _MCHECK(m) { \
1174 if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) { \
1175 if (mclaudit == NULL) \
1176 panic("MCHECK: m_type=%d m=%p", \
1177 (u_int16_t)(m)->m_type, m); \
1178 else \
1179 mcl_audit_mcheck_panic(m); \
1180 } \
1181 }
1182 #else
1183 #define _MCHECK(m) \
1184 if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) { \
1185 panic("MCHECK: m_type=%d m=%p", \
1186 (u_int16_t)(m)->m_type, m); \
1187 }
1188 #endif /* CONFIG_MBUF_MCACHE */
1189
1190 /*
1191 * Macro version of mtod.
1192 */
1193 #define MTOD(m, t) ((t)((m)->m_data))
1194
1195 #if CONFIG_MBUF_MCACHE
1196 #define MBUF_IN_MAP(addr) \
1197 ((unsigned char *)(addr) >= mbutl && \
1198 (unsigned char *)(addr) < embutl)
1199
1200 #define MRANGE(addr) { \
1201 if (!MBUF_IN_MAP(addr)) \
1202 panic("MRANGE: address out of range 0x%p", addr); \
1203 }
1204
1205 /*
1206 * Macros to obtain page index given a base cluster address
1207 */
1208 #define MTOPG(x) (((unsigned char *)x - mbutl) >> PAGE_SHIFT)
1209 #define PGTOM(x) (mbutl + (x << PAGE_SHIFT))
1210
1211 /*
1212 * Macro to find the mbuf index relative to a base.
1213 */
1214 #define MBPAGEIDX(c, m) \
1215 (((unsigned char *)(m) - (unsigned char *)(c)) >> _MSIZESHIFT)
1216
1217 /*
1218 * Same thing for 2KB cluster index.
1219 */
1220 #define CLPAGEIDX(c, m) \
1221 (((unsigned char *)(m) - (unsigned char *)(c)) >> MCLSHIFT)
1222
1223 /*
1224 * Macro to find 4KB cluster index relative to a base
1225 */
1226 #define BCLPAGEIDX(c, m) \
1227 (((unsigned char *)(m) - (unsigned char *)(c)) >> MBIGCLSHIFT)
1228 #endif /* CONFIG_MBUF_MCACHE */
1229
1230 /*
1231 * Macros used during mbuf and cluster initialization.
1232 */
1233 #define MBUF_INIT_PKTHDR(m) { \
1234 (m)->m_pkthdr.rcvif = NULL; \
1235 (m)->m_pkthdr.pkt_hdr = NULL; \
1236 (m)->m_pkthdr.len = 0; \
1237 (m)->m_pkthdr.csum_flags = 0; \
1238 (m)->m_pkthdr.csum_data = 0; \
1239 (m)->m_pkthdr.vlan_tag = 0; \
1240 (m)->m_pkthdr.comp_gencnt = 0; \
1241 (m)->m_pkthdr.pkt_crumbs = 0; \
1242 m_classifier_init(m, 0); \
1243 m_tag_init(m, 1); \
1244 m_scratch_init(m); \
1245 m_redzone_init(m); \
1246 }
1247
1248 #define MBUF_INIT(m, pkthdr, type) { \
1249 _MCHECK(m); \
1250 (m)->m_next = (m)->m_nextpkt = NULL; \
1251 (m)->m_len = 0; \
1252 (m)->m_type = type; \
1253 if ((pkthdr) == 0) { \
1254 (m)->m_data = (m)->m_dat; \
1255 (m)->m_flags = 0; \
1256 } else { \
1257 (m)->m_data = (m)->m_pktdat; \
1258 (m)->m_flags = M_PKTHDR; \
1259 MBUF_INIT_PKTHDR(m); \
1260 } \
1261 }
1262
1263 #define MEXT_INIT(m, buf, size, free, arg, rfa, min, ref, pref, flag, \
1264 priv, pm) { \
1265 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
1266 (m)->m_flags |= M_EXT; \
1267 m_set_ext((m), (rfa), (free), (arg)); \
1268 (m)->m_ext.ext_size = (u_int)(size); \
1269 MEXT_MINREF(m) = (min); \
1270 MEXT_REF(m) = (ref); \
1271 MEXT_PREF(m) = (pref); \
1272 MEXT_FLAGS(m) = (flag); \
1273 MEXT_PRIV(m) = (priv); \
1274 MEXT_PMBUF(m) = (pm); \
1275 }
1276
1277 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
1278 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, 0, \
1279 ref, 0, flag, 0, NULL)
1280
1281 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
1282 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, 0, \
1283 ref, 0, flag, 0, NULL)
1284
1285 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
1286 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, 0, \
1287 ref, 0, flag, 0, NULL)
1288
1289 /*
1290 * Macro to convert BSD malloc sleep flag to mcache's
1291 */
1292 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
1293
1294 /*
1295 * The structure that holds all mbuf class statistics exportable via sysctl.
1296 * Similar to mbstat structure, the mb_stat structure is protected by the
1297 * global mbuf lock. It contains additional information about the classes
1298 * that allows for a more accurate view of the state of the allocator.
1299 */
1300 struct mb_stat *mb_stat;
1301 struct omb_stat *omb_stat; /* For backwards compatibility */
1302
1303 #define MB_STAT_SIZE(n) \
1304 __builtin_offsetof(mb_stat_t, mbs_class[n])
1305 #define OMB_STAT_SIZE(n) \
1306 __builtin_offsetof(struct omb_stat, mbs_class[n])
1307
1308 /*
1309 * The legacy structure holding all of the mbuf allocation statistics.
1310 * The actual statistics used by the kernel are stored in the mbuf_table
1311 * instead, and are updated atomically while the global mbuf lock is held.
1312 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
1313 * Unlike before, the kernel no longer relies on the contents of mbstat for
1314 * its operations (e.g. cluster expansion) because the structure is exposed
1315 * to outside and could possibly be modified, therefore making it unsafe.
1316 * With the exception of the mbstat.m_mtypes array (see below), all of the
1317 * statistics are updated as they change.
1318 */
1319 struct mbstat mbstat;
1320
1321 #define MBSTAT_MTYPES_MAX \
1322 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
1323
1324 /*
1325 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
1326 * atomically and stored in a per-CPU structure which is lock-free; this is
1327 * done in order to avoid writing to the global mbstat data structure which
1328 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
1329 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
1330 * array and returned to the application. Any updates for types greater or
1331 * equal than MT_MAX would be done atomically to the mbstat; this slows down
1332 * performance but is okay since the kernel uses only up to MT_MAX-1 while
1333 * anything beyond that (up to type 255) is considered a corner case.
1334 */
1335 typedef struct {
1336 unsigned int cpu_mtypes[MT_MAX];
1337 } mbuf_mtypes_t;
1338
1339 static mbuf_mtypes_t PERCPU_DATA(mbuf_mtypes);
1340
1341 #define mtype_stat_add(type, n) { \
1342 if ((unsigned)(type) < MT_MAX) { \
1343 mbuf_mtypes_t *mbs = PERCPU_GET(mbuf_mtypes); \
1344 os_atomic_add(&mbs->cpu_mtypes[type], n, relaxed); \
1345 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
1346 os_atomic_add((int16_t *)&mbstat.m_mtypes[type], n, relaxed); \
1347 } \
1348 }
1349
1350 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
1351 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
1352 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
1353
1354 static void
mbuf_mtypes_sync(boolean_t locked)1355 mbuf_mtypes_sync(boolean_t locked)
1356 {
1357 mbuf_mtypes_t mtc;
1358
1359 if (locked) {
1360 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1361 }
1362
1363 mtc = *PERCPU_GET_MASTER(mbuf_mtypes);
1364 percpu_foreach_secondary(mtype, mbuf_mtypes) {
1365 for (int n = 0; n < MT_MAX; n++) {
1366 mtc.cpu_mtypes[n] += mtype->cpu_mtypes[n];
1367 }
1368 }
1369
1370 if (!locked) {
1371 lck_mtx_lock(mbuf_mlock);
1372 }
1373 for (int n = 0; n < MT_MAX; n++) {
1374 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
1375 }
1376 if (!locked) {
1377 lck_mtx_unlock(mbuf_mlock);
1378 }
1379 }
1380
1381 static int
1382 mbstat_sysctl SYSCTL_HANDLER_ARGS
1383 {
1384 #pragma unused(oidp, arg1, arg2)
1385
1386 #if CONFIG_MBUF_MCACHE
1387 mbuf_mtypes_sync(FALSE);
1388 #else
1389 lck_mtx_lock(mbuf_mlock);
1390 mbuf_stat_sync();
1391 mbuf_mtypes_sync(TRUE);
1392 lck_mtx_unlock(mbuf_mlock);
1393 #endif
1394
1395 return SYSCTL_OUT(req, &mbstat, sizeof(mbstat));
1396 }
1397
1398 static void
mbuf_stat_sync(void)1399 mbuf_stat_sync(void)
1400 {
1401 mb_class_stat_t *sp;
1402 #if CONFIG_MBUF_MCACHE
1403 mcache_cpu_t *ccp;
1404 mcache_t *cp;
1405 int k, m, bktsize;
1406 #else
1407 int k;
1408 uint64_t drops = 0;
1409 #endif /* CONFIG_MBUF_MCACHE */
1410
1411
1412 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1413
1414 #if CONFIG_MBUF_MCACHE
1415 for (k = 0; k < NELEM(mbuf_table); k++) {
1416 cp = m_cache(k);
1417 ccp = &cp->mc_cpu[0];
1418 bktsize = ccp->cc_bktsize;
1419 sp = mbuf_table[k].mtbl_stats;
1420
1421 if (cp->mc_flags & MCF_NOCPUCACHE) {
1422 sp->mbcl_mc_state = MCS_DISABLED;
1423 } else if (cp->mc_purge_cnt > 0) {
1424 sp->mbcl_mc_state = MCS_PURGING;
1425 } else if (bktsize == 0) {
1426 sp->mbcl_mc_state = MCS_OFFLINE;
1427 } else {
1428 sp->mbcl_mc_state = MCS_ONLINE;
1429 }
1430
1431 sp->mbcl_mc_cached = 0;
1432 for (m = 0; m < ncpu; m++) {
1433 ccp = &cp->mc_cpu[m];
1434 if (ccp->cc_objs > 0) {
1435 sp->mbcl_mc_cached += ccp->cc_objs;
1436 }
1437 if (ccp->cc_pobjs > 0) {
1438 sp->mbcl_mc_cached += ccp->cc_pobjs;
1439 }
1440 }
1441 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
1442 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1443 sp->mbcl_infree;
1444
1445 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1446 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1447 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1448
1449 /* Calculate total count specific to each class */
1450 sp->mbcl_ctotal = sp->mbcl_total;
1451 switch (m_class(k)) {
1452 case MC_MBUF:
1453 /* Deduct mbufs used in composite caches */
1454 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1455 m_total(MC_MBUF_BIGCL) - m_total(MC_MBUF_16KCL));
1456 break;
1457
1458 case MC_CL:
1459 /* Deduct clusters used in composite cache */
1460 sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1461 break;
1462
1463 case MC_BIGCL:
1464 /* Deduct clusters used in composite cache */
1465 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1466 break;
1467
1468 case MC_16KCL:
1469 /* Deduct clusters used in composite cache */
1470 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1471 break;
1472
1473 default:
1474 break;
1475 }
1476 }
1477 #else
1478 for (k = 0; k < NELEM(mbuf_table); k++) {
1479 const zone_id_t zid = m_class_to_zid(m_class(k));
1480 const zone_t zone = zone_by_id(zid);
1481 struct zone_basic_stats stats = {};
1482
1483 sp = m_stats(k);
1484 zone_get_stats(zone, &stats);
1485 drops += stats.zbs_alloc_fail;
1486 sp->mbcl_total = stats.zbs_avail;
1487 sp->mbcl_active = stats.zbs_alloc;
1488 /*
1489 * infree is what mcache considers the freelist (uncached)
1490 * free_cnt contains all the cached/uncached elements
1491 * in a zone.
1492 */
1493 sp->mbcl_infree = stats.zbs_free - stats.zbs_cached;
1494 sp->mbcl_fail_cnt = stats.zbs_alloc_fail;
1495 sp->mbcl_ctotal = sp->mbcl_total;
1496
1497 /* These stats are not available in zalloc. */
1498 sp->mbcl_alloc_cnt = 0;
1499 sp->mbcl_free_cnt = 0;
1500 sp->mbcl_notified = 0;
1501 sp->mbcl_purge_cnt = 0;
1502 sp->mbcl_slab_cnt = 0;
1503 sp->mbcl_release_cnt = 0;
1504
1505 /* zalloc caches are always on. */
1506 sp->mbcl_mc_state = MCS_ONLINE;
1507 sp->mbcl_mc_cached = stats.zbs_cached;
1508 /* These stats are not collected by zalloc. */
1509 sp->mbcl_mc_waiter_cnt = 0;
1510 sp->mbcl_mc_wretry_cnt = 0;
1511 sp->mbcl_mc_nwretry_cnt = 0;
1512 }
1513 /* Deduct clusters used in composite cache */
1514 m_ctotal(MC_MBUF) -= (m_total(MC_MBUF_CL) +
1515 m_total(MC_MBUF_BIGCL) -
1516 m_total(MC_MBUF_16KCL));
1517 m_ctotal(MC_CL) -= m_total(MC_MBUF_CL);
1518 m_ctotal(MC_BIGCL) -= m_total(MC_MBUF_BIGCL);
1519 m_ctotal(MC_16KCL) -= m_total(MC_MBUF_16KCL);
1520
1521 /* Update mbstat. */
1522 mbstat.m_mbufs = m_total(MC_MBUF);
1523 mbstat.m_clusters = m_total(MC_CL);
1524 mbstat.m_clfree = m_infree(MC_CL) + m_infree(MC_MBUF_CL);
1525 mbstat.m_drops = drops;
1526 mbstat.m_bigclusters = m_total(MC_BIGCL);
1527 mbstat.m_bigclfree = m_infree(MC_BIGCL) + m_infree(MC_MBUF_BIGCL);
1528 #endif /* CONFIG_MBUF_MCACHE */
1529 }
1530
1531 static int
1532 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1533 {
1534 #pragma unused(oidp, arg1, arg2)
1535 void *statp;
1536 int k, statsz, proc64 = proc_is64bit(req->p);
1537
1538 lck_mtx_lock(mbuf_mlock);
1539 mbuf_stat_sync();
1540
1541 if (!proc64) {
1542 struct omb_class_stat *oc;
1543 struct mb_class_stat *c;
1544
1545 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1546 oc = &omb_stat->mbs_class[0];
1547 c = &mb_stat->mbs_class[0];
1548 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1549 (void) snprintf(oc->mbcl_cname, sizeof(oc->mbcl_cname),
1550 "%s", c->mbcl_cname);
1551 oc->mbcl_size = c->mbcl_size;
1552 oc->mbcl_total = c->mbcl_total;
1553 oc->mbcl_active = c->mbcl_active;
1554 oc->mbcl_infree = c->mbcl_infree;
1555 oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1556 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1557 oc->mbcl_free_cnt = c->mbcl_free_cnt;
1558 oc->mbcl_notified = c->mbcl_notified;
1559 oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1560 oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1561 oc->mbcl_ctotal = c->mbcl_ctotal;
1562 oc->mbcl_release_cnt = c->mbcl_release_cnt;
1563 oc->mbcl_mc_state = c->mbcl_mc_state;
1564 oc->mbcl_mc_cached = c->mbcl_mc_cached;
1565 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1566 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1567 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1568 oc->mbcl_peak_reported = c->mbcl_peak_reported;
1569 }
1570 statp = omb_stat;
1571 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1572 } else {
1573 statp = mb_stat;
1574 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1575 }
1576
1577 lck_mtx_unlock(mbuf_mlock);
1578
1579 return SYSCTL_OUT(req, statp, statsz);
1580 }
1581
1582 #if !CONFIG_MBUF_MCACHE
1583 /*
1584 * The following functions are wrappers around mbuf
1585 * allocation for zalloc. They all have the prefix "mz"
1586 * which was chosen to avoid conflicts with the mbuf KPIs.
1587 *
1588 * Z_NOPAGEWAIT is used in place of Z_NOWAIT because
1589 * Z_NOPAGEWAIT maps closer to MCR_TRYHARD. Z_NOWAIT will
1590 * fail immediately if it has to take a mutex and that
1591 * may cause packets to be dropped more frequently.
1592 * In general, the mbuf subsystem can sustain grabbing a mutex
1593 * during "non-blocking" allocation and that's the reason
1594 * why Z_NOPAGEWAIT was chosen.
1595 *
1596 * mbufs are elided (removed all pointers) before they are
1597 * returned to the cache. The exception are composite mbufs which
1598 * are re-initialized on allocation.
1599 */
1600 __attribute__((always_inline))
1601 static inline void
m_elide(struct mbuf * m)1602 m_elide(struct mbuf *m)
1603 {
1604 m->m_next = m->m_nextpkt = NULL;
1605 m->m_data = NULL;
1606 memset(&m->m_ext, 0, sizeof(m->m_ext));
1607 m->m_pkthdr.rcvif = NULL;
1608 m->m_pkthdr.pkt_hdr = NULL;
1609 m->m_flags |= M_PKTHDR;
1610 m_tag_init(m, 1);
1611 m->m_pkthdr.pkt_flags = 0;
1612 m_scratch_init(m);
1613 m->m_pkthdr.redzone = 0;
1614 m->m_flags &= ~M_PKTHDR;
1615 }
1616
1617 __attribute__((always_inline))
1618 static inline struct mbuf *
mz_alloc(zalloc_flags_t flags)1619 mz_alloc(zalloc_flags_t flags)
1620 {
1621 if (flags & Z_NOWAIT) {
1622 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
1623 } else if (!(flags & Z_NOPAGEWAIT)) {
1624 flags |= Z_NOFAIL;
1625 }
1626 return zalloc_id(ZONE_ID_MBUF, flags | Z_NOZZC);
1627 }
1628
1629 __attribute__((always_inline))
1630 static inline zstack_t
mz_alloc_n(uint32_t count,zalloc_flags_t flags)1631 mz_alloc_n(uint32_t count, zalloc_flags_t flags)
1632 {
1633 if (flags & Z_NOWAIT) {
1634 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
1635 } else if (!(flags & Z_NOPAGEWAIT)) {
1636 flags |= Z_NOFAIL;
1637 }
1638 return zalloc_n(ZONE_ID_MBUF, count, flags | Z_NOZZC);
1639 }
1640
1641 __attribute__((always_inline))
1642 static inline void
mz_free(struct mbuf * m)1643 mz_free(struct mbuf *m)
1644 {
1645 #if KASAN
1646 zone_require(zone_by_id(ZONE_ID_MBUF), m);
1647 #endif
1648 m_elide(m);
1649 zfree_nozero(ZONE_ID_MBUF, m);
1650 }
1651
1652 __attribute__((always_inline))
1653 static inline void
mz_free_n(zstack_t list)1654 mz_free_n(zstack_t list)
1655 {
1656 /* Callers of this function have already elided the mbuf. */
1657 zfree_nozero_n(ZONE_ID_MBUF, list);
1658 }
1659
1660 __attribute__((always_inline))
1661 static inline struct ext_ref *
mz_ref_alloc(zalloc_flags_t flags)1662 mz_ref_alloc(zalloc_flags_t flags)
1663 {
1664 if (flags & Z_NOWAIT) {
1665 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
1666 }
1667 return zalloc_id(ZONE_ID_MBUF_REF, flags | Z_NOZZC);
1668 }
1669
1670 __attribute__((always_inline))
1671 static inline void
mz_ref_free(struct ext_ref * rfa)1672 mz_ref_free(struct ext_ref *rfa)
1673 {
1674 VERIFY(rfa->minref == rfa->refcnt);
1675 #if KASAN
1676 zone_require(zone_by_id(ZONE_ID_MBUF_REF), rfa);
1677 #endif
1678 zfree_nozero(ZONE_ID_MBUF_REF, rfa);
1679 }
1680
1681 __attribute__((always_inline))
1682 static inline void *
mz_cl_alloc(zone_id_t zid,zalloc_flags_t flags)1683 mz_cl_alloc(zone_id_t zid, zalloc_flags_t flags)
1684 {
1685 if (flags & Z_NOWAIT) {
1686 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
1687 } else if (!(flags & Z_NOPAGEWAIT)) {
1688 flags |= Z_NOFAIL;
1689 }
1690 return (zalloc_id)(zid, flags | Z_NOZZC);
1691 }
1692
1693 __attribute__((always_inline))
1694 static inline void
mz_cl_free(zone_id_t zid,void * cl)1695 mz_cl_free(zone_id_t zid, void *cl)
1696 {
1697 #if KASAN
1698 zone_require(zone_by_id(zid), cl);
1699 #endif
1700 zfree_nozero(zid, cl);
1701 }
1702
1703 __attribute__((always_inline))
1704 static inline zstack_t
mz_composite_alloc_n(mbuf_class_t class,unsigned int n,zalloc_flags_t flags)1705 mz_composite_alloc_n(mbuf_class_t class, unsigned int n, zalloc_flags_t flags)
1706 {
1707 if (flags & Z_NOWAIT) {
1708 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
1709 }
1710 return (zcache_alloc_n)(m_class_to_zid(class), n, flags,
1711 &mz_composite_ops);
1712 }
1713
1714 __attribute__((always_inline))
1715 static inline struct mbuf *
mz_composite_alloc(mbuf_class_t class,zalloc_flags_t flags)1716 mz_composite_alloc(mbuf_class_t class, zalloc_flags_t flags)
1717 {
1718 zstack_t list = {};
1719 list = mz_composite_alloc_n(class, 1, flags);
1720 if (!zstack_empty(list)) {
1721 return zstack_pop(&list);
1722 } else {
1723 return NULL;
1724 }
1725 }
1726
1727 __attribute__((always_inline))
1728 static inline void
mz_composite_free_n(mbuf_class_t class,zstack_t list)1729 mz_composite_free_n(mbuf_class_t class, zstack_t list)
1730 {
1731 (zcache_free_n)(m_class_to_zid(class), list, &mz_composite_ops);
1732 }
1733
1734 __attribute__((always_inline))
1735 static inline void
mz_composite_free(mbuf_class_t class,struct mbuf * m)1736 mz_composite_free(mbuf_class_t class, struct mbuf *m)
1737 {
1738 zstack_t list = {};
1739 zstack_push(&list, m);
1740 (zcache_free_n)(m_class_to_zid(class), list, &mz_composite_ops);
1741 }
1742
1743 /* Converts composite zone ID to the cluster zone ID. */
1744 __attribute__((always_inline))
1745 static inline zone_id_t
mz_cl_zid(zone_id_t zid)1746 mz_cl_zid(zone_id_t zid)
1747 {
1748 return ZONE_ID_CLUSTER_2K + zid - ZONE_ID_MBUF_CLUSTER_2K;
1749 }
1750
1751 static void *
mz_composite_build(zone_id_t zid,zalloc_flags_t flags)1752 mz_composite_build(zone_id_t zid, zalloc_flags_t flags)
1753 {
1754 const zone_id_t cl_zid = mz_cl_zid(zid);
1755 struct mbuf *m = NULL;
1756 struct ext_ref *rfa = NULL;
1757 void *cl = NULL;
1758
1759 cl = mz_cl_alloc(cl_zid, flags);
1760 if (__improbable(cl == NULL)) {
1761 goto out;
1762 }
1763 rfa = mz_ref_alloc(flags);
1764 if (__improbable(rfa == NULL)) {
1765 goto out_free_cl;
1766 }
1767 m = mz_alloc(flags);
1768 if (__improbable(m == NULL)) {
1769 goto out_free_rfa;
1770 }
1771 MBUF_INIT(m, 0, MT_FREE);
1772 if (zid == ZONE_ID_MBUF_CLUSTER_2K) {
1773 MBUF_CL_INIT(m, cl, rfa, 0, EXTF_COMPOSITE);
1774 } else if (zid == ZONE_ID_MBUF_CLUSTER_4K) {
1775 MBUF_BIGCL_INIT(m, cl, rfa, 0, EXTF_COMPOSITE);
1776 } else {
1777 MBUF_16KCL_INIT(m, cl, rfa, 0, EXTF_COMPOSITE);
1778 }
1779 VERIFY(m->m_flags == M_EXT);
1780 VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
1781
1782 return m;
1783 out_free_rfa:
1784 mz_ref_free(rfa);
1785 out_free_cl:
1786 mz_cl_free(cl_zid, cl);
1787 out:
1788 return NULL;
1789 }
1790
1791 static void *
mz_composite_mark_valid(zone_id_t zid,void * p)1792 mz_composite_mark_valid(zone_id_t zid, void *p)
1793 {
1794 struct mbuf *m = p;
1795
1796 m = zcache_mark_valid(zone_by_id(ZONE_ID_MBUF), m);
1797 #if KASAN
1798 struct ext_ref *rfa = m_get_rfa(m);
1799 const zone_id_t cl_zid = mz_cl_zid(zid);
1800 void *cl = m->m_ext.ext_buf;
1801
1802 cl = zcache_mark_valid(zone_by_id(cl_zid), cl);
1803 rfa = zcache_mark_valid(zone_by_id(ZONE_ID_MBUF_REF), rfa);
1804 m->m_data = m->m_ext.ext_buf = cl;
1805 m_set_rfa(m, rfa);
1806 #else
1807 #pragma unused(zid)
1808 #endif
1809 VERIFY(MBUF_IS_COMPOSITE(m));
1810
1811 return m;
1812 }
1813
1814 static void *
mz_composite_mark_invalid(zone_id_t zid,void * p)1815 mz_composite_mark_invalid(zone_id_t zid, void *p)
1816 {
1817 struct mbuf *m = p;
1818
1819 VERIFY(MBUF_IS_COMPOSITE(m));
1820 VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
1821 #if KASAN
1822 struct ext_ref *rfa = m_get_rfa(m);
1823 const zone_id_t cl_zid = mz_cl_zid(zid);
1824 void *cl = m->m_ext.ext_buf;
1825
1826 cl = zcache_mark_invalid(zone_by_id(cl_zid), cl);
1827 rfa = zcache_mark_invalid(zone_by_id(ZONE_ID_MBUF_REF), rfa);
1828 m->m_data = m->m_ext.ext_buf = cl;
1829 m_set_rfa(m, rfa);
1830 #else
1831 #pragma unused(zid)
1832 #endif
1833
1834 return zcache_mark_invalid(zone_by_id(ZONE_ID_MBUF), m);
1835 }
1836
1837 static void
mz_composite_destroy(zone_id_t zid,void * p)1838 mz_composite_destroy(zone_id_t zid, void *p)
1839 {
1840 const zone_id_t cl_zid = mz_cl_zid(zid);
1841 struct ext_ref *rfa = NULL;
1842 struct mbuf *m = p;
1843
1844 VERIFY(MBUF_IS_COMPOSITE(m));
1845
1846 MEXT_MINREF(m) = 0;
1847 MEXT_REF(m) = 0;
1848 MEXT_PREF(m) = 0;
1849 MEXT_FLAGS(m) = 0;
1850 MEXT_PRIV(m) = 0;
1851 MEXT_PMBUF(m) = NULL;
1852 MEXT_TOKEN(m) = 0;
1853
1854 rfa = m_get_rfa(m);
1855 m_set_ext(m, NULL, NULL, NULL);
1856
1857 m->m_type = MT_FREE;
1858 m->m_flags = m->m_len = 0;
1859 m->m_next = m->m_nextpkt = NULL;
1860
1861 mz_cl_free(cl_zid, m->m_ext.ext_buf);
1862 m->m_ext.ext_buf = NULL;
1863 mz_ref_free(rfa);
1864 mz_free(m);
1865 }
1866 #endif /* !CONFIG_MBUF_MCACHE */
1867
1868 #if CONFIG_MBUF_MCACHE
1869 static int
1870 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1871 {
1872 #pragma unused(oidp, arg1, arg2)
1873 int i;
1874
1875 /* Ensure leak tracing turned on */
1876 if (!mclfindleak || !mclexpleak) {
1877 return ENXIO;
1878 }
1879
1880 lck_mtx_lock(mleak_lock);
1881 mleak_update_stats();
1882 i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1883 lck_mtx_unlock(mleak_lock);
1884
1885 return i;
1886 }
1887
1888 static int
1889 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1890 {
1891 #pragma unused(oidp, arg1, arg2)
1892 int i = 0;
1893
1894 /* Ensure leak tracing turned on */
1895 if (!mclfindleak || !mclexpleak) {
1896 return ENXIO;
1897 }
1898
1899 lck_mtx_lock(mleak_lock);
1900 i = SYSCTL_OUT(req, &mleak_table, sizeof(mleak_table));
1901 lck_mtx_unlock(mleak_lock);
1902
1903 return i;
1904 }
1905 #endif /* CONFIG_MBUF_MCACHE */
1906
1907 static inline void
m_incref(struct mbuf * m)1908 m_incref(struct mbuf *m)
1909 {
1910 uint16_t new = os_atomic_inc(&MEXT_REF(m), relaxed);
1911
1912 VERIFY(new != 0);
1913 /*
1914 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1915 * we don't clear the flag when the refcount goes back to the
1916 * minimum, to simplify code calling m_mclhasreference().
1917 */
1918 if (new > (MEXT_MINREF(m) + 1) && !(MEXT_FLAGS(m) & EXTF_READONLY)) {
1919 os_atomic_or(&MEXT_FLAGS(m), EXTF_READONLY, relaxed);
1920 }
1921 }
1922
1923 static inline uint16_t
m_decref(struct mbuf * m)1924 m_decref(struct mbuf *m)
1925 {
1926 VERIFY(MEXT_REF(m) != 0);
1927
1928 return os_atomic_dec(&MEXT_REF(m), acq_rel);
1929 }
1930
1931 static void
mbuf_table_init(void)1932 mbuf_table_init(void)
1933 {
1934 unsigned int b, c, s;
1935 int m, config_mbuf_jumbo = 0;
1936
1937 omb_stat = zalloc_permanent(OMB_STAT_SIZE(NELEM(mbuf_table)),
1938 ZALIGN(struct omb_stat));
1939
1940 mb_stat = zalloc_permanent(MB_STAT_SIZE(NELEM(mbuf_table)),
1941 ZALIGN(mb_stat_t));
1942
1943 mb_stat->mbs_cnt = NELEM(mbuf_table);
1944 for (m = 0; m < NELEM(mbuf_table); m++) {
1945 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1946 }
1947
1948 #if CONFIG_MBUF_JUMBO
1949 config_mbuf_jumbo = 1;
1950 #endif /* CONFIG_MBUF_JUMBO */
1951
1952 if (config_mbuf_jumbo == 1 || PAGE_SIZE == M16KCLBYTES) {
1953 /*
1954 * Set aside 1/3 of the mbuf cluster map for jumbo
1955 * clusters; we do this only on platforms where jumbo
1956 * cluster pool is enabled.
1957 */
1958 njcl = nmbclusters / 3;
1959 njclbytes = M16KCLBYTES;
1960 }
1961
1962 /*
1963 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1964 * a multiple of 4KB clusters.
1965 */
1966 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1967 if (njcl > 0) {
1968 /*
1969 * Each jumbo cluster takes 8 2KB clusters, so make
1970 * sure that the pool size is evenly divisible by 8;
1971 * njcl is in 2KB unit, hence treated as such.
1972 */
1973 njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL);
1974
1975 /* Update nclusters with rounded down value of njcl */
1976 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1977 }
1978
1979 /*
1980 * njcl is valid only on platforms with 16KB jumbo clusters or
1981 * with 16KB pages, where it is configured to 1/3 of the pool
1982 * size. On these platforms, the remaining is used for 2KB
1983 * and 4KB clusters. On platforms without 16KB jumbo clusters,
1984 * the entire pool is used for both 2KB and 4KB clusters. A 4KB
1985 * cluster can either be splitted into 16 mbufs, or into 2 2KB
1986 * clusters.
1987 *
1988 * +---+---+------------ ... -----------+------- ... -------+
1989 * | c | b | s | njcl |
1990 * +---+---+------------ ... -----------+------- ... -------+
1991 *
1992 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1993 * clusters (1/64th each.)
1994 */
1995 c = P2ROUNDDOWN((nclusters >> 6), NCLPG); /* in 2KB unit */
1996 b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), NBCLPG); /* in 4KB unit */
1997 s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */
1998
1999 /*
2000 * 1/64th (c) is reserved for 2KB clusters.
2001 */
2002 m_minlimit(MC_CL) = c;
2003 m_maxlimit(MC_CL) = s + c; /* in 2KB unit */
2004 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
2005 snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
2006
2007 /*
2008 * Another 1/64th (b) of the map is reserved for 4KB clusters.
2009 * It cannot be turned into 2KB clusters or mbufs.
2010 */
2011 m_minlimit(MC_BIGCL) = b;
2012 m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */
2013 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
2014 snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
2015
2016 /*
2017 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
2018 */
2019 m_minlimit(MC_MBUF) = 0;
2020 m_maxlimit(MC_MBUF) = s * NMBPCL; /* in mbuf unit */
2021 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = _MSIZE;
2022 snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
2023
2024 /*
2025 * Set limits for the composite classes.
2026 */
2027 m_minlimit(MC_MBUF_CL) = 0;
2028 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
2029 m_maxsize(MC_MBUF_CL) = MCLBYTES;
2030 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
2031 snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
2032
2033 m_minlimit(MC_MBUF_BIGCL) = 0;
2034 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
2035 m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
2036 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
2037 snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
2038
2039 /*
2040 * And for jumbo classes.
2041 */
2042 m_minlimit(MC_16KCL) = 0;
2043 m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */
2044 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
2045 snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
2046
2047 m_minlimit(MC_MBUF_16KCL) = 0;
2048 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
2049 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
2050 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
2051 snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
2052
2053 /*
2054 * Initialize the legacy mbstat structure.
2055 */
2056 bzero(&mbstat, sizeof(mbstat));
2057 mbstat.m_msize = m_maxsize(MC_MBUF);
2058 mbstat.m_mclbytes = m_maxsize(MC_CL);
2059 mbstat.m_minclsize = MINCLSIZE;
2060 mbstat.m_mlen = MLEN;
2061 mbstat.m_mhlen = MHLEN;
2062 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
2063 }
2064
2065 int
mbuf_get_class(struct mbuf * m)2066 mbuf_get_class(struct mbuf *m)
2067 {
2068 if (m->m_flags & M_EXT) {
2069 uint32_t composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
2070 m_ext_free_func_t m_free_func = m_get_ext_free(m);
2071
2072 if (m_free_func == NULL) {
2073 if (composite) {
2074 return MC_MBUF_CL;
2075 } else {
2076 return MC_CL;
2077 }
2078 } else if (m_free_func == m_bigfree) {
2079 if (composite) {
2080 return MC_MBUF_BIGCL;
2081 } else {
2082 return MC_BIGCL;
2083 }
2084 } else if (m_free_func == m_16kfree) {
2085 if (composite) {
2086 return MC_MBUF_16KCL;
2087 } else {
2088 return MC_16KCL;
2089 }
2090 }
2091 }
2092
2093 return MC_MBUF;
2094 }
2095
2096 bool
mbuf_class_under_pressure(struct mbuf * m)2097 mbuf_class_under_pressure(struct mbuf *m)
2098 {
2099 int mclass = mbuf_get_class(m);
2100
2101 #if CONFIG_MBUF_MCACHE
2102 if (m_total(mclass) - m_infree(mclass) >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
2103 /*
2104 * The above computation does not include the per-CPU cached objects.
2105 * As a fast-path check this is good-enough. But now we do
2106 * the "slower" count of the cached objects to know exactly the
2107 * number of active mbufs in use.
2108 *
2109 * We do not take the mbuf_lock here to avoid lock-contention. Numbers
2110 * might be slightly off but we don't try to be 100% accurate.
2111 * At worst, we drop a packet that we shouldn't have dropped or
2112 * we might go slightly above our memory-pressure threshold.
2113 */
2114 mcache_t *cp = m_cache(mclass);
2115 mcache_cpu_t *ccp = &cp->mc_cpu[0];
2116
2117 int bktsize = os_access_once(ccp->cc_bktsize);
2118 uint32_t bl_total = os_access_once(cp->mc_full.bl_total);
2119 uint32_t cached = 0;
2120 int i;
2121
2122 for (i = 0; i < ncpu; i++) {
2123 ccp = &cp->mc_cpu[i];
2124
2125 int cc_objs = os_access_once(ccp->cc_objs);
2126 if (cc_objs > 0) {
2127 cached += cc_objs;
2128 }
2129
2130 int cc_pobjs = os_access_once(ccp->cc_pobjs);
2131 if (cc_pobjs > 0) {
2132 cached += cc_pobjs;
2133 }
2134 }
2135 cached += (bl_total * bktsize);
2136 if (m_total(mclass) - m_infree(mclass) - cached >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
2137 os_log(OS_LOG_DEFAULT,
2138 "%s memory-pressure on mbuf due to class %u, total %u free %u cached %u max %u",
2139 __func__, mclass, m_total(mclass), m_infree(mclass), cached, m_maxlimit(mclass));
2140 return true;
2141 }
2142 }
2143 #else
2144 /*
2145 * Grab the statistics from zalloc.
2146 * We can't call mbuf_stat_sync() since that requires a lock.
2147 */
2148 const zone_id_t zid = m_class_to_zid(m_class(mclass));
2149 const zone_t zone = zone_by_id(zid);
2150 struct zone_basic_stats stats = {};
2151
2152 zone_get_stats(zone, &stats);
2153 if (stats.zbs_avail - stats.zbs_free >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
2154 os_log(OS_LOG_DEFAULT,
2155 "%s memory-pressure on mbuf due to class %u, total %llu free %llu max %u",
2156 __func__, mclass, stats.zbs_avail, stats.zbs_free, m_maxlimit(mclass));
2157 return true;
2158 }
2159 #endif /* CONFIG_MBUF_MCACHE */
2160
2161 return false;
2162 }
2163
2164 #if defined(__LP64__)
2165 typedef struct ncl_tbl {
2166 uint64_t nt_maxmem; /* memory (sane) size */
2167 uint32_t nt_mbpool; /* mbuf pool size */
2168 } ncl_tbl_t;
2169
2170 static const ncl_tbl_t ncl_table[] = {
2171 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ },
2172 { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (96 << MBSHIFT) /* 96 MB */ },
2173 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (128 << MBSHIFT) /* 128 MB */ },
2174 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (256 << MBSHIFT) /* 256 MB */ },
2175 { (1ULL << (GBSHIFT + 5)) /* 32 GB */, (512 << MBSHIFT) /* 512 MB */ },
2176 { 0, 0 }
2177 };
2178 #endif /* __LP64__ */
2179
2180 __private_extern__ unsigned int
mbuf_default_ncl(uint64_t mem)2181 mbuf_default_ncl(uint64_t mem)
2182 {
2183 #if !defined(__LP64__)
2184 unsigned int n;
2185 /*
2186 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
2187 */
2188 if ((n = ((mem / 16) / MCLBYTES)) > 32768) {
2189 n = 32768;
2190 }
2191 #else
2192 unsigned int n, i;
2193 /*
2194 * 64-bit kernel (mbuf pool size based on table).
2195 */
2196 n = ncl_table[0].nt_mbpool;
2197 for (i = 0; ncl_table[i].nt_mbpool != 0; i++) {
2198 if (mem < ncl_table[i].nt_maxmem) {
2199 break;
2200 }
2201 n = ncl_table[i].nt_mbpool;
2202 }
2203 n >>= MCLSHIFT;
2204 #endif /* !__LP64__ */
2205 return n;
2206 }
2207
2208 __private_extern__ void
mbinit(void)2209 mbinit(void)
2210 {
2211 unsigned int m;
2212 #if CONFIG_MBUF_MCACHE
2213 unsigned int initmcl = 0;
2214 thread_t thread = THREAD_NULL;
2215 #endif /* CONFIG_MBUF_MCACHE */
2216
2217 #if CONFIG_MBUF_MCACHE
2218 microuptime(&mb_start);
2219 #endif /* CONFIG_MBUF_MCACHE */
2220
2221 /*
2222 * These MBUF_ values must be equal to their private counterparts.
2223 */
2224 _CASSERT(MBUF_EXT == M_EXT);
2225 _CASSERT(MBUF_PKTHDR == M_PKTHDR);
2226 _CASSERT(MBUF_EOR == M_EOR);
2227 _CASSERT(MBUF_LOOP == M_LOOP);
2228 _CASSERT(MBUF_BCAST == M_BCAST);
2229 _CASSERT(MBUF_MCAST == M_MCAST);
2230 _CASSERT(MBUF_FRAG == M_FRAG);
2231 _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
2232 _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
2233 _CASSERT(MBUF_PROMISC == M_PROMISC);
2234 _CASSERT(MBUF_HASFCS == M_HASFCS);
2235
2236 _CASSERT(MBUF_TYPE_FREE == MT_FREE);
2237 _CASSERT(MBUF_TYPE_DATA == MT_DATA);
2238 _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
2239 _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
2240 _CASSERT(MBUF_TYPE_PCB == MT_PCB);
2241 _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
2242 _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
2243 _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
2244 _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
2245 _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
2246 _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
2247 _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
2248 _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
2249 _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
2250 _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
2251
2252 _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
2253 _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
2254 _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
2255 _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
2256 _CASSERT(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
2257 _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
2258 _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
2259 _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
2260 _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
2261 _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
2262 _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
2263 _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
2264 _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
2265 _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
2266
2267 _CASSERT(MBUF_WAITOK == M_WAIT);
2268 _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
2269 _CASSERT(MBUF_COPYALL == M_COPYALL);
2270
2271 _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
2272 _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
2273 _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
2274 _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
2275 _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
2276 _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
2277 _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
2278 _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
2279 _CASSERT(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI);
2280 _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
2281 _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
2282
2283 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
2284 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
2285 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
2286 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
2287
2288 /* Module specific scratch space (32-bit alignment requirement) */
2289 _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
2290 sizeof(uint32_t)));
2291
2292 /* pktdata needs to start at 128-bit offset! */
2293 _CASSERT((offsetof(struct mbuf, m_pktdat) % 16) == 0);
2294
2295 /* Initialize random red zone cookie value */
2296 _CASSERT(sizeof(mb_redzone_cookie) ==
2297 sizeof(((struct pkthdr *)0)->redzone));
2298 read_random(&mb_redzone_cookie, sizeof(mb_redzone_cookie));
2299 read_random(&mb_obscure_extref, sizeof(mb_obscure_extref));
2300 read_random(&mb_obscure_extfree, sizeof(mb_obscure_extfree));
2301 mb_obscure_extref |= 0x3;
2302 mb_obscure_extref = 0;
2303 mb_obscure_extfree |= 0x3;
2304
2305 #if CONFIG_MBUF_MCACHE
2306 /* Make sure we don't save more than we should */
2307 _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof(struct mbuf));
2308 #endif /* CONFIG_MBUF_MCACHE */
2309
2310 if (nmbclusters == 0) {
2311 nmbclusters = NMBCLUSTERS;
2312 }
2313
2314 /* This should be a sane (at least even) value by now */
2315 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
2316
2317 /* Setup the mbuf table */
2318 mbuf_table_init();
2319
2320 _CASSERT(sizeof(struct mbuf) == _MSIZE);
2321
2322 #if CONFIG_MBUF_MCACHE
2323 /*
2324 * Allocate cluster slabs table:
2325 *
2326 * maxslabgrp = (N * 2048) / (1024 * 1024)
2327 *
2328 * Where N is nmbclusters rounded up to the nearest 512. This yields
2329 * mcl_slab_g_t units, each one representing a MB of memory.
2330 */
2331 maxslabgrp =
2332 (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
2333 slabstbl = zalloc_permanent(maxslabgrp * sizeof(mcl_slabg_t *),
2334 ZALIGN(mcl_slabg_t));
2335
2336 /*
2337 * Allocate audit structures, if needed:
2338 *
2339 * maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE
2340 *
2341 * This yields mcl_audit_t units, each one representing a page.
2342 */
2343 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof(mbuf_debug));
2344 mbuf_debug |= mcache_getflags();
2345 if (mbuf_debug & MCF_DEBUG) {
2346 int l;
2347 mcl_audit_t *mclad;
2348 maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
2349 mclaudit = zalloc_permanent(maxclaudit * sizeof(*mclaudit),
2350 ZALIGN(mcl_audit_t));
2351 for (l = 0, mclad = mclaudit; l < maxclaudit; l++) {
2352 mclad[l].cl_audit = zalloc_permanent(NMBPG * sizeof(mcache_audit_t *),
2353 ZALIGN_PTR);
2354 }
2355
2356 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
2357 AUDIT_CONTENTS_SIZE, sizeof(u_int64_t), 0, MCR_SLEEP);
2358 VERIFY(mcl_audit_con_cache != NULL);
2359 }
2360 mclverify = (mbuf_debug & MCF_VERIFY);
2361 mcltrace = (mbuf_debug & MCF_TRACE);
2362 mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
2363 mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
2364
2365 /* Enable mbuf leak logging, with a lock to protect the tables */
2366
2367 mleak_activate();
2368
2369 /*
2370 * Allocate structure for per-CPU statistics that's aligned
2371 * on the CPU cache boundary; this code assumes that we never
2372 * uninitialize this framework, since the original address
2373 * before alignment is not saved.
2374 */
2375 ncpu = ml_wait_max_cpus();
2376
2377 /* Calculate the number of pages assigned to the cluster pool */
2378 mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
2379 mcl_paddr = zalloc_permanent(mcl_pages * sizeof(ppnum_t),
2380 ZALIGN(ppnum_t));
2381
2382 /* Register with the I/O Bus mapper */
2383 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
2384
2385 embutl = (mbutl + (nmbclusters * MCLBYTES));
2386 VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0);
2387
2388 /* Prime up the freelist */
2389 PE_parse_boot_argn("initmcl", &initmcl, sizeof(initmcl));
2390 if (initmcl != 0) {
2391 initmcl >>= NCLPBGSHIFT; /* become a 4K unit */
2392 if (initmcl > m_maxlimit(MC_BIGCL)) {
2393 initmcl = m_maxlimit(MC_BIGCL);
2394 }
2395 }
2396 if (initmcl < m_minlimit(MC_BIGCL)) {
2397 initmcl = m_minlimit(MC_BIGCL);
2398 }
2399
2400 lck_mtx_lock(mbuf_mlock);
2401
2402 /*
2403 * For classes with non-zero minimum limits, populate their freelists
2404 * so that m_total(class) is at least m_minlimit(class).
2405 */
2406 VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
2407 freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
2408 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
2409 freelist_init(m_class(MC_CL));
2410 #else
2411 /*
2412 * We have yet to create the non composite zones
2413 * and thus we haven't asked zalloc to allocate
2414 * anything yet, which means that at this point
2415 * m_total() is zero. Once we create the zones and
2416 * raise the reserve, m_total() will be calculated,
2417 * but until then just assume that we will have
2418 * at least the minium limit allocated.
2419 */
2420 m_total(MC_BIGCL) = m_minlimit(MC_BIGCL);
2421 m_total(MC_CL) = m_minlimit(MC_CL);
2422 #endif /* CONFIG_MBUF_MCACHE */
2423
2424 for (m = 0; m < NELEM(mbuf_table); m++) {
2425 /* Make sure we didn't miss any */
2426 VERIFY(m_minlimit(m_class(m)) == 0 ||
2427 m_total(m_class(m)) >= m_minlimit(m_class(m)));
2428
2429 /* populate the initial sizes and report from there on */
2430 m_peak(m_class(m)) = m_total(m_class(m));
2431 }
2432 mb_peak_newreport = FALSE;
2433
2434 #if CONFIG_MBUF_MCACHE
2435 lck_mtx_unlock(mbuf_mlock);
2436
2437 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
2438 NULL, &thread);
2439 thread_deallocate(thread);
2440
2441 ref_cache = mcache_create("mext_ref", sizeof(struct ext_ref),
2442 0, 0, MCR_SLEEP);
2443 #endif /* CONFIG_MBUF_MCACHE */
2444
2445 /* Create the cache for each class */
2446 for (m = 0; m < NELEM(mbuf_table); m++) {
2447 #if CONFIG_MBUF_MCACHE
2448 void *allocfunc, *freefunc, *auditfunc, *logfunc;
2449 u_int32_t flags;
2450
2451 flags = mbuf_debug;
2452 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
2453 m_class(m) == MC_MBUF_16KCL) {
2454 allocfunc = mbuf_cslab_alloc;
2455 freefunc = mbuf_cslab_free;
2456 auditfunc = mbuf_cslab_audit;
2457 logfunc = mleak_logger;
2458 } else {
2459 allocfunc = mbuf_slab_alloc;
2460 freefunc = mbuf_slab_free;
2461 auditfunc = mbuf_slab_audit;
2462 logfunc = mleak_logger;
2463 }
2464
2465 /*
2466 * Disable per-CPU caches for jumbo classes if there
2467 * is no jumbo cluster pool available in the system.
2468 * The cache itself is still created (but will never
2469 * be populated) since it simplifies the code.
2470 */
2471 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
2472 njcl == 0) {
2473 flags |= MCF_NOCPUCACHE;
2474 }
2475
2476 if (!mclfindleak) {
2477 flags |= MCF_NOLEAKLOG;
2478 }
2479
2480 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
2481 allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
2482 (void *)(uintptr_t)m, flags, MCR_SLEEP);
2483 #else
2484 if (!MBUF_CLASS_COMPOSITE(m)) {
2485 zone_t zone = zone_by_id(m_class_to_zid(m));
2486
2487 zone_set_exhaustible(zone, m_maxlimit(m), false);
2488 zone_raise_reserve(zone, m_minlimit(m));
2489 /*
2490 * Pretend that we have allocated m_total() items
2491 * at this point. zalloc will eventually do that
2492 * but it's an async operation.
2493 */
2494 m_total(m) = m_minlimit(m);
2495 }
2496 #endif /* CONFIG_MBUF_MCACHE */
2497 }
2498
2499 /*
2500 * Set the max limit on sb_max to be 1/16 th of the size of
2501 * memory allocated for mbuf clusters.
2502 */
2503 high_sb_max = (nmbclusters << (MCLSHIFT - 4));
2504 if (high_sb_max < sb_max) {
2505 /* sb_max is too large for this configuration, scale it down */
2506 if (high_sb_max > (1 << MBSHIFT)) {
2507 /* We have atleast 16 M of mbuf pool */
2508 sb_max = high_sb_max;
2509 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
2510 /*
2511 * If we have more than 1M of mbufpool, cap the size of
2512 * max sock buf at 1M
2513 */
2514 sb_max = high_sb_max = (1 << MBSHIFT);
2515 } else {
2516 sb_max = high_sb_max;
2517 }
2518 sb_max_adj = SB_MAX_ADJUST(sb_max);
2519 assert(sb_max_adj < UINT32_MAX);
2520 }
2521
2522 #if CONFIG_MBUF_MCACHE
2523 /* allocate space for mbuf_dump_buf */
2524 mbuf_dump_buf = zalloc_permanent(MBUF_DUMP_BUF_SIZE, ZALIGN_NONE);
2525
2526 if (mbuf_debug & MCF_DEBUG) {
2527 printf("%s: MLEN %d, MHLEN %d\n", __func__,
2528 (int)_MLEN, (int)_MHLEN);
2529 }
2530 #else
2531 mbuf_defunct_tcall =
2532 thread_call_allocate_with_options(mbuf_watchdog_defunct,
2533 NULL,
2534 THREAD_CALL_PRIORITY_KERNEL,
2535 THREAD_CALL_OPTIONS_ONCE);
2536 mbuf_drain_tcall =
2537 thread_call_allocate_with_options(mbuf_watchdog_drain_composite,
2538 NULL,
2539 THREAD_CALL_PRIORITY_KERNEL,
2540 THREAD_CALL_OPTIONS_ONCE);
2541 #endif /* CONFIG_MBUF_MCACHE */
2542 printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
2543 (nmbclusters << MCLSHIFT) >> MBSHIFT,
2544 (nclusters << MCLSHIFT) >> MBSHIFT,
2545 (njcl << MCLSHIFT) >> MBSHIFT);
2546
2547 PE_parse_boot_argn("mb_tag_mbuf", &mb_tag_mbuf, sizeof(mb_tag_mbuf));
2548 }
2549
2550 #if CONFIG_MBUF_MCACHE
2551 /*
2552 * Obtain a slab of object(s) from the class's freelist.
2553 */
2554 static mcache_obj_t *
slab_alloc(mbuf_class_t class,int wait)2555 slab_alloc(mbuf_class_t class, int wait)
2556 {
2557 mcl_slab_t *sp;
2558 mcache_obj_t *buf;
2559
2560 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2561
2562 /* This should always be NULL for us */
2563 VERIFY(m_cobjlist(class) == NULL);
2564
2565 /*
2566 * Treat composite objects as having longer lifespan by using
2567 * a slab from the reverse direction, in hoping that this could
2568 * reduce the probability of fragmentation for slabs that hold
2569 * more than one buffer chunks (e.g. mbuf slabs). For other
2570 * slabs, this probably doesn't make much of a difference.
2571 */
2572 if ((class == MC_MBUF || class == MC_CL || class == MC_BIGCL)
2573 && (wait & MCR_COMP)) {
2574 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
2575 } else {
2576 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
2577 }
2578
2579 if (sp == NULL) {
2580 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
2581 /* The slab list for this class is empty */
2582 return NULL;
2583 }
2584
2585 VERIFY(m_infree(class) > 0);
2586 VERIFY(!slab_is_detached(sp));
2587 VERIFY(sp->sl_class == class &&
2588 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2589 buf = sp->sl_head;
2590 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
2591 sp->sl_head = buf->obj_next;
2592 /* Increment slab reference */
2593 sp->sl_refcnt++;
2594
2595 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == sp->sl_chunks);
2596
2597 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
2598 slab_nextptr_panic(sp, sp->sl_head);
2599 /* In case sl_head is in the map but not in the slab */
2600 VERIFY(slab_inrange(sp, sp->sl_head));
2601 /* NOTREACHED */
2602 }
2603
2604 if (mclaudit != NULL) {
2605 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
2606 mca->mca_uflags = 0;
2607 /* Save contents on mbuf objects only */
2608 if (class == MC_MBUF) {
2609 mca->mca_uflags |= MB_SCVALID;
2610 }
2611 }
2612
2613 if (class == MC_CL) {
2614 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
2615 /*
2616 * A 2K cluster slab can have at most NCLPG references.
2617 */
2618 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPG &&
2619 sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
2620 VERIFY(sp->sl_refcnt < NCLPG || sp->sl_head == NULL);
2621 } else if (class == MC_BIGCL) {
2622 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
2623 m_infree(MC_MBUF_BIGCL);
2624 /*
2625 * A 4K cluster slab can have NBCLPG references.
2626 */
2627 VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG &&
2628 sp->sl_len == PAGE_SIZE &&
2629 (sp->sl_refcnt < NBCLPG || sp->sl_head == NULL));
2630 } else if (class == MC_16KCL) {
2631 mcl_slab_t *nsp;
2632 int k;
2633
2634 --m_infree(MC_16KCL);
2635 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
2636 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
2637 /*
2638 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
2639 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
2640 * most 1 reference.
2641 */
2642 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
2643 nsp = nsp->sl_next;
2644 /* Next slab must already be present */
2645 VERIFY(nsp != NULL);
2646 nsp->sl_refcnt++;
2647 VERIFY(!slab_is_detached(nsp));
2648 VERIFY(nsp->sl_class == MC_16KCL &&
2649 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
2650 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
2651 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
2652 nsp->sl_head == NULL);
2653 }
2654 } else {
2655 VERIFY(class == MC_MBUF);
2656 --m_infree(MC_MBUF);
2657 /*
2658 * If auditing is turned on, this check is
2659 * deferred until later in mbuf_slab_audit().
2660 */
2661 if (mclaudit == NULL) {
2662 _MCHECK((struct mbuf *)buf);
2663 }
2664 /*
2665 * Since we have incremented the reference count above,
2666 * an mbuf slab (formerly a 4KB cluster slab that was cut
2667 * up into mbufs) must have a reference count between 1
2668 * and NMBPG at this point.
2669 */
2670 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPG &&
2671 sp->sl_chunks == NMBPG &&
2672 sp->sl_len == PAGE_SIZE);
2673 VERIFY(sp->sl_refcnt < NMBPG || sp->sl_head == NULL);
2674 }
2675
2676 /* If empty, remove this slab from the class's freelist */
2677 if (sp->sl_head == NULL) {
2678 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPG);
2679 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPG);
2680 VERIFY(class != MC_BIGCL || sp->sl_refcnt == NBCLPG);
2681 slab_remove(sp, class);
2682 }
2683
2684 return buf;
2685 }
2686
2687 /*
2688 * Place a slab of object(s) back into a class's slab list.
2689 */
2690 static void
slab_free(mbuf_class_t class,mcache_obj_t * buf)2691 slab_free(mbuf_class_t class, mcache_obj_t *buf)
2692 {
2693 mcl_slab_t *sp;
2694 boolean_t reinit_supercl = false;
2695 mbuf_class_t super_class;
2696
2697 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2698
2699 VERIFY(class != MC_16KCL || njcl > 0);
2700 VERIFY(buf->obj_next == NULL);
2701
2702 /*
2703 * Synchronizing with m_clalloc, as it reads m_total, while we here
2704 * are modifying m_total.
2705 */
2706 while (mb_clalloc_busy) {
2707 mb_clalloc_waiters++;
2708 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2709 (PZERO - 1), "m_clalloc", NULL);
2710 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2711 }
2712
2713 /* We are busy now; tell everyone else to go away */
2714 mb_clalloc_busy = TRUE;
2715
2716 sp = slab_get(buf);
2717 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
2718 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2719
2720 /* Decrement slab reference */
2721 sp->sl_refcnt--;
2722
2723 if (class == MC_CL) {
2724 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
2725 /*
2726 * A slab that has been splitted for 2KB clusters can have
2727 * at most 1 outstanding reference at this point.
2728 */
2729 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPG - 1) &&
2730 sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
2731 VERIFY(sp->sl_refcnt < (NCLPG - 1) ||
2732 (slab_is_detached(sp) && sp->sl_head == NULL));
2733 } else if (class == MC_BIGCL) {
2734 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
2735
2736 /* A 4KB cluster slab can have NBCLPG references at most */
2737 VERIFY(sp->sl_refcnt >= 0 && sp->sl_chunks == NBCLPG);
2738 VERIFY(sp->sl_refcnt < (NBCLPG - 1) ||
2739 (slab_is_detached(sp) && sp->sl_head == NULL));
2740 } else if (class == MC_16KCL) {
2741 mcl_slab_t *nsp;
2742 int k;
2743 /*
2744 * A 16KB cluster takes NSLABSP16KB slabs, all must
2745 * now have 0 reference.
2746 */
2747 VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE));
2748 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
2749 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
2750 VERIFY(slab_is_detached(sp));
2751 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
2752 nsp = nsp->sl_next;
2753 /* Next slab must already be present */
2754 VERIFY(nsp != NULL);
2755 nsp->sl_refcnt--;
2756 VERIFY(slab_is_detached(nsp));
2757 VERIFY(nsp->sl_class == MC_16KCL &&
2758 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
2759 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
2760 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
2761 nsp->sl_head == NULL);
2762 }
2763 } else {
2764 /*
2765 * A slab that has been splitted for mbufs has at most
2766 * NMBPG reference counts. Since we have decremented
2767 * one reference above, it must now be between 0 and
2768 * NMBPG-1.
2769 */
2770 VERIFY(class == MC_MBUF);
2771 VERIFY(sp->sl_refcnt >= 0 &&
2772 sp->sl_refcnt <= (NMBPG - 1) &&
2773 sp->sl_chunks == NMBPG &&
2774 sp->sl_len == PAGE_SIZE);
2775 VERIFY(sp->sl_refcnt < (NMBPG - 1) ||
2776 (slab_is_detached(sp) && sp->sl_head == NULL));
2777 }
2778
2779 /*
2780 * When auditing is enabled, ensure that the buffer still
2781 * contains the free pattern. Otherwise it got corrupted
2782 * while at the CPU cache layer.
2783 */
2784 if (mclaudit != NULL) {
2785 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
2786 if (mclverify) {
2787 mcache_audit_free_verify(mca, buf, 0,
2788 m_maxsize(class));
2789 }
2790 mca->mca_uflags &= ~MB_SCVALID;
2791 }
2792
2793 if (class == MC_CL) {
2794 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
2795 buf->obj_next = sp->sl_head;
2796 } else if (class == MC_BIGCL) {
2797 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
2798 m_infree(MC_MBUF_BIGCL);
2799 buf->obj_next = sp->sl_head;
2800 } else if (class == MC_16KCL) {
2801 ++m_infree(MC_16KCL);
2802 } else {
2803 ++m_infree(MC_MBUF);
2804 buf->obj_next = sp->sl_head;
2805 }
2806 sp->sl_head = buf;
2807
2808 /*
2809 * If a slab has been split to either one which holds 2KB clusters,
2810 * or one which holds mbufs, turn it back to one which holds a
2811 * 4 or 16 KB cluster depending on the page size.
2812 */
2813 if (m_maxsize(MC_BIGCL) == PAGE_SIZE) {
2814 super_class = MC_BIGCL;
2815 } else {
2816 VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL));
2817 super_class = MC_16KCL;
2818 }
2819 if (class == MC_MBUF && sp->sl_refcnt == 0 &&
2820 m_total(class) >= (m_minlimit(class) + NMBPG) &&
2821 m_total(super_class) < m_maxlimit(super_class)) {
2822 int i = NMBPG;
2823
2824 m_total(MC_MBUF) -= NMBPG;
2825 mbstat.m_mbufs = m_total(MC_MBUF);
2826 m_infree(MC_MBUF) -= NMBPG;
2827 mtype_stat_add(MT_FREE, -((unsigned)NMBPG));
2828
2829 while (i--) {
2830 struct mbuf *m = sp->sl_head;
2831 VERIFY(m != NULL);
2832 sp->sl_head = m->m_next;
2833 m->m_next = NULL;
2834 }
2835 reinit_supercl = true;
2836 } else if (class == MC_CL && sp->sl_refcnt == 0 &&
2837 m_total(class) >= (m_minlimit(class) + NCLPG) &&
2838 m_total(super_class) < m_maxlimit(super_class)) {
2839 int i = NCLPG;
2840
2841 m_total(MC_CL) -= NCLPG;
2842 mbstat.m_clusters = m_total(MC_CL);
2843 m_infree(MC_CL) -= NCLPG;
2844
2845 while (i--) {
2846 union mcluster *c = sp->sl_head;
2847 VERIFY(c != NULL);
2848 sp->sl_head = c->mcl_next;
2849 c->mcl_next = NULL;
2850 }
2851 reinit_supercl = true;
2852 } else if (class == MC_BIGCL && super_class != MC_BIGCL &&
2853 sp->sl_refcnt == 0 &&
2854 m_total(class) >= (m_minlimit(class) + NBCLPG) &&
2855 m_total(super_class) < m_maxlimit(super_class)) {
2856 int i = NBCLPG;
2857
2858 VERIFY(super_class == MC_16KCL);
2859 m_total(MC_BIGCL) -= NBCLPG;
2860 mbstat.m_bigclusters = m_total(MC_BIGCL);
2861 m_infree(MC_BIGCL) -= NBCLPG;
2862
2863 while (i--) {
2864 union mbigcluster *bc = sp->sl_head;
2865 VERIFY(bc != NULL);
2866 sp->sl_head = bc->mbc_next;
2867 bc->mbc_next = NULL;
2868 }
2869 reinit_supercl = true;
2870 }
2871
2872 if (reinit_supercl) {
2873 VERIFY(sp->sl_head == NULL);
2874 VERIFY(m_total(class) >= m_minlimit(class));
2875 slab_remove(sp, class);
2876
2877 /* Reinitialize it as a cluster for the super class */
2878 m_total(super_class)++;
2879 m_infree(super_class)++;
2880 VERIFY(sp->sl_flags == (SLF_MAPPED | SLF_DETACHED) &&
2881 sp->sl_len == PAGE_SIZE && sp->sl_refcnt == 0);
2882
2883 slab_init(sp, super_class, SLF_MAPPED, sp->sl_base,
2884 sp->sl_base, PAGE_SIZE, 0, 1);
2885 if (mclverify) {
2886 mcache_set_pattern(MCACHE_FREE_PATTERN,
2887 (caddr_t)sp->sl_base, sp->sl_len);
2888 }
2889 ((mcache_obj_t *)(sp->sl_base))->obj_next = NULL;
2890
2891 if (super_class == MC_BIGCL) {
2892 mbstat.m_bigclusters = m_total(MC_BIGCL);
2893 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2894 m_infree(MC_MBUF_BIGCL);
2895 }
2896
2897 VERIFY(slab_is_detached(sp));
2898 VERIFY(m_total(super_class) <= m_maxlimit(super_class));
2899
2900 /* And finally switch class */
2901 class = super_class;
2902 }
2903
2904 /* Reinsert the slab to the class's slab list */
2905 if (slab_is_detached(sp)) {
2906 slab_insert(sp, class);
2907 }
2908
2909 /* We're done; let others enter */
2910 mb_clalloc_busy = FALSE;
2911 if (mb_clalloc_waiters > 0) {
2912 mb_clalloc_waiters = 0;
2913 wakeup(mb_clalloc_waitchan);
2914 }
2915 }
2916
2917 /*
2918 * Common allocator for rudimentary objects called by the CPU cache layer
2919 * during an allocation request whenever there is no available element in the
2920 * bucket layer. It returns one or more elements from the appropriate global
2921 * freelist. If the freelist is empty, it will attempt to populate it and
2922 * retry the allocation.
2923 */
2924 static unsigned int
mbuf_slab_alloc(void * arg,mcache_obj_t *** plist,unsigned int num,int wait)2925 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
2926 {
2927 mbuf_class_t class = (mbuf_class_t)arg;
2928 unsigned int need = num;
2929 mcache_obj_t **list = *plist;
2930
2931 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2932 ASSERT(need > 0);
2933
2934 lck_mtx_lock(mbuf_mlock);
2935
2936 for (;;) {
2937 if ((*list = slab_alloc(class, wait)) != NULL) {
2938 (*list)->obj_next = NULL;
2939 list = *plist = &(*list)->obj_next;
2940
2941 if (--need == 0) {
2942 /*
2943 * If the number of elements in freelist has
2944 * dropped below low watermark, asynchronously
2945 * populate the freelist now rather than doing
2946 * it later when we run out of elements.
2947 */
2948 if (!mbuf_cached_above(class, wait) &&
2949 m_infree(class) < (m_total(class) >> 5)) {
2950 (void) freelist_populate(class, 1,
2951 M_DONTWAIT);
2952 }
2953 break;
2954 }
2955 } else {
2956 VERIFY(m_infree(class) == 0 || class == MC_CL);
2957
2958 (void) freelist_populate(class, 1,
2959 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2960
2961 if (m_infree(class) > 0) {
2962 continue;
2963 }
2964
2965 /* Check if there's anything at the cache layer */
2966 if (mbuf_cached_above(class, wait)) {
2967 break;
2968 }
2969
2970 /* watchdog checkpoint */
2971 mbuf_watchdog();
2972
2973 /* We have nothing and cannot block; give up */
2974 if (wait & MCR_NOSLEEP) {
2975 if (!(wait & MCR_TRYHARD)) {
2976 m_fail_cnt(class)++;
2977 mbstat.m_drops++;
2978 break;
2979 }
2980 }
2981
2982 /*
2983 * If the freelist is still empty and the caller is
2984 * willing to be blocked, sleep on the wait channel
2985 * until an element is available. Otherwise, if
2986 * MCR_TRYHARD is set, do our best to satisfy the
2987 * request without having to go to sleep.
2988 */
2989 if (mbuf_worker_ready &&
2990 mbuf_sleep(class, need, wait)) {
2991 break;
2992 }
2993
2994 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2995 }
2996 }
2997
2998 m_alloc_cnt(class) += num - need;
2999 lck_mtx_unlock(mbuf_mlock);
3000
3001 return num - need;
3002 }
3003
3004 /*
3005 * Common de-allocator for rudimentary objects called by the CPU cache
3006 * layer when one or more elements need to be returned to the appropriate
3007 * global freelist.
3008 */
3009 static void
mbuf_slab_free(void * arg,mcache_obj_t * list,__unused int purged)3010 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
3011 {
3012 mbuf_class_t class = (mbuf_class_t)arg;
3013 mcache_obj_t *nlist;
3014 unsigned int num = 0;
3015 int w;
3016
3017 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
3018
3019 lck_mtx_lock(mbuf_mlock);
3020
3021 for (;;) {
3022 nlist = list->obj_next;
3023 list->obj_next = NULL;
3024 slab_free(class, list);
3025 ++num;
3026 if ((list = nlist) == NULL) {
3027 break;
3028 }
3029 }
3030 m_free_cnt(class) += num;
3031
3032 if ((w = mb_waiters) > 0) {
3033 mb_waiters = 0;
3034 }
3035 if (w) {
3036 mbwdog_logger("waking up all threads");
3037 }
3038 lck_mtx_unlock(mbuf_mlock);
3039
3040 if (w != 0) {
3041 wakeup(mb_waitchan);
3042 }
3043 }
3044
3045 /*
3046 * Common auditor for rudimentary objects called by the CPU cache layer
3047 * during an allocation or free request. For the former, this is called
3048 * after the objects are obtained from either the bucket or slab layer
3049 * and before they are returned to the caller. For the latter, this is
3050 * called immediately during free and before placing the objects into
3051 * the bucket or slab layer.
3052 */
3053 static void
mbuf_slab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)3054 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
3055 {
3056 mbuf_class_t class = (mbuf_class_t)arg;
3057 mcache_audit_t *mca;
3058
3059 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
3060
3061 while (list != NULL) {
3062 lck_mtx_lock(mbuf_mlock);
3063 mca = mcl_audit_buf2mca(class, list);
3064
3065 /* Do the sanity checks */
3066 if (class == MC_MBUF) {
3067 mcl_audit_mbuf(mca, list, FALSE, alloc);
3068 ASSERT(mca->mca_uflags & MB_SCVALID);
3069 } else {
3070 mcl_audit_cluster(mca, list, m_maxsize(class),
3071 alloc, TRUE);
3072 ASSERT(!(mca->mca_uflags & MB_SCVALID));
3073 }
3074 /* Record this transaction */
3075 if (mcltrace) {
3076 mcache_buffer_log(mca, list, m_cache(class), &mb_start);
3077 }
3078
3079 if (alloc) {
3080 mca->mca_uflags |= MB_INUSE;
3081 } else {
3082 mca->mca_uflags &= ~MB_INUSE;
3083 }
3084 /* Unpair the object (unconditionally) */
3085 mca->mca_uptr = NULL;
3086 lck_mtx_unlock(mbuf_mlock);
3087
3088 list = list->obj_next;
3089 }
3090 }
3091
3092 /*
3093 * Common notify routine for all caches. It is called by mcache when
3094 * one or more objects get freed. We use this indication to trigger
3095 * the wakeup of any sleeping threads so that they can retry their
3096 * allocation requests.
3097 */
3098 static void
mbuf_slab_notify(void * arg,u_int32_t reason)3099 mbuf_slab_notify(void *arg, u_int32_t reason)
3100 {
3101 mbuf_class_t class = (mbuf_class_t)arg;
3102 int w;
3103
3104 ASSERT(MBUF_CLASS_VALID(class));
3105
3106 if (reason != MCN_RETRYALLOC) {
3107 return;
3108 }
3109
3110 lck_mtx_lock(mbuf_mlock);
3111 if ((w = mb_waiters) > 0) {
3112 m_notified(class)++;
3113 mb_waiters = 0;
3114 }
3115 if (w) {
3116 mbwdog_logger("waking up all threads");
3117 }
3118 lck_mtx_unlock(mbuf_mlock);
3119
3120 if (w != 0) {
3121 wakeup(mb_waitchan);
3122 }
3123 }
3124
3125 /*
3126 * Obtain object(s) from the composite class's freelist.
3127 */
3128 static unsigned int
cslab_alloc(mbuf_class_t class,mcache_obj_t *** plist,unsigned int num)3129 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
3130 {
3131 unsigned int need = num;
3132 mcl_slab_t *sp, *clsp, *nsp;
3133 struct mbuf *m;
3134 mcache_obj_t **list = *plist;
3135 void *cl;
3136
3137 VERIFY(need > 0);
3138 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
3139 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3140
3141 /* Get what we can from the freelist */
3142 while ((*list = m_cobjlist(class)) != NULL) {
3143 MRANGE(*list);
3144
3145 m = (struct mbuf *)*list;
3146 sp = slab_get(m);
3147 cl = m->m_ext.ext_buf;
3148 clsp = slab_get(cl);
3149 VERIFY(m->m_flags == M_EXT && cl != NULL);
3150 VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
3151
3152 if (class == MC_MBUF_CL) {
3153 VERIFY(clsp->sl_refcnt >= 1 &&
3154 clsp->sl_refcnt <= NCLPG);
3155 } else {
3156 VERIFY(clsp->sl_refcnt >= 1 &&
3157 clsp->sl_refcnt <= NBCLPG);
3158 }
3159
3160 if (class == MC_MBUF_16KCL) {
3161 int k;
3162 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
3163 nsp = nsp->sl_next;
3164 /* Next slab must already be present */
3165 VERIFY(nsp != NULL);
3166 VERIFY(nsp->sl_refcnt == 1);
3167 }
3168 }
3169
3170 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
3171 !MBUF_IN_MAP(m_cobjlist(class))) {
3172 slab_nextptr_panic(sp, m_cobjlist(class));
3173 /* NOTREACHED */
3174 }
3175 (*list)->obj_next = NULL;
3176 list = *plist = &(*list)->obj_next;
3177
3178 if (--need == 0) {
3179 break;
3180 }
3181 }
3182 m_infree(class) -= (num - need);
3183
3184 return num - need;
3185 }
3186
3187 /*
3188 * Place object(s) back into a composite class's freelist.
3189 */
3190 static unsigned int
cslab_free(mbuf_class_t class,mcache_obj_t * list,int purged)3191 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
3192 {
3193 mcache_obj_t *o, *tail;
3194 unsigned int num = 0;
3195 struct mbuf *m, *ms;
3196 mcache_audit_t *mca = NULL;
3197 mcache_obj_t *ref_list = NULL;
3198 mcl_slab_t *clsp, *nsp;
3199 void *cl;
3200 mbuf_class_t cl_class;
3201
3202 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
3203 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
3204 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3205
3206 if (class == MC_MBUF_CL) {
3207 cl_class = MC_CL;
3208 } else if (class == MC_MBUF_BIGCL) {
3209 cl_class = MC_BIGCL;
3210 } else {
3211 VERIFY(class == MC_MBUF_16KCL);
3212 cl_class = MC_16KCL;
3213 }
3214
3215 o = tail = list;
3216
3217 while ((m = ms = (struct mbuf *)o) != NULL) {
3218 mcache_obj_t *rfa, *nexto = o->obj_next;
3219
3220 /* Do the mbuf sanity checks */
3221 if (mclaudit != NULL) {
3222 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
3223 if (mclverify) {
3224 mcache_audit_free_verify(mca, m, 0,
3225 m_maxsize(MC_MBUF));
3226 }
3227 ms = MCA_SAVED_MBUF_PTR(mca);
3228 }
3229
3230 /* Do the cluster sanity checks */
3231 cl = ms->m_ext.ext_buf;
3232 clsp = slab_get(cl);
3233 if (mclverify) {
3234 size_t size = m_maxsize(cl_class);
3235 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
3236 (mcache_obj_t *)cl), cl, 0, size);
3237 }
3238 VERIFY(ms->m_type == MT_FREE);
3239 VERIFY(ms->m_flags == M_EXT);
3240 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
3241 if (cl_class == MC_CL) {
3242 VERIFY(clsp->sl_refcnt >= 1 &&
3243 clsp->sl_refcnt <= NCLPG);
3244 } else {
3245 VERIFY(clsp->sl_refcnt >= 1 &&
3246 clsp->sl_refcnt <= NBCLPG);
3247 }
3248 if (cl_class == MC_16KCL) {
3249 int k;
3250 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
3251 nsp = nsp->sl_next;
3252 /* Next slab must already be present */
3253 VERIFY(nsp != NULL);
3254 VERIFY(nsp->sl_refcnt == 1);
3255 }
3256 }
3257
3258 /*
3259 * If we're asked to purge, restore the actual mbuf using
3260 * contents of the shadow structure (if auditing is enabled)
3261 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
3262 * about to free it and the attached cluster into their caches.
3263 */
3264 if (purged) {
3265 /* Restore constructed mbuf fields */
3266 if (mclaudit != NULL) {
3267 mcl_audit_restore_mbuf(m, mca, TRUE);
3268 }
3269
3270 MEXT_MINREF(m) = 0;
3271 MEXT_REF(m) = 0;
3272 MEXT_PREF(m) = 0;
3273 MEXT_FLAGS(m) = 0;
3274 MEXT_PRIV(m) = 0;
3275 MEXT_PMBUF(m) = NULL;
3276 MEXT_TOKEN(m) = 0;
3277
3278 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
3279 m_set_ext(m, NULL, NULL, NULL);
3280 rfa->obj_next = ref_list;
3281 ref_list = rfa;
3282
3283 m->m_type = MT_FREE;
3284 m->m_flags = m->m_len = 0;
3285 m->m_next = m->m_nextpkt = NULL;
3286
3287 /* Save mbuf fields and make auditing happy */
3288 if (mclaudit != NULL) {
3289 mcl_audit_mbuf(mca, o, FALSE, FALSE);
3290 }
3291
3292 VERIFY(m_total(class) > 0);
3293 m_total(class)--;
3294
3295 /* Free the mbuf */
3296 o->obj_next = NULL;
3297 slab_free(MC_MBUF, o);
3298
3299 /* And free the cluster */
3300 ((mcache_obj_t *)cl)->obj_next = NULL;
3301 if (class == MC_MBUF_CL) {
3302 slab_free(MC_CL, cl);
3303 } else if (class == MC_MBUF_BIGCL) {
3304 slab_free(MC_BIGCL, cl);
3305 } else {
3306 slab_free(MC_16KCL, cl);
3307 }
3308 }
3309
3310 ++num;
3311 tail = o;
3312 o = nexto;
3313 }
3314
3315 if (!purged) {
3316 tail->obj_next = m_cobjlist(class);
3317 m_cobjlist(class) = list;
3318 m_infree(class) += num;
3319 } else if (ref_list != NULL) {
3320 mcache_free_ext(ref_cache, ref_list);
3321 }
3322
3323 return num;
3324 }
3325
3326 /*
3327 * Common allocator for composite objects called by the CPU cache layer
3328 * during an allocation request whenever there is no available element in
3329 * the bucket layer. It returns one or more composite elements from the
3330 * appropriate global freelist. If the freelist is empty, it will attempt
3331 * to obtain the rudimentary objects from their caches and construct them
3332 * into composite mbuf + cluster objects.
3333 */
3334 static unsigned int
mbuf_cslab_alloc(void * arg,mcache_obj_t *** plist,unsigned int needed,int wait)3335 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
3336 int wait)
3337 {
3338 mbuf_class_t class = (mbuf_class_t)arg;
3339 mbuf_class_t cl_class = 0;
3340 unsigned int num = 0, cnum = 0, want = needed;
3341 mcache_obj_t *ref_list = NULL;
3342 mcache_obj_t *mp_list = NULL;
3343 mcache_obj_t *clp_list = NULL;
3344 mcache_obj_t **list;
3345 struct ext_ref *rfa;
3346 struct mbuf *m;
3347 void *cl;
3348
3349 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
3350 ASSERT(needed > 0);
3351
3352 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
3353
3354 /* There should not be any slab for this class */
3355 VERIFY(m_slab_cnt(class) == 0 &&
3356 m_slablist(class).tqh_first == NULL &&
3357 m_slablist(class).tqh_last == NULL);
3358
3359 lck_mtx_lock(mbuf_mlock);
3360
3361 /* Try using the freelist first */
3362 num = cslab_alloc(class, plist, needed);
3363 list = *plist;
3364 if (num == needed) {
3365 m_alloc_cnt(class) += num;
3366 lck_mtx_unlock(mbuf_mlock);
3367 return needed;
3368 }
3369
3370 lck_mtx_unlock(mbuf_mlock);
3371
3372 /*
3373 * We could not satisfy the request using the freelist alone;
3374 * allocate from the appropriate rudimentary caches and use
3375 * whatever we can get to construct the composite objects.
3376 */
3377 needed -= num;
3378
3379 /*
3380 * Mark these allocation requests as coming from a composite cache.
3381 * Also, if the caller is willing to be blocked, mark the request
3382 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
3383 * slab layer waiting for the individual object when one or more
3384 * of the already-constructed composite objects are available.
3385 */
3386 wait |= MCR_COMP;
3387 if (!(wait & MCR_NOSLEEP)) {
3388 wait |= MCR_FAILOK;
3389 }
3390
3391 /* allocate mbufs */
3392 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
3393 if (needed == 0) {
3394 ASSERT(mp_list == NULL);
3395 goto fail;
3396 }
3397
3398 /* allocate clusters */
3399 if (class == MC_MBUF_CL) {
3400 cl_class = MC_CL;
3401 } else if (class == MC_MBUF_BIGCL) {
3402 cl_class = MC_BIGCL;
3403 } else {
3404 VERIFY(class == MC_MBUF_16KCL);
3405 cl_class = MC_16KCL;
3406 }
3407 needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
3408 if (needed == 0) {
3409 ASSERT(clp_list == NULL);
3410 goto fail;
3411 }
3412
3413 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
3414 if (needed == 0) {
3415 ASSERT(ref_list == NULL);
3416 goto fail;
3417 }
3418
3419 /*
3420 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
3421 * overs will get freed accordingly before we return to caller.
3422 */
3423 for (cnum = 0; cnum < needed; cnum++) {
3424 struct mbuf *ms;
3425
3426 m = ms = (struct mbuf *)mp_list;
3427 mp_list = mp_list->obj_next;
3428
3429 cl = clp_list;
3430 clp_list = clp_list->obj_next;
3431 ((mcache_obj_t *)cl)->obj_next = NULL;
3432
3433 rfa = (struct ext_ref *)ref_list;
3434 ref_list = ref_list->obj_next;
3435 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
3436
3437 /*
3438 * If auditing is enabled, construct the shadow mbuf
3439 * in the audit structure instead of in the actual one.
3440 * mbuf_cslab_audit() will take care of restoring the
3441 * contents after the integrity check.
3442 */
3443 if (mclaudit != NULL) {
3444 mcache_audit_t *mca, *cl_mca;
3445
3446 lck_mtx_lock(mbuf_mlock);
3447 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
3448 ms = MCA_SAVED_MBUF_PTR(mca);
3449 cl_mca = mcl_audit_buf2mca(cl_class,
3450 (mcache_obj_t *)cl);
3451
3452 /*
3453 * Pair them up. Note that this is done at the time
3454 * the mbuf+cluster objects are constructed. This
3455 * information should be treated as "best effort"
3456 * debugging hint since more than one mbufs can refer
3457 * to a cluster. In that case, the cluster might not
3458 * be freed along with the mbuf it was paired with.
3459 */
3460 mca->mca_uptr = cl_mca;
3461 cl_mca->mca_uptr = mca;
3462
3463 ASSERT(mca->mca_uflags & MB_SCVALID);
3464 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
3465 lck_mtx_unlock(mbuf_mlock);
3466
3467 /* Technically, they are in the freelist */
3468 if (mclverify) {
3469 size_t size;
3470
3471 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
3472 m_maxsize(MC_MBUF));
3473
3474 if (class == MC_MBUF_CL) {
3475 size = m_maxsize(MC_CL);
3476 } else if (class == MC_MBUF_BIGCL) {
3477 size = m_maxsize(MC_BIGCL);
3478 } else {
3479 size = m_maxsize(MC_16KCL);
3480 }
3481
3482 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
3483 size);
3484 }
3485 }
3486
3487 MBUF_INIT(ms, 0, MT_FREE);
3488 if (class == MC_MBUF_16KCL) {
3489 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
3490 } else if (class == MC_MBUF_BIGCL) {
3491 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
3492 } else {
3493 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
3494 }
3495 VERIFY(ms->m_flags == M_EXT);
3496 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
3497
3498 *list = (mcache_obj_t *)m;
3499 (*list)->obj_next = NULL;
3500 list = *plist = &(*list)->obj_next;
3501 }
3502
3503 fail:
3504 /*
3505 * Free up what's left of the above.
3506 */
3507 if (mp_list != NULL) {
3508 mcache_free_ext(m_cache(MC_MBUF), mp_list);
3509 }
3510 if (clp_list != NULL) {
3511 mcache_free_ext(m_cache(cl_class), clp_list);
3512 }
3513 if (ref_list != NULL) {
3514 mcache_free_ext(ref_cache, ref_list);
3515 }
3516
3517 lck_mtx_lock(mbuf_mlock);
3518 if (num > 0 || cnum > 0) {
3519 m_total(class) += cnum;
3520 VERIFY(m_total(class) <= m_maxlimit(class));
3521 m_alloc_cnt(class) += num + cnum;
3522 }
3523 if ((num + cnum) < want) {
3524 m_fail_cnt(class) += (want - (num + cnum));
3525 }
3526 lck_mtx_unlock(mbuf_mlock);
3527
3528 return num + cnum;
3529 }
3530
3531 /*
3532 * Common de-allocator for composite objects called by the CPU cache
3533 * layer when one or more elements need to be returned to the appropriate
3534 * global freelist.
3535 */
3536 static void
mbuf_cslab_free(void * arg,mcache_obj_t * list,int purged)3537 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
3538 {
3539 mbuf_class_t class = (mbuf_class_t)arg;
3540 unsigned int num;
3541 int w;
3542
3543 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
3544
3545 lck_mtx_lock(mbuf_mlock);
3546
3547 num = cslab_free(class, list, purged);
3548 m_free_cnt(class) += num;
3549
3550 if ((w = mb_waiters) > 0) {
3551 mb_waiters = 0;
3552 }
3553 if (w) {
3554 mbwdog_logger("waking up all threads");
3555 }
3556
3557 lck_mtx_unlock(mbuf_mlock);
3558
3559 if (w != 0) {
3560 wakeup(mb_waitchan);
3561 }
3562 }
3563
3564 /*
3565 * Common auditor for composite objects called by the CPU cache layer
3566 * during an allocation or free request. For the former, this is called
3567 * after the objects are obtained from either the bucket or slab layer
3568 * and before they are returned to the caller. For the latter, this is
3569 * called immediately during free and before placing the objects into
3570 * the bucket or slab layer.
3571 */
3572 static void
mbuf_cslab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)3573 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
3574 {
3575 mbuf_class_t class = (mbuf_class_t)arg, cl_class;
3576 mcache_audit_t *mca;
3577 struct mbuf *m, *ms;
3578 mcl_slab_t *clsp, *nsp;
3579 size_t cl_size;
3580 void *cl;
3581
3582 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
3583 if (class == MC_MBUF_CL) {
3584 cl_class = MC_CL;
3585 } else if (class == MC_MBUF_BIGCL) {
3586 cl_class = MC_BIGCL;
3587 } else {
3588 cl_class = MC_16KCL;
3589 }
3590 cl_size = m_maxsize(cl_class);
3591
3592 while ((m = ms = (struct mbuf *)list) != NULL) {
3593 lck_mtx_lock(mbuf_mlock);
3594 /* Do the mbuf sanity checks and record its transaction */
3595 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
3596 mcl_audit_mbuf(mca, m, TRUE, alloc);
3597 if (mcltrace) {
3598 mcache_buffer_log(mca, m, m_cache(class), &mb_start);
3599 }
3600
3601 if (alloc) {
3602 mca->mca_uflags |= MB_COMP_INUSE;
3603 } else {
3604 mca->mca_uflags &= ~MB_COMP_INUSE;
3605 }
3606
3607 /*
3608 * Use the shadow mbuf in the audit structure if we are
3609 * freeing, since the contents of the actual mbuf has been
3610 * pattern-filled by the above call to mcl_audit_mbuf().
3611 */
3612 if (!alloc && mclverify) {
3613 ms = MCA_SAVED_MBUF_PTR(mca);
3614 }
3615
3616 /* Do the cluster sanity checks and record its transaction */
3617 cl = ms->m_ext.ext_buf;
3618 clsp = slab_get(cl);
3619 VERIFY(ms->m_flags == M_EXT && cl != NULL);
3620 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
3621 if (class == MC_MBUF_CL) {
3622 VERIFY(clsp->sl_refcnt >= 1 &&
3623 clsp->sl_refcnt <= NCLPG);
3624 } else {
3625 VERIFY(clsp->sl_refcnt >= 1 &&
3626 clsp->sl_refcnt <= NBCLPG);
3627 }
3628
3629 if (class == MC_MBUF_16KCL) {
3630 int k;
3631 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
3632 nsp = nsp->sl_next;
3633 /* Next slab must already be present */
3634 VERIFY(nsp != NULL);
3635 VERIFY(nsp->sl_refcnt == 1);
3636 }
3637 }
3638
3639
3640 mca = mcl_audit_buf2mca(cl_class, cl);
3641 mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE);
3642 if (mcltrace) {
3643 mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
3644 }
3645
3646 if (alloc) {
3647 mca->mca_uflags |= MB_COMP_INUSE;
3648 } else {
3649 mca->mca_uflags &= ~MB_COMP_INUSE;
3650 }
3651 lck_mtx_unlock(mbuf_mlock);
3652
3653 list = list->obj_next;
3654 }
3655 }
3656
3657 static void
m_vm_error_stats(uint32_t * cnt,uint64_t * ts,uint64_t * size,uint64_t alloc_size,kern_return_t error)3658 m_vm_error_stats(uint32_t *cnt, uint64_t *ts, uint64_t *size,
3659 uint64_t alloc_size, kern_return_t error)
3660 {
3661 *cnt = *cnt + 1;
3662 *ts = net_uptime();
3663 if (size) {
3664 *size = alloc_size;
3665 }
3666 switch (error) {
3667 case KERN_SUCCESS:
3668 break;
3669 case KERN_INVALID_ARGUMENT:
3670 mb_kmem_stats[0]++;
3671 break;
3672 case KERN_INVALID_ADDRESS:
3673 mb_kmem_stats[1]++;
3674 break;
3675 case KERN_RESOURCE_SHORTAGE:
3676 mb_kmem_stats[2]++;
3677 break;
3678 case KERN_NO_SPACE:
3679 mb_kmem_stats[3]++;
3680 break;
3681 case KERN_FAILURE:
3682 mb_kmem_stats[4]++;
3683 break;
3684 default:
3685 mb_kmem_stats[5]++;
3686 break;
3687 }
3688 }
3689
3690 static vm_offset_t
kmem_mb_alloc(vm_map_t mbmap,int size,int physContig,kern_return_t * err)3691 kmem_mb_alloc(vm_map_t mbmap, int size, int physContig, kern_return_t *err)
3692 {
3693 vm_offset_t addr = 0;
3694 kern_return_t kr = KERN_SUCCESS;
3695
3696 if (!physContig) {
3697 kr = kmem_alloc(mbmap, &addr, size,
3698 KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
3699 } else {
3700 kr = kmem_alloc_contig(mbmap, &addr, size, PAGE_MASK, 0xfffff,
3701 0, KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
3702 }
3703
3704 if (kr != KERN_SUCCESS) {
3705 addr = 0;
3706 }
3707 if (err) {
3708 *err = kr;
3709 }
3710
3711 return addr;
3712 }
3713
3714 /*
3715 * Allocate some number of mbuf clusters and place on cluster freelist.
3716 */
3717 static int
m_clalloc(const u_int32_t num,const int wait,const u_int32_t bufsize)3718 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
3719 {
3720 int i, count = 0;
3721 vm_size_t size = 0;
3722 int numpages = 0, large_buffer;
3723 vm_offset_t page = 0;
3724 mcache_audit_t *mca_list = NULL;
3725 mcache_obj_t *con_list = NULL;
3726 mcl_slab_t *sp;
3727 mbuf_class_t class;
3728 kern_return_t error;
3729
3730 /* Set if a buffer allocation needs allocation of multiple pages */
3731 large_buffer = ((bufsize == m_maxsize(MC_16KCL)) &&
3732 PAGE_SIZE < M16KCLBYTES);
3733 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
3734 bufsize == m_maxsize(MC_16KCL));
3735
3736 VERIFY((bufsize == PAGE_SIZE) ||
3737 (bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL)));
3738
3739 if (bufsize == m_size(MC_BIGCL)) {
3740 class = MC_BIGCL;
3741 } else {
3742 class = MC_16KCL;
3743 }
3744
3745 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3746
3747 /*
3748 * Multiple threads may attempt to populate the cluster map one
3749 * after another. Since we drop the lock below prior to acquiring
3750 * the physical page(s), our view of the cluster map may no longer
3751 * be accurate, and we could end up over-committing the pages beyond
3752 * the maximum allowed for each class. To prevent it, this entire
3753 * operation (including the page mapping) is serialized.
3754 */
3755 while (mb_clalloc_busy) {
3756 mb_clalloc_waiters++;
3757 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
3758 (PZERO - 1), "m_clalloc", NULL);
3759 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3760 }
3761
3762 /* We are busy now; tell everyone else to go away */
3763 mb_clalloc_busy = TRUE;
3764
3765 /*
3766 * Honor the caller's wish to block or not block. We have a way
3767 * to grow the pool asynchronously using the mbuf worker thread.
3768 */
3769 i = m_howmany(num, bufsize);
3770 if (i <= 0 || (wait & M_DONTWAIT)) {
3771 goto out;
3772 }
3773
3774 lck_mtx_unlock(mbuf_mlock);
3775
3776 size = round_page(i * bufsize);
3777 page = kmem_mb_alloc(mb_map, size, large_buffer, &error);
3778
3779 /*
3780 * If we did ask for "n" 16KB physically contiguous chunks
3781 * and didn't get them, then please try again without this
3782 * restriction.
3783 */
3784 net_update_uptime();
3785 if (large_buffer && page == 0) {
3786 m_vm_error_stats(&mb_kmem_contig_failed,
3787 &mb_kmem_contig_failed_ts,
3788 &mb_kmem_contig_failed_size,
3789 size, error);
3790 page = kmem_mb_alloc(mb_map, size, 0, &error);
3791 }
3792
3793 if (page == 0) {
3794 m_vm_error_stats(&mb_kmem_failed,
3795 &mb_kmem_failed_ts,
3796 &mb_kmem_failed_size,
3797 size, error);
3798 #if PAGE_SIZE == 4096
3799 if (bufsize == m_maxsize(MC_BIGCL)) {
3800 #else
3801 if (bufsize >= m_maxsize(MC_BIGCL)) {
3802 #endif
3803 /* Try for 1 page if failed */
3804 size = PAGE_SIZE;
3805 page = kmem_mb_alloc(mb_map, size, 0, &error);
3806 if (page == 0) {
3807 m_vm_error_stats(&mb_kmem_one_failed,
3808 &mb_kmem_one_failed_ts,
3809 NULL, size, error);
3810 }
3811 }
3812
3813 if (page == 0) {
3814 lck_mtx_lock(mbuf_mlock);
3815 goto out;
3816 }
3817 }
3818
3819 VERIFY(IS_P2ALIGNED(page, PAGE_SIZE));
3820 numpages = size / PAGE_SIZE;
3821
3822 /* If auditing is enabled, allocate the audit structures now */
3823 if (mclaudit != NULL) {
3824 int needed;
3825
3826 /*
3827 * Yes, I realize this is a waste of memory for clusters
3828 * that never get transformed into mbufs, as we may end
3829 * up with NMBPG-1 unused audit structures per cluster.
3830 * But doing so tremendously simplifies the allocation
3831 * strategy, since at this point we are not holding the
3832 * mbuf lock and the caller is okay to be blocked.
3833 */
3834 if (bufsize == PAGE_SIZE) {
3835 needed = numpages * NMBPG;
3836
3837 i = mcache_alloc_ext(mcl_audit_con_cache,
3838 &con_list, needed, MCR_SLEEP);
3839
3840 VERIFY(con_list != NULL && i == needed);
3841 } else {
3842 /*
3843 * if multiple 4K pages are being used for a
3844 * 16K cluster
3845 */
3846 needed = numpages / NSLABSP16KB;
3847 }
3848
3849 i = mcache_alloc_ext(mcache_audit_cache,
3850 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
3851
3852 VERIFY(mca_list != NULL && i == needed);
3853 }
3854
3855 lck_mtx_lock(mbuf_mlock);
3856
3857 for (i = 0; i < numpages; i++, page += PAGE_SIZE) {
3858 ppnum_t offset =
3859 ((unsigned char *)page - mbutl) >> PAGE_SHIFT;
3860 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
3861
3862 /*
3863 * If there is a mapper the appropriate I/O page is
3864 * returned; zero out the page to discard its past
3865 * contents to prevent exposing leftover kernel memory.
3866 */
3867 VERIFY(offset < mcl_pages);
3868 if (mcl_paddr_base != 0) {
3869 bzero((void *)(uintptr_t) page, PAGE_SIZE);
3870 new_page = IOMapperInsertPage(mcl_paddr_base,
3871 offset, new_page);
3872 }
3873 mcl_paddr[offset] = new_page;
3874
3875 /* Pattern-fill this fresh page */
3876 if (mclverify) {
3877 mcache_set_pattern(MCACHE_FREE_PATTERN,
3878 (caddr_t)page, PAGE_SIZE);
3879 }
3880 if (bufsize == PAGE_SIZE) {
3881 mcache_obj_t *buf;
3882 /* One for the entire page */
3883 sp = slab_get((void *)page);
3884 if (mclaudit != NULL) {
3885 mcl_audit_init((void *)page,
3886 &mca_list, &con_list,
3887 AUDIT_CONTENTS_SIZE, NMBPG);
3888 }
3889 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3890 slab_init(sp, class, SLF_MAPPED, (void *)page,
3891 (void *)page, PAGE_SIZE, 0, 1);
3892 buf = (mcache_obj_t *)page;
3893 buf->obj_next = NULL;
3894
3895 /* Insert this slab */
3896 slab_insert(sp, class);
3897
3898 /* Update stats now since slab_get drops the lock */
3899 ++m_infree(class);
3900 ++m_total(class);
3901 VERIFY(m_total(class) <= m_maxlimit(class));
3902 if (class == MC_BIGCL) {
3903 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3904 m_infree(MC_MBUF_BIGCL);
3905 mbstat.m_bigclusters = m_total(MC_BIGCL);
3906 }
3907 ++count;
3908 } else if ((bufsize > PAGE_SIZE) &&
3909 (i % NSLABSP16KB) == 0) {
3910 union m16kcluster *m16kcl = (union m16kcluster *)page;
3911 mcl_slab_t *nsp;
3912 int k;
3913
3914 /* One for the entire 16KB */
3915 sp = slab_get(m16kcl);
3916 if (mclaudit != NULL) {
3917 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
3918 }
3919
3920 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3921 slab_init(sp, MC_16KCL, SLF_MAPPED,
3922 m16kcl, m16kcl, bufsize, 0, 1);
3923 m16kcl->m16kcl_next = NULL;
3924
3925 /*
3926 * 2nd-Nth page's slab is part of the first one,
3927 * where N is NSLABSP16KB.
3928 */
3929 for (k = 1; k < NSLABSP16KB; k++) {
3930 nsp = slab_get(((union mbigcluster *)page) + k);
3931 VERIFY(nsp->sl_refcnt == 0 &&
3932 nsp->sl_flags == 0);
3933 slab_init(nsp, MC_16KCL,
3934 SLF_MAPPED | SLF_PARTIAL,
3935 m16kcl, NULL, 0, 0, 0);
3936 }
3937 /* Insert this slab */
3938 slab_insert(sp, MC_16KCL);
3939
3940 /* Update stats now since slab_get drops the lock */
3941 ++m_infree(MC_16KCL);
3942 ++m_total(MC_16KCL);
3943 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3944 ++count;
3945 }
3946 }
3947 VERIFY(mca_list == NULL && con_list == NULL);
3948
3949 if (!mb_peak_newreport && mbuf_report_usage(class)) {
3950 mb_peak_newreport = TRUE;
3951 }
3952
3953 /* We're done; let others enter */
3954 mb_clalloc_busy = FALSE;
3955 if (mb_clalloc_waiters > 0) {
3956 mb_clalloc_waiters = 0;
3957 wakeup(mb_clalloc_waitchan);
3958 }
3959
3960 return count;
3961 out:
3962 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3963
3964 mtracelarge_register(size);
3965
3966 /* We're done; let others enter */
3967 mb_clalloc_busy = FALSE;
3968 if (mb_clalloc_waiters > 0) {
3969 mb_clalloc_waiters = 0;
3970 wakeup(mb_clalloc_waitchan);
3971 }
3972
3973 /*
3974 * When non-blocking we kick a thread if we have to grow the
3975 * pool or if the number of free clusters is less than requested.
3976 */
3977 if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) {
3978 mbwdog_logger("waking up the worker thread to to grow %s by %d",
3979 m_cname(class), i);
3980 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
3981 mbuf_worker_needs_wakeup = FALSE;
3982 }
3983 if (class == MC_BIGCL) {
3984 if (i > 0) {
3985 /*
3986 * Remember total number of 4KB clusters needed
3987 * at this time.
3988 */
3989 i += m_total(MC_BIGCL);
3990 if (i > m_region_expand(MC_BIGCL)) {
3991 m_region_expand(MC_BIGCL) = i;
3992 }
3993 }
3994 if (m_infree(MC_BIGCL) >= num) {
3995 return 1;
3996 }
3997 } else {
3998 if (i > 0) {
3999 /*
4000 * Remember total number of 16KB clusters needed
4001 * at this time.
4002 */
4003 i += m_total(MC_16KCL);
4004 if (i > m_region_expand(MC_16KCL)) {
4005 m_region_expand(MC_16KCL) = i;
4006 }
4007 }
4008 if (m_infree(MC_16KCL) >= num) {
4009 return 1;
4010 }
4011 }
4012 return 0;
4013 }
4014
4015 /*
4016 * Populate the global freelist of the corresponding buffer class.
4017 */
4018 static int
4019 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
4020 {
4021 mcache_obj_t *o = NULL;
4022 int i, numpages = 0, count;
4023 mbuf_class_t super_class;
4024
4025 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
4026 class == MC_16KCL);
4027
4028 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4029
4030 VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) ||
4031 PAGE_SIZE == m_maxsize(MC_16KCL));
4032
4033 if (m_maxsize(class) >= PAGE_SIZE) {
4034 return m_clalloc(num, wait, m_maxsize(class)) != 0;
4035 }
4036
4037 /*
4038 * The rest of the function will allocate pages and will slice
4039 * them up into the right size
4040 */
4041
4042 numpages = (num * m_size(class) + PAGE_SIZE - 1) / PAGE_SIZE;
4043
4044 /* Currently assume that pages are 4K or 16K */
4045 if (PAGE_SIZE == m_maxsize(MC_BIGCL)) {
4046 super_class = MC_BIGCL;
4047 } else {
4048 super_class = MC_16KCL;
4049 }
4050
4051 i = m_clalloc(numpages, wait, m_maxsize(super_class));
4052
4053 /* how many objects will we cut the page into? */
4054 int numobj = PAGE_SIZE / m_maxsize(class);
4055
4056 for (count = 0; count < numpages; count++) {
4057 /* respect totals, minlimit, maxlimit */
4058 if (m_total(super_class) <= m_minlimit(super_class) ||
4059 m_total(class) >= m_maxlimit(class)) {
4060 break;
4061 }
4062
4063 if ((o = slab_alloc(super_class, wait)) == NULL) {
4064 break;
4065 }
4066
4067 struct mbuf *m = (struct mbuf *)o;
4068 union mcluster *c = (union mcluster *)o;
4069 union mbigcluster *mbc = (union mbigcluster *)o;
4070 mcl_slab_t *sp = slab_get(o);
4071 mcache_audit_t *mca = NULL;
4072
4073 /*
4074 * since one full page will be converted to MC_MBUF or
4075 * MC_CL, verify that the reference count will match that
4076 * assumption
4077 */
4078 VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp));
4079 VERIFY((sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
4080 /*
4081 * Make sure that the cluster is unmolested
4082 * while in freelist
4083 */
4084 if (mclverify) {
4085 mca = mcl_audit_buf2mca(super_class,
4086 (mcache_obj_t *)o);
4087 mcache_audit_free_verify(mca,
4088 (mcache_obj_t *)o, 0, m_maxsize(super_class));
4089 }
4090
4091 /* Reinitialize it as an mbuf or 2K or 4K slab */
4092 slab_init(sp, class, sp->sl_flags,
4093 sp->sl_base, NULL, PAGE_SIZE, 0, numobj);
4094
4095 VERIFY(sp->sl_head == NULL);
4096
4097 VERIFY(m_total(super_class) >= 1);
4098 m_total(super_class)--;
4099
4100 if (super_class == MC_BIGCL) {
4101 mbstat.m_bigclusters = m_total(MC_BIGCL);
4102 }
4103
4104 m_total(class) += numobj;
4105 VERIFY(m_total(class) <= m_maxlimit(class));
4106 m_infree(class) += numobj;
4107
4108 if (!mb_peak_newreport && mbuf_report_usage(class)) {
4109 mb_peak_newreport = TRUE;
4110 }
4111
4112 i = numobj;
4113 if (class == MC_MBUF) {
4114 mbstat.m_mbufs = m_total(MC_MBUF);
4115 mtype_stat_add(MT_FREE, NMBPG);
4116 while (i--) {
4117 /*
4118 * If auditing is enabled, construct the
4119 * shadow mbuf in the audit structure
4120 * instead of the actual one.
4121 * mbuf_slab_audit() will take care of
4122 * restoring the contents after the
4123 * integrity check.
4124 */
4125 if (mclaudit != NULL) {
4126 struct mbuf *ms;
4127 mca = mcl_audit_buf2mca(MC_MBUF,
4128 (mcache_obj_t *)m);
4129 ms = MCA_SAVED_MBUF_PTR(mca);
4130 ms->m_type = MT_FREE;
4131 } else {
4132 m->m_type = MT_FREE;
4133 }
4134 m->m_next = sp->sl_head;
4135 sp->sl_head = (void *)m++;
4136 }
4137 } else if (class == MC_CL) { /* MC_CL */
4138 mbstat.m_clfree =
4139 m_infree(MC_CL) + m_infree(MC_MBUF_CL);
4140 mbstat.m_clusters = m_total(MC_CL);
4141 while (i--) {
4142 c->mcl_next = sp->sl_head;
4143 sp->sl_head = (void *)c++;
4144 }
4145 } else {
4146 VERIFY(class == MC_BIGCL);
4147 mbstat.m_bigclusters = m_total(MC_BIGCL);
4148 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
4149 m_infree(MC_MBUF_BIGCL);
4150 while (i--) {
4151 mbc->mbc_next = sp->sl_head;
4152 sp->sl_head = (void *)mbc++;
4153 }
4154 }
4155
4156 /* Insert into the mbuf or 2k or 4k slab list */
4157 slab_insert(sp, class);
4158
4159 if ((i = mb_waiters) > 0) {
4160 mb_waiters = 0;
4161 }
4162 if (i != 0) {
4163 mbwdog_logger("waking up all threads");
4164 wakeup(mb_waitchan);
4165 }
4166 }
4167 return count != 0;
4168 }
4169
4170 /*
4171 * For each class, initialize the freelist to hold m_minlimit() objects.
4172 */
4173 static void
4174 freelist_init(mbuf_class_t class)
4175 {
4176 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4177
4178 VERIFY(class == MC_CL || class == MC_BIGCL);
4179 VERIFY(m_total(class) == 0);
4180 VERIFY(m_minlimit(class) > 0);
4181
4182 while (m_total(class) < m_minlimit(class)) {
4183 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
4184 }
4185
4186 VERIFY(m_total(class) >= m_minlimit(class));
4187 }
4188
4189 /*
4190 * (Inaccurately) check if it might be worth a trip back to the
4191 * mcache layer due the availability of objects there. We'll
4192 * end up back here if there's nothing up there.
4193 */
4194 static boolean_t
4195 mbuf_cached_above(mbuf_class_t class, int wait)
4196 {
4197 switch (class) {
4198 case MC_MBUF:
4199 if (wait & MCR_COMP) {
4200 return !mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
4201 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
4202 }
4203 break;
4204
4205 case MC_CL:
4206 if (wait & MCR_COMP) {
4207 return !mcache_bkt_isempty(m_cache(MC_MBUF_CL));
4208 }
4209 break;
4210
4211 case MC_BIGCL:
4212 if (wait & MCR_COMP) {
4213 return !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
4214 }
4215 break;
4216
4217 case MC_16KCL:
4218 if (wait & MCR_COMP) {
4219 return !mcache_bkt_isempty(m_cache(MC_MBUF_16KCL));
4220 }
4221 break;
4222
4223 case MC_MBUF_CL:
4224 case MC_MBUF_BIGCL:
4225 case MC_MBUF_16KCL:
4226 break;
4227
4228 default:
4229 VERIFY(0);
4230 /* NOTREACHED */
4231 }
4232
4233 return !mcache_bkt_isempty(m_cache(class));
4234 }
4235
4236 /*
4237 * If possible, convert constructed objects to raw ones.
4238 */
4239 static boolean_t
4240 mbuf_steal(mbuf_class_t class, unsigned int num)
4241 {
4242 mcache_obj_t *top = NULL;
4243 mcache_obj_t **list = ⊤
4244 unsigned int tot = 0;
4245
4246 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4247
4248 switch (class) {
4249 case MC_MBUF:
4250 case MC_CL:
4251 case MC_BIGCL:
4252 case MC_16KCL:
4253 return FALSE;
4254
4255 case MC_MBUF_CL:
4256 case MC_MBUF_BIGCL:
4257 case MC_MBUF_16KCL:
4258 /* Get the required number of constructed objects if possible */
4259 if (m_infree(class) > m_minlimit(class)) {
4260 tot = cslab_alloc(class, &list,
4261 MIN(num, m_infree(class)));
4262 }
4263
4264 /* And destroy them to get back the raw objects */
4265 if (top != NULL) {
4266 (void) cslab_free(class, top, 1);
4267 }
4268 break;
4269
4270 default:
4271 VERIFY(0);
4272 /* NOTREACHED */
4273 }
4274
4275 return tot == num;
4276 }
4277
4278 static void
4279 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
4280 {
4281 int m, bmap = 0;
4282
4283 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4284
4285 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
4286 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
4287 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
4288
4289 /*
4290 * This logic can be made smarter; for now, simply mark
4291 * all other related classes as potential victims.
4292 */
4293 switch (class) {
4294 case MC_MBUF:
4295 m_wantpurge(MC_CL)++;
4296 m_wantpurge(MC_BIGCL)++;
4297 m_wantpurge(MC_MBUF_CL)++;
4298 m_wantpurge(MC_MBUF_BIGCL)++;
4299 break;
4300
4301 case MC_CL:
4302 m_wantpurge(MC_MBUF)++;
4303 m_wantpurge(MC_BIGCL)++;
4304 m_wantpurge(MC_MBUF_BIGCL)++;
4305 if (!comp) {
4306 m_wantpurge(MC_MBUF_CL)++;
4307 }
4308 break;
4309
4310 case MC_BIGCL:
4311 m_wantpurge(MC_MBUF)++;
4312 m_wantpurge(MC_CL)++;
4313 m_wantpurge(MC_MBUF_CL)++;
4314 if (!comp) {
4315 m_wantpurge(MC_MBUF_BIGCL)++;
4316 }
4317 break;
4318
4319 case MC_16KCL:
4320 if (!comp) {
4321 m_wantpurge(MC_MBUF_16KCL)++;
4322 }
4323 break;
4324
4325 default:
4326 VERIFY(0);
4327 /* NOTREACHED */
4328 }
4329
4330 /*
4331 * Run through each marked class and check if we really need to
4332 * purge (and therefore temporarily disable) the per-CPU caches
4333 * layer used by the class. If so, remember the classes since
4334 * we are going to drop the lock below prior to purging.
4335 */
4336 for (m = 0; m < NELEM(mbuf_table); m++) {
4337 if (m_wantpurge(m) > 0) {
4338 m_wantpurge(m) = 0;
4339 /*
4340 * Try hard to steal the required number of objects
4341 * from the freelist of other mbuf classes. Only
4342 * purge and disable the per-CPU caches layer when
4343 * we don't have enough; it's the last resort.
4344 */
4345 if (!mbuf_steal(m, num)) {
4346 bmap |= (1 << m);
4347 }
4348 }
4349 }
4350
4351 lck_mtx_unlock(mbuf_mlock);
4352
4353 if (bmap != 0) {
4354 /* signal the domains to drain */
4355 net_drain_domains();
4356
4357 /* Sigh; we have no other choices but to ask mcache to purge */
4358 for (m = 0; m < NELEM(mbuf_table); m++) {
4359 if ((bmap & (1 << m)) &&
4360 mcache_purge_cache(m_cache(m), TRUE)) {
4361 lck_mtx_lock(mbuf_mlock);
4362 m_purge_cnt(m)++;
4363 mbstat.m_drain++;
4364 lck_mtx_unlock(mbuf_mlock);
4365 }
4366 }
4367 } else {
4368 /*
4369 * Request mcache to reap extra elements from all of its caches;
4370 * note that all reaps are serialized and happen only at a fixed
4371 * interval.
4372 */
4373 mcache_reap();
4374 }
4375 lck_mtx_lock(mbuf_mlock);
4376 }
4377 #endif /* CONFIG_MBUF_MCACHE */
4378
4379 static inline struct mbuf *
4380 m_get_common(int wait, short type, int hdr)
4381 {
4382 struct mbuf *m;
4383
4384 #if CONFIG_MBUF_MCACHE
4385 int mcflags = MSLEEPF(wait);
4386
4387 /* Is this due to a non-blocking retry? If so, then try harder */
4388 if (mcflags & MCR_NOSLEEP) {
4389 mcflags |= MCR_TRYHARD;
4390 }
4391
4392 m = mcache_alloc(m_cache(MC_MBUF), mcflags);
4393 #else
4394 m = mz_alloc(wait);
4395 #endif /* CONFIG_MBUF_MCACHE */
4396 if (m != NULL) {
4397 MBUF_INIT(m, hdr, type);
4398 mtype_stat_inc(type);
4399 mtype_stat_dec(MT_FREE);
4400 }
4401 return m;
4402 }
4403
4404 /*
4405 * Space allocation routines; these are also available as macros
4406 * for critical paths.
4407 */
4408 #define _M_GET(wait, type) m_get_common(wait, type, 0)
4409 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
4410 #define _M_RETRY(wait, type) _M_GET(wait, type)
4411 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
4412 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
4413 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
4414
4415 struct mbuf *
4416 m_get(int wait, int type)
4417 {
4418 return _M_GET(wait, type);
4419 }
4420
4421 struct mbuf *
4422 m_gethdr(int wait, int type)
4423 {
4424 return _M_GETHDR(wait, type);
4425 }
4426
4427 struct mbuf *
4428 m_retry(int wait, int type)
4429 {
4430 return _M_RETRY(wait, type);
4431 }
4432
4433 struct mbuf *
4434 m_retryhdr(int wait, int type)
4435 {
4436 return _M_RETRYHDR(wait, type);
4437 }
4438
4439 struct mbuf *
4440 m_getclr(int wait, int type)
4441 {
4442 struct mbuf *m;
4443
4444 _MGET(m, wait, type);
4445 if (m != NULL) {
4446 bzero(MTOD(m, caddr_t), MLEN);
4447 }
4448 return m;
4449 }
4450
4451 static int
4452 m_free_paired(struct mbuf *m)
4453 {
4454 VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED));
4455
4456 os_atomic_thread_fence(seq_cst);
4457 if (MEXT_PMBUF(m) == m) {
4458 /*
4459 * Paired ref count might be negative in case we lose
4460 * against another thread clearing MEXT_PMBUF, in the
4461 * event it occurs after the above memory barrier sync.
4462 * In that case just ignore as things have been unpaired.
4463 */
4464 int16_t prefcnt = os_atomic_dec(&MEXT_PREF(m), acq_rel);
4465 if (prefcnt > 1) {
4466 return 1;
4467 } else if (prefcnt == 1) {
4468 m_ext_free_func_t m_free_func = m_get_ext_free(m);
4469 VERIFY(m_free_func != NULL);
4470 (*m_free_func)(m->m_ext.ext_buf,
4471 m->m_ext.ext_size, m_get_ext_arg(m));
4472 return 1;
4473 } else if (prefcnt == 0) {
4474 VERIFY(MBUF_IS_PAIRED(m));
4475
4476 /*
4477 * Restore minref to its natural value, so that
4478 * the caller will be able to free the cluster
4479 * as appropriate.
4480 */
4481 MEXT_MINREF(m) = 0;
4482
4483 /*
4484 * Clear MEXT_PMBUF, but leave EXTF_PAIRED intact
4485 * as it is immutable. atomic_set_ptr also causes
4486 * memory barrier sync.
4487 */
4488 os_atomic_store(&MEXT_PMBUF(m), NULL, release);
4489
4490 switch (m->m_ext.ext_size) {
4491 case MCLBYTES:
4492 m_set_ext(m, m_get_rfa(m), NULL, NULL);
4493 break;
4494
4495 case MBIGCLBYTES:
4496 m_set_ext(m, m_get_rfa(m), m_bigfree, NULL);
4497 break;
4498
4499 case M16KCLBYTES:
4500 m_set_ext(m, m_get_rfa(m), m_16kfree, NULL);
4501 break;
4502
4503 default:
4504 VERIFY(0);
4505 /* NOTREACHED */
4506 }
4507 }
4508 }
4509
4510 /*
4511 * Tell caller the unpair has occurred, and that the reference
4512 * count on the external cluster held for the paired mbuf should
4513 * now be dropped.
4514 */
4515 return 0;
4516 }
4517
4518 struct mbuf *
4519 m_free(struct mbuf *m)
4520 {
4521 struct mbuf *n = m->m_next;
4522
4523 if (m->m_type == MT_FREE) {
4524 panic("m_free: freeing an already freed mbuf");
4525 }
4526
4527 if (m->m_flags & M_PKTHDR) {
4528 /* Check for scratch area overflow */
4529 m_redzone_verify(m);
4530 /* Free the aux data and tags if there is any */
4531 m_tag_delete_chain(m);
4532
4533 m_do_tx_compl_callback(m, NULL);
4534 }
4535
4536 if (m->m_flags & M_EXT) {
4537 if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
4538 return n;
4539 }
4540 /*
4541 * Make sure that we don't touch any ext_ref
4542 * member after we decrement the reference count
4543 * since that may lead to use-after-free
4544 * when we do not hold the last reference.
4545 */
4546 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
4547 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
4548 const uint16_t minref = MEXT_MINREF(m);
4549 const uint16_t refcnt = m_decref(m);
4550
4551 if (refcnt == minref && !composite) {
4552 #if CONFIG_MBUF_MCACHE
4553 if (m_free_func == NULL) {
4554 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
4555 } else if (m_free_func == m_bigfree) {
4556 mcache_free(m_cache(MC_BIGCL),
4557 m->m_ext.ext_buf);
4558 } else if (m_free_func == m_16kfree) {
4559 mcache_free(m_cache(MC_16KCL),
4560 m->m_ext.ext_buf);
4561 } else {
4562 (*m_free_func)(m->m_ext.ext_buf,
4563 m->m_ext.ext_size, m_get_ext_arg(m));
4564 }
4565 mcache_free(ref_cache, m_get_rfa(m));
4566 #else
4567 if (m_free_func == NULL) {
4568 mz_cl_free(ZONE_ID_CLUSTER_2K, m->m_ext.ext_buf);
4569 } else if (m_free_func == m_bigfree) {
4570 mz_cl_free(ZONE_ID_CLUSTER_4K, m->m_ext.ext_buf);
4571 } else if (m_free_func == m_16kfree) {
4572 mz_cl_free(ZONE_ID_CLUSTER_16K, m->m_ext.ext_buf);
4573 } else {
4574 (*m_free_func)(m->m_ext.ext_buf,
4575 m->m_ext.ext_size, m_get_ext_arg(m));
4576 }
4577 mz_ref_free(m_get_rfa(m));
4578 #endif /* CONFIG_MBUF_MCACHE */
4579 m_set_ext(m, NULL, NULL, NULL);
4580 } else if (refcnt == minref && composite) {
4581 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
4582 VERIFY(m->m_type != MT_FREE);
4583
4584 mtype_stat_dec(m->m_type);
4585 mtype_stat_inc(MT_FREE);
4586
4587 m->m_type = MT_FREE;
4588 m->m_flags = M_EXT;
4589 m->m_len = 0;
4590 m->m_next = m->m_nextpkt = NULL;
4591 /*
4592 * MEXT_FLAGS is safe to access here
4593 * since we are now sure that we held
4594 * the last reference to ext_ref.
4595 */
4596 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4597
4598 #if CONFIG_MBUF_MCACHE
4599 /* "Free" into the intermediate cache */
4600 if (m_free_func == NULL) {
4601 mcache_free(m_cache(MC_MBUF_CL), m);
4602 } else if (m_free_func == m_bigfree) {
4603 mcache_free(m_cache(MC_MBUF_BIGCL), m);
4604 } else {
4605 VERIFY(m_free_func == m_16kfree);
4606 mcache_free(m_cache(MC_MBUF_16KCL), m);
4607 }
4608 #else
4609 /* "Free" into the intermediate cache */
4610 if (m_free_func == NULL) {
4611 mz_composite_free(MC_MBUF_CL, m);
4612 } else if (m_free_func == m_bigfree) {
4613 mz_composite_free(MC_MBUF_BIGCL, m);
4614 } else {
4615 VERIFY(m_free_func == m_16kfree);
4616 mz_composite_free(MC_MBUF_16KCL, m);
4617 }
4618 #endif /* CONFIG_MBUF_MCACHE */
4619 return n;
4620 }
4621 }
4622
4623 if (m->m_type != MT_FREE) {
4624 mtype_stat_dec(m->m_type);
4625 mtype_stat_inc(MT_FREE);
4626 }
4627
4628 m->m_type = MT_FREE;
4629 m->m_flags = m->m_len = 0;
4630 m->m_next = m->m_nextpkt = NULL;
4631
4632 #if CONFIG_MBUF_MCACHE
4633 mcache_free(m_cache(MC_MBUF), m);
4634 #else
4635 mz_free(m);
4636 #endif /* CONFIG_MBUF_MCACHE */
4637
4638 return n;
4639 }
4640
4641 __private_extern__ struct mbuf *
4642 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
4643 void (*extfree)(caddr_t, u_int, caddr_t), size_t extsize, caddr_t extarg,
4644 int wait, int pair)
4645 {
4646 struct ext_ref *rfa = NULL;
4647
4648 /*
4649 * If pairing is requested and an existing mbuf is provided, reject
4650 * it if it's already been paired to another cluster. Otherwise,
4651 * allocate a new one or free any existing below.
4652 */
4653 if ((m != NULL && MBUF_IS_PAIRED(m)) ||
4654 (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)) {
4655 return NULL;
4656 }
4657
4658 if (m->m_flags & M_EXT) {
4659 /*
4660 * Make sure that we don't touch any ext_ref
4661 * member after we decrement the reference count
4662 * since that may lead to use-after-free
4663 * when we do not hold the last reference.
4664 */
4665 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
4666 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
4667 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
4668 const uint16_t minref = MEXT_MINREF(m);
4669 const uint16_t refcnt = m_decref(m);
4670
4671 if (refcnt == minref && !composite) {
4672 #if CONFIG_MBUF_MCACHE
4673 if (m_free_func == NULL) {
4674 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
4675 } else if (m_free_func == m_bigfree) {
4676 mcache_free(m_cache(MC_BIGCL),
4677 m->m_ext.ext_buf);
4678 } else if (m_free_func == m_16kfree) {
4679 mcache_free(m_cache(MC_16KCL),
4680 m->m_ext.ext_buf);
4681 } else {
4682 (*m_free_func)(m->m_ext.ext_buf,
4683 m->m_ext.ext_size, m_get_ext_arg(m));
4684 }
4685 #else
4686 if (m_free_func == NULL) {
4687 mz_cl_free(ZONE_ID_CLUSTER_2K, m->m_ext.ext_buf);
4688 } else if (m_free_func == m_bigfree) {
4689 mz_cl_free(ZONE_ID_CLUSTER_4K, m->m_ext.ext_buf);
4690 } else if (m_free_func == m_16kfree) {
4691 mz_cl_free(ZONE_ID_CLUSTER_16K, m->m_ext.ext_buf);
4692 } else {
4693 (*m_free_func)(m->m_ext.ext_buf,
4694 m->m_ext.ext_size, m_get_ext_arg(m));
4695 }
4696 #endif /* CONFIG_MBUF_MCACHE */
4697 /* Re-use the reference structure */
4698 rfa = m_get_rfa(m);
4699 } else if (refcnt == minref && composite) {
4700 VERIFY(m->m_type != MT_FREE);
4701
4702 mtype_stat_dec(m->m_type);
4703 mtype_stat_inc(MT_FREE);
4704
4705 m->m_type = MT_FREE;
4706 m->m_flags = M_EXT;
4707 m->m_len = 0;
4708 m->m_next = m->m_nextpkt = NULL;
4709
4710 /*
4711 * MEXT_FLAGS is safe to access here
4712 * since we are now sure that we held
4713 * the last reference to ext_ref.
4714 */
4715 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4716
4717 /* "Free" into the intermediate cache */
4718 #if CONFIG_MBUF_MCACHE
4719 if (m_free_func == NULL) {
4720 mcache_free(m_cache(MC_MBUF_CL), m);
4721 } else if (m_free_func == m_bigfree) {
4722 mcache_free(m_cache(MC_MBUF_BIGCL), m);
4723 } else {
4724 VERIFY(m_free_func == m_16kfree);
4725 mcache_free(m_cache(MC_MBUF_16KCL), m);
4726 }
4727 #else
4728 if (m_free_func == NULL) {
4729 mz_composite_free(MC_MBUF_CL, m);
4730 } else if (m_free_func == m_bigfree) {
4731 mz_composite_free(MC_MBUF_BIGCL, m);
4732 } else {
4733 VERIFY(m_free_func == m_16kfree);
4734 mz_composite_free(MC_MBUF_16KCL, m);
4735 }
4736 #endif /* CONFIG_MBUF_MCACHE */
4737 /*
4738 * Allocate a new mbuf, since we didn't divorce
4739 * the composite mbuf + cluster pair above.
4740 */
4741 if ((m = _M_GETHDR(wait, type)) == NULL) {
4742 return NULL;
4743 }
4744 }
4745 }
4746
4747 #if CONFIG_MBUF_MCACHE
4748 if (rfa == NULL &&
4749 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4750 m_free(m);
4751 return NULL;
4752 }
4753 #else
4754 if (rfa == NULL &&
4755 (rfa = mz_ref_alloc(wait)) == NULL) {
4756 m_free(m);
4757 return NULL;
4758 }
4759 #endif /* CONFIG_MBUF_MCACHE */
4760
4761 if (!pair) {
4762 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa,
4763 0, 1, 0, 0, 0, NULL);
4764 } else {
4765 MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
4766 1, 1, 1, EXTF_PAIRED, 0, m);
4767 }
4768
4769 return m;
4770 }
4771
4772 /*
4773 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
4774 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
4775 */
4776 struct mbuf *
4777 m_getcl(int wait, int type, int flags)
4778 {
4779 struct mbuf *m = NULL;
4780 int hdr = (flags & M_PKTHDR);
4781
4782 #if CONFIG_MBUF_MCACHE
4783 int mcflags = MSLEEPF(wait);
4784
4785 /* Is this due to a non-blocking retry? If so, then try harder */
4786 if (mcflags & MCR_NOSLEEP) {
4787 mcflags |= MCR_TRYHARD;
4788 }
4789
4790 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
4791 #else
4792 m = mz_composite_alloc(MC_MBUF_CL, wait);
4793 #endif /* CONFIG_MBUF_MCACHE */
4794 if (m != NULL) {
4795 u_int16_t flag;
4796 struct ext_ref *rfa;
4797 void *cl;
4798
4799 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4800 cl = m->m_ext.ext_buf;
4801 rfa = m_get_rfa(m);
4802
4803 ASSERT(cl != NULL && rfa != NULL);
4804 VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
4805
4806 flag = MEXT_FLAGS(m);
4807
4808 MBUF_INIT(m, hdr, type);
4809 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4810
4811 mtype_stat_inc(type);
4812 mtype_stat_dec(MT_FREE);
4813 }
4814 return m;
4815 }
4816
4817 /* m_mclget() add an mbuf cluster to a normal mbuf */
4818 struct mbuf *
4819 m_mclget(struct mbuf *m, int wait)
4820 {
4821 struct ext_ref *rfa = NULL;
4822
4823 #if CONFIG_MBUF_MCACHE
4824 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4825 return m;
4826 }
4827 #else
4828 if ((rfa = mz_ref_alloc(wait)) == NULL) {
4829 return m;
4830 }
4831 #endif /* CONFIG_MBUF_MCACHE */
4832 m->m_ext.ext_buf = m_mclalloc(wait);
4833 if (m->m_ext.ext_buf != NULL) {
4834 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4835 } else {
4836 #if CONFIG_MBUF_MCACHE
4837 mcache_free(ref_cache, rfa);
4838 #else
4839 mz_ref_free(rfa);
4840 #endif /* CONFIG_MBUF_MCACHE */
4841 }
4842
4843 return m;
4844 }
4845
4846 /* Allocate an mbuf cluster */
4847 caddr_t
4848 m_mclalloc(int wait)
4849 {
4850 #if CONFIG_MBUF_MCACHE
4851 int mcflags = MSLEEPF(wait);
4852
4853 /* Is this due to a non-blocking retry? If so, then try harder */
4854 if (mcflags & MCR_NOSLEEP) {
4855 mcflags |= MCR_TRYHARD;
4856 }
4857
4858 return mcache_alloc(m_cache(MC_CL), mcflags);
4859 #else
4860 return mz_cl_alloc(ZONE_ID_CLUSTER_2K, wait);
4861 #endif /* CONFIG_MBUF_MCACHE */
4862 }
4863
4864 /* Free an mbuf cluster */
4865 void
4866 m_mclfree(caddr_t p)
4867 {
4868 #if CONFIG_MBUF_MCACHE
4869 mcache_free(m_cache(MC_CL), p);
4870 #else
4871 mz_cl_free(ZONE_ID_CLUSTER_2K, p);
4872 #endif /* CONFIG_MBUF_MCACHE */
4873 }
4874
4875 /*
4876 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
4877 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
4878 */
4879 int
4880 m_mclhasreference(struct mbuf *m)
4881 {
4882 if (!(m->m_flags & M_EXT)) {
4883 return 0;
4884 }
4885
4886 ASSERT(m_get_rfa(m) != NULL);
4887
4888 return (MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0;
4889 }
4890
4891 __private_extern__ caddr_t
4892 m_bigalloc(int wait)
4893 {
4894 #if CONFIG_MBUF_MCACHE
4895 int mcflags = MSLEEPF(wait);
4896
4897 /* Is this due to a non-blocking retry? If so, then try harder */
4898 if (mcflags & MCR_NOSLEEP) {
4899 mcflags |= MCR_TRYHARD;
4900 }
4901
4902 return mcache_alloc(m_cache(MC_BIGCL), mcflags);
4903 #else
4904 return mz_cl_alloc(ZONE_ID_CLUSTER_4K, wait);
4905 #endif /* CONFIG_MBUF_MCACHE */
4906 }
4907
4908 __private_extern__ void
4909 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
4910 {
4911 #if CONFIG_MBUF_MCACHE
4912 mcache_free(m_cache(MC_BIGCL), p);
4913 #else
4914 mz_cl_free(ZONE_ID_CLUSTER_4K, p);
4915 #endif /* CONFIG_MBUF_MCACHE */
4916 }
4917
4918 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
4919 __private_extern__ struct mbuf *
4920 m_mbigget(struct mbuf *m, int wait)
4921 {
4922 struct ext_ref *rfa = NULL;
4923
4924 #if CONFIG_MBUF_MCACHE
4925 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4926 return m;
4927 }
4928 #else
4929 if ((rfa = mz_ref_alloc(wait)) == NULL) {
4930 return m;
4931 }
4932 #endif /* CONFIG_MBUF_MCACHE */
4933 m->m_ext.ext_buf = m_bigalloc(wait);
4934 if (m->m_ext.ext_buf != NULL) {
4935 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4936 } else {
4937 #if CONFIG_MBUF_MCACHE
4938 mcache_free(ref_cache, rfa);
4939 #else
4940 mz_ref_free(rfa);
4941 #endif /* CONFIG_MBUF_MCACHE */
4942 }
4943 return m;
4944 }
4945
4946 __private_extern__ caddr_t
4947 m_16kalloc(int wait)
4948 {
4949 #if CONFIG_MBUF_MCACHE
4950 int mcflags = MSLEEPF(wait);
4951
4952 /* Is this due to a non-blocking retry? If so, then try harder */
4953 if (mcflags & MCR_NOSLEEP) {
4954 mcflags |= MCR_TRYHARD;
4955 }
4956
4957 return mcache_alloc(m_cache(MC_16KCL), mcflags);
4958 #else
4959 return mz_cl_alloc(ZONE_ID_CLUSTER_16K, wait);
4960 #endif /* CONFIG_MBUF_MCACHE */
4961 }
4962
4963 __private_extern__ void
4964 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
4965 {
4966 #if CONFIG_MBUF_MCACHE
4967 mcache_free(m_cache(MC_16KCL), p);
4968 #else
4969 mz_cl_free(ZONE_ID_CLUSTER_16K, p);
4970 #endif /* CONFIG_MBUF_MCACHE */
4971 }
4972
4973 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
4974 __private_extern__ struct mbuf *
4975 m_m16kget(struct mbuf *m, int wait)
4976 {
4977 struct ext_ref *rfa = NULL;
4978
4979 #if CONFIG_MBUF_MCACHE
4980 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4981 return m;
4982 }
4983 #else
4984 if ((rfa = mz_ref_alloc(wait)) == NULL) {
4985 return m;
4986 }
4987 #endif /* CONFIG_MBUF_MCACHE */
4988 m->m_ext.ext_buf = m_16kalloc(wait);
4989 if (m->m_ext.ext_buf != NULL) {
4990 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4991 } else {
4992 #if CONFIG_MBUF_MCACHE
4993 mcache_free(ref_cache, rfa);
4994 #else
4995 mz_ref_free(rfa);
4996 #endif /* CONFIG_MBUF_MCACHE */
4997 }
4998
4999 return m;
5000 }
5001
5002 /*
5003 * "Move" mbuf pkthdr from "from" to "to".
5004 * "from" must have M_PKTHDR set, and "to" must be empty.
5005 */
5006 void
5007 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
5008 {
5009 VERIFY(from->m_flags & M_PKTHDR);
5010
5011 /* Check for scratch area overflow */
5012 m_redzone_verify(from);
5013
5014 if (to->m_flags & M_PKTHDR) {
5015 /* Check for scratch area overflow */
5016 m_redzone_verify(to);
5017 /* We will be taking over the tags of 'to' */
5018 m_tag_delete_chain(to);
5019 }
5020 to->m_pkthdr = from->m_pkthdr; /* especially tags */
5021 m_classifier_init(from, 0); /* purge classifier info */
5022 m_tag_init(from, 1); /* purge all tags from src */
5023 m_scratch_init(from); /* clear src scratch area */
5024 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
5025 if ((to->m_flags & M_EXT) == 0) {
5026 to->m_data = to->m_pktdat;
5027 }
5028 m_redzone_init(to); /* setup red zone on dst */
5029 }
5030
5031 /*
5032 * Duplicate "from"'s mbuf pkthdr in "to".
5033 * "from" must have M_PKTHDR set, and "to" must be empty.
5034 * In particular, this does a deep copy of the packet tags.
5035 */
5036 int
5037 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
5038 {
5039 VERIFY(from->m_flags & M_PKTHDR);
5040
5041 /* Check for scratch area overflow */
5042 m_redzone_verify(from);
5043
5044 if (to->m_flags & M_PKTHDR) {
5045 /* Check for scratch area overflow */
5046 m_redzone_verify(to);
5047 /* We will be taking over the tags of 'to' */
5048 m_tag_delete_chain(to);
5049 }
5050 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
5051 if ((to->m_flags & M_EXT) == 0) {
5052 to->m_data = to->m_pktdat;
5053 }
5054 to->m_pkthdr = from->m_pkthdr;
5055 /* clear TX completion flag so the callback is not called in the copy */
5056 to->m_pkthdr.pkt_flags &= ~PKTF_TX_COMPL_TS_REQ;
5057 m_redzone_init(to); /* setup red zone on dst */
5058 m_tag_init(to, 0); /* preserve dst static tags */
5059 return m_tag_copy_chain(to, from, how);
5060 }
5061
5062 void
5063 m_copy_pftag(struct mbuf *to, struct mbuf *from)
5064 {
5065 memcpy(m_pftag(to), m_pftag(from), sizeof(struct pf_mtag));
5066 #if PF_ECN
5067 m_pftag(to)->pftag_hdr = NULL;
5068 m_pftag(to)->pftag_flags &= ~(PF_TAG_HDR_INET | PF_TAG_HDR_INET6);
5069 #endif /* PF_ECN */
5070 }
5071
5072 void
5073 m_copy_necptag(struct mbuf *to, struct mbuf *from)
5074 {
5075 memcpy(m_necptag(to), m_necptag(from), sizeof(struct necp_mtag_));
5076 }
5077
5078 void
5079 m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
5080 {
5081 VERIFY(m->m_flags & M_PKTHDR);
5082
5083 m->m_pkthdr.pkt_proto = 0;
5084 m->m_pkthdr.pkt_flowsrc = 0;
5085 m->m_pkthdr.pkt_flowid = 0;
5086 m->m_pkthdr.pkt_ext_flags = 0;
5087 m->m_pkthdr.pkt_flags &= pktf_mask; /* caller-defined mask */
5088 /* preserve service class and interface info for loopback packets */
5089 if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
5090 (void) m_set_service_class(m, MBUF_SC_BE);
5091 }
5092 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
5093 m->m_pkthdr.pkt_ifainfo = 0;
5094 }
5095 /*
5096 * Preserve timestamp if requested
5097 */
5098 if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID)) {
5099 m->m_pkthdr.pkt_timestamp = 0;
5100 }
5101 }
5102
5103 void
5104 m_copy_classifier(struct mbuf *to, struct mbuf *from)
5105 {
5106 VERIFY(to->m_flags & M_PKTHDR);
5107 VERIFY(from->m_flags & M_PKTHDR);
5108
5109 to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
5110 to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
5111 to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
5112 to->m_pkthdr.pkt_mpriv_srcid = from->m_pkthdr.pkt_mpriv_srcid;
5113 to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
5114 to->m_pkthdr.pkt_ext_flags = from->m_pkthdr.pkt_ext_flags;
5115 (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
5116 to->m_pkthdr.pkt_ifainfo = from->m_pkthdr.pkt_ifainfo;
5117 }
5118
5119 /*
5120 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
5121 * if wantall is not set, return whatever number were available. Set up the
5122 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
5123 * are chained on the m_nextpkt field. Any packets requested beyond this
5124 * are chained onto the last packet header's m_next field. The size of
5125 * the cluster is controlled by the parameter bufsize.
5126 */
5127 __private_extern__ struct mbuf *
5128 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
5129 int wait, int wantall, size_t bufsize)
5130 {
5131 struct mbuf *m = NULL;
5132 struct mbuf **np, *top;
5133 unsigned int pnum, needed = *num_needed;
5134 #if CONFIG_MBUF_MCACHE
5135 mcache_obj_t *mp_list = NULL;
5136 int mcflags = MSLEEPF(wait);
5137 mcache_t *cp;
5138 #else
5139 zstack_t mp_list = {};
5140 mbuf_class_t class = MC_MBUF_CL;
5141 #endif /* CONFIG_MBUF_MCACHE */
5142 u_int16_t flag;
5143 struct ext_ref *rfa;
5144 void *cl;
5145
5146 ASSERT(bufsize == m_maxsize(MC_CL) ||
5147 bufsize == m_maxsize(MC_BIGCL) ||
5148 bufsize == m_maxsize(MC_16KCL));
5149
5150 /*
5151 * Caller must first check for njcl because this
5152 * routine is internal and not exposed/used via KPI.
5153 */
5154 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
5155
5156 top = NULL;
5157 np = ⊤
5158 pnum = 0;
5159
5160 /*
5161 * The caller doesn't want all the requested buffers; only some.
5162 * Try hard to get what we can, but don't block. This effectively
5163 * overrides MCR_SLEEP, since this thread will not go to sleep
5164 * if we can't get all the buffers.
5165 */
5166 #if CONFIG_MBUF_MCACHE
5167 if (!wantall || (mcflags & MCR_NOSLEEP)) {
5168 mcflags |= MCR_TRYHARD;
5169 }
5170
5171 /* Allocate the composite mbuf + cluster elements from the cache */
5172 if (bufsize == m_maxsize(MC_CL)) {
5173 cp = m_cache(MC_MBUF_CL);
5174 } else if (bufsize == m_maxsize(MC_BIGCL)) {
5175 cp = m_cache(MC_MBUF_BIGCL);
5176 } else {
5177 cp = m_cache(MC_MBUF_16KCL);
5178 }
5179 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
5180 #else
5181 if (!wantall || (wait & Z_NOWAIT)) {
5182 wait &= ~Z_NOWAIT;
5183 wait |= Z_NOPAGEWAIT;
5184 }
5185
5186 /* Allocate the composite mbuf + cluster elements from the cache */
5187 if (bufsize == m_maxsize(MC_CL)) {
5188 class = MC_MBUF_CL;
5189 } else if (bufsize == m_maxsize(MC_BIGCL)) {
5190 class = MC_MBUF_BIGCL;
5191 } else {
5192 class = MC_MBUF_16KCL;
5193 }
5194 mp_list = mz_composite_alloc_n(class, needed, wait);
5195 needed = zstack_count(mp_list);
5196 #endif /* CONFIG_MBUF_MCACHE */
5197
5198 for (pnum = 0; pnum < needed; pnum++) {
5199 #if CONFIG_MBUF_MCACHE
5200 m = (struct mbuf *)mp_list;
5201 mp_list = mp_list->obj_next;
5202 #else
5203 m = zstack_pop(&mp_list);
5204 #endif /* CONFIG_MBUF_MCACHE */
5205
5206 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
5207 cl = m->m_ext.ext_buf;
5208 rfa = m_get_rfa(m);
5209
5210 ASSERT(cl != NULL && rfa != NULL);
5211 VERIFY(MBUF_IS_COMPOSITE(m));
5212
5213 flag = MEXT_FLAGS(m);
5214
5215 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
5216 if (bufsize == m_maxsize(MC_16KCL)) {
5217 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
5218 } else if (bufsize == m_maxsize(MC_BIGCL)) {
5219 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
5220 } else {
5221 MBUF_CL_INIT(m, cl, rfa, 1, flag);
5222 }
5223
5224 if (num_with_pkthdrs > 0) {
5225 --num_with_pkthdrs;
5226 }
5227
5228 *np = m;
5229 if (num_with_pkthdrs > 0) {
5230 np = &m->m_nextpkt;
5231 } else {
5232 np = &m->m_next;
5233 }
5234 }
5235 #if CONFIG_MBUF_MCACHE
5236 ASSERT(pnum != *num_needed || mp_list == NULL);
5237 if (mp_list != NULL) {
5238 mcache_free_ext(cp, mp_list);
5239 }
5240 #else
5241 ASSERT(pnum != *num_needed || zstack_empty(mp_list));
5242 if (!zstack_empty(mp_list)) {
5243 mz_composite_free_n(class, mp_list);
5244 }
5245 #endif /* CONFIG_MBUF_MCACHE */
5246 if (pnum > 0) {
5247 mtype_stat_add(MT_DATA, pnum);
5248 mtype_stat_sub(MT_FREE, pnum);
5249 }
5250
5251 if (wantall && (pnum != *num_needed)) {
5252 if (top != NULL) {
5253 m_freem_list(top);
5254 }
5255 return NULL;
5256 }
5257
5258 if (pnum > *num_needed) {
5259 printf("%s: File a radar related to <rdar://10146739>. \
5260 needed = %u, pnum = %u, num_needed = %u \n",
5261 __func__, needed, pnum, *num_needed);
5262 }
5263 *num_needed = pnum;
5264
5265 return top;
5266 }
5267
5268 /*
5269 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
5270 * wantall is not set, return whatever number were available. The size of
5271 * each mbuf in the list is controlled by the parameter packetlen. Each
5272 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
5273 * in the chain is called a segment. If maxsegments is not null and the
5274 * value pointed to is not null, this specify the maximum number of segments
5275 * for a chain of mbufs. If maxsegments is zero or the value pointed to
5276 * is zero the caller does not have any restriction on the number of segments.
5277 * The actual number of segments of a mbuf chain is return in the value
5278 * pointed to by maxsegments.
5279 */
5280 __private_extern__ struct mbuf *
5281 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
5282 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
5283 {
5284 struct mbuf **np, *top, *first = NULL;
5285 size_t bufsize, r_bufsize;
5286 unsigned int num = 0;
5287 unsigned int nsegs = 0;
5288 unsigned int needed = 0, resid;
5289 #if CONFIG_MBUF_MCACHE
5290 int mcflags = MSLEEPF(wait);
5291 mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
5292 mcache_t *cp = NULL, *rcp = NULL;
5293 #else
5294 zstack_t mp_list = {}, rmp_list = {};
5295 mbuf_class_t class = MC_MBUF, rclass = MC_MBUF_CL;
5296 #endif /* CONFIG_MBUF_MCACHE */
5297
5298 if (*numlist == 0) {
5299 os_log(OS_LOG_DEFAULT, "m_allocpacket_internal *numlist is 0");
5300 return NULL;
5301 }
5302
5303 top = NULL;
5304 np = ⊤
5305
5306 if (wantsize == 0) {
5307 if (packetlen <= MINCLSIZE) {
5308 bufsize = packetlen;
5309 } else if (packetlen > m_maxsize(MC_CL)) {
5310 /* Use 4KB if jumbo cluster pool isn't available */
5311 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0) {
5312 bufsize = m_maxsize(MC_BIGCL);
5313 } else {
5314 bufsize = m_maxsize(MC_16KCL);
5315 }
5316 } else {
5317 bufsize = m_maxsize(MC_CL);
5318 }
5319 } else if (wantsize == m_maxsize(MC_CL) ||
5320 wantsize == m_maxsize(MC_BIGCL) ||
5321 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
5322 bufsize = wantsize;
5323 } else {
5324 *numlist = 0;
5325 os_log(OS_LOG_DEFAULT, "m_allocpacket_internal wantsize unsupported");
5326 return NULL;
5327 }
5328
5329 if (bufsize <= MHLEN) {
5330 nsegs = 1;
5331 } else if (bufsize <= MINCLSIZE) {
5332 if (maxsegments != NULL && *maxsegments == 1) {
5333 bufsize = m_maxsize(MC_CL);
5334 nsegs = 1;
5335 } else {
5336 nsegs = 2;
5337 }
5338 } else if (bufsize == m_maxsize(MC_16KCL)) {
5339 VERIFY(njcl > 0);
5340 nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1;
5341 } else if (bufsize == m_maxsize(MC_BIGCL)) {
5342 nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1;
5343 } else {
5344 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
5345 }
5346 if (maxsegments != NULL) {
5347 if (*maxsegments && nsegs > *maxsegments) {
5348 *maxsegments = nsegs;
5349 *numlist = 0;
5350 os_log(OS_LOG_DEFAULT, "m_allocpacket_internal nsegs > *maxsegments");
5351 return NULL;
5352 }
5353 *maxsegments = nsegs;
5354 }
5355
5356 /*
5357 * The caller doesn't want all the requested buffers; only some.
5358 * Try hard to get what we can, but don't block. This effectively
5359 * overrides MCR_SLEEP, since this thread will not go to sleep
5360 * if we can't get all the buffers.
5361 */
5362 #if CONFIG_MBUF_MCACHE
5363 if (!wantall || (mcflags & MCR_NOSLEEP)) {
5364 mcflags |= MCR_TRYHARD;
5365 }
5366 #else
5367 if (!wantall || (wait & Z_NOWAIT)) {
5368 wait &= ~Z_NOWAIT;
5369 wait |= Z_NOPAGEWAIT;
5370 }
5371 #endif /* !CONFIG_MBUF_MCACHE */
5372
5373 /*
5374 * Simple case where all elements in the lists/chains are mbufs.
5375 * Unless bufsize is greater than MHLEN, each segment chain is made
5376 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
5377 * of 2 mbufs; the second one is used for the residual data, i.e.
5378 * the remaining data that cannot fit into the first mbuf.
5379 */
5380 if (bufsize <= MINCLSIZE) {
5381 /* Allocate the elements in one shot from the mbuf cache */
5382 ASSERT(bufsize <= MHLEN || nsegs == 2);
5383 #if CONFIG_MBUF_MCACHE
5384 cp = m_cache(MC_MBUF);
5385 needed = mcache_alloc_ext(cp, &mp_list,
5386 (*numlist) * nsegs, mcflags);
5387 #else
5388 class = MC_MBUF;
5389 mp_list = mz_alloc_n((*numlist) * nsegs, wait);
5390 needed = zstack_count(mp_list);
5391 #endif /* CONFIG_MBUF_MCACHE */
5392
5393 /*
5394 * The number of elements must be even if we are to use an
5395 * mbuf (instead of a cluster) to store the residual data.
5396 * If we couldn't allocate the requested number of mbufs,
5397 * trim the number down (if it's odd) in order to avoid
5398 * creating a partial segment chain.
5399 */
5400 if (bufsize > MHLEN && (needed & 0x1)) {
5401 needed--;
5402 }
5403
5404 while (num < needed) {
5405 struct mbuf *m = NULL;
5406
5407 #if CONFIG_MBUF_MCACHE
5408 m = (struct mbuf *)mp_list;
5409 mp_list = mp_list->obj_next;
5410 #else
5411 m = zstack_pop(&mp_list);
5412 #endif /* CONFIG_MBUF_MCACHE */
5413 ASSERT(m != NULL);
5414
5415 MBUF_INIT(m, 1, MT_DATA);
5416 num++;
5417 if (bufsize > MHLEN) {
5418 /* A second mbuf for this segment chain */
5419 #if CONFIG_MBUF_MCACHE
5420 m->m_next = (struct mbuf *)mp_list;
5421 mp_list = mp_list->obj_next;
5422 #else
5423 m->m_next = zstack_pop(&mp_list);
5424 #endif /* CONFIG_MBUF_MCACHE */
5425
5426 ASSERT(m->m_next != NULL);
5427
5428 MBUF_INIT(m->m_next, 0, MT_DATA);
5429 num++;
5430 }
5431 *np = m;
5432 np = &m->m_nextpkt;
5433 }
5434 #if CONFIG_MBUF_MCACHE
5435 ASSERT(num != *numlist || mp_list == NULL);
5436 #else
5437 ASSERT(num != *numlist || zstack_empty(mp_list));
5438 #endif /* CONFIG_MBUF_MCACHE */
5439
5440 if (num > 0) {
5441 mtype_stat_add(MT_DATA, num);
5442 mtype_stat_sub(MT_FREE, num);
5443 }
5444 num /= nsegs;
5445
5446 /* We've got them all; return to caller */
5447 if (num == *numlist) {
5448 return top;
5449 }
5450
5451 goto fail;
5452 }
5453
5454 /*
5455 * Complex cases where elements are made up of one or more composite
5456 * mbufs + cluster, depending on packetlen. Each N-segment chain can
5457 * be illustrated as follows:
5458 *
5459 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
5460 *
5461 * Every composite mbuf + cluster element comes from the intermediate
5462 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
5463 * the last composite element will come from the MC_MBUF_CL cache,
5464 * unless the residual data is larger than 2KB where we use the
5465 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
5466 * data is defined as extra data beyond the first element that cannot
5467 * fit into the previous element, i.e. there is no residual data if
5468 * the chain only has 1 segment.
5469 */
5470 r_bufsize = bufsize;
5471 resid = packetlen > bufsize ? packetlen % bufsize : 0;
5472 if (resid > 0) {
5473 /* There is residual data; figure out the cluster size */
5474 if (wantsize == 0 && packetlen > MINCLSIZE) {
5475 /*
5476 * Caller didn't request that all of the segments
5477 * in the chain use the same cluster size; use the
5478 * smaller of the cluster sizes.
5479 */
5480 if (njcl > 0 && resid > m_maxsize(MC_BIGCL)) {
5481 r_bufsize = m_maxsize(MC_16KCL);
5482 } else if (resid > m_maxsize(MC_CL)) {
5483 r_bufsize = m_maxsize(MC_BIGCL);
5484 } else {
5485 r_bufsize = m_maxsize(MC_CL);
5486 }
5487 } else {
5488 /* Use the same cluster size as the other segments */
5489 resid = 0;
5490 }
5491 }
5492
5493 needed = *numlist;
5494 if (resid > 0) {
5495 /*
5496 * Attempt to allocate composite mbuf + cluster elements for
5497 * the residual data in each chain; record the number of such
5498 * elements that can be allocated so that we know how many
5499 * segment chains we can afford to create.
5500 */
5501 #if CONFIG_MBUF_MCACHE
5502 if (r_bufsize <= m_maxsize(MC_CL)) {
5503 rcp = m_cache(MC_MBUF_CL);
5504 } else if (r_bufsize <= m_maxsize(MC_BIGCL)) {
5505 rcp = m_cache(MC_MBUF_BIGCL);
5506 } else {
5507 rcp = m_cache(MC_MBUF_16KCL);
5508 }
5509 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
5510 #else
5511 if (r_bufsize <= m_maxsize(MC_CL)) {
5512 rclass = MC_MBUF_CL;
5513 } else if (r_bufsize <= m_maxsize(MC_BIGCL)) {
5514 rclass = MC_MBUF_BIGCL;
5515 } else {
5516 rclass = MC_MBUF_16KCL;
5517 }
5518 rmp_list = mz_composite_alloc_n(rclass, *numlist, wait);
5519 needed = zstack_count(rmp_list);
5520 #endif /* CONFIG_MBUF_MCACHE */
5521 if (needed == 0) {
5522 goto fail;
5523 }
5524
5525 /* This is temporarily reduced for calculation */
5526 ASSERT(nsegs > 1);
5527 nsegs--;
5528 }
5529
5530 /*
5531 * Attempt to allocate the rest of the composite mbuf + cluster
5532 * elements for the number of segment chains that we need.
5533 */
5534 #if CONFIG_MBUF_MCACHE
5535 if (bufsize <= m_maxsize(MC_CL)) {
5536 cp = m_cache(MC_MBUF_CL);
5537 } else if (bufsize <= m_maxsize(MC_BIGCL)) {
5538 cp = m_cache(MC_MBUF_BIGCL);
5539 } else {
5540 cp = m_cache(MC_MBUF_16KCL);
5541 }
5542 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
5543 #else
5544 if (bufsize <= m_maxsize(MC_CL)) {
5545 class = MC_MBUF_CL;
5546 } else if (bufsize <= m_maxsize(MC_BIGCL)) {
5547 class = MC_MBUF_BIGCL;
5548 } else {
5549 class = MC_MBUF_16KCL;
5550 }
5551 mp_list = mz_composite_alloc_n(class, needed * nsegs, wait);
5552 needed = zstack_count(mp_list);
5553 #endif /* CONFIG_MBUF_MCACHE */
5554
5555 /* Round it down to avoid creating a partial segment chain */
5556 needed = (needed / nsegs) * nsegs;
5557 if (needed == 0) {
5558 goto fail;
5559 }
5560
5561 if (resid > 0) {
5562 /*
5563 * We're about to construct the chain(s); take into account
5564 * the number of segments we have created above to hold the
5565 * residual data for each chain, as well as restore the
5566 * original count of segments per chain.
5567 */
5568 ASSERT(nsegs > 0);
5569 needed += needed / nsegs;
5570 nsegs++;
5571 }
5572
5573 for (;;) {
5574 struct mbuf *m = NULL;
5575 u_int16_t flag;
5576 struct ext_ref *rfa;
5577 void *cl;
5578 int pkthdr;
5579 m_ext_free_func_t m_free_func;
5580
5581 ++num;
5582
5583 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
5584 #if CONFIG_MBUF_MCACHE
5585 m = (struct mbuf *)mp_list;
5586 mp_list = mp_list->obj_next;
5587 #else
5588 m = zstack_pop(&mp_list);
5589 #endif /* CONFIG_MBUF_MCACHE */
5590 } else {
5591 #if CONFIG_MBUF_MCACHE
5592 m = (struct mbuf *)rmp_list;
5593 rmp_list = rmp_list->obj_next;
5594 #else
5595 m = zstack_pop(&rmp_list);
5596 #endif /* CONFIG_MBUF_MCACHE */
5597 }
5598 m_free_func = m_get_ext_free(m);
5599 ASSERT(m != NULL);
5600 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
5601 VERIFY(m_free_func == NULL || m_free_func == m_bigfree ||
5602 m_free_func == m_16kfree);
5603
5604 cl = m->m_ext.ext_buf;
5605 rfa = m_get_rfa(m);
5606
5607 ASSERT(cl != NULL && rfa != NULL);
5608 VERIFY(MBUF_IS_COMPOSITE(m));
5609
5610 flag = MEXT_FLAGS(m);
5611
5612 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
5613 if (pkthdr) {
5614 first = m;
5615 }
5616 MBUF_INIT(m, pkthdr, MT_DATA);
5617 if (m_free_func == m_16kfree) {
5618 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
5619 } else if (m_free_func == m_bigfree) {
5620 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
5621 } else {
5622 MBUF_CL_INIT(m, cl, rfa, 1, flag);
5623 }
5624
5625 *np = m;
5626 if ((num % nsegs) == 0) {
5627 np = &first->m_nextpkt;
5628 } else {
5629 np = &m->m_next;
5630 }
5631
5632 if (num == needed) {
5633 break;
5634 }
5635 }
5636
5637 if (num > 0) {
5638 mtype_stat_add(MT_DATA, num);
5639 mtype_stat_sub(MT_FREE, num);
5640 }
5641
5642 num /= nsegs;
5643
5644 /* We've got them all; return to caller */
5645 if (num == *numlist) {
5646 #if CONFIG_MBUF_MCACHE
5647 ASSERT(mp_list == NULL && rmp_list == NULL);
5648 #else
5649 ASSERT(zstack_empty(mp_list) && zstack_empty(rmp_list));
5650 #endif /* CONFIG_MBUF_MCACHE */
5651 return top;
5652 }
5653
5654 fail:
5655 /* Free up what's left of the above */
5656 #if CONFIG_MBUF_MCACHE
5657 if (mp_list != NULL) {
5658 mcache_free_ext(cp, mp_list);
5659 }
5660 if (rmp_list != NULL) {
5661 mcache_free_ext(rcp, rmp_list);
5662 }
5663 #else
5664 if (!zstack_empty(mp_list)) {
5665 if (class == MC_MBUF) {
5666 /* No need to elide, these mbufs came from the cache. */
5667 mz_free_n(mp_list);
5668 } else {
5669 mz_composite_free_n(class, mp_list);
5670 }
5671 }
5672 if (!zstack_empty(rmp_list)) {
5673 mz_composite_free_n(rclass, rmp_list);
5674 }
5675 #endif /* CONFIG_MBUF_MCACHE */
5676 if (wantall && top != NULL) {
5677 m_freem_list(top);
5678 *numlist = 0;
5679 return NULL;
5680 }
5681 *numlist = num;
5682 return top;
5683 }
5684
5685 /*
5686 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
5687 * packets on receive ring.
5688 */
5689 __private_extern__ struct mbuf *
5690 m_getpacket_how(int wait)
5691 {
5692 unsigned int num_needed = 1;
5693
5694 return m_getpackets_internal(&num_needed, 1, wait, 1,
5695 m_maxsize(MC_CL));
5696 }
5697
5698 /*
5699 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
5700 * packets on receive ring.
5701 */
5702 struct mbuf *
5703 m_getpacket(void)
5704 {
5705 unsigned int num_needed = 1;
5706
5707 return m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
5708 m_maxsize(MC_CL));
5709 }
5710
5711 /*
5712 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
5713 * if this can't be met, return whatever number were available. Set up the
5714 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
5715 * are chained on the m_nextpkt field. Any packets requested beyond this are
5716 * chained onto the last packet header's m_next field.
5717 */
5718 struct mbuf *
5719 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
5720 {
5721 unsigned int n = num_needed;
5722
5723 return m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
5724 m_maxsize(MC_CL));
5725 }
5726
5727 /*
5728 * Return a list of mbuf hdrs set up as packet hdrs chained together
5729 * on the m_nextpkt field
5730 */
5731 struct mbuf *
5732 m_getpackethdrs(int num_needed, int how)
5733 {
5734 struct mbuf *m;
5735 struct mbuf **np, *top;
5736
5737 top = NULL;
5738 np = ⊤
5739
5740 while (num_needed--) {
5741 m = _M_RETRYHDR(how, MT_DATA);
5742 if (m == NULL) {
5743 break;
5744 }
5745
5746 *np = m;
5747 np = &m->m_nextpkt;
5748 }
5749
5750 return top;
5751 }
5752
5753 /*
5754 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
5755 * for mbufs packets freed. Used by the drivers.
5756 */
5757 int
5758 m_freem_list(struct mbuf *m)
5759 {
5760 struct mbuf *nextpkt;
5761 #if CONFIG_MBUF_MCACHE
5762 mcache_obj_t *mp_list = NULL;
5763 mcache_obj_t *mcl_list = NULL;
5764 mcache_obj_t *mbc_list = NULL;
5765 mcache_obj_t *m16k_list = NULL;
5766 mcache_obj_t *m_mcl_list = NULL;
5767 mcache_obj_t *m_mbc_list = NULL;
5768 mcache_obj_t *m_m16k_list = NULL;
5769 mcache_obj_t *ref_list = NULL;
5770 #else
5771 zstack_t mp_list = {}, mcl_list = {}, mbc_list = {},
5772 m16k_list = {}, m_mcl_list = {},
5773 m_mbc_list = {}, m_m16k_list = {}, ref_list = {};
5774 #endif /* CONFIG_MBUF_MCACHE */
5775 int pktcount = 0;
5776 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
5777
5778 while (m != NULL) {
5779 pktcount++;
5780
5781 nextpkt = m->m_nextpkt;
5782 m->m_nextpkt = NULL;
5783
5784 while (m != NULL) {
5785 struct mbuf *next = m->m_next;
5786 #if CONFIG_MBUF_MCACHE
5787 mcache_obj_t *o, *rfa;
5788 #else
5789 void *cl = NULL;
5790 #endif /* CONFIG_MBUF_MCACHE */
5791 if (m->m_type == MT_FREE) {
5792 panic("m_free: freeing an already freed mbuf");
5793 }
5794
5795 if (m->m_flags & M_PKTHDR) {
5796 /* Check for scratch area overflow */
5797 m_redzone_verify(m);
5798 /* Free the aux data and tags if there is any */
5799 m_tag_delete_chain(m);
5800 m_do_tx_compl_callback(m, NULL);
5801 }
5802
5803 if (!(m->m_flags & M_EXT)) {
5804 mt_free++;
5805 goto simple_free;
5806 }
5807
5808 if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
5809 m = next;
5810 continue;
5811 }
5812
5813 mt_free++;
5814
5815 #if CONFIG_MBUF_MCACHE
5816 o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
5817 #else
5818 cl = m->m_ext.ext_buf;
5819 #endif /* CONFIG_MBUF_MCACHE */
5820 /*
5821 * Make sure that we don't touch any ext_ref
5822 * member after we decrement the reference count
5823 * since that may lead to use-after-free
5824 * when we do not hold the last reference.
5825 */
5826 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
5827 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
5828 const uint16_t minref = MEXT_MINREF(m);
5829 const uint16_t refcnt = m_decref(m);
5830 if (refcnt == minref && !composite) {
5831 #if CONFIG_MBUF_MCACHE
5832 if (m_free_func == NULL) {
5833 o->obj_next = mcl_list;
5834 mcl_list = o;
5835 } else if (m_free_func == m_bigfree) {
5836 o->obj_next = mbc_list;
5837 mbc_list = o;
5838 } else if (m_free_func == m_16kfree) {
5839 o->obj_next = m16k_list;
5840 m16k_list = o;
5841 } else {
5842 (*(m_free_func))((caddr_t)o,
5843 m->m_ext.ext_size,
5844 m_get_ext_arg(m));
5845 }
5846 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
5847 rfa->obj_next = ref_list;
5848 ref_list = rfa;
5849 #else
5850 if (m_free_func == NULL) {
5851 zstack_push(&mcl_list, cl);
5852 } else if (m_free_func == m_bigfree) {
5853 zstack_push(&mbc_list, cl);
5854 } else if (m_free_func == m_16kfree) {
5855 zstack_push(&m16k_list, cl);
5856 } else {
5857 (*(m_free_func))((caddr_t)cl,
5858 m->m_ext.ext_size,
5859 m_get_ext_arg(m));
5860 }
5861 zstack_push(&ref_list, m_get_rfa(m));
5862 #endif /* CONFIG_MBUF_MCACHE */
5863 m_set_ext(m, NULL, NULL, NULL);
5864 } else if (refcnt == minref && composite) {
5865 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
5866 VERIFY(m->m_type != MT_FREE);
5867 /*
5868 * Amortize the costs of atomic operations
5869 * by doing them at the end, if possible.
5870 */
5871 if (m->m_type == MT_DATA) {
5872 mt_data++;
5873 } else if (m->m_type == MT_HEADER) {
5874 mt_header++;
5875 } else if (m->m_type == MT_SONAME) {
5876 mt_soname++;
5877 } else if (m->m_type == MT_TAG) {
5878 mt_tag++;
5879 } else {
5880 mtype_stat_dec(m->m_type);
5881 }
5882
5883 m->m_type = MT_FREE;
5884 m->m_flags = M_EXT;
5885 m->m_len = 0;
5886 m->m_next = m->m_nextpkt = NULL;
5887
5888 /*
5889 * MEXT_FLAGS is safe to access here
5890 * since we are now sure that we held
5891 * the last reference to ext_ref.
5892 */
5893 MEXT_FLAGS(m) &= ~EXTF_READONLY;
5894
5895 /* "Free" into the intermediate cache */
5896 #if CONFIG_MBUF_MCACHE
5897 o = (mcache_obj_t *)m;
5898 if (m_free_func == NULL) {
5899 o->obj_next = m_mcl_list;
5900 m_mcl_list = o;
5901 } else if (m_free_func == m_bigfree) {
5902 o->obj_next = m_mbc_list;
5903 m_mbc_list = o;
5904 } else {
5905 VERIFY(m_free_func == m_16kfree);
5906 o->obj_next = m_m16k_list;
5907 m_m16k_list = o;
5908 }
5909 #else
5910 if (m_free_func == NULL) {
5911 zstack_push(&m_mcl_list, m);
5912 } else if (m_free_func == m_bigfree) {
5913 zstack_push(&m_mbc_list, m);
5914 } else {
5915 VERIFY(m_free_func == m_16kfree);
5916 zstack_push(&m_m16k_list, m);
5917 }
5918 #endif /* CONFIG_MBUF_MCACHE */
5919 m = next;
5920 continue;
5921 }
5922 simple_free:
5923 /*
5924 * Amortize the costs of atomic operations
5925 * by doing them at the end, if possible.
5926 */
5927 if (m->m_type == MT_DATA) {
5928 mt_data++;
5929 } else if (m->m_type == MT_HEADER) {
5930 mt_header++;
5931 } else if (m->m_type == MT_SONAME) {
5932 mt_soname++;
5933 } else if (m->m_type == MT_TAG) {
5934 mt_tag++;
5935 } else if (m->m_type != MT_FREE) {
5936 mtype_stat_dec(m->m_type);
5937 }
5938
5939 m->m_type = MT_FREE;
5940 m->m_flags = m->m_len = 0;
5941 m->m_next = m->m_nextpkt = NULL;
5942
5943 #if CONFIG_MBUF_MCACHE
5944 ((mcache_obj_t *)m)->obj_next = mp_list;
5945 mp_list = (mcache_obj_t *)m;
5946 #else
5947 m_elide(m);
5948 zstack_push(&mp_list, m);
5949 #endif /* CONFIG_MBUF_MCACHE */
5950
5951 m = next;
5952 }
5953
5954 m = nextpkt;
5955 }
5956
5957 if (mt_free > 0) {
5958 mtype_stat_add(MT_FREE, mt_free);
5959 }
5960 if (mt_data > 0) {
5961 mtype_stat_sub(MT_DATA, mt_data);
5962 }
5963 if (mt_header > 0) {
5964 mtype_stat_sub(MT_HEADER, mt_header);
5965 }
5966 if (mt_soname > 0) {
5967 mtype_stat_sub(MT_SONAME, mt_soname);
5968 }
5969 if (mt_tag > 0) {
5970 mtype_stat_sub(MT_TAG, mt_tag);
5971 }
5972 #if CONFIG_MBUF_MCACHE
5973 if (mp_list != NULL) {
5974 mcache_free_ext(m_cache(MC_MBUF), mp_list);
5975 }
5976 if (mcl_list != NULL) {
5977 mcache_free_ext(m_cache(MC_CL), mcl_list);
5978 }
5979 if (mbc_list != NULL) {
5980 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
5981 }
5982 if (m16k_list != NULL) {
5983 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
5984 }
5985 if (m_mcl_list != NULL) {
5986 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
5987 }
5988 if (m_mbc_list != NULL) {
5989 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
5990 }
5991 if (m_m16k_list != NULL) {
5992 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
5993 }
5994 if (ref_list != NULL) {
5995 mcache_free_ext(ref_cache, ref_list);
5996 }
5997 #else
5998 if (!zstack_empty(mp_list)) {
5999 /* mbufs elided above. */
6000 mz_free_n(mp_list);
6001 }
6002 if (!zstack_empty(mcl_list)) {
6003 zfree_nozero_n(ZONE_ID_CLUSTER_2K, mcl_list);
6004 }
6005 if (!zstack_empty(mbc_list)) {
6006 zfree_nozero_n(ZONE_ID_CLUSTER_4K, mbc_list);
6007 }
6008 if (!zstack_empty(m16k_list)) {
6009 zfree_nozero_n(ZONE_ID_CLUSTER_16K, m16k_list);
6010 }
6011 if (!zstack_empty(m_mcl_list)) {
6012 mz_composite_free_n(MC_MBUF_CL, m_mcl_list);
6013 }
6014 if (!zstack_empty(m_mbc_list)) {
6015 mz_composite_free_n(MC_MBUF_BIGCL, m_mbc_list);
6016 }
6017 if (!zstack_empty(m_m16k_list)) {
6018 mz_composite_free_n(MC_MBUF_16KCL, m_m16k_list);
6019 }
6020 if (!zstack_empty(ref_list)) {
6021 zfree_nozero_n(ZONE_ID_MBUF_REF, ref_list);
6022 }
6023 #endif /* CONFIG_MBUF_MCACHE */
6024
6025 return pktcount;
6026 }
6027
6028 void
6029 m_freem(struct mbuf *m)
6030 {
6031 while (m != NULL) {
6032 m = m_free(m);
6033 }
6034 }
6035
6036 /*
6037 * Mbuffer utility routines.
6038 */
6039 /*
6040 * Set the m_data pointer of a newly allocated mbuf to place an object of the
6041 * specified size at the end of the mbuf, longword aligned.
6042 *
6043 * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
6044 * separate macros, each asserting that it was called at the proper moment.
6045 * This required callers to themselves test the storage type and call the
6046 * right one. Rather than require callers to be aware of those layout
6047 * decisions, we centralize here.
6048 */
6049 void
6050 m_align(struct mbuf *m, int len)
6051 {
6052 int adjust = 0;
6053
6054 /* At this point data must point to start */
6055 VERIFY(m->m_data == M_START(m));
6056 VERIFY(len >= 0);
6057 VERIFY(len <= M_SIZE(m));
6058 adjust = M_SIZE(m) - len;
6059 m->m_data += adjust & ~(sizeof(long) - 1);
6060 }
6061
6062 /*
6063 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
6064 * copy junk along. Does not adjust packet header length.
6065 */
6066 struct mbuf *
6067 m_prepend(struct mbuf *m, int len, int how)
6068 {
6069 struct mbuf *mn;
6070
6071 _MGET(mn, how, m->m_type);
6072 if (mn == NULL) {
6073 m_freem(m);
6074 return NULL;
6075 }
6076 if (m->m_flags & M_PKTHDR) {
6077 M_COPY_PKTHDR(mn, m);
6078 m->m_flags &= ~M_PKTHDR;
6079 }
6080 mn->m_next = m;
6081 m = mn;
6082 if (m->m_flags & M_PKTHDR) {
6083 VERIFY(len <= MHLEN);
6084 MH_ALIGN(m, len);
6085 } else {
6086 VERIFY(len <= MLEN);
6087 M_ALIGN(m, len);
6088 }
6089 m->m_len = len;
6090 return m;
6091 }
6092
6093 /*
6094 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
6095 * chain, copy junk along, and adjust length.
6096 */
6097 struct mbuf *
6098 m_prepend_2(struct mbuf *m, int len, int how, int align)
6099 {
6100 if (M_LEADINGSPACE(m) >= len &&
6101 (!align || IS_P2ALIGNED((m->m_data - len), sizeof(u_int32_t)))) {
6102 m->m_data -= len;
6103 m->m_len += len;
6104 } else {
6105 m = m_prepend(m, len, how);
6106 }
6107 if ((m) && (m->m_flags & M_PKTHDR)) {
6108 m->m_pkthdr.len += len;
6109 }
6110 return m;
6111 }
6112
6113 /*
6114 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
6115 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
6116 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
6117 *
6118 * The last mbuf and offset accessed are passed in and adjusted on return to
6119 * avoid having to iterate over the entire mbuf chain each time.
6120 */
6121 struct mbuf *
6122 m_copym_mode(struct mbuf *m, int off0, int len0, int wait,
6123 struct mbuf **m_lastm, int *m_off, uint32_t mode)
6124 {
6125 struct mbuf *n, *mhdr = NULL, **np;
6126 int off = off0, len = len0;
6127 struct mbuf *top;
6128 int copyhdr = 0;
6129
6130 if (off < 0 || len < 0) {
6131 panic("m_copym: invalid offset %d or len %d", off, len);
6132 }
6133
6134 VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
6135 mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR));
6136
6137 if ((off == 0 && (m->m_flags & M_PKTHDR)) ||
6138 mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) {
6139 mhdr = m;
6140 copyhdr = 1;
6141 }
6142
6143 if (m_lastm != NULL && *m_lastm != NULL) {
6144 if (off0 >= *m_off) {
6145 m = *m_lastm;
6146 off = off0 - *m_off;
6147 }
6148 }
6149
6150 while (off >= m->m_len) {
6151 off -= m->m_len;
6152 m = m->m_next;
6153 }
6154 np = ⊤
6155 top = NULL;
6156
6157 while (len > 0) {
6158 if (m == NULL) {
6159 if (len != M_COPYALL) {
6160 panic("m_copym: len != M_COPYALL");
6161 }
6162 break;
6163 }
6164
6165 if (copyhdr) {
6166 n = _M_RETRYHDR(wait, m->m_type);
6167 } else {
6168 n = _M_RETRY(wait, m->m_type);
6169 }
6170 *np = n;
6171
6172 if (n == NULL) {
6173 goto nospace;
6174 }
6175
6176 if (copyhdr != 0) {
6177 if ((mode == M_COPYM_MOVE_HDR) ||
6178 (mode == M_COPYM_MUST_MOVE_HDR)) {
6179 M_COPY_PKTHDR(n, mhdr);
6180 } else if ((mode == M_COPYM_COPY_HDR) ||
6181 (mode == M_COPYM_MUST_COPY_HDR)) {
6182 if (m_dup_pkthdr(n, mhdr, wait) == 0) {
6183 goto nospace;
6184 }
6185 }
6186 if (len == M_COPYALL) {
6187 n->m_pkthdr.len -= off0;
6188 } else {
6189 n->m_pkthdr.len = len;
6190 }
6191 copyhdr = 0;
6192 /*
6193 * There is data to copy from the packet header mbuf
6194 * if it is empty or it is before the starting offset
6195 */
6196 if (mhdr != m) {
6197 np = &n->m_next;
6198 continue;
6199 }
6200 }
6201 n->m_len = MIN(len, (m->m_len - off));
6202 if (m->m_flags & M_EXT) {
6203 n->m_ext = m->m_ext;
6204 m_incref(m);
6205 n->m_data = m->m_data + off;
6206 n->m_flags |= M_EXT;
6207 } else {
6208 /*
6209 * Limit to the capacity of the destination
6210 */
6211 if (n->m_flags & M_PKTHDR) {
6212 n->m_len = MIN(n->m_len, MHLEN);
6213 } else {
6214 n->m_len = MIN(n->m_len, MLEN);
6215 }
6216
6217 if (MTOD(n, char *) + n->m_len > ((char *)n) + _MSIZE) {
6218 panic("%s n %p copy overflow",
6219 __func__, n);
6220 }
6221
6222 bcopy(MTOD(m, caddr_t) + off, MTOD(n, caddr_t),
6223 (unsigned)n->m_len);
6224 }
6225 if (len != M_COPYALL) {
6226 len -= n->m_len;
6227 }
6228
6229 if (len == 0) {
6230 if (m_lastm != NULL) {
6231 *m_lastm = m;
6232 *m_off = off0 + len0 - (off + n->m_len);
6233 }
6234 }
6235 off = 0;
6236 m = m->m_next;
6237 np = &n->m_next;
6238 }
6239
6240 return top;
6241 nospace:
6242 m_freem(top);
6243
6244 return NULL;
6245 }
6246
6247
6248 struct mbuf *
6249 m_copym(struct mbuf *m, int off0, int len, int wait)
6250 {
6251 return m_copym_mode(m, off0, len, wait, NULL, NULL, M_COPYM_MOVE_HDR);
6252 }
6253
6254 /*
6255 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
6256 * within this routine also.
6257 *
6258 * The last mbuf and offset accessed are passed in and adjusted on return to
6259 * avoid having to iterate over the entire mbuf chain each time.
6260 */
6261 struct mbuf *
6262 m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
6263 struct mbuf **m_lastm, int *m_off, uint32_t mode)
6264 {
6265 struct mbuf *m = m0, *n, **np = NULL;
6266 int off = off0, len = len0;
6267 struct mbuf *top = NULL;
6268 #if CONFIG_MBUF_MCACHE
6269 int mcflags = MSLEEPF(wait);
6270 mcache_obj_t *list = NULL;
6271 #else
6272 zstack_t list = {};
6273 #endif /* CONFIG_MBUF_MCACHE */
6274 int copyhdr = 0;
6275 int type = 0;
6276 int needed = 0;
6277
6278 if (off == 0 && (m->m_flags & M_PKTHDR)) {
6279 copyhdr = 1;
6280 }
6281
6282 if (m_lastm != NULL && *m_lastm != NULL) {
6283 if (off0 >= *m_off) {
6284 m = *m_lastm;
6285 off = off0 - *m_off;
6286 }
6287 }
6288
6289 while (off >= m->m_len) {
6290 off -= m->m_len;
6291 m = m->m_next;
6292 }
6293
6294 n = m;
6295 while (len > 0) {
6296 needed++;
6297 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
6298 n = n->m_next;
6299 }
6300 needed++;
6301 len = len0;
6302
6303 #if CONFIG_MBUF_MCACHE
6304 /*
6305 * If the caller doesn't want to be put to sleep, mark it with
6306 * MCR_TRYHARD so that we may reclaim buffers from other places
6307 * before giving up.
6308 */
6309 if (mcflags & MCR_NOSLEEP) {
6310 mcflags |= MCR_TRYHARD;
6311 }
6312
6313 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
6314 mcflags) != needed) {
6315 goto nospace;
6316 }
6317 #else
6318 list = mz_alloc_n(needed, wait);
6319 if (zstack_count(list) != needed) {
6320 goto nospace;
6321 }
6322 #endif /* CONFIG_MBUF_MCACHE */
6323
6324 needed = 0;
6325 while (len > 0) {
6326 #if CONFIG_MBUF_MCACHE
6327 n = (struct mbuf *)list;
6328 list = list->obj_next;
6329 #else
6330 n = zstack_pop(&list);
6331 #endif /* CONFIG_MBUF_MCACHE */
6332 ASSERT(n != NULL && m != NULL);
6333
6334 type = (top == NULL) ? MT_HEADER : m->m_type;
6335 MBUF_INIT(n, (top == NULL), type);
6336
6337 if (top == NULL) {
6338 top = n;
6339 np = &top->m_next;
6340 continue;
6341 } else {
6342 needed++;
6343 *np = n;
6344 }
6345
6346 if (copyhdr) {
6347 if ((mode == M_COPYM_MOVE_HDR) ||
6348 (mode == M_COPYM_MUST_MOVE_HDR)) {
6349 M_COPY_PKTHDR(n, m);
6350 } else if ((mode == M_COPYM_COPY_HDR) ||
6351 (mode == M_COPYM_MUST_COPY_HDR)) {
6352 if (m_dup_pkthdr(n, m, wait) == 0) {
6353 #if !CONFIG_MBUF_MCACHE
6354 m_elide(n);
6355 #endif
6356 goto nospace;
6357 }
6358 }
6359 n->m_pkthdr.len = len;
6360 copyhdr = 0;
6361 }
6362 n->m_len = MIN(len, (m->m_len - off));
6363
6364 if (m->m_flags & M_EXT) {
6365 n->m_ext = m->m_ext;
6366 m_incref(m);
6367 n->m_data = m->m_data + off;
6368 n->m_flags |= M_EXT;
6369 } else {
6370 if (MTOD(n, char *) + n->m_len > ((char *)n) + _MSIZE) {
6371 panic("%s n %p copy overflow",
6372 __func__, n);
6373 }
6374
6375 bcopy(MTOD(m, caddr_t) + off, MTOD(n, caddr_t),
6376 (unsigned)n->m_len);
6377 }
6378 len -= n->m_len;
6379
6380 if (len == 0) {
6381 if (m_lastm != NULL) {
6382 *m_lastm = m;
6383 *m_off = off0 + len0 - (off + n->m_len);
6384 }
6385 break;
6386 }
6387 off = 0;
6388 m = m->m_next;
6389 np = &n->m_next;
6390 }
6391
6392 mtype_stat_inc(MT_HEADER);
6393 mtype_stat_add(type, needed);
6394 mtype_stat_sub(MT_FREE, needed + 1);
6395
6396 #if CONFIG_MBUF_MCACHE
6397 ASSERT(list == NULL);
6398 #else
6399 ASSERT(zstack_empty(list));
6400 #endif /* CONFIG_MBUF_MCACHE */
6401
6402 return top;
6403
6404 nospace:
6405 #if CONFIG_MBUF_MCACHE
6406 if (list != NULL) {
6407 mcache_free_ext(m_cache(MC_MBUF), list);
6408 }
6409 #else
6410 if (!zstack_empty(list)) {
6411 /* No need to elide, these mbufs came from the cache. */
6412 mz_free_n(list);
6413 }
6414 #endif /* CONFIG_MBUF_MCACHE */
6415 if (top != NULL) {
6416 m_freem(top);
6417 }
6418 return NULL;
6419 }
6420
6421 /*
6422 * Copy data from an mbuf chain starting "off" bytes from the beginning,
6423 * continuing for "len" bytes, into the indicated buffer.
6424 */
6425 void
6426 m_copydata(struct mbuf *m, int off, int len, void *vp)
6427 {
6428 int off0 = off, len0 = len;
6429 struct mbuf *m0 = m;
6430 unsigned count;
6431 char *cp = vp;
6432
6433 if (__improbable(off < 0 || len < 0)) {
6434 panic("%s: invalid offset %d or len %d", __func__, off, len);
6435 /* NOTREACHED */
6436 }
6437
6438 while (off > 0) {
6439 if (__improbable(m == NULL)) {
6440 panic("%s: invalid mbuf chain %p [off %d, len %d]",
6441 __func__, m0, off0, len0);
6442 /* NOTREACHED */
6443 }
6444 if (off < m->m_len) {
6445 break;
6446 }
6447 off -= m->m_len;
6448 m = m->m_next;
6449 }
6450 while (len > 0) {
6451 if (__improbable(m == NULL)) {
6452 panic("%s: invalid mbuf chain %p [off %d, len %d]",
6453 __func__, m0, off0, len0);
6454 /* NOTREACHED */
6455 }
6456 count = MIN(m->m_len - off, len);
6457 bcopy(MTOD(m, caddr_t) + off, cp, count);
6458 len -= count;
6459 cp += count;
6460 off = 0;
6461 m = m->m_next;
6462 }
6463 }
6464
6465 /*
6466 * Concatenate mbuf chain n to m. Both chains must be of the same type
6467 * (e.g. MT_DATA). Any m_pkthdr is not updated.
6468 */
6469 void
6470 m_cat(struct mbuf *m, struct mbuf *n)
6471 {
6472 while (m->m_next) {
6473 m = m->m_next;
6474 }
6475 while (n) {
6476 if ((m->m_flags & M_EXT) ||
6477 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
6478 /* just join the two chains */
6479 m->m_next = n;
6480 return;
6481 }
6482 /* splat the data from one into the other */
6483 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
6484 (u_int)n->m_len);
6485 m->m_len += n->m_len;
6486 n = m_free(n);
6487 }
6488 }
6489
6490 void
6491 m_adj(struct mbuf *mp, int req_len)
6492 {
6493 int len = req_len;
6494 struct mbuf *m;
6495 int count;
6496
6497 if ((m = mp) == NULL) {
6498 return;
6499 }
6500 if (len >= 0) {
6501 /*
6502 * Trim from head.
6503 */
6504 while (m != NULL && len > 0) {
6505 if (m->m_len <= len) {
6506 len -= m->m_len;
6507 m->m_len = 0;
6508 m = m->m_next;
6509 } else {
6510 m->m_len -= len;
6511 m->m_data += len;
6512 len = 0;
6513 }
6514 }
6515 m = mp;
6516 if (m->m_flags & M_PKTHDR) {
6517 m->m_pkthdr.len -= (req_len - len);
6518 }
6519 } else {
6520 /*
6521 * Trim from tail. Scan the mbuf chain,
6522 * calculating its length and finding the last mbuf.
6523 * If the adjustment only affects this mbuf, then just
6524 * adjust and return. Otherwise, rescan and truncate
6525 * after the remaining size.
6526 */
6527 len = -len;
6528 count = 0;
6529 for (;;) {
6530 count += m->m_len;
6531 if (m->m_next == (struct mbuf *)0) {
6532 break;
6533 }
6534 m = m->m_next;
6535 }
6536 if (m->m_len >= len) {
6537 m->m_len -= len;
6538 m = mp;
6539 if (m->m_flags & M_PKTHDR) {
6540 m->m_pkthdr.len -= len;
6541 }
6542 return;
6543 }
6544 count -= len;
6545 if (count < 0) {
6546 count = 0;
6547 }
6548 /*
6549 * Correct length for chain is "count".
6550 * Find the mbuf with last data, adjust its length,
6551 * and toss data from remaining mbufs on chain.
6552 */
6553 m = mp;
6554 if (m->m_flags & M_PKTHDR) {
6555 m->m_pkthdr.len = count;
6556 }
6557 for (; m; m = m->m_next) {
6558 if (m->m_len >= count) {
6559 m->m_len = count;
6560 break;
6561 }
6562 count -= m->m_len;
6563 }
6564 while ((m = m->m_next)) {
6565 m->m_len = 0;
6566 }
6567 }
6568 }
6569
6570 /*
6571 * Rearange an mbuf chain so that len bytes are contiguous
6572 * and in the data area of an mbuf (so that mtod
6573 * will work for a structure of size len). Returns the resulting
6574 * mbuf chain on success, frees it and returns null on failure.
6575 * If there is room, it will add up to max_protohdr-len extra bytes to the
6576 * contiguous region in an attempt to avoid being called next time.
6577 */
6578 struct mbuf *
6579 m_pullup(struct mbuf *n, int len)
6580 {
6581 struct mbuf *m;
6582 int count;
6583 int space;
6584
6585 /* check invalid arguments */
6586 if (n == NULL) {
6587 panic("%s: n == NULL", __func__);
6588 }
6589 if (len < 0) {
6590 os_log_info(OS_LOG_DEFAULT, "%s: failed negative len %d",
6591 __func__, len);
6592 goto bad;
6593 }
6594 if (len > MLEN) {
6595 os_log_info(OS_LOG_DEFAULT, "%s: failed len %d too big",
6596 __func__, len);
6597 goto bad;
6598 }
6599 if ((n->m_flags & M_EXT) == 0 &&
6600 n->m_data >= &n->m_dat[MLEN]) {
6601 os_log_info(OS_LOG_DEFAULT, "%s: m_data out of bounds",
6602 __func__);
6603 goto bad;
6604 }
6605
6606 /*
6607 * If first mbuf has no cluster, and has room for len bytes
6608 * without shifting current data, pullup into it,
6609 * otherwise allocate a new mbuf to prepend to the chain.
6610 */
6611 if ((n->m_flags & M_EXT) == 0 &&
6612 len < &n->m_dat[MLEN] - n->m_data && n->m_next != NULL) {
6613 if (n->m_len >= len) {
6614 return n;
6615 }
6616 m = n;
6617 n = n->m_next;
6618 len -= m->m_len;
6619 } else {
6620 if (len > MHLEN) {
6621 goto bad;
6622 }
6623 _MGET(m, M_DONTWAIT, n->m_type);
6624 if (m == 0) {
6625 goto bad;
6626 }
6627 m->m_len = 0;
6628 if (n->m_flags & M_PKTHDR) {
6629 M_COPY_PKTHDR(m, n);
6630 n->m_flags &= ~M_PKTHDR;
6631 }
6632 }
6633 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
6634 do {
6635 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
6636 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
6637 (unsigned)count);
6638 len -= count;
6639 m->m_len += count;
6640 n->m_len -= count;
6641 space -= count;
6642 if (n->m_len != 0) {
6643 n->m_data += count;
6644 } else {
6645 n = m_free(n);
6646 }
6647 } while (len > 0 && n != NULL);
6648 if (len > 0) {
6649 (void) m_free(m);
6650 goto bad;
6651 }
6652 m->m_next = n;
6653 return m;
6654 bad:
6655 m_freem(n);
6656 return 0;
6657 }
6658
6659 /*
6660 * Like m_pullup(), except a new mbuf is always allocated, and we allow
6661 * the amount of empty space before the data in the new mbuf to be specified
6662 * (in the event that the caller expects to prepend later).
6663 */
6664 __private_extern__ struct mbuf *
6665 m_copyup(struct mbuf *n, int len, int dstoff)
6666 {
6667 struct mbuf *m;
6668 int count, space;
6669
6670 VERIFY(len >= 0 && dstoff >= 0);
6671
6672 if (len > (MHLEN - dstoff)) {
6673 goto bad;
6674 }
6675 MGET(m, M_DONTWAIT, n->m_type);
6676 if (m == NULL) {
6677 goto bad;
6678 }
6679 m->m_len = 0;
6680 if (n->m_flags & M_PKTHDR) {
6681 m_copy_pkthdr(m, n);
6682 n->m_flags &= ~M_PKTHDR;
6683 }
6684 m->m_data += dstoff;
6685 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
6686 do {
6687 count = min(min(max(len, max_protohdr), space), n->m_len);
6688 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
6689 (unsigned)count);
6690 len -= count;
6691 m->m_len += count;
6692 n->m_len -= count;
6693 space -= count;
6694 if (n->m_len) {
6695 n->m_data += count;
6696 } else {
6697 n = m_free(n);
6698 }
6699 } while (len > 0 && n);
6700 if (len > 0) {
6701 (void) m_free(m);
6702 goto bad;
6703 }
6704 m->m_next = n;
6705 return m;
6706 bad:
6707 m_freem(n);
6708
6709 return NULL;
6710 }
6711
6712 /*
6713 * Partition an mbuf chain in two pieces, returning the tail --
6714 * all but the first len0 bytes. In case of failure, it returns NULL and
6715 * attempts to restore the chain to its original state.
6716 */
6717 struct mbuf *
6718 m_split(struct mbuf *m0, int len0, int wait)
6719 {
6720 return m_split0(m0, len0, wait, 1);
6721 }
6722
6723 static struct mbuf *
6724 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
6725 {
6726 struct mbuf *m, *n;
6727 unsigned len = len0, remain;
6728
6729 /*
6730 * First iterate to the mbuf which contains the first byte of
6731 * data at offset len0
6732 */
6733 for (m = m0; m && len > m->m_len; m = m->m_next) {
6734 len -= m->m_len;
6735 }
6736 if (m == NULL) {
6737 return NULL;
6738 }
6739 /*
6740 * len effectively is now the offset in the current
6741 * mbuf where we have to perform split.
6742 *
6743 * remain becomes the tail length.
6744 * Note that len can also be == m->m_len
6745 */
6746 remain = m->m_len - len;
6747
6748 /*
6749 * If current mbuf len contains the entire remaining offset len,
6750 * just make the second mbuf chain pointing to next mbuf onwards
6751 * and return after making necessary adjustments
6752 */
6753 if (copyhdr && (m0->m_flags & M_PKTHDR) && remain == 0) {
6754 _MGETHDR(n, wait, m0->m_type);
6755 if (n == NULL) {
6756 return NULL;
6757 }
6758 n->m_next = m->m_next;
6759 m->m_next = NULL;
6760 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
6761 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
6762 m0->m_pkthdr.len = len0;
6763 return n;
6764 }
6765 if (copyhdr && (m0->m_flags & M_PKTHDR)) {
6766 _MGETHDR(n, wait, m0->m_type);
6767 if (n == NULL) {
6768 return NULL;
6769 }
6770 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
6771 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
6772 m0->m_pkthdr.len = len0;
6773
6774 /*
6775 * If current points to external storage
6776 * then it can be shared by making last mbuf
6777 * of head chain and first mbuf of current chain
6778 * pointing to different data offsets
6779 */
6780 if (m->m_flags & M_EXT) {
6781 goto extpacket;
6782 }
6783 if (remain > MHLEN) {
6784 /* m can't be the lead packet */
6785 MH_ALIGN(n, 0);
6786 n->m_next = m_split(m, len, wait);
6787 if (n->m_next == NULL) {
6788 (void) m_free(n);
6789 return NULL;
6790 } else {
6791 return n;
6792 }
6793 } else {
6794 MH_ALIGN(n, remain);
6795 }
6796 } else if (remain == 0) {
6797 n = m->m_next;
6798 m->m_next = NULL;
6799 return n;
6800 } else {
6801 _MGET(n, wait, m->m_type);
6802 if (n == NULL) {
6803 return NULL;
6804 }
6805
6806 if ((m->m_flags & M_EXT) == 0) {
6807 VERIFY(remain <= MLEN);
6808 M_ALIGN(n, remain);
6809 }
6810 }
6811 extpacket:
6812 if (m->m_flags & M_EXT) {
6813 n->m_flags |= M_EXT;
6814 n->m_ext = m->m_ext;
6815 m_incref(m);
6816 n->m_data = m->m_data + len;
6817 } else {
6818 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
6819 }
6820 n->m_len = remain;
6821 m->m_len = len;
6822 n->m_next = m->m_next;
6823 m->m_next = NULL;
6824 return n;
6825 }
6826
6827 /*
6828 * Routine to copy from device local memory into mbufs.
6829 */
6830 struct mbuf *
6831 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
6832 void (*copy)(const void *, void *, size_t))
6833 {
6834 struct mbuf *m;
6835 struct mbuf *top = NULL, **mp = ⊤
6836 int off = off0, len;
6837 char *cp;
6838 char *epkt;
6839
6840 cp = buf;
6841 epkt = cp + totlen;
6842 if (off) {
6843 /*
6844 * If 'off' is non-zero, packet is trailer-encapsulated,
6845 * so we have to skip the type and length fields.
6846 */
6847 cp += off + 2 * sizeof(u_int16_t);
6848 totlen -= 2 * sizeof(u_int16_t);
6849 }
6850 _MGETHDR(m, M_DONTWAIT, MT_DATA);
6851 if (m == NULL) {
6852 return NULL;
6853 }
6854 m->m_pkthdr.rcvif = ifp;
6855 m->m_pkthdr.len = totlen;
6856 m->m_len = MHLEN;
6857
6858 while (totlen > 0) {
6859 if (top != NULL) {
6860 _MGET(m, M_DONTWAIT, MT_DATA);
6861 if (m == NULL) {
6862 m_freem(top);
6863 return NULL;
6864 }
6865 m->m_len = MLEN;
6866 }
6867 len = MIN(totlen, epkt - cp);
6868 if (len >= MINCLSIZE) {
6869 MCLGET(m, M_DONTWAIT);
6870 if (m->m_flags & M_EXT) {
6871 m->m_len = len = MIN(len, m_maxsize(MC_CL));
6872 } else {
6873 /* give up when it's out of cluster mbufs */
6874 if (top != NULL) {
6875 m_freem(top);
6876 }
6877 m_freem(m);
6878 return NULL;
6879 }
6880 } else {
6881 /*
6882 * Place initial small packet/header at end of mbuf.
6883 */
6884 if (len < m->m_len) {
6885 if (top == NULL &&
6886 len + max_linkhdr <= m->m_len) {
6887 m->m_data += max_linkhdr;
6888 }
6889 m->m_len = len;
6890 } else {
6891 len = m->m_len;
6892 }
6893 }
6894 if (copy) {
6895 copy(cp, MTOD(m, caddr_t), (unsigned)len);
6896 } else {
6897 bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
6898 }
6899 cp += len;
6900 *mp = m;
6901 mp = &m->m_next;
6902 totlen -= len;
6903 if (cp == epkt) {
6904 cp = buf;
6905 }
6906 }
6907 return top;
6908 }
6909
6910 #if CONFIG_MBUF_MCACHE
6911 #ifndef MBUF_GROWTH_NORMAL_THRESH
6912 #define MBUF_GROWTH_NORMAL_THRESH 25
6913 #endif
6914
6915 /*
6916 * Cluster freelist allocation check.
6917 */
6918 static int
6919 m_howmany(int num, size_t bufsize)
6920 {
6921 int i = 0, j = 0;
6922 u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
6923 u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
6924 u_int32_t sumclusters, freeclusters;
6925 u_int32_t percent_pool, percent_kmem;
6926 u_int32_t mb_growth, mb_growth_thresh;
6927
6928 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
6929 bufsize == m_maxsize(MC_16KCL));
6930
6931 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6932
6933 /* Numbers in 2K cluster units */
6934 m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
6935 m_clusters = m_total(MC_CL);
6936 m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
6937 m_16kclusters = m_total(MC_16KCL);
6938 sumclusters = m_mbclusters + m_clusters + m_bigclusters;
6939
6940 m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
6941 m_clfree = m_infree(MC_CL);
6942 m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
6943 m_16kclfree = m_infree(MC_16KCL);
6944 freeclusters = m_mbfree + m_clfree + m_bigclfree;
6945
6946 /* Bail if we've maxed out the mbuf memory map */
6947 if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
6948 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
6949 (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
6950 mbwdog_logger("maxed out nclusters (%u >= %u) or njcl (%u >= %u)",
6951 sumclusters, nclusters,
6952 (m_16kclusters << NCLPJCLSHIFT), njcl);
6953 return 0;
6954 }
6955
6956 if (bufsize == m_maxsize(MC_BIGCL)) {
6957 /* Under minimum */
6958 if (m_bigclusters < m_minlimit(MC_BIGCL)) {
6959 return m_minlimit(MC_BIGCL) - m_bigclusters;
6960 }
6961
6962 percent_pool =
6963 ((sumclusters - freeclusters) * 100) / sumclusters;
6964 percent_kmem = (sumclusters * 100) / nclusters;
6965
6966 /*
6967 * If a light/normal user, grow conservatively (75%)
6968 * If a heavy user, grow aggressively (50%)
6969 */
6970 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH) {
6971 mb_growth = MB_GROWTH_NORMAL;
6972 } else {
6973 mb_growth = MB_GROWTH_AGGRESSIVE;
6974 }
6975
6976 if (percent_kmem < 5) {
6977 /* For initial allocations */
6978 i = num;
6979 } else {
6980 /* Return if >= MBIGCL_LOWAT clusters available */
6981 if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
6982 m_total(MC_BIGCL) >=
6983 MBIGCL_LOWAT + m_minlimit(MC_BIGCL)) {
6984 return 0;
6985 }
6986
6987 /* Ensure at least num clusters are accessible */
6988 if (num >= m_infree(MC_BIGCL)) {
6989 i = num - m_infree(MC_BIGCL);
6990 }
6991 if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL)) {
6992 j = num - (m_total(MC_BIGCL) -
6993 m_minlimit(MC_BIGCL));
6994 }
6995
6996 i = MAX(i, j);
6997
6998 /*
6999 * Grow pool if percent_pool > 75 (normal growth)
7000 * or percent_pool > 50 (aggressive growth).
7001 */
7002 mb_growth_thresh = 100 - (100 / (1 << mb_growth));
7003 if (percent_pool > mb_growth_thresh) {
7004 j = ((sumclusters + num) >> mb_growth) -
7005 freeclusters;
7006 }
7007 i = MAX(i, j);
7008 }
7009
7010 /* Check to ensure we didn't go over limits */
7011 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) {
7012 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
7013 }
7014 if ((i << 1) + sumclusters >= nclusters) {
7015 i = (nclusters - sumclusters) >> 1;
7016 }
7017 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
7018 VERIFY(sumclusters + (i << 1) <= nclusters);
7019 } else { /* 16K CL */
7020 VERIFY(njcl > 0);
7021 /* Ensure at least num clusters are available */
7022 if (num >= m_16kclfree) {
7023 i = num - m_16kclfree;
7024 }
7025
7026 /* Always grow 16KCL pool aggressively */
7027 if (((m_16kclusters + num) >> 1) > m_16kclfree) {
7028 j = ((m_16kclusters + num) >> 1) - m_16kclfree;
7029 }
7030 i = MAX(i, j);
7031
7032 /* Check to ensure we don't go over limit */
7033 if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL)) {
7034 i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
7035 }
7036 }
7037 return i;
7038 }
7039 #endif /* CONFIG_MBUF_MCACHE */
7040 /*
7041 * Return the number of bytes in the mbuf chain, m.
7042 */
7043 unsigned int
7044 m_length(struct mbuf *m)
7045 {
7046 struct mbuf *m0;
7047 unsigned int pktlen;
7048
7049 if (m->m_flags & M_PKTHDR) {
7050 return m->m_pkthdr.len;
7051 }
7052
7053 pktlen = 0;
7054 for (m0 = m; m0 != NULL; m0 = m0->m_next) {
7055 pktlen += m0->m_len;
7056 }
7057 return pktlen;
7058 }
7059
7060 /*
7061 * Copy data from a buffer back into the indicated mbuf chain,
7062 * starting "off" bytes from the beginning, extending the mbuf
7063 * chain if necessary.
7064 */
7065 void
7066 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
7067 {
7068 #if DEBUG
7069 struct mbuf *origm = m0;
7070 int error;
7071 #endif /* DEBUG */
7072
7073 if (m0 == NULL) {
7074 return;
7075 }
7076
7077 #if DEBUG
7078 error =
7079 #endif /* DEBUG */
7080 m_copyback0(&m0, off, len, cp,
7081 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
7082
7083 #if DEBUG
7084 if (error != 0 || (m0 != NULL && origm != m0)) {
7085 panic("m_copyback");
7086 }
7087 #endif /* DEBUG */
7088 }
7089
7090 struct mbuf *
7091 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
7092 {
7093 int error;
7094
7095 /* don't support chain expansion */
7096 VERIFY(off + len <= m_length(m0));
7097
7098 error = m_copyback0(&m0, off, len, cp,
7099 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
7100 if (error) {
7101 /*
7102 * no way to recover from partial success.
7103 * just free the chain.
7104 */
7105 m_freem(m0);
7106 return NULL;
7107 }
7108 return m0;
7109 }
7110
7111 /*
7112 * m_makewritable: ensure the specified range writable.
7113 */
7114 int
7115 m_makewritable(struct mbuf **mp, int off, int len, int how)
7116 {
7117 int error;
7118 #if DEBUG
7119 struct mbuf *n;
7120 int origlen, reslen;
7121
7122 origlen = m_length(*mp);
7123 #endif /* DEBUG */
7124
7125 #if 0 /* M_COPYALL is large enough */
7126 if (len == M_COPYALL) {
7127 len = m_length(*mp) - off; /* XXX */
7128 }
7129 #endif
7130
7131 error = m_copyback0(mp, off, len, NULL,
7132 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
7133
7134 #if DEBUG
7135 reslen = 0;
7136 for (n = *mp; n; n = n->m_next) {
7137 reslen += n->m_len;
7138 }
7139 if (origlen != reslen) {
7140 panic("m_makewritable: length changed");
7141 }
7142 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len) {
7143 panic("m_makewritable: inconsist");
7144 }
7145 #endif /* DEBUG */
7146
7147 return error;
7148 }
7149
7150 static int
7151 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
7152 int how)
7153 {
7154 int mlen;
7155 struct mbuf *m, *n;
7156 struct mbuf **mp;
7157 int totlen = 0;
7158 const char *cp = vp;
7159
7160 VERIFY(mp0 != NULL);
7161 VERIFY(*mp0 != NULL);
7162 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
7163 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
7164
7165 /*
7166 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
7167 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
7168 */
7169
7170 VERIFY((~flags & (M_COPYBACK0_EXTEND | M_COPYBACK0_COW)) != 0);
7171
7172 mp = mp0;
7173 m = *mp;
7174 while (off > (mlen = m->m_len)) {
7175 off -= mlen;
7176 totlen += mlen;
7177 if (m->m_next == NULL) {
7178 int tspace;
7179 extend:
7180 if (!(flags & M_COPYBACK0_EXTEND)) {
7181 goto out;
7182 }
7183
7184 /*
7185 * try to make some space at the end of "m".
7186 */
7187
7188 mlen = m->m_len;
7189 if (off + len >= MINCLSIZE &&
7190 !(m->m_flags & M_EXT) && m->m_len == 0) {
7191 MCLGET(m, how);
7192 }
7193 tspace = M_TRAILINGSPACE(m);
7194 if (tspace > 0) {
7195 tspace = MIN(tspace, off + len);
7196 VERIFY(tspace > 0);
7197 bzero(mtod(m, char *) + m->m_len,
7198 MIN(off, tspace));
7199 m->m_len += tspace;
7200 off += mlen;
7201 totlen -= mlen;
7202 continue;
7203 }
7204
7205 /*
7206 * need to allocate an mbuf.
7207 */
7208
7209 if (off + len >= MINCLSIZE) {
7210 n = m_getcl(how, m->m_type, 0);
7211 } else {
7212 n = _M_GET(how, m->m_type);
7213 }
7214 if (n == NULL) {
7215 goto out;
7216 }
7217 n->m_len = 0;
7218 n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
7219 bzero(mtod(n, char *), MIN(n->m_len, off));
7220 m->m_next = n;
7221 }
7222 mp = &m->m_next;
7223 m = m->m_next;
7224 }
7225 while (len > 0) {
7226 mlen = m->m_len - off;
7227 if (mlen != 0 && m_mclhasreference(m)) {
7228 char *datap;
7229 int eatlen;
7230
7231 /*
7232 * this mbuf is read-only.
7233 * allocate a new writable mbuf and try again.
7234 */
7235
7236 #if DIAGNOSTIC
7237 if (!(flags & M_COPYBACK0_COW)) {
7238 panic("m_copyback0: read-only");
7239 }
7240 #endif /* DIAGNOSTIC */
7241
7242 /*
7243 * if we're going to write into the middle of
7244 * a mbuf, split it first.
7245 */
7246 if (off > 0 && len < mlen) {
7247 n = m_split0(m, off, how, 0);
7248 if (n == NULL) {
7249 goto enobufs;
7250 }
7251 m->m_next = n;
7252 mp = &m->m_next;
7253 m = n;
7254 off = 0;
7255 continue;
7256 }
7257
7258 /*
7259 * XXX TODO coalesce into the trailingspace of
7260 * the previous mbuf when possible.
7261 */
7262
7263 /*
7264 * allocate a new mbuf. copy packet header if needed.
7265 */
7266 n = _M_GET(how, m->m_type);
7267 if (n == NULL) {
7268 goto enobufs;
7269 }
7270 if (off == 0 && (m->m_flags & M_PKTHDR)) {
7271 M_COPY_PKTHDR(n, m);
7272 n->m_len = MHLEN;
7273 } else {
7274 if (len >= MINCLSIZE) {
7275 MCLGET(n, M_DONTWAIT);
7276 }
7277 n->m_len =
7278 (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
7279 }
7280 if (n->m_len > len) {
7281 n->m_len = len;
7282 }
7283
7284 /*
7285 * free the region which has been overwritten.
7286 * copying data from old mbufs if requested.
7287 */
7288 if (flags & M_COPYBACK0_PRESERVE) {
7289 datap = mtod(n, char *);
7290 } else {
7291 datap = NULL;
7292 }
7293 eatlen = n->m_len;
7294 VERIFY(off == 0 || eatlen >= mlen);
7295 if (off > 0) {
7296 VERIFY(len >= mlen);
7297 m->m_len = off;
7298 m->m_next = n;
7299 if (datap) {
7300 m_copydata(m, off, mlen, datap);
7301 datap += mlen;
7302 }
7303 eatlen -= mlen;
7304 mp = &m->m_next;
7305 m = m->m_next;
7306 }
7307 while (m != NULL && m_mclhasreference(m) &&
7308 n->m_type == m->m_type && eatlen > 0) {
7309 mlen = MIN(eatlen, m->m_len);
7310 if (datap) {
7311 m_copydata(m, 0, mlen, datap);
7312 datap += mlen;
7313 }
7314 m->m_data += mlen;
7315 m->m_len -= mlen;
7316 eatlen -= mlen;
7317 if (m->m_len == 0) {
7318 *mp = m = m_free(m);
7319 }
7320 }
7321 if (eatlen > 0) {
7322 n->m_len -= eatlen;
7323 }
7324 n->m_next = m;
7325 *mp = m = n;
7326 continue;
7327 }
7328 mlen = MIN(mlen, len);
7329 if (flags & M_COPYBACK0_COPYBACK) {
7330 bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
7331 cp += mlen;
7332 }
7333 len -= mlen;
7334 mlen += off;
7335 off = 0;
7336 totlen += mlen;
7337 if (len == 0) {
7338 break;
7339 }
7340 if (m->m_next == NULL) {
7341 goto extend;
7342 }
7343 mp = &m->m_next;
7344 m = m->m_next;
7345 }
7346 out:
7347 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
7348 VERIFY(flags & M_COPYBACK0_EXTEND);
7349 m->m_pkthdr.len = totlen;
7350 }
7351
7352 return 0;
7353
7354 enobufs:
7355 return ENOBUFS;
7356 }
7357
7358 uint64_t
7359 mcl_to_paddr(char *addr)
7360 {
7361 #if CONFIG_MBUF_MCACHE
7362 vm_offset_t base_phys;
7363
7364 if (!MBUF_IN_MAP(addr)) {
7365 return 0;
7366 }
7367 base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
7368
7369 if (base_phys == 0) {
7370 return 0;
7371 }
7372 return (uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK));
7373 #else
7374 extern addr64_t kvtophys(vm_offset_t va);
7375
7376 return kvtophys((vm_offset_t)addr);
7377 #endif /* CONFIG_MBUF_MCACHE */
7378 }
7379
7380 /*
7381 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
7382 * And really copy the thing. That way, we don't "precompute" checksums
7383 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
7384 * small packets, don't dup into a cluster. That way received packets
7385 * don't take up too much room in the sockbuf (cf. sbspace()).
7386 */
7387 struct mbuf *
7388 m_dup(struct mbuf *m, int how)
7389 {
7390 struct mbuf *n, **np;
7391 struct mbuf *top;
7392 int copyhdr = 0;
7393
7394 np = ⊤
7395 top = NULL;
7396 if (m->m_flags & M_PKTHDR) {
7397 copyhdr = 1;
7398 }
7399
7400 /*
7401 * Quick check: if we have one mbuf and its data fits in an
7402 * mbuf with packet header, just copy and go.
7403 */
7404 if (m->m_next == NULL) {
7405 /* Then just move the data into an mbuf and be done... */
7406 if (copyhdr) {
7407 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
7408 if ((n = _M_GETHDR(how, m->m_type)) == NULL) {
7409 return NULL;
7410 }
7411 n->m_len = m->m_len;
7412 m_dup_pkthdr(n, m, how);
7413 bcopy(m->m_data, n->m_data, m->m_len);
7414 return n;
7415 }
7416 } else if (m->m_len <= MLEN) {
7417 if ((n = _M_GET(how, m->m_type)) == NULL) {
7418 return NULL;
7419 }
7420 bcopy(m->m_data, n->m_data, m->m_len);
7421 n->m_len = m->m_len;
7422 return n;
7423 }
7424 }
7425 while (m != NULL) {
7426 #if BLUE_DEBUG
7427 printf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
7428 m->m_data);
7429 #endif
7430 if (copyhdr) {
7431 n = _M_GETHDR(how, m->m_type);
7432 } else {
7433 n = _M_GET(how, m->m_type);
7434 }
7435 if (n == NULL) {
7436 goto nospace;
7437 }
7438 if (m->m_flags & M_EXT) {
7439 if (m->m_len <= m_maxsize(MC_CL)) {
7440 MCLGET(n, how);
7441 } else if (m->m_len <= m_maxsize(MC_BIGCL)) {
7442 n = m_mbigget(n, how);
7443 } else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0) {
7444 n = m_m16kget(n, how);
7445 }
7446 if (!(n->m_flags & M_EXT)) {
7447 (void) m_free(n);
7448 goto nospace;
7449 }
7450 } else {
7451 VERIFY((copyhdr == 1 && m->m_len <= MHLEN) ||
7452 (copyhdr == 0 && m->m_len <= MLEN));
7453 }
7454 *np = n;
7455 if (copyhdr) {
7456 /* Don't use M_COPY_PKTHDR: preserve m_data */
7457 m_dup_pkthdr(n, m, how);
7458 copyhdr = 0;
7459 if (!(n->m_flags & M_EXT)) {
7460 n->m_data = n->m_pktdat;
7461 }
7462 }
7463 n->m_len = m->m_len;
7464 /*
7465 * Get the dup on the same bdry as the original
7466 * Assume that the two mbufs have the same offset to data area
7467 * (up to word boundaries)
7468 */
7469 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
7470 m = m->m_next;
7471 np = &n->m_next;
7472 #if BLUE_DEBUG
7473 printf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
7474 n->m_data);
7475 #endif
7476 }
7477
7478 return top;
7479
7480 nospace:
7481 m_freem(top);
7482 return NULL;
7483 }
7484
7485 #define MBUF_MULTIPAGES(m) \
7486 (((m)->m_flags & M_EXT) && \
7487 ((IS_P2ALIGNED((m)->m_data, PAGE_SIZE) \
7488 && (m)->m_len > PAGE_SIZE) || \
7489 (!IS_P2ALIGNED((m)->m_data, PAGE_SIZE) && \
7490 P2ROUNDUP((m)->m_data, PAGE_SIZE) < ((uintptr_t)(m)->m_data + (m)->m_len))))
7491
7492 static struct mbuf *
7493 m_expand(struct mbuf *m, struct mbuf **last)
7494 {
7495 struct mbuf *top = NULL;
7496 struct mbuf **nm = ⊤
7497 uintptr_t data0, data;
7498 unsigned int len0, len;
7499
7500 VERIFY(MBUF_MULTIPAGES(m));
7501 VERIFY(m->m_next == NULL);
7502 data0 = (uintptr_t)m->m_data;
7503 len0 = m->m_len;
7504 *last = top;
7505
7506 for (;;) {
7507 struct mbuf *n;
7508
7509 data = data0;
7510 if (IS_P2ALIGNED(data, PAGE_SIZE) && len0 > PAGE_SIZE) {
7511 len = PAGE_SIZE;
7512 } else if (!IS_P2ALIGNED(data, PAGE_SIZE) &&
7513 P2ROUNDUP(data, PAGE_SIZE) < (data + len0)) {
7514 len = P2ROUNDUP(data, PAGE_SIZE) - data;
7515 } else {
7516 len = len0;
7517 }
7518
7519 VERIFY(len > 0);
7520 VERIFY(m->m_flags & M_EXT);
7521 m->m_data = (void *)data;
7522 m->m_len = len;
7523
7524 *nm = *last = m;
7525 nm = &m->m_next;
7526 m->m_next = NULL;
7527
7528 data0 += len;
7529 len0 -= len;
7530 if (len0 == 0) {
7531 break;
7532 }
7533
7534 n = _M_RETRY(M_DONTWAIT, MT_DATA);
7535 if (n == NULL) {
7536 m_freem(top);
7537 top = *last = NULL;
7538 break;
7539 }
7540
7541 n->m_ext = m->m_ext;
7542 m_incref(m);
7543 n->m_flags |= M_EXT;
7544 m = n;
7545 }
7546 return top;
7547 }
7548
7549 struct mbuf *
7550 m_normalize(struct mbuf *m)
7551 {
7552 struct mbuf *top = NULL;
7553 struct mbuf **nm = ⊤
7554 boolean_t expanded = FALSE;
7555
7556 while (m != NULL) {
7557 struct mbuf *n;
7558
7559 n = m->m_next;
7560 m->m_next = NULL;
7561
7562 /* Does the data cross one or more page boundaries? */
7563 if (MBUF_MULTIPAGES(m)) {
7564 struct mbuf *last;
7565 if ((m = m_expand(m, &last)) == NULL) {
7566 m_freem(n);
7567 m_freem(top);
7568 top = NULL;
7569 break;
7570 }
7571 *nm = m;
7572 nm = &last->m_next;
7573 expanded = TRUE;
7574 } else {
7575 *nm = m;
7576 nm = &m->m_next;
7577 }
7578 m = n;
7579 }
7580 if (expanded) {
7581 os_atomic_inc(&mb_normalized, relaxed);
7582 }
7583 return top;
7584 }
7585
7586 /*
7587 * Append the specified data to the indicated mbuf chain,
7588 * Extend the mbuf chain if the new data does not fit in
7589 * existing space.
7590 *
7591 * Return 1 if able to complete the job; otherwise 0.
7592 */
7593 int
7594 m_append(struct mbuf *m0, int len, caddr_t cp)
7595 {
7596 struct mbuf *m, *n;
7597 int remainder, space;
7598
7599 for (m = m0; m->m_next != NULL; m = m->m_next) {
7600 ;
7601 }
7602 remainder = len;
7603 space = M_TRAILINGSPACE(m);
7604 if (space > 0) {
7605 /*
7606 * Copy into available space.
7607 */
7608 if (space > remainder) {
7609 space = remainder;
7610 }
7611 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
7612 m->m_len += space;
7613 cp += space;
7614 remainder -= space;
7615 }
7616 while (remainder > 0) {
7617 /*
7618 * Allocate a new mbuf; could check space
7619 * and allocate a cluster instead.
7620 */
7621 n = m_get(M_WAITOK, m->m_type);
7622 if (n == NULL) {
7623 break;
7624 }
7625 n->m_len = min(MLEN, remainder);
7626 bcopy(cp, mtod(n, caddr_t), n->m_len);
7627 cp += n->m_len;
7628 remainder -= n->m_len;
7629 m->m_next = n;
7630 m = n;
7631 }
7632 if (m0->m_flags & M_PKTHDR) {
7633 m0->m_pkthdr.len += len - remainder;
7634 }
7635 return remainder == 0;
7636 }
7637
7638 struct mbuf *
7639 m_last(struct mbuf *m)
7640 {
7641 while (m->m_next != NULL) {
7642 m = m->m_next;
7643 }
7644 return m;
7645 }
7646
7647 unsigned int
7648 m_fixhdr(struct mbuf *m0)
7649 {
7650 u_int len;
7651
7652 VERIFY(m0->m_flags & M_PKTHDR);
7653
7654 len = m_length2(m0, NULL);
7655 m0->m_pkthdr.len = len;
7656 return len;
7657 }
7658
7659 unsigned int
7660 m_length2(struct mbuf *m0, struct mbuf **last)
7661 {
7662 struct mbuf *m;
7663 u_int len;
7664
7665 len = 0;
7666 for (m = m0; m != NULL; m = m->m_next) {
7667 len += m->m_len;
7668 if (m->m_next == NULL) {
7669 break;
7670 }
7671 }
7672 if (last != NULL) {
7673 *last = m;
7674 }
7675 return len;
7676 }
7677
7678 /*
7679 * Defragment a mbuf chain, returning the shortest possible chain of mbufs
7680 * and clusters. If allocation fails and this cannot be completed, NULL will
7681 * be returned, but the passed in chain will be unchanged. Upon success,
7682 * the original chain will be freed, and the new chain will be returned.
7683 *
7684 * If a non-packet header is passed in, the original mbuf (chain?) will
7685 * be returned unharmed.
7686 *
7687 * If offset is specfied, the first mbuf in the chain will have a leading
7688 * space of the amount stated by the "off" parameter.
7689 *
7690 * This routine requires that the m_pkthdr.header field of the original
7691 * mbuf chain is cleared by the caller.
7692 */
7693 struct mbuf *
7694 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
7695 {
7696 struct mbuf *m_new = NULL, *m_final = NULL;
7697 int progress = 0, length, pktlen;
7698
7699 if (!(m0->m_flags & M_PKTHDR)) {
7700 return m0;
7701 }
7702
7703 VERIFY(off < MHLEN);
7704 m_fixhdr(m0); /* Needed sanity check */
7705
7706 pktlen = m0->m_pkthdr.len + off;
7707 if (pktlen > MHLEN) {
7708 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
7709 } else {
7710 m_final = m_gethdr(how, MT_DATA);
7711 }
7712
7713 if (m_final == NULL) {
7714 goto nospace;
7715 }
7716
7717 if (off > 0) {
7718 pktlen -= off;
7719 m_final->m_data += off;
7720 }
7721
7722 /*
7723 * Caller must have handled the contents pointed to by this
7724 * pointer before coming here, as otherwise it will point to
7725 * the original mbuf which will get freed upon success.
7726 */
7727 VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
7728
7729 if (m_dup_pkthdr(m_final, m0, how) == 0) {
7730 goto nospace;
7731 }
7732
7733 m_new = m_final;
7734
7735 while (progress < pktlen) {
7736 length = pktlen - progress;
7737 if (length > MCLBYTES) {
7738 length = MCLBYTES;
7739 }
7740 length -= ((m_new == m_final) ? off : 0);
7741 if (length < 0) {
7742 goto nospace;
7743 }
7744
7745 if (m_new == NULL) {
7746 if (length > MLEN) {
7747 m_new = m_getcl(how, MT_DATA, 0);
7748 } else {
7749 m_new = m_get(how, MT_DATA);
7750 }
7751 if (m_new == NULL) {
7752 goto nospace;
7753 }
7754 }
7755
7756 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
7757 progress += length;
7758 m_new->m_len = length;
7759 if (m_new != m_final) {
7760 m_cat(m_final, m_new);
7761 }
7762 m_new = NULL;
7763 }
7764 m_freem(m0);
7765 m0 = m_final;
7766 return m0;
7767 nospace:
7768 if (m_final) {
7769 m_freem(m_final);
7770 }
7771 return NULL;
7772 }
7773
7774 struct mbuf *
7775 m_defrag(struct mbuf *m0, int how)
7776 {
7777 return m_defrag_offset(m0, 0, how);
7778 }
7779
7780 void
7781 m_mchtype(struct mbuf *m, int t)
7782 {
7783 mtype_stat_inc(t);
7784 mtype_stat_dec(m->m_type);
7785 (m)->m_type = t;
7786 }
7787
7788 void *
7789 m_mtod(struct mbuf *m)
7790 {
7791 return MTOD(m, void *);
7792 }
7793
7794 void
7795 m_mcheck(struct mbuf *m)
7796 {
7797 _MCHECK(m);
7798 }
7799
7800 /*
7801 * Return a pointer to mbuf/offset of location in mbuf chain.
7802 */
7803 struct mbuf *
7804 m_getptr(struct mbuf *m, int loc, int *off)
7805 {
7806 while (loc >= 0) {
7807 /* Normal end of search. */
7808 if (m->m_len > loc) {
7809 *off = loc;
7810 return m;
7811 } else {
7812 loc -= m->m_len;
7813 if (m->m_next == NULL) {
7814 if (loc == 0) {
7815 /* Point at the end of valid data. */
7816 *off = m->m_len;
7817 return m;
7818 }
7819 return NULL;
7820 }
7821 m = m->m_next;
7822 }
7823 }
7824 return NULL;
7825 }
7826
7827 #if CONFIG_MBUF_MCACHE
7828 /*
7829 * Inform the corresponding mcache(s) that there's a waiter below.
7830 */
7831 static void
7832 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
7833 {
7834 mcache_waiter_inc(m_cache(class));
7835 if (comp) {
7836 if (class == MC_CL) {
7837 mcache_waiter_inc(m_cache(MC_MBUF_CL));
7838 } else if (class == MC_BIGCL) {
7839 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
7840 } else if (class == MC_16KCL) {
7841 mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
7842 } else {
7843 mcache_waiter_inc(m_cache(MC_MBUF_CL));
7844 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
7845 }
7846 }
7847 }
7848
7849 /*
7850 * Inform the corresponding mcache(s) that there's no more waiter below.
7851 */
7852 static void
7853 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
7854 {
7855 mcache_waiter_dec(m_cache(class));
7856 if (comp) {
7857 if (class == MC_CL) {
7858 mcache_waiter_dec(m_cache(MC_MBUF_CL));
7859 } else if (class == MC_BIGCL) {
7860 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
7861 } else if (class == MC_16KCL) {
7862 mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
7863 } else {
7864 mcache_waiter_dec(m_cache(MC_MBUF_CL));
7865 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
7866 }
7867 }
7868 }
7869
7870 static bool mbuf_watchdog_defunct_active = false;
7871
7872 #endif /* CONFIG_MBUF_MCACHE */
7873
7874 static uint32_t
7875 mbuf_watchdog_socket_space(struct socket *so)
7876 {
7877 uint32_t space = 0;
7878
7879 if (so == NULL) {
7880 return 0;
7881 }
7882
7883 space = so->so_snd.sb_mbcnt + so->so_rcv.sb_mbcnt;
7884
7885 #if INET
7886 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
7887 SOCK_PROTO(so) == IPPROTO_TCP) {
7888 space += tcp_reass_qlen_space(so);
7889 }
7890 #endif /* INET */
7891
7892 return space;
7893 }
7894
7895 struct mbuf_watchdog_defunct_args {
7896 struct proc *top_app;
7897 uint32_t top_app_space_used;
7898 bool non_blocking;
7899 };
7900
7901 static bool
7902 proc_fd_trylock(proc_t p)
7903 {
7904 return lck_mtx_try_lock(&p->p_fd.fd_lock);
7905 }
7906
7907 static int
7908 mbuf_watchdog_defunct_iterate(proc_t p, void *arg)
7909 {
7910 struct fileproc *fp = NULL;
7911 struct mbuf_watchdog_defunct_args *args =
7912 (struct mbuf_watchdog_defunct_args *)arg;
7913 uint32_t space_used = 0;
7914
7915 /*
7916 * Non-blocking is only used when dumping the mbuf usage from the watchdog
7917 */
7918 if (args->non_blocking) {
7919 if (!proc_fd_trylock(p)) {
7920 return PROC_RETURNED;
7921 }
7922 } else {
7923 proc_fdlock(p);
7924 }
7925 fdt_foreach(fp, p) {
7926 struct fileglob *fg = fp->fp_glob;
7927 struct socket *so = NULL;
7928
7929 if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
7930 continue;
7931 }
7932 so = fg_get_data(fg);
7933 /*
7934 * We calculate the space without the socket
7935 * lock because we don't want to be blocked
7936 * by another process that called send() and
7937 * is stuck waiting for mbufs.
7938 *
7939 * These variables are 32-bit so we don't have
7940 * to worry about incomplete reads.
7941 */
7942 space_used += mbuf_watchdog_socket_space(so);
7943 }
7944 proc_fdunlock(p);
7945 if (space_used > args->top_app_space_used) {
7946 if (args->top_app != NULL) {
7947 proc_rele(args->top_app);
7948 }
7949 args->top_app = p;
7950 args->top_app_space_used = space_used;
7951
7952 return PROC_CLAIMED;
7953 } else {
7954 return PROC_RETURNED;
7955 }
7956 }
7957
7958 extern char *proc_name_address(void *p);
7959
7960 static void
7961 mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1)
7962 {
7963 #pragma unused(arg0, arg1)
7964 struct mbuf_watchdog_defunct_args args = {};
7965 struct fileproc *fp = NULL;
7966
7967 args.non_blocking = false;
7968 proc_iterate(PROC_ALLPROCLIST,
7969 mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
7970
7971 /*
7972 * Defunct all sockets from this app.
7973 */
7974 if (args.top_app != NULL) {
7975 #if CONFIG_MBUF_MCACHE
7976 /* Restart the watchdog count. */
7977 lck_mtx_lock(mbuf_mlock);
7978 microuptime(&mb_wdtstart);
7979 lck_mtx_unlock(mbuf_mlock);
7980 #endif
7981 os_log(OS_LOG_DEFAULT, "%s: defuncting all sockets from %s.%d",
7982 __func__,
7983 proc_name_address(args.top_app),
7984 proc_pid(args.top_app));
7985 proc_fdlock(args.top_app);
7986 fdt_foreach(fp, args.top_app) {
7987 struct fileglob *fg = fp->fp_glob;
7988 struct socket *so = NULL;
7989
7990 if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
7991 continue;
7992 }
7993 so = (struct socket *)fp_get_data(fp);
7994 if (!socket_try_lock(so)) {
7995 continue;
7996 }
7997 if (sosetdefunct(args.top_app, so,
7998 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
7999 TRUE) == 0) {
8000 sodefunct(args.top_app, so,
8001 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
8002 }
8003 socket_unlock(so, 0);
8004 }
8005 proc_fdunlock(args.top_app);
8006 proc_rele(args.top_app);
8007 mbstat.m_forcedefunct++;
8008 #if !CONFIG_MBUF_MCACHE
8009 zcache_drain(ZONE_ID_MBUF_CLUSTER_2K);
8010 zcache_drain(ZONE_ID_MBUF_CLUSTER_4K);
8011 zcache_drain(ZONE_ID_MBUF_CLUSTER_16K);
8012 zone_drain(zone_by_id(ZONE_ID_MBUF));
8013 zone_drain(zone_by_id(ZONE_ID_CLUSTER_2K));
8014 zone_drain(zone_by_id(ZONE_ID_CLUSTER_4K));
8015 zone_drain(zone_by_id(ZONE_ID_CLUSTER_16K));
8016 zone_drain(zone_by_id(ZONE_ID_MBUF_REF));
8017 #endif
8018 }
8019 #if CONFIG_MBUF_MCACHE
8020 mbuf_watchdog_defunct_active = false;
8021 #endif
8022 }
8023
8024 #if !CONFIG_MBUF_MCACHE
8025 static LCK_GRP_DECLARE(mbuf_exhausted_grp, "mbuf-exhausted");
8026 static LCK_TICKET_DECLARE(mbuf_exhausted_lock, &mbuf_exhausted_grp);
8027 static uint32_t mbuf_exhausted_mask;
8028
8029 #define MBUF_EXHAUSTED_DRAIN_MASK (\
8030 (1u << MC_MBUF) | \
8031 (1u << MC_CL) | \
8032 (1u << MC_BIGCL) | \
8033 (1u << MC_16KCL))
8034
8035 #define MBUF_EXHAUSTED_DEFUNCT_MASK (\
8036 (1u << MC_MBUF) | \
8037 (1u << MC_MBUF_CL) | \
8038 (1u << MC_MBUF_BIGCL) | \
8039 (1u << MC_MBUF_16KCL))
8040
8041 static void
8042 mbuf_watchdog_drain_composite(thread_call_param_t arg0, thread_call_param_t arg1)
8043 {
8044 #pragma unused(arg0, arg1)
8045 zcache_drain(ZONE_ID_MBUF_CLUSTER_2K);
8046 zcache_drain(ZONE_ID_MBUF_CLUSTER_4K);
8047 zcache_drain(ZONE_ID_MBUF_CLUSTER_16K);
8048 }
8049
8050 static void
8051 mbuf_zone_exhausted_start(uint32_t bit)
8052 {
8053 uint64_t deadline;
8054 uint32_t mask;
8055
8056 mask = mbuf_exhausted_mask;
8057 mbuf_exhausted_mask = mask | bit;
8058
8059 if ((mask & MBUF_EXHAUSTED_DRAIN_MASK) == 0 &&
8060 (bit & MBUF_EXHAUSTED_DRAIN_MASK)) {
8061 clock_interval_to_deadline(MB_WDT_MAXTIME * 1000 / 10,
8062 NSEC_PER_MSEC, &deadline);
8063 thread_call_enter_delayed(mbuf_drain_tcall, deadline);
8064 }
8065
8066 if ((mask & MBUF_EXHAUSTED_DEFUNCT_MASK) == 0 &&
8067 (bit & MBUF_EXHAUSTED_DEFUNCT_MASK)) {
8068 clock_interval_to_deadline(MB_WDT_MAXTIME * 1000 / 2,
8069 NSEC_PER_MSEC, &deadline);
8070 thread_call_enter_delayed(mbuf_defunct_tcall, deadline);
8071 }
8072 }
8073
8074 static void
8075 mbuf_zone_exhausted_end(uint32_t bit)
8076 {
8077 uint32_t mask;
8078
8079 mask = (mbuf_exhausted_mask &= ~bit);
8080
8081 if ((mask & MBUF_EXHAUSTED_DRAIN_MASK) == 0 &&
8082 (bit & MBUF_EXHAUSTED_DRAIN_MASK)) {
8083 thread_call_cancel(mbuf_drain_tcall);
8084 }
8085
8086 if ((mask & MBUF_EXHAUSTED_DEFUNCT_MASK) == 0 &&
8087 (bit & MBUF_EXHAUSTED_DEFUNCT_MASK)) {
8088 thread_call_cancel(mbuf_defunct_tcall);
8089 }
8090 }
8091
8092 static void
8093 mbuf_zone_exhausted(zone_id_t zid, zone_t zone __unused, bool exhausted)
8094 {
8095 uint32_t bit;
8096
8097 if (zid < m_class_to_zid(MBUF_CLASS_MIN) ||
8098 zid > m_class_to_zid(MBUF_CLASS_MAX)) {
8099 return;
8100 }
8101
8102 bit = 1u << m_class_from_zid(zid);
8103
8104 lck_ticket_lock_nopreempt(&mbuf_exhausted_lock, &mbuf_exhausted_grp);
8105
8106 if (exhausted) {
8107 mbuf_zone_exhausted_start(bit);
8108 } else {
8109 mbuf_zone_exhausted_end(bit);
8110 }
8111
8112 lck_ticket_unlock_nopreempt(&mbuf_exhausted_lock);
8113 }
8114 EVENT_REGISTER_HANDLER(ZONE_EXHAUSTED, mbuf_zone_exhausted);
8115 #endif /* !CONFIG_MBUF_MCACHE */
8116
8117 #if CONFIG_MBUF_MCACHE
8118 /*
8119 * Called during slab (blocking and non-blocking) allocation. If there
8120 * is at least one waiter, and the time since the first waiter is blocked
8121 * is greater than the watchdog timeout, panic the system.
8122 */
8123 static void
8124 mbuf_watchdog(void)
8125 {
8126 struct timeval now;
8127 unsigned int since;
8128 static thread_call_t defunct_tcall = NULL;
8129
8130 if (mb_waiters == 0 || !mb_watchdog) {
8131 return;
8132 }
8133
8134 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8135
8136 microuptime(&now);
8137 since = now.tv_sec - mb_wdtstart.tv_sec;
8138
8139 if (mbuf_watchdog_defunct_active) {
8140 /*
8141 * Don't panic the system while we are trying
8142 * to find sockets to defunct.
8143 */
8144 return;
8145 }
8146 if (since >= MB_WDT_MAXTIME) {
8147 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
8148 mb_waiters, since, mbuf_dump());
8149 /* NOTREACHED */
8150 }
8151 /*
8152 * Check if we are about to panic the system due
8153 * to lack of mbufs and start defuncting sockets
8154 * from processes that use too many sockets.
8155 *
8156 * We're always called with the mbuf_mlock held,
8157 * so that also protects mbuf_watchdog_defunct_active.
8158 */
8159 if (since >= MB_WDT_MAXTIME / 2) {
8160 /*
8161 * Start a thread to defunct sockets
8162 * from apps that are over-using their socket
8163 * buffers.
8164 */
8165 if (defunct_tcall == NULL) {
8166 defunct_tcall =
8167 thread_call_allocate_with_options(mbuf_watchdog_defunct,
8168 NULL,
8169 THREAD_CALL_PRIORITY_KERNEL,
8170 THREAD_CALL_OPTIONS_ONCE);
8171 }
8172 if (defunct_tcall != NULL) {
8173 mbuf_watchdog_defunct_active = true;
8174 thread_call_enter(defunct_tcall);
8175 }
8176 }
8177 }
8178
8179 /*
8180 * Called during blocking allocation. Returns TRUE if one or more objects
8181 * are available at the per-CPU caches layer and that allocation should be
8182 * retried at that level.
8183 */
8184 static boolean_t
8185 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
8186 {
8187 boolean_t mcache_retry = FALSE;
8188
8189 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8190
8191 /* Check if there's anything at the cache layer */
8192 if (mbuf_cached_above(class, wait)) {
8193 mcache_retry = TRUE;
8194 goto done;
8195 }
8196
8197 /* Nothing? Then try hard to get it from somewhere */
8198 m_reclaim(class, num, (wait & MCR_COMP));
8199
8200 /* We tried hard and got something? */
8201 if (m_infree(class) > 0) {
8202 mbstat.m_wait++;
8203 goto done;
8204 } else if (mbuf_cached_above(class, wait)) {
8205 mbstat.m_wait++;
8206 mcache_retry = TRUE;
8207 goto done;
8208 } else if (wait & MCR_TRYHARD) {
8209 mcache_retry = TRUE;
8210 goto done;
8211 }
8212
8213 /*
8214 * There's really nothing for us right now; inform the
8215 * cache(s) that there is a waiter below and go to sleep.
8216 */
8217 mbuf_waiter_inc(class, (wait & MCR_COMP));
8218
8219 VERIFY(!(wait & MCR_NOSLEEP));
8220
8221 /*
8222 * If this is the first waiter, arm the watchdog timer. Otherwise
8223 * check if we need to panic the system due to watchdog timeout.
8224 */
8225 if (mb_waiters == 0) {
8226 microuptime(&mb_wdtstart);
8227 } else {
8228 mbuf_watchdog();
8229 }
8230
8231 mb_waiters++;
8232 m_region_expand(class) += m_total(class) + num;
8233 /* wake up the worker thread */
8234 if (mbuf_worker_ready &&
8235 mbuf_worker_needs_wakeup) {
8236 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
8237 mbuf_worker_needs_wakeup = FALSE;
8238 }
8239 mbwdog_logger("waiting (%d mbufs in class %s)", num, m_cname(class));
8240 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO - 1), m_cname(class), NULL);
8241 mbwdog_logger("woke up (%d mbufs in class %s) ", num, m_cname(class));
8242
8243 /* We are now up; stop getting notified until next round */
8244 mbuf_waiter_dec(class, (wait & MCR_COMP));
8245
8246 /* We waited and got something */
8247 if (m_infree(class) > 0) {
8248 mbstat.m_wait++;
8249 goto done;
8250 } else if (mbuf_cached_above(class, wait)) {
8251 mbstat.m_wait++;
8252 mcache_retry = TRUE;
8253 }
8254 done:
8255 return mcache_retry;
8256 }
8257
8258 __attribute__((noreturn))
8259 static void
8260 mbuf_worker_thread(void)
8261 {
8262 int mbuf_expand;
8263
8264 while (1) {
8265 lck_mtx_lock(mbuf_mlock);
8266 mbwdog_logger("worker thread running");
8267 mbuf_worker_run_cnt++;
8268 mbuf_expand = 0;
8269 /*
8270 * Allocations are based on page size, so if we have depleted
8271 * the reserved spaces, try to free mbufs from the major classes.
8272 */
8273 #if PAGE_SIZE == 4096
8274 uint32_t m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
8275 uint32_t m_clusters = m_total(MC_CL);
8276 uint32_t m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
8277 uint32_t sumclusters = m_mbclusters + m_clusters + m_bigclusters;
8278 if (sumclusters >= nclusters) {
8279 mbwdog_logger("reclaiming bigcl");
8280 mbuf_drain_locked(TRUE);
8281 m_reclaim(MC_BIGCL, 4, FALSE);
8282 }
8283 #else
8284 uint32_t m_16kclusters = m_total(MC_16KCL);
8285 if (njcl > 0 && (m_16kclusters << NCLPJCLSHIFT) >= njcl) {
8286 mbwdog_logger("reclaiming 16kcl");
8287 mbuf_drain_locked(TRUE);
8288 m_reclaim(MC_16KCL, 4, FALSE);
8289 }
8290 #endif
8291 if (m_region_expand(MC_CL) > 0) {
8292 int n;
8293 mb_expand_cl_cnt++;
8294 /* Adjust to current number of cluster in use */
8295 n = m_region_expand(MC_CL) -
8296 (m_total(MC_CL) - m_infree(MC_CL));
8297 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL)) {
8298 n = m_maxlimit(MC_CL) - m_total(MC_CL);
8299 }
8300 if (n > 0) {
8301 mb_expand_cl_total += n;
8302 }
8303 m_region_expand(MC_CL) = 0;
8304
8305 if (n > 0) {
8306 mbwdog_logger("expanding MC_CL by %d", n);
8307 freelist_populate(MC_CL, n, M_WAIT);
8308 }
8309 }
8310 if (m_region_expand(MC_BIGCL) > 0) {
8311 int n;
8312 mb_expand_bigcl_cnt++;
8313 /* Adjust to current number of 4 KB cluster in use */
8314 n = m_region_expand(MC_BIGCL) -
8315 (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
8316 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL)) {
8317 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
8318 }
8319 if (n > 0) {
8320 mb_expand_bigcl_total += n;
8321 }
8322 m_region_expand(MC_BIGCL) = 0;
8323
8324 if (n > 0) {
8325 mbwdog_logger("expanding MC_BIGCL by %d", n);
8326 freelist_populate(MC_BIGCL, n, M_WAIT);
8327 }
8328 }
8329 if (m_region_expand(MC_16KCL) > 0) {
8330 int n;
8331 mb_expand_16kcl_cnt++;
8332 /* Adjust to current number of 16 KB cluster in use */
8333 n = m_region_expand(MC_16KCL) -
8334 (m_total(MC_16KCL) - m_infree(MC_16KCL));
8335 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL)) {
8336 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
8337 }
8338 if (n > 0) {
8339 mb_expand_16kcl_total += n;
8340 }
8341 m_region_expand(MC_16KCL) = 0;
8342
8343 if (n > 0) {
8344 mbwdog_logger("expanding MC_16KCL by %d", n);
8345 (void) freelist_populate(MC_16KCL, n, M_WAIT);
8346 }
8347 }
8348
8349 /*
8350 * Because we can run out of memory before filling the mbuf
8351 * map, we should not allocate more clusters than they are
8352 * mbufs -- otherwise we could have a large number of useless
8353 * clusters allocated.
8354 */
8355 mbwdog_logger("totals: MC_MBUF %d MC_BIGCL %d MC_CL %d MC_16KCL %d",
8356 m_total(MC_MBUF), m_total(MC_BIGCL), m_total(MC_CL),
8357 m_total(MC_16KCL));
8358 uint32_t total_mbufs = m_total(MC_MBUF);
8359 uint32_t total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
8360 m_total(MC_16KCL);
8361 if (total_mbufs < total_clusters) {
8362 mbwdog_logger("expanding MC_MBUF by %d",
8363 total_clusters - total_mbufs);
8364 }
8365 while (total_mbufs < total_clusters) {
8366 mb_expand_cnt++;
8367 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0) {
8368 break;
8369 }
8370 total_mbufs = m_total(MC_MBUF);
8371 total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
8372 m_total(MC_16KCL);
8373 }
8374
8375 mbuf_worker_needs_wakeup = TRUE;
8376 /*
8377 * If there's a deadlock and we're not sending / receiving
8378 * packets, net_uptime() won't be updated. Update it here
8379 * so we are sure it's correct.
8380 */
8381 net_update_uptime();
8382 mbuf_worker_last_runtime = net_uptime();
8383 assert_wait((caddr_t)&mbuf_worker_needs_wakeup,
8384 THREAD_UNINT);
8385 mbwdog_logger("worker thread sleeping");
8386 lck_mtx_unlock(mbuf_mlock);
8387 (void) thread_block((thread_continue_t)mbuf_worker_thread);
8388 }
8389 }
8390
8391 __attribute__((noreturn))
8392 static void
8393 mbuf_worker_thread_init(void)
8394 {
8395 mbuf_worker_ready++;
8396 mbuf_worker_thread();
8397 }
8398
8399 static mcl_slab_t *
8400 slab_get(void *buf)
8401 {
8402 mcl_slabg_t *slg;
8403 unsigned int ix, k;
8404
8405 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8406
8407 VERIFY(MBUF_IN_MAP(buf));
8408 ix = ((unsigned char *)buf - mbutl) >> MBSHIFT;
8409 VERIFY(ix < maxslabgrp);
8410
8411 if ((slg = slabstbl[ix]) == NULL) {
8412 /*
8413 * In the current implementation, we never shrink the slabs
8414 * table; if we attempt to reallocate a cluster group when
8415 * it's already allocated, panic since this is a sign of a
8416 * memory corruption (slabstbl[ix] got nullified).
8417 */
8418 ++slabgrp;
8419 VERIFY(ix < slabgrp);
8420 /*
8421 * Slabs expansion can only be done single threaded; when
8422 * we get here, it must be as a result of m_clalloc() which
8423 * is serialized and therefore mb_clalloc_busy must be set.
8424 */
8425 VERIFY(mb_clalloc_busy);
8426 lck_mtx_unlock(mbuf_mlock);
8427
8428 /* This is a new buffer; create the slabs group for it */
8429 slg = zalloc_permanent_type(mcl_slabg_t);
8430 slg->slg_slab = zalloc_permanent(sizeof(mcl_slab_t) * NSLABSPMB,
8431 ZALIGN(mcl_slab_t));
8432
8433 lck_mtx_lock(mbuf_mlock);
8434 /*
8435 * No other thread could have gone into m_clalloc() after
8436 * we dropped the lock above, so verify that it's true.
8437 */
8438 VERIFY(mb_clalloc_busy);
8439
8440 slabstbl[ix] = slg;
8441
8442 /* Chain each slab in the group to its forward neighbor */
8443 for (k = 1; k < NSLABSPMB; k++) {
8444 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
8445 }
8446 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
8447
8448 /* And chain the last slab in the previous group to this */
8449 if (ix > 0) {
8450 VERIFY(slabstbl[ix - 1]->
8451 slg_slab[NSLABSPMB - 1].sl_next == NULL);
8452 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
8453 &slg->slg_slab[0];
8454 }
8455 }
8456
8457 ix = MTOPG(buf) % NSLABSPMB;
8458 VERIFY(ix < NSLABSPMB);
8459
8460 return &slg->slg_slab[ix];
8461 }
8462
8463 static void
8464 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
8465 void *base, void *head, unsigned int len, int refcnt, int chunks)
8466 {
8467 sp->sl_class = class;
8468 sp->sl_flags = flags;
8469 sp->sl_base = base;
8470 sp->sl_head = head;
8471 sp->sl_len = len;
8472 sp->sl_refcnt = refcnt;
8473 sp->sl_chunks = chunks;
8474 slab_detach(sp);
8475 }
8476
8477 static void
8478 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
8479 {
8480 VERIFY(slab_is_detached(sp));
8481 m_slab_cnt(class)++;
8482 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
8483 sp->sl_flags &= ~SLF_DETACHED;
8484
8485 /*
8486 * If a buffer spans multiple contiguous pages then mark them as
8487 * detached too
8488 */
8489 if (class == MC_16KCL) {
8490 int k;
8491 for (k = 1; k < NSLABSP16KB; k++) {
8492 sp = sp->sl_next;
8493 /* Next slab must already be present */
8494 VERIFY(sp != NULL && slab_is_detached(sp));
8495 sp->sl_flags &= ~SLF_DETACHED;
8496 }
8497 }
8498 }
8499
8500 static void
8501 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
8502 {
8503 int k;
8504 VERIFY(!slab_is_detached(sp));
8505 VERIFY(m_slab_cnt(class) > 0);
8506 m_slab_cnt(class)--;
8507 TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
8508 slab_detach(sp);
8509 if (class == MC_16KCL) {
8510 for (k = 1; k < NSLABSP16KB; k++) {
8511 sp = sp->sl_next;
8512 /* Next slab must already be present */
8513 VERIFY(sp != NULL);
8514 VERIFY(!slab_is_detached(sp));
8515 slab_detach(sp);
8516 }
8517 }
8518 }
8519
8520 static boolean_t
8521 slab_inrange(mcl_slab_t *sp, void *buf)
8522 {
8523 return (uintptr_t)buf >= (uintptr_t)sp->sl_base &&
8524 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len);
8525 }
8526
8527 #undef panic
8528
8529 static void
8530 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
8531 {
8532 int i;
8533 unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
8534 uintptr_t buf = (uintptr_t)sp->sl_base;
8535
8536 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
8537 void *next = ((mcache_obj_t *)buf)->obj_next;
8538 if (next != addr) {
8539 continue;
8540 }
8541 if (!mclverify) {
8542 if (next != NULL && !MBUF_IN_MAP(next)) {
8543 mcache_t *cp = m_cache(sp->sl_class);
8544 panic("%s: %s buffer %p in slab %p modified "
8545 "after free at offset 0: %p out of range "
8546 "[%p-%p)\n", __func__, cp->mc_name,
8547 (void *)buf, sp, next, mbutl, embutl);
8548 /* NOTREACHED */
8549 }
8550 } else {
8551 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
8552 (mcache_obj_t *)buf);
8553 mcl_audit_verify_nextptr(next, mca);
8554 }
8555 }
8556 }
8557
8558 static void
8559 slab_detach(mcl_slab_t *sp)
8560 {
8561 sp->sl_link.tqe_next = (mcl_slab_t *)-1;
8562 sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
8563 sp->sl_flags |= SLF_DETACHED;
8564 }
8565
8566 static boolean_t
8567 slab_is_detached(mcl_slab_t *sp)
8568 {
8569 return (intptr_t)sp->sl_link.tqe_next == -1 &&
8570 (intptr_t)sp->sl_link.tqe_prev == -1 &&
8571 (sp->sl_flags & SLF_DETACHED);
8572 }
8573
8574 static void
8575 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
8576 mcache_obj_t **con_list, size_t con_size, unsigned int num)
8577 {
8578 mcache_audit_t *mca, *mca_tail;
8579 mcache_obj_t *con = NULL;
8580 boolean_t save_contents = (con_list != NULL);
8581 unsigned int i, ix;
8582
8583 ASSERT(num <= NMBPG);
8584 ASSERT(con_list == NULL || con_size != 0);
8585
8586 ix = MTOPG(buf);
8587 VERIFY(ix < maxclaudit);
8588
8589 /* Make sure we haven't been here before */
8590 for (i = 0; i < num; i++) {
8591 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
8592 }
8593
8594 mca = mca_tail = *mca_list;
8595 if (save_contents) {
8596 con = *con_list;
8597 }
8598
8599 for (i = 0; i < num; i++) {
8600 mcache_audit_t *next;
8601
8602 next = mca->mca_next;
8603 bzero(mca, sizeof(*mca));
8604 mca->mca_next = next;
8605 mclaudit[ix].cl_audit[i] = mca;
8606
8607 /* Attach the contents buffer if requested */
8608 if (save_contents) {
8609 mcl_saved_contents_t *msc =
8610 (mcl_saved_contents_t *)(void *)con;
8611
8612 VERIFY(msc != NULL);
8613 VERIFY(IS_P2ALIGNED(msc, sizeof(u_int64_t)));
8614 VERIFY(con_size == sizeof(*msc));
8615 mca->mca_contents_size = con_size;
8616 mca->mca_contents = msc;
8617 con = con->obj_next;
8618 bzero(mca->mca_contents, mca->mca_contents_size);
8619 }
8620
8621 mca_tail = mca;
8622 mca = mca->mca_next;
8623 }
8624
8625 if (save_contents) {
8626 *con_list = con;
8627 }
8628
8629 *mca_list = mca_tail->mca_next;
8630 mca_tail->mca_next = NULL;
8631 }
8632
8633 static void
8634 mcl_audit_free(void *buf, unsigned int num)
8635 {
8636 unsigned int i, ix;
8637 mcache_audit_t *mca, *mca_list;
8638
8639 ix = MTOPG(buf);
8640 VERIFY(ix < maxclaudit);
8641
8642 if (mclaudit[ix].cl_audit[0] != NULL) {
8643 mca_list = mclaudit[ix].cl_audit[0];
8644 for (i = 0; i < num; i++) {
8645 mca = mclaudit[ix].cl_audit[i];
8646 mclaudit[ix].cl_audit[i] = NULL;
8647 if (mca->mca_contents) {
8648 mcache_free(mcl_audit_con_cache,
8649 mca->mca_contents);
8650 }
8651 }
8652 mcache_free_ext(mcache_audit_cache,
8653 (mcache_obj_t *)mca_list);
8654 }
8655 }
8656
8657 /*
8658 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
8659 * the corresponding audit structure for that buffer.
8660 */
8661 static mcache_audit_t *
8662 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj)
8663 {
8664 mcache_audit_t *mca = NULL;
8665 int ix = MTOPG(mobj), m_idx = 0;
8666 unsigned char *page_addr;
8667
8668 VERIFY(ix < maxclaudit);
8669 VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE)));
8670
8671 page_addr = PGTOM(ix);
8672
8673 switch (class) {
8674 case MC_MBUF:
8675 /*
8676 * For the mbuf case, find the index of the page
8677 * used by the mbuf and use that index to locate the
8678 * base address of the page. Then find out the
8679 * mbuf index relative to the page base and use
8680 * it to locate the audit structure.
8681 */
8682 m_idx = MBPAGEIDX(page_addr, mobj);
8683 VERIFY(m_idx < (int)NMBPG);
8684 mca = mclaudit[ix].cl_audit[m_idx];
8685 break;
8686
8687 case MC_CL:
8688 /*
8689 * Same thing as above, but for 2KB clusters in a page.
8690 */
8691 m_idx = CLPAGEIDX(page_addr, mobj);
8692 VERIFY(m_idx < (int)NCLPG);
8693 mca = mclaudit[ix].cl_audit[m_idx];
8694 break;
8695
8696 case MC_BIGCL:
8697 m_idx = BCLPAGEIDX(page_addr, mobj);
8698 VERIFY(m_idx < (int)NBCLPG);
8699 mca = mclaudit[ix].cl_audit[m_idx];
8700 break;
8701 case MC_16KCL:
8702 /*
8703 * Same as above, but only return the first element.
8704 */
8705 mca = mclaudit[ix].cl_audit[0];
8706 break;
8707
8708 default:
8709 VERIFY(0);
8710 /* NOTREACHED */
8711 }
8712
8713 return mca;
8714 }
8715
8716 static void
8717 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
8718 boolean_t alloc)
8719 {
8720 struct mbuf *m = addr;
8721 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
8722
8723 VERIFY(mca->mca_contents != NULL &&
8724 mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
8725
8726 if (mclverify) {
8727 mcl_audit_verify_nextptr(next, mca);
8728 }
8729
8730 if (!alloc) {
8731 /* Save constructed mbuf fields */
8732 mcl_audit_save_mbuf(m, mca);
8733 if (mclverify) {
8734 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
8735 m_maxsize(MC_MBUF));
8736 }
8737 ((mcache_obj_t *)m)->obj_next = next;
8738 return;
8739 }
8740
8741 /* Check if the buffer has been corrupted while in freelist */
8742 if (mclverify) {
8743 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
8744 }
8745 /* Restore constructed mbuf fields */
8746 mcl_audit_restore_mbuf(m, mca, composite);
8747 }
8748
8749 static void
8750 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
8751 {
8752 struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
8753
8754 if (composite) {
8755 struct mbuf *next = m->m_next;
8756 VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL &&
8757 MBUF_IS_COMPOSITE(ms));
8758 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
8759 /*
8760 * We could have hand-picked the mbuf fields and restore
8761 * them individually, but that will be a maintenance
8762 * headache. Instead, restore everything that was saved;
8763 * the mbuf layer will recheck and reinitialize anyway.
8764 */
8765 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
8766 m->m_next = next;
8767 } else {
8768 /*
8769 * For a regular mbuf (no cluster attached) there's nothing
8770 * to restore other than the type field, which is expected
8771 * to be MT_FREE.
8772 */
8773 m->m_type = ms->m_type;
8774 }
8775 _MCHECK(m);
8776 }
8777
8778 static void
8779 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
8780 {
8781 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
8782 _MCHECK(m);
8783 bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
8784 }
8785
8786 static void
8787 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
8788 boolean_t save_next)
8789 {
8790 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
8791
8792 if (!alloc) {
8793 if (mclverify) {
8794 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
8795 }
8796 if (save_next) {
8797 mcl_audit_verify_nextptr(next, mca);
8798 ((mcache_obj_t *)addr)->obj_next = next;
8799 }
8800 } else if (mclverify) {
8801 /* Check if the buffer has been corrupted while in freelist */
8802 mcl_audit_verify_nextptr(next, mca);
8803 mcache_audit_free_verify_set(mca, addr, 0, size);
8804 }
8805 }
8806
8807 static void
8808 mcl_audit_scratch(mcache_audit_t *mca)
8809 {
8810 void *stack[MCACHE_STACK_DEPTH + 1];
8811 mcl_scratch_audit_t *msa;
8812 struct timeval now;
8813
8814 VERIFY(mca->mca_contents != NULL);
8815 msa = MCA_SAVED_SCRATCH_PTR(mca);
8816
8817 msa->msa_pthread = msa->msa_thread;
8818 msa->msa_thread = current_thread();
8819 bcopy(msa->msa_stack, msa->msa_pstack, sizeof(msa->msa_pstack));
8820 msa->msa_pdepth = msa->msa_depth;
8821 bzero(stack, sizeof(stack));
8822 msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
8823 bcopy(&stack[1], msa->msa_stack, sizeof(msa->msa_stack));
8824
8825 msa->msa_ptstamp = msa->msa_tstamp;
8826 microuptime(&now);
8827 /* tstamp is in ms relative to base_ts */
8828 msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
8829 if ((now.tv_sec - mb_start.tv_sec) > 0) {
8830 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
8831 }
8832 }
8833
8834 __abortlike
8835 static void
8836 mcl_audit_mcheck_panic(struct mbuf *m)
8837 {
8838 char buf[DUMP_MCA_BUF_SIZE];
8839 mcache_audit_t *mca;
8840
8841 MRANGE(m);
8842 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
8843
8844 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s",
8845 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(buf, mca));
8846 /* NOTREACHED */
8847 }
8848
8849 __abortlike
8850 static void
8851 mcl_audit_verify_nextptr_panic(void *next, mcache_audit_t *mca)
8852 {
8853 char buf[DUMP_MCA_BUF_SIZE];
8854 panic("mcl_audit: buffer %p modified after free at offset 0: "
8855 "%p out of range [%p-%p)\n%s\n",
8856 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(buf, mca));
8857 /* NOTREACHED */
8858 }
8859
8860 static void
8861 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
8862 {
8863 if (next != NULL && !MBUF_IN_MAP(next) &&
8864 (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
8865 mcl_audit_verify_nextptr_panic(next, mca);
8866 }
8867 }
8868
8869 static uintptr_t
8870 hash_mix(uintptr_t x)
8871 {
8872 #ifndef __LP64__
8873 x += ~(x << 15);
8874 x ^= (x >> 10);
8875 x += (x << 3);
8876 x ^= (x >> 6);
8877 x += ~(x << 11);
8878 x ^= (x >> 16);
8879 #else
8880 x += ~(x << 32);
8881 x ^= (x >> 22);
8882 x += ~(x << 13);
8883 x ^= (x >> 8);
8884 x += (x << 3);
8885 x ^= (x >> 15);
8886 x += ~(x << 27);
8887 x ^= (x >> 31);
8888 #endif
8889 return x;
8890 }
8891
8892 static uint32_t
8893 hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
8894 {
8895 uintptr_t hash = 0;
8896 uintptr_t mask = max_size - 1;
8897
8898 while (depth) {
8899 hash += bt[--depth];
8900 }
8901
8902 hash = hash_mix(hash) & mask;
8903
8904 assert(hash < max_size);
8905
8906 return (uint32_t) hash;
8907 }
8908
8909 static uint32_t
8910 hashaddr(uintptr_t pt, uint32_t max_size)
8911 {
8912 uintptr_t hash = 0;
8913 uintptr_t mask = max_size - 1;
8914
8915 hash = hash_mix(pt) & mask;
8916
8917 assert(hash < max_size);
8918
8919 return (uint32_t) hash;
8920 }
8921
8922 /* This function turns on mbuf leak detection */
8923 static void
8924 mleak_activate(void)
8925 {
8926 mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
8927 PE_parse_boot_argn("mleak_sample_factor",
8928 &mleak_table.mleak_sample_factor,
8929 sizeof(mleak_table.mleak_sample_factor));
8930
8931 if (mleak_table.mleak_sample_factor == 0) {
8932 mclfindleak = 0;
8933 }
8934
8935 if (mclfindleak == 0) {
8936 return;
8937 }
8938
8939 vm_size_t alloc_size =
8940 mleak_alloc_buckets * sizeof(struct mallocation);
8941 vm_size_t trace_size = mleak_trace_buckets * sizeof(struct mtrace);
8942
8943 mleak_allocations = zalloc_permanent(alloc_size, ZALIGN(struct mallocation));
8944 mleak_traces = zalloc_permanent(trace_size, ZALIGN(struct mtrace));
8945 mleak_stat = zalloc_permanent(MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
8946 ZALIGN(mleak_stat_t));
8947
8948 mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
8949 #ifdef __LP64__
8950 mleak_stat->ml_isaddr64 = 1;
8951 #endif /* __LP64__ */
8952 }
8953
8954 static void
8955 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
8956 {
8957 int temp;
8958
8959 if (mclfindleak == 0) {
8960 return;
8961 }
8962
8963 if (!alloc) {
8964 return mleak_free(addr);
8965 }
8966
8967 temp = os_atomic_inc_orig(&mleak_table.mleak_capture, relaxed);
8968
8969 if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
8970 uintptr_t bt[MLEAK_STACK_DEPTH];
8971 unsigned int logged = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
8972 mleak_log(bt, addr, logged, num);
8973 }
8974 }
8975
8976 /*
8977 * This function records the allocation in the mleak_allocations table
8978 * and the backtrace in the mleak_traces table; if allocation slot is in use,
8979 * replace old allocation with new one if the trace slot is in use, return
8980 * (or increment refcount if same trace).
8981 */
8982 static boolean_t
8983 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
8984 {
8985 struct mallocation *allocation;
8986 struct mtrace *trace;
8987 uint32_t trace_index;
8988
8989 /* Quit if someone else modifying the tables */
8990 if (!lck_mtx_try_lock_spin(mleak_lock)) {
8991 mleak_table.total_conflicts++;
8992 return FALSE;
8993 }
8994
8995 allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
8996 mleak_alloc_buckets)];
8997 trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
8998 trace = &mleak_traces[trace_index];
8999
9000 VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
9001 VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
9002
9003 allocation->hitcount++;
9004 trace->hitcount++;
9005
9006 /*
9007 * If the allocation bucket we want is occupied
9008 * and the occupier has the same trace, just bail.
9009 */
9010 if (allocation->element != NULL &&
9011 trace_index == allocation->trace_index) {
9012 mleak_table.alloc_collisions++;
9013 lck_mtx_unlock(mleak_lock);
9014 return TRUE;
9015 }
9016
9017 /*
9018 * Store the backtrace in the traces array;
9019 * Size of zero = trace bucket is free.
9020 */
9021 if (trace->allocs > 0 &&
9022 bcmp(trace->addr, bt, (depth * sizeof(uintptr_t))) != 0) {
9023 /* Different, unique trace, but the same hash! Bail out. */
9024 trace->collisions++;
9025 mleak_table.trace_collisions++;
9026 lck_mtx_unlock(mleak_lock);
9027 return TRUE;
9028 } else if (trace->allocs > 0) {
9029 /* Same trace, already added, so increment refcount */
9030 trace->allocs++;
9031 } else {
9032 /* Found an unused trace bucket, so record the trace here */
9033 if (trace->depth != 0) {
9034 /* this slot previously used but not currently in use */
9035 mleak_table.trace_overwrites++;
9036 }
9037 mleak_table.trace_recorded++;
9038 trace->allocs = 1;
9039 memcpy(trace->addr, bt, (depth * sizeof(uintptr_t)));
9040 trace->depth = depth;
9041 trace->collisions = 0;
9042 }
9043
9044 /* Step 2: Store the allocation record in the allocations array */
9045 if (allocation->element != NULL) {
9046 /*
9047 * Replace an existing allocation. No need to preserve
9048 * because only a subset of the allocations are being
9049 * recorded anyway.
9050 */
9051 mleak_table.alloc_collisions++;
9052 } else if (allocation->trace_index != 0) {
9053 mleak_table.alloc_overwrites++;
9054 }
9055 allocation->element = addr;
9056 allocation->trace_index = trace_index;
9057 allocation->count = num;
9058 mleak_table.alloc_recorded++;
9059 mleak_table.outstanding_allocs++;
9060
9061 lck_mtx_unlock(mleak_lock);
9062 return TRUE;
9063 }
9064
9065 static void
9066 mleak_free(mcache_obj_t *addr)
9067 {
9068 while (addr != NULL) {
9069 struct mallocation *allocation = &mleak_allocations
9070 [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
9071
9072 if (allocation->element == addr &&
9073 allocation->trace_index < mleak_trace_buckets) {
9074 lck_mtx_lock_spin(mleak_lock);
9075 if (allocation->element == addr &&
9076 allocation->trace_index < mleak_trace_buckets) {
9077 struct mtrace *trace;
9078 trace = &mleak_traces[allocation->trace_index];
9079 /* allocs = 0 means trace bucket is unused */
9080 if (trace->allocs > 0) {
9081 trace->allocs--;
9082 }
9083 if (trace->allocs == 0) {
9084 trace->depth = 0;
9085 }
9086 /* NULL element means alloc bucket is unused */
9087 allocation->element = NULL;
9088 mleak_table.outstanding_allocs--;
9089 }
9090 lck_mtx_unlock(mleak_lock);
9091 }
9092 addr = addr->obj_next;
9093 }
9094 }
9095
9096 static void
9097 mleak_sort_traces()
9098 {
9099 int i, j, k;
9100 struct mtrace *swap;
9101
9102 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
9103 mleak_top_trace[i] = NULL;
9104 }
9105
9106 for (i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++) {
9107 if (mleak_traces[i].allocs <= 0) {
9108 continue;
9109 }
9110
9111 mleak_top_trace[j] = &mleak_traces[i];
9112 for (k = j; k > 0; k--) {
9113 if (mleak_top_trace[k]->allocs <=
9114 mleak_top_trace[k - 1]->allocs) {
9115 break;
9116 }
9117
9118 swap = mleak_top_trace[k - 1];
9119 mleak_top_trace[k - 1] = mleak_top_trace[k];
9120 mleak_top_trace[k] = swap;
9121 }
9122 j++;
9123 }
9124
9125 j--;
9126 for (; i < mleak_trace_buckets; i++) {
9127 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs) {
9128 continue;
9129 }
9130
9131 mleak_top_trace[j] = &mleak_traces[i];
9132
9133 for (k = j; k > 0; k--) {
9134 if (mleak_top_trace[k]->allocs <=
9135 mleak_top_trace[k - 1]->allocs) {
9136 break;
9137 }
9138
9139 swap = mleak_top_trace[k - 1];
9140 mleak_top_trace[k - 1] = mleak_top_trace[k];
9141 mleak_top_trace[k] = swap;
9142 }
9143 }
9144 }
9145
9146 static void
9147 mleak_update_stats()
9148 {
9149 mleak_trace_stat_t *mltr;
9150 int i;
9151
9152 VERIFY(mleak_stat != NULL);
9153 #ifdef __LP64__
9154 VERIFY(mleak_stat->ml_isaddr64);
9155 #else
9156 VERIFY(!mleak_stat->ml_isaddr64);
9157 #endif /* !__LP64__ */
9158 VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
9159
9160 mleak_sort_traces();
9161
9162 mltr = &mleak_stat->ml_trace[0];
9163 bzero(mltr, sizeof(*mltr) * MLEAK_NUM_TRACES);
9164 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
9165 int j;
9166
9167 if (mleak_top_trace[i] == NULL ||
9168 mleak_top_trace[i]->allocs == 0) {
9169 continue;
9170 }
9171
9172 mltr->mltr_collisions = mleak_top_trace[i]->collisions;
9173 mltr->mltr_hitcount = mleak_top_trace[i]->hitcount;
9174 mltr->mltr_allocs = mleak_top_trace[i]->allocs;
9175 mltr->mltr_depth = mleak_top_trace[i]->depth;
9176
9177 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
9178 for (j = 0; j < mltr->mltr_depth; j++) {
9179 mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
9180 }
9181
9182 mltr++;
9183 }
9184 }
9185
9186 static struct mbtypes {
9187 int mt_type;
9188 const char *mt_name;
9189 } mbtypes[] = {
9190 { MT_DATA, "data" },
9191 { MT_OOBDATA, "oob data" },
9192 { MT_CONTROL, "ancillary data" },
9193 { MT_HEADER, "packet headers" },
9194 { MT_SOCKET, "socket structures" },
9195 { MT_PCB, "protocol control blocks" },
9196 { MT_RTABLE, "routing table entries" },
9197 { MT_HTABLE, "IMP host table entries" },
9198 { MT_ATABLE, "address resolution tables" },
9199 { MT_FTABLE, "fragment reassembly queue headers" },
9200 { MT_SONAME, "socket names and addresses" },
9201 { MT_SOOPTS, "socket options" },
9202 { MT_RIGHTS, "access rights" },
9203 { MT_IFADDR, "interface addresses" },
9204 { MT_TAG, "packet tags" },
9205 { 0, NULL }
9206 };
9207
9208 #define MBUF_DUMP_BUF_CHK() { \
9209 clen -= k; \
9210 if (clen < 1) \
9211 goto done; \
9212 c += k; \
9213 }
9214
9215 static char *
9216 mbuf_dump(void)
9217 {
9218 unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct,
9219 totreturned = 0;
9220 u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
9221 u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
9222 u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
9223 int nmbtypes = sizeof(mbstat.m_mtypes) / sizeof(short);
9224 uint8_t seen[256];
9225 struct mbtypes *mp;
9226 mb_class_stat_t *sp;
9227 mleak_trace_stat_t *mltr;
9228 char *c = mbuf_dump_buf;
9229 int i, j, k, clen = MBUF_DUMP_BUF_SIZE;
9230 struct mbuf_watchdog_defunct_args args = {};
9231
9232 mbuf_dump_buf[0] = '\0';
9233
9234 /* synchronize all statistics in the mbuf table */
9235 mbuf_stat_sync();
9236 mbuf_mtypes_sync(TRUE);
9237
9238 sp = &mb_stat->mbs_class[0];
9239 for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
9240 u_int32_t mem;
9241
9242 if (m_class(i) == MC_MBUF) {
9243 m_mbufs = sp->mbcl_active;
9244 } else if (m_class(i) == MC_CL) {
9245 m_clfree = sp->mbcl_total - sp->mbcl_active;
9246 } else if (m_class(i) == MC_BIGCL) {
9247 m_bigclfree = sp->mbcl_total - sp->mbcl_active;
9248 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
9249 m_16kclfree = sp->mbcl_total - sp->mbcl_active;
9250 m_16kclusters = sp->mbcl_total;
9251 } else if (m_class(i) == MC_MBUF_CL) {
9252 m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
9253 } else if (m_class(i) == MC_MBUF_BIGCL) {
9254 m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
9255 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
9256 m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
9257 }
9258
9259 mem = sp->mbcl_ctotal * sp->mbcl_size;
9260 totmem += mem;
9261 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
9262 sp->mbcl_size;
9263 totreturned += sp->mbcl_release_cnt;
9264 }
9265
9266 /* adjust free counts to include composite caches */
9267 m_clfree += m_mbufclfree;
9268 m_bigclfree += m_mbufbigclfree;
9269 m_16kclfree += m_mbuf16kclfree;
9270
9271 totmbufs = 0;
9272 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
9273 totmbufs += mbstat.m_mtypes[mp->mt_type];
9274 }
9275 if (totmbufs > m_mbufs) {
9276 totmbufs = m_mbufs;
9277 }
9278 k = scnprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
9279 MBUF_DUMP_BUF_CHK();
9280
9281 bzero(&seen, sizeof(seen));
9282 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
9283 if (mbstat.m_mtypes[mp->mt_type] != 0) {
9284 seen[mp->mt_type] = 1;
9285 k = scnprintf(c, clen, "\t%u mbufs allocated to %s\n",
9286 mbstat.m_mtypes[mp->mt_type], mp->mt_name);
9287 MBUF_DUMP_BUF_CHK();
9288 }
9289 }
9290 seen[MT_FREE] = 1;
9291 for (i = 0; i < nmbtypes; i++) {
9292 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
9293 k = scnprintf(c, clen, "\t%u mbufs allocated to "
9294 "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
9295 MBUF_DUMP_BUF_CHK();
9296 }
9297 }
9298 if ((m_mbufs - totmbufs) > 0) {
9299 k = scnprintf(c, clen, "\t%lu mbufs allocated to caches\n",
9300 m_mbufs - totmbufs);
9301 MBUF_DUMP_BUF_CHK();
9302 }
9303 k = scnprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
9304 "%u/%u mbuf 4KB clusters in use\n",
9305 (unsigned int)(mbstat.m_clusters - m_clfree),
9306 (unsigned int)mbstat.m_clusters,
9307 (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
9308 (unsigned int)mbstat.m_bigclusters);
9309 MBUF_DUMP_BUF_CHK();
9310
9311 if (njcl > 0) {
9312 k = scnprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
9313 m_16kclusters - m_16kclfree, m_16kclusters,
9314 njclbytes / 1024);
9315 MBUF_DUMP_BUF_CHK();
9316 }
9317 totused = totmem - totfree;
9318 if (totmem == 0) {
9319 totpct = 0;
9320 } else if (totused < (ULONG_MAX / 100)) {
9321 totpct = (totused * 100) / totmem;
9322 } else {
9323 u_long totmem1 = totmem / 100;
9324 u_long totused1 = totused / 100;
9325 totpct = (totused1 * 100) / totmem1;
9326 }
9327 k = scnprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
9328 "in use)\n", totmem / 1024, totpct);
9329 MBUF_DUMP_BUF_CHK();
9330 k = scnprintf(c, clen, "%lu KB returned to the system\n",
9331 totreturned / 1024);
9332 MBUF_DUMP_BUF_CHK();
9333
9334 net_update_uptime();
9335
9336 k = scnprintf(c, clen,
9337 "worker thread runs: %u, expansions: %llu, cl %llu/%llu, "
9338 "bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt,
9339 mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total,
9340 mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt,
9341 mb_expand_16kcl_total);
9342 MBUF_DUMP_BUF_CHK();
9343 if (mbuf_worker_last_runtime != 0) {
9344 k = scnprintf(c, clen, "worker thread last run time: "
9345 "%llu (%llu seconds ago)\n",
9346 mbuf_worker_last_runtime,
9347 net_uptime() - mbuf_worker_last_runtime);
9348 MBUF_DUMP_BUF_CHK();
9349 }
9350 if (mbuf_drain_last_runtime != 0) {
9351 k = scnprintf(c, clen, "drain routine last run time: "
9352 "%llu (%llu seconds ago)\n",
9353 mbuf_drain_last_runtime,
9354 net_uptime() - mbuf_drain_last_runtime);
9355 MBUF_DUMP_BUF_CHK();
9356 }
9357
9358 /*
9359 * Log where the most mbufs have accumulated:
9360 * - Process socket buffers
9361 * - TCP reassembly queue
9362 * - Interface AQM queue (output) and DLIL input queue
9363 */
9364 args.non_blocking = true;
9365 proc_iterate(PROC_ALLPROCLIST,
9366 mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
9367 if (args.top_app != NULL) {
9368 k = scnprintf(c, clen, "\ntop proc mbuf space %u bytes by %s:%d\n",
9369 args.top_app_space_used,
9370 proc_name_address(args.top_app),
9371 proc_pid(args.top_app));
9372 proc_rele(args.top_app);
9373 }
9374 MBUF_DUMP_BUF_CHK();
9375
9376 #if INET
9377 k = dump_tcp_reass_qlen(c, clen);
9378 MBUF_DUMP_BUF_CHK();
9379 #endif /* INET */
9380
9381 #if MPTCP
9382 k = dump_mptcp_reass_qlen(c, clen);
9383 MBUF_DUMP_BUF_CHK();
9384 #endif /* MPTCP */
9385
9386 #if NETWORKING
9387 k = dlil_dump_top_if_qlen(c, clen);
9388 MBUF_DUMP_BUF_CHK();
9389 #endif /* NETWORKING */
9390
9391 /* mbuf leak detection statistics */
9392 mleak_update_stats();
9393
9394 k = scnprintf(c, clen, "\nmbuf leak detection table:\n");
9395 MBUF_DUMP_BUF_CHK();
9396 k = scnprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
9397 mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
9398 mleak_table.mleak_sample_factor);
9399 MBUF_DUMP_BUF_CHK();
9400 k = scnprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
9401 mleak_table.outstanding_allocs);
9402 MBUF_DUMP_BUF_CHK();
9403 k = scnprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
9404 mleak_table.alloc_recorded, mleak_table.trace_recorded);
9405 MBUF_DUMP_BUF_CHK();
9406 k = scnprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
9407 mleak_table.alloc_collisions, mleak_table.trace_collisions);
9408 MBUF_DUMP_BUF_CHK();
9409 k = scnprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
9410 mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
9411 MBUF_DUMP_BUF_CHK();
9412 k = scnprintf(c, clen, "\tlock conflicts: %llu\n\n",
9413 mleak_table.total_conflicts);
9414 MBUF_DUMP_BUF_CHK();
9415
9416 k = scnprintf(c, clen, "top %d outstanding traces:\n",
9417 mleak_stat->ml_cnt);
9418 MBUF_DUMP_BUF_CHK();
9419 for (i = 0; i < mleak_stat->ml_cnt; i++) {
9420 mltr = &mleak_stat->ml_trace[i];
9421 k = scnprintf(c, clen, "[%d] %llu outstanding alloc(s), "
9422 "%llu hit(s), %llu collision(s)\n", (i + 1),
9423 mltr->mltr_allocs, mltr->mltr_hitcount,
9424 mltr->mltr_collisions);
9425 MBUF_DUMP_BUF_CHK();
9426 }
9427
9428 if (mleak_stat->ml_isaddr64) {
9429 k = scnprintf(c, clen, MB_LEAK_HDR_64);
9430 } else {
9431 k = scnprintf(c, clen, MB_LEAK_HDR_32);
9432 }
9433 MBUF_DUMP_BUF_CHK();
9434
9435 for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
9436 k = scnprintf(c, clen, "%2d: ", (i + 1));
9437 MBUF_DUMP_BUF_CHK();
9438 for (j = 0; j < mleak_stat->ml_cnt; j++) {
9439 mltr = &mleak_stat->ml_trace[j];
9440 if (i < mltr->mltr_depth) {
9441 if (mleak_stat->ml_isaddr64) {
9442 k = scnprintf(c, clen, "0x%0llx ",
9443 (uint64_t)VM_KERNEL_UNSLIDE(
9444 mltr->mltr_addr[i]));
9445 } else {
9446 k = scnprintf(c, clen,
9447 "0x%08x ",
9448 (uint32_t)VM_KERNEL_UNSLIDE(
9449 mltr->mltr_addr[i]));
9450 }
9451 } else {
9452 if (mleak_stat->ml_isaddr64) {
9453 k = scnprintf(c, clen,
9454 MB_LEAK_SPACING_64);
9455 } else {
9456 k = scnprintf(c, clen,
9457 MB_LEAK_SPACING_32);
9458 }
9459 }
9460 MBUF_DUMP_BUF_CHK();
9461 }
9462 k = scnprintf(c, clen, "\n");
9463 MBUF_DUMP_BUF_CHK();
9464 }
9465
9466 done:
9467 return mbuf_dump_buf;
9468 }
9469
9470 #undef MBUF_DUMP_BUF_CHK
9471 #endif /* CONFIG_MBUF_MCACHE */
9472
9473 /*
9474 * Convert between a regular and a packet header mbuf. Caller is responsible
9475 * for setting or clearing M_PKTHDR; this routine does the rest of the work.
9476 */
9477 int
9478 m_reinit(struct mbuf *m, int hdr)
9479 {
9480 int ret = 0;
9481
9482 if (hdr) {
9483 VERIFY(!(m->m_flags & M_PKTHDR));
9484 if (!(m->m_flags & M_EXT) &&
9485 (m->m_data != m->m_dat || m->m_len > 0)) {
9486 /*
9487 * If there's no external cluster attached and the
9488 * mbuf appears to contain user data, we cannot
9489 * safely convert this to a packet header mbuf,
9490 * as the packet header structure might overlap
9491 * with the data.
9492 */
9493 printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
9494 "m_data %llx (expected %llx), "
9495 "m_len %d (expected 0)\n",
9496 __func__,
9497 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m),
9498 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m->m_data),
9499 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)(m->m_dat)), m->m_len);
9500 ret = EBUSY;
9501 } else {
9502 VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
9503 m->m_flags |= M_PKTHDR;
9504 MBUF_INIT_PKTHDR(m);
9505 }
9506 } else {
9507 /* Check for scratch area overflow */
9508 m_redzone_verify(m);
9509 /* Free the aux data and tags if there is any */
9510 m_tag_delete_chain(m);
9511 m_do_tx_compl_callback(m, NULL);
9512 m->m_flags &= ~M_PKTHDR;
9513 }
9514
9515 return ret;
9516 }
9517
9518 int
9519 m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n)
9520 {
9521 ASSERT(m->m_flags & M_EXT);
9522 return os_atomic_cmpxchg(&MEXT_PRIV(m), o, n, acq_rel);
9523 }
9524
9525 uint32_t
9526 m_ext_get_prop(struct mbuf *m)
9527 {
9528 ASSERT(m->m_flags & M_EXT);
9529 return MEXT_PRIV(m);
9530 }
9531
9532 int
9533 m_ext_paired_is_active(struct mbuf *m)
9534 {
9535 return MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : 1;
9536 }
9537
9538 void
9539 m_ext_paired_activate(struct mbuf *m)
9540 {
9541 struct ext_ref *rfa;
9542 int hdr, type;
9543 caddr_t extbuf;
9544 m_ext_free_func_t extfree;
9545 u_int extsize;
9546
9547 VERIFY(MBUF_IS_PAIRED(m));
9548 VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
9549 VERIFY(MEXT_PREF(m) == MEXT_MINREF(m));
9550
9551 hdr = (m->m_flags & M_PKTHDR);
9552 type = m->m_type;
9553 extbuf = m->m_ext.ext_buf;
9554 extfree = m_get_ext_free(m);
9555 extsize = m->m_ext.ext_size;
9556 rfa = m_get_rfa(m);
9557
9558 VERIFY(extbuf != NULL && rfa != NULL);
9559
9560 /*
9561 * Safe to reinitialize packet header tags, since it's
9562 * already taken care of at m_free() time. Similar to
9563 * what's done in m_clattach() for the cluster. Bump
9564 * up MEXT_PREF to indicate activation.
9565 */
9566 MBUF_INIT(m, hdr, type);
9567 MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
9568 1, 1, 2, EXTF_PAIRED, MEXT_PRIV(m), m);
9569 }
9570
9571 void
9572 m_scratch_init(struct mbuf *m)
9573 {
9574 struct pkthdr *pkt = &m->m_pkthdr;
9575
9576 VERIFY(m->m_flags & M_PKTHDR);
9577
9578 /* See comments in <rdar://problem/14040693> */
9579 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
9580 panic_plain("Invalid attempt to modify guarded module-private "
9581 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
9582 /* NOTREACHED */
9583 }
9584
9585 bzero(&pkt->pkt_mpriv, sizeof(pkt->pkt_mpriv));
9586 }
9587
9588 /*
9589 * This routine is reserved for mbuf_get_driver_scratch(); clients inside
9590 * xnu that intend on utilizing the module-private area should directly
9591 * refer to the pkt_mpriv structure in the pkthdr. They are also expected
9592 * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
9593 * to handing it off to another module, respectively.
9594 */
9595 u_int32_t
9596 m_scratch_get(struct mbuf *m, u_int8_t **p)
9597 {
9598 struct pkthdr *pkt = &m->m_pkthdr;
9599
9600 VERIFY(m->m_flags & M_PKTHDR);
9601
9602 /* See comments in <rdar://problem/14040693> */
9603 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
9604 panic_plain("Invalid attempt to access guarded module-private "
9605 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
9606 /* NOTREACHED */
9607 }
9608
9609 #if CONFIG_MBUF_MCACHE
9610 if (mcltrace) {
9611 mcache_audit_t *mca;
9612
9613 lck_mtx_lock(mbuf_mlock);
9614 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
9615 if (mca->mca_uflags & MB_SCVALID) {
9616 mcl_audit_scratch(mca);
9617 }
9618 lck_mtx_unlock(mbuf_mlock);
9619 }
9620 #endif /* CONFIG_MBUF_MCACHE */
9621
9622 *p = (u_int8_t *)&pkt->pkt_mpriv;
9623 return sizeof(pkt->pkt_mpriv);
9624 }
9625
9626 void
9627 m_add_crumb(struct mbuf *m, uint16_t crumb)
9628 {
9629 VERIFY(m->m_flags & M_PKTHDR);
9630
9631 m->m_pkthdr.pkt_crumbs |= crumb;
9632 }
9633
9634 static void
9635 m_redzone_init(struct mbuf *m)
9636 {
9637 VERIFY(m->m_flags & M_PKTHDR);
9638 /*
9639 * Each mbuf has a unique red zone pattern, which is a XOR
9640 * of the red zone cookie and the address of the mbuf.
9641 */
9642 m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
9643 }
9644
9645 static void
9646 m_redzone_verify(struct mbuf *m)
9647 {
9648 u_int32_t mb_redzone;
9649
9650 VERIFY(m->m_flags & M_PKTHDR);
9651
9652 mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
9653 if (m->m_pkthdr.redzone != mb_redzone) {
9654 panic("mbuf %p redzone violation with value 0x%x "
9655 "(instead of 0x%x, using cookie 0x%x)\n",
9656 m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
9657 /* NOTREACHED */
9658 }
9659 }
9660
9661 __private_extern__ inline void
9662 m_set_ext(struct mbuf *m, struct ext_ref *rfa, m_ext_free_func_t ext_free,
9663 caddr_t ext_arg)
9664 {
9665 VERIFY(m->m_flags & M_EXT);
9666 if (rfa != NULL) {
9667 m_set_rfa(m, rfa);
9668 if (ext_free != NULL) {
9669 rfa->ext_token = ((uintptr_t)&rfa->ext_token) ^
9670 mb_obscure_extfree;
9671 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, ext_free) ^ rfa->ext_token;
9672 m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
9673 if (ext_arg != NULL) {
9674 m->m_ext.ext_arg =
9675 (caddr_t)(((uintptr_t)ext_arg) ^ rfa->ext_token);
9676 } else {
9677 m->m_ext.ext_arg = NULL;
9678 }
9679 } else {
9680 rfa->ext_token = 0;
9681 m->m_ext.ext_free = NULL;
9682 m->m_ext.ext_arg = NULL;
9683 }
9684 } else {
9685 /*
9686 * If we are going to loose the cookie in ext_token by
9687 * resetting the rfa, we should use the global cookie
9688 * to obscure the ext_free and ext_arg pointers.
9689 */
9690 if (ext_free != NULL) {
9691 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, ext_free) ^ mb_obscure_extfree;
9692 m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
9693 if (ext_arg != NULL) {
9694 m->m_ext.ext_arg =
9695 (caddr_t)((uintptr_t)ext_arg ^
9696 mb_obscure_extfree);
9697 } else {
9698 m->m_ext.ext_arg = NULL;
9699 }
9700 } else {
9701 m->m_ext.ext_free = NULL;
9702 m->m_ext.ext_arg = NULL;
9703 }
9704 m->m_ext.ext_refflags = NULL;
9705 }
9706 }
9707
9708 __private_extern__ inline struct ext_ref *
9709 m_get_rfa(struct mbuf *m)
9710 {
9711 if (m->m_ext.ext_refflags == NULL) {
9712 return NULL;
9713 } else {
9714 return (struct ext_ref *)(((uintptr_t)m->m_ext.ext_refflags) ^ mb_obscure_extref);
9715 }
9716 }
9717
9718 static inline void
9719 m_set_rfa(struct mbuf *m, struct ext_ref *rfa)
9720 {
9721 if (rfa != NULL) {
9722 m->m_ext.ext_refflags =
9723 (struct ext_ref *)(((uintptr_t)rfa) ^ mb_obscure_extref);
9724 } else {
9725 m->m_ext.ext_refflags = NULL;
9726 }
9727 }
9728
9729 __private_extern__ inline m_ext_free_func_t
9730 m_get_ext_free(struct mbuf *m)
9731 {
9732 struct ext_ref *rfa;
9733 if (m->m_ext.ext_free == NULL) {
9734 return NULL;
9735 }
9736
9737 rfa = m_get_rfa(m);
9738 if (rfa == NULL) {
9739 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, m->m_ext.ext_free) ^ mb_obscure_extfree;
9740 return ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
9741 } else {
9742 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, m->m_ext.ext_free) ^ rfa->ext_token;
9743 return ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
9744 }
9745 }
9746
9747 __private_extern__ inline caddr_t
9748 m_get_ext_arg(struct mbuf *m)
9749 {
9750 struct ext_ref *rfa;
9751 if (m->m_ext.ext_arg == NULL) {
9752 return NULL;
9753 }
9754
9755 rfa = m_get_rfa(m);
9756 if (rfa == NULL) {
9757 return (caddr_t)((uintptr_t)m->m_ext.ext_arg ^ mb_obscure_extfree);
9758 } else {
9759 return (caddr_t)(((uintptr_t)m->m_ext.ext_arg) ^
9760 rfa->ext_token);
9761 }
9762 }
9763
9764 #if CONFIG_MBUF_MCACHE
9765 /*
9766 * Send a report of mbuf usage if the usage is at least 6% of max limit
9767 * or if there has been at least 3% increase since the last report.
9768 *
9769 * The values 6% and 3% are chosen so that we can do simple arithmetic
9770 * with shift operations.
9771 */
9772 static boolean_t
9773 mbuf_report_usage(mbuf_class_t cl)
9774 {
9775 /* if a report is already in progress, nothing to do */
9776 if (mb_peak_newreport) {
9777 return TRUE;
9778 }
9779
9780 if (m_total(cl) > m_peak(cl) &&
9781 m_total(cl) >= (m_maxlimit(cl) >> 4) &&
9782 (m_total(cl) - m_peak(cl)) >= (m_peak(cl) >> 5)) {
9783 return TRUE;
9784 }
9785 return FALSE;
9786 }
9787 #endif /* CONFIG_MBUF_MCACHE */
9788
9789 __private_extern__ void
9790 mbuf_report_peak_usage(void)
9791 {
9792 int i = 0;
9793 u_int64_t uptime;
9794 struct nstat_sysinfo_data ns_data;
9795 uint32_t memreleased = 0;
9796 static uint32_t prevmemreleased;
9797
9798 uptime = net_uptime();
9799 lck_mtx_lock(mbuf_mlock);
9800 mbuf_stat_sync();
9801 mbuf_mtypes_sync(TRUE);
9802
9803 /* Generate an initial report after 1 week of uptime */
9804 if (!mb_peak_firstreport &&
9805 uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
9806 mb_peak_newreport = TRUE;
9807 mb_peak_firstreport = TRUE;
9808 }
9809
9810 if (!mb_peak_newreport) {
9811 lck_mtx_unlock(mbuf_mlock);
9812 return;
9813 }
9814
9815 /*
9816 * Since a report is being generated before 1 week,
9817 * we do not need to force another one later
9818 */
9819 if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
9820 mb_peak_firstreport = TRUE;
9821 }
9822
9823 for (i = 0; i < NELEM(mbuf_table); i++) {
9824 m_peak(m_class(i)) = m_total(m_class(i));
9825 memreleased += m_release_cnt(i);
9826 }
9827 memreleased = memreleased - prevmemreleased;
9828 prevmemreleased = memreleased;
9829 mb_peak_newreport = FALSE;
9830 lck_mtx_unlock(mbuf_mlock);
9831
9832 bzero(&ns_data, sizeof(ns_data));
9833 ns_data.flags = NSTAT_SYSINFO_MBUF_STATS;
9834 ns_data.u.mb_stats.total_256b = m_peak(MC_MBUF);
9835 ns_data.u.mb_stats.total_2kb = m_peak(MC_CL);
9836 ns_data.u.mb_stats.total_4kb = m_peak(MC_BIGCL);
9837 ns_data.u.mb_stats.total_16kb = m_peak(MC_16KCL);
9838 ns_data.u.mb_stats.sbmb_total = total_sbmb_cnt_peak;
9839 ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached;
9840 ns_data.u.mb_stats.draincnt = mbstat.m_drain;
9841 ns_data.u.mb_stats.memreleased = memreleased;
9842 ns_data.u.mb_stats.sbmb_floor = total_sbmb_cnt_floor;
9843
9844 nstat_sysinfo_send_data(&ns_data);
9845
9846 /*
9847 * Reset the floor whenever we report a new
9848 * peak to track the trend (increase peek usage
9849 * is not a leak if mbufs get released
9850 * between reports and the floor stays low)
9851 */
9852 total_sbmb_cnt_floor = total_sbmb_cnt_peak;
9853 }
9854
9855 #if CONFIG_MBUF_MCACHE
9856 /*
9857 * Simple routine to avoid taking the lock when we can't run the
9858 * mbuf drain.
9859 */
9860 static int
9861 mbuf_drain_checks(boolean_t ignore_waiters)
9862 {
9863 if (mb_drain_maxint == 0) {
9864 return 0;
9865 }
9866 if (!ignore_waiters && mb_waiters != 0) {
9867 return 0;
9868 }
9869
9870 return 1;
9871 }
9872
9873 /*
9874 * Called by the VM when there's memory pressure or when we exhausted
9875 * the 4k/16k reserved space.
9876 */
9877 static void
9878 mbuf_drain_locked(boolean_t ignore_waiters)
9879 {
9880 mbuf_class_t mc;
9881 mcl_slab_t *sp, *sp_tmp, *nsp;
9882 unsigned int num, k, interval, released = 0;
9883 unsigned long total_mem = 0, use_mem = 0;
9884 boolean_t ret, purge_caches = FALSE;
9885 ppnum_t offset;
9886 mcache_obj_t *obj;
9887 unsigned long per;
9888 static unsigned char scratch[32];
9889 static ppnum_t scratch_pa = 0;
9890
9891 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
9892 if (!mbuf_drain_checks(ignore_waiters)) {
9893 return;
9894 }
9895 if (scratch_pa == 0) {
9896 bzero(scratch, sizeof(scratch));
9897 scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
9898 VERIFY(scratch_pa);
9899 } else if (mclverify) {
9900 /*
9901 * Panic if a driver wrote to our scratch memory.
9902 */
9903 for (k = 0; k < sizeof(scratch); k++) {
9904 if (scratch[k]) {
9905 panic("suspect DMA to freed address");
9906 }
9907 }
9908 }
9909 /*
9910 * Don't free memory too often as that could cause excessive
9911 * waiting times for mbufs. Purge caches if we were asked to drain
9912 * in the last 5 minutes.
9913 */
9914 if (mbuf_drain_last_runtime != 0) {
9915 interval = net_uptime() - mbuf_drain_last_runtime;
9916 if (interval <= mb_drain_maxint) {
9917 return;
9918 }
9919 if (interval <= mb_drain_maxint * 5) {
9920 purge_caches = TRUE;
9921 }
9922 }
9923 mbuf_drain_last_runtime = net_uptime();
9924 /*
9925 * Don't free any memory if we're using 60% or more.
9926 */
9927 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
9928 total_mem += m_total(mc) * m_maxsize(mc);
9929 use_mem += m_active(mc) * m_maxsize(mc);
9930 }
9931 per = (use_mem * 100) / total_mem;
9932 if (per >= 60) {
9933 return;
9934 }
9935 /*
9936 * Purge all the caches. This effectively disables
9937 * caching for a few seconds, but the mbuf worker thread will
9938 * re-enable them again.
9939 */
9940 if (purge_caches == TRUE) {
9941 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
9942 if (m_total(mc) < m_avgtotal(mc)) {
9943 continue;
9944 }
9945 lck_mtx_unlock(mbuf_mlock);
9946 ret = mcache_purge_cache(m_cache(mc), FALSE);
9947 lck_mtx_lock(mbuf_mlock);
9948 if (ret == TRUE) {
9949 m_purge_cnt(mc)++;
9950 }
9951 }
9952 }
9953 /*
9954 * Move the objects from the composite class freelist to
9955 * the rudimentary slabs list, but keep at least 10% of the average
9956 * total in the freelist.
9957 */
9958 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
9959 while (m_cobjlist(mc) &&
9960 m_total(mc) < m_avgtotal(mc) &&
9961 m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
9962 obj = m_cobjlist(mc);
9963 m_cobjlist(mc) = obj->obj_next;
9964 obj->obj_next = NULL;
9965 num = cslab_free(mc, obj, 1);
9966 VERIFY(num == 1);
9967 m_free_cnt(mc)++;
9968 m_infree(mc)--;
9969 /* cslab_free() handles m_total */
9970 }
9971 }
9972 /*
9973 * Free the buffers present in the slab list up to 10% of the total
9974 * average per class.
9975 *
9976 * We walk the list backwards in an attempt to reduce fragmentation.
9977 */
9978 for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) {
9979 TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
9980 /*
9981 * Process only unused slabs occupying memory.
9982 */
9983 if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
9984 sp->sl_base == NULL) {
9985 continue;
9986 }
9987 if (m_total(mc) < m_avgtotal(mc) ||
9988 m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
9989 break;
9990 }
9991 slab_remove(sp, mc);
9992 switch (mc) {
9993 case MC_MBUF:
9994 m_infree(mc) -= NMBPG;
9995 m_total(mc) -= NMBPG;
9996 if (mclaudit != NULL) {
9997 mcl_audit_free(sp->sl_base, NMBPG);
9998 }
9999 break;
10000 case MC_CL:
10001 m_infree(mc) -= NCLPG;
10002 m_total(mc) -= NCLPG;
10003 if (mclaudit != NULL) {
10004 mcl_audit_free(sp->sl_base, NMBPG);
10005 }
10006 break;
10007 case MC_BIGCL:
10008 {
10009 m_infree(mc) -= NBCLPG;
10010 m_total(mc) -= NBCLPG;
10011 if (mclaudit != NULL) {
10012 mcl_audit_free(sp->sl_base, NMBPG);
10013 }
10014 break;
10015 }
10016 case MC_16KCL:
10017 m_infree(mc)--;
10018 m_total(mc)--;
10019 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
10020 nsp = nsp->sl_next;
10021 VERIFY(nsp->sl_refcnt == 0 &&
10022 nsp->sl_base != NULL &&
10023 nsp->sl_len == 0);
10024 slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
10025 0);
10026 nsp->sl_flags = 0;
10027 }
10028 if (mclaudit != NULL) {
10029 if (sp->sl_len == PAGE_SIZE) {
10030 mcl_audit_free(sp->sl_base,
10031 NMBPG);
10032 } else {
10033 mcl_audit_free(sp->sl_base, 1);
10034 }
10035 }
10036 break;
10037 default:
10038 /*
10039 * The composite classes have their own
10040 * freelist (m_cobjlist), so we only
10041 * process rudimentary classes here.
10042 */
10043 VERIFY(0);
10044 }
10045 m_release_cnt(mc) += m_size(mc);
10046 released += m_size(mc);
10047 VERIFY(sp->sl_base != NULL &&
10048 sp->sl_len >= PAGE_SIZE);
10049 offset = MTOPG(sp->sl_base);
10050 /*
10051 * Make sure the IOMapper points to a valid, but
10052 * bogus, address. This should prevent further DMA
10053 * accesses to freed memory.
10054 */
10055 IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
10056 mcl_paddr[offset] = 0;
10057 kmem_free(mb_map, (vm_offset_t)sp->sl_base,
10058 sp->sl_len);
10059 slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
10060 sp->sl_flags = 0;
10061 }
10062 }
10063 mbstat.m_drain++;
10064 mbstat.m_bigclusters = m_total(MC_BIGCL);
10065 mbstat.m_clusters = m_total(MC_CL);
10066 mbstat.m_mbufs = m_total(MC_MBUF);
10067 mbuf_stat_sync();
10068 mbuf_mtypes_sync(TRUE);
10069 }
10070
10071 __private_extern__ void
10072 mbuf_drain(boolean_t ignore_waiters)
10073 {
10074 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_NOTOWNED);
10075 if (!mbuf_drain_checks(ignore_waiters)) {
10076 return;
10077 }
10078 lck_mtx_lock(mbuf_mlock);
10079 mbuf_drain_locked(ignore_waiters);
10080 lck_mtx_unlock(mbuf_mlock);
10081 }
10082
10083
10084 static int
10085 m_drain_force_sysctl SYSCTL_HANDLER_ARGS
10086 {
10087 #pragma unused(arg1, arg2)
10088 int val = 0, err;
10089
10090 err = sysctl_handle_int(oidp, &val, 0, req);
10091 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10092 return err;
10093 }
10094 if (val) {
10095 mbuf_drain(TRUE);
10096 }
10097
10098 return err;
10099 }
10100
10101 #if DEBUG || DEVELOPMENT
10102 __printflike(3, 4)
10103 static void
10104 _mbwdog_logger(const char *func, const int line, const char *fmt, ...)
10105 {
10106 va_list ap;
10107 struct timeval now;
10108 char str[384], p[256];
10109 int len;
10110
10111 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
10112 if (mbwdog_logging == NULL) {
10113 /*
10114 * This might block under a mutex, which isn't really great,
10115 * but this happens once, so we'll live.
10116 */
10117 mbwdog_logging = zalloc_permanent(mbwdog_logging_size,
10118 ZALIGN_NONE);
10119 }
10120 va_start(ap, fmt);
10121 vsnprintf(p, sizeof(p), fmt, ap);
10122 va_end(ap);
10123 microuptime(&now);
10124 len = scnprintf(str, sizeof(str),
10125 "\n%ld.%d (%d/%llx) %s:%d %s",
10126 now.tv_sec, now.tv_usec,
10127 proc_getpid(current_proc()),
10128 (uint64_t)VM_KERNEL_ADDRPERM(current_thread()),
10129 func, line, p);
10130 if (len < 0) {
10131 return;
10132 }
10133 if (mbwdog_logging_used + len > mbwdog_logging_size) {
10134 mbwdog_logging_used = mbwdog_logging_used / 2;
10135 memmove(mbwdog_logging, mbwdog_logging + mbwdog_logging_used,
10136 mbwdog_logging_size - mbwdog_logging_used);
10137 mbwdog_logging[mbwdog_logging_used] = 0;
10138 }
10139 strlcat(mbwdog_logging, str, mbwdog_logging_size);
10140 mbwdog_logging_used += len;
10141 }
10142
10143 #endif // DEBUG || DEVELOPMENT
10144
10145 static void
10146 mtracelarge_register(size_t size)
10147 {
10148 int i;
10149 struct mtracelarge *trace;
10150 uintptr_t bt[MLEAK_STACK_DEPTH];
10151 unsigned int depth;
10152
10153 depth = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
10154 /* Check if this entry is already on the list. */
10155 for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
10156 trace = &mtracelarge_table[i];
10157 if (trace->size == size && trace->depth == depth &&
10158 memcmp(bt, trace->addr, depth * sizeof(uintptr_t)) == 0) {
10159 return;
10160 }
10161 }
10162 for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
10163 trace = &mtracelarge_table[i];
10164 if (size > trace->size) {
10165 trace->depth = depth;
10166 memcpy(trace->addr, bt, depth * sizeof(uintptr_t));
10167 trace->size = size;
10168 break;
10169 }
10170 }
10171 }
10172
10173 #if DEBUG || DEVELOPMENT
10174
10175 static int
10176 mbuf_wd_dump_sysctl SYSCTL_HANDLER_ARGS
10177 {
10178 char *str;
10179
10180 ifnet_head_lock_shared();
10181 lck_mtx_lock(mbuf_mlock);
10182
10183 str = mbuf_dump();
10184
10185 lck_mtx_unlock(mbuf_mlock);
10186 ifnet_head_done();
10187
10188 return sysctl_io_string(req, str, 0, 0, NULL);
10189 }
10190
10191 #endif /* DEBUG || DEVELOPMENT */
10192 #endif /* CONFIG_MBUF_MCACHE */
10193
10194 SYSCTL_DECL(_kern_ipc);
10195 #if DEBUG || DEVELOPMENT
10196 #if SKYWALK && CONFIG_MBUF_MCACHE
10197 SYSCTL_UINT(_kern_ipc, OID_AUTO, mc_threshold_scale_factor,
10198 CTLFLAG_RW | CTLFLAG_LOCKED, &mc_threshold_scale_down_factor,
10199 MC_THRESHOLD_SCALE_DOWN_FACTOR,
10200 "scale down factor for mbuf cache thresholds");
10201 #endif /* SKYWALK && CONFIG_MBUF_MCACHE */
10202 #if CONFIG_MBUF_MCACHE
10203 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_wd_dump,
10204 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED,
10205 0, 0, mbuf_wd_dump_sysctl, "A", "mbuf watchdog dump");
10206 #endif /* CONFIG_MBUF_MCACHE */
10207 #endif /* DEBUG || DEVELOPMENT */
10208 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
10209 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
10210 0, 0, mbstat_sysctl, "S,mbstat", "");
10211 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
10212 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
10213 0, 0, mb_stat_sysctl, "S,mb_stat", "");
10214 #if CONFIG_MBUF_MCACHE
10215 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
10216 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
10217 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
10218 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
10219 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
10220 0, 0, mleak_table_sysctl, "S,mleak_table", "");
10221 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
10222 CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
10223 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
10224 CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
10225 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
10226 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
10227 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
10228 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
10229 m_drain_force_sysctl, "I",
10230 "Forces the mbuf garbage collection to run");
10231 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
10232 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0,
10233 "Minimum time interval between garbage collection");
10234 #endif /* CONFIG_MBUF_MCACHE */
10235 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_memory_pressure_percentage,
10236 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_memory_pressure_percentage, 0,
10237 "Percentage of when we trigger memory-pressure for an mbuf-class");
10238 #if CONFIG_MBUF_MCACHE
10239 static int mb_uses_mcache = 1;
10240 #else
10241 static int mb_uses_mcache = 0;
10242 #endif /* CONFIG_MBUF_MCACHE */
10243 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_uses_mcache,
10244 CTLFLAG_LOCKED, &mb_uses_mcache, 0,
10245 "Whether mbufs use mcache");
10246