1 /*
2 * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <ptrauth.h>
71
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/malloc.h>
75 #include <sys/mbuf.h>
76 #include <sys/kernel.h>
77 #include <sys/sysctl.h>
78 #include <sys/syslog.h>
79 #include <sys/protosw.h>
80 #include <sys/domain.h>
81 #include <sys/queue.h>
82 #include <sys/proc.h>
83 #include <sys/filedesc.h>
84 #include <sys/file_internal.h>
85
86 #include <vm/vm_kern_xnu.h>
87
88 #include <dev/random/randomdev.h>
89
90 #include <kern/kern_types.h>
91 #include <kern/simple_lock.h>
92 #include <kern/queue.h>
93 #include <kern/sched_prim.h>
94 #include <kern/backtrace.h>
95 #include <kern/percpu.h>
96 #include <kern/zalloc.h>
97
98 #include <libkern/OSDebug.h>
99 #include <libkern/libkern.h>
100
101 #include <os/log.h>
102 #include <os/ptrtools.h>
103
104 #include <IOKit/IOMapper.h>
105
106 #include <machine/limits.h>
107 #include <machine/machine_routines.h>
108
109 #if CONFIG_MBUF_MCACHE
110 #include <sys/mcache.h>
111 #endif /* CONFIG_MBUF_MCACHE */
112 #include <net/ntstat.h>
113
114 #include <net/droptap.h>
115
116 #if INET
117 extern int dump_tcp_reass_qlen(char *, int);
118 extern int tcp_reass_qlen_space(struct socket *);
119 #endif /* INET */
120
121 #if MPTCP
122 extern int dump_mptcp_reass_qlen(char *, int);
123 #endif /* MPTCP */
124
125
126 #if NETWORKING
127 extern int dlil_dump_top_if_qlen(char *, int);
128 #endif /* NETWORKING */
129
130 #if CONFIG_MBUF_MCACHE
131 /*
132 * MBUF IMPLEMENTATION NOTES.
133 *
134 * There is a total of 5 per-CPU caches:
135 *
136 * MC_MBUF:
137 * This is a cache of rudimentary objects of _MSIZE in size; each
138 * object represents an mbuf structure. This cache preserves only
139 * the m_type field of the mbuf during its transactions.
140 *
141 * MC_CL:
142 * This is a cache of rudimentary objects of MCLBYTES in size; each
143 * object represents a mcluster structure. This cache does not
144 * preserve the contents of the objects during its transactions.
145 *
146 * MC_BIGCL:
147 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each
148 * object represents a mbigcluster structure. This cache does not
149 * preserve the contents of the objects during its transaction.
150 *
151 * MC_MBUF_CL:
152 * This is a cache of mbufs each having a cluster attached to it.
153 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
154 * fields of the mbuf related to the external cluster are preserved
155 * during transactions.
156 *
157 * MC_MBUF_BIGCL:
158 * This is a cache of mbufs each having a big cluster attached to it.
159 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
160 * fields of the mbuf related to the external cluster are preserved
161 * during transactions.
162 *
163 * OBJECT ALLOCATION:
164 *
165 * Allocation requests are handled first at the per-CPU (mcache) layer
166 * before falling back to the slab layer. Performance is optimal when
167 * the request is satisfied at the CPU layer because global data/lock
168 * never gets accessed. When the slab layer is entered for allocation,
169 * the slab freelist will be checked first for available objects before
170 * the VM backing store is invoked. Slab layer operations are serialized
171 * for all of the caches as the mbuf global lock is held most of the time.
172 * Allocation paths are different depending on the class of objects:
173 *
174 * a. Rudimentary object:
175 *
176 * { m_get_common(), m_clattach(), m_mclget(),
177 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
178 * composite object allocation }
179 * | ^
180 * | |
181 * | +-----------------------+
182 * v |
183 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
184 * | ^
185 * v |
186 * [CPU cache] -------> (found?) -------+
187 * | |
188 * v |
189 * mbuf_slab_alloc() |
190 * | |
191 * v |
192 * +---------> [freelist] -------> (found?) -------+
193 * | |
194 * | v
195 * | m_clalloc()
196 * | |
197 * | v
198 * +---<<---- kmem_mb_alloc()
199 *
200 * b. Composite object:
201 *
202 * { m_getpackets_internal(), m_allocpacket_internal() }
203 * | ^
204 * | |
205 * | +------ (done) ---------+
206 * v |
207 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
208 * | ^
209 * v |
210 * [CPU cache] -------> (found?) -------+
211 * | |
212 * v |
213 * mbuf_cslab_alloc() |
214 * | |
215 * v |
216 * [freelist] -------> (found?) -------+
217 * | |
218 * v |
219 * (rudimentary object) |
220 * mcache_alloc/mcache_alloc_ext() ------>>-----+
221 *
222 * Auditing notes: If auditing is enabled, buffers will be subjected to
223 * integrity checks by the audit routine. This is done by verifying their
224 * contents against DEADBEEF (free) pattern before returning them to caller.
225 * As part of this step, the routine will also record the transaction and
226 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
227 * also restore any constructed data structure fields if necessary.
228 *
229 * OBJECT DEALLOCATION:
230 *
231 * Freeing an object simply involves placing it into the CPU cache; this
232 * pollutes the cache to benefit subsequent allocations. The slab layer
233 * will only be entered if the object is to be purged out of the cache.
234 * During normal operations, this happens only when the CPU layer resizes
235 * its bucket while it's adjusting to the allocation load. Deallocation
236 * paths are different depending on the class of objects:
237 *
238 * a. Rudimentary object:
239 *
240 * { m_free(), m_freem_list(), composite object deallocation }
241 * | ^
242 * | |
243 * | +------ (done) ---------+
244 * v |
245 * mcache_free/mcache_free_ext() |
246 * | |
247 * v |
248 * mbuf_slab_audit() |
249 * | |
250 * v |
251 * [CPU cache] ---> (not purging?) -----+
252 * | |
253 * v |
254 * mbuf_slab_free() |
255 * | |
256 * v |
257 * [freelist] ----------->>------------+
258 * (objects get purged to VM only on demand)
259 *
260 * b. Composite object:
261 *
262 * { m_free(), m_freem_list() }
263 * | ^
264 * | |
265 * | +------ (done) ---------+
266 * v |
267 * mcache_free/mcache_free_ext() |
268 * | |
269 * v |
270 * mbuf_cslab_audit() |
271 * | |
272 * v |
273 * [CPU cache] ---> (not purging?) -----+
274 * | |
275 * v |
276 * mbuf_cslab_free() |
277 * | |
278 * v |
279 * [freelist] ---> (not purging?) -----+
280 * | |
281 * v |
282 * (rudimentary object) |
283 * mcache_free/mcache_free_ext() ------->>------+
284 *
285 * Auditing notes: If auditing is enabled, the audit routine will save
286 * any constructed data structure fields (if necessary) before filling the
287 * contents of the buffers with DEADBEEF (free) pattern and recording the
288 * transaction. Buffers that are freed (whether at CPU or slab layer) are
289 * expected to contain the free pattern.
290 *
291 * DEBUGGING:
292 *
293 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
294 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
295 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
296 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
297 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
298 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
299 *
300 * Each object is associated with exactly one mcache_audit_t structure that
301 * contains the information related to its last buffer transaction. Given
302 * an address of an object, the audit structure can be retrieved by finding
303 * the position of the object relevant to the base address of the cluster:
304 *
305 * +------------+ +=============+
306 * | mbuf addr | | mclaudit[i] |
307 * +------------+ +=============+
308 * | | cl_audit[0] |
309 * i = MTOBG(addr) +-------------+
310 * | +-----> | cl_audit[1] | -----> mcache_audit_t
311 * b = BGTOM(i) | +-------------+
312 * | | | ... |
313 * x = MCLIDX(b, addr) | +-------------+
314 * | | | cl_audit[7] |
315 * +-----------------+ +-------------+
316 * (e.g. x == 1)
317 *
318 * The mclaudit[] array is allocated at initialization time, but its contents
319 * get populated when the corresponding cluster is created. Because a page
320 * can be turned into NMBPG number of mbufs, we preserve enough space for the
321 * mbufs so that there is a 1-to-1 mapping between them. A page that never
322 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
323 * remaining entries unused. For 16KB cluster, only one entry from the first
324 * page is allocated and used for the entire object.
325 */
326 #else
327 /*
328 * MBUF IMPLEMENTATION NOTES (using zalloc).
329 *
330 * There are a total of 4 zones and 3 zcaches.
331 *
332 * MC_MBUF:
333 * This is a zone of rudimentary objects of _MSIZE in size; each
334 * object represents an mbuf structure. This cache preserves only
335 * the m_type field of the mbuf during its transactions.
336 *
337 * MC_CL:
338 * This is a zone of rudimentary objects of MCLBYTES in size; each
339 * object represents a mcluster structure. This cache does not
340 * preserve the contents of the objects during its transactions.
341 *
342 * MC_BIGCL:
343 * This is a zone of rudimentary objects of MBIGCLBYTES in size; each
344 * object represents a mbigcluster structure. This cache does not
345 * preserve the contents of the objects during its transaction.
346 *
347 * MC_16KCL:
348 * This is a zone of rudimentary objects of M16KCLBYTES in size; each
349 * object represents a m16kcluster structure. This cache does not
350 * preserve the contents of the objects during its transaction.
351 *
352 * MC_MBUF_CL:
353 * This is a cache of mbufs each having a cluster attached to it.
354 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
355 * fields of the mbuf related to the external cluster are preserved
356 * during transactions.
357 *
358 * MC_MBUF_BIGCL:
359 * This is a cache of mbufs each having a big cluster attached to it.
360 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
361 * fields of the mbuf related to the external cluster are preserved
362 * during transactions.
363 *
364 * MC_MBUF_16KCL:
365 * This is a cache of mbufs each having a big cluster attached to it.
366 * It is backed by MC_MBUF and MC_16KCL rudimentary caches. Several
367 * fields of the mbuf related to the external cluster are preserved
368 * during transactions.
369 *
370 * OBJECT ALLOCATION:
371 *
372 * Allocation requests are handled first at the zalloc per-CPU layer
373 * before falling back to the zalloc depot. Performance is optimal when
374 * the request is satisfied at the CPU layer. zalloc has an additional
375 * overflow layer called the depot, not pictured in the diagram below.
376 *
377 * Allocation paths are different depending on the class of objects:
378 *
379 * a. Rudimentary object:
380 *
381 * { m_get_common(), m_clattach(), m_mclget(),
382 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
383 * composite object allocation }
384 * | ^
385 * | |
386 * | +------- (done) --------+
387 * v |
388 * zalloc_flags/zalloc_n() KASAN
389 * | ^
390 * v |
391 * +----> [zalloc per-CPU cache] -----> (found?) --+
392 * | | |
393 * | v |
394 * | [zalloc recirculation layer] --> (found?) ---+
395 * | |
396 * | v
397 * +--<<-- [zone backing store]
398 *
399 * b. Composite object:
400 *
401 * { m_getpackets_internal(), m_allocpacket_internal() }
402 * | ^
403 * | |
404 * | +------ (done) ---------+
405 * v |
406 * mz_composite_alloc() KASAN
407 * | ^
408 * v |
409 * zcache_alloc_n() |
410 * | |
411 * v |
412 * [zalloc per-CPU cache] --> mark_valid() ---+
413 * | |
414 * v |
415 * [zalloc recirculation layer] -> mark_valid() -+
416 * | |
417 * v |
418 * mz_composite_build() |
419 * | |
420 * v |
421 * (rudimentary objects) |
422 * zalloc_id() ---------------->>-----+
423 *
424 * Auditing notes: If KASAN enabled, buffers will be subjected to
425 * integrity checks by the AddressSanitizer.
426 *
427 * OBJECT DEALLOCATION:
428 *
429 * Freeing an object simply involves placing it into the CPU cache; this
430 * pollutes the cache to benefit subsequent allocations. The depot
431 * will only be entered if the object is to be purged out of the cache.
432 * Objects may be purged based on the overall memory pressure or
433 * during zone garbage collection.
434 * To improve performance, objects are not zero-filled when freed
435 * as it's custom for other zalloc zones.
436 *
437 * Deallocation paths are different depending on the class of objects:
438 *
439 * a. Rudimentary object:
440 *
441 * { m_free(), m_freem_list(), composite object deallocation }
442 * | ^
443 * | |
444 * | +------ (done) ---------+
445 * v |
446 * zfree_nozero() |
447 * | |
448 * v |
449 * KASAN |
450 * | |
451 * v |
452 * [zalloc per-CPU cache] -> (not purging?) --+
453 * | |
454 * v |
455 * [zalloc recirculation layer] --->>----------+
456 *
457 *
458 * b. Composite object:
459 *
460 * { m_free(), m_freem_list() }
461 * | ^
462 * | |
463 * | +------ (done) ---------+
464 * v |
465 * mz_composite_free() |
466 * | |
467 * v |
468 * zcache_free_n() |
469 * | |
470 * v |
471 * KASAN |
472 * | |
473 * v |
474 * [zalloc per-CPU cache] -> mark_invalid() --+
475 * | |
476 * v |
477 * mz_composite_destroy() |
478 * | |
479 * v |
480 * (rudimentary object) |
481 * zfree_nozero() -------------->>------+
482 *
483 * Auditing notes: If KASAN enabled, buffers will be subjected to
484 * integrity checks by the AddressSanitizer.
485 *
486 * DEBUGGING:
487 *
488 * Debugging mbufs can be done by booting a KASAN enabled kernel.
489 */
490
491 #endif /* CONFIG_MBUF_MCACHE */
492
493 /* TODO: should be in header file */
494 /* kernel translater */
495 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
496 extern vm_map_t mb_map; /* special map */
497
498 #if CONFIG_MBUF_MCACHE
499 static uint32_t mb_kmem_contig_failed;
500 static uint32_t mb_kmem_failed;
501 static uint32_t mb_kmem_one_failed;
502 /* Timestamp of allocation failures. */
503 static uint64_t mb_kmem_contig_failed_ts;
504 static uint64_t mb_kmem_failed_ts;
505 static uint64_t mb_kmem_one_failed_ts;
506 static uint64_t mb_kmem_contig_failed_size;
507 static uint64_t mb_kmem_failed_size;
508 static uint32_t mb_kmem_stats[6];
509 #endif /* CONFIG_MBUF_MCACHE */
510
511 /* Global lock */
512 static LCK_GRP_DECLARE(mbuf_mlock_grp, "mbuf");
513 static LCK_MTX_DECLARE(mbuf_mlock_data, &mbuf_mlock_grp);
514 static lck_mtx_t *const mbuf_mlock = &mbuf_mlock_data;
515
516 #if CONFIG_MBUF_MCACHE
517 /* Back-end (common) layer */
518 static uint64_t mb_expand_cnt;
519 static uint64_t mb_expand_cl_cnt;
520 static uint64_t mb_expand_cl_total;
521 static uint64_t mb_expand_bigcl_cnt;
522 static uint64_t mb_expand_bigcl_total;
523 static uint64_t mb_expand_16kcl_cnt;
524 static uint64_t mb_expand_16kcl_total;
525 static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */
526 static uint32_t mbuf_worker_run_cnt;
527 static uint64_t mbuf_worker_last_runtime;
528 static uint64_t mbuf_drain_last_runtime;
529 static int mbuf_worker_ready; /* worker thread is runnable */
530 static unsigned int ncpu; /* number of CPUs */
531 static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */
532 static ppnum_t mcl_pages; /* Size of array (# physical pages) */
533 static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
534 static mcache_t *ref_cache; /* Cache of cluster reference & flags */
535 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
536 unsigned int mbuf_debug; /* patchable mbuf mcache flags */
537 #endif /* CONFIG_MBUF_DEBUG */
538 static unsigned int mb_normalized; /* number of packets "normalized" */
539
540 #define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
541 #define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
542
543 typedef enum {
544 MC_MBUF = 0, /* Regular mbuf */
545 MC_CL, /* Cluster */
546 MC_BIGCL, /* Large (4KB) cluster */
547 MC_16KCL, /* Jumbo (16KB) cluster */
548 MC_MBUF_CL, /* mbuf + cluster */
549 MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */
550 MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */
551 } mbuf_class_t;
552
553 #define MBUF_CLASS_MIN MC_MBUF
554 #define MBUF_CLASS_MAX MC_MBUF_16KCL
555 #define MBUF_CLASS_LAST MC_16KCL
556 #define MBUF_CLASS_VALID(c) \
557 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
558 #define MBUF_CLASS_COMPOSITE(c) \
559 ((int)(c) > MBUF_CLASS_LAST)
560
561
562 /*
563 * mbuf specific mcache allocation request flags.
564 */
565 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
566
567 /*
568 * Per-cluster slab structure.
569 *
570 * A slab is a cluster control structure that contains one or more object
571 * chunks; the available chunks are chained in the slab's freelist (sl_head).
572 * Each time a chunk is taken out of the slab, the slab's reference count
573 * gets incremented. When all chunks have been taken out, the empty slab
574 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
575 * returned to a slab causes the slab's reference count to be decremented;
576 * it also causes the slab to be reinserted back to class's slab list, if
577 * it's not already done.
578 *
579 * Compartmentalizing of the object chunks into slabs allows us to easily
580 * merge one or more slabs together when the adjacent slabs are idle, as
581 * well as to convert or move a slab from one class to another; e.g. the
582 * mbuf cluster slab can be converted to a regular cluster slab when all
583 * mbufs in the slab have been freed.
584 *
585 * A slab may also span across multiple clusters for chunks larger than
586 * a cluster's size. In this case, only the slab of the first cluster is
587 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
588 * that they are part of the larger slab.
589 *
590 * Each slab controls a page of memory.
591 */
592 typedef struct mcl_slab {
593 struct mcl_slab *sl_next; /* neighboring slab */
594 u_int8_t sl_class; /* controlling mbuf class */
595 int8_t sl_refcnt; /* outstanding allocations */
596 int8_t sl_chunks; /* chunks (bufs) in this slab */
597 u_int16_t sl_flags; /* slab flags (see below) */
598 u_int16_t sl_len; /* slab length */
599 void *sl_base; /* base of allocated memory */
600 void *sl_head; /* first free buffer */
601 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
602 } mcl_slab_t;
603
604 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
605 #define SLF_PARTIAL 0x0002 /* part of another slab */
606 #define SLF_DETACHED 0x0004 /* not in slab freelist */
607
608 /*
609 * The array of slabs are broken into groups of arrays per 1MB of kernel
610 * memory to reduce the footprint. Each group is allocated on demand
611 * whenever a new piece of memory mapped in from the VM crosses the 1MB
612 * boundary.
613 */
614 #define NSLABSPMB ((1 << MBSHIFT) >> PAGE_SHIFT)
615
616 typedef struct mcl_slabg {
617 mcl_slab_t *slg_slab; /* group of slabs */
618 } mcl_slabg_t;
619
620 /*
621 * Number of slabs needed to control a 16KB cluster object.
622 */
623 #define NSLABSP16KB (M16KCLBYTES >> PAGE_SHIFT)
624
625 #if CONFIG_MBUF_MCACHE
626 /*
627 * Per-cluster audit structure.
628 */
629 typedef struct {
630 mcache_audit_t **cl_audit; /* array of audits */
631 } mcl_audit_t;
632
633 typedef struct {
634 struct thread *msa_thread; /* thread doing transaction */
635 struct thread *msa_pthread; /* previous transaction thread */
636 uint32_t msa_tstamp; /* transaction timestamp (ms) */
637 uint32_t msa_ptstamp; /* prev transaction timestamp (ms) */
638 uint16_t msa_depth; /* pc stack depth */
639 uint16_t msa_pdepth; /* previous transaction pc stack */
640 void *msa_stack[MCACHE_STACK_DEPTH];
641 void *msa_pstack[MCACHE_STACK_DEPTH];
642 } mcl_scratch_audit_t;
643
644 typedef struct {
645 /*
646 * Size of data from the beginning of an mbuf that covers m_hdr,
647 * pkthdr and m_ext structures. If auditing is enabled, we allocate
648 * a shadow mbuf structure of this size inside each audit structure,
649 * and the contents of the real mbuf gets copied into it when the mbuf
650 * is freed. This allows us to pattern-fill the mbuf for integrity
651 * check, and to preserve any constructed mbuf fields (e.g. mbuf +
652 * cluster cache case). Note that we don't save the contents of
653 * clusters when they are freed; we simply pattern-fill them.
654 */
655 u_int8_t sc_mbuf[(_MSIZE - _MHLEN) + sizeof(_m_ext_t)];
656 mcl_scratch_audit_t sc_scratch __attribute__((aligned(8)));
657 } mcl_saved_contents_t;
658
659 #define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t))
660
661 #define MCA_SAVED_MBUF_PTR(_mca) \
662 ((struct mbuf *)(void *)((mcl_saved_contents_t *) \
663 (_mca)->mca_contents)->sc_mbuf)
664 #define MCA_SAVED_MBUF_SIZE \
665 (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
666 #define MCA_SAVED_SCRATCH_PTR(_mca) \
667 (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
668
669 /*
670 * mbuf specific mcache audit flags
671 */
672 #define MB_INUSE 0x01 /* object has not been returned to slab */
673 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
674 #define MB_SCVALID 0x04 /* object has valid saved contents */
675
676 /*
677 * Each of the following two arrays hold up to nmbclusters elements.
678 */
679 static mcl_audit_t *mclaudit; /* array of cluster audit information */
680 static unsigned int maxclaudit; /* max # of entries in audit table */
681 static mcl_slabg_t **slabstbl; /* cluster slabs table */
682 static unsigned int maxslabgrp; /* max # of entries in slabs table */
683 static unsigned int slabgrp; /* # of entries in slabs table */
684 #endif /* CONFIG_MBUF_MCACHE */
685
686 /* Globals */
687 int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
688 int njcl; /* # of clusters for jumbo sizes */
689 int njclbytes; /* size of a jumbo cluster */
690 unsigned char *mbutl; /* first mapped cluster address */
691 unsigned char *embutl; /* ending virtual address of mclusters */
692 int max_linkhdr; /* largest link-level header */
693 int max_protohdr; /* largest protocol header */
694 int max_hdr; /* largest link+protocol header */
695 int max_datalen; /* MHLEN - max_hdr */
696
697 #if CONFIG_MBUF_MCACHE
698 static boolean_t mclverify; /* debug: pattern-checking */
699 static boolean_t mcltrace; /* debug: stack tracing */
700 static boolean_t mclfindleak; /* debug: leak detection */
701 static boolean_t mclexpleak; /* debug: expose leak info to user space */
702
703 static struct timeval mb_start; /* beginning of time */
704
705 /* mbuf leak detection variables */
706 static struct mleak_table mleak_table;
707 static mleak_stat_t *mleak_stat;
708
709 #define MLEAK_STAT_SIZE(n) \
710 __builtin_offsetof(mleak_stat_t, ml_trace[n])
711
712 struct mallocation {
713 mcache_obj_t *element; /* the alloc'ed element, NULL if unused */
714 u_int32_t trace_index; /* mtrace index for corresponding backtrace */
715 u_int32_t count; /* How many objects were requested */
716 u_int64_t hitcount; /* for determining hash effectiveness */
717 };
718
719 struct mtrace {
720 u_int64_t collisions;
721 u_int64_t hitcount;
722 u_int64_t allocs;
723 u_int64_t depth;
724 uintptr_t addr[MLEAK_STACK_DEPTH];
725 };
726
727 /* Size must be a power of two for the zhash to be able to just mask off bits */
728 #define MLEAK_ALLOCATION_MAP_NUM 512
729 #define MLEAK_TRACE_MAP_NUM 256
730
731 /*
732 * Sample factor for how often to record a trace. This is overwritable
733 * by the boot-arg mleak_sample_factor.
734 */
735 #define MLEAK_SAMPLE_FACTOR 500
736
737 /*
738 * Number of top leakers recorded.
739 */
740 #define MLEAK_NUM_TRACES 5
741
742 #define MB_LEAK_SPACING_64 " "
743 #define MB_LEAK_SPACING_32 " "
744
745
746 #define MB_LEAK_HDR_32 "\n\
747 trace [1] trace [2] trace [3] trace [4] trace [5] \n\
748 ---------- ---------- ---------- ---------- ---------- \n\
749 "
750
751 #define MB_LEAK_HDR_64 "\n\
752 trace [1] trace [2] trace [3] \
753 trace [4] trace [5] \n\
754 ------------------ ------------------ ------------------ \
755 ------------------ ------------------ \n\
756 "
757
758 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
759 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
760
761 /* Hashmaps of allocations and their corresponding traces */
762 static struct mallocation *mleak_allocations;
763 static struct mtrace *mleak_traces;
764 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
765
766 /* Lock to protect mleak tables from concurrent modification */
767 static LCK_GRP_DECLARE(mleak_lock_grp, "mleak_lock");
768 static LCK_MTX_DECLARE(mleak_lock_data, &mleak_lock_grp);
769 static lck_mtx_t *const mleak_lock = &mleak_lock_data;
770
771 /* *Failed* large allocations. */
772 struct mtracelarge {
773 uint64_t size;
774 uint64_t depth;
775 uintptr_t addr[MLEAK_STACK_DEPTH];
776 };
777
778 #define MTRACELARGE_NUM_TRACES 5
779 static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES];
780
781 static void mtracelarge_register(size_t size);
782 #endif /* CONFIG_MBUF_MCACHE */
783
784 /* Lock to protect the completion callback table */
785 static LCK_GRP_DECLARE(mbuf_tx_compl_tbl_lck_grp, "mbuf_tx_compl_tbl");
786 LCK_RW_DECLARE(mbuf_tx_compl_tbl_lock, &mbuf_tx_compl_tbl_lck_grp);
787
788 extern u_int32_t high_sb_max;
789
790 /* The minimum number of objects that are allocated, to start. */
791 #define MINCL 32
792 #define MINBIGCL (MINCL >> 1)
793 #define MIN16KCL (MINCL >> 2)
794
795 /* Low watermarks (only map in pages once free counts go below) */
796 #define MBIGCL_LOWAT MINBIGCL
797 #define M16KCL_LOWAT MIN16KCL
798
799 typedef struct {
800 mbuf_class_t mtbl_class; /* class type */
801 #if CONFIG_MBUF_MCACHE
802 mcache_t *mtbl_cache; /* mcache for this buffer class */
803 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
804 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */
805 #endif /* CONFIG_MBUF_MCACHE */
806 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */
807 u_int32_t mtbl_maxsize; /* maximum buffer size */
808 int mtbl_minlimit; /* minimum allowed */
809 int mtbl_maxlimit; /* maximum allowed */
810 u_int32_t mtbl_wantpurge; /* purge during next reclaim */
811 uint32_t mtbl_avgtotal; /* average total on iOS */
812 u_int32_t mtbl_expand; /* worker should expand the class */
813 } mbuf_table_t;
814
815 #define m_class(c) mbuf_table[c].mtbl_class
816 #if CONFIG_MBUF_MCACHE
817 #define m_cache(c) mbuf_table[c].mtbl_cache
818 #define m_slablist(c) mbuf_table[c].mtbl_slablist
819 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
820 #else
821 #define m_stats(c) mbuf_table[c].mtbl_stats
822 #endif /* CONFIG_MBUF_MCACHE */
823 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
824 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
825 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
826 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
827 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
828 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
829 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
830 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
831 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
832 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
833 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
834 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
835 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
836 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
837 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
838 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
839 #define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
840 #define m_region_expand(c) mbuf_table[c].mtbl_expand
841
842 static mbuf_table_t mbuf_table[] = {
843 #if CONFIG_MBUF_MCACHE
844 /*
845 * The caches for mbufs, regular clusters and big clusters.
846 * The average total values were based on data gathered by actual
847 * usage patterns on iOS.
848 */
849 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
850 NULL, NULL, 0, 0, 0, 0, 3000, 0 },
851 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
852 NULL, NULL, 0, 0, 0, 0, 2000, 0 },
853 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
854 NULL, NULL, 0, 0, 0, 0, 1000, 0 },
855 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
856 NULL, NULL, 0, 0, 0, 0, 200, 0 },
857 /*
858 * The following are special caches; they serve as intermediate
859 * caches backed by the above rudimentary caches. Each object
860 * in the cache is an mbuf with a cluster attached to it. Unlike
861 * the above caches, these intermediate caches do not directly
862 * deal with the slab structures; instead, the constructed
863 * cached elements are simply stored in the freelists.
864 */
865 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000, 0 },
866 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000, 0 },
867 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 200, 0 },
868 #else
869 { .mtbl_class = MC_MBUF },
870 { .mtbl_class = MC_CL },
871 { .mtbl_class = MC_BIGCL },
872 { .mtbl_class = MC_16KCL },
873 { .mtbl_class = MC_MBUF_CL },
874 { .mtbl_class = MC_MBUF_BIGCL },
875 { .mtbl_class = MC_MBUF_16KCL },
876 #endif /* CONFIG_MBUF_MCACHE */
877 };
878
879 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
880
881 #if SKYWALK && CONFIG_MBUF_MCACHE
882 #define MC_THRESHOLD_SCALE_DOWN_FACTOR 2
883 static unsigned int mc_threshold_scale_down_factor =
884 MC_THRESHOLD_SCALE_DOWN_FACTOR;
885 #endif /* SKYWALK */
886
887 #if CONFIG_MBUF_MCACHE
888 static uint32_t
m_avgtotal(mbuf_class_t c)889 m_avgtotal(mbuf_class_t c)
890 {
891 #if SKYWALK
892 return if_is_fsw_transport_netagent_enabled() ?
893 (mbuf_table[c].mtbl_avgtotal / mc_threshold_scale_down_factor) :
894 mbuf_table[c].mtbl_avgtotal;
895 #else /* !SKYWALK */
896 return mbuf_table[c].mtbl_avgtotal;
897 #endif /* SKYWALK */
898 }
899 #endif /* CONFIG_MBUF_MCACHE */
900
901 #if CONFIG_MBUF_MCACHE
902 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
903 static int mb_waiters; /* number of waiters */
904 #endif /* CONFIG_MBUF_MCACHE */
905
906 #define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */
907 #if CONFIG_MBUF_MCACHE
908 static struct timeval mb_wdtstart; /* watchdog start timestamp */
909 static char *mbuf_dump_buf;
910
911 #define MBUF_DUMP_BUF_SIZE 4096
912
913 /*
914 * mbuf watchdog is enabled by default. It is also toggeable via the
915 * kern.ipc.mb_watchdog sysctl.
916 * Garbage collection is enabled by default on embedded platforms.
917 * mb_drain_maxint controls the amount of time to wait (in seconds) before
918 * consecutive calls to mbuf_drain().
919 */
920 static unsigned int mb_watchdog = 1;
921 #if !XNU_TARGET_OS_OSX
922 static unsigned int mb_drain_maxint = 60;
923 #else /* XNU_TARGET_OS_OSX */
924 static unsigned int mb_drain_maxint = 0;
925 #endif /* XNU_TARGET_OS_OSX */
926 #endif /* CONFIG_MBUF_MCACHE */
927 static unsigned int mb_memory_pressure_percentage = 80;
928
929 uintptr_t mb_obscure_extfree __attribute__((visibility("hidden")));
930 uintptr_t mb_obscure_extref __attribute__((visibility("hidden")));
931
932 /* Red zone */
933 static u_int32_t mb_redzone_cookie;
934 static void m_redzone_init(struct mbuf *);
935 static void m_redzone_verify(struct mbuf *m);
936
937 static void m_set_rfa(struct mbuf *, struct ext_ref *);
938
939 #if CONFIG_MBUF_MCACHE
940 /* The following are used to serialize m_clalloc() */
941 static boolean_t mb_clalloc_busy;
942 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
943 static int mb_clalloc_waiters;
944 #endif /* CONFIG_MBUF_MCACHE */
945
946 static void mbuf_mtypes_sync(boolean_t);
947 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
948 static void mbuf_stat_sync(void);
949 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
950 #if CONFIG_MBUF_MCACHE
951 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
952 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
953 static char *mbuf_dump(void);
954 #endif /* CONFIG_MBUF_MCACHE */
955 static void mbuf_table_init(void);
956 static inline void m_incref(struct mbuf *);
957 static inline u_int16_t m_decref(struct mbuf *);
958 static void mbuf_watchdog_defunct(thread_call_param_t, thread_call_param_t);
959 #if CONFIG_MBUF_MCACHE
960 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
961 static void mbuf_worker_thread_init(void);
962 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
963 static void slab_free(mbuf_class_t, mcache_obj_t *);
964 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
965 unsigned int, int);
966 static void mbuf_slab_free(void *, mcache_obj_t *, int);
967 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
968 static void mbuf_slab_notify(void *, u_int32_t);
969 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
970 unsigned int);
971 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
972 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
973 unsigned int, int);
974 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
975 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
976 static int freelist_populate(mbuf_class_t, unsigned int, int);
977 static void freelist_init(mbuf_class_t);
978 static boolean_t mbuf_cached_above(mbuf_class_t, int);
979 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
980 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
981 static int m_howmany(int, size_t);
982 static void mbuf_worker_thread(void);
983 static void mbuf_watchdog(void);
984 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
985
986 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
987 size_t, unsigned int);
988 static void mcl_audit_free(void *, unsigned int);
989 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
990 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
991 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
992 boolean_t);
993 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
994 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
995 static void mcl_audit_scratch(mcache_audit_t *);
996 static void mcl_audit_mcheck_panic(struct mbuf *);
997 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
998
999 static void mleak_activate(void);
1000 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
1001 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
1002 static void mleak_free(mcache_obj_t *);
1003 static void mleak_sort_traces(void);
1004 static void mleak_update_stats(void);
1005
1006 static mcl_slab_t *slab_get(void *);
1007 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
1008 void *, void *, unsigned int, int, int);
1009 static void slab_insert(mcl_slab_t *, mbuf_class_t);
1010 static void slab_remove(mcl_slab_t *, mbuf_class_t);
1011 static boolean_t slab_inrange(mcl_slab_t *, void *);
1012 static void slab_nextptr_panic(mcl_slab_t *, void *);
1013 static void slab_detach(mcl_slab_t *);
1014 static boolean_t slab_is_detached(mcl_slab_t *);
1015 #else /* !CONFIG_MBUF_MCACHE */
1016 static void mbuf_watchdog_drain_composite(thread_call_param_t, thread_call_param_t);
1017 static struct mbuf *mz_alloc(zalloc_flags_t);
1018 static void mz_free(struct mbuf *);
1019 static struct ext_ref *mz_ref_alloc(zalloc_flags_t);
1020 static void mz_ref_free(struct ext_ref *);
1021 static void *mz_cl_alloc(zone_id_t, zalloc_flags_t);
1022 static void mz_cl_free(zone_id_t, void *);
1023 static struct mbuf *mz_composite_alloc(mbuf_class_t, zalloc_flags_t);
1024 static zstack_t mz_composite_alloc_n(mbuf_class_t, unsigned int, zalloc_flags_t);
1025 static void mz_composite_free(mbuf_class_t, struct mbuf *);
1026 static void mz_composite_free_n(mbuf_class_t, zstack_t);
1027 static void *mz_composite_build(zone_id_t, zalloc_flags_t);
1028 static void *mz_composite_mark_valid(zone_id_t, void *);
1029 static void *mz_composite_mark_invalid(zone_id_t, void *);
1030 static void mz_composite_destroy(zone_id_t, void *);
1031
1032 ZONE_DEFINE_ID(ZONE_ID_MBUF_REF, "mbuf.ref", struct ext_ref,
1033 ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE);
1034 ZONE_DEFINE_ID(ZONE_ID_MBUF, "mbuf", struct mbuf,
1035 ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE);
1036 ZONE_DEFINE_ID(ZONE_ID_CLUSTER_2K, "mbuf.cluster.2k", union mcluster,
1037 ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE | ZC_DATA);
1038 ZONE_DEFINE_ID(ZONE_ID_CLUSTER_4K, "mbuf.cluster.4k", union mbigcluster,
1039 ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE | ZC_DATA);
1040 ZONE_DEFINE_ID(ZONE_ID_CLUSTER_16K, "mbuf.cluster.16k", union m16kcluster,
1041 ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE | ZC_DATA);
1042 static_assert(sizeof(union mcluster) == MCLBYTES);
1043 static_assert(sizeof(union mbigcluster) == MBIGCLBYTES);
1044 static_assert(sizeof(union m16kcluster) == M16KCLBYTES);
1045
1046 static const struct zone_cache_ops mz_composite_ops = {
1047 .zc_op_alloc = mz_composite_build,
1048 .zc_op_mark_valid = mz_composite_mark_valid,
1049 .zc_op_mark_invalid = mz_composite_mark_invalid,
1050 .zc_op_free = mz_composite_destroy,
1051 };
1052 ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_2K, "mbuf.composite.2k", struct mbuf,
1053 sizeof(struct mbuf) + sizeof(struct ext_ref) + MCLBYTES,
1054 &mz_composite_ops);
1055 ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_4K, "mbuf.composite.4k", struct mbuf,
1056 sizeof(struct mbuf) + sizeof(struct ext_ref) + MBIGCLBYTES,
1057 &mz_composite_ops);
1058 ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_16K, "mbuf.composite.16k", struct mbuf,
1059 sizeof(struct mbuf) + sizeof(struct ext_ref) + M16KCLBYTES,
1060 &mz_composite_ops);
1061 static_assert(ZONE_ID_MBUF + MC_MBUF == ZONE_ID_MBUF);
1062 static_assert(ZONE_ID_MBUF + MC_CL == ZONE_ID_CLUSTER_2K);
1063 static_assert(ZONE_ID_MBUF + MC_BIGCL == ZONE_ID_CLUSTER_4K);
1064 static_assert(ZONE_ID_MBUF + MC_16KCL == ZONE_ID_CLUSTER_16K);
1065 static_assert(ZONE_ID_MBUF + MC_MBUF_CL == ZONE_ID_MBUF_CLUSTER_2K);
1066 static_assert(ZONE_ID_MBUF + MC_MBUF_BIGCL == ZONE_ID_MBUF_CLUSTER_4K);
1067 static_assert(ZONE_ID_MBUF + MC_MBUF_16KCL == ZONE_ID_MBUF_CLUSTER_16K);
1068
1069 /* Converts a an mbuf class to a zalloc zone ID. */
1070 __attribute__((always_inline))
1071 static inline zone_id_t
m_class_to_zid(mbuf_class_t class)1072 m_class_to_zid(mbuf_class_t class)
1073 {
1074 return ZONE_ID_MBUF + class - MC_MBUF;
1075 }
1076
1077 __attribute__((always_inline))
1078 static inline mbuf_class_t
m_class_from_zid(zone_id_t zid)1079 m_class_from_zid(zone_id_t zid)
1080 {
1081 return MC_MBUF + zid - ZONE_ID_MBUF;
1082 }
1083
1084 static thread_call_t mbuf_defunct_tcall;
1085 static thread_call_t mbuf_drain_tcall;
1086 #endif /* CONFIG_MBUF_MCACHE */
1087
1088 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
1089 static struct mbuf *m_split0(struct mbuf *, int, int, int);
1090 #if CONFIG_MBUF_MCACHE && (DEBUG || DEVELOPMENT)
1091 #define mbwdog_logger(fmt, ...) _mbwdog_logger(__func__, __LINE__, fmt, ## __VA_ARGS__)
1092 static void _mbwdog_logger(const char *func, const int line, const char *fmt, ...);
1093 static char *mbwdog_logging;
1094 const unsigned mbwdog_logging_size = 4096;
1095 static size_t mbwdog_logging_used;
1096 #else
1097 #define mbwdog_logger(fmt, ...) do { } while (0)
1098 #endif /* CONFIG_MBUF_MCACHE &&DEBUG || DEVELOPMENT */
1099 #if CONFIG_MBUF_MCACHE
1100 static void mbuf_drain_locked(boolean_t);
1101 #endif /* CONFIG_MBUF_MCACHE */
1102
1103 /* flags for m_copyback0 */
1104 #define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
1105 #define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
1106 #define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
1107 #define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
1108
1109 /*
1110 * This flag is set for all mbufs that come out of and into the composite
1111 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
1112 * are marked with such a flag have clusters attached to them, and will be
1113 * treated differently when they are freed; instead of being placed back
1114 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
1115 * are placed back into the appropriate composite cache's freelist, and the
1116 * actual freeing is deferred until the composite objects are purged. At
1117 * such a time, this flag will be cleared from the mbufs and the objects
1118 * will be freed into their own separate freelists.
1119 */
1120 #define EXTF_COMPOSITE 0x1
1121
1122 /*
1123 * This flag indicates that the external cluster is read-only, i.e. it is
1124 * or was referred to by more than one mbufs. Once set, this flag is never
1125 * cleared.
1126 */
1127 #define EXTF_READONLY 0x2
1128 /*
1129 * This flag indicates that the external cluster is paired with the mbuf.
1130 * Pairing implies an external free routine defined which will be invoked
1131 * when the reference count drops to the minimum at m_free time. This
1132 * flag is never cleared.
1133 */
1134 #define EXTF_PAIRED 0x4
1135
1136 #define EXTF_MASK \
1137 (EXTF_COMPOSITE | EXTF_READONLY | EXTF_PAIRED)
1138
1139 #define MEXT_MINREF(m) ((m_get_rfa(m))->minref)
1140 #define MEXT_REF(m) ((m_get_rfa(m))->refcnt)
1141 #define MEXT_PREF(m) ((m_get_rfa(m))->prefcnt)
1142 #define MEXT_FLAGS(m) ((m_get_rfa(m))->flags)
1143 #define MEXT_PRIV(m) ((m_get_rfa(m))->priv)
1144 #define MEXT_PMBUF(m) ((m_get_rfa(m))->paired)
1145 #define MEXT_TOKEN(m) ((m_get_rfa(m))->ext_token)
1146 #define MBUF_IS_COMPOSITE(m) \
1147 (MEXT_REF(m) == MEXT_MINREF(m) && \
1148 (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
1149 /*
1150 * This macro can be used to test if the mbuf is paired to an external
1151 * cluster. The test for MEXT_PMBUF being equal to the mbuf in subject
1152 * is important, as EXTF_PAIRED alone is insufficient since it is immutable,
1153 * and thus survives calls to m_free_paired.
1154 */
1155 #define MBUF_IS_PAIRED(m) \
1156 (((m)->m_flags & M_EXT) && \
1157 (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_PAIRED && \
1158 MEXT_PMBUF(m) == (m))
1159
1160 /*
1161 * Macros used to verify the integrity of the mbuf.
1162 */
1163 #if CONFIG_MBUF_MCACHE
1164 #define _MCHECK(m) { \
1165 if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) { \
1166 if (mclaudit == NULL) \
1167 panic("MCHECK: m_type=%d m=%p", \
1168 (u_int16_t)(m)->m_type, m); \
1169 else \
1170 mcl_audit_mcheck_panic(m); \
1171 } \
1172 }
1173 #else
1174 #define _MCHECK(m) \
1175 if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) { \
1176 panic("MCHECK: m_type=%d m=%p", \
1177 (u_int16_t)(m)->m_type, m); \
1178 }
1179 #endif /* CONFIG_MBUF_MCACHE */
1180
1181 /*
1182 * Macro version of mtod.
1183 */
1184 #define MTOD(m, t) ((t)((m)->m_data))
1185
1186 #if CONFIG_MBUF_MCACHE
1187 #define MBUF_IN_MAP(addr) \
1188 ((unsigned char *)(addr) >= mbutl && \
1189 (unsigned char *)(addr) < embutl)
1190
1191 #define MRANGE(addr) { \
1192 if (!MBUF_IN_MAP(addr)) \
1193 panic("MRANGE: address out of range 0x%p", addr); \
1194 }
1195
1196 /*
1197 * Macros to obtain page index given a base cluster address
1198 */
1199 #define MTOPG(x) (((unsigned char *)x - mbutl) >> PAGE_SHIFT)
1200 #define PGTOM(x) (mbutl + (x << PAGE_SHIFT))
1201
1202 /*
1203 * Macro to find the mbuf index relative to a base.
1204 */
1205 #define MBPAGEIDX(c, m) \
1206 (((unsigned char *)(m) - (unsigned char *)(c)) >> _MSIZESHIFT)
1207
1208 /*
1209 * Same thing for 2KB cluster index.
1210 */
1211 #define CLPAGEIDX(c, m) \
1212 (((unsigned char *)(m) - (unsigned char *)(c)) >> MCLSHIFT)
1213
1214 /*
1215 * Macro to find 4KB cluster index relative to a base
1216 */
1217 #define BCLPAGEIDX(c, m) \
1218 (((unsigned char *)(m) - (unsigned char *)(c)) >> MBIGCLSHIFT)
1219 #endif /* CONFIG_MBUF_MCACHE */
1220
1221 /*
1222 * Macros used during mbuf and cluster initialization.
1223 */
1224 #define MBUF_INIT_PKTHDR(m) { \
1225 (m)->m_pkthdr.rcvif = NULL; \
1226 (m)->m_pkthdr.pkt_hdr = NULL; \
1227 (m)->m_pkthdr.len = 0; \
1228 (m)->m_pkthdr.csum_flags = 0; \
1229 (m)->m_pkthdr.csum_data = 0; \
1230 (m)->m_pkthdr.vlan_tag = 0; \
1231 (m)->m_pkthdr.comp_gencnt = 0; \
1232 (m)->m_pkthdr.pkt_crumbs = 0; \
1233 m_classifier_init(m, 0); \
1234 m_tag_init(m, 1); \
1235 m_scratch_init(m); \
1236 m_redzone_init(m); \
1237 }
1238
1239 #define MBUF_INIT(m, pkthdr, type) { \
1240 _MCHECK(m); \
1241 (m)->m_next = (m)->m_nextpkt = NULL; \
1242 (m)->m_len = 0; \
1243 (m)->m_type = type; \
1244 if ((pkthdr) == 0) { \
1245 (m)->m_data = (uintptr_t)(m)->m_dat; \
1246 (m)->m_flags = 0; \
1247 } else { \
1248 (m)->m_data = (uintptr_t)(m)->m_pktdat; \
1249 (m)->m_flags = M_PKTHDR; \
1250 MBUF_INIT_PKTHDR(m); \
1251 } \
1252 }
1253
1254 #define MEXT_INIT mext_init
1255
1256 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
1257 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, 0, \
1258 ref, 0, flag, 0, NULL)
1259
1260 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
1261 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, 0, \
1262 ref, 0, flag, 0, NULL)
1263
1264 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
1265 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, 0, \
1266 ref, 0, flag, 0, NULL)
1267
1268 /*
1269 * Macro to convert BSD malloc sleep flag to mcache's
1270 */
1271 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
1272
1273 /*
1274 * The structure that holds all mbuf class statistics exportable via sysctl.
1275 * Similar to mbstat structure, the mb_stat structure is protected by the
1276 * global mbuf lock. It contains additional information about the classes
1277 * that allows for a more accurate view of the state of the allocator.
1278 */
1279 struct mb_stat *mb_stat;
1280 struct omb_stat *omb_stat; /* For backwards compatibility */
1281
1282 #define MB_STAT_SIZE(n) \
1283 __builtin_offsetof(mb_stat_t, mbs_class[n])
1284 #define OMB_STAT_SIZE(n) \
1285 __builtin_offsetof(struct omb_stat, mbs_class[n])
1286
1287 /*
1288 * The legacy structure holding all of the mbuf allocation statistics.
1289 * The actual statistics used by the kernel are stored in the mbuf_table
1290 * instead, and are updated atomically while the global mbuf lock is held.
1291 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
1292 * Unlike before, the kernel no longer relies on the contents of mbstat for
1293 * its operations (e.g. cluster expansion) because the structure is exposed
1294 * to outside and could possibly be modified, therefore making it unsafe.
1295 * With the exception of the mbstat.m_mtypes array (see below), all of the
1296 * statistics are updated as they change.
1297 */
1298 struct mbstat mbstat;
1299
1300 #define MBSTAT_MTYPES_MAX \
1301 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
1302
1303 /*
1304 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
1305 * atomically and stored in a per-CPU structure which is lock-free; this is
1306 * done in order to avoid writing to the global mbstat data structure which
1307 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
1308 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
1309 * array and returned to the application. Any updates for types greater or
1310 * equal than MT_MAX would be done atomically to the mbstat; this slows down
1311 * performance but is okay since the kernel uses only up to MT_MAX-1 while
1312 * anything beyond that (up to type 255) is considered a corner case.
1313 */
1314 typedef struct {
1315 unsigned int cpu_mtypes[MT_MAX];
1316 } mbuf_mtypes_t;
1317
1318 static mbuf_mtypes_t PERCPU_DATA(mbuf_mtypes);
1319
1320 #define mtype_stat_add(type, n) { \
1321 if ((unsigned)(type) < MT_MAX) { \
1322 mbuf_mtypes_t *mbs = PERCPU_GET(mbuf_mtypes); \
1323 os_atomic_add(&mbs->cpu_mtypes[type], n, relaxed); \
1324 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
1325 os_atomic_add((int16_t *)&mbstat.m_mtypes[type], n, relaxed); \
1326 } \
1327 }
1328
1329 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
1330 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
1331 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
1332
1333 static inline void
mext_init(struct mbuf * m,void * __sized_by (size)buf,u_int size,m_ext_free_func_t free,caddr_t free_arg,struct ext_ref * rfa,u_int16_t min,u_int16_t ref,u_int16_t pref,u_int16_t flag,u_int32_t priv,struct mbuf * pm)1334 mext_init(struct mbuf *m, void *__sized_by(size)buf, u_int size,
1335 m_ext_free_func_t free, caddr_t free_arg, struct ext_ref *rfa,
1336 u_int16_t min, u_int16_t ref, u_int16_t pref, u_int16_t flag,
1337 u_int32_t priv, struct mbuf *pm)
1338 {
1339 m->m_ext.ext_buf = buf;
1340 m->m_ext.ext_size = size;
1341 m->m_data = (uintptr_t)m->m_ext.ext_buf;
1342 m->m_len = 0;
1343 m->m_flags |= M_EXT;
1344 m_set_ext(m, rfa, free, free_arg);
1345 MEXT_MINREF(m) = min;
1346 MEXT_REF(m) = ref;
1347 MEXT_PREF(m) = pref;
1348 MEXT_FLAGS(m) = flag;
1349 MEXT_PRIV(m) = priv;
1350 MEXT_PMBUF(m) = pm;
1351 }
1352
1353 static void
mbuf_mtypes_sync(boolean_t locked)1354 mbuf_mtypes_sync(boolean_t locked)
1355 {
1356 mbuf_mtypes_t mtc;
1357
1358 if (locked) {
1359 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1360 }
1361
1362 mtc = *PERCPU_GET_MASTER(mbuf_mtypes);
1363 percpu_foreach_secondary(mtype, mbuf_mtypes) {
1364 for (int n = 0; n < MT_MAX; n++) {
1365 mtc.cpu_mtypes[n] += mtype->cpu_mtypes[n];
1366 }
1367 }
1368
1369 if (!locked) {
1370 lck_mtx_lock(mbuf_mlock);
1371 }
1372 for (int n = 0; n < MT_MAX; n++) {
1373 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
1374 }
1375 if (!locked) {
1376 lck_mtx_unlock(mbuf_mlock);
1377 }
1378 }
1379
1380 static int
1381 mbstat_sysctl SYSCTL_HANDLER_ARGS
1382 {
1383 #pragma unused(oidp, arg1, arg2)
1384
1385 #if CONFIG_MBUF_MCACHE
1386 mbuf_mtypes_sync(FALSE);
1387 #else
1388 lck_mtx_lock(mbuf_mlock);
1389 mbuf_stat_sync();
1390 mbuf_mtypes_sync(TRUE);
1391 lck_mtx_unlock(mbuf_mlock);
1392 #endif
1393
1394 return SYSCTL_OUT(req, &mbstat, sizeof(mbstat));
1395 }
1396
1397 static void
mbuf_stat_sync(void)1398 mbuf_stat_sync(void)
1399 {
1400 mb_class_stat_t *sp;
1401 #if CONFIG_MBUF_MCACHE
1402 mcache_cpu_t *ccp;
1403 mcache_t *cp;
1404 int k, m, bktsize;
1405 #else
1406 int k;
1407 uint64_t drops = 0;
1408 #endif /* CONFIG_MBUF_MCACHE */
1409
1410
1411 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1412
1413 #if CONFIG_MBUF_MCACHE
1414 for (k = 0; k < NELEM(mbuf_table); k++) {
1415 cp = m_cache(k);
1416 ccp = &cp->mc_cpu[0];
1417 bktsize = ccp->cc_bktsize;
1418 sp = mbuf_table[k].mtbl_stats;
1419
1420 if (cp->mc_flags & MCF_NOCPUCACHE) {
1421 sp->mbcl_mc_state = MCS_DISABLED;
1422 } else if (cp->mc_purge_cnt > 0) {
1423 sp->mbcl_mc_state = MCS_PURGING;
1424 } else if (bktsize == 0) {
1425 sp->mbcl_mc_state = MCS_OFFLINE;
1426 } else {
1427 sp->mbcl_mc_state = MCS_ONLINE;
1428 }
1429
1430 sp->mbcl_mc_cached = 0;
1431 for (m = 0; m < ncpu; m++) {
1432 ccp = &cp->mc_cpu[m];
1433 if (ccp->cc_objs > 0) {
1434 sp->mbcl_mc_cached += ccp->cc_objs;
1435 }
1436 if (ccp->cc_pobjs > 0) {
1437 sp->mbcl_mc_cached += ccp->cc_pobjs;
1438 }
1439 }
1440 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
1441 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1442 sp->mbcl_infree;
1443
1444 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1445 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1446 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1447
1448 /* Calculate total count specific to each class */
1449 sp->mbcl_ctotal = sp->mbcl_total;
1450 switch (m_class(k)) {
1451 case MC_MBUF:
1452 /* Deduct mbufs used in composite caches */
1453 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1454 m_total(MC_MBUF_BIGCL) - m_total(MC_MBUF_16KCL));
1455 break;
1456
1457 case MC_CL:
1458 /* Deduct clusters used in composite cache */
1459 sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1460 break;
1461
1462 case MC_BIGCL:
1463 /* Deduct clusters used in composite cache */
1464 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1465 break;
1466
1467 case MC_16KCL:
1468 /* Deduct clusters used in composite cache */
1469 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1470 break;
1471
1472 default:
1473 break;
1474 }
1475 }
1476 #else
1477 for (k = 0; k < NELEM(mbuf_table); k++) {
1478 const zone_id_t zid = m_class_to_zid(m_class(k));
1479 const zone_t zone = zone_by_id(zid);
1480 struct zone_basic_stats stats = {};
1481
1482 sp = m_stats(k);
1483 zone_get_stats(zone, &stats);
1484 drops += stats.zbs_alloc_fail;
1485 sp->mbcl_total = stats.zbs_avail;
1486 sp->mbcl_active = stats.zbs_alloc;
1487 /*
1488 * infree is what mcache considers the freelist (uncached)
1489 * free_cnt contains all the cached/uncached elements
1490 * in a zone.
1491 */
1492 sp->mbcl_infree = stats.zbs_free - stats.zbs_cached;
1493 sp->mbcl_fail_cnt = stats.zbs_alloc_fail;
1494 sp->mbcl_ctotal = sp->mbcl_total;
1495
1496 /* These stats are not available in zalloc. */
1497 sp->mbcl_alloc_cnt = 0;
1498 sp->mbcl_free_cnt = 0;
1499 sp->mbcl_notified = 0;
1500 sp->mbcl_purge_cnt = 0;
1501 sp->mbcl_slab_cnt = 0;
1502 sp->mbcl_release_cnt = 0;
1503
1504 /* zalloc caches are always on. */
1505 sp->mbcl_mc_state = MCS_ONLINE;
1506 sp->mbcl_mc_cached = stats.zbs_cached;
1507 /* These stats are not collected by zalloc. */
1508 sp->mbcl_mc_waiter_cnt = 0;
1509 sp->mbcl_mc_wretry_cnt = 0;
1510 sp->mbcl_mc_nwretry_cnt = 0;
1511 }
1512 /* Deduct clusters used in composite cache */
1513 m_ctotal(MC_MBUF) -= (m_total(MC_MBUF_CL) +
1514 m_total(MC_MBUF_BIGCL) -
1515 m_total(MC_MBUF_16KCL));
1516 m_ctotal(MC_CL) -= m_total(MC_MBUF_CL);
1517 m_ctotal(MC_BIGCL) -= m_total(MC_MBUF_BIGCL);
1518 m_ctotal(MC_16KCL) -= m_total(MC_MBUF_16KCL);
1519
1520 /* Update mbstat. */
1521 mbstat.m_mbufs = m_total(MC_MBUF);
1522 mbstat.m_clusters = m_total(MC_CL);
1523 mbstat.m_clfree = m_infree(MC_CL) + m_infree(MC_MBUF_CL);
1524 mbstat.m_drops = drops;
1525 mbstat.m_bigclusters = m_total(MC_BIGCL);
1526 mbstat.m_bigclfree = m_infree(MC_BIGCL) + m_infree(MC_MBUF_BIGCL);
1527 #endif /* CONFIG_MBUF_MCACHE */
1528 }
1529
1530 static int
1531 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1532 {
1533 #pragma unused(oidp, arg1, arg2)
1534 void *statp;
1535 int k, statsz, proc64 = proc_is64bit(req->p);
1536
1537 lck_mtx_lock(mbuf_mlock);
1538 mbuf_stat_sync();
1539
1540 if (!proc64) {
1541 struct omb_class_stat *oc;
1542 struct mb_class_stat *c;
1543
1544 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1545 oc = &omb_stat->mbs_class[0];
1546 c = &mb_stat->mbs_class[0];
1547 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1548 (void) snprintf(oc->mbcl_cname, sizeof(oc->mbcl_cname),
1549 "%s", c->mbcl_cname);
1550 oc->mbcl_size = c->mbcl_size;
1551 oc->mbcl_total = c->mbcl_total;
1552 oc->mbcl_active = c->mbcl_active;
1553 oc->mbcl_infree = c->mbcl_infree;
1554 oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1555 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1556 oc->mbcl_free_cnt = c->mbcl_free_cnt;
1557 oc->mbcl_notified = c->mbcl_notified;
1558 oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1559 oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1560 oc->mbcl_ctotal = c->mbcl_ctotal;
1561 oc->mbcl_release_cnt = c->mbcl_release_cnt;
1562 oc->mbcl_mc_state = c->mbcl_mc_state;
1563 oc->mbcl_mc_cached = c->mbcl_mc_cached;
1564 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1565 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1566 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1567 }
1568 statp = omb_stat;
1569 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1570 } else {
1571 statp = mb_stat;
1572 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1573 }
1574
1575 lck_mtx_unlock(mbuf_mlock);
1576
1577 return SYSCTL_OUT(req, statp, statsz);
1578 }
1579
1580 #if !CONFIG_MBUF_MCACHE
1581 /*
1582 * The following functions are wrappers around mbuf
1583 * allocation for zalloc. They all have the prefix "mz"
1584 * which was chosen to avoid conflicts with the mbuf KPIs.
1585 *
1586 * Z_NOPAGEWAIT is used in place of Z_NOWAIT because
1587 * Z_NOPAGEWAIT maps closer to MCR_TRYHARD. Z_NOWAIT will
1588 * fail immediately if it has to take a mutex and that
1589 * may cause packets to be dropped more frequently.
1590 * In general, the mbuf subsystem can sustain grabbing a mutex
1591 * during "non-blocking" allocation and that's the reason
1592 * why Z_NOPAGEWAIT was chosen.
1593 *
1594 * mbufs are elided (removed all pointers) before they are
1595 * returned to the cache. The exception are composite mbufs which
1596 * are re-initialized on allocation.
1597 */
1598 __attribute__((always_inline))
1599 static inline void
m_elide(struct mbuf * m)1600 m_elide(struct mbuf *m)
1601 {
1602 m->m_next = m->m_nextpkt = NULL;
1603 m->m_data = 0;
1604 memset(&m->m_ext, 0, sizeof(m->m_ext));
1605 m->m_pkthdr.rcvif = NULL;
1606 m->m_pkthdr.pkt_hdr = NULL;
1607 m->m_flags |= M_PKTHDR;
1608 m_tag_init(m, 1);
1609 m->m_pkthdr.pkt_flags = 0;
1610 m_scratch_init(m);
1611 m->m_pkthdr.redzone = 0;
1612 m->m_flags &= ~M_PKTHDR;
1613 }
1614
1615 __attribute__((always_inline))
1616 static inline struct mbuf *
mz_alloc(zalloc_flags_t flags)1617 mz_alloc(zalloc_flags_t flags)
1618 {
1619 if (flags & Z_NOWAIT) {
1620 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
1621 } else if (!(flags & Z_NOPAGEWAIT)) {
1622 flags |= Z_NOFAIL;
1623 }
1624 return zalloc_id(ZONE_ID_MBUF, flags | Z_NOZZC);
1625 }
1626
1627 __attribute__((always_inline))
1628 static inline zstack_t
mz_alloc_n(uint32_t count,zalloc_flags_t flags)1629 mz_alloc_n(uint32_t count, zalloc_flags_t flags)
1630 {
1631 if (flags & Z_NOWAIT) {
1632 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
1633 } else if (!(flags & Z_NOPAGEWAIT)) {
1634 flags |= Z_NOFAIL;
1635 }
1636 return zalloc_n(ZONE_ID_MBUF, count, flags | Z_NOZZC);
1637 }
1638
1639 __attribute__((always_inline))
1640 static inline void
mz_free(struct mbuf * m)1641 mz_free(struct mbuf *m)
1642 {
1643 #if KASAN
1644 zone_require(zone_by_id(ZONE_ID_MBUF), m);
1645 #endif
1646 m_elide(m);
1647 zfree_nozero(ZONE_ID_MBUF, m);
1648 }
1649
1650 __attribute__((always_inline))
1651 static inline void
mz_free_n(zstack_t list)1652 mz_free_n(zstack_t list)
1653 {
1654 /* Callers of this function have already elided the mbuf. */
1655 zfree_nozero_n(ZONE_ID_MBUF, list);
1656 }
1657
1658 __attribute__((always_inline))
1659 static inline struct ext_ref *
mz_ref_alloc(zalloc_flags_t flags)1660 mz_ref_alloc(zalloc_flags_t flags)
1661 {
1662 if (flags & Z_NOWAIT) {
1663 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
1664 }
1665 return zalloc_id(ZONE_ID_MBUF_REF, flags | Z_NOZZC);
1666 }
1667
1668 __attribute__((always_inline))
1669 static inline void
mz_ref_free(struct ext_ref * rfa)1670 mz_ref_free(struct ext_ref *rfa)
1671 {
1672 VERIFY(rfa->minref == rfa->refcnt);
1673 #if KASAN
1674 zone_require(zone_by_id(ZONE_ID_MBUF_REF), rfa);
1675 #endif
1676 zfree_nozero(ZONE_ID_MBUF_REF, rfa);
1677 }
1678
1679 __attribute__((always_inline))
1680 static inline void *
mz_cl_alloc(zone_id_t zid,zalloc_flags_t flags)1681 mz_cl_alloc(zone_id_t zid, zalloc_flags_t flags)
1682 {
1683 if (flags & Z_NOWAIT) {
1684 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
1685 } else if (!(flags & Z_NOPAGEWAIT)) {
1686 flags |= Z_NOFAIL;
1687 }
1688 return (zalloc_id)(zid, flags | Z_NOZZC);
1689 }
1690
1691 __attribute__((always_inline))
1692 static inline void
mz_cl_free(zone_id_t zid,void * cl)1693 mz_cl_free(zone_id_t zid, void *cl)
1694 {
1695 #if KASAN
1696 zone_require(zone_by_id(zid), cl);
1697 #endif
1698 zfree_nozero(zid, cl);
1699 }
1700
1701 __attribute__((always_inline))
1702 static inline zstack_t
mz_composite_alloc_n(mbuf_class_t class,unsigned int n,zalloc_flags_t flags)1703 mz_composite_alloc_n(mbuf_class_t class, unsigned int n, zalloc_flags_t flags)
1704 {
1705 if (flags & Z_NOWAIT) {
1706 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
1707 }
1708 return (zcache_alloc_n)(m_class_to_zid(class), n, flags,
1709 &mz_composite_ops);
1710 }
1711
1712 __attribute__((always_inline))
1713 static inline struct mbuf *
mz_composite_alloc(mbuf_class_t class,zalloc_flags_t flags)1714 mz_composite_alloc(mbuf_class_t class, zalloc_flags_t flags)
1715 {
1716 zstack_t list = {};
1717 list = mz_composite_alloc_n(class, 1, flags);
1718 if (!zstack_empty(list)) {
1719 return zstack_pop(&list);
1720 } else {
1721 return NULL;
1722 }
1723 }
1724
1725 __attribute__((always_inline))
1726 static inline void
mz_composite_free_n(mbuf_class_t class,zstack_t list)1727 mz_composite_free_n(mbuf_class_t class, zstack_t list)
1728 {
1729 (zcache_free_n)(m_class_to_zid(class), list, &mz_composite_ops);
1730 }
1731
1732 __attribute__((always_inline))
1733 static inline void
mz_composite_free(mbuf_class_t class,struct mbuf * m)1734 mz_composite_free(mbuf_class_t class, struct mbuf *m)
1735 {
1736 zstack_t list = {};
1737 zstack_push(&list, m);
1738 (zcache_free_n)(m_class_to_zid(class), list, &mz_composite_ops);
1739 }
1740
1741 /* Converts composite zone ID to the cluster zone ID. */
1742 __attribute__((always_inline))
1743 static inline zone_id_t
mz_cl_zid(zone_id_t zid)1744 mz_cl_zid(zone_id_t zid)
1745 {
1746 return ZONE_ID_CLUSTER_2K + zid - ZONE_ID_MBUF_CLUSTER_2K;
1747 }
1748
1749 static void *
mz_composite_build(zone_id_t zid,zalloc_flags_t flags)1750 mz_composite_build(zone_id_t zid, zalloc_flags_t flags)
1751 {
1752 const zone_id_t cl_zid = mz_cl_zid(zid);
1753 struct mbuf *m = NULL;
1754 struct ext_ref *rfa = NULL;
1755 void *cl = NULL;
1756
1757 cl = mz_cl_alloc(cl_zid, flags);
1758 if (__improbable(cl == NULL)) {
1759 goto out;
1760 }
1761 rfa = mz_ref_alloc(flags);
1762 if (__improbable(rfa == NULL)) {
1763 goto out_free_cl;
1764 }
1765 m = mz_alloc(flags);
1766 if (__improbable(m == NULL)) {
1767 goto out_free_rfa;
1768 }
1769 MBUF_INIT(m, 0, MT_FREE);
1770 if (zid == ZONE_ID_MBUF_CLUSTER_2K) {
1771 MBUF_CL_INIT(m, cl, rfa, 0, EXTF_COMPOSITE);
1772 } else if (zid == ZONE_ID_MBUF_CLUSTER_4K) {
1773 MBUF_BIGCL_INIT(m, cl, rfa, 0, EXTF_COMPOSITE);
1774 } else {
1775 MBUF_16KCL_INIT(m, cl, rfa, 0, EXTF_COMPOSITE);
1776 }
1777 VERIFY(m->m_flags == M_EXT);
1778 VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
1779
1780 return m;
1781 out_free_rfa:
1782 mz_ref_free(rfa);
1783 out_free_cl:
1784 mz_cl_free(cl_zid, cl);
1785 out:
1786 return NULL;
1787 }
1788
1789 static void *
mz_composite_mark_valid(zone_id_t zid,void * p)1790 mz_composite_mark_valid(zone_id_t zid, void *p)
1791 {
1792 struct mbuf *m = p;
1793
1794 m = zcache_mark_valid(zone_by_id(ZONE_ID_MBUF), m);
1795 #if KASAN
1796 struct ext_ref *rfa = m_get_rfa(m);
1797 const zone_id_t cl_zid = mz_cl_zid(zid);
1798 void *cl = m->m_ext.ext_buf;
1799
1800 cl = zcache_mark_valid(zone_by_id(cl_zid), cl);
1801 rfa = zcache_mark_valid(zone_by_id(ZONE_ID_MBUF_REF), rfa);
1802 m->m_data = (uintptr_t)cl;
1803 m->m_ext.ext_buf = cl;
1804 m_set_rfa(m, rfa);
1805 #else
1806 #pragma unused(zid)
1807 #endif
1808 VERIFY(MBUF_IS_COMPOSITE(m));
1809
1810 return m;
1811 }
1812
1813 static void *
mz_composite_mark_invalid(zone_id_t zid,void * p)1814 mz_composite_mark_invalid(zone_id_t zid, void *p)
1815 {
1816 struct mbuf *m = p;
1817
1818 VERIFY(MBUF_IS_COMPOSITE(m));
1819 VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
1820 #if KASAN
1821 struct ext_ref *rfa = m_get_rfa(m);
1822 const zone_id_t cl_zid = mz_cl_zid(zid);
1823 void *cl = m->m_ext.ext_buf;
1824
1825 cl = zcache_mark_invalid(zone_by_id(cl_zid), cl);
1826 rfa = zcache_mark_invalid(zone_by_id(ZONE_ID_MBUF_REF), rfa);
1827 m->m_data = (uintptr_t)cl;
1828 m->m_ext.ext_buf = cl;
1829 m_set_rfa(m, rfa);
1830 #else
1831 #pragma unused(zid)
1832 #endif
1833
1834 return zcache_mark_invalid(zone_by_id(ZONE_ID_MBUF), m);
1835 }
1836
1837 static void
mz_composite_destroy(zone_id_t zid,void * p)1838 mz_composite_destroy(zone_id_t zid, void *p)
1839 {
1840 const zone_id_t cl_zid = mz_cl_zid(zid);
1841 struct ext_ref *rfa = NULL;
1842 struct mbuf *m = p;
1843
1844 VERIFY(MBUF_IS_COMPOSITE(m));
1845
1846 MEXT_MINREF(m) = 0;
1847 MEXT_REF(m) = 0;
1848 MEXT_PREF(m) = 0;
1849 MEXT_FLAGS(m) = 0;
1850 MEXT_PRIV(m) = 0;
1851 MEXT_PMBUF(m) = NULL;
1852 MEXT_TOKEN(m) = 0;
1853
1854 rfa = m_get_rfa(m);
1855 m_set_ext(m, NULL, NULL, NULL);
1856
1857 m->m_type = MT_FREE;
1858 m->m_flags = m->m_len = 0;
1859 m->m_next = m->m_nextpkt = NULL;
1860
1861 mz_cl_free(cl_zid, m->m_ext.ext_buf);
1862 m->m_ext.ext_buf = NULL;
1863 mz_ref_free(rfa);
1864 mz_free(m);
1865 }
1866 #endif /* !CONFIG_MBUF_MCACHE */
1867
1868 #if CONFIG_MBUF_MCACHE
1869 static int
1870 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1871 {
1872 #pragma unused(oidp, arg1, arg2)
1873 int i;
1874
1875 /* Ensure leak tracing turned on */
1876 if (!mclfindleak || !mclexpleak) {
1877 return ENXIO;
1878 }
1879
1880 lck_mtx_lock(mleak_lock);
1881 mleak_update_stats();
1882 i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1883 lck_mtx_unlock(mleak_lock);
1884
1885 return i;
1886 }
1887
1888 static int
1889 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1890 {
1891 #pragma unused(oidp, arg1, arg2)
1892 int i = 0;
1893
1894 /* Ensure leak tracing turned on */
1895 if (!mclfindleak || !mclexpleak) {
1896 return ENXIO;
1897 }
1898
1899 lck_mtx_lock(mleak_lock);
1900 i = SYSCTL_OUT(req, &mleak_table, sizeof(mleak_table));
1901 lck_mtx_unlock(mleak_lock);
1902
1903 return i;
1904 }
1905 #endif /* CONFIG_MBUF_MCACHE */
1906
1907 static inline void
m_incref(struct mbuf * m)1908 m_incref(struct mbuf *m)
1909 {
1910 uint16_t new = os_atomic_inc(&MEXT_REF(m), relaxed);
1911
1912 VERIFY(new != 0);
1913 /*
1914 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1915 * we don't clear the flag when the refcount goes back to the
1916 * minimum, to simplify code calling m_mclhasreference().
1917 */
1918 if (new > (MEXT_MINREF(m) + 1) && !(MEXT_FLAGS(m) & EXTF_READONLY)) {
1919 os_atomic_or(&MEXT_FLAGS(m), EXTF_READONLY, relaxed);
1920 }
1921 }
1922
1923 static inline uint16_t
m_decref(struct mbuf * m)1924 m_decref(struct mbuf *m)
1925 {
1926 VERIFY(MEXT_REF(m) != 0);
1927
1928 return os_atomic_dec(&MEXT_REF(m), acq_rel);
1929 }
1930
1931 static void
mbuf_table_init(void)1932 mbuf_table_init(void)
1933 {
1934 unsigned int b, c, s;
1935 int m, config_mbuf_jumbo = 0;
1936
1937 omb_stat = zalloc_permanent(OMB_STAT_SIZE(NELEM(mbuf_table)),
1938 ZALIGN(struct omb_stat));
1939
1940 mb_stat = zalloc_permanent(MB_STAT_SIZE(NELEM(mbuf_table)),
1941 ZALIGN(mb_stat_t));
1942
1943 mb_stat->mbs_cnt = NELEM(mbuf_table);
1944 for (m = 0; m < NELEM(mbuf_table); m++) {
1945 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1946 }
1947
1948 #if CONFIG_MBUF_JUMBO
1949 config_mbuf_jumbo = 1;
1950 #endif /* CONFIG_MBUF_JUMBO */
1951
1952 if (config_mbuf_jumbo == 1 || PAGE_SIZE == M16KCLBYTES) {
1953 /*
1954 * Set aside 1/3 of the mbuf cluster map for jumbo
1955 * clusters; we do this only on platforms where jumbo
1956 * cluster pool is enabled.
1957 */
1958 njcl = nmbclusters / 3;
1959 njclbytes = M16KCLBYTES;
1960 }
1961
1962 /*
1963 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1964 * a multiple of 4KB clusters.
1965 */
1966 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1967 if (njcl > 0) {
1968 /*
1969 * Each jumbo cluster takes 8 2KB clusters, so make
1970 * sure that the pool size is evenly divisible by 8;
1971 * njcl is in 2KB unit, hence treated as such.
1972 */
1973 njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL);
1974
1975 /* Update nclusters with rounded down value of njcl */
1976 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1977 }
1978
1979 /*
1980 * njcl is valid only on platforms with 16KB jumbo clusters or
1981 * with 16KB pages, where it is configured to 1/3 of the pool
1982 * size. On these platforms, the remaining is used for 2KB
1983 * and 4KB clusters. On platforms without 16KB jumbo clusters,
1984 * the entire pool is used for both 2KB and 4KB clusters. A 4KB
1985 * cluster can either be splitted into 16 mbufs, or into 2 2KB
1986 * clusters.
1987 *
1988 * +---+---+------------ ... -----------+------- ... -------+
1989 * | c | b | s | njcl |
1990 * +---+---+------------ ... -----------+------- ... -------+
1991 *
1992 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1993 * clusters (1/64th each.)
1994 */
1995 c = P2ROUNDDOWN((nclusters >> 6), NCLPG); /* in 2KB unit */
1996 b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), NBCLPG); /* in 4KB unit */
1997 s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */
1998
1999 /*
2000 * 1/64th (c) is reserved for 2KB clusters.
2001 */
2002 m_minlimit(MC_CL) = c;
2003 m_maxlimit(MC_CL) = s + c; /* in 2KB unit */
2004 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
2005 snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
2006
2007 /*
2008 * Another 1/64th (b) of the map is reserved for 4KB clusters.
2009 * It cannot be turned into 2KB clusters or mbufs.
2010 */
2011 m_minlimit(MC_BIGCL) = b;
2012 m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */
2013 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
2014 snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
2015
2016 /*
2017 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
2018 */
2019 m_minlimit(MC_MBUF) = 0;
2020 m_maxlimit(MC_MBUF) = s * NMBPCL; /* in mbuf unit */
2021 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = _MSIZE;
2022 snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
2023
2024 /*
2025 * Set limits for the composite classes.
2026 */
2027 m_minlimit(MC_MBUF_CL) = 0;
2028 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
2029 m_maxsize(MC_MBUF_CL) = MCLBYTES;
2030 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
2031 snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
2032
2033 m_minlimit(MC_MBUF_BIGCL) = 0;
2034 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
2035 m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
2036 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
2037 snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
2038
2039 /*
2040 * And for jumbo classes.
2041 */
2042 m_minlimit(MC_16KCL) = 0;
2043 m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */
2044 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
2045 snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
2046
2047 m_minlimit(MC_MBUF_16KCL) = 0;
2048 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
2049 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
2050 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
2051 snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
2052
2053 /*
2054 * Initialize the legacy mbstat structure.
2055 */
2056 bzero(&mbstat, sizeof(mbstat));
2057 mbstat.m_msize = m_maxsize(MC_MBUF);
2058 mbstat.m_mclbytes = m_maxsize(MC_CL);
2059 mbstat.m_minclsize = MINCLSIZE;
2060 mbstat.m_mlen = MLEN;
2061 mbstat.m_mhlen = MHLEN;
2062 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
2063 }
2064
2065 static int
mbuf_get_class(struct mbuf * m)2066 mbuf_get_class(struct mbuf *m)
2067 {
2068 if (m->m_flags & M_EXT) {
2069 uint32_t composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
2070 m_ext_free_func_t m_free_func = m_get_ext_free(m);
2071
2072 if (m_free_func == NULL) {
2073 if (composite) {
2074 return MC_MBUF_CL;
2075 } else {
2076 return MC_CL;
2077 }
2078 } else if (m_free_func == m_bigfree) {
2079 if (composite) {
2080 return MC_MBUF_BIGCL;
2081 } else {
2082 return MC_BIGCL;
2083 }
2084 } else if (m_free_func == m_16kfree) {
2085 if (composite) {
2086 return MC_MBUF_16KCL;
2087 } else {
2088 return MC_16KCL;
2089 }
2090 }
2091 }
2092
2093 return MC_MBUF;
2094 }
2095
2096 bool
mbuf_class_under_pressure(struct mbuf * m)2097 mbuf_class_under_pressure(struct mbuf *m)
2098 {
2099 int mclass = mbuf_get_class(m);
2100
2101 #if CONFIG_MBUF_MCACHE
2102 if (m_total(mclass) - m_infree(mclass) >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
2103 /*
2104 * The above computation does not include the per-CPU cached objects.
2105 * As a fast-path check this is good-enough. But now we do
2106 * the "slower" count of the cached objects to know exactly the
2107 * number of active mbufs in use.
2108 *
2109 * We do not take the mbuf_lock here to avoid lock-contention. Numbers
2110 * might be slightly off but we don't try to be 100% accurate.
2111 * At worst, we drop a packet that we shouldn't have dropped or
2112 * we might go slightly above our memory-pressure threshold.
2113 */
2114 mcache_t *cp = m_cache(mclass);
2115 mcache_cpu_t *ccp = &cp->mc_cpu[0];
2116
2117 int bktsize = os_access_once(ccp->cc_bktsize);
2118 uint32_t bl_total = os_access_once(cp->mc_full.bl_total);
2119 uint32_t cached = 0;
2120 int i;
2121
2122 for (i = 0; i < ncpu; i++) {
2123 ccp = &cp->mc_cpu[i];
2124
2125 int cc_objs = os_access_once(ccp->cc_objs);
2126 if (cc_objs > 0) {
2127 cached += cc_objs;
2128 }
2129
2130 int cc_pobjs = os_access_once(ccp->cc_pobjs);
2131 if (cc_pobjs > 0) {
2132 cached += cc_pobjs;
2133 }
2134 }
2135 cached += (bl_total * bktsize);
2136 if (m_total(mclass) - m_infree(mclass) - cached >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
2137 os_log(OS_LOG_DEFAULT,
2138 "%s memory-pressure on mbuf due to class %u, total %u free %u cached %u max %u",
2139 __func__, mclass, m_total(mclass), m_infree(mclass), cached, m_maxlimit(mclass));
2140 return true;
2141 }
2142 }
2143 #else
2144 /*
2145 * Grab the statistics from zalloc.
2146 * We can't call mbuf_stat_sync() since that requires a lock.
2147 */
2148 const zone_id_t zid = m_class_to_zid(m_class(mclass));
2149 const zone_t zone = zone_by_id(zid);
2150 struct zone_basic_stats stats = {};
2151
2152 zone_get_stats(zone, &stats);
2153 if (stats.zbs_avail - stats.zbs_free >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
2154 os_log(OS_LOG_DEFAULT,
2155 "%s memory-pressure on mbuf due to class %u, total %llu free %llu max %u",
2156 __func__, mclass, stats.zbs_avail, stats.zbs_free, m_maxlimit(mclass));
2157 return true;
2158 }
2159 #endif /* CONFIG_MBUF_MCACHE */
2160
2161 return false;
2162 }
2163
2164 #if defined(__LP64__)
2165 typedef struct ncl_tbl {
2166 uint64_t nt_maxmem; /* memory (sane) size */
2167 uint32_t nt_mbpool; /* mbuf pool size */
2168 } ncl_tbl_t;
2169
2170 static const ncl_tbl_t ncl_table[] = {
2171 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ },
2172 { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (96 << MBSHIFT) /* 96 MB */ },
2173 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (128 << MBSHIFT) /* 128 MB */ },
2174 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (256 << MBSHIFT) /* 256 MB */ },
2175 { (1ULL << (GBSHIFT + 5)) /* 32 GB */, (512 << MBSHIFT) /* 512 MB */ },
2176 { 0, 0 }
2177 };
2178 #endif /* __LP64__ */
2179
2180 __private_extern__ unsigned int
mbuf_default_ncl(uint64_t mem)2181 mbuf_default_ncl(uint64_t mem)
2182 {
2183 #if !defined(__LP64__)
2184 unsigned int n;
2185 /*
2186 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
2187 */
2188 if ((n = ((mem / 16) / MCLBYTES)) > 32768) {
2189 n = 32768;
2190 }
2191 #else
2192 unsigned int n, i;
2193 /*
2194 * 64-bit kernel (mbuf pool size based on table).
2195 */
2196 n = ncl_table[0].nt_mbpool;
2197 for (i = 0; ncl_table[i].nt_mbpool != 0; i++) {
2198 if (mem < ncl_table[i].nt_maxmem) {
2199 break;
2200 }
2201 n = ncl_table[i].nt_mbpool;
2202 }
2203 n >>= MCLSHIFT;
2204 #endif /* !__LP64__ */
2205 return n;
2206 }
2207
2208 __private_extern__ void
mbinit(void)2209 mbinit(void)
2210 {
2211 unsigned int m;
2212 #if CONFIG_MBUF_MCACHE
2213 unsigned int initmcl = 0;
2214 thread_t thread = THREAD_NULL;
2215 #endif /* CONFIG_MBUF_MCACHE */
2216
2217 #if CONFIG_MBUF_MCACHE
2218 microuptime(&mb_start);
2219 #endif /* CONFIG_MBUF_MCACHE */
2220
2221 /*
2222 * These MBUF_ values must be equal to their private counterparts.
2223 */
2224 _CASSERT(MBUF_EXT == M_EXT);
2225 _CASSERT(MBUF_PKTHDR == M_PKTHDR);
2226 _CASSERT(MBUF_EOR == M_EOR);
2227 _CASSERT(MBUF_LOOP == M_LOOP);
2228 _CASSERT(MBUF_BCAST == M_BCAST);
2229 _CASSERT(MBUF_MCAST == M_MCAST);
2230 _CASSERT(MBUF_FRAG == M_FRAG);
2231 _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
2232 _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
2233 _CASSERT(MBUF_PROMISC == M_PROMISC);
2234 _CASSERT(MBUF_HASFCS == M_HASFCS);
2235
2236 _CASSERT(MBUF_TYPE_FREE == MT_FREE);
2237 _CASSERT(MBUF_TYPE_DATA == MT_DATA);
2238 _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
2239 _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
2240 _CASSERT(MBUF_TYPE_PCB == MT_PCB);
2241 _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
2242 _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
2243 _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
2244 _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
2245 _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
2246 _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
2247 _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
2248 _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
2249 _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
2250 _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
2251
2252 _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
2253 _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
2254 _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
2255 _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
2256 _CASSERT(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
2257 _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
2258 _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
2259 _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
2260 _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
2261 _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
2262 _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
2263 _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
2264 _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
2265 _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
2266
2267 _CASSERT(MBUF_WAITOK == M_WAIT);
2268 _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
2269 _CASSERT(MBUF_COPYALL == M_COPYALL);
2270
2271 _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
2272 _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
2273 _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
2274 _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
2275 _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
2276 _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
2277 _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
2278 _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
2279 _CASSERT(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI);
2280 _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
2281 _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
2282
2283 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
2284 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
2285 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
2286 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
2287
2288 /* Module specific scratch space (32-bit alignment requirement) */
2289 _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
2290 sizeof(uint32_t)));
2291
2292 /* pktdata needs to start at 128-bit offset! */
2293 _CASSERT((offsetof(struct mbuf, m_pktdat) % 16) == 0);
2294
2295 /* Initialize random red zone cookie value */
2296 _CASSERT(sizeof(mb_redzone_cookie) ==
2297 sizeof(((struct pkthdr *)0)->redzone));
2298 read_random(&mb_redzone_cookie, sizeof(mb_redzone_cookie));
2299 read_random(&mb_obscure_extref, sizeof(mb_obscure_extref));
2300 read_random(&mb_obscure_extfree, sizeof(mb_obscure_extfree));
2301 mb_obscure_extref |= 0x3;
2302 mb_obscure_extref = 0;
2303 mb_obscure_extfree |= 0x3;
2304
2305 #if CONFIG_MBUF_MCACHE
2306 /* Make sure we don't save more than we should */
2307 _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof(struct mbuf));
2308 #endif /* CONFIG_MBUF_MCACHE */
2309
2310 if (nmbclusters == 0) {
2311 nmbclusters = NMBCLUSTERS;
2312 }
2313
2314 /* This should be a sane (at least even) value by now */
2315 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
2316
2317 /* Setup the mbuf table */
2318 mbuf_table_init();
2319
2320 _CASSERT(sizeof(struct mbuf) == _MSIZE);
2321
2322 #if CONFIG_MBUF_MCACHE
2323 /*
2324 * Allocate cluster slabs table:
2325 *
2326 * maxslabgrp = (N * 2048) / (1024 * 1024)
2327 *
2328 * Where N is nmbclusters rounded up to the nearest 512. This yields
2329 * mcl_slab_g_t units, each one representing a MB of memory.
2330 */
2331 maxslabgrp =
2332 (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
2333 slabstbl = zalloc_permanent(maxslabgrp * sizeof(mcl_slabg_t *),
2334 ZALIGN(mcl_slabg_t));
2335
2336 /*
2337 * Allocate audit structures, if needed:
2338 *
2339 * maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE
2340 *
2341 * This yields mcl_audit_t units, each one representing a page.
2342 */
2343 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof(mbuf_debug));
2344 mbuf_debug |= mcache_getflags();
2345 if (mbuf_debug & MCF_DEBUG) {
2346 int l;
2347 mcl_audit_t *mclad;
2348 maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
2349 mclaudit = zalloc_permanent(maxclaudit * sizeof(*mclaudit),
2350 ZALIGN(mcl_audit_t));
2351 for (l = 0, mclad = mclaudit; l < maxclaudit; l++) {
2352 mclad[l].cl_audit = zalloc_permanent(NMBPG * sizeof(mcache_audit_t *),
2353 ZALIGN_PTR);
2354 }
2355
2356 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
2357 AUDIT_CONTENTS_SIZE, sizeof(u_int64_t), 0, MCR_SLEEP);
2358 VERIFY(mcl_audit_con_cache != NULL);
2359 }
2360 mclverify = (mbuf_debug & MCF_VERIFY);
2361 mcltrace = (mbuf_debug & MCF_TRACE);
2362 mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
2363 mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
2364
2365 /* Enable mbuf leak logging, with a lock to protect the tables */
2366
2367 mleak_activate();
2368
2369 /*
2370 * Allocate structure for per-CPU statistics that's aligned
2371 * on the CPU cache boundary; this code assumes that we never
2372 * uninitialize this framework, since the original address
2373 * before alignment is not saved.
2374 */
2375 ncpu = ml_wait_max_cpus();
2376
2377 /* Calculate the number of pages assigned to the cluster pool */
2378 mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
2379 mcl_paddr = zalloc_permanent(mcl_pages * sizeof(ppnum_t),
2380 ZALIGN(ppnum_t));
2381
2382 /* Register with the I/O Bus mapper */
2383 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
2384
2385 embutl = (mbutl + (nmbclusters * MCLBYTES));
2386 VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0);
2387
2388 /* Prime up the freelist */
2389 PE_parse_boot_argn("initmcl", &initmcl, sizeof(initmcl));
2390 if (initmcl != 0) {
2391 initmcl >>= NCLPBGSHIFT; /* become a 4K unit */
2392 if (initmcl > m_maxlimit(MC_BIGCL)) {
2393 initmcl = m_maxlimit(MC_BIGCL);
2394 }
2395 }
2396 if (initmcl < m_minlimit(MC_BIGCL)) {
2397 initmcl = m_minlimit(MC_BIGCL);
2398 }
2399
2400 lck_mtx_lock(mbuf_mlock);
2401
2402 /*
2403 * For classes with non-zero minimum limits, populate their freelists
2404 * so that m_total(class) is at least m_minlimit(class).
2405 */
2406 VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
2407 freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
2408 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
2409 freelist_init(m_class(MC_CL));
2410 #else
2411 /*
2412 * We have yet to create the non composite zones
2413 * and thus we haven't asked zalloc to allocate
2414 * anything yet, which means that at this point
2415 * m_total() is zero. Once we create the zones and
2416 * raise the reserve, m_total() will be calculated,
2417 * but until then just assume that we will have
2418 * at least the minium limit allocated.
2419 */
2420 m_total(MC_BIGCL) = m_minlimit(MC_BIGCL);
2421 m_total(MC_CL) = m_minlimit(MC_CL);
2422 #endif /* CONFIG_MBUF_MCACHE */
2423
2424 for (m = 0; m < NELEM(mbuf_table); m++) {
2425 /* Make sure we didn't miss any */
2426 VERIFY(m_minlimit(m_class(m)) == 0 ||
2427 m_total(m_class(m)) >= m_minlimit(m_class(m)));
2428 }
2429
2430 #if CONFIG_MBUF_MCACHE
2431 lck_mtx_unlock(mbuf_mlock);
2432
2433 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
2434 NULL, &thread);
2435 thread_deallocate(thread);
2436
2437 ref_cache = mcache_create("mext_ref", sizeof(struct ext_ref),
2438 0, 0, MCR_SLEEP);
2439 #endif /* CONFIG_MBUF_MCACHE */
2440
2441 /* Create the cache for each class */
2442 for (m = 0; m < NELEM(mbuf_table); m++) {
2443 #if CONFIG_MBUF_MCACHE
2444 void *allocfunc, *freefunc, *auditfunc, *logfunc;
2445 u_int32_t flags;
2446
2447 flags = mbuf_debug;
2448 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
2449 m_class(m) == MC_MBUF_16KCL) {
2450 allocfunc = mbuf_cslab_alloc;
2451 freefunc = mbuf_cslab_free;
2452 auditfunc = mbuf_cslab_audit;
2453 logfunc = mleak_logger;
2454 } else {
2455 allocfunc = mbuf_slab_alloc;
2456 freefunc = mbuf_slab_free;
2457 auditfunc = mbuf_slab_audit;
2458 logfunc = mleak_logger;
2459 }
2460
2461 /*
2462 * Disable per-CPU caches for jumbo classes if there
2463 * is no jumbo cluster pool available in the system.
2464 * The cache itself is still created (but will never
2465 * be populated) since it simplifies the code.
2466 */
2467 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
2468 njcl == 0) {
2469 flags |= MCF_NOCPUCACHE;
2470 }
2471
2472 if (!mclfindleak) {
2473 flags |= MCF_NOLEAKLOG;
2474 }
2475
2476 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
2477 allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
2478 (void *)(uintptr_t)m, flags, MCR_SLEEP);
2479 #else
2480 if (!MBUF_CLASS_COMPOSITE(m)) {
2481 zone_t zone = zone_by_id(m_class_to_zid(m));
2482
2483 zone_set_exhaustible(zone, m_maxlimit(m), false);
2484 zone_raise_reserve(zone, m_minlimit(m));
2485 /*
2486 * Pretend that we have allocated m_total() items
2487 * at this point. zalloc will eventually do that
2488 * but it's an async operation.
2489 */
2490 m_total(m) = m_minlimit(m);
2491 }
2492 #endif /* CONFIG_MBUF_MCACHE */
2493 }
2494
2495 /*
2496 * Set the max limit on sb_max to be 1/16 th of the size of
2497 * memory allocated for mbuf clusters.
2498 */
2499 high_sb_max = (nmbclusters << (MCLSHIFT - 4));
2500 if (high_sb_max < sb_max) {
2501 /* sb_max is too large for this configuration, scale it down */
2502 if (high_sb_max > (1 << MBSHIFT)) {
2503 /* We have atleast 16 M of mbuf pool */
2504 sb_max = high_sb_max;
2505 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
2506 /*
2507 * If we have more than 1M of mbufpool, cap the size of
2508 * max sock buf at 1M
2509 */
2510 sb_max = high_sb_max = (1 << MBSHIFT);
2511 } else {
2512 sb_max = high_sb_max;
2513 }
2514 }
2515
2516 #if CONFIG_MBUF_MCACHE
2517 /* allocate space for mbuf_dump_buf */
2518 mbuf_dump_buf = zalloc_permanent(MBUF_DUMP_BUF_SIZE, ZALIGN_NONE);
2519
2520 if (mbuf_debug & MCF_DEBUG) {
2521 printf("%s: MLEN %d, MHLEN %d\n", __func__,
2522 (int)_MLEN, (int)_MHLEN);
2523 }
2524 #else
2525 mbuf_defunct_tcall =
2526 thread_call_allocate_with_options(mbuf_watchdog_defunct,
2527 NULL,
2528 THREAD_CALL_PRIORITY_KERNEL,
2529 THREAD_CALL_OPTIONS_ONCE);
2530 mbuf_drain_tcall =
2531 thread_call_allocate_with_options(mbuf_watchdog_drain_composite,
2532 NULL,
2533 THREAD_CALL_PRIORITY_KERNEL,
2534 THREAD_CALL_OPTIONS_ONCE);
2535 #endif /* CONFIG_MBUF_MCACHE */
2536 printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
2537 (nmbclusters << MCLSHIFT) >> MBSHIFT,
2538 (nclusters << MCLSHIFT) >> MBSHIFT,
2539 (njcl << MCLSHIFT) >> MBSHIFT);
2540 }
2541
2542 #if CONFIG_MBUF_MCACHE
2543 /*
2544 * Obtain a slab of object(s) from the class's freelist.
2545 */
2546 static mcache_obj_t *
slab_alloc(mbuf_class_t class,int wait)2547 slab_alloc(mbuf_class_t class, int wait)
2548 {
2549 mcl_slab_t *sp;
2550 mcache_obj_t *buf;
2551
2552 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2553
2554 /* This should always be NULL for us */
2555 VERIFY(m_cobjlist(class) == NULL);
2556
2557 /*
2558 * Treat composite objects as having longer lifespan by using
2559 * a slab from the reverse direction, in hoping that this could
2560 * reduce the probability of fragmentation for slabs that hold
2561 * more than one buffer chunks (e.g. mbuf slabs). For other
2562 * slabs, this probably doesn't make much of a difference.
2563 */
2564 if ((class == MC_MBUF || class == MC_CL || class == MC_BIGCL)
2565 && (wait & MCR_COMP)) {
2566 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
2567 } else {
2568 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
2569 }
2570
2571 if (sp == NULL) {
2572 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
2573 /* The slab list for this class is empty */
2574 return NULL;
2575 }
2576
2577 VERIFY(m_infree(class) > 0);
2578 VERIFY(!slab_is_detached(sp));
2579 VERIFY(sp->sl_class == class &&
2580 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2581 buf = sp->sl_head;
2582 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
2583 sp->sl_head = buf->obj_next;
2584 /* Increment slab reference */
2585 sp->sl_refcnt++;
2586
2587 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == sp->sl_chunks);
2588
2589 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
2590 slab_nextptr_panic(sp, sp->sl_head);
2591 /* In case sl_head is in the map but not in the slab */
2592 VERIFY(slab_inrange(sp, sp->sl_head));
2593 /* NOTREACHED */
2594 }
2595
2596 if (mclaudit != NULL) {
2597 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
2598 mca->mca_uflags = 0;
2599 /* Save contents on mbuf objects only */
2600 if (class == MC_MBUF) {
2601 mca->mca_uflags |= MB_SCVALID;
2602 }
2603 }
2604
2605 if (class == MC_CL) {
2606 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
2607 /*
2608 * A 2K cluster slab can have at most NCLPG references.
2609 */
2610 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPG &&
2611 sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
2612 VERIFY(sp->sl_refcnt < NCLPG || sp->sl_head == NULL);
2613 } else if (class == MC_BIGCL) {
2614 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
2615 m_infree(MC_MBUF_BIGCL);
2616 /*
2617 * A 4K cluster slab can have NBCLPG references.
2618 */
2619 VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG &&
2620 sp->sl_len == PAGE_SIZE &&
2621 (sp->sl_refcnt < NBCLPG || sp->sl_head == NULL));
2622 } else if (class == MC_16KCL) {
2623 mcl_slab_t *nsp;
2624 int k;
2625
2626 --m_infree(MC_16KCL);
2627 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
2628 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
2629 /*
2630 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
2631 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
2632 * most 1 reference.
2633 */
2634 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
2635 nsp = nsp->sl_next;
2636 /* Next slab must already be present */
2637 VERIFY(nsp != NULL);
2638 nsp->sl_refcnt++;
2639 VERIFY(!slab_is_detached(nsp));
2640 VERIFY(nsp->sl_class == MC_16KCL &&
2641 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
2642 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
2643 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
2644 nsp->sl_head == NULL);
2645 }
2646 } else {
2647 VERIFY(class == MC_MBUF);
2648 --m_infree(MC_MBUF);
2649 /*
2650 * If auditing is turned on, this check is
2651 * deferred until later in mbuf_slab_audit().
2652 */
2653 if (mclaudit == NULL) {
2654 _MCHECK((struct mbuf *)buf);
2655 }
2656 /*
2657 * Since we have incremented the reference count above,
2658 * an mbuf slab (formerly a 4KB cluster slab that was cut
2659 * up into mbufs) must have a reference count between 1
2660 * and NMBPG at this point.
2661 */
2662 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPG &&
2663 sp->sl_chunks == NMBPG &&
2664 sp->sl_len == PAGE_SIZE);
2665 VERIFY(sp->sl_refcnt < NMBPG || sp->sl_head == NULL);
2666 }
2667
2668 /* If empty, remove this slab from the class's freelist */
2669 if (sp->sl_head == NULL) {
2670 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPG);
2671 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPG);
2672 VERIFY(class != MC_BIGCL || sp->sl_refcnt == NBCLPG);
2673 slab_remove(sp, class);
2674 }
2675
2676 return buf;
2677 }
2678
2679 /*
2680 * Place a slab of object(s) back into a class's slab list.
2681 */
2682 static void
slab_free(mbuf_class_t class,mcache_obj_t * buf)2683 slab_free(mbuf_class_t class, mcache_obj_t *buf)
2684 {
2685 mcl_slab_t *sp;
2686 boolean_t reinit_supercl = false;
2687 mbuf_class_t super_class;
2688
2689 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2690
2691 VERIFY(class != MC_16KCL || njcl > 0);
2692 VERIFY(buf->obj_next == NULL);
2693
2694 /*
2695 * Synchronizing with m_clalloc, as it reads m_total, while we here
2696 * are modifying m_total.
2697 */
2698 while (mb_clalloc_busy) {
2699 mb_clalloc_waiters++;
2700 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2701 (PZERO - 1), "m_clalloc", NULL);
2702 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2703 }
2704
2705 /* We are busy now; tell everyone else to go away */
2706 mb_clalloc_busy = TRUE;
2707
2708 sp = slab_get(buf);
2709 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
2710 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2711
2712 /* Decrement slab reference */
2713 sp->sl_refcnt--;
2714
2715 if (class == MC_CL) {
2716 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
2717 /*
2718 * A slab that has been splitted for 2KB clusters can have
2719 * at most 1 outstanding reference at this point.
2720 */
2721 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPG - 1) &&
2722 sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
2723 VERIFY(sp->sl_refcnt < (NCLPG - 1) ||
2724 (slab_is_detached(sp) && sp->sl_head == NULL));
2725 } else if (class == MC_BIGCL) {
2726 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
2727
2728 /* A 4KB cluster slab can have NBCLPG references at most */
2729 VERIFY(sp->sl_refcnt >= 0 && sp->sl_chunks == NBCLPG);
2730 VERIFY(sp->sl_refcnt < (NBCLPG - 1) ||
2731 (slab_is_detached(sp) && sp->sl_head == NULL));
2732 } else if (class == MC_16KCL) {
2733 mcl_slab_t *nsp;
2734 int k;
2735 /*
2736 * A 16KB cluster takes NSLABSP16KB slabs, all must
2737 * now have 0 reference.
2738 */
2739 VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE));
2740 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
2741 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
2742 VERIFY(slab_is_detached(sp));
2743 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
2744 nsp = nsp->sl_next;
2745 /* Next slab must already be present */
2746 VERIFY(nsp != NULL);
2747 nsp->sl_refcnt--;
2748 VERIFY(slab_is_detached(nsp));
2749 VERIFY(nsp->sl_class == MC_16KCL &&
2750 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
2751 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
2752 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
2753 nsp->sl_head == NULL);
2754 }
2755 } else {
2756 /*
2757 * A slab that has been splitted for mbufs has at most
2758 * NMBPG reference counts. Since we have decremented
2759 * one reference above, it must now be between 0 and
2760 * NMBPG-1.
2761 */
2762 VERIFY(class == MC_MBUF);
2763 VERIFY(sp->sl_refcnt >= 0 &&
2764 sp->sl_refcnt <= (NMBPG - 1) &&
2765 sp->sl_chunks == NMBPG &&
2766 sp->sl_len == PAGE_SIZE);
2767 VERIFY(sp->sl_refcnt < (NMBPG - 1) ||
2768 (slab_is_detached(sp) && sp->sl_head == NULL));
2769 }
2770
2771 /*
2772 * When auditing is enabled, ensure that the buffer still
2773 * contains the free pattern. Otherwise it got corrupted
2774 * while at the CPU cache layer.
2775 */
2776 if (mclaudit != NULL) {
2777 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
2778 if (mclverify) {
2779 mcache_audit_free_verify(mca, buf, 0,
2780 m_maxsize(class));
2781 }
2782 mca->mca_uflags &= ~MB_SCVALID;
2783 }
2784
2785 if (class == MC_CL) {
2786 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
2787 buf->obj_next = sp->sl_head;
2788 } else if (class == MC_BIGCL) {
2789 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
2790 m_infree(MC_MBUF_BIGCL);
2791 buf->obj_next = sp->sl_head;
2792 } else if (class == MC_16KCL) {
2793 ++m_infree(MC_16KCL);
2794 } else {
2795 ++m_infree(MC_MBUF);
2796 buf->obj_next = sp->sl_head;
2797 }
2798 sp->sl_head = buf;
2799
2800 /*
2801 * If a slab has been split to either one which holds 2KB clusters,
2802 * or one which holds mbufs, turn it back to one which holds a
2803 * 4 or 16 KB cluster depending on the page size.
2804 */
2805 if (m_maxsize(MC_BIGCL) == PAGE_SIZE) {
2806 super_class = MC_BIGCL;
2807 } else {
2808 VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL));
2809 super_class = MC_16KCL;
2810 }
2811 if (class == MC_MBUF && sp->sl_refcnt == 0 &&
2812 m_total(class) >= (m_minlimit(class) + NMBPG) &&
2813 m_total(super_class) < m_maxlimit(super_class)) {
2814 int i = NMBPG;
2815
2816 m_total(MC_MBUF) -= NMBPG;
2817 mbstat.m_mbufs = m_total(MC_MBUF);
2818 m_infree(MC_MBUF) -= NMBPG;
2819 mtype_stat_add(MT_FREE, -((unsigned)NMBPG));
2820
2821 while (i--) {
2822 struct mbuf *m = sp->sl_head;
2823 VERIFY(m != NULL);
2824 sp->sl_head = m->m_next;
2825 m->m_next = NULL;
2826 }
2827 reinit_supercl = true;
2828 } else if (class == MC_CL && sp->sl_refcnt == 0 &&
2829 m_total(class) >= (m_minlimit(class) + NCLPG) &&
2830 m_total(super_class) < m_maxlimit(super_class)) {
2831 int i = NCLPG;
2832
2833 m_total(MC_CL) -= NCLPG;
2834 mbstat.m_clusters = m_total(MC_CL);
2835 m_infree(MC_CL) -= NCLPG;
2836
2837 while (i--) {
2838 union mcluster *c = sp->sl_head;
2839 VERIFY(c != NULL);
2840 sp->sl_head = c->mcl_next;
2841 c->mcl_next = NULL;
2842 }
2843 reinit_supercl = true;
2844 } else if (class == MC_BIGCL && super_class != MC_BIGCL &&
2845 sp->sl_refcnt == 0 &&
2846 m_total(class) >= (m_minlimit(class) + NBCLPG) &&
2847 m_total(super_class) < m_maxlimit(super_class)) {
2848 int i = NBCLPG;
2849
2850 VERIFY(super_class == MC_16KCL);
2851 m_total(MC_BIGCL) -= NBCLPG;
2852 mbstat.m_bigclusters = m_total(MC_BIGCL);
2853 m_infree(MC_BIGCL) -= NBCLPG;
2854
2855 while (i--) {
2856 union mbigcluster *bc = sp->sl_head;
2857 VERIFY(bc != NULL);
2858 sp->sl_head = bc->mbc_next;
2859 bc->mbc_next = NULL;
2860 }
2861 reinit_supercl = true;
2862 }
2863
2864 if (reinit_supercl) {
2865 VERIFY(sp->sl_head == NULL);
2866 VERIFY(m_total(class) >= m_minlimit(class));
2867 slab_remove(sp, class);
2868
2869 /* Reinitialize it as a cluster for the super class */
2870 m_total(super_class)++;
2871 m_infree(super_class)++;
2872 VERIFY(sp->sl_flags == (SLF_MAPPED | SLF_DETACHED) &&
2873 sp->sl_len == PAGE_SIZE && sp->sl_refcnt == 0);
2874
2875 slab_init(sp, super_class, SLF_MAPPED, sp->sl_base,
2876 sp->sl_base, PAGE_SIZE, 0, 1);
2877 if (mclverify) {
2878 mcache_set_pattern(MCACHE_FREE_PATTERN,
2879 (caddr_t)sp->sl_base, sp->sl_len);
2880 }
2881 ((mcache_obj_t *)(sp->sl_base))->obj_next = NULL;
2882
2883 if (super_class == MC_BIGCL) {
2884 mbstat.m_bigclusters = m_total(MC_BIGCL);
2885 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2886 m_infree(MC_MBUF_BIGCL);
2887 }
2888
2889 VERIFY(slab_is_detached(sp));
2890 VERIFY(m_total(super_class) <= m_maxlimit(super_class));
2891
2892 /* And finally switch class */
2893 class = super_class;
2894 }
2895
2896 /* Reinsert the slab to the class's slab list */
2897 if (slab_is_detached(sp)) {
2898 slab_insert(sp, class);
2899 }
2900
2901 /* We're done; let others enter */
2902 mb_clalloc_busy = FALSE;
2903 if (mb_clalloc_waiters > 0) {
2904 mb_clalloc_waiters = 0;
2905 wakeup(mb_clalloc_waitchan);
2906 }
2907 }
2908
2909 /*
2910 * Common allocator for rudimentary objects called by the CPU cache layer
2911 * during an allocation request whenever there is no available element in the
2912 * bucket layer. It returns one or more elements from the appropriate global
2913 * freelist. If the freelist is empty, it will attempt to populate it and
2914 * retry the allocation.
2915 */
2916 static unsigned int
mbuf_slab_alloc(void * arg,mcache_obj_t *** plist,unsigned int num,int wait)2917 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
2918 {
2919 mbuf_class_t class = (mbuf_class_t)arg;
2920 unsigned int need = num;
2921 mcache_obj_t **list = *plist;
2922
2923 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2924 ASSERT(need > 0);
2925
2926 lck_mtx_lock(mbuf_mlock);
2927
2928 for (;;) {
2929 if ((*list = slab_alloc(class, wait)) != NULL) {
2930 (*list)->obj_next = NULL;
2931 list = *plist = &(*list)->obj_next;
2932
2933 if (--need == 0) {
2934 /*
2935 * If the number of elements in freelist has
2936 * dropped below low watermark, asynchronously
2937 * populate the freelist now rather than doing
2938 * it later when we run out of elements.
2939 */
2940 if (!mbuf_cached_above(class, wait) &&
2941 m_infree(class) < (m_total(class) >> 5)) {
2942 (void) freelist_populate(class, 1,
2943 M_DONTWAIT);
2944 }
2945 break;
2946 }
2947 } else {
2948 VERIFY(m_infree(class) == 0 || class == MC_CL);
2949
2950 (void) freelist_populate(class, 1,
2951 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2952
2953 if (m_infree(class) > 0) {
2954 continue;
2955 }
2956
2957 /* Check if there's anything at the cache layer */
2958 if (mbuf_cached_above(class, wait)) {
2959 break;
2960 }
2961
2962 /* watchdog checkpoint */
2963 mbuf_watchdog();
2964
2965 /* We have nothing and cannot block; give up */
2966 if (wait & MCR_NOSLEEP) {
2967 if (!(wait & MCR_TRYHARD)) {
2968 m_fail_cnt(class)++;
2969 mbstat.m_drops++;
2970 break;
2971 }
2972 }
2973
2974 /*
2975 * If the freelist is still empty and the caller is
2976 * willing to be blocked, sleep on the wait channel
2977 * until an element is available. Otherwise, if
2978 * MCR_TRYHARD is set, do our best to satisfy the
2979 * request without having to go to sleep.
2980 */
2981 if (mbuf_worker_ready &&
2982 mbuf_sleep(class, need, wait)) {
2983 break;
2984 }
2985
2986 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2987 }
2988 }
2989
2990 m_alloc_cnt(class) += num - need;
2991 lck_mtx_unlock(mbuf_mlock);
2992
2993 return num - need;
2994 }
2995
2996 /*
2997 * Common de-allocator for rudimentary objects called by the CPU cache
2998 * layer when one or more elements need to be returned to the appropriate
2999 * global freelist.
3000 */
3001 static void
mbuf_slab_free(void * arg,mcache_obj_t * list,__unused int purged)3002 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
3003 {
3004 mbuf_class_t class = (mbuf_class_t)arg;
3005 mcache_obj_t *nlist;
3006 unsigned int num = 0;
3007 int w;
3008
3009 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
3010
3011 lck_mtx_lock(mbuf_mlock);
3012
3013 for (;;) {
3014 nlist = list->obj_next;
3015 list->obj_next = NULL;
3016 slab_free(class, list);
3017 ++num;
3018 if ((list = nlist) == NULL) {
3019 break;
3020 }
3021 }
3022 m_free_cnt(class) += num;
3023
3024 if ((w = mb_waiters) > 0) {
3025 mb_waiters = 0;
3026 }
3027 if (w) {
3028 mbwdog_logger("waking up all threads");
3029 }
3030 lck_mtx_unlock(mbuf_mlock);
3031
3032 if (w != 0) {
3033 wakeup(mb_waitchan);
3034 }
3035 }
3036
3037 /*
3038 * Common auditor for rudimentary objects called by the CPU cache layer
3039 * during an allocation or free request. For the former, this is called
3040 * after the objects are obtained from either the bucket or slab layer
3041 * and before they are returned to the caller. For the latter, this is
3042 * called immediately during free and before placing the objects into
3043 * the bucket or slab layer.
3044 */
3045 static void
mbuf_slab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)3046 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
3047 {
3048 mbuf_class_t class = (mbuf_class_t)arg;
3049 mcache_audit_t *mca;
3050
3051 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
3052
3053 while (list != NULL) {
3054 lck_mtx_lock(mbuf_mlock);
3055 mca = mcl_audit_buf2mca(class, list);
3056
3057 /* Do the sanity checks */
3058 if (class == MC_MBUF) {
3059 mcl_audit_mbuf(mca, list, FALSE, alloc);
3060 ASSERT(mca->mca_uflags & MB_SCVALID);
3061 } else {
3062 mcl_audit_cluster(mca, list, m_maxsize(class),
3063 alloc, TRUE);
3064 ASSERT(!(mca->mca_uflags & MB_SCVALID));
3065 }
3066 /* Record this transaction */
3067 if (mcltrace) {
3068 mcache_buffer_log(mca, list, m_cache(class), &mb_start);
3069 }
3070
3071 if (alloc) {
3072 mca->mca_uflags |= MB_INUSE;
3073 } else {
3074 mca->mca_uflags &= ~MB_INUSE;
3075 }
3076 /* Unpair the object (unconditionally) */
3077 mca->mca_uptr = NULL;
3078 lck_mtx_unlock(mbuf_mlock);
3079
3080 list = list->obj_next;
3081 }
3082 }
3083
3084 /*
3085 * Common notify routine for all caches. It is called by mcache when
3086 * one or more objects get freed. We use this indication to trigger
3087 * the wakeup of any sleeping threads so that they can retry their
3088 * allocation requests.
3089 */
3090 static void
mbuf_slab_notify(void * arg,u_int32_t reason)3091 mbuf_slab_notify(void *arg, u_int32_t reason)
3092 {
3093 mbuf_class_t class = (mbuf_class_t)arg;
3094 int w;
3095
3096 ASSERT(MBUF_CLASS_VALID(class));
3097
3098 if (reason != MCN_RETRYALLOC) {
3099 return;
3100 }
3101
3102 lck_mtx_lock(mbuf_mlock);
3103 if ((w = mb_waiters) > 0) {
3104 m_notified(class)++;
3105 mb_waiters = 0;
3106 }
3107 if (w) {
3108 mbwdog_logger("waking up all threads");
3109 }
3110 lck_mtx_unlock(mbuf_mlock);
3111
3112 if (w != 0) {
3113 wakeup(mb_waitchan);
3114 }
3115 }
3116
3117 /*
3118 * Obtain object(s) from the composite class's freelist.
3119 */
3120 static unsigned int
cslab_alloc(mbuf_class_t class,mcache_obj_t *** plist,unsigned int num)3121 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
3122 {
3123 unsigned int need = num;
3124 mcl_slab_t *sp, *clsp, *nsp;
3125 struct mbuf *m;
3126 mcache_obj_t **list = *plist;
3127 void *cl;
3128
3129 VERIFY(need > 0);
3130 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
3131 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3132
3133 /* Get what we can from the freelist */
3134 while ((*list = m_cobjlist(class)) != NULL) {
3135 MRANGE(*list);
3136
3137 m = (struct mbuf *)*list;
3138 sp = slab_get(m);
3139 cl = m->m_ext.ext_buf;
3140 clsp = slab_get(cl);
3141 VERIFY(m->m_flags == M_EXT && cl != NULL);
3142 VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
3143
3144 if (class == MC_MBUF_CL) {
3145 VERIFY(clsp->sl_refcnt >= 1 &&
3146 clsp->sl_refcnt <= NCLPG);
3147 } else {
3148 VERIFY(clsp->sl_refcnt >= 1 &&
3149 clsp->sl_refcnt <= NBCLPG);
3150 }
3151
3152 if (class == MC_MBUF_16KCL) {
3153 int k;
3154 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
3155 nsp = nsp->sl_next;
3156 /* Next slab must already be present */
3157 VERIFY(nsp != NULL);
3158 VERIFY(nsp->sl_refcnt == 1);
3159 }
3160 }
3161
3162 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
3163 !MBUF_IN_MAP(m_cobjlist(class))) {
3164 slab_nextptr_panic(sp, m_cobjlist(class));
3165 /* NOTREACHED */
3166 }
3167 (*list)->obj_next = NULL;
3168 list = *plist = &(*list)->obj_next;
3169
3170 if (--need == 0) {
3171 break;
3172 }
3173 }
3174 m_infree(class) -= (num - need);
3175
3176 return num - need;
3177 }
3178
3179 /*
3180 * Place object(s) back into a composite class's freelist.
3181 */
3182 static unsigned int
cslab_free(mbuf_class_t class,mcache_obj_t * list,int purged)3183 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
3184 {
3185 mcache_obj_t *o, *tail;
3186 unsigned int num = 0;
3187 struct mbuf *m, *ms;
3188 mcache_audit_t *mca = NULL;
3189 mcache_obj_t *ref_list = NULL;
3190 mcl_slab_t *clsp, *nsp;
3191 void *cl;
3192 mbuf_class_t cl_class;
3193
3194 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
3195 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
3196 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3197
3198 if (class == MC_MBUF_CL) {
3199 cl_class = MC_CL;
3200 } else if (class == MC_MBUF_BIGCL) {
3201 cl_class = MC_BIGCL;
3202 } else {
3203 VERIFY(class == MC_MBUF_16KCL);
3204 cl_class = MC_16KCL;
3205 }
3206
3207 o = tail = list;
3208
3209 while ((m = ms = (struct mbuf *)o) != NULL) {
3210 mcache_obj_t *rfa, *nexto = o->obj_next;
3211
3212 /* Do the mbuf sanity checks */
3213 if (mclaudit != NULL) {
3214 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
3215 if (mclverify) {
3216 mcache_audit_free_verify(mca, m, 0,
3217 m_maxsize(MC_MBUF));
3218 }
3219 ms = MCA_SAVED_MBUF_PTR(mca);
3220 }
3221
3222 /* Do the cluster sanity checks */
3223 cl = ms->m_ext.ext_buf;
3224 clsp = slab_get(cl);
3225 if (mclverify) {
3226 size_t size = m_maxsize(cl_class);
3227 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
3228 (mcache_obj_t *)cl), cl, 0, size);
3229 }
3230 VERIFY(ms->m_type == MT_FREE);
3231 VERIFY(ms->m_flags == M_EXT);
3232 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
3233 if (cl_class == MC_CL) {
3234 VERIFY(clsp->sl_refcnt >= 1 &&
3235 clsp->sl_refcnt <= NCLPG);
3236 } else {
3237 VERIFY(clsp->sl_refcnt >= 1 &&
3238 clsp->sl_refcnt <= NBCLPG);
3239 }
3240 if (cl_class == MC_16KCL) {
3241 int k;
3242 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
3243 nsp = nsp->sl_next;
3244 /* Next slab must already be present */
3245 VERIFY(nsp != NULL);
3246 VERIFY(nsp->sl_refcnt == 1);
3247 }
3248 }
3249
3250 /*
3251 * If we're asked to purge, restore the actual mbuf using
3252 * contents of the shadow structure (if auditing is enabled)
3253 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
3254 * about to free it and the attached cluster into their caches.
3255 */
3256 if (purged) {
3257 /* Restore constructed mbuf fields */
3258 if (mclaudit != NULL) {
3259 mcl_audit_restore_mbuf(m, mca, TRUE);
3260 }
3261
3262 MEXT_MINREF(m) = 0;
3263 MEXT_REF(m) = 0;
3264 MEXT_PREF(m) = 0;
3265 MEXT_FLAGS(m) = 0;
3266 MEXT_PRIV(m) = 0;
3267 MEXT_PMBUF(m) = NULL;
3268 MEXT_TOKEN(m) = 0;
3269
3270 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
3271 m_set_ext(m, NULL, NULL, NULL);
3272 rfa->obj_next = ref_list;
3273 ref_list = rfa;
3274
3275 m->m_type = MT_FREE;
3276 m->m_flags = m->m_len = 0;
3277 m->m_next = m->m_nextpkt = NULL;
3278
3279 /* Save mbuf fields and make auditing happy */
3280 if (mclaudit != NULL) {
3281 mcl_audit_mbuf(mca, o, FALSE, FALSE);
3282 }
3283
3284 VERIFY(m_total(class) > 0);
3285 m_total(class)--;
3286
3287 /* Free the mbuf */
3288 o->obj_next = NULL;
3289 slab_free(MC_MBUF, o);
3290
3291 /* And free the cluster */
3292 ((mcache_obj_t *)cl)->obj_next = NULL;
3293 if (class == MC_MBUF_CL) {
3294 slab_free(MC_CL, cl);
3295 } else if (class == MC_MBUF_BIGCL) {
3296 slab_free(MC_BIGCL, cl);
3297 } else {
3298 slab_free(MC_16KCL, cl);
3299 }
3300 }
3301
3302 ++num;
3303 tail = o;
3304 o = nexto;
3305 }
3306
3307 if (!purged) {
3308 tail->obj_next = m_cobjlist(class);
3309 m_cobjlist(class) = list;
3310 m_infree(class) += num;
3311 } else if (ref_list != NULL) {
3312 mcache_free_ext(ref_cache, ref_list);
3313 }
3314
3315 return num;
3316 }
3317
3318 /*
3319 * Common allocator for composite objects called by the CPU cache layer
3320 * during an allocation request whenever there is no available element in
3321 * the bucket layer. It returns one or more composite elements from the
3322 * appropriate global freelist. If the freelist is empty, it will attempt
3323 * to obtain the rudimentary objects from their caches and construct them
3324 * into composite mbuf + cluster objects.
3325 */
3326 static unsigned int
mbuf_cslab_alloc(void * arg,mcache_obj_t *** plist,unsigned int needed,int wait)3327 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
3328 int wait)
3329 {
3330 mbuf_class_t class = (mbuf_class_t)arg;
3331 mbuf_class_t cl_class = 0;
3332 unsigned int num = 0, cnum = 0, want = needed;
3333 mcache_obj_t *ref_list = NULL;
3334 mcache_obj_t *mp_list = NULL;
3335 mcache_obj_t *clp_list = NULL;
3336 mcache_obj_t **list;
3337 struct ext_ref *rfa;
3338 struct mbuf *m;
3339 void *cl;
3340
3341 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
3342 ASSERT(needed > 0);
3343
3344 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
3345
3346 /* There should not be any slab for this class */
3347 VERIFY(m_slab_cnt(class) == 0 &&
3348 m_slablist(class).tqh_first == NULL &&
3349 m_slablist(class).tqh_last == NULL);
3350
3351 lck_mtx_lock(mbuf_mlock);
3352
3353 /* Try using the freelist first */
3354 num = cslab_alloc(class, plist, needed);
3355 list = *plist;
3356 if (num == needed) {
3357 m_alloc_cnt(class) += num;
3358 lck_mtx_unlock(mbuf_mlock);
3359 return needed;
3360 }
3361
3362 lck_mtx_unlock(mbuf_mlock);
3363
3364 /*
3365 * We could not satisfy the request using the freelist alone;
3366 * allocate from the appropriate rudimentary caches and use
3367 * whatever we can get to construct the composite objects.
3368 */
3369 needed -= num;
3370
3371 /*
3372 * Mark these allocation requests as coming from a composite cache.
3373 * Also, if the caller is willing to be blocked, mark the request
3374 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
3375 * slab layer waiting for the individual object when one or more
3376 * of the already-constructed composite objects are available.
3377 */
3378 wait |= MCR_COMP;
3379 if (!(wait & MCR_NOSLEEP)) {
3380 wait |= MCR_FAILOK;
3381 }
3382
3383 /* allocate mbufs */
3384 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
3385 if (needed == 0) {
3386 ASSERT(mp_list == NULL);
3387 goto fail;
3388 }
3389
3390 /* allocate clusters */
3391 if (class == MC_MBUF_CL) {
3392 cl_class = MC_CL;
3393 } else if (class == MC_MBUF_BIGCL) {
3394 cl_class = MC_BIGCL;
3395 } else {
3396 VERIFY(class == MC_MBUF_16KCL);
3397 cl_class = MC_16KCL;
3398 }
3399 needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
3400 if (needed == 0) {
3401 ASSERT(clp_list == NULL);
3402 goto fail;
3403 }
3404
3405 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
3406 if (needed == 0) {
3407 ASSERT(ref_list == NULL);
3408 goto fail;
3409 }
3410
3411 /*
3412 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
3413 * overs will get freed accordingly before we return to caller.
3414 */
3415 for (cnum = 0; cnum < needed; cnum++) {
3416 struct mbuf *ms;
3417
3418 m = ms = (struct mbuf *)mp_list;
3419 mp_list = mp_list->obj_next;
3420
3421 cl = clp_list;
3422 clp_list = clp_list->obj_next;
3423 ((mcache_obj_t *)cl)->obj_next = NULL;
3424
3425 rfa = (struct ext_ref *)ref_list;
3426 ref_list = ref_list->obj_next;
3427 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
3428
3429 /*
3430 * If auditing is enabled, construct the shadow mbuf
3431 * in the audit structure instead of in the actual one.
3432 * mbuf_cslab_audit() will take care of restoring the
3433 * contents after the integrity check.
3434 */
3435 if (mclaudit != NULL) {
3436 mcache_audit_t *mca, *cl_mca;
3437
3438 lck_mtx_lock(mbuf_mlock);
3439 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
3440 ms = MCA_SAVED_MBUF_PTR(mca);
3441 cl_mca = mcl_audit_buf2mca(cl_class,
3442 (mcache_obj_t *)cl);
3443
3444 /*
3445 * Pair them up. Note that this is done at the time
3446 * the mbuf+cluster objects are constructed. This
3447 * information should be treated as "best effort"
3448 * debugging hint since more than one mbufs can refer
3449 * to a cluster. In that case, the cluster might not
3450 * be freed along with the mbuf it was paired with.
3451 */
3452 mca->mca_uptr = cl_mca;
3453 cl_mca->mca_uptr = mca;
3454
3455 ASSERT(mca->mca_uflags & MB_SCVALID);
3456 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
3457 lck_mtx_unlock(mbuf_mlock);
3458
3459 /* Technically, they are in the freelist */
3460 if (mclverify) {
3461 size_t size;
3462
3463 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
3464 m_maxsize(MC_MBUF));
3465
3466 if (class == MC_MBUF_CL) {
3467 size = m_maxsize(MC_CL);
3468 } else if (class == MC_MBUF_BIGCL) {
3469 size = m_maxsize(MC_BIGCL);
3470 } else {
3471 size = m_maxsize(MC_16KCL);
3472 }
3473
3474 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
3475 size);
3476 }
3477 }
3478
3479 MBUF_INIT(ms, 0, MT_FREE);
3480 if (class == MC_MBUF_16KCL) {
3481 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
3482 } else if (class == MC_MBUF_BIGCL) {
3483 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
3484 } else {
3485 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
3486 }
3487 VERIFY(ms->m_flags == M_EXT);
3488 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
3489
3490 *list = (mcache_obj_t *)m;
3491 (*list)->obj_next = NULL;
3492 list = *plist = &(*list)->obj_next;
3493 }
3494
3495 fail:
3496 /*
3497 * Free up what's left of the above.
3498 */
3499 if (mp_list != NULL) {
3500 mcache_free_ext(m_cache(MC_MBUF), mp_list);
3501 }
3502 if (clp_list != NULL) {
3503 mcache_free_ext(m_cache(cl_class), clp_list);
3504 }
3505 if (ref_list != NULL) {
3506 mcache_free_ext(ref_cache, ref_list);
3507 }
3508
3509 lck_mtx_lock(mbuf_mlock);
3510 if (num > 0 || cnum > 0) {
3511 m_total(class) += cnum;
3512 VERIFY(m_total(class) <= m_maxlimit(class));
3513 m_alloc_cnt(class) += num + cnum;
3514 }
3515 if ((num + cnum) < want) {
3516 m_fail_cnt(class) += (want - (num + cnum));
3517 }
3518 lck_mtx_unlock(mbuf_mlock);
3519
3520 return num + cnum;
3521 }
3522
3523 /*
3524 * Common de-allocator for composite objects called by the CPU cache
3525 * layer when one or more elements need to be returned to the appropriate
3526 * global freelist.
3527 */
3528 static void
mbuf_cslab_free(void * arg,mcache_obj_t * list,int purged)3529 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
3530 {
3531 mbuf_class_t class = (mbuf_class_t)arg;
3532 unsigned int num;
3533 int w;
3534
3535 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
3536
3537 lck_mtx_lock(mbuf_mlock);
3538
3539 num = cslab_free(class, list, purged);
3540 m_free_cnt(class) += num;
3541
3542 if ((w = mb_waiters) > 0) {
3543 mb_waiters = 0;
3544 }
3545 if (w) {
3546 mbwdog_logger("waking up all threads");
3547 }
3548
3549 lck_mtx_unlock(mbuf_mlock);
3550
3551 if (w != 0) {
3552 wakeup(mb_waitchan);
3553 }
3554 }
3555
3556 /*
3557 * Common auditor for composite objects called by the CPU cache layer
3558 * during an allocation or free request. For the former, this is called
3559 * after the objects are obtained from either the bucket or slab layer
3560 * and before they are returned to the caller. For the latter, this is
3561 * called immediately during free and before placing the objects into
3562 * the bucket or slab layer.
3563 */
3564 static void
mbuf_cslab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)3565 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
3566 {
3567 mbuf_class_t class = (mbuf_class_t)arg, cl_class;
3568 mcache_audit_t *mca;
3569 struct mbuf *m, *ms;
3570 mcl_slab_t *clsp, *nsp;
3571 size_t cl_size;
3572 void *cl;
3573
3574 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
3575 if (class == MC_MBUF_CL) {
3576 cl_class = MC_CL;
3577 } else if (class == MC_MBUF_BIGCL) {
3578 cl_class = MC_BIGCL;
3579 } else {
3580 cl_class = MC_16KCL;
3581 }
3582 cl_size = m_maxsize(cl_class);
3583
3584 while ((m = ms = (struct mbuf *)list) != NULL) {
3585 lck_mtx_lock(mbuf_mlock);
3586 /* Do the mbuf sanity checks and record its transaction */
3587 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
3588 mcl_audit_mbuf(mca, m, TRUE, alloc);
3589 if (mcltrace) {
3590 mcache_buffer_log(mca, m, m_cache(class), &mb_start);
3591 }
3592
3593 if (alloc) {
3594 mca->mca_uflags |= MB_COMP_INUSE;
3595 } else {
3596 mca->mca_uflags &= ~MB_COMP_INUSE;
3597 }
3598
3599 /*
3600 * Use the shadow mbuf in the audit structure if we are
3601 * freeing, since the contents of the actual mbuf has been
3602 * pattern-filled by the above call to mcl_audit_mbuf().
3603 */
3604 if (!alloc && mclverify) {
3605 ms = MCA_SAVED_MBUF_PTR(mca);
3606 }
3607
3608 /* Do the cluster sanity checks and record its transaction */
3609 cl = ms->m_ext.ext_buf;
3610 clsp = slab_get(cl);
3611 VERIFY(ms->m_flags == M_EXT && cl != NULL);
3612 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
3613 if (class == MC_MBUF_CL) {
3614 VERIFY(clsp->sl_refcnt >= 1 &&
3615 clsp->sl_refcnt <= NCLPG);
3616 } else {
3617 VERIFY(clsp->sl_refcnt >= 1 &&
3618 clsp->sl_refcnt <= NBCLPG);
3619 }
3620
3621 if (class == MC_MBUF_16KCL) {
3622 int k;
3623 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
3624 nsp = nsp->sl_next;
3625 /* Next slab must already be present */
3626 VERIFY(nsp != NULL);
3627 VERIFY(nsp->sl_refcnt == 1);
3628 }
3629 }
3630
3631
3632 mca = mcl_audit_buf2mca(cl_class, cl);
3633 mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE);
3634 if (mcltrace) {
3635 mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
3636 }
3637
3638 if (alloc) {
3639 mca->mca_uflags |= MB_COMP_INUSE;
3640 } else {
3641 mca->mca_uflags &= ~MB_COMP_INUSE;
3642 }
3643 lck_mtx_unlock(mbuf_mlock);
3644
3645 list = list->obj_next;
3646 }
3647 }
3648
3649 static void
m_vm_error_stats(uint32_t * cnt,uint64_t * ts,uint64_t * size,uint64_t alloc_size,kern_return_t error)3650 m_vm_error_stats(uint32_t *cnt, uint64_t *ts, uint64_t *size,
3651 uint64_t alloc_size, kern_return_t error)
3652 {
3653 *cnt = *cnt + 1;
3654 *ts = net_uptime();
3655 if (size) {
3656 *size = alloc_size;
3657 }
3658 switch (error) {
3659 case KERN_SUCCESS:
3660 break;
3661 case KERN_INVALID_ARGUMENT:
3662 mb_kmem_stats[0]++;
3663 break;
3664 case KERN_INVALID_ADDRESS:
3665 mb_kmem_stats[1]++;
3666 break;
3667 case KERN_RESOURCE_SHORTAGE:
3668 mb_kmem_stats[2]++;
3669 break;
3670 case KERN_NO_SPACE:
3671 mb_kmem_stats[3]++;
3672 break;
3673 case KERN_FAILURE:
3674 mb_kmem_stats[4]++;
3675 break;
3676 default:
3677 mb_kmem_stats[5]++;
3678 break;
3679 }
3680 }
3681
3682 static vm_offset_t
kmem_mb_alloc(vm_map_t mbmap,int size,int physContig,kern_return_t * err)3683 kmem_mb_alloc(vm_map_t mbmap, int size, int physContig, kern_return_t *err)
3684 {
3685 vm_offset_t addr = 0;
3686 kern_return_t kr = KERN_SUCCESS;
3687
3688 if (!physContig) {
3689 kr = kmem_alloc(mbmap, &addr, size,
3690 KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
3691 } else {
3692 kr = kmem_alloc_contig(mbmap, &addr, size, PAGE_MASK, 0xfffff,
3693 0, KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
3694 }
3695
3696 if (kr != KERN_SUCCESS) {
3697 addr = 0;
3698 }
3699 if (err) {
3700 *err = kr;
3701 }
3702
3703 return addr;
3704 }
3705
3706 /*
3707 * Allocate some number of mbuf clusters and place on cluster freelist.
3708 */
3709 static int
m_clalloc(const u_int32_t num,const int wait,const u_int32_t bufsize)3710 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
3711 {
3712 int i, count = 0;
3713 vm_size_t size = 0;
3714 int numpages = 0, large_buffer;
3715 vm_offset_t page = 0;
3716 mcache_audit_t *mca_list = NULL;
3717 mcache_obj_t *con_list = NULL;
3718 mcl_slab_t *sp;
3719 mbuf_class_t class;
3720 kern_return_t error;
3721
3722 /* Set if a buffer allocation needs allocation of multiple pages */
3723 large_buffer = ((bufsize == m_maxsize(MC_16KCL)) &&
3724 PAGE_SIZE < M16KCLBYTES);
3725 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
3726 bufsize == m_maxsize(MC_16KCL));
3727
3728 VERIFY((bufsize == PAGE_SIZE) ||
3729 (bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL)));
3730
3731 if (bufsize == m_size(MC_BIGCL)) {
3732 class = MC_BIGCL;
3733 } else {
3734 class = MC_16KCL;
3735 }
3736
3737 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3738
3739 /*
3740 * Multiple threads may attempt to populate the cluster map one
3741 * after another. Since we drop the lock below prior to acquiring
3742 * the physical page(s), our view of the cluster map may no longer
3743 * be accurate, and we could end up over-committing the pages beyond
3744 * the maximum allowed for each class. To prevent it, this entire
3745 * operation (including the page mapping) is serialized.
3746 */
3747 while (mb_clalloc_busy) {
3748 mb_clalloc_waiters++;
3749 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
3750 (PZERO - 1), "m_clalloc", NULL);
3751 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3752 }
3753
3754 /* We are busy now; tell everyone else to go away */
3755 mb_clalloc_busy = TRUE;
3756
3757 /*
3758 * Honor the caller's wish to block or not block. We have a way
3759 * to grow the pool asynchronously using the mbuf worker thread.
3760 */
3761 i = m_howmany(num, bufsize);
3762 if (i <= 0 || (wait & M_DONTWAIT)) {
3763 goto out;
3764 }
3765
3766 lck_mtx_unlock(mbuf_mlock);
3767
3768 size = round_page(i * bufsize);
3769 page = kmem_mb_alloc(mb_map, size, large_buffer, &error);
3770
3771 /*
3772 * If we did ask for "n" 16KB physically contiguous chunks
3773 * and didn't get them, then please try again without this
3774 * restriction.
3775 */
3776 net_update_uptime();
3777 if (large_buffer && page == 0) {
3778 m_vm_error_stats(&mb_kmem_contig_failed,
3779 &mb_kmem_contig_failed_ts,
3780 &mb_kmem_contig_failed_size,
3781 size, error);
3782 page = kmem_mb_alloc(mb_map, size, 0, &error);
3783 }
3784
3785 if (page == 0) {
3786 m_vm_error_stats(&mb_kmem_failed,
3787 &mb_kmem_failed_ts,
3788 &mb_kmem_failed_size,
3789 size, error);
3790 #if PAGE_SIZE == 4096
3791 if (bufsize == m_maxsize(MC_BIGCL)) {
3792 #else
3793 if (bufsize >= m_maxsize(MC_BIGCL)) {
3794 #endif
3795 /* Try for 1 page if failed */
3796 size = PAGE_SIZE;
3797 page = kmem_mb_alloc(mb_map, size, 0, &error);
3798 if (page == 0) {
3799 m_vm_error_stats(&mb_kmem_one_failed,
3800 &mb_kmem_one_failed_ts,
3801 NULL, size, error);
3802 }
3803 }
3804
3805 if (page == 0) {
3806 lck_mtx_lock(mbuf_mlock);
3807 goto out;
3808 }
3809 }
3810
3811 VERIFY(IS_P2ALIGNED(page, PAGE_SIZE));
3812 numpages = size / PAGE_SIZE;
3813
3814 /* If auditing is enabled, allocate the audit structures now */
3815 if (mclaudit != NULL) {
3816 int needed;
3817
3818 /*
3819 * Yes, I realize this is a waste of memory for clusters
3820 * that never get transformed into mbufs, as we may end
3821 * up with NMBPG-1 unused audit structures per cluster.
3822 * But doing so tremendously simplifies the allocation
3823 * strategy, since at this point we are not holding the
3824 * mbuf lock and the caller is okay to be blocked.
3825 */
3826 if (bufsize == PAGE_SIZE) {
3827 needed = numpages * NMBPG;
3828
3829 i = mcache_alloc_ext(mcl_audit_con_cache,
3830 &con_list, needed, MCR_SLEEP);
3831
3832 VERIFY(con_list != NULL && i == needed);
3833 } else {
3834 /*
3835 * if multiple 4K pages are being used for a
3836 * 16K cluster
3837 */
3838 needed = numpages / NSLABSP16KB;
3839 }
3840
3841 i = mcache_alloc_ext(mcache_audit_cache,
3842 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
3843
3844 VERIFY(mca_list != NULL && i == needed);
3845 }
3846
3847 lck_mtx_lock(mbuf_mlock);
3848
3849 for (i = 0; i < numpages; i++, page += PAGE_SIZE) {
3850 ppnum_t offset =
3851 ((unsigned char *)page - mbutl) >> PAGE_SHIFT;
3852 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
3853
3854 /*
3855 * If there is a mapper the appropriate I/O page is
3856 * returned; zero out the page to discard its past
3857 * contents to prevent exposing leftover kernel memory.
3858 */
3859 VERIFY(offset < mcl_pages);
3860 if (mcl_paddr_base != 0) {
3861 bzero((void *)(uintptr_t) page, PAGE_SIZE);
3862 new_page = IOMapperInsertPage(mcl_paddr_base,
3863 offset, new_page);
3864 }
3865 mcl_paddr[offset] = new_page;
3866
3867 /* Pattern-fill this fresh page */
3868 if (mclverify) {
3869 mcache_set_pattern(MCACHE_FREE_PATTERN,
3870 (caddr_t)page, PAGE_SIZE);
3871 }
3872 if (bufsize == PAGE_SIZE) {
3873 mcache_obj_t *buf;
3874 /* One for the entire page */
3875 sp = slab_get((void *)page);
3876 if (mclaudit != NULL) {
3877 mcl_audit_init((void *)page,
3878 &mca_list, &con_list,
3879 AUDIT_CONTENTS_SIZE, NMBPG);
3880 }
3881 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3882 slab_init(sp, class, SLF_MAPPED, (void *)page,
3883 (void *)page, PAGE_SIZE, 0, 1);
3884 buf = (mcache_obj_t *)page;
3885 buf->obj_next = NULL;
3886
3887 /* Insert this slab */
3888 slab_insert(sp, class);
3889
3890 /* Update stats now since slab_get drops the lock */
3891 ++m_infree(class);
3892 ++m_total(class);
3893 VERIFY(m_total(class) <= m_maxlimit(class));
3894 if (class == MC_BIGCL) {
3895 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3896 m_infree(MC_MBUF_BIGCL);
3897 mbstat.m_bigclusters = m_total(MC_BIGCL);
3898 }
3899 ++count;
3900 } else if ((bufsize > PAGE_SIZE) &&
3901 (i % NSLABSP16KB) == 0) {
3902 union m16kcluster *m16kcl = (union m16kcluster *)page;
3903 mcl_slab_t *nsp;
3904 int k;
3905
3906 /* One for the entire 16KB */
3907 sp = slab_get(m16kcl);
3908 if (mclaudit != NULL) {
3909 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
3910 }
3911
3912 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3913 slab_init(sp, MC_16KCL, SLF_MAPPED,
3914 m16kcl, m16kcl, bufsize, 0, 1);
3915 m16kcl->m16kcl_next = NULL;
3916
3917 /*
3918 * 2nd-Nth page's slab is part of the first one,
3919 * where N is NSLABSP16KB.
3920 */
3921 for (k = 1; k < NSLABSP16KB; k++) {
3922 nsp = slab_get(((union mbigcluster *)page) + k);
3923 VERIFY(nsp->sl_refcnt == 0 &&
3924 nsp->sl_flags == 0);
3925 slab_init(nsp, MC_16KCL,
3926 SLF_MAPPED | SLF_PARTIAL,
3927 m16kcl, NULL, 0, 0, 0);
3928 }
3929 /* Insert this slab */
3930 slab_insert(sp, MC_16KCL);
3931
3932 /* Update stats now since slab_get drops the lock */
3933 ++m_infree(MC_16KCL);
3934 ++m_total(MC_16KCL);
3935 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3936 ++count;
3937 }
3938 }
3939 VERIFY(mca_list == NULL && con_list == NULL);
3940
3941 /* We're done; let others enter */
3942 mb_clalloc_busy = FALSE;
3943 if (mb_clalloc_waiters > 0) {
3944 mb_clalloc_waiters = 0;
3945 wakeup(mb_clalloc_waitchan);
3946 }
3947
3948 return count;
3949 out:
3950 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3951
3952 mtracelarge_register(size);
3953
3954 /* We're done; let others enter */
3955 mb_clalloc_busy = FALSE;
3956 if (mb_clalloc_waiters > 0) {
3957 mb_clalloc_waiters = 0;
3958 wakeup(mb_clalloc_waitchan);
3959 }
3960
3961 /*
3962 * When non-blocking we kick a thread if we have to grow the
3963 * pool or if the number of free clusters is less than requested.
3964 */
3965 if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) {
3966 mbwdog_logger("waking up the worker thread to to grow %s by %d",
3967 m_cname(class), i);
3968 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
3969 mbuf_worker_needs_wakeup = FALSE;
3970 }
3971 if (class == MC_BIGCL) {
3972 if (i > 0) {
3973 /*
3974 * Remember total number of 4KB clusters needed
3975 * at this time.
3976 */
3977 i += m_total(MC_BIGCL);
3978 if (i > m_region_expand(MC_BIGCL)) {
3979 m_region_expand(MC_BIGCL) = i;
3980 }
3981 }
3982 if (m_infree(MC_BIGCL) >= num) {
3983 return 1;
3984 }
3985 } else {
3986 if (i > 0) {
3987 /*
3988 * Remember total number of 16KB clusters needed
3989 * at this time.
3990 */
3991 i += m_total(MC_16KCL);
3992 if (i > m_region_expand(MC_16KCL)) {
3993 m_region_expand(MC_16KCL) = i;
3994 }
3995 }
3996 if (m_infree(MC_16KCL) >= num) {
3997 return 1;
3998 }
3999 }
4000 return 0;
4001 }
4002
4003 /*
4004 * Populate the global freelist of the corresponding buffer class.
4005 */
4006 static int
4007 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
4008 {
4009 mcache_obj_t *o = NULL;
4010 int i, numpages = 0, count;
4011 mbuf_class_t super_class;
4012
4013 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
4014 class == MC_16KCL);
4015
4016 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4017
4018 VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) ||
4019 PAGE_SIZE == m_maxsize(MC_16KCL));
4020
4021 if (m_maxsize(class) >= PAGE_SIZE) {
4022 return m_clalloc(num, wait, m_maxsize(class)) != 0;
4023 }
4024
4025 /*
4026 * The rest of the function will allocate pages and will slice
4027 * them up into the right size
4028 */
4029
4030 numpages = (num * m_size(class) + PAGE_SIZE - 1) / PAGE_SIZE;
4031
4032 /* Currently assume that pages are 4K or 16K */
4033 if (PAGE_SIZE == m_maxsize(MC_BIGCL)) {
4034 super_class = MC_BIGCL;
4035 } else {
4036 super_class = MC_16KCL;
4037 }
4038
4039 i = m_clalloc(numpages, wait, m_maxsize(super_class));
4040
4041 /* how many objects will we cut the page into? */
4042 int numobj = PAGE_SIZE / m_maxsize(class);
4043
4044 for (count = 0; count < numpages; count++) {
4045 /* respect totals, minlimit, maxlimit */
4046 if (m_total(super_class) <= m_minlimit(super_class) ||
4047 m_total(class) >= m_maxlimit(class)) {
4048 break;
4049 }
4050
4051 if ((o = slab_alloc(super_class, wait)) == NULL) {
4052 break;
4053 }
4054
4055 struct mbuf *m = (struct mbuf *)o;
4056 union mcluster *c = (union mcluster *)o;
4057 union mbigcluster *mbc = (union mbigcluster *)o;
4058 mcl_slab_t *sp = slab_get(o);
4059 mcache_audit_t *mca = NULL;
4060
4061 /*
4062 * since one full page will be converted to MC_MBUF or
4063 * MC_CL, verify that the reference count will match that
4064 * assumption
4065 */
4066 VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp));
4067 VERIFY((sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
4068 /*
4069 * Make sure that the cluster is unmolested
4070 * while in freelist
4071 */
4072 if (mclverify) {
4073 mca = mcl_audit_buf2mca(super_class,
4074 (mcache_obj_t *)o);
4075 mcache_audit_free_verify(mca,
4076 (mcache_obj_t *)o, 0, m_maxsize(super_class));
4077 }
4078
4079 /* Reinitialize it as an mbuf or 2K or 4K slab */
4080 slab_init(sp, class, sp->sl_flags,
4081 sp->sl_base, NULL, PAGE_SIZE, 0, numobj);
4082
4083 VERIFY(sp->sl_head == NULL);
4084
4085 VERIFY(m_total(super_class) >= 1);
4086 m_total(super_class)--;
4087
4088 if (super_class == MC_BIGCL) {
4089 mbstat.m_bigclusters = m_total(MC_BIGCL);
4090 }
4091
4092 m_total(class) += numobj;
4093 VERIFY(m_total(class) <= m_maxlimit(class));
4094 m_infree(class) += numobj;
4095
4096 i = numobj;
4097 if (class == MC_MBUF) {
4098 mbstat.m_mbufs = m_total(MC_MBUF);
4099 mtype_stat_add(MT_FREE, NMBPG);
4100 while (i--) {
4101 /*
4102 * If auditing is enabled, construct the
4103 * shadow mbuf in the audit structure
4104 * instead of the actual one.
4105 * mbuf_slab_audit() will take care of
4106 * restoring the contents after the
4107 * integrity check.
4108 */
4109 if (mclaudit != NULL) {
4110 struct mbuf *ms;
4111 mca = mcl_audit_buf2mca(MC_MBUF,
4112 (mcache_obj_t *)m);
4113 ms = MCA_SAVED_MBUF_PTR(mca);
4114 ms->m_type = MT_FREE;
4115 } else {
4116 m->m_type = MT_FREE;
4117 }
4118 m->m_next = sp->sl_head;
4119 sp->sl_head = (void *)m++;
4120 }
4121 } else if (class == MC_CL) { /* MC_CL */
4122 mbstat.m_clfree =
4123 m_infree(MC_CL) + m_infree(MC_MBUF_CL);
4124 mbstat.m_clusters = m_total(MC_CL);
4125 while (i--) {
4126 c->mcl_next = sp->sl_head;
4127 sp->sl_head = (void *)c++;
4128 }
4129 } else {
4130 VERIFY(class == MC_BIGCL);
4131 mbstat.m_bigclusters = m_total(MC_BIGCL);
4132 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
4133 m_infree(MC_MBUF_BIGCL);
4134 while (i--) {
4135 mbc->mbc_next = sp->sl_head;
4136 sp->sl_head = (void *)mbc++;
4137 }
4138 }
4139
4140 /* Insert into the mbuf or 2k or 4k slab list */
4141 slab_insert(sp, class);
4142
4143 if ((i = mb_waiters) > 0) {
4144 mb_waiters = 0;
4145 }
4146 if (i != 0) {
4147 mbwdog_logger("waking up all threads");
4148 wakeup(mb_waitchan);
4149 }
4150 }
4151 return count != 0;
4152 }
4153
4154 /*
4155 * For each class, initialize the freelist to hold m_minlimit() objects.
4156 */
4157 static void
4158 freelist_init(mbuf_class_t class)
4159 {
4160 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4161
4162 VERIFY(class == MC_CL || class == MC_BIGCL);
4163 VERIFY(m_total(class) == 0);
4164 VERIFY(m_minlimit(class) > 0);
4165
4166 while (m_total(class) < m_minlimit(class)) {
4167 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
4168 }
4169
4170 VERIFY(m_total(class) >= m_minlimit(class));
4171 }
4172
4173 /*
4174 * (Inaccurately) check if it might be worth a trip back to the
4175 * mcache layer due the availability of objects there. We'll
4176 * end up back here if there's nothing up there.
4177 */
4178 static boolean_t
4179 mbuf_cached_above(mbuf_class_t class, int wait)
4180 {
4181 switch (class) {
4182 case MC_MBUF:
4183 if (wait & MCR_COMP) {
4184 return !mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
4185 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
4186 }
4187 break;
4188
4189 case MC_CL:
4190 if (wait & MCR_COMP) {
4191 return !mcache_bkt_isempty(m_cache(MC_MBUF_CL));
4192 }
4193 break;
4194
4195 case MC_BIGCL:
4196 if (wait & MCR_COMP) {
4197 return !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
4198 }
4199 break;
4200
4201 case MC_16KCL:
4202 if (wait & MCR_COMP) {
4203 return !mcache_bkt_isempty(m_cache(MC_MBUF_16KCL));
4204 }
4205 break;
4206
4207 case MC_MBUF_CL:
4208 case MC_MBUF_BIGCL:
4209 case MC_MBUF_16KCL:
4210 break;
4211
4212 default:
4213 VERIFY(0);
4214 /* NOTREACHED */
4215 }
4216
4217 return !mcache_bkt_isempty(m_cache(class));
4218 }
4219
4220 /*
4221 * If possible, convert constructed objects to raw ones.
4222 */
4223 static boolean_t
4224 mbuf_steal(mbuf_class_t class, unsigned int num)
4225 {
4226 mcache_obj_t *top = NULL;
4227 mcache_obj_t **list = ⊤
4228 unsigned int tot = 0;
4229
4230 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4231
4232 switch (class) {
4233 case MC_MBUF:
4234 case MC_CL:
4235 case MC_BIGCL:
4236 case MC_16KCL:
4237 return FALSE;
4238
4239 case MC_MBUF_CL:
4240 case MC_MBUF_BIGCL:
4241 case MC_MBUF_16KCL:
4242 /* Get the required number of constructed objects if possible */
4243 if (m_infree(class) > m_minlimit(class)) {
4244 tot = cslab_alloc(class, &list,
4245 MIN(num, m_infree(class)));
4246 }
4247
4248 /* And destroy them to get back the raw objects */
4249 if (top != NULL) {
4250 (void) cslab_free(class, top, 1);
4251 }
4252 break;
4253
4254 default:
4255 VERIFY(0);
4256 /* NOTREACHED */
4257 }
4258
4259 return tot == num;
4260 }
4261
4262 static void
4263 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
4264 {
4265 int m, bmap = 0;
4266
4267 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4268
4269 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
4270 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
4271 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
4272
4273 /*
4274 * This logic can be made smarter; for now, simply mark
4275 * all other related classes as potential victims.
4276 */
4277 switch (class) {
4278 case MC_MBUF:
4279 m_wantpurge(MC_CL)++;
4280 m_wantpurge(MC_BIGCL)++;
4281 m_wantpurge(MC_MBUF_CL)++;
4282 m_wantpurge(MC_MBUF_BIGCL)++;
4283 break;
4284
4285 case MC_CL:
4286 m_wantpurge(MC_MBUF)++;
4287 m_wantpurge(MC_BIGCL)++;
4288 m_wantpurge(MC_MBUF_BIGCL)++;
4289 if (!comp) {
4290 m_wantpurge(MC_MBUF_CL)++;
4291 }
4292 break;
4293
4294 case MC_BIGCL:
4295 m_wantpurge(MC_MBUF)++;
4296 m_wantpurge(MC_CL)++;
4297 m_wantpurge(MC_MBUF_CL)++;
4298 if (!comp) {
4299 m_wantpurge(MC_MBUF_BIGCL)++;
4300 }
4301 break;
4302
4303 case MC_16KCL:
4304 if (!comp) {
4305 m_wantpurge(MC_MBUF_16KCL)++;
4306 }
4307 break;
4308
4309 default:
4310 VERIFY(0);
4311 /* NOTREACHED */
4312 }
4313
4314 /*
4315 * Run through each marked class and check if we really need to
4316 * purge (and therefore temporarily disable) the per-CPU caches
4317 * layer used by the class. If so, remember the classes since
4318 * we are going to drop the lock below prior to purging.
4319 */
4320 for (m = 0; m < NELEM(mbuf_table); m++) {
4321 if (m_wantpurge(m) > 0) {
4322 m_wantpurge(m) = 0;
4323 /*
4324 * Try hard to steal the required number of objects
4325 * from the freelist of other mbuf classes. Only
4326 * purge and disable the per-CPU caches layer when
4327 * we don't have enough; it's the last resort.
4328 */
4329 if (!mbuf_steal(m, num)) {
4330 bmap |= (1 << m);
4331 }
4332 }
4333 }
4334
4335 lck_mtx_unlock(mbuf_mlock);
4336
4337 if (bmap != 0) {
4338 /* signal the domains to drain */
4339 net_drain_domains();
4340
4341 /* Sigh; we have no other choices but to ask mcache to purge */
4342 for (m = 0; m < NELEM(mbuf_table); m++) {
4343 if ((bmap & (1 << m)) &&
4344 mcache_purge_cache(m_cache(m), TRUE)) {
4345 lck_mtx_lock(mbuf_mlock);
4346 m_purge_cnt(m)++;
4347 mbstat.m_drain++;
4348 lck_mtx_unlock(mbuf_mlock);
4349 }
4350 }
4351 } else {
4352 /*
4353 * Request mcache to reap extra elements from all of its caches;
4354 * note that all reaps are serialized and happen only at a fixed
4355 * interval.
4356 */
4357 mcache_reap();
4358 }
4359 lck_mtx_lock(mbuf_mlock);
4360 }
4361 #endif /* CONFIG_MBUF_MCACHE */
4362
4363 static inline struct mbuf *
4364 m_get_common(int wait, short type, int hdr)
4365 {
4366 struct mbuf *m;
4367
4368 #if CONFIG_MBUF_MCACHE
4369 int mcflags = MSLEEPF(wait);
4370
4371 /* Is this due to a non-blocking retry? If so, then try harder */
4372 if (mcflags & MCR_NOSLEEP) {
4373 mcflags |= MCR_TRYHARD;
4374 }
4375
4376 m = mcache_alloc(m_cache(MC_MBUF), mcflags);
4377 #else
4378 m = mz_alloc(wait);
4379 #endif /* CONFIG_MBUF_MCACHE */
4380 if (m != NULL) {
4381 MBUF_INIT(m, hdr, type);
4382 mtype_stat_inc(type);
4383 mtype_stat_dec(MT_FREE);
4384 }
4385 return m;
4386 }
4387
4388 /*
4389 * Space allocation routines; these are also available as macros
4390 * for critical paths.
4391 */
4392 #define _M_GET(wait, type) m_get_common(wait, type, 0)
4393 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
4394 #define _M_RETRY(wait, type) _M_GET(wait, type)
4395 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
4396 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
4397 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
4398
4399 struct mbuf *
4400 m_get(int wait, int type)
4401 {
4402 return _M_GET(wait, type);
4403 }
4404
4405 struct mbuf *
4406 m_gethdr(int wait, int type)
4407 {
4408 return _M_GETHDR(wait, type);
4409 }
4410
4411 struct mbuf *
4412 m_retry(int wait, int type)
4413 {
4414 return _M_RETRY(wait, type);
4415 }
4416
4417 struct mbuf *
4418 m_retryhdr(int wait, int type)
4419 {
4420 return _M_RETRYHDR(wait, type);
4421 }
4422
4423 struct mbuf *
4424 m_getclr(int wait, int type)
4425 {
4426 struct mbuf *m;
4427
4428 _MGET(m, wait, type);
4429 if (m != NULL) {
4430 bzero(MTOD(m, caddr_t), MLEN);
4431 }
4432 return m;
4433 }
4434
4435 static int
4436 m_free_paired(struct mbuf *m)
4437 {
4438 VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED));
4439
4440 os_atomic_thread_fence(seq_cst);
4441 if (MEXT_PMBUF(m) == m) {
4442 /*
4443 * Paired ref count might be negative in case we lose
4444 * against another thread clearing MEXT_PMBUF, in the
4445 * event it occurs after the above memory barrier sync.
4446 * In that case just ignore as things have been unpaired.
4447 */
4448 int16_t prefcnt = os_atomic_dec(&MEXT_PREF(m), acq_rel);
4449 if (prefcnt > 1) {
4450 return 1;
4451 } else if (prefcnt == 1) {
4452 m_ext_free_func_t m_free_func = m_get_ext_free(m);
4453 VERIFY(m_free_func != NULL);
4454 (*m_free_func)(m->m_ext.ext_buf,
4455 m->m_ext.ext_size, m_get_ext_arg(m));
4456 return 1;
4457 } else if (prefcnt == 0) {
4458 VERIFY(MBUF_IS_PAIRED(m));
4459
4460 /*
4461 * Restore minref to its natural value, so that
4462 * the caller will be able to free the cluster
4463 * as appropriate.
4464 */
4465 MEXT_MINREF(m) = 0;
4466
4467 /*
4468 * Clear MEXT_PMBUF, but leave EXTF_PAIRED intact
4469 * as it is immutable. atomic_set_ptr also causes
4470 * memory barrier sync.
4471 */
4472 os_atomic_store(&MEXT_PMBUF(m), NULL, release);
4473
4474 switch (m->m_ext.ext_size) {
4475 case MCLBYTES:
4476 m_set_ext(m, m_get_rfa(m), NULL, NULL);
4477 break;
4478
4479 case MBIGCLBYTES:
4480 m_set_ext(m, m_get_rfa(m), m_bigfree, NULL);
4481 break;
4482
4483 case M16KCLBYTES:
4484 m_set_ext(m, m_get_rfa(m), m_16kfree, NULL);
4485 break;
4486
4487 default:
4488 VERIFY(0);
4489 /* NOTREACHED */
4490 }
4491 }
4492 }
4493
4494 /*
4495 * Tell caller the unpair has occurred, and that the reference
4496 * count on the external cluster held for the paired mbuf should
4497 * now be dropped.
4498 */
4499 return 0;
4500 }
4501
4502 struct mbuf *
4503 m_free(struct mbuf *m)
4504 {
4505 struct mbuf *n = m->m_next;
4506
4507 if (m->m_type == MT_FREE) {
4508 panic("m_free: freeing an already freed mbuf");
4509 }
4510
4511 if (m->m_flags & M_PKTHDR) {
4512 /* Check for scratch area overflow */
4513 m_redzone_verify(m);
4514 /* Free the aux data and tags if there is any */
4515 m_tag_delete_chain(m);
4516
4517 m_do_tx_compl_callback(m, NULL);
4518 }
4519
4520 if (m->m_flags & M_EXT) {
4521 if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
4522 return n;
4523 }
4524 /*
4525 * Make sure that we don't touch any ext_ref
4526 * member after we decrement the reference count
4527 * since that may lead to use-after-free
4528 * when we do not hold the last reference.
4529 */
4530 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
4531 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
4532 const uint16_t minref = MEXT_MINREF(m);
4533 const uint16_t refcnt = m_decref(m);
4534
4535 if (refcnt == minref && !composite) {
4536 #if CONFIG_MBUF_MCACHE
4537 if (m_free_func == NULL) {
4538 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
4539 } else if (m_free_func == m_bigfree) {
4540 mcache_free(m_cache(MC_BIGCL),
4541 m->m_ext.ext_buf);
4542 } else if (m_free_func == m_16kfree) {
4543 mcache_free(m_cache(MC_16KCL),
4544 m->m_ext.ext_buf);
4545 } else {
4546 (*m_free_func)(m->m_ext.ext_buf,
4547 m->m_ext.ext_size, m_get_ext_arg(m));
4548 }
4549 mcache_free(ref_cache, m_get_rfa(m));
4550 #else
4551 if (m_free_func == NULL) {
4552 mz_cl_free(ZONE_ID_CLUSTER_2K, m->m_ext.ext_buf);
4553 } else if (m_free_func == m_bigfree) {
4554 mz_cl_free(ZONE_ID_CLUSTER_4K, m->m_ext.ext_buf);
4555 } else if (m_free_func == m_16kfree) {
4556 mz_cl_free(ZONE_ID_CLUSTER_16K, m->m_ext.ext_buf);
4557 } else {
4558 (*m_free_func)(m->m_ext.ext_buf,
4559 m->m_ext.ext_size, m_get_ext_arg(m));
4560 }
4561 mz_ref_free(m_get_rfa(m));
4562 #endif /* CONFIG_MBUF_MCACHE */
4563 m_set_ext(m, NULL, NULL, NULL);
4564 } else if (refcnt == minref && composite) {
4565 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
4566
4567 mtype_stat_dec(m->m_type);
4568 mtype_stat_inc(MT_FREE);
4569
4570 m->m_type = MT_FREE;
4571 m->m_flags = M_EXT;
4572 m->m_len = 0;
4573 m->m_next = m->m_nextpkt = NULL;
4574 /*
4575 * MEXT_FLAGS is safe to access here
4576 * since we are now sure that we held
4577 * the last reference to ext_ref.
4578 */
4579 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4580
4581 #if CONFIG_MBUF_MCACHE
4582 /* "Free" into the intermediate cache */
4583 if (m_free_func == NULL) {
4584 mcache_free(m_cache(MC_MBUF_CL), m);
4585 } else if (m_free_func == m_bigfree) {
4586 mcache_free(m_cache(MC_MBUF_BIGCL), m);
4587 } else {
4588 VERIFY(m_free_func == m_16kfree);
4589 mcache_free(m_cache(MC_MBUF_16KCL), m);
4590 }
4591 #else
4592 /* "Free" into the intermediate cache */
4593 if (m_free_func == NULL) {
4594 mz_composite_free(MC_MBUF_CL, m);
4595 } else if (m_free_func == m_bigfree) {
4596 mz_composite_free(MC_MBUF_BIGCL, m);
4597 } else {
4598 VERIFY(m_free_func == m_16kfree);
4599 mz_composite_free(MC_MBUF_16KCL, m);
4600 }
4601 #endif /* CONFIG_MBUF_MCACHE */
4602 return n;
4603 }
4604 }
4605
4606 mtype_stat_dec(m->m_type);
4607 mtype_stat_inc(MT_FREE);
4608
4609 m->m_type = MT_FREE;
4610 m->m_flags = m->m_len = 0;
4611 m->m_next = m->m_nextpkt = NULL;
4612
4613 #if CONFIG_MBUF_MCACHE
4614 mcache_free(m_cache(MC_MBUF), m);
4615 #else
4616 mz_free(m);
4617 #endif /* CONFIG_MBUF_MCACHE */
4618
4619 return n;
4620 }
4621
4622 __private_extern__ struct mbuf *
4623 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
4624 void (*extfree)(caddr_t, u_int, caddr_t), size_t extsize, caddr_t extarg,
4625 int wait, int pair)
4626 {
4627 struct ext_ref *rfa = NULL;
4628
4629 /*
4630 * If pairing is requested and an existing mbuf is provided, reject
4631 * it if it's already been paired to another cluster. Otherwise,
4632 * allocate a new one or free any existing below.
4633 */
4634 if ((m != NULL && MBUF_IS_PAIRED(m)) ||
4635 (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)) {
4636 return NULL;
4637 }
4638
4639 if (m->m_flags & M_EXT) {
4640 /*
4641 * Make sure that we don't touch any ext_ref
4642 * member after we decrement the reference count
4643 * since that may lead to use-after-free
4644 * when we do not hold the last reference.
4645 */
4646 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
4647 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
4648 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
4649 const uint16_t minref = MEXT_MINREF(m);
4650 const uint16_t refcnt = m_decref(m);
4651
4652 if (refcnt == minref && !composite) {
4653 #if CONFIG_MBUF_MCACHE
4654 if (m_free_func == NULL) {
4655 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
4656 } else if (m_free_func == m_bigfree) {
4657 mcache_free(m_cache(MC_BIGCL),
4658 m->m_ext.ext_buf);
4659 } else if (m_free_func == m_16kfree) {
4660 mcache_free(m_cache(MC_16KCL),
4661 m->m_ext.ext_buf);
4662 } else {
4663 (*m_free_func)(m->m_ext.ext_buf,
4664 m->m_ext.ext_size, m_get_ext_arg(m));
4665 }
4666 #else
4667 if (m_free_func == NULL) {
4668 mz_cl_free(ZONE_ID_CLUSTER_2K, m->m_ext.ext_buf);
4669 } else if (m_free_func == m_bigfree) {
4670 mz_cl_free(ZONE_ID_CLUSTER_4K, m->m_ext.ext_buf);
4671 } else if (m_free_func == m_16kfree) {
4672 mz_cl_free(ZONE_ID_CLUSTER_16K, m->m_ext.ext_buf);
4673 } else {
4674 (*m_free_func)(m->m_ext.ext_buf,
4675 m->m_ext.ext_size, m_get_ext_arg(m));
4676 }
4677 #endif /* CONFIG_MBUF_MCACHE */
4678 /* Re-use the reference structure */
4679 rfa = m_get_rfa(m);
4680 } else if (refcnt == minref && composite) {
4681 VERIFY(m->m_type != MT_FREE);
4682
4683 mtype_stat_dec(m->m_type);
4684 mtype_stat_inc(MT_FREE);
4685
4686 m->m_type = MT_FREE;
4687 m->m_flags = M_EXT;
4688 m->m_len = 0;
4689 m->m_next = m->m_nextpkt = NULL;
4690
4691 /*
4692 * MEXT_FLAGS is safe to access here
4693 * since we are now sure that we held
4694 * the last reference to ext_ref.
4695 */
4696 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4697
4698 /* "Free" into the intermediate cache */
4699 #if CONFIG_MBUF_MCACHE
4700 if (m_free_func == NULL) {
4701 mcache_free(m_cache(MC_MBUF_CL), m);
4702 } else if (m_free_func == m_bigfree) {
4703 mcache_free(m_cache(MC_MBUF_BIGCL), m);
4704 } else {
4705 VERIFY(m_free_func == m_16kfree);
4706 mcache_free(m_cache(MC_MBUF_16KCL), m);
4707 }
4708 #else
4709 if (m_free_func == NULL) {
4710 mz_composite_free(MC_MBUF_CL, m);
4711 } else if (m_free_func == m_bigfree) {
4712 mz_composite_free(MC_MBUF_BIGCL, m);
4713 } else {
4714 VERIFY(m_free_func == m_16kfree);
4715 mz_composite_free(MC_MBUF_16KCL, m);
4716 }
4717 #endif /* CONFIG_MBUF_MCACHE */
4718 /*
4719 * Allocate a new mbuf, since we didn't divorce
4720 * the composite mbuf + cluster pair above.
4721 */
4722 if ((m = _M_GETHDR(wait, type)) == NULL) {
4723 return NULL;
4724 }
4725 }
4726 }
4727
4728 #if CONFIG_MBUF_MCACHE
4729 if (rfa == NULL &&
4730 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4731 m_free(m);
4732 return NULL;
4733 }
4734 #else
4735 if (rfa == NULL &&
4736 (rfa = mz_ref_alloc(wait)) == NULL) {
4737 m_free(m);
4738 return NULL;
4739 }
4740 #endif /* CONFIG_MBUF_MCACHE */
4741
4742 if (!pair) {
4743 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa,
4744 0, 1, 0, 0, 0, NULL);
4745 } else {
4746 MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
4747 1, 1, 1, EXTF_PAIRED, 0, m);
4748 }
4749
4750 return m;
4751 }
4752
4753 /*
4754 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
4755 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
4756 */
4757 struct mbuf *
4758 m_getcl(int wait, int type, int flags)
4759 {
4760 struct mbuf *m = NULL;
4761 int hdr = (flags & M_PKTHDR);
4762
4763 #if CONFIG_MBUF_MCACHE
4764 int mcflags = MSLEEPF(wait);
4765
4766 /* Is this due to a non-blocking retry? If so, then try harder */
4767 if (mcflags & MCR_NOSLEEP) {
4768 mcflags |= MCR_TRYHARD;
4769 }
4770
4771 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
4772 #else
4773 m = mz_composite_alloc(MC_MBUF_CL, wait);
4774 #endif /* CONFIG_MBUF_MCACHE */
4775 if (m != NULL) {
4776 u_int16_t flag;
4777 struct ext_ref *rfa;
4778 void *cl;
4779
4780 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4781 cl = m->m_ext.ext_buf;
4782 rfa = m_get_rfa(m);
4783
4784 ASSERT(cl != NULL && rfa != NULL);
4785 VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
4786
4787 flag = MEXT_FLAGS(m);
4788
4789 MBUF_INIT(m, hdr, type);
4790 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4791
4792 mtype_stat_inc(type);
4793 mtype_stat_dec(MT_FREE);
4794 }
4795 return m;
4796 }
4797
4798 /* m_mclget() add an mbuf cluster to a normal mbuf */
4799 struct mbuf *
4800 m_mclget(struct mbuf *m, int wait)
4801 {
4802 struct ext_ref *rfa = NULL;
4803
4804 #if CONFIG_MBUF_MCACHE
4805 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4806 return m;
4807 }
4808 #else
4809 if ((rfa = mz_ref_alloc(wait)) == NULL) {
4810 return m;
4811 }
4812 #endif /* CONFIG_MBUF_MCACHE */
4813 m->m_ext.ext_buf = m_mclalloc(wait);
4814 if (m->m_ext.ext_buf != NULL) {
4815 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4816 } else {
4817 #if CONFIG_MBUF_MCACHE
4818 mcache_free(ref_cache, rfa);
4819 #else
4820 mz_ref_free(rfa);
4821 #endif /* CONFIG_MBUF_MCACHE */
4822 }
4823
4824 return m;
4825 }
4826
4827 /* Allocate an mbuf cluster */
4828 caddr_t
4829 m_mclalloc(int wait)
4830 {
4831 #if CONFIG_MBUF_MCACHE
4832 int mcflags = MSLEEPF(wait);
4833
4834 /* Is this due to a non-blocking retry? If so, then try harder */
4835 if (mcflags & MCR_NOSLEEP) {
4836 mcflags |= MCR_TRYHARD;
4837 }
4838
4839 return mcache_alloc(m_cache(MC_CL), mcflags);
4840 #else
4841 return mz_cl_alloc(ZONE_ID_CLUSTER_2K, wait);
4842 #endif /* CONFIG_MBUF_MCACHE */
4843 }
4844
4845 /* Free an mbuf cluster */
4846 void
4847 m_mclfree(caddr_t p)
4848 {
4849 #if CONFIG_MBUF_MCACHE
4850 mcache_free(m_cache(MC_CL), p);
4851 #else
4852 mz_cl_free(ZONE_ID_CLUSTER_2K, p);
4853 #endif /* CONFIG_MBUF_MCACHE */
4854 }
4855
4856 /*
4857 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
4858 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
4859 */
4860 int
4861 m_mclhasreference(struct mbuf *m)
4862 {
4863 if (!(m->m_flags & M_EXT)) {
4864 return 0;
4865 }
4866
4867 ASSERT(m_get_rfa(m) != NULL);
4868
4869 return (MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0;
4870 }
4871
4872 __private_extern__ caddr_t
4873 m_bigalloc(int wait)
4874 {
4875 #if CONFIG_MBUF_MCACHE
4876 int mcflags = MSLEEPF(wait);
4877
4878 /* Is this due to a non-blocking retry? If so, then try harder */
4879 if (mcflags & MCR_NOSLEEP) {
4880 mcflags |= MCR_TRYHARD;
4881 }
4882
4883 return mcache_alloc(m_cache(MC_BIGCL), mcflags);
4884 #else
4885 return mz_cl_alloc(ZONE_ID_CLUSTER_4K, wait);
4886 #endif /* CONFIG_MBUF_MCACHE */
4887 }
4888
4889 __private_extern__ void
4890 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
4891 {
4892 #if CONFIG_MBUF_MCACHE
4893 mcache_free(m_cache(MC_BIGCL), p);
4894 #else
4895 mz_cl_free(ZONE_ID_CLUSTER_4K, p);
4896 #endif /* CONFIG_MBUF_MCACHE */
4897 }
4898
4899 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
4900 __private_extern__ struct mbuf *
4901 m_mbigget(struct mbuf *m, int wait)
4902 {
4903 struct ext_ref *rfa = NULL;
4904
4905 #if CONFIG_MBUF_MCACHE
4906 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4907 return m;
4908 }
4909 #else
4910 if ((rfa = mz_ref_alloc(wait)) == NULL) {
4911 return m;
4912 }
4913 #endif /* CONFIG_MBUF_MCACHE */
4914 m->m_ext.ext_buf = m_bigalloc(wait);
4915 if (m->m_ext.ext_buf != NULL) {
4916 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4917 } else {
4918 #if CONFIG_MBUF_MCACHE
4919 mcache_free(ref_cache, rfa);
4920 #else
4921 mz_ref_free(rfa);
4922 #endif /* CONFIG_MBUF_MCACHE */
4923 }
4924 return m;
4925 }
4926
4927 __private_extern__ caddr_t
4928 m_16kalloc(int wait)
4929 {
4930 #if CONFIG_MBUF_MCACHE
4931 int mcflags = MSLEEPF(wait);
4932
4933 /* Is this due to a non-blocking retry? If so, then try harder */
4934 if (mcflags & MCR_NOSLEEP) {
4935 mcflags |= MCR_TRYHARD;
4936 }
4937
4938 return mcache_alloc(m_cache(MC_16KCL), mcflags);
4939 #else
4940 return mz_cl_alloc(ZONE_ID_CLUSTER_16K, wait);
4941 #endif /* CONFIG_MBUF_MCACHE */
4942 }
4943
4944 __private_extern__ void
4945 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
4946 {
4947 #if CONFIG_MBUF_MCACHE
4948 mcache_free(m_cache(MC_16KCL), p);
4949 #else
4950 mz_cl_free(ZONE_ID_CLUSTER_16K, p);
4951 #endif /* CONFIG_MBUF_MCACHE */
4952 }
4953
4954 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
4955 __private_extern__ struct mbuf *
4956 m_m16kget(struct mbuf *m, int wait)
4957 {
4958 struct ext_ref *rfa = NULL;
4959
4960 #if CONFIG_MBUF_MCACHE
4961 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4962 return m;
4963 }
4964 #else
4965 if ((rfa = mz_ref_alloc(wait)) == NULL) {
4966 return m;
4967 }
4968 #endif /* CONFIG_MBUF_MCACHE */
4969 m->m_ext.ext_buf = m_16kalloc(wait);
4970 if (m->m_ext.ext_buf != NULL) {
4971 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4972 } else {
4973 #if CONFIG_MBUF_MCACHE
4974 mcache_free(ref_cache, rfa);
4975 #else
4976 mz_ref_free(rfa);
4977 #endif /* CONFIG_MBUF_MCACHE */
4978 }
4979
4980 return m;
4981 }
4982
4983 /*
4984 * "Move" mbuf pkthdr from "from" to "to".
4985 * "from" must have M_PKTHDR set, and "to" must be empty.
4986 */
4987 void
4988 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
4989 {
4990 VERIFY(from->m_flags & M_PKTHDR);
4991
4992 /* Check for scratch area overflow */
4993 m_redzone_verify(from);
4994
4995 if (to->m_flags & M_PKTHDR) {
4996 /* Check for scratch area overflow */
4997 m_redzone_verify(to);
4998 /* We will be taking over the tags of 'to' */
4999 m_tag_delete_chain(to);
5000 }
5001 to->m_pkthdr = from->m_pkthdr; /* especially tags */
5002 m_classifier_init(from, 0); /* purge classifier info */
5003 m_tag_init(from, 1); /* purge all tags from src */
5004 m_scratch_init(from); /* clear src scratch area */
5005 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
5006 if ((to->m_flags & M_EXT) == 0) {
5007 to->m_data = (uintptr_t)to->m_pktdat;
5008 }
5009 m_redzone_init(to); /* setup red zone on dst */
5010 }
5011
5012 /*
5013 * Duplicate "from"'s mbuf pkthdr in "to".
5014 * "from" must have M_PKTHDR set, and "to" must be empty.
5015 * In particular, this does a deep copy of the packet tags.
5016 */
5017 int
5018 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
5019 {
5020 VERIFY(from->m_flags & M_PKTHDR);
5021
5022 /* Check for scratch area overflow */
5023 m_redzone_verify(from);
5024
5025 if (to->m_flags & M_PKTHDR) {
5026 /* Check for scratch area overflow */
5027 m_redzone_verify(to);
5028 /* We will be taking over the tags of 'to' */
5029 m_tag_delete_chain(to);
5030 }
5031 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
5032 if ((to->m_flags & M_EXT) == 0) {
5033 to->m_data = (uintptr_t)to->m_pktdat;
5034 }
5035 to->m_pkthdr = from->m_pkthdr;
5036 /* clear TX completion flag so the callback is not called in the copy */
5037 to->m_pkthdr.pkt_flags &= ~PKTF_TX_COMPL_TS_REQ;
5038 m_redzone_init(to); /* setup red zone on dst */
5039 m_tag_init(to, 0); /* preserve dst static tags */
5040 return m_tag_copy_chain(to, from, how);
5041 }
5042
5043 void
5044 m_copy_pftag(struct mbuf *to, struct mbuf *from)
5045 {
5046 memcpy(m_pftag(to), m_pftag(from), sizeof(struct pf_mtag));
5047 #if PF_ECN
5048 m_pftag(to)->pftag_hdr = NULL;
5049 m_pftag(to)->pftag_flags &= ~(PF_TAG_HDR_INET | PF_TAG_HDR_INET6);
5050 #endif /* PF_ECN */
5051 }
5052
5053 void
5054 m_copy_necptag(struct mbuf *to, struct mbuf *from)
5055 {
5056 memcpy(m_necptag(to), m_necptag(from), sizeof(struct necp_mtag_));
5057 }
5058
5059 void
5060 m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
5061 {
5062 VERIFY(m->m_flags & M_PKTHDR);
5063
5064 m->m_pkthdr.pkt_proto = 0;
5065 m->m_pkthdr.pkt_flowsrc = 0;
5066 m->m_pkthdr.pkt_flowid = 0;
5067 m->m_pkthdr.pkt_ext_flags = 0;
5068 m->m_pkthdr.pkt_flags &= pktf_mask; /* caller-defined mask */
5069 /* preserve service class and interface info for loopback packets */
5070 if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
5071 (void) m_set_service_class(m, MBUF_SC_BE);
5072 }
5073 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
5074 m->m_pkthdr.pkt_ifainfo = 0;
5075 }
5076 /*
5077 * Preserve timestamp if requested
5078 */
5079 if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID)) {
5080 m->m_pkthdr.pkt_timestamp = 0;
5081 }
5082 }
5083
5084 void
5085 m_copy_classifier(struct mbuf *to, struct mbuf *from)
5086 {
5087 VERIFY(to->m_flags & M_PKTHDR);
5088 VERIFY(from->m_flags & M_PKTHDR);
5089
5090 to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
5091 to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
5092 to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
5093 to->m_pkthdr.pkt_mpriv_srcid = from->m_pkthdr.pkt_mpriv_srcid;
5094 to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
5095 to->m_pkthdr.pkt_ext_flags = from->m_pkthdr.pkt_ext_flags;
5096 (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
5097 to->m_pkthdr.pkt_ifainfo = from->m_pkthdr.pkt_ifainfo;
5098 }
5099
5100 /*
5101 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
5102 * if wantall is not set, return whatever number were available. Set up the
5103 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
5104 * are chained on the m_nextpkt field. Any packets requested beyond this
5105 * are chained onto the last packet header's m_next field. The size of
5106 * the cluster is controlled by the parameter bufsize.
5107 */
5108 __private_extern__ struct mbuf *
5109 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
5110 int wait, int wantall, size_t bufsize)
5111 {
5112 struct mbuf *m = NULL;
5113 struct mbuf **np, *top;
5114 unsigned int pnum, needed = *num_needed;
5115 #if CONFIG_MBUF_MCACHE
5116 mcache_obj_t *mp_list = NULL;
5117 int mcflags = MSLEEPF(wait);
5118 mcache_t *cp;
5119 #else
5120 zstack_t mp_list = {};
5121 mbuf_class_t class = MC_MBUF_CL;
5122 #endif /* CONFIG_MBUF_MCACHE */
5123 u_int16_t flag;
5124 struct ext_ref *rfa;
5125 void *cl;
5126
5127 ASSERT(bufsize == m_maxsize(MC_CL) ||
5128 bufsize == m_maxsize(MC_BIGCL) ||
5129 bufsize == m_maxsize(MC_16KCL));
5130
5131 /*
5132 * Caller must first check for njcl because this
5133 * routine is internal and not exposed/used via KPI.
5134 */
5135 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
5136
5137 top = NULL;
5138 np = ⊤
5139 pnum = 0;
5140
5141 /*
5142 * The caller doesn't want all the requested buffers; only some.
5143 * Try hard to get what we can, but don't block. This effectively
5144 * overrides MCR_SLEEP, since this thread will not go to sleep
5145 * if we can't get all the buffers.
5146 */
5147 #if CONFIG_MBUF_MCACHE
5148 if (!wantall || (mcflags & MCR_NOSLEEP)) {
5149 mcflags |= MCR_TRYHARD;
5150 }
5151
5152 /* Allocate the composite mbuf + cluster elements from the cache */
5153 if (bufsize == m_maxsize(MC_CL)) {
5154 cp = m_cache(MC_MBUF_CL);
5155 } else if (bufsize == m_maxsize(MC_BIGCL)) {
5156 cp = m_cache(MC_MBUF_BIGCL);
5157 } else {
5158 cp = m_cache(MC_MBUF_16KCL);
5159 }
5160 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
5161 #else
5162 if (!wantall || (wait & Z_NOWAIT)) {
5163 wait &= ~Z_NOWAIT;
5164 wait |= Z_NOPAGEWAIT;
5165 }
5166
5167 /* Allocate the composite mbuf + cluster elements from the cache */
5168 if (bufsize == m_maxsize(MC_CL)) {
5169 class = MC_MBUF_CL;
5170 } else if (bufsize == m_maxsize(MC_BIGCL)) {
5171 class = MC_MBUF_BIGCL;
5172 } else {
5173 class = MC_MBUF_16KCL;
5174 }
5175 mp_list = mz_composite_alloc_n(class, needed, wait);
5176 needed = zstack_count(mp_list);
5177 #endif /* CONFIG_MBUF_MCACHE */
5178
5179 for (pnum = 0; pnum < needed; pnum++) {
5180 #if CONFIG_MBUF_MCACHE
5181 m = (struct mbuf *)mp_list;
5182 mp_list = mp_list->obj_next;
5183 #else
5184 m = zstack_pop(&mp_list);
5185 #endif /* CONFIG_MBUF_MCACHE */
5186
5187 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
5188 cl = m->m_ext.ext_buf;
5189 rfa = m_get_rfa(m);
5190
5191 ASSERT(cl != NULL && rfa != NULL);
5192 VERIFY(MBUF_IS_COMPOSITE(m));
5193
5194 flag = MEXT_FLAGS(m);
5195
5196 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
5197 if (bufsize == m_maxsize(MC_16KCL)) {
5198 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
5199 } else if (bufsize == m_maxsize(MC_BIGCL)) {
5200 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
5201 } else {
5202 MBUF_CL_INIT(m, cl, rfa, 1, flag);
5203 }
5204
5205 if (num_with_pkthdrs > 0) {
5206 --num_with_pkthdrs;
5207 }
5208
5209 *np = m;
5210 if (num_with_pkthdrs > 0) {
5211 np = &m->m_nextpkt;
5212 } else {
5213 np = &m->m_next;
5214 }
5215 }
5216 #if CONFIG_MBUF_MCACHE
5217 ASSERT(pnum != *num_needed || mp_list == NULL);
5218 if (mp_list != NULL) {
5219 mcache_free_ext(cp, mp_list);
5220 }
5221 #else
5222 ASSERT(pnum != *num_needed || zstack_empty(mp_list));
5223 if (!zstack_empty(mp_list)) {
5224 mz_composite_free_n(class, mp_list);
5225 }
5226 #endif /* CONFIG_MBUF_MCACHE */
5227 if (pnum > 0) {
5228 mtype_stat_add(MT_DATA, pnum);
5229 mtype_stat_sub(MT_FREE, pnum);
5230 }
5231
5232 if (wantall && (pnum != *num_needed)) {
5233 if (top != NULL) {
5234 m_freem_list(top);
5235 }
5236 return NULL;
5237 }
5238
5239 if (pnum > *num_needed) {
5240 printf("%s: File a radar related to <rdar://10146739>. \
5241 needed = %u, pnum = %u, num_needed = %u \n",
5242 __func__, needed, pnum, *num_needed);
5243 }
5244 *num_needed = pnum;
5245
5246 return top;
5247 }
5248
5249 /*
5250 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
5251 * wantall is not set, return whatever number were available. The size of
5252 * each mbuf in the list is controlled by the parameter packetlen. Each
5253 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
5254 * in the chain is called a segment. If maxsegments is not null and the
5255 * value pointed to is not null, this specify the maximum number of segments
5256 * for a chain of mbufs. If maxsegments is zero or the value pointed to
5257 * is zero the caller does not have any restriction on the number of segments.
5258 * The actual number of segments of a mbuf chain is return in the value
5259 * pointed to by maxsegments.
5260 */
5261 __private_extern__ struct mbuf *
5262 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
5263 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
5264 {
5265 struct mbuf **np, *top, *first = NULL;
5266 size_t bufsize, r_bufsize;
5267 unsigned int num = 0;
5268 unsigned int nsegs = 0;
5269 unsigned int needed = 0, resid;
5270 #if CONFIG_MBUF_MCACHE
5271 int mcflags = MSLEEPF(wait);
5272 mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
5273 mcache_t *cp = NULL, *rcp = NULL;
5274 #else
5275 zstack_t mp_list = {}, rmp_list = {};
5276 mbuf_class_t class = MC_MBUF, rclass = MC_MBUF_CL;
5277 #endif /* CONFIG_MBUF_MCACHE */
5278
5279 if (*numlist == 0) {
5280 os_log(OS_LOG_DEFAULT, "m_allocpacket_internal *numlist is 0");
5281 return NULL;
5282 }
5283
5284 top = NULL;
5285 np = ⊤
5286
5287 if (wantsize == 0) {
5288 if (packetlen <= MINCLSIZE) {
5289 bufsize = packetlen;
5290 } else if (packetlen > m_maxsize(MC_CL)) {
5291 /* Use 4KB if jumbo cluster pool isn't available */
5292 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0) {
5293 bufsize = m_maxsize(MC_BIGCL);
5294 } else {
5295 bufsize = m_maxsize(MC_16KCL);
5296 }
5297 } else {
5298 bufsize = m_maxsize(MC_CL);
5299 }
5300 } else if (wantsize == m_maxsize(MC_CL) ||
5301 wantsize == m_maxsize(MC_BIGCL) ||
5302 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
5303 bufsize = wantsize;
5304 } else {
5305 *numlist = 0;
5306 os_log(OS_LOG_DEFAULT, "m_allocpacket_internal wantsize unsupported");
5307 return NULL;
5308 }
5309
5310 if (bufsize <= MHLEN) {
5311 nsegs = 1;
5312 } else if (bufsize <= MINCLSIZE) {
5313 if (maxsegments != NULL && *maxsegments == 1) {
5314 bufsize = m_maxsize(MC_CL);
5315 nsegs = 1;
5316 } else {
5317 nsegs = 2;
5318 }
5319 } else if (bufsize == m_maxsize(MC_16KCL)) {
5320 VERIFY(njcl > 0);
5321 nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1;
5322 } else if (bufsize == m_maxsize(MC_BIGCL)) {
5323 nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1;
5324 } else {
5325 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
5326 }
5327 if (maxsegments != NULL) {
5328 if (*maxsegments && nsegs > *maxsegments) {
5329 *maxsegments = nsegs;
5330 *numlist = 0;
5331 os_log(OS_LOG_DEFAULT, "m_allocpacket_internal nsegs > *maxsegments");
5332 return NULL;
5333 }
5334 *maxsegments = nsegs;
5335 }
5336
5337 /*
5338 * The caller doesn't want all the requested buffers; only some.
5339 * Try hard to get what we can, but don't block. This effectively
5340 * overrides MCR_SLEEP, since this thread will not go to sleep
5341 * if we can't get all the buffers.
5342 */
5343 #if CONFIG_MBUF_MCACHE
5344 if (!wantall || (mcflags & MCR_NOSLEEP)) {
5345 mcflags |= MCR_TRYHARD;
5346 }
5347 #else
5348 if (!wantall || (wait & Z_NOWAIT)) {
5349 wait &= ~Z_NOWAIT;
5350 wait |= Z_NOPAGEWAIT;
5351 }
5352 #endif /* !CONFIG_MBUF_MCACHE */
5353
5354 /*
5355 * Simple case where all elements in the lists/chains are mbufs.
5356 * Unless bufsize is greater than MHLEN, each segment chain is made
5357 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
5358 * of 2 mbufs; the second one is used for the residual data, i.e.
5359 * the remaining data that cannot fit into the first mbuf.
5360 */
5361 if (bufsize <= MINCLSIZE) {
5362 /* Allocate the elements in one shot from the mbuf cache */
5363 ASSERT(bufsize <= MHLEN || nsegs == 2);
5364 #if CONFIG_MBUF_MCACHE
5365 cp = m_cache(MC_MBUF);
5366 needed = mcache_alloc_ext(cp, &mp_list,
5367 (*numlist) * nsegs, mcflags);
5368 #else
5369 class = MC_MBUF;
5370 mp_list = mz_alloc_n((*numlist) * nsegs, wait);
5371 needed = zstack_count(mp_list);
5372 #endif /* CONFIG_MBUF_MCACHE */
5373
5374 /*
5375 * The number of elements must be even if we are to use an
5376 * mbuf (instead of a cluster) to store the residual data.
5377 * If we couldn't allocate the requested number of mbufs,
5378 * trim the number down (if it's odd) in order to avoid
5379 * creating a partial segment chain.
5380 */
5381 if (bufsize > MHLEN && (needed & 0x1)) {
5382 needed--;
5383 }
5384
5385 while (num < needed) {
5386 struct mbuf *m = NULL;
5387
5388 #if CONFIG_MBUF_MCACHE
5389 m = (struct mbuf *)mp_list;
5390 mp_list = mp_list->obj_next;
5391 #else
5392 m = zstack_pop(&mp_list);
5393 #endif /* CONFIG_MBUF_MCACHE */
5394 ASSERT(m != NULL);
5395
5396 MBUF_INIT(m, 1, MT_DATA);
5397 num++;
5398 if (bufsize > MHLEN) {
5399 /* A second mbuf for this segment chain */
5400 #if CONFIG_MBUF_MCACHE
5401 m->m_next = (struct mbuf *)mp_list;
5402 mp_list = mp_list->obj_next;
5403 #else
5404 m->m_next = zstack_pop(&mp_list);
5405 #endif /* CONFIG_MBUF_MCACHE */
5406
5407 ASSERT(m->m_next != NULL);
5408
5409 MBUF_INIT(m->m_next, 0, MT_DATA);
5410 num++;
5411 }
5412 *np = m;
5413 np = &m->m_nextpkt;
5414 }
5415 #if CONFIG_MBUF_MCACHE
5416 ASSERT(num != *numlist || mp_list == NULL);
5417 #else
5418 ASSERT(num != *numlist || zstack_empty(mp_list));
5419 #endif /* CONFIG_MBUF_MCACHE */
5420
5421 if (num > 0) {
5422 mtype_stat_add(MT_DATA, num);
5423 mtype_stat_sub(MT_FREE, num);
5424 }
5425 num /= nsegs;
5426
5427 /* We've got them all; return to caller */
5428 if (num == *numlist) {
5429 return top;
5430 }
5431
5432 goto fail;
5433 }
5434
5435 /*
5436 * Complex cases where elements are made up of one or more composite
5437 * mbufs + cluster, depending on packetlen. Each N-segment chain can
5438 * be illustrated as follows:
5439 *
5440 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
5441 *
5442 * Every composite mbuf + cluster element comes from the intermediate
5443 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
5444 * the last composite element will come from the MC_MBUF_CL cache,
5445 * unless the residual data is larger than 2KB where we use the
5446 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
5447 * data is defined as extra data beyond the first element that cannot
5448 * fit into the previous element, i.e. there is no residual data if
5449 * the chain only has 1 segment.
5450 */
5451 r_bufsize = bufsize;
5452 resid = packetlen > bufsize ? packetlen % bufsize : 0;
5453 if (resid > 0) {
5454 /* There is residual data; figure out the cluster size */
5455 if (wantsize == 0 && packetlen > MINCLSIZE) {
5456 /*
5457 * Caller didn't request that all of the segments
5458 * in the chain use the same cluster size; use the
5459 * smaller of the cluster sizes.
5460 */
5461 if (njcl > 0 && resid > m_maxsize(MC_BIGCL)) {
5462 r_bufsize = m_maxsize(MC_16KCL);
5463 } else if (resid > m_maxsize(MC_CL)) {
5464 r_bufsize = m_maxsize(MC_BIGCL);
5465 } else {
5466 r_bufsize = m_maxsize(MC_CL);
5467 }
5468 } else {
5469 /* Use the same cluster size as the other segments */
5470 resid = 0;
5471 }
5472 }
5473
5474 needed = *numlist;
5475 if (resid > 0) {
5476 /*
5477 * Attempt to allocate composite mbuf + cluster elements for
5478 * the residual data in each chain; record the number of such
5479 * elements that can be allocated so that we know how many
5480 * segment chains we can afford to create.
5481 */
5482 #if CONFIG_MBUF_MCACHE
5483 if (r_bufsize <= m_maxsize(MC_CL)) {
5484 rcp = m_cache(MC_MBUF_CL);
5485 } else if (r_bufsize <= m_maxsize(MC_BIGCL)) {
5486 rcp = m_cache(MC_MBUF_BIGCL);
5487 } else {
5488 rcp = m_cache(MC_MBUF_16KCL);
5489 }
5490 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
5491 #else
5492 if (r_bufsize <= m_maxsize(MC_CL)) {
5493 rclass = MC_MBUF_CL;
5494 } else if (r_bufsize <= m_maxsize(MC_BIGCL)) {
5495 rclass = MC_MBUF_BIGCL;
5496 } else {
5497 rclass = MC_MBUF_16KCL;
5498 }
5499 rmp_list = mz_composite_alloc_n(rclass, *numlist, wait);
5500 needed = zstack_count(rmp_list);
5501 #endif /* CONFIG_MBUF_MCACHE */
5502 if (needed == 0) {
5503 goto fail;
5504 }
5505
5506 /* This is temporarily reduced for calculation */
5507 ASSERT(nsegs > 1);
5508 nsegs--;
5509 }
5510
5511 /*
5512 * Attempt to allocate the rest of the composite mbuf + cluster
5513 * elements for the number of segment chains that we need.
5514 */
5515 #if CONFIG_MBUF_MCACHE
5516 if (bufsize <= m_maxsize(MC_CL)) {
5517 cp = m_cache(MC_MBUF_CL);
5518 } else if (bufsize <= m_maxsize(MC_BIGCL)) {
5519 cp = m_cache(MC_MBUF_BIGCL);
5520 } else {
5521 cp = m_cache(MC_MBUF_16KCL);
5522 }
5523 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
5524 #else
5525 if (bufsize <= m_maxsize(MC_CL)) {
5526 class = MC_MBUF_CL;
5527 } else if (bufsize <= m_maxsize(MC_BIGCL)) {
5528 class = MC_MBUF_BIGCL;
5529 } else {
5530 class = MC_MBUF_16KCL;
5531 }
5532 mp_list = mz_composite_alloc_n(class, needed * nsegs, wait);
5533 needed = zstack_count(mp_list);
5534 #endif /* CONFIG_MBUF_MCACHE */
5535
5536 /* Round it down to avoid creating a partial segment chain */
5537 needed = (needed / nsegs) * nsegs;
5538 if (needed == 0) {
5539 goto fail;
5540 }
5541
5542 if (resid > 0) {
5543 /*
5544 * We're about to construct the chain(s); take into account
5545 * the number of segments we have created above to hold the
5546 * residual data for each chain, as well as restore the
5547 * original count of segments per chain.
5548 */
5549 ASSERT(nsegs > 0);
5550 needed += needed / nsegs;
5551 nsegs++;
5552 }
5553
5554 for (;;) {
5555 struct mbuf *m = NULL;
5556 u_int16_t flag;
5557 struct ext_ref *rfa;
5558 void *cl;
5559 int pkthdr;
5560 m_ext_free_func_t m_free_func;
5561
5562 ++num;
5563
5564 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
5565 #if CONFIG_MBUF_MCACHE
5566 m = (struct mbuf *)mp_list;
5567 mp_list = mp_list->obj_next;
5568 #else
5569 m = zstack_pop(&mp_list);
5570 #endif /* CONFIG_MBUF_MCACHE */
5571 } else {
5572 #if CONFIG_MBUF_MCACHE
5573 m = (struct mbuf *)rmp_list;
5574 rmp_list = rmp_list->obj_next;
5575 #else
5576 m = zstack_pop(&rmp_list);
5577 #endif /* CONFIG_MBUF_MCACHE */
5578 }
5579 m_free_func = m_get_ext_free(m);
5580 ASSERT(m != NULL);
5581 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
5582 VERIFY(m_free_func == NULL || m_free_func == m_bigfree ||
5583 m_free_func == m_16kfree);
5584
5585 cl = m->m_ext.ext_buf;
5586 rfa = m_get_rfa(m);
5587
5588 ASSERT(cl != NULL && rfa != NULL);
5589 VERIFY(MBUF_IS_COMPOSITE(m));
5590
5591 flag = MEXT_FLAGS(m);
5592
5593 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
5594 if (pkthdr) {
5595 first = m;
5596 }
5597 MBUF_INIT(m, pkthdr, MT_DATA);
5598 if (m_free_func == m_16kfree) {
5599 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
5600 } else if (m_free_func == m_bigfree) {
5601 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
5602 } else {
5603 MBUF_CL_INIT(m, cl, rfa, 1, flag);
5604 }
5605
5606 *np = m;
5607 if ((num % nsegs) == 0) {
5608 np = &first->m_nextpkt;
5609 } else {
5610 np = &m->m_next;
5611 }
5612
5613 if (num == needed) {
5614 break;
5615 }
5616 }
5617
5618 if (num > 0) {
5619 mtype_stat_add(MT_DATA, num);
5620 mtype_stat_sub(MT_FREE, num);
5621 }
5622
5623 num /= nsegs;
5624
5625 /* We've got them all; return to caller */
5626 if (num == *numlist) {
5627 #if CONFIG_MBUF_MCACHE
5628 ASSERT(mp_list == NULL && rmp_list == NULL);
5629 #else
5630 ASSERT(zstack_empty(mp_list) && zstack_empty(rmp_list));
5631 #endif /* CONFIG_MBUF_MCACHE */
5632 return top;
5633 }
5634
5635 fail:
5636 /* Free up what's left of the above */
5637 #if CONFIG_MBUF_MCACHE
5638 if (mp_list != NULL) {
5639 mcache_free_ext(cp, mp_list);
5640 }
5641 if (rmp_list != NULL) {
5642 mcache_free_ext(rcp, rmp_list);
5643 }
5644 #else
5645 if (!zstack_empty(mp_list)) {
5646 if (class == MC_MBUF) {
5647 /* No need to elide, these mbufs came from the cache. */
5648 mz_free_n(mp_list);
5649 } else {
5650 mz_composite_free_n(class, mp_list);
5651 }
5652 }
5653 if (!zstack_empty(rmp_list)) {
5654 mz_composite_free_n(rclass, rmp_list);
5655 }
5656 #endif /* CONFIG_MBUF_MCACHE */
5657 if (wantall && top != NULL) {
5658 m_freem_list(top);
5659 *numlist = 0;
5660 return NULL;
5661 }
5662 *numlist = num;
5663 return top;
5664 }
5665
5666 /*
5667 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
5668 * packets on receive ring.
5669 */
5670 __private_extern__ struct mbuf *
5671 m_getpacket_how(int wait)
5672 {
5673 unsigned int num_needed = 1;
5674
5675 return m_getpackets_internal(&num_needed, 1, wait, 1,
5676 m_maxsize(MC_CL));
5677 }
5678
5679 /*
5680 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
5681 * packets on receive ring.
5682 */
5683 struct mbuf *
5684 m_getpacket(void)
5685 {
5686 unsigned int num_needed = 1;
5687
5688 return m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
5689 m_maxsize(MC_CL));
5690 }
5691
5692 /*
5693 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
5694 * if this can't be met, return whatever number were available. Set up the
5695 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
5696 * are chained on the m_nextpkt field. Any packets requested beyond this are
5697 * chained onto the last packet header's m_next field.
5698 */
5699 struct mbuf *
5700 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
5701 {
5702 unsigned int n = num_needed;
5703
5704 return m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
5705 m_maxsize(MC_CL));
5706 }
5707
5708 /*
5709 * Return a list of mbuf hdrs set up as packet hdrs chained together
5710 * on the m_nextpkt field
5711 */
5712 struct mbuf *
5713 m_getpackethdrs(int num_needed, int how)
5714 {
5715 struct mbuf *m;
5716 struct mbuf **np, *top;
5717
5718 top = NULL;
5719 np = ⊤
5720
5721 while (num_needed--) {
5722 m = _M_RETRYHDR(how, MT_DATA);
5723 if (m == NULL) {
5724 break;
5725 }
5726
5727 *np = m;
5728 np = &m->m_nextpkt;
5729 }
5730
5731 return top;
5732 }
5733
5734 /*
5735 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
5736 * for mbufs packets freed. Used by the drivers.
5737 */
5738 int
5739 m_freem_list(struct mbuf *m)
5740 {
5741 struct mbuf *nextpkt;
5742 #if CONFIG_MBUF_MCACHE
5743 mcache_obj_t *mp_list = NULL;
5744 mcache_obj_t *mcl_list = NULL;
5745 mcache_obj_t *mbc_list = NULL;
5746 mcache_obj_t *m16k_list = NULL;
5747 mcache_obj_t *m_mcl_list = NULL;
5748 mcache_obj_t *m_mbc_list = NULL;
5749 mcache_obj_t *m_m16k_list = NULL;
5750 mcache_obj_t *ref_list = NULL;
5751 #else
5752 zstack_t mp_list = {}, mcl_list = {}, mbc_list = {},
5753 m16k_list = {}, m_mcl_list = {},
5754 m_mbc_list = {}, m_m16k_list = {}, ref_list = {};
5755 #endif /* CONFIG_MBUF_MCACHE */
5756 int pktcount = 0;
5757 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
5758
5759 while (m != NULL) {
5760 pktcount++;
5761
5762 nextpkt = m->m_nextpkt;
5763 m->m_nextpkt = NULL;
5764
5765 while (m != NULL) {
5766 struct mbuf *next = m->m_next;
5767 #if CONFIG_MBUF_MCACHE
5768 mcache_obj_t *o, *rfa;
5769 #else
5770 void *cl = NULL;
5771 #endif /* CONFIG_MBUF_MCACHE */
5772 if (m->m_type == MT_FREE) {
5773 panic("m_free: freeing an already freed mbuf");
5774 }
5775
5776 if (m->m_flags & M_PKTHDR) {
5777 /* Check for scratch area overflow */
5778 m_redzone_verify(m);
5779 /* Free the aux data and tags if there is any */
5780 m_tag_delete_chain(m);
5781 m_do_tx_compl_callback(m, NULL);
5782 }
5783
5784 if (!(m->m_flags & M_EXT)) {
5785 mt_free++;
5786 goto simple_free;
5787 }
5788
5789 if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
5790 m = next;
5791 continue;
5792 }
5793
5794 mt_free++;
5795
5796 #if CONFIG_MBUF_MCACHE
5797 o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
5798 #else
5799 cl = m->m_ext.ext_buf;
5800 #endif /* CONFIG_MBUF_MCACHE */
5801 /*
5802 * Make sure that we don't touch any ext_ref
5803 * member after we decrement the reference count
5804 * since that may lead to use-after-free
5805 * when we do not hold the last reference.
5806 */
5807 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
5808 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
5809 const uint16_t minref = MEXT_MINREF(m);
5810 const uint16_t refcnt = m_decref(m);
5811 if (refcnt == minref && !composite) {
5812 #if CONFIG_MBUF_MCACHE
5813 if (m_free_func == NULL) {
5814 o->obj_next = mcl_list;
5815 mcl_list = o;
5816 } else if (m_free_func == m_bigfree) {
5817 o->obj_next = mbc_list;
5818 mbc_list = o;
5819 } else if (m_free_func == m_16kfree) {
5820 o->obj_next = m16k_list;
5821 m16k_list = o;
5822 } else {
5823 (*(m_free_func))((caddr_t)o,
5824 m->m_ext.ext_size,
5825 m_get_ext_arg(m));
5826 }
5827 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
5828 rfa->obj_next = ref_list;
5829 ref_list = rfa;
5830 #else
5831 if (m_free_func == NULL) {
5832 zstack_push(&mcl_list, cl);
5833 } else if (m_free_func == m_bigfree) {
5834 zstack_push(&mbc_list, cl);
5835 } else if (m_free_func == m_16kfree) {
5836 zstack_push(&m16k_list, cl);
5837 } else {
5838 (*(m_free_func))((caddr_t)cl,
5839 m->m_ext.ext_size,
5840 m_get_ext_arg(m));
5841 }
5842 zstack_push(&ref_list, m_get_rfa(m));
5843 #endif /* CONFIG_MBUF_MCACHE */
5844 m_set_ext(m, NULL, NULL, NULL);
5845 } else if (refcnt == minref && composite) {
5846 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
5847 /*
5848 * Amortize the costs of atomic operations
5849 * by doing them at the end, if possible.
5850 */
5851 if (m->m_type == MT_DATA) {
5852 mt_data++;
5853 } else if (m->m_type == MT_HEADER) {
5854 mt_header++;
5855 } else if (m->m_type == MT_SONAME) {
5856 mt_soname++;
5857 } else if (m->m_type == MT_TAG) {
5858 mt_tag++;
5859 } else {
5860 mtype_stat_dec(m->m_type);
5861 }
5862
5863 m->m_type = MT_FREE;
5864 m->m_flags = M_EXT;
5865 m->m_len = 0;
5866 m->m_next = m->m_nextpkt = NULL;
5867
5868 /*
5869 * MEXT_FLAGS is safe to access here
5870 * since we are now sure that we held
5871 * the last reference to ext_ref.
5872 */
5873 MEXT_FLAGS(m) &= ~EXTF_READONLY;
5874
5875 /* "Free" into the intermediate cache */
5876 #if CONFIG_MBUF_MCACHE
5877 o = (mcache_obj_t *)m;
5878 if (m_free_func == NULL) {
5879 o->obj_next = m_mcl_list;
5880 m_mcl_list = o;
5881 } else if (m_free_func == m_bigfree) {
5882 o->obj_next = m_mbc_list;
5883 m_mbc_list = o;
5884 } else {
5885 VERIFY(m_free_func == m_16kfree);
5886 o->obj_next = m_m16k_list;
5887 m_m16k_list = o;
5888 }
5889 #else
5890 if (m_free_func == NULL) {
5891 zstack_push(&m_mcl_list, m);
5892 } else if (m_free_func == m_bigfree) {
5893 zstack_push(&m_mbc_list, m);
5894 } else {
5895 VERIFY(m_free_func == m_16kfree);
5896 zstack_push(&m_m16k_list, m);
5897 }
5898 #endif /* CONFIG_MBUF_MCACHE */
5899 m = next;
5900 continue;
5901 }
5902 simple_free:
5903 /*
5904 * Amortize the costs of atomic operations
5905 * by doing them at the end, if possible.
5906 */
5907 if (m->m_type == MT_DATA) {
5908 mt_data++;
5909 } else if (m->m_type == MT_HEADER) {
5910 mt_header++;
5911 } else if (m->m_type == MT_SONAME) {
5912 mt_soname++;
5913 } else if (m->m_type == MT_TAG) {
5914 mt_tag++;
5915 } else if (m->m_type != MT_FREE) {
5916 mtype_stat_dec(m->m_type);
5917 }
5918
5919 m->m_type = MT_FREE;
5920 m->m_flags = m->m_len = 0;
5921 m->m_next = m->m_nextpkt = NULL;
5922
5923 #if CONFIG_MBUF_MCACHE
5924 ((mcache_obj_t *)m)->obj_next = mp_list;
5925 mp_list = (mcache_obj_t *)m;
5926 #else
5927 m_elide(m);
5928 zstack_push(&mp_list, m);
5929 #endif /* CONFIG_MBUF_MCACHE */
5930
5931 m = next;
5932 }
5933
5934 m = nextpkt;
5935 }
5936
5937 if (mt_free > 0) {
5938 mtype_stat_add(MT_FREE, mt_free);
5939 }
5940 if (mt_data > 0) {
5941 mtype_stat_sub(MT_DATA, mt_data);
5942 }
5943 if (mt_header > 0) {
5944 mtype_stat_sub(MT_HEADER, mt_header);
5945 }
5946 if (mt_soname > 0) {
5947 mtype_stat_sub(MT_SONAME, mt_soname);
5948 }
5949 if (mt_tag > 0) {
5950 mtype_stat_sub(MT_TAG, mt_tag);
5951 }
5952 #if CONFIG_MBUF_MCACHE
5953 if (mp_list != NULL) {
5954 mcache_free_ext(m_cache(MC_MBUF), mp_list);
5955 }
5956 if (mcl_list != NULL) {
5957 mcache_free_ext(m_cache(MC_CL), mcl_list);
5958 }
5959 if (mbc_list != NULL) {
5960 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
5961 }
5962 if (m16k_list != NULL) {
5963 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
5964 }
5965 if (m_mcl_list != NULL) {
5966 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
5967 }
5968 if (m_mbc_list != NULL) {
5969 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
5970 }
5971 if (m_m16k_list != NULL) {
5972 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
5973 }
5974 if (ref_list != NULL) {
5975 mcache_free_ext(ref_cache, ref_list);
5976 }
5977 #else
5978 if (!zstack_empty(mp_list)) {
5979 /* mbufs elided above. */
5980 mz_free_n(mp_list);
5981 }
5982 if (!zstack_empty(mcl_list)) {
5983 zfree_nozero_n(ZONE_ID_CLUSTER_2K, mcl_list);
5984 }
5985 if (!zstack_empty(mbc_list)) {
5986 zfree_nozero_n(ZONE_ID_CLUSTER_4K, mbc_list);
5987 }
5988 if (!zstack_empty(m16k_list)) {
5989 zfree_nozero_n(ZONE_ID_CLUSTER_16K, m16k_list);
5990 }
5991 if (!zstack_empty(m_mcl_list)) {
5992 mz_composite_free_n(MC_MBUF_CL, m_mcl_list);
5993 }
5994 if (!zstack_empty(m_mbc_list)) {
5995 mz_composite_free_n(MC_MBUF_BIGCL, m_mbc_list);
5996 }
5997 if (!zstack_empty(m_m16k_list)) {
5998 mz_composite_free_n(MC_MBUF_16KCL, m_m16k_list);
5999 }
6000 if (!zstack_empty(ref_list)) {
6001 zfree_nozero_n(ZONE_ID_MBUF_REF, ref_list);
6002 }
6003 #endif /* CONFIG_MBUF_MCACHE */
6004
6005 return pktcount;
6006 }
6007
6008 /*
6009 * Wrapper around m_freem_list which captures the packet that's going to be
6010 * dropped. If funcname is NULL, that means we do not want to store both
6011 * function name and line number, and only the drop reason will be saved.
6012 * Make sure to pass the direction flag (DROPTAP_FLAG_DIR_OUT,
6013 * DROPTAP_FLAG_DIR_IN), or the packet will not be captured.
6014 */
6015 void
6016 m_drop_list(mbuf_t m, uint16_t flags, uint32_t reason, const char *funcname,
6017 uint16_t linenum)
6018 {
6019 struct mbuf *nextpkt;
6020 struct ifnet *ifp = NULL;
6021
6022 if (m == NULL) {
6023 return;
6024 }
6025
6026 if (__probable(droptap_total_tap_count == 0)) {
6027 m_freem_list(m);
6028 return;
6029 }
6030
6031 if (flags & DROPTAP_FLAG_DIR_OUT) {
6032 while (m != NULL) {
6033 uint16_t tmp_flags = flags;
6034
6035 nextpkt = m->m_nextpkt;
6036 if (m->m_pkthdr.pkt_hdr == NULL) {
6037 tmp_flags |= DROPTAP_FLAG_L2_MISSING;
6038 }
6039 droptap_output_mbuf(m, reason, funcname, linenum, tmp_flags,
6040 ifp);
6041 m = nextpkt;
6042 }
6043 } else if (flags & DROPTAP_FLAG_DIR_IN) {
6044 while (m != NULL) {
6045 char *frame_header;
6046 uint16_t tmp_flags = flags;
6047
6048 nextpkt = m->m_nextpkt;
6049 ifp = m->m_pkthdr.rcvif;
6050
6051 if ((flags & DROPTAP_FLAG_L2_MISSING) == 0 &&
6052 m->m_pkthdr.pkt_hdr != NULL) {
6053 frame_header = m->m_pkthdr.pkt_hdr;
6054 } else {
6055 frame_header = NULL;
6056 tmp_flags |= DROPTAP_FLAG_L2_MISSING;
6057 }
6058
6059 droptap_input_mbuf(m, reason, funcname, linenum, tmp_flags,
6060 ifp, frame_header);
6061 m = nextpkt;
6062 }
6063 }
6064 m_freem_list(m);
6065 }
6066
6067 void
6068 m_freem(struct mbuf *m)
6069 {
6070 while (m != NULL) {
6071 m = m_free(m);
6072 }
6073 }
6074
6075 /*
6076 * Wrapper around m_freem which captures the packet that's going to be dropped.
6077 * If funcname is NULL, that means we do not want to store both function name
6078 * and line number, and only the drop reason will be saved. Make sure to pass the
6079 * direction flag (DROPTAP_FLAG_DIR_OUT, DROPTAP_FLAG_DIR_IN), or the packet will
6080 * not be captured.
6081 */
6082 void
6083 m_drop(mbuf_t m, uint16_t flags, uint32_t reason, const char *funcname,
6084 uint16_t linenum)
6085 {
6086 struct ifnet *ifp = NULL;
6087
6088 if (m == NULL) {
6089 return;
6090 }
6091
6092 if (__probable(droptap_total_tap_count == 0)) {
6093 m_freem(m);
6094 return;
6095 }
6096
6097 if (flags & DROPTAP_FLAG_DIR_OUT) {
6098 droptap_output_mbuf(m, reason, funcname, linenum, flags, ifp);
6099 } else if (flags & DROPTAP_FLAG_DIR_IN) {
6100 char *frame_header;
6101
6102 ifp = m->m_pkthdr.rcvif;
6103
6104 if ((flags & DROPTAP_FLAG_L2_MISSING) == 0 &&
6105 m->m_pkthdr.pkt_hdr != NULL) {
6106 frame_header = m->m_pkthdr.pkt_hdr;
6107 } else {
6108 frame_header = NULL;
6109 flags |= DROPTAP_FLAG_L2_MISSING;
6110 }
6111
6112 droptap_input_mbuf(m, reason, funcname, linenum, flags, ifp,
6113 frame_header);
6114 }
6115 m_freem(m);
6116 }
6117
6118 /*
6119 * Mbuffer utility routines.
6120 */
6121 /*
6122 * Set the m_data pointer of a newly allocated mbuf to place an object of the
6123 * specified size at the end of the mbuf, longword aligned.
6124 *
6125 * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
6126 * separate macros, each asserting that it was called at the proper moment.
6127 * This required callers to themselves test the storage type and call the
6128 * right one. Rather than require callers to be aware of those layout
6129 * decisions, we centralize here.
6130 */
6131 void
6132 m_align(struct mbuf *m, int len)
6133 {
6134 int adjust = 0;
6135
6136 /* At this point data must point to start */
6137 VERIFY(m->m_data == (uintptr_t)M_START(m));
6138 VERIFY(len >= 0);
6139 VERIFY(len <= M_SIZE(m));
6140 adjust = M_SIZE(m) - len;
6141 m->m_data += adjust & ~(sizeof(long) - 1);
6142 }
6143
6144 /*
6145 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
6146 * copy junk along. Does not adjust packet header length.
6147 */
6148 struct mbuf *
6149 m_prepend(struct mbuf *m, int len, int how)
6150 {
6151 struct mbuf *mn;
6152
6153 _MGET(mn, how, m->m_type);
6154 if (mn == NULL) {
6155 m_freem(m);
6156 return NULL;
6157 }
6158 if (m->m_flags & M_PKTHDR) {
6159 M_COPY_PKTHDR(mn, m);
6160 m->m_flags &= ~M_PKTHDR;
6161 }
6162 mn->m_next = m;
6163 m = mn;
6164 if (m->m_flags & M_PKTHDR) {
6165 VERIFY(len <= MHLEN);
6166 MH_ALIGN(m, len);
6167 } else {
6168 VERIFY(len <= MLEN);
6169 M_ALIGN(m, len);
6170 }
6171 m->m_len = len;
6172 return m;
6173 }
6174
6175 /*
6176 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
6177 * chain, copy junk along, and adjust length.
6178 */
6179 struct mbuf *
6180 m_prepend_2(struct mbuf *m, int len, int how, int align)
6181 {
6182 if (M_LEADINGSPACE(m) >= len &&
6183 (!align || IS_P2ALIGNED((m->m_data - len), sizeof(u_int32_t)))) {
6184 m->m_data -= len;
6185 m->m_len += len;
6186 } else {
6187 m = m_prepend(m, len, how);
6188 }
6189 if ((m) && (m->m_flags & M_PKTHDR)) {
6190 m->m_pkthdr.len += len;
6191 }
6192 return m;
6193 }
6194
6195 /*
6196 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
6197 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
6198 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
6199 *
6200 * The last mbuf and offset accessed are passed in and adjusted on return to
6201 * avoid having to iterate over the entire mbuf chain each time.
6202 */
6203 struct mbuf *
6204 m_copym_mode(struct mbuf *m, int off0, int len0, int wait,
6205 struct mbuf **m_lastm, int *m_off, uint32_t mode)
6206 {
6207 struct mbuf *n, *mhdr = NULL, **np;
6208 int off = off0, len = len0;
6209 struct mbuf *top;
6210 int copyhdr = 0;
6211
6212 if (off < 0 || len < 0) {
6213 panic("m_copym: invalid offset %d or len %d", off, len);
6214 }
6215
6216 VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
6217 mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR));
6218
6219 if ((off == 0 && (m->m_flags & M_PKTHDR)) ||
6220 mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) {
6221 mhdr = m;
6222 copyhdr = 1;
6223 }
6224
6225 if (m_lastm != NULL && *m_lastm != NULL) {
6226 if (off0 >= *m_off) {
6227 m = *m_lastm;
6228 off = off0 - *m_off;
6229 }
6230 }
6231
6232 while (off >= m->m_len) {
6233 off -= m->m_len;
6234 m = m->m_next;
6235 }
6236 np = ⊤
6237 top = NULL;
6238
6239 while (len > 0) {
6240 if (m == NULL) {
6241 if (len != M_COPYALL) {
6242 panic("m_copym: len != M_COPYALL");
6243 }
6244 break;
6245 }
6246
6247 if (copyhdr) {
6248 n = _M_RETRYHDR(wait, m->m_type);
6249 } else {
6250 n = _M_RETRY(wait, m->m_type);
6251 }
6252 *np = n;
6253
6254 if (n == NULL) {
6255 goto nospace;
6256 }
6257
6258 if (copyhdr != 0) {
6259 if ((mode == M_COPYM_MOVE_HDR) ||
6260 (mode == M_COPYM_MUST_MOVE_HDR)) {
6261 M_COPY_PKTHDR(n, mhdr);
6262 } else if ((mode == M_COPYM_COPY_HDR) ||
6263 (mode == M_COPYM_MUST_COPY_HDR)) {
6264 if (m_dup_pkthdr(n, mhdr, wait) == 0) {
6265 goto nospace;
6266 }
6267 }
6268 if (len == M_COPYALL) {
6269 n->m_pkthdr.len -= off0;
6270 } else {
6271 n->m_pkthdr.len = len;
6272 }
6273 copyhdr = 0;
6274 /*
6275 * There is data to copy from the packet header mbuf
6276 * if it is empty or it is before the starting offset
6277 */
6278 if (mhdr != m) {
6279 np = &n->m_next;
6280 continue;
6281 }
6282 }
6283 n->m_len = MIN(len, (m->m_len - off));
6284 if (m->m_flags & M_EXT) {
6285 n->m_ext = m->m_ext;
6286 m_incref(m);
6287 n->m_data = m->m_data + off;
6288 n->m_flags |= M_EXT;
6289 } else {
6290 /*
6291 * Limit to the capacity of the destination
6292 */
6293 if (n->m_flags & M_PKTHDR) {
6294 n->m_len = MIN(n->m_len, MHLEN);
6295 } else {
6296 n->m_len = MIN(n->m_len, MLEN);
6297 }
6298
6299 if (MTOD(n, char *) + n->m_len > ((char *)n) + _MSIZE) {
6300 panic("%s n %p copy overflow",
6301 __func__, n);
6302 }
6303
6304 bcopy(MTOD(m, caddr_t) + off, MTOD(n, caddr_t),
6305 (unsigned)n->m_len);
6306 }
6307 if (len != M_COPYALL) {
6308 len -= n->m_len;
6309 }
6310
6311 if (len == 0) {
6312 if (m_lastm != NULL) {
6313 *m_lastm = m;
6314 *m_off = off0 + len0 - (off + n->m_len);
6315 }
6316 }
6317 off = 0;
6318 m = m->m_next;
6319 np = &n->m_next;
6320 }
6321
6322 return top;
6323 nospace:
6324 m_freem(top);
6325
6326 return NULL;
6327 }
6328
6329
6330 struct mbuf *
6331 m_copym(struct mbuf *m, int off0, int len, int wait)
6332 {
6333 return m_copym_mode(m, off0, len, wait, NULL, NULL, M_COPYM_MOVE_HDR);
6334 }
6335
6336 /*
6337 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
6338 * within this routine also.
6339 *
6340 * The last mbuf and offset accessed are passed in and adjusted on return to
6341 * avoid having to iterate over the entire mbuf chain each time.
6342 */
6343 struct mbuf *
6344 m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
6345 struct mbuf **m_lastm, int *m_off, uint32_t mode)
6346 {
6347 struct mbuf *m = m0, *n, **np = NULL;
6348 int off = off0, len = len0;
6349 struct mbuf *top = NULL;
6350 #if CONFIG_MBUF_MCACHE
6351 int mcflags = MSLEEPF(wait);
6352 mcache_obj_t *list = NULL;
6353 #else
6354 zstack_t list = {};
6355 #endif /* CONFIG_MBUF_MCACHE */
6356 int copyhdr = 0;
6357 int type = 0;
6358 int needed = 0;
6359
6360 if (off == 0 && (m->m_flags & M_PKTHDR)) {
6361 copyhdr = 1;
6362 }
6363
6364 if (m_lastm != NULL && *m_lastm != NULL) {
6365 if (off0 >= *m_off) {
6366 m = *m_lastm;
6367 off = off0 - *m_off;
6368 }
6369 }
6370
6371 while (off >= m->m_len) {
6372 off -= m->m_len;
6373 m = m->m_next;
6374 }
6375
6376 n = m;
6377 while (len > 0) {
6378 needed++;
6379 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
6380 n = n->m_next;
6381 }
6382 needed++;
6383 len = len0;
6384
6385 #if CONFIG_MBUF_MCACHE
6386 /*
6387 * If the caller doesn't want to be put to sleep, mark it with
6388 * MCR_TRYHARD so that we may reclaim buffers from other places
6389 * before giving up.
6390 */
6391 if (mcflags & MCR_NOSLEEP) {
6392 mcflags |= MCR_TRYHARD;
6393 }
6394
6395 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
6396 mcflags) != needed) {
6397 goto nospace;
6398 }
6399 #else
6400 list = mz_alloc_n(needed, wait);
6401 if (zstack_count(list) != needed) {
6402 goto nospace;
6403 }
6404 #endif /* CONFIG_MBUF_MCACHE */
6405
6406 needed = 0;
6407 while (len > 0) {
6408 #if CONFIG_MBUF_MCACHE
6409 n = (struct mbuf *)list;
6410 list = list->obj_next;
6411 #else
6412 n = zstack_pop(&list);
6413 #endif /* CONFIG_MBUF_MCACHE */
6414 ASSERT(n != NULL && m != NULL);
6415
6416 type = (top == NULL) ? MT_HEADER : m->m_type;
6417 MBUF_INIT(n, (top == NULL), type);
6418
6419 if (top == NULL) {
6420 top = n;
6421 np = &top->m_next;
6422 continue;
6423 } else {
6424 needed++;
6425 *np = n;
6426 }
6427
6428 if (copyhdr) {
6429 if ((mode == M_COPYM_MOVE_HDR) ||
6430 (mode == M_COPYM_MUST_MOVE_HDR)) {
6431 M_COPY_PKTHDR(n, m);
6432 } else if ((mode == M_COPYM_COPY_HDR) ||
6433 (mode == M_COPYM_MUST_COPY_HDR)) {
6434 if (m_dup_pkthdr(n, m, wait) == 0) {
6435 #if !CONFIG_MBUF_MCACHE
6436 m_elide(n);
6437 #endif
6438 goto nospace;
6439 }
6440 }
6441 n->m_pkthdr.len = len;
6442 copyhdr = 0;
6443 }
6444 n->m_len = MIN(len, (m->m_len - off));
6445
6446 if (m->m_flags & M_EXT) {
6447 n->m_ext = m->m_ext;
6448 m_incref(m);
6449 n->m_data = m->m_data + off;
6450 n->m_flags |= M_EXT;
6451 } else {
6452 if (m_mtod_end(n) > m_mtod_upper_bound(n)) {
6453 panic("%s n %p copy overflow",
6454 __func__, n);
6455 }
6456
6457 bcopy(MTOD(m, caddr_t) + off, MTOD(n, caddr_t),
6458 (unsigned)n->m_len);
6459 }
6460 len -= n->m_len;
6461
6462 if (len == 0) {
6463 if (m_lastm != NULL) {
6464 *m_lastm = m;
6465 *m_off = off0 + len0 - (off + n->m_len);
6466 }
6467 break;
6468 }
6469 off = 0;
6470 m = m->m_next;
6471 np = &n->m_next;
6472 }
6473
6474 mtype_stat_inc(MT_HEADER);
6475 mtype_stat_add(type, needed);
6476 mtype_stat_sub(MT_FREE, needed + 1);
6477
6478 #if CONFIG_MBUF_MCACHE
6479 ASSERT(list == NULL);
6480 #else
6481 ASSERT(zstack_empty(list));
6482 #endif /* CONFIG_MBUF_MCACHE */
6483
6484 return top;
6485
6486 nospace:
6487 #if CONFIG_MBUF_MCACHE
6488 if (list != NULL) {
6489 mcache_free_ext(m_cache(MC_MBUF), list);
6490 }
6491 #else
6492 if (!zstack_empty(list)) {
6493 /* No need to elide, these mbufs came from the cache. */
6494 mz_free_n(list);
6495 }
6496 #endif /* CONFIG_MBUF_MCACHE */
6497 if (top != NULL) {
6498 m_freem(top);
6499 }
6500 return NULL;
6501 }
6502
6503 /*
6504 * Copy data from an mbuf chain starting "off" bytes from the beginning,
6505 * continuing for "len" bytes, into the indicated buffer.
6506 */
6507 void
6508 m_copydata(struct mbuf *m, int off, int len, void *vp)
6509 {
6510 int off0 = off, len0 = len;
6511 struct mbuf *m0 = m;
6512 unsigned count;
6513 char *cp = vp;
6514
6515 if (__improbable(off < 0 || len < 0)) {
6516 panic("%s: invalid offset %d or len %d", __func__, off, len);
6517 /* NOTREACHED */
6518 }
6519
6520 while (off > 0) {
6521 if (__improbable(m == NULL)) {
6522 panic("%s: invalid mbuf chain %p [off %d, len %d]",
6523 __func__, m0, off0, len0);
6524 /* NOTREACHED */
6525 }
6526 if (off < m->m_len) {
6527 break;
6528 }
6529 off -= m->m_len;
6530 m = m->m_next;
6531 }
6532 while (len > 0) {
6533 if (__improbable(m == NULL)) {
6534 panic("%s: invalid mbuf chain %p [off %d, len %d]",
6535 __func__, m0, off0, len0);
6536 /* NOTREACHED */
6537 }
6538 count = MIN(m->m_len - off, len);
6539 bcopy(MTOD(m, caddr_t) + off, cp, count);
6540 len -= count;
6541 cp += count;
6542 off = 0;
6543 m = m->m_next;
6544 }
6545 }
6546
6547 /*
6548 * Concatenate mbuf chain n to m. Both chains must be of the same type
6549 * (e.g. MT_DATA). Any m_pkthdr is not updated.
6550 */
6551 void
6552 m_cat(struct mbuf *m, struct mbuf *n)
6553 {
6554 while (m->m_next) {
6555 m = m->m_next;
6556 }
6557 while (n) {
6558 if ((m->m_flags & M_EXT) ||
6559 m->m_data + m->m_len + n->m_len >= (uintptr_t)&m->m_dat[MLEN]) {
6560 /* just join the two chains */
6561 m->m_next = n;
6562 return;
6563 }
6564 /* splat the data from one into the other */
6565 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
6566 (u_int)n->m_len);
6567 m->m_len += n->m_len;
6568 n = m_free(n);
6569 }
6570 }
6571
6572 void
6573 m_adj(struct mbuf *mp, int req_len)
6574 {
6575 int len = req_len;
6576 struct mbuf *m;
6577 int count;
6578
6579 if ((m = mp) == NULL) {
6580 return;
6581 }
6582 if (len >= 0) {
6583 /*
6584 * Trim from head.
6585 */
6586 while (m != NULL && len > 0) {
6587 if (m->m_len <= len) {
6588 len -= m->m_len;
6589 m->m_len = 0;
6590 m = m->m_next;
6591 } else {
6592 m->m_len -= len;
6593 m->m_data += len;
6594 len = 0;
6595 }
6596 }
6597 m = mp;
6598 if (m->m_flags & M_PKTHDR) {
6599 m->m_pkthdr.len -= (req_len - len);
6600 }
6601 } else {
6602 /*
6603 * Trim from tail. Scan the mbuf chain,
6604 * calculating its length and finding the last mbuf.
6605 * If the adjustment only affects this mbuf, then just
6606 * adjust and return. Otherwise, rescan and truncate
6607 * after the remaining size.
6608 */
6609 len = -len;
6610 count = 0;
6611 for (;;) {
6612 count += m->m_len;
6613 if (m->m_next == (struct mbuf *)0) {
6614 break;
6615 }
6616 m = m->m_next;
6617 }
6618 if (m->m_len >= len) {
6619 m->m_len -= len;
6620 m = mp;
6621 if (m->m_flags & M_PKTHDR) {
6622 m->m_pkthdr.len -= len;
6623 }
6624 return;
6625 }
6626 count -= len;
6627 if (count < 0) {
6628 count = 0;
6629 }
6630 /*
6631 * Correct length for chain is "count".
6632 * Find the mbuf with last data, adjust its length,
6633 * and toss data from remaining mbufs on chain.
6634 */
6635 m = mp;
6636 if (m->m_flags & M_PKTHDR) {
6637 m->m_pkthdr.len = count;
6638 }
6639 for (; m; m = m->m_next) {
6640 if (m->m_len >= count) {
6641 m->m_len = count;
6642 break;
6643 }
6644 count -= m->m_len;
6645 }
6646 while ((m = m->m_next)) {
6647 m->m_len = 0;
6648 }
6649 }
6650 }
6651
6652 /*
6653 * Rearange an mbuf chain so that len bytes are contiguous
6654 * and in the data area of an mbuf (so that mtod
6655 * will work for a structure of size len). Returns the resulting
6656 * mbuf chain on success, frees it and returns null on failure.
6657 * If there is room, it will add up to max_protohdr-len extra bytes to the
6658 * contiguous region in an attempt to avoid being called next time.
6659 */
6660 struct mbuf *
6661 m_pullup(struct mbuf *n, int len)
6662 {
6663 struct mbuf *m;
6664 int count;
6665 int space;
6666
6667 /* check invalid arguments */
6668 if (n == NULL) {
6669 panic("%s: n == NULL", __func__);
6670 }
6671 if (len < 0) {
6672 os_log_info(OS_LOG_DEFAULT, "%s: failed negative len %d",
6673 __func__, len);
6674 goto bad;
6675 }
6676 if (len > MLEN) {
6677 os_log_info(OS_LOG_DEFAULT, "%s: failed len %d too big",
6678 __func__, len);
6679 goto bad;
6680 }
6681 if ((n->m_flags & M_EXT) == 0 &&
6682 m_mtod_current(n) >= m_mtod_upper_bound(n)) {
6683 os_log_info(OS_LOG_DEFAULT, "%s: m_data out of bounds",
6684 __func__);
6685 goto bad;
6686 }
6687
6688 /*
6689 * If first mbuf has no cluster, and has room for len bytes
6690 * without shifting current data, pullup into it,
6691 * otherwise allocate a new mbuf to prepend to the chain.
6692 */
6693 if ((n->m_flags & M_EXT) == 0 &&
6694 len < m_mtod_upper_bound(n) - m_mtod_current(n) && n->m_next != NULL) {
6695 if (n->m_len >= len) {
6696 return n;
6697 }
6698 m = n;
6699 n = n->m_next;
6700 len -= m->m_len;
6701 } else {
6702 if (len > MHLEN) {
6703 goto bad;
6704 }
6705 _MGET(m, M_DONTWAIT, n->m_type);
6706 if (m == 0) {
6707 goto bad;
6708 }
6709 m->m_len = 0;
6710 if (n->m_flags & M_PKTHDR) {
6711 M_COPY_PKTHDR(m, n);
6712 n->m_flags &= ~M_PKTHDR;
6713 }
6714 }
6715 space = m_mtod_upper_bound(m) - m_mtod_end(m);
6716 do {
6717 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
6718 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
6719 (unsigned)count);
6720 len -= count;
6721 m->m_len += count;
6722 n->m_len -= count;
6723 space -= count;
6724 if (n->m_len != 0) {
6725 n->m_data += count;
6726 } else {
6727 n = m_free(n);
6728 }
6729 } while (len > 0 && n != NULL);
6730 if (len > 0) {
6731 (void) m_free(m);
6732 goto bad;
6733 }
6734 m->m_next = n;
6735 return m;
6736 bad:
6737 m_freem(n);
6738 return 0;
6739 }
6740
6741 /*
6742 * Like m_pullup(), except a new mbuf is always allocated, and we allow
6743 * the amount of empty space before the data in the new mbuf to be specified
6744 * (in the event that the caller expects to prepend later).
6745 */
6746 __private_extern__ struct mbuf *
6747 m_copyup(struct mbuf *n, int len, int dstoff)
6748 {
6749 struct mbuf *m;
6750 int count, space;
6751
6752 VERIFY(len >= 0 && dstoff >= 0);
6753
6754 if (len > (MHLEN - dstoff)) {
6755 goto bad;
6756 }
6757 MGET(m, M_DONTWAIT, n->m_type);
6758 if (m == NULL) {
6759 goto bad;
6760 }
6761 m->m_len = 0;
6762 if (n->m_flags & M_PKTHDR) {
6763 m_copy_pkthdr(m, n);
6764 n->m_flags &= ~M_PKTHDR;
6765 }
6766 m->m_data += dstoff;
6767 space = m_mtod_upper_bound(m) - m_mtod_end(m);
6768 do {
6769 count = min(min(max(len, max_protohdr), space), n->m_len);
6770 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
6771 (unsigned)count);
6772 len -= count;
6773 m->m_len += count;
6774 n->m_len -= count;
6775 space -= count;
6776 if (n->m_len) {
6777 n->m_data += count;
6778 } else {
6779 n = m_free(n);
6780 }
6781 } while (len > 0 && n);
6782 if (len > 0) {
6783 (void) m_free(m);
6784 goto bad;
6785 }
6786 m->m_next = n;
6787 return m;
6788 bad:
6789 m_freem(n);
6790
6791 return NULL;
6792 }
6793
6794 /*
6795 * Partition an mbuf chain in two pieces, returning the tail --
6796 * all but the first len0 bytes. In case of failure, it returns NULL and
6797 * attempts to restore the chain to its original state.
6798 */
6799 struct mbuf *
6800 m_split(struct mbuf *m0, int len0, int wait)
6801 {
6802 return m_split0(m0, len0, wait, 1);
6803 }
6804
6805 static struct mbuf *
6806 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
6807 {
6808 struct mbuf *m, *n;
6809 unsigned len = len0, remain;
6810
6811 /*
6812 * First iterate to the mbuf which contains the first byte of
6813 * data at offset len0
6814 */
6815 for (m = m0; m && len > m->m_len; m = m->m_next) {
6816 len -= m->m_len;
6817 }
6818 if (m == NULL) {
6819 return NULL;
6820 }
6821 /*
6822 * len effectively is now the offset in the current
6823 * mbuf where we have to perform split.
6824 *
6825 * remain becomes the tail length.
6826 * Note that len can also be == m->m_len
6827 */
6828 remain = m->m_len - len;
6829
6830 /*
6831 * If current mbuf len contains the entire remaining offset len,
6832 * just make the second mbuf chain pointing to next mbuf onwards
6833 * and return after making necessary adjustments
6834 */
6835 if (copyhdr && (m0->m_flags & M_PKTHDR) && remain == 0) {
6836 _MGETHDR(n, wait, m0->m_type);
6837 if (n == NULL) {
6838 return NULL;
6839 }
6840 n->m_next = m->m_next;
6841 m->m_next = NULL;
6842 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
6843 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
6844 m0->m_pkthdr.len = len0;
6845 return n;
6846 }
6847 if (copyhdr && (m0->m_flags & M_PKTHDR)) {
6848 _MGETHDR(n, wait, m0->m_type);
6849 if (n == NULL) {
6850 return NULL;
6851 }
6852 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
6853 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
6854 m0->m_pkthdr.len = len0;
6855
6856 /*
6857 * If current points to external storage
6858 * then it can be shared by making last mbuf
6859 * of head chain and first mbuf of current chain
6860 * pointing to different data offsets
6861 */
6862 if (m->m_flags & M_EXT) {
6863 goto extpacket;
6864 }
6865 if (remain > MHLEN) {
6866 /* m can't be the lead packet */
6867 MH_ALIGN(n, 0);
6868 n->m_next = m_split(m, len, wait);
6869 if (n->m_next == NULL) {
6870 (void) m_free(n);
6871 return NULL;
6872 } else {
6873 return n;
6874 }
6875 } else {
6876 MH_ALIGN(n, remain);
6877 }
6878 } else if (remain == 0) {
6879 n = m->m_next;
6880 m->m_next = NULL;
6881 return n;
6882 } else {
6883 _MGET(n, wait, m->m_type);
6884 if (n == NULL) {
6885 return NULL;
6886 }
6887
6888 if ((m->m_flags & M_EXT) == 0) {
6889 VERIFY(remain <= MLEN);
6890 M_ALIGN(n, remain);
6891 }
6892 }
6893 extpacket:
6894 if (m->m_flags & M_EXT) {
6895 n->m_flags |= M_EXT;
6896 n->m_ext = m->m_ext;
6897 m_incref(m);
6898 n->m_data = m->m_data + len;
6899 } else {
6900 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
6901 }
6902 n->m_len = remain;
6903 m->m_len = len;
6904 n->m_next = m->m_next;
6905 m->m_next = NULL;
6906 return n;
6907 }
6908
6909 /*
6910 * Routine to copy from device local memory into mbufs.
6911 */
6912 struct mbuf *
6913 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
6914 void (*copy)(const void *, void *, size_t))
6915 {
6916 struct mbuf *m;
6917 struct mbuf *top = NULL, **mp = ⊤
6918 int off = off0, len;
6919 char *cp;
6920 char *epkt;
6921
6922 cp = buf;
6923 epkt = cp + totlen;
6924 if (off) {
6925 /*
6926 * If 'off' is non-zero, packet is trailer-encapsulated,
6927 * so we have to skip the type and length fields.
6928 */
6929 cp += off + 2 * sizeof(u_int16_t);
6930 totlen -= 2 * sizeof(u_int16_t);
6931 }
6932 _MGETHDR(m, M_DONTWAIT, MT_DATA);
6933 if (m == NULL) {
6934 return NULL;
6935 }
6936 m->m_pkthdr.rcvif = ifp;
6937 m->m_pkthdr.len = totlen;
6938 m->m_len = MHLEN;
6939
6940 while (totlen > 0) {
6941 if (top != NULL) {
6942 _MGET(m, M_DONTWAIT, MT_DATA);
6943 if (m == NULL) {
6944 m_freem(top);
6945 return NULL;
6946 }
6947 m->m_len = MLEN;
6948 }
6949 len = MIN(totlen, epkt - cp);
6950 if (len >= MINCLSIZE) {
6951 MCLGET(m, M_DONTWAIT);
6952 if (m->m_flags & M_EXT) {
6953 m->m_len = len = MIN(len, m_maxsize(MC_CL));
6954 } else {
6955 /* give up when it's out of cluster mbufs */
6956 if (top != NULL) {
6957 m_freem(top);
6958 }
6959 m_freem(m);
6960 return NULL;
6961 }
6962 } else {
6963 /*
6964 * Place initial small packet/header at end of mbuf.
6965 */
6966 if (len < m->m_len) {
6967 if (top == NULL &&
6968 len + max_linkhdr <= m->m_len) {
6969 m->m_data += max_linkhdr;
6970 }
6971 m->m_len = len;
6972 } else {
6973 len = m->m_len;
6974 }
6975 }
6976 if (copy) {
6977 copy(cp, MTOD(m, caddr_t), (unsigned)len);
6978 } else {
6979 bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
6980 }
6981 cp += len;
6982 *mp = m;
6983 mp = &m->m_next;
6984 totlen -= len;
6985 if (cp == epkt) {
6986 cp = buf;
6987 }
6988 }
6989 return top;
6990 }
6991
6992 #if CONFIG_MBUF_MCACHE
6993 #ifndef MBUF_GROWTH_NORMAL_THRESH
6994 #define MBUF_GROWTH_NORMAL_THRESH 25
6995 #endif
6996
6997 /*
6998 * Cluster freelist allocation check.
6999 */
7000 static int
7001 m_howmany(int num, size_t bufsize)
7002 {
7003 int i = 0, j = 0;
7004 u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
7005 u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
7006 u_int32_t sumclusters, freeclusters;
7007 u_int32_t percent_pool, percent_kmem;
7008 u_int32_t mb_growth, mb_growth_thresh;
7009
7010 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
7011 bufsize == m_maxsize(MC_16KCL));
7012
7013 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
7014
7015 /* Numbers in 2K cluster units */
7016 m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
7017 m_clusters = m_total(MC_CL);
7018 m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
7019 m_16kclusters = m_total(MC_16KCL);
7020 sumclusters = m_mbclusters + m_clusters + m_bigclusters;
7021
7022 m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
7023 m_clfree = m_infree(MC_CL);
7024 m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
7025 m_16kclfree = m_infree(MC_16KCL);
7026 freeclusters = m_mbfree + m_clfree + m_bigclfree;
7027
7028 /* Bail if we've maxed out the mbuf memory map */
7029 if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
7030 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
7031 (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
7032 mbwdog_logger("maxed out nclusters (%u >= %u) or njcl (%u >= %u)",
7033 sumclusters, nclusters,
7034 (m_16kclusters << NCLPJCLSHIFT), njcl);
7035 return 0;
7036 }
7037
7038 if (bufsize == m_maxsize(MC_BIGCL)) {
7039 /* Under minimum */
7040 if (m_bigclusters < m_minlimit(MC_BIGCL)) {
7041 return m_minlimit(MC_BIGCL) - m_bigclusters;
7042 }
7043
7044 percent_pool =
7045 ((sumclusters - freeclusters) * 100) / sumclusters;
7046 percent_kmem = (sumclusters * 100) / nclusters;
7047
7048 /*
7049 * If a light/normal user, grow conservatively (75%)
7050 * If a heavy user, grow aggressively (50%)
7051 */
7052 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH) {
7053 mb_growth = MB_GROWTH_NORMAL;
7054 } else {
7055 mb_growth = MB_GROWTH_AGGRESSIVE;
7056 }
7057
7058 if (percent_kmem < 5) {
7059 /* For initial allocations */
7060 i = num;
7061 } else {
7062 /* Return if >= MBIGCL_LOWAT clusters available */
7063 if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
7064 m_total(MC_BIGCL) >=
7065 MBIGCL_LOWAT + m_minlimit(MC_BIGCL)) {
7066 return 0;
7067 }
7068
7069 /* Ensure at least num clusters are accessible */
7070 if (num >= m_infree(MC_BIGCL)) {
7071 i = num - m_infree(MC_BIGCL);
7072 }
7073 if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL)) {
7074 j = num - (m_total(MC_BIGCL) -
7075 m_minlimit(MC_BIGCL));
7076 }
7077
7078 i = MAX(i, j);
7079
7080 /*
7081 * Grow pool if percent_pool > 75 (normal growth)
7082 * or percent_pool > 50 (aggressive growth).
7083 */
7084 mb_growth_thresh = 100 - (100 / (1 << mb_growth));
7085 if (percent_pool > mb_growth_thresh) {
7086 j = ((sumclusters + num) >> mb_growth) -
7087 freeclusters;
7088 }
7089 i = MAX(i, j);
7090 }
7091
7092 /* Check to ensure we didn't go over limits */
7093 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) {
7094 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
7095 }
7096 if ((i << 1) + sumclusters >= nclusters) {
7097 i = (nclusters - sumclusters) >> 1;
7098 }
7099 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
7100 VERIFY(sumclusters + (i << 1) <= nclusters);
7101 } else { /* 16K CL */
7102 VERIFY(njcl > 0);
7103 /* Ensure at least num clusters are available */
7104 if (num >= m_16kclfree) {
7105 i = num - m_16kclfree;
7106 }
7107
7108 /* Always grow 16KCL pool aggressively */
7109 if (((m_16kclusters + num) >> 1) > m_16kclfree) {
7110 j = ((m_16kclusters + num) >> 1) - m_16kclfree;
7111 }
7112 i = MAX(i, j);
7113
7114 /* Check to ensure we don't go over limit */
7115 if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL)) {
7116 i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
7117 }
7118 }
7119 return i;
7120 }
7121 #endif /* CONFIG_MBUF_MCACHE */
7122 /*
7123 * Return the number of bytes in the mbuf chain, m.
7124 */
7125 unsigned int
7126 m_length(struct mbuf *m)
7127 {
7128 struct mbuf *m0;
7129 unsigned int pktlen;
7130
7131 if (m->m_flags & M_PKTHDR) {
7132 return m->m_pkthdr.len;
7133 }
7134
7135 pktlen = 0;
7136 for (m0 = m; m0 != NULL; m0 = m0->m_next) {
7137 pktlen += m0->m_len;
7138 }
7139 return pktlen;
7140 }
7141
7142 /*
7143 * Copy data from a buffer back into the indicated mbuf chain,
7144 * starting "off" bytes from the beginning, extending the mbuf
7145 * chain if necessary.
7146 */
7147 void
7148 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
7149 {
7150 #if DEBUG
7151 struct mbuf *origm = m0;
7152 int error;
7153 #endif /* DEBUG */
7154
7155 if (m0 == NULL) {
7156 return;
7157 }
7158
7159 #if DEBUG
7160 error =
7161 #endif /* DEBUG */
7162 m_copyback0(&m0, off, len, cp,
7163 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
7164
7165 #if DEBUG
7166 if (error != 0 || (m0 != NULL && origm != m0)) {
7167 panic("m_copyback");
7168 }
7169 #endif /* DEBUG */
7170 }
7171
7172 struct mbuf *
7173 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
7174 {
7175 int error;
7176
7177 /* don't support chain expansion */
7178 VERIFY(off + len <= m_length(m0));
7179
7180 error = m_copyback0(&m0, off, len, cp,
7181 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
7182 if (error) {
7183 /*
7184 * no way to recover from partial success.
7185 * just free the chain.
7186 */
7187 m_freem(m0);
7188 return NULL;
7189 }
7190 return m0;
7191 }
7192
7193 /*
7194 * m_makewritable: ensure the specified range writable.
7195 */
7196 int
7197 m_makewritable(struct mbuf **mp, int off, int len, int how)
7198 {
7199 int error;
7200 #if DEBUG
7201 struct mbuf *n;
7202 int origlen, reslen;
7203
7204 origlen = m_length(*mp);
7205 #endif /* DEBUG */
7206
7207 #if 0 /* M_COPYALL is large enough */
7208 if (len == M_COPYALL) {
7209 len = m_length(*mp) - off; /* XXX */
7210 }
7211 #endif
7212
7213 error = m_copyback0(mp, off, len, NULL,
7214 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
7215
7216 #if DEBUG
7217 reslen = 0;
7218 for (n = *mp; n; n = n->m_next) {
7219 reslen += n->m_len;
7220 }
7221 if (origlen != reslen) {
7222 panic("m_makewritable: length changed");
7223 }
7224 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len) {
7225 panic("m_makewritable: inconsist");
7226 }
7227 #endif /* DEBUG */
7228
7229 return error;
7230 }
7231
7232 static int
7233 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
7234 int how)
7235 {
7236 int mlen;
7237 struct mbuf *m, *n;
7238 struct mbuf **mp;
7239 int totlen = 0;
7240 const char *cp = vp;
7241
7242 VERIFY(mp0 != NULL);
7243 VERIFY(*mp0 != NULL);
7244 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
7245 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
7246
7247 /*
7248 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
7249 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
7250 */
7251
7252 VERIFY((~flags & (M_COPYBACK0_EXTEND | M_COPYBACK0_COW)) != 0);
7253
7254 mp = mp0;
7255 m = *mp;
7256 while (off > (mlen = m->m_len)) {
7257 off -= mlen;
7258 totlen += mlen;
7259 if (m->m_next == NULL) {
7260 int tspace;
7261 extend:
7262 if (!(flags & M_COPYBACK0_EXTEND)) {
7263 goto out;
7264 }
7265
7266 /*
7267 * try to make some space at the end of "m".
7268 */
7269
7270 mlen = m->m_len;
7271 if (off + len >= MINCLSIZE &&
7272 !(m->m_flags & M_EXT) && m->m_len == 0) {
7273 MCLGET(m, how);
7274 }
7275 tspace = M_TRAILINGSPACE(m);
7276 if (tspace > 0) {
7277 tspace = MIN(tspace, off + len);
7278 VERIFY(tspace > 0);
7279 bzero(mtod(m, char *) + m->m_len,
7280 MIN(off, tspace));
7281 m->m_len += tspace;
7282 off += mlen;
7283 totlen -= mlen;
7284 continue;
7285 }
7286
7287 /*
7288 * need to allocate an mbuf.
7289 */
7290
7291 if (off + len >= MINCLSIZE) {
7292 n = m_getcl(how, m->m_type, 0);
7293 } else {
7294 n = _M_GET(how, m->m_type);
7295 }
7296 if (n == NULL) {
7297 goto out;
7298 }
7299 n->m_len = 0;
7300 n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
7301 bzero(mtod(n, char *), MIN(n->m_len, off));
7302 m->m_next = n;
7303 }
7304 mp = &m->m_next;
7305 m = m->m_next;
7306 }
7307 while (len > 0) {
7308 mlen = m->m_len - off;
7309 if (mlen != 0 && m_mclhasreference(m)) {
7310 char *datap;
7311 int eatlen;
7312
7313 /*
7314 * this mbuf is read-only.
7315 * allocate a new writable mbuf and try again.
7316 */
7317
7318 #if DIAGNOSTIC
7319 if (!(flags & M_COPYBACK0_COW)) {
7320 panic("m_copyback0: read-only");
7321 }
7322 #endif /* DIAGNOSTIC */
7323
7324 /*
7325 * if we're going to write into the middle of
7326 * a mbuf, split it first.
7327 */
7328 if (off > 0 && len < mlen) {
7329 n = m_split0(m, off, how, 0);
7330 if (n == NULL) {
7331 goto enobufs;
7332 }
7333 m->m_next = n;
7334 mp = &m->m_next;
7335 m = n;
7336 off = 0;
7337 continue;
7338 }
7339
7340 /*
7341 * XXX TODO coalesce into the trailingspace of
7342 * the previous mbuf when possible.
7343 */
7344
7345 /*
7346 * allocate a new mbuf. copy packet header if needed.
7347 */
7348 n = _M_GET(how, m->m_type);
7349 if (n == NULL) {
7350 goto enobufs;
7351 }
7352 if (off == 0 && (m->m_flags & M_PKTHDR)) {
7353 M_COPY_PKTHDR(n, m);
7354 n->m_len = MHLEN;
7355 } else {
7356 if (len >= MINCLSIZE) {
7357 MCLGET(n, M_DONTWAIT);
7358 }
7359 n->m_len =
7360 (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
7361 }
7362 if (n->m_len > len) {
7363 n->m_len = len;
7364 }
7365
7366 /*
7367 * free the region which has been overwritten.
7368 * copying data from old mbufs if requested.
7369 */
7370 if (flags & M_COPYBACK0_PRESERVE) {
7371 datap = mtod(n, char *);
7372 } else {
7373 datap = NULL;
7374 }
7375 eatlen = n->m_len;
7376 VERIFY(off == 0 || eatlen >= mlen);
7377 if (off > 0) {
7378 VERIFY(len >= mlen);
7379 m->m_len = off;
7380 m->m_next = n;
7381 if (datap) {
7382 m_copydata(m, off, mlen, datap);
7383 datap += mlen;
7384 }
7385 eatlen -= mlen;
7386 mp = &m->m_next;
7387 m = m->m_next;
7388 }
7389 while (m != NULL && m_mclhasreference(m) &&
7390 n->m_type == m->m_type && eatlen > 0) {
7391 mlen = MIN(eatlen, m->m_len);
7392 if (datap) {
7393 m_copydata(m, 0, mlen, datap);
7394 datap += mlen;
7395 }
7396 m->m_data += mlen;
7397 m->m_len -= mlen;
7398 eatlen -= mlen;
7399 if (m->m_len == 0) {
7400 *mp = m = m_free(m);
7401 }
7402 }
7403 if (eatlen > 0) {
7404 n->m_len -= eatlen;
7405 }
7406 n->m_next = m;
7407 *mp = m = n;
7408 continue;
7409 }
7410 mlen = MIN(mlen, len);
7411 if (flags & M_COPYBACK0_COPYBACK) {
7412 bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
7413 cp += mlen;
7414 }
7415 len -= mlen;
7416 mlen += off;
7417 off = 0;
7418 totlen += mlen;
7419 if (len == 0) {
7420 break;
7421 }
7422 if (m->m_next == NULL) {
7423 goto extend;
7424 }
7425 mp = &m->m_next;
7426 m = m->m_next;
7427 }
7428 out:
7429 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
7430 VERIFY(flags & M_COPYBACK0_EXTEND);
7431 m->m_pkthdr.len = totlen;
7432 }
7433
7434 return 0;
7435
7436 enobufs:
7437 return ENOBUFS;
7438 }
7439
7440 uint64_t
7441 mcl_to_paddr(char *addr)
7442 {
7443 #if CONFIG_MBUF_MCACHE
7444 vm_offset_t base_phys;
7445
7446 if (!MBUF_IN_MAP(addr)) {
7447 return 0;
7448 }
7449 base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
7450
7451 if (base_phys == 0) {
7452 return 0;
7453 }
7454 return (uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK));
7455 #else
7456 extern addr64_t kvtophys(vm_offset_t va);
7457
7458 return kvtophys((vm_offset_t)addr);
7459 #endif /* CONFIG_MBUF_MCACHE */
7460 }
7461
7462 /*
7463 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
7464 * And really copy the thing. That way, we don't "precompute" checksums
7465 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
7466 * small packets, don't dup into a cluster. That way received packets
7467 * don't take up too much room in the sockbuf (cf. sbspace()).
7468 */
7469 struct mbuf *
7470 m_dup(struct mbuf *m, int how)
7471 {
7472 struct mbuf *n, **np;
7473 struct mbuf *top;
7474 int copyhdr = 0;
7475
7476 np = ⊤
7477 top = NULL;
7478 if (m->m_flags & M_PKTHDR) {
7479 copyhdr = 1;
7480 }
7481
7482 /*
7483 * Quick check: if we have one mbuf and its data fits in an
7484 * mbuf with packet header, just copy and go.
7485 */
7486 if (m->m_next == NULL) {
7487 /* Then just move the data into an mbuf and be done... */
7488 if (copyhdr) {
7489 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
7490 if ((n = _M_GETHDR(how, m->m_type)) == NULL) {
7491 return NULL;
7492 }
7493 n->m_len = m->m_len;
7494 m_dup_pkthdr(n, m, how);
7495 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), m->m_len);
7496 return n;
7497 }
7498 } else if (m->m_len <= MLEN) {
7499 if ((n = _M_GET(how, m->m_type)) == NULL) {
7500 return NULL;
7501 }
7502 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), m->m_len);
7503 n->m_len = m->m_len;
7504 return n;
7505 }
7506 }
7507 while (m != NULL) {
7508 #if BLUE_DEBUG
7509 printf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
7510 m->m_data);
7511 #endif
7512 if (copyhdr) {
7513 n = _M_GETHDR(how, m->m_type);
7514 } else {
7515 n = _M_GET(how, m->m_type);
7516 }
7517 if (n == NULL) {
7518 goto nospace;
7519 }
7520 if (m->m_flags & M_EXT) {
7521 if (m->m_len <= m_maxsize(MC_CL)) {
7522 MCLGET(n, how);
7523 } else if (m->m_len <= m_maxsize(MC_BIGCL)) {
7524 n = m_mbigget(n, how);
7525 } else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0) {
7526 n = m_m16kget(n, how);
7527 }
7528 if (!(n->m_flags & M_EXT)) {
7529 (void) m_free(n);
7530 goto nospace;
7531 }
7532 } else {
7533 VERIFY((copyhdr == 1 && m->m_len <= MHLEN) ||
7534 (copyhdr == 0 && m->m_len <= MLEN));
7535 }
7536 *np = n;
7537 if (copyhdr) {
7538 /* Don't use M_COPY_PKTHDR: preserve m_data */
7539 m_dup_pkthdr(n, m, how);
7540 copyhdr = 0;
7541 if (!(n->m_flags & M_EXT)) {
7542 n->m_data = (uintptr_t)n->m_pktdat;
7543 }
7544 }
7545 n->m_len = m->m_len;
7546 /*
7547 * Get the dup on the same bdry as the original
7548 * Assume that the two mbufs have the same offset to data area
7549 * (up to word boundaries)
7550 */
7551 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
7552 m = m->m_next;
7553 np = &n->m_next;
7554 #if BLUE_DEBUG
7555 printf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
7556 n->m_data);
7557 #endif
7558 }
7559
7560 return top;
7561
7562 nospace:
7563 m_freem(top);
7564 return NULL;
7565 }
7566
7567 #define MBUF_MULTIPAGES(m) \
7568 (((m)->m_flags & M_EXT) && \
7569 ((IS_P2ALIGNED((m)->m_data, PAGE_SIZE) \
7570 && (m)->m_len > PAGE_SIZE) || \
7571 (!IS_P2ALIGNED((m)->m_data, PAGE_SIZE) && \
7572 P2ROUNDUP((m)->m_data, PAGE_SIZE) < ((uintptr_t)(m)->m_data + (m)->m_len))))
7573
7574 static struct mbuf *
7575 m_expand(struct mbuf *m, struct mbuf **last)
7576 {
7577 struct mbuf *top = NULL;
7578 struct mbuf **nm = ⊤
7579 uintptr_t data0, data;
7580 unsigned int len0, len;
7581
7582 VERIFY(MBUF_MULTIPAGES(m));
7583 VERIFY(m->m_next == NULL);
7584 data0 = (uintptr_t)m->m_data;
7585 len0 = m->m_len;
7586 *last = top;
7587
7588 for (;;) {
7589 struct mbuf *n;
7590
7591 data = data0;
7592 if (IS_P2ALIGNED(data, PAGE_SIZE) && len0 > PAGE_SIZE) {
7593 len = PAGE_SIZE;
7594 } else if (!IS_P2ALIGNED(data, PAGE_SIZE) &&
7595 P2ROUNDUP(data, PAGE_SIZE) < (data + len0)) {
7596 len = P2ROUNDUP(data, PAGE_SIZE) - data;
7597 } else {
7598 len = len0;
7599 }
7600
7601 VERIFY(len > 0);
7602 VERIFY(m->m_flags & M_EXT);
7603 m->m_data = data;
7604 m->m_len = len;
7605
7606 *nm = *last = m;
7607 nm = &m->m_next;
7608 m->m_next = NULL;
7609
7610 data0 += len;
7611 len0 -= len;
7612 if (len0 == 0) {
7613 break;
7614 }
7615
7616 n = _M_RETRY(M_DONTWAIT, MT_DATA);
7617 if (n == NULL) {
7618 m_freem(top);
7619 top = *last = NULL;
7620 break;
7621 }
7622
7623 n->m_ext = m->m_ext;
7624 m_incref(m);
7625 n->m_flags |= M_EXT;
7626 m = n;
7627 }
7628 return top;
7629 }
7630
7631 struct mbuf *
7632 m_normalize(struct mbuf *m)
7633 {
7634 struct mbuf *top = NULL;
7635 struct mbuf **nm = ⊤
7636 boolean_t expanded = FALSE;
7637
7638 while (m != NULL) {
7639 struct mbuf *n;
7640
7641 n = m->m_next;
7642 m->m_next = NULL;
7643
7644 /* Does the data cross one or more page boundaries? */
7645 if (MBUF_MULTIPAGES(m)) {
7646 struct mbuf *last;
7647 if ((m = m_expand(m, &last)) == NULL) {
7648 m_freem(n);
7649 m_freem(top);
7650 top = NULL;
7651 break;
7652 }
7653 *nm = m;
7654 nm = &last->m_next;
7655 expanded = TRUE;
7656 } else {
7657 *nm = m;
7658 nm = &m->m_next;
7659 }
7660 m = n;
7661 }
7662 if (expanded) {
7663 os_atomic_inc(&mb_normalized, relaxed);
7664 }
7665 return top;
7666 }
7667
7668 /*
7669 * Append the specified data to the indicated mbuf chain,
7670 * Extend the mbuf chain if the new data does not fit in
7671 * existing space.
7672 *
7673 * Return 1 if able to complete the job; otherwise 0.
7674 */
7675 int
7676 m_append(struct mbuf *m0, int len, caddr_t cp)
7677 {
7678 struct mbuf *m, *n;
7679 int remainder, space;
7680
7681 for (m = m0; m->m_next != NULL; m = m->m_next) {
7682 ;
7683 }
7684 remainder = len;
7685 space = M_TRAILINGSPACE(m);
7686 if (space > 0) {
7687 /*
7688 * Copy into available space.
7689 */
7690 if (space > remainder) {
7691 space = remainder;
7692 }
7693 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
7694 m->m_len += space;
7695 cp += space;
7696 remainder -= space;
7697 }
7698 while (remainder > 0) {
7699 /*
7700 * Allocate a new mbuf; could check space
7701 * and allocate a cluster instead.
7702 */
7703 n = m_get(M_WAITOK, m->m_type);
7704 if (n == NULL) {
7705 break;
7706 }
7707 n->m_len = min(MLEN, remainder);
7708 bcopy(cp, mtod(n, caddr_t), n->m_len);
7709 cp += n->m_len;
7710 remainder -= n->m_len;
7711 m->m_next = n;
7712 m = n;
7713 }
7714 if (m0->m_flags & M_PKTHDR) {
7715 m0->m_pkthdr.len += len - remainder;
7716 }
7717 return remainder == 0;
7718 }
7719
7720 struct mbuf *
7721 m_last(struct mbuf *m)
7722 {
7723 while (m->m_next != NULL) {
7724 m = m->m_next;
7725 }
7726 return m;
7727 }
7728
7729 unsigned int
7730 m_fixhdr(struct mbuf *m0)
7731 {
7732 u_int len;
7733
7734 VERIFY(m0->m_flags & M_PKTHDR);
7735
7736 len = m_length2(m0, NULL);
7737 m0->m_pkthdr.len = len;
7738 return len;
7739 }
7740
7741 unsigned int
7742 m_length2(struct mbuf *m0, struct mbuf **last)
7743 {
7744 struct mbuf *m;
7745 u_int len;
7746
7747 len = 0;
7748 for (m = m0; m != NULL; m = m->m_next) {
7749 len += m->m_len;
7750 if (m->m_next == NULL) {
7751 break;
7752 }
7753 }
7754 if (last != NULL) {
7755 *last = m;
7756 }
7757 return len;
7758 }
7759
7760 /*
7761 * Defragment a mbuf chain, returning the shortest possible chain of mbufs
7762 * and clusters. If allocation fails and this cannot be completed, NULL will
7763 * be returned, but the passed in chain will be unchanged. Upon success,
7764 * the original chain will be freed, and the new chain will be returned.
7765 *
7766 * If a non-packet header is passed in, the original mbuf (chain?) will
7767 * be returned unharmed.
7768 *
7769 * If offset is specfied, the first mbuf in the chain will have a leading
7770 * space of the amount stated by the "off" parameter.
7771 *
7772 * This routine requires that the m_pkthdr.header field of the original
7773 * mbuf chain is cleared by the caller.
7774 */
7775 struct mbuf *
7776 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
7777 {
7778 struct mbuf *m_new = NULL, *m_final = NULL;
7779 int progress = 0, length, pktlen;
7780
7781 if (!(m0->m_flags & M_PKTHDR)) {
7782 return m0;
7783 }
7784
7785 VERIFY(off < MHLEN);
7786 m_fixhdr(m0); /* Needed sanity check */
7787
7788 pktlen = m0->m_pkthdr.len + off;
7789 if (pktlen > MHLEN) {
7790 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
7791 } else {
7792 m_final = m_gethdr(how, MT_DATA);
7793 }
7794
7795 if (m_final == NULL) {
7796 goto nospace;
7797 }
7798
7799 if (off > 0) {
7800 pktlen -= off;
7801 m_final->m_data += off;
7802 }
7803
7804 /*
7805 * Caller must have handled the contents pointed to by this
7806 * pointer before coming here, as otherwise it will point to
7807 * the original mbuf which will get freed upon success.
7808 */
7809 VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
7810
7811 if (m_dup_pkthdr(m_final, m0, how) == 0) {
7812 goto nospace;
7813 }
7814
7815 m_new = m_final;
7816
7817 while (progress < pktlen) {
7818 length = pktlen - progress;
7819 if (length > MCLBYTES) {
7820 length = MCLBYTES;
7821 }
7822 length -= ((m_new == m_final) ? off : 0);
7823 if (length < 0) {
7824 goto nospace;
7825 }
7826
7827 if (m_new == NULL) {
7828 if (length > MLEN) {
7829 m_new = m_getcl(how, MT_DATA, 0);
7830 } else {
7831 m_new = m_get(how, MT_DATA);
7832 }
7833 if (m_new == NULL) {
7834 goto nospace;
7835 }
7836 }
7837
7838 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
7839 progress += length;
7840 m_new->m_len = length;
7841 if (m_new != m_final) {
7842 m_cat(m_final, m_new);
7843 }
7844 m_new = NULL;
7845 }
7846 m_freem(m0);
7847 m0 = m_final;
7848 return m0;
7849 nospace:
7850 if (m_final) {
7851 m_freem(m_final);
7852 }
7853 return NULL;
7854 }
7855
7856 struct mbuf *
7857 m_defrag(struct mbuf *m0, int how)
7858 {
7859 return m_defrag_offset(m0, 0, how);
7860 }
7861
7862 void
7863 m_mchtype(struct mbuf *m, int t)
7864 {
7865 mtype_stat_inc(t);
7866 mtype_stat_dec(m->m_type);
7867 (m)->m_type = t;
7868 }
7869
7870 void *__unsafe_indexable
7871 m_mtod(struct mbuf *m)
7872 {
7873 return m_mtod_current(m);
7874 }
7875
7876 void
7877 m_mcheck(struct mbuf *m)
7878 {
7879 _MCHECK(m);
7880 }
7881
7882 /*
7883 * Return a pointer to mbuf/offset of location in mbuf chain.
7884 */
7885 struct mbuf *
7886 m_getptr(struct mbuf *m, int loc, int *off)
7887 {
7888 while (loc >= 0) {
7889 /* Normal end of search. */
7890 if (m->m_len > loc) {
7891 *off = loc;
7892 return m;
7893 } else {
7894 loc -= m->m_len;
7895 if (m->m_next == NULL) {
7896 if (loc == 0) {
7897 /* Point at the end of valid data. */
7898 *off = m->m_len;
7899 return m;
7900 }
7901 return NULL;
7902 }
7903 m = m->m_next;
7904 }
7905 }
7906 return NULL;
7907 }
7908
7909 #if CONFIG_MBUF_MCACHE
7910 /*
7911 * Inform the corresponding mcache(s) that there's a waiter below.
7912 */
7913 static void
7914 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
7915 {
7916 mcache_waiter_inc(m_cache(class));
7917 if (comp) {
7918 if (class == MC_CL) {
7919 mcache_waiter_inc(m_cache(MC_MBUF_CL));
7920 } else if (class == MC_BIGCL) {
7921 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
7922 } else if (class == MC_16KCL) {
7923 mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
7924 } else {
7925 mcache_waiter_inc(m_cache(MC_MBUF_CL));
7926 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
7927 }
7928 }
7929 }
7930
7931 /*
7932 * Inform the corresponding mcache(s) that there's no more waiter below.
7933 */
7934 static void
7935 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
7936 {
7937 mcache_waiter_dec(m_cache(class));
7938 if (comp) {
7939 if (class == MC_CL) {
7940 mcache_waiter_dec(m_cache(MC_MBUF_CL));
7941 } else if (class == MC_BIGCL) {
7942 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
7943 } else if (class == MC_16KCL) {
7944 mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
7945 } else {
7946 mcache_waiter_dec(m_cache(MC_MBUF_CL));
7947 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
7948 }
7949 }
7950 }
7951
7952 static bool mbuf_watchdog_defunct_active = false;
7953
7954 #endif /* CONFIG_MBUF_MCACHE */
7955
7956 static uint32_t
7957 mbuf_watchdog_socket_space(struct socket *so)
7958 {
7959 uint32_t space = 0;
7960
7961 if (so == NULL) {
7962 return 0;
7963 }
7964
7965 space = so->so_snd.sb_mbcnt + so->so_rcv.sb_mbcnt;
7966
7967 #if INET
7968 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
7969 SOCK_PROTO(so) == IPPROTO_TCP) {
7970 space += tcp_reass_qlen_space(so);
7971 }
7972 #endif /* INET */
7973
7974 return space;
7975 }
7976
7977 struct mbuf_watchdog_defunct_args {
7978 struct proc *top_app;
7979 uint32_t top_app_space_used;
7980 bool non_blocking;
7981 };
7982
7983 static bool
7984 proc_fd_trylock(proc_t p)
7985 {
7986 return lck_mtx_try_lock(&p->p_fd.fd_lock);
7987 }
7988
7989 static int
7990 mbuf_watchdog_defunct_iterate(proc_t p, void *arg)
7991 {
7992 struct fileproc *fp = NULL;
7993 struct mbuf_watchdog_defunct_args *args =
7994 (struct mbuf_watchdog_defunct_args *)arg;
7995 uint32_t space_used = 0;
7996
7997 /*
7998 * Non-blocking is only used when dumping the mbuf usage from the watchdog
7999 */
8000 if (args->non_blocking) {
8001 if (!proc_fd_trylock(p)) {
8002 return PROC_RETURNED;
8003 }
8004 } else {
8005 proc_fdlock(p);
8006 }
8007 fdt_foreach(fp, p) {
8008 struct fileglob *fg = fp->fp_glob;
8009 struct socket *so = NULL;
8010
8011 if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
8012 continue;
8013 }
8014 so = fg_get_data(fg);
8015 /*
8016 * We calculate the space without the socket
8017 * lock because we don't want to be blocked
8018 * by another process that called send() and
8019 * is stuck waiting for mbufs.
8020 *
8021 * These variables are 32-bit so we don't have
8022 * to worry about incomplete reads.
8023 */
8024 space_used += mbuf_watchdog_socket_space(so);
8025 }
8026 proc_fdunlock(p);
8027 if (space_used > args->top_app_space_used) {
8028 if (args->top_app != NULL) {
8029 proc_rele(args->top_app);
8030 }
8031 args->top_app = p;
8032 args->top_app_space_used = space_used;
8033
8034 return PROC_CLAIMED;
8035 } else {
8036 return PROC_RETURNED;
8037 }
8038 }
8039
8040 extern char *proc_name_address(void *p);
8041
8042 static void
8043 mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1)
8044 {
8045 #pragma unused(arg0, arg1)
8046 struct mbuf_watchdog_defunct_args args = {};
8047 struct fileproc *fp = NULL;
8048
8049 args.non_blocking = false;
8050 proc_iterate(PROC_ALLPROCLIST,
8051 mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
8052
8053 /*
8054 * Defunct all sockets from this app.
8055 */
8056 if (args.top_app != NULL) {
8057 #if CONFIG_MBUF_MCACHE
8058 /* Restart the watchdog count. */
8059 lck_mtx_lock(mbuf_mlock);
8060 microuptime(&mb_wdtstart);
8061 lck_mtx_unlock(mbuf_mlock);
8062 #endif
8063 os_log(OS_LOG_DEFAULT, "%s: defuncting all sockets from %s.%d",
8064 __func__,
8065 proc_name_address(args.top_app),
8066 proc_pid(args.top_app));
8067 proc_fdlock(args.top_app);
8068 fdt_foreach(fp, args.top_app) {
8069 struct fileglob *fg = fp->fp_glob;
8070 struct socket *so = NULL;
8071
8072 if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
8073 continue;
8074 }
8075 so = (struct socket *)fp_get_data(fp);
8076 if (!socket_try_lock(so)) {
8077 continue;
8078 }
8079 if (sosetdefunct(args.top_app, so,
8080 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
8081 TRUE) == 0) {
8082 sodefunct(args.top_app, so,
8083 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
8084 }
8085 socket_unlock(so, 0);
8086 }
8087 proc_fdunlock(args.top_app);
8088 proc_rele(args.top_app);
8089 mbstat.m_forcedefunct++;
8090 #if !CONFIG_MBUF_MCACHE
8091 zcache_drain(ZONE_ID_MBUF_CLUSTER_2K);
8092 zcache_drain(ZONE_ID_MBUF_CLUSTER_4K);
8093 zcache_drain(ZONE_ID_MBUF_CLUSTER_16K);
8094 zone_drain(zone_by_id(ZONE_ID_MBUF));
8095 zone_drain(zone_by_id(ZONE_ID_CLUSTER_2K));
8096 zone_drain(zone_by_id(ZONE_ID_CLUSTER_4K));
8097 zone_drain(zone_by_id(ZONE_ID_CLUSTER_16K));
8098 zone_drain(zone_by_id(ZONE_ID_MBUF_REF));
8099 #endif
8100 }
8101 #if CONFIG_MBUF_MCACHE
8102 mbuf_watchdog_defunct_active = false;
8103 #endif
8104 }
8105
8106 #if !CONFIG_MBUF_MCACHE
8107 static LCK_GRP_DECLARE(mbuf_exhausted_grp, "mbuf-exhausted");
8108 static LCK_TICKET_DECLARE(mbuf_exhausted_lock, &mbuf_exhausted_grp);
8109 static uint32_t mbuf_exhausted_mask;
8110
8111 #define MBUF_EXHAUSTED_DRAIN_MASK (\
8112 (1u << MC_MBUF) | \
8113 (1u << MC_CL) | \
8114 (1u << MC_BIGCL) | \
8115 (1u << MC_16KCL))
8116
8117 #define MBUF_EXHAUSTED_DEFUNCT_MASK (\
8118 (1u << MC_MBUF) | \
8119 (1u << MC_MBUF_CL) | \
8120 (1u << MC_MBUF_BIGCL) | \
8121 (1u << MC_MBUF_16KCL))
8122
8123 static void
8124 mbuf_watchdog_drain_composite(thread_call_param_t arg0, thread_call_param_t arg1)
8125 {
8126 #pragma unused(arg0, arg1)
8127 zcache_drain(ZONE_ID_MBUF_CLUSTER_2K);
8128 zcache_drain(ZONE_ID_MBUF_CLUSTER_4K);
8129 zcache_drain(ZONE_ID_MBUF_CLUSTER_16K);
8130 }
8131
8132 static void
8133 mbuf_zone_exhausted_start(uint32_t bit)
8134 {
8135 uint64_t deadline;
8136 uint32_t mask;
8137
8138 mask = mbuf_exhausted_mask;
8139 mbuf_exhausted_mask = mask | bit;
8140
8141 if ((mask & MBUF_EXHAUSTED_DRAIN_MASK) == 0 &&
8142 (bit & MBUF_EXHAUSTED_DRAIN_MASK)) {
8143 clock_interval_to_deadline(MB_WDT_MAXTIME * 1000 / 10,
8144 NSEC_PER_MSEC, &deadline);
8145 thread_call_enter_delayed(mbuf_drain_tcall, deadline);
8146 }
8147
8148 if ((mask & MBUF_EXHAUSTED_DEFUNCT_MASK) == 0 &&
8149 (bit & MBUF_EXHAUSTED_DEFUNCT_MASK)) {
8150 clock_interval_to_deadline(MB_WDT_MAXTIME * 1000 / 2,
8151 NSEC_PER_MSEC, &deadline);
8152 thread_call_enter_delayed(mbuf_defunct_tcall, deadline);
8153 }
8154 }
8155
8156 static void
8157 mbuf_zone_exhausted_end(uint32_t bit)
8158 {
8159 uint32_t mask;
8160
8161 mask = (mbuf_exhausted_mask &= ~bit);
8162
8163 if ((mask & MBUF_EXHAUSTED_DRAIN_MASK) == 0 &&
8164 (bit & MBUF_EXHAUSTED_DRAIN_MASK)) {
8165 thread_call_cancel(mbuf_drain_tcall);
8166 }
8167
8168 if ((mask & MBUF_EXHAUSTED_DEFUNCT_MASK) == 0 &&
8169 (bit & MBUF_EXHAUSTED_DEFUNCT_MASK)) {
8170 thread_call_cancel(mbuf_defunct_tcall);
8171 }
8172 }
8173
8174 static void
8175 mbuf_zone_exhausted(zone_id_t zid, zone_t zone __unused, bool exhausted)
8176 {
8177 uint32_t bit;
8178
8179 if (zid < m_class_to_zid(MBUF_CLASS_MIN) ||
8180 zid > m_class_to_zid(MBUF_CLASS_MAX)) {
8181 return;
8182 }
8183
8184 bit = 1u << m_class_from_zid(zid);
8185
8186 lck_ticket_lock_nopreempt(&mbuf_exhausted_lock, &mbuf_exhausted_grp);
8187
8188 if (exhausted) {
8189 mbuf_zone_exhausted_start(bit);
8190 } else {
8191 mbuf_zone_exhausted_end(bit);
8192 }
8193
8194 lck_ticket_unlock_nopreempt(&mbuf_exhausted_lock);
8195 }
8196 EVENT_REGISTER_HANDLER(ZONE_EXHAUSTED, mbuf_zone_exhausted);
8197 #endif /* !CONFIG_MBUF_MCACHE */
8198
8199 #if CONFIG_MBUF_MCACHE
8200 /*
8201 * Called during slab (blocking and non-blocking) allocation. If there
8202 * is at least one waiter, and the time since the first waiter is blocked
8203 * is greater than the watchdog timeout, panic the system.
8204 */
8205 static void
8206 mbuf_watchdog(void)
8207 {
8208 struct timeval now;
8209 unsigned int since;
8210 static thread_call_t defunct_tcall = NULL;
8211
8212 if (mb_waiters == 0 || !mb_watchdog) {
8213 return;
8214 }
8215
8216 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8217
8218 microuptime(&now);
8219 since = now.tv_sec - mb_wdtstart.tv_sec;
8220
8221 if (mbuf_watchdog_defunct_active) {
8222 /*
8223 * Don't panic the system while we are trying
8224 * to find sockets to defunct.
8225 */
8226 return;
8227 }
8228 if (since >= MB_WDT_MAXTIME) {
8229 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
8230 mb_waiters, since, mbuf_dump());
8231 /* NOTREACHED */
8232 }
8233 /*
8234 * Check if we are about to panic the system due
8235 * to lack of mbufs and start defuncting sockets
8236 * from processes that use too many sockets.
8237 *
8238 * We're always called with the mbuf_mlock held,
8239 * so that also protects mbuf_watchdog_defunct_active.
8240 */
8241 if (since >= MB_WDT_MAXTIME / 2) {
8242 /*
8243 * Start a thread to defunct sockets
8244 * from apps that are over-using their socket
8245 * buffers.
8246 */
8247 if (defunct_tcall == NULL) {
8248 defunct_tcall =
8249 thread_call_allocate_with_options(mbuf_watchdog_defunct,
8250 NULL,
8251 THREAD_CALL_PRIORITY_KERNEL,
8252 THREAD_CALL_OPTIONS_ONCE);
8253 }
8254 if (defunct_tcall != NULL) {
8255 mbuf_watchdog_defunct_active = true;
8256 thread_call_enter(defunct_tcall);
8257 }
8258 }
8259 }
8260
8261 /*
8262 * Called during blocking allocation. Returns TRUE if one or more objects
8263 * are available at the per-CPU caches layer and that allocation should be
8264 * retried at that level.
8265 */
8266 static boolean_t
8267 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
8268 {
8269 boolean_t mcache_retry = FALSE;
8270
8271 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8272
8273 /* Check if there's anything at the cache layer */
8274 if (mbuf_cached_above(class, wait)) {
8275 mcache_retry = TRUE;
8276 goto done;
8277 }
8278
8279 /* Nothing? Then try hard to get it from somewhere */
8280 m_reclaim(class, num, (wait & MCR_COMP));
8281
8282 /* We tried hard and got something? */
8283 if (m_infree(class) > 0) {
8284 mbstat.m_wait++;
8285 goto done;
8286 } else if (mbuf_cached_above(class, wait)) {
8287 mbstat.m_wait++;
8288 mcache_retry = TRUE;
8289 goto done;
8290 } else if (wait & MCR_TRYHARD) {
8291 mcache_retry = TRUE;
8292 goto done;
8293 }
8294
8295 /*
8296 * There's really nothing for us right now; inform the
8297 * cache(s) that there is a waiter below and go to sleep.
8298 */
8299 mbuf_waiter_inc(class, (wait & MCR_COMP));
8300
8301 VERIFY(!(wait & MCR_NOSLEEP));
8302
8303 /*
8304 * If this is the first waiter, arm the watchdog timer. Otherwise
8305 * check if we need to panic the system due to watchdog timeout.
8306 */
8307 if (mb_waiters == 0) {
8308 microuptime(&mb_wdtstart);
8309 } else {
8310 mbuf_watchdog();
8311 }
8312
8313 mb_waiters++;
8314 m_region_expand(class) += m_total(class) + num;
8315 /* wake up the worker thread */
8316 if (mbuf_worker_ready &&
8317 mbuf_worker_needs_wakeup) {
8318 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
8319 mbuf_worker_needs_wakeup = FALSE;
8320 }
8321 mbwdog_logger("waiting (%d mbufs in class %s)", num, m_cname(class));
8322 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO - 1), m_cname(class), NULL);
8323 mbwdog_logger("woke up (%d mbufs in class %s) ", num, m_cname(class));
8324
8325 /* We are now up; stop getting notified until next round */
8326 mbuf_waiter_dec(class, (wait & MCR_COMP));
8327
8328 /* We waited and got something */
8329 if (m_infree(class) > 0) {
8330 mbstat.m_wait++;
8331 goto done;
8332 } else if (mbuf_cached_above(class, wait)) {
8333 mbstat.m_wait++;
8334 mcache_retry = TRUE;
8335 }
8336 done:
8337 return mcache_retry;
8338 }
8339
8340 __attribute__((noreturn))
8341 static void
8342 mbuf_worker_thread(void)
8343 {
8344 int mbuf_expand;
8345
8346 while (1) {
8347 lck_mtx_lock(mbuf_mlock);
8348 mbwdog_logger("worker thread running");
8349 mbuf_worker_run_cnt++;
8350 mbuf_expand = 0;
8351 /*
8352 * Allocations are based on page size, so if we have depleted
8353 * the reserved spaces, try to free mbufs from the major classes.
8354 */
8355 #if PAGE_SIZE == 4096
8356 uint32_t m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
8357 uint32_t m_clusters = m_total(MC_CL);
8358 uint32_t m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
8359 uint32_t sumclusters = m_mbclusters + m_clusters + m_bigclusters;
8360 if (sumclusters >= nclusters) {
8361 mbwdog_logger("reclaiming bigcl");
8362 mbuf_drain_locked(TRUE);
8363 m_reclaim(MC_BIGCL, 4, FALSE);
8364 }
8365 #else
8366 uint32_t m_16kclusters = m_total(MC_16KCL);
8367 if (njcl > 0 && (m_16kclusters << NCLPJCLSHIFT) >= njcl) {
8368 mbwdog_logger("reclaiming 16kcl");
8369 mbuf_drain_locked(TRUE);
8370 m_reclaim(MC_16KCL, 4, FALSE);
8371 }
8372 #endif
8373 if (m_region_expand(MC_CL) > 0) {
8374 int n;
8375 mb_expand_cl_cnt++;
8376 /* Adjust to current number of cluster in use */
8377 n = m_region_expand(MC_CL) -
8378 (m_total(MC_CL) - m_infree(MC_CL));
8379 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL)) {
8380 n = m_maxlimit(MC_CL) - m_total(MC_CL);
8381 }
8382 if (n > 0) {
8383 mb_expand_cl_total += n;
8384 }
8385 m_region_expand(MC_CL) = 0;
8386
8387 if (n > 0) {
8388 mbwdog_logger("expanding MC_CL by %d", n);
8389 freelist_populate(MC_CL, n, M_WAIT);
8390 }
8391 }
8392 if (m_region_expand(MC_BIGCL) > 0) {
8393 int n;
8394 mb_expand_bigcl_cnt++;
8395 /* Adjust to current number of 4 KB cluster in use */
8396 n = m_region_expand(MC_BIGCL) -
8397 (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
8398 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL)) {
8399 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
8400 }
8401 if (n > 0) {
8402 mb_expand_bigcl_total += n;
8403 }
8404 m_region_expand(MC_BIGCL) = 0;
8405
8406 if (n > 0) {
8407 mbwdog_logger("expanding MC_BIGCL by %d", n);
8408 freelist_populate(MC_BIGCL, n, M_WAIT);
8409 }
8410 }
8411 if (m_region_expand(MC_16KCL) > 0) {
8412 int n;
8413 mb_expand_16kcl_cnt++;
8414 /* Adjust to current number of 16 KB cluster in use */
8415 n = m_region_expand(MC_16KCL) -
8416 (m_total(MC_16KCL) - m_infree(MC_16KCL));
8417 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL)) {
8418 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
8419 }
8420 if (n > 0) {
8421 mb_expand_16kcl_total += n;
8422 }
8423 m_region_expand(MC_16KCL) = 0;
8424
8425 if (n > 0) {
8426 mbwdog_logger("expanding MC_16KCL by %d", n);
8427 (void) freelist_populate(MC_16KCL, n, M_WAIT);
8428 }
8429 }
8430
8431 /*
8432 * Because we can run out of memory before filling the mbuf
8433 * map, we should not allocate more clusters than they are
8434 * mbufs -- otherwise we could have a large number of useless
8435 * clusters allocated.
8436 */
8437 mbwdog_logger("totals: MC_MBUF %d MC_BIGCL %d MC_CL %d MC_16KCL %d",
8438 m_total(MC_MBUF), m_total(MC_BIGCL), m_total(MC_CL),
8439 m_total(MC_16KCL));
8440 uint32_t total_mbufs = m_total(MC_MBUF);
8441 uint32_t total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
8442 m_total(MC_16KCL);
8443 if (total_mbufs < total_clusters) {
8444 mbwdog_logger("expanding MC_MBUF by %d",
8445 total_clusters - total_mbufs);
8446 }
8447 while (total_mbufs < total_clusters) {
8448 mb_expand_cnt++;
8449 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0) {
8450 break;
8451 }
8452 total_mbufs = m_total(MC_MBUF);
8453 total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
8454 m_total(MC_16KCL);
8455 }
8456
8457 mbuf_worker_needs_wakeup = TRUE;
8458 /*
8459 * If there's a deadlock and we're not sending / receiving
8460 * packets, net_uptime() won't be updated. Update it here
8461 * so we are sure it's correct.
8462 */
8463 net_update_uptime();
8464 mbuf_worker_last_runtime = net_uptime();
8465 assert_wait((caddr_t)&mbuf_worker_needs_wakeup,
8466 THREAD_UNINT);
8467 mbwdog_logger("worker thread sleeping");
8468 lck_mtx_unlock(mbuf_mlock);
8469 (void) thread_block((thread_continue_t)mbuf_worker_thread);
8470 }
8471 }
8472
8473 __attribute__((noreturn))
8474 static void
8475 mbuf_worker_thread_init(void)
8476 {
8477 mbuf_worker_ready++;
8478 mbuf_worker_thread();
8479 }
8480
8481 static mcl_slab_t *
8482 slab_get(void *buf)
8483 {
8484 mcl_slabg_t *slg;
8485 unsigned int ix, k;
8486
8487 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8488
8489 VERIFY(MBUF_IN_MAP(buf));
8490 ix = ((unsigned char *)buf - mbutl) >> MBSHIFT;
8491 VERIFY(ix < maxslabgrp);
8492
8493 if ((slg = slabstbl[ix]) == NULL) {
8494 /*
8495 * In the current implementation, we never shrink the slabs
8496 * table; if we attempt to reallocate a cluster group when
8497 * it's already allocated, panic since this is a sign of a
8498 * memory corruption (slabstbl[ix] got nullified).
8499 */
8500 ++slabgrp;
8501 VERIFY(ix < slabgrp);
8502 /*
8503 * Slabs expansion can only be done single threaded; when
8504 * we get here, it must be as a result of m_clalloc() which
8505 * is serialized and therefore mb_clalloc_busy must be set.
8506 */
8507 VERIFY(mb_clalloc_busy);
8508 lck_mtx_unlock(mbuf_mlock);
8509
8510 /* This is a new buffer; create the slabs group for it */
8511 slg = zalloc_permanent_type(mcl_slabg_t);
8512 slg->slg_slab = zalloc_permanent(sizeof(mcl_slab_t) * NSLABSPMB,
8513 ZALIGN(mcl_slab_t));
8514
8515 lck_mtx_lock(mbuf_mlock);
8516 /*
8517 * No other thread could have gone into m_clalloc() after
8518 * we dropped the lock above, so verify that it's true.
8519 */
8520 VERIFY(mb_clalloc_busy);
8521
8522 slabstbl[ix] = slg;
8523
8524 /* Chain each slab in the group to its forward neighbor */
8525 for (k = 1; k < NSLABSPMB; k++) {
8526 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
8527 }
8528 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
8529
8530 /* And chain the last slab in the previous group to this */
8531 if (ix > 0) {
8532 VERIFY(slabstbl[ix - 1]->
8533 slg_slab[NSLABSPMB - 1].sl_next == NULL);
8534 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
8535 &slg->slg_slab[0];
8536 }
8537 }
8538
8539 ix = MTOPG(buf) % NSLABSPMB;
8540 VERIFY(ix < NSLABSPMB);
8541
8542 return &slg->slg_slab[ix];
8543 }
8544
8545 static void
8546 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
8547 void *base, void *head, unsigned int len, int refcnt, int chunks)
8548 {
8549 sp->sl_class = class;
8550 sp->sl_flags = flags;
8551 sp->sl_base = base;
8552 sp->sl_head = head;
8553 sp->sl_len = len;
8554 sp->sl_refcnt = refcnt;
8555 sp->sl_chunks = chunks;
8556 slab_detach(sp);
8557 }
8558
8559 static void
8560 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
8561 {
8562 VERIFY(slab_is_detached(sp));
8563 m_slab_cnt(class)++;
8564 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
8565 sp->sl_flags &= ~SLF_DETACHED;
8566
8567 /*
8568 * If a buffer spans multiple contiguous pages then mark them as
8569 * detached too
8570 */
8571 if (class == MC_16KCL) {
8572 int k;
8573 for (k = 1; k < NSLABSP16KB; k++) {
8574 sp = sp->sl_next;
8575 /* Next slab must already be present */
8576 VERIFY(sp != NULL && slab_is_detached(sp));
8577 sp->sl_flags &= ~SLF_DETACHED;
8578 }
8579 }
8580 }
8581
8582 static void
8583 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
8584 {
8585 int k;
8586 VERIFY(!slab_is_detached(sp));
8587 VERIFY(m_slab_cnt(class) > 0);
8588 m_slab_cnt(class)--;
8589 TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
8590 slab_detach(sp);
8591 if (class == MC_16KCL) {
8592 for (k = 1; k < NSLABSP16KB; k++) {
8593 sp = sp->sl_next;
8594 /* Next slab must already be present */
8595 VERIFY(sp != NULL);
8596 VERIFY(!slab_is_detached(sp));
8597 slab_detach(sp);
8598 }
8599 }
8600 }
8601
8602 static boolean_t
8603 slab_inrange(mcl_slab_t *sp, void *buf)
8604 {
8605 return (uintptr_t)buf >= (uintptr_t)sp->sl_base &&
8606 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len);
8607 }
8608
8609 #undef panic
8610
8611 static void
8612 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
8613 {
8614 int i;
8615 unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
8616 uintptr_t buf = (uintptr_t)sp->sl_base;
8617
8618 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
8619 void *next = ((mcache_obj_t *)buf)->obj_next;
8620 if (next != addr) {
8621 continue;
8622 }
8623 if (!mclverify) {
8624 if (next != NULL && !MBUF_IN_MAP(next)) {
8625 mcache_t *cp = m_cache(sp->sl_class);
8626 panic("%s: %s buffer %p in slab %p modified "
8627 "after free at offset 0: %p out of range "
8628 "[%p-%p)\n", __func__, cp->mc_name,
8629 (void *)buf, sp, next, mbutl, embutl);
8630 /* NOTREACHED */
8631 }
8632 } else {
8633 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
8634 (mcache_obj_t *)buf);
8635 mcl_audit_verify_nextptr(next, mca);
8636 }
8637 }
8638 }
8639
8640 static void
8641 slab_detach(mcl_slab_t *sp)
8642 {
8643 sp->sl_link.tqe_next = (mcl_slab_t *)-1;
8644 sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
8645 sp->sl_flags |= SLF_DETACHED;
8646 }
8647
8648 static boolean_t
8649 slab_is_detached(mcl_slab_t *sp)
8650 {
8651 return (intptr_t)sp->sl_link.tqe_next == -1 &&
8652 (intptr_t)sp->sl_link.tqe_prev == -1 &&
8653 (sp->sl_flags & SLF_DETACHED);
8654 }
8655
8656 static void
8657 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
8658 mcache_obj_t **con_list, size_t con_size, unsigned int num)
8659 {
8660 mcache_audit_t *mca, *mca_tail;
8661 mcache_obj_t *con = NULL;
8662 boolean_t save_contents = (con_list != NULL);
8663 unsigned int i, ix;
8664
8665 ASSERT(num <= NMBPG);
8666 ASSERT(con_list == NULL || con_size != 0);
8667
8668 ix = MTOPG(buf);
8669 VERIFY(ix < maxclaudit);
8670
8671 /* Make sure we haven't been here before */
8672 for (i = 0; i < num; i++) {
8673 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
8674 }
8675
8676 mca = mca_tail = *mca_list;
8677 if (save_contents) {
8678 con = *con_list;
8679 }
8680
8681 for (i = 0; i < num; i++) {
8682 mcache_audit_t *next;
8683
8684 next = mca->mca_next;
8685 bzero(mca, sizeof(*mca));
8686 mca->mca_next = next;
8687 mclaudit[ix].cl_audit[i] = mca;
8688
8689 /* Attach the contents buffer if requested */
8690 if (save_contents) {
8691 mcl_saved_contents_t *msc =
8692 (mcl_saved_contents_t *)(void *)con;
8693
8694 VERIFY(msc != NULL);
8695 VERIFY(IS_P2ALIGNED(msc, sizeof(u_int64_t)));
8696 VERIFY(con_size == sizeof(*msc));
8697 mca->mca_contents_size = con_size;
8698 mca->mca_contents = msc;
8699 con = con->obj_next;
8700 bzero(mca->mca_contents, mca->mca_contents_size);
8701 }
8702
8703 mca_tail = mca;
8704 mca = mca->mca_next;
8705 }
8706
8707 if (save_contents) {
8708 *con_list = con;
8709 }
8710
8711 *mca_list = mca_tail->mca_next;
8712 mca_tail->mca_next = NULL;
8713 }
8714
8715 static void
8716 mcl_audit_free(void *buf, unsigned int num)
8717 {
8718 unsigned int i, ix;
8719 mcache_audit_t *mca, *mca_list;
8720
8721 ix = MTOPG(buf);
8722 VERIFY(ix < maxclaudit);
8723
8724 if (mclaudit[ix].cl_audit[0] != NULL) {
8725 mca_list = mclaudit[ix].cl_audit[0];
8726 for (i = 0; i < num; i++) {
8727 mca = mclaudit[ix].cl_audit[i];
8728 mclaudit[ix].cl_audit[i] = NULL;
8729 if (mca->mca_contents) {
8730 mcache_free(mcl_audit_con_cache,
8731 mca->mca_contents);
8732 }
8733 }
8734 mcache_free_ext(mcache_audit_cache,
8735 (mcache_obj_t *)mca_list);
8736 }
8737 }
8738
8739 /*
8740 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
8741 * the corresponding audit structure for that buffer.
8742 */
8743 static mcache_audit_t *
8744 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj)
8745 {
8746 mcache_audit_t *mca = NULL;
8747 int ix = MTOPG(mobj), m_idx = 0;
8748 unsigned char *page_addr;
8749
8750 VERIFY(ix < maxclaudit);
8751 VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE)));
8752
8753 page_addr = PGTOM(ix);
8754
8755 switch (class) {
8756 case MC_MBUF:
8757 /*
8758 * For the mbuf case, find the index of the page
8759 * used by the mbuf and use that index to locate the
8760 * base address of the page. Then find out the
8761 * mbuf index relative to the page base and use
8762 * it to locate the audit structure.
8763 */
8764 m_idx = MBPAGEIDX(page_addr, mobj);
8765 VERIFY(m_idx < (int)NMBPG);
8766 mca = mclaudit[ix].cl_audit[m_idx];
8767 break;
8768
8769 case MC_CL:
8770 /*
8771 * Same thing as above, but for 2KB clusters in a page.
8772 */
8773 m_idx = CLPAGEIDX(page_addr, mobj);
8774 VERIFY(m_idx < (int)NCLPG);
8775 mca = mclaudit[ix].cl_audit[m_idx];
8776 break;
8777
8778 case MC_BIGCL:
8779 m_idx = BCLPAGEIDX(page_addr, mobj);
8780 VERIFY(m_idx < (int)NBCLPG);
8781 mca = mclaudit[ix].cl_audit[m_idx];
8782 break;
8783 case MC_16KCL:
8784 /*
8785 * Same as above, but only return the first element.
8786 */
8787 mca = mclaudit[ix].cl_audit[0];
8788 break;
8789
8790 default:
8791 VERIFY(0);
8792 /* NOTREACHED */
8793 }
8794
8795 return mca;
8796 }
8797
8798 static void
8799 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
8800 boolean_t alloc)
8801 {
8802 struct mbuf *m = addr;
8803 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
8804
8805 VERIFY(mca->mca_contents != NULL &&
8806 mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
8807
8808 if (mclverify) {
8809 mcl_audit_verify_nextptr(next, mca);
8810 }
8811
8812 if (!alloc) {
8813 /* Save constructed mbuf fields */
8814 mcl_audit_save_mbuf(m, mca);
8815 if (mclverify) {
8816 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
8817 m_maxsize(MC_MBUF));
8818 }
8819 ((mcache_obj_t *)m)->obj_next = next;
8820 return;
8821 }
8822
8823 /* Check if the buffer has been corrupted while in freelist */
8824 if (mclverify) {
8825 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
8826 }
8827 /* Restore constructed mbuf fields */
8828 mcl_audit_restore_mbuf(m, mca, composite);
8829 }
8830
8831 static void
8832 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
8833 {
8834 struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
8835
8836 if (composite) {
8837 struct mbuf *next = m->m_next;
8838 VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL &&
8839 MBUF_IS_COMPOSITE(ms));
8840 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
8841 /*
8842 * We could have hand-picked the mbuf fields and restore
8843 * them individually, but that will be a maintenance
8844 * headache. Instead, restore everything that was saved;
8845 * the mbuf layer will recheck and reinitialize anyway.
8846 */
8847 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
8848 m->m_next = next;
8849 } else {
8850 /*
8851 * For a regular mbuf (no cluster attached) there's nothing
8852 * to restore other than the type field, which is expected
8853 * to be MT_FREE.
8854 */
8855 m->m_type = ms->m_type;
8856 }
8857 _MCHECK(m);
8858 }
8859
8860 static void
8861 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
8862 {
8863 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
8864 _MCHECK(m);
8865 bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
8866 }
8867
8868 static void
8869 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
8870 boolean_t save_next)
8871 {
8872 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
8873
8874 if (!alloc) {
8875 if (mclverify) {
8876 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
8877 }
8878 if (save_next) {
8879 mcl_audit_verify_nextptr(next, mca);
8880 ((mcache_obj_t *)addr)->obj_next = next;
8881 }
8882 } else if (mclverify) {
8883 /* Check if the buffer has been corrupted while in freelist */
8884 mcl_audit_verify_nextptr(next, mca);
8885 mcache_audit_free_verify_set(mca, addr, 0, size);
8886 }
8887 }
8888
8889 static void
8890 mcl_audit_scratch(mcache_audit_t *mca)
8891 {
8892 void *stack[MCACHE_STACK_DEPTH + 1];
8893 mcl_scratch_audit_t *msa;
8894 struct timeval now;
8895
8896 VERIFY(mca->mca_contents != NULL);
8897 msa = MCA_SAVED_SCRATCH_PTR(mca);
8898
8899 msa->msa_pthread = msa->msa_thread;
8900 msa->msa_thread = current_thread();
8901 bcopy(msa->msa_stack, msa->msa_pstack, sizeof(msa->msa_pstack));
8902 msa->msa_pdepth = msa->msa_depth;
8903 bzero(stack, sizeof(stack));
8904 msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
8905 bcopy(&stack[1], msa->msa_stack, sizeof(msa->msa_stack));
8906
8907 msa->msa_ptstamp = msa->msa_tstamp;
8908 microuptime(&now);
8909 /* tstamp is in ms relative to base_ts */
8910 msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
8911 if ((now.tv_sec - mb_start.tv_sec) > 0) {
8912 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
8913 }
8914 }
8915
8916 __abortlike
8917 static void
8918 mcl_audit_mcheck_panic(struct mbuf *m)
8919 {
8920 char buf[DUMP_MCA_BUF_SIZE];
8921 mcache_audit_t *mca;
8922
8923 MRANGE(m);
8924 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
8925
8926 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s",
8927 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(buf, mca));
8928 /* NOTREACHED */
8929 }
8930
8931 __abortlike
8932 static void
8933 mcl_audit_verify_nextptr_panic(void *next, mcache_audit_t *mca)
8934 {
8935 char buf[DUMP_MCA_BUF_SIZE];
8936 panic("mcl_audit: buffer %p modified after free at offset 0: "
8937 "%p out of range [%p-%p)\n%s\n",
8938 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(buf, mca));
8939 /* NOTREACHED */
8940 }
8941
8942 static void
8943 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
8944 {
8945 if (next != NULL && !MBUF_IN_MAP(next) &&
8946 (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
8947 mcl_audit_verify_nextptr_panic(next, mca);
8948 }
8949 }
8950
8951 static uintptr_t
8952 hash_mix(uintptr_t x)
8953 {
8954 #ifndef __LP64__
8955 x += ~(x << 15);
8956 x ^= (x >> 10);
8957 x += (x << 3);
8958 x ^= (x >> 6);
8959 x += ~(x << 11);
8960 x ^= (x >> 16);
8961 #else
8962 x += ~(x << 32);
8963 x ^= (x >> 22);
8964 x += ~(x << 13);
8965 x ^= (x >> 8);
8966 x += (x << 3);
8967 x ^= (x >> 15);
8968 x += ~(x << 27);
8969 x ^= (x >> 31);
8970 #endif
8971 return x;
8972 }
8973
8974 static uint32_t
8975 hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
8976 {
8977 uintptr_t hash = 0;
8978 uintptr_t mask = max_size - 1;
8979
8980 while (depth) {
8981 hash += bt[--depth];
8982 }
8983
8984 hash = hash_mix(hash) & mask;
8985
8986 assert(hash < max_size);
8987
8988 return (uint32_t) hash;
8989 }
8990
8991 static uint32_t
8992 hashaddr(uintptr_t pt, uint32_t max_size)
8993 {
8994 uintptr_t hash = 0;
8995 uintptr_t mask = max_size - 1;
8996
8997 hash = hash_mix(pt) & mask;
8998
8999 assert(hash < max_size);
9000
9001 return (uint32_t) hash;
9002 }
9003
9004 /* This function turns on mbuf leak detection */
9005 static void
9006 mleak_activate(void)
9007 {
9008 mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
9009 PE_parse_boot_argn("mleak_sample_factor",
9010 &mleak_table.mleak_sample_factor,
9011 sizeof(mleak_table.mleak_sample_factor));
9012
9013 if (mleak_table.mleak_sample_factor == 0) {
9014 mclfindleak = 0;
9015 }
9016
9017 if (mclfindleak == 0) {
9018 return;
9019 }
9020
9021 vm_size_t alloc_size =
9022 mleak_alloc_buckets * sizeof(struct mallocation);
9023 vm_size_t trace_size = mleak_trace_buckets * sizeof(struct mtrace);
9024
9025 mleak_allocations = zalloc_permanent(alloc_size, ZALIGN(struct mallocation));
9026 mleak_traces = zalloc_permanent(trace_size, ZALIGN(struct mtrace));
9027 mleak_stat = zalloc_permanent(MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
9028 ZALIGN(mleak_stat_t));
9029
9030 mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
9031 #ifdef __LP64__
9032 mleak_stat->ml_isaddr64 = 1;
9033 #endif /* __LP64__ */
9034 }
9035
9036 static void
9037 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
9038 {
9039 int temp;
9040
9041 if (mclfindleak == 0) {
9042 return;
9043 }
9044
9045 if (!alloc) {
9046 return mleak_free(addr);
9047 }
9048
9049 temp = os_atomic_inc_orig(&mleak_table.mleak_capture, relaxed);
9050
9051 if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
9052 uintptr_t bt[MLEAK_STACK_DEPTH];
9053 unsigned int logged = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
9054 mleak_log(bt, addr, logged, num);
9055 }
9056 }
9057
9058 /*
9059 * This function records the allocation in the mleak_allocations table
9060 * and the backtrace in the mleak_traces table; if allocation slot is in use,
9061 * replace old allocation with new one if the trace slot is in use, return
9062 * (or increment refcount if same trace).
9063 */
9064 static boolean_t
9065 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
9066 {
9067 struct mallocation *allocation;
9068 struct mtrace *trace;
9069 uint32_t trace_index;
9070
9071 /* Quit if someone else modifying the tables */
9072 if (!lck_mtx_try_lock_spin(mleak_lock)) {
9073 mleak_table.total_conflicts++;
9074 return FALSE;
9075 }
9076
9077 allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
9078 mleak_alloc_buckets)];
9079 trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
9080 trace = &mleak_traces[trace_index];
9081
9082 VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
9083 VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
9084
9085 allocation->hitcount++;
9086 trace->hitcount++;
9087
9088 /*
9089 * If the allocation bucket we want is occupied
9090 * and the occupier has the same trace, just bail.
9091 */
9092 if (allocation->element != NULL &&
9093 trace_index == allocation->trace_index) {
9094 mleak_table.alloc_collisions++;
9095 lck_mtx_unlock(mleak_lock);
9096 return TRUE;
9097 }
9098
9099 /*
9100 * Store the backtrace in the traces array;
9101 * Size of zero = trace bucket is free.
9102 */
9103 if (trace->allocs > 0 &&
9104 bcmp(trace->addr, bt, (depth * sizeof(uintptr_t))) != 0) {
9105 /* Different, unique trace, but the same hash! Bail out. */
9106 trace->collisions++;
9107 mleak_table.trace_collisions++;
9108 lck_mtx_unlock(mleak_lock);
9109 return TRUE;
9110 } else if (trace->allocs > 0) {
9111 /* Same trace, already added, so increment refcount */
9112 trace->allocs++;
9113 } else {
9114 /* Found an unused trace bucket, so record the trace here */
9115 if (trace->depth != 0) {
9116 /* this slot previously used but not currently in use */
9117 mleak_table.trace_overwrites++;
9118 }
9119 mleak_table.trace_recorded++;
9120 trace->allocs = 1;
9121 memcpy(trace->addr, bt, (depth * sizeof(uintptr_t)));
9122 trace->depth = depth;
9123 trace->collisions = 0;
9124 }
9125
9126 /* Step 2: Store the allocation record in the allocations array */
9127 if (allocation->element != NULL) {
9128 /*
9129 * Replace an existing allocation. No need to preserve
9130 * because only a subset of the allocations are being
9131 * recorded anyway.
9132 */
9133 mleak_table.alloc_collisions++;
9134 } else if (allocation->trace_index != 0) {
9135 mleak_table.alloc_overwrites++;
9136 }
9137 allocation->element = addr;
9138 allocation->trace_index = trace_index;
9139 allocation->count = num;
9140 mleak_table.alloc_recorded++;
9141 mleak_table.outstanding_allocs++;
9142
9143 lck_mtx_unlock(mleak_lock);
9144 return TRUE;
9145 }
9146
9147 static void
9148 mleak_free(mcache_obj_t *addr)
9149 {
9150 while (addr != NULL) {
9151 struct mallocation *allocation = &mleak_allocations
9152 [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
9153
9154 if (allocation->element == addr &&
9155 allocation->trace_index < mleak_trace_buckets) {
9156 lck_mtx_lock_spin(mleak_lock);
9157 if (allocation->element == addr &&
9158 allocation->trace_index < mleak_trace_buckets) {
9159 struct mtrace *trace;
9160 trace = &mleak_traces[allocation->trace_index];
9161 /* allocs = 0 means trace bucket is unused */
9162 if (trace->allocs > 0) {
9163 trace->allocs--;
9164 }
9165 if (trace->allocs == 0) {
9166 trace->depth = 0;
9167 }
9168 /* NULL element means alloc bucket is unused */
9169 allocation->element = NULL;
9170 mleak_table.outstanding_allocs--;
9171 }
9172 lck_mtx_unlock(mleak_lock);
9173 }
9174 addr = addr->obj_next;
9175 }
9176 }
9177
9178 static void
9179 mleak_sort_traces()
9180 {
9181 int i, j, k;
9182 struct mtrace *swap;
9183
9184 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
9185 mleak_top_trace[i] = NULL;
9186 }
9187
9188 for (i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++) {
9189 if (mleak_traces[i].allocs <= 0) {
9190 continue;
9191 }
9192
9193 mleak_top_trace[j] = &mleak_traces[i];
9194 for (k = j; k > 0; k--) {
9195 if (mleak_top_trace[k]->allocs <=
9196 mleak_top_trace[k - 1]->allocs) {
9197 break;
9198 }
9199
9200 swap = mleak_top_trace[k - 1];
9201 mleak_top_trace[k - 1] = mleak_top_trace[k];
9202 mleak_top_trace[k] = swap;
9203 }
9204 j++;
9205 }
9206
9207 j--;
9208 for (; i < mleak_trace_buckets; i++) {
9209 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs) {
9210 continue;
9211 }
9212
9213 mleak_top_trace[j] = &mleak_traces[i];
9214
9215 for (k = j; k > 0; k--) {
9216 if (mleak_top_trace[k]->allocs <=
9217 mleak_top_trace[k - 1]->allocs) {
9218 break;
9219 }
9220
9221 swap = mleak_top_trace[k - 1];
9222 mleak_top_trace[k - 1] = mleak_top_trace[k];
9223 mleak_top_trace[k] = swap;
9224 }
9225 }
9226 }
9227
9228 static void
9229 mleak_update_stats()
9230 {
9231 mleak_trace_stat_t *mltr;
9232 int i;
9233
9234 VERIFY(mleak_stat != NULL);
9235 #ifdef __LP64__
9236 VERIFY(mleak_stat->ml_isaddr64);
9237 #else
9238 VERIFY(!mleak_stat->ml_isaddr64);
9239 #endif /* !__LP64__ */
9240 VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
9241
9242 mleak_sort_traces();
9243
9244 mltr = &mleak_stat->ml_trace[0];
9245 bzero(mltr, sizeof(*mltr) * MLEAK_NUM_TRACES);
9246 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
9247 int j;
9248
9249 if (mleak_top_trace[i] == NULL ||
9250 mleak_top_trace[i]->allocs == 0) {
9251 continue;
9252 }
9253
9254 mltr->mltr_collisions = mleak_top_trace[i]->collisions;
9255 mltr->mltr_hitcount = mleak_top_trace[i]->hitcount;
9256 mltr->mltr_allocs = mleak_top_trace[i]->allocs;
9257 mltr->mltr_depth = mleak_top_trace[i]->depth;
9258
9259 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
9260 for (j = 0; j < mltr->mltr_depth; j++) {
9261 mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
9262 }
9263
9264 mltr++;
9265 }
9266 }
9267
9268 static struct mbtypes {
9269 int mt_type;
9270 const char *mt_name;
9271 } mbtypes[] = {
9272 { MT_DATA, "data" },
9273 { MT_OOBDATA, "oob data" },
9274 { MT_CONTROL, "ancillary data" },
9275 { MT_HEADER, "packet headers" },
9276 { MT_SOCKET, "socket structures" },
9277 { MT_PCB, "protocol control blocks" },
9278 { MT_RTABLE, "routing table entries" },
9279 { MT_HTABLE, "IMP host table entries" },
9280 { MT_ATABLE, "address resolution tables" },
9281 { MT_FTABLE, "fragment reassembly queue headers" },
9282 { MT_SONAME, "socket names and addresses" },
9283 { MT_SOOPTS, "socket options" },
9284 { MT_RIGHTS, "access rights" },
9285 { MT_IFADDR, "interface addresses" },
9286 { MT_TAG, "packet tags" },
9287 { 0, NULL }
9288 };
9289
9290 #define MBUF_DUMP_BUF_CHK() { \
9291 clen -= k; \
9292 if (clen < 1) \
9293 goto done; \
9294 c += k; \
9295 }
9296
9297 static char *
9298 mbuf_dump(void)
9299 {
9300 unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct,
9301 totreturned = 0;
9302 u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
9303 u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
9304 u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
9305 int nmbtypes = sizeof(mbstat.m_mtypes) / sizeof(short);
9306 uint8_t seen[256];
9307 struct mbtypes *mp;
9308 mb_class_stat_t *sp;
9309 mleak_trace_stat_t *mltr;
9310 char *c = mbuf_dump_buf;
9311 int i, j, k, clen = MBUF_DUMP_BUF_SIZE;
9312 struct mbuf_watchdog_defunct_args args = {};
9313
9314 mbuf_dump_buf[0] = '\0';
9315
9316 /* synchronize all statistics in the mbuf table */
9317 mbuf_stat_sync();
9318 mbuf_mtypes_sync(TRUE);
9319
9320 sp = &mb_stat->mbs_class[0];
9321 for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
9322 u_int32_t mem;
9323
9324 if (m_class(i) == MC_MBUF) {
9325 m_mbufs = sp->mbcl_active;
9326 } else if (m_class(i) == MC_CL) {
9327 m_clfree = sp->mbcl_total - sp->mbcl_active;
9328 } else if (m_class(i) == MC_BIGCL) {
9329 m_bigclfree = sp->mbcl_total - sp->mbcl_active;
9330 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
9331 m_16kclfree = sp->mbcl_total - sp->mbcl_active;
9332 m_16kclusters = sp->mbcl_total;
9333 } else if (m_class(i) == MC_MBUF_CL) {
9334 m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
9335 } else if (m_class(i) == MC_MBUF_BIGCL) {
9336 m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
9337 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
9338 m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
9339 }
9340
9341 mem = sp->mbcl_ctotal * sp->mbcl_size;
9342 totmem += mem;
9343 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
9344 sp->mbcl_size;
9345 totreturned += sp->mbcl_release_cnt;
9346 }
9347
9348 /* adjust free counts to include composite caches */
9349 m_clfree += m_mbufclfree;
9350 m_bigclfree += m_mbufbigclfree;
9351 m_16kclfree += m_mbuf16kclfree;
9352
9353 totmbufs = 0;
9354 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
9355 totmbufs += mbstat.m_mtypes[mp->mt_type];
9356 }
9357 if (totmbufs > m_mbufs) {
9358 totmbufs = m_mbufs;
9359 }
9360 k = scnprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
9361 MBUF_DUMP_BUF_CHK();
9362
9363 bzero(&seen, sizeof(seen));
9364 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
9365 if (mbstat.m_mtypes[mp->mt_type] != 0) {
9366 seen[mp->mt_type] = 1;
9367 k = scnprintf(c, clen, "\t%u mbufs allocated to %s\n",
9368 mbstat.m_mtypes[mp->mt_type], mp->mt_name);
9369 MBUF_DUMP_BUF_CHK();
9370 }
9371 }
9372 seen[MT_FREE] = 1;
9373 for (i = 0; i < nmbtypes; i++) {
9374 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
9375 k = scnprintf(c, clen, "\t%u mbufs allocated to "
9376 "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
9377 MBUF_DUMP_BUF_CHK();
9378 }
9379 }
9380 if ((m_mbufs - totmbufs) > 0) {
9381 k = scnprintf(c, clen, "\t%lu mbufs allocated to caches\n",
9382 m_mbufs - totmbufs);
9383 MBUF_DUMP_BUF_CHK();
9384 }
9385 k = scnprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
9386 "%u/%u mbuf 4KB clusters in use\n",
9387 (unsigned int)(mbstat.m_clusters - m_clfree),
9388 (unsigned int)mbstat.m_clusters,
9389 (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
9390 (unsigned int)mbstat.m_bigclusters);
9391 MBUF_DUMP_BUF_CHK();
9392
9393 if (njcl > 0) {
9394 k = scnprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
9395 m_16kclusters - m_16kclfree, m_16kclusters,
9396 njclbytes / 1024);
9397 MBUF_DUMP_BUF_CHK();
9398 }
9399 totused = totmem - totfree;
9400 if (totmem == 0) {
9401 totpct = 0;
9402 } else if (totused < (ULONG_MAX / 100)) {
9403 totpct = (totused * 100) / totmem;
9404 } else {
9405 u_long totmem1 = totmem / 100;
9406 u_long totused1 = totused / 100;
9407 totpct = (totused1 * 100) / totmem1;
9408 }
9409 k = scnprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
9410 "in use)\n", totmem / 1024, totpct);
9411 MBUF_DUMP_BUF_CHK();
9412 k = scnprintf(c, clen, "%lu KB returned to the system\n",
9413 totreturned / 1024);
9414 MBUF_DUMP_BUF_CHK();
9415
9416 net_update_uptime();
9417
9418 k = scnprintf(c, clen,
9419 "worker thread runs: %u, expansions: %llu, cl %llu/%llu, "
9420 "bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt,
9421 mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total,
9422 mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt,
9423 mb_expand_16kcl_total);
9424 MBUF_DUMP_BUF_CHK();
9425 if (mbuf_worker_last_runtime != 0) {
9426 k = scnprintf(c, clen, "worker thread last run time: "
9427 "%llu (%llu seconds ago)\n",
9428 mbuf_worker_last_runtime,
9429 net_uptime() - mbuf_worker_last_runtime);
9430 MBUF_DUMP_BUF_CHK();
9431 }
9432 if (mbuf_drain_last_runtime != 0) {
9433 k = scnprintf(c, clen, "drain routine last run time: "
9434 "%llu (%llu seconds ago)\n",
9435 mbuf_drain_last_runtime,
9436 net_uptime() - mbuf_drain_last_runtime);
9437 MBUF_DUMP_BUF_CHK();
9438 }
9439
9440 /*
9441 * Log where the most mbufs have accumulated:
9442 * - Process socket buffers
9443 * - TCP reassembly queue
9444 * - Interface AQM queue (output) and DLIL input queue
9445 */
9446 args.non_blocking = true;
9447 proc_iterate(PROC_ALLPROCLIST,
9448 mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
9449 if (args.top_app != NULL) {
9450 k = scnprintf(c, clen, "\ntop proc mbuf space %u bytes by %s:%d\n",
9451 args.top_app_space_used,
9452 proc_name_address(args.top_app),
9453 proc_pid(args.top_app));
9454 proc_rele(args.top_app);
9455 }
9456 MBUF_DUMP_BUF_CHK();
9457
9458 #if INET
9459 k = dump_tcp_reass_qlen(c, clen);
9460 MBUF_DUMP_BUF_CHK();
9461 #endif /* INET */
9462
9463 #if MPTCP
9464 k = dump_mptcp_reass_qlen(c, clen);
9465 MBUF_DUMP_BUF_CHK();
9466 #endif /* MPTCP */
9467
9468 #if NETWORKING
9469 k = dlil_dump_top_if_qlen(c, clen);
9470 MBUF_DUMP_BUF_CHK();
9471 #endif /* NETWORKING */
9472
9473 /* mbuf leak detection statistics */
9474 mleak_update_stats();
9475
9476 k = scnprintf(c, clen, "\nmbuf leak detection table:\n");
9477 MBUF_DUMP_BUF_CHK();
9478 k = scnprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
9479 mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
9480 mleak_table.mleak_sample_factor);
9481 MBUF_DUMP_BUF_CHK();
9482 k = scnprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
9483 mleak_table.outstanding_allocs);
9484 MBUF_DUMP_BUF_CHK();
9485 k = scnprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
9486 mleak_table.alloc_recorded, mleak_table.trace_recorded);
9487 MBUF_DUMP_BUF_CHK();
9488 k = scnprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
9489 mleak_table.alloc_collisions, mleak_table.trace_collisions);
9490 MBUF_DUMP_BUF_CHK();
9491 k = scnprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
9492 mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
9493 MBUF_DUMP_BUF_CHK();
9494 k = scnprintf(c, clen, "\tlock conflicts: %llu\n\n",
9495 mleak_table.total_conflicts);
9496 MBUF_DUMP_BUF_CHK();
9497
9498 k = scnprintf(c, clen, "top %d outstanding traces:\n",
9499 mleak_stat->ml_cnt);
9500 MBUF_DUMP_BUF_CHK();
9501 for (i = 0; i < mleak_stat->ml_cnt; i++) {
9502 mltr = &mleak_stat->ml_trace[i];
9503 k = scnprintf(c, clen, "[%d] %llu outstanding alloc(s), "
9504 "%llu hit(s), %llu collision(s)\n", (i + 1),
9505 mltr->mltr_allocs, mltr->mltr_hitcount,
9506 mltr->mltr_collisions);
9507 MBUF_DUMP_BUF_CHK();
9508 }
9509
9510 if (mleak_stat->ml_isaddr64) {
9511 k = scnprintf(c, clen, MB_LEAK_HDR_64);
9512 } else {
9513 k = scnprintf(c, clen, MB_LEAK_HDR_32);
9514 }
9515 MBUF_DUMP_BUF_CHK();
9516
9517 for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
9518 k = scnprintf(c, clen, "%2d: ", (i + 1));
9519 MBUF_DUMP_BUF_CHK();
9520 for (j = 0; j < mleak_stat->ml_cnt; j++) {
9521 mltr = &mleak_stat->ml_trace[j];
9522 if (i < mltr->mltr_depth) {
9523 if (mleak_stat->ml_isaddr64) {
9524 k = scnprintf(c, clen, "0x%0llx ",
9525 (uint64_t)VM_KERNEL_UNSLIDE(
9526 mltr->mltr_addr[i]));
9527 } else {
9528 k = scnprintf(c, clen,
9529 "0x%08x ",
9530 (uint32_t)VM_KERNEL_UNSLIDE(
9531 mltr->mltr_addr[i]));
9532 }
9533 } else {
9534 if (mleak_stat->ml_isaddr64) {
9535 k = scnprintf(c, clen,
9536 MB_LEAK_SPACING_64);
9537 } else {
9538 k = scnprintf(c, clen,
9539 MB_LEAK_SPACING_32);
9540 }
9541 }
9542 MBUF_DUMP_BUF_CHK();
9543 }
9544 k = scnprintf(c, clen, "\n");
9545 MBUF_DUMP_BUF_CHK();
9546 }
9547
9548 done:
9549 return mbuf_dump_buf;
9550 }
9551
9552 #undef MBUF_DUMP_BUF_CHK
9553 #endif /* CONFIG_MBUF_MCACHE */
9554
9555 /*
9556 * Convert between a regular and a packet header mbuf. Caller is responsible
9557 * for setting or clearing M_PKTHDR; this routine does the rest of the work.
9558 */
9559 int
9560 m_reinit(struct mbuf *m, int hdr)
9561 {
9562 int ret = 0;
9563
9564 if (hdr) {
9565 VERIFY(!(m->m_flags & M_PKTHDR));
9566 if (!(m->m_flags & M_EXT) &&
9567 (m->m_data != (uintptr_t)m->m_dat || m->m_len > 0)) {
9568 /*
9569 * If there's no external cluster attached and the
9570 * mbuf appears to contain user data, we cannot
9571 * safely convert this to a packet header mbuf,
9572 * as the packet header structure might overlap
9573 * with the data.
9574 */
9575 printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
9576 "m_data %llx (expected %llx), "
9577 "m_len %d (expected 0)\n",
9578 __func__,
9579 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m),
9580 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m->m_data),
9581 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)(m->m_dat)), m->m_len);
9582 ret = EBUSY;
9583 } else {
9584 VERIFY((m->m_flags & M_EXT) || m->m_data == (uintptr_t)m->m_dat);
9585 m->m_flags |= M_PKTHDR;
9586 MBUF_INIT_PKTHDR(m);
9587 }
9588 } else {
9589 /* Check for scratch area overflow */
9590 m_redzone_verify(m);
9591 /* Free the aux data and tags if there is any */
9592 m_tag_delete_chain(m);
9593 m_do_tx_compl_callback(m, NULL);
9594 m->m_flags &= ~M_PKTHDR;
9595 }
9596
9597 return ret;
9598 }
9599
9600 int
9601 m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n)
9602 {
9603 ASSERT(m->m_flags & M_EXT);
9604 return os_atomic_cmpxchg(&MEXT_PRIV(m), o, n, acq_rel);
9605 }
9606
9607 uint32_t
9608 m_ext_get_prop(struct mbuf *m)
9609 {
9610 ASSERT(m->m_flags & M_EXT);
9611 return MEXT_PRIV(m);
9612 }
9613
9614 int
9615 m_ext_paired_is_active(struct mbuf *m)
9616 {
9617 return MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : 1;
9618 }
9619
9620 void
9621 m_ext_paired_activate(struct mbuf *m)
9622 {
9623 struct ext_ref *rfa;
9624 int hdr, type;
9625 caddr_t extbuf;
9626 m_ext_free_func_t extfree;
9627 u_int extsize;
9628
9629 VERIFY(MBUF_IS_PAIRED(m));
9630 VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
9631 VERIFY(MEXT_PREF(m) == MEXT_MINREF(m));
9632
9633 hdr = (m->m_flags & M_PKTHDR);
9634 type = m->m_type;
9635 extbuf = m->m_ext.ext_buf;
9636 extfree = m_get_ext_free(m);
9637 extsize = m->m_ext.ext_size;
9638 rfa = m_get_rfa(m);
9639
9640 VERIFY(extbuf != NULL && rfa != NULL);
9641
9642 /*
9643 * Safe to reinitialize packet header tags, since it's
9644 * already taken care of at m_free() time. Similar to
9645 * what's done in m_clattach() for the cluster. Bump
9646 * up MEXT_PREF to indicate activation.
9647 */
9648 MBUF_INIT(m, hdr, type);
9649 MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
9650 1, 1, 2, EXTF_PAIRED, MEXT_PRIV(m), m);
9651 }
9652
9653 void
9654 m_scratch_init(struct mbuf *m)
9655 {
9656 struct pkthdr *pkt = &m->m_pkthdr;
9657
9658 VERIFY(m->m_flags & M_PKTHDR);
9659
9660 /* See comments in <rdar://problem/14040693> */
9661 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
9662 panic_plain("Invalid attempt to modify guarded module-private "
9663 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
9664 /* NOTREACHED */
9665 }
9666
9667 bzero(&pkt->pkt_mpriv, sizeof(pkt->pkt_mpriv));
9668 }
9669
9670 /*
9671 * This routine is reserved for mbuf_get_driver_scratch(); clients inside
9672 * xnu that intend on utilizing the module-private area should directly
9673 * refer to the pkt_mpriv structure in the pkthdr. They are also expected
9674 * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
9675 * to handing it off to another module, respectively.
9676 */
9677 u_int32_t
9678 m_scratch_get(struct mbuf *m, u_int8_t **p)
9679 {
9680 struct pkthdr *pkt = &m->m_pkthdr;
9681
9682 VERIFY(m->m_flags & M_PKTHDR);
9683
9684 /* See comments in <rdar://problem/14040693> */
9685 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
9686 panic_plain("Invalid attempt to access guarded module-private "
9687 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
9688 /* NOTREACHED */
9689 }
9690
9691 #if CONFIG_MBUF_MCACHE
9692 if (mcltrace) {
9693 mcache_audit_t *mca;
9694
9695 lck_mtx_lock(mbuf_mlock);
9696 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
9697 if (mca->mca_uflags & MB_SCVALID) {
9698 mcl_audit_scratch(mca);
9699 }
9700 lck_mtx_unlock(mbuf_mlock);
9701 }
9702 #endif /* CONFIG_MBUF_MCACHE */
9703
9704 *p = (u_int8_t *)&pkt->pkt_mpriv;
9705 return sizeof(pkt->pkt_mpriv);
9706 }
9707
9708 void
9709 m_add_crumb(struct mbuf *m, uint16_t crumb)
9710 {
9711 VERIFY(m->m_flags & M_PKTHDR);
9712
9713 m->m_pkthdr.pkt_crumbs |= crumb;
9714 }
9715
9716 static void
9717 m_redzone_init(struct mbuf *m)
9718 {
9719 VERIFY(m->m_flags & M_PKTHDR);
9720 /*
9721 * Each mbuf has a unique red zone pattern, which is a XOR
9722 * of the red zone cookie and the address of the mbuf.
9723 */
9724 m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
9725 }
9726
9727 static void
9728 m_redzone_verify(struct mbuf *m)
9729 {
9730 u_int32_t mb_redzone;
9731
9732 VERIFY(m->m_flags & M_PKTHDR);
9733
9734 mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
9735 if (m->m_pkthdr.redzone != mb_redzone) {
9736 panic("mbuf %p redzone violation with value 0x%x "
9737 "(instead of 0x%x, using cookie 0x%x)\n",
9738 m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
9739 /* NOTREACHED */
9740 }
9741 }
9742
9743 __private_extern__ inline void
9744 m_set_ext(struct mbuf *m, struct ext_ref *rfa, m_ext_free_func_t ext_free,
9745 caddr_t ext_arg)
9746 {
9747 VERIFY(m->m_flags & M_EXT);
9748 if (rfa != NULL) {
9749 m_set_rfa(m, rfa);
9750 if (ext_free != NULL) {
9751 rfa->ext_token = ((uintptr_t)&rfa->ext_token) ^
9752 mb_obscure_extfree;
9753 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, ext_free) ^ rfa->ext_token;
9754 m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
9755 if (ext_arg != NULL) {
9756 m->m_ext.ext_arg =
9757 (caddr_t)(((uintptr_t)ext_arg) ^ rfa->ext_token);
9758 } else {
9759 m->m_ext.ext_arg = NULL;
9760 }
9761 } else {
9762 rfa->ext_token = 0;
9763 m->m_ext.ext_free = NULL;
9764 m->m_ext.ext_arg = NULL;
9765 }
9766 } else {
9767 /*
9768 * If we are going to loose the cookie in ext_token by
9769 * resetting the rfa, we should use the global cookie
9770 * to obscure the ext_free and ext_arg pointers.
9771 */
9772 if (ext_free != NULL) {
9773 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, ext_free) ^ mb_obscure_extfree;
9774 m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
9775 if (ext_arg != NULL) {
9776 m->m_ext.ext_arg =
9777 (caddr_t)((uintptr_t)ext_arg ^
9778 mb_obscure_extfree);
9779 } else {
9780 m->m_ext.ext_arg = NULL;
9781 }
9782 } else {
9783 m->m_ext.ext_free = NULL;
9784 m->m_ext.ext_arg = NULL;
9785 }
9786 m->m_ext.ext_refflags = NULL;
9787 }
9788 }
9789
9790 __private_extern__ inline struct ext_ref *
9791 m_get_rfa(struct mbuf *m)
9792 {
9793 if (m->m_ext.ext_refflags == NULL) {
9794 return NULL;
9795 } else {
9796 return (struct ext_ref *)(((uintptr_t)m->m_ext.ext_refflags) ^ mb_obscure_extref);
9797 }
9798 }
9799
9800 static inline void
9801 m_set_rfa(struct mbuf *m, struct ext_ref *rfa)
9802 {
9803 if (rfa != NULL) {
9804 m->m_ext.ext_refflags =
9805 (struct ext_ref *)(((uintptr_t)rfa) ^ mb_obscure_extref);
9806 } else {
9807 m->m_ext.ext_refflags = NULL;
9808 }
9809 }
9810
9811 __private_extern__ inline m_ext_free_func_t
9812 m_get_ext_free(struct mbuf *m)
9813 {
9814 struct ext_ref *rfa;
9815 if (m->m_ext.ext_free == NULL) {
9816 return NULL;
9817 }
9818
9819 rfa = m_get_rfa(m);
9820 if (rfa == NULL) {
9821 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, m->m_ext.ext_free) ^ mb_obscure_extfree;
9822 return ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
9823 } else {
9824 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, m->m_ext.ext_free) ^ rfa->ext_token;
9825 return ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
9826 }
9827 }
9828
9829 __private_extern__ inline caddr_t
9830 m_get_ext_arg(struct mbuf *m)
9831 {
9832 struct ext_ref *rfa;
9833 if (m->m_ext.ext_arg == NULL) {
9834 return NULL;
9835 }
9836
9837 rfa = m_get_rfa(m);
9838 if (rfa == NULL) {
9839 return (caddr_t)((uintptr_t)m->m_ext.ext_arg ^ mb_obscure_extfree);
9840 } else {
9841 return (caddr_t)(((uintptr_t)m->m_ext.ext_arg) ^
9842 rfa->ext_token);
9843 }
9844 }
9845
9846 #if CONFIG_MBUF_MCACHE
9847 /*
9848 * Simple routine to avoid taking the lock when we can't run the
9849 * mbuf drain.
9850 */
9851 static int
9852 mbuf_drain_checks(boolean_t ignore_waiters)
9853 {
9854 if (mb_drain_maxint == 0) {
9855 return 0;
9856 }
9857 if (!ignore_waiters && mb_waiters != 0) {
9858 return 0;
9859 }
9860
9861 return 1;
9862 }
9863
9864 /*
9865 * Called by the VM when there's memory pressure or when we exhausted
9866 * the 4k/16k reserved space.
9867 */
9868 static void
9869 mbuf_drain_locked(boolean_t ignore_waiters)
9870 {
9871 mbuf_class_t mc;
9872 mcl_slab_t *sp, *sp_tmp, *nsp;
9873 unsigned int num, k, interval, released = 0;
9874 unsigned long total_mem = 0, use_mem = 0;
9875 boolean_t ret, purge_caches = FALSE;
9876 ppnum_t offset;
9877 mcache_obj_t *obj;
9878 unsigned long per;
9879 static unsigned char scratch[32];
9880 static ppnum_t scratch_pa = 0;
9881
9882 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
9883 if (!mbuf_drain_checks(ignore_waiters)) {
9884 return;
9885 }
9886 if (scratch_pa == 0) {
9887 bzero(scratch, sizeof(scratch));
9888 scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
9889 VERIFY(scratch_pa);
9890 } else if (mclverify) {
9891 /*
9892 * Panic if a driver wrote to our scratch memory.
9893 */
9894 for (k = 0; k < sizeof(scratch); k++) {
9895 if (scratch[k]) {
9896 panic("suspect DMA to freed address");
9897 }
9898 }
9899 }
9900 /*
9901 * Don't free memory too often as that could cause excessive
9902 * waiting times for mbufs. Purge caches if we were asked to drain
9903 * in the last 5 minutes.
9904 */
9905 if (mbuf_drain_last_runtime != 0) {
9906 interval = net_uptime() - mbuf_drain_last_runtime;
9907 if (interval <= mb_drain_maxint) {
9908 return;
9909 }
9910 if (interval <= mb_drain_maxint * 5) {
9911 purge_caches = TRUE;
9912 }
9913 }
9914 mbuf_drain_last_runtime = net_uptime();
9915 /*
9916 * Don't free any memory if we're using 60% or more.
9917 */
9918 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
9919 total_mem += m_total(mc) * m_maxsize(mc);
9920 use_mem += m_active(mc) * m_maxsize(mc);
9921 }
9922 per = (use_mem * 100) / total_mem;
9923 if (per >= 60) {
9924 return;
9925 }
9926 /*
9927 * Purge all the caches. This effectively disables
9928 * caching for a few seconds, but the mbuf worker thread will
9929 * re-enable them again.
9930 */
9931 if (purge_caches == TRUE) {
9932 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
9933 if (m_total(mc) < m_avgtotal(mc)) {
9934 continue;
9935 }
9936 lck_mtx_unlock(mbuf_mlock);
9937 ret = mcache_purge_cache(m_cache(mc), FALSE);
9938 lck_mtx_lock(mbuf_mlock);
9939 if (ret == TRUE) {
9940 m_purge_cnt(mc)++;
9941 }
9942 }
9943 }
9944 /*
9945 * Move the objects from the composite class freelist to
9946 * the rudimentary slabs list, but keep at least 10% of the average
9947 * total in the freelist.
9948 */
9949 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
9950 while (m_cobjlist(mc) &&
9951 m_total(mc) < m_avgtotal(mc) &&
9952 m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
9953 obj = m_cobjlist(mc);
9954 m_cobjlist(mc) = obj->obj_next;
9955 obj->obj_next = NULL;
9956 num = cslab_free(mc, obj, 1);
9957 VERIFY(num == 1);
9958 m_free_cnt(mc)++;
9959 m_infree(mc)--;
9960 /* cslab_free() handles m_total */
9961 }
9962 }
9963 /*
9964 * Free the buffers present in the slab list up to 10% of the total
9965 * average per class.
9966 *
9967 * We walk the list backwards in an attempt to reduce fragmentation.
9968 */
9969 for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) {
9970 TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
9971 /*
9972 * Process only unused slabs occupying memory.
9973 */
9974 if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
9975 sp->sl_base == NULL) {
9976 continue;
9977 }
9978 if (m_total(mc) < m_avgtotal(mc) ||
9979 m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
9980 break;
9981 }
9982 slab_remove(sp, mc);
9983 switch (mc) {
9984 case MC_MBUF:
9985 m_infree(mc) -= NMBPG;
9986 m_total(mc) -= NMBPG;
9987 if (mclaudit != NULL) {
9988 mcl_audit_free(sp->sl_base, NMBPG);
9989 }
9990 break;
9991 case MC_CL:
9992 m_infree(mc) -= NCLPG;
9993 m_total(mc) -= NCLPG;
9994 if (mclaudit != NULL) {
9995 mcl_audit_free(sp->sl_base, NMBPG);
9996 }
9997 break;
9998 case MC_BIGCL:
9999 {
10000 m_infree(mc) -= NBCLPG;
10001 m_total(mc) -= NBCLPG;
10002 if (mclaudit != NULL) {
10003 mcl_audit_free(sp->sl_base, NMBPG);
10004 }
10005 break;
10006 }
10007 case MC_16KCL:
10008 m_infree(mc)--;
10009 m_total(mc)--;
10010 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
10011 nsp = nsp->sl_next;
10012 VERIFY(nsp->sl_refcnt == 0 &&
10013 nsp->sl_base != NULL &&
10014 nsp->sl_len == 0);
10015 slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
10016 0);
10017 nsp->sl_flags = 0;
10018 }
10019 if (mclaudit != NULL) {
10020 if (sp->sl_len == PAGE_SIZE) {
10021 mcl_audit_free(sp->sl_base,
10022 NMBPG);
10023 } else {
10024 mcl_audit_free(sp->sl_base, 1);
10025 }
10026 }
10027 break;
10028 default:
10029 /*
10030 * The composite classes have their own
10031 * freelist (m_cobjlist), so we only
10032 * process rudimentary classes here.
10033 */
10034 VERIFY(0);
10035 }
10036 m_release_cnt(mc) += m_size(mc);
10037 released += m_size(mc);
10038 VERIFY(sp->sl_base != NULL &&
10039 sp->sl_len >= PAGE_SIZE);
10040 offset = MTOPG(sp->sl_base);
10041 /*
10042 * Make sure the IOMapper points to a valid, but
10043 * bogus, address. This should prevent further DMA
10044 * accesses to freed memory.
10045 */
10046 IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
10047 mcl_paddr[offset] = 0;
10048 kmem_free(mb_map, (vm_offset_t)sp->sl_base,
10049 sp->sl_len);
10050 slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
10051 sp->sl_flags = 0;
10052 }
10053 }
10054 mbstat.m_drain++;
10055 mbstat.m_bigclusters = m_total(MC_BIGCL);
10056 mbstat.m_clusters = m_total(MC_CL);
10057 mbstat.m_mbufs = m_total(MC_MBUF);
10058 mbuf_stat_sync();
10059 mbuf_mtypes_sync(TRUE);
10060 }
10061
10062 __private_extern__ void
10063 mbuf_drain(boolean_t ignore_waiters)
10064 {
10065 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_NOTOWNED);
10066 if (!mbuf_drain_checks(ignore_waiters)) {
10067 return;
10068 }
10069 lck_mtx_lock(mbuf_mlock);
10070 mbuf_drain_locked(ignore_waiters);
10071 lck_mtx_unlock(mbuf_mlock);
10072 }
10073
10074
10075 static int
10076 m_drain_force_sysctl SYSCTL_HANDLER_ARGS
10077 {
10078 #pragma unused(arg1, arg2)
10079 int val = 0, err;
10080
10081 err = sysctl_handle_int(oidp, &val, 0, req);
10082 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10083 return err;
10084 }
10085 if (val) {
10086 mbuf_drain(TRUE);
10087 }
10088
10089 return err;
10090 }
10091
10092 #if DEBUG || DEVELOPMENT
10093 __printflike(3, 4)
10094 static void
10095 _mbwdog_logger(const char *func, const int line, const char *fmt, ...)
10096 {
10097 va_list ap;
10098 struct timeval now;
10099 char str[384], p[256];
10100 int len;
10101
10102 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
10103 if (mbwdog_logging == NULL) {
10104 /*
10105 * This might block under a mutex, which isn't really great,
10106 * but this happens once, so we'll live.
10107 */
10108 mbwdog_logging = zalloc_permanent(mbwdog_logging_size,
10109 ZALIGN_NONE);
10110 }
10111 va_start(ap, fmt);
10112 vsnprintf(p, sizeof(p), fmt, ap);
10113 va_end(ap);
10114 microuptime(&now);
10115 len = scnprintf(str, sizeof(str),
10116 "\n%ld.%d (%d/%llx) %s:%d %s",
10117 now.tv_sec, now.tv_usec,
10118 proc_getpid(current_proc()),
10119 (uint64_t)VM_KERNEL_ADDRPERM(current_thread()),
10120 func, line, p);
10121 if (len < 0) {
10122 return;
10123 }
10124 if (mbwdog_logging_used + len > mbwdog_logging_size) {
10125 mbwdog_logging_used = mbwdog_logging_used / 2;
10126 memmove(mbwdog_logging, mbwdog_logging + mbwdog_logging_used,
10127 mbwdog_logging_size - mbwdog_logging_used);
10128 mbwdog_logging[mbwdog_logging_used] = 0;
10129 }
10130 strlcat(mbwdog_logging, str, mbwdog_logging_size);
10131 mbwdog_logging_used += len;
10132 }
10133
10134 #endif // DEBUG || DEVELOPMENT
10135
10136 static void
10137 mtracelarge_register(size_t size)
10138 {
10139 int i;
10140 struct mtracelarge *trace;
10141 uintptr_t bt[MLEAK_STACK_DEPTH];
10142 unsigned int depth;
10143
10144 depth = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
10145 /* Check if this entry is already on the list. */
10146 for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
10147 trace = &mtracelarge_table[i];
10148 if (trace->size == size && trace->depth == depth &&
10149 memcmp(bt, trace->addr, depth * sizeof(uintptr_t)) == 0) {
10150 return;
10151 }
10152 }
10153 for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
10154 trace = &mtracelarge_table[i];
10155 if (size > trace->size) {
10156 trace->depth = depth;
10157 memcpy(trace->addr, bt, depth * sizeof(uintptr_t));
10158 trace->size = size;
10159 break;
10160 }
10161 }
10162 }
10163
10164 #if DEBUG || DEVELOPMENT
10165
10166 static int
10167 mbuf_wd_dump_sysctl SYSCTL_HANDLER_ARGS
10168 {
10169 char *str;
10170
10171 ifnet_head_lock_shared();
10172 lck_mtx_lock(mbuf_mlock);
10173
10174 str = mbuf_dump();
10175
10176 lck_mtx_unlock(mbuf_mlock);
10177 ifnet_head_done();
10178
10179 return sysctl_io_string(req, str, 0, 0, NULL);
10180 }
10181
10182 #endif /* DEBUG || DEVELOPMENT */
10183 #endif /* CONFIG_MBUF_MCACHE */
10184
10185 SYSCTL_DECL(_kern_ipc);
10186 #if DEBUG || DEVELOPMENT
10187 #if SKYWALK && CONFIG_MBUF_MCACHE
10188 SYSCTL_UINT(_kern_ipc, OID_AUTO, mc_threshold_scale_factor,
10189 CTLFLAG_RW | CTLFLAG_LOCKED, &mc_threshold_scale_down_factor,
10190 MC_THRESHOLD_SCALE_DOWN_FACTOR,
10191 "scale down factor for mbuf cache thresholds");
10192 #endif /* SKYWALK && CONFIG_MBUF_MCACHE */
10193 #if CONFIG_MBUF_MCACHE
10194 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_wd_dump,
10195 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED,
10196 0, 0, mbuf_wd_dump_sysctl, "A", "mbuf watchdog dump");
10197 #endif /* CONFIG_MBUF_MCACHE */
10198 #endif /* DEBUG || DEVELOPMENT */
10199 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
10200 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
10201 0, 0, mbstat_sysctl, "S,mbstat", "");
10202 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
10203 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
10204 0, 0, mb_stat_sysctl, "S,mb_stat", "");
10205 #if CONFIG_MBUF_MCACHE
10206 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
10207 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
10208 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
10209 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
10210 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
10211 0, 0, mleak_table_sysctl, "S,mleak_table", "");
10212 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
10213 CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
10214 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
10215 CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
10216 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
10217 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
10218 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
10219 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
10220 m_drain_force_sysctl, "I",
10221 "Forces the mbuf garbage collection to run");
10222 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
10223 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0,
10224 "Minimum time interval between garbage collection");
10225 #endif /* CONFIG_MBUF_MCACHE */
10226 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_memory_pressure_percentage,
10227 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_memory_pressure_percentage, 0,
10228 "Percentage of when we trigger memory-pressure for an mbuf-class");
10229 #if CONFIG_MBUF_MCACHE
10230 static int mb_uses_mcache = 1;
10231 #else
10232 static int mb_uses_mcache = 0;
10233 #endif /* CONFIG_MBUF_MCACHE */
10234 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_uses_mcache,
10235 CTLFLAG_LOCKED, &mb_uses_mcache, 0,
10236 "Whether mbufs use mcache");
10237