xref: /xnu-12377.41.6/bsd/skywalk/packet/pbufpool.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2016-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/packet/pbufpool_var.h>
31 #include <sys/sdt.h>
32 #include <net/droptap.h>
33 #include <kern/uipc_domain.h>
34 
35 static struct kern_pbufpool *pp_alloc(zalloc_flags_t);
36 static void pp_free(struct kern_pbufpool *);
37 static uint32_t pp_alloc_packet_common(struct kern_pbufpool *, uint16_t,
38     uint64_t *__counted_by(num), uint32_t num, boolean_t, alloc_cb_func_t,
39     const void *, uint32_t);
40 static void pp_free_packet_array(struct kern_pbufpool *,
41     uint64_t *__counted_by(num)array, uint32_t num);
42 static int pp_metadata_ctor_no_buflet(struct skmem_obj_info *,
43     struct skmem_obj_info *, void *, uint32_t);
44 static int pp_metadata_ctor_max_buflet(struct skmem_obj_info *,
45     struct skmem_obj_info *, void *, uint32_t);
46 static void pp_metadata_dtor(void *, void *);
47 static int pp_metadata_construct(struct __kern_quantum *,
48     struct __user_quantum *, obj_idx_t, struct kern_pbufpool *, uint32_t,
49     uint16_t, bool, struct skmem_obj **);
50 static void pp_metadata_destruct(struct __kern_quantum *,
51     struct kern_pbufpool *, bool);
52 static struct __kern_quantum *pp_metadata_init(struct __metadata_preamble *,
53     struct kern_pbufpool *, uint16_t, uint32_t, struct skmem_obj **);
54 static struct __metadata_preamble *pp_metadata_fini(struct __kern_quantum *,
55     struct kern_pbufpool *, struct mbuf **, struct __kern_packet **,
56     struct skmem_obj **, struct skmem_obj **, struct skmem_obj **, struct skmem_obj **);
57 static void pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid);
58 static void pp_buf_seg_ctor(struct sksegment *, IOSKMemoryBufferRef, void *);
59 static void pp_buf_seg_dtor(struct sksegment *, IOSKMemoryBufferRef, void *);
60 static void pp_destroy_upp_locked(struct kern_pbufpool *);
61 static void pp_destroy_upp_bft_locked(struct kern_pbufpool *);
62 static int pp_init_upp_bft_locked(struct kern_pbufpool *, boolean_t);
63 static void pp_free_buflet_common(const kern_pbufpool_t, kern_buflet_t);
64 static mach_vm_address_t pp_alloc_buffer_common(const kern_pbufpool_t pp,
65     struct skmem_obj_info *oi, uint32_t skmflag, bool large);
66 static inline uint32_t
67 pp_alloc_buflet_common(struct kern_pbufpool *pp,
68     uint64_t *__counted_by(num)array, uint32_t num, uint32_t skmflag,
69     bool large);
70 
71 #define KERN_PBUFPOOL_U_HASH_SIZE       64      /* hash table size */
72 
73 #define KERN_BUF_MIN_STRIDING_SIZE      32 * 1024
74 static uint32_t kern_buf_min_striding_size = KERN_BUF_MIN_STRIDING_SIZE;
75 
76 /*
77  * Since the inputs are small (indices to the metadata region), we can use
78  * Knuth's multiplicative hash method which is fast and good enough.  Here
79  * we multiply the input by the golden ratio of 2^32.  See "The Art of
80  * Computer Programming", section 6.4.
81  */
82 #define KERN_PBUFPOOL_U_HASH_INDEX(_i, _m)                      \
83 	(((_i) * 2654435761U) & (_m))
84 #define KERN_PBUFPOOL_U_HASH(_pp, _i)                           \
85 	(&(_pp)->pp_u_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
86 	KERN_PBUFPOOL_U_HASH_SIZE - 1)])
87 #define KERN_PBUFPOOL_U_BFT_HASH(_pp, _i)                           \
88 	(&(_pp)->pp_u_bft_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
89 	KERN_PBUFPOOL_U_HASH_SIZE - 1)])
90 
91 static SKMEM_TYPE_DEFINE(pp_zone, struct kern_pbufpool);
92 
93 #define SKMEM_TAG_PBUFPOOL_HASH  "com.apple.skywalk.pbufpool.hash"
94 static SKMEM_TAG_DEFINE(skmem_tag_pbufpool_hash, SKMEM_TAG_PBUFPOOL_HASH);
95 
96 #define SKMEM_TAG_PBUFPOOL_BFT_HASH  "com.apple.skywalk.pbufpool.bft.hash"
97 static SKMEM_TAG_DEFINE(skmem_tag_pbufpool_bft_hash, SKMEM_TAG_PBUFPOOL_BFT_HASH);
98 
99 #if HAS_MTE
100 extern bool is_mte_enabled;
101 #endif /* HAS_MTE */
102 
103 struct kern_pbufpool_u_htbl {
104 	struct kern_pbufpool_u_bkt upp_hash[KERN_PBUFPOOL_U_HASH_SIZE];
105 };
106 
107 #define PP_U_HTBL_SIZE  sizeof(struct kern_pbufpool_u_htbl)
108 static SKMEM_TYPE_DEFINE(pp_u_htbl_zone, struct kern_pbufpool_u_htbl);
109 
110 static struct skmem_cache *pp_opt_cache;        /* cache for __packet_opt */
111 static struct skmem_cache *pp_flow_cache;       /* cache for __flow */
112 static struct skmem_cache *pp_compl_cache;      /* cache for __packet_compl */
113 
114 static int __pp_inited = 0;
115 
116 int
pp_init(void)117 pp_init(void)
118 {
119 	static_assert(KPKT_SC_UNSPEC == MBUF_SC_UNSPEC);
120 	static_assert(KPKT_SC_BK_SYS == MBUF_SC_BK_SYS);
121 	static_assert(KPKT_SC_BK == MBUF_SC_BK);
122 	static_assert(KPKT_SC_BE == MBUF_SC_BE);
123 	static_assert(KPKT_SC_RD == MBUF_SC_RD);
124 	static_assert(KPKT_SC_OAM == MBUF_SC_OAM);
125 	static_assert(KPKT_SC_AV == MBUF_SC_AV);
126 	static_assert(KPKT_SC_RV == MBUF_SC_RV);
127 	static_assert(KPKT_SC_VI == MBUF_SC_VI);
128 	static_assert(KPKT_SC_SIG == MBUF_SC_SIG);
129 	static_assert(KPKT_SC_VO == MBUF_SC_VO);
130 	static_assert(KPKT_SC_CTL == MBUF_SC_CTL);
131 
132 	static_assert(KPKT_SC_BK_SYS == PKT_SC_BK_SYS);
133 	static_assert(KPKT_SC_BK == PKT_SC_BK);
134 	static_assert(KPKT_SC_BE == PKT_SC_BE);
135 	static_assert(KPKT_SC_RD == PKT_SC_RD);
136 	static_assert(KPKT_SC_OAM == PKT_SC_OAM);
137 	static_assert(KPKT_SC_AV == PKT_SC_AV);
138 	static_assert(KPKT_SC_RV == PKT_SC_RV);
139 	static_assert(KPKT_SC_VI == PKT_SC_VI);
140 	static_assert(KPKT_SC_SIG == PKT_SC_SIG);
141 	static_assert(KPKT_SC_VO == PKT_SC_VO);
142 	static_assert(KPKT_SC_CTL == PKT_SC_CTL);
143 	static_assert(KPKT_SC_MAX_CLASSES == MBUF_SC_MAX_CLASSES);
144 
145 	static_assert(KPKT_TC_UNSPEC == MBUF_TC_UNSPEC);
146 	static_assert(KPKT_TC_BE == MBUF_TC_BE);
147 	static_assert(KPKT_TC_BK == MBUF_TC_BK);
148 	static_assert(KPKT_TC_VI == MBUF_TC_VI);
149 	static_assert(KPKT_TC_VO == MBUF_TC_VO);
150 	static_assert(KPKT_TC_MAX == MBUF_TC_MAX);
151 
152 	static_assert(KPKT_TC_BE == PKT_TC_BE);
153 	static_assert(KPKT_TC_BK == PKT_TC_BK);
154 	static_assert(KPKT_TC_VI == PKT_TC_VI);
155 	static_assert(KPKT_TC_VO == PKT_TC_VO);
156 
157 	static_assert(PKT_SCVAL_BK_SYS == SCVAL_BK_SYS);
158 	static_assert(PKT_SCVAL_BK == SCVAL_BK);
159 	static_assert(PKT_SCVAL_BE == SCVAL_BE);
160 	static_assert(PKT_SCVAL_RD == SCVAL_RD);
161 	static_assert(PKT_SCVAL_OAM == SCVAL_OAM);
162 	static_assert(PKT_SCVAL_AV == SCVAL_AV);
163 	static_assert(PKT_SCVAL_RV == SCVAL_RV);
164 	static_assert(PKT_SCVAL_VI == SCVAL_VI);
165 	static_assert(PKT_SCVAL_VO == SCVAL_VO);
166 	static_assert(PKT_SCVAL_CTL == SCVAL_CTL);
167 
168 	/*
169 	 * Assert that the value of common packet flags between mbuf and
170 	 * skywalk packets match, and that they are in PKT_F_COMMON_MASK.
171 	 */
172 	static_assert(PKT_F_BACKGROUND == PKTF_SO_BACKGROUND);
173 	static_assert(PKT_F_REALTIME == PKTF_SO_REALTIME);
174 	static_assert(PKT_F_REXMT == PKTF_TCP_REXMT);
175 	static_assert(PKT_F_LAST_PKT == PKTF_LAST_PKT);
176 	static_assert(PKT_F_FLOW_ID == PKTF_FLOW_ID);
177 	static_assert(PKT_F_FLOW_ADV == PKTF_FLOW_ADV);
178 	static_assert(PKT_F_TX_COMPL_TS_REQ == PKTF_TX_COMPL_TS_REQ);
179 	static_assert(PKT_F_TS_VALID == PKTF_TS_VALID);
180 	static_assert(PKT_F_NEW_FLOW == PKTF_NEW_FLOW);
181 	static_assert(PKT_F_START_SEQ == PKTF_START_SEQ);
182 	static_assert(PKT_F_KEEPALIVE == PKTF_KEEPALIVE);
183 	static_assert(PKT_F_WAKE_PKT == PKTF_WAKE_PKT);
184 	static_assert(PKT_F_COMMON_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME | PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_FLOW_ID | PKT_F_FLOW_ADV | PKT_F_TX_COMPL_TS_REQ | PKT_F_TS_VALID | PKT_F_NEW_FLOW | PKT_F_START_SEQ | PKT_F_KEEPALIVE | PKT_F_WAKE_PKT));
185 	/*
186 	 * Assert packet flags shared with userland.
187 	 */
188 	static_assert(PKT_F_USER_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME | PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_OPT_DATA | PKT_F_PROMISC | PKT_F_TRUNCATED | PKT_F_WAKE_PKT | PKT_F_L4S | PKT_F_ULPN));
189 
190 	static_assert(offsetof(struct __kern_quantum, qum_len) == offsetof(struct __kern_packet, pkt_length));
191 
192 	/*
193 	 * Due to the use of tagged pointer, we need the size of
194 	 * the metadata preamble structure to be multiples of 16.
195 	 * See SK_PTR_TAG() definition for details.
196 	 */
197 	static_assert(sizeof(struct __metadata_preamble) != 0 && (sizeof(struct __metadata_preamble) % 16) == 0);
198 
199 	static_assert(NX_PBUF_FRAGS_MIN == 1 && NX_PBUF_FRAGS_MIN == NX_PBUF_FRAGS_DEFAULT);
200 
201 	/*
202 	 * Batch alloc/free requires linking the objects together;
203 	 * make sure that the fields are at the same offset since
204 	 * we cast the object to struct skmem_obj.
205 	 */
206 	static_assert(offsetof(struct __metadata_preamble, _mdp_next) == offsetof(struct skmem_obj, mo_next));
207 	static_assert(offsetof(struct __buflet, __buflet_next) == offsetof(struct skmem_obj, mo_next));
208 
209 	SK_LOCK_ASSERT_HELD();
210 	ASSERT(!__pp_inited);
211 
212 	pp_opt_cache = skmem_cache_create("pkt.opt",
213 	    sizeof(struct __packet_opt), sizeof(uint64_t),
214 	    NULL, NULL, NULL, NULL, NULL, 0);
215 	pp_flow_cache = skmem_cache_create("pkt.flow",
216 	    sizeof(struct __flow), 16,  /* 16-bytes aligned */
217 	    NULL, NULL, NULL, NULL, NULL, 0);
218 	pp_compl_cache = skmem_cache_create("pkt.compl",
219 	    sizeof(struct __packet_compl), sizeof(uint64_t),
220 	    NULL, NULL, NULL, NULL, NULL, 0);
221 
222 	PE_parse_boot_argn("sk_pp_min_striding_size", &kern_buf_min_striding_size,
223 	    sizeof(kern_buf_min_striding_size));
224 
225 	return 0;
226 }
227 
228 void
pp_fini(void)229 pp_fini(void)
230 {
231 	SK_LOCK_ASSERT_HELD();
232 
233 	if (__pp_inited) {
234 		if (pp_compl_cache != NULL) {
235 			skmem_cache_destroy(pp_compl_cache);
236 			pp_compl_cache = NULL;
237 		}
238 		if (pp_flow_cache != NULL) {
239 			skmem_cache_destroy(pp_flow_cache);
240 			pp_flow_cache = NULL;
241 		}
242 		if (pp_opt_cache != NULL) {
243 			skmem_cache_destroy(pp_opt_cache);
244 			pp_opt_cache = NULL;
245 		}
246 
247 		__pp_inited = 0;
248 	}
249 }
250 
251 static struct kern_pbufpool *
pp_alloc(zalloc_flags_t how)252 pp_alloc(zalloc_flags_t how)
253 {
254 	struct kern_pbufpool *pp = zalloc_flags(pp_zone, how | Z_ZERO);
255 
256 	if (pp) {
257 		lck_mtx_init(&pp->pp_lock, &skmem_lock_grp, &skmem_lock_attr);
258 	}
259 	return pp;
260 }
261 
262 static void
pp_free(struct kern_pbufpool * pp)263 pp_free(struct kern_pbufpool *pp)
264 {
265 	PP_LOCK_ASSERT_HELD(pp);
266 
267 	pp_destroy(pp);
268 	PP_UNLOCK(pp);
269 
270 	SK_DF(SK_VERB_MEM, "pp %p FREE", SK_KVA(pp));
271 	lck_mtx_destroy(&pp->pp_lock, &skmem_lock_grp);
272 	zfree(pp_zone, pp);
273 }
274 
275 void
pp_retain_locked(struct kern_pbufpool * pp)276 pp_retain_locked(struct kern_pbufpool *pp)
277 {
278 	PP_LOCK_ASSERT_HELD(pp);
279 
280 	pp->pp_refcnt++;
281 	ASSERT(pp->pp_refcnt != 0);
282 }
283 
284 void
pp_retain(struct kern_pbufpool * pp)285 pp_retain(struct kern_pbufpool *pp)
286 {
287 	PP_LOCK(pp);
288 	pp_retain_locked(pp);
289 	PP_UNLOCK(pp);
290 }
291 
292 boolean_t
pp_release_locked(struct kern_pbufpool * pp)293 pp_release_locked(struct kern_pbufpool *pp)
294 {
295 	uint32_t oldref = pp->pp_refcnt;
296 
297 	PP_LOCK_ASSERT_HELD(pp);
298 
299 	ASSERT(pp->pp_refcnt != 0);
300 	if (--pp->pp_refcnt == 0) {
301 		pp_free(pp);
302 	}
303 
304 	return oldref == 1;
305 }
306 
307 boolean_t
pp_release(struct kern_pbufpool * pp)308 pp_release(struct kern_pbufpool *pp)
309 {
310 	boolean_t lastref;
311 
312 	PP_LOCK(pp);
313 	if (!(lastref = pp_release_locked(pp))) {
314 		PP_UNLOCK(pp);
315 	}
316 
317 	return lastref;
318 }
319 
320 void
pp_close(struct kern_pbufpool * pp)321 pp_close(struct kern_pbufpool *pp)
322 {
323 	PP_LOCK(pp);
324 	ASSERT(pp->pp_refcnt > 0);
325 	ASSERT(!(pp->pp_flags & PPF_CLOSED));
326 	pp->pp_flags |= PPF_CLOSED;
327 	if (!pp_release_locked(pp)) {
328 		PP_UNLOCK(pp);
329 	}
330 }
331 
332 /*
333  * -fbounds-safety: All callers of pp_regions_params_adjust use SKMEM_REGIONS
334  * size for the srp_array. This is same as marking it __counted_by(SKMEM_REGIONS)
335  */
336 void
pp_regions_params_adjust(struct skmem_region_params srp_array[SKMEM_REGIONS],nexus_meta_type_t md_type,nexus_meta_subtype_t md_subtype,uint32_t md_cnt,uint16_t max_frags,uint32_t buf_size,uint32_t large_buf_size,uint32_t buf_cnt,uint32_t buf_seg_size,uint32_t flags)337 pp_regions_params_adjust(struct skmem_region_params srp_array[SKMEM_REGIONS],
338     nexus_meta_type_t md_type, nexus_meta_subtype_t md_subtype, uint32_t md_cnt,
339     uint16_t max_frags, uint32_t buf_size, uint32_t large_buf_size,
340     uint32_t buf_cnt, uint32_t buf_seg_size, uint32_t flags)
341 {
342 	struct skmem_region_params *srp, *kmd_srp, *buf_srp, *kbft_srp,
343 	    *lbuf_srp;
344 	uint32_t md_size = 0;
345 	bool kernel_only = ((flags & PP_REGION_CONFIG_KERNEL_ONLY) != 0);
346 	bool md_persistent = ((flags & PP_REGION_CONFIG_MD_PERSISTENT) != 0);
347 	bool buf_persistent = ((flags & PP_REGION_CONFIG_BUF_PERSISTENT) != 0);
348 	bool config_buflet = ((flags & PP_REGION_CONFIG_BUFLET) != 0);
349 	bool md_magazine_enable = ((flags &
350 	    PP_REGION_CONFIG_MD_MAGAZINE_ENABLE) != 0);
351 
352 	ASSERT(max_frags != 0);
353 
354 	md_size = NX_METADATA_PACKET_SZ(max_frags);
355 
356 	switch (flags & PP_REGION_CONFIG_BUF_IODIR_BIDIR) {
357 	case PP_REGION_CONFIG_BUF_IODIR_IN:
358 		kmd_srp = &srp_array[SKMEM_REGION_RXKMD];
359 		buf_srp = &srp_array[SKMEM_REGION_RXBUF_DEF];
360 		lbuf_srp = &srp_array[SKMEM_REGION_RXBUF_LARGE];
361 		kbft_srp = &srp_array[SKMEM_REGION_RXKBFT];
362 		break;
363 	case PP_REGION_CONFIG_BUF_IODIR_OUT:
364 		kmd_srp = &srp_array[SKMEM_REGION_TXKMD];
365 		buf_srp = &srp_array[SKMEM_REGION_TXBUF_DEF];
366 		lbuf_srp = &srp_array[SKMEM_REGION_TXBUF_LARGE];
367 		kbft_srp = &srp_array[SKMEM_REGION_TXKBFT];
368 		break;
369 	case PP_REGION_CONFIG_BUF_IODIR_BIDIR:
370 	default:
371 		kmd_srp = &srp_array[SKMEM_REGION_KMD];
372 		buf_srp = &srp_array[SKMEM_REGION_BUF_DEF];
373 		lbuf_srp = &srp_array[SKMEM_REGION_BUF_LARGE];
374 		kbft_srp = &srp_array[SKMEM_REGION_KBFT];
375 		break;
376 	}
377 
378 	/* add preamble size to metadata obj size */
379 	md_size += METADATA_PREAMBLE_SZ;
380 	ASSERT(md_size >= NX_METADATA_OBJ_MIN_SZ);
381 
382 	/* configure kernel metadata region */
383 	kmd_srp->srp_md_type = md_type;
384 	kmd_srp->srp_md_subtype = md_subtype;
385 	kmd_srp->srp_r_obj_cnt = md_cnt;
386 	kmd_srp->srp_r_obj_size = md_size;
387 	kmd_srp->srp_max_frags = max_frags;
388 	ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
389 	if (md_persistent) {
390 		kmd_srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
391 	}
392 	ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
393 	if (md_magazine_enable) {
394 		kmd_srp->srp_cflags &= ~SKMEM_REGION_CR_NOMAGAZINES;
395 	}
396 	skmem_region_params_config(kmd_srp);
397 
398 	/* Sanity check for memtag */
399 	ASSERT(kmd_srp->srp_c_seg_size == SKMEM_MD_SEG_SIZE);
400 
401 	/* configure user metadata region */
402 	srp = &srp_array[SKMEM_REGION_UMD];
403 	if (!kernel_only) {
404 		srp->srp_md_type = kmd_srp->srp_md_type;
405 		srp->srp_md_subtype = kmd_srp->srp_md_subtype;
406 		srp->srp_r_obj_cnt = kmd_srp->srp_c_obj_cnt;
407 		srp->srp_r_obj_size = kmd_srp->srp_c_obj_size;
408 		srp->srp_max_frags = kmd_srp->srp_max_frags;
409 		ASSERT((srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
410 		if (md_persistent) {
411 			srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
412 		}
413 		/*
414 		 * UMD is a mirrored region and object allocation operations
415 		 * are performed on the KMD objects.
416 		 */
417 		ASSERT((srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
418 		skmem_region_params_config(srp);
419 		ASSERT(srp->srp_c_obj_cnt == kmd_srp->srp_c_obj_cnt);
420 	} else {
421 		ASSERT(srp->srp_r_obj_cnt == 0);
422 		ASSERT(srp->srp_r_obj_size == 0);
423 	}
424 
425 	/* configure buffer region */
426 	buf_srp->srp_r_obj_cnt = MAX(buf_cnt, kmd_srp->srp_c_obj_cnt);
427 	buf_srp->srp_r_obj_size = buf_size;
428 	buf_srp->srp_cflags &= ~SKMEM_REGION_CR_MONOLITHIC;
429 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
430 	if (buf_persistent) {
431 		buf_srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
432 	}
433 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
434 	if (buf_srp->srp_r_obj_size >= kern_buf_min_striding_size) {
435 		/*
436 		 * A buffer size larger than 32K indicates striding is in use, which
437 		 * means a buffer could be detached from a buflet. In this case, magzine
438 		 * layer should be enabled.
439 		 */
440 		buf_srp->srp_cflags &= ~SKMEM_REGION_CR_NOMAGAZINES;
441 	}
442 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_UREADONLY) == 0);
443 	if ((flags & PP_REGION_CONFIG_BUF_UREADONLY) != 0) {
444 		buf_srp->srp_cflags |= SKMEM_REGION_CR_UREADONLY;
445 	}
446 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_KREADONLY) == 0);
447 	if ((flags & PP_REGION_CONFIG_BUF_KREADONLY) != 0) {
448 		buf_srp->srp_cflags |= SKMEM_REGION_CR_KREADONLY;
449 	}
450 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_MONOLITHIC) == 0);
451 	if ((flags & PP_REGION_CONFIG_BUF_MONOLITHIC) != 0) {
452 		buf_srp->srp_cflags |= SKMEM_REGION_CR_MONOLITHIC;
453 	}
454 	ASSERT((srp->srp_cflags & SKMEM_REGION_CR_SEGPHYSCONTIG) == 0);
455 	if ((flags & PP_REGION_CONFIG_BUF_SEGPHYSCONTIG) != 0) {
456 		buf_srp->srp_cflags |= SKMEM_REGION_CR_SEGPHYSCONTIG;
457 	}
458 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_NOCACHE) == 0);
459 	if ((flags & PP_REGION_CONFIG_BUF_NOCACHE) != 0) {
460 		buf_srp->srp_cflags |= SKMEM_REGION_CR_NOCACHE;
461 	}
462 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_THREADSAFE) == 0);
463 	if ((flags & PP_REGION_CONFIG_BUF_THREADSAFE) != 0) {
464 		buf_srp->srp_cflags |= SKMEM_REGION_CR_THREADSAFE;
465 	}
466 	if (buf_seg_size != 0) {
467 		buf_srp->srp_r_seg_size = buf_seg_size;
468 	}
469 	skmem_region_params_config(buf_srp);
470 
471 	/* configure large buffer region */
472 	if (large_buf_size != 0) {
473 		lbuf_srp->srp_r_obj_cnt = buf_srp->srp_r_obj_cnt;
474 		lbuf_srp->srp_r_obj_size = large_buf_size;
475 		lbuf_srp->srp_r_seg_size = buf_srp->srp_r_seg_size;
476 		lbuf_srp->srp_cflags = buf_srp->srp_cflags;
477 		skmem_region_params_config(lbuf_srp);
478 	}
479 
480 	/* configure kernel buflet region */
481 	if (config_buflet) {
482 		/*
483 		 * Ideally we want the number of buflets to be
484 		 * "kmd_srp->srp_c_obj_cnt * (kmd_srp->srp_max_frags - 1)",
485 		 * so that we have enough buflets when multi-buflet and
486 		 * shared buffer object is used.
487 		 * Currently multi-buflet is being used only by user pool
488 		 * which doesn't support shared buffer object, hence to reduce
489 		 * the number of objects we are restricting the number of
490 		 * buflets to the number of buffers.
491 		 */
492 		kbft_srp->srp_r_obj_cnt = buf_srp->srp_c_obj_cnt +
493 		    lbuf_srp->srp_c_obj_cnt;
494 		kbft_srp->srp_r_obj_size = MAX(sizeof(struct __kern_buflet_ext),
495 		    sizeof(struct __user_buflet));
496 		kbft_srp->srp_cflags = kmd_srp->srp_cflags;
497 		skmem_region_params_config(kbft_srp);
498 		ASSERT(kbft_srp->srp_c_obj_cnt >= buf_srp->srp_c_obj_cnt +
499 		    lbuf_srp->srp_c_obj_cnt);
500 		/* Sanity check for memtag */
501 		ASSERT(kbft_srp->srp_c_seg_size == SKMEM_MD_SEG_SIZE);
502 	} else {
503 		ASSERT(kbft_srp->srp_r_obj_cnt == 0);
504 		ASSERT(kbft_srp->srp_r_obj_size == 0);
505 	}
506 
507 	/* configure user buflet region */
508 	srp = &srp_array[SKMEM_REGION_UBFT];
509 	if (config_buflet && !kernel_only) {
510 		srp->srp_r_obj_cnt = kbft_srp->srp_c_obj_cnt;
511 		srp->srp_r_obj_size = kbft_srp->srp_c_obj_size;
512 		srp->srp_cflags = srp_array[SKMEM_REGION_UMD].srp_cflags;
513 		skmem_region_params_config(srp);
514 		ASSERT(srp->srp_c_obj_cnt == kbft_srp->srp_c_obj_cnt);
515 	} else {
516 		ASSERT(srp->srp_r_obj_cnt == 0);
517 		ASSERT(srp->srp_r_obj_size == 0);
518 	}
519 
520 	/* make sure each metadata can be paired with a buffer */
521 	ASSERT(kmd_srp->srp_c_obj_cnt <= buf_srp->srp_c_obj_cnt);
522 }
523 
524 SK_NO_INLINE_ATTRIBUTE
525 static int
pp_metadata_construct(struct __kern_quantum * kqum,struct __user_quantum * uqum,obj_idx_t midx,struct kern_pbufpool * pp,uint32_t skmflag,uint16_t bufcnt,bool raw,struct skmem_obj ** blist)526 pp_metadata_construct(struct __kern_quantum *kqum, struct __user_quantum *uqum,
527     obj_idx_t midx, struct kern_pbufpool *pp, uint32_t skmflag, uint16_t bufcnt,
528     bool raw, struct skmem_obj **blist)
529 {
530 	struct __kern_buflet *kbuf;
531 	mach_vm_address_t baddr = 0;
532 	uint16_t *pbufs_cnt, *pbufs_max;
533 	uint16_t i;
534 
535 	ASSERT(bufcnt == 1 || PP_HAS_BUFFER_ON_DEMAND(pp));
536 
537 	/* construct {user,kernel} metadata */
538 	struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
539 	struct __user_packet *upkt = SK_PTR_ADDR_UPKT(uqum);
540 	struct __packet_opt *__single opt;
541 	struct __flow *__single flow;
542 	struct __packet_compl *__single compl;
543 	uint64_t pflags;
544 
545 	if (raw) {
546 		opt = skmem_cache_alloc(pp_opt_cache, SKMEM_SLEEP);
547 		flow = skmem_cache_alloc(pp_flow_cache, SKMEM_SLEEP);
548 		compl = skmem_cache_alloc(pp_compl_cache, SKMEM_SLEEP);
549 		pflags = (PKT_F_OPT_ALLOC | PKT_F_FLOW_ALLOC |
550 		    PKT_F_TX_COMPL_ALLOC);
551 	} else {
552 		ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
553 		    kpkt->pkt_com_opt != NULL);
554 		opt = kpkt->pkt_com_opt;
555 		ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
556 		    kpkt->pkt_flow != NULL);
557 		flow = kpkt->pkt_flow;
558 		ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
559 		    kpkt->pkt_tx_compl != NULL);
560 		compl = kpkt->pkt_tx_compl;
561 		pflags = kpkt->pkt_pflags;
562 	}
563 	/* will be adjusted below as part of allocating buffer(s) */
564 	static_assert(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
565 	static_assert(sizeof(kpkt->pkt_bufs_max) == sizeof(uint16_t));
566 	pbufs_cnt = __DECONST(uint16_t *, &kpkt->pkt_bufs_cnt);
567 	pbufs_max = __DECONST(uint16_t *, &kpkt->pkt_bufs_max);
568 
569 	/* kernel (and user) packet */
570 	KPKT_CTOR(kpkt, pflags, opt, flow, compl, midx,
571 	    upkt, pp, 0, pp->pp_max_frags, 0);
572 
573 	kbuf = kqum->qum_buf;
574 	for (i = 0; i < bufcnt; i++) {
575 		struct skmem_obj_info oib;
576 
577 		if (!PP_HAS_BUFFER_ON_DEMAND(pp)) {
578 			ASSERT(i == 0);
579 			ASSERT(*blist == NULL);
580 			/*
581 			 * quantum has a native buflet, so we only need a
582 			 * buffer to be allocated and attached to the buflet.
583 			 */
584 			baddr = pp_alloc_buffer_common(pp, &oib, skmflag,
585 			    false);
586 			if (__improbable(baddr == 0)) {
587 				goto fail;
588 			}
589 			KBUF_CTOR(kbuf, baddr, SKMEM_OBJ_IDX_REG(&oib),
590 			    SKMEM_OBJ_BUFCTL(&oib), pp, false);
591 			baddr = 0;
592 		} else {
593 			/*
594 			 * we use pre-constructed buflets with attached buffers.
595 			 */
596 			struct __kern_buflet *pkbuf = kbuf;
597 			struct skmem_obj *blistn;
598 
599 			ASSERT(pkbuf != NULL);
600 			kbuf = (kern_buflet_t)*blist;
601 			if (__improbable(kbuf == NULL)) {
602 				SK_DF(SK_VERB_MEM, "failed to get buflet,"
603 				    " pp %p", SK_KVA(pp));
604 				goto fail;
605 			}
606 
607 #if HAS_MTE && CONFIG_KERNEL_TAGGING
608 			if (__probable(is_mte_enabled)) {
609 				/* Checking to ensure the object address is tagged */
610 				ASSERT((vm_offset_t)kbuf !=
611 				    vm_memtag_canonicalize_kernel((vm_offset_t)kbuf));
612 			}
613 #endif /* HAS_MTE && CONFIG_KERNEL_TAGGING */
614 
615 			blistn = (*blist)->mo_next;
616 			(*blist)->mo_next = NULL;
617 
618 			KBUF_EXT_INIT(kbuf, pp);
619 			KBUF_LINK(pkbuf, kbuf);
620 			*blist = blistn;
621 		}
622 
623 		/* adjust buffer count accordingly */
624 		if (__probable(pbufs_cnt != NULL)) {
625 			*pbufs_cnt += 1;
626 			ASSERT(*pbufs_cnt <= *pbufs_max);
627 		}
628 	}
629 
630 	ASSERT(!PP_KERNEL_ONLY(pp) || (kqum->qum_qflags & QUM_F_KERNEL_ONLY));
631 	ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE);
632 	SK_DF(SK_VERB_MEM, "pp %p pkt %p bufcnt %d buf %p",
633 	    SK_KVA(pp), SK_KVA(kqum), bufcnt, SK_KVA(baddr));
634 	return 0;
635 
636 fail:
637 	ASSERT(bufcnt != 0 && baddr == 0);
638 	pp_metadata_destruct(kqum, pp, raw);
639 	return ENOMEM;
640 }
641 
642 static int
pp_metadata_ctor_common(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,struct kern_pbufpool * pp,uint32_t skmflag,bool no_buflet)643 pp_metadata_ctor_common(struct skmem_obj_info *oi0,
644     struct skmem_obj_info *oim0, struct kern_pbufpool *pp, uint32_t skmflag,
645     bool no_buflet)
646 {
647 	struct skmem_obj_info _oi, _oim;
648 	struct skmem_obj_info *oi, *oim;
649 	struct __kern_quantum *kqum;
650 	struct __user_quantum *uqum;
651 	uint16_t bufcnt = (no_buflet ? 0 : pp->pp_max_frags);
652 	struct skmem_obj *__single blist = NULL;
653 	int error;
654 
655 #if (DEVELOPMENT || DEBUG)
656 	uint64_t mtbf = skmem_region_get_mtbf();
657 	/*
658 	 * MTBF is applicable only for non-blocking allocations here.
659 	 */
660 	if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
661 	    (skmflag & SKMEM_NOSLEEP))) {
662 		SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
663 		net_update_uptime();
664 		return ENOMEM;
665 	}
666 #endif /* (DEVELOPMENT || DEBUG) */
667 
668 	/*
669 	 * Note that oi0 and oim0 may be stored inside the object itself;
670 	 * if so, copy them to local variables before constructing.  We
671 	 * don't use PPF_BATCH to test as the allocator may be allocating
672 	 * storage space differently depending on the number of objects.
673 	 */
674 	if (__probable((uintptr_t)oi0 >= (uintptr_t)SKMEM_OBJ_ADDR(oi0) &&
675 	    ((uintptr_t)oi0 + sizeof(*oi0)) <=
676 	    ((uintptr_t)SKMEM_OBJ_ADDR(oi0) + SKMEM_OBJ_SIZE(oi0)))) {
677 		oi = &_oi;
678 		*oi = *oi0;
679 		if (__probable(oim0 != NULL)) {
680 			oim = &_oim;
681 			*oim = *oim0;
682 		} else {
683 			oim = NULL;
684 		}
685 	} else {
686 		oi = oi0;
687 		oim = oim0;
688 	}
689 
690 	kqum = SK_PTR_ADDR_KQUM((uintptr_t)SKMEM_OBJ_ADDR(oi) +
691 	    METADATA_PREAMBLE_SZ);
692 
693 	if (__probable(!PP_KERNEL_ONLY(pp))) {
694 		ASSERT(oim != NULL && SKMEM_OBJ_ADDR(oim) != NULL);
695 		ASSERT(SKMEM_OBJ_SIZE(oi) == SKMEM_OBJ_SIZE(oim));
696 		uqum = SK_PTR_ADDR_UQUM((uintptr_t)SKMEM_OBJ_ADDR(oim) +
697 		    METADATA_PREAMBLE_SZ);
698 	} else {
699 		ASSERT(oim == NULL);
700 		uqum = NULL;
701 	}
702 
703 	if (oim != NULL) {
704 		/* initialize user metadata redzone */
705 		struct __metadata_preamble *mdp = SKMEM_OBJ_ADDR(oim);
706 		mdp->mdp_redzone =
707 		    (SKMEM_OBJ_ROFF(oim) + METADATA_PREAMBLE_SZ) ^
708 		    __ch_umd_redzone_cookie;
709 	}
710 
711 	/* allocate (constructed) buflet(s) with buffer(s) attached */
712 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0) {
713 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
714 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, bufcnt, skmflag);
715 	}
716 
717 	error = pp_metadata_construct(kqum, uqum, SKMEM_OBJ_IDX_REG(oi), pp,
718 	    skmflag, bufcnt, TRUE, &blist);
719 	if (__improbable(blist != NULL)) {
720 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist);
721 		blist = NULL;
722 	}
723 	return error;
724 }
725 
726 static int
pp_metadata_ctor_no_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)727 pp_metadata_ctor_no_buflet(struct skmem_obj_info *oi0,
728     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
729 {
730 	return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, true);
731 }
732 
733 static int
pp_metadata_ctor_max_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)734 pp_metadata_ctor_max_buflet(struct skmem_obj_info *oi0,
735     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
736 {
737 	return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, false);
738 }
739 
740 __attribute__((always_inline))
741 static void
pp_metadata_destruct_common(struct __kern_quantum * kqum,struct kern_pbufpool * pp,bool raw,struct skmem_obj ** blist_def,struct skmem_obj ** blist_nocache_def,struct skmem_obj ** blist_large,struct skmem_obj ** blist_nocache_large)742 pp_metadata_destruct_common(struct __kern_quantum *kqum,
743     struct kern_pbufpool *pp, bool raw, struct skmem_obj **blist_def,
744     struct skmem_obj **blist_nocache_def, struct skmem_obj **blist_large,
745     struct skmem_obj **blist_nocache_large)
746 {
747 	struct __kern_buflet *kbuf, *nbuf;
748 	struct skmem_obj *__single p_blist_def = NULL, *__single p_blist_large = NULL;
749 	struct skmem_obj *__single p_blist_nocache_def = NULL, *__single p_blist_nocache_large = NULL;
750 	struct skmem_obj **pp_blist_def = &p_blist_def;
751 	struct skmem_obj **pp_blist_large = &p_blist_large;
752 	struct skmem_obj **pp_blist_nocache_def = &p_blist_nocache_def;
753 	struct skmem_obj **pp_blist_nocache_large = &p_blist_nocache_large;
754 	uint16_t bufcnt, i = 0;
755 	bool first_buflet_empty;
756 
757 	ASSERT(blist_def != NULL);
758 	ASSERT(blist_large != NULL);
759 
760 	struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
761 
762 	ASSERT(kpkt->pkt_user != NULL || PP_KERNEL_ONLY(pp));
763 	ASSERT(kpkt->pkt_qum.qum_pp == pp);
764 	ASSERT(METADATA_TYPE(kpkt) == pp->pp_md_type);
765 	ASSERT(METADATA_SUBTYPE(kpkt) == pp->pp_md_subtype);
766 	ASSERT(METADATA_IDX(kpkt) != OBJ_IDX_NONE);
767 	ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
768 	ASSERT(kpkt->pkt_bufs_cnt <= kpkt->pkt_bufs_max);
769 	ASSERT(kpkt->pkt_bufs_max == pp->pp_max_frags);
770 	static_assert(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
771 	bufcnt = kpkt->pkt_bufs_cnt;
772 	kbuf = &kqum->qum_buf[0];
773 	/*
774 	 * special handling for empty first buflet.
775 	 */
776 	first_buflet_empty = (kbuf->buf_addr == 0);
777 	*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = 0;
778 
779 	/*
780 	 * -fbounds-safety: buf_nbft_addr is a mach_vm_address_t which is
781 	 * unsafe, so we forge it here.
782 	 */
783 	nbuf = __unsafe_forge_single(struct __kern_buflet *,
784 	    __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
785 	BUF_NBFT_ADDR(kbuf, 0);
786 	BUF_NBFT_IDX(kbuf, OBJ_IDX_NONE);
787 	if (!first_buflet_empty) {
788 		pp_free_buflet_common(pp, kbuf);
789 		++i;
790 	}
791 
792 	while (nbuf != NULL) {
793 		ASSERT(nbuf->buf_ctl != NULL);
794 		if (BUFLET_HAS_LARGE_BUF(nbuf)) {
795 			/*
796 			 * bc_usecnt larger than 1 means the buffer has been cloned and is
797 			 * still being used by other bflts. In this case, when we free
798 			 * this bflt we need to explicitly ask for it to not be cached again
799 			 * into magzine layer to prevent immediate reuse of the buffer and
800 			 * data corruption.
801 			 */
802 			if (nbuf->buf_ctl->bc_usecnt > 1) {
803 				*pp_blist_nocache_large = (struct skmem_obj *)(void *)nbuf;
804 				pp_blist_nocache_large =
805 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
806 			} else {
807 				*pp_blist_large = (struct skmem_obj *)(void *)nbuf;
808 				pp_blist_large =
809 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
810 			}
811 		} else {
812 			if (nbuf->buf_ctl->bc_usecnt > 1) {
813 				*pp_blist_nocache_def = (struct skmem_obj *)(void *)nbuf;
814 				pp_blist_nocache_def =
815 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
816 			} else {
817 				*pp_blist_def = (struct skmem_obj *)(void *)nbuf;
818 				pp_blist_def =
819 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
820 			}
821 		}
822 		BUF_NBFT_IDX(nbuf, OBJ_IDX_NONE);
823 		nbuf = __unsafe_forge_single(struct __kern_buflet *,
824 		    __DECONST(struct __kern_buflet *, nbuf->buf_nbft_addr));
825 		++i;
826 	}
827 
828 	ASSERT(i == bufcnt);
829 
830 	if (p_blist_def != NULL) {
831 		*pp_blist_def = *blist_def;
832 		*blist_def = p_blist_def;
833 	}
834 	if (p_blist_large != NULL) {
835 		*pp_blist_large = *blist_large;
836 		*blist_large = p_blist_large;
837 	}
838 	if (p_blist_nocache_def != NULL) {
839 		*pp_blist_nocache_def = *blist_nocache_def;
840 		*blist_nocache_def = p_blist_nocache_def;
841 	}
842 	if (p_blist_nocache_large != NULL) {
843 		*pp_blist_nocache_large = *blist_nocache_large;
844 		*blist_nocache_large = p_blist_nocache_large;
845 	}
846 
847 	/* if we're about to return this object to the slab, clean it up */
848 	if (raw) {
849 		ASSERT(kpkt->pkt_com_opt != NULL ||
850 		    !(kpkt->pkt_pflags & PKT_F_OPT_ALLOC));
851 		if (kpkt->pkt_com_opt != NULL) {
852 			ASSERT(kpkt->pkt_pflags & PKT_F_OPT_ALLOC);
853 			skmem_cache_free(pp_opt_cache,
854 			    kpkt->pkt_com_opt);
855 			kpkt->pkt_com_opt = NULL;
856 		}
857 		ASSERT(kpkt->pkt_flow != NULL ||
858 		    !(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC));
859 		if (kpkt->pkt_flow != NULL) {
860 			ASSERT(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC);
861 			skmem_cache_free(pp_flow_cache, kpkt->pkt_flow);
862 			kpkt->pkt_flow = NULL;
863 		}
864 		ASSERT(kpkt->pkt_tx_compl != NULL ||
865 		    !(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC));
866 		if (kpkt->pkt_tx_compl != NULL) {
867 			ASSERT(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC);
868 			skmem_cache_free(pp_compl_cache,
869 			    kpkt->pkt_tx_compl);
870 			kpkt->pkt_tx_compl = NULL;
871 		}
872 		kpkt->pkt_pflags = 0;
873 	}
874 }
875 
876 __attribute__((always_inline))
877 static void
pp_free_kbft_list(struct kern_pbufpool * pp,struct skmem_obj * blist_def,struct skmem_obj * blist_nocache_def,struct skmem_obj * blist_large,struct skmem_obj * blist_nocache_large)878 pp_free_kbft_list(struct kern_pbufpool *pp, struct skmem_obj *blist_def, struct skmem_obj *blist_nocache_def,
879     struct skmem_obj *blist_large, struct skmem_obj *blist_nocache_large)
880 {
881 	if (blist_def != NULL) {
882 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist_def);
883 	}
884 	if (blist_large != NULL) {
885 		skmem_cache_batch_free(PP_KBFT_CACHE_LARGE(pp), blist_large);
886 	}
887 	if (blist_nocache_def != NULL) {
888 		skmem_cache_batch_free_nocache(PP_KBFT_CACHE_DEF(pp), blist_nocache_def);
889 	}
890 	if (blist_nocache_large != NULL) {
891 		skmem_cache_batch_free_nocache(PP_KBFT_CACHE_LARGE(pp), blist_nocache_large);
892 	}
893 }
894 
895 __attribute__((always_inline))
896 static void
pp_metadata_destruct(struct __kern_quantum * kqum,struct kern_pbufpool * pp,bool raw)897 pp_metadata_destruct(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
898     bool raw)
899 {
900 	struct skmem_obj *__single blist_def = NULL, *__single blist_large = NULL;
901 	struct skmem_obj *__single blist_nocache_def = NULL, *__single blist_nocache_large = NULL;
902 
903 	pp_metadata_destruct_common(kqum, pp, raw, &blist_def, &blist_nocache_def,
904 	    &blist_large, &blist_nocache_large);
905 	pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
906 }
907 
908 static void
pp_metadata_dtor(void * addr,void * arg)909 pp_metadata_dtor(void *addr, void *arg)
910 {
911 	pp_metadata_destruct(SK_PTR_ADDR_KQUM((uintptr_t)addr +
912 	    METADATA_PREAMBLE_SZ), arg, TRUE);
913 }
914 
915 static void
pp_buf_seg_ctor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)916 pp_buf_seg_ctor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
917 {
918 	struct kern_pbufpool *__single pp = arg;
919 
920 	if (pp->pp_pbuf_seg_ctor != NULL) {
921 		pp->pp_pbuf_seg_ctor(pp, sg, md);
922 	}
923 }
924 
925 static void
pp_buf_seg_dtor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)926 pp_buf_seg_dtor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
927 {
928 	struct kern_pbufpool *__single pp = arg;
929 
930 	if (pp->pp_pbuf_seg_dtor != NULL) {
931 		pp->pp_pbuf_seg_dtor(pp, sg, md);
932 	}
933 }
934 
935 static int
pp_buflet_metadata_ctor_common(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag,bool large)936 pp_buflet_metadata_ctor_common(struct skmem_obj_info *oi0,
937     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag, bool large)
938 {
939 #pragma unused (skmflag)
940 	struct kern_pbufpool *pp = (struct kern_pbufpool *)arg;
941 	struct __kern_buflet *kbft;
942 	struct __user_buflet *ubft;
943 	struct skmem_obj_info oib;
944 	mach_vm_address_t baddr;
945 	obj_idx_t oi_idx_reg;
946 
947 	baddr = pp_alloc_buffer_common(pp, &oib, skmflag, large);
948 	if (__improbable(baddr == 0)) {
949 		return ENOMEM;
950 	}
951 	/*
952 	 * Note that oi0 and oim0 may be stored inside the object itself;
953 	 * so copy what is required to local variables before constructing.
954 	 */
955 	oi_idx_reg = SKMEM_OBJ_IDX_REG(oi0);
956 	kbft = SKMEM_OBJ_ADDR(oi0);
957 
958 	if (__probable(!PP_KERNEL_ONLY(pp))) {
959 		ASSERT(oim0 != NULL && SKMEM_OBJ_ADDR(oim0) != NULL);
960 		ASSERT(SKMEM_OBJ_SIZE(oi0) == SKMEM_OBJ_SIZE(oim0));
961 		ASSERT(oi_idx_reg == SKMEM_OBJ_IDX_REG(oim0));
962 		ASSERT(SKMEM_OBJ_IDX_SEG(oi0) == SKMEM_OBJ_IDX_SEG(oim0));
963 		ubft = SKMEM_OBJ_ADDR(oim0);
964 	} else {
965 		ASSERT(oim0 == NULL);
966 		ubft = NULL;
967 	}
968 	KBUF_EXT_CTOR(kbft, ubft, baddr, SKMEM_OBJ_IDX_REG(&oib),
969 	    SKMEM_OBJ_BUFCTL(&oib), oi_idx_reg, pp, large);
970 	return 0;
971 }
972 
973 static int
pp_buflet_default_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)974 pp_buflet_default_buffer_metadata_ctor(struct skmem_obj_info *oi0,
975     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
976 {
977 	return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, false);
978 }
979 
980 static int
pp_buflet_large_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)981 pp_buflet_large_buffer_metadata_ctor(struct skmem_obj_info *oi0,
982     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
983 {
984 	return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, true);
985 }
986 
987 static void
pp_buflet_metadata_dtor(void * addr,void * arg)988 pp_buflet_metadata_dtor(void *addr, void *arg)
989 {
990 	struct __kern_buflet *__single kbft = addr;
991 	void *objaddr = kbft->buf_objaddr;
992 	struct kern_pbufpool *__single pp = arg;
993 	uint32_t usecnt = 0;
994 	bool large = BUFLET_HAS_LARGE_BUF(kbft);
995 
996 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
997 	/*
998 	 * don't assert for (buf_nbft_addr == 0) here as constructed
999 	 * buflet may have this field as non-zero. This is because
1000 	 * buf_nbft_addr (__buflet_next) is used by skmem batch alloc
1001 	 * for chaining the buflets.
1002 	 * To ensure that the frred buflet was not part of a chain we
1003 	 * assert for (buf_nbft_idx == OBJ_IDX_NONE).
1004 	 */
1005 	ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
1006 	ASSERT(((struct __kern_buflet_ext *)kbft)->kbe_buf_upp_link.sle_next ==
1007 	    NULL);
1008 	ASSERT(kbft->buf_addr != 0);
1009 	ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
1010 	ASSERT(kbft->buf_ctl != NULL);
1011 
1012 	KBUF_DTOR(kbft, usecnt);
1013 	SK_DF(SK_VERB_MEM, "pp %p buf %p usecnt %u", SK_KVA(pp),
1014 	    SK_KVA(objaddr), usecnt);
1015 	if (__probable(usecnt == 0)) {
1016 		skmem_cache_free(large ? PP_BUF_CACHE_LARGE(pp) :
1017 		    PP_BUF_CACHE_DEF(pp), objaddr);
1018 	}
1019 }
1020 
1021 /*
1022  * -fbounds-safety: all callers of pp_create use srp_array with a known size:
1023  * SKMEM_REGIONS. This is same as marking it __counted_by(SKMEM_REGIONS)
1024  */
1025 struct kern_pbufpool *
pp_create(const char * name,struct skmem_region_params srp_array[SKMEM_REGIONS],pbuf_seg_ctor_fn_t buf_seg_ctor,pbuf_seg_dtor_fn_t buf_seg_dtor,const void * ctx,pbuf_ctx_retain_fn_t ctx_retain,pbuf_ctx_release_fn_t ctx_release,uint32_t ppcreatef)1026 pp_create(const char *name, struct skmem_region_params srp_array[SKMEM_REGIONS],
1027     pbuf_seg_ctor_fn_t buf_seg_ctor, pbuf_seg_dtor_fn_t buf_seg_dtor,
1028     const void *ctx, pbuf_ctx_retain_fn_t ctx_retain,
1029     pbuf_ctx_release_fn_t ctx_release, uint32_t ppcreatef)
1030 {
1031 	struct kern_pbufpool *pp = NULL;
1032 	uint32_t md_size, def_buf_obj_size;
1033 	uint32_t def_buf_size, large_buf_size;
1034 	nexus_meta_type_t md_type;
1035 	nexus_meta_subtype_t md_subtype;
1036 	uint32_t md_cflags;
1037 	uint16_t max_frags;
1038 	uint32_t buf_def_cflags;
1039 	char cname[64];
1040 	const char *__null_terminated cache_name = NULL;
1041 	struct skmem_region_params *kmd_srp;
1042 	struct skmem_region_params *buf_srp;
1043 	struct skmem_region_params *kbft_srp;
1044 	struct skmem_region_params *umd_srp = NULL;
1045 	struct skmem_region_params *ubft_srp = NULL;
1046 	struct skmem_region_params *lbuf_srp = NULL;
1047 
1048 	/* buf_seg_{ctor,dtor} pair must be either NULL or non-NULL */
1049 	ASSERT(!(!(buf_seg_ctor == NULL && buf_seg_dtor == NULL) &&
1050 	    ((buf_seg_ctor == NULL) ^ (buf_seg_dtor == NULL))));
1051 
1052 	/* ctx{,_retain,_release} must be either ALL NULL or ALL non-NULL */
1053 	ASSERT((ctx == NULL && ctx_retain == NULL && ctx_release == NULL) ||
1054 	    (ctx != NULL && ctx_retain != NULL && ctx_release != NULL));
1055 
1056 	if (srp_array[SKMEM_REGION_KMD].srp_c_obj_cnt != 0) {
1057 		kmd_srp = &srp_array[SKMEM_REGION_KMD];
1058 		buf_srp = &srp_array[SKMEM_REGION_BUF_DEF];
1059 		lbuf_srp = &srp_array[SKMEM_REGION_BUF_LARGE];
1060 		kbft_srp = &srp_array[SKMEM_REGION_KBFT];
1061 	} else if (srp_array[SKMEM_REGION_RXKMD].srp_c_obj_cnt != 0) {
1062 		kmd_srp = &srp_array[SKMEM_REGION_RXKMD];
1063 		buf_srp = &srp_array[SKMEM_REGION_RXBUF_DEF];
1064 		lbuf_srp = &srp_array[SKMEM_REGION_RXBUF_LARGE];
1065 		kbft_srp = &srp_array[SKMEM_REGION_RXKBFT];
1066 	} else {
1067 		VERIFY(srp_array[SKMEM_REGION_TXKMD].srp_c_obj_cnt != 0);
1068 		kmd_srp = &srp_array[SKMEM_REGION_TXKMD];
1069 		buf_srp = &srp_array[SKMEM_REGION_TXBUF_DEF];
1070 		lbuf_srp = &srp_array[SKMEM_REGION_TXBUF_LARGE];
1071 		kbft_srp = &srp_array[SKMEM_REGION_TXKBFT];
1072 	}
1073 
1074 	VERIFY(kmd_srp->srp_c_obj_size != 0);
1075 	VERIFY(buf_srp->srp_c_obj_cnt != 0);
1076 	VERIFY(buf_srp->srp_c_obj_size != 0);
1077 
1078 	if (ppcreatef & PPCREATEF_ONDEMAND_BUF) {
1079 		VERIFY(kbft_srp->srp_c_obj_cnt != 0);
1080 		VERIFY(kbft_srp->srp_c_obj_size != 0);
1081 	} else {
1082 		kbft_srp = NULL;
1083 	}
1084 
1085 	if ((ppcreatef & PPCREATEF_KERNEL_ONLY) == 0) {
1086 		umd_srp = &srp_array[SKMEM_REGION_UMD];
1087 		ASSERT(umd_srp->srp_c_obj_size == kmd_srp->srp_c_obj_size);
1088 		ASSERT(umd_srp->srp_c_obj_cnt == kmd_srp->srp_c_obj_cnt);
1089 		ASSERT(umd_srp->srp_c_seg_size == kmd_srp->srp_c_seg_size);
1090 		ASSERT(umd_srp->srp_seg_cnt == kmd_srp->srp_seg_cnt);
1091 		ASSERT(umd_srp->srp_md_type == kmd_srp->srp_md_type);
1092 		ASSERT(umd_srp->srp_md_subtype == kmd_srp->srp_md_subtype);
1093 		ASSERT(umd_srp->srp_max_frags == kmd_srp->srp_max_frags);
1094 		ASSERT((umd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
1095 		    (kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
1096 		if (kbft_srp != NULL) {
1097 			ubft_srp = &srp_array[SKMEM_REGION_UBFT];
1098 			ASSERT(ubft_srp->srp_c_obj_size ==
1099 			    kbft_srp->srp_c_obj_size);
1100 			ASSERT(ubft_srp->srp_c_obj_cnt ==
1101 			    kbft_srp->srp_c_obj_cnt);
1102 			ASSERT(ubft_srp->srp_c_seg_size ==
1103 			    kbft_srp->srp_c_seg_size);
1104 			ASSERT(ubft_srp->srp_seg_cnt == kbft_srp->srp_seg_cnt);
1105 		}
1106 	}
1107 
1108 	md_size = kmd_srp->srp_r_obj_size;
1109 	md_type = kmd_srp->srp_md_type;
1110 	md_subtype = kmd_srp->srp_md_subtype;
1111 	max_frags = kmd_srp->srp_max_frags;
1112 	def_buf_obj_size = buf_srp->srp_c_obj_size;
1113 	def_buf_size = def_buf_obj_size;
1114 	large_buf_size = lbuf_srp->srp_c_obj_size;
1115 
1116 #if (DEBUG || DEVELOPMENT)
1117 	ASSERT(def_buf_obj_size != 0);
1118 	ASSERT(md_type > NEXUS_META_TYPE_INVALID &&
1119 	    md_type <= NEXUS_META_TYPE_MAX);
1120 	ASSERT(max_frags >= 1);
1121 	ASSERT(md_type == NEXUS_META_TYPE_PACKET);
1122 	ASSERT(md_size >= (METADATA_PREAMBLE_SZ +
1123 	    NX_METADATA_PACKET_SZ(max_frags)));
1124 	ASSERT(md_subtype > NEXUS_META_SUBTYPE_INVALID &&
1125 	    md_subtype <= NEXUS_META_SUBTYPE_MAX);
1126 #endif /* DEBUG || DEVELOPMENT */
1127 
1128 	pp = pp_alloc(Z_WAITOK);
1129 
1130 	(void) snprintf((char *)pp->pp_name, sizeof(pp->pp_name),
1131 	    "skywalk.pp.%s", name);
1132 
1133 	pp->pp_ctx = __DECONST(void *, ctx);
1134 	pp->pp_ctx_retain = ctx_retain;
1135 	pp->pp_ctx_release = ctx_release;
1136 	if (pp->pp_ctx != NULL) {
1137 		pp->pp_ctx_retain(pp->pp_ctx);
1138 	}
1139 
1140 	pp->pp_pbuf_seg_ctor = buf_seg_ctor;
1141 	pp->pp_pbuf_seg_dtor = buf_seg_dtor;
1142 	PP_BUF_SIZE_DEF(pp) = def_buf_size;
1143 	PP_BUF_OBJ_SIZE_DEF(pp) = def_buf_obj_size;
1144 	PP_BUF_SIZE_LARGE(pp) = large_buf_size;
1145 	PP_BUF_OBJ_SIZE_LARGE(pp) = lbuf_srp->srp_c_obj_size;
1146 	pp->pp_md_type = md_type;
1147 	pp->pp_md_subtype = md_subtype;
1148 	pp->pp_max_frags = max_frags;
1149 	if (ppcreatef & PPCREATEF_EXTERNAL) {
1150 		pp->pp_flags |= PPF_EXTERNAL;
1151 	}
1152 	if (ppcreatef & PPCREATEF_TRUNCATED_BUF) {
1153 		pp->pp_flags |= PPF_TRUNCATED_BUF;
1154 	}
1155 	if (ppcreatef & PPCREATEF_KERNEL_ONLY) {
1156 		pp->pp_flags |= PPF_KERNEL;
1157 	}
1158 	if (ppcreatef & PPCREATEF_ONDEMAND_BUF) {
1159 		pp->pp_flags |= PPF_BUFFER_ON_DEMAND;
1160 	}
1161 	if (ppcreatef & PPCREATEF_DYNAMIC) {
1162 		pp->pp_flags |= PPF_DYNAMIC;
1163 	}
1164 	if (lbuf_srp->srp_c_obj_cnt > 0) {
1165 		ASSERT(lbuf_srp->srp_c_obj_size != 0);
1166 		pp->pp_flags |= PPF_LARGE_BUF;
1167 	}
1168 
1169 	pp_retain(pp);
1170 
1171 	md_cflags = ((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ?
1172 	    SKMEM_CR_NOMAGAZINES : 0);
1173 	md_cflags |= SKMEM_CR_BATCH;
1174 	pp->pp_flags |= PPF_BATCH;
1175 
1176 	if (pp->pp_flags & PPF_DYNAMIC) {
1177 		md_cflags |= SKMEM_CR_DYNAMIC;
1178 	}
1179 
1180 	if (umd_srp != NULL && (pp->pp_umd_region =
1181 	    skmem_region_create(name, umd_srp, NULL, NULL, NULL)) == NULL) {
1182 		SK_ERR("\"%s\" (%p) failed to create %s region",
1183 		    pp->pp_name, SK_KVA(pp), umd_srp->srp_name);
1184 		goto failed;
1185 	}
1186 
1187 	if ((pp->pp_kmd_region = skmem_region_create(name, kmd_srp, NULL, NULL,
1188 	    NULL)) == NULL) {
1189 		SK_ERR("\"%s\" (%p) failed to create %s region",
1190 		    pp->pp_name, SK_KVA(pp), kmd_srp->srp_name);
1191 		goto failed;
1192 	}
1193 
1194 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1195 		VERIFY((kbft_srp != NULL) && (kbft_srp->srp_c_obj_cnt > 0));
1196 		if (!PP_KERNEL_ONLY(pp)) {
1197 			VERIFY((ubft_srp != NULL) &&
1198 			    (ubft_srp->srp_c_obj_cnt > 0));
1199 		}
1200 	}
1201 	/*
1202 	 * Metadata regions {KMD,KBFT,UBFT} magazines layer and persistency
1203 	 * attribute must match.
1204 	 */
1205 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1206 		ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ==
1207 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES));
1208 		ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
1209 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
1210 	}
1211 
1212 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && !PP_KERNEL_ONLY(pp)) {
1213 		if ((pp->pp_ubft_region = skmem_region_create(name, ubft_srp,
1214 		    NULL, NULL, NULL)) == NULL) {
1215 			SK_ERR("\"%s\" (%p) failed to create %s region",
1216 			    pp->pp_name, SK_KVA(pp), ubft_srp->srp_name);
1217 			goto failed;
1218 		}
1219 	}
1220 
1221 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1222 		if ((pp->pp_kbft_region = skmem_region_create(name,
1223 		    kbft_srp, NULL, NULL, NULL)) == NULL) {
1224 			SK_ERR("\"%s\" (%p) failed to create %s region",
1225 			    pp->pp_name, SK_KVA(pp), kbft_srp->srp_name);
1226 			goto failed;
1227 		}
1228 	}
1229 
1230 	if (!PP_KERNEL_ONLY(pp)) {
1231 		skmem_region_mirror(pp->pp_kmd_region, pp->pp_umd_region);
1232 	}
1233 	if (!PP_KERNEL_ONLY(pp) && pp->pp_ubft_region != NULL) {
1234 		ASSERT(pp->pp_kbft_region != NULL);
1235 		skmem_region_mirror(pp->pp_kbft_region, pp->pp_ubft_region);
1236 	}
1237 
1238 	/*
1239 	 * Create the metadata cache; magazines layer is determined by caller.
1240 	 */
1241 	cache_name = tsnprintf(cname, sizeof(cname), "kmd.%s", name);
1242 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1243 		pp->pp_kmd_cache = skmem_cache_create(cache_name, md_size, 0,
1244 		    pp_metadata_ctor_no_buflet, pp_metadata_dtor, NULL, pp,
1245 		    pp->pp_kmd_region, md_cflags);
1246 	} else {
1247 		pp->pp_kmd_cache = skmem_cache_create(cache_name, md_size, 0,
1248 		    pp_metadata_ctor_max_buflet, pp_metadata_dtor, NULL, pp,
1249 		    pp->pp_kmd_region, md_cflags);
1250 	}
1251 
1252 	if (pp->pp_kmd_cache == NULL) {
1253 		SK_ERR("\"%s\" (%p) failed to create \"%s\" cache",
1254 		    pp->pp_name, SK_KVA(pp), cname);
1255 		goto failed;
1256 	}
1257 
1258 	/*
1259 	 * Create the buflet metadata cache
1260 	 */
1261 	if (pp->pp_kbft_region != NULL) {
1262 		cache_name = tsnprintf(cname, sizeof(cname), "kbft_def.%s", name);
1263 		PP_KBFT_CACHE_DEF(pp) = skmem_cache_create(cache_name,
1264 		    kbft_srp->srp_c_obj_size, 0,
1265 		    pp_buflet_default_buffer_metadata_ctor,
1266 		    pp_buflet_metadata_dtor, NULL, pp, pp->pp_kbft_region,
1267 		    md_cflags);
1268 
1269 		if (PP_KBFT_CACHE_DEF(pp) == NULL) {
1270 			SK_ERR("\"%s\" (%p) failed to create \"%s\" cache",
1271 			    pp->pp_name, SK_KVA(pp), cname);
1272 			goto failed;
1273 		}
1274 
1275 		if (PP_HAS_LARGE_BUF(pp)) {
1276 			/* Aggressive memory reclaim flag set to kbft_large for now */
1277 			md_cflags |= SKMEM_CR_RECLAIM;
1278 			cache_name = tsnprintf(cname, sizeof(cname),
1279 			    "kbft_large.%s", name);
1280 			PP_KBFT_CACHE_LARGE(pp) = skmem_cache_create(cache_name,
1281 			    kbft_srp->srp_c_obj_size, 0,
1282 			    pp_buflet_large_buffer_metadata_ctor,
1283 			    pp_buflet_metadata_dtor,
1284 			    NULL, pp, pp->pp_kbft_region, md_cflags);
1285 
1286 			if (PP_KBFT_CACHE_LARGE(pp) == NULL) {
1287 				SK_ERR("\"%s\" (%p) failed to "
1288 				    "create \"%s\" cache", pp->pp_name,
1289 				    SK_KVA(pp), cname);
1290 				goto failed;
1291 			}
1292 		}
1293 	}
1294 
1295 	if ((PP_BUF_REGION_DEF(pp) = skmem_region_create(name,
1296 	    buf_srp, pp_buf_seg_ctor, pp_buf_seg_dtor, pp)) == NULL) {
1297 		SK_ERR("\"%s\" (%p) failed to create %s region",
1298 		    pp->pp_name, SK_KVA(pp), buf_srp->srp_name);
1299 		goto failed;
1300 	}
1301 
1302 	if (PP_HAS_LARGE_BUF(pp)) {
1303 		PP_BUF_REGION_LARGE(pp) = skmem_region_create(name, lbuf_srp,
1304 		    pp_buf_seg_ctor, pp_buf_seg_dtor, pp);
1305 		if (PP_BUF_REGION_LARGE(pp) == NULL) {
1306 			SK_ERR("\"%s\" (%p) failed to create %s region",
1307 			    pp->pp_name, SK_KVA(pp), lbuf_srp->srp_name);
1308 			goto failed;
1309 		}
1310 	}
1311 
1312 	/*
1313 	 * Create the buffer object cache without the magazines layer.
1314 	 * We rely on caching the constructed metadata object instead.
1315 	 */
1316 	cache_name = tsnprintf(cname, sizeof(cname), "buf_def.%s", name);
1317 	buf_def_cflags = buf_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES ? SKMEM_CR_NOMAGAZINES : 0;
1318 	if ((PP_BUF_CACHE_DEF(pp) = skmem_cache_create(cache_name,
1319 	    def_buf_obj_size,
1320 	    0, NULL, NULL, NULL, pp, PP_BUF_REGION_DEF(pp),
1321 	    buf_def_cflags)) == NULL) {
1322 		SK_ERR("\"%s\" (%p) failed to create \"%s\" cache",
1323 		    pp->pp_name, SK_KVA(pp), cname);
1324 		goto failed;
1325 	}
1326 
1327 	if (PP_BUF_REGION_LARGE(pp) != NULL) {
1328 		cache_name = tsnprintf(cname, sizeof(cname), "buf_large.%s", name);
1329 		if ((PP_BUF_CACHE_LARGE(pp) = skmem_cache_create(cache_name,
1330 		    lbuf_srp->srp_c_obj_size, 0, NULL, NULL, NULL, pp,
1331 		    PP_BUF_REGION_LARGE(pp), SKMEM_CR_NOMAGAZINES)) == NULL) {
1332 			SK_ERR("\"%s\" (%p) failed to create \"%s\" cache",
1333 			    pp->pp_name, SK_KVA(pp), cname);
1334 			goto failed;
1335 		}
1336 	}
1337 
1338 	return pp;
1339 
1340 failed:
1341 	if (pp != NULL) {
1342 		if (pp->pp_ctx != NULL) {
1343 			pp->pp_ctx_release(pp->pp_ctx);
1344 			pp->pp_ctx = NULL;
1345 		}
1346 		pp_close(pp);
1347 	}
1348 
1349 	return NULL;
1350 }
1351 
1352 void
pp_destroy(struct kern_pbufpool * pp)1353 pp_destroy(struct kern_pbufpool *pp)
1354 {
1355 	PP_LOCK_ASSERT_HELD(pp);
1356 
1357 	/* may be called for built-in pp with outstanding reference */
1358 	ASSERT(!(pp->pp_flags & PPF_EXTERNAL) || pp->pp_refcnt == 0);
1359 
1360 	pp_destroy_upp_locked(pp);
1361 
1362 	pp_destroy_upp_bft_locked(pp);
1363 
1364 	if (pp->pp_kmd_cache != NULL) {
1365 		skmem_cache_destroy(pp->pp_kmd_cache);
1366 		pp->pp_kmd_cache = NULL;
1367 	}
1368 
1369 	if (pp->pp_umd_region != NULL) {
1370 		skmem_region_release(pp->pp_umd_region);
1371 		pp->pp_umd_region = NULL;
1372 	}
1373 
1374 	if (pp->pp_kmd_region != NULL) {
1375 		skmem_region_release(pp->pp_kmd_region);
1376 		pp->pp_kmd_region = NULL;
1377 	}
1378 
1379 	if (PP_KBFT_CACHE_DEF(pp) != NULL) {
1380 		skmem_cache_destroy(PP_KBFT_CACHE_DEF(pp));
1381 		PP_KBFT_CACHE_DEF(pp) = NULL;
1382 	}
1383 
1384 	if (PP_KBFT_CACHE_LARGE(pp) != NULL) {
1385 		skmem_cache_destroy(PP_KBFT_CACHE_LARGE(pp));
1386 		PP_KBFT_CACHE_LARGE(pp) = NULL;
1387 	}
1388 
1389 	if (pp->pp_ubft_region != NULL) {
1390 		skmem_region_release(pp->pp_ubft_region);
1391 		pp->pp_ubft_region = NULL;
1392 	}
1393 
1394 	if (pp->pp_kbft_region != NULL) {
1395 		skmem_region_release(pp->pp_kbft_region);
1396 		pp->pp_kbft_region = NULL;
1397 	}
1398 
1399 	/*
1400 	 * The order is important here, since pp_metadata_dtor()
1401 	 * called by freeing on the pp_kmd_cache will in turn
1402 	 * free the attached buffer.  Therefore destroy the
1403 	 * buffer cache last.
1404 	 */
1405 	if (PP_BUF_CACHE_DEF(pp) != NULL) {
1406 		skmem_cache_destroy(PP_BUF_CACHE_DEF(pp));
1407 		PP_BUF_CACHE_DEF(pp) = NULL;
1408 	}
1409 	if (PP_BUF_REGION_DEF(pp) != NULL) {
1410 		skmem_region_release(PP_BUF_REGION_DEF(pp));
1411 		PP_BUF_REGION_DEF(pp) = NULL;
1412 	}
1413 	if (PP_BUF_CACHE_LARGE(pp) != NULL) {
1414 		skmem_cache_destroy(PP_BUF_CACHE_LARGE(pp));
1415 		PP_BUF_CACHE_LARGE(pp) = NULL;
1416 	}
1417 	if (PP_BUF_REGION_LARGE(pp) != NULL) {
1418 		skmem_region_release(PP_BUF_REGION_LARGE(pp));
1419 		PP_BUF_REGION_LARGE(pp) = NULL;
1420 	}
1421 
1422 	if (pp->pp_ctx != NULL) {
1423 		pp->pp_ctx_release(pp->pp_ctx);
1424 		pp->pp_ctx = NULL;
1425 	}
1426 }
1427 
1428 static int
pp_init_upp_locked(struct kern_pbufpool * pp,boolean_t can_block)1429 pp_init_upp_locked(struct kern_pbufpool *pp, boolean_t can_block)
1430 {
1431 	int i, err = 0;
1432 
1433 	if (pp->pp_u_hash_table != NULL) {
1434 		goto done;
1435 	}
1436 
1437 	/* allocated-address hash table */
1438 	/*
1439 	 * -fbounds-safety: We switched to sk_alloc (aka kalloc) from zalloc, so
1440 	 * if we see any performance hit, we can check if this caused it.
1441 	 */
1442 	if (can_block) {
1443 		pp->pp_u_hash_table = sk_alloc_type_array(
1444 			struct kern_pbufpool_u_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1445 			Z_WAITOK, skmem_tag_pbufpool_hash);
1446 		pp->pp_u_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1447 	} else {
1448 		pp->pp_u_hash_table = sk_alloc_type_array(
1449 			struct kern_pbufpool_u_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1450 			Z_NOWAIT, skmem_tag_pbufpool_hash);
1451 		pp->pp_u_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1452 	}
1453 	if (pp->pp_u_hash_table == NULL) {
1454 		SK_ERR("failed to zalloc packet buffer pool upp hash table");
1455 		err = ENOMEM;
1456 		goto done;
1457 	}
1458 
1459 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1460 		SLIST_INIT(&pp->pp_u_hash_table[i].upp_head);
1461 	}
1462 done:
1463 	return err;
1464 }
1465 
1466 static void
pp_destroy_upp_locked(struct kern_pbufpool * pp)1467 pp_destroy_upp_locked(struct kern_pbufpool *pp)
1468 {
1469 	PP_LOCK_ASSERT_HELD(pp);
1470 	if (pp->pp_u_hash_table != NULL) {
1471 		/* purge anything that's left */
1472 		pp_purge_upp_locked(pp, -1);
1473 
1474 #if (DEBUG || DEVELOPMENT)
1475 		for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1476 			ASSERT(SLIST_EMPTY(&pp->pp_u_hash_table[i].upp_head));
1477 		}
1478 #endif /* DEBUG || DEVELOPMENT */
1479 
1480 		kfree_type_counted_by(struct kern_pbufpool_u_bkt,
1481 		    pp->pp_u_hash_table_size,
1482 		    pp->pp_u_hash_table);
1483 	}
1484 	ASSERT(pp->pp_u_bufinuse == 0);
1485 }
1486 
1487 int
pp_init_upp(struct kern_pbufpool * pp,boolean_t can_block)1488 pp_init_upp(struct kern_pbufpool *pp, boolean_t can_block)
1489 {
1490 	int err = 0;
1491 
1492 	PP_LOCK(pp);
1493 	err = pp_init_upp_locked(pp, can_block);
1494 	if (err) {
1495 		SK_ERR("packet UPP init failed (%d)", err);
1496 		goto done;
1497 	}
1498 	err = pp_init_upp_bft_locked(pp, can_block);
1499 	if (err) {
1500 		SK_ERR("buflet UPP init failed (%d)", err);
1501 		pp_destroy_upp_locked(pp);
1502 		goto done;
1503 	}
1504 	pp_retain_locked(pp);
1505 done:
1506 	PP_UNLOCK(pp);
1507 	return err;
1508 }
1509 
1510 __attribute__((always_inline))
1511 static void
pp_insert_upp_bft_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1512 pp_insert_upp_bft_locked(struct kern_pbufpool *pp,
1513     struct __kern_buflet *kbft, pid_t pid)
1514 {
1515 	struct kern_pbufpool_u_bft_bkt *bkt;
1516 	struct __kern_buflet_ext *kbe = (struct __kern_buflet_ext *)kbft;
1517 
1518 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1519 	ASSERT(kbe->kbe_buf_pid == (pid_t)-1);
1520 	kbe->kbe_buf_pid = pid;
1521 	bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, kbft->buf_bft_idx_reg);
1522 	SLIST_INSERT_HEAD(&bkt->upp_head, kbe, kbe_buf_upp_link);
1523 	pp->pp_u_bftinuse++;
1524 }
1525 
1526 __attribute__((always_inline))
1527 static void
pp_insert_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1528 pp_insert_upp_bft_chain_locked(struct kern_pbufpool *pp,
1529     struct __kern_buflet *kbft, pid_t pid)
1530 {
1531 	while (kbft != NULL) {
1532 		pp_insert_upp_bft_locked(pp, kbft, pid);
1533 		kbft = __unsafe_forge_single(struct __kern_buflet *,
1534 		    __DECONST(kern_buflet_t, kbft->buf_nbft_addr));
1535 	}
1536 }
1537 
1538 /* Also inserts the attached chain of buflets */
1539 void static inline
pp_insert_upp_common(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1540 pp_insert_upp_common(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1541     pid_t pid)
1542 {
1543 	struct kern_pbufpool_u_bkt *bkt;
1544 	struct __kern_buflet *kbft;
1545 
1546 	ASSERT(kqum->qum_pid == (pid_t)-1);
1547 	kqum->qum_pid = pid;
1548 
1549 	bkt = KERN_PBUFPOOL_U_HASH(pp, METADATA_IDX(kqum));
1550 	SLIST_INSERT_HEAD(&bkt->upp_head, kqum, qum_upp_link);
1551 	pp->pp_u_bufinuse++;
1552 
1553 	kbft = __unsafe_forge_single(struct __kern_buflet *, (kern_buflet_t)kqum->qum_buf[0].buf_nbft_addr);
1554 	if (kbft != NULL) {
1555 		ASSERT(((kern_buflet_t)kbft)->buf_flag & BUFLET_FLAG_EXTERNAL);
1556 		ASSERT(kqum->qum_qflags & QUM_F_INTERNALIZED);
1557 		pp_insert_upp_bft_chain_locked(pp, kbft, pid);
1558 	}
1559 }
1560 
1561 void
pp_insert_upp_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1562 pp_insert_upp_locked(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1563     pid_t pid)
1564 {
1565 	pp_insert_upp_common(pp, kqum, pid);
1566 }
1567 
1568 void
pp_insert_upp(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1569 pp_insert_upp(struct kern_pbufpool *pp, struct __kern_quantum *kqum, pid_t pid)
1570 {
1571 	PP_LOCK(pp);
1572 	pp_insert_upp_common(pp, kqum, pid);
1573 	PP_UNLOCK(pp);
1574 }
1575 
1576 void
pp_insert_upp_batch(struct kern_pbufpool * pp,pid_t pid,uint64_t * __counted_by (num)array,uint32_t num)1577 pp_insert_upp_batch(struct kern_pbufpool *pp, pid_t pid,
1578     uint64_t *__counted_by(num)array, uint32_t num)
1579 {
1580 	uint32_t i = 0;
1581 
1582 	ASSERT(array != NULL && num > 0);
1583 	PP_LOCK(pp);
1584 	while (i < num) {
1585 		struct __kern_quantum *kqum = SK_PTR_ADDR_KQUM(array[i]);
1586 
1587 		ASSERT(kqum != NULL);
1588 		pp_insert_upp_common(pp, kqum, pid);
1589 		++i;
1590 	}
1591 	PP_UNLOCK(pp);
1592 }
1593 
1594 __attribute__((always_inline))
1595 static struct __kern_buflet *
pp_remove_upp_bft_locked(struct kern_pbufpool * pp,obj_idx_t bft_idx)1596 pp_remove_upp_bft_locked(struct kern_pbufpool *pp, obj_idx_t bft_idx)
1597 {
1598 	struct __kern_buflet_ext *kbft, *tbft;
1599 	struct kern_pbufpool_u_bft_bkt *bkt;
1600 
1601 	bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, bft_idx);
1602 	SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link, tbft) {
1603 		if (((kern_buflet_t)kbft)->buf_bft_idx_reg == bft_idx) {
1604 			SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1605 			    kbe_buf_upp_link);
1606 			kbft->kbe_buf_pid = (pid_t)-1;
1607 			kbft->kbe_buf_upp_link.sle_next = NULL;
1608 			ASSERT(pp->pp_u_bftinuse != 0);
1609 			pp->pp_u_bftinuse--;
1610 			break;
1611 		}
1612 	}
1613 	return (kern_buflet_t)kbft;
1614 }
1615 
1616 struct __kern_buflet *
pp_remove_upp_bft(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1617 pp_remove_upp_bft(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1618 {
1619 	struct __kern_buflet *kbft = pp_remove_upp_bft_locked(pp, md_idx);
1620 
1621 	*err = __improbable(kbft != NULL) ? 0 : EINVAL;
1622 	return kbft;
1623 }
1624 
1625 __attribute__((always_inline))
1626 static int
pp_remove_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum)1627 pp_remove_upp_bft_chain_locked(struct kern_pbufpool *pp,
1628     struct __kern_quantum *kqum)
1629 {
1630 	uint32_t max_frags = pp->pp_max_frags;
1631 	struct __kern_buflet *kbft;
1632 	uint16_t nbfts, upkt_nbfts;
1633 	obj_idx_t bft_idx;
1634 
1635 	ASSERT(!(kqum->qum_qflags & QUM_F_INTERNALIZED));
1636 	bft_idx = kqum->qum_user->qum_buf[0].buf_nbft_idx;
1637 	kbft = &kqum->qum_buf[0];
1638 	if (bft_idx == OBJ_IDX_NONE) {
1639 		return 0;
1640 	}
1641 
1642 	ASSERT(METADATA_TYPE(kqum) == NEXUS_META_TYPE_PACKET);
1643 	struct __kern_packet *kpkt = __DECONST(struct __kern_packet *, kqum);
1644 	struct __user_packet *upkt = __DECONST(struct __user_packet *,
1645 	    kpkt->pkt_qum.qum_user);
1646 
1647 	upkt_nbfts = upkt->pkt_bufs_cnt;
1648 	if (__improbable(upkt_nbfts > max_frags)) {
1649 		SK_ERR("bad bcnt in upkt (%d > %d)", upkt_nbfts, max_frags);
1650 		BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1651 		BUF_NBFT_ADDR(kbft, 0);
1652 		return ERANGE;
1653 	}
1654 
1655 	nbfts = (kbft->buf_addr != 0) ? 1 : 0;
1656 
1657 	do {
1658 		struct __kern_buflet *pbft = kbft;
1659 		struct __kern_buflet_ext *kbe;
1660 
1661 		kbft = pp_remove_upp_bft_locked(pp, bft_idx);
1662 		if (__improbable(kbft == NULL)) {
1663 			BUF_NBFT_IDX(pbft, OBJ_IDX_NONE);
1664 			BUF_NBFT_ADDR(pbft, 0);
1665 			SK_ERR("unallocated next buflet (%d), %p", bft_idx,
1666 			    SK_KVA(pbft));
1667 			return ERANGE;
1668 		}
1669 		ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1670 		BUF_NBFT_IDX(pbft, bft_idx);
1671 		BUF_NBFT_ADDR(pbft, kbft);
1672 		kbe = __container_of(kbft, struct __kern_buflet_ext, kbe_overlay);
1673 		bft_idx = kbe->kbe_buf_user->buf_nbft_idx;
1674 		++nbfts;
1675 	} while ((bft_idx != OBJ_IDX_NONE) && (nbfts < upkt_nbfts));
1676 
1677 	ASSERT(kbft != NULL);
1678 	BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1679 	BUF_NBFT_ADDR(kbft, 0);
1680 	*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = nbfts;
1681 
1682 	if (__improbable((bft_idx != OBJ_IDX_NONE) || (nbfts != upkt_nbfts))) {
1683 		SK_ERR("bad buflet in upkt (%d, %d)", nbfts, upkt_nbfts);
1684 		return ERANGE;
1685 	}
1686 	return 0;
1687 }
1688 
1689 struct __kern_quantum *
pp_remove_upp_locked(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1690 pp_remove_upp_locked(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1691 {
1692 	struct __kern_quantum *kqum, *tqum;
1693 	struct kern_pbufpool_u_bkt *bkt;
1694 
1695 	bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1696 	SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1697 		if (METADATA_IDX(kqum) == md_idx) {
1698 			SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1699 			    qum_upp_link);
1700 			kqum->qum_pid = (pid_t)-1;
1701 			ASSERT(pp->pp_u_bufinuse != 0);
1702 			pp->pp_u_bufinuse--;
1703 			break;
1704 		}
1705 	}
1706 	if (__probable(kqum != NULL)) {
1707 		*err = pp_remove_upp_bft_chain_locked(pp, kqum);
1708 	} else {
1709 		*err = ERANGE;
1710 	}
1711 	return kqum;
1712 }
1713 
1714 struct __kern_quantum *
pp_remove_upp(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1715 pp_remove_upp(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1716 {
1717 	struct __kern_quantum *kqum;
1718 
1719 	PP_LOCK(pp);
1720 	kqum = pp_remove_upp_locked(pp, md_idx, err);
1721 	PP_UNLOCK(pp);
1722 	return kqum;
1723 }
1724 
1725 struct __kern_quantum *
pp_find_upp(struct kern_pbufpool * pp,obj_idx_t md_idx)1726 pp_find_upp(struct kern_pbufpool *pp, obj_idx_t md_idx)
1727 {
1728 	struct __kern_quantum *kqum, *tqum;
1729 	struct kern_pbufpool_u_bkt *bkt;
1730 
1731 	PP_LOCK(pp);
1732 	bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1733 	SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1734 		if (METADATA_IDX(kqum) == md_idx) {
1735 			break;
1736 		}
1737 	}
1738 	PP_UNLOCK(pp);
1739 
1740 	return kqum;
1741 }
1742 
1743 __attribute__((always_inline))
1744 static void
pp_purge_upp_locked(struct kern_pbufpool * pp,pid_t pid)1745 pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid)
1746 {
1747 	struct __kern_quantum *kqum, *tqum;
1748 	struct kern_pbufpool_u_bkt *bkt;
1749 	int i;
1750 
1751 	PP_LOCK_ASSERT_HELD(pp);
1752 
1753 	/*
1754 	 * TODO: Build a list of packets and batch-free them.
1755 	 */
1756 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1757 		bkt = &pp->pp_u_hash_table[i];
1758 		SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1759 			ASSERT(kqum->qum_pid != (pid_t)-1);
1760 			if (pid != (pid_t)-1 && kqum->qum_pid != pid) {
1761 				continue;
1762 			}
1763 			SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1764 			    qum_upp_link);
1765 			pp_remove_upp_bft_chain_locked(pp, kqum);
1766 			kqum->qum_pid = (pid_t)-1;
1767 			kqum->qum_qflags &= ~QUM_F_FINALIZED;
1768 			kqum->qum_ksd = NULL;
1769 			pp_free_packet(__DECONST(struct kern_pbufpool *,
1770 			    kqum->qum_pp), (uint64_t)kqum);
1771 			ASSERT(pp->pp_u_bufinuse != 0);
1772 			pp->pp_u_bufinuse--;
1773 		}
1774 	}
1775 }
1776 
1777 __attribute__((always_inline))
1778 static void
pp_purge_upp_bft_locked(struct kern_pbufpool * pp,pid_t pid)1779 pp_purge_upp_bft_locked(struct kern_pbufpool *pp, pid_t pid)
1780 {
1781 	struct __kern_buflet_ext *kbft, *tbft;
1782 	struct kern_pbufpool_u_bft_bkt *bkt;
1783 	int i;
1784 
1785 	PP_LOCK_ASSERT_HELD(pp);
1786 
1787 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1788 		bkt = &pp->pp_u_bft_hash_table[i];
1789 		SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link,
1790 		    tbft) {
1791 			ASSERT(kbft->kbe_buf_pid != (pid_t)-1);
1792 			if (pid != (pid_t)-1 && kbft->kbe_buf_pid != pid) {
1793 				continue;
1794 			}
1795 			SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1796 			    kbe_buf_upp_link);
1797 			kbft->kbe_buf_pid = (pid_t)-1;
1798 			kbft->kbe_buf_upp_link.sle_next = NULL;
1799 			pp_free_buflet(pp, (kern_buflet_t)kbft);
1800 			ASSERT(pp->pp_u_bftinuse != 0);
1801 			pp->pp_u_bftinuse--;
1802 		}
1803 	}
1804 }
1805 
1806 void
pp_purge_upp(struct kern_pbufpool * pp,pid_t pid)1807 pp_purge_upp(struct kern_pbufpool *pp, pid_t pid)
1808 {
1809 	PP_LOCK(pp);
1810 	pp_purge_upp_locked(pp, pid);
1811 	pp_purge_upp_bft_locked(pp, pid);
1812 	PP_UNLOCK(pp);
1813 }
1814 
1815 static int
pp_init_upp_bft_locked(struct kern_pbufpool * pp,boolean_t can_block)1816 pp_init_upp_bft_locked(struct kern_pbufpool *pp, boolean_t can_block)
1817 {
1818 	int i, err = 0;
1819 
1820 	PP_LOCK_ASSERT_HELD(pp);
1821 	if (pp->pp_u_bft_hash_table != NULL) {
1822 		return 0;
1823 	}
1824 
1825 	/* allocated-address hash table */
1826 	/*
1827 	 * -fbounds-safety: We switched to sk_alloc (aka kalloc) from zalloc, so
1828 	 * if we see any performance hit, we can check if this caused it.
1829 	 */
1830 	if (can_block) {
1831 		pp->pp_u_bft_hash_table = sk_alloc_type_array(
1832 			struct kern_pbufpool_u_bft_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1833 			Z_WAITOK, skmem_tag_pbufpool_bft_hash);
1834 		pp->pp_u_bft_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1835 	} else {
1836 		pp->pp_u_bft_hash_table = sk_alloc_type_array(
1837 			struct kern_pbufpool_u_bft_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1838 			Z_NOWAIT, skmem_tag_pbufpool_bft_hash);
1839 		pp->pp_u_bft_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1840 	}
1841 	if (pp->pp_u_bft_hash_table == NULL) {
1842 		SK_ERR("failed to zalloc packet buffer pool upp buflet hash table");
1843 		err = ENOMEM;
1844 		goto fail;
1845 	}
1846 
1847 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1848 		SLIST_INIT(&pp->pp_u_bft_hash_table[i].upp_head);
1849 	}
1850 
1851 fail:
1852 	return err;
1853 }
1854 
1855 static void
pp_destroy_upp_bft_locked(struct kern_pbufpool * pp)1856 pp_destroy_upp_bft_locked(struct kern_pbufpool *pp)
1857 {
1858 	PP_LOCK_ASSERT_HELD(pp);
1859 	if (pp->pp_u_bft_hash_table != NULL) {
1860 		/* purge anything that's left */
1861 		pp_purge_upp_bft_locked(pp, -1);
1862 
1863 #if (DEBUG || DEVELOPMENT)
1864 		for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1865 			ASSERT(SLIST_EMPTY(&pp->pp_u_bft_hash_table[i].upp_head));
1866 		}
1867 #endif /* DEBUG || DEVELOPMENT */
1868 
1869 		kfree_type_counted_by(struct kern_pbufpool_u_bft_bkt,
1870 		    pp->pp_u_bft_hash_table_size,
1871 		    pp->pp_u_bft_hash_table);
1872 	}
1873 	ASSERT(pp->pp_u_bftinuse == 0);
1874 }
1875 
1876 void
pp_insert_upp_bft(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1877 pp_insert_upp_bft(struct kern_pbufpool *pp,
1878     struct __kern_buflet *kbft, pid_t pid)
1879 {
1880 	PP_LOCK(pp);
1881 	pp_insert_upp_bft_locked(pp, kbft, pid);
1882 	PP_UNLOCK(pp);
1883 }
1884 
1885 boolean_t
pp_isempty_upp(struct kern_pbufpool * pp)1886 pp_isempty_upp(struct kern_pbufpool *pp)
1887 {
1888 	boolean_t isempty;
1889 
1890 	PP_LOCK(pp);
1891 	isempty = (pp->pp_u_bufinuse == 0);
1892 	PP_UNLOCK(pp);
1893 
1894 	return isempty;
1895 }
1896 
1897 __attribute__((always_inline))
1898 static inline struct __kern_quantum *
pp_metadata_init(struct __metadata_preamble * mdp,struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag,struct skmem_obj ** blist)1899 pp_metadata_init(struct __metadata_preamble *mdp, struct kern_pbufpool *pp,
1900     uint16_t bufcnt, uint32_t skmflag, struct skmem_obj **blist)
1901 {
1902 	struct __kern_quantum *kqum;
1903 	struct __user_quantum *uqum;
1904 
1905 	kqum = SK_PTR_ADDR_KQUM((uintptr_t)mdp + METADATA_PREAMBLE_SZ);
1906 	ASSERT(kqum->qum_pp == pp);
1907 	if (__probable(!PP_KERNEL_ONLY(pp))) {
1908 		ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY));
1909 		uqum =  __DECONST(struct __user_quantum *, kqum->qum_user);
1910 		ASSERT(uqum != NULL);
1911 	} else {
1912 		ASSERT(kqum->qum_qflags & QUM_F_KERNEL_ONLY);
1913 		ASSERT(kqum->qum_user == NULL);
1914 		uqum = NULL;
1915 	}
1916 
1917 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 &&
1918 	    pp_metadata_construct(kqum, uqum, METADATA_IDX(kqum), pp,
1919 	    skmflag, bufcnt, FALSE, blist) != 0) {
1920 		return NULL;
1921 	}
1922 
1923 	/* (re)construct {user,kernel} metadata */
1924 	struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
1925 	struct __kern_buflet *kbuf = &kpkt->pkt_qum_buf;
1926 	uint16_t i;
1927 
1928 	/* sanitize flags */
1929 	kpkt->pkt_pflags &= PKT_F_INIT_MASK;
1930 
1931 	ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
1932 	    kpkt->pkt_com_opt != NULL);
1933 	ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
1934 	    kpkt->pkt_flow != NULL);
1935 	ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
1936 	    kpkt->pkt_tx_compl != NULL);
1937 
1938 	/*
1939 	 * XXX: For now we always set PKT_F_FLOW_DATA;
1940 	 * this is a no-op but done for consistency
1941 	 * with the other PKT_F_*_DATA flags.
1942 	 */
1943 	kpkt->pkt_pflags |= PKT_F_FLOW_DATA;
1944 
1945 	/* initialize kernel packet */
1946 	KPKT_INIT(kpkt, QUM_F_INTERNALIZED);
1947 
1948 	ASSERT(bufcnt || PP_HAS_BUFFER_ON_DEMAND(pp));
1949 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1950 		ASSERT(kbuf->buf_ctl == NULL);
1951 		ASSERT(kbuf->buf_addr == 0);
1952 		/*
1953 		 * -fbounds-safety: buf_nbft_addr is a mach_vm_address_t
1954 		 * which is unsafe, so we just forge it here.
1955 		 */
1956 		kbuf = __unsafe_forge_single(struct __kern_buflet *,
1957 		    __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
1958 	}
1959 	/* initialize kernel buflet */
1960 	for (i = 0; i < bufcnt; i++) {
1961 		ASSERT(kbuf != NULL);
1962 		KBUF_INIT(kbuf);
1963 		kbuf = __unsafe_forge_single(struct __kern_buflet *,
1964 		    __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
1965 	}
1966 	ASSERT((kbuf == NULL) || (bufcnt == 0));
1967 
1968 	return kqum;
1969 }
1970 
1971 /*
1972  * When PPF_BUFFER_ON_DEMAND flag is set on packet pool creation, we create
1973  * packet descriptor cache with no buffer attached and a buflet cache with
1974  * cpu layer caching enabled. While operating in this mode, we can call
1975  * pp_alloc_packet_common() either with `bufcnt = 0` or `bufcnt = n`,
1976  * where n <= pp->pp_max_frags. If `bufcnt == 0` then we allocate packet
1977  * descriptor with no attached buffer from the metadata cache.
1978  * If `bufcnt != 0`, then this routine allocates packet descriptor and buflets
1979  * from their respective caches and constructs the packet on behalf of the
1980  * caller.
1981  */
1982 __attribute__((always_inline))
1983 static inline uint32_t
pp_alloc_packet_common(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * __counted_by (num)array,uint32_t num,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)1984 pp_alloc_packet_common(struct kern_pbufpool *pp, uint16_t bufcnt,
1985     uint64_t *__counted_by(num)array, uint32_t num, boolean_t tagged,
1986     alloc_cb_func_t cb, const void *ctx, uint32_t skmflag)
1987 {
1988 	struct __metadata_preamble *mdp;
1989 	struct __kern_quantum *kqum = NULL;
1990 	uint32_t allocp, need = num;
1991 	struct skmem_obj *__single plist, *__single blist = NULL;
1992 	uint64_t *array_cp;  /* -fbounds-safety */
1993 
1994 	ASSERT(bufcnt <= pp->pp_max_frags);
1995 	ASSERT(array != NULL && num > 0);
1996 	ASSERT(PP_BATCH_CAPABLE(pp));
1997 
1998 	/* allocate (constructed) packet(s) with buffer(s) attached */
1999 	allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist,
2000 	    pp->pp_kmd_cache->skm_objsize, num, skmflag);
2001 
2002 	/* allocate (constructed) buflet(s) with buffer(s) attached */
2003 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
2004 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
2005 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, (allocp * bufcnt), skmflag);
2006 	}
2007 
2008 	array_cp = array;
2009 	while (plist != NULL) {
2010 		struct skmem_obj *plistn;
2011 
2012 		plistn = plist->mo_next;
2013 		plist->mo_next = NULL;
2014 
2015 		mdp = (struct __metadata_preamble *)(void *)plist;
2016 		kqum = pp_metadata_init(mdp, pp, bufcnt, skmflag, &blist);
2017 		if (kqum == NULL) {
2018 			if (blist != NULL) {
2019 				skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp),
2020 				    blist);
2021 				blist = NULL;
2022 			}
2023 			plist->mo_next = plistn;
2024 			skmem_cache_batch_free(pp->pp_kmd_cache, plist);
2025 			plist = NULL;
2026 			break;
2027 		}
2028 
2029 #if HAS_MTE && CONFIG_KERNEL_TAGGING
2030 		if (__probable(is_mte_enabled)) {
2031 			/* Checking to ensure the object address is tagged */
2032 			ASSERT((vm_offset_t)kqum !=
2033 			    vm_memtag_canonicalize_kernel((vm_offset_t)kqum));
2034 		}
2035 #endif /* HAS_MTE && CONFIG_KERNEL_TAGGING */
2036 
2037 		if (tagged) {
2038 			*array_cp = SK_PTR_ENCODE(kqum, METADATA_TYPE(kqum),
2039 			    METADATA_SUBTYPE(kqum));
2040 		} else {
2041 			*array_cp = (uint64_t)kqum;
2042 		}
2043 
2044 		if (cb != NULL) {
2045 			(cb)(*array_cp, (num - need), ctx);
2046 		}
2047 
2048 		++array_cp;
2049 		plist = plistn;
2050 
2051 		ASSERT(need > 0);
2052 		--need;
2053 	}
2054 	ASSERT(blist == NULL);
2055 	ASSERT((num - need) == allocp || kqum == NULL);
2056 
2057 	return num - need;
2058 }
2059 
2060 uint64_t
pp_alloc_packet(struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag)2061 pp_alloc_packet(struct kern_pbufpool *pp, uint16_t bufcnt, uint32_t skmflag)
2062 {
2063 	uint64_t kpkt = 0;
2064 
2065 	(void) pp_alloc_packet_common(pp, bufcnt, &kpkt, 1, FALSE,
2066 	    NULL, NULL, skmflag);
2067 
2068 	return kpkt;
2069 }
2070 
2071 int
pp_alloc_packet_batch(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * __counted_by (* size)array,uint32_t * size,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2072 pp_alloc_packet_batch(struct kern_pbufpool *pp, uint16_t bufcnt,
2073     uint64_t *__counted_by(*size)array, uint32_t *size, boolean_t tagged,
2074     alloc_cb_func_t cb, const void *ctx, uint32_t skmflag)
2075 {
2076 	uint32_t i, n;
2077 	int err;
2078 
2079 	ASSERT(array != NULL && size > 0);
2080 
2081 	n = *size;
2082 	/*
2083 	 * -fbounds-safety: Originally there was this line here: *size = 0; but
2084 	 * we removed this because array is now __counted_by(*size), so *size =
2085 	 * 0 leads to brk 0x5519. Also, *size is set to i anyway.
2086 	 */
2087 
2088 	i = pp_alloc_packet_common(pp, bufcnt, array, n, tagged,
2089 	    cb, ctx, skmflag);
2090 	/*
2091 	 * -fbounds-safety: Since array is __counted_by(*size), we need to be
2092 	 * extra careful when *size is updated, like below. Here, we know i will
2093 	 * be less than or equal to the original *size value, so updating *size
2094 	 * is okay.
2095 	 */
2096 	*size = i;
2097 
2098 	if (__probable(i == n)) {
2099 		err = 0;
2100 	} else if (i != 0) {
2101 		err = EAGAIN;
2102 	} else {
2103 		err = ENOMEM;
2104 	}
2105 
2106 	return err;
2107 }
2108 
2109 int
pp_alloc_pktq(struct kern_pbufpool * pp,uint16_t bufcnt,struct pktq * pktq,uint32_t num,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2110 pp_alloc_pktq(struct kern_pbufpool *pp, uint16_t bufcnt,
2111     struct pktq *pktq, uint32_t num, alloc_cb_func_t cb, const void *ctx,
2112     uint32_t skmflag)
2113 {
2114 	struct __metadata_preamble *mdp;
2115 	struct __kern_packet *kpkt = NULL;
2116 	uint32_t allocp, need = num;
2117 	struct skmem_obj *__single plist, *__single blist = NULL;
2118 	int err;
2119 
2120 	ASSERT(pktq != NULL && num > 0);
2121 	ASSERT(pp->pp_md_type == NEXUS_META_TYPE_PACKET);
2122 	ASSERT(bufcnt <= pp->pp_max_frags);
2123 	ASSERT(PP_BATCH_CAPABLE(pp));
2124 
2125 	/* allocate (constructed) packet(s) with buffer(s) attached */
2126 	allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist,
2127 	    pp->pp_kmd_cache->skm_objsize, num, skmflag);
2128 
2129 	/* allocate (constructed) buflet(s) with buffer(s) attached */
2130 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
2131 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
2132 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, (allocp * bufcnt), skmflag);
2133 	}
2134 
2135 	while (plist != NULL) {
2136 		struct skmem_obj *plistn;
2137 
2138 		plistn = plist->mo_next;
2139 		plist->mo_next = NULL;
2140 
2141 		mdp = (struct __metadata_preamble *)(void *)plist;
2142 		kpkt = (struct __kern_packet *)pp_metadata_init(mdp, pp,
2143 		    bufcnt, skmflag, &blist);
2144 		if (kpkt == NULL) {
2145 			if (blist != NULL) {
2146 				skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp),
2147 				    blist);
2148 				blist = NULL;
2149 			}
2150 			plist->mo_next = plistn;
2151 			skmem_cache_batch_free(pp->pp_kmd_cache, plist);
2152 			plist = NULL;
2153 			break;
2154 		}
2155 
2156 #if HAS_MTE && CONFIG_KERNEL_TAGGING
2157 		if (__probable(is_mte_enabled)) {
2158 			/* Checking to ensure the object address is tagged */
2159 			ASSERT((vm_offset_t)kpkt !=
2160 			    vm_memtag_canonicalize_kernel((vm_offset_t)kpkt));
2161 		}
2162 #endif /* HAS_MTE && CONFIG_KERNEL_TAGGING */
2163 
2164 		KPKTQ_ENQUEUE(pktq, kpkt);
2165 
2166 		if (cb != NULL) {
2167 			(cb)((uint64_t)kpkt, (num - need), ctx);
2168 		}
2169 
2170 		plist = plistn;
2171 
2172 		ASSERT(need > 0);
2173 		--need;
2174 	}
2175 	ASSERT(blist == NULL);
2176 	ASSERT((num - need) == allocp || kpkt == NULL);
2177 
2178 	if (__probable(need == 0)) {
2179 		err = 0;
2180 	} else if (need == num) {
2181 		err = ENOMEM;
2182 	} else {
2183 		err = EAGAIN;
2184 	}
2185 
2186 	return err;
2187 }
2188 
2189 uint64_t
pp_alloc_packet_by_size(struct kern_pbufpool * pp,uint32_t size,uint32_t skmflag)2190 pp_alloc_packet_by_size(struct kern_pbufpool *pp, uint32_t size,
2191     uint32_t skmflag)
2192 {
2193 	uint32_t bufcnt = pp->pp_max_frags;
2194 	uint64_t kpkt = 0;
2195 
2196 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
2197 		bufcnt =
2198 		    SK_ROUNDUP(size, PP_BUF_SIZE_DEF(pp)) / PP_BUF_SIZE_DEF(pp);
2199 		ASSERT(bufcnt <= UINT16_MAX);
2200 	}
2201 
2202 	(void) pp_alloc_packet_common(pp, (uint16_t)bufcnt, &kpkt, 1, TRUE,
2203 	    NULL, NULL, skmflag);
2204 
2205 	return kpkt;
2206 }
2207 
2208 __attribute__((always_inline))
2209 static inline struct __metadata_preamble *
pp_metadata_fini(struct __kern_quantum * kqum,struct kern_pbufpool * pp,struct mbuf ** mp,struct __kern_packet ** kpp,struct skmem_obj ** blist_def,struct skmem_obj ** blist_nocache_def,struct skmem_obj ** blist_large,struct skmem_obj ** blist_nocahce_large)2210 pp_metadata_fini(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
2211     struct mbuf **mp, struct __kern_packet **kpp, struct skmem_obj **blist_def,
2212     struct skmem_obj **blist_nocache_def, struct skmem_obj **blist_large,
2213     struct skmem_obj **blist_nocahce_large)
2214 {
2215 	struct __metadata_preamble *mdp = METADATA_PREAMBLE(kqum);
2216 	ASSERT(SK_PTR_TAG(kqum) == 0);
2217 	struct __kern_packet *kpkt = SK_PTR_KPKT(kqum);
2218 
2219 	if ((kpkt->pkt_pflags & PKT_F_TX_COMPL_TS_REQ) != 0) {
2220 		__packet_perform_tx_completion_callbacks(
2221 			SK_PKT2PH(kpkt), NULL);
2222 	}
2223 	if ((kpkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) {
2224 		ASSERT((kpkt->pkt_pflags & PKT_F_PKT_DATA) == 0);
2225 		ASSERT(kpkt->pkt_mbuf != NULL);
2226 		ASSERT(kpkt->pkt_mbuf->m_nextpkt == NULL);
2227 		if (mp != NULL) {
2228 			ASSERT(*mp == NULL);
2229 			*mp = kpkt->pkt_mbuf;
2230 		} else {
2231 			m_freem(kpkt->pkt_mbuf);
2232 		}
2233 		KPKT_CLEAR_MBUF_DATA(kpkt);
2234 	} else if ((kpkt->pkt_pflags & PKT_F_PKT_DATA) != 0) {
2235 		ASSERT(kpkt->pkt_pkt != NULL);
2236 		ASSERT(kpkt->pkt_pkt->pkt_nextpkt == NULL);
2237 		if (kpp != NULL) {
2238 			ASSERT(*kpp == NULL);
2239 			*kpp = kpkt->pkt_pkt;
2240 		} else {
2241 			/* can only recurse once */
2242 			ASSERT((kpkt->pkt_pkt->pkt_pflags &
2243 			    PKT_F_PKT_DATA) == 0);
2244 			pp_free_packet_single(kpkt->pkt_pkt);
2245 		}
2246 		KPKT_CLEAR_PKT_DATA(kpkt);
2247 	}
2248 	kpkt->pkt_pflags &= ~PKT_F_TRUNCATED;
2249 	ASSERT(kpkt->pkt_nextpkt == NULL);
2250 	ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
2251 	ASSERT((kpkt->pkt_pflags & PKT_F_MBUF_MASK) == 0);
2252 	ASSERT((kpkt->pkt_pflags & PKT_F_PKT_MASK) == 0);
2253 
2254 	if (__improbable(PP_HAS_BUFFER_ON_DEMAND(pp))) {
2255 		pp_metadata_destruct_common(kqum, pp, FALSE, blist_def, blist_nocache_def,
2256 		    blist_large, blist_nocahce_large);
2257 	}
2258 	return mdp;
2259 }
2260 
2261 void
pp_free_packet_chain(struct __kern_packet * pkt_chain,int * npkt)2262 pp_free_packet_chain(struct __kern_packet *pkt_chain, int *npkt)
2263 {
2264 	struct __metadata_preamble *mdp;
2265 	struct skmem_obj *__single obj_mdp;
2266 	struct skmem_obj *__single top = NULL;
2267 	struct skmem_obj *__single blist_def = NULL, *__single blist_nocache_def = NULL;
2268 	struct skmem_obj *__single blist_large = NULL, *__single blist_nocache_large = NULL;
2269 	struct skmem_obj **list = &top;
2270 	struct mbuf *__single mtop = NULL;
2271 	struct mbuf **mp = &mtop;
2272 	struct __kern_packet *__single kptop = NULL;
2273 	struct __kern_packet **__single kpp = &kptop, *pkt, *next;
2274 	struct kern_pbufpool *pp;
2275 	int c = 0;
2276 
2277 	pp = __DECONST(struct kern_pbufpool *, pkt_chain->pkt_qum.qum_pp);
2278 	ASSERT(pp != NULL);
2279 	ASSERT(PP_BATCH_CAPABLE(pp));
2280 
2281 	for (pkt = pkt_chain; pkt != NULL; pkt = next) {
2282 		next = pkt->pkt_nextpkt;
2283 		pkt->pkt_nextpkt = NULL;
2284 
2285 		ASSERT(SK_PTR_ADDR_KQUM(pkt)->qum_pp == pp);
2286 		mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(pkt), pp,
2287 		    mp, kpp, &blist_def, &blist_nocache_def, &blist_large, &blist_nocache_large);
2288 
2289 		obj_mdp = __unsafe_forge_single(struct skmem_obj *, mdp);
2290 		*list = obj_mdp;
2291 		list = &(*list)->mo_next;
2292 		c++;
2293 
2294 		if (*mp != NULL) {
2295 			mp = &(*mp)->m_nextpkt;
2296 			ASSERT(*mp == NULL);
2297 		}
2298 		if (*kpp != NULL) {
2299 			kpp = &(*kpp)->pkt_nextpkt;
2300 			ASSERT(*kpp == NULL);
2301 		}
2302 	}
2303 
2304 	ASSERT(top != NULL);
2305 	skmem_cache_batch_free(pp->pp_kmd_cache, top);
2306 	pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
2307 	if (mtop != NULL) {
2308 		DTRACE_SKYWALK(free__attached__mbuf);
2309 		if (__probable(mtop->m_nextpkt != NULL)) {
2310 			m_freem_list(mtop);
2311 		} else {
2312 			m_freem(mtop);
2313 		}
2314 	}
2315 	if (kptop != NULL) {
2316 		int cnt = 0;
2317 		pp_free_packet_chain(kptop, &cnt);
2318 		DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2319 	}
2320 	if (npkt != NULL) {
2321 		*npkt = c;
2322 	}
2323 }
2324 
2325 void
pp_free_pktq(struct pktq * pktq)2326 pp_free_pktq(struct pktq *pktq)
2327 {
2328 	if (__improbable(KPKTQ_EMPTY(pktq))) {
2329 		return;
2330 	}
2331 	struct __kern_packet *pkt = KPKTQ_FIRST(pktq);
2332 	pp_free_packet_chain(pkt, NULL);
2333 	KPKTQ_DISPOSE(pktq);
2334 }
2335 
2336 void
pp_drop_pktq(struct pktq * pktq,struct ifnet * ifp,uint16_t flags,drop_reason_t reason,const char * funcname,uint16_t linenum)2337 pp_drop_pktq(struct pktq *pktq, struct ifnet *ifp, uint16_t flags,
2338     drop_reason_t reason, const char *funcname, uint16_t linenum)
2339 {
2340 	drop_func_t dropfunc;
2341 	struct __kern_packet *kpkt;
2342 
2343 	if (KPKTQ_EMPTY(pktq)) {
2344 		return;
2345 	}
2346 	if (__probable(droptap_total_tap_count == 0)) {
2347 		goto nodroptap;
2348 	}
2349 
2350 	if (flags & DROPTAP_FLAG_DIR_OUT) {
2351 		dropfunc = droptap_output_packet;
2352 	} else if (flags & DROPTAP_FLAG_DIR_IN) {
2353 		dropfunc = droptap_input_packet;
2354 	} else {
2355 		goto nodroptap;
2356 	}
2357 
2358 	KPKTQ_FOREACH(kpkt, pktq) {
2359 		dropfunc(SK_PKT2PH(kpkt), reason, funcname, linenum, flags, ifp,
2360 		    kpkt->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);
2361 	}
2362 
2363 nodroptap:
2364 	pp_free_pktq(pktq);
2365 }
2366 
2367 __attribute__((always_inline))
2368 static inline void
pp_free_packet_array(struct kern_pbufpool * pp,uint64_t * __counted_by (num)array,uint32_t num)2369 pp_free_packet_array(struct kern_pbufpool *pp, uint64_t *__counted_by(num)array, uint32_t num)
2370 {
2371 	struct __metadata_preamble *mdp;
2372 	struct skmem_obj *__single obj_mdp = NULL;
2373 	struct skmem_obj *__single top = NULL;
2374 	struct skmem_obj *__single blist_def = NULL, *__single blist_nocache_def = NULL;
2375 	struct skmem_obj *__single blist_large = NULL, *__single blist_nocache_large = NULL;
2376 	struct skmem_obj **list = &top;
2377 	struct mbuf *__single mtop = NULL;
2378 	struct mbuf **mp = &mtop;
2379 	struct __kern_packet *__single kptop = NULL;
2380 	struct __kern_packet **kpp = &kptop;
2381 	uint32_t i;
2382 
2383 	ASSERT(pp != NULL);
2384 	ASSERT(array != NULL && num > 0);
2385 	ASSERT(PP_BATCH_CAPABLE(pp));
2386 
2387 	for (i = 0; i < num; i++) {
2388 		ASSERT(SK_PTR_ADDR_KQUM(array[i])->qum_pp == pp);
2389 		mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(array[i]), pp,
2390 		    mp, kpp, &blist_def, &blist_nocache_def, &blist_large, &blist_nocache_large);
2391 
2392 		obj_mdp = __unsafe_forge_single(struct skmem_obj *, mdp);
2393 		*list = obj_mdp;
2394 		list = &(*list)->mo_next;
2395 		array[i] = 0;
2396 
2397 		if (*mp != NULL) {
2398 			mp = &(*mp)->m_nextpkt;
2399 			ASSERT(*mp == NULL);
2400 		}
2401 		if (*kpp != NULL) {
2402 			kpp = &(*kpp)->pkt_nextpkt;
2403 			ASSERT(*kpp == NULL);
2404 		}
2405 	}
2406 
2407 	ASSERT(top != NULL);
2408 	skmem_cache_batch_free(pp->pp_kmd_cache, top);
2409 	pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
2410 	if (mtop != NULL) {
2411 		DTRACE_SKYWALK(free__attached__mbuf);
2412 		if (__probable(mtop->m_nextpkt != NULL)) {
2413 			m_freem_list(mtop);
2414 		} else {
2415 			m_freem(mtop);
2416 		}
2417 	}
2418 	if (kptop != NULL) {
2419 		int cnt = 0;
2420 		pp_free_packet_chain(kptop, &cnt);
2421 		DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2422 	}
2423 }
2424 
2425 void
pp_free_packet(struct kern_pbufpool * pp,uint64_t kqum)2426 pp_free_packet(struct kern_pbufpool *pp, uint64_t kqum)
2427 {
2428 	pp_free_packet_array(pp, &kqum, 1);
2429 }
2430 
2431 void
pp_free_packet_batch(const kern_pbufpool_t pp,uint64_t * __counted_by (size)array,uint32_t size)2432 pp_free_packet_batch(const kern_pbufpool_t pp, uint64_t *__counted_by(size)array, uint32_t size)
2433 {
2434 	pp_free_packet_array(pp, array, size);
2435 }
2436 
2437 void
pp_free_packet_single(struct __kern_packet * pkt)2438 pp_free_packet_single(struct __kern_packet *pkt)
2439 {
2440 	ASSERT(pkt->pkt_nextpkt == NULL);
2441 	pp_free_packet(__DECONST(struct kern_pbufpool *,
2442 	    pkt->pkt_qum.qum_pp), SK_PTR_ADDR(pkt));
2443 }
2444 
2445 void
pp_drop_packet_single(struct __kern_packet * pkt,struct ifnet * ifp,uint16_t flags,drop_reason_t reason,const char * funcname,uint16_t linenum)2446 pp_drop_packet_single(struct __kern_packet *pkt, struct ifnet *ifp, uint16_t flags,
2447     drop_reason_t reason, const char *funcname, uint16_t linenum)
2448 {
2449 	drop_func_t dropfunc;
2450 
2451 	if (pkt->pkt_length == 0) {
2452 		return;
2453 	}
2454 	if (__probable(droptap_total_tap_count == 0)) {
2455 		goto nodroptap;
2456 	}
2457 
2458 	if (flags & DROPTAP_FLAG_DIR_OUT) {
2459 		dropfunc = droptap_output_packet;
2460 	} else if (flags & DROPTAP_FLAG_DIR_IN) {
2461 		dropfunc = droptap_input_packet;
2462 	} else {
2463 		goto nodroptap;
2464 	}
2465 
2466 	dropfunc(SK_PKT2PH(pkt), reason, funcname, linenum, flags, ifp,
2467 	    pkt->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);
2468 
2469 nodroptap:
2470 	pp_free_packet_single(pkt);
2471 }
2472 
2473 static mach_vm_address_t
pp_alloc_buffer_common(const kern_pbufpool_t pp,struct skmem_obj_info * oi,uint32_t skmflag,bool large)2474 pp_alloc_buffer_common(const kern_pbufpool_t pp, struct skmem_obj_info *oi,
2475     uint32_t skmflag, bool large)
2476 {
2477 	/*
2478 	 * XXX -fbounds-safety: We can't change this mach_vm_address_t to some
2479 	 * other (safe) pointer type, because IOSkywalkFamily depends on this
2480 	 * being mach_vm_address_t
2481 	 */
2482 	mach_vm_address_t baddr;
2483 	struct skmem_cache *skm = large ? PP_BUF_CACHE_LARGE(pp):
2484 	    PP_BUF_CACHE_DEF(pp);
2485 
2486 	ASSERT(skm != NULL);
2487 	/* allocate a cached buffer */
2488 	baddr = (mach_vm_address_t)skmem_cache_alloc(skm, skmflag);
2489 
2490 #if (DEVELOPMENT || DEBUG)
2491 	uint64_t mtbf = skmem_region_get_mtbf();
2492 	/*
2493 	 * MTBF is applicable only for non-blocking allocations here.
2494 	 */
2495 	if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
2496 	    (skmflag & SKMEM_NOSLEEP))) {
2497 		SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
2498 		net_update_uptime();
2499 		if (baddr != 0) {
2500 			skmem_cache_free(skm,
2501 			    __unsafe_forge_single(struct skmem_obj *, baddr));
2502 			baddr = 0;
2503 		}
2504 	}
2505 #endif /* (DEVELOPMENT || DEBUG) */
2506 
2507 	if (__improbable(baddr == 0)) {
2508 		SK_DF(SK_VERB_MEM, "failed to alloc buffer, pp %p",
2509 		    SK_KVA(pp));
2510 		return 0;
2511 	}
2512 	skmem_cache_get_obj_info(skm,
2513 	    __unsafe_forge_single(struct skmem_obj *, baddr), oi, NULL);
2514 	ASSERT(SKMEM_OBJ_BUFCTL(oi) != NULL);
2515 	ASSERT((mach_vm_address_t)SKMEM_OBJ_ADDR(oi) == baddr);
2516 	return baddr;
2517 }
2518 
2519 errno_t
pp_alloc_buffer(const kern_pbufpool_t pp,mach_vm_address_t * baddr,kern_segment_t * seg,kern_obj_idx_seg_t * idx,uint32_t skmflag)2520 pp_alloc_buffer(const kern_pbufpool_t pp, mach_vm_address_t *baddr,
2521     kern_segment_t *seg, kern_obj_idx_seg_t *idx, uint32_t skmflag)
2522 {
2523 	struct skmem_obj_info oib;
2524 
2525 	VERIFY(pp != NULL && baddr != NULL);
2526 	VERIFY((seg != NULL) == (idx != NULL));
2527 
2528 	if (__improbable(!PP_HAS_BUFFER_ON_DEMAND(pp))) {
2529 		return ENOTSUP;
2530 	}
2531 
2532 	*baddr = pp_alloc_buffer_common(pp, &oib, skmflag, false);
2533 	if (__improbable(*baddr == 0)) {
2534 		return ENOMEM;
2535 	}
2536 
2537 	if (seg != NULL) {
2538 		ASSERT(SKMEM_OBJ_SEG(&oib) != NULL);
2539 		*seg = SKMEM_OBJ_SEG(&oib);
2540 		*idx = SKMEM_OBJ_IDX_SEG(&oib);
2541 	}
2542 	return 0;
2543 }
2544 
2545 void
pp_free_buffer(const kern_pbufpool_t pp,mach_vm_address_t addr)2546 pp_free_buffer(const kern_pbufpool_t pp, mach_vm_address_t addr)
2547 {
2548 	ASSERT(pp != NULL && addr != 0);
2549 	skmem_cache_free(PP_BUF_CACHE_DEF(pp), __unsafe_forge_single(
2550 		    struct skmem_obj *, addr));
2551 }
2552 
2553 __attribute__((always_inline))
2554 static inline uint32_t
pp_alloc_buflet_common(struct kern_pbufpool * pp,uint64_t * __counted_by (num)array,uint32_t num,uint32_t skmflag,bool large)2555 pp_alloc_buflet_common(struct kern_pbufpool *pp,
2556     uint64_t *__counted_by(num)array, uint32_t num, uint32_t skmflag,
2557     bool large)
2558 {
2559 	struct __kern_buflet *kbft = NULL;
2560 	uint32_t allocd, need = num;
2561 	struct skmem_obj *__single list;
2562 	uint64_t *array_cp;  /* -fbounds-safety */
2563 
2564 	ASSERT(array != NULL && num > 0);
2565 	ASSERT(PP_BATCH_CAPABLE(pp));
2566 	ASSERT(PP_KBFT_CACHE_DEF(pp) != NULL);
2567 	ASSERT(PP_BUF_SIZE_LARGE(pp) != 0 || !large);
2568 
2569 	if (large) {
2570 		allocd = skmem_cache_batch_alloc(PP_KBFT_CACHE_LARGE(pp), &list,
2571 		    PP_KBFT_CACHE_LARGE(pp)->skm_objsize, num, skmflag);
2572 	} else {
2573 		allocd = skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &list,
2574 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, num, skmflag);
2575 	}
2576 
2577 	array_cp = array;
2578 	while (list != NULL) {
2579 		struct skmem_obj *listn;
2580 
2581 		listn = list->mo_next;
2582 		list->mo_next = NULL;
2583 		kbft = (kern_buflet_t)(void *)list;
2584 
2585 #if HAS_MTE && CONFIG_KERNEL_TAGGING
2586 		if (__probable(is_mte_enabled)) {
2587 			/* Checking to ensure the object address is tagged */
2588 			ASSERT((vm_offset_t)kbft !=
2589 			    vm_memtag_canonicalize_kernel((vm_offset_t)kbft));
2590 		}
2591 #endif /* HAS_MTE && CONFIG_KERNEL_TAGGING */
2592 
2593 		KBUF_EXT_INIT(kbft, pp);
2594 		*array_cp = (uint64_t)kbft;
2595 		++array_cp;
2596 		list = listn;
2597 		ASSERT(need > 0);
2598 		--need;
2599 	}
2600 	ASSERT((num - need) == allocd || kbft == NULL);
2601 	return num - need;
2602 }
2603 
2604 errno_t
pp_alloc_buflet(struct kern_pbufpool * pp,kern_buflet_t * kbft,uint32_t skmflag,bool large)2605 pp_alloc_buflet(struct kern_pbufpool *pp, kern_buflet_t *kbft, uint32_t skmflag,
2606     bool large)
2607 {
2608 	uint64_t bft;
2609 
2610 	if (__improbable(!pp_alloc_buflet_common(pp, &bft, 1, skmflag, large))) {
2611 		return ENOMEM;
2612 	}
2613 	*kbft = __unsafe_forge_single(kern_buflet_t, bft);
2614 	return 0;
2615 }
2616 
2617 errno_t
pp_alloc_buflet_batch(struct kern_pbufpool * pp,uint64_t * __counted_by (* size)array,uint32_t * size,uint32_t skmflag,bool large)2618 pp_alloc_buflet_batch(struct kern_pbufpool *pp,
2619     uint64_t *__counted_by(*size)array, uint32_t *size, uint32_t skmflag,
2620     bool large)
2621 {
2622 	uint32_t i, n;
2623 	int err;
2624 
2625 	ASSERT(array != NULL && size > 0);
2626 
2627 	n = *size;
2628 	i = pp_alloc_buflet_common(pp, array, n, skmflag, large);
2629 	*size = i;
2630 
2631 	if (__probable(i == n)) {
2632 		err = 0;
2633 	} else if (i != 0) {
2634 		err = EAGAIN;
2635 	} else {
2636 		err = ENOMEM;
2637 	}
2638 
2639 	return err;
2640 }
2641 
2642 __attribute__((always_inline))
2643 static void
pp_free_buflet_common(const kern_pbufpool_t pp,kern_buflet_t kbft)2644 pp_free_buflet_common(const kern_pbufpool_t pp, kern_buflet_t kbft)
2645 {
2646 	ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
2647 	ASSERT(kbft->buf_nbft_addr == 0);
2648 
2649 	if (kbft->buf_flag & BUFLET_FLAG_EXTERNAL) {
2650 		ASSERT(kbft->buf_addr != 0);
2651 		ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2652 		ASSERT(kbft->buf_bft_idx_reg != OBJ_IDX_NONE);
2653 		ASSERT(kbft->buf_ctl != NULL);
2654 		ASSERT(((struct __kern_buflet_ext *)kbft)->
2655 		    kbe_buf_upp_link.sle_next == NULL);
2656 		if (kbft->buf_ctl->bc_usecnt > 1) {
2657 			skmem_cache_free_nocache(BUFLET_HAS_LARGE_BUF(kbft) ?
2658 			    PP_KBFT_CACHE_LARGE(pp) : PP_KBFT_CACHE_DEF(pp),
2659 			    (void *)kbft);
2660 		} else {
2661 			skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2662 			    PP_KBFT_CACHE_LARGE(pp) : PP_KBFT_CACHE_DEF(pp),
2663 			    (void *)kbft);
2664 		}
2665 	} else if (__probable(kbft->buf_addr != 0)) {
2666 		void *objaddr = kbft->buf_objaddr;
2667 		uint32_t usecnt = 0;
2668 
2669 		ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2670 		ASSERT(kbft->buf_ctl != NULL);
2671 		KBUF_DTOR(kbft, usecnt);
2672 		SK_DF(SK_VERB_MEM, "pp %p buf %p usecnt %u",
2673 		    SK_KVA(pp), SK_KVA(objaddr), usecnt);
2674 		if (__probable(usecnt == 0)) {
2675 			skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2676 			    PP_BUF_CACHE_LARGE(pp) : PP_BUF_CACHE_DEF(pp),
2677 			    objaddr);
2678 		}
2679 	}
2680 }
2681 
2682 void
pp_free_buflet(const kern_pbufpool_t pp,kern_buflet_t kbft)2683 pp_free_buflet(const kern_pbufpool_t pp, kern_buflet_t kbft)
2684 {
2685 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
2686 	ASSERT(pp != NULL && kbft != NULL);
2687 	pp_free_buflet_common(pp, kbft);
2688 }
2689 
2690 void
pp_reap_caches(boolean_t purge)2691 pp_reap_caches(boolean_t purge)
2692 {
2693 	skmem_cache_reap_now(pp_opt_cache, purge);
2694 	skmem_cache_reap_now(pp_flow_cache, purge);
2695 	skmem_cache_reap_now(pp_compl_cache, purge);
2696 }
2697