xref: /xnu-11417.140.69/bsd/skywalk/packet/pbufpool.c (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1 /*
2  * Copyright (c) 2016-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/packet/pbufpool_var.h>
31 #include <sys/sdt.h>
32 #include <net/droptap.h>
33 
34 static struct kern_pbufpool *pp_alloc(zalloc_flags_t);
35 static void pp_free(struct kern_pbufpool *);
36 static uint32_t pp_alloc_packet_common(struct kern_pbufpool *, uint16_t,
37     uint64_t *__counted_by(num), uint32_t num, boolean_t, alloc_cb_func_t,
38     const void *, uint32_t);
39 static void pp_free_packet_array(struct kern_pbufpool *,
40     uint64_t *__counted_by(num)array, uint32_t num);
41 static int pp_metadata_ctor_no_buflet(struct skmem_obj_info *,
42     struct skmem_obj_info *, void *, uint32_t);
43 static int pp_metadata_ctor_max_buflet(struct skmem_obj_info *,
44     struct skmem_obj_info *, void *, uint32_t);
45 static void pp_metadata_dtor(void *, void *);
46 static int pp_metadata_construct(struct __kern_quantum *,
47     struct __user_quantum *, obj_idx_t, struct kern_pbufpool *, uint32_t,
48     uint16_t, bool, struct skmem_obj **);
49 static void pp_metadata_destruct(struct __kern_quantum *,
50     struct kern_pbufpool *, bool);
51 static struct __kern_quantum *pp_metadata_init(struct __metadata_preamble *,
52     struct kern_pbufpool *, uint16_t, uint32_t, struct skmem_obj **);
53 static struct __metadata_preamble *pp_metadata_fini(struct __kern_quantum *,
54     struct kern_pbufpool *, struct mbuf **, struct __kern_packet **,
55     struct skmem_obj **, struct skmem_obj **, struct skmem_obj **, struct skmem_obj **);
56 static void pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid);
57 static void pp_buf_seg_ctor(struct sksegment *, IOSKMemoryBufferRef, void *);
58 static void pp_buf_seg_dtor(struct sksegment *, IOSKMemoryBufferRef, void *);
59 static void pp_destroy_upp_locked(struct kern_pbufpool *);
60 static void pp_destroy_upp_bft_locked(struct kern_pbufpool *);
61 static int pp_init_upp_bft_locked(struct kern_pbufpool *, boolean_t);
62 static void pp_free_buflet_common(const kern_pbufpool_t, kern_buflet_t);
63 static mach_vm_address_t pp_alloc_buffer_common(const kern_pbufpool_t pp,
64     struct skmem_obj_info *oi, uint32_t skmflag, bool large);
65 static inline uint32_t
66 pp_alloc_buflet_common(struct kern_pbufpool *pp,
67     uint64_t *__counted_by(num)array, uint32_t num, uint32_t skmflag,
68     bool large);
69 
70 #define KERN_PBUFPOOL_U_HASH_SIZE       64      /* hash table size */
71 
72 #define KERN_BUF_MIN_STRIDING_SIZE      32 * 1024
73 static uint32_t kern_buf_min_striding_size = KERN_BUF_MIN_STRIDING_SIZE;
74 
75 /*
76  * Since the inputs are small (indices to the metadata region), we can use
77  * Knuth's multiplicative hash method which is fast and good enough.  Here
78  * we multiply the input by the golden ratio of 2^32.  See "The Art of
79  * Computer Programming", section 6.4.
80  */
81 #define KERN_PBUFPOOL_U_HASH_INDEX(_i, _m)                      \
82 	(((_i) * 2654435761U) & (_m))
83 #define KERN_PBUFPOOL_U_HASH(_pp, _i)                           \
84 	(&(_pp)->pp_u_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
85 	KERN_PBUFPOOL_U_HASH_SIZE - 1)])
86 #define KERN_PBUFPOOL_U_BFT_HASH(_pp, _i)                           \
87 	(&(_pp)->pp_u_bft_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
88 	KERN_PBUFPOOL_U_HASH_SIZE - 1)])
89 
90 static SKMEM_TYPE_DEFINE(pp_zone, struct kern_pbufpool);
91 
92 #define SKMEM_TAG_PBUFPOOL_HASH  "com.apple.skywalk.pbufpool.hash"
93 static SKMEM_TAG_DEFINE(skmem_tag_pbufpool_hash, SKMEM_TAG_PBUFPOOL_HASH);
94 
95 #define SKMEM_TAG_PBUFPOOL_BFT_HASH  "com.apple.skywalk.pbufpool.bft.hash"
96 static SKMEM_TAG_DEFINE(skmem_tag_pbufpool_bft_hash, SKMEM_TAG_PBUFPOOL_BFT_HASH);
97 
98 
99 struct kern_pbufpool_u_htbl {
100 	struct kern_pbufpool_u_bkt upp_hash[KERN_PBUFPOOL_U_HASH_SIZE];
101 };
102 
103 #define PP_U_HTBL_SIZE  sizeof(struct kern_pbufpool_u_htbl)
104 static SKMEM_TYPE_DEFINE(pp_u_htbl_zone, struct kern_pbufpool_u_htbl);
105 
106 static struct skmem_cache *pp_opt_cache;        /* cache for __packet_opt */
107 static struct skmem_cache *pp_flow_cache;       /* cache for __flow */
108 static struct skmem_cache *pp_compl_cache;      /* cache for __packet_compl */
109 
110 static int __pp_inited = 0;
111 
112 int
pp_init(void)113 pp_init(void)
114 {
115 	_CASSERT(KPKT_SC_UNSPEC == MBUF_SC_UNSPEC);
116 	_CASSERT(KPKT_SC_BK_SYS == MBUF_SC_BK_SYS);
117 	_CASSERT(KPKT_SC_BK == MBUF_SC_BK);
118 	_CASSERT(KPKT_SC_BE == MBUF_SC_BE);
119 	_CASSERT(KPKT_SC_RD == MBUF_SC_RD);
120 	_CASSERT(KPKT_SC_OAM == MBUF_SC_OAM);
121 	_CASSERT(KPKT_SC_AV == MBUF_SC_AV);
122 	_CASSERT(KPKT_SC_RV == MBUF_SC_RV);
123 	_CASSERT(KPKT_SC_VI == MBUF_SC_VI);
124 	_CASSERT(KPKT_SC_SIG == MBUF_SC_SIG);
125 	_CASSERT(KPKT_SC_VO == MBUF_SC_VO);
126 	_CASSERT(KPKT_SC_CTL == MBUF_SC_CTL);
127 
128 	_CASSERT(KPKT_SC_BK_SYS == PKT_SC_BK_SYS);
129 	_CASSERT(KPKT_SC_BK == PKT_SC_BK);
130 	_CASSERT(KPKT_SC_BE == PKT_SC_BE);
131 	_CASSERT(KPKT_SC_RD == PKT_SC_RD);
132 	_CASSERT(KPKT_SC_OAM == PKT_SC_OAM);
133 	_CASSERT(KPKT_SC_AV == PKT_SC_AV);
134 	_CASSERT(KPKT_SC_RV == PKT_SC_RV);
135 	_CASSERT(KPKT_SC_VI == PKT_SC_VI);
136 	_CASSERT(KPKT_SC_SIG == PKT_SC_SIG);
137 	_CASSERT(KPKT_SC_VO == PKT_SC_VO);
138 	_CASSERT(KPKT_SC_CTL == PKT_SC_CTL);
139 	_CASSERT(KPKT_SC_MAX_CLASSES == MBUF_SC_MAX_CLASSES);
140 
141 	_CASSERT(KPKT_TC_UNSPEC == MBUF_TC_UNSPEC);
142 	_CASSERT(KPKT_TC_BE == MBUF_TC_BE);
143 	_CASSERT(KPKT_TC_BK == MBUF_TC_BK);
144 	_CASSERT(KPKT_TC_VI == MBUF_TC_VI);
145 	_CASSERT(KPKT_TC_VO == MBUF_TC_VO);
146 	_CASSERT(KPKT_TC_MAX == MBUF_TC_MAX);
147 
148 	_CASSERT(KPKT_TC_BE == PKT_TC_BE);
149 	_CASSERT(KPKT_TC_BK == PKT_TC_BK);
150 	_CASSERT(KPKT_TC_VI == PKT_TC_VI);
151 	_CASSERT(KPKT_TC_VO == PKT_TC_VO);
152 
153 	_CASSERT(PKT_SCVAL_BK_SYS == SCVAL_BK_SYS);
154 	_CASSERT(PKT_SCVAL_BK == SCVAL_BK);
155 	_CASSERT(PKT_SCVAL_BE == SCVAL_BE);
156 	_CASSERT(PKT_SCVAL_RD == SCVAL_RD);
157 	_CASSERT(PKT_SCVAL_OAM == SCVAL_OAM);
158 	_CASSERT(PKT_SCVAL_AV == SCVAL_AV);
159 	_CASSERT(PKT_SCVAL_RV == SCVAL_RV);
160 	_CASSERT(PKT_SCVAL_VI == SCVAL_VI);
161 	_CASSERT(PKT_SCVAL_VO == SCVAL_VO);
162 	_CASSERT(PKT_SCVAL_CTL == SCVAL_CTL);
163 
164 	/*
165 	 * Assert that the value of common packet flags between mbuf and
166 	 * skywalk packets match, and that they are in PKT_F_COMMON_MASK.
167 	 */
168 	_CASSERT(PKT_F_BACKGROUND == PKTF_SO_BACKGROUND);
169 	_CASSERT(PKT_F_REALTIME == PKTF_SO_REALTIME);
170 	_CASSERT(PKT_F_REXMT == PKTF_TCP_REXMT);
171 	_CASSERT(PKT_F_LAST_PKT == PKTF_LAST_PKT);
172 	_CASSERT(PKT_F_FLOW_ID == PKTF_FLOW_ID);
173 	_CASSERT(PKT_F_FLOW_ADV == PKTF_FLOW_ADV);
174 	_CASSERT(PKT_F_TX_COMPL_TS_REQ == PKTF_TX_COMPL_TS_REQ);
175 	_CASSERT(PKT_F_TS_VALID == PKTF_TS_VALID);
176 	_CASSERT(PKT_F_NEW_FLOW == PKTF_NEW_FLOW);
177 	_CASSERT(PKT_F_START_SEQ == PKTF_START_SEQ);
178 	_CASSERT(PKT_F_KEEPALIVE == PKTF_KEEPALIVE);
179 	_CASSERT(PKT_F_WAKE_PKT == PKTF_WAKE_PKT);
180 	_CASSERT(PKT_F_COMMON_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME |
181 	    PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_FLOW_ID | PKT_F_FLOW_ADV |
182 	    PKT_F_TX_COMPL_TS_REQ | PKT_F_TS_VALID | PKT_F_NEW_FLOW |
183 	    PKT_F_START_SEQ | PKT_F_KEEPALIVE | PKT_F_WAKE_PKT));
184 	/*
185 	 * Assert packet flags shared with userland.
186 	 */
187 	_CASSERT(PKT_F_USER_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME |
188 	    PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_OPT_DATA | PKT_F_PROMISC |
189 	    PKT_F_TRUNCATED | PKT_F_WAKE_PKT | PKT_F_L4S));
190 
191 	_CASSERT(offsetof(struct __kern_quantum, qum_len) ==
192 	    offsetof(struct __kern_packet, pkt_length));
193 
194 	/*
195 	 * Due to the use of tagged pointer, we need the size of
196 	 * the metadata preamble structure to be multiples of 16.
197 	 * See SK_PTR_TAG() definition for details.
198 	 */
199 	_CASSERT(sizeof(struct __metadata_preamble) != 0 &&
200 	    (sizeof(struct __metadata_preamble) % 16) == 0);
201 
202 	_CASSERT(NX_PBUF_FRAGS_MIN == 1 &&
203 	    NX_PBUF_FRAGS_MIN == NX_PBUF_FRAGS_DEFAULT);
204 
205 	/*
206 	 * Batch alloc/free requires linking the objects together;
207 	 * make sure that the fields are at the same offset since
208 	 * we cast the object to struct skmem_obj.
209 	 */
210 	_CASSERT(offsetof(struct __metadata_preamble, _mdp_next) ==
211 	    offsetof(struct skmem_obj, mo_next));
212 	_CASSERT(offsetof(struct __buflet, __buflet_next) ==
213 	    offsetof(struct skmem_obj, mo_next));
214 
215 	SK_LOCK_ASSERT_HELD();
216 	ASSERT(!__pp_inited);
217 
218 	pp_opt_cache = skmem_cache_create("pkt.opt",
219 	    sizeof(struct __packet_opt), sizeof(uint64_t),
220 	    NULL, NULL, NULL, NULL, NULL, 0);
221 	pp_flow_cache = skmem_cache_create("pkt.flow",
222 	    sizeof(struct __flow), 16,  /* 16-bytes aligned */
223 	    NULL, NULL, NULL, NULL, NULL, 0);
224 	pp_compl_cache = skmem_cache_create("pkt.compl",
225 	    sizeof(struct __packet_compl), sizeof(uint64_t),
226 	    NULL, NULL, NULL, NULL, NULL, 0);
227 
228 	PE_parse_boot_argn("sk_pp_min_striding_size", &kern_buf_min_striding_size,
229 	    sizeof(kern_buf_min_striding_size));
230 
231 	return 0;
232 }
233 
234 void
pp_fini(void)235 pp_fini(void)
236 {
237 	SK_LOCK_ASSERT_HELD();
238 
239 	if (__pp_inited) {
240 		if (pp_compl_cache != NULL) {
241 			skmem_cache_destroy(pp_compl_cache);
242 			pp_compl_cache = NULL;
243 		}
244 		if (pp_flow_cache != NULL) {
245 			skmem_cache_destroy(pp_flow_cache);
246 			pp_flow_cache = NULL;
247 		}
248 		if (pp_opt_cache != NULL) {
249 			skmem_cache_destroy(pp_opt_cache);
250 			pp_opt_cache = NULL;
251 		}
252 
253 		__pp_inited = 0;
254 	}
255 }
256 
257 static struct kern_pbufpool *
pp_alloc(zalloc_flags_t how)258 pp_alloc(zalloc_flags_t how)
259 {
260 	struct kern_pbufpool *pp = zalloc_flags(pp_zone, how | Z_ZERO);
261 
262 	if (pp) {
263 		lck_mtx_init(&pp->pp_lock, &skmem_lock_grp, &skmem_lock_attr);
264 	}
265 	return pp;
266 }
267 
268 static void
pp_free(struct kern_pbufpool * pp)269 pp_free(struct kern_pbufpool *pp)
270 {
271 	PP_LOCK_ASSERT_HELD(pp);
272 
273 	pp_destroy(pp);
274 	PP_UNLOCK(pp);
275 
276 	SK_DF(SK_VERB_MEM, "pp 0x%llx FREE", SK_KVA(pp));
277 	lck_mtx_destroy(&pp->pp_lock, &skmem_lock_grp);
278 	zfree(pp_zone, pp);
279 }
280 
281 void
pp_retain_locked(struct kern_pbufpool * pp)282 pp_retain_locked(struct kern_pbufpool *pp)
283 {
284 	PP_LOCK_ASSERT_HELD(pp);
285 
286 	pp->pp_refcnt++;
287 	ASSERT(pp->pp_refcnt != 0);
288 }
289 
290 void
pp_retain(struct kern_pbufpool * pp)291 pp_retain(struct kern_pbufpool *pp)
292 {
293 	PP_LOCK(pp);
294 	pp_retain_locked(pp);
295 	PP_UNLOCK(pp);
296 }
297 
298 boolean_t
pp_release_locked(struct kern_pbufpool * pp)299 pp_release_locked(struct kern_pbufpool *pp)
300 {
301 	uint32_t oldref = pp->pp_refcnt;
302 
303 	PP_LOCK_ASSERT_HELD(pp);
304 
305 	ASSERT(pp->pp_refcnt != 0);
306 	if (--pp->pp_refcnt == 0) {
307 		pp_free(pp);
308 	}
309 
310 	return oldref == 1;
311 }
312 
313 boolean_t
pp_release(struct kern_pbufpool * pp)314 pp_release(struct kern_pbufpool *pp)
315 {
316 	boolean_t lastref;
317 
318 	PP_LOCK(pp);
319 	if (!(lastref = pp_release_locked(pp))) {
320 		PP_UNLOCK(pp);
321 	}
322 
323 	return lastref;
324 }
325 
326 void
pp_close(struct kern_pbufpool * pp)327 pp_close(struct kern_pbufpool *pp)
328 {
329 	PP_LOCK(pp);
330 	ASSERT(pp->pp_refcnt > 0);
331 	ASSERT(!(pp->pp_flags & PPF_CLOSED));
332 	pp->pp_flags |= PPF_CLOSED;
333 	if (!pp_release_locked(pp)) {
334 		PP_UNLOCK(pp);
335 	}
336 }
337 
338 /*
339  * -fbounds-safety: All callers of pp_regions_params_adjust use SKMEM_REGIONS
340  * size for the srp_array. This is same as marking it __counted_by(SKMEM_REGIONS)
341  */
342 void
pp_regions_params_adjust(struct skmem_region_params srp_array[SKMEM_REGIONS],nexus_meta_type_t md_type,nexus_meta_subtype_t md_subtype,uint32_t md_cnt,uint16_t max_frags,uint32_t buf_size,uint32_t large_buf_size,uint32_t buf_cnt,uint32_t buf_seg_size,uint32_t flags)343 pp_regions_params_adjust(struct skmem_region_params srp_array[SKMEM_REGIONS],
344     nexus_meta_type_t md_type, nexus_meta_subtype_t md_subtype, uint32_t md_cnt,
345     uint16_t max_frags, uint32_t buf_size, uint32_t large_buf_size,
346     uint32_t buf_cnt, uint32_t buf_seg_size, uint32_t flags)
347 {
348 	struct skmem_region_params *srp, *kmd_srp, *buf_srp, *kbft_srp,
349 	    *lbuf_srp;
350 	uint32_t md_size = 0;
351 	bool kernel_only = ((flags & PP_REGION_CONFIG_KERNEL_ONLY) != 0);
352 	bool md_persistent = ((flags & PP_REGION_CONFIG_MD_PERSISTENT) != 0);
353 	bool buf_persistent = ((flags & PP_REGION_CONFIG_BUF_PERSISTENT) != 0);
354 	bool config_buflet = ((flags & PP_REGION_CONFIG_BUFLET) != 0);
355 	bool md_magazine_enable = ((flags &
356 	    PP_REGION_CONFIG_MD_MAGAZINE_ENABLE) != 0);
357 
358 	ASSERT(max_frags != 0);
359 
360 	switch (md_type) {
361 	case NEXUS_META_TYPE_QUANTUM:
362 		md_size = NX_METADATA_QUANTUM_SZ;
363 		break;
364 	case NEXUS_META_TYPE_PACKET:
365 		md_size = NX_METADATA_PACKET_SZ(max_frags);
366 		break;
367 	default:
368 		VERIFY(0);
369 		/* NOTREACHED */
370 		__builtin_unreachable();
371 	}
372 
373 	switch (flags & PP_REGION_CONFIG_BUF_IODIR_BIDIR) {
374 	case PP_REGION_CONFIG_BUF_IODIR_IN:
375 		kmd_srp = &srp_array[SKMEM_REGION_RXKMD];
376 		buf_srp = &srp_array[SKMEM_REGION_RXBUF_DEF];
377 		lbuf_srp = &srp_array[SKMEM_REGION_RXBUF_LARGE];
378 		kbft_srp = &srp_array[SKMEM_REGION_RXKBFT];
379 		break;
380 	case PP_REGION_CONFIG_BUF_IODIR_OUT:
381 		kmd_srp = &srp_array[SKMEM_REGION_TXKMD];
382 		buf_srp = &srp_array[SKMEM_REGION_TXBUF_DEF];
383 		lbuf_srp = &srp_array[SKMEM_REGION_TXBUF_LARGE];
384 		kbft_srp = &srp_array[SKMEM_REGION_TXKBFT];
385 		break;
386 	case PP_REGION_CONFIG_BUF_IODIR_BIDIR:
387 	default:
388 		kmd_srp = &srp_array[SKMEM_REGION_KMD];
389 		buf_srp = &srp_array[SKMEM_REGION_BUF_DEF];
390 		lbuf_srp = &srp_array[SKMEM_REGION_BUF_LARGE];
391 		kbft_srp = &srp_array[SKMEM_REGION_KBFT];
392 		break;
393 	}
394 
395 	/* add preamble size to metadata obj size */
396 	md_size += METADATA_PREAMBLE_SZ;
397 	ASSERT(md_size >= NX_METADATA_OBJ_MIN_SZ);
398 
399 	/* configure kernel metadata region */
400 	kmd_srp->srp_md_type = md_type;
401 	kmd_srp->srp_md_subtype = md_subtype;
402 	kmd_srp->srp_r_obj_cnt = md_cnt;
403 	kmd_srp->srp_r_obj_size = md_size;
404 	kmd_srp->srp_max_frags = max_frags;
405 	ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
406 	if (md_persistent) {
407 		kmd_srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
408 	}
409 	ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
410 	if (md_magazine_enable) {
411 		kmd_srp->srp_cflags &= ~SKMEM_REGION_CR_NOMAGAZINES;
412 	}
413 	skmem_region_params_config(kmd_srp);
414 
415 	/* Sanity check for memtag */
416 	ASSERT(kmd_srp->srp_c_seg_size == SKMEM_MD_SEG_SIZE);
417 
418 	/* configure user metadata region */
419 	srp = &srp_array[SKMEM_REGION_UMD];
420 	if (!kernel_only) {
421 		srp->srp_md_type = kmd_srp->srp_md_type;
422 		srp->srp_md_subtype = kmd_srp->srp_md_subtype;
423 		srp->srp_r_obj_cnt = kmd_srp->srp_c_obj_cnt;
424 		srp->srp_r_obj_size = kmd_srp->srp_c_obj_size;
425 		srp->srp_max_frags = kmd_srp->srp_max_frags;
426 		ASSERT((srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
427 		if (md_persistent) {
428 			srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
429 		}
430 		/*
431 		 * UMD is a mirrored region and object allocation operations
432 		 * are performed on the KMD objects.
433 		 */
434 		ASSERT((srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
435 		skmem_region_params_config(srp);
436 		ASSERT(srp->srp_c_obj_cnt == kmd_srp->srp_c_obj_cnt);
437 	} else {
438 		ASSERT(srp->srp_r_obj_cnt == 0);
439 		ASSERT(srp->srp_r_obj_size == 0);
440 	}
441 
442 	/* configure buffer region */
443 	buf_srp->srp_r_obj_cnt = MAX(buf_cnt, kmd_srp->srp_c_obj_cnt);
444 	buf_srp->srp_r_obj_size = buf_size;
445 	buf_srp->srp_cflags &= ~SKMEM_REGION_CR_MONOLITHIC;
446 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
447 	if (buf_persistent) {
448 		buf_srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
449 	}
450 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
451 	if (buf_srp->srp_r_obj_size >= kern_buf_min_striding_size) {
452 		/*
453 		 * A buffer size larger than 32K indicates striding is in use, which
454 		 * means a buffer could be detached from a buflet. In this case, magzine
455 		 * layer should be enabled.
456 		 */
457 		buf_srp->srp_cflags &= ~SKMEM_REGION_CR_NOMAGAZINES;
458 	}
459 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_UREADONLY) == 0);
460 	if ((flags & PP_REGION_CONFIG_BUF_UREADONLY) != 0) {
461 		buf_srp->srp_cflags |= SKMEM_REGION_CR_UREADONLY;
462 	}
463 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_KREADONLY) == 0);
464 	if ((flags & PP_REGION_CONFIG_BUF_KREADONLY) != 0) {
465 		buf_srp->srp_cflags |= SKMEM_REGION_CR_KREADONLY;
466 	}
467 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_MONOLITHIC) == 0);
468 	if ((flags & PP_REGION_CONFIG_BUF_MONOLITHIC) != 0) {
469 		buf_srp->srp_cflags |= SKMEM_REGION_CR_MONOLITHIC;
470 	}
471 	ASSERT((srp->srp_cflags & SKMEM_REGION_CR_SEGPHYSCONTIG) == 0);
472 	if ((flags & PP_REGION_CONFIG_BUF_SEGPHYSCONTIG) != 0) {
473 		buf_srp->srp_cflags |= SKMEM_REGION_CR_SEGPHYSCONTIG;
474 	}
475 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_NOCACHE) == 0);
476 	if ((flags & PP_REGION_CONFIG_BUF_NOCACHE) != 0) {
477 		buf_srp->srp_cflags |= SKMEM_REGION_CR_NOCACHE;
478 	}
479 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_THREADSAFE) == 0);
480 	if ((flags & PP_REGION_CONFIG_BUF_THREADSAFE) != 0) {
481 		buf_srp->srp_cflags |= SKMEM_REGION_CR_THREADSAFE;
482 	}
483 	if (buf_seg_size != 0) {
484 		buf_srp->srp_r_seg_size = buf_seg_size;
485 	}
486 	skmem_region_params_config(buf_srp);
487 
488 	/* configure large buffer region */
489 	if (large_buf_size != 0) {
490 		lbuf_srp->srp_r_obj_cnt = buf_srp->srp_r_obj_cnt;
491 		lbuf_srp->srp_r_obj_size = large_buf_size;
492 		lbuf_srp->srp_r_seg_size = buf_srp->srp_r_seg_size;
493 		lbuf_srp->srp_cflags = buf_srp->srp_cflags;
494 		skmem_region_params_config(lbuf_srp);
495 	}
496 
497 	/* configure kernel buflet region */
498 	if (config_buflet) {
499 		ASSERT(md_type == NEXUS_META_TYPE_PACKET);
500 		/*
501 		 * Ideally we want the number of buflets to be
502 		 * "kmd_srp->srp_c_obj_cnt * (kmd_srp->srp_max_frags - 1)",
503 		 * so that we have enough buflets when multi-buflet and
504 		 * shared buffer object is used.
505 		 * Currently multi-buflet is being used only by user pool
506 		 * which doesn't support shared buffer object, hence to reduce
507 		 * the number of objects we are restricting the number of
508 		 * buflets to the number of buffers.
509 		 */
510 		kbft_srp->srp_r_obj_cnt = buf_srp->srp_c_obj_cnt +
511 		    lbuf_srp->srp_c_obj_cnt;
512 		kbft_srp->srp_r_obj_size = MAX(sizeof(struct __kern_buflet_ext),
513 		    sizeof(struct __user_buflet));
514 		kbft_srp->srp_cflags = kmd_srp->srp_cflags;
515 		skmem_region_params_config(kbft_srp);
516 		ASSERT(kbft_srp->srp_c_obj_cnt >= buf_srp->srp_c_obj_cnt +
517 		    lbuf_srp->srp_c_obj_cnt);
518 		/* Sanity check for memtag */
519 		ASSERT(kbft_srp->srp_c_seg_size == SKMEM_MD_SEG_SIZE);
520 	} else {
521 		ASSERT(kbft_srp->srp_r_obj_cnt == 0);
522 		ASSERT(kbft_srp->srp_r_obj_size == 0);
523 	}
524 
525 	/* configure user buflet region */
526 	srp = &srp_array[SKMEM_REGION_UBFT];
527 	if (config_buflet && !kernel_only) {
528 		srp->srp_r_obj_cnt = kbft_srp->srp_c_obj_cnt;
529 		srp->srp_r_obj_size = kbft_srp->srp_c_obj_size;
530 		srp->srp_cflags = srp_array[SKMEM_REGION_UMD].srp_cflags;
531 		skmem_region_params_config(srp);
532 		ASSERT(srp->srp_c_obj_cnt == kbft_srp->srp_c_obj_cnt);
533 	} else {
534 		ASSERT(srp->srp_r_obj_cnt == 0);
535 		ASSERT(srp->srp_r_obj_size == 0);
536 	}
537 
538 	/* make sure each metadata can be paired with a buffer */
539 	ASSERT(kmd_srp->srp_c_obj_cnt <= buf_srp->srp_c_obj_cnt);
540 }
541 
542 SK_NO_INLINE_ATTRIBUTE
543 static int
pp_metadata_construct(struct __kern_quantum * kqum,struct __user_quantum * uqum,obj_idx_t midx,struct kern_pbufpool * pp,uint32_t skmflag,uint16_t bufcnt,bool raw,struct skmem_obj ** blist)544 pp_metadata_construct(struct __kern_quantum *kqum, struct __user_quantum *uqum,
545     obj_idx_t midx, struct kern_pbufpool *pp, uint32_t skmflag, uint16_t bufcnt,
546     bool raw, struct skmem_obj **blist)
547 {
548 	struct __kern_buflet *kbuf;
549 	mach_vm_address_t baddr = 0;
550 	uint16_t *pbufs_cnt, *pbufs_max;
551 	uint16_t i;
552 
553 	ASSERT(bufcnt == 1 || PP_HAS_BUFFER_ON_DEMAND(pp));
554 
555 	/* construct {user,kernel} metadata */
556 	switch (pp->pp_md_type) {
557 	case NEXUS_META_TYPE_PACKET: {
558 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
559 		struct __user_packet *upkt = SK_PTR_ADDR_UPKT(uqum);
560 		struct __packet_opt *__single opt;
561 		struct __flow *__single flow;
562 		struct __packet_compl *__single compl;
563 		uint64_t pflags;
564 
565 		if (raw) {
566 			opt = skmem_cache_alloc(pp_opt_cache, SKMEM_SLEEP);
567 			flow = skmem_cache_alloc(pp_flow_cache, SKMEM_SLEEP);
568 			compl = skmem_cache_alloc(pp_compl_cache, SKMEM_SLEEP);
569 			pflags = (PKT_F_OPT_ALLOC | PKT_F_FLOW_ALLOC |
570 			    PKT_F_TX_COMPL_ALLOC);
571 		} else {
572 			ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
573 			    kpkt->pkt_com_opt != NULL);
574 			opt = kpkt->pkt_com_opt;
575 			ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
576 			    kpkt->pkt_flow != NULL);
577 			flow = kpkt->pkt_flow;
578 			ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
579 			    kpkt->pkt_tx_compl != NULL);
580 			compl = kpkt->pkt_tx_compl;
581 			pflags = kpkt->pkt_pflags;
582 		}
583 		/* will be adjusted below as part of allocating buffer(s) */
584 		_CASSERT(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
585 		_CASSERT(sizeof(kpkt->pkt_bufs_max) == sizeof(uint16_t));
586 		pbufs_cnt = __DECONST(uint16_t *, &kpkt->pkt_bufs_cnt);
587 		pbufs_max = __DECONST(uint16_t *, &kpkt->pkt_bufs_max);
588 
589 		/* kernel (and user) packet */
590 		KPKT_CTOR(kpkt, pflags, opt, flow, compl, midx,
591 		    upkt, pp, 0, pp->pp_max_frags, 0);
592 		break;
593 	}
594 	default:
595 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
596 		VERIFY(bufcnt == 1);
597 		/* TODO: point these to quantum's once they're defined */
598 		pbufs_cnt = pbufs_max = NULL;
599 		/* kernel quantum */
600 		KQUM_CTOR(kqum, midx, uqum, pp, 0);
601 		break;
602 	}
603 
604 	kbuf = kqum->qum_buf;
605 	for (i = 0; i < bufcnt; i++) {
606 		struct skmem_obj_info oib;
607 
608 		if (!PP_HAS_BUFFER_ON_DEMAND(pp)) {
609 			ASSERT(i == 0);
610 			ASSERT(*blist == NULL);
611 			/*
612 			 * quantum has a native buflet, so we only need a
613 			 * buffer to be allocated and attached to the buflet.
614 			 */
615 			baddr = pp_alloc_buffer_common(pp, &oib, skmflag,
616 			    false);
617 			if (__improbable(baddr == 0)) {
618 				goto fail;
619 			}
620 			KBUF_CTOR(kbuf, baddr, SKMEM_OBJ_IDX_REG(&oib),
621 			    SKMEM_OBJ_BUFCTL(&oib), pp, false);
622 			baddr = 0;
623 		} else {
624 			/*
625 			 * we use pre-constructed buflets with attached buffers.
626 			 */
627 			struct __kern_buflet *pkbuf = kbuf;
628 			struct skmem_obj *blistn;
629 
630 			ASSERT(pkbuf != NULL);
631 			kbuf = (kern_buflet_t)*blist;
632 			if (__improbable(kbuf == NULL)) {
633 				SK_DF(SK_VERB_MEM, "failed to get buflet,"
634 				    " pp 0x%llx", SK_KVA(pp));
635 				goto fail;
636 			}
637 
638 
639 			blistn = (*blist)->mo_next;
640 			(*blist)->mo_next = NULL;
641 
642 			KBUF_EXT_INIT(kbuf, pp);
643 			KBUF_LINK(pkbuf, kbuf);
644 			*blist = blistn;
645 		}
646 
647 		/* adjust buffer count accordingly */
648 		if (__probable(pbufs_cnt != NULL)) {
649 			*pbufs_cnt += 1;
650 			ASSERT(*pbufs_cnt <= *pbufs_max);
651 		}
652 	}
653 
654 	ASSERT(!PP_KERNEL_ONLY(pp) || (kqum->qum_qflags & QUM_F_KERNEL_ONLY));
655 	ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE);
656 	SK_DF(SK_VERB_MEM, "pp 0x%llx pkt 0x%llx bufcnt %d buf 0x%llx",
657 	    SK_KVA(pp), SK_KVA(kqum), bufcnt, SK_KVA(baddr));
658 	return 0;
659 
660 fail:
661 	ASSERT(bufcnt != 0 && baddr == 0);
662 	pp_metadata_destruct(kqum, pp, raw);
663 	return ENOMEM;
664 }
665 
666 static int
pp_metadata_ctor_common(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,struct kern_pbufpool * pp,uint32_t skmflag,bool no_buflet)667 pp_metadata_ctor_common(struct skmem_obj_info *oi0,
668     struct skmem_obj_info *oim0, struct kern_pbufpool *pp, uint32_t skmflag,
669     bool no_buflet)
670 {
671 	struct skmem_obj_info _oi, _oim;
672 	struct skmem_obj_info *oi, *oim;
673 	struct __kern_quantum *kqum;
674 	struct __user_quantum *uqum;
675 	uint16_t bufcnt = (no_buflet ? 0 : pp->pp_max_frags);
676 	struct skmem_obj *__single blist = NULL;
677 	int error;
678 
679 #if (DEVELOPMENT || DEBUG)
680 	uint64_t mtbf = skmem_region_get_mtbf();
681 	/*
682 	 * MTBF is applicable only for non-blocking allocations here.
683 	 */
684 	if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
685 	    (skmflag & SKMEM_NOSLEEP))) {
686 		SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
687 		net_update_uptime();
688 		return ENOMEM;
689 	}
690 #endif /* (DEVELOPMENT || DEBUG) */
691 
692 	/*
693 	 * Note that oi0 and oim0 may be stored inside the object itself;
694 	 * if so, copy them to local variables before constructing.  We
695 	 * don't use PPF_BATCH to test as the allocator may be allocating
696 	 * storage space differently depending on the number of objects.
697 	 */
698 	if (__probable((uintptr_t)oi0 >= (uintptr_t)SKMEM_OBJ_ADDR(oi0) &&
699 	    ((uintptr_t)oi0 + sizeof(*oi0)) <=
700 	    ((uintptr_t)SKMEM_OBJ_ADDR(oi0) + SKMEM_OBJ_SIZE(oi0)))) {
701 		oi = &_oi;
702 		*oi = *oi0;
703 		if (__probable(oim0 != NULL)) {
704 			oim = &_oim;
705 			*oim = *oim0;
706 		} else {
707 			oim = NULL;
708 		}
709 	} else {
710 		oi = oi0;
711 		oim = oim0;
712 	}
713 
714 	kqum = SK_PTR_ADDR_KQUM((uintptr_t)SKMEM_OBJ_ADDR(oi) +
715 	    METADATA_PREAMBLE_SZ);
716 
717 	if (__probable(!PP_KERNEL_ONLY(pp))) {
718 		ASSERT(oim != NULL && SKMEM_OBJ_ADDR(oim) != NULL);
719 		ASSERT(SKMEM_OBJ_SIZE(oi) == SKMEM_OBJ_SIZE(oim));
720 		uqum = SK_PTR_ADDR_UQUM((uintptr_t)SKMEM_OBJ_ADDR(oim) +
721 		    METADATA_PREAMBLE_SZ);
722 	} else {
723 		ASSERT(oim == NULL);
724 		uqum = NULL;
725 	}
726 
727 	if (oim != NULL) {
728 		/* initialize user metadata redzone */
729 		struct __metadata_preamble *mdp = SKMEM_OBJ_ADDR(oim);
730 		mdp->mdp_redzone =
731 		    (SKMEM_OBJ_ROFF(oim) + METADATA_PREAMBLE_SZ) ^
732 		    __ch_umd_redzone_cookie;
733 	}
734 
735 	/* allocate (constructed) buflet(s) with buffer(s) attached */
736 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0) {
737 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
738 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, bufcnt, skmflag);
739 	}
740 
741 	error = pp_metadata_construct(kqum, uqum, SKMEM_OBJ_IDX_REG(oi), pp,
742 	    skmflag, bufcnt, TRUE, &blist);
743 	if (__improbable(blist != NULL)) {
744 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist);
745 		blist = NULL;
746 	}
747 	return error;
748 }
749 
750 static int
pp_metadata_ctor_no_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)751 pp_metadata_ctor_no_buflet(struct skmem_obj_info *oi0,
752     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
753 {
754 	return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, true);
755 }
756 
757 static int
pp_metadata_ctor_max_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)758 pp_metadata_ctor_max_buflet(struct skmem_obj_info *oi0,
759     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
760 {
761 	return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, false);
762 }
763 
764 __attribute__((always_inline))
765 static void
pp_metadata_destruct_common(struct __kern_quantum * kqum,struct kern_pbufpool * pp,bool raw,struct skmem_obj ** blist_def,struct skmem_obj ** blist_nocache_def,struct skmem_obj ** blist_large,struct skmem_obj ** blist_nocache_large)766 pp_metadata_destruct_common(struct __kern_quantum *kqum,
767     struct kern_pbufpool *pp, bool raw, struct skmem_obj **blist_def,
768     struct skmem_obj **blist_nocache_def, struct skmem_obj **blist_large,
769     struct skmem_obj **blist_nocache_large)
770 {
771 	struct __kern_buflet *kbuf, *nbuf;
772 	struct skmem_obj *__single p_blist_def = NULL, *__single p_blist_large = NULL;
773 	struct skmem_obj *__single p_blist_nocache_def = NULL, *__single p_blist_nocache_large = NULL;
774 	struct skmem_obj **pp_blist_def = &p_blist_def;
775 	struct skmem_obj **pp_blist_large = &p_blist_large;
776 	struct skmem_obj **pp_blist_nocache_def = &p_blist_nocache_def;
777 	struct skmem_obj **pp_blist_nocache_large = &p_blist_nocache_large;
778 	uint16_t bufcnt, i = 0;
779 	bool first_buflet_empty;
780 
781 	ASSERT(blist_def != NULL);
782 	ASSERT(blist_large != NULL);
783 
784 	switch (pp->pp_md_type) {
785 	case NEXUS_META_TYPE_PACKET: {
786 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
787 
788 		ASSERT(kpkt->pkt_user != NULL || PP_KERNEL_ONLY(pp));
789 		ASSERT(kpkt->pkt_qum.qum_pp == pp);
790 		ASSERT(METADATA_TYPE(kpkt) == pp->pp_md_type);
791 		ASSERT(METADATA_SUBTYPE(kpkt) == pp->pp_md_subtype);
792 		ASSERT(METADATA_IDX(kpkt) != OBJ_IDX_NONE);
793 		ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
794 		ASSERT(kpkt->pkt_bufs_cnt <= kpkt->pkt_bufs_max);
795 		ASSERT(kpkt->pkt_bufs_max == pp->pp_max_frags);
796 		_CASSERT(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
797 		bufcnt = kpkt->pkt_bufs_cnt;
798 		kbuf = &kqum->qum_buf[0];
799 		/*
800 		 * special handling for empty first buflet.
801 		 */
802 		first_buflet_empty = (kbuf->buf_addr == 0);
803 		*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = 0;
804 		break;
805 	}
806 	default:
807 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
808 		ASSERT(kqum->qum_user != NULL || PP_KERNEL_ONLY(pp));
809 		ASSERT(kqum->qum_pp == pp);
810 		ASSERT(METADATA_TYPE(kqum) == pp->pp_md_type);
811 		ASSERT(METADATA_SUBTYPE(kqum) == pp->pp_md_subtype);
812 		ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE);
813 		ASSERT(kqum->qum_ksd == NULL);
814 		kbuf = &kqum->qum_buf[0];
815 		/*
816 		 * XXX: Special handling for quantum as we don't currently
817 		 * define bufs_{cnt,max} there.  Given that we support at
818 		 * most only 1 buflet for now, check if buf_addr is non-NULL.
819 		 * See related code in pp_metadata_construct().
820 		 */
821 		first_buflet_empty = (kbuf->buf_addr == 0);
822 		bufcnt = first_buflet_empty ? 0 : 1;
823 		break;
824 	}
825 
826 	/*
827 	 * -fbounds-safety: buf_nbft_addr is a mach_vm_address_t which is
828 	 * unsafe, so we forge it here.
829 	 */
830 	nbuf = __unsafe_forge_single(struct __kern_buflet *,
831 	    __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
832 	BUF_NBFT_ADDR(kbuf, 0);
833 	BUF_NBFT_IDX(kbuf, OBJ_IDX_NONE);
834 	if (!first_buflet_empty) {
835 		pp_free_buflet_common(pp, kbuf);
836 		++i;
837 	}
838 
839 	while (nbuf != NULL) {
840 		ASSERT(nbuf->buf_ctl != NULL);
841 		if (BUFLET_HAS_LARGE_BUF(nbuf)) {
842 			/*
843 			 * bc_usecnt larger than 1 means the buffer has been cloned and is
844 			 * still being used by other bflts. In this case, when we free
845 			 * this bflt we need to explicitly ask for it to not be cached again
846 			 * into magzine layer to prevent immediate reuse of the buffer and
847 			 * data corruption.
848 			 */
849 			if (nbuf->buf_ctl->bc_usecnt > 1) {
850 				*pp_blist_nocache_large = (struct skmem_obj *)(void *)nbuf;
851 				pp_blist_nocache_large =
852 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
853 			} else {
854 				*pp_blist_large = (struct skmem_obj *)(void *)nbuf;
855 				pp_blist_large =
856 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
857 			}
858 		} else {
859 			if (nbuf->buf_ctl->bc_usecnt > 1) {
860 				*pp_blist_nocache_def = (struct skmem_obj *)(void *)nbuf;
861 				pp_blist_nocache_def =
862 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
863 			} else {
864 				*pp_blist_def = (struct skmem_obj *)(void *)nbuf;
865 				pp_blist_def =
866 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
867 			}
868 		}
869 		BUF_NBFT_IDX(nbuf, OBJ_IDX_NONE);
870 		nbuf = __unsafe_forge_single(struct __kern_buflet *,
871 		    __DECONST(struct __kern_buflet *, nbuf->buf_nbft_addr));
872 		++i;
873 	}
874 
875 	ASSERT(i == bufcnt);
876 
877 	if (p_blist_def != NULL) {
878 		*pp_blist_def = *blist_def;
879 		*blist_def = p_blist_def;
880 	}
881 	if (p_blist_large != NULL) {
882 		*pp_blist_large = *blist_large;
883 		*blist_large = p_blist_large;
884 	}
885 	if (p_blist_nocache_def != NULL) {
886 		*pp_blist_nocache_def = *blist_nocache_def;
887 		*blist_nocache_def = p_blist_nocache_def;
888 	}
889 	if (p_blist_nocache_large != NULL) {
890 		*pp_blist_nocache_large = *blist_nocache_large;
891 		*blist_nocache_large = p_blist_nocache_large;
892 	}
893 
894 	/* if we're about to return this object to the slab, clean it up */
895 	if (raw) {
896 		switch (pp->pp_md_type) {
897 		case NEXUS_META_TYPE_PACKET: {
898 			struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
899 
900 			ASSERT(kpkt->pkt_com_opt != NULL ||
901 			    !(kpkt->pkt_pflags & PKT_F_OPT_ALLOC));
902 			if (kpkt->pkt_com_opt != NULL) {
903 				ASSERT(kpkt->pkt_pflags & PKT_F_OPT_ALLOC);
904 				skmem_cache_free(pp_opt_cache,
905 				    kpkt->pkt_com_opt);
906 				kpkt->pkt_com_opt = NULL;
907 			}
908 			ASSERT(kpkt->pkt_flow != NULL ||
909 			    !(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC));
910 			if (kpkt->pkt_flow != NULL) {
911 				ASSERT(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC);
912 				skmem_cache_free(pp_flow_cache, kpkt->pkt_flow);
913 				kpkt->pkt_flow = NULL;
914 			}
915 			ASSERT(kpkt->pkt_tx_compl != NULL ||
916 			    !(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC));
917 			if (kpkt->pkt_tx_compl != NULL) {
918 				ASSERT(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC);
919 				skmem_cache_free(pp_compl_cache,
920 				    kpkt->pkt_tx_compl);
921 				kpkt->pkt_tx_compl = NULL;
922 			}
923 			kpkt->pkt_pflags = 0;
924 			break;
925 		}
926 		default:
927 			ASSERT(METADATA_TYPE(kqum) == NEXUS_META_TYPE_QUANTUM);
928 			/* nothing to do for quantum (yet) */
929 			break;
930 		}
931 	}
932 }
933 
934 __attribute__((always_inline))
935 static void
pp_free_kbft_list(struct kern_pbufpool * pp,struct skmem_obj * blist_def,struct skmem_obj * blist_nocache_def,struct skmem_obj * blist_large,struct skmem_obj * blist_nocache_large)936 pp_free_kbft_list(struct kern_pbufpool *pp, struct skmem_obj *blist_def, struct skmem_obj *blist_nocache_def,
937     struct skmem_obj *blist_large, struct skmem_obj *blist_nocache_large)
938 {
939 	if (blist_def != NULL) {
940 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist_def);
941 	}
942 	if (blist_large != NULL) {
943 		skmem_cache_batch_free(PP_KBFT_CACHE_LARGE(pp), blist_large);
944 	}
945 	if (blist_nocache_def != NULL) {
946 		skmem_cache_batch_free_nocache(PP_KBFT_CACHE_DEF(pp), blist_nocache_def);
947 	}
948 	if (blist_nocache_large != NULL) {
949 		skmem_cache_batch_free_nocache(PP_KBFT_CACHE_LARGE(pp), blist_nocache_large);
950 	}
951 }
952 
953 __attribute__((always_inline))
954 static void
pp_metadata_destruct(struct __kern_quantum * kqum,struct kern_pbufpool * pp,bool raw)955 pp_metadata_destruct(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
956     bool raw)
957 {
958 	struct skmem_obj *__single blist_def = NULL, *__single blist_large = NULL;
959 	struct skmem_obj *__single blist_nocache_def = NULL, *__single blist_nocache_large = NULL;
960 
961 	pp_metadata_destruct_common(kqum, pp, raw, &blist_def, &blist_nocache_def,
962 	    &blist_large, &blist_nocache_large);
963 	pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
964 }
965 
966 static void
pp_metadata_dtor(void * addr,void * arg)967 pp_metadata_dtor(void *addr, void *arg)
968 {
969 	pp_metadata_destruct(SK_PTR_ADDR_KQUM((uintptr_t)addr +
970 	    METADATA_PREAMBLE_SZ), arg, TRUE);
971 }
972 
973 static void
pp_buf_seg_ctor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)974 pp_buf_seg_ctor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
975 {
976 	struct kern_pbufpool *__single pp = arg;
977 
978 	if (pp->pp_pbuf_seg_ctor != NULL) {
979 		pp->pp_pbuf_seg_ctor(pp, sg, md);
980 	}
981 }
982 
983 static void
pp_buf_seg_dtor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)984 pp_buf_seg_dtor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
985 {
986 	struct kern_pbufpool *__single pp = arg;
987 
988 	if (pp->pp_pbuf_seg_dtor != NULL) {
989 		pp->pp_pbuf_seg_dtor(pp, sg, md);
990 	}
991 }
992 
993 static int
pp_buflet_metadata_ctor_common(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag,bool large)994 pp_buflet_metadata_ctor_common(struct skmem_obj_info *oi0,
995     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag, bool large)
996 {
997 #pragma unused (skmflag)
998 	struct kern_pbufpool *pp = (struct kern_pbufpool *)arg;
999 	struct __kern_buflet *kbft;
1000 	struct __user_buflet *ubft;
1001 	struct skmem_obj_info oib;
1002 	mach_vm_address_t baddr;
1003 	obj_idx_t oi_idx_reg;
1004 
1005 	baddr = pp_alloc_buffer_common(pp, &oib, skmflag, large);
1006 	if (__improbable(baddr == 0)) {
1007 		return ENOMEM;
1008 	}
1009 	/*
1010 	 * Note that oi0 and oim0 may be stored inside the object itself;
1011 	 * so copy what is required to local variables before constructing.
1012 	 */
1013 	oi_idx_reg = SKMEM_OBJ_IDX_REG(oi0);
1014 	kbft = SKMEM_OBJ_ADDR(oi0);
1015 
1016 	if (__probable(!PP_KERNEL_ONLY(pp))) {
1017 		ASSERT(oim0 != NULL && SKMEM_OBJ_ADDR(oim0) != NULL);
1018 		ASSERT(SKMEM_OBJ_SIZE(oi0) == SKMEM_OBJ_SIZE(oim0));
1019 		ASSERT(oi_idx_reg == SKMEM_OBJ_IDX_REG(oim0));
1020 		ASSERT(SKMEM_OBJ_IDX_SEG(oi0) == SKMEM_OBJ_IDX_SEG(oim0));
1021 		ubft = SKMEM_OBJ_ADDR(oim0);
1022 	} else {
1023 		ASSERT(oim0 == NULL);
1024 		ubft = NULL;
1025 	}
1026 	KBUF_EXT_CTOR(kbft, ubft, baddr, SKMEM_OBJ_IDX_REG(&oib),
1027 	    SKMEM_OBJ_BUFCTL(&oib), oi_idx_reg, pp, large);
1028 	return 0;
1029 }
1030 
1031 static int
pp_buflet_default_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)1032 pp_buflet_default_buffer_metadata_ctor(struct skmem_obj_info *oi0,
1033     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
1034 {
1035 	return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, false);
1036 }
1037 
1038 static int
pp_buflet_large_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)1039 pp_buflet_large_buffer_metadata_ctor(struct skmem_obj_info *oi0,
1040     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
1041 {
1042 	return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, true);
1043 }
1044 
1045 static void
pp_buflet_metadata_dtor(void * addr,void * arg)1046 pp_buflet_metadata_dtor(void *addr, void *arg)
1047 {
1048 	struct __kern_buflet *__single kbft = addr;
1049 	void *objaddr = kbft->buf_objaddr;
1050 	struct kern_pbufpool *__single pp = arg;
1051 	uint32_t usecnt = 0;
1052 	bool large = BUFLET_HAS_LARGE_BUF(kbft);
1053 
1054 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1055 	/*
1056 	 * don't assert for (buf_nbft_addr == 0) here as constructed
1057 	 * buflet may have this field as non-zero. This is because
1058 	 * buf_nbft_addr (__buflet_next) is used by skmem batch alloc
1059 	 * for chaining the buflets.
1060 	 * To ensure that the frred buflet was not part of a chain we
1061 	 * assert for (buf_nbft_idx == OBJ_IDX_NONE).
1062 	 */
1063 	ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
1064 	ASSERT(((struct __kern_buflet_ext *)kbft)->kbe_buf_upp_link.sle_next ==
1065 	    NULL);
1066 	ASSERT(kbft->buf_addr != 0);
1067 	ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
1068 	ASSERT(kbft->buf_ctl != NULL);
1069 
1070 	KBUF_DTOR(kbft, usecnt);
1071 	SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u", SK_KVA(pp),
1072 	    SK_KVA(objaddr), usecnt);
1073 	if (__probable(usecnt == 0)) {
1074 		skmem_cache_free(large ? PP_BUF_CACHE_LARGE(pp) :
1075 		    PP_BUF_CACHE_DEF(pp), objaddr);
1076 	}
1077 }
1078 
1079 /*
1080  * -fbounds-safety: all callers of pp_create use srp_array with a known size:
1081  * SKMEM_REGIONS. This is same as marking it __counted_by(SKMEM_REGIONS)
1082  */
1083 struct kern_pbufpool *
pp_create(const char * name,struct skmem_region_params srp_array[SKMEM_REGIONS],pbuf_seg_ctor_fn_t buf_seg_ctor,pbuf_seg_dtor_fn_t buf_seg_dtor,const void * ctx,pbuf_ctx_retain_fn_t ctx_retain,pbuf_ctx_release_fn_t ctx_release,uint32_t ppcreatef)1084 pp_create(const char *name, struct skmem_region_params srp_array[SKMEM_REGIONS],
1085     pbuf_seg_ctor_fn_t buf_seg_ctor, pbuf_seg_dtor_fn_t buf_seg_dtor,
1086     const void *ctx, pbuf_ctx_retain_fn_t ctx_retain,
1087     pbuf_ctx_release_fn_t ctx_release, uint32_t ppcreatef)
1088 {
1089 	struct kern_pbufpool *pp = NULL;
1090 	uint32_t md_size, def_buf_obj_size;
1091 	uint32_t def_buf_size, large_buf_size;
1092 	nexus_meta_type_t md_type;
1093 	nexus_meta_subtype_t md_subtype;
1094 	uint32_t md_cflags;
1095 	uint16_t max_frags;
1096 	uint32_t buf_def_cflags;
1097 	char cname[64];
1098 	const char *__null_terminated cache_name = NULL;
1099 	struct skmem_region_params *kmd_srp;
1100 	struct skmem_region_params *buf_srp;
1101 	struct skmem_region_params *kbft_srp;
1102 	struct skmem_region_params *umd_srp = NULL;
1103 	struct skmem_region_params *ubft_srp = NULL;
1104 	struct skmem_region_params *lbuf_srp = NULL;
1105 
1106 	/* buf_seg_{ctor,dtor} pair must be either NULL or non-NULL */
1107 	ASSERT(!(!(buf_seg_ctor == NULL && buf_seg_dtor == NULL) &&
1108 	    ((buf_seg_ctor == NULL) ^ (buf_seg_dtor == NULL))));
1109 
1110 	/* ctx{,_retain,_release} must be either ALL NULL or ALL non-NULL */
1111 	ASSERT((ctx == NULL && ctx_retain == NULL && ctx_release == NULL) ||
1112 	    (ctx != NULL && ctx_retain != NULL && ctx_release != NULL));
1113 
1114 	if (srp_array[SKMEM_REGION_KMD].srp_c_obj_cnt != 0) {
1115 		kmd_srp = &srp_array[SKMEM_REGION_KMD];
1116 		buf_srp = &srp_array[SKMEM_REGION_BUF_DEF];
1117 		lbuf_srp = &srp_array[SKMEM_REGION_BUF_LARGE];
1118 		kbft_srp = &srp_array[SKMEM_REGION_KBFT];
1119 	} else if (srp_array[SKMEM_REGION_RXKMD].srp_c_obj_cnt != 0) {
1120 		kmd_srp = &srp_array[SKMEM_REGION_RXKMD];
1121 		buf_srp = &srp_array[SKMEM_REGION_RXBUF_DEF];
1122 		lbuf_srp = &srp_array[SKMEM_REGION_RXBUF_LARGE];
1123 		kbft_srp = &srp_array[SKMEM_REGION_RXKBFT];
1124 	} else {
1125 		VERIFY(srp_array[SKMEM_REGION_TXKMD].srp_c_obj_cnt != 0);
1126 		kmd_srp = &srp_array[SKMEM_REGION_TXKMD];
1127 		buf_srp = &srp_array[SKMEM_REGION_TXBUF_DEF];
1128 		lbuf_srp = &srp_array[SKMEM_REGION_TXBUF_LARGE];
1129 		kbft_srp = &srp_array[SKMEM_REGION_TXKBFT];
1130 	}
1131 
1132 	VERIFY(kmd_srp->srp_c_obj_size != 0);
1133 	VERIFY(buf_srp->srp_c_obj_cnt != 0);
1134 	VERIFY(buf_srp->srp_c_obj_size != 0);
1135 
1136 	if (ppcreatef & PPCREATEF_ONDEMAND_BUF) {
1137 		VERIFY(kbft_srp->srp_c_obj_cnt != 0);
1138 		VERIFY(kbft_srp->srp_c_obj_size != 0);
1139 	} else {
1140 		kbft_srp = NULL;
1141 	}
1142 
1143 	if ((ppcreatef & PPCREATEF_KERNEL_ONLY) == 0) {
1144 		umd_srp = &srp_array[SKMEM_REGION_UMD];
1145 		ASSERT(umd_srp->srp_c_obj_size == kmd_srp->srp_c_obj_size);
1146 		ASSERT(umd_srp->srp_c_obj_cnt == kmd_srp->srp_c_obj_cnt);
1147 		ASSERT(umd_srp->srp_c_seg_size == kmd_srp->srp_c_seg_size);
1148 		ASSERT(umd_srp->srp_seg_cnt == kmd_srp->srp_seg_cnt);
1149 		ASSERT(umd_srp->srp_md_type == kmd_srp->srp_md_type);
1150 		ASSERT(umd_srp->srp_md_subtype == kmd_srp->srp_md_subtype);
1151 		ASSERT(umd_srp->srp_max_frags == kmd_srp->srp_max_frags);
1152 		ASSERT((umd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
1153 		    (kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
1154 		if (kbft_srp != NULL) {
1155 			ubft_srp = &srp_array[SKMEM_REGION_UBFT];
1156 			ASSERT(ubft_srp->srp_c_obj_size ==
1157 			    kbft_srp->srp_c_obj_size);
1158 			ASSERT(ubft_srp->srp_c_obj_cnt ==
1159 			    kbft_srp->srp_c_obj_cnt);
1160 			ASSERT(ubft_srp->srp_c_seg_size ==
1161 			    kbft_srp->srp_c_seg_size);
1162 			ASSERT(ubft_srp->srp_seg_cnt == kbft_srp->srp_seg_cnt);
1163 		}
1164 	}
1165 
1166 	md_size = kmd_srp->srp_r_obj_size;
1167 	md_type = kmd_srp->srp_md_type;
1168 	md_subtype = kmd_srp->srp_md_subtype;
1169 	max_frags = kmd_srp->srp_max_frags;
1170 	def_buf_obj_size = buf_srp->srp_c_obj_size;
1171 	def_buf_size = def_buf_obj_size;
1172 	large_buf_size = lbuf_srp->srp_c_obj_size;
1173 
1174 #if (DEBUG || DEVELOPMENT)
1175 	ASSERT(def_buf_obj_size != 0);
1176 	ASSERT(md_type > NEXUS_META_TYPE_INVALID &&
1177 	    md_type <= NEXUS_META_TYPE_MAX);
1178 	if (md_type == NEXUS_META_TYPE_QUANTUM) {
1179 		ASSERT(max_frags == 1);
1180 		ASSERT(md_size >=
1181 		    (METADATA_PREAMBLE_SZ + NX_METADATA_QUANTUM_SZ));
1182 	} else {
1183 		ASSERT(max_frags >= 1);
1184 		ASSERT(md_type == NEXUS_META_TYPE_PACKET);
1185 		ASSERT(md_size >= (METADATA_PREAMBLE_SZ +
1186 		    NX_METADATA_PACKET_SZ(max_frags)));
1187 	}
1188 	ASSERT(md_subtype > NEXUS_META_SUBTYPE_INVALID &&
1189 	    md_subtype <= NEXUS_META_SUBTYPE_MAX);
1190 #endif /* DEBUG || DEVELOPMENT */
1191 
1192 	pp = pp_alloc(Z_WAITOK);
1193 
1194 	(void) snprintf((char *)pp->pp_name, sizeof(pp->pp_name),
1195 	    "skywalk.pp.%s", name);
1196 
1197 	pp->pp_ctx = __DECONST(void *, ctx);
1198 	pp->pp_ctx_retain = ctx_retain;
1199 	pp->pp_ctx_release = ctx_release;
1200 	if (pp->pp_ctx != NULL) {
1201 		pp->pp_ctx_retain(pp->pp_ctx);
1202 	}
1203 
1204 	pp->pp_pbuf_seg_ctor = buf_seg_ctor;
1205 	pp->pp_pbuf_seg_dtor = buf_seg_dtor;
1206 	PP_BUF_SIZE_DEF(pp) = def_buf_size;
1207 	PP_BUF_OBJ_SIZE_DEF(pp) = def_buf_obj_size;
1208 	PP_BUF_SIZE_LARGE(pp) = large_buf_size;
1209 	PP_BUF_OBJ_SIZE_LARGE(pp) = lbuf_srp->srp_c_obj_size;
1210 	pp->pp_md_type = md_type;
1211 	pp->pp_md_subtype = md_subtype;
1212 	pp->pp_max_frags = max_frags;
1213 	if (ppcreatef & PPCREATEF_EXTERNAL) {
1214 		pp->pp_flags |= PPF_EXTERNAL;
1215 	}
1216 	if (ppcreatef & PPCREATEF_TRUNCATED_BUF) {
1217 		pp->pp_flags |= PPF_TRUNCATED_BUF;
1218 	}
1219 	if (ppcreatef & PPCREATEF_KERNEL_ONLY) {
1220 		pp->pp_flags |= PPF_KERNEL;
1221 	}
1222 	if (ppcreatef & PPCREATEF_ONDEMAND_BUF) {
1223 		pp->pp_flags |= PPF_BUFFER_ON_DEMAND;
1224 	}
1225 	if (ppcreatef & PPCREATEF_DYNAMIC) {
1226 		pp->pp_flags |= PPF_DYNAMIC;
1227 	}
1228 	if (lbuf_srp->srp_c_obj_cnt > 0) {
1229 		ASSERT(lbuf_srp->srp_c_obj_size != 0);
1230 		pp->pp_flags |= PPF_LARGE_BUF;
1231 	}
1232 
1233 	pp_retain(pp);
1234 
1235 	md_cflags = ((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ?
1236 	    SKMEM_CR_NOMAGAZINES : 0);
1237 	md_cflags |= SKMEM_CR_BATCH;
1238 	pp->pp_flags |= PPF_BATCH;
1239 
1240 	if (pp->pp_flags & PPF_DYNAMIC) {
1241 		md_cflags |= SKMEM_CR_DYNAMIC;
1242 	}
1243 
1244 	if (umd_srp != NULL && (pp->pp_umd_region =
1245 	    skmem_region_create(name, umd_srp, NULL, NULL, NULL)) == NULL) {
1246 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1247 		    pp->pp_name, SK_KVA(pp), umd_srp->srp_name);
1248 		goto failed;
1249 	}
1250 
1251 	if ((pp->pp_kmd_region = skmem_region_create(name, kmd_srp, NULL, NULL,
1252 	    NULL)) == NULL) {
1253 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1254 		    pp->pp_name, SK_KVA(pp), kmd_srp->srp_name);
1255 		goto failed;
1256 	}
1257 
1258 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1259 		VERIFY((kbft_srp != NULL) && (kbft_srp->srp_c_obj_cnt > 0));
1260 		if (!PP_KERNEL_ONLY(pp)) {
1261 			VERIFY((ubft_srp != NULL) &&
1262 			    (ubft_srp->srp_c_obj_cnt > 0));
1263 		}
1264 	}
1265 	/*
1266 	 * Metadata regions {KMD,KBFT,UBFT} magazines layer and persistency
1267 	 * attribute must match.
1268 	 */
1269 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1270 		ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ==
1271 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES));
1272 		ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
1273 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
1274 	}
1275 
1276 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && !PP_KERNEL_ONLY(pp)) {
1277 		if ((pp->pp_ubft_region = skmem_region_create(name, ubft_srp,
1278 		    NULL, NULL, NULL)) == NULL) {
1279 			SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1280 			    pp->pp_name, SK_KVA(pp), ubft_srp->srp_name);
1281 			goto failed;
1282 		}
1283 	}
1284 
1285 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1286 		if ((pp->pp_kbft_region = skmem_region_create(name,
1287 		    kbft_srp, NULL, NULL, NULL)) == NULL) {
1288 			SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1289 			    pp->pp_name, SK_KVA(pp), kbft_srp->srp_name);
1290 			goto failed;
1291 		}
1292 	}
1293 
1294 	if (!PP_KERNEL_ONLY(pp)) {
1295 		skmem_region_mirror(pp->pp_kmd_region, pp->pp_umd_region);
1296 	}
1297 	if (!PP_KERNEL_ONLY(pp) && pp->pp_ubft_region != NULL) {
1298 		ASSERT(pp->pp_kbft_region != NULL);
1299 		skmem_region_mirror(pp->pp_kbft_region, pp->pp_ubft_region);
1300 	}
1301 
1302 	/*
1303 	 * Create the metadata cache; magazines layer is determined by caller.
1304 	 */
1305 	cache_name = tsnprintf(cname, sizeof(cname), "kmd.%s", name);
1306 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1307 		pp->pp_kmd_cache = skmem_cache_create(cache_name, md_size, 0,
1308 		    pp_metadata_ctor_no_buflet, pp_metadata_dtor, NULL, pp,
1309 		    pp->pp_kmd_region, md_cflags);
1310 	} else {
1311 		pp->pp_kmd_cache = skmem_cache_create(cache_name, md_size, 0,
1312 		    pp_metadata_ctor_max_buflet, pp_metadata_dtor, NULL, pp,
1313 		    pp->pp_kmd_region, md_cflags);
1314 	}
1315 
1316 	if (pp->pp_kmd_cache == NULL) {
1317 		SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1318 		    pp->pp_name, SK_KVA(pp), cname);
1319 		goto failed;
1320 	}
1321 
1322 	/*
1323 	 * Create the buflet metadata cache
1324 	 */
1325 	if (pp->pp_kbft_region != NULL) {
1326 		cache_name = tsnprintf(cname, sizeof(cname), "kbft_def.%s", name);
1327 		PP_KBFT_CACHE_DEF(pp) = skmem_cache_create(cache_name,
1328 		    kbft_srp->srp_c_obj_size, 0,
1329 		    pp_buflet_default_buffer_metadata_ctor,
1330 		    pp_buflet_metadata_dtor, NULL, pp, pp->pp_kbft_region,
1331 		    md_cflags);
1332 
1333 		if (PP_KBFT_CACHE_DEF(pp) == NULL) {
1334 			SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1335 			    pp->pp_name, SK_KVA(pp), cname);
1336 			goto failed;
1337 		}
1338 
1339 		if (PP_HAS_LARGE_BUF(pp)) {
1340 			/* Aggressive memory reclaim flag set to kbft_large for now */
1341 			md_cflags |= SKMEM_CR_RECLAIM;
1342 			cache_name = tsnprintf(cname, sizeof(cname),
1343 			    "kbft_large.%s", name);
1344 			PP_KBFT_CACHE_LARGE(pp) = skmem_cache_create(cache_name,
1345 			    kbft_srp->srp_c_obj_size, 0,
1346 			    pp_buflet_large_buffer_metadata_ctor,
1347 			    pp_buflet_metadata_dtor,
1348 			    NULL, pp, pp->pp_kbft_region, md_cflags);
1349 
1350 			if (PP_KBFT_CACHE_LARGE(pp) == NULL) {
1351 				SK_ERR("\"%s\" (0x%llx) failed to "
1352 				    "create \"%s\" cache", pp->pp_name,
1353 				    SK_KVA(pp), cname);
1354 				goto failed;
1355 			}
1356 		}
1357 	}
1358 
1359 	if ((PP_BUF_REGION_DEF(pp) = skmem_region_create(name,
1360 	    buf_srp, pp_buf_seg_ctor, pp_buf_seg_dtor, pp)) == NULL) {
1361 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1362 		    pp->pp_name, SK_KVA(pp), buf_srp->srp_name);
1363 		goto failed;
1364 	}
1365 
1366 	if (PP_HAS_LARGE_BUF(pp)) {
1367 		PP_BUF_REGION_LARGE(pp) = skmem_region_create(name, lbuf_srp,
1368 		    pp_buf_seg_ctor, pp_buf_seg_dtor, pp);
1369 		if (PP_BUF_REGION_LARGE(pp) == NULL) {
1370 			SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1371 			    pp->pp_name, SK_KVA(pp), lbuf_srp->srp_name);
1372 			goto failed;
1373 		}
1374 	}
1375 
1376 	/*
1377 	 * Create the buffer object cache without the magazines layer.
1378 	 * We rely on caching the constructed metadata object instead.
1379 	 */
1380 	cache_name = tsnprintf(cname, sizeof(cname), "buf_def.%s", name);
1381 	buf_def_cflags = buf_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES ? SKMEM_CR_NOMAGAZINES : 0;
1382 	if ((PP_BUF_CACHE_DEF(pp) = skmem_cache_create(cache_name,
1383 	    def_buf_obj_size,
1384 	    0, NULL, NULL, NULL, pp, PP_BUF_REGION_DEF(pp),
1385 	    buf_def_cflags)) == NULL) {
1386 		SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1387 		    pp->pp_name, SK_KVA(pp), cname);
1388 		goto failed;
1389 	}
1390 
1391 	if (PP_BUF_REGION_LARGE(pp) != NULL) {
1392 		cache_name = tsnprintf(cname, sizeof(cname), "buf_large.%s", name);
1393 		if ((PP_BUF_CACHE_LARGE(pp) = skmem_cache_create(cache_name,
1394 		    lbuf_srp->srp_c_obj_size, 0, NULL, NULL, NULL, pp,
1395 		    PP_BUF_REGION_LARGE(pp), SKMEM_CR_NOMAGAZINES)) == NULL) {
1396 			SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1397 			    pp->pp_name, SK_KVA(pp), cname);
1398 			goto failed;
1399 		}
1400 	}
1401 
1402 	return pp;
1403 
1404 failed:
1405 	if (pp != NULL) {
1406 		if (pp->pp_ctx != NULL) {
1407 			pp->pp_ctx_release(pp->pp_ctx);
1408 			pp->pp_ctx = NULL;
1409 		}
1410 		pp_close(pp);
1411 	}
1412 
1413 	return NULL;
1414 }
1415 
1416 void
pp_destroy(struct kern_pbufpool * pp)1417 pp_destroy(struct kern_pbufpool *pp)
1418 {
1419 	PP_LOCK_ASSERT_HELD(pp);
1420 
1421 	/* may be called for built-in pp with outstanding reference */
1422 	ASSERT(!(pp->pp_flags & PPF_EXTERNAL) || pp->pp_refcnt == 0);
1423 
1424 	pp_destroy_upp_locked(pp);
1425 
1426 	pp_destroy_upp_bft_locked(pp);
1427 
1428 	if (pp->pp_kmd_cache != NULL) {
1429 		skmem_cache_destroy(pp->pp_kmd_cache);
1430 		pp->pp_kmd_cache = NULL;
1431 	}
1432 
1433 	if (pp->pp_umd_region != NULL) {
1434 		skmem_region_release(pp->pp_umd_region);
1435 		pp->pp_umd_region = NULL;
1436 	}
1437 
1438 	if (pp->pp_kmd_region != NULL) {
1439 		skmem_region_release(pp->pp_kmd_region);
1440 		pp->pp_kmd_region = NULL;
1441 	}
1442 
1443 	if (PP_KBFT_CACHE_DEF(pp) != NULL) {
1444 		skmem_cache_destroy(PP_KBFT_CACHE_DEF(pp));
1445 		PP_KBFT_CACHE_DEF(pp) = NULL;
1446 	}
1447 
1448 	if (PP_KBFT_CACHE_LARGE(pp) != NULL) {
1449 		skmem_cache_destroy(PP_KBFT_CACHE_LARGE(pp));
1450 		PP_KBFT_CACHE_LARGE(pp) = NULL;
1451 	}
1452 
1453 	if (pp->pp_ubft_region != NULL) {
1454 		skmem_region_release(pp->pp_ubft_region);
1455 		pp->pp_ubft_region = NULL;
1456 	}
1457 
1458 	if (pp->pp_kbft_region != NULL) {
1459 		skmem_region_release(pp->pp_kbft_region);
1460 		pp->pp_kbft_region = NULL;
1461 	}
1462 
1463 	/*
1464 	 * The order is important here, since pp_metadata_dtor()
1465 	 * called by freeing on the pp_kmd_cache will in turn
1466 	 * free the attached buffer.  Therefore destroy the
1467 	 * buffer cache last.
1468 	 */
1469 	if (PP_BUF_CACHE_DEF(pp) != NULL) {
1470 		skmem_cache_destroy(PP_BUF_CACHE_DEF(pp));
1471 		PP_BUF_CACHE_DEF(pp) = NULL;
1472 	}
1473 	if (PP_BUF_REGION_DEF(pp) != NULL) {
1474 		skmem_region_release(PP_BUF_REGION_DEF(pp));
1475 		PP_BUF_REGION_DEF(pp) = NULL;
1476 	}
1477 	if (PP_BUF_CACHE_LARGE(pp) != NULL) {
1478 		skmem_cache_destroy(PP_BUF_CACHE_LARGE(pp));
1479 		PP_BUF_CACHE_LARGE(pp) = NULL;
1480 	}
1481 	if (PP_BUF_REGION_LARGE(pp) != NULL) {
1482 		skmem_region_release(PP_BUF_REGION_LARGE(pp));
1483 		PP_BUF_REGION_LARGE(pp) = NULL;
1484 	}
1485 
1486 	if (pp->pp_ctx != NULL) {
1487 		pp->pp_ctx_release(pp->pp_ctx);
1488 		pp->pp_ctx = NULL;
1489 	}
1490 }
1491 
1492 static int
pp_init_upp_locked(struct kern_pbufpool * pp,boolean_t can_block)1493 pp_init_upp_locked(struct kern_pbufpool *pp, boolean_t can_block)
1494 {
1495 	int i, err = 0;
1496 
1497 	if (pp->pp_u_hash_table != NULL) {
1498 		goto done;
1499 	}
1500 
1501 	/* allocated-address hash table */
1502 	/*
1503 	 * -fbounds-safety: We switched to sk_alloc (aka kalloc) from zalloc, so
1504 	 * if we see any performance hit, we can check if this caused it.
1505 	 */
1506 	if (can_block) {
1507 		pp->pp_u_hash_table = sk_alloc_type_array(
1508 			struct kern_pbufpool_u_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1509 			Z_WAITOK, skmem_tag_pbufpool_hash);
1510 		pp->pp_u_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1511 	} else {
1512 		pp->pp_u_hash_table = sk_alloc_type_array(
1513 			struct kern_pbufpool_u_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1514 			Z_NOWAIT, skmem_tag_pbufpool_hash);
1515 		pp->pp_u_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1516 	}
1517 	if (pp->pp_u_hash_table == NULL) {
1518 		SK_ERR("failed to zalloc packet buffer pool upp hash table");
1519 		err = ENOMEM;
1520 		goto done;
1521 	}
1522 
1523 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1524 		SLIST_INIT(&pp->pp_u_hash_table[i].upp_head);
1525 	}
1526 done:
1527 	return err;
1528 }
1529 
1530 static void
pp_destroy_upp_locked(struct kern_pbufpool * pp)1531 pp_destroy_upp_locked(struct kern_pbufpool *pp)
1532 {
1533 	PP_LOCK_ASSERT_HELD(pp);
1534 	if (pp->pp_u_hash_table != NULL) {
1535 		/* purge anything that's left */
1536 		pp_purge_upp_locked(pp, -1);
1537 
1538 #if (DEBUG || DEVELOPMENT)
1539 		for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1540 			ASSERT(SLIST_EMPTY(&pp->pp_u_hash_table[i].upp_head));
1541 		}
1542 #endif /* DEBUG || DEVELOPMENT */
1543 
1544 		kfree_type_counted_by(struct kern_pbufpool_u_bkt,
1545 		    pp->pp_u_hash_table_size,
1546 		    pp->pp_u_hash_table);
1547 	}
1548 	ASSERT(pp->pp_u_bufinuse == 0);
1549 }
1550 
1551 int
pp_init_upp(struct kern_pbufpool * pp,boolean_t can_block)1552 pp_init_upp(struct kern_pbufpool *pp, boolean_t can_block)
1553 {
1554 	int err = 0;
1555 
1556 	PP_LOCK(pp);
1557 	err = pp_init_upp_locked(pp, can_block);
1558 	if (err) {
1559 		SK_ERR("packet UPP init failed (%d)", err);
1560 		goto done;
1561 	}
1562 	err = pp_init_upp_bft_locked(pp, can_block);
1563 	if (err) {
1564 		SK_ERR("buflet UPP init failed (%d)", err);
1565 		pp_destroy_upp_locked(pp);
1566 		goto done;
1567 	}
1568 	pp_retain_locked(pp);
1569 done:
1570 	PP_UNLOCK(pp);
1571 	return err;
1572 }
1573 
1574 __attribute__((always_inline))
1575 static void
pp_insert_upp_bft_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1576 pp_insert_upp_bft_locked(struct kern_pbufpool *pp,
1577     struct __kern_buflet *kbft, pid_t pid)
1578 {
1579 	struct kern_pbufpool_u_bft_bkt *bkt;
1580 	struct __kern_buflet_ext *kbe = (struct __kern_buflet_ext *)kbft;
1581 
1582 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1583 	ASSERT(kbe->kbe_buf_pid == (pid_t)-1);
1584 	kbe->kbe_buf_pid = pid;
1585 	bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, kbft->buf_bft_idx_reg);
1586 	SLIST_INSERT_HEAD(&bkt->upp_head, kbe, kbe_buf_upp_link);
1587 	pp->pp_u_bftinuse++;
1588 }
1589 
1590 __attribute__((always_inline))
1591 static void
pp_insert_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1592 pp_insert_upp_bft_chain_locked(struct kern_pbufpool *pp,
1593     struct __kern_buflet *kbft, pid_t pid)
1594 {
1595 	while (kbft != NULL) {
1596 		pp_insert_upp_bft_locked(pp, kbft, pid);
1597 		kbft = __unsafe_forge_single(struct __kern_buflet *,
1598 		    __DECONST(kern_buflet_t, kbft->buf_nbft_addr));
1599 	}
1600 }
1601 
1602 /* Also inserts the attached chain of buflets */
1603 void static inline
pp_insert_upp_common(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1604 pp_insert_upp_common(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1605     pid_t pid)
1606 {
1607 	struct kern_pbufpool_u_bkt *bkt;
1608 	struct __kern_buflet *kbft;
1609 
1610 	ASSERT(kqum->qum_pid == (pid_t)-1);
1611 	kqum->qum_pid = pid;
1612 
1613 	bkt = KERN_PBUFPOOL_U_HASH(pp, METADATA_IDX(kqum));
1614 	SLIST_INSERT_HEAD(&bkt->upp_head, kqum, qum_upp_link);
1615 	pp->pp_u_bufinuse++;
1616 
1617 	kbft = __unsafe_forge_single(struct __kern_buflet *, (kern_buflet_t)kqum->qum_buf[0].buf_nbft_addr);
1618 	if (kbft != NULL) {
1619 		ASSERT(((kern_buflet_t)kbft)->buf_flag & BUFLET_FLAG_EXTERNAL);
1620 		ASSERT(kqum->qum_qflags & QUM_F_INTERNALIZED);
1621 		pp_insert_upp_bft_chain_locked(pp, kbft, pid);
1622 	}
1623 }
1624 
1625 void
pp_insert_upp_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1626 pp_insert_upp_locked(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1627     pid_t pid)
1628 {
1629 	pp_insert_upp_common(pp, kqum, pid);
1630 }
1631 
1632 void
pp_insert_upp(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1633 pp_insert_upp(struct kern_pbufpool *pp, struct __kern_quantum *kqum, pid_t pid)
1634 {
1635 	PP_LOCK(pp);
1636 	pp_insert_upp_common(pp, kqum, pid);
1637 	PP_UNLOCK(pp);
1638 }
1639 
1640 void
pp_insert_upp_batch(struct kern_pbufpool * pp,pid_t pid,uint64_t * __counted_by (num)array,uint32_t num)1641 pp_insert_upp_batch(struct kern_pbufpool *pp, pid_t pid,
1642     uint64_t *__counted_by(num)array, uint32_t num)
1643 {
1644 	uint32_t i = 0;
1645 
1646 	ASSERT(array != NULL && num > 0);
1647 	PP_LOCK(pp);
1648 	while (i < num) {
1649 		struct __kern_quantum *kqum = SK_PTR_ADDR_KQUM(array[i]);
1650 
1651 		ASSERT(kqum != NULL);
1652 		pp_insert_upp_common(pp, kqum, pid);
1653 		++i;
1654 	}
1655 	PP_UNLOCK(pp);
1656 }
1657 
1658 __attribute__((always_inline))
1659 static struct __kern_buflet *
pp_remove_upp_bft_locked(struct kern_pbufpool * pp,obj_idx_t bft_idx)1660 pp_remove_upp_bft_locked(struct kern_pbufpool *pp, obj_idx_t bft_idx)
1661 {
1662 	struct __kern_buflet_ext *kbft, *tbft;
1663 	struct kern_pbufpool_u_bft_bkt *bkt;
1664 
1665 	bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, bft_idx);
1666 	SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link, tbft) {
1667 		if (((kern_buflet_t)kbft)->buf_bft_idx_reg == bft_idx) {
1668 			SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1669 			    kbe_buf_upp_link);
1670 			kbft->kbe_buf_pid = (pid_t)-1;
1671 			kbft->kbe_buf_upp_link.sle_next = NULL;
1672 			ASSERT(pp->pp_u_bftinuse != 0);
1673 			pp->pp_u_bftinuse--;
1674 			break;
1675 		}
1676 	}
1677 	return (kern_buflet_t)kbft;
1678 }
1679 
1680 struct __kern_buflet *
pp_remove_upp_bft(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1681 pp_remove_upp_bft(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1682 {
1683 	struct __kern_buflet *kbft = pp_remove_upp_bft_locked(pp, md_idx);
1684 
1685 	*err = __improbable(kbft != NULL) ? 0 : EINVAL;
1686 	return kbft;
1687 }
1688 
1689 __attribute__((always_inline))
1690 static int
pp_remove_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum)1691 pp_remove_upp_bft_chain_locked(struct kern_pbufpool *pp,
1692     struct __kern_quantum *kqum)
1693 {
1694 	uint32_t max_frags = pp->pp_max_frags;
1695 	struct __kern_buflet *kbft;
1696 	uint16_t nbfts, upkt_nbfts;
1697 	obj_idx_t bft_idx;
1698 
1699 	ASSERT(!(kqum->qum_qflags & QUM_F_INTERNALIZED));
1700 	bft_idx = kqum->qum_user->qum_buf[0].buf_nbft_idx;
1701 	kbft = &kqum->qum_buf[0];
1702 	if (bft_idx == OBJ_IDX_NONE) {
1703 		return 0;
1704 	}
1705 
1706 	ASSERT(METADATA_TYPE(kqum) == NEXUS_META_TYPE_PACKET);
1707 	struct __kern_packet *kpkt = __DECONST(struct __kern_packet *, kqum);
1708 	struct __user_packet *upkt = __DECONST(struct __user_packet *,
1709 	    kpkt->pkt_qum.qum_user);
1710 
1711 	upkt_nbfts = upkt->pkt_bufs_cnt;
1712 	if (__improbable(upkt_nbfts > max_frags)) {
1713 		SK_ERR("bad bcnt in upkt (%d > %d)", upkt_nbfts, max_frags);
1714 		BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1715 		BUF_NBFT_ADDR(kbft, 0);
1716 		return ERANGE;
1717 	}
1718 
1719 	nbfts = (kbft->buf_addr != 0) ? 1 : 0;
1720 
1721 	do {
1722 		struct __kern_buflet *pbft = kbft;
1723 		struct __kern_buflet_ext *kbe;
1724 
1725 		kbft = pp_remove_upp_bft_locked(pp, bft_idx);
1726 		if (__improbable(kbft == NULL)) {
1727 			BUF_NBFT_IDX(pbft, OBJ_IDX_NONE);
1728 			BUF_NBFT_ADDR(pbft, 0);
1729 			SK_ERR("unallocated next buflet (%d), %p", bft_idx,
1730 			    SK_KVA(pbft));
1731 			return ERANGE;
1732 		}
1733 		ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1734 		BUF_NBFT_IDX(pbft, bft_idx);
1735 		BUF_NBFT_ADDR(pbft, kbft);
1736 		kbe = __container_of(kbft, struct __kern_buflet_ext, kbe_overlay);
1737 		bft_idx = kbe->kbe_buf_user->buf_nbft_idx;
1738 		++nbfts;
1739 	} while ((bft_idx != OBJ_IDX_NONE) && (nbfts < upkt_nbfts));
1740 
1741 	ASSERT(kbft != NULL);
1742 	BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1743 	BUF_NBFT_ADDR(kbft, 0);
1744 	*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = nbfts;
1745 
1746 	if (__improbable((bft_idx != OBJ_IDX_NONE) || (nbfts != upkt_nbfts))) {
1747 		SK_ERR("bad buflet in upkt (%d, %d)", nbfts, upkt_nbfts);
1748 		return ERANGE;
1749 	}
1750 	return 0;
1751 }
1752 
1753 struct __kern_quantum *
pp_remove_upp_locked(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1754 pp_remove_upp_locked(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1755 {
1756 	struct __kern_quantum *kqum, *tqum;
1757 	struct kern_pbufpool_u_bkt *bkt;
1758 
1759 	bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1760 	SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1761 		if (METADATA_IDX(kqum) == md_idx) {
1762 			SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1763 			    qum_upp_link);
1764 			kqum->qum_pid = (pid_t)-1;
1765 			ASSERT(pp->pp_u_bufinuse != 0);
1766 			pp->pp_u_bufinuse--;
1767 			break;
1768 		}
1769 	}
1770 	if (__probable(kqum != NULL)) {
1771 		*err = pp_remove_upp_bft_chain_locked(pp, kqum);
1772 	} else {
1773 		*err = ERANGE;
1774 	}
1775 	return kqum;
1776 }
1777 
1778 struct __kern_quantum *
pp_remove_upp(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1779 pp_remove_upp(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1780 {
1781 	struct __kern_quantum *kqum;
1782 
1783 	PP_LOCK(pp);
1784 	kqum = pp_remove_upp_locked(pp, md_idx, err);
1785 	PP_UNLOCK(pp);
1786 	return kqum;
1787 }
1788 
1789 struct __kern_quantum *
pp_find_upp(struct kern_pbufpool * pp,obj_idx_t md_idx)1790 pp_find_upp(struct kern_pbufpool *pp, obj_idx_t md_idx)
1791 {
1792 	struct __kern_quantum *kqum, *tqum;
1793 	struct kern_pbufpool_u_bkt *bkt;
1794 
1795 	PP_LOCK(pp);
1796 	bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1797 	SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1798 		if (METADATA_IDX(kqum) == md_idx) {
1799 			break;
1800 		}
1801 	}
1802 	PP_UNLOCK(pp);
1803 
1804 	return kqum;
1805 }
1806 
1807 __attribute__((always_inline))
1808 static void
pp_purge_upp_locked(struct kern_pbufpool * pp,pid_t pid)1809 pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid)
1810 {
1811 	struct __kern_quantum *kqum, *tqum;
1812 	struct kern_pbufpool_u_bkt *bkt;
1813 	int i;
1814 
1815 	PP_LOCK_ASSERT_HELD(pp);
1816 
1817 	/*
1818 	 * TODO: Build a list of packets and batch-free them.
1819 	 */
1820 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1821 		bkt = &pp->pp_u_hash_table[i];
1822 		SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1823 			ASSERT(kqum->qum_pid != (pid_t)-1);
1824 			if (pid != (pid_t)-1 && kqum->qum_pid != pid) {
1825 				continue;
1826 			}
1827 			SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1828 			    qum_upp_link);
1829 			pp_remove_upp_bft_chain_locked(pp, kqum);
1830 			kqum->qum_pid = (pid_t)-1;
1831 			kqum->qum_qflags &= ~QUM_F_FINALIZED;
1832 			kqum->qum_ksd = NULL;
1833 			pp_free_packet(__DECONST(struct kern_pbufpool *,
1834 			    kqum->qum_pp), (uint64_t)kqum);
1835 			ASSERT(pp->pp_u_bufinuse != 0);
1836 			pp->pp_u_bufinuse--;
1837 		}
1838 	}
1839 }
1840 
1841 __attribute__((always_inline))
1842 static void
pp_purge_upp_bft_locked(struct kern_pbufpool * pp,pid_t pid)1843 pp_purge_upp_bft_locked(struct kern_pbufpool *pp, pid_t pid)
1844 {
1845 	struct __kern_buflet_ext *kbft, *tbft;
1846 	struct kern_pbufpool_u_bft_bkt *bkt;
1847 	int i;
1848 
1849 	PP_LOCK_ASSERT_HELD(pp);
1850 
1851 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1852 		bkt = &pp->pp_u_bft_hash_table[i];
1853 		SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link,
1854 		    tbft) {
1855 			ASSERT(kbft->kbe_buf_pid != (pid_t)-1);
1856 			if (pid != (pid_t)-1 && kbft->kbe_buf_pid != pid) {
1857 				continue;
1858 			}
1859 			SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1860 			    kbe_buf_upp_link);
1861 			kbft->kbe_buf_pid = (pid_t)-1;
1862 			kbft->kbe_buf_upp_link.sle_next = NULL;
1863 			pp_free_buflet(pp, (kern_buflet_t)kbft);
1864 			ASSERT(pp->pp_u_bftinuse != 0);
1865 			pp->pp_u_bftinuse--;
1866 		}
1867 	}
1868 }
1869 
1870 void
pp_purge_upp(struct kern_pbufpool * pp,pid_t pid)1871 pp_purge_upp(struct kern_pbufpool *pp, pid_t pid)
1872 {
1873 	PP_LOCK(pp);
1874 	pp_purge_upp_locked(pp, pid);
1875 	pp_purge_upp_bft_locked(pp, pid);
1876 	PP_UNLOCK(pp);
1877 }
1878 
1879 static int
pp_init_upp_bft_locked(struct kern_pbufpool * pp,boolean_t can_block)1880 pp_init_upp_bft_locked(struct kern_pbufpool *pp, boolean_t can_block)
1881 {
1882 	int i, err = 0;
1883 
1884 	PP_LOCK_ASSERT_HELD(pp);
1885 	if (pp->pp_u_bft_hash_table != NULL) {
1886 		return 0;
1887 	}
1888 
1889 	/* allocated-address hash table */
1890 	/*
1891 	 * -fbounds-safety: We switched to sk_alloc (aka kalloc) from zalloc, so
1892 	 * if we see any performance hit, we can check if this caused it.
1893 	 */
1894 	if (can_block) {
1895 		pp->pp_u_bft_hash_table = sk_alloc_type_array(
1896 			struct kern_pbufpool_u_bft_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1897 			Z_WAITOK, skmem_tag_pbufpool_bft_hash);
1898 		pp->pp_u_bft_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1899 	} else {
1900 		pp->pp_u_bft_hash_table = sk_alloc_type_array(
1901 			struct kern_pbufpool_u_bft_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1902 			Z_NOWAIT, skmem_tag_pbufpool_bft_hash);
1903 		pp->pp_u_bft_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1904 	}
1905 	if (pp->pp_u_bft_hash_table == NULL) {
1906 		SK_ERR("failed to zalloc packet buffer pool upp buflet hash table");
1907 		err = ENOMEM;
1908 		goto fail;
1909 	}
1910 
1911 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1912 		SLIST_INIT(&pp->pp_u_bft_hash_table[i].upp_head);
1913 	}
1914 
1915 fail:
1916 	return err;
1917 }
1918 
1919 static void
pp_destroy_upp_bft_locked(struct kern_pbufpool * pp)1920 pp_destroy_upp_bft_locked(struct kern_pbufpool *pp)
1921 {
1922 	PP_LOCK_ASSERT_HELD(pp);
1923 	if (pp->pp_u_bft_hash_table != NULL) {
1924 		/* purge anything that's left */
1925 		pp_purge_upp_bft_locked(pp, -1);
1926 
1927 #if (DEBUG || DEVELOPMENT)
1928 		for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1929 			ASSERT(SLIST_EMPTY(&pp->pp_u_bft_hash_table[i].upp_head));
1930 		}
1931 #endif /* DEBUG || DEVELOPMENT */
1932 
1933 		kfree_type_counted_by(struct kern_pbufpool_u_bft_bkt,
1934 		    pp->pp_u_bft_hash_table_size,
1935 		    pp->pp_u_bft_hash_table);
1936 	}
1937 	ASSERT(pp->pp_u_bftinuse == 0);
1938 }
1939 
1940 void
pp_insert_upp_bft(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1941 pp_insert_upp_bft(struct kern_pbufpool *pp,
1942     struct __kern_buflet *kbft, pid_t pid)
1943 {
1944 	PP_LOCK(pp);
1945 	pp_insert_upp_bft_locked(pp, kbft, pid);
1946 	PP_UNLOCK(pp);
1947 }
1948 
1949 boolean_t
pp_isempty_upp(struct kern_pbufpool * pp)1950 pp_isempty_upp(struct kern_pbufpool *pp)
1951 {
1952 	boolean_t isempty;
1953 
1954 	PP_LOCK(pp);
1955 	isempty = (pp->pp_u_bufinuse == 0);
1956 	PP_UNLOCK(pp);
1957 
1958 	return isempty;
1959 }
1960 
1961 __attribute__((always_inline))
1962 static inline struct __kern_quantum *
pp_metadata_init(struct __metadata_preamble * mdp,struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag,struct skmem_obj ** blist)1963 pp_metadata_init(struct __metadata_preamble *mdp, struct kern_pbufpool *pp,
1964     uint16_t bufcnt, uint32_t skmflag, struct skmem_obj **blist)
1965 {
1966 	struct __kern_quantum *kqum;
1967 	struct __user_quantum *uqum;
1968 
1969 	kqum = SK_PTR_ADDR_KQUM((uintptr_t)mdp + METADATA_PREAMBLE_SZ);
1970 	ASSERT(kqum->qum_pp == pp);
1971 	if (__probable(!PP_KERNEL_ONLY(pp))) {
1972 		ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY));
1973 		uqum =  __DECONST(struct __user_quantum *, kqum->qum_user);
1974 		ASSERT(uqum != NULL);
1975 	} else {
1976 		ASSERT(kqum->qum_qflags & QUM_F_KERNEL_ONLY);
1977 		ASSERT(kqum->qum_user == NULL);
1978 		uqum = NULL;
1979 	}
1980 
1981 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 &&
1982 	    pp_metadata_construct(kqum, uqum, METADATA_IDX(kqum), pp,
1983 	    skmflag, bufcnt, FALSE, blist) != 0) {
1984 		return NULL;
1985 	}
1986 
1987 	/* (re)construct {user,kernel} metadata */
1988 	switch (pp->pp_md_type) {
1989 	case NEXUS_META_TYPE_PACKET: {
1990 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
1991 		struct __kern_buflet *kbuf = &kpkt->pkt_qum_buf;
1992 		uint16_t i;
1993 
1994 		/* sanitize flags */
1995 		kpkt->pkt_pflags &= PKT_F_INIT_MASK;
1996 
1997 		ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
1998 		    kpkt->pkt_com_opt != NULL);
1999 		ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
2000 		    kpkt->pkt_flow != NULL);
2001 		ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
2002 		    kpkt->pkt_tx_compl != NULL);
2003 
2004 		/*
2005 		 * XXX: For now we always set PKT_F_FLOW_DATA;
2006 		 * this is a no-op but done for consistency
2007 		 * with the other PKT_F_*_DATA flags.
2008 		 */
2009 		kpkt->pkt_pflags |= PKT_F_FLOW_DATA;
2010 
2011 		/* initialize kernel packet */
2012 		KPKT_INIT(kpkt, QUM_F_INTERNALIZED);
2013 
2014 		ASSERT(bufcnt || PP_HAS_BUFFER_ON_DEMAND(pp));
2015 		if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
2016 			ASSERT(kbuf->buf_ctl == NULL);
2017 			ASSERT(kbuf->buf_addr == 0);
2018 			/*
2019 			 * -fbounds-safety: buf_nbft_addr is a mach_vm_address_t
2020 			 * which is unsafe, so we just forge it here.
2021 			 */
2022 			kbuf = __unsafe_forge_single(struct __kern_buflet *,
2023 			    __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
2024 		}
2025 		/* initialize kernel buflet */
2026 		for (i = 0; i < bufcnt; i++) {
2027 			ASSERT(kbuf != NULL);
2028 			KBUF_INIT(kbuf);
2029 			kbuf = __unsafe_forge_single(struct __kern_buflet *,
2030 			    __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
2031 		}
2032 		ASSERT((kbuf == NULL) || (bufcnt == 0));
2033 		break;
2034 	}
2035 	default:
2036 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
2037 		/* kernel quantum */
2038 		KQUM_INIT(kqum, QUM_F_INTERNALIZED);
2039 		KBUF_INIT(&kqum->qum_buf[0]);
2040 		break;
2041 	}
2042 
2043 	return kqum;
2044 }
2045 
2046 /*
2047  * When PPF_BUFFER_ON_DEMAND flag is set on packet pool creation, we create
2048  * packet descriptor cache with no buffer attached and a buflet cache with
2049  * cpu layer caching enabled. While operating in this mode, we can call
2050  * pp_alloc_packet_common() either with `bufcnt = 0` or `bufcnt = n`,
2051  * where n <= pp->pp_max_frags. If `bufcnt == 0` then we allocate packet
2052  * descriptor with no attached buffer from the metadata cache.
2053  * If `bufcnt != 0`, then this routine allocates packet descriptor and buflets
2054  * from their respective caches and constructs the packet on behalf of the
2055  * caller.
2056  */
2057 __attribute__((always_inline))
2058 static inline uint32_t
pp_alloc_packet_common(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * __counted_by (num)array,uint32_t num,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2059 pp_alloc_packet_common(struct kern_pbufpool *pp, uint16_t bufcnt,
2060     uint64_t *__counted_by(num)array, uint32_t num, boolean_t tagged,
2061     alloc_cb_func_t cb, const void *ctx, uint32_t skmflag)
2062 {
2063 	struct __metadata_preamble *mdp;
2064 	struct __kern_quantum *kqum = NULL;
2065 	uint32_t allocp, need = num;
2066 	struct skmem_obj *__single plist, *__single blist = NULL;
2067 	uint64_t *array_cp;  /* -fbounds-safety */
2068 
2069 	ASSERT(bufcnt <= pp->pp_max_frags);
2070 	ASSERT(array != NULL && num > 0);
2071 	ASSERT(PP_BATCH_CAPABLE(pp));
2072 
2073 	/* allocate (constructed) packet(s) with buffer(s) attached */
2074 	allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist,
2075 	    pp->pp_kmd_cache->skm_objsize, num, skmflag);
2076 
2077 	/* allocate (constructed) buflet(s) with buffer(s) attached */
2078 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
2079 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
2080 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, (allocp * bufcnt), skmflag);
2081 	}
2082 
2083 	array_cp = array;
2084 	while (plist != NULL) {
2085 		struct skmem_obj *plistn;
2086 
2087 		plistn = plist->mo_next;
2088 		plist->mo_next = NULL;
2089 
2090 		mdp = (struct __metadata_preamble *)(void *)plist;
2091 		kqum = pp_metadata_init(mdp, pp, bufcnt, skmflag, &blist);
2092 		if (kqum == NULL) {
2093 			if (blist != NULL) {
2094 				skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp),
2095 				    blist);
2096 				blist = NULL;
2097 			}
2098 			plist->mo_next = plistn;
2099 			skmem_cache_batch_free(pp->pp_kmd_cache, plist);
2100 			plist = NULL;
2101 			break;
2102 		}
2103 
2104 
2105 		if (tagged) {
2106 			*array_cp = SK_PTR_ENCODE(kqum, METADATA_TYPE(kqum),
2107 			    METADATA_SUBTYPE(kqum));
2108 		} else {
2109 			*array_cp = (uint64_t)kqum;
2110 		}
2111 
2112 		if (cb != NULL) {
2113 			(cb)(*array_cp, (num - need), ctx);
2114 		}
2115 
2116 		++array_cp;
2117 		plist = plistn;
2118 
2119 		ASSERT(need > 0);
2120 		--need;
2121 	}
2122 	ASSERT(blist == NULL);
2123 	ASSERT((num - need) == allocp || kqum == NULL);
2124 
2125 	return num - need;
2126 }
2127 
2128 uint64_t
pp_alloc_packet(struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag)2129 pp_alloc_packet(struct kern_pbufpool *pp, uint16_t bufcnt, uint32_t skmflag)
2130 {
2131 	uint64_t kpkt = 0;
2132 
2133 	(void) pp_alloc_packet_common(pp, bufcnt, &kpkt, 1, FALSE,
2134 	    NULL, NULL, skmflag);
2135 
2136 	return kpkt;
2137 }
2138 
2139 int
pp_alloc_packet_batch(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * __counted_by (* size)array,uint32_t * size,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2140 pp_alloc_packet_batch(struct kern_pbufpool *pp, uint16_t bufcnt,
2141     uint64_t *__counted_by(*size)array, uint32_t *size, boolean_t tagged,
2142     alloc_cb_func_t cb, const void *ctx, uint32_t skmflag)
2143 {
2144 	uint32_t i, n;
2145 	int err;
2146 
2147 	ASSERT(array != NULL && size > 0);
2148 
2149 	n = *size;
2150 	/*
2151 	 * -fbounds-safety: Originally there was this line here: *size = 0; but
2152 	 * we removed this because array is now __counted_by(*size), so *size =
2153 	 * 0 leads to brk 0x5519. Also, *size is set to i anyway.
2154 	 */
2155 
2156 	i = pp_alloc_packet_common(pp, bufcnt, array, n, tagged,
2157 	    cb, ctx, skmflag);
2158 	/*
2159 	 * -fbounds-safety: Since array is __counted_by(*size), we need to be
2160 	 * extra careful when *size is updated, like below. Here, we know i will
2161 	 * be less than or equal to the original *size value, so updating *size
2162 	 * is okay.
2163 	 */
2164 	*size = i;
2165 
2166 	if (__probable(i == n)) {
2167 		err = 0;
2168 	} else if (i != 0) {
2169 		err = EAGAIN;
2170 	} else {
2171 		err = ENOMEM;
2172 	}
2173 
2174 	return err;
2175 }
2176 
2177 int
pp_alloc_pktq(struct kern_pbufpool * pp,uint16_t bufcnt,struct pktq * pktq,uint32_t num,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2178 pp_alloc_pktq(struct kern_pbufpool *pp, uint16_t bufcnt,
2179     struct pktq *pktq, uint32_t num, alloc_cb_func_t cb, const void *ctx,
2180     uint32_t skmflag)
2181 {
2182 	struct __metadata_preamble *mdp;
2183 	struct __kern_packet *kpkt = NULL;
2184 	uint32_t allocp, need = num;
2185 	struct skmem_obj *__single plist, *__single blist = NULL;
2186 	int err;
2187 
2188 	ASSERT(pktq != NULL && num > 0);
2189 	ASSERT(pp->pp_md_type == NEXUS_META_TYPE_PACKET);
2190 	ASSERT(bufcnt <= pp->pp_max_frags);
2191 	ASSERT(PP_BATCH_CAPABLE(pp));
2192 
2193 	/* allocate (constructed) packet(s) with buffer(s) attached */
2194 	allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist,
2195 	    pp->pp_kmd_cache->skm_objsize, num, skmflag);
2196 
2197 	/* allocate (constructed) buflet(s) with buffer(s) attached */
2198 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
2199 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
2200 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, (allocp * bufcnt), skmflag);
2201 	}
2202 
2203 	while (plist != NULL) {
2204 		struct skmem_obj *plistn;
2205 
2206 		plistn = plist->mo_next;
2207 		plist->mo_next = NULL;
2208 
2209 		mdp = (struct __metadata_preamble *)(void *)plist;
2210 		kpkt = (struct __kern_packet *)pp_metadata_init(mdp, pp,
2211 		    bufcnt, skmflag, &blist);
2212 		if (kpkt == NULL) {
2213 			if (blist != NULL) {
2214 				skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp),
2215 				    blist);
2216 				blist = NULL;
2217 			}
2218 			plist->mo_next = plistn;
2219 			skmem_cache_batch_free(pp->pp_kmd_cache, plist);
2220 			plist = NULL;
2221 			break;
2222 		}
2223 
2224 
2225 		KPKTQ_ENQUEUE(pktq, kpkt);
2226 
2227 		if (cb != NULL) {
2228 			(cb)((uint64_t)kpkt, (num - need), ctx);
2229 		}
2230 
2231 		plist = plistn;
2232 
2233 		ASSERT(need > 0);
2234 		--need;
2235 	}
2236 	ASSERT(blist == NULL);
2237 	ASSERT((num - need) == allocp || kpkt == NULL);
2238 
2239 	if (__probable(need == 0)) {
2240 		err = 0;
2241 	} else if (need == num) {
2242 		err = ENOMEM;
2243 	} else {
2244 		err = EAGAIN;
2245 	}
2246 
2247 	return err;
2248 }
2249 
2250 uint64_t
pp_alloc_packet_by_size(struct kern_pbufpool * pp,uint32_t size,uint32_t skmflag)2251 pp_alloc_packet_by_size(struct kern_pbufpool *pp, uint32_t size,
2252     uint32_t skmflag)
2253 {
2254 	uint32_t bufcnt = pp->pp_max_frags;
2255 	uint64_t kpkt = 0;
2256 
2257 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
2258 		bufcnt =
2259 		    SK_ROUNDUP(size, PP_BUF_SIZE_DEF(pp)) / PP_BUF_SIZE_DEF(pp);
2260 		ASSERT(bufcnt <= UINT16_MAX);
2261 	}
2262 
2263 	(void) pp_alloc_packet_common(pp, (uint16_t)bufcnt, &kpkt, 1, TRUE,
2264 	    NULL, NULL, skmflag);
2265 
2266 	return kpkt;
2267 }
2268 
2269 __attribute__((always_inline))
2270 static inline struct __metadata_preamble *
pp_metadata_fini(struct __kern_quantum * kqum,struct kern_pbufpool * pp,struct mbuf ** mp,struct __kern_packet ** kpp,struct skmem_obj ** blist_def,struct skmem_obj ** blist_nocache_def,struct skmem_obj ** blist_large,struct skmem_obj ** blist_nocahce_large)2271 pp_metadata_fini(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
2272     struct mbuf **mp, struct __kern_packet **kpp, struct skmem_obj **blist_def,
2273     struct skmem_obj **blist_nocache_def, struct skmem_obj **blist_large,
2274     struct skmem_obj **blist_nocahce_large)
2275 {
2276 	struct __metadata_preamble *mdp = METADATA_PREAMBLE(kqum);
2277 
2278 	ASSERT(SK_PTR_TAG(kqum) == 0);
2279 
2280 	switch (pp->pp_md_type) {
2281 	case NEXUS_META_TYPE_PACKET: {
2282 		struct __kern_packet *kpkt = SK_PTR_KPKT(kqum);
2283 
2284 		if ((kpkt->pkt_pflags & PKT_F_TX_COMPL_TS_REQ) != 0) {
2285 			__packet_perform_tx_completion_callbacks(
2286 				SK_PKT2PH(kpkt), NULL);
2287 		}
2288 		if ((kpkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) {
2289 			ASSERT((kpkt->pkt_pflags & PKT_F_PKT_DATA) == 0);
2290 			ASSERT(kpkt->pkt_mbuf != NULL);
2291 			ASSERT(kpkt->pkt_mbuf->m_nextpkt == NULL);
2292 			if (mp != NULL) {
2293 				ASSERT(*mp == NULL);
2294 				*mp = kpkt->pkt_mbuf;
2295 			} else {
2296 				m_freem(kpkt->pkt_mbuf);
2297 			}
2298 			KPKT_CLEAR_MBUF_DATA(kpkt);
2299 		} else if ((kpkt->pkt_pflags & PKT_F_PKT_DATA) != 0) {
2300 			ASSERT(kpkt->pkt_pkt != NULL);
2301 			ASSERT(kpkt->pkt_pkt->pkt_nextpkt == NULL);
2302 			if (kpp != NULL) {
2303 				ASSERT(*kpp == NULL);
2304 				*kpp = kpkt->pkt_pkt;
2305 			} else {
2306 				/* can only recurse once */
2307 				ASSERT((kpkt->pkt_pkt->pkt_pflags &
2308 				    PKT_F_PKT_DATA) == 0);
2309 				pp_free_packet_single(kpkt->pkt_pkt);
2310 			}
2311 			KPKT_CLEAR_PKT_DATA(kpkt);
2312 		}
2313 		kpkt->pkt_pflags &= ~PKT_F_TRUNCATED;
2314 		ASSERT(kpkt->pkt_nextpkt == NULL);
2315 		ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
2316 		ASSERT((kpkt->pkt_pflags & PKT_F_MBUF_MASK) == 0);
2317 		ASSERT((kpkt->pkt_pflags & PKT_F_PKT_MASK) == 0);
2318 		break;
2319 	}
2320 	default:
2321 		break;
2322 	}
2323 
2324 	if (__improbable(PP_HAS_BUFFER_ON_DEMAND(pp))) {
2325 		pp_metadata_destruct_common(kqum, pp, FALSE, blist_def, blist_nocache_def,
2326 		    blist_large, blist_nocahce_large);
2327 	}
2328 	return mdp;
2329 }
2330 
2331 void
pp_free_packet_chain(struct __kern_packet * pkt_chain,int * npkt)2332 pp_free_packet_chain(struct __kern_packet *pkt_chain, int *npkt)
2333 {
2334 	struct __metadata_preamble *mdp;
2335 	struct skmem_obj *__single obj_mdp;
2336 	struct skmem_obj *__single top = NULL;
2337 	struct skmem_obj *__single blist_def = NULL, *__single blist_nocache_def = NULL;
2338 	struct skmem_obj *__single blist_large = NULL, *__single blist_nocache_large = NULL;
2339 	struct skmem_obj **list = &top;
2340 	struct mbuf *__single mtop = NULL;
2341 	struct mbuf **mp = &mtop;
2342 	struct __kern_packet *__single kptop = NULL;
2343 	struct __kern_packet **__single kpp = &kptop, *pkt, *next;
2344 	struct kern_pbufpool *pp;
2345 	int c = 0;
2346 
2347 	pp = __DECONST(struct kern_pbufpool *, pkt_chain->pkt_qum.qum_pp);
2348 	ASSERT(pp != NULL);
2349 	ASSERT(PP_BATCH_CAPABLE(pp));
2350 
2351 	for (pkt = pkt_chain; pkt != NULL; pkt = next) {
2352 		next = pkt->pkt_nextpkt;
2353 		pkt->pkt_nextpkt = NULL;
2354 
2355 		ASSERT(SK_PTR_ADDR_KQUM(pkt)->qum_pp == pp);
2356 		mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(pkt), pp,
2357 		    mp, kpp, &blist_def, &blist_nocache_def, &blist_large, &blist_nocache_large);
2358 
2359 		obj_mdp = __unsafe_forge_single(struct skmem_obj *, mdp);
2360 		*list = obj_mdp;
2361 		list = &(*list)->mo_next;
2362 		c++;
2363 
2364 		if (*mp != NULL) {
2365 			mp = &(*mp)->m_nextpkt;
2366 			ASSERT(*mp == NULL);
2367 		}
2368 		if (*kpp != NULL) {
2369 			kpp = &(*kpp)->pkt_nextpkt;
2370 			ASSERT(*kpp == NULL);
2371 		}
2372 	}
2373 
2374 	ASSERT(top != NULL);
2375 	skmem_cache_batch_free(pp->pp_kmd_cache, top);
2376 	pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
2377 	if (mtop != NULL) {
2378 		DTRACE_SKYWALK(free__attached__mbuf);
2379 		if (__probable(mtop->m_nextpkt != NULL)) {
2380 			m_freem_list(mtop);
2381 		} else {
2382 			m_freem(mtop);
2383 		}
2384 	}
2385 	if (kptop != NULL) {
2386 		int cnt = 0;
2387 		pp_free_packet_chain(kptop, &cnt);
2388 		DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2389 	}
2390 	if (npkt != NULL) {
2391 		*npkt = c;
2392 	}
2393 }
2394 
2395 void
pp_free_pktq(struct pktq * pktq)2396 pp_free_pktq(struct pktq *pktq)
2397 {
2398 	if (__improbable(KPKTQ_EMPTY(pktq))) {
2399 		return;
2400 	}
2401 	struct __kern_packet *pkt = KPKTQ_FIRST(pktq);
2402 	pp_free_packet_chain(pkt, NULL);
2403 	KPKTQ_DISPOSE(pktq);
2404 }
2405 
2406 void
pp_drop_pktq(struct pktq * pktq,struct ifnet * ifp,uint16_t flags,drop_reason_t reason,const char * funcname,uint16_t linenum)2407 pp_drop_pktq(struct pktq *pktq, struct ifnet *ifp, uint16_t flags,
2408     drop_reason_t reason, const char *funcname, uint16_t linenum)
2409 {
2410 	drop_func_t dropfunc;
2411 	struct __kern_packet *kpkt;
2412 
2413 	if (KPKTQ_EMPTY(pktq)) {
2414 		return;
2415 	}
2416 	if (__probable(droptap_total_tap_count == 0)) {
2417 		goto nodroptap;
2418 	}
2419 
2420 	if (flags & DROPTAP_FLAG_DIR_OUT) {
2421 		dropfunc = droptap_output_packet;
2422 	} else if (flags & DROPTAP_FLAG_DIR_IN) {
2423 		dropfunc = droptap_input_packet;
2424 	} else {
2425 		goto nodroptap;
2426 	}
2427 
2428 	KPKTQ_FOREACH(kpkt, pktq) {
2429 		dropfunc(SK_PKT2PH(kpkt), reason, funcname, linenum, flags, ifp,
2430 		    kpkt->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);
2431 	}
2432 
2433 nodroptap:
2434 	pp_free_pktq(pktq);
2435 }
2436 
2437 __attribute__((always_inline))
2438 static inline void
pp_free_packet_array(struct kern_pbufpool * pp,uint64_t * __counted_by (num)array,uint32_t num)2439 pp_free_packet_array(struct kern_pbufpool *pp, uint64_t *__counted_by(num)array, uint32_t num)
2440 {
2441 	struct __metadata_preamble *mdp;
2442 	struct skmem_obj *__single obj_mdp = NULL;
2443 	struct skmem_obj *__single top = NULL;
2444 	struct skmem_obj *__single blist_def = NULL, *__single blist_nocache_def = NULL;
2445 	struct skmem_obj *__single blist_large = NULL, *__single blist_nocache_large = NULL;
2446 	struct skmem_obj **list = &top;
2447 	struct mbuf *__single mtop = NULL;
2448 	struct mbuf **mp = &mtop;
2449 	struct __kern_packet *__single kptop = NULL;
2450 	struct __kern_packet **kpp = &kptop;
2451 	uint32_t i;
2452 
2453 	ASSERT(pp != NULL);
2454 	ASSERT(array != NULL && num > 0);
2455 	ASSERT(PP_BATCH_CAPABLE(pp));
2456 
2457 	for (i = 0; i < num; i++) {
2458 		ASSERT(SK_PTR_ADDR_KQUM(array[i])->qum_pp == pp);
2459 		mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(array[i]), pp,
2460 		    mp, kpp, &blist_def, &blist_nocache_def, &blist_large, &blist_nocache_large);
2461 
2462 		obj_mdp = __unsafe_forge_single(struct skmem_obj *, mdp);
2463 		*list = obj_mdp;
2464 		list = &(*list)->mo_next;
2465 		array[i] = 0;
2466 
2467 		if (*mp != NULL) {
2468 			mp = &(*mp)->m_nextpkt;
2469 			ASSERT(*mp == NULL);
2470 		}
2471 		if (*kpp != NULL) {
2472 			kpp = &(*kpp)->pkt_nextpkt;
2473 			ASSERT(*kpp == NULL);
2474 		}
2475 	}
2476 
2477 	ASSERT(top != NULL);
2478 	skmem_cache_batch_free(pp->pp_kmd_cache, top);
2479 	pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
2480 	if (mtop != NULL) {
2481 		DTRACE_SKYWALK(free__attached__mbuf);
2482 		if (__probable(mtop->m_nextpkt != NULL)) {
2483 			m_freem_list(mtop);
2484 		} else {
2485 			m_freem(mtop);
2486 		}
2487 	}
2488 	if (kptop != NULL) {
2489 		int cnt = 0;
2490 		pp_free_packet_chain(kptop, &cnt);
2491 		DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2492 	}
2493 }
2494 
2495 void
pp_free_packet(struct kern_pbufpool * pp,uint64_t kqum)2496 pp_free_packet(struct kern_pbufpool *pp, uint64_t kqum)
2497 {
2498 	pp_free_packet_array(pp, &kqum, 1);
2499 }
2500 
2501 void
pp_free_packet_batch(const kern_pbufpool_t pp,uint64_t * __counted_by (size)array,uint32_t size)2502 pp_free_packet_batch(const kern_pbufpool_t pp, uint64_t *__counted_by(size)array, uint32_t size)
2503 {
2504 	pp_free_packet_array(pp, array, size);
2505 }
2506 
2507 void
pp_free_packet_single(struct __kern_packet * pkt)2508 pp_free_packet_single(struct __kern_packet *pkt)
2509 {
2510 	ASSERT(pkt->pkt_nextpkt == NULL);
2511 	pp_free_packet(__DECONST(struct kern_pbufpool *,
2512 	    pkt->pkt_qum.qum_pp), SK_PTR_ADDR(pkt));
2513 }
2514 
2515 void
pp_drop_packet_single(struct __kern_packet * pkt,struct ifnet * ifp,uint16_t flags,drop_reason_t reason,const char * funcname,uint16_t linenum)2516 pp_drop_packet_single(struct __kern_packet *pkt, struct ifnet *ifp, uint16_t flags,
2517     drop_reason_t reason, const char *funcname, uint16_t linenum)
2518 {
2519 	drop_func_t dropfunc;
2520 
2521 	if (pkt->pkt_length == 0) {
2522 		return;
2523 	}
2524 	if (__probable(droptap_total_tap_count == 0)) {
2525 		goto nodroptap;
2526 	}
2527 
2528 	if (flags & DROPTAP_FLAG_DIR_OUT) {
2529 		dropfunc = droptap_output_packet;
2530 	} else if (flags & DROPTAP_FLAG_DIR_IN) {
2531 		dropfunc = droptap_input_packet;
2532 	} else {
2533 		goto nodroptap;
2534 	}
2535 
2536 	dropfunc(SK_PKT2PH(pkt), reason, funcname, linenum, flags, ifp,
2537 	    pkt->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);
2538 
2539 nodroptap:
2540 	pp_free_packet_single(pkt);
2541 }
2542 
2543 static mach_vm_address_t
pp_alloc_buffer_common(const kern_pbufpool_t pp,struct skmem_obj_info * oi,uint32_t skmflag,bool large)2544 pp_alloc_buffer_common(const kern_pbufpool_t pp, struct skmem_obj_info *oi,
2545     uint32_t skmflag, bool large)
2546 {
2547 	/*
2548 	 * XXX -fbounds-safety: We can't change this mach_vm_address_t to some
2549 	 * other (safe) pointer type, because IOSkywalkFamily depends on this
2550 	 * being mach_vm_address_t
2551 	 */
2552 	mach_vm_address_t baddr;
2553 	struct skmem_cache *skm = large ? PP_BUF_CACHE_LARGE(pp):
2554 	    PP_BUF_CACHE_DEF(pp);
2555 
2556 	ASSERT(skm != NULL);
2557 	/* allocate a cached buffer */
2558 	baddr = (mach_vm_address_t)skmem_cache_alloc(skm, skmflag);
2559 
2560 #if (DEVELOPMENT || DEBUG)
2561 	uint64_t mtbf = skmem_region_get_mtbf();
2562 	/*
2563 	 * MTBF is applicable only for non-blocking allocations here.
2564 	 */
2565 	if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
2566 	    (skmflag & SKMEM_NOSLEEP))) {
2567 		SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
2568 		net_update_uptime();
2569 		if (baddr != 0) {
2570 			skmem_cache_free(skm,
2571 			    __unsafe_forge_single(struct skmem_obj *, baddr));
2572 			baddr = 0;
2573 		}
2574 	}
2575 #endif /* (DEVELOPMENT || DEBUG) */
2576 
2577 	if (__improbable(baddr == 0)) {
2578 		SK_DF(SK_VERB_MEM, "failed to alloc buffer, pp 0x%llx",
2579 		    SK_KVA(pp));
2580 		return 0;
2581 	}
2582 	skmem_cache_get_obj_info(skm,
2583 	    __unsafe_forge_single(struct skmem_obj *, baddr), oi, NULL);
2584 	ASSERT(SKMEM_OBJ_BUFCTL(oi) != NULL);
2585 	ASSERT((mach_vm_address_t)SKMEM_OBJ_ADDR(oi) == baddr);
2586 	return baddr;
2587 }
2588 
2589 errno_t
pp_alloc_buffer(const kern_pbufpool_t pp,mach_vm_address_t * baddr,kern_segment_t * seg,kern_obj_idx_seg_t * idx,uint32_t skmflag)2590 pp_alloc_buffer(const kern_pbufpool_t pp, mach_vm_address_t *baddr,
2591     kern_segment_t *seg, kern_obj_idx_seg_t *idx, uint32_t skmflag)
2592 {
2593 	struct skmem_obj_info oib;
2594 
2595 	VERIFY(pp != NULL && baddr != NULL);
2596 	VERIFY((seg != NULL) == (idx != NULL));
2597 
2598 	if (__improbable(!PP_HAS_BUFFER_ON_DEMAND(pp))) {
2599 		return ENOTSUP;
2600 	}
2601 
2602 	*baddr = pp_alloc_buffer_common(pp, &oib, skmflag, false);
2603 	if (__improbable(*baddr == 0)) {
2604 		return ENOMEM;
2605 	}
2606 
2607 	if (seg != NULL) {
2608 		ASSERT(SKMEM_OBJ_SEG(&oib) != NULL);
2609 		*seg = SKMEM_OBJ_SEG(&oib);
2610 		*idx = SKMEM_OBJ_IDX_SEG(&oib);
2611 	}
2612 	return 0;
2613 }
2614 
2615 void
pp_free_buffer(const kern_pbufpool_t pp,mach_vm_address_t addr)2616 pp_free_buffer(const kern_pbufpool_t pp, mach_vm_address_t addr)
2617 {
2618 	ASSERT(pp != NULL && addr != 0);
2619 	skmem_cache_free(PP_BUF_CACHE_DEF(pp), __unsafe_forge_single(
2620 		    struct skmem_obj *, addr));
2621 }
2622 
2623 __attribute__((always_inline))
2624 static inline uint32_t
pp_alloc_buflet_common(struct kern_pbufpool * pp,uint64_t * __counted_by (num)array,uint32_t num,uint32_t skmflag,bool large)2625 pp_alloc_buflet_common(struct kern_pbufpool *pp,
2626     uint64_t *__counted_by(num)array, uint32_t num, uint32_t skmflag,
2627     bool large)
2628 {
2629 	struct __kern_buflet *kbft = NULL;
2630 	uint32_t allocd, need = num;
2631 	struct skmem_obj *__single list;
2632 	uint64_t *array_cp;  /* -fbounds-safety */
2633 
2634 	ASSERT(array != NULL && num > 0);
2635 	ASSERT(PP_BATCH_CAPABLE(pp));
2636 	ASSERT(PP_KBFT_CACHE_DEF(pp) != NULL);
2637 	ASSERT(PP_BUF_SIZE_LARGE(pp) != 0 || !large);
2638 
2639 	if (large) {
2640 		allocd = skmem_cache_batch_alloc(PP_KBFT_CACHE_LARGE(pp), &list,
2641 		    PP_KBFT_CACHE_LARGE(pp)->skm_objsize, num, skmflag);
2642 	} else {
2643 		allocd = skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &list,
2644 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, num, skmflag);
2645 	}
2646 
2647 	array_cp = array;
2648 	while (list != NULL) {
2649 		struct skmem_obj *listn;
2650 
2651 		listn = list->mo_next;
2652 		list->mo_next = NULL;
2653 		kbft = (kern_buflet_t)(void *)list;
2654 
2655 
2656 		KBUF_EXT_INIT(kbft, pp);
2657 		*array_cp = (uint64_t)kbft;
2658 		++array_cp;
2659 		list = listn;
2660 		ASSERT(need > 0);
2661 		--need;
2662 	}
2663 	ASSERT((num - need) == allocd || kbft == NULL);
2664 	return num - need;
2665 }
2666 
2667 errno_t
pp_alloc_buflet(struct kern_pbufpool * pp,kern_buflet_t * kbft,uint32_t skmflag,bool large)2668 pp_alloc_buflet(struct kern_pbufpool *pp, kern_buflet_t *kbft, uint32_t skmflag,
2669     bool large)
2670 {
2671 	uint64_t bft;
2672 
2673 	if (__improbable(!pp_alloc_buflet_common(pp, &bft, 1, skmflag, large))) {
2674 		return ENOMEM;
2675 	}
2676 	*kbft = __unsafe_forge_single(kern_buflet_t, bft);
2677 	return 0;
2678 }
2679 
2680 errno_t
pp_alloc_buflet_batch(struct kern_pbufpool * pp,uint64_t * __counted_by (* size)array,uint32_t * size,uint32_t skmflag,bool large)2681 pp_alloc_buflet_batch(struct kern_pbufpool *pp,
2682     uint64_t *__counted_by(*size)array, uint32_t *size, uint32_t skmflag,
2683     bool large)
2684 {
2685 	uint32_t i, n;
2686 	int err;
2687 
2688 	ASSERT(array != NULL && size > 0);
2689 
2690 	n = *size;
2691 	i = pp_alloc_buflet_common(pp, array, n, skmflag, large);
2692 	*size = i;
2693 
2694 	if (__probable(i == n)) {
2695 		err = 0;
2696 	} else if (i != 0) {
2697 		err = EAGAIN;
2698 	} else {
2699 		err = ENOMEM;
2700 	}
2701 
2702 	return err;
2703 }
2704 
2705 __attribute__((always_inline))
2706 static void
pp_free_buflet_common(const kern_pbufpool_t pp,kern_buflet_t kbft)2707 pp_free_buflet_common(const kern_pbufpool_t pp, kern_buflet_t kbft)
2708 {
2709 	ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
2710 	ASSERT(kbft->buf_nbft_addr == 0);
2711 
2712 	if (kbft->buf_flag & BUFLET_FLAG_EXTERNAL) {
2713 		ASSERT(kbft->buf_addr != 0);
2714 		ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2715 		ASSERT(kbft->buf_bft_idx_reg != OBJ_IDX_NONE);
2716 		ASSERT(kbft->buf_ctl != NULL);
2717 		ASSERT(((struct __kern_buflet_ext *)kbft)->
2718 		    kbe_buf_upp_link.sle_next == NULL);
2719 		if (kbft->buf_ctl->bc_usecnt > 1) {
2720 			skmem_cache_free_nocache(BUFLET_HAS_LARGE_BUF(kbft) ?
2721 			    PP_KBFT_CACHE_LARGE(pp) : PP_KBFT_CACHE_DEF(pp),
2722 			    (void *)kbft);
2723 		} else {
2724 			skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2725 			    PP_KBFT_CACHE_LARGE(pp) : PP_KBFT_CACHE_DEF(pp),
2726 			    (void *)kbft);
2727 		}
2728 	} else if (__probable(kbft->buf_addr != 0)) {
2729 		void *objaddr = kbft->buf_objaddr;
2730 		uint32_t usecnt = 0;
2731 
2732 		ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2733 		ASSERT(kbft->buf_ctl != NULL);
2734 		KBUF_DTOR(kbft, usecnt);
2735 		SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u",
2736 		    SK_KVA(pp), SK_KVA(objaddr), usecnt);
2737 		if (__probable(usecnt == 0)) {
2738 			skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2739 			    PP_BUF_CACHE_LARGE(pp) : PP_BUF_CACHE_DEF(pp),
2740 			    objaddr);
2741 		}
2742 	}
2743 }
2744 
2745 void
pp_free_buflet(const kern_pbufpool_t pp,kern_buflet_t kbft)2746 pp_free_buflet(const kern_pbufpool_t pp, kern_buflet_t kbft)
2747 {
2748 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
2749 	ASSERT(pp != NULL && kbft != NULL);
2750 	pp_free_buflet_common(pp, kbft);
2751 }
2752 
2753 void
pp_reap_caches(boolean_t purge)2754 pp_reap_caches(boolean_t purge)
2755 {
2756 	skmem_cache_reap_now(pp_opt_cache, purge);
2757 	skmem_cache_reap_now(pp_flow_cache, purge);
2758 	skmem_cache_reap_now(pp_compl_cache, purge);
2759 }
2760