xref: /xnu-8796.101.5/bsd/skywalk/packet/pbufpool.c (revision aca3beaa3dfbd42498b42c5e5ce20a938e6554e5)
1 /*
2  * Copyright (c) 2016-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/packet/pbufpool_var.h>
31 #include <sys/sdt.h>
32 
33 static struct kern_pbufpool *pp_alloc(zalloc_flags_t);
34 static void pp_free(struct kern_pbufpool *);
35 static uint32_t pp_alloc_packet_common(struct kern_pbufpool *, uint16_t,
36     uint64_t *, uint32_t, boolean_t, alloc_cb_func_t, const void *, uint32_t);
37 static void pp_free_packet_array(struct kern_pbufpool *, uint64_t *, uint32_t);
38 static int pp_metadata_ctor_no_buflet(struct skmem_obj_info *,
39     struct skmem_obj_info *, void *, uint32_t);
40 static int pp_metadata_ctor_max_buflet(struct skmem_obj_info *,
41     struct skmem_obj_info *, void *, uint32_t);
42 static void pp_metadata_dtor(void *, void *);
43 static int pp_metadata_construct(struct __kern_quantum *,
44     struct __user_quantum *, obj_idx_t, struct kern_pbufpool *, uint32_t,
45     uint16_t, bool, struct skmem_obj **);
46 static void pp_metadata_destruct(struct __kern_quantum *,
47     struct kern_pbufpool *, bool);
48 static struct __kern_quantum *pp_metadata_init(struct __metadata_preamble *,
49     struct kern_pbufpool *, uint16_t, uint32_t, struct skmem_obj **);
50 static struct __metadata_preamble *pp_metadata_fini(struct __kern_quantum *,
51     struct kern_pbufpool *, struct mbuf **, struct __kern_packet **,
52     struct skmem_obj **, struct skmem_obj **, struct skmem_obj **);
53 static void pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid);
54 static void pp_buf_seg_ctor(struct sksegment *, IOSKMemoryBufferRef, void *);
55 static void pp_buf_seg_dtor(struct sksegment *, IOSKMemoryBufferRef, void *);
56 static void pp_destroy_upp_locked(struct kern_pbufpool *);
57 static void pp_destroy_upp_bft_locked(struct kern_pbufpool *);
58 static int pp_init_upp_bft_locked(struct kern_pbufpool *, boolean_t);
59 static void pp_free_buflet_common(const kern_pbufpool_t, kern_buflet_t);
60 static mach_vm_address_t pp_alloc_buffer_common(const kern_pbufpool_t pp,
61     struct skmem_obj_info *oi, uint32_t skmflag, bool large);
62 static inline uint32_t
63 pp_alloc_buflet_common(struct kern_pbufpool *pp, uint64_t *array,
64     uint32_t num, uint32_t skmflag, uint32_t flags);
65 
66 #define KERN_PBUFPOOL_U_HASH_SIZE       64      /* hash table size */
67 #define KERN_BUF_CNT_MULTIPLIER          2
68 
69 /*
70  * Since the inputs are small (indices to the metadata region), we can use
71  * Knuth's multiplicative hash method which is fast and good enough.  Here
72  * we multiply the input by the golden ratio of 2^32.  See "The Art of
73  * Computer Programming", section 6.4.
74  */
75 #define KERN_PBUFPOOL_U_HASH_INDEX(_i, _m)                      \
76 	(((_i) * 2654435761U) & (_m))
77 #define KERN_PBUFPOOL_U_HASH(_pp, _i)                           \
78 	(&(_pp)->pp_u_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
79 	KERN_PBUFPOOL_U_HASH_SIZE - 1)])
80 #define KERN_PBUFPOOL_U_BFT_HASH(_pp, _i)                           \
81 	(&(_pp)->pp_u_bft_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
82 	KERN_PBUFPOOL_U_HASH_SIZE - 1)])
83 
84 static SKMEM_TYPE_DEFINE(pp_zone, struct kern_pbufpool);
85 
86 struct kern_pbufpool_u_htbl {
87 	struct kern_pbufpool_u_bkt upp_hash[KERN_PBUFPOOL_U_HASH_SIZE];
88 };
89 
90 #define PP_U_HTBL_SIZE  sizeof(struct kern_pbufpool_u_htbl)
91 static SKMEM_TYPE_DEFINE(pp_u_htbl_zone, struct kern_pbufpool_u_htbl);
92 
93 static struct skmem_cache *pp_opt_cache;        /* cache for __packet_opt */
94 static struct skmem_cache *pp_flow_cache;       /* cache for __flow */
95 static struct skmem_cache *pp_compl_cache;      /* cache for __packet_compl */
96 
97 static int __pp_inited = 0;
98 
99 int
pp_init(void)100 pp_init(void)
101 {
102 	_CASSERT(KPKT_SC_UNSPEC == MBUF_SC_UNSPEC);
103 	_CASSERT(KPKT_SC_BK_SYS == MBUF_SC_BK_SYS);
104 	_CASSERT(KPKT_SC_BK == MBUF_SC_BK);
105 	_CASSERT(KPKT_SC_BE == MBUF_SC_BE);
106 	_CASSERT(KPKT_SC_RD == MBUF_SC_RD);
107 	_CASSERT(KPKT_SC_OAM == MBUF_SC_OAM);
108 	_CASSERT(KPKT_SC_AV == MBUF_SC_AV);
109 	_CASSERT(KPKT_SC_RV == MBUF_SC_RV);
110 	_CASSERT(KPKT_SC_VI == MBUF_SC_VI);
111 	_CASSERT(KPKT_SC_SIG == MBUF_SC_SIG);
112 	_CASSERT(KPKT_SC_VO == MBUF_SC_VO);
113 	_CASSERT(KPKT_SC_CTL == MBUF_SC_CTL);
114 
115 	_CASSERT(KPKT_SC_BK_SYS == PKT_SC_BK_SYS);
116 	_CASSERT(KPKT_SC_BK == PKT_SC_BK);
117 	_CASSERT(KPKT_SC_BE == PKT_SC_BE);
118 	_CASSERT(KPKT_SC_RD == PKT_SC_RD);
119 	_CASSERT(KPKT_SC_OAM == PKT_SC_OAM);
120 	_CASSERT(KPKT_SC_AV == PKT_SC_AV);
121 	_CASSERT(KPKT_SC_RV == PKT_SC_RV);
122 	_CASSERT(KPKT_SC_VI == PKT_SC_VI);
123 	_CASSERT(KPKT_SC_SIG == PKT_SC_SIG);
124 	_CASSERT(KPKT_SC_VO == PKT_SC_VO);
125 	_CASSERT(KPKT_SC_CTL == PKT_SC_CTL);
126 	_CASSERT(KPKT_SC_MAX_CLASSES == MBUF_SC_MAX_CLASSES);
127 
128 	_CASSERT(KPKT_TC_UNSPEC == MBUF_TC_UNSPEC);
129 	_CASSERT(KPKT_TC_BE == MBUF_TC_BE);
130 	_CASSERT(KPKT_TC_BK == MBUF_TC_BK);
131 	_CASSERT(KPKT_TC_VI == MBUF_TC_VI);
132 	_CASSERT(KPKT_TC_VO == MBUF_TC_VO);
133 	_CASSERT(KPKT_TC_MAX == MBUF_TC_MAX);
134 
135 	_CASSERT(KPKT_TC_BE == PKT_TC_BE);
136 	_CASSERT(KPKT_TC_BK == PKT_TC_BK);
137 	_CASSERT(KPKT_TC_VI == PKT_TC_VI);
138 	_CASSERT(KPKT_TC_VO == PKT_TC_VO);
139 
140 	_CASSERT(PKT_SCVAL_BK_SYS == SCVAL_BK_SYS);
141 	_CASSERT(PKT_SCVAL_BK == SCVAL_BK);
142 	_CASSERT(PKT_SCVAL_BE == SCVAL_BE);
143 	_CASSERT(PKT_SCVAL_RD == SCVAL_RD);
144 	_CASSERT(PKT_SCVAL_OAM == SCVAL_OAM);
145 	_CASSERT(PKT_SCVAL_AV == SCVAL_AV);
146 	_CASSERT(PKT_SCVAL_RV == SCVAL_RV);
147 	_CASSERT(PKT_SCVAL_VI == SCVAL_VI);
148 	_CASSERT(PKT_SCVAL_VO == SCVAL_VO);
149 	_CASSERT(PKT_SCVAL_CTL == SCVAL_CTL);
150 
151 	/*
152 	 * Assert that the value of common packet flags between mbuf and
153 	 * skywalk packets match, and that they are in PKT_F_COMMON_MASK.
154 	 */
155 	_CASSERT(PKT_F_BACKGROUND == PKTF_SO_BACKGROUND);
156 	_CASSERT(PKT_F_REALTIME == PKTF_SO_REALTIME);
157 	_CASSERT(PKT_F_REXMT == PKTF_TCP_REXMT);
158 	_CASSERT(PKT_F_LAST_PKT == PKTF_LAST_PKT);
159 	_CASSERT(PKT_F_FLOW_ID == PKTF_FLOW_ID);
160 	_CASSERT(PKT_F_FLOW_ADV == PKTF_FLOW_ADV);
161 	_CASSERT(PKT_F_TX_COMPL_TS_REQ == PKTF_TX_COMPL_TS_REQ);
162 	_CASSERT(PKT_F_TS_VALID == PKTF_TS_VALID);
163 	_CASSERT(PKT_F_NEW_FLOW == PKTF_NEW_FLOW);
164 	_CASSERT(PKT_F_START_SEQ == PKTF_START_SEQ);
165 	_CASSERT(PKT_F_KEEPALIVE == PKTF_KEEPALIVE);
166 	_CASSERT(PKT_F_WAKE_PKT == PKTF_WAKE_PKT);
167 	_CASSERT(PKT_F_COMMON_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME |
168 	    PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_FLOW_ID | PKT_F_FLOW_ADV |
169 	    PKT_F_TX_COMPL_TS_REQ | PKT_F_TS_VALID | PKT_F_NEW_FLOW |
170 	    PKT_F_START_SEQ | PKT_F_KEEPALIVE | PKT_F_WAKE_PKT));
171 	/*
172 	 * Assert packet flags shared with userland.
173 	 */
174 	_CASSERT(PKT_F_USER_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME |
175 	    PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_OPT_DATA | PKT_F_PROMISC |
176 	    PKT_F_TRUNCATED | PKT_F_WAKE_PKT | PKT_F_L4S));
177 
178 	_CASSERT(offsetof(struct __kern_quantum, qum_len) ==
179 	    offsetof(struct __kern_packet, pkt_length));
180 
181 	/*
182 	 * Due to the use of tagged pointer, we need the size of
183 	 * the metadata preamble structure to be multiples of 16.
184 	 * See SK_PTR_TAG() definition for details.
185 	 */
186 	_CASSERT(sizeof(struct __metadata_preamble) != 0 &&
187 	    (sizeof(struct __metadata_preamble) % 16) == 0);
188 
189 	_CASSERT(NX_PBUF_FRAGS_MIN == 1 &&
190 	    NX_PBUF_FRAGS_MIN == NX_PBUF_FRAGS_DEFAULT);
191 
192 	/*
193 	 * Batch alloc/free requires linking the objects together;
194 	 * make sure that the fields are at the same offset since
195 	 * we cast the object to struct skmem_obj.
196 	 */
197 	_CASSERT(offsetof(struct __metadata_preamble, _mdp_next) ==
198 	    offsetof(struct skmem_obj, mo_next));
199 	_CASSERT(offsetof(struct __buflet, __buflet_next) ==
200 	    offsetof(struct skmem_obj, mo_next));
201 
202 	SK_LOCK_ASSERT_HELD();
203 	ASSERT(!__pp_inited);
204 
205 	pp_opt_cache = skmem_cache_create("pkt.opt",
206 	    sizeof(struct __packet_opt), sizeof(uint64_t),
207 	    NULL, NULL, NULL, NULL, NULL, 0);
208 	pp_flow_cache = skmem_cache_create("pkt.flow",
209 	    sizeof(struct __flow), 16,  /* 16-bytes aligned */
210 	    NULL, NULL, NULL, NULL, NULL, 0);
211 	pp_compl_cache = skmem_cache_create("pkt.compl",
212 	    sizeof(struct __packet_compl), sizeof(uint64_t),
213 	    NULL, NULL, NULL, NULL, NULL, 0);
214 
215 	return 0;
216 }
217 
218 void
pp_fini(void)219 pp_fini(void)
220 {
221 	SK_LOCK_ASSERT_HELD();
222 
223 	if (__pp_inited) {
224 		if (pp_compl_cache != NULL) {
225 			skmem_cache_destroy(pp_compl_cache);
226 			pp_compl_cache = NULL;
227 		}
228 		if (pp_flow_cache != NULL) {
229 			skmem_cache_destroy(pp_flow_cache);
230 			pp_flow_cache = NULL;
231 		}
232 		if (pp_opt_cache != NULL) {
233 			skmem_cache_destroy(pp_opt_cache);
234 			pp_opt_cache = NULL;
235 		}
236 
237 		__pp_inited = 0;
238 	}
239 }
240 
241 static struct kern_pbufpool *
pp_alloc(zalloc_flags_t how)242 pp_alloc(zalloc_flags_t how)
243 {
244 	struct kern_pbufpool *pp = zalloc_flags(pp_zone, how | Z_ZERO);
245 
246 	if (pp) {
247 		lck_mtx_init(&pp->pp_lock, &skmem_lock_grp, &skmem_lock_attr);
248 	}
249 	return pp;
250 }
251 
252 static void
pp_free(struct kern_pbufpool * pp)253 pp_free(struct kern_pbufpool *pp)
254 {
255 	PP_LOCK_ASSERT_HELD(pp);
256 
257 	pp_destroy(pp);
258 	PP_UNLOCK(pp);
259 
260 	SK_DF(SK_VERB_MEM, "pp 0x%llx FREE", SK_KVA(pp));
261 	lck_mtx_destroy(&pp->pp_lock, &skmem_lock_grp);
262 	zfree(pp_zone, pp);
263 }
264 
265 void
pp_retain_locked(struct kern_pbufpool * pp)266 pp_retain_locked(struct kern_pbufpool *pp)
267 {
268 	PP_LOCK_ASSERT_HELD(pp);
269 
270 	pp->pp_refcnt++;
271 	ASSERT(pp->pp_refcnt != 0);
272 }
273 
274 void
pp_retain(struct kern_pbufpool * pp)275 pp_retain(struct kern_pbufpool *pp)
276 {
277 	PP_LOCK(pp);
278 	pp_retain_locked(pp);
279 	PP_UNLOCK(pp);
280 }
281 
282 boolean_t
pp_release_locked(struct kern_pbufpool * pp)283 pp_release_locked(struct kern_pbufpool *pp)
284 {
285 	uint32_t oldref = pp->pp_refcnt;
286 
287 	PP_LOCK_ASSERT_HELD(pp);
288 
289 	ASSERT(pp->pp_refcnt != 0);
290 	if (--pp->pp_refcnt == 0) {
291 		pp_free(pp);
292 	}
293 
294 	return oldref == 1;
295 }
296 
297 boolean_t
pp_release(struct kern_pbufpool * pp)298 pp_release(struct kern_pbufpool *pp)
299 {
300 	boolean_t lastref;
301 
302 	PP_LOCK(pp);
303 	if (!(lastref = pp_release_locked(pp))) {
304 		PP_UNLOCK(pp);
305 	}
306 
307 	return lastref;
308 }
309 
310 void
pp_close(struct kern_pbufpool * pp)311 pp_close(struct kern_pbufpool *pp)
312 {
313 	PP_LOCK(pp);
314 	ASSERT(pp->pp_refcnt > 0);
315 	ASSERT(!(pp->pp_flags & PPF_CLOSED));
316 	pp->pp_flags |= PPF_CLOSED;
317 	if (!pp_release_locked(pp)) {
318 		PP_UNLOCK(pp);
319 	}
320 }
321 
322 void
pp_regions_params_adjust(struct skmem_region_params * srp_array,nexus_meta_type_t md_type,nexus_meta_subtype_t md_subtype,uint32_t md_cnt,uint16_t max_frags,uint32_t buf_size,uint32_t large_buf_size,uint32_t buf_cnt,uint32_t buf_seg_size,uint32_t flags)323 pp_regions_params_adjust(struct skmem_region_params *srp_array,
324     nexus_meta_type_t md_type, nexus_meta_subtype_t md_subtype, uint32_t md_cnt,
325     uint16_t max_frags, uint32_t buf_size, uint32_t large_buf_size,
326     uint32_t buf_cnt, uint32_t buf_seg_size, uint32_t flags)
327 {
328 	struct skmem_region_params *srp, *kmd_srp, *buf_srp, *kbft_srp,
329 	    *lbuf_srp;
330 	uint32_t md_size = 0;
331 	bool kernel_only = ((flags & PP_REGION_CONFIG_KERNEL_ONLY) != 0);
332 	bool md_persistent = ((flags & PP_REGION_CONFIG_MD_PERSISTENT) != 0);
333 	bool buf_persistent = ((flags & PP_REGION_CONFIG_BUF_PERSISTENT) != 0);
334 	bool config_buflet = ((flags & PP_REGION_CONFIG_BUFLET) != 0);
335 	bool md_magazine_enable = ((flags &
336 	    PP_REGION_CONFIG_MD_MAGAZINE_ENABLE) != 0);
337 	bool config_raw_buflet = (flags & PP_REGION_CONFIG_RAW_BUFLET) != 0;
338 
339 	ASSERT(max_frags != 0);
340 
341 	switch (md_type) {
342 	case NEXUS_META_TYPE_QUANTUM:
343 		md_size = NX_METADATA_QUANTUM_SZ;
344 		break;
345 	case NEXUS_META_TYPE_PACKET:
346 		md_size = NX_METADATA_PACKET_SZ(max_frags);
347 		break;
348 	default:
349 		VERIFY(0);
350 		/* NOTREACHED */
351 		__builtin_unreachable();
352 	}
353 
354 	switch (flags & PP_REGION_CONFIG_BUF_IODIR_BIDIR) {
355 	case PP_REGION_CONFIG_BUF_IODIR_IN:
356 		kmd_srp = &srp_array[SKMEM_REGION_RXKMD];
357 		buf_srp = &srp_array[SKMEM_REGION_RXBUF_DEF];
358 		lbuf_srp = &srp_array[SKMEM_REGION_RXBUF_LARGE];
359 		kbft_srp = &srp_array[SKMEM_REGION_RXKBFT];
360 		break;
361 	case PP_REGION_CONFIG_BUF_IODIR_OUT:
362 		kmd_srp = &srp_array[SKMEM_REGION_TXKMD];
363 		buf_srp = &srp_array[SKMEM_REGION_TXBUF_DEF];
364 		lbuf_srp = &srp_array[SKMEM_REGION_TXBUF_LARGE];
365 		kbft_srp = &srp_array[SKMEM_REGION_TXKBFT];
366 		break;
367 	case PP_REGION_CONFIG_BUF_IODIR_BIDIR:
368 	default:
369 		kmd_srp = &srp_array[SKMEM_REGION_KMD];
370 		buf_srp = &srp_array[SKMEM_REGION_BUF_DEF];
371 		lbuf_srp = &srp_array[SKMEM_REGION_BUF_LARGE];
372 		kbft_srp = &srp_array[SKMEM_REGION_KBFT];
373 		break;
374 	}
375 
376 	/* add preamble size to metadata obj size */
377 	md_size += METADATA_PREAMBLE_SZ;
378 	ASSERT(md_size >= NX_METADATA_OBJ_MIN_SZ);
379 
380 	/* configure kernel metadata region */
381 	kmd_srp->srp_md_type = md_type;
382 	kmd_srp->srp_md_subtype = md_subtype;
383 	kmd_srp->srp_r_obj_cnt = md_cnt;
384 	kmd_srp->srp_r_obj_size = md_size;
385 	kmd_srp->srp_max_frags = max_frags;
386 	ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
387 	if (md_persistent) {
388 		kmd_srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
389 	}
390 	ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
391 	if (md_magazine_enable) {
392 		kmd_srp->srp_cflags &= ~SKMEM_REGION_CR_NOMAGAZINES;
393 	}
394 	skmem_region_params_config(kmd_srp);
395 
396 	/* configure user metadata region */
397 	srp = &srp_array[SKMEM_REGION_UMD];
398 	if (!kernel_only) {
399 		srp->srp_md_type = kmd_srp->srp_md_type;
400 		srp->srp_md_subtype = kmd_srp->srp_md_subtype;
401 		srp->srp_r_obj_cnt = kmd_srp->srp_c_obj_cnt;
402 		srp->srp_r_obj_size = kmd_srp->srp_c_obj_size;
403 		srp->srp_max_frags = kmd_srp->srp_max_frags;
404 		ASSERT((srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
405 		if (md_persistent) {
406 			srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
407 		}
408 		/*
409 		 * UMD is a mirrored region and object allocation operations
410 		 * are performed on the KMD objects.
411 		 */
412 		ASSERT((srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
413 		skmem_region_params_config(srp);
414 		ASSERT(srp->srp_c_obj_cnt == kmd_srp->srp_c_obj_cnt);
415 	} else {
416 		ASSERT(srp->srp_r_obj_cnt == 0);
417 		ASSERT(srp->srp_r_obj_size == 0);
418 	}
419 
420 	/* configure buffer region */
421 	buf_srp->srp_r_obj_cnt = MAX(buf_cnt, kmd_srp->srp_c_obj_cnt);
422 	buf_srp->srp_r_obj_size = buf_size;
423 	buf_srp->srp_cflags &= ~SKMEM_REGION_CR_MONOLITHIC;
424 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
425 	if (buf_persistent) {
426 		buf_srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
427 	}
428 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
429 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_UREADONLY) == 0);
430 	if ((flags & PP_REGION_CONFIG_BUF_UREADONLY) != 0) {
431 		buf_srp->srp_cflags |= SKMEM_REGION_CR_UREADONLY;
432 	}
433 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_KREADONLY) == 0);
434 	if ((flags & PP_REGION_CONFIG_BUF_KREADONLY) != 0) {
435 		buf_srp->srp_cflags |= SKMEM_REGION_CR_KREADONLY;
436 	}
437 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_MONOLITHIC) == 0);
438 	if ((flags & PP_REGION_CONFIG_BUF_MONOLITHIC) != 0) {
439 		buf_srp->srp_cflags |= SKMEM_REGION_CR_MONOLITHIC;
440 	}
441 	ASSERT((srp->srp_cflags & SKMEM_REGION_CR_SEGPHYSCONTIG) == 0);
442 	if ((flags & PP_REGION_CONFIG_BUF_SEGPHYSCONTIG) != 0) {
443 		buf_srp->srp_cflags |= SKMEM_REGION_CR_SEGPHYSCONTIG;
444 	}
445 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_NOCACHE) == 0);
446 	if ((flags & PP_REGION_CONFIG_BUF_NOCACHE) != 0) {
447 		buf_srp->srp_cflags |= SKMEM_REGION_CR_NOCACHE;
448 	}
449 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_THREADSAFE) == 0);
450 	if ((flags & PP_REGION_CONFIG_BUF_THREADSAFE) != 0) {
451 		buf_srp->srp_cflags |= SKMEM_REGION_CR_THREADSAFE;
452 	}
453 	if (buf_seg_size != 0) {
454 		buf_srp->srp_r_seg_size = buf_seg_size;
455 	}
456 	skmem_region_params_config(buf_srp);
457 
458 	/* configure large buffer region */
459 	if (large_buf_size != 0) {
460 		lbuf_srp->srp_r_obj_cnt = buf_srp->srp_r_obj_cnt;
461 		lbuf_srp->srp_r_obj_size = large_buf_size;
462 		lbuf_srp->srp_r_seg_size = buf_srp->srp_r_seg_size;
463 		lbuf_srp->srp_cflags = buf_srp->srp_cflags;
464 		skmem_region_params_config(lbuf_srp);
465 	}
466 
467 	/* configure kernel buflet region */
468 	if (config_buflet) {
469 		ASSERT(md_type == NEXUS_META_TYPE_PACKET);
470 		/*
471 		 * We want to have enough buflets when multi-buflet and
472 		 * shared buffer object is used.
473 		 */
474 		uint32_t r_obj_cnt_multiplier = config_raw_buflet ?
475 		    KERN_BUF_CNT_MULTIPLIER : 1;
476 		kbft_srp->srp_r_obj_cnt =
477 		    (buf_srp->srp_c_obj_cnt + lbuf_srp->srp_c_obj_cnt) *
478 		    r_obj_cnt_multiplier;
479 		kbft_srp->srp_r_obj_size = MAX(sizeof(struct __kern_buflet_ext),
480 		    sizeof(struct __user_buflet));
481 		kbft_srp->srp_cflags = kmd_srp->srp_cflags;
482 		skmem_region_params_config(kbft_srp);
483 		ASSERT(kbft_srp->srp_c_obj_cnt >= buf_srp->srp_c_obj_cnt +
484 		    lbuf_srp->srp_c_obj_cnt);
485 	} else {
486 		ASSERT(kbft_srp->srp_r_obj_cnt == 0);
487 		ASSERT(kbft_srp->srp_r_obj_size == 0);
488 	}
489 
490 	/* configure user buflet region */
491 	srp = &srp_array[SKMEM_REGION_UBFT];
492 	if (config_buflet && !kernel_only) {
493 		srp->srp_r_obj_cnt = kbft_srp->srp_c_obj_cnt;
494 		srp->srp_r_obj_size = kbft_srp->srp_c_obj_size;
495 		srp->srp_cflags = srp_array[SKMEM_REGION_UMD].srp_cflags;
496 		skmem_region_params_config(srp);
497 		ASSERT(srp->srp_c_obj_cnt == kbft_srp->srp_c_obj_cnt);
498 	} else {
499 		ASSERT(srp->srp_r_obj_cnt == 0);
500 		ASSERT(srp->srp_r_obj_size == 0);
501 	}
502 
503 	/* make sure each metadata can be paired with a buffer */
504 	ASSERT(kmd_srp->srp_c_obj_cnt <= buf_srp->srp_c_obj_cnt);
505 }
506 
507 SK_NO_INLINE_ATTRIBUTE
508 static int
pp_metadata_construct(struct __kern_quantum * kqum,struct __user_quantum * uqum,obj_idx_t midx,struct kern_pbufpool * pp,uint32_t skmflag,uint16_t bufcnt,bool raw,struct skmem_obj ** blist)509 pp_metadata_construct(struct __kern_quantum *kqum, struct __user_quantum *uqum,
510     obj_idx_t midx, struct kern_pbufpool *pp, uint32_t skmflag, uint16_t bufcnt,
511     bool raw, struct skmem_obj **blist)
512 {
513 	struct __kern_buflet *kbuf;
514 	mach_vm_address_t baddr = 0;
515 	uint16_t *pbufs_cnt, *pbufs_max;
516 	uint16_t i;
517 
518 	ASSERT(bufcnt == 1 || PP_HAS_BUFFER_ON_DEMAND(pp));
519 
520 	/* construct {user,kernel} metadata */
521 	switch (pp->pp_md_type) {
522 	case NEXUS_META_TYPE_PACKET: {
523 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
524 		struct __user_packet *upkt = SK_PTR_ADDR_UPKT(uqum);
525 		struct __packet_opt *opt;
526 		struct __flow *flow;
527 		struct __packet_compl *compl;
528 		uint64_t pflags;
529 
530 		if (raw) {
531 			opt = skmem_cache_alloc(pp_opt_cache, SKMEM_SLEEP);
532 			flow = skmem_cache_alloc(pp_flow_cache, SKMEM_SLEEP);
533 			compl = skmem_cache_alloc(pp_compl_cache, SKMEM_SLEEP);
534 			pflags = (PKT_F_OPT_ALLOC | PKT_F_FLOW_ALLOC |
535 			    PKT_F_TX_COMPL_ALLOC);
536 		} else {
537 			ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
538 			    kpkt->pkt_com_opt != NULL);
539 			opt = kpkt->pkt_com_opt;
540 			ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
541 			    kpkt->pkt_flow != NULL);
542 			flow = kpkt->pkt_flow;
543 			ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
544 			    kpkt->pkt_tx_compl != NULL);
545 			compl = kpkt->pkt_tx_compl;
546 			pflags = kpkt->pkt_pflags;
547 		}
548 		/* will be adjusted below as part of allocating buffer(s) */
549 		_CASSERT(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
550 		_CASSERT(sizeof(kpkt->pkt_bufs_max) == sizeof(uint16_t));
551 		pbufs_cnt = __DECONST(uint16_t *, &kpkt->pkt_bufs_cnt);
552 		pbufs_max = __DECONST(uint16_t *, &kpkt->pkt_bufs_max);
553 
554 		/* kernel (and user) packet */
555 		KPKT_CTOR(kpkt, pflags, opt, flow, compl, midx,
556 		    upkt, pp, 0, pp->pp_max_frags, 0);
557 		break;
558 	}
559 	default:
560 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
561 		VERIFY(bufcnt == 1);
562 		/* TODO: point these to quantum's once they're defined */
563 		pbufs_cnt = pbufs_max = NULL;
564 		/* kernel quantum */
565 		KQUM_CTOR(kqum, midx, uqum, pp, 0);
566 		break;
567 	}
568 
569 	kbuf = kqum->qum_buf;
570 	for (i = 0; i < bufcnt; i++) {
571 		struct skmem_obj_info oib;
572 
573 		if (!PP_HAS_BUFFER_ON_DEMAND(pp)) {
574 			ASSERT(i == 0);
575 			ASSERT(*blist == NULL);
576 			/*
577 			 * quantum has a native buflet, so we only need a
578 			 * buffer to be allocated and attached to the buflet.
579 			 */
580 			baddr = pp_alloc_buffer_common(pp, &oib, skmflag,
581 			    false);
582 			if (__improbable(baddr == 0)) {
583 				goto fail;
584 			}
585 			KBUF_CTOR(kbuf, baddr, SKMEM_OBJ_IDX_REG(&oib),
586 			    SKMEM_OBJ_BUFCTL(&oib), pp, false);
587 			baddr = 0;
588 		} else {
589 			/*
590 			 * we use pre-constructed buflets with attached buffers.
591 			 */
592 			struct __kern_buflet *pkbuf = kbuf;
593 			struct skmem_obj *blistn;
594 
595 			ASSERT(pkbuf != NULL);
596 			kbuf = (kern_buflet_t)*blist;
597 			if (__improbable(kbuf == NULL)) {
598 				SK_DF(SK_VERB_MEM, "failed to get buflet,"
599 				    " pp 0x%llx", SK_KVA(pp));
600 				goto fail;
601 			}
602 			blistn = (*blist)->mo_next;
603 			(*blist)->mo_next = NULL;
604 
605 			KBUF_EXT_INIT(kbuf, pp);
606 			KBUF_LINK(pkbuf, kbuf);
607 			*blist = blistn;
608 		}
609 
610 		/* adjust buffer count accordingly */
611 		if (__probable(pbufs_cnt != NULL)) {
612 			*pbufs_cnt += 1;
613 			ASSERT(*pbufs_cnt <= *pbufs_max);
614 		}
615 	}
616 
617 	ASSERT(!PP_KERNEL_ONLY(pp) || (kqum->qum_qflags & QUM_F_KERNEL_ONLY));
618 	ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE);
619 	SK_DF(SK_VERB_MEM, "pp 0x%llx pkt 0x%llx bufcnt %d buf 0x%llx",
620 	    SK_KVA(pp), SK_KVA(kqum), bufcnt, SK_KVA(baddr));
621 	return 0;
622 
623 fail:
624 	ASSERT(bufcnt != 0 && baddr == 0);
625 	pp_metadata_destruct(kqum, pp, raw);
626 	return ENOMEM;
627 }
628 
629 static int
pp_metadata_ctor_common(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,struct kern_pbufpool * pp,uint32_t skmflag,bool no_buflet)630 pp_metadata_ctor_common(struct skmem_obj_info *oi0,
631     struct skmem_obj_info *oim0, struct kern_pbufpool *pp, uint32_t skmflag,
632     bool no_buflet)
633 {
634 	struct skmem_obj_info _oi, _oim;
635 	struct skmem_obj_info *oi, *oim;
636 	struct __kern_quantum *kqum;
637 	struct __user_quantum *uqum;
638 	uint16_t bufcnt = (no_buflet ? 0 : pp->pp_max_frags);
639 	struct skmem_obj *blist = NULL;
640 	int error;
641 
642 #if (DEVELOPMENT || DEBUG)
643 	uint64_t mtbf = skmem_region_get_mtbf();
644 	/*
645 	 * MTBF is applicable only for non-blocking allocations here.
646 	 */
647 	if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
648 	    (skmflag & SKMEM_NOSLEEP))) {
649 		SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
650 		net_update_uptime();
651 		return ENOMEM;
652 	}
653 #endif /* (DEVELOPMENT || DEBUG) */
654 
655 	/*
656 	 * Note that oi0 and oim0 may be stored inside the object itself;
657 	 * if so, copy them to local variables before constructing.  We
658 	 * don't use PPF_BATCH to test as the allocator may be allocating
659 	 * storage space differently depending on the number of objects.
660 	 */
661 	if (__probable((uintptr_t)oi0 >= (uintptr_t)SKMEM_OBJ_ADDR(oi0) &&
662 	    ((uintptr_t)oi0 + sizeof(*oi0)) <=
663 	    ((uintptr_t)SKMEM_OBJ_ADDR(oi0) + SKMEM_OBJ_SIZE(oi0)))) {
664 		oi = &_oi;
665 		*oi = *oi0;
666 		if (__probable(oim0 != NULL)) {
667 			oim = &_oim;
668 			*oim = *oim0;
669 		} else {
670 			oim = NULL;
671 		}
672 	} else {
673 		oi = oi0;
674 		oim = oim0;
675 	}
676 
677 	kqum = SK_PTR_ADDR_KQUM((uintptr_t)SKMEM_OBJ_ADDR(oi) +
678 	    METADATA_PREAMBLE_SZ);
679 
680 	if (__probable(!PP_KERNEL_ONLY(pp))) {
681 		ASSERT(oim != NULL && SKMEM_OBJ_ADDR(oim) != NULL);
682 		ASSERT(SKMEM_OBJ_SIZE(oi) == SKMEM_OBJ_SIZE(oim));
683 		uqum = SK_PTR_ADDR_UQUM((uintptr_t)SKMEM_OBJ_ADDR(oim) +
684 		    METADATA_PREAMBLE_SZ);
685 	} else {
686 		ASSERT(oim == NULL);
687 		uqum = NULL;
688 	}
689 
690 	if (oim != NULL) {
691 		/* initialize user metadata redzone */
692 		struct __metadata_preamble *mdp = SKMEM_OBJ_ADDR(oim);
693 		mdp->mdp_redzone =
694 		    (SKMEM_OBJ_ROFF(oim) + METADATA_PREAMBLE_SZ) ^
695 		    __ch_umd_redzone_cookie;
696 	}
697 
698 	/* allocate (constructed) buflet(s) with buffer(s) attached */
699 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0) {
700 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
701 		    bufcnt, skmflag);
702 	}
703 
704 	error = pp_metadata_construct(kqum, uqum, SKMEM_OBJ_IDX_REG(oi), pp,
705 	    skmflag, bufcnt, TRUE, &blist);
706 	if (__improbable(blist != NULL)) {
707 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist);
708 		blist = NULL;
709 	}
710 	return error;
711 }
712 
713 static int
pp_metadata_ctor_no_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)714 pp_metadata_ctor_no_buflet(struct skmem_obj_info *oi0,
715     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
716 {
717 	return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, true);
718 }
719 
720 static int
pp_metadata_ctor_max_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)721 pp_metadata_ctor_max_buflet(struct skmem_obj_info *oi0,
722     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
723 {
724 	return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, false);
725 }
726 
727 __attribute__((always_inline))
728 static void
pp_metadata_destruct_common(struct __kern_quantum * kqum,struct kern_pbufpool * pp,bool raw,struct skmem_obj ** blist_def,struct skmem_obj ** blist_large,struct skmem_obj ** blist_raw)729 pp_metadata_destruct_common(struct __kern_quantum *kqum,
730     struct kern_pbufpool *pp, bool raw, struct skmem_obj **blist_def,
731     struct skmem_obj **blist_large, struct skmem_obj **blist_raw)
732 {
733 	struct __kern_buflet *kbuf, *nbuf;
734 	struct skmem_obj *p_blist_def = NULL, *p_blist_large = NULL, *p_blist_raw = NULL;
735 	struct skmem_obj **pp_blist_def = &p_blist_def;
736 	struct skmem_obj **pp_blist_large = &p_blist_large;
737 	struct skmem_obj **pp_blist_raw = &p_blist_raw;
738 
739 	uint16_t bufcnt, i = 0;
740 	bool first_buflet_empty;
741 
742 	ASSERT(blist_def != NULL);
743 	ASSERT(blist_large != NULL);
744 
745 	switch (pp->pp_md_type) {
746 	case NEXUS_META_TYPE_PACKET: {
747 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
748 
749 		ASSERT(kpkt->pkt_user != NULL || PP_KERNEL_ONLY(pp));
750 		ASSERT(kpkt->pkt_qum.qum_pp == pp);
751 		ASSERT(METADATA_TYPE(kpkt) == pp->pp_md_type);
752 		ASSERT(METADATA_SUBTYPE(kpkt) == pp->pp_md_subtype);
753 		ASSERT(METADATA_IDX(kpkt) != OBJ_IDX_NONE);
754 		ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
755 		ASSERT(kpkt->pkt_bufs_cnt <= kpkt->pkt_bufs_max);
756 		ASSERT(kpkt->pkt_bufs_max == pp->pp_max_frags);
757 		_CASSERT(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
758 		bufcnt = kpkt->pkt_bufs_cnt;
759 		kbuf = &kqum->qum_buf[0];
760 		/*
761 		 * special handling for empty first buflet.
762 		 */
763 		first_buflet_empty = (kbuf->buf_addr == 0);
764 		*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = 0;
765 		break;
766 	}
767 	default:
768 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
769 		ASSERT(kqum->qum_user != NULL || PP_KERNEL_ONLY(pp));
770 		ASSERT(kqum->qum_pp == pp);
771 		ASSERT(METADATA_TYPE(kqum) == pp->pp_md_type);
772 		ASSERT(METADATA_SUBTYPE(kqum) == pp->pp_md_subtype);
773 		ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE);
774 		ASSERT(kqum->qum_ksd == NULL);
775 		kbuf = &kqum->qum_buf[0];
776 		/*
777 		 * XXX: Special handling for quantum as we don't currently
778 		 * define bufs_{cnt,max} there.  Given that we support at
779 		 * most only 1 buflet for now, check if buf_addr is non-NULL.
780 		 * See related code in pp_metadata_construct().
781 		 */
782 		first_buflet_empty = (kbuf->buf_addr == 0);
783 		bufcnt = first_buflet_empty ? 0 : 1;
784 		break;
785 	}
786 
787 	nbuf = __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr);
788 	BUF_NBFT_ADDR(kbuf, 0);
789 	BUF_NBFT_IDX(kbuf, OBJ_IDX_NONE);
790 	if (!first_buflet_empty) {
791 		pp_free_buflet_common(pp, kbuf);
792 		++i;
793 	}
794 
795 	while (nbuf != NULL) {
796 		if (BUFLET_FROM_RAW_BFLT_CACHE(nbuf)) {
797 			/*
798 			 * Separate the raw buflet and its attached buffer to
799 			 * reduce usecnt.
800 			 */
801 			uint32_t usecnt = 0;
802 			void *objaddr = nbuf->buf_objaddr;
803 			KBUF_DTOR(nbuf, usecnt);
804 			SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u",
805 			    SK_KVA(pp), SK_KVA(objaddr), usecnt);
806 			if (__improbable(usecnt == 0)) {
807 				skmem_cache_free(BUFLET_HAS_LARGE_BUF(nbuf) ?
808 				    PP_BUF_CACHE_LARGE(pp) : PP_BUF_CACHE_DEF(pp),
809 				    objaddr);
810 			}
811 
812 			*pp_blist_raw = (struct skmem_obj *)(void *)nbuf;
813 			pp_blist_raw = &((struct skmem_obj *)(void *)nbuf)->mo_next;
814 		} else {
815 			if (BUFLET_HAS_LARGE_BUF(nbuf)) {
816 				*pp_blist_large = (struct skmem_obj *)(void *)nbuf;
817 				pp_blist_large =
818 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
819 			} else {
820 				*pp_blist_def = (struct skmem_obj *)(void *)nbuf;
821 				pp_blist_def =
822 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
823 			}
824 		}
825 		BUF_NBFT_IDX(nbuf, OBJ_IDX_NONE);
826 		nbuf = __DECONST(struct __kern_buflet *, nbuf->buf_nbft_addr);
827 		++i;
828 	}
829 
830 	ASSERT(i == bufcnt);
831 
832 	if (p_blist_def != NULL) {
833 		*pp_blist_def = *blist_def;
834 		*blist_def = p_blist_def;
835 	}
836 	if (p_blist_large != NULL) {
837 		*pp_blist_large = *blist_large;
838 		*blist_large = p_blist_large;
839 	}
840 	if (p_blist_raw != NULL) {
841 		*pp_blist_raw = *blist_raw;
842 		*blist_raw = p_blist_raw;
843 	}
844 
845 	/* if we're about to return this object to the slab, clean it up */
846 	if (raw) {
847 		switch (pp->pp_md_type) {
848 		case NEXUS_META_TYPE_PACKET: {
849 			struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
850 
851 			ASSERT(kpkt->pkt_com_opt != NULL ||
852 			    !(kpkt->pkt_pflags & PKT_F_OPT_ALLOC));
853 			if (kpkt->pkt_com_opt != NULL) {
854 				ASSERT(kpkt->pkt_pflags & PKT_F_OPT_ALLOC);
855 				skmem_cache_free(pp_opt_cache,
856 				    kpkt->pkt_com_opt);
857 				kpkt->pkt_com_opt = NULL;
858 			}
859 			ASSERT(kpkt->pkt_flow != NULL ||
860 			    !(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC));
861 			if (kpkt->pkt_flow != NULL) {
862 				ASSERT(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC);
863 				skmem_cache_free(pp_flow_cache, kpkt->pkt_flow);
864 				kpkt->pkt_flow = NULL;
865 			}
866 			ASSERT(kpkt->pkt_tx_compl != NULL ||
867 			    !(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC));
868 			if (kpkt->pkt_tx_compl != NULL) {
869 				ASSERT(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC);
870 				skmem_cache_free(pp_compl_cache,
871 				    kpkt->pkt_tx_compl);
872 				kpkt->pkt_tx_compl = NULL;
873 			}
874 			kpkt->pkt_pflags = 0;
875 			break;
876 		}
877 		default:
878 			ASSERT(METADATA_TYPE(kqum) == NEXUS_META_TYPE_QUANTUM);
879 			/* nothing to do for quantum (yet) */
880 			break;
881 		}
882 	}
883 }
884 
885 __attribute__((always_inline))
886 static void
pp_metadata_destruct(struct __kern_quantum * kqum,struct kern_pbufpool * pp,bool raw)887 pp_metadata_destruct(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
888     bool raw)
889 {
890 	struct skmem_obj *blist_def = NULL, *blist_large = NULL, *blist_raw = NULL;
891 
892 	pp_metadata_destruct_common(kqum, pp, raw, &blist_def, &blist_large,
893 	    &blist_raw);
894 	if (blist_def != NULL) {
895 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist_def);
896 	}
897 	if (blist_large != NULL) {
898 		skmem_cache_batch_free(PP_KBFT_CACHE_LARGE(pp), blist_large);
899 	}
900 	if (blist_raw != NULL) {
901 		skmem_cache_batch_free(pp->pp_raw_kbft_cache, blist_raw);
902 	}
903 }
904 
905 static void
pp_metadata_dtor(void * addr,void * arg)906 pp_metadata_dtor(void *addr, void *arg)
907 {
908 	pp_metadata_destruct(SK_PTR_ADDR_KQUM((uintptr_t)addr +
909 	    METADATA_PREAMBLE_SZ), arg, TRUE);
910 }
911 
912 static void
pp_buf_seg_ctor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)913 pp_buf_seg_ctor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
914 {
915 	struct kern_pbufpool *pp = arg;
916 
917 	if (pp->pp_pbuf_seg_ctor != NULL) {
918 		pp->pp_pbuf_seg_ctor(pp, sg, md);
919 	}
920 }
921 
922 static void
pp_buf_seg_dtor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)923 pp_buf_seg_dtor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
924 {
925 	struct kern_pbufpool *pp = arg;
926 
927 	if (pp->pp_pbuf_seg_dtor != NULL) {
928 		pp->pp_pbuf_seg_dtor(pp, sg, md);
929 	}
930 }
931 
932 static int
pp_buflet_metadata_ctor_common(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag,bool large,bool attach_buf)933 pp_buflet_metadata_ctor_common(struct skmem_obj_info *oi0,
934     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag, bool large,
935     bool attach_buf)
936 {
937 #pragma unused (skmflag)
938 	struct kern_pbufpool *pp = (struct kern_pbufpool *)arg;
939 	struct __kern_buflet *kbft;
940 	struct __user_buflet *ubft;
941 	struct skmem_obj_info oib;
942 	mach_vm_address_t baddr = 0;
943 	obj_idx_t oi_idx_reg, oib_idx_reg = OBJ_IDX_NONE;
944 	struct skmem_bufctl* oib_bc = NULL;
945 
946 	if (attach_buf) {
947 		baddr = pp_alloc_buffer_common(pp, &oib, skmflag, large);
948 		if (__improbable(baddr == 0)) {
949 			return ENOMEM;
950 		}
951 		oib_idx_reg = SKMEM_OBJ_IDX_REG(&oib);
952 		oib_bc = SKMEM_OBJ_BUFCTL(&oib);
953 	}
954 	/*
955 	 * Note that oi0 and oim0 may be stored inside the object itself;
956 	 * so copy what is required to local variables before constructing.
957 	 */
958 	oi_idx_reg = SKMEM_OBJ_IDX_REG(oi0);
959 	kbft = SKMEM_OBJ_ADDR(oi0);
960 
961 	if (__probable(!PP_KERNEL_ONLY(pp))) {
962 		ASSERT(oim0 != NULL && SKMEM_OBJ_ADDR(oim0) != NULL);
963 		ASSERT(SKMEM_OBJ_SIZE(oi0) == SKMEM_OBJ_SIZE(oim0));
964 		ASSERT(oi_idx_reg == SKMEM_OBJ_IDX_REG(oim0));
965 		ASSERT(SKMEM_OBJ_IDX_SEG(oi0) == SKMEM_OBJ_IDX_SEG(oim0));
966 		ubft = SKMEM_OBJ_ADDR(oim0);
967 	} else {
968 		ASSERT(oim0 == NULL);
969 		ubft = NULL;
970 	}
971 	KBUF_EXT_CTOR(kbft, ubft, baddr, oib_idx_reg, oib_bc,
972 	    oi_idx_reg, pp, large, attach_buf);
973 	return 0;
974 }
975 
976 static int
pp_buflet_default_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)977 pp_buflet_default_buffer_metadata_ctor(struct skmem_obj_info *oi0,
978     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
979 {
980 	return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, false, true);
981 }
982 
983 static int
pp_buflet_large_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)984 pp_buflet_large_buffer_metadata_ctor(struct skmem_obj_info *oi0,
985     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
986 {
987 	return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, true, true);
988 }
989 
990 static int
pp_buflet_no_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)991 pp_buflet_no_buffer_metadata_ctor(struct skmem_obj_info *oi0,
992     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
993 {
994 	return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, false, false);
995 }
996 
997 static void
pp_buflet_metadata_dtor(void * addr,void * arg)998 pp_buflet_metadata_dtor(void *addr, void *arg)
999 {
1000 	struct __kern_buflet *kbft = addr;
1001 	void *objaddr;
1002 	struct kern_pbufpool *pp = arg;
1003 	uint32_t usecnt = 0;
1004 	bool large = BUFLET_HAS_LARGE_BUF(kbft);
1005 
1006 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1007 	/*
1008 	 * don't assert for (buf_nbft_addr == 0) here as constructed
1009 	 * buflet may have this field as non-zero. This is because
1010 	 * buf_nbft_addr (__buflet_next) is used by skmem batch alloc
1011 	 * for chaining the buflets.
1012 	 * To ensure that the frred buflet was not part of a chain we
1013 	 * assert for (buf_nbft_idx == OBJ_IDX_NONE).
1014 	 */
1015 	ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
1016 	ASSERT(((struct __kern_buflet_ext *)kbft)->kbe_buf_upp_link.sle_next ==
1017 	    NULL);
1018 
1019 	/*
1020 	 * The raw buflet has never been attached with a buffer or already
1021 	 * cleaned up.
1022 	 */
1023 	if ((kbft->buf_flag & BUFLET_FLAG_RAW) != 0 && kbft->buf_ctl == NULL) {
1024 		return;
1025 	}
1026 
1027 	ASSERT(kbft->buf_addr != 0);
1028 	ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
1029 	ASSERT(kbft->buf_ctl != NULL);
1030 
1031 	objaddr = kbft->buf_objaddr;
1032 	KBUF_DTOR(kbft, usecnt);
1033 	SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u", SK_KVA(pp),
1034 	    SK_KVA(objaddr), usecnt);
1035 	if (__probable(usecnt == 0)) {
1036 		skmem_cache_free(large ? PP_BUF_CACHE_LARGE(pp) :
1037 		    PP_BUF_CACHE_DEF(pp), objaddr);
1038 	}
1039 }
1040 
1041 struct kern_pbufpool *
pp_create(const char * name,struct skmem_region_params * srp_array,pbuf_seg_ctor_fn_t buf_seg_ctor,pbuf_seg_dtor_fn_t buf_seg_dtor,const void * ctx,pbuf_ctx_retain_fn_t ctx_retain,pbuf_ctx_release_fn_t ctx_release,uint32_t ppcreatef)1042 pp_create(const char *name, struct skmem_region_params *srp_array,
1043     pbuf_seg_ctor_fn_t buf_seg_ctor, pbuf_seg_dtor_fn_t buf_seg_dtor,
1044     const void *ctx, pbuf_ctx_retain_fn_t ctx_retain,
1045     pbuf_ctx_release_fn_t ctx_release, uint32_t ppcreatef)
1046 {
1047 	struct kern_pbufpool *pp = NULL;
1048 	uint32_t md_size, def_buf_obj_size;
1049 	uint16_t def_buf_size, large_buf_size;
1050 	nexus_meta_type_t md_type;
1051 	nexus_meta_subtype_t md_subtype;
1052 	uint32_t md_cflags;
1053 	uint16_t max_frags;
1054 	char cname[64];
1055 	struct skmem_region_params *kmd_srp;
1056 	struct skmem_region_params *buf_srp;
1057 	struct skmem_region_params *kbft_srp;
1058 	struct skmem_region_params *umd_srp = NULL;
1059 	struct skmem_region_params *ubft_srp = NULL;
1060 	struct skmem_region_params *lbuf_srp = NULL;
1061 
1062 	/* buf_seg_{ctor,dtor} pair must be either NULL or non-NULL */
1063 	ASSERT(!(!(buf_seg_ctor == NULL && buf_seg_dtor == NULL) &&
1064 	    ((buf_seg_ctor == NULL) ^ (buf_seg_dtor == NULL))));
1065 
1066 	/* ctx{,_retain,_release} must be either ALL NULL or ALL non-NULL */
1067 	ASSERT((ctx == NULL && ctx_retain == NULL && ctx_release == NULL) ||
1068 	    (ctx != NULL && ctx_retain != NULL && ctx_release != NULL));
1069 
1070 	if (srp_array[SKMEM_REGION_KMD].srp_c_obj_cnt != 0) {
1071 		kmd_srp = &srp_array[SKMEM_REGION_KMD];
1072 		buf_srp = &srp_array[SKMEM_REGION_BUF_DEF];
1073 		lbuf_srp = &srp_array[SKMEM_REGION_BUF_LARGE];
1074 		kbft_srp = &srp_array[SKMEM_REGION_KBFT];
1075 	} else if (srp_array[SKMEM_REGION_RXKMD].srp_c_obj_cnt != 0) {
1076 		kmd_srp = &srp_array[SKMEM_REGION_RXKMD];
1077 		buf_srp = &srp_array[SKMEM_REGION_RXBUF_DEF];
1078 		lbuf_srp = &srp_array[SKMEM_REGION_RXBUF_LARGE];
1079 		kbft_srp = &srp_array[SKMEM_REGION_RXKBFT];
1080 	} else {
1081 		VERIFY(srp_array[SKMEM_REGION_TXKMD].srp_c_obj_cnt != 0);
1082 		kmd_srp = &srp_array[SKMEM_REGION_TXKMD];
1083 		buf_srp = &srp_array[SKMEM_REGION_TXBUF_DEF];
1084 		lbuf_srp = &srp_array[SKMEM_REGION_TXBUF_LARGE];
1085 		kbft_srp = &srp_array[SKMEM_REGION_TXKBFT];
1086 	}
1087 
1088 	VERIFY(kmd_srp->srp_c_obj_size != 0);
1089 	VERIFY(buf_srp->srp_c_obj_cnt != 0);
1090 	VERIFY(buf_srp->srp_c_obj_size != 0);
1091 
1092 	if (ppcreatef & PPCREATEF_ONDEMAND_BUF) {
1093 		VERIFY(kbft_srp->srp_c_obj_cnt != 0);
1094 		VERIFY(kbft_srp->srp_c_obj_size != 0);
1095 	} else {
1096 		kbft_srp = NULL;
1097 	}
1098 
1099 	if ((ppcreatef & PPCREATEF_KERNEL_ONLY) == 0) {
1100 		umd_srp = &srp_array[SKMEM_REGION_UMD];
1101 		ASSERT(umd_srp->srp_c_obj_size == kmd_srp->srp_c_obj_size);
1102 		ASSERT(umd_srp->srp_c_obj_cnt == kmd_srp->srp_c_obj_cnt);
1103 		ASSERT(umd_srp->srp_c_seg_size == kmd_srp->srp_c_seg_size);
1104 		ASSERT(umd_srp->srp_seg_cnt == kmd_srp->srp_seg_cnt);
1105 		ASSERT(umd_srp->srp_md_type == kmd_srp->srp_md_type);
1106 		ASSERT(umd_srp->srp_md_subtype == kmd_srp->srp_md_subtype);
1107 		ASSERT(umd_srp->srp_max_frags == kmd_srp->srp_max_frags);
1108 		ASSERT((umd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
1109 		    (kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
1110 		if (kbft_srp != NULL) {
1111 			ubft_srp = &srp_array[SKMEM_REGION_UBFT];
1112 			ASSERT(ubft_srp->srp_c_obj_size ==
1113 			    kbft_srp->srp_c_obj_size);
1114 			ASSERT(ubft_srp->srp_c_obj_cnt ==
1115 			    kbft_srp->srp_c_obj_cnt);
1116 			ASSERT(ubft_srp->srp_c_seg_size ==
1117 			    kbft_srp->srp_c_seg_size);
1118 			ASSERT(ubft_srp->srp_seg_cnt == kbft_srp->srp_seg_cnt);
1119 		}
1120 	}
1121 
1122 	md_size = kmd_srp->srp_r_obj_size;
1123 	md_type = kmd_srp->srp_md_type;
1124 	md_subtype = kmd_srp->srp_md_subtype;
1125 	max_frags = kmd_srp->srp_max_frags;
1126 	def_buf_obj_size = buf_srp->srp_c_obj_size;
1127 
1128 	if (def_buf_obj_size > UINT16_MAX) {
1129 		def_buf_size = UINT16_MAX;
1130 	} else {
1131 		def_buf_size = (uint16_t)def_buf_obj_size;
1132 	}
1133 
1134 	if (lbuf_srp->srp_c_obj_size > UINT16_MAX) {
1135 		large_buf_size = UINT16_MAX;
1136 	} else {
1137 		large_buf_size = (uint16_t)lbuf_srp->srp_c_obj_size;
1138 	}
1139 
1140 #if (DEBUG || DEVELOPMENT)
1141 	ASSERT(def_buf_obj_size != 0);
1142 	ASSERT(md_type > NEXUS_META_TYPE_INVALID &&
1143 	    md_type <= NEXUS_META_TYPE_MAX);
1144 	if (md_type == NEXUS_META_TYPE_QUANTUM) {
1145 		ASSERT(max_frags == 1);
1146 		ASSERT(md_size >=
1147 		    (METADATA_PREAMBLE_SZ + NX_METADATA_QUANTUM_SZ));
1148 	} else {
1149 		ASSERT(max_frags >= 1);
1150 		ASSERT(md_type == NEXUS_META_TYPE_PACKET);
1151 		ASSERT(md_size >= (METADATA_PREAMBLE_SZ +
1152 		    NX_METADATA_PACKET_SZ(max_frags)));
1153 	}
1154 	ASSERT(md_subtype > NEXUS_META_SUBTYPE_INVALID &&
1155 	    md_subtype <= NEXUS_META_SUBTYPE_MAX);
1156 #endif /* DEBUG || DEVELOPMENT */
1157 
1158 	pp = pp_alloc(Z_WAITOK);
1159 
1160 	(void) snprintf((char *)pp->pp_name, sizeof(pp->pp_name),
1161 	    "skywalk.pp.%s", name);
1162 
1163 	pp->pp_ctx = __DECONST(void *, ctx);
1164 	pp->pp_ctx_retain = ctx_retain;
1165 	pp->pp_ctx_release = ctx_release;
1166 	if (pp->pp_ctx != NULL) {
1167 		pp->pp_ctx_retain(pp->pp_ctx);
1168 	}
1169 
1170 	pp->pp_pbuf_seg_ctor = buf_seg_ctor;
1171 	pp->pp_pbuf_seg_dtor = buf_seg_dtor;
1172 	PP_BUF_SIZE_DEF(pp) = def_buf_size;
1173 	PP_BUF_OBJ_SIZE_DEF(pp) = def_buf_obj_size;
1174 	PP_BUF_SIZE_LARGE(pp) = large_buf_size;
1175 	PP_BUF_OBJ_SIZE_LARGE(pp) = lbuf_srp->srp_c_obj_size;
1176 	pp->pp_md_type = md_type;
1177 	pp->pp_md_subtype = md_subtype;
1178 	pp->pp_max_frags = max_frags;
1179 	if (ppcreatef & PPCREATEF_EXTERNAL) {
1180 		pp->pp_flags |= PPF_EXTERNAL;
1181 	}
1182 	if (ppcreatef & PPCREATEF_TRUNCATED_BUF) {
1183 		pp->pp_flags |= PPF_TRUNCATED_BUF;
1184 	}
1185 	if (ppcreatef & PPCREATEF_KERNEL_ONLY) {
1186 		pp->pp_flags |= PPF_KERNEL;
1187 	}
1188 	if (ppcreatef & PPCREATEF_ONDEMAND_BUF) {
1189 		pp->pp_flags |= PPF_BUFFER_ON_DEMAND;
1190 	}
1191 	if (ppcreatef & PPCREATEF_DYNAMIC) {
1192 		pp->pp_flags |= PPF_DYNAMIC;
1193 	}
1194 	if (lbuf_srp->srp_c_obj_cnt > 0) {
1195 		ASSERT(lbuf_srp->srp_c_obj_size != 0);
1196 		pp->pp_flags |= PPF_LARGE_BUF;
1197 	}
1198 	if (ppcreatef & PPCREATEF_RAW_BFLT) {
1199 		ASSERT((ppcreatef & PPCREATEF_ONDEMAND_BUF) != 0);
1200 		pp->pp_flags |= PPF_RAW_BUFLT;
1201 	}
1202 
1203 	pp_retain(pp);
1204 
1205 	md_cflags = ((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ?
1206 	    SKMEM_CR_NOMAGAZINES : 0);
1207 	md_cflags |= SKMEM_CR_BATCH;
1208 	pp->pp_flags |= PPF_BATCH;
1209 
1210 	if (pp->pp_flags & PPF_DYNAMIC) {
1211 		md_cflags |= SKMEM_CR_DYNAMIC;
1212 	}
1213 
1214 	if (umd_srp != NULL && (pp->pp_umd_region =
1215 	    skmem_region_create(name, umd_srp, NULL, NULL, NULL)) == NULL) {
1216 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1217 		    pp->pp_name, SK_KVA(pp), umd_srp->srp_name);
1218 		goto failed;
1219 	}
1220 
1221 	if ((pp->pp_kmd_region = skmem_region_create(name, kmd_srp, NULL, NULL,
1222 	    NULL)) == NULL) {
1223 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1224 		    pp->pp_name, SK_KVA(pp), kmd_srp->srp_name);
1225 		goto failed;
1226 	}
1227 
1228 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1229 		VERIFY((kbft_srp != NULL) && (kbft_srp->srp_c_obj_cnt > 0));
1230 		if (!PP_KERNEL_ONLY(pp)) {
1231 			VERIFY((ubft_srp != NULL) &&
1232 			    (ubft_srp->srp_c_obj_cnt > 0));
1233 		}
1234 	}
1235 	/*
1236 	 * Metadata regions {KMD,KBFT,UBFT} magazines layer and persistency
1237 	 * attribute must match.
1238 	 */
1239 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1240 		ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ==
1241 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES));
1242 		ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
1243 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
1244 	}
1245 
1246 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && !PP_KERNEL_ONLY(pp)) {
1247 		if ((pp->pp_ubft_region = skmem_region_create(name, ubft_srp,
1248 		    NULL, NULL, NULL)) == NULL) {
1249 			SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1250 			    pp->pp_name, SK_KVA(pp), ubft_srp->srp_name);
1251 			goto failed;
1252 		}
1253 	}
1254 
1255 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1256 		if ((pp->pp_kbft_region = skmem_region_create(name,
1257 		    kbft_srp, NULL, NULL, NULL)) == NULL) {
1258 			SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1259 			    pp->pp_name, SK_KVA(pp), kbft_srp->srp_name);
1260 			goto failed;
1261 		}
1262 	}
1263 
1264 	if (!PP_KERNEL_ONLY(pp)) {
1265 		skmem_region_mirror(pp->pp_kmd_region, pp->pp_umd_region);
1266 	}
1267 	if (!PP_KERNEL_ONLY(pp) && pp->pp_ubft_region != NULL) {
1268 		ASSERT(pp->pp_kbft_region != NULL);
1269 		skmem_region_mirror(pp->pp_kbft_region, pp->pp_ubft_region);
1270 	}
1271 
1272 	/*
1273 	 * Create the metadata cache; magazines layer is determined by caller.
1274 	 */
1275 	(void) snprintf(cname, sizeof(cname), "kmd.%s", name);
1276 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1277 		pp->pp_kmd_cache = skmem_cache_create(cname, md_size, 0,
1278 		    pp_metadata_ctor_no_buflet, pp_metadata_dtor, NULL, pp,
1279 		    pp->pp_kmd_region, md_cflags);
1280 	} else {
1281 		pp->pp_kmd_cache = skmem_cache_create(cname, md_size, 0,
1282 		    pp_metadata_ctor_max_buflet, pp_metadata_dtor, NULL, pp,
1283 		    pp->pp_kmd_region, md_cflags);
1284 	}
1285 
1286 	if (pp->pp_kmd_cache == NULL) {
1287 		SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1288 		    pp->pp_name, SK_KVA(pp), cname);
1289 		goto failed;
1290 	}
1291 
1292 	/*
1293 	 * Create the buflet metadata cache
1294 	 */
1295 	if (pp->pp_kbft_region != NULL) {
1296 		(void) snprintf(cname, sizeof(cname), "kbft_def.%s", name);
1297 		PP_KBFT_CACHE_DEF(pp) = skmem_cache_create(cname,
1298 		    kbft_srp->srp_c_obj_size, 0,
1299 		    pp_buflet_default_buffer_metadata_ctor,
1300 		    pp_buflet_metadata_dtor, NULL, pp, pp->pp_kbft_region,
1301 		    md_cflags);
1302 
1303 		if (PP_KBFT_CACHE_DEF(pp) == NULL) {
1304 			SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1305 			    pp->pp_name, SK_KVA(pp), cname);
1306 			goto failed;
1307 		}
1308 
1309 		if (PP_HAS_LARGE_BUF(pp)) {
1310 			(void) snprintf(cname, sizeof(cname), "kbft_large.%s",
1311 			    name);
1312 			PP_KBFT_CACHE_LARGE(pp) = skmem_cache_create(cname,
1313 			    kbft_srp->srp_c_obj_size, 0,
1314 			    pp_buflet_large_buffer_metadata_ctor,
1315 			    pp_buflet_metadata_dtor,
1316 			    NULL, pp, pp->pp_kbft_region, md_cflags);
1317 
1318 			if (PP_KBFT_CACHE_LARGE(pp) == NULL) {
1319 				SK_ERR("\"%s\" (0x%llx) failed to "
1320 				    "create \"%s\" cache", pp->pp_name,
1321 				    SK_KVA(pp), cname);
1322 				goto failed;
1323 			}
1324 		}
1325 
1326 		if (PP_HAS_RAW_BFLT(pp)) {
1327 			(void) snprintf(cname, sizeof(cname), "kbft_raw.%s", name);
1328 			pp->pp_raw_kbft_cache = skmem_cache_create(cname,
1329 			    kbft_srp->srp_c_obj_size, 0,
1330 			    pp_buflet_no_buffer_metadata_ctor,
1331 			    pp_buflet_metadata_dtor, NULL, pp, pp->pp_kbft_region,
1332 			    md_cflags);
1333 
1334 			if (pp->pp_raw_kbft_cache == NULL) {
1335 				SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1336 				    pp->pp_name, SK_KVA(pp), cname);
1337 				goto failed;
1338 			}
1339 		}
1340 	}
1341 
1342 	if ((PP_BUF_REGION_DEF(pp) = skmem_region_create(name,
1343 	    buf_srp, pp_buf_seg_ctor, pp_buf_seg_dtor, pp)) == NULL) {
1344 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1345 		    pp->pp_name, SK_KVA(pp), buf_srp->srp_name);
1346 		goto failed;
1347 	}
1348 
1349 	if (PP_HAS_LARGE_BUF(pp)) {
1350 		PP_BUF_REGION_LARGE(pp) = skmem_region_create(name, lbuf_srp,
1351 		    pp_buf_seg_ctor, pp_buf_seg_dtor, pp);
1352 		if (PP_BUF_REGION_LARGE(pp) == NULL) {
1353 			SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1354 			    pp->pp_name, SK_KVA(pp), lbuf_srp->srp_name);
1355 			goto failed;
1356 		}
1357 	}
1358 
1359 	/*
1360 	 * Create the buffer object cache without the magazines layer.
1361 	 * We rely on caching the constructed metadata object instead.
1362 	 */
1363 	(void) snprintf(cname, sizeof(cname), "buf_def.%s", name);
1364 	if ((PP_BUF_CACHE_DEF(pp) = skmem_cache_create(cname, def_buf_obj_size,
1365 	    0, NULL, NULL, NULL, pp, PP_BUF_REGION_DEF(pp),
1366 	    SKMEM_CR_NOMAGAZINES)) == NULL) {
1367 		SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1368 		    pp->pp_name, SK_KVA(pp), cname);
1369 		goto failed;
1370 	}
1371 
1372 	if (PP_BUF_REGION_LARGE(pp) != NULL) {
1373 		(void) snprintf(cname, sizeof(cname), "buf_large.%s", name);
1374 		if ((PP_BUF_CACHE_LARGE(pp) = skmem_cache_create(cname,
1375 		    lbuf_srp->srp_c_obj_size, 0, NULL, NULL, NULL, pp,
1376 		    PP_BUF_REGION_LARGE(pp), SKMEM_CR_NOMAGAZINES)) == NULL) {
1377 			SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1378 			    pp->pp_name, SK_KVA(pp), cname);
1379 			goto failed;
1380 		}
1381 	}
1382 
1383 	return pp;
1384 
1385 failed:
1386 	if (pp != NULL) {
1387 		if (pp->pp_ctx != NULL) {
1388 			pp->pp_ctx_release(pp->pp_ctx);
1389 			pp->pp_ctx = NULL;
1390 		}
1391 		pp_close(pp);
1392 	}
1393 
1394 	return NULL;
1395 }
1396 
1397 void
pp_destroy(struct kern_pbufpool * pp)1398 pp_destroy(struct kern_pbufpool *pp)
1399 {
1400 	PP_LOCK_ASSERT_HELD(pp);
1401 
1402 	/* may be called for built-in pp with outstanding reference */
1403 	ASSERT(!(pp->pp_flags & PPF_EXTERNAL) || pp->pp_refcnt == 0);
1404 
1405 	pp_destroy_upp_locked(pp);
1406 
1407 	pp_destroy_upp_bft_locked(pp);
1408 
1409 	if (pp->pp_kmd_cache != NULL) {
1410 		skmem_cache_destroy(pp->pp_kmd_cache);
1411 		pp->pp_kmd_cache = NULL;
1412 	}
1413 
1414 	if (pp->pp_umd_region != NULL) {
1415 		skmem_region_release(pp->pp_umd_region);
1416 		pp->pp_umd_region = NULL;
1417 	}
1418 
1419 	if (pp->pp_kmd_region != NULL) {
1420 		skmem_region_release(pp->pp_kmd_region);
1421 		pp->pp_kmd_region = NULL;
1422 	}
1423 
1424 	if (PP_KBFT_CACHE_DEF(pp) != NULL) {
1425 		skmem_cache_destroy(PP_KBFT_CACHE_DEF(pp));
1426 		PP_KBFT_CACHE_DEF(pp) = NULL;
1427 	}
1428 
1429 	if (PP_KBFT_CACHE_LARGE(pp) != NULL) {
1430 		skmem_cache_destroy(PP_KBFT_CACHE_LARGE(pp));
1431 		PP_KBFT_CACHE_LARGE(pp) = NULL;
1432 	}
1433 
1434 	if (pp->pp_raw_kbft_cache != NULL) {
1435 		skmem_cache_destroy(pp->pp_raw_kbft_cache);
1436 		pp->pp_raw_kbft_cache = NULL;
1437 	}
1438 
1439 	if (pp->pp_ubft_region != NULL) {
1440 		skmem_region_release(pp->pp_ubft_region);
1441 		pp->pp_ubft_region = NULL;
1442 	}
1443 
1444 	if (pp->pp_kbft_region != NULL) {
1445 		skmem_region_release(pp->pp_kbft_region);
1446 		pp->pp_kbft_region = NULL;
1447 	}
1448 
1449 	/*
1450 	 * The order is important here, since pp_metadata_dtor()
1451 	 * called by freeing on the pp_kmd_cache will in turn
1452 	 * free the attached buffer.  Therefore destroy the
1453 	 * buffer cache last.
1454 	 */
1455 	if (PP_BUF_CACHE_DEF(pp) != NULL) {
1456 		skmem_cache_destroy(PP_BUF_CACHE_DEF(pp));
1457 		PP_BUF_CACHE_DEF(pp) = NULL;
1458 	}
1459 	if (PP_BUF_REGION_DEF(pp) != NULL) {
1460 		skmem_region_release(PP_BUF_REGION_DEF(pp));
1461 		PP_BUF_REGION_DEF(pp) = NULL;
1462 	}
1463 	if (PP_BUF_CACHE_LARGE(pp) != NULL) {
1464 		skmem_cache_destroy(PP_BUF_CACHE_LARGE(pp));
1465 		PP_BUF_CACHE_LARGE(pp) = NULL;
1466 	}
1467 	if (PP_BUF_REGION_LARGE(pp) != NULL) {
1468 		skmem_region_release(PP_BUF_REGION_LARGE(pp));
1469 		PP_BUF_REGION_LARGE(pp) = NULL;
1470 	}
1471 
1472 	if (pp->pp_ctx != NULL) {
1473 		pp->pp_ctx_release(pp->pp_ctx);
1474 		pp->pp_ctx = NULL;
1475 	}
1476 }
1477 
1478 static int
pp_init_upp_locked(struct kern_pbufpool * pp,boolean_t can_block)1479 pp_init_upp_locked(struct kern_pbufpool *pp, boolean_t can_block)
1480 {
1481 	int i, err = 0;
1482 
1483 	if (pp->pp_u_hash_table != NULL) {
1484 		goto done;
1485 	}
1486 
1487 	/* allocated-address hash table */
1488 	pp->pp_u_hash_table = can_block ? zalloc(pp_u_htbl_zone) :
1489 	    zalloc_noblock(pp_u_htbl_zone);
1490 	if (pp->pp_u_hash_table == NULL) {
1491 		SK_ERR("failed to zalloc packet buffer pool upp hash table");
1492 		err = ENOMEM;
1493 		goto done;
1494 	}
1495 
1496 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1497 		SLIST_INIT(&pp->pp_u_hash_table[i].upp_head);
1498 	}
1499 done:
1500 	return err;
1501 }
1502 
1503 static void
pp_destroy_upp_locked(struct kern_pbufpool * pp)1504 pp_destroy_upp_locked(struct kern_pbufpool *pp)
1505 {
1506 	PP_LOCK_ASSERT_HELD(pp);
1507 	if (pp->pp_u_hash_table != NULL) {
1508 		/* purge anything that's left */
1509 		pp_purge_upp_locked(pp, -1);
1510 
1511 #if (DEBUG || DEVELOPMENT)
1512 		for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1513 			ASSERT(SLIST_EMPTY(&pp->pp_u_hash_table[i].upp_head));
1514 		}
1515 #endif /* DEBUG || DEVELOPMENT */
1516 
1517 		zfree(pp_u_htbl_zone, pp->pp_u_hash_table);
1518 		pp->pp_u_hash_table = NULL;
1519 	}
1520 	ASSERT(pp->pp_u_bufinuse == 0);
1521 }
1522 
1523 int
pp_init_upp(struct kern_pbufpool * pp,boolean_t can_block)1524 pp_init_upp(struct kern_pbufpool *pp, boolean_t can_block)
1525 {
1526 	int err = 0;
1527 
1528 	PP_LOCK(pp);
1529 	err = pp_init_upp_locked(pp, can_block);
1530 	if (err) {
1531 		SK_ERR("packet UPP init failed (%d)", err);
1532 		goto done;
1533 	}
1534 	err = pp_init_upp_bft_locked(pp, can_block);
1535 	if (err) {
1536 		SK_ERR("buflet UPP init failed (%d)", err);
1537 		pp_destroy_upp_locked(pp);
1538 		goto done;
1539 	}
1540 	pp_retain_locked(pp);
1541 done:
1542 	PP_UNLOCK(pp);
1543 	return err;
1544 }
1545 
1546 __attribute__((always_inline))
1547 static void
pp_insert_upp_bft_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1548 pp_insert_upp_bft_locked(struct kern_pbufpool *pp,
1549     struct __kern_buflet *kbft, pid_t pid)
1550 {
1551 	struct kern_pbufpool_u_bft_bkt *bkt;
1552 	struct __kern_buflet_ext *kbe = (struct __kern_buflet_ext *)kbft;
1553 
1554 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1555 	ASSERT(kbe->kbe_buf_pid == (pid_t)-1);
1556 	kbe->kbe_buf_pid = pid;
1557 	bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, kbft->buf_bft_idx_reg);
1558 	SLIST_INSERT_HEAD(&bkt->upp_head, kbe, kbe_buf_upp_link);
1559 	pp->pp_u_bftinuse++;
1560 }
1561 
1562 __attribute__((always_inline))
1563 static void
pp_insert_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1564 pp_insert_upp_bft_chain_locked(struct kern_pbufpool *pp,
1565     struct __kern_buflet *kbft, pid_t pid)
1566 {
1567 	while (kbft != NULL) {
1568 		pp_insert_upp_bft_locked(pp, kbft, pid);
1569 		kbft = __DECONST(kern_buflet_t, kbft->buf_nbft_addr);
1570 	}
1571 }
1572 
1573 /* Also inserts the attached chain of buflets */
1574 void static inline
pp_insert_upp_common(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1575 pp_insert_upp_common(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1576     pid_t pid)
1577 {
1578 	struct kern_pbufpool_u_bkt *bkt;
1579 	struct __kern_buflet *kbft;
1580 
1581 	ASSERT(kqum->qum_pid == (pid_t)-1);
1582 	kqum->qum_pid = pid;
1583 
1584 	bkt = KERN_PBUFPOOL_U_HASH(pp, METADATA_IDX(kqum));
1585 	SLIST_INSERT_HEAD(&bkt->upp_head, kqum, qum_upp_link);
1586 	pp->pp_u_bufinuse++;
1587 
1588 	kbft = (kern_buflet_t)kqum->qum_buf[0].buf_nbft_addr;
1589 	if (kbft != NULL) {
1590 		ASSERT(((kern_buflet_t)kbft)->buf_flag & BUFLET_FLAG_EXTERNAL);
1591 		ASSERT(kqum->qum_qflags & QUM_F_INTERNALIZED);
1592 		pp_insert_upp_bft_chain_locked(pp, kbft, pid);
1593 	}
1594 }
1595 
1596 void
pp_insert_upp_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1597 pp_insert_upp_locked(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1598     pid_t pid)
1599 {
1600 	pp_insert_upp_common(pp, kqum, pid);
1601 }
1602 
1603 void
pp_insert_upp(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1604 pp_insert_upp(struct kern_pbufpool *pp, struct __kern_quantum *kqum, pid_t pid)
1605 {
1606 	PP_LOCK(pp);
1607 	pp_insert_upp_common(pp, kqum, pid);
1608 	PP_UNLOCK(pp);
1609 }
1610 
1611 void
pp_insert_upp_batch(struct kern_pbufpool * pp,pid_t pid,uint64_t * array,uint32_t num)1612 pp_insert_upp_batch(struct kern_pbufpool *pp, pid_t pid, uint64_t *array,
1613     uint32_t num)
1614 {
1615 	uint32_t i = 0;
1616 
1617 	ASSERT(array != NULL && num > 0);
1618 	PP_LOCK(pp);
1619 	while (num != 0) {
1620 		struct __kern_quantum *kqum = SK_PTR_ADDR_KQUM(array[i]);
1621 
1622 		ASSERT(kqum != NULL);
1623 		pp_insert_upp_common(pp, kqum, pid);
1624 		--num;
1625 		++i;
1626 	}
1627 	PP_UNLOCK(pp);
1628 }
1629 
1630 __attribute__((always_inline))
1631 static struct __kern_buflet *
pp_remove_upp_bft_locked(struct kern_pbufpool * pp,obj_idx_t bft_idx)1632 pp_remove_upp_bft_locked(struct kern_pbufpool *pp, obj_idx_t bft_idx)
1633 {
1634 	struct __kern_buflet_ext *kbft, *tbft;
1635 	struct kern_pbufpool_u_bft_bkt *bkt;
1636 
1637 	bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, bft_idx);
1638 	SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link, tbft) {
1639 		if (((kern_buflet_t)kbft)->buf_bft_idx_reg == bft_idx) {
1640 			SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1641 			    kbe_buf_upp_link);
1642 			kbft->kbe_buf_pid = (pid_t)-1;
1643 			kbft->kbe_buf_upp_link.sle_next = NULL;
1644 			ASSERT(pp->pp_u_bftinuse != 0);
1645 			pp->pp_u_bftinuse--;
1646 			break;
1647 		}
1648 	}
1649 	return (kern_buflet_t)kbft;
1650 }
1651 
1652 struct __kern_buflet *
pp_remove_upp_bft(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1653 pp_remove_upp_bft(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1654 {
1655 	struct __kern_buflet *kbft = pp_remove_upp_bft_locked(pp, md_idx);
1656 
1657 	*err = __improbable(kbft != NULL) ? 0 : EINVAL;
1658 	return kbft;
1659 }
1660 
1661 __attribute__((always_inline))
1662 static int
pp_remove_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum)1663 pp_remove_upp_bft_chain_locked(struct kern_pbufpool *pp,
1664     struct __kern_quantum *kqum)
1665 {
1666 	uint32_t max_frags = pp->pp_max_frags;
1667 	struct __kern_buflet *kbft;
1668 	uint16_t nbfts, upkt_nbfts;
1669 	obj_idx_t bft_idx;
1670 
1671 	ASSERT(!(kqum->qum_qflags & QUM_F_INTERNALIZED));
1672 	bft_idx = kqum->qum_user->qum_buf[0].buf_nbft_idx;
1673 	kbft = &kqum->qum_buf[0];
1674 	if (bft_idx == OBJ_IDX_NONE) {
1675 		return 0;
1676 	}
1677 
1678 	ASSERT(METADATA_TYPE(kqum) == NEXUS_META_TYPE_PACKET);
1679 	struct __kern_packet *kpkt = __DECONST(struct __kern_packet *, kqum);
1680 	struct __user_packet *upkt = __DECONST(struct __user_packet *,
1681 	    kpkt->pkt_qum.qum_user);
1682 
1683 	upkt_nbfts = upkt->pkt_bufs_cnt;
1684 	if (__improbable(upkt_nbfts > max_frags)) {
1685 		SK_ERR("bad bcnt in upkt (%d > %d)", upkt_nbfts, max_frags);
1686 		BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1687 		BUF_NBFT_ADDR(kbft, 0);
1688 		return ERANGE;
1689 	}
1690 
1691 	nbfts = (kbft->buf_addr != 0) ? 1 : 0;
1692 
1693 	do {
1694 		struct __kern_buflet *pbft = kbft;
1695 		struct __kern_buflet_ext *kbe;
1696 
1697 		kbft = pp_remove_upp_bft_locked(pp, bft_idx);
1698 		if (__improbable(kbft == NULL)) {
1699 			BUF_NBFT_IDX(pbft, OBJ_IDX_NONE);
1700 			BUF_NBFT_ADDR(pbft, 0);
1701 			SK_ERR("unallocated next buflet (%d), %p", bft_idx,
1702 			    SK_KVA(pbft));
1703 			return ERANGE;
1704 		}
1705 		ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1706 		BUF_NBFT_IDX(pbft, bft_idx);
1707 		BUF_NBFT_ADDR(pbft, kbft);
1708 		kbe = (struct __kern_buflet_ext *)kbft;
1709 		bft_idx = kbe->kbe_buf_user->buf_nbft_idx;
1710 		++nbfts;
1711 	} while ((bft_idx != OBJ_IDX_NONE) && (nbfts < upkt_nbfts));
1712 
1713 	ASSERT(kbft != NULL);
1714 	BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1715 	BUF_NBFT_ADDR(kbft, 0);
1716 	*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = nbfts;
1717 
1718 	if (__improbable((bft_idx != OBJ_IDX_NONE) || (nbfts != upkt_nbfts))) {
1719 		SK_ERR("bad buflet in upkt (%d, %d)", nbfts, upkt_nbfts);
1720 		return ERANGE;
1721 	}
1722 	return 0;
1723 }
1724 
1725 struct __kern_quantum *
pp_remove_upp_locked(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1726 pp_remove_upp_locked(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1727 {
1728 	struct __kern_quantum *kqum, *tqum;
1729 	struct kern_pbufpool_u_bkt *bkt;
1730 
1731 	bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1732 	SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1733 		if (METADATA_IDX(kqum) == md_idx) {
1734 			SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1735 			    qum_upp_link);
1736 			kqum->qum_pid = (pid_t)-1;
1737 			ASSERT(pp->pp_u_bufinuse != 0);
1738 			pp->pp_u_bufinuse--;
1739 			break;
1740 		}
1741 	}
1742 	if (__probable(kqum != NULL)) {
1743 		*err = pp_remove_upp_bft_chain_locked(pp, kqum);
1744 	} else {
1745 		*err = ERANGE;
1746 	}
1747 	return kqum;
1748 }
1749 
1750 struct __kern_quantum *
pp_remove_upp(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1751 pp_remove_upp(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1752 {
1753 	struct __kern_quantum *kqum;
1754 
1755 	PP_LOCK(pp);
1756 	kqum = pp_remove_upp_locked(pp, md_idx, err);
1757 	PP_UNLOCK(pp);
1758 	return kqum;
1759 }
1760 
1761 struct __kern_quantum *
pp_find_upp(struct kern_pbufpool * pp,obj_idx_t md_idx)1762 pp_find_upp(struct kern_pbufpool *pp, obj_idx_t md_idx)
1763 {
1764 	struct __kern_quantum *kqum, *tqum;
1765 	struct kern_pbufpool_u_bkt *bkt;
1766 
1767 	PP_LOCK(pp);
1768 	bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1769 	SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1770 		if (METADATA_IDX(kqum) == md_idx) {
1771 			break;
1772 		}
1773 	}
1774 	PP_UNLOCK(pp);
1775 
1776 	return kqum;
1777 }
1778 
1779 __attribute__((always_inline))
1780 static void
pp_purge_upp_locked(struct kern_pbufpool * pp,pid_t pid)1781 pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid)
1782 {
1783 	struct __kern_quantum *kqum, *tqum;
1784 	struct kern_pbufpool_u_bkt *bkt;
1785 	int i;
1786 
1787 	PP_LOCK_ASSERT_HELD(pp);
1788 
1789 	/*
1790 	 * TODO: Build a list of packets and batch-free them.
1791 	 */
1792 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1793 		bkt = &pp->pp_u_hash_table[i];
1794 		SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1795 			ASSERT(kqum->qum_pid != (pid_t)-1);
1796 			if (pid != (pid_t)-1 && kqum->qum_pid != pid) {
1797 				continue;
1798 			}
1799 			SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1800 			    qum_upp_link);
1801 			pp_remove_upp_bft_chain_locked(pp, kqum);
1802 			kqum->qum_pid = (pid_t)-1;
1803 			kqum->qum_qflags &= ~QUM_F_FINALIZED;
1804 			kqum->qum_ksd = NULL;
1805 			pp_free_packet(__DECONST(struct kern_pbufpool *,
1806 			    kqum->qum_pp), (uint64_t)kqum);
1807 			ASSERT(pp->pp_u_bufinuse != 0);
1808 			pp->pp_u_bufinuse--;
1809 		}
1810 	}
1811 }
1812 
1813 __attribute__((always_inline))
1814 static void
pp_purge_upp_bft_locked(struct kern_pbufpool * pp,pid_t pid)1815 pp_purge_upp_bft_locked(struct kern_pbufpool *pp, pid_t pid)
1816 {
1817 	struct __kern_buflet_ext *kbft, *tbft;
1818 	struct kern_pbufpool_u_bft_bkt *bkt;
1819 	int i;
1820 
1821 	PP_LOCK_ASSERT_HELD(pp);
1822 
1823 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1824 		bkt = &pp->pp_u_bft_hash_table[i];
1825 		SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link,
1826 		    tbft) {
1827 			ASSERT(kbft->kbe_buf_pid != (pid_t)-1);
1828 			if (pid != (pid_t)-1 && kbft->kbe_buf_pid != pid) {
1829 				continue;
1830 			}
1831 			SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1832 			    kbe_buf_upp_link);
1833 			kbft->kbe_buf_pid = (pid_t)-1;
1834 			kbft->kbe_buf_upp_link.sle_next = NULL;
1835 			pp_free_buflet(pp, (kern_buflet_t)kbft);
1836 			ASSERT(pp->pp_u_bftinuse != 0);
1837 			pp->pp_u_bftinuse--;
1838 		}
1839 	}
1840 }
1841 
1842 void
pp_purge_upp(struct kern_pbufpool * pp,pid_t pid)1843 pp_purge_upp(struct kern_pbufpool *pp, pid_t pid)
1844 {
1845 	PP_LOCK(pp);
1846 	pp_purge_upp_locked(pp, pid);
1847 	pp_purge_upp_bft_locked(pp, pid);
1848 	PP_UNLOCK(pp);
1849 }
1850 
1851 static int
pp_init_upp_bft_locked(struct kern_pbufpool * pp,boolean_t can_block)1852 pp_init_upp_bft_locked(struct kern_pbufpool *pp, boolean_t can_block)
1853 {
1854 	int i, err = 0;
1855 
1856 	PP_LOCK_ASSERT_HELD(pp);
1857 	if (pp->pp_u_bft_hash_table != NULL) {
1858 		return 0;
1859 	}
1860 
1861 	/* allocated-address hash table */
1862 	pp->pp_u_bft_hash_table = can_block ? zalloc(pp_u_htbl_zone) :
1863 	    zalloc_noblock(pp_u_htbl_zone);
1864 	if (pp->pp_u_bft_hash_table == NULL) {
1865 		SK_ERR("failed to zalloc packet buffer pool upp buflet hash table");
1866 		err = ENOMEM;
1867 		goto fail;
1868 	}
1869 
1870 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1871 		SLIST_INIT(&pp->pp_u_bft_hash_table[i].upp_head);
1872 	}
1873 
1874 fail:
1875 	return err;
1876 }
1877 
1878 static void
pp_destroy_upp_bft_locked(struct kern_pbufpool * pp)1879 pp_destroy_upp_bft_locked(struct kern_pbufpool *pp)
1880 {
1881 	PP_LOCK_ASSERT_HELD(pp);
1882 	if (pp->pp_u_bft_hash_table != NULL) {
1883 		/* purge anything that's left */
1884 		pp_purge_upp_bft_locked(pp, -1);
1885 
1886 #if (DEBUG || DEVELOPMENT)
1887 		for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1888 			ASSERT(SLIST_EMPTY(&pp->pp_u_bft_hash_table[i].upp_head));
1889 		}
1890 #endif /* DEBUG || DEVELOPMENT */
1891 
1892 		zfree(pp_u_htbl_zone, pp->pp_u_bft_hash_table);
1893 		pp->pp_u_bft_hash_table = NULL;
1894 	}
1895 	ASSERT(pp->pp_u_bftinuse == 0);
1896 }
1897 
1898 void
pp_insert_upp_bft(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1899 pp_insert_upp_bft(struct kern_pbufpool *pp,
1900     struct __kern_buflet *kbft, pid_t pid)
1901 {
1902 	PP_LOCK(pp);
1903 	pp_insert_upp_bft_locked(pp, kbft, pid);
1904 	PP_UNLOCK(pp);
1905 }
1906 
1907 boolean_t
pp_isempty_upp(struct kern_pbufpool * pp)1908 pp_isempty_upp(struct kern_pbufpool *pp)
1909 {
1910 	boolean_t isempty;
1911 
1912 	PP_LOCK(pp);
1913 	isempty = (pp->pp_u_bufinuse == 0);
1914 	PP_UNLOCK(pp);
1915 
1916 	return isempty;
1917 }
1918 
1919 __attribute__((always_inline))
1920 static inline struct __kern_quantum *
pp_metadata_init(struct __metadata_preamble * mdp,struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag,struct skmem_obj ** blist)1921 pp_metadata_init(struct __metadata_preamble *mdp, struct kern_pbufpool *pp,
1922     uint16_t bufcnt, uint32_t skmflag, struct skmem_obj **blist)
1923 {
1924 	struct __kern_quantum *kqum;
1925 	struct __user_quantum *uqum;
1926 
1927 	kqum = SK_PTR_ADDR_KQUM((uintptr_t)mdp + METADATA_PREAMBLE_SZ);
1928 	ASSERT(kqum->qum_pp == pp);
1929 	if (__probable(!PP_KERNEL_ONLY(pp))) {
1930 		ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY));
1931 		uqum =  __DECONST(struct __user_quantum *, kqum->qum_user);
1932 		ASSERT(uqum != NULL);
1933 	} else {
1934 		ASSERT(kqum->qum_qflags & QUM_F_KERNEL_ONLY);
1935 		ASSERT(kqum->qum_user == NULL);
1936 		uqum = NULL;
1937 	}
1938 
1939 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 &&
1940 	    pp_metadata_construct(kqum, uqum, METADATA_IDX(kqum), pp,
1941 	    skmflag, bufcnt, FALSE, blist) != 0) {
1942 		return NULL;
1943 	}
1944 
1945 	/* (re)construct {user,kernel} metadata */
1946 	switch (pp->pp_md_type) {
1947 	case NEXUS_META_TYPE_PACKET: {
1948 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
1949 		struct __kern_buflet *kbuf = &kpkt->pkt_qum_buf;
1950 		uint16_t i;
1951 
1952 		/* sanitize flags */
1953 		kpkt->pkt_pflags &= PKT_F_INIT_MASK;
1954 
1955 		ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
1956 		    kpkt->pkt_com_opt != NULL);
1957 		ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
1958 		    kpkt->pkt_flow != NULL);
1959 		ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
1960 		    kpkt->pkt_tx_compl != NULL);
1961 
1962 		/*
1963 		 * XXX: For now we always set PKT_F_FLOW_DATA;
1964 		 * this is a no-op but done for consistency
1965 		 * with the other PKT_F_*_DATA flags.
1966 		 */
1967 		kpkt->pkt_pflags |= PKT_F_FLOW_DATA;
1968 
1969 		/* initialize kernel packet */
1970 		KPKT_INIT(kpkt, QUM_F_INTERNALIZED);
1971 
1972 		ASSERT(bufcnt || PP_HAS_BUFFER_ON_DEMAND(pp));
1973 		if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1974 			ASSERT(kbuf->buf_ctl == NULL);
1975 			ASSERT(kbuf->buf_addr == 0);
1976 			kbuf = __DECONST(struct __kern_buflet *,
1977 			    kbuf->buf_nbft_addr);
1978 		}
1979 		/* initialize kernel buflet */
1980 		for (i = 0; i < bufcnt; i++) {
1981 			ASSERT(kbuf != NULL);
1982 			KBUF_INIT(kbuf);
1983 			kbuf = __DECONST(struct __kern_buflet *,
1984 			    kbuf->buf_nbft_addr);
1985 		}
1986 		ASSERT((kbuf == NULL) || (bufcnt == 0));
1987 		break;
1988 	}
1989 	default:
1990 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
1991 		/* kernel quantum */
1992 		KQUM_INIT(kqum, QUM_F_INTERNALIZED);
1993 		KBUF_INIT(&kqum->qum_buf[0]);
1994 		break;
1995 	}
1996 
1997 	return kqum;
1998 }
1999 
2000 /*
2001  * When PPF_BUFFER_ON_DEMAND flag is set on packet pool creation, we create
2002  * packet descriptor cache with no buffer attached and a buflet cache with
2003  * cpu layer caching enabled. While operating in this mode, we can call
2004  * pp_alloc_packet_common() either with `bufcnt = 0` or `bufcnt = n`,
2005  * where n <= pp->pp_max_frags. If `bufcnt == 0` then we allocate packet
2006  * descriptor with no attached buffer from the metadata cache.
2007  * If `bufcnt != 0`, then this routine allocates packet descriptor and buflets
2008  * from their respective caches and constructs the packet on behalf of the
2009  * caller.
2010  */
2011 __attribute__((always_inline))
2012 static inline uint32_t
pp_alloc_packet_common(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * array,uint32_t num,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2013 pp_alloc_packet_common(struct kern_pbufpool *pp, uint16_t bufcnt,
2014     uint64_t *array, uint32_t num, boolean_t tagged, alloc_cb_func_t cb,
2015     const void *ctx, uint32_t skmflag)
2016 {
2017 	struct __metadata_preamble *mdp;
2018 	struct __kern_quantum *kqum = NULL;
2019 	uint32_t allocp, need = num;
2020 	struct skmem_obj *plist, *blist = NULL;
2021 
2022 	ASSERT(bufcnt <= pp->pp_max_frags);
2023 	ASSERT(array != NULL && num > 0);
2024 	ASSERT(PP_BATCH_CAPABLE(pp));
2025 
2026 	/* allocate (constructed) packet(s) with buffer(s) attached */
2027 	allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist, num,
2028 	    skmflag);
2029 
2030 	/* allocate (constructed) buflet(s) with buffer(s) attached */
2031 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
2032 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
2033 		    (allocp * bufcnt), skmflag);
2034 	}
2035 
2036 	while (plist != NULL) {
2037 		struct skmem_obj *plistn;
2038 
2039 		plistn = plist->mo_next;
2040 		plist->mo_next = NULL;
2041 
2042 		mdp = (struct __metadata_preamble *)(void *)plist;
2043 		kqum = pp_metadata_init(mdp, pp, bufcnt, skmflag, &blist);
2044 		if (kqum == NULL) {
2045 			if (blist != NULL) {
2046 				skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp),
2047 				    blist);
2048 				blist = NULL;
2049 			}
2050 			plist->mo_next = plistn;
2051 			skmem_cache_batch_free(pp->pp_kmd_cache, plist);
2052 			plist = NULL;
2053 			break;
2054 		}
2055 
2056 		if (tagged) {
2057 			*array = SK_PTR_ENCODE(kqum, METADATA_TYPE(kqum),
2058 			    METADATA_SUBTYPE(kqum));
2059 		} else {
2060 			*array = (uint64_t)kqum;
2061 		}
2062 
2063 		if (cb != NULL) {
2064 			(cb)(*array, (num - need), ctx);
2065 		}
2066 
2067 		++array;
2068 		plist = plistn;
2069 
2070 		ASSERT(need > 0);
2071 		--need;
2072 	}
2073 	ASSERT(blist == NULL);
2074 	ASSERT((num - need) == allocp || kqum == NULL);
2075 
2076 	return num - need;
2077 }
2078 
2079 uint64_t
pp_alloc_packet(struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag)2080 pp_alloc_packet(struct kern_pbufpool *pp, uint16_t bufcnt, uint32_t skmflag)
2081 {
2082 	uint64_t kpkt = 0;
2083 
2084 	(void) pp_alloc_packet_common(pp, bufcnt, &kpkt, 1, FALSE,
2085 	    NULL, NULL, skmflag);
2086 
2087 	return kpkt;
2088 }
2089 
2090 int
pp_alloc_packet_batch(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * array,uint32_t * size,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2091 pp_alloc_packet_batch(struct kern_pbufpool *pp, uint16_t bufcnt,
2092     uint64_t *array, uint32_t *size, boolean_t tagged, alloc_cb_func_t cb,
2093     const void *ctx, uint32_t skmflag)
2094 {
2095 	uint32_t i, n;
2096 	int err;
2097 
2098 	ASSERT(array != NULL && size > 0);
2099 
2100 	n = *size;
2101 	*size = 0;
2102 
2103 	i = pp_alloc_packet_common(pp, bufcnt, array, n, tagged,
2104 	    cb, ctx, skmflag);
2105 	*size = i;
2106 
2107 	if (__probable(i == n)) {
2108 		err = 0;
2109 	} else if (i != 0) {
2110 		err = EAGAIN;
2111 	} else {
2112 		err = ENOMEM;
2113 	}
2114 
2115 	return err;
2116 }
2117 
2118 int
pp_alloc_pktq(struct kern_pbufpool * pp,uint16_t bufcnt,struct pktq * pktq,uint32_t num,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2119 pp_alloc_pktq(struct kern_pbufpool *pp, uint16_t bufcnt,
2120     struct pktq *pktq, uint32_t num, alloc_cb_func_t cb, const void *ctx,
2121     uint32_t skmflag)
2122 {
2123 	struct __metadata_preamble *mdp;
2124 	struct __kern_packet *kpkt = NULL;
2125 	uint32_t allocp, need = num;
2126 	struct skmem_obj *plist, *blist = NULL;
2127 	int err;
2128 
2129 	ASSERT(pktq != NULL && num > 0);
2130 	ASSERT(pp->pp_md_type == NEXUS_META_TYPE_PACKET);
2131 	ASSERT(bufcnt <= pp->pp_max_frags);
2132 	ASSERT(PP_BATCH_CAPABLE(pp));
2133 
2134 	/* allocate (constructed) packet(s) with buffer(s) attached */
2135 	allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist, num,
2136 	    skmflag);
2137 
2138 	/* allocate (constructed) buflet(s) with buffer(s) attached */
2139 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
2140 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
2141 		    (allocp * bufcnt), skmflag);
2142 	}
2143 
2144 	while (plist != NULL) {
2145 		struct skmem_obj *plistn;
2146 
2147 		plistn = plist->mo_next;
2148 		plist->mo_next = NULL;
2149 
2150 		mdp = (struct __metadata_preamble *)(void *)plist;
2151 		kpkt = (struct __kern_packet *)pp_metadata_init(mdp, pp,
2152 		    bufcnt, skmflag, &blist);
2153 		if (kpkt == NULL) {
2154 			if (blist != NULL) {
2155 				skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp),
2156 				    blist);
2157 				blist = NULL;
2158 			}
2159 			plist->mo_next = plistn;
2160 			skmem_cache_batch_free(pp->pp_kmd_cache, plist);
2161 			plist = NULL;
2162 			break;
2163 		}
2164 
2165 		KPKTQ_ENQUEUE(pktq, kpkt);
2166 
2167 		if (cb != NULL) {
2168 			(cb)((uint64_t)kpkt, (num - need), ctx);
2169 		}
2170 
2171 		plist = plistn;
2172 
2173 		ASSERT(need > 0);
2174 		--need;
2175 	}
2176 	ASSERT(blist == NULL);
2177 	ASSERT((num - need) == allocp || kpkt == NULL);
2178 
2179 	if (__probable(need == 0)) {
2180 		err = 0;
2181 	} else if (need == num) {
2182 		err = ENOMEM;
2183 	} else {
2184 		err = EAGAIN;
2185 	}
2186 
2187 	return err;
2188 }
2189 
2190 uint64_t
pp_alloc_packet_by_size(struct kern_pbufpool * pp,uint32_t size,uint32_t skmflag)2191 pp_alloc_packet_by_size(struct kern_pbufpool *pp, uint32_t size,
2192     uint32_t skmflag)
2193 {
2194 	uint32_t bufcnt = pp->pp_max_frags;
2195 	uint64_t kpkt = 0;
2196 
2197 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
2198 		bufcnt =
2199 		    SK_ROUNDUP(size, PP_BUF_SIZE_DEF(pp)) / PP_BUF_SIZE_DEF(pp);
2200 		ASSERT(bufcnt <= UINT16_MAX);
2201 	}
2202 
2203 	(void) pp_alloc_packet_common(pp, (uint16_t)bufcnt, &kpkt, 1, TRUE,
2204 	    NULL, NULL, skmflag);
2205 
2206 	return kpkt;
2207 }
2208 
2209 __attribute__((always_inline))
2210 static inline struct __metadata_preamble *
pp_metadata_fini(struct __kern_quantum * kqum,struct kern_pbufpool * pp,struct mbuf ** mp,struct __kern_packet ** kpp,struct skmem_obj ** blist_def,struct skmem_obj ** blist_large,struct skmem_obj ** blist_raw)2211 pp_metadata_fini(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
2212     struct mbuf **mp, struct __kern_packet **kpp, struct skmem_obj **blist_def,
2213     struct skmem_obj **blist_large, struct skmem_obj **blist_raw)
2214 {
2215 	struct __metadata_preamble *mdp = METADATA_PREAMBLE(kqum);
2216 
2217 	ASSERT(SK_PTR_TAG(kqum) == 0);
2218 
2219 	switch (pp->pp_md_type) {
2220 	case NEXUS_META_TYPE_PACKET: {
2221 		struct __kern_packet *kpkt = SK_PTR_KPKT(kqum);
2222 
2223 		if ((kpkt->pkt_pflags & PKT_F_TX_COMPL_TS_REQ) != 0) {
2224 			__packet_perform_tx_completion_callbacks(
2225 				SK_PKT2PH(kpkt), NULL);
2226 		}
2227 		if ((kpkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) {
2228 			ASSERT((kpkt->pkt_pflags & PKT_F_PKT_DATA) == 0);
2229 			ASSERT(kpkt->pkt_mbuf != NULL);
2230 			ASSERT(kpkt->pkt_mbuf->m_nextpkt == NULL);
2231 			if (mp != NULL) {
2232 				ASSERT(*mp == NULL);
2233 				*mp = kpkt->pkt_mbuf;
2234 			} else {
2235 				m_freem(kpkt->pkt_mbuf);
2236 			}
2237 			KPKT_CLEAR_MBUF_DATA(kpkt);
2238 		} else if ((kpkt->pkt_pflags & PKT_F_PKT_DATA) != 0) {
2239 			ASSERT(kpkt->pkt_pkt != NULL);
2240 			ASSERT(kpkt->pkt_pkt->pkt_nextpkt == NULL);
2241 			if (kpp != NULL) {
2242 				ASSERT(*kpp == NULL);
2243 				*kpp = kpkt->pkt_pkt;
2244 			} else {
2245 				/* can only recurse once */
2246 				ASSERT((kpkt->pkt_pkt->pkt_pflags &
2247 				    PKT_F_PKT_DATA) == 0);
2248 				pp_free_packet_single(kpkt->pkt_pkt);
2249 			}
2250 			KPKT_CLEAR_PKT_DATA(kpkt);
2251 		}
2252 		ASSERT(kpkt->pkt_nextpkt == NULL);
2253 		ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
2254 		ASSERT((kpkt->pkt_pflags & PKT_F_MBUF_MASK) == 0);
2255 		ASSERT((kpkt->pkt_pflags & PKT_F_PKT_MASK) == 0);
2256 		break;
2257 	}
2258 	default:
2259 		break;
2260 	}
2261 
2262 	if (__improbable(PP_HAS_BUFFER_ON_DEMAND(pp))) {
2263 		pp_metadata_destruct_common(kqum, pp, FALSE, blist_def,
2264 		    blist_large, blist_raw);
2265 	}
2266 	return mdp;
2267 }
2268 
2269 void
pp_free_packet_chain(struct __kern_packet * pkt_chain,int * npkt)2270 pp_free_packet_chain(struct __kern_packet *pkt_chain, int *npkt)
2271 {
2272 	struct __metadata_preamble *mdp;
2273 	struct skmem_obj *top = NULL;
2274 	struct skmem_obj *blist_def = NULL;
2275 	struct skmem_obj *blist_large = NULL;
2276 	struct skmem_obj *blist_raw = NULL;
2277 	struct skmem_obj **list = &top;
2278 	struct mbuf *mtop = NULL;
2279 	struct mbuf **mp = &mtop;
2280 	struct __kern_packet *kptop = NULL;
2281 	struct __kern_packet **kpp = &kptop, *pkt, *next;
2282 	struct kern_pbufpool *pp;
2283 	int c = 0;
2284 
2285 	pp = __DECONST(struct kern_pbufpool *, pkt_chain->pkt_qum.qum_pp);
2286 	ASSERT(pp != NULL);
2287 	ASSERT(PP_BATCH_CAPABLE(pp));
2288 
2289 	for (pkt = pkt_chain; pkt != NULL; pkt = next) {
2290 		next = pkt->pkt_nextpkt;
2291 		pkt->pkt_nextpkt = NULL;
2292 
2293 		ASSERT(SK_PTR_ADDR_KQUM(pkt)->qum_pp == pp);
2294 		mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(pkt), pp,
2295 		    mp, kpp, &blist_def, &blist_large, &blist_raw);
2296 
2297 		*list = (struct skmem_obj *)mdp;
2298 		list = &(*list)->mo_next;
2299 		c++;
2300 
2301 		if (*mp != NULL) {
2302 			mp = &(*mp)->m_nextpkt;
2303 			ASSERT(*mp == NULL);
2304 		}
2305 		if (*kpp != NULL) {
2306 			kpp = &(*kpp)->pkt_nextpkt;
2307 			ASSERT(*kpp == NULL);
2308 		}
2309 	}
2310 
2311 	ASSERT(top != NULL);
2312 	skmem_cache_batch_free(pp->pp_kmd_cache, top);
2313 	if (blist_def != NULL) {
2314 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist_def);
2315 		blist_def = NULL;
2316 	}
2317 	if (blist_large != NULL) {
2318 		skmem_cache_batch_free(PP_KBFT_CACHE_LARGE(pp), blist_large);
2319 		blist_large = NULL;
2320 	}
2321 	if (blist_raw != NULL) {
2322 		skmem_cache_batch_free(pp->pp_raw_kbft_cache, blist_raw);
2323 		blist_raw = NULL;
2324 	}
2325 	if (mtop != NULL) {
2326 		DTRACE_SKYWALK(free__attached__mbuf);
2327 		if (__probable(mtop->m_nextpkt != NULL)) {
2328 			m_freem_list(mtop);
2329 		} else {
2330 			m_freem(mtop);
2331 		}
2332 	}
2333 	if (kptop != NULL) {
2334 		int cnt = 0;
2335 		pp_free_packet_chain(kptop, &cnt);
2336 		DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2337 	}
2338 	if (npkt != NULL) {
2339 		*npkt = c;
2340 	}
2341 }
2342 
2343 void
pp_free_pktq(struct pktq * pktq)2344 pp_free_pktq(struct pktq *pktq)
2345 {
2346 	if (__improbable(KPKTQ_EMPTY(pktq))) {
2347 		return;
2348 	}
2349 	struct __kern_packet *pkt = KPKTQ_FIRST(pktq);
2350 	pp_free_packet_chain(pkt, NULL);
2351 	KPKTQ_DISPOSE(pktq);
2352 }
2353 
2354 __attribute__((always_inline))
2355 static inline void
pp_free_packet_array(struct kern_pbufpool * pp,uint64_t * array,uint32_t num)2356 pp_free_packet_array(struct kern_pbufpool *pp, uint64_t *array, uint32_t num)
2357 {
2358 	struct __metadata_preamble *mdp;
2359 	struct skmem_obj *top = NULL;
2360 	struct skmem_obj *blist_def = NULL;
2361 	struct skmem_obj *blist_large = NULL;
2362 	struct skmem_obj *blist_raw = NULL;
2363 	struct skmem_obj **list = &top;
2364 	struct mbuf *mtop = NULL;
2365 	struct mbuf **mp = &mtop;
2366 	struct __kern_packet *kptop = NULL;
2367 	struct __kern_packet **kpp = &kptop;
2368 	uint32_t i;
2369 
2370 	ASSERT(pp != NULL);
2371 	ASSERT(array != NULL && num > 0);
2372 	ASSERT(PP_BATCH_CAPABLE(pp));
2373 
2374 	for (i = 0; i < num; i++) {
2375 		ASSERT(SK_PTR_ADDR_KQUM(array[i])->qum_pp == pp);
2376 		mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(array[i]), pp,
2377 		    mp, kpp, &blist_def, &blist_large, &blist_raw);
2378 
2379 		*list = (struct skmem_obj *)mdp;
2380 		list = &(*list)->mo_next;
2381 		array[i] = 0;
2382 
2383 		if (*mp != NULL) {
2384 			mp = &(*mp)->m_nextpkt;
2385 			ASSERT(*mp == NULL);
2386 		}
2387 		if (*kpp != NULL) {
2388 			kpp = &(*kpp)->pkt_nextpkt;
2389 			ASSERT(*kpp == NULL);
2390 		}
2391 	}
2392 
2393 	ASSERT(top != NULL);
2394 	skmem_cache_batch_free(pp->pp_kmd_cache, top);
2395 	if (blist_def != NULL) {
2396 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist_def);
2397 		blist_def = NULL;
2398 	}
2399 	if (blist_large != NULL) {
2400 		skmem_cache_batch_free(PP_KBFT_CACHE_LARGE(pp), blist_large);
2401 		blist_large = NULL;
2402 	}
2403 	if (blist_raw != NULL) {
2404 		skmem_cache_batch_free(pp->pp_raw_kbft_cache, blist_raw);
2405 		blist_raw = NULL;
2406 	}
2407 	if (mtop != NULL) {
2408 		DTRACE_SKYWALK(free__attached__mbuf);
2409 		if (__probable(mtop->m_nextpkt != NULL)) {
2410 			m_freem_list(mtop);
2411 		} else {
2412 			m_freem(mtop);
2413 		}
2414 	}
2415 	if (kptop != NULL) {
2416 		int cnt = 0;
2417 		pp_free_packet_chain(kptop, &cnt);
2418 		DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2419 	}
2420 }
2421 
2422 void
pp_free_packet(struct kern_pbufpool * pp,uint64_t kqum)2423 pp_free_packet(struct kern_pbufpool *pp, uint64_t kqum)
2424 {
2425 	pp_free_packet_array(pp, &kqum, 1);
2426 }
2427 
2428 void
pp_free_packet_batch(const kern_pbufpool_t pp,uint64_t * array,uint32_t size)2429 pp_free_packet_batch(const kern_pbufpool_t pp, uint64_t *array, uint32_t size)
2430 {
2431 	pp_free_packet_array(pp, array, size);
2432 }
2433 
2434 void
pp_free_packet_single(struct __kern_packet * pkt)2435 pp_free_packet_single(struct __kern_packet *pkt)
2436 {
2437 	ASSERT(pkt->pkt_nextpkt == NULL);
2438 	pp_free_packet(__DECONST(struct kern_pbufpool *,
2439 	    pkt->pkt_qum.qum_pp), SK_PTR_ADDR(pkt));
2440 }
2441 
2442 static mach_vm_address_t
pp_alloc_buffer_common(const kern_pbufpool_t pp,struct skmem_obj_info * oi,uint32_t skmflag,bool large)2443 pp_alloc_buffer_common(const kern_pbufpool_t pp, struct skmem_obj_info *oi,
2444     uint32_t skmflag, bool large)
2445 {
2446 	mach_vm_address_t baddr;
2447 	struct skmem_cache *skm = large ? PP_BUF_CACHE_LARGE(pp):
2448 	    PP_BUF_CACHE_DEF(pp);
2449 
2450 	ASSERT(skm != NULL);
2451 	/* allocate a cached buffer */
2452 	baddr = (mach_vm_address_t)skmem_cache_alloc(skm, skmflag);
2453 
2454 #if (DEVELOPMENT || DEBUG)
2455 	uint64_t mtbf = skmem_region_get_mtbf();
2456 	/*
2457 	 * MTBF is applicable only for non-blocking allocations here.
2458 	 */
2459 	if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
2460 	    (skmflag & SKMEM_NOSLEEP))) {
2461 		SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
2462 		net_update_uptime();
2463 		if (baddr != 0) {
2464 			skmem_cache_free(skm, (void *)baddr);
2465 			baddr = 0;
2466 		}
2467 	}
2468 #endif /* (DEVELOPMENT || DEBUG) */
2469 
2470 	if (__improbable(baddr == 0)) {
2471 		SK_DF(SK_VERB_MEM, "failed to alloc buffer, pp 0x%llx",
2472 		    SK_KVA(pp));
2473 		return 0;
2474 	}
2475 	skmem_cache_get_obj_info(skm, (void *)baddr, oi, NULL);
2476 	ASSERT(SKMEM_OBJ_BUFCTL(oi) != NULL);
2477 	ASSERT((mach_vm_address_t)SKMEM_OBJ_ADDR(oi) == baddr);
2478 	return baddr;
2479 }
2480 
2481 errno_t
pp_alloc_buffer(const kern_pbufpool_t pp,mach_vm_address_t * baddr,kern_segment_t * seg,kern_obj_idx_seg_t * idx,uint32_t skmflag)2482 pp_alloc_buffer(const kern_pbufpool_t pp, mach_vm_address_t *baddr,
2483     kern_segment_t *seg, kern_obj_idx_seg_t *idx, uint32_t skmflag)
2484 {
2485 	struct skmem_obj_info oib;
2486 
2487 	VERIFY(pp != NULL && baddr != NULL);
2488 	VERIFY((seg != NULL) == (idx != NULL));
2489 
2490 	if (__improbable(!PP_HAS_BUFFER_ON_DEMAND(pp))) {
2491 		return ENOTSUP;
2492 	}
2493 
2494 	*baddr = pp_alloc_buffer_common(pp, &oib, skmflag, false);
2495 	if (__improbable(*baddr == 0)) {
2496 		return ENOMEM;
2497 	}
2498 
2499 	if (seg != NULL) {
2500 		ASSERT(SKMEM_OBJ_SEG(&oib) != NULL);
2501 		*seg = SKMEM_OBJ_SEG(&oib);
2502 		*idx = SKMEM_OBJ_IDX_SEG(&oib);
2503 	}
2504 	return 0;
2505 }
2506 
2507 void
pp_free_buffer(const kern_pbufpool_t pp,mach_vm_address_t addr)2508 pp_free_buffer(const kern_pbufpool_t pp, mach_vm_address_t addr)
2509 {
2510 	ASSERT(pp != NULL && addr != 0);
2511 	skmem_cache_free(PP_BUF_CACHE_DEF(pp), (void *)addr);
2512 }
2513 
2514 __attribute__((always_inline))
2515 static inline uint32_t
pp_alloc_buflet_common(struct kern_pbufpool * pp,uint64_t * array,uint32_t num,uint32_t skmflag,uint32_t flags)2516 pp_alloc_buflet_common(struct kern_pbufpool *pp, uint64_t *array,
2517     uint32_t num, uint32_t skmflag, uint32_t flags)
2518 {
2519 	struct __kern_buflet *kbft = NULL;
2520 	uint32_t allocd, need = num;
2521 	struct skmem_obj *list;
2522 	struct skmem_cache *skm = NULL;
2523 	boolean_t attach_buffer = (flags & PP_ALLOC_BFT_ATTACH_BUFFER) != 0;
2524 	boolean_t large = (flags & PP_ALLOC_BFT_LARGE) != 0;
2525 
2526 	ASSERT(array != NULL && num > 0);
2527 	ASSERT(PP_BATCH_CAPABLE(pp));
2528 	ASSERT(PP_KBFT_CACHE_DEF(pp) != NULL);
2529 	ASSERT(PP_BUF_SIZE_LARGE(pp) != 0 || !large);
2530 	ASSERT(pp->pp_raw_kbft_cache != NULL || attach_buffer);
2531 
2532 	if (!attach_buffer) {
2533 		skm = pp->pp_raw_kbft_cache;
2534 	} else {
2535 		skm = large ? PP_KBFT_CACHE_LARGE(pp) :
2536 		    PP_KBFT_CACHE_DEF(pp);
2537 	}
2538 	allocd = skmem_cache_batch_alloc(skm, &list, num, skmflag);
2539 
2540 	while (list != NULL) {
2541 		struct skmem_obj *listn;
2542 
2543 		listn = list->mo_next;
2544 		list->mo_next = NULL;
2545 		kbft = (kern_buflet_t)(void *)list;
2546 		if (attach_buffer) {
2547 			KBUF_EXT_INIT(kbft, pp);
2548 		} else {
2549 			RAW_KBUF_EXT_INIT(kbft);
2550 		}
2551 		*array = (uint64_t)kbft;
2552 		++array;
2553 		list = listn;
2554 		ASSERT(need > 0);
2555 		--need;
2556 	}
2557 	ASSERT((num - need) == allocd || kbft == NULL);
2558 	return num - need;
2559 }
2560 
2561 errno_t
pp_alloc_buflet(struct kern_pbufpool * pp,kern_buflet_t * kbft,uint32_t skmflag,uint32_t flags)2562 pp_alloc_buflet(struct kern_pbufpool *pp, kern_buflet_t *kbft, uint32_t skmflag,
2563     uint32_t flags)
2564 {
2565 	uint64_t bft;
2566 
2567 	if (__improbable(!pp_alloc_buflet_common(pp, &bft, 1, skmflag, flags))) {
2568 		return ENOMEM;
2569 	}
2570 	*kbft = (kern_buflet_t)bft;
2571 	return 0;
2572 }
2573 
2574 errno_t
pp_alloc_buflet_batch(struct kern_pbufpool * pp,uint64_t * array,uint32_t * size,uint32_t skmflag,uint32_t flags)2575 pp_alloc_buflet_batch(struct kern_pbufpool *pp, uint64_t *array,
2576     uint32_t *size, uint32_t skmflag, uint32_t flags)
2577 {
2578 	uint32_t i, n;
2579 	int err;
2580 
2581 	ASSERT(array != NULL && size > 0);
2582 
2583 	n = *size;
2584 	*size = 0;
2585 
2586 	i = pp_alloc_buflet_common(pp, array, n, skmflag, flags);
2587 	*size = i;
2588 
2589 	if (__probable(i == n)) {
2590 		err = 0;
2591 	} else if (i != 0) {
2592 		err = EAGAIN;
2593 	} else {
2594 		err = ENOMEM;
2595 	}
2596 
2597 	return err;
2598 }
2599 
2600 __attribute__((always_inline))
2601 static void
pp_free_buflet_common(const kern_pbufpool_t pp,kern_buflet_t kbft)2602 pp_free_buflet_common(const kern_pbufpool_t pp, kern_buflet_t kbft)
2603 {
2604 	ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
2605 	ASSERT(kbft->buf_nbft_addr == 0);
2606 
2607 	if (kbft->buf_flag & BUFLET_FLAG_EXTERNAL) {
2608 		ASSERT(kbft->buf_addr != 0);
2609 		ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2610 		ASSERT(kbft->buf_bft_idx_reg != OBJ_IDX_NONE);
2611 		ASSERT(kbft->buf_ctl != NULL);
2612 		ASSERT(((struct __kern_buflet_ext *)kbft)->
2613 		    kbe_buf_upp_link.sle_next == NULL);
2614 
2615 		/* raw buflet has a buffer attached after construction */
2616 		if (BUFLET_FROM_RAW_BFLT_CACHE(kbft)) {
2617 			uint32_t usecnt = 0;
2618 			void *objaddr = kbft->buf_objaddr;
2619 			KBUF_DTOR(kbft, usecnt);
2620 			SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u",
2621 			    SK_KVA(pp), SK_KVA(objaddr), usecnt);
2622 			if (__improbable(usecnt == 0)) {
2623 				skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2624 				    PP_BUF_CACHE_LARGE(pp) : PP_BUF_CACHE_DEF(pp),
2625 				    objaddr);
2626 			}
2627 		}
2628 
2629 		/*
2630 		 * non-raw external buflet has buffer attached at construction,
2631 		 * so we don't free the buffer here.
2632 		 */
2633 		skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2634 		    PP_KBFT_CACHE_LARGE(pp) : PP_KBFT_CACHE_DEF(pp),
2635 		    (void *)kbft);
2636 	} else if (__probable(kbft->buf_addr != 0)) {
2637 		void *objaddr = kbft->buf_objaddr;
2638 		uint32_t usecnt = 0;
2639 
2640 		ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2641 		ASSERT(kbft->buf_ctl != NULL);
2642 		KBUF_DTOR(kbft, usecnt);
2643 		SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u",
2644 		    SK_KVA(pp), SK_KVA(objaddr), usecnt);
2645 		if (__probable(usecnt == 0)) {
2646 			skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2647 			    PP_BUF_CACHE_LARGE(pp) : PP_BUF_CACHE_DEF(pp),
2648 			    objaddr);
2649 		}
2650 	}
2651 }
2652 
2653 void
pp_free_buflet(const kern_pbufpool_t pp,kern_buflet_t kbft)2654 pp_free_buflet(const kern_pbufpool_t pp, kern_buflet_t kbft)
2655 {
2656 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
2657 	ASSERT(pp != NULL && kbft != NULL);
2658 	pp_free_buflet_common(pp, kbft);
2659 }
2660 
2661 void
pp_reap_caches(boolean_t purge)2662 pp_reap_caches(boolean_t purge)
2663 {
2664 	skmem_cache_reap_now(pp_opt_cache, purge);
2665 	skmem_cache_reap_now(pp_flow_cache, purge);
2666 	skmem_cache_reap_now(pp_compl_cache, purge);
2667 }
2668