xref: /xnu-8792.61.2/bsd/skywalk/packet/pbufpool.c (revision 42e220869062b56f8d7d0726fd4c88954f87902c)
1 /*
2  * Copyright (c) 2016-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/packet/pbufpool_var.h>
31 #include <sys/sdt.h>
32 
33 static struct kern_pbufpool *pp_alloc(zalloc_flags_t);
34 static void pp_free(struct kern_pbufpool *);
35 static uint32_t pp_alloc_packet_common(struct kern_pbufpool *, uint16_t,
36     uint64_t *, uint32_t, boolean_t, alloc_cb_func_t, const void *, uint32_t);
37 static void pp_free_packet_array(struct kern_pbufpool *, uint64_t *, uint32_t);
38 static int pp_metadata_ctor_no_buflet(struct skmem_obj_info *,
39     struct skmem_obj_info *, void *, uint32_t);
40 static int pp_metadata_ctor_max_buflet(struct skmem_obj_info *,
41     struct skmem_obj_info *, void *, uint32_t);
42 static void pp_metadata_dtor(void *, void *);
43 static int pp_metadata_construct(struct __kern_quantum *,
44     struct __user_quantum *, obj_idx_t, struct kern_pbufpool *, uint32_t,
45     uint16_t, bool, struct skmem_obj **);
46 static void pp_metadata_destruct(struct __kern_quantum *,
47     struct kern_pbufpool *, bool);
48 static struct __kern_quantum *pp_metadata_init(struct __metadata_preamble *,
49     struct kern_pbufpool *, uint16_t, uint32_t, struct skmem_obj **);
50 static struct __metadata_preamble *pp_metadata_fini(struct __kern_quantum *,
51     struct kern_pbufpool *, struct mbuf **, struct __kern_packet **,
52     struct skmem_obj **, struct skmem_obj **, struct skmem_obj **);
53 static void pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid);
54 static void pp_buf_seg_ctor(struct sksegment *, IOSKMemoryBufferRef, void *);
55 static void pp_buf_seg_dtor(struct sksegment *, IOSKMemoryBufferRef, void *);
56 static void pp_destroy_upp_locked(struct kern_pbufpool *);
57 static void pp_destroy_upp_bft_locked(struct kern_pbufpool *);
58 static int pp_init_upp_bft_locked(struct kern_pbufpool *, boolean_t);
59 static void pp_free_buflet_common(const kern_pbufpool_t, kern_buflet_t);
60 static mach_vm_address_t pp_alloc_buffer_common(const kern_pbufpool_t pp,
61     struct skmem_obj_info *oi, uint32_t skmflag, bool large);
62 static inline uint32_t
63 pp_alloc_buflet_common(struct kern_pbufpool *pp, uint64_t *array,
64     uint32_t num, uint32_t skmflag, uint32_t flags);
65 
66 #define KERN_PBUFPOOL_U_HASH_SIZE       64      /* hash table size */
67 #define KERN_BUF_CNT_MULTIPLIER          2
68 
69 /*
70  * Since the inputs are small (indices to the metadata region), we can use
71  * Knuth's multiplicative hash method which is fast and good enough.  Here
72  * we multiply the input by the golden ratio of 2^32.  See "The Art of
73  * Computer Programming", section 6.4.
74  */
75 #define KERN_PBUFPOOL_U_HASH_INDEX(_i, _m)                      \
76 	(((_i) * 2654435761U) & (_m))
77 #define KERN_PBUFPOOL_U_HASH(_pp, _i)                           \
78 	(&(_pp)->pp_u_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
79 	KERN_PBUFPOOL_U_HASH_SIZE - 1)])
80 #define KERN_PBUFPOOL_U_BFT_HASH(_pp, _i)                           \
81 	(&(_pp)->pp_u_bft_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
82 	KERN_PBUFPOOL_U_HASH_SIZE - 1)])
83 
84 static ZONE_DEFINE(pp_zone, SKMEM_ZONE_PREFIX ".mem.pp",
85     sizeof(struct kern_pbufpool), ZC_ZFREE_CLEARMEM);
86 
87 #define PP_U_HTBL_SIZE  \
88 	(sizeof(struct kern_pbufpool_u_bkt) * KERN_PBUFPOOL_U_HASH_SIZE)
89 static ZONE_DEFINE(pp_u_htbl_zone, SKMEM_ZONE_PREFIX ".mem.pp.htbl",
90     PP_U_HTBL_SIZE, ZC_ZFREE_CLEARMEM);
91 
92 static struct skmem_cache *pp_opt_cache;        /* cache for __packet_opt */
93 static struct skmem_cache *pp_flow_cache;       /* cache for __flow */
94 static struct skmem_cache *pp_compl_cache;      /* cache for __packet_compl */
95 
96 static int __pp_inited = 0;
97 
98 int
pp_init(void)99 pp_init(void)
100 {
101 	_CASSERT(KPKT_SC_UNSPEC == MBUF_SC_UNSPEC);
102 	_CASSERT(KPKT_SC_BK_SYS == MBUF_SC_BK_SYS);
103 	_CASSERT(KPKT_SC_BK == MBUF_SC_BK);
104 	_CASSERT(KPKT_SC_BE == MBUF_SC_BE);
105 	_CASSERT(KPKT_SC_RD == MBUF_SC_RD);
106 	_CASSERT(KPKT_SC_OAM == MBUF_SC_OAM);
107 	_CASSERT(KPKT_SC_AV == MBUF_SC_AV);
108 	_CASSERT(KPKT_SC_RV == MBUF_SC_RV);
109 	_CASSERT(KPKT_SC_VI == MBUF_SC_VI);
110 	_CASSERT(KPKT_SC_SIG == MBUF_SC_SIG);
111 	_CASSERT(KPKT_SC_VO == MBUF_SC_VO);
112 	_CASSERT(KPKT_SC_CTL == MBUF_SC_CTL);
113 
114 	_CASSERT(KPKT_SC_BK_SYS == PKT_SC_BK_SYS);
115 	_CASSERT(KPKT_SC_BK == PKT_SC_BK);
116 	_CASSERT(KPKT_SC_BE == PKT_SC_BE);
117 	_CASSERT(KPKT_SC_RD == PKT_SC_RD);
118 	_CASSERT(KPKT_SC_OAM == PKT_SC_OAM);
119 	_CASSERT(KPKT_SC_AV == PKT_SC_AV);
120 	_CASSERT(KPKT_SC_RV == PKT_SC_RV);
121 	_CASSERT(KPKT_SC_VI == PKT_SC_VI);
122 	_CASSERT(KPKT_SC_SIG == PKT_SC_SIG);
123 	_CASSERT(KPKT_SC_VO == PKT_SC_VO);
124 	_CASSERT(KPKT_SC_CTL == PKT_SC_CTL);
125 	_CASSERT(KPKT_SC_MAX_CLASSES == MBUF_SC_MAX_CLASSES);
126 
127 	_CASSERT(KPKT_TC_UNSPEC == MBUF_TC_UNSPEC);
128 	_CASSERT(KPKT_TC_BE == MBUF_TC_BE);
129 	_CASSERT(KPKT_TC_BK == MBUF_TC_BK);
130 	_CASSERT(KPKT_TC_VI == MBUF_TC_VI);
131 	_CASSERT(KPKT_TC_VO == MBUF_TC_VO);
132 	_CASSERT(KPKT_TC_MAX == MBUF_TC_MAX);
133 
134 	_CASSERT(KPKT_TC_BE == PKT_TC_BE);
135 	_CASSERT(KPKT_TC_BK == PKT_TC_BK);
136 	_CASSERT(KPKT_TC_VI == PKT_TC_VI);
137 	_CASSERT(KPKT_TC_VO == PKT_TC_VO);
138 
139 	_CASSERT(PKT_SCVAL_BK_SYS == SCVAL_BK_SYS);
140 	_CASSERT(PKT_SCVAL_BK == SCVAL_BK);
141 	_CASSERT(PKT_SCVAL_BE == SCVAL_BE);
142 	_CASSERT(PKT_SCVAL_RD == SCVAL_RD);
143 	_CASSERT(PKT_SCVAL_OAM == SCVAL_OAM);
144 	_CASSERT(PKT_SCVAL_AV == SCVAL_AV);
145 	_CASSERT(PKT_SCVAL_RV == SCVAL_RV);
146 	_CASSERT(PKT_SCVAL_VI == SCVAL_VI);
147 	_CASSERT(PKT_SCVAL_VO == SCVAL_VO);
148 	_CASSERT(PKT_SCVAL_CTL == SCVAL_CTL);
149 
150 	/*
151 	 * Assert that the value of common packet flags between mbuf and
152 	 * skywalk packets match, and that they are in PKT_F_COMMON_MASK.
153 	 */
154 	_CASSERT(PKT_F_BACKGROUND == PKTF_SO_BACKGROUND);
155 	_CASSERT(PKT_F_REALTIME == PKTF_SO_REALTIME);
156 	_CASSERT(PKT_F_REXMT == PKTF_TCP_REXMT);
157 	_CASSERT(PKT_F_LAST_PKT == PKTF_LAST_PKT);
158 	_CASSERT(PKT_F_FLOW_ID == PKTF_FLOW_ID);
159 	_CASSERT(PKT_F_FLOW_ADV == PKTF_FLOW_ADV);
160 	_CASSERT(PKT_F_TX_COMPL_TS_REQ == PKTF_TX_COMPL_TS_REQ);
161 	_CASSERT(PKT_F_TS_VALID == PKTF_TS_VALID);
162 	_CASSERT(PKT_F_NEW_FLOW == PKTF_NEW_FLOW);
163 	_CASSERT(PKT_F_START_SEQ == PKTF_START_SEQ);
164 	_CASSERT(PKT_F_KEEPALIVE == PKTF_KEEPALIVE);
165 	_CASSERT(PKT_F_WAKE_PKT == PKTF_WAKE_PKT);
166 	_CASSERT(PKT_F_COMMON_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME |
167 	    PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_FLOW_ID | PKT_F_FLOW_ADV |
168 	    PKT_F_TX_COMPL_TS_REQ | PKT_F_TS_VALID | PKT_F_NEW_FLOW |
169 	    PKT_F_START_SEQ | PKT_F_KEEPALIVE | PKT_F_WAKE_PKT));
170 	/*
171 	 * Assert packet flags shared with userland.
172 	 */
173 	_CASSERT(PKT_F_USER_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME |
174 	    PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_OPT_DATA | PKT_F_PROMISC |
175 	    PKT_F_TRUNCATED | PKT_F_WAKE_PKT | PKT_F_L4S));
176 
177 	_CASSERT(offsetof(struct __kern_quantum, qum_len) ==
178 	    offsetof(struct __kern_packet, pkt_length));
179 
180 	/*
181 	 * Due to the use of tagged pointer, we need the size of
182 	 * the metadata preamble structure to be multiples of 16.
183 	 * See SK_PTR_TAG() definition for details.
184 	 */
185 	_CASSERT(sizeof(struct __metadata_preamble) != 0 &&
186 	    (sizeof(struct __metadata_preamble) % 16) == 0);
187 
188 	_CASSERT(NX_PBUF_FRAGS_MIN == 1 &&
189 	    NX_PBUF_FRAGS_MIN == NX_PBUF_FRAGS_DEFAULT);
190 
191 	/*
192 	 * Batch alloc/free requires linking the objects together;
193 	 * make sure that the fields are at the same offset since
194 	 * we cast the object to struct skmem_obj.
195 	 */
196 	_CASSERT(offsetof(struct __metadata_preamble, _mdp_next) ==
197 	    offsetof(struct skmem_obj, mo_next));
198 	_CASSERT(offsetof(struct __buflet, __buflet_next) ==
199 	    offsetof(struct skmem_obj, mo_next));
200 
201 	SK_LOCK_ASSERT_HELD();
202 	ASSERT(!__pp_inited);
203 
204 	pp_opt_cache = skmem_cache_create("pkt.opt",
205 	    sizeof(struct __packet_opt), sizeof(uint64_t),
206 	    NULL, NULL, NULL, NULL, NULL, 0);
207 	pp_flow_cache = skmem_cache_create("pkt.flow",
208 	    sizeof(struct __flow), 16,  /* 16-bytes aligned */
209 	    NULL, NULL, NULL, NULL, NULL, 0);
210 	pp_compl_cache = skmem_cache_create("pkt.compl",
211 	    sizeof(struct __packet_compl), sizeof(uint64_t),
212 	    NULL, NULL, NULL, NULL, NULL, 0);
213 
214 	return 0;
215 }
216 
217 void
pp_fini(void)218 pp_fini(void)
219 {
220 	SK_LOCK_ASSERT_HELD();
221 
222 	if (__pp_inited) {
223 		if (pp_compl_cache != NULL) {
224 			skmem_cache_destroy(pp_compl_cache);
225 			pp_compl_cache = NULL;
226 		}
227 		if (pp_flow_cache != NULL) {
228 			skmem_cache_destroy(pp_flow_cache);
229 			pp_flow_cache = NULL;
230 		}
231 		if (pp_opt_cache != NULL) {
232 			skmem_cache_destroy(pp_opt_cache);
233 			pp_opt_cache = NULL;
234 		}
235 
236 		__pp_inited = 0;
237 	}
238 }
239 
240 static struct kern_pbufpool *
pp_alloc(zalloc_flags_t how)241 pp_alloc(zalloc_flags_t how)
242 {
243 	struct kern_pbufpool *pp = zalloc_flags(pp_zone, how | Z_ZERO);
244 
245 	if (pp) {
246 		lck_mtx_init(&pp->pp_lock, &skmem_lock_grp, &skmem_lock_attr);
247 	}
248 	return pp;
249 }
250 
251 static void
pp_free(struct kern_pbufpool * pp)252 pp_free(struct kern_pbufpool *pp)
253 {
254 	PP_LOCK_ASSERT_HELD(pp);
255 
256 	pp_destroy(pp);
257 	PP_UNLOCK(pp);
258 
259 	SK_DF(SK_VERB_MEM, "pp 0x%llx FREE", SK_KVA(pp));
260 	lck_mtx_destroy(&pp->pp_lock, &skmem_lock_grp);
261 	zfree(pp_zone, pp);
262 }
263 
264 void
pp_retain_locked(struct kern_pbufpool * pp)265 pp_retain_locked(struct kern_pbufpool *pp)
266 {
267 	PP_LOCK_ASSERT_HELD(pp);
268 
269 	pp->pp_refcnt++;
270 	ASSERT(pp->pp_refcnt != 0);
271 }
272 
273 void
pp_retain(struct kern_pbufpool * pp)274 pp_retain(struct kern_pbufpool *pp)
275 {
276 	PP_LOCK(pp);
277 	pp_retain_locked(pp);
278 	PP_UNLOCK(pp);
279 }
280 
281 boolean_t
pp_release_locked(struct kern_pbufpool * pp)282 pp_release_locked(struct kern_pbufpool *pp)
283 {
284 	uint32_t oldref = pp->pp_refcnt;
285 
286 	PP_LOCK_ASSERT_HELD(pp);
287 
288 	ASSERT(pp->pp_refcnt != 0);
289 	if (--pp->pp_refcnt == 0) {
290 		pp_free(pp);
291 	}
292 
293 	return oldref == 1;
294 }
295 
296 boolean_t
pp_release(struct kern_pbufpool * pp)297 pp_release(struct kern_pbufpool *pp)
298 {
299 	boolean_t lastref;
300 
301 	PP_LOCK(pp);
302 	if (!(lastref = pp_release_locked(pp))) {
303 		PP_UNLOCK(pp);
304 	}
305 
306 	return lastref;
307 }
308 
309 void
pp_close(struct kern_pbufpool * pp)310 pp_close(struct kern_pbufpool *pp)
311 {
312 	PP_LOCK(pp);
313 	ASSERT(pp->pp_refcnt > 0);
314 	ASSERT(!(pp->pp_flags & PPF_CLOSED));
315 	pp->pp_flags |= PPF_CLOSED;
316 	if (!pp_release_locked(pp)) {
317 		PP_UNLOCK(pp);
318 	}
319 }
320 
321 void
pp_regions_params_adjust(struct skmem_region_params * srp_array,nexus_meta_type_t md_type,nexus_meta_subtype_t md_subtype,uint32_t md_cnt,uint16_t max_frags,uint32_t buf_size,uint32_t large_buf_size,uint32_t buf_cnt,uint32_t buf_seg_size,uint32_t flags)322 pp_regions_params_adjust(struct skmem_region_params *srp_array,
323     nexus_meta_type_t md_type, nexus_meta_subtype_t md_subtype, uint32_t md_cnt,
324     uint16_t max_frags, uint32_t buf_size, uint32_t large_buf_size,
325     uint32_t buf_cnt, uint32_t buf_seg_size, uint32_t flags)
326 {
327 	struct skmem_region_params *srp, *kmd_srp, *buf_srp, *kbft_srp,
328 	    *lbuf_srp;
329 	uint32_t md_size = 0;
330 	bool kernel_only = ((flags & PP_REGION_CONFIG_KERNEL_ONLY) != 0);
331 	bool md_persistent = ((flags & PP_REGION_CONFIG_MD_PERSISTENT) != 0);
332 	bool buf_persistent = ((flags & PP_REGION_CONFIG_BUF_PERSISTENT) != 0);
333 	bool config_buflet = ((flags & PP_REGION_CONFIG_BUFLET) != 0);
334 	bool md_magazine_enable = ((flags &
335 	    PP_REGION_CONFIG_MD_MAGAZINE_ENABLE) != 0);
336 	bool config_raw_buflet = (flags & PP_REGION_CONFIG_RAW_BUFLET) != 0;
337 
338 	ASSERT(max_frags != 0);
339 
340 	switch (md_type) {
341 	case NEXUS_META_TYPE_QUANTUM:
342 		md_size = NX_METADATA_QUANTUM_SZ;
343 		break;
344 	case NEXUS_META_TYPE_PACKET:
345 		md_size = NX_METADATA_PACKET_SZ(max_frags);
346 		break;
347 	default:
348 		VERIFY(0);
349 		/* NOTREACHED */
350 		__builtin_unreachable();
351 	}
352 
353 	switch (flags & PP_REGION_CONFIG_BUF_IODIR_BIDIR) {
354 	case PP_REGION_CONFIG_BUF_IODIR_IN:
355 		kmd_srp = &srp_array[SKMEM_REGION_RXKMD];
356 		buf_srp = &srp_array[SKMEM_REGION_RXBUF_DEF];
357 		lbuf_srp = &srp_array[SKMEM_REGION_RXBUF_LARGE];
358 		kbft_srp = &srp_array[SKMEM_REGION_RXKBFT];
359 		break;
360 	case PP_REGION_CONFIG_BUF_IODIR_OUT:
361 		kmd_srp = &srp_array[SKMEM_REGION_TXKMD];
362 		buf_srp = &srp_array[SKMEM_REGION_TXBUF_DEF];
363 		lbuf_srp = &srp_array[SKMEM_REGION_TXBUF_LARGE];
364 		kbft_srp = &srp_array[SKMEM_REGION_TXKBFT];
365 		break;
366 	case PP_REGION_CONFIG_BUF_IODIR_BIDIR:
367 	default:
368 		kmd_srp = &srp_array[SKMEM_REGION_KMD];
369 		buf_srp = &srp_array[SKMEM_REGION_BUF_DEF];
370 		lbuf_srp = &srp_array[SKMEM_REGION_BUF_LARGE];
371 		kbft_srp = &srp_array[SKMEM_REGION_KBFT];
372 		break;
373 	}
374 
375 	/* add preamble size to metadata obj size */
376 	md_size += METADATA_PREAMBLE_SZ;
377 	ASSERT(md_size >= NX_METADATA_OBJ_MIN_SZ);
378 
379 	/* configure kernel metadata region */
380 	kmd_srp->srp_md_type = md_type;
381 	kmd_srp->srp_md_subtype = md_subtype;
382 	kmd_srp->srp_r_obj_cnt = md_cnt;
383 	kmd_srp->srp_r_obj_size = md_size;
384 	kmd_srp->srp_max_frags = max_frags;
385 	ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
386 	if (md_persistent) {
387 		kmd_srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
388 	}
389 	ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
390 	if (md_magazine_enable) {
391 		kmd_srp->srp_cflags &= ~SKMEM_REGION_CR_NOMAGAZINES;
392 	}
393 	skmem_region_params_config(kmd_srp);
394 
395 	/* configure user metadata region */
396 	srp = &srp_array[SKMEM_REGION_UMD];
397 	if (!kernel_only) {
398 		srp->srp_md_type = kmd_srp->srp_md_type;
399 		srp->srp_md_subtype = kmd_srp->srp_md_subtype;
400 		srp->srp_r_obj_cnt = kmd_srp->srp_c_obj_cnt;
401 		srp->srp_r_obj_size = kmd_srp->srp_c_obj_size;
402 		srp->srp_max_frags = kmd_srp->srp_max_frags;
403 		ASSERT((srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
404 		if (md_persistent) {
405 			srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
406 		}
407 		/*
408 		 * UMD is a mirrored region and object allocation operations
409 		 * are performed on the KMD objects.
410 		 */
411 		ASSERT((srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
412 		skmem_region_params_config(srp);
413 		ASSERT(srp->srp_c_obj_cnt == kmd_srp->srp_c_obj_cnt);
414 	} else {
415 		ASSERT(srp->srp_r_obj_cnt == 0);
416 		ASSERT(srp->srp_r_obj_size == 0);
417 	}
418 
419 	/* configure buffer region */
420 	buf_srp->srp_r_obj_cnt = MAX(buf_cnt, kmd_srp->srp_c_obj_cnt);
421 	buf_srp->srp_r_obj_size = buf_size;
422 	buf_srp->srp_cflags &= ~SKMEM_REGION_CR_MONOLITHIC;
423 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
424 	if (buf_persistent) {
425 		buf_srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
426 	}
427 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
428 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_UREADONLY) == 0);
429 	if ((flags & PP_REGION_CONFIG_BUF_UREADONLY) != 0) {
430 		buf_srp->srp_cflags |= SKMEM_REGION_CR_UREADONLY;
431 	}
432 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_KREADONLY) == 0);
433 	if ((flags & PP_REGION_CONFIG_BUF_KREADONLY) != 0) {
434 		buf_srp->srp_cflags |= SKMEM_REGION_CR_KREADONLY;
435 	}
436 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_MONOLITHIC) == 0);
437 	if ((flags & PP_REGION_CONFIG_BUF_MONOLITHIC) != 0) {
438 		buf_srp->srp_cflags |= SKMEM_REGION_CR_MONOLITHIC;
439 	}
440 	ASSERT((srp->srp_cflags & SKMEM_REGION_CR_SEGPHYSCONTIG) == 0);
441 	if ((flags & PP_REGION_CONFIG_BUF_SEGPHYSCONTIG) != 0) {
442 		buf_srp->srp_cflags |= SKMEM_REGION_CR_SEGPHYSCONTIG;
443 	}
444 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_NOCACHE) == 0);
445 	if ((flags & PP_REGION_CONFIG_BUF_NOCACHE) != 0) {
446 		buf_srp->srp_cflags |= SKMEM_REGION_CR_NOCACHE;
447 	}
448 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_THREADSAFE) == 0);
449 	if ((flags & PP_REGION_CONFIG_BUF_THREADSAFE) != 0) {
450 		buf_srp->srp_cflags |= SKMEM_REGION_CR_THREADSAFE;
451 	}
452 	if (buf_seg_size != 0) {
453 		buf_srp->srp_r_seg_size = buf_seg_size;
454 	}
455 	skmem_region_params_config(buf_srp);
456 
457 	/* configure large buffer region */
458 	if (large_buf_size != 0) {
459 		lbuf_srp->srp_r_obj_cnt = buf_srp->srp_r_obj_cnt;
460 		lbuf_srp->srp_r_obj_size = large_buf_size;
461 		lbuf_srp->srp_r_seg_size = buf_srp->srp_r_seg_size;
462 		lbuf_srp->srp_cflags = buf_srp->srp_cflags;
463 		skmem_region_params_config(lbuf_srp);
464 	}
465 
466 	/* configure kernel buflet region */
467 	if (config_buflet) {
468 		ASSERT(md_type == NEXUS_META_TYPE_PACKET);
469 		/*
470 		 * We want to have enough buflets when multi-buflet and
471 		 * shared buffer object is used.
472 		 */
473 		uint32_t r_obj_cnt_multiplier = config_raw_buflet ?
474 		    KERN_BUF_CNT_MULTIPLIER : 1;
475 		kbft_srp->srp_r_obj_cnt =
476 		    (buf_srp->srp_c_obj_cnt + lbuf_srp->srp_c_obj_cnt) *
477 		    r_obj_cnt_multiplier;
478 		kbft_srp->srp_r_obj_size = MAX(sizeof(struct __kern_buflet_ext),
479 		    sizeof(struct __user_buflet));
480 		kbft_srp->srp_cflags = kmd_srp->srp_cflags;
481 		skmem_region_params_config(kbft_srp);
482 		ASSERT(kbft_srp->srp_c_obj_cnt >= buf_srp->srp_c_obj_cnt +
483 		    lbuf_srp->srp_c_obj_cnt);
484 	} else {
485 		ASSERT(kbft_srp->srp_r_obj_cnt == 0);
486 		ASSERT(kbft_srp->srp_r_obj_size == 0);
487 	}
488 
489 	/* configure user buflet region */
490 	srp = &srp_array[SKMEM_REGION_UBFT];
491 	if (config_buflet && !kernel_only) {
492 		srp->srp_r_obj_cnt = kbft_srp->srp_c_obj_cnt;
493 		srp->srp_r_obj_size = kbft_srp->srp_c_obj_size;
494 		srp->srp_cflags = srp_array[SKMEM_REGION_UMD].srp_cflags;
495 		skmem_region_params_config(srp);
496 		ASSERT(srp->srp_c_obj_cnt == kbft_srp->srp_c_obj_cnt);
497 	} else {
498 		ASSERT(srp->srp_r_obj_cnt == 0);
499 		ASSERT(srp->srp_r_obj_size == 0);
500 	}
501 
502 	/* make sure each metadata can be paired with a buffer */
503 	ASSERT(kmd_srp->srp_c_obj_cnt <= buf_srp->srp_c_obj_cnt);
504 }
505 
506 SK_NO_INLINE_ATTRIBUTE
507 static int
pp_metadata_construct(struct __kern_quantum * kqum,struct __user_quantum * uqum,obj_idx_t midx,struct kern_pbufpool * pp,uint32_t skmflag,uint16_t bufcnt,bool raw,struct skmem_obj ** blist)508 pp_metadata_construct(struct __kern_quantum *kqum, struct __user_quantum *uqum,
509     obj_idx_t midx, struct kern_pbufpool *pp, uint32_t skmflag, uint16_t bufcnt,
510     bool raw, struct skmem_obj **blist)
511 {
512 	struct __kern_buflet *kbuf;
513 	mach_vm_address_t baddr = 0;
514 	uint16_t *pbufs_cnt, *pbufs_max;
515 	uint16_t i;
516 
517 	ASSERT(bufcnt == 1 || PP_HAS_BUFFER_ON_DEMAND(pp));
518 
519 	/* construct {user,kernel} metadata */
520 	switch (pp->pp_md_type) {
521 	case NEXUS_META_TYPE_PACKET: {
522 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
523 		struct __user_packet *upkt = SK_PTR_ADDR_UPKT(uqum);
524 		struct __packet_opt *opt;
525 		struct __flow *flow;
526 		struct __packet_compl *compl;
527 		uint64_t pflags;
528 
529 		if (raw) {
530 			opt = skmem_cache_alloc(pp_opt_cache, SKMEM_SLEEP);
531 			flow = skmem_cache_alloc(pp_flow_cache, SKMEM_SLEEP);
532 			compl = skmem_cache_alloc(pp_compl_cache, SKMEM_SLEEP);
533 			pflags = (PKT_F_OPT_ALLOC | PKT_F_FLOW_ALLOC |
534 			    PKT_F_TX_COMPL_ALLOC);
535 		} else {
536 			ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
537 			    kpkt->pkt_com_opt != NULL);
538 			opt = kpkt->pkt_com_opt;
539 			ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
540 			    kpkt->pkt_flow != NULL);
541 			flow = kpkt->pkt_flow;
542 			ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
543 			    kpkt->pkt_tx_compl != NULL);
544 			compl = kpkt->pkt_tx_compl;
545 			pflags = kpkt->pkt_pflags;
546 		}
547 		/* will be adjusted below as part of allocating buffer(s) */
548 		_CASSERT(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
549 		_CASSERT(sizeof(kpkt->pkt_bufs_max) == sizeof(uint16_t));
550 		pbufs_cnt = __DECONST(uint16_t *, &kpkt->pkt_bufs_cnt);
551 		pbufs_max = __DECONST(uint16_t *, &kpkt->pkt_bufs_max);
552 
553 		/* kernel (and user) packet */
554 		KPKT_CTOR(kpkt, pflags, opt, flow, compl, midx,
555 		    upkt, pp, 0, pp->pp_max_frags, 0);
556 		break;
557 	}
558 	default:
559 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
560 		VERIFY(bufcnt == 1);
561 		/* TODO: point these to quantum's once they're defined */
562 		pbufs_cnt = pbufs_max = NULL;
563 		/* kernel quantum */
564 		KQUM_CTOR(kqum, midx, uqum, pp, 0);
565 		break;
566 	}
567 
568 	kbuf = kqum->qum_buf;
569 	for (i = 0; i < bufcnt; i++) {
570 		struct skmem_obj_info oib;
571 
572 		if (!PP_HAS_BUFFER_ON_DEMAND(pp)) {
573 			ASSERT(i == 0);
574 			ASSERT(*blist == NULL);
575 			/*
576 			 * quantum has a native buflet, so we only need a
577 			 * buffer to be allocated and attached to the buflet.
578 			 */
579 			baddr = pp_alloc_buffer_common(pp, &oib, skmflag,
580 			    false);
581 			if (__improbable(baddr == 0)) {
582 				goto fail;
583 			}
584 			KBUF_CTOR(kbuf, baddr, SKMEM_OBJ_IDX_REG(&oib),
585 			    SKMEM_OBJ_BUFCTL(&oib), pp, false);
586 			baddr = 0;
587 		} else {
588 			/*
589 			 * we use pre-constructed buflets with attached buffers.
590 			 */
591 			struct __kern_buflet *pkbuf = kbuf;
592 			struct skmem_obj *blistn;
593 
594 			ASSERT(pkbuf != NULL);
595 			kbuf = (kern_buflet_t)*blist;
596 			if (__improbable(kbuf == NULL)) {
597 				SK_DF(SK_VERB_MEM, "failed to get buflet,"
598 				    " pp 0x%llx", SK_KVA(pp));
599 				goto fail;
600 			}
601 			blistn = (*blist)->mo_next;
602 			(*blist)->mo_next = NULL;
603 
604 			KBUF_EXT_INIT(kbuf, pp);
605 			KBUF_LINK(pkbuf, kbuf);
606 			*blist = blistn;
607 		}
608 
609 		/* adjust buffer count accordingly */
610 		if (__probable(pbufs_cnt != NULL)) {
611 			*pbufs_cnt += 1;
612 			ASSERT(*pbufs_cnt <= *pbufs_max);
613 		}
614 	}
615 
616 	ASSERT(!PP_KERNEL_ONLY(pp) || (kqum->qum_qflags & QUM_F_KERNEL_ONLY));
617 	ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE);
618 	SK_DF(SK_VERB_MEM, "pp 0x%llx pkt 0x%llx bufcnt %d buf 0x%llx",
619 	    SK_KVA(pp), SK_KVA(kqum), bufcnt, SK_KVA(baddr));
620 	return 0;
621 
622 fail:
623 	ASSERT(bufcnt != 0 && baddr == 0);
624 	pp_metadata_destruct(kqum, pp, raw);
625 	return ENOMEM;
626 }
627 
628 static int
pp_metadata_ctor_common(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,struct kern_pbufpool * pp,uint32_t skmflag,bool no_buflet)629 pp_metadata_ctor_common(struct skmem_obj_info *oi0,
630     struct skmem_obj_info *oim0, struct kern_pbufpool *pp, uint32_t skmflag,
631     bool no_buflet)
632 {
633 	struct skmem_obj_info _oi, _oim;
634 	struct skmem_obj_info *oi, *oim;
635 	struct __kern_quantum *kqum;
636 	struct __user_quantum *uqum;
637 	uint16_t bufcnt = (no_buflet ? 0 : pp->pp_max_frags);
638 	struct skmem_obj *blist = NULL;
639 	int error;
640 
641 #if (DEVELOPMENT || DEBUG)
642 	uint64_t mtbf = skmem_region_get_mtbf();
643 	/*
644 	 * MTBF is applicable only for non-blocking allocations here.
645 	 */
646 	if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
647 	    (skmflag & SKMEM_NOSLEEP))) {
648 		SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
649 		net_update_uptime();
650 		return ENOMEM;
651 	}
652 #endif /* (DEVELOPMENT || DEBUG) */
653 
654 	/*
655 	 * Note that oi0 and oim0 may be stored inside the object itself;
656 	 * if so, copy them to local variables before constructing.  We
657 	 * don't use PPF_BATCH to test as the allocator may be allocating
658 	 * storage space differently depending on the number of objects.
659 	 */
660 	if (__probable((uintptr_t)oi0 >= (uintptr_t)SKMEM_OBJ_ADDR(oi0) &&
661 	    ((uintptr_t)oi0 + sizeof(*oi0)) <=
662 	    ((uintptr_t)SKMEM_OBJ_ADDR(oi0) + SKMEM_OBJ_SIZE(oi0)))) {
663 		oi = &_oi;
664 		*oi = *oi0;
665 		if (__probable(oim0 != NULL)) {
666 			oim = &_oim;
667 			*oim = *oim0;
668 		} else {
669 			oim = NULL;
670 		}
671 	} else {
672 		oi = oi0;
673 		oim = oim0;
674 	}
675 
676 	kqum = SK_PTR_ADDR_KQUM((uintptr_t)SKMEM_OBJ_ADDR(oi) +
677 	    METADATA_PREAMBLE_SZ);
678 
679 	if (__probable(!PP_KERNEL_ONLY(pp))) {
680 		ASSERT(oim != NULL && SKMEM_OBJ_ADDR(oim) != NULL);
681 		ASSERT(SKMEM_OBJ_SIZE(oi) == SKMEM_OBJ_SIZE(oim));
682 		uqum = SK_PTR_ADDR_UQUM((uintptr_t)SKMEM_OBJ_ADDR(oim) +
683 		    METADATA_PREAMBLE_SZ);
684 	} else {
685 		ASSERT(oim == NULL);
686 		uqum = NULL;
687 	}
688 
689 	if (oim != NULL) {
690 		/* initialize user metadata redzone */
691 		struct __metadata_preamble *mdp = SKMEM_OBJ_ADDR(oim);
692 		mdp->mdp_redzone =
693 		    (SKMEM_OBJ_ROFF(oim) + METADATA_PREAMBLE_SZ) ^
694 		    __ch_umd_redzone_cookie;
695 	}
696 
697 	/* allocate (constructed) buflet(s) with buffer(s) attached */
698 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0) {
699 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
700 		    bufcnt, skmflag);
701 	}
702 
703 	error = pp_metadata_construct(kqum, uqum, SKMEM_OBJ_IDX_REG(oi), pp,
704 	    skmflag, bufcnt, TRUE, &blist);
705 	if (__improbable(blist != NULL)) {
706 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist);
707 		blist = NULL;
708 	}
709 	return error;
710 }
711 
712 static int
pp_metadata_ctor_no_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)713 pp_metadata_ctor_no_buflet(struct skmem_obj_info *oi0,
714     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
715 {
716 	return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, true);
717 }
718 
719 static int
pp_metadata_ctor_max_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)720 pp_metadata_ctor_max_buflet(struct skmem_obj_info *oi0,
721     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
722 {
723 	return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, false);
724 }
725 
726 __attribute__((always_inline))
727 static void
pp_metadata_destruct_common(struct __kern_quantum * kqum,struct kern_pbufpool * pp,bool raw,struct skmem_obj ** blist_def,struct skmem_obj ** blist_large,struct skmem_obj ** blist_raw)728 pp_metadata_destruct_common(struct __kern_quantum *kqum,
729     struct kern_pbufpool *pp, bool raw, struct skmem_obj **blist_def,
730     struct skmem_obj **blist_large, struct skmem_obj **blist_raw)
731 {
732 	struct __kern_buflet *kbuf, *nbuf;
733 	struct skmem_obj *p_blist_def = NULL, *p_blist_large = NULL, *p_blist_raw = NULL;
734 	struct skmem_obj **pp_blist_def = &p_blist_def;
735 	struct skmem_obj **pp_blist_large = &p_blist_large;
736 	struct skmem_obj **pp_blist_raw = &p_blist_raw;
737 
738 	uint16_t bufcnt, i = 0;
739 	bool first_buflet_empty;
740 
741 	ASSERT(blist_def != NULL);
742 	ASSERT(blist_large != NULL);
743 
744 	switch (pp->pp_md_type) {
745 	case NEXUS_META_TYPE_PACKET: {
746 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
747 
748 		ASSERT(kpkt->pkt_user != NULL || PP_KERNEL_ONLY(pp));
749 		ASSERT(kpkt->pkt_qum.qum_pp == pp);
750 		ASSERT(METADATA_TYPE(kpkt) == pp->pp_md_type);
751 		ASSERT(METADATA_SUBTYPE(kpkt) == pp->pp_md_subtype);
752 		ASSERT(METADATA_IDX(kpkt) != OBJ_IDX_NONE);
753 		ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
754 		ASSERT(kpkt->pkt_bufs_cnt <= kpkt->pkt_bufs_max);
755 		ASSERT(kpkt->pkt_bufs_max == pp->pp_max_frags);
756 		_CASSERT(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
757 		bufcnt = kpkt->pkt_bufs_cnt;
758 		kbuf = &kqum->qum_buf[0];
759 		/*
760 		 * special handling for empty first buflet.
761 		 */
762 		first_buflet_empty = (kbuf->buf_addr == 0);
763 		*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = 0;
764 		break;
765 	}
766 	default:
767 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
768 		ASSERT(kqum->qum_user != NULL || PP_KERNEL_ONLY(pp));
769 		ASSERT(kqum->qum_pp == pp);
770 		ASSERT(METADATA_TYPE(kqum) == pp->pp_md_type);
771 		ASSERT(METADATA_SUBTYPE(kqum) == pp->pp_md_subtype);
772 		ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE);
773 		ASSERT(kqum->qum_ksd == NULL);
774 		kbuf = &kqum->qum_buf[0];
775 		/*
776 		 * XXX: Special handling for quantum as we don't currently
777 		 * define bufs_{cnt,max} there.  Given that we support at
778 		 * most only 1 buflet for now, check if buf_addr is non-NULL.
779 		 * See related code in pp_metadata_construct().
780 		 */
781 		first_buflet_empty = (kbuf->buf_addr == 0);
782 		bufcnt = first_buflet_empty ? 0 : 1;
783 		break;
784 	}
785 
786 	nbuf = __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr);
787 	BUF_NBFT_ADDR(kbuf, 0);
788 	BUF_NBFT_IDX(kbuf, OBJ_IDX_NONE);
789 	if (!first_buflet_empty) {
790 		pp_free_buflet_common(pp, kbuf);
791 		++i;
792 	}
793 
794 	while (nbuf != NULL) {
795 		if (BUFLET_FROM_RAW_BFLT_CACHE(nbuf)) {
796 			/*
797 			 * Separate the raw buflet and its attached buffer to
798 			 * reduce usecnt.
799 			 */
800 			uint32_t usecnt = 0;
801 			void *objaddr = nbuf->buf_objaddr;
802 			KBUF_DTOR(nbuf, usecnt);
803 			SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u",
804 			    SK_KVA(pp), SK_KVA(objaddr), usecnt);
805 			if (__improbable(usecnt == 0)) {
806 				skmem_cache_free(BUFLET_HAS_LARGE_BUF(nbuf) ?
807 				    PP_BUF_CACHE_LARGE(pp) : PP_BUF_CACHE_DEF(pp),
808 				    objaddr);
809 			}
810 
811 			*pp_blist_raw = (struct skmem_obj *)(void *)nbuf;
812 			pp_blist_raw = &((struct skmem_obj *)(void *)nbuf)->mo_next;
813 		} else {
814 			if (BUFLET_HAS_LARGE_BUF(nbuf)) {
815 				*pp_blist_large = (struct skmem_obj *)(void *)nbuf;
816 				pp_blist_large =
817 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
818 			} else {
819 				*pp_blist_def = (struct skmem_obj *)(void *)nbuf;
820 				pp_blist_def =
821 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
822 			}
823 		}
824 		BUF_NBFT_IDX(nbuf, OBJ_IDX_NONE);
825 		nbuf = __DECONST(struct __kern_buflet *, nbuf->buf_nbft_addr);
826 		++i;
827 	}
828 
829 	ASSERT(i == bufcnt);
830 
831 	if (p_blist_def != NULL) {
832 		*pp_blist_def = *blist_def;
833 		*blist_def = p_blist_def;
834 	}
835 	if (p_blist_large != NULL) {
836 		*pp_blist_large = *blist_large;
837 		*blist_large = p_blist_large;
838 	}
839 	if (p_blist_raw != NULL) {
840 		*pp_blist_raw = *blist_raw;
841 		*blist_raw = p_blist_raw;
842 	}
843 
844 	/* if we're about to return this object to the slab, clean it up */
845 	if (raw) {
846 		switch (pp->pp_md_type) {
847 		case NEXUS_META_TYPE_PACKET: {
848 			struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
849 
850 			ASSERT(kpkt->pkt_com_opt != NULL ||
851 			    !(kpkt->pkt_pflags & PKT_F_OPT_ALLOC));
852 			if (kpkt->pkt_com_opt != NULL) {
853 				ASSERT(kpkt->pkt_pflags & PKT_F_OPT_ALLOC);
854 				skmem_cache_free(pp_opt_cache,
855 				    kpkt->pkt_com_opt);
856 				kpkt->pkt_com_opt = NULL;
857 			}
858 			ASSERT(kpkt->pkt_flow != NULL ||
859 			    !(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC));
860 			if (kpkt->pkt_flow != NULL) {
861 				ASSERT(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC);
862 				skmem_cache_free(pp_flow_cache, kpkt->pkt_flow);
863 				kpkt->pkt_flow = NULL;
864 			}
865 			ASSERT(kpkt->pkt_tx_compl != NULL ||
866 			    !(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC));
867 			if (kpkt->pkt_tx_compl != NULL) {
868 				ASSERT(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC);
869 				skmem_cache_free(pp_compl_cache,
870 				    kpkt->pkt_tx_compl);
871 				kpkt->pkt_tx_compl = NULL;
872 			}
873 			kpkt->pkt_pflags = 0;
874 			break;
875 		}
876 		default:
877 			ASSERT(METADATA_TYPE(kqum) == NEXUS_META_TYPE_QUANTUM);
878 			/* nothing to do for quantum (yet) */
879 			break;
880 		}
881 	}
882 }
883 
884 __attribute__((always_inline))
885 static void
pp_metadata_destruct(struct __kern_quantum * kqum,struct kern_pbufpool * pp,bool raw)886 pp_metadata_destruct(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
887     bool raw)
888 {
889 	struct skmem_obj *blist_def = NULL, *blist_large = NULL, *blist_raw = NULL;
890 
891 	pp_metadata_destruct_common(kqum, pp, raw, &blist_def, &blist_large,
892 	    &blist_raw);
893 	if (blist_def != NULL) {
894 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist_def);
895 	}
896 	if (blist_large != NULL) {
897 		skmem_cache_batch_free(PP_KBFT_CACHE_LARGE(pp), blist_large);
898 	}
899 	if (blist_raw != NULL) {
900 		skmem_cache_batch_free(pp->pp_raw_kbft_cache, blist_raw);
901 	}
902 }
903 
904 static void
pp_metadata_dtor(void * addr,void * arg)905 pp_metadata_dtor(void *addr, void *arg)
906 {
907 	pp_metadata_destruct(SK_PTR_ADDR_KQUM((uintptr_t)addr +
908 	    METADATA_PREAMBLE_SZ), arg, TRUE);
909 }
910 
911 static void
pp_buf_seg_ctor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)912 pp_buf_seg_ctor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
913 {
914 	struct kern_pbufpool *pp = arg;
915 
916 	if (pp->pp_pbuf_seg_ctor != NULL) {
917 		pp->pp_pbuf_seg_ctor(pp, sg, md);
918 	}
919 }
920 
921 static void
pp_buf_seg_dtor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)922 pp_buf_seg_dtor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
923 {
924 	struct kern_pbufpool *pp = arg;
925 
926 	if (pp->pp_pbuf_seg_dtor != NULL) {
927 		pp->pp_pbuf_seg_dtor(pp, sg, md);
928 	}
929 }
930 
931 static int
pp_buflet_metadata_ctor_common(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag,bool large,bool attach_buf)932 pp_buflet_metadata_ctor_common(struct skmem_obj_info *oi0,
933     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag, bool large,
934     bool attach_buf)
935 {
936 #pragma unused (skmflag)
937 	struct kern_pbufpool *pp = (struct kern_pbufpool *)arg;
938 	struct __kern_buflet *kbft;
939 	struct __user_buflet *ubft;
940 	struct skmem_obj_info oib;
941 	mach_vm_address_t baddr = 0;
942 	obj_idx_t oi_idx_reg, oib_idx_reg = OBJ_IDX_NONE;
943 	struct skmem_bufctl* oib_bc = NULL;
944 
945 	if (attach_buf) {
946 		baddr = pp_alloc_buffer_common(pp, &oib, skmflag, large);
947 		if (__improbable(baddr == 0)) {
948 			return ENOMEM;
949 		}
950 		oib_idx_reg = SKMEM_OBJ_IDX_REG(&oib);
951 		oib_bc = SKMEM_OBJ_BUFCTL(&oib);
952 	}
953 	/*
954 	 * Note that oi0 and oim0 may be stored inside the object itself;
955 	 * so copy what is required to local variables before constructing.
956 	 */
957 	oi_idx_reg = SKMEM_OBJ_IDX_REG(oi0);
958 	kbft = SKMEM_OBJ_ADDR(oi0);
959 
960 	if (__probable(!PP_KERNEL_ONLY(pp))) {
961 		ASSERT(oim0 != NULL && SKMEM_OBJ_ADDR(oim0) != NULL);
962 		ASSERT(SKMEM_OBJ_SIZE(oi0) == SKMEM_OBJ_SIZE(oim0));
963 		ASSERT(oi_idx_reg == SKMEM_OBJ_IDX_REG(oim0));
964 		ASSERT(SKMEM_OBJ_IDX_SEG(oi0) == SKMEM_OBJ_IDX_SEG(oim0));
965 		ubft = SKMEM_OBJ_ADDR(oim0);
966 	} else {
967 		ASSERT(oim0 == NULL);
968 		ubft = NULL;
969 	}
970 	KBUF_EXT_CTOR(kbft, ubft, baddr, oib_idx_reg, oib_bc,
971 	    oi_idx_reg, pp, large, attach_buf);
972 	return 0;
973 }
974 
975 static int
pp_buflet_default_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)976 pp_buflet_default_buffer_metadata_ctor(struct skmem_obj_info *oi0,
977     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
978 {
979 	return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, false, true);
980 }
981 
982 static int
pp_buflet_large_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)983 pp_buflet_large_buffer_metadata_ctor(struct skmem_obj_info *oi0,
984     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
985 {
986 	return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, true, true);
987 }
988 
989 static int
pp_buflet_no_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)990 pp_buflet_no_buffer_metadata_ctor(struct skmem_obj_info *oi0,
991     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
992 {
993 	return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, false, false);
994 }
995 
996 static void
pp_buflet_metadata_dtor(void * addr,void * arg)997 pp_buflet_metadata_dtor(void *addr, void *arg)
998 {
999 	struct __kern_buflet *kbft = addr;
1000 	void *objaddr;
1001 	struct kern_pbufpool *pp = arg;
1002 	uint32_t usecnt = 0;
1003 	bool large = BUFLET_HAS_LARGE_BUF(kbft);
1004 
1005 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1006 	/*
1007 	 * don't assert for (buf_nbft_addr == 0) here as constructed
1008 	 * buflet may have this field as non-zero. This is because
1009 	 * buf_nbft_addr (__buflet_next) is used by skmem batch alloc
1010 	 * for chaining the buflets.
1011 	 * To ensure that the frred buflet was not part of a chain we
1012 	 * assert for (buf_nbft_idx == OBJ_IDX_NONE).
1013 	 */
1014 	ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
1015 	ASSERT(((struct __kern_buflet_ext *)kbft)->kbe_buf_upp_link.sle_next ==
1016 	    NULL);
1017 
1018 	/*
1019 	 * The raw buflet has never been attached with a buffer or already
1020 	 * cleaned up.
1021 	 */
1022 	if ((kbft->buf_flag & BUFLET_FLAG_RAW) != 0 && kbft->buf_ctl == NULL) {
1023 		return;
1024 	}
1025 
1026 	ASSERT(kbft->buf_addr != 0);
1027 	ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
1028 	ASSERT(kbft->buf_ctl != NULL);
1029 
1030 	objaddr = kbft->buf_objaddr;
1031 	KBUF_DTOR(kbft, usecnt);
1032 	SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u", SK_KVA(pp),
1033 	    SK_KVA(objaddr), usecnt);
1034 	if (__probable(usecnt == 0)) {
1035 		skmem_cache_free(large ? PP_BUF_CACHE_LARGE(pp) :
1036 		    PP_BUF_CACHE_DEF(pp), objaddr);
1037 	}
1038 }
1039 
1040 struct kern_pbufpool *
pp_create(const char * name,struct skmem_region_params * srp_array,pbuf_seg_ctor_fn_t buf_seg_ctor,pbuf_seg_dtor_fn_t buf_seg_dtor,const void * ctx,pbuf_ctx_retain_fn_t ctx_retain,pbuf_ctx_release_fn_t ctx_release,uint32_t ppcreatef)1041 pp_create(const char *name, struct skmem_region_params *srp_array,
1042     pbuf_seg_ctor_fn_t buf_seg_ctor, pbuf_seg_dtor_fn_t buf_seg_dtor,
1043     const void *ctx, pbuf_ctx_retain_fn_t ctx_retain,
1044     pbuf_ctx_release_fn_t ctx_release, uint32_t ppcreatef)
1045 {
1046 	struct kern_pbufpool *pp = NULL;
1047 	uint32_t md_size, def_buf_obj_size;
1048 	uint16_t def_buf_size, large_buf_size;
1049 	nexus_meta_type_t md_type;
1050 	nexus_meta_subtype_t md_subtype;
1051 	uint32_t md_cflags;
1052 	uint16_t max_frags;
1053 	char cname[64];
1054 	struct skmem_region_params *kmd_srp;
1055 	struct skmem_region_params *buf_srp;
1056 	struct skmem_region_params *kbft_srp;
1057 	struct skmem_region_params *umd_srp = NULL;
1058 	struct skmem_region_params *ubft_srp = NULL;
1059 	struct skmem_region_params *lbuf_srp = NULL;
1060 
1061 	/* buf_seg_{ctor,dtor} pair must be either NULL or non-NULL */
1062 	ASSERT(!(!(buf_seg_ctor == NULL && buf_seg_dtor == NULL) &&
1063 	    ((buf_seg_ctor == NULL) ^ (buf_seg_dtor == NULL))));
1064 
1065 	/* ctx{,_retain,_release} must be either ALL NULL or ALL non-NULL */
1066 	ASSERT((ctx == NULL && ctx_retain == NULL && ctx_release == NULL) ||
1067 	    (ctx != NULL && ctx_retain != NULL && ctx_release != NULL));
1068 
1069 	if (srp_array[SKMEM_REGION_KMD].srp_c_obj_cnt != 0) {
1070 		kmd_srp = &srp_array[SKMEM_REGION_KMD];
1071 		buf_srp = &srp_array[SKMEM_REGION_BUF_DEF];
1072 		lbuf_srp = &srp_array[SKMEM_REGION_BUF_LARGE];
1073 		kbft_srp = &srp_array[SKMEM_REGION_KBFT];
1074 	} else if (srp_array[SKMEM_REGION_RXKMD].srp_c_obj_cnt != 0) {
1075 		kmd_srp = &srp_array[SKMEM_REGION_RXKMD];
1076 		buf_srp = &srp_array[SKMEM_REGION_RXBUF_DEF];
1077 		lbuf_srp = &srp_array[SKMEM_REGION_RXBUF_LARGE];
1078 		kbft_srp = &srp_array[SKMEM_REGION_RXKBFT];
1079 	} else {
1080 		VERIFY(srp_array[SKMEM_REGION_TXKMD].srp_c_obj_cnt != 0);
1081 		kmd_srp = &srp_array[SKMEM_REGION_TXKMD];
1082 		buf_srp = &srp_array[SKMEM_REGION_TXBUF_DEF];
1083 		lbuf_srp = &srp_array[SKMEM_REGION_TXBUF_LARGE];
1084 		kbft_srp = &srp_array[SKMEM_REGION_TXKBFT];
1085 	}
1086 
1087 	VERIFY(kmd_srp->srp_c_obj_size != 0);
1088 	VERIFY(buf_srp->srp_c_obj_cnt != 0);
1089 	VERIFY(buf_srp->srp_c_obj_size != 0);
1090 
1091 	if (ppcreatef & PPCREATEF_ONDEMAND_BUF) {
1092 		VERIFY(kbft_srp->srp_c_obj_cnt != 0);
1093 		VERIFY(kbft_srp->srp_c_obj_size != 0);
1094 	} else {
1095 		kbft_srp = NULL;
1096 	}
1097 
1098 	if ((ppcreatef & PPCREATEF_KERNEL_ONLY) == 0) {
1099 		umd_srp = &srp_array[SKMEM_REGION_UMD];
1100 		ASSERT(umd_srp->srp_c_obj_size == kmd_srp->srp_c_obj_size);
1101 		ASSERT(umd_srp->srp_c_obj_cnt == kmd_srp->srp_c_obj_cnt);
1102 		ASSERT(umd_srp->srp_c_seg_size == kmd_srp->srp_c_seg_size);
1103 		ASSERT(umd_srp->srp_seg_cnt == kmd_srp->srp_seg_cnt);
1104 		ASSERT(umd_srp->srp_md_type == kmd_srp->srp_md_type);
1105 		ASSERT(umd_srp->srp_md_subtype == kmd_srp->srp_md_subtype);
1106 		ASSERT(umd_srp->srp_max_frags == kmd_srp->srp_max_frags);
1107 		ASSERT((umd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
1108 		    (kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
1109 		if (kbft_srp != NULL) {
1110 			ubft_srp = &srp_array[SKMEM_REGION_UBFT];
1111 			ASSERT(ubft_srp->srp_c_obj_size ==
1112 			    kbft_srp->srp_c_obj_size);
1113 			ASSERT(ubft_srp->srp_c_obj_cnt ==
1114 			    kbft_srp->srp_c_obj_cnt);
1115 			ASSERT(ubft_srp->srp_c_seg_size ==
1116 			    kbft_srp->srp_c_seg_size);
1117 			ASSERT(ubft_srp->srp_seg_cnt == kbft_srp->srp_seg_cnt);
1118 		}
1119 	}
1120 
1121 	md_size = kmd_srp->srp_r_obj_size;
1122 	md_type = kmd_srp->srp_md_type;
1123 	md_subtype = kmd_srp->srp_md_subtype;
1124 	max_frags = kmd_srp->srp_max_frags;
1125 	def_buf_obj_size = buf_srp->srp_c_obj_size;
1126 
1127 	if (def_buf_obj_size > UINT16_MAX) {
1128 		def_buf_size = UINT16_MAX;
1129 	} else {
1130 		def_buf_size = (uint16_t)def_buf_obj_size;
1131 	}
1132 
1133 	if (lbuf_srp->srp_c_obj_size > UINT16_MAX) {
1134 		large_buf_size = UINT16_MAX;
1135 	} else {
1136 		large_buf_size = (uint16_t)lbuf_srp->srp_c_obj_size;
1137 	}
1138 
1139 #if (DEBUG || DEVELOPMENT)
1140 	ASSERT(def_buf_obj_size != 0);
1141 	ASSERT(md_type > NEXUS_META_TYPE_INVALID &&
1142 	    md_type <= NEXUS_META_TYPE_MAX);
1143 	if (md_type == NEXUS_META_TYPE_QUANTUM) {
1144 		ASSERT(max_frags == 1);
1145 		ASSERT(md_size >=
1146 		    (METADATA_PREAMBLE_SZ + NX_METADATA_QUANTUM_SZ));
1147 	} else {
1148 		ASSERT(max_frags >= 1);
1149 		ASSERT(md_type == NEXUS_META_TYPE_PACKET);
1150 		ASSERT(md_size >= (METADATA_PREAMBLE_SZ +
1151 		    NX_METADATA_PACKET_SZ(max_frags)));
1152 	}
1153 	ASSERT(md_subtype > NEXUS_META_SUBTYPE_INVALID &&
1154 	    md_subtype <= NEXUS_META_SUBTYPE_MAX);
1155 #endif /* DEBUG || DEVELOPMENT */
1156 
1157 	pp = pp_alloc(Z_WAITOK);
1158 
1159 	(void) snprintf((char *)pp->pp_name, sizeof(pp->pp_name),
1160 	    "skywalk.pp.%s", name);
1161 
1162 	pp->pp_ctx = __DECONST(void *, ctx);
1163 	pp->pp_ctx_retain = ctx_retain;
1164 	pp->pp_ctx_release = ctx_release;
1165 	if (pp->pp_ctx != NULL) {
1166 		pp->pp_ctx_retain(pp->pp_ctx);
1167 	}
1168 
1169 	pp->pp_pbuf_seg_ctor = buf_seg_ctor;
1170 	pp->pp_pbuf_seg_dtor = buf_seg_dtor;
1171 	PP_BUF_SIZE_DEF(pp) = def_buf_size;
1172 	PP_BUF_OBJ_SIZE_DEF(pp) = def_buf_obj_size;
1173 	PP_BUF_SIZE_LARGE(pp) = large_buf_size;
1174 	PP_BUF_OBJ_SIZE_LARGE(pp) = lbuf_srp->srp_c_obj_size;
1175 	pp->pp_md_type = md_type;
1176 	pp->pp_md_subtype = md_subtype;
1177 	pp->pp_max_frags = max_frags;
1178 	if (ppcreatef & PPCREATEF_EXTERNAL) {
1179 		pp->pp_flags |= PPF_EXTERNAL;
1180 	}
1181 	if (ppcreatef & PPCREATEF_TRUNCATED_BUF) {
1182 		pp->pp_flags |= PPF_TRUNCATED_BUF;
1183 	}
1184 	if (ppcreatef & PPCREATEF_KERNEL_ONLY) {
1185 		pp->pp_flags |= PPF_KERNEL;
1186 	}
1187 	if (ppcreatef & PPCREATEF_ONDEMAND_BUF) {
1188 		pp->pp_flags |= PPF_BUFFER_ON_DEMAND;
1189 	}
1190 	if (ppcreatef & PPCREATEF_DYNAMIC) {
1191 		pp->pp_flags |= PPF_DYNAMIC;
1192 	}
1193 	if (lbuf_srp->srp_c_obj_cnt > 0) {
1194 		ASSERT(lbuf_srp->srp_c_obj_size != 0);
1195 		pp->pp_flags |= PPF_LARGE_BUF;
1196 	}
1197 	if (ppcreatef & PPCREATEF_RAW_BFLT) {
1198 		ASSERT((ppcreatef & PPCREATEF_ONDEMAND_BUF) != 0);
1199 		pp->pp_flags |= PPF_RAW_BUFLT;
1200 	}
1201 
1202 	pp_retain(pp);
1203 
1204 	md_cflags = ((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ?
1205 	    SKMEM_CR_NOMAGAZINES : 0);
1206 	md_cflags |= SKMEM_CR_BATCH;
1207 	pp->pp_flags |= PPF_BATCH;
1208 
1209 	if (pp->pp_flags & PPF_DYNAMIC) {
1210 		md_cflags |= SKMEM_CR_DYNAMIC;
1211 	}
1212 
1213 	if (umd_srp != NULL && (pp->pp_umd_region =
1214 	    skmem_region_create(name, umd_srp, NULL, NULL, NULL)) == NULL) {
1215 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1216 		    pp->pp_name, SK_KVA(pp), umd_srp->srp_name);
1217 		goto failed;
1218 	}
1219 
1220 	if ((pp->pp_kmd_region = skmem_region_create(name, kmd_srp, NULL, NULL,
1221 	    NULL)) == NULL) {
1222 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1223 		    pp->pp_name, SK_KVA(pp), kmd_srp->srp_name);
1224 		goto failed;
1225 	}
1226 
1227 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1228 		VERIFY((kbft_srp != NULL) && (kbft_srp->srp_c_obj_cnt > 0));
1229 		if (!PP_KERNEL_ONLY(pp)) {
1230 			VERIFY((ubft_srp != NULL) &&
1231 			    (ubft_srp->srp_c_obj_cnt > 0));
1232 		}
1233 	}
1234 	/*
1235 	 * Metadata regions {KMD,KBFT,UBFT} magazines layer and persistency
1236 	 * attribute must match.
1237 	 */
1238 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1239 		ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ==
1240 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES));
1241 		ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
1242 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
1243 	}
1244 
1245 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && !PP_KERNEL_ONLY(pp)) {
1246 		if ((pp->pp_ubft_region = skmem_region_create(name, ubft_srp,
1247 		    NULL, NULL, NULL)) == NULL) {
1248 			SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1249 			    pp->pp_name, SK_KVA(pp), ubft_srp->srp_name);
1250 			goto failed;
1251 		}
1252 	}
1253 
1254 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1255 		if ((pp->pp_kbft_region = skmem_region_create(name,
1256 		    kbft_srp, NULL, NULL, NULL)) == NULL) {
1257 			SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1258 			    pp->pp_name, SK_KVA(pp), kbft_srp->srp_name);
1259 			goto failed;
1260 		}
1261 	}
1262 
1263 	if (!PP_KERNEL_ONLY(pp)) {
1264 		skmem_region_mirror(pp->pp_kmd_region, pp->pp_umd_region);
1265 	}
1266 	if (!PP_KERNEL_ONLY(pp) && pp->pp_ubft_region != NULL) {
1267 		ASSERT(pp->pp_kbft_region != NULL);
1268 		skmem_region_mirror(pp->pp_kbft_region, pp->pp_ubft_region);
1269 	}
1270 
1271 	/*
1272 	 * Create the metadata cache; magazines layer is determined by caller.
1273 	 */
1274 	(void) snprintf(cname, sizeof(cname), "kmd.%s", name);
1275 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1276 		pp->pp_kmd_cache = skmem_cache_create(cname, md_size, 0,
1277 		    pp_metadata_ctor_no_buflet, pp_metadata_dtor, NULL, pp,
1278 		    pp->pp_kmd_region, md_cflags);
1279 	} else {
1280 		pp->pp_kmd_cache = skmem_cache_create(cname, md_size, 0,
1281 		    pp_metadata_ctor_max_buflet, pp_metadata_dtor, NULL, pp,
1282 		    pp->pp_kmd_region, md_cflags);
1283 	}
1284 
1285 	if (pp->pp_kmd_cache == NULL) {
1286 		SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1287 		    pp->pp_name, SK_KVA(pp), cname);
1288 		goto failed;
1289 	}
1290 
1291 	/*
1292 	 * Create the buflet metadata cache
1293 	 */
1294 	if (pp->pp_kbft_region != NULL) {
1295 		(void) snprintf(cname, sizeof(cname), "kbft_def.%s", name);
1296 		PP_KBFT_CACHE_DEF(pp) = skmem_cache_create(cname,
1297 		    kbft_srp->srp_c_obj_size, 0,
1298 		    pp_buflet_default_buffer_metadata_ctor,
1299 		    pp_buflet_metadata_dtor, NULL, pp, pp->pp_kbft_region,
1300 		    md_cflags);
1301 
1302 		if (PP_KBFT_CACHE_DEF(pp) == NULL) {
1303 			SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1304 			    pp->pp_name, SK_KVA(pp), cname);
1305 			goto failed;
1306 		}
1307 
1308 		if (PP_HAS_LARGE_BUF(pp)) {
1309 			(void) snprintf(cname, sizeof(cname), "kbft_large.%s",
1310 			    name);
1311 			PP_KBFT_CACHE_LARGE(pp) = skmem_cache_create(cname,
1312 			    kbft_srp->srp_c_obj_size, 0,
1313 			    pp_buflet_large_buffer_metadata_ctor,
1314 			    pp_buflet_metadata_dtor,
1315 			    NULL, pp, pp->pp_kbft_region, md_cflags);
1316 
1317 			if (PP_KBFT_CACHE_LARGE(pp) == NULL) {
1318 				SK_ERR("\"%s\" (0x%llx) failed to "
1319 				    "create \"%s\" cache", pp->pp_name,
1320 				    SK_KVA(pp), cname);
1321 				goto failed;
1322 			}
1323 		}
1324 
1325 		if (PP_HAS_RAW_BFLT(pp)) {
1326 			(void) snprintf(cname, sizeof(cname), "kbft_raw.%s", name);
1327 			pp->pp_raw_kbft_cache = skmem_cache_create(cname,
1328 			    kbft_srp->srp_c_obj_size, 0,
1329 			    pp_buflet_no_buffer_metadata_ctor,
1330 			    pp_buflet_metadata_dtor, NULL, pp, pp->pp_kbft_region,
1331 			    md_cflags);
1332 
1333 			if (pp->pp_raw_kbft_cache == NULL) {
1334 				SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1335 				    pp->pp_name, SK_KVA(pp), cname);
1336 				goto failed;
1337 			}
1338 		}
1339 	}
1340 
1341 	if ((PP_BUF_REGION_DEF(pp) = skmem_region_create(name,
1342 	    buf_srp, pp_buf_seg_ctor, pp_buf_seg_dtor, pp)) == NULL) {
1343 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1344 		    pp->pp_name, SK_KVA(pp), buf_srp->srp_name);
1345 		goto failed;
1346 	}
1347 
1348 	if (PP_HAS_LARGE_BUF(pp)) {
1349 		PP_BUF_REGION_LARGE(pp) = skmem_region_create(name, lbuf_srp,
1350 		    pp_buf_seg_ctor, pp_buf_seg_dtor, pp);
1351 		if (PP_BUF_REGION_LARGE(pp) == NULL) {
1352 			SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1353 			    pp->pp_name, SK_KVA(pp), lbuf_srp->srp_name);
1354 			goto failed;
1355 		}
1356 	}
1357 
1358 	/*
1359 	 * Create the buffer object cache without the magazines layer.
1360 	 * We rely on caching the constructed metadata object instead.
1361 	 */
1362 	(void) snprintf(cname, sizeof(cname), "buf_def.%s", name);
1363 	if ((PP_BUF_CACHE_DEF(pp) = skmem_cache_create(cname, def_buf_obj_size,
1364 	    0, NULL, NULL, NULL, pp, PP_BUF_REGION_DEF(pp),
1365 	    SKMEM_CR_NOMAGAZINES)) == NULL) {
1366 		SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1367 		    pp->pp_name, SK_KVA(pp), cname);
1368 		goto failed;
1369 	}
1370 
1371 	if (PP_BUF_REGION_LARGE(pp) != NULL) {
1372 		(void) snprintf(cname, sizeof(cname), "buf_large.%s", name);
1373 		if ((PP_BUF_CACHE_LARGE(pp) = skmem_cache_create(cname,
1374 		    lbuf_srp->srp_c_obj_size, 0, NULL, NULL, NULL, pp,
1375 		    PP_BUF_REGION_LARGE(pp), SKMEM_CR_NOMAGAZINES)) == NULL) {
1376 			SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1377 			    pp->pp_name, SK_KVA(pp), cname);
1378 			goto failed;
1379 		}
1380 	}
1381 
1382 	return pp;
1383 
1384 failed:
1385 	if (pp != NULL) {
1386 		if (pp->pp_ctx != NULL) {
1387 			pp->pp_ctx_release(pp->pp_ctx);
1388 			pp->pp_ctx = NULL;
1389 		}
1390 		pp_close(pp);
1391 	}
1392 
1393 	return NULL;
1394 }
1395 
1396 void
pp_destroy(struct kern_pbufpool * pp)1397 pp_destroy(struct kern_pbufpool *pp)
1398 {
1399 	PP_LOCK_ASSERT_HELD(pp);
1400 
1401 	/* may be called for built-in pp with outstanding reference */
1402 	ASSERT(!(pp->pp_flags & PPF_EXTERNAL) || pp->pp_refcnt == 0);
1403 
1404 	pp_destroy_upp_locked(pp);
1405 
1406 	pp_destroy_upp_bft_locked(pp);
1407 
1408 	if (pp->pp_kmd_cache != NULL) {
1409 		skmem_cache_destroy(pp->pp_kmd_cache);
1410 		pp->pp_kmd_cache = NULL;
1411 	}
1412 
1413 	if (pp->pp_umd_region != NULL) {
1414 		skmem_region_release(pp->pp_umd_region);
1415 		pp->pp_umd_region = NULL;
1416 	}
1417 
1418 	if (pp->pp_kmd_region != NULL) {
1419 		skmem_region_release(pp->pp_kmd_region);
1420 		pp->pp_kmd_region = NULL;
1421 	}
1422 
1423 	if (PP_KBFT_CACHE_DEF(pp) != NULL) {
1424 		skmem_cache_destroy(PP_KBFT_CACHE_DEF(pp));
1425 		PP_KBFT_CACHE_DEF(pp) = NULL;
1426 	}
1427 
1428 	if (PP_KBFT_CACHE_LARGE(pp) != NULL) {
1429 		skmem_cache_destroy(PP_KBFT_CACHE_LARGE(pp));
1430 		PP_KBFT_CACHE_LARGE(pp) = NULL;
1431 	}
1432 
1433 	if (pp->pp_raw_kbft_cache != NULL) {
1434 		skmem_cache_destroy(pp->pp_raw_kbft_cache);
1435 		pp->pp_raw_kbft_cache = NULL;
1436 	}
1437 
1438 	if (pp->pp_ubft_region != NULL) {
1439 		skmem_region_release(pp->pp_ubft_region);
1440 		pp->pp_ubft_region = NULL;
1441 	}
1442 
1443 	if (pp->pp_kbft_region != NULL) {
1444 		skmem_region_release(pp->pp_kbft_region);
1445 		pp->pp_kbft_region = NULL;
1446 	}
1447 
1448 	/*
1449 	 * The order is important here, since pp_metadata_dtor()
1450 	 * called by freeing on the pp_kmd_cache will in turn
1451 	 * free the attached buffer.  Therefore destroy the
1452 	 * buffer cache last.
1453 	 */
1454 	if (PP_BUF_CACHE_DEF(pp) != NULL) {
1455 		skmem_cache_destroy(PP_BUF_CACHE_DEF(pp));
1456 		PP_BUF_CACHE_DEF(pp) = NULL;
1457 	}
1458 	if (PP_BUF_REGION_DEF(pp) != NULL) {
1459 		skmem_region_release(PP_BUF_REGION_DEF(pp));
1460 		PP_BUF_REGION_DEF(pp) = NULL;
1461 	}
1462 	if (PP_BUF_CACHE_LARGE(pp) != NULL) {
1463 		skmem_cache_destroy(PP_BUF_CACHE_LARGE(pp));
1464 		PP_BUF_CACHE_LARGE(pp) = NULL;
1465 	}
1466 	if (PP_BUF_REGION_LARGE(pp) != NULL) {
1467 		skmem_region_release(PP_BUF_REGION_LARGE(pp));
1468 		PP_BUF_REGION_LARGE(pp) = NULL;
1469 	}
1470 
1471 	if (pp->pp_ctx != NULL) {
1472 		pp->pp_ctx_release(pp->pp_ctx);
1473 		pp->pp_ctx = NULL;
1474 	}
1475 }
1476 
1477 static int
pp_init_upp_locked(struct kern_pbufpool * pp,boolean_t can_block)1478 pp_init_upp_locked(struct kern_pbufpool *pp, boolean_t can_block)
1479 {
1480 	int i, err = 0;
1481 
1482 	if (pp->pp_u_hash_table != NULL) {
1483 		goto done;
1484 	}
1485 
1486 	/* allocated-address hash table */
1487 	pp->pp_u_hash_table = can_block ? zalloc(pp_u_htbl_zone) :
1488 	    zalloc_noblock(pp_u_htbl_zone);
1489 	if (pp->pp_u_hash_table == NULL) {
1490 		SK_ERR("failed to zalloc packet buffer pool upp hash table");
1491 		err = ENOMEM;
1492 		goto done;
1493 	}
1494 
1495 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1496 		SLIST_INIT(&pp->pp_u_hash_table[i].upp_head);
1497 	}
1498 done:
1499 	return err;
1500 }
1501 
1502 static void
pp_destroy_upp_locked(struct kern_pbufpool * pp)1503 pp_destroy_upp_locked(struct kern_pbufpool *pp)
1504 {
1505 	PP_LOCK_ASSERT_HELD(pp);
1506 	if (pp->pp_u_hash_table != NULL) {
1507 		/* purge anything that's left */
1508 		pp_purge_upp_locked(pp, -1);
1509 
1510 #if (DEBUG || DEVELOPMENT)
1511 		for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1512 			ASSERT(SLIST_EMPTY(&pp->pp_u_hash_table[i].upp_head));
1513 		}
1514 #endif /* DEBUG || DEVELOPMENT */
1515 
1516 		zfree(pp_u_htbl_zone, pp->pp_u_hash_table);
1517 		pp->pp_u_hash_table = NULL;
1518 	}
1519 	ASSERT(pp->pp_u_bufinuse == 0);
1520 }
1521 
1522 int
pp_init_upp(struct kern_pbufpool * pp,boolean_t can_block)1523 pp_init_upp(struct kern_pbufpool *pp, boolean_t can_block)
1524 {
1525 	int err = 0;
1526 
1527 	PP_LOCK(pp);
1528 	err = pp_init_upp_locked(pp, can_block);
1529 	if (err) {
1530 		SK_ERR("packet UPP init failed (%d)", err);
1531 		goto done;
1532 	}
1533 	err = pp_init_upp_bft_locked(pp, can_block);
1534 	if (err) {
1535 		SK_ERR("buflet UPP init failed (%d)", err);
1536 		pp_destroy_upp_locked(pp);
1537 		goto done;
1538 	}
1539 	pp_retain_locked(pp);
1540 done:
1541 	PP_UNLOCK(pp);
1542 	return err;
1543 }
1544 
1545 __attribute__((always_inline))
1546 static void
pp_insert_upp_bft_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1547 pp_insert_upp_bft_locked(struct kern_pbufpool *pp,
1548     struct __kern_buflet *kbft, pid_t pid)
1549 {
1550 	struct kern_pbufpool_u_bft_bkt *bkt;
1551 	struct __kern_buflet_ext *kbe = (struct __kern_buflet_ext *)kbft;
1552 
1553 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1554 	ASSERT(kbe->kbe_buf_pid == (pid_t)-1);
1555 	kbe->kbe_buf_pid = pid;
1556 	bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, kbft->buf_bft_idx_reg);
1557 	SLIST_INSERT_HEAD(&bkt->upp_head, kbe, kbe_buf_upp_link);
1558 	pp->pp_u_bftinuse++;
1559 }
1560 
1561 __attribute__((always_inline))
1562 static void
pp_insert_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1563 pp_insert_upp_bft_chain_locked(struct kern_pbufpool *pp,
1564     struct __kern_buflet *kbft, pid_t pid)
1565 {
1566 	while (kbft != NULL) {
1567 		pp_insert_upp_bft_locked(pp, kbft, pid);
1568 		kbft = __DECONST(kern_buflet_t, kbft->buf_nbft_addr);
1569 	}
1570 }
1571 
1572 /* Also inserts the attached chain of buflets */
1573 void static inline
pp_insert_upp_common(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1574 pp_insert_upp_common(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1575     pid_t pid)
1576 {
1577 	struct kern_pbufpool_u_bkt *bkt;
1578 	struct __kern_buflet *kbft;
1579 
1580 	ASSERT(kqum->qum_pid == (pid_t)-1);
1581 	kqum->qum_pid = pid;
1582 
1583 	bkt = KERN_PBUFPOOL_U_HASH(pp, METADATA_IDX(kqum));
1584 	SLIST_INSERT_HEAD(&bkt->upp_head, kqum, qum_upp_link);
1585 	pp->pp_u_bufinuse++;
1586 
1587 	kbft = (kern_buflet_t)kqum->qum_buf[0].buf_nbft_addr;
1588 	if (kbft != NULL) {
1589 		ASSERT(((kern_buflet_t)kbft)->buf_flag & BUFLET_FLAG_EXTERNAL);
1590 		ASSERT(kqum->qum_qflags & QUM_F_INTERNALIZED);
1591 		pp_insert_upp_bft_chain_locked(pp, kbft, pid);
1592 	}
1593 }
1594 
1595 void
pp_insert_upp_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1596 pp_insert_upp_locked(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1597     pid_t pid)
1598 {
1599 	pp_insert_upp_common(pp, kqum, pid);
1600 }
1601 
1602 void
pp_insert_upp(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1603 pp_insert_upp(struct kern_pbufpool *pp, struct __kern_quantum *kqum, pid_t pid)
1604 {
1605 	PP_LOCK(pp);
1606 	pp_insert_upp_common(pp, kqum, pid);
1607 	PP_UNLOCK(pp);
1608 }
1609 
1610 void
pp_insert_upp_batch(struct kern_pbufpool * pp,pid_t pid,uint64_t * array,uint32_t num)1611 pp_insert_upp_batch(struct kern_pbufpool *pp, pid_t pid, uint64_t *array,
1612     uint32_t num)
1613 {
1614 	uint32_t i = 0;
1615 
1616 	ASSERT(array != NULL && num > 0);
1617 	PP_LOCK(pp);
1618 	while (num != 0) {
1619 		struct __kern_quantum *kqum = SK_PTR_ADDR_KQUM(array[i]);
1620 
1621 		ASSERT(kqum != NULL);
1622 		pp_insert_upp_common(pp, kqum, pid);
1623 		--num;
1624 		++i;
1625 	}
1626 	PP_UNLOCK(pp);
1627 }
1628 
1629 __attribute__((always_inline))
1630 static struct __kern_buflet *
pp_remove_upp_bft_locked(struct kern_pbufpool * pp,obj_idx_t bft_idx)1631 pp_remove_upp_bft_locked(struct kern_pbufpool *pp, obj_idx_t bft_idx)
1632 {
1633 	struct __kern_buflet_ext *kbft, *tbft;
1634 	struct kern_pbufpool_u_bft_bkt *bkt;
1635 
1636 	bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, bft_idx);
1637 	SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link, tbft) {
1638 		if (((kern_buflet_t)kbft)->buf_bft_idx_reg == bft_idx) {
1639 			SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1640 			    kbe_buf_upp_link);
1641 			kbft->kbe_buf_pid = (pid_t)-1;
1642 			kbft->kbe_buf_upp_link.sle_next = NULL;
1643 			ASSERT(pp->pp_u_bftinuse != 0);
1644 			pp->pp_u_bftinuse--;
1645 			break;
1646 		}
1647 	}
1648 	return (kern_buflet_t)kbft;
1649 }
1650 
1651 struct __kern_buflet *
pp_remove_upp_bft(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1652 pp_remove_upp_bft(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1653 {
1654 	struct __kern_buflet *kbft = pp_remove_upp_bft_locked(pp, md_idx);
1655 
1656 	*err = __improbable(kbft != NULL) ? 0 : EINVAL;
1657 	return kbft;
1658 }
1659 
1660 __attribute__((always_inline))
1661 static int
pp_remove_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum)1662 pp_remove_upp_bft_chain_locked(struct kern_pbufpool *pp,
1663     struct __kern_quantum *kqum)
1664 {
1665 	uint32_t max_frags = pp->pp_max_frags;
1666 	struct __kern_buflet *kbft;
1667 	uint16_t nbfts, upkt_nbfts;
1668 	obj_idx_t bft_idx;
1669 
1670 	ASSERT(!(kqum->qum_qflags & QUM_F_INTERNALIZED));
1671 	bft_idx = kqum->qum_user->qum_buf[0].buf_nbft_idx;
1672 	kbft = &kqum->qum_buf[0];
1673 	if (bft_idx == OBJ_IDX_NONE) {
1674 		return 0;
1675 	}
1676 
1677 	ASSERT(METADATA_TYPE(kqum) == NEXUS_META_TYPE_PACKET);
1678 	struct __kern_packet *kpkt = __DECONST(struct __kern_packet *, kqum);
1679 	struct __user_packet *upkt = __DECONST(struct __user_packet *,
1680 	    kpkt->pkt_qum.qum_user);
1681 
1682 	upkt_nbfts = upkt->pkt_bufs_cnt;
1683 	if (__improbable(upkt_nbfts > max_frags)) {
1684 		SK_ERR("bad bcnt in upkt (%d > %d)", upkt_nbfts, max_frags);
1685 		BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1686 		BUF_NBFT_ADDR(kbft, 0);
1687 		return ERANGE;
1688 	}
1689 
1690 	nbfts = (kbft->buf_addr != 0) ? 1 : 0;
1691 
1692 	do {
1693 		struct __kern_buflet *pbft = kbft;
1694 		struct __kern_buflet_ext *kbe;
1695 
1696 		kbft = pp_remove_upp_bft_locked(pp, bft_idx);
1697 		if (__improbable(kbft == NULL)) {
1698 			BUF_NBFT_IDX(pbft, OBJ_IDX_NONE);
1699 			BUF_NBFT_ADDR(pbft, 0);
1700 			SK_ERR("unallocated next buflet (%d), %p", bft_idx,
1701 			    SK_KVA(pbft));
1702 			return ERANGE;
1703 		}
1704 		ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1705 		BUF_NBFT_IDX(pbft, bft_idx);
1706 		BUF_NBFT_ADDR(pbft, kbft);
1707 		kbe = (struct __kern_buflet_ext *)kbft;
1708 		bft_idx = kbe->kbe_buf_user->buf_nbft_idx;
1709 		++nbfts;
1710 	} while ((bft_idx != OBJ_IDX_NONE) && (nbfts < upkt_nbfts));
1711 
1712 	ASSERT(kbft != NULL);
1713 	BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1714 	BUF_NBFT_ADDR(kbft, 0);
1715 	*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = nbfts;
1716 
1717 	if (__improbable((bft_idx != OBJ_IDX_NONE) || (nbfts != upkt_nbfts))) {
1718 		SK_ERR("bad buflet in upkt (%d, %d)", nbfts, upkt_nbfts);
1719 		return ERANGE;
1720 	}
1721 	return 0;
1722 }
1723 
1724 struct __kern_quantum *
pp_remove_upp_locked(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1725 pp_remove_upp_locked(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1726 {
1727 	struct __kern_quantum *kqum, *tqum;
1728 	struct kern_pbufpool_u_bkt *bkt;
1729 
1730 	bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1731 	SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1732 		if (METADATA_IDX(kqum) == md_idx) {
1733 			SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1734 			    qum_upp_link);
1735 			kqum->qum_pid = (pid_t)-1;
1736 			ASSERT(pp->pp_u_bufinuse != 0);
1737 			pp->pp_u_bufinuse--;
1738 			break;
1739 		}
1740 	}
1741 	if (__probable(kqum != NULL)) {
1742 		*err = pp_remove_upp_bft_chain_locked(pp, kqum);
1743 	} else {
1744 		*err = ERANGE;
1745 	}
1746 	return kqum;
1747 }
1748 
1749 struct __kern_quantum *
pp_remove_upp(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1750 pp_remove_upp(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1751 {
1752 	struct __kern_quantum *kqum;
1753 
1754 	PP_LOCK(pp);
1755 	kqum = pp_remove_upp_locked(pp, md_idx, err);
1756 	PP_UNLOCK(pp);
1757 	return kqum;
1758 }
1759 
1760 struct __kern_quantum *
pp_find_upp(struct kern_pbufpool * pp,obj_idx_t md_idx)1761 pp_find_upp(struct kern_pbufpool *pp, obj_idx_t md_idx)
1762 {
1763 	struct __kern_quantum *kqum, *tqum;
1764 	struct kern_pbufpool_u_bkt *bkt;
1765 
1766 	PP_LOCK(pp);
1767 	bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1768 	SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1769 		if (METADATA_IDX(kqum) == md_idx) {
1770 			break;
1771 		}
1772 	}
1773 	PP_UNLOCK(pp);
1774 
1775 	return kqum;
1776 }
1777 
1778 __attribute__((always_inline))
1779 static void
pp_purge_upp_locked(struct kern_pbufpool * pp,pid_t pid)1780 pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid)
1781 {
1782 	struct __kern_quantum *kqum, *tqum;
1783 	struct kern_pbufpool_u_bkt *bkt;
1784 	int i;
1785 
1786 	PP_LOCK_ASSERT_HELD(pp);
1787 
1788 	/*
1789 	 * TODO: Build a list of packets and batch-free them.
1790 	 */
1791 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1792 		bkt = &pp->pp_u_hash_table[i];
1793 		SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1794 			ASSERT(kqum->qum_pid != (pid_t)-1);
1795 			if (pid != (pid_t)-1 && kqum->qum_pid != pid) {
1796 				continue;
1797 			}
1798 			SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1799 			    qum_upp_link);
1800 			pp_remove_upp_bft_chain_locked(pp, kqum);
1801 			kqum->qum_pid = (pid_t)-1;
1802 			kqum->qum_qflags &= ~QUM_F_FINALIZED;
1803 			kqum->qum_ksd = NULL;
1804 			pp_free_packet(__DECONST(struct kern_pbufpool *,
1805 			    kqum->qum_pp), (uint64_t)kqum);
1806 			ASSERT(pp->pp_u_bufinuse != 0);
1807 			pp->pp_u_bufinuse--;
1808 		}
1809 	}
1810 }
1811 
1812 __attribute__((always_inline))
1813 static void
pp_purge_upp_bft_locked(struct kern_pbufpool * pp,pid_t pid)1814 pp_purge_upp_bft_locked(struct kern_pbufpool *pp, pid_t pid)
1815 {
1816 	struct __kern_buflet_ext *kbft, *tbft;
1817 	struct kern_pbufpool_u_bft_bkt *bkt;
1818 	int i;
1819 
1820 	PP_LOCK_ASSERT_HELD(pp);
1821 
1822 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1823 		bkt = &pp->pp_u_bft_hash_table[i];
1824 		SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link,
1825 		    tbft) {
1826 			ASSERT(kbft->kbe_buf_pid != (pid_t)-1);
1827 			if (pid != (pid_t)-1 && kbft->kbe_buf_pid != pid) {
1828 				continue;
1829 			}
1830 			SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1831 			    kbe_buf_upp_link);
1832 			kbft->kbe_buf_pid = (pid_t)-1;
1833 			kbft->kbe_buf_upp_link.sle_next = NULL;
1834 			pp_free_buflet(pp, (kern_buflet_t)kbft);
1835 			ASSERT(pp->pp_u_bftinuse != 0);
1836 			pp->pp_u_bftinuse--;
1837 		}
1838 	}
1839 }
1840 
1841 void
pp_purge_upp(struct kern_pbufpool * pp,pid_t pid)1842 pp_purge_upp(struct kern_pbufpool *pp, pid_t pid)
1843 {
1844 	PP_LOCK(pp);
1845 	pp_purge_upp_locked(pp, pid);
1846 	pp_purge_upp_bft_locked(pp, pid);
1847 	PP_UNLOCK(pp);
1848 }
1849 
1850 static int
pp_init_upp_bft_locked(struct kern_pbufpool * pp,boolean_t can_block)1851 pp_init_upp_bft_locked(struct kern_pbufpool *pp, boolean_t can_block)
1852 {
1853 	int i, err = 0;
1854 
1855 	PP_LOCK_ASSERT_HELD(pp);
1856 	if (pp->pp_u_bft_hash_table != NULL) {
1857 		return 0;
1858 	}
1859 
1860 	/* allocated-address hash table */
1861 	pp->pp_u_bft_hash_table = can_block ? zalloc(pp_u_htbl_zone) :
1862 	    zalloc_noblock(pp_u_htbl_zone);
1863 	if (pp->pp_u_bft_hash_table == NULL) {
1864 		SK_ERR("failed to zalloc packet buffer pool upp buflet hash table");
1865 		err = ENOMEM;
1866 		goto fail;
1867 	}
1868 
1869 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1870 		SLIST_INIT(&pp->pp_u_bft_hash_table[i].upp_head);
1871 	}
1872 
1873 fail:
1874 	return err;
1875 }
1876 
1877 static void
pp_destroy_upp_bft_locked(struct kern_pbufpool * pp)1878 pp_destroy_upp_bft_locked(struct kern_pbufpool *pp)
1879 {
1880 	PP_LOCK_ASSERT_HELD(pp);
1881 	if (pp->pp_u_bft_hash_table != NULL) {
1882 		/* purge anything that's left */
1883 		pp_purge_upp_bft_locked(pp, -1);
1884 
1885 #if (DEBUG || DEVELOPMENT)
1886 		for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1887 			ASSERT(SLIST_EMPTY(&pp->pp_u_bft_hash_table[i].upp_head));
1888 		}
1889 #endif /* DEBUG || DEVELOPMENT */
1890 
1891 		zfree(pp_u_htbl_zone, pp->pp_u_bft_hash_table);
1892 		pp->pp_u_bft_hash_table = NULL;
1893 	}
1894 	ASSERT(pp->pp_u_bftinuse == 0);
1895 }
1896 
1897 void
pp_insert_upp_bft(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1898 pp_insert_upp_bft(struct kern_pbufpool *pp,
1899     struct __kern_buflet *kbft, pid_t pid)
1900 {
1901 	PP_LOCK(pp);
1902 	pp_insert_upp_bft_locked(pp, kbft, pid);
1903 	PP_UNLOCK(pp);
1904 }
1905 
1906 boolean_t
pp_isempty_upp(struct kern_pbufpool * pp)1907 pp_isempty_upp(struct kern_pbufpool *pp)
1908 {
1909 	boolean_t isempty;
1910 
1911 	PP_LOCK(pp);
1912 	isempty = (pp->pp_u_bufinuse == 0);
1913 	PP_UNLOCK(pp);
1914 
1915 	return isempty;
1916 }
1917 
1918 __attribute__((always_inline))
1919 static inline struct __kern_quantum *
pp_metadata_init(struct __metadata_preamble * mdp,struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag,struct skmem_obj ** blist)1920 pp_metadata_init(struct __metadata_preamble *mdp, struct kern_pbufpool *pp,
1921     uint16_t bufcnt, uint32_t skmflag, struct skmem_obj **blist)
1922 {
1923 	struct __kern_quantum *kqum;
1924 	struct __user_quantum *uqum;
1925 
1926 	kqum = SK_PTR_ADDR_KQUM((uintptr_t)mdp + METADATA_PREAMBLE_SZ);
1927 	ASSERT(kqum->qum_pp == pp);
1928 	if (__probable(!PP_KERNEL_ONLY(pp))) {
1929 		ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY));
1930 		uqum =  __DECONST(struct __user_quantum *, kqum->qum_user);
1931 		ASSERT(uqum != NULL);
1932 	} else {
1933 		ASSERT(kqum->qum_qflags & QUM_F_KERNEL_ONLY);
1934 		ASSERT(kqum->qum_user == NULL);
1935 		uqum = NULL;
1936 	}
1937 
1938 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 &&
1939 	    pp_metadata_construct(kqum, uqum, METADATA_IDX(kqum), pp,
1940 	    skmflag, bufcnt, FALSE, blist) != 0) {
1941 		return NULL;
1942 	}
1943 
1944 	/* (re)construct {user,kernel} metadata */
1945 	switch (pp->pp_md_type) {
1946 	case NEXUS_META_TYPE_PACKET: {
1947 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
1948 		struct __kern_buflet *kbuf = &kpkt->pkt_qum_buf;
1949 		uint16_t i;
1950 
1951 		/* sanitize flags */
1952 		kpkt->pkt_pflags &= PKT_F_INIT_MASK;
1953 
1954 		ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
1955 		    kpkt->pkt_com_opt != NULL);
1956 		ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
1957 		    kpkt->pkt_flow != NULL);
1958 		ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
1959 		    kpkt->pkt_tx_compl != NULL);
1960 
1961 		/*
1962 		 * XXX: For now we always set PKT_F_FLOW_DATA;
1963 		 * this is a no-op but done for consistency
1964 		 * with the other PKT_F_*_DATA flags.
1965 		 */
1966 		kpkt->pkt_pflags |= PKT_F_FLOW_DATA;
1967 
1968 		/* initialize kernel packet */
1969 		KPKT_INIT(kpkt, QUM_F_INTERNALIZED);
1970 
1971 		ASSERT(bufcnt || PP_HAS_BUFFER_ON_DEMAND(pp));
1972 		if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1973 			ASSERT(kbuf->buf_ctl == NULL);
1974 			ASSERT(kbuf->buf_addr == 0);
1975 			kbuf = __DECONST(struct __kern_buflet *,
1976 			    kbuf->buf_nbft_addr);
1977 		}
1978 		/* initialize kernel buflet */
1979 		for (i = 0; i < bufcnt; i++) {
1980 			ASSERT(kbuf != NULL);
1981 			KBUF_INIT(kbuf);
1982 			kbuf = __DECONST(struct __kern_buflet *,
1983 			    kbuf->buf_nbft_addr);
1984 		}
1985 		ASSERT((kbuf == NULL) || (bufcnt == 0));
1986 		break;
1987 	}
1988 	default:
1989 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
1990 		/* kernel quantum */
1991 		KQUM_INIT(kqum, QUM_F_INTERNALIZED);
1992 		KBUF_INIT(&kqum->qum_buf[0]);
1993 		break;
1994 	}
1995 
1996 	return kqum;
1997 }
1998 
1999 /*
2000  * When PPF_BUFFER_ON_DEMAND flag is set on packet pool creation, we create
2001  * packet descriptor cache with no buffer attached and a buflet cache with
2002  * cpu layer caching enabled. While operating in this mode, we can call
2003  * pp_alloc_packet_common() either with `bufcnt = 0` or `bufcnt = n`,
2004  * where n <= pp->pp_max_frags. If `bufcnt == 0` then we allocate packet
2005  * descriptor with no attached buffer from the metadata cache.
2006  * If `bufcnt != 0`, then this routine allocates packet descriptor and buflets
2007  * from their respective caches and constructs the packet on behalf of the
2008  * caller.
2009  */
2010 __attribute__((always_inline))
2011 static inline uint32_t
pp_alloc_packet_common(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * array,uint32_t num,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2012 pp_alloc_packet_common(struct kern_pbufpool *pp, uint16_t bufcnt,
2013     uint64_t *array, uint32_t num, boolean_t tagged, alloc_cb_func_t cb,
2014     const void *ctx, uint32_t skmflag)
2015 {
2016 	struct __metadata_preamble *mdp;
2017 	struct __kern_quantum *kqum = NULL;
2018 	uint32_t allocp, need = num;
2019 	struct skmem_obj *plist, *blist = NULL;
2020 
2021 	ASSERT(bufcnt <= pp->pp_max_frags);
2022 	ASSERT(array != NULL && num > 0);
2023 	ASSERT(PP_BATCH_CAPABLE(pp));
2024 
2025 	/* allocate (constructed) packet(s) with buffer(s) attached */
2026 	allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist, num,
2027 	    skmflag);
2028 
2029 	/* allocate (constructed) buflet(s) with buffer(s) attached */
2030 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
2031 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
2032 		    (allocp * bufcnt), skmflag);
2033 	}
2034 
2035 	while (plist != NULL) {
2036 		struct skmem_obj *plistn;
2037 
2038 		plistn = plist->mo_next;
2039 		plist->mo_next = NULL;
2040 
2041 		mdp = (struct __metadata_preamble *)(void *)plist;
2042 		kqum = pp_metadata_init(mdp, pp, bufcnt, skmflag, &blist);
2043 		if (kqum == NULL) {
2044 			if (blist != NULL) {
2045 				skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp),
2046 				    blist);
2047 				blist = NULL;
2048 			}
2049 			plist->mo_next = plistn;
2050 			skmem_cache_batch_free(pp->pp_kmd_cache, plist);
2051 			plist = NULL;
2052 			break;
2053 		}
2054 
2055 		if (tagged) {
2056 			*array = SK_PTR_ENCODE(kqum, METADATA_TYPE(kqum),
2057 			    METADATA_SUBTYPE(kqum));
2058 		} else {
2059 			*array = (uint64_t)kqum;
2060 		}
2061 
2062 		if (cb != NULL) {
2063 			(cb)(*array, (num - need), ctx);
2064 		}
2065 
2066 		++array;
2067 		plist = plistn;
2068 
2069 		ASSERT(need > 0);
2070 		--need;
2071 	}
2072 	ASSERT(blist == NULL);
2073 	ASSERT((num - need) == allocp || kqum == NULL);
2074 
2075 	return num - need;
2076 }
2077 
2078 uint64_t
pp_alloc_packet(struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag)2079 pp_alloc_packet(struct kern_pbufpool *pp, uint16_t bufcnt, uint32_t skmflag)
2080 {
2081 	uint64_t kpkt = 0;
2082 
2083 	(void) pp_alloc_packet_common(pp, bufcnt, &kpkt, 1, FALSE,
2084 	    NULL, NULL, skmflag);
2085 
2086 	return kpkt;
2087 }
2088 
2089 int
pp_alloc_packet_batch(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * array,uint32_t * size,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2090 pp_alloc_packet_batch(struct kern_pbufpool *pp, uint16_t bufcnt,
2091     uint64_t *array, uint32_t *size, boolean_t tagged, alloc_cb_func_t cb,
2092     const void *ctx, uint32_t skmflag)
2093 {
2094 	uint32_t i, n;
2095 	int err;
2096 
2097 	ASSERT(array != NULL && size > 0);
2098 
2099 	n = *size;
2100 	*size = 0;
2101 
2102 	i = pp_alloc_packet_common(pp, bufcnt, array, n, tagged,
2103 	    cb, ctx, skmflag);
2104 	*size = i;
2105 
2106 	if (__probable(i == n)) {
2107 		err = 0;
2108 	} else if (i != 0) {
2109 		err = EAGAIN;
2110 	} else {
2111 		err = ENOMEM;
2112 	}
2113 
2114 	return err;
2115 }
2116 
2117 int
pp_alloc_pktq(struct kern_pbufpool * pp,uint16_t bufcnt,struct pktq * pktq,uint32_t num,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2118 pp_alloc_pktq(struct kern_pbufpool *pp, uint16_t bufcnt,
2119     struct pktq *pktq, uint32_t num, alloc_cb_func_t cb, const void *ctx,
2120     uint32_t skmflag)
2121 {
2122 	struct __metadata_preamble *mdp;
2123 	struct __kern_packet *kpkt = NULL;
2124 	uint32_t allocp, need = num;
2125 	struct skmem_obj *plist, *blist = NULL;
2126 	int err;
2127 
2128 	ASSERT(pktq != NULL && num > 0);
2129 	ASSERT(pp->pp_md_type == NEXUS_META_TYPE_PACKET);
2130 	ASSERT(bufcnt <= pp->pp_max_frags);
2131 	ASSERT(PP_BATCH_CAPABLE(pp));
2132 
2133 	/* allocate (constructed) packet(s) with buffer(s) attached */
2134 	allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist, num,
2135 	    skmflag);
2136 
2137 	/* allocate (constructed) buflet(s) with buffer(s) attached */
2138 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
2139 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
2140 		    (allocp * bufcnt), skmflag);
2141 	}
2142 
2143 	while (plist != NULL) {
2144 		struct skmem_obj *plistn;
2145 
2146 		plistn = plist->mo_next;
2147 		plist->mo_next = NULL;
2148 
2149 		mdp = (struct __metadata_preamble *)(void *)plist;
2150 		kpkt = (struct __kern_packet *)pp_metadata_init(mdp, pp,
2151 		    bufcnt, skmflag, &blist);
2152 		if (kpkt == NULL) {
2153 			if (blist != NULL) {
2154 				skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp),
2155 				    blist);
2156 				blist = NULL;
2157 			}
2158 			plist->mo_next = plistn;
2159 			skmem_cache_batch_free(pp->pp_kmd_cache, plist);
2160 			plist = NULL;
2161 			break;
2162 		}
2163 
2164 		KPKTQ_ENQUEUE(pktq, kpkt);
2165 
2166 		if (cb != NULL) {
2167 			(cb)((uint64_t)kpkt, (num - need), ctx);
2168 		}
2169 
2170 		plist = plistn;
2171 
2172 		ASSERT(need > 0);
2173 		--need;
2174 	}
2175 	ASSERT(blist == NULL);
2176 	ASSERT((num - need) == allocp || kpkt == NULL);
2177 
2178 	if (__probable(need == 0)) {
2179 		err = 0;
2180 	} else if (need == num) {
2181 		err = ENOMEM;
2182 	} else {
2183 		err = EAGAIN;
2184 	}
2185 
2186 	return err;
2187 }
2188 
2189 uint64_t
pp_alloc_packet_by_size(struct kern_pbufpool * pp,uint32_t size,uint32_t skmflag)2190 pp_alloc_packet_by_size(struct kern_pbufpool *pp, uint32_t size,
2191     uint32_t skmflag)
2192 {
2193 	uint32_t bufcnt = pp->pp_max_frags;
2194 	uint64_t kpkt = 0;
2195 
2196 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
2197 		bufcnt =
2198 		    SK_ROUNDUP(size, PP_BUF_SIZE_DEF(pp)) / PP_BUF_SIZE_DEF(pp);
2199 		ASSERT(bufcnt <= UINT16_MAX);
2200 	}
2201 
2202 	(void) pp_alloc_packet_common(pp, (uint16_t)bufcnt, &kpkt, 1, TRUE,
2203 	    NULL, NULL, skmflag);
2204 
2205 	return kpkt;
2206 }
2207 
2208 __attribute__((always_inline))
2209 static inline struct __metadata_preamble *
pp_metadata_fini(struct __kern_quantum * kqum,struct kern_pbufpool * pp,struct mbuf ** mp,struct __kern_packet ** kpp,struct skmem_obj ** blist_def,struct skmem_obj ** blist_large,struct skmem_obj ** blist_raw)2210 pp_metadata_fini(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
2211     struct mbuf **mp, struct __kern_packet **kpp, struct skmem_obj **blist_def,
2212     struct skmem_obj **blist_large, struct skmem_obj **blist_raw)
2213 {
2214 	struct __metadata_preamble *mdp = METADATA_PREAMBLE(kqum);
2215 
2216 	ASSERT(SK_PTR_TAG(kqum) == 0);
2217 
2218 	switch (pp->pp_md_type) {
2219 	case NEXUS_META_TYPE_PACKET: {
2220 		struct __kern_packet *kpkt = SK_PTR_KPKT(kqum);
2221 
2222 		if ((kpkt->pkt_pflags & PKT_F_TX_COMPL_TS_REQ) != 0) {
2223 			__packet_perform_tx_completion_callbacks(
2224 				SK_PKT2PH(kpkt), NULL);
2225 		}
2226 		if ((kpkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) {
2227 			ASSERT((kpkt->pkt_pflags & PKT_F_PKT_DATA) == 0);
2228 			ASSERT(kpkt->pkt_mbuf != NULL);
2229 			ASSERT(kpkt->pkt_mbuf->m_nextpkt == NULL);
2230 			if (mp != NULL) {
2231 				ASSERT(*mp == NULL);
2232 				*mp = kpkt->pkt_mbuf;
2233 			} else {
2234 				m_freem(kpkt->pkt_mbuf);
2235 			}
2236 			KPKT_CLEAR_MBUF_DATA(kpkt);
2237 		} else if ((kpkt->pkt_pflags & PKT_F_PKT_DATA) != 0) {
2238 			ASSERT(kpkt->pkt_pkt != NULL);
2239 			ASSERT(kpkt->pkt_pkt->pkt_nextpkt == NULL);
2240 			if (kpp != NULL) {
2241 				ASSERT(*kpp == NULL);
2242 				*kpp = kpkt->pkt_pkt;
2243 			} else {
2244 				/* can only recurse once */
2245 				ASSERT((kpkt->pkt_pkt->pkt_pflags &
2246 				    PKT_F_PKT_DATA) == 0);
2247 				pp_free_packet_single(kpkt->pkt_pkt);
2248 			}
2249 			KPKT_CLEAR_PKT_DATA(kpkt);
2250 		}
2251 		ASSERT(kpkt->pkt_nextpkt == NULL);
2252 		ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
2253 		ASSERT((kpkt->pkt_pflags & PKT_F_MBUF_MASK) == 0);
2254 		ASSERT((kpkt->pkt_pflags & PKT_F_PKT_MASK) == 0);
2255 		break;
2256 	}
2257 	default:
2258 		break;
2259 	}
2260 
2261 	if (__improbable(PP_HAS_BUFFER_ON_DEMAND(pp))) {
2262 		pp_metadata_destruct_common(kqum, pp, FALSE, blist_def,
2263 		    blist_large, blist_raw);
2264 	}
2265 	return mdp;
2266 }
2267 
2268 void
pp_free_packet_chain(struct __kern_packet * pkt_chain,int * npkt)2269 pp_free_packet_chain(struct __kern_packet *pkt_chain, int *npkt)
2270 {
2271 	struct __metadata_preamble *mdp;
2272 	struct skmem_obj *top = NULL;
2273 	struct skmem_obj *blist_def = NULL;
2274 	struct skmem_obj *blist_large = NULL;
2275 	struct skmem_obj *blist_raw = NULL;
2276 	struct skmem_obj **list = &top;
2277 	struct mbuf *mtop = NULL;
2278 	struct mbuf **mp = &mtop;
2279 	struct __kern_packet *kptop = NULL;
2280 	struct __kern_packet **kpp = &kptop, *pkt, *next;
2281 	struct kern_pbufpool *pp;
2282 	int c = 0;
2283 
2284 	pp = __DECONST(struct kern_pbufpool *, pkt_chain->pkt_qum.qum_pp);
2285 	ASSERT(pp != NULL);
2286 	ASSERT(PP_BATCH_CAPABLE(pp));
2287 
2288 	for (pkt = pkt_chain; pkt != NULL; pkt = next) {
2289 		next = pkt->pkt_nextpkt;
2290 		pkt->pkt_nextpkt = NULL;
2291 
2292 		ASSERT(SK_PTR_ADDR_KQUM(pkt)->qum_pp == pp);
2293 		mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(pkt), pp,
2294 		    mp, kpp, &blist_def, &blist_large, &blist_raw);
2295 
2296 		*list = (struct skmem_obj *)mdp;
2297 		list = &(*list)->mo_next;
2298 		c++;
2299 
2300 		if (*mp != NULL) {
2301 			mp = &(*mp)->m_nextpkt;
2302 			ASSERT(*mp == NULL);
2303 		}
2304 		if (*kpp != NULL) {
2305 			kpp = &(*kpp)->pkt_nextpkt;
2306 			ASSERT(*kpp == NULL);
2307 		}
2308 	}
2309 
2310 	ASSERT(top != NULL);
2311 	skmem_cache_batch_free(pp->pp_kmd_cache, top);
2312 	if (blist_def != NULL) {
2313 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist_def);
2314 		blist_def = NULL;
2315 	}
2316 	if (blist_large != NULL) {
2317 		skmem_cache_batch_free(PP_KBFT_CACHE_LARGE(pp), blist_large);
2318 		blist_large = NULL;
2319 	}
2320 	if (blist_raw != NULL) {
2321 		skmem_cache_batch_free(pp->pp_raw_kbft_cache, blist_raw);
2322 		blist_raw = NULL;
2323 	}
2324 	if (mtop != NULL) {
2325 		DTRACE_SKYWALK(free__attached__mbuf);
2326 		if (__probable(mtop->m_nextpkt != NULL)) {
2327 			m_freem_list(mtop);
2328 		} else {
2329 			m_freem(mtop);
2330 		}
2331 	}
2332 	if (kptop != NULL) {
2333 		int cnt = 0;
2334 		pp_free_packet_chain(kptop, &cnt);
2335 		DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2336 	}
2337 	if (npkt != NULL) {
2338 		*npkt = c;
2339 	}
2340 }
2341 
2342 void
pp_free_pktq(struct pktq * pktq)2343 pp_free_pktq(struct pktq *pktq)
2344 {
2345 	if (__improbable(KPKTQ_EMPTY(pktq))) {
2346 		return;
2347 	}
2348 	struct __kern_packet *pkt = KPKTQ_FIRST(pktq);
2349 	pp_free_packet_chain(pkt, NULL);
2350 	KPKTQ_DISPOSE(pktq);
2351 }
2352 
2353 __attribute__((always_inline))
2354 static inline void
pp_free_packet_array(struct kern_pbufpool * pp,uint64_t * array,uint32_t num)2355 pp_free_packet_array(struct kern_pbufpool *pp, uint64_t *array, uint32_t num)
2356 {
2357 	struct __metadata_preamble *mdp;
2358 	struct skmem_obj *top = NULL;
2359 	struct skmem_obj *blist_def = NULL;
2360 	struct skmem_obj *blist_large = NULL;
2361 	struct skmem_obj *blist_raw = NULL;
2362 	struct skmem_obj **list = &top;
2363 	struct mbuf *mtop = NULL;
2364 	struct mbuf **mp = &mtop;
2365 	struct __kern_packet *kptop = NULL;
2366 	struct __kern_packet **kpp = &kptop;
2367 	uint32_t i;
2368 
2369 	ASSERT(pp != NULL);
2370 	ASSERT(array != NULL && num > 0);
2371 	ASSERT(PP_BATCH_CAPABLE(pp));
2372 
2373 	for (i = 0; i < num; i++) {
2374 		ASSERT(SK_PTR_ADDR_KQUM(array[i])->qum_pp == pp);
2375 		mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(array[i]), pp,
2376 		    mp, kpp, &blist_def, &blist_large, &blist_raw);
2377 
2378 		*list = (struct skmem_obj *)mdp;
2379 		list = &(*list)->mo_next;
2380 		array[i] = 0;
2381 
2382 		if (*mp != NULL) {
2383 			mp = &(*mp)->m_nextpkt;
2384 			ASSERT(*mp == NULL);
2385 		}
2386 		if (*kpp != NULL) {
2387 			kpp = &(*kpp)->pkt_nextpkt;
2388 			ASSERT(*kpp == NULL);
2389 		}
2390 	}
2391 
2392 	ASSERT(top != NULL);
2393 	skmem_cache_batch_free(pp->pp_kmd_cache, top);
2394 	if (blist_def != NULL) {
2395 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist_def);
2396 		blist_def = NULL;
2397 	}
2398 	if (blist_large != NULL) {
2399 		skmem_cache_batch_free(PP_KBFT_CACHE_LARGE(pp), blist_large);
2400 		blist_large = NULL;
2401 	}
2402 	if (blist_raw != NULL) {
2403 		skmem_cache_batch_free(pp->pp_raw_kbft_cache, blist_raw);
2404 		blist_raw = NULL;
2405 	}
2406 	if (mtop != NULL) {
2407 		DTRACE_SKYWALK(free__attached__mbuf);
2408 		if (__probable(mtop->m_nextpkt != NULL)) {
2409 			m_freem_list(mtop);
2410 		} else {
2411 			m_freem(mtop);
2412 		}
2413 	}
2414 	if (kptop != NULL) {
2415 		int cnt = 0;
2416 		pp_free_packet_chain(kptop, &cnt);
2417 		DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2418 	}
2419 }
2420 
2421 void
pp_free_packet(struct kern_pbufpool * pp,uint64_t kqum)2422 pp_free_packet(struct kern_pbufpool *pp, uint64_t kqum)
2423 {
2424 	pp_free_packet_array(pp, &kqum, 1);
2425 }
2426 
2427 void
pp_free_packet_batch(const kern_pbufpool_t pp,uint64_t * array,uint32_t size)2428 pp_free_packet_batch(const kern_pbufpool_t pp, uint64_t *array, uint32_t size)
2429 {
2430 	pp_free_packet_array(pp, array, size);
2431 }
2432 
2433 void
pp_free_packet_single(struct __kern_packet * pkt)2434 pp_free_packet_single(struct __kern_packet *pkt)
2435 {
2436 	ASSERT(pkt->pkt_nextpkt == NULL);
2437 	pp_free_packet(__DECONST(struct kern_pbufpool *,
2438 	    pkt->pkt_qum.qum_pp), SK_PTR_ADDR(pkt));
2439 }
2440 
2441 static mach_vm_address_t
pp_alloc_buffer_common(const kern_pbufpool_t pp,struct skmem_obj_info * oi,uint32_t skmflag,bool large)2442 pp_alloc_buffer_common(const kern_pbufpool_t pp, struct skmem_obj_info *oi,
2443     uint32_t skmflag, bool large)
2444 {
2445 	mach_vm_address_t baddr;
2446 	struct skmem_cache *skm = large ? PP_BUF_CACHE_LARGE(pp):
2447 	    PP_BUF_CACHE_DEF(pp);
2448 
2449 	ASSERT(skm != NULL);
2450 	/* allocate a cached buffer */
2451 	baddr = (mach_vm_address_t)skmem_cache_alloc(skm, skmflag);
2452 
2453 #if (DEVELOPMENT || DEBUG)
2454 	uint64_t mtbf = skmem_region_get_mtbf();
2455 	/*
2456 	 * MTBF is applicable only for non-blocking allocations here.
2457 	 */
2458 	if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
2459 	    (skmflag & SKMEM_NOSLEEP))) {
2460 		SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
2461 		net_update_uptime();
2462 		if (baddr != 0) {
2463 			skmem_cache_free(skm, (void *)baddr);
2464 			baddr = 0;
2465 		}
2466 	}
2467 #endif /* (DEVELOPMENT || DEBUG) */
2468 
2469 	if (__improbable(baddr == 0)) {
2470 		SK_DF(SK_VERB_MEM, "failed to alloc buffer, pp 0x%llx",
2471 		    SK_KVA(pp));
2472 		return 0;
2473 	}
2474 	skmem_cache_get_obj_info(skm, (void *)baddr, oi, NULL);
2475 	ASSERT(SKMEM_OBJ_BUFCTL(oi) != NULL);
2476 	ASSERT((mach_vm_address_t)SKMEM_OBJ_ADDR(oi) == baddr);
2477 	return baddr;
2478 }
2479 
2480 errno_t
pp_alloc_buffer(const kern_pbufpool_t pp,mach_vm_address_t * baddr,kern_segment_t * seg,kern_obj_idx_seg_t * idx,uint32_t skmflag)2481 pp_alloc_buffer(const kern_pbufpool_t pp, mach_vm_address_t *baddr,
2482     kern_segment_t *seg, kern_obj_idx_seg_t *idx, uint32_t skmflag)
2483 {
2484 	struct skmem_obj_info oib;
2485 
2486 	VERIFY(pp != NULL && baddr != NULL);
2487 	VERIFY((seg != NULL) == (idx != NULL));
2488 
2489 	if (__improbable(!PP_HAS_BUFFER_ON_DEMAND(pp))) {
2490 		return ENOTSUP;
2491 	}
2492 
2493 	*baddr = pp_alloc_buffer_common(pp, &oib, skmflag, false);
2494 	if (__improbable(*baddr == 0)) {
2495 		return ENOMEM;
2496 	}
2497 
2498 	if (seg != NULL) {
2499 		ASSERT(SKMEM_OBJ_SEG(&oib) != NULL);
2500 		*seg = SKMEM_OBJ_SEG(&oib);
2501 		*idx = SKMEM_OBJ_IDX_SEG(&oib);
2502 	}
2503 	return 0;
2504 }
2505 
2506 void
pp_free_buffer(const kern_pbufpool_t pp,mach_vm_address_t addr)2507 pp_free_buffer(const kern_pbufpool_t pp, mach_vm_address_t addr)
2508 {
2509 	ASSERT(pp != NULL && addr != 0);
2510 	skmem_cache_free(PP_BUF_CACHE_DEF(pp), (void *)addr);
2511 }
2512 
2513 __attribute__((always_inline))
2514 static inline uint32_t
pp_alloc_buflet_common(struct kern_pbufpool * pp,uint64_t * array,uint32_t num,uint32_t skmflag,uint32_t flags)2515 pp_alloc_buflet_common(struct kern_pbufpool *pp, uint64_t *array,
2516     uint32_t num, uint32_t skmflag, uint32_t flags)
2517 {
2518 	struct __kern_buflet *kbft = NULL;
2519 	uint32_t allocd, need = num;
2520 	struct skmem_obj *list;
2521 	struct skmem_cache *skm = NULL;
2522 	boolean_t attach_buffer = (flags & PP_ALLOC_BFT_ATTACH_BUFFER) != 0;
2523 	boolean_t large = (flags & PP_ALLOC_BFT_LARGE) != 0;
2524 
2525 	ASSERT(array != NULL && num > 0);
2526 	ASSERT(PP_BATCH_CAPABLE(pp));
2527 	ASSERT(PP_KBFT_CACHE_DEF(pp) != NULL);
2528 	ASSERT(PP_BUF_SIZE_LARGE(pp) != 0 || !large);
2529 	ASSERT(pp->pp_raw_kbft_cache != NULL || attach_buffer);
2530 
2531 	if (!attach_buffer) {
2532 		skm = pp->pp_raw_kbft_cache;
2533 	} else {
2534 		skm = large ? PP_KBFT_CACHE_LARGE(pp) :
2535 		    PP_KBFT_CACHE_DEF(pp);
2536 	}
2537 	allocd = skmem_cache_batch_alloc(skm, &list, num, skmflag);
2538 
2539 	while (list != NULL) {
2540 		struct skmem_obj *listn;
2541 
2542 		listn = list->mo_next;
2543 		list->mo_next = NULL;
2544 		kbft = (kern_buflet_t)(void *)list;
2545 		if (attach_buffer) {
2546 			KBUF_EXT_INIT(kbft, pp);
2547 		} else {
2548 			RAW_KBUF_EXT_INIT(kbft);
2549 		}
2550 		*array = (uint64_t)kbft;
2551 		++array;
2552 		list = listn;
2553 		ASSERT(need > 0);
2554 		--need;
2555 	}
2556 	ASSERT((num - need) == allocd || kbft == NULL);
2557 	return num - need;
2558 }
2559 
2560 errno_t
pp_alloc_buflet(struct kern_pbufpool * pp,kern_buflet_t * kbft,uint32_t skmflag,uint32_t flags)2561 pp_alloc_buflet(struct kern_pbufpool *pp, kern_buflet_t *kbft, uint32_t skmflag,
2562     uint32_t flags)
2563 {
2564 	uint64_t bft;
2565 
2566 	if (__improbable(!pp_alloc_buflet_common(pp, &bft, 1, skmflag, flags))) {
2567 		return ENOMEM;
2568 	}
2569 	*kbft = (kern_buflet_t)bft;
2570 	return 0;
2571 }
2572 
2573 errno_t
pp_alloc_buflet_batch(struct kern_pbufpool * pp,uint64_t * array,uint32_t * size,uint32_t skmflag,uint32_t flags)2574 pp_alloc_buflet_batch(struct kern_pbufpool *pp, uint64_t *array,
2575     uint32_t *size, uint32_t skmflag, uint32_t flags)
2576 {
2577 	uint32_t i, n;
2578 	int err;
2579 
2580 	ASSERT(array != NULL && size > 0);
2581 
2582 	n = *size;
2583 	*size = 0;
2584 
2585 	i = pp_alloc_buflet_common(pp, array, n, skmflag, flags);
2586 	*size = i;
2587 
2588 	if (__probable(i == n)) {
2589 		err = 0;
2590 	} else if (i != 0) {
2591 		err = EAGAIN;
2592 	} else {
2593 		err = ENOMEM;
2594 	}
2595 
2596 	return err;
2597 }
2598 
2599 __attribute__((always_inline))
2600 static void
pp_free_buflet_common(const kern_pbufpool_t pp,kern_buflet_t kbft)2601 pp_free_buflet_common(const kern_pbufpool_t pp, kern_buflet_t kbft)
2602 {
2603 	ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
2604 	ASSERT(kbft->buf_nbft_addr == 0);
2605 
2606 	if (kbft->buf_flag & BUFLET_FLAG_EXTERNAL) {
2607 		ASSERT(kbft->buf_addr != 0);
2608 		ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2609 		ASSERT(kbft->buf_bft_idx_reg != OBJ_IDX_NONE);
2610 		ASSERT(kbft->buf_ctl != NULL);
2611 		ASSERT(((struct __kern_buflet_ext *)kbft)->
2612 		    kbe_buf_upp_link.sle_next == NULL);
2613 
2614 		/* raw buflet has a buffer attached after construction */
2615 		if (BUFLET_FROM_RAW_BFLT_CACHE(kbft)) {
2616 			uint32_t usecnt = 0;
2617 			void *objaddr = kbft->buf_objaddr;
2618 			KBUF_DTOR(kbft, usecnt);
2619 			SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u",
2620 			    SK_KVA(pp), SK_KVA(objaddr), usecnt);
2621 			if (__improbable(usecnt == 0)) {
2622 				skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2623 				    PP_BUF_CACHE_LARGE(pp) : PP_BUF_CACHE_DEF(pp),
2624 				    objaddr);
2625 			}
2626 		}
2627 
2628 		/*
2629 		 * non-raw external buflet has buffer attached at construction,
2630 		 * so we don't free the buffer here.
2631 		 */
2632 		skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2633 		    PP_KBFT_CACHE_LARGE(pp) : PP_KBFT_CACHE_DEF(pp),
2634 		    (void *)kbft);
2635 	} else if (__probable(kbft->buf_addr != 0)) {
2636 		void *objaddr = kbft->buf_objaddr;
2637 		uint32_t usecnt = 0;
2638 
2639 		ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2640 		ASSERT(kbft->buf_ctl != NULL);
2641 		KBUF_DTOR(kbft, usecnt);
2642 		SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u",
2643 		    SK_KVA(pp), SK_KVA(objaddr), usecnt);
2644 		if (__probable(usecnt == 0)) {
2645 			skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2646 			    PP_BUF_CACHE_LARGE(pp) : PP_BUF_CACHE_DEF(pp),
2647 			    objaddr);
2648 		}
2649 	}
2650 }
2651 
2652 void
pp_free_buflet(const kern_pbufpool_t pp,kern_buflet_t kbft)2653 pp_free_buflet(const kern_pbufpool_t pp, kern_buflet_t kbft)
2654 {
2655 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
2656 	ASSERT(pp != NULL && kbft != NULL);
2657 	pp_free_buflet_common(pp, kbft);
2658 }
2659 
2660 void
pp_reap_caches(boolean_t purge)2661 pp_reap_caches(boolean_t purge)
2662 {
2663 	skmem_cache_reap_now(pp_opt_cache, purge);
2664 	skmem_cache_reap_now(pp_flow_cache, purge);
2665 	skmem_cache_reap_now(pp_compl_cache, purge);
2666 }
2667