xref: /xnu-8020.101.4/bsd/skywalk/packet/pbufpool.c (revision e7776783b89a353188416a9a346c6cdb4928faad)
1 /*
2  * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/packet/pbufpool_var.h>
31 #include <sys/sdt.h>
32 
33 static struct kern_pbufpool *pp_alloc(zalloc_flags_t);
34 static void pp_free(struct kern_pbufpool *);
35 static uint32_t pp_alloc_packet_common(struct kern_pbufpool *, uint16_t,
36     uint64_t *, uint32_t, boolean_t, alloc_cb_func_t, const void *, uint32_t);
37 static void pp_free_packet_array(struct kern_pbufpool *, uint64_t *, uint32_t);
38 static int pp_metadata_ctor_no_buflet(struct skmem_obj_info *,
39     struct skmem_obj_info *, void *, uint32_t);
40 static int pp_metadata_ctor_max_buflet(struct skmem_obj_info *,
41     struct skmem_obj_info *, void *, uint32_t);
42 static void pp_metadata_dtor(void *, void *);
43 static int pp_metadata_construct(struct __kern_quantum *,
44     struct __user_quantum *, obj_idx_t, struct kern_pbufpool *, uint32_t,
45     uint16_t, boolean_t, struct skmem_obj **);
46 static void pp_metadata_destruct(struct __kern_quantum *,
47     struct kern_pbufpool *, boolean_t);
48 static struct __kern_quantum *pp_metadata_init(struct __metadata_preamble *,
49     struct kern_pbufpool *, uint16_t, uint32_t, struct skmem_obj **);
50 static struct __metadata_preamble *pp_metadata_fini(struct __kern_quantum *,
51     struct kern_pbufpool *, struct mbuf **, struct __kern_packet **,
52     struct skmem_obj **);
53 static void pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid);
54 static void pp_buf_seg_ctor(struct sksegment *, IOSKMemoryBufferRef, void *);
55 static void pp_buf_seg_dtor(struct sksegment *, IOSKMemoryBufferRef, void *);
56 static void pp_destroy_upp_locked(struct kern_pbufpool *);
57 static void pp_destroy_upp_bft_locked(struct kern_pbufpool *);
58 static int pp_init_upp_bft_locked(struct kern_pbufpool *, boolean_t);
59 static void pp_free_buflet_common(const kern_pbufpool_t, kern_buflet_t);
60 static mach_vm_address_t pp_alloc_buffer_common(const kern_pbufpool_t pp,
61     struct skmem_obj_info *oi, uint32_t skmflag);
62 static inline uint32_t
63 pp_alloc_buflet_common(struct kern_pbufpool *pp, uint64_t *array,
64     uint32_t num, uint32_t skmflag);
65 
66 #define KERN_PBUFPOOL_U_HASH_SIZE       64      /* hash table size */
67 
68 /*
69  * Since the inputs are small (indices to the metadata region), we can use
70  * Knuth's multiplicative hash method which is fast and good enough.  Here
71  * we multiply the input by the golden ratio of 2^32.  See "The Art of
72  * Computer Programming", section 6.4.
73  */
74 #define KERN_PBUFPOOL_U_HASH_INDEX(_i, _m)                      \
75 	(((_i) * 2654435761U) & (_m))
76 #define KERN_PBUFPOOL_U_HASH(_pp, _i)                           \
77 	(&(_pp)->pp_u_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
78 	KERN_PBUFPOOL_U_HASH_SIZE - 1)])
79 #define KERN_PBUFPOOL_U_BFT_HASH(_pp, _i)                           \
80 	(&(_pp)->pp_u_bft_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
81 	KERN_PBUFPOOL_U_HASH_SIZE - 1)])
82 
83 static ZONE_DEFINE(pp_zone, SKMEM_ZONE_PREFIX ".mem.pp",
84     sizeof(struct kern_pbufpool), ZC_ZFREE_CLEARMEM);
85 
86 #define PP_U_HTBL_SIZE  \
87 	(sizeof(struct kern_pbufpool_u_bkt) * KERN_PBUFPOOL_U_HASH_SIZE)
88 static ZONE_DEFINE(pp_u_htbl_zone, SKMEM_ZONE_PREFIX ".mem.pp.htbl",
89     PP_U_HTBL_SIZE, ZC_ZFREE_CLEARMEM);
90 
91 static struct skmem_cache *pp_opt_cache;        /* cache for __packet_opt */
92 static struct skmem_cache *pp_flow_cache;       /* cache for __flow */
93 static struct skmem_cache *pp_compl_cache;      /* cache for __packet_compl */
94 
95 static int __pp_inited = 0;
96 
97 int
pp_init(void)98 pp_init(void)
99 {
100 	_CASSERT(KPKT_SC_UNSPEC == MBUF_SC_UNSPEC);
101 	_CASSERT(KPKT_SC_BK_SYS == MBUF_SC_BK_SYS);
102 	_CASSERT(KPKT_SC_BK == MBUF_SC_BK);
103 	_CASSERT(KPKT_SC_BE == MBUF_SC_BE);
104 	_CASSERT(KPKT_SC_RD == MBUF_SC_RD);
105 	_CASSERT(KPKT_SC_OAM == MBUF_SC_OAM);
106 	_CASSERT(KPKT_SC_AV == MBUF_SC_AV);
107 	_CASSERT(KPKT_SC_RV == MBUF_SC_RV);
108 	_CASSERT(KPKT_SC_VI == MBUF_SC_VI);
109 	_CASSERT(KPKT_SC_SIG == MBUF_SC_SIG);
110 	_CASSERT(KPKT_SC_VO == MBUF_SC_VO);
111 	_CASSERT(KPKT_SC_CTL == MBUF_SC_CTL);
112 
113 	_CASSERT(KPKT_SC_BK_SYS == PKT_SC_BK_SYS);
114 	_CASSERT(KPKT_SC_BK == PKT_SC_BK);
115 	_CASSERT(KPKT_SC_BE == PKT_SC_BE);
116 	_CASSERT(KPKT_SC_RD == PKT_SC_RD);
117 	_CASSERT(KPKT_SC_OAM == PKT_SC_OAM);
118 	_CASSERT(KPKT_SC_AV == PKT_SC_AV);
119 	_CASSERT(KPKT_SC_RV == PKT_SC_RV);
120 	_CASSERT(KPKT_SC_VI == PKT_SC_VI);
121 	_CASSERT(KPKT_SC_SIG == PKT_SC_SIG);
122 	_CASSERT(KPKT_SC_VO == PKT_SC_VO);
123 	_CASSERT(KPKT_SC_CTL == PKT_SC_CTL);
124 	_CASSERT(KPKT_SC_MAX_CLASSES == MBUF_SC_MAX_CLASSES);
125 
126 	_CASSERT(KPKT_TC_UNSPEC == MBUF_TC_UNSPEC);
127 	_CASSERT(KPKT_TC_BE == MBUF_TC_BE);
128 	_CASSERT(KPKT_TC_BK == MBUF_TC_BK);
129 	_CASSERT(KPKT_TC_VI == MBUF_TC_VI);
130 	_CASSERT(KPKT_TC_VO == MBUF_TC_VO);
131 	_CASSERT(KPKT_TC_MAX == MBUF_TC_MAX);
132 
133 	_CASSERT(KPKT_TC_BE == PKT_TC_BE);
134 	_CASSERT(KPKT_TC_BK == PKT_TC_BK);
135 	_CASSERT(KPKT_TC_VI == PKT_TC_VI);
136 	_CASSERT(KPKT_TC_VO == PKT_TC_VO);
137 
138 	_CASSERT(PKT_SCVAL_BK_SYS == SCVAL_BK_SYS);
139 	_CASSERT(PKT_SCVAL_BK == SCVAL_BK);
140 	_CASSERT(PKT_SCVAL_BE == SCVAL_BE);
141 	_CASSERT(PKT_SCVAL_RD == SCVAL_RD);
142 	_CASSERT(PKT_SCVAL_OAM == SCVAL_OAM);
143 	_CASSERT(PKT_SCVAL_AV == SCVAL_AV);
144 	_CASSERT(PKT_SCVAL_RV == SCVAL_RV);
145 	_CASSERT(PKT_SCVAL_VI == SCVAL_VI);
146 	_CASSERT(PKT_SCVAL_VO == SCVAL_VO);
147 	_CASSERT(PKT_SCVAL_CTL == SCVAL_CTL);
148 
149 	/*
150 	 * Assert that the value of common packet flags between mbuf and
151 	 * skywalk packets match, and that they are in PKT_F_COMMON_MASK.
152 	 */
153 	_CASSERT(PKT_F_BACKGROUND == PKTF_SO_BACKGROUND);
154 	_CASSERT(PKT_F_REALTIME == PKTF_SO_REALTIME);
155 	_CASSERT(PKT_F_REXMT == PKTF_TCP_REXMT);
156 	_CASSERT(PKT_F_LAST_PKT == PKTF_LAST_PKT);
157 	_CASSERT(PKT_F_FLOW_ID == PKTF_FLOW_ID);
158 	_CASSERT(PKT_F_FLOW_ADV == PKTF_FLOW_ADV);
159 	_CASSERT(PKT_F_TX_COMPL_TS_REQ == PKTF_TX_COMPL_TS_REQ);
160 	_CASSERT(PKT_F_TS_VALID == PKTF_TS_VALID);
161 	_CASSERT(PKT_F_NEW_FLOW == PKTF_NEW_FLOW);
162 	_CASSERT(PKT_F_START_SEQ == PKTF_START_SEQ);
163 	_CASSERT(PKT_F_KEEPALIVE == PKTF_KEEPALIVE);
164 	_CASSERT(PKT_F_WAKE_PKT == PKTF_WAKE_PKT);
165 	_CASSERT(PKT_F_COMMON_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME |
166 	    PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_FLOW_ID | PKT_F_FLOW_ADV |
167 	    PKT_F_TX_COMPL_TS_REQ | PKT_F_TS_VALID | PKT_F_NEW_FLOW |
168 	    PKT_F_START_SEQ | PKT_F_KEEPALIVE | PKT_F_WAKE_PKT));
169 	/*
170 	 * Assert packet flags shared with userland.
171 	 */
172 	_CASSERT(PKT_F_USER_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME |
173 	    PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_OPT_DATA | PKT_F_PROMISC |
174 	    PKT_F_TRUNCATED | PKT_F_WAKE_PKT));
175 
176 	_CASSERT(offsetof(struct __kern_quantum, qum_len) ==
177 	    offsetof(struct __kern_packet, pkt_length));
178 
179 	/*
180 	 * Due to the use of tagged pointer, we need the size of
181 	 * the metadata preamble structure to be multiples of 16.
182 	 * See SK_PTR_TAG() definition for details.
183 	 */
184 	_CASSERT(sizeof(struct __metadata_preamble) != 0 &&
185 	    (sizeof(struct __metadata_preamble) % 16) == 0);
186 
187 	_CASSERT(NX_PBUF_FRAGS_MIN == 1 &&
188 	    NX_PBUF_FRAGS_MIN == NX_PBUF_FRAGS_DEFAULT);
189 
190 	/*
191 	 * Batch alloc/free requires linking the objects together;
192 	 * make sure that the fields are at the same offset since
193 	 * we cast the object to struct skmem_obj.
194 	 */
195 	_CASSERT(offsetof(struct __metadata_preamble, _mdp_next) ==
196 	    offsetof(struct skmem_obj, mo_next));
197 	_CASSERT(offsetof(struct __buflet, __buflet_next) ==
198 	    offsetof(struct skmem_obj, mo_next));
199 
200 	SK_LOCK_ASSERT_HELD();
201 	ASSERT(!__pp_inited);
202 
203 	pp_opt_cache = skmem_cache_create("pkt.opt",
204 	    sizeof(struct __packet_opt), sizeof(uint64_t),
205 	    NULL, NULL, NULL, NULL, NULL, 0);
206 	pp_flow_cache = skmem_cache_create("pkt.flow",
207 	    sizeof(struct __flow), 16,  /* 16-bytes aligned */
208 	    NULL, NULL, NULL, NULL, NULL, 0);
209 	pp_compl_cache = skmem_cache_create("pkt.compl",
210 	    sizeof(struct __packet_compl), sizeof(uint64_t),
211 	    NULL, NULL, NULL, NULL, NULL, 0);
212 
213 	return 0;
214 }
215 
216 void
pp_fini(void)217 pp_fini(void)
218 {
219 	SK_LOCK_ASSERT_HELD();
220 
221 	if (__pp_inited) {
222 		if (pp_compl_cache != NULL) {
223 			skmem_cache_destroy(pp_compl_cache);
224 			pp_compl_cache = NULL;
225 		}
226 		if (pp_flow_cache != NULL) {
227 			skmem_cache_destroy(pp_flow_cache);
228 			pp_flow_cache = NULL;
229 		}
230 		if (pp_opt_cache != NULL) {
231 			skmem_cache_destroy(pp_opt_cache);
232 			pp_opt_cache = NULL;
233 		}
234 
235 		__pp_inited = 0;
236 	}
237 }
238 
239 static struct kern_pbufpool *
pp_alloc(zalloc_flags_t how)240 pp_alloc(zalloc_flags_t how)
241 {
242 	struct kern_pbufpool *pp = zalloc_flags(pp_zone, how | Z_ZERO);
243 
244 	if (pp) {
245 		lck_mtx_init(&pp->pp_lock, &skmem_lock_grp, &skmem_lock_attr);
246 	}
247 	return pp;
248 }
249 
250 static void
pp_free(struct kern_pbufpool * pp)251 pp_free(struct kern_pbufpool *pp)
252 {
253 	PP_LOCK_ASSERT_HELD(pp);
254 
255 	pp_destroy(pp);
256 	PP_UNLOCK(pp);
257 
258 	SK_DF(SK_VERB_MEM, "pp 0x%llx FREE", SK_KVA(pp));
259 	lck_mtx_destroy(&pp->pp_lock, &skmem_lock_grp);
260 	zfree(pp_zone, pp);
261 }
262 
263 void
pp_retain_locked(struct kern_pbufpool * pp)264 pp_retain_locked(struct kern_pbufpool *pp)
265 {
266 	PP_LOCK_ASSERT_HELD(pp);
267 
268 	pp->pp_refcnt++;
269 	ASSERT(pp->pp_refcnt != 0);
270 }
271 
272 void
pp_retain(struct kern_pbufpool * pp)273 pp_retain(struct kern_pbufpool *pp)
274 {
275 	PP_LOCK(pp);
276 	pp_retain_locked(pp);
277 	PP_UNLOCK(pp);
278 }
279 
280 boolean_t
pp_release_locked(struct kern_pbufpool * pp)281 pp_release_locked(struct kern_pbufpool *pp)
282 {
283 	uint32_t oldref = pp->pp_refcnt;
284 
285 	PP_LOCK_ASSERT_HELD(pp);
286 
287 	ASSERT(pp->pp_refcnt != 0);
288 	if (--pp->pp_refcnt == 0) {
289 		pp_free(pp);
290 	}
291 
292 	return oldref == 1;
293 }
294 
295 boolean_t
pp_release(struct kern_pbufpool * pp)296 pp_release(struct kern_pbufpool *pp)
297 {
298 	boolean_t lastref;
299 
300 	PP_LOCK(pp);
301 	if (!(lastref = pp_release_locked(pp))) {
302 		PP_UNLOCK(pp);
303 	}
304 
305 	return lastref;
306 }
307 
308 void
pp_close(struct kern_pbufpool * pp)309 pp_close(struct kern_pbufpool *pp)
310 {
311 	PP_LOCK(pp);
312 	ASSERT(pp->pp_refcnt > 0);
313 	ASSERT(!(pp->pp_flags & PPF_CLOSED));
314 	pp->pp_flags |= PPF_CLOSED;
315 	if (!pp_release_locked(pp)) {
316 		PP_UNLOCK(pp);
317 	}
318 }
319 
320 void
pp_regions_params_adjust(struct skmem_region_params * buf_srp,struct skmem_region_params * kmd_srp,struct skmem_region_params * umd_srp,struct skmem_region_params * kbft_srp,struct skmem_region_params * ubft_srp,nexus_meta_type_t md_type,nexus_meta_subtype_t md_subtype,uint32_t md_cnt,uint16_t max_frags,uint32_t buf_size,uint32_t buf_cnt)321 pp_regions_params_adjust(struct skmem_region_params *buf_srp,
322     struct skmem_region_params *kmd_srp, struct skmem_region_params *umd_srp,
323     struct skmem_region_params *kbft_srp, struct skmem_region_params *ubft_srp,
324     nexus_meta_type_t md_type, nexus_meta_subtype_t md_subtype, uint32_t md_cnt,
325     uint16_t max_frags, uint32_t buf_size, uint32_t buf_cnt)
326 {
327 	uint32_t md_size = 0;
328 
329 	ASSERT(max_frags != 0);
330 
331 	switch (md_type) {
332 	case NEXUS_META_TYPE_QUANTUM:
333 		md_size = NX_METADATA_QUANTUM_SZ;
334 		break;
335 	case NEXUS_META_TYPE_PACKET:
336 		md_size = NX_METADATA_PACKET_SZ(max_frags);
337 		break;
338 	default:
339 		VERIFY(0);
340 		/* NOTREACHED */
341 		__builtin_unreachable();
342 	}
343 	/* add preamble size to metadata obj size */
344 	md_size += METADATA_PREAMBLE_SZ;
345 	ASSERT(md_size >= NX_METADATA_OBJ_MIN_SZ);
346 
347 	umd_srp->srp_md_type = md_type;
348 	umd_srp->srp_md_subtype = md_subtype;
349 	umd_srp->srp_r_obj_cnt = md_cnt;
350 	umd_srp->srp_r_obj_size = md_size;
351 	umd_srp->srp_max_frags = max_frags;
352 	skmem_region_params_config(umd_srp);
353 
354 	kmd_srp->srp_md_type = md_type;
355 	kmd_srp->srp_md_subtype = md_subtype;
356 	kmd_srp->srp_r_obj_cnt = md_cnt;
357 	kmd_srp->srp_r_obj_size = md_size;
358 	kmd_srp->srp_max_frags = max_frags;
359 	skmem_region_params_config(kmd_srp);
360 
361 	buf_srp->srp_r_obj_cnt = MAX(buf_cnt, kmd_srp->srp_c_obj_cnt);
362 	buf_srp->srp_r_obj_size = buf_size;
363 	buf_srp->srp_cflags &=
364 	    ~(SKMEM_REGION_CR_MONOLITHIC | SKMEM_REGION_CR_PERSISTENT);
365 	skmem_region_params_config(buf_srp);
366 
367 	if (kbft_srp != NULL) {
368 		ASSERT(md_type == NEXUS_META_TYPE_PACKET);
369 
370 		/*
371 		 * Ideally we want the number of buflets to be
372 		 * "kmd_srp->srp_c_obj_cnt * (kmd_srp->srp_max_frags - 1)",
373 		 * so that we have enough buflets when multi-buflet and
374 		 * shared buffer object is used.
375 		 * Currently multi-buflet is being used only by user pool
376 		 * which doesn't support shared buffer object, hence to reduce
377 		 * the number of objects we are restricting the number of
378 		 * buflets to the number of buffers.
379 		 */
380 		kbft_srp->srp_r_obj_cnt = buf_srp->srp_c_obj_cnt;
381 		kbft_srp->srp_r_obj_size = MAX(sizeof(struct __kern_buflet_ext),
382 		    sizeof(struct __user_buflet));
383 		kbft_srp->srp_cflags = kmd_srp->srp_cflags;
384 		skmem_region_params_config(kbft_srp);
385 		ASSERT(kbft_srp->srp_c_obj_cnt >= buf_srp->srp_c_obj_cnt);
386 	}
387 
388 	if (ubft_srp != NULL) {
389 		ASSERT(kbft_srp != NULL);
390 		ubft_srp->srp_r_obj_cnt = kbft_srp->srp_r_obj_cnt;
391 		ubft_srp->srp_r_obj_size = kbft_srp->srp_r_obj_size;
392 		ubft_srp->srp_cflags = umd_srp->srp_cflags;
393 		skmem_region_params_config(ubft_srp);
394 		ASSERT(kbft_srp->srp_c_obj_cnt == ubft_srp->srp_c_obj_cnt);
395 	}
396 
397 	/* make sure each metadata can be paired with a buffer */
398 	ASSERT(kmd_srp->srp_c_obj_cnt == umd_srp->srp_c_obj_cnt);
399 	ASSERT(kmd_srp->srp_c_obj_cnt <= buf_srp->srp_c_obj_cnt);
400 }
401 
402 SK_NO_INLINE_ATTRIBUTE
403 static int
pp_metadata_construct(struct __kern_quantum * kqum,struct __user_quantum * uqum,obj_idx_t midx,struct kern_pbufpool * pp,uint32_t skmflag,uint16_t bufcnt,boolean_t raw,struct skmem_obj ** blist)404 pp_metadata_construct(struct __kern_quantum *kqum, struct __user_quantum *uqum,
405     obj_idx_t midx, struct kern_pbufpool *pp, uint32_t skmflag, uint16_t bufcnt,
406     boolean_t raw, struct skmem_obj **blist)
407 {
408 	struct __kern_buflet *kbuf;
409 	mach_vm_address_t baddr = 0;
410 	uint16_t *pbufs_cnt, *pbufs_max;
411 	uint16_t i;
412 
413 	ASSERT(bufcnt == 1 || PP_HAS_BUFFER_ON_DEMAND(pp));
414 
415 	/* construct {user,kernel} metadata */
416 	switch (pp->pp_md_type) {
417 	case NEXUS_META_TYPE_PACKET: {
418 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
419 		struct __user_packet *upkt = SK_PTR_ADDR_UPKT(uqum);
420 		struct __packet_opt *opt;
421 		struct __flow *flow;
422 		struct __packet_compl *compl;
423 		uint64_t pflags;
424 
425 		if (raw) {
426 			opt = skmem_cache_alloc(pp_opt_cache, SKMEM_SLEEP);
427 			flow = skmem_cache_alloc(pp_flow_cache, SKMEM_SLEEP);
428 			compl = skmem_cache_alloc(pp_compl_cache, SKMEM_SLEEP);
429 			pflags = (PKT_F_OPT_ALLOC | PKT_F_FLOW_ALLOC |
430 			    PKT_F_TX_COMPL_ALLOC);
431 		} else {
432 			ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
433 			    kpkt->pkt_com_opt != NULL);
434 			opt = kpkt->pkt_com_opt;
435 			ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
436 			    kpkt->pkt_flow != NULL);
437 			flow = kpkt->pkt_flow;
438 			ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
439 			    kpkt->pkt_tx_compl != NULL);
440 			compl = kpkt->pkt_tx_compl;
441 			pflags = kpkt->pkt_pflags;
442 		}
443 		/* will be adjusted below as part of allocating buffer(s) */
444 		_CASSERT(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
445 		_CASSERT(sizeof(kpkt->pkt_bufs_max) == sizeof(uint16_t));
446 		pbufs_cnt = __DECONST(uint16_t *, &kpkt->pkt_bufs_cnt);
447 		pbufs_max = __DECONST(uint16_t *, &kpkt->pkt_bufs_max);
448 
449 		/* kernel (and user) packet */
450 		KPKT_CTOR(kpkt, pflags, opt, flow, compl, midx,
451 		    upkt, pp, 0, pp->pp_max_frags, 0);
452 		break;
453 	}
454 	default:
455 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
456 		VERIFY(bufcnt == 1);
457 		/* TODO: point these to quantum's once they're defined */
458 		pbufs_cnt = pbufs_max = NULL;
459 		/* kernel quantum */
460 		KQUM_CTOR(kqum, midx, uqum, pp, 0);
461 		break;
462 	}
463 
464 	kbuf = kqum->qum_buf;
465 	for (i = 0; i < bufcnt; i++) {
466 		struct skmem_obj_info oib;
467 
468 		if (!PP_HAS_BUFFER_ON_DEMAND(pp)) {
469 			ASSERT(i == 0);
470 			ASSERT(*blist == NULL);
471 			/*
472 			 * quantum has a native buflet, so we only need a
473 			 * buffer to be allocated and attached to the buflet.
474 			 */
475 			baddr = pp_alloc_buffer_common(pp, &oib, skmflag);
476 			if (__improbable(baddr == 0)) {
477 				goto fail;
478 			}
479 			KBUF_CTOR(kbuf, baddr, SKMEM_OBJ_IDX_REG(&oib),
480 			    SKMEM_OBJ_BUFCTL(&oib), pp);
481 			baddr = 0;
482 		} else {
483 			/*
484 			 * we use pre-constructed buflets with attached buffers.
485 			 */
486 			struct __kern_buflet *pkbuf = kbuf;
487 			struct skmem_obj *blistn;
488 
489 			ASSERT(pkbuf != NULL);
490 			kbuf = (kern_buflet_t)*blist;
491 			if (__improbable(kbuf == NULL)) {
492 				SK_DF(SK_VERB_MEM, "failed to get buflet,"
493 				    " pp 0x%llx", SK_KVA(pp));
494 				goto fail;
495 			}
496 			blistn = (*blist)->mo_next;
497 			(*blist)->mo_next = NULL;
498 
499 			KBUF_EXT_INIT(kbuf, pp);
500 			KBUF_LINK(pkbuf, kbuf);
501 			*blist = blistn;
502 		}
503 
504 		/* adjust buffer count accordingly */
505 		if (__probable(pbufs_cnt != NULL)) {
506 			*pbufs_cnt += 1;
507 			ASSERT(*pbufs_cnt <= *pbufs_max);
508 		}
509 	}
510 
511 	ASSERT(!PP_KERNEL_ONLY(pp) || (kqum->qum_qflags & QUM_F_KERNEL_ONLY));
512 	ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE);
513 	SK_DF(SK_VERB_MEM, "pp 0x%llx pkt 0x%llx bufcnt %d buf 0x%llx",
514 	    SK_KVA(pp), SK_KVA(kqum), bufcnt, SK_KVA(baddr));
515 	return 0;
516 
517 fail:
518 	ASSERT(bufcnt != 0 && baddr == 0);
519 	pp_metadata_destruct(kqum, pp, raw);
520 	return ENOMEM;
521 }
522 
523 static int
pp_metadata_ctor_common(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,struct kern_pbufpool * pp,uint32_t skmflag,boolean_t no_buflet)524 pp_metadata_ctor_common(struct skmem_obj_info *oi0,
525     struct skmem_obj_info *oim0, struct kern_pbufpool *pp, uint32_t skmflag,
526     boolean_t no_buflet)
527 {
528 	struct skmem_obj_info _oi, _oim;
529 	struct skmem_obj_info *oi, *oim;
530 	struct __kern_quantum *kqum;
531 	struct __user_quantum *uqum;
532 	uint16_t bufcnt = (no_buflet ? 0 : pp->pp_max_frags);
533 	struct skmem_obj *blist = NULL;
534 	int error;
535 
536 #if (DEVELOPMENT || DEBUG)
537 	uint64_t mtbf = skmem_region_get_mtbf();
538 	/*
539 	 * MTBF is applicable only for non-blocking allocations here.
540 	 */
541 	if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
542 	    (skmflag & SKMEM_NOSLEEP))) {
543 		SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
544 		net_update_uptime();
545 		return ENOMEM;
546 	}
547 #endif /* (DEVELOPMENT || DEBUG) */
548 
549 	/*
550 	 * Note that oi0 and oim0 may be stored inside the object itself;
551 	 * if so, copy them to local variables before constructing.  We
552 	 * don't use PPF_BATCH to test as the allocator may be allocating
553 	 * storage space differently depending on the number of objects.
554 	 */
555 	if (__probable((uintptr_t)oi0 >= (uintptr_t)SKMEM_OBJ_ADDR(oi0) &&
556 	    ((uintptr_t)oi0 + sizeof(*oi0)) <=
557 	    ((uintptr_t)SKMEM_OBJ_ADDR(oi0) + SKMEM_OBJ_SIZE(oi0)))) {
558 		oi = &_oi;
559 		*oi = *oi0;
560 		if (__probable(oim0 != NULL)) {
561 			oim = &_oim;
562 			*oim = *oim0;
563 		} else {
564 			oim = NULL;
565 		}
566 	} else {
567 		oi = oi0;
568 		oim = oim0;
569 	}
570 
571 	kqum = SK_PTR_ADDR_KQUM((uintptr_t)SKMEM_OBJ_ADDR(oi) +
572 	    METADATA_PREAMBLE_SZ);
573 
574 	if (__probable(!PP_KERNEL_ONLY(pp))) {
575 		ASSERT(oim != NULL && SKMEM_OBJ_ADDR(oim) != NULL);
576 		ASSERT(SKMEM_OBJ_SIZE(oi) == SKMEM_OBJ_SIZE(oim));
577 		uqum = SK_PTR_ADDR_UQUM((uintptr_t)SKMEM_OBJ_ADDR(oim) +
578 		    METADATA_PREAMBLE_SZ);
579 	} else {
580 		ASSERT(oim == NULL);
581 		uqum = NULL;
582 	}
583 
584 	if (oim != NULL) {
585 		/* initialize user metadata redzone */
586 		struct __metadata_preamble *mdp = SKMEM_OBJ_ADDR(oim);
587 		mdp->mdp_redzone =
588 		    (SKMEM_OBJ_ROFF(oim) + METADATA_PREAMBLE_SZ) ^
589 		    __ch_umd_redzone_cookie;
590 	}
591 
592 	/* allocate (constructed) buflet(s) with buffer(s) attached */
593 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0) {
594 		(void) skmem_cache_batch_alloc(pp->pp_kbft_cache, &blist,
595 		    bufcnt, skmflag);
596 	}
597 
598 	error = pp_metadata_construct(kqum, uqum, SKMEM_OBJ_IDX_REG(oi), pp,
599 	    skmflag, bufcnt, TRUE, &blist);
600 	if (__improbable(blist != NULL)) {
601 		skmem_cache_batch_free(pp->pp_kbft_cache, blist);
602 		blist = NULL;
603 	}
604 	return error;
605 }
606 
607 static int
pp_metadata_ctor_no_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)608 pp_metadata_ctor_no_buflet(struct skmem_obj_info *oi0,
609     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
610 {
611 	return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, TRUE);
612 }
613 
614 static int
pp_metadata_ctor_max_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)615 pp_metadata_ctor_max_buflet(struct skmem_obj_info *oi0,
616     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
617 {
618 	return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, FALSE);
619 }
620 
621 __attribute__((always_inline))
622 static void
pp_metadata_destruct_common(struct __kern_quantum * kqum,struct kern_pbufpool * pp,boolean_t raw,struct skmem_obj ** blist)623 pp_metadata_destruct_common(struct __kern_quantum *kqum,
624     struct kern_pbufpool *pp, boolean_t raw, struct skmem_obj **blist)
625 {
626 	struct __kern_buflet *kbuf, *nbuf, *lbuf = NULL;
627 	boolean_t first_buflet_empty;
628 	struct skmem_obj *_blist;
629 	uint16_t bufcnt, i = 0;
630 
631 	ASSERT(blist != NULL);
632 	_blist = *blist;
633 
634 	switch (pp->pp_md_type) {
635 	case NEXUS_META_TYPE_PACKET: {
636 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
637 
638 		ASSERT(kpkt->pkt_user != NULL || PP_KERNEL_ONLY(pp));
639 		ASSERT(kpkt->pkt_qum.qum_pp == pp);
640 		ASSERT(METADATA_TYPE(kpkt) == pp->pp_md_type);
641 		ASSERT(METADATA_SUBTYPE(kpkt) == pp->pp_md_subtype);
642 		ASSERT(METADATA_IDX(kpkt) != OBJ_IDX_NONE);
643 		ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
644 		ASSERT(kpkt->pkt_bufs_cnt <= kpkt->pkt_bufs_max);
645 		ASSERT(kpkt->pkt_bufs_max == pp->pp_max_frags);
646 		_CASSERT(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
647 		bufcnt = kpkt->pkt_bufs_cnt;
648 		kbuf = &kqum->qum_buf[0];
649 		/*
650 		 * special handling for empty first buflet.
651 		 */
652 		first_buflet_empty = (kbuf->buf_addr == 0);
653 		*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = 0;
654 		break;
655 	}
656 	default:
657 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
658 		ASSERT(kqum->qum_user != NULL || PP_KERNEL_ONLY(pp));
659 		ASSERT(kqum->qum_pp == pp);
660 		ASSERT(METADATA_TYPE(kqum) == pp->pp_md_type);
661 		ASSERT(METADATA_SUBTYPE(kqum) == pp->pp_md_subtype);
662 		ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE);
663 		ASSERT(kqum->qum_ksd == NULL);
664 		kbuf = &kqum->qum_buf[0];
665 		/*
666 		 * XXX: Special handling for quantum as we don't currently
667 		 * define bufs_{cnt,max} there.  Given that we support at
668 		 * most only 1 buflet for now, check if buf_addr is non-NULL.
669 		 * See related code in pp_metadata_construct().
670 		 */
671 		first_buflet_empty = (kbuf->buf_addr == 0);
672 		bufcnt = first_buflet_empty ? 0 : 1;
673 		break;
674 	}
675 
676 	nbuf = __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr);
677 	if (nbuf != NULL) {
678 		*blist = (struct skmem_obj *)(void *)nbuf;
679 	}
680 	BUF_NBFT_ADDR(kbuf, 0);
681 	BUF_NBFT_IDX(kbuf, OBJ_IDX_NONE);
682 	if (!first_buflet_empty) {
683 		pp_free_buflet_common(pp, kbuf);
684 		++i;
685 	}
686 	while (nbuf != NULL) {
687 		lbuf = nbuf;
688 		BUF_NBFT_IDX(nbuf, OBJ_IDX_NONE);
689 		nbuf = __DECONST(struct __kern_buflet *, nbuf->buf_nbft_addr);
690 		++i;
691 	}
692 	ASSERT(i == bufcnt);
693 	if (lbuf != NULL) {
694 		((struct skmem_obj *)(void *)lbuf)->mo_next = _blist;
695 	}
696 
697 	/* if we're about to return this object to the slab, clean it up */
698 	if (raw) {
699 		switch (pp->pp_md_type) {
700 		case NEXUS_META_TYPE_PACKET: {
701 			struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
702 
703 			ASSERT(kpkt->pkt_com_opt != NULL ||
704 			    !(kpkt->pkt_pflags & PKT_F_OPT_ALLOC));
705 			if (kpkt->pkt_com_opt != NULL) {
706 				ASSERT(kpkt->pkt_pflags & PKT_F_OPT_ALLOC);
707 				skmem_cache_free(pp_opt_cache,
708 				    kpkt->pkt_com_opt);
709 				kpkt->pkt_com_opt = NULL;
710 			}
711 			ASSERT(kpkt->pkt_flow != NULL ||
712 			    !(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC));
713 			if (kpkt->pkt_flow != NULL) {
714 				ASSERT(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC);
715 				skmem_cache_free(pp_flow_cache, kpkt->pkt_flow);
716 				kpkt->pkt_flow = NULL;
717 			}
718 			ASSERT(kpkt->pkt_tx_compl != NULL ||
719 			    !(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC));
720 			if (kpkt->pkt_tx_compl != NULL) {
721 				ASSERT(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC);
722 				skmem_cache_free(pp_compl_cache,
723 				    kpkt->pkt_tx_compl);
724 				kpkt->pkt_tx_compl = NULL;
725 			}
726 			kpkt->pkt_pflags = 0;
727 			break;
728 		}
729 		default:
730 			ASSERT(METADATA_TYPE(kqum) == NEXUS_META_TYPE_QUANTUM);
731 			/* nothing to do for quantum (yet) */
732 			break;
733 		}
734 	}
735 }
736 
737 __attribute__((always_inline))
738 static void
pp_metadata_destruct(struct __kern_quantum * kqum,struct kern_pbufpool * pp,boolean_t raw)739 pp_metadata_destruct(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
740     boolean_t raw)
741 {
742 	struct skmem_obj *blist = NULL;
743 
744 	pp_metadata_destruct_common(kqum, pp, raw, &blist);
745 	if (blist != NULL) {
746 		skmem_cache_batch_free(pp->pp_kbft_cache, blist);
747 	}
748 }
749 
750 static void
pp_metadata_dtor(void * addr,void * arg)751 pp_metadata_dtor(void *addr, void *arg)
752 {
753 	pp_metadata_destruct(SK_PTR_ADDR_KQUM((uintptr_t)addr +
754 	    METADATA_PREAMBLE_SZ), arg, TRUE);
755 }
756 
757 static void
pp_buf_seg_ctor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)758 pp_buf_seg_ctor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
759 {
760 	struct kern_pbufpool *pp = arg;
761 
762 	if (pp->pp_pbuf_seg_ctor != NULL) {
763 		pp->pp_pbuf_seg_ctor(pp, sg, md);
764 	}
765 }
766 
767 static void
pp_buf_seg_dtor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)768 pp_buf_seg_dtor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
769 {
770 	struct kern_pbufpool *pp = arg;
771 
772 	if (pp->pp_pbuf_seg_dtor != NULL) {
773 		pp->pp_pbuf_seg_dtor(pp, sg, md);
774 	}
775 }
776 
777 static int
pp_buflet_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)778 pp_buflet_metadata_ctor(struct skmem_obj_info *oi0,
779     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
780 {
781 #pragma unused (skmflag)
782 	struct kern_pbufpool *pp = (struct kern_pbufpool *)arg;
783 	struct __kern_buflet *kbft;
784 	struct __user_buflet *ubft;
785 	struct skmem_obj_info oib;
786 	mach_vm_address_t baddr;
787 	obj_idx_t oi_idx_reg;
788 
789 	baddr = pp_alloc_buffer_common(pp, &oib, skmflag);
790 	if (__improbable(baddr == 0)) {
791 		return ENOMEM;
792 	}
793 	/*
794 	 * Note that oi0 and oim0 may be stored inside the object itself;
795 	 * so copy what is required to local variables before constructing.
796 	 */
797 	oi_idx_reg = SKMEM_OBJ_IDX_REG(oi0);
798 	kbft = SKMEM_OBJ_ADDR(oi0);
799 
800 	if (__probable(!PP_KERNEL_ONLY(pp))) {
801 		ASSERT(oim0 != NULL && SKMEM_OBJ_ADDR(oim0) != NULL);
802 		ASSERT(SKMEM_OBJ_SIZE(oi0) == SKMEM_OBJ_SIZE(oim0));
803 		ASSERT(oi_idx_reg == SKMEM_OBJ_IDX_REG(oim0));
804 		ASSERT(SKMEM_OBJ_IDX_SEG(oi0) == SKMEM_OBJ_IDX_SEG(oim0));
805 		ubft = SKMEM_OBJ_ADDR(oim0);
806 	} else {
807 		ASSERT(oim0 == NULL);
808 		ubft = NULL;
809 	}
810 	KBUF_EXT_CTOR(kbft, ubft, baddr, SKMEM_OBJ_IDX_REG(&oib),
811 	    SKMEM_OBJ_BUFCTL(&oib), oi_idx_reg, pp);
812 	return 0;
813 }
814 
815 static void
pp_buflet_metadata_dtor(void * addr,void * arg)816 pp_buflet_metadata_dtor(void *addr, void *arg)
817 {
818 	struct __kern_buflet *kbft = addr;
819 	void *objaddr = kbft->buf_objaddr;
820 	struct kern_pbufpool *pp = arg;
821 	uint32_t usecnt = 0;
822 
823 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
824 	/*
825 	 * don't assert for (buf_nbft_addr == 0) here as constructed
826 	 * buflet may have this field as non-zero. This is because
827 	 * buf_nbft_addr (__buflet_next) is used by skmem batch alloc
828 	 * for chaining the buflets.
829 	 * To ensure that the frred buflet was not part of a chain we
830 	 * assert for (buf_nbft_idx == OBJ_IDX_NONE).
831 	 */
832 	ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
833 	ASSERT(((struct __kern_buflet_ext *)kbft)->kbe_buf_upp_link.sle_next ==
834 	    NULL);
835 	ASSERT(kbft->buf_addr != 0);
836 	ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
837 	ASSERT(kbft->buf_ctl != NULL);
838 
839 	KBUF_DTOR(kbft, usecnt);
840 	SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u", SK_KVA(pp),
841 	    SK_KVA(objaddr), usecnt);
842 	if (__probable(usecnt == 0)) {
843 		skmem_cache_free(pp->pp_buf_cache, objaddr);
844 	}
845 }
846 
847 struct kern_pbufpool *
pp_create(const char * name,struct skmem_region_params * buf_srp,struct skmem_region_params * kmd_srp,struct skmem_region_params * umd_srp,struct skmem_region_params * kbft_srp,struct skmem_region_params * ubft_srp,pbuf_seg_ctor_fn_t buf_seg_ctor,pbuf_seg_dtor_fn_t buf_seg_dtor,const void * ctx,pbuf_ctx_retain_fn_t ctx_retain,pbuf_ctx_release_fn_t ctx_release,uint32_t ppcreatef)848 pp_create(const char *name, struct skmem_region_params *buf_srp,
849     struct skmem_region_params *kmd_srp, struct skmem_region_params *umd_srp,
850     struct skmem_region_params *kbft_srp, struct skmem_region_params *ubft_srp,
851     pbuf_seg_ctor_fn_t buf_seg_ctor, pbuf_seg_dtor_fn_t buf_seg_dtor,
852     const void *ctx, pbuf_ctx_retain_fn_t ctx_retain,
853     pbuf_ctx_release_fn_t ctx_release, uint32_t ppcreatef)
854 {
855 	struct kern_pbufpool *pp = NULL;
856 	uint32_t md_size, buf_size;
857 	nexus_meta_type_t md_type;
858 	nexus_meta_subtype_t md_subtype;
859 	uint32_t md_cflags;
860 	uint16_t max_frags;
861 	char cname[64];
862 
863 	/* buf_seg_{ctor,dtor} pair must be either NULL or non-NULL */
864 	ASSERT(!(!(buf_seg_ctor == NULL && buf_seg_dtor == NULL) &&
865 	    ((buf_seg_ctor == NULL) ^ (buf_seg_dtor == NULL))));
866 
867 	/* ctx{,_retain,_release} must be either ALL NULL or ALL non-NULL */
868 	ASSERT((ctx == NULL && ctx_retain == NULL && ctx_release == NULL) ||
869 	    (ctx != NULL && ctx_retain != NULL && ctx_release != NULL));
870 
871 	ASSERT(umd_srp->srp_c_obj_size == kmd_srp->srp_c_obj_size);
872 	ASSERT(umd_srp->srp_c_obj_cnt == kmd_srp->srp_c_obj_cnt);
873 	ASSERT(umd_srp->srp_c_seg_size == kmd_srp->srp_c_seg_size);
874 	ASSERT(umd_srp->srp_seg_cnt == kmd_srp->srp_seg_cnt);
875 	ASSERT(umd_srp->srp_md_type == kmd_srp->srp_md_type);
876 	ASSERT(umd_srp->srp_md_subtype == kmd_srp->srp_md_subtype);
877 	ASSERT(umd_srp->srp_max_frags == kmd_srp->srp_max_frags);
878 
879 	md_size = kmd_srp->srp_r_obj_size;
880 	md_type = kmd_srp->srp_md_type;
881 	md_subtype = kmd_srp->srp_md_subtype;
882 	max_frags = kmd_srp->srp_max_frags;
883 	buf_size = buf_srp->srp_r_obj_size;
884 
885 	if (__improbable((buf_size > UINT16_MAX) ||
886 	    (buf_srp->srp_c_obj_size > UINT16_MAX))) {
887 		SK_ERR("\"%s\" requested/configured "
888 		    "(%d/%d) buffer size is too large", name, buf_size,
889 		    buf_srp->srp_c_obj_size);
890 		goto failed;
891 	}
892 
893 #if (DEBUG || DEVELOPMENT)
894 	ASSERT(buf_size != 0);
895 	ASSERT(md_type > NEXUS_META_TYPE_INVALID &&
896 	    md_type <= NEXUS_META_TYPE_MAX);
897 	if (md_type == NEXUS_META_TYPE_QUANTUM) {
898 		ASSERT(max_frags == 1);
899 		ASSERT(md_size >=
900 		    (METADATA_PREAMBLE_SZ + NX_METADATA_QUANTUM_SZ));
901 	} else {
902 		ASSERT(max_frags >= 1);
903 		ASSERT(md_type == NEXUS_META_TYPE_PACKET);
904 		ASSERT(md_size >= (METADATA_PREAMBLE_SZ +
905 		    NX_METADATA_PACKET_SZ(max_frags)));
906 	}
907 	ASSERT(md_subtype > NEXUS_META_SUBTYPE_INVALID &&
908 	    md_subtype <= NEXUS_META_SUBTYPE_MAX);
909 #endif /* DEBUG || DEVELOPMENT */
910 
911 	pp = pp_alloc(Z_WAITOK);
912 
913 	(void) snprintf((char *)pp->pp_name, sizeof(pp->pp_name),
914 	    "skywalk.pp.%s", name);
915 
916 	pp->pp_ctx = __DECONST(void *, ctx);
917 	pp->pp_ctx_retain = ctx_retain;
918 	pp->pp_ctx_release = ctx_release;
919 	if (pp->pp_ctx != NULL) {
920 		pp->pp_ctx_retain(pp->pp_ctx);
921 	}
922 
923 	pp->pp_pbuf_seg_ctor = buf_seg_ctor;
924 	pp->pp_pbuf_seg_dtor = buf_seg_dtor;
925 	pp->pp_buflet_size = (uint16_t)buf_size;
926 	pp->pp_md_type = md_type;
927 	pp->pp_md_subtype = md_subtype;
928 	pp->pp_max_frags = max_frags;
929 	if (ppcreatef & PPCREATEF_EXTERNAL) {
930 		pp->pp_flags |= PPF_EXTERNAL;
931 	}
932 	if (ppcreatef & PPCREATEF_TRUNCATED_BUF) {
933 		pp->pp_flags |= PPF_TRUNCATED_BUF;
934 	}
935 	if (ppcreatef & PPCREATEF_KERNEL_ONLY) {
936 		pp->pp_flags |= PPF_KERNEL;
937 	}
938 	if (ppcreatef & PPCREATEF_ONDEMAND_BUF) {
939 		pp->pp_flags |= PPF_BUFFER_ON_DEMAND;
940 	}
941 	if (ppcreatef & PPCREATEF_DYNAMIC) {
942 		pp->pp_flags |= PPF_DYNAMIC;
943 	}
944 
945 	pp_retain(pp);
946 
947 	/*
948 	 * Metadata regions {UMD,KMD} magazines layer attribute must match.
949 	 */
950 	ASSERT((umd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ==
951 	    (kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES));
952 	/*
953 	 * Metadata regions {UMD,KMD} persistency attribute must match.
954 	 */
955 	ASSERT((umd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
956 	    (kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
957 
958 	md_cflags = ((umd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ?
959 	    SKMEM_CR_NOMAGAZINES : 0);
960 	md_cflags |= SKMEM_CR_BATCH;
961 	pp->pp_flags |= PPF_BATCH;
962 
963 	if (pp->pp_flags & PPF_DYNAMIC) {
964 		md_cflags |= SKMEM_CR_DYNAMIC;
965 	}
966 
967 	if (!PP_KERNEL_ONLY(pp) && (pp->pp_umd_region =
968 	    skmem_region_create(name, umd_srp, NULL, NULL,
969 	    NULL)) == NULL) {
970 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
971 		    pp->pp_name, SK_KVA(pp), umd_srp->srp_name);
972 		goto failed;
973 	}
974 
975 	if ((pp->pp_kmd_region = skmem_region_create(name, kmd_srp, NULL, NULL,
976 	    NULL)) == NULL) {
977 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
978 		    pp->pp_name, SK_KVA(pp), kmd_srp->srp_name);
979 		goto failed;
980 	}
981 
982 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
983 		VERIFY((kbft_srp != NULL) && (kbft_srp->srp_r_obj_cnt > 0));
984 		if (!PP_KERNEL_ONLY(pp)) {
985 			VERIFY((ubft_srp != NULL) &&
986 			    (ubft_srp->srp_r_obj_cnt > 0));
987 		}
988 	}
989 	/*
990 	 * Metadata regions {KMD,KBFT,UBFT} magazines layer and persistency
991 	 * attribute must match.
992 	 */
993 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
994 		ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ==
995 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES));
996 		ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
997 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
998 	}
999 
1000 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && !PP_KERNEL_ONLY(pp)) {
1001 		ASSERT((ubft_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ==
1002 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES));
1003 		ASSERT((ubft_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
1004 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
1005 
1006 		if ((pp->pp_ubft_region = skmem_region_create(name, ubft_srp,
1007 		    NULL, NULL, NULL)) == NULL) {
1008 			SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1009 			    pp->pp_name, SK_KVA(pp), ubft_srp->srp_name);
1010 			goto failed;
1011 		}
1012 	}
1013 
1014 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1015 		if ((pp->pp_kbft_region = skmem_region_create(name,
1016 		    kbft_srp, NULL, NULL, NULL)) == NULL) {
1017 			SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1018 			    pp->pp_name, SK_KVA(pp), kbft_srp->srp_name);
1019 			goto failed;
1020 		}
1021 	}
1022 
1023 	if (!PP_KERNEL_ONLY(pp)) {
1024 		skmem_region_mirror(pp->pp_kmd_region, pp->pp_umd_region);
1025 	}
1026 	if (!PP_KERNEL_ONLY(pp) && pp->pp_ubft_region != NULL) {
1027 		ASSERT(pp->pp_kbft_region != NULL);
1028 		skmem_region_mirror(pp->pp_kbft_region, pp->pp_ubft_region);
1029 	}
1030 
1031 	/*
1032 	 * Create the metadata cache; magazines layer is determined by caller.
1033 	 */
1034 	(void) snprintf(cname, sizeof(cname), "kmd.%s", name);
1035 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1036 		pp->pp_kmd_cache = skmem_cache_create(cname, md_size, 0,
1037 		    pp_metadata_ctor_no_buflet, pp_metadata_dtor, NULL, pp,
1038 		    pp->pp_kmd_region, md_cflags);
1039 	} else {
1040 		pp->pp_kmd_cache = skmem_cache_create(cname, md_size, 0,
1041 		    pp_metadata_ctor_max_buflet, pp_metadata_dtor, NULL, pp,
1042 		    pp->pp_kmd_region, md_cflags);
1043 	}
1044 
1045 	if (pp->pp_kmd_cache == NULL) {
1046 		SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1047 		    pp->pp_name, SK_KVA(pp), cname);
1048 		goto failed;
1049 	}
1050 
1051 	/*
1052 	 * Create the buflet metadata cache
1053 	 */
1054 	if (pp->pp_kbft_region != NULL) {
1055 		(void) snprintf(cname, sizeof(cname), "kbft.%s", name);
1056 		pp->pp_kbft_cache = skmem_cache_create(cname,
1057 		    kbft_srp->srp_c_obj_size, 0, pp_buflet_metadata_ctor,
1058 		    pp_buflet_metadata_dtor, NULL, pp, pp->pp_kbft_region,
1059 		    md_cflags);
1060 
1061 		if (pp->pp_kbft_cache == NULL) {
1062 			SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1063 			    pp->pp_name, SK_KVA(pp), cname);
1064 			goto failed;
1065 		}
1066 	}
1067 
1068 	if ((pp->pp_buf_region = skmem_region_create(name,
1069 	    buf_srp, pp_buf_seg_ctor, pp_buf_seg_dtor, pp)) == NULL) {
1070 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1071 		    pp->pp_name, SK_KVA(pp), buf_srp->srp_name);
1072 		goto failed;
1073 	}
1074 
1075 	/*
1076 	 * Create the buffer object cache without the magazines layer.
1077 	 * We rely on caching the constructed metadata object instead.
1078 	 */
1079 	(void) snprintf(cname, sizeof(cname), "buf.%s", name);
1080 	if ((pp->pp_buf_cache = skmem_cache_create(cname, buf_size, 0,
1081 	    NULL, NULL, NULL, pp, pp->pp_buf_region, SKMEM_CR_NOMAGAZINES)) ==
1082 	    NULL) {
1083 		SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1084 		    pp->pp_name, SK_KVA(pp), cname);
1085 		goto failed;
1086 	}
1087 
1088 	return pp;
1089 
1090 failed:
1091 	if (pp != NULL) {
1092 		if (pp->pp_ctx != NULL) {
1093 			pp->pp_ctx_release(pp->pp_ctx);
1094 			pp->pp_ctx = NULL;
1095 		}
1096 		pp_close(pp);
1097 	}
1098 
1099 	return NULL;
1100 }
1101 
1102 void
pp_destroy(struct kern_pbufpool * pp)1103 pp_destroy(struct kern_pbufpool *pp)
1104 {
1105 	PP_LOCK_ASSERT_HELD(pp);
1106 
1107 	/* may be called for built-in pp with outstanding reference */
1108 	ASSERT(!(pp->pp_flags & PPF_EXTERNAL) || pp->pp_refcnt == 0);
1109 
1110 	pp_destroy_upp_locked(pp);
1111 
1112 	pp_destroy_upp_bft_locked(pp);
1113 
1114 	if (pp->pp_kmd_cache != NULL) {
1115 		skmem_cache_destroy(pp->pp_kmd_cache);
1116 		pp->pp_kmd_cache = NULL;
1117 	}
1118 
1119 	if (pp->pp_umd_region != NULL) {
1120 		skmem_region_release(pp->pp_umd_region);
1121 		pp->pp_umd_region = NULL;
1122 	}
1123 
1124 	if (pp->pp_kmd_region != NULL) {
1125 		skmem_region_release(pp->pp_kmd_region);
1126 		pp->pp_kmd_region = NULL;
1127 	}
1128 
1129 	if (pp->pp_kbft_cache != NULL) {
1130 		skmem_cache_destroy(pp->pp_kbft_cache);
1131 		pp->pp_kbft_cache = NULL;
1132 	}
1133 
1134 	if (pp->pp_ubft_region != NULL) {
1135 		skmem_region_release(pp->pp_ubft_region);
1136 		pp->pp_ubft_region = NULL;
1137 	}
1138 
1139 	if (pp->pp_kbft_region != NULL) {
1140 		skmem_region_release(pp->pp_kbft_region);
1141 		pp->pp_kbft_region = NULL;
1142 	}
1143 
1144 	/*
1145 	 * The order is important here, since pp_metadata_dtor()
1146 	 * called by freeing on the pp_kmd_cache will in turn
1147 	 * free the attached buffer.  Therefore destroy the
1148 	 * buffer cache last.
1149 	 */
1150 	if (pp->pp_buf_cache != NULL) {
1151 		skmem_cache_destroy(pp->pp_buf_cache);
1152 		pp->pp_buf_cache = NULL;
1153 	}
1154 	if (pp->pp_buf_region != NULL) {
1155 		skmem_region_release(pp->pp_buf_region);
1156 		pp->pp_buf_region = NULL;
1157 	}
1158 
1159 	if (pp->pp_ctx != NULL) {
1160 		pp->pp_ctx_release(pp->pp_ctx);
1161 		pp->pp_ctx = NULL;
1162 	}
1163 }
1164 
1165 static int
pp_init_upp_locked(struct kern_pbufpool * pp,boolean_t can_block)1166 pp_init_upp_locked(struct kern_pbufpool *pp, boolean_t can_block)
1167 {
1168 	int i, err = 0;
1169 
1170 	if (pp->pp_u_hash_table != NULL) {
1171 		goto done;
1172 	}
1173 
1174 	/* allocated-address hash table */
1175 	pp->pp_u_hash_table = can_block ? zalloc(pp_u_htbl_zone) :
1176 	    zalloc_noblock(pp_u_htbl_zone);
1177 	if (pp->pp_u_hash_table == NULL) {
1178 		SK_ERR("failed to zalloc packet buffer pool upp hash table");
1179 		err = ENOMEM;
1180 		goto done;
1181 	}
1182 
1183 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1184 		SLIST_INIT(&pp->pp_u_hash_table[i].upp_head);
1185 	}
1186 done:
1187 	return err;
1188 }
1189 
1190 static void
pp_destroy_upp_locked(struct kern_pbufpool * pp)1191 pp_destroy_upp_locked(struct kern_pbufpool *pp)
1192 {
1193 	PP_LOCK_ASSERT_HELD(pp);
1194 	if (pp->pp_u_hash_table != NULL) {
1195 		/* purge anything that's left */
1196 		pp_purge_upp_locked(pp, -1);
1197 
1198 #if (DEBUG || DEVELOPMENT)
1199 		for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1200 			ASSERT(SLIST_EMPTY(&pp->pp_u_hash_table[i].upp_head));
1201 		}
1202 #endif /* DEBUG || DEVELOPMENT */
1203 
1204 		zfree(pp_u_htbl_zone, pp->pp_u_hash_table);
1205 		pp->pp_u_hash_table = NULL;
1206 	}
1207 	ASSERT(pp->pp_u_bufinuse == 0);
1208 }
1209 
1210 int
pp_init_upp(struct kern_pbufpool * pp,boolean_t can_block)1211 pp_init_upp(struct kern_pbufpool *pp, boolean_t can_block)
1212 {
1213 	int err = 0;
1214 
1215 	PP_LOCK(pp);
1216 	err = pp_init_upp_locked(pp, can_block);
1217 	if (err) {
1218 		SK_ERR("packet UPP init failed (%d)", err);
1219 		goto done;
1220 	}
1221 	err = pp_init_upp_bft_locked(pp, can_block);
1222 	if (err) {
1223 		SK_ERR("buflet UPP init failed (%d)", err);
1224 		pp_destroy_upp_locked(pp);
1225 		goto done;
1226 	}
1227 	pp_retain_locked(pp);
1228 done:
1229 	PP_UNLOCK(pp);
1230 	return err;
1231 }
1232 
1233 __attribute__((always_inline))
1234 static void
pp_insert_upp_bft_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1235 pp_insert_upp_bft_locked(struct kern_pbufpool *pp,
1236     struct __kern_buflet *kbft, pid_t pid)
1237 {
1238 	struct kern_pbufpool_u_bft_bkt *bkt;
1239 	struct __kern_buflet_ext *kbe = (struct __kern_buflet_ext *)kbft;
1240 
1241 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1242 	ASSERT(kbe->kbe_buf_pid == (pid_t)-1);
1243 	kbe->kbe_buf_pid = pid;
1244 	bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, kbft->buf_bft_idx_reg);
1245 	SLIST_INSERT_HEAD(&bkt->upp_head, kbe, kbe_buf_upp_link);
1246 	pp->pp_u_bftinuse++;
1247 }
1248 
1249 __attribute__((always_inline))
1250 static void
pp_insert_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1251 pp_insert_upp_bft_chain_locked(struct kern_pbufpool *pp,
1252     struct __kern_buflet *kbft, pid_t pid)
1253 {
1254 	while (kbft != NULL) {
1255 		pp_insert_upp_bft_locked(pp, kbft, pid);
1256 		kbft = __DECONST(kern_buflet_t, kbft->buf_nbft_addr);
1257 	}
1258 }
1259 
1260 /* Also inserts the attached chain of buflets */
1261 void static inline
pp_insert_upp_common(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1262 pp_insert_upp_common(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1263     pid_t pid)
1264 {
1265 	struct kern_pbufpool_u_bkt *bkt;
1266 	struct __kern_buflet *kbft;
1267 
1268 	ASSERT(kqum->qum_pid == (pid_t)-1);
1269 	kqum->qum_pid = pid;
1270 
1271 	bkt = KERN_PBUFPOOL_U_HASH(pp, METADATA_IDX(kqum));
1272 	SLIST_INSERT_HEAD(&bkt->upp_head, kqum, qum_upp_link);
1273 	pp->pp_u_bufinuse++;
1274 
1275 	kbft = (kern_buflet_t)kqum->qum_buf[0].buf_nbft_addr;
1276 	if (kbft != NULL) {
1277 		ASSERT(((kern_buflet_t)kbft)->buf_flag & BUFLET_FLAG_EXTERNAL);
1278 		ASSERT(kqum->qum_qflags & QUM_F_INTERNALIZED);
1279 		pp_insert_upp_bft_chain_locked(pp, kbft, pid);
1280 	}
1281 }
1282 
1283 void
pp_insert_upp_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1284 pp_insert_upp_locked(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1285     pid_t pid)
1286 {
1287 	pp_insert_upp_common(pp, kqum, pid);
1288 }
1289 
1290 void
pp_insert_upp(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1291 pp_insert_upp(struct kern_pbufpool *pp, struct __kern_quantum *kqum, pid_t pid)
1292 {
1293 	PP_LOCK(pp);
1294 	pp_insert_upp_common(pp, kqum, pid);
1295 	PP_UNLOCK(pp);
1296 }
1297 
1298 void
pp_insert_upp_batch(struct kern_pbufpool * pp,pid_t pid,uint64_t * array,uint32_t num)1299 pp_insert_upp_batch(struct kern_pbufpool *pp, pid_t pid, uint64_t *array,
1300     uint32_t num)
1301 {
1302 	uint32_t i = 0;
1303 
1304 	ASSERT(array != NULL && num > 0);
1305 	PP_LOCK(pp);
1306 	while (num != 0) {
1307 		struct __kern_quantum *kqum = SK_PTR_ADDR_KQUM(array[i]);
1308 
1309 		ASSERT(kqum != NULL);
1310 		pp_insert_upp_common(pp, kqum, pid);
1311 		--num;
1312 		++i;
1313 	}
1314 	PP_UNLOCK(pp);
1315 }
1316 
1317 __attribute__((always_inline))
1318 static struct __kern_buflet *
pp_remove_upp_bft_locked(struct kern_pbufpool * pp,obj_idx_t bft_idx)1319 pp_remove_upp_bft_locked(struct kern_pbufpool *pp, obj_idx_t bft_idx)
1320 {
1321 	struct __kern_buflet_ext *kbft, *tbft;
1322 	struct kern_pbufpool_u_bft_bkt *bkt;
1323 
1324 	bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, bft_idx);
1325 	SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link, tbft) {
1326 		if (((kern_buflet_t)kbft)->buf_bft_idx_reg == bft_idx) {
1327 			SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1328 			    kbe_buf_upp_link);
1329 			kbft->kbe_buf_pid = (pid_t)-1;
1330 			kbft->kbe_buf_upp_link.sle_next = NULL;
1331 			ASSERT(pp->pp_u_bftinuse != 0);
1332 			pp->pp_u_bftinuse--;
1333 			break;
1334 		}
1335 	}
1336 	return (kern_buflet_t)kbft;
1337 }
1338 
1339 struct __kern_buflet *
pp_remove_upp_bft(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1340 pp_remove_upp_bft(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1341 {
1342 	struct __kern_buflet *kbft = pp_remove_upp_bft_locked(pp, md_idx);
1343 
1344 	*err = __improbable(kbft != NULL) ? 0 : EINVAL;
1345 	return kbft;
1346 }
1347 
1348 __attribute__((always_inline))
1349 static int
pp_remove_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum)1350 pp_remove_upp_bft_chain_locked(struct kern_pbufpool *pp,
1351     struct __kern_quantum *kqum)
1352 {
1353 	uint32_t max_frags = pp->pp_max_frags;
1354 	struct __kern_buflet *kbft;
1355 	uint16_t nbfts, upkt_nbfts;
1356 	obj_idx_t bft_idx;
1357 
1358 	ASSERT(!(kqum->qum_qflags & QUM_F_INTERNALIZED));
1359 	bft_idx = kqum->qum_user->qum_buf[0].buf_nbft_idx;
1360 	kbft = &kqum->qum_buf[0];
1361 	if (bft_idx == OBJ_IDX_NONE) {
1362 		return 0;
1363 	}
1364 
1365 	ASSERT(METADATA_TYPE(kqum) == NEXUS_META_TYPE_PACKET);
1366 	struct __kern_packet *kpkt = __DECONST(struct __kern_packet *, kqum);
1367 	struct __user_packet *upkt = __DECONST(struct __user_packet *,
1368 	    kpkt->pkt_qum.qum_user);
1369 
1370 	upkt_nbfts = upkt->pkt_bufs_cnt;
1371 	if (__improbable(upkt_nbfts > max_frags)) {
1372 		SK_ERR("bad bcnt in upkt (%d > %d)", upkt_nbfts, max_frags);
1373 		BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1374 		BUF_NBFT_ADDR(kbft, 0);
1375 		return ERANGE;
1376 	}
1377 
1378 	nbfts = (kbft->buf_addr != 0) ? 1 : 0;
1379 
1380 	do {
1381 		struct __kern_buflet *pbft = kbft;
1382 		struct __kern_buflet_ext *kbe;
1383 
1384 		kbft = pp_remove_upp_bft_locked(pp, bft_idx);
1385 		if (__improbable(kbft == NULL)) {
1386 			BUF_NBFT_IDX(pbft, OBJ_IDX_NONE);
1387 			BUF_NBFT_ADDR(pbft, 0);
1388 			SK_ERR("unallocated next buflet (%d), %p", bft_idx,
1389 			    SK_KVA(pbft));
1390 			return ERANGE;
1391 		}
1392 		ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1393 		BUF_NBFT_IDX(pbft, bft_idx);
1394 		BUF_NBFT_ADDR(pbft, kbft);
1395 		kbe = (struct __kern_buflet_ext *)kbft;
1396 		bft_idx = kbe->kbe_buf_user->buf_nbft_idx;
1397 		++nbfts;
1398 	} while ((bft_idx != OBJ_IDX_NONE) && (nbfts < upkt_nbfts));
1399 
1400 	ASSERT(kbft != NULL);
1401 	BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1402 	BUF_NBFT_ADDR(kbft, 0);
1403 	*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = nbfts;
1404 
1405 	if (__improbable((bft_idx != OBJ_IDX_NONE) || (nbfts != upkt_nbfts))) {
1406 		SK_ERR("bad buflet in upkt (%d, %d)", nbfts, upkt_nbfts);
1407 		return ERANGE;
1408 	}
1409 	return 0;
1410 }
1411 
1412 struct __kern_quantum *
pp_remove_upp_locked(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1413 pp_remove_upp_locked(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1414 {
1415 	struct __kern_quantum *kqum, *tqum;
1416 	struct kern_pbufpool_u_bkt *bkt;
1417 
1418 	bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1419 	SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1420 		if (METADATA_IDX(kqum) == md_idx) {
1421 			SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1422 			    qum_upp_link);
1423 			kqum->qum_pid = (pid_t)-1;
1424 			ASSERT(pp->pp_u_bufinuse != 0);
1425 			pp->pp_u_bufinuse--;
1426 			break;
1427 		}
1428 	}
1429 	if (__probable(kqum != NULL)) {
1430 		*err = pp_remove_upp_bft_chain_locked(pp, kqum);
1431 	} else {
1432 		*err = ERANGE;
1433 	}
1434 	return kqum;
1435 }
1436 
1437 struct __kern_quantum *
pp_remove_upp(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1438 pp_remove_upp(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1439 {
1440 	struct __kern_quantum *kqum;
1441 
1442 	PP_LOCK(pp);
1443 	kqum = pp_remove_upp_locked(pp, md_idx, err);
1444 	PP_UNLOCK(pp);
1445 	return kqum;
1446 }
1447 
1448 struct __kern_quantum *
pp_find_upp(struct kern_pbufpool * pp,obj_idx_t md_idx)1449 pp_find_upp(struct kern_pbufpool *pp, obj_idx_t md_idx)
1450 {
1451 	struct __kern_quantum *kqum, *tqum;
1452 	struct kern_pbufpool_u_bkt *bkt;
1453 
1454 	PP_LOCK(pp);
1455 	bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1456 	SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1457 		if (METADATA_IDX(kqum) == md_idx) {
1458 			break;
1459 		}
1460 	}
1461 	PP_UNLOCK(pp);
1462 
1463 	return kqum;
1464 }
1465 
1466 __attribute__((always_inline))
1467 static void
pp_purge_upp_locked(struct kern_pbufpool * pp,pid_t pid)1468 pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid)
1469 {
1470 	struct __kern_quantum *kqum, *tqum;
1471 	struct kern_pbufpool_u_bkt *bkt;
1472 	int i;
1473 
1474 	PP_LOCK_ASSERT_HELD(pp);
1475 
1476 	/*
1477 	 * TODO: Build a list of packets and batch-free them.
1478 	 */
1479 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1480 		bkt = &pp->pp_u_hash_table[i];
1481 		SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1482 			ASSERT(kqum->qum_pid != (pid_t)-1);
1483 			if (pid != (pid_t)-1 && kqum->qum_pid != pid) {
1484 				continue;
1485 			}
1486 			SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1487 			    qum_upp_link);
1488 			pp_remove_upp_bft_chain_locked(pp, kqum);
1489 			kqum->qum_pid = (pid_t)-1;
1490 			kqum->qum_qflags &= ~QUM_F_FINALIZED;
1491 			kqum->qum_ksd = NULL;
1492 			pp_free_packet(__DECONST(struct kern_pbufpool *,
1493 			    kqum->qum_pp), (uint64_t)kqum);
1494 			ASSERT(pp->pp_u_bufinuse != 0);
1495 			pp->pp_u_bufinuse--;
1496 		}
1497 	}
1498 }
1499 
1500 __attribute__((always_inline))
1501 static void
pp_purge_upp_bft_locked(struct kern_pbufpool * pp,pid_t pid)1502 pp_purge_upp_bft_locked(struct kern_pbufpool *pp, pid_t pid)
1503 {
1504 	struct __kern_buflet_ext *kbft, *tbft;
1505 	struct kern_pbufpool_u_bft_bkt *bkt;
1506 	int i;
1507 
1508 	PP_LOCK_ASSERT_HELD(pp);
1509 
1510 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1511 		bkt = &pp->pp_u_bft_hash_table[i];
1512 		SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link,
1513 		    tbft) {
1514 			ASSERT(kbft->kbe_buf_pid != (pid_t)-1);
1515 			if (pid != (pid_t)-1 && kbft->kbe_buf_pid != pid) {
1516 				continue;
1517 			}
1518 			SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1519 			    kbe_buf_upp_link);
1520 			kbft->kbe_buf_pid = (pid_t)-1;
1521 			kbft->kbe_buf_upp_link.sle_next = NULL;
1522 			pp_free_buflet(pp, (kern_buflet_t)kbft);
1523 			ASSERT(pp->pp_u_bftinuse != 0);
1524 			pp->pp_u_bftinuse--;
1525 		}
1526 	}
1527 }
1528 
1529 void
pp_purge_upp(struct kern_pbufpool * pp,pid_t pid)1530 pp_purge_upp(struct kern_pbufpool *pp, pid_t pid)
1531 {
1532 	PP_LOCK(pp);
1533 	pp_purge_upp_locked(pp, pid);
1534 	pp_purge_upp_bft_locked(pp, pid);
1535 	PP_UNLOCK(pp);
1536 }
1537 
1538 static int
pp_init_upp_bft_locked(struct kern_pbufpool * pp,boolean_t can_block)1539 pp_init_upp_bft_locked(struct kern_pbufpool *pp, boolean_t can_block)
1540 {
1541 	int i, err = 0;
1542 
1543 	PP_LOCK_ASSERT_HELD(pp);
1544 	if (pp->pp_u_bft_hash_table != NULL) {
1545 		return 0;
1546 	}
1547 
1548 	/* allocated-address hash table */
1549 	pp->pp_u_bft_hash_table = can_block ? zalloc(pp_u_htbl_zone) :
1550 	    zalloc_noblock(pp_u_htbl_zone);
1551 	if (pp->pp_u_bft_hash_table == NULL) {
1552 		SK_ERR("failed to zalloc packet buffer pool upp buflet hash table");
1553 		err = ENOMEM;
1554 		goto fail;
1555 	}
1556 
1557 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1558 		SLIST_INIT(&pp->pp_u_bft_hash_table[i].upp_head);
1559 	}
1560 
1561 fail:
1562 	return err;
1563 }
1564 
1565 static void
pp_destroy_upp_bft_locked(struct kern_pbufpool * pp)1566 pp_destroy_upp_bft_locked(struct kern_pbufpool *pp)
1567 {
1568 	PP_LOCK_ASSERT_HELD(pp);
1569 	if (pp->pp_u_bft_hash_table != NULL) {
1570 		/* purge anything that's left */
1571 		pp_purge_upp_bft_locked(pp, -1);
1572 
1573 #if (DEBUG || DEVELOPMENT)
1574 		for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1575 			ASSERT(SLIST_EMPTY(&pp->pp_u_bft_hash_table[i].upp_head));
1576 		}
1577 #endif /* DEBUG || DEVELOPMENT */
1578 
1579 		zfree(pp_u_htbl_zone, pp->pp_u_bft_hash_table);
1580 		pp->pp_u_bft_hash_table = NULL;
1581 	}
1582 	ASSERT(pp->pp_u_bftinuse == 0);
1583 }
1584 
1585 void
pp_insert_upp_bft(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1586 pp_insert_upp_bft(struct kern_pbufpool *pp,
1587     struct __kern_buflet *kbft, pid_t pid)
1588 {
1589 	PP_LOCK(pp);
1590 	pp_insert_upp_bft_locked(pp, kbft, pid);
1591 	PP_UNLOCK(pp);
1592 }
1593 
1594 boolean_t
pp_isempty_upp(struct kern_pbufpool * pp)1595 pp_isempty_upp(struct kern_pbufpool *pp)
1596 {
1597 	boolean_t isempty;
1598 
1599 	PP_LOCK(pp);
1600 	isempty = (pp->pp_u_bufinuse == 0);
1601 	PP_UNLOCK(pp);
1602 
1603 	return isempty;
1604 }
1605 
1606 __attribute__((always_inline))
1607 static inline struct __kern_quantum *
pp_metadata_init(struct __metadata_preamble * mdp,struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag,struct skmem_obj ** blist)1608 pp_metadata_init(struct __metadata_preamble *mdp, struct kern_pbufpool *pp,
1609     uint16_t bufcnt, uint32_t skmflag, struct skmem_obj **blist)
1610 {
1611 	struct __kern_quantum *kqum;
1612 	struct __user_quantum *uqum;
1613 
1614 	kqum = SK_PTR_ADDR_KQUM((uintptr_t)mdp + METADATA_PREAMBLE_SZ);
1615 	ASSERT(kqum->qum_pp == pp);
1616 	if (__probable(!PP_KERNEL_ONLY(pp))) {
1617 		ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY));
1618 		uqum =  __DECONST(struct __user_quantum *, kqum->qum_user);
1619 		ASSERT(uqum != NULL);
1620 	} else {
1621 		ASSERT(kqum->qum_qflags & QUM_F_KERNEL_ONLY);
1622 		ASSERT(kqum->qum_user == NULL);
1623 		uqum = NULL;
1624 	}
1625 
1626 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 &&
1627 	    pp_metadata_construct(kqum, uqum, METADATA_IDX(kqum), pp,
1628 	    skmflag, bufcnt, FALSE, blist) != 0) {
1629 		return NULL;
1630 	}
1631 
1632 	/* (re)construct {user,kernel} metadata */
1633 	switch (pp->pp_md_type) {
1634 	case NEXUS_META_TYPE_PACKET: {
1635 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
1636 		struct __kern_buflet *kbuf = &kpkt->pkt_qum_buf;
1637 		uint16_t i;
1638 
1639 		/* sanitize flags */
1640 		kpkt->pkt_pflags &= PKT_F_INIT_MASK;
1641 
1642 		ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
1643 		    kpkt->pkt_com_opt != NULL);
1644 		ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
1645 		    kpkt->pkt_flow != NULL);
1646 		ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
1647 		    kpkt->pkt_tx_compl != NULL);
1648 
1649 		/*
1650 		 * XXX: For now we always set PKT_F_FLOW_DATA;
1651 		 * this is a no-op but done for consistency
1652 		 * with the other PKT_F_*_DATA flags.
1653 		 */
1654 		kpkt->pkt_pflags |= PKT_F_FLOW_DATA;
1655 
1656 		/* initialize kernel packet */
1657 		KPKT_INIT(kpkt, QUM_F_INTERNALIZED);
1658 
1659 		ASSERT(bufcnt || PP_HAS_BUFFER_ON_DEMAND(pp));
1660 		if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1661 			ASSERT(kbuf->buf_ctl == NULL);
1662 			ASSERT(kbuf->buf_addr == 0);
1663 			kbuf = __DECONST(struct __kern_buflet *,
1664 			    kbuf->buf_nbft_addr);
1665 		}
1666 		/* initialize kernel buflet */
1667 		for (i = 0; i < bufcnt; i++) {
1668 			ASSERT(kbuf != NULL);
1669 			KBUF_INIT(kbuf);
1670 			kbuf = __DECONST(struct __kern_buflet *,
1671 			    kbuf->buf_nbft_addr);
1672 		}
1673 		ASSERT((kbuf == NULL) || (bufcnt == 0));
1674 		break;
1675 	}
1676 	default:
1677 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
1678 		/* kernel quantum */
1679 		KQUM_INIT(kqum, QUM_F_INTERNALIZED);
1680 		KBUF_INIT(&kqum->qum_buf[0]);
1681 		break;
1682 	}
1683 
1684 	return kqum;
1685 }
1686 
1687 /*
1688  * When PPF_BUFFER_ON_DEMAND flag is set on packet pool creation, we create
1689  * packet descriptor cache with no buffer attached and a buflet cache with
1690  * cpu layer caching enabled. While operating in this mode, we can call
1691  * pp_alloc_packet_common() either with `bufcnt = 0` or `bufcnt = n`,
1692  * where n <= pp->pp_max_frags. If `bufcnt == 0` then we allocate packet
1693  * descriptor with no attached buffer from the metadata cache.
1694  * If `bufcnt != 0`, then this routine allocates packet descriptor and buflets
1695  * from their respective caches and constructs the packet on behalf of the
1696  * caller.
1697  */
1698 __attribute__((always_inline))
1699 static inline uint32_t
pp_alloc_packet_common(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * array,uint32_t num,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)1700 pp_alloc_packet_common(struct kern_pbufpool *pp, uint16_t bufcnt,
1701     uint64_t *array, uint32_t num, boolean_t tagged, alloc_cb_func_t cb,
1702     const void *ctx, uint32_t skmflag)
1703 {
1704 	struct __metadata_preamble *mdp;
1705 	struct __kern_quantum *kqum = NULL;
1706 	uint32_t allocp, need = num;
1707 	struct skmem_obj *plist, *blist = NULL;
1708 
1709 	ASSERT(bufcnt <= pp->pp_max_frags);
1710 	ASSERT(array != NULL && num > 0);
1711 	ASSERT(PP_BATCH_CAPABLE(pp));
1712 
1713 	/* allocate (constructed) packet(s) with buffer(s) attached */
1714 	allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist, num,
1715 	    skmflag);
1716 
1717 	/* allocate (constructed) buflet(s) with buffer(s) attached */
1718 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
1719 		(void) skmem_cache_batch_alloc(pp->pp_kbft_cache, &blist,
1720 		    (allocp * bufcnt), skmflag);
1721 	}
1722 
1723 	while (plist != NULL) {
1724 		struct skmem_obj *plistn;
1725 
1726 		plistn = plist->mo_next;
1727 		plist->mo_next = NULL;
1728 
1729 		mdp = (struct __metadata_preamble *)(void *)plist;
1730 		kqum = pp_metadata_init(mdp, pp, bufcnt, skmflag, &blist);
1731 		if (kqum == NULL) {
1732 			if (blist != NULL) {
1733 				skmem_cache_batch_free(pp->pp_kbft_cache,
1734 				    blist);
1735 				blist = NULL;
1736 			}
1737 			plist->mo_next = plistn;
1738 			skmem_cache_batch_free(pp->pp_kmd_cache, plist);
1739 			plist = NULL;
1740 			break;
1741 		}
1742 
1743 		if (tagged) {
1744 			*array = SK_PTR_ENCODE(kqum, METADATA_TYPE(kqum),
1745 			    METADATA_SUBTYPE(kqum));
1746 		} else {
1747 			*array = (uint64_t)kqum;
1748 		}
1749 
1750 		if (cb != NULL) {
1751 			(cb)(*array, (num - need), ctx);
1752 		}
1753 
1754 		++array;
1755 		plist = plistn;
1756 
1757 		ASSERT(need > 0);
1758 		--need;
1759 	}
1760 	ASSERT(blist == NULL);
1761 	ASSERT((num - need) == allocp || kqum == NULL);
1762 
1763 	return num - need;
1764 }
1765 
1766 uint64_t
pp_alloc_packet(struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag)1767 pp_alloc_packet(struct kern_pbufpool *pp, uint16_t bufcnt, uint32_t skmflag)
1768 {
1769 	uint64_t kpkt = 0;
1770 
1771 	(void) pp_alloc_packet_common(pp, bufcnt, &kpkt, 1, FALSE,
1772 	    NULL, NULL, skmflag);
1773 
1774 	return kpkt;
1775 }
1776 
1777 int
pp_alloc_packet_batch(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * array,uint32_t * size,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)1778 pp_alloc_packet_batch(struct kern_pbufpool *pp, uint16_t bufcnt,
1779     uint64_t *array, uint32_t *size, boolean_t tagged, alloc_cb_func_t cb,
1780     const void *ctx, uint32_t skmflag)
1781 {
1782 	uint32_t i, n;
1783 	int err;
1784 
1785 	ASSERT(array != NULL && size > 0);
1786 
1787 	n = *size;
1788 	*size = 0;
1789 
1790 	i = pp_alloc_packet_common(pp, bufcnt, array, n, tagged,
1791 	    cb, ctx, skmflag);
1792 	*size = i;
1793 
1794 	if (__probable(i == n)) {
1795 		err = 0;
1796 	} else if (i != 0) {
1797 		err = EAGAIN;
1798 	} else {
1799 		err = ENOMEM;
1800 	}
1801 
1802 	return err;
1803 }
1804 
1805 int
pp_alloc_pktq(struct kern_pbufpool * pp,uint16_t bufcnt,struct pktq * pktq,uint32_t num,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)1806 pp_alloc_pktq(struct kern_pbufpool *pp, uint16_t bufcnt,
1807     struct pktq *pktq, uint32_t num, alloc_cb_func_t cb, const void *ctx,
1808     uint32_t skmflag)
1809 {
1810 	struct __metadata_preamble *mdp;
1811 	struct __kern_packet *kpkt = NULL;
1812 	uint32_t allocp, need = num;
1813 	struct skmem_obj *plist, *blist = NULL;
1814 	int err;
1815 
1816 	ASSERT(pktq != NULL && num > 0);
1817 	ASSERT(pp->pp_md_type == NEXUS_META_TYPE_PACKET);
1818 	ASSERT(bufcnt <= pp->pp_max_frags);
1819 	ASSERT(PP_BATCH_CAPABLE(pp));
1820 
1821 	/* allocate (constructed) packet(s) with buffer(s) attached */
1822 	allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist, num,
1823 	    skmflag);
1824 
1825 	/* allocate (constructed) buflet(s) with buffer(s) attached */
1826 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
1827 		(void) skmem_cache_batch_alloc(pp->pp_kbft_cache, &blist,
1828 		    (allocp * bufcnt), skmflag);
1829 	}
1830 
1831 	while (plist != NULL) {
1832 		struct skmem_obj *plistn;
1833 
1834 		plistn = plist->mo_next;
1835 		plist->mo_next = NULL;
1836 
1837 		mdp = (struct __metadata_preamble *)(void *)plist;
1838 		kpkt = (struct __kern_packet *)pp_metadata_init(mdp, pp,
1839 		    bufcnt, skmflag, &blist);
1840 		if (kpkt == NULL) {
1841 			if (blist != NULL) {
1842 				skmem_cache_batch_free(pp->pp_kbft_cache,
1843 				    blist);
1844 				blist = NULL;
1845 			}
1846 			plist->mo_next = plistn;
1847 			skmem_cache_batch_free(pp->pp_kmd_cache, plist);
1848 			plist = NULL;
1849 			break;
1850 		}
1851 
1852 		KPKTQ_ENQUEUE(pktq, kpkt);
1853 
1854 		if (cb != NULL) {
1855 			(cb)((uint64_t)kpkt, (num - need), ctx);
1856 		}
1857 
1858 		plist = plistn;
1859 
1860 		ASSERT(need > 0);
1861 		--need;
1862 	}
1863 	ASSERT(blist == NULL);
1864 	ASSERT((num - need) == allocp || kpkt == NULL);
1865 
1866 	if (__probable(need == 0)) {
1867 		err = 0;
1868 	} else if (need == num) {
1869 		err = ENOMEM;
1870 	} else {
1871 		err = EAGAIN;
1872 	}
1873 
1874 	return err;
1875 }
1876 
1877 uint64_t
pp_alloc_packet_by_size(struct kern_pbufpool * pp,uint32_t size,uint32_t skmflag)1878 pp_alloc_packet_by_size(struct kern_pbufpool *pp, uint32_t size,
1879     uint32_t skmflag)
1880 {
1881 	uint32_t bufcnt = pp->pp_max_frags;
1882 	uint64_t kpkt = 0;
1883 
1884 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1885 		bufcnt =
1886 		    SK_ROUNDUP(size, pp->pp_buflet_size) / pp->pp_buflet_size;
1887 		ASSERT(bufcnt <= UINT16_MAX);
1888 	}
1889 
1890 	(void) pp_alloc_packet_common(pp, (uint16_t)bufcnt, &kpkt, 1, TRUE,
1891 	    NULL, NULL, skmflag);
1892 
1893 	return kpkt;
1894 }
1895 
1896 __attribute__((always_inline))
1897 static inline struct __metadata_preamble *
pp_metadata_fini(struct __kern_quantum * kqum,struct kern_pbufpool * pp,struct mbuf ** mp,struct __kern_packet ** kpp,struct skmem_obj ** blist)1898 pp_metadata_fini(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
1899     struct mbuf **mp, struct __kern_packet **kpp, struct skmem_obj **blist)
1900 {
1901 	struct __metadata_preamble *mdp = METADATA_PREAMBLE(kqum);
1902 
1903 	ASSERT(SK_PTR_TAG(kqum) == 0);
1904 
1905 	switch (pp->pp_md_type) {
1906 	case NEXUS_META_TYPE_PACKET: {
1907 		struct __kern_packet *kpkt = SK_PTR_KPKT(kqum);
1908 
1909 		if ((kpkt->pkt_pflags & PKT_F_TX_COMPL_TS_REQ) != 0) {
1910 			__packet_perform_tx_completion_callbacks(
1911 				SK_PKT2PH(kpkt), NULL);
1912 		}
1913 		if ((kpkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) {
1914 			ASSERT((kpkt->pkt_pflags & PKT_F_PKT_DATA) == 0);
1915 			ASSERT(kpkt->pkt_mbuf != NULL);
1916 			ASSERT(kpkt->pkt_mbuf->m_nextpkt == NULL);
1917 			if (mp != NULL) {
1918 				ASSERT(*mp == NULL);
1919 				*mp = kpkt->pkt_mbuf;
1920 			} else {
1921 				m_freem(kpkt->pkt_mbuf);
1922 			}
1923 			KPKT_CLEAR_MBUF_DATA(kpkt);
1924 		} else if ((kpkt->pkt_pflags & PKT_F_PKT_DATA) != 0) {
1925 			ASSERT(kpkt->pkt_pkt != NULL);
1926 			ASSERT(kpkt->pkt_pkt->pkt_nextpkt == NULL);
1927 			if (kpp != NULL) {
1928 				ASSERT(*kpp == NULL);
1929 				*kpp = kpkt->pkt_pkt;
1930 			} else {
1931 				/* can only recurse once */
1932 				ASSERT((kpkt->pkt_pkt->pkt_pflags &
1933 				    PKT_F_PKT_DATA) == 0);
1934 				pp_free_packet_single(kpkt->pkt_pkt);
1935 			}
1936 			KPKT_CLEAR_PKT_DATA(kpkt);
1937 		}
1938 		ASSERT(kpkt->pkt_nextpkt == NULL);
1939 		ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
1940 		ASSERT((kpkt->pkt_pflags & PKT_F_MBUF_MASK) == 0);
1941 		ASSERT((kpkt->pkt_pflags & PKT_F_PKT_MASK) == 0);
1942 		break;
1943 	}
1944 	default:
1945 		break;
1946 	}
1947 
1948 	if (__improbable(PP_HAS_BUFFER_ON_DEMAND(pp))) {
1949 		pp_metadata_destruct_common(kqum, pp, FALSE, blist);
1950 	}
1951 	return mdp;
1952 }
1953 
1954 void
pp_free_packet_chain(struct __kern_packet * pkt_chain,int * npkt)1955 pp_free_packet_chain(struct __kern_packet *pkt_chain, int *npkt)
1956 {
1957 	struct __metadata_preamble *mdp;
1958 	struct skmem_obj *top = NULL;
1959 	struct skmem_obj *blist = NULL;
1960 	struct skmem_obj **list = &top;
1961 	struct mbuf *mtop = NULL;
1962 	struct mbuf **mp = &mtop;
1963 	struct __kern_packet *kptop = NULL;
1964 	struct __kern_packet **kpp = &kptop, *pkt, *next;
1965 	struct kern_pbufpool *pp;
1966 	int c = 0;
1967 
1968 	pp = __DECONST(struct kern_pbufpool *, pkt_chain->pkt_qum.qum_pp);
1969 	ASSERT(pp != NULL);
1970 	ASSERT(PP_BATCH_CAPABLE(pp));
1971 
1972 	for (pkt = pkt_chain; pkt != NULL; pkt = next) {
1973 		next = pkt->pkt_nextpkt;
1974 		pkt->pkt_nextpkt = NULL;
1975 
1976 		ASSERT(SK_PTR_ADDR_KQUM(pkt)->qum_pp == pp);
1977 		mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(pkt), pp,
1978 		    mp, kpp, &blist);
1979 
1980 		*list = (struct skmem_obj *)mdp;
1981 		list = &(*list)->mo_next;
1982 		c++;
1983 
1984 		if (*mp != NULL) {
1985 			mp = &(*mp)->m_nextpkt;
1986 			ASSERT(*mp == NULL);
1987 		}
1988 		if (*kpp != NULL) {
1989 			kpp = &(*kpp)->pkt_nextpkt;
1990 			ASSERT(*kpp == NULL);
1991 		}
1992 	}
1993 
1994 	ASSERT(top != NULL);
1995 	skmem_cache_batch_free(pp->pp_kmd_cache, top);
1996 	if (blist != NULL) {
1997 		skmem_cache_batch_free(pp->pp_kbft_cache, blist);
1998 		blist = NULL;
1999 	}
2000 	if (mtop != NULL) {
2001 		DTRACE_SKYWALK(free__attached__mbuf);
2002 		if (__probable(mtop->m_nextpkt != NULL)) {
2003 			m_freem_list(mtop);
2004 		} else {
2005 			m_freem(mtop);
2006 		}
2007 	}
2008 	if (kptop != NULL) {
2009 		int cnt = 0;
2010 		pp_free_packet_chain(kptop, &cnt);
2011 		DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2012 	}
2013 	if (npkt != NULL) {
2014 		*npkt = c;
2015 	}
2016 }
2017 
2018 void
pp_free_pktq(struct pktq * pktq)2019 pp_free_pktq(struct pktq *pktq)
2020 {
2021 	if (__improbable(KPKTQ_EMPTY(pktq))) {
2022 		return;
2023 	}
2024 	struct __kern_packet *pkt = KPKTQ_FIRST(pktq);
2025 	pp_free_packet_chain(pkt, NULL);
2026 	KPKTQ_DISPOSE(pktq);
2027 }
2028 
2029 __attribute__((always_inline))
2030 static inline void
pp_free_packet_array(struct kern_pbufpool * pp,uint64_t * array,uint32_t num)2031 pp_free_packet_array(struct kern_pbufpool *pp, uint64_t *array, uint32_t num)
2032 {
2033 	struct __metadata_preamble *mdp;
2034 	struct skmem_obj *top = NULL;
2035 	struct skmem_obj *blist = NULL;
2036 	struct skmem_obj **list = &top;
2037 	struct mbuf *mtop = NULL;
2038 	struct mbuf **mp = &mtop;
2039 	struct __kern_packet *kptop = NULL;
2040 	struct __kern_packet **kpp = &kptop;
2041 	uint32_t i;
2042 
2043 	ASSERT(pp != NULL);
2044 	ASSERT(array != NULL && num > 0);
2045 	ASSERT(PP_BATCH_CAPABLE(pp));
2046 
2047 	for (i = 0; i < num; i++) {
2048 		ASSERT(SK_PTR_ADDR_KQUM(array[i])->qum_pp == pp);
2049 		mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(array[i]), pp,
2050 		    mp, kpp, &blist);
2051 
2052 		*list = (struct skmem_obj *)mdp;
2053 		list = &(*list)->mo_next;
2054 		array[i] = 0;
2055 
2056 		if (*mp != NULL) {
2057 			mp = &(*mp)->m_nextpkt;
2058 			ASSERT(*mp == NULL);
2059 		}
2060 		if (*kpp != NULL) {
2061 			kpp = &(*kpp)->pkt_nextpkt;
2062 			ASSERT(*kpp == NULL);
2063 		}
2064 	}
2065 
2066 	ASSERT(top != NULL);
2067 	skmem_cache_batch_free(pp->pp_kmd_cache, top);
2068 	if (blist != NULL) {
2069 		skmem_cache_batch_free(pp->pp_kbft_cache, blist);
2070 		blist = NULL;
2071 	}
2072 
2073 	if (mtop != NULL) {
2074 		DTRACE_SKYWALK(free__attached__mbuf);
2075 		if (__probable(mtop->m_nextpkt != NULL)) {
2076 			m_freem_list(mtop);
2077 		} else {
2078 			m_freem(mtop);
2079 		}
2080 	}
2081 	if (kptop != NULL) {
2082 		int cnt = 0;
2083 		pp_free_packet_chain(kptop, &cnt);
2084 		DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2085 	}
2086 }
2087 
2088 void
pp_free_packet(struct kern_pbufpool * pp,uint64_t kqum)2089 pp_free_packet(struct kern_pbufpool *pp, uint64_t kqum)
2090 {
2091 	pp_free_packet_array(pp, &kqum, 1);
2092 }
2093 
2094 void
pp_free_packet_batch(const kern_pbufpool_t pp,uint64_t * array,uint32_t size)2095 pp_free_packet_batch(const kern_pbufpool_t pp, uint64_t *array, uint32_t size)
2096 {
2097 	pp_free_packet_array(pp, array, size);
2098 }
2099 
2100 void
pp_free_packet_single(struct __kern_packet * pkt)2101 pp_free_packet_single(struct __kern_packet *pkt)
2102 {
2103 	ASSERT(pkt->pkt_nextpkt == NULL);
2104 	pp_free_packet(__DECONST(struct kern_pbufpool *,
2105 	    pkt->pkt_qum.qum_pp), SK_PTR_ADDR(pkt));
2106 }
2107 
2108 static mach_vm_address_t
pp_alloc_buffer_common(const kern_pbufpool_t pp,struct skmem_obj_info * oi,uint32_t skmflag)2109 pp_alloc_buffer_common(const kern_pbufpool_t pp, struct skmem_obj_info *oi,
2110     uint32_t skmflag)
2111 {
2112 	mach_vm_address_t baddr;
2113 
2114 	/* allocate a cached buffer */
2115 	baddr = (mach_vm_address_t)skmem_cache_alloc(pp->pp_buf_cache,
2116 	    skmflag);
2117 
2118 #if (DEVELOPMENT || DEBUG)
2119 	uint64_t mtbf = skmem_region_get_mtbf();
2120 	/*
2121 	 * MTBF is applicable only for non-blocking allocations here.
2122 	 */
2123 	if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
2124 	    (skmflag & SKMEM_NOSLEEP))) {
2125 		SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
2126 		net_update_uptime();
2127 		if (baddr != 0) {
2128 			skmem_cache_free(pp->pp_buf_cache, (void *)baddr);
2129 			baddr = 0;
2130 		}
2131 	}
2132 #endif /* (DEVELOPMENT || DEBUG) */
2133 
2134 	if (__improbable(baddr == 0)) {
2135 		SK_DF(SK_VERB_MEM, "failed to alloc buffer, pp 0x%llx",
2136 		    SK_KVA(pp));
2137 		return 0;
2138 	}
2139 	skmem_cache_get_obj_info(pp->pp_buf_cache, (void *)baddr, oi, NULL);
2140 	ASSERT(SKMEM_OBJ_BUFCTL(oi) != NULL);
2141 	ASSERT((mach_vm_address_t)SKMEM_OBJ_ADDR(oi) == baddr);
2142 	return baddr;
2143 }
2144 
2145 errno_t
pp_alloc_buffer(const kern_pbufpool_t pp,mach_vm_address_t * baddr,kern_segment_t * seg,kern_obj_idx_seg_t * idx,uint32_t skmflag)2146 pp_alloc_buffer(const kern_pbufpool_t pp, mach_vm_address_t *baddr,
2147     kern_segment_t *seg, kern_obj_idx_seg_t *idx, uint32_t skmflag)
2148 {
2149 	struct skmem_obj_info oib;
2150 
2151 	VERIFY(pp != NULL && baddr != NULL);
2152 	VERIFY((seg != NULL) == (idx != NULL));
2153 
2154 	if (__improbable(!PP_HAS_BUFFER_ON_DEMAND(pp))) {
2155 		return ENOTSUP;
2156 	}
2157 
2158 	*baddr = pp_alloc_buffer_common(pp, &oib, skmflag);
2159 	if (__improbable(*baddr == 0)) {
2160 		return ENOMEM;
2161 	}
2162 
2163 	if (seg != NULL) {
2164 		ASSERT(SKMEM_OBJ_SEG(&oib) != NULL);
2165 		*seg = SKMEM_OBJ_SEG(&oib);
2166 		*idx = SKMEM_OBJ_IDX_SEG(&oib);
2167 	}
2168 	return 0;
2169 }
2170 
2171 void
pp_free_buffer(const kern_pbufpool_t pp,mach_vm_address_t addr)2172 pp_free_buffer(const kern_pbufpool_t pp, mach_vm_address_t addr)
2173 {
2174 	ASSERT(pp != NULL && addr != 0);
2175 	skmem_cache_free(pp->pp_buf_cache, (void *)addr);
2176 }
2177 
2178 __attribute__((always_inline))
2179 static inline uint32_t
pp_alloc_buflet_common(struct kern_pbufpool * pp,uint64_t * array,uint32_t num,uint32_t skmflag)2180 pp_alloc_buflet_common(struct kern_pbufpool *pp, uint64_t *array,
2181     uint32_t num, uint32_t skmflag)
2182 {
2183 	struct __kern_buflet *kbft = NULL;
2184 	uint32_t allocd, need = num;
2185 	struct skmem_obj *list;
2186 
2187 	ASSERT(array != NULL && num > 0);
2188 	ASSERT(PP_BATCH_CAPABLE(pp));
2189 	ASSERT(pp->pp_kbft_cache != NULL);
2190 
2191 	allocd = skmem_cache_batch_alloc(pp->pp_kbft_cache, &list, num,
2192 	    skmflag);
2193 
2194 	while (list != NULL) {
2195 		struct skmem_obj *listn;
2196 
2197 		listn = list->mo_next;
2198 		list->mo_next = NULL;
2199 		kbft = (kern_buflet_t)(void *)list;
2200 		KBUF_EXT_INIT(kbft, pp);
2201 		*array = (uint64_t)kbft;
2202 		++array;
2203 		list = listn;
2204 		ASSERT(need > 0);
2205 		--need;
2206 	}
2207 	ASSERT((num - need) == allocd || kbft == NULL);
2208 	return num - need;
2209 }
2210 
2211 errno_t
pp_alloc_buflet(struct kern_pbufpool * pp,kern_buflet_t * kbft,uint32_t skmflag)2212 pp_alloc_buflet(struct kern_pbufpool *pp, kern_buflet_t *kbft, uint32_t skmflag)
2213 {
2214 	uint64_t bft;
2215 
2216 	if (__improbable(!pp_alloc_buflet_common(pp, &bft, 1, skmflag))) {
2217 		return ENOMEM;
2218 	}
2219 	*kbft = (kern_buflet_t)bft;
2220 	return 0;
2221 }
2222 
2223 errno_t
pp_alloc_buflet_batch(struct kern_pbufpool * pp,uint64_t * array,uint32_t * size,uint32_t skmflag)2224 pp_alloc_buflet_batch(struct kern_pbufpool *pp, uint64_t *array,
2225     uint32_t *size, uint32_t skmflag)
2226 {
2227 	uint32_t i, n;
2228 	int err;
2229 
2230 	ASSERT(array != NULL && size > 0);
2231 
2232 	n = *size;
2233 	*size = 0;
2234 
2235 	i = pp_alloc_buflet_common(pp, array, n, skmflag);
2236 	*size = i;
2237 
2238 	if (__probable(i == n)) {
2239 		err = 0;
2240 	} else if (i != 0) {
2241 		err = EAGAIN;
2242 	} else {
2243 		err = ENOMEM;
2244 	}
2245 
2246 	return err;
2247 }
2248 
2249 __attribute__((always_inline))
2250 static void
pp_free_buflet_common(const kern_pbufpool_t pp,kern_buflet_t kbft)2251 pp_free_buflet_common(const kern_pbufpool_t pp, kern_buflet_t kbft)
2252 {
2253 	ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
2254 	ASSERT(kbft->buf_nbft_addr == 0);
2255 
2256 	if (kbft->buf_flag & BUFLET_FLAG_EXTERNAL) {
2257 		ASSERT(kbft->buf_addr != 0);
2258 		ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2259 		ASSERT(kbft->buf_bft_idx_reg != OBJ_IDX_NONE);
2260 		ASSERT(kbft->buf_ctl != NULL);
2261 		ASSERT(((struct __kern_buflet_ext *)kbft)->
2262 		    kbe_buf_upp_link.sle_next == NULL);
2263 		/*
2264 		 * external buflet has buffer attached at construction,
2265 		 * so we don't free the buffer here.
2266 		 */
2267 		skmem_cache_free(pp->pp_kbft_cache, (void *)kbft);
2268 	} else if (__probable(kbft->buf_addr != 0)) {
2269 		void *objaddr = kbft->buf_objaddr;
2270 		uint32_t usecnt = 0;
2271 
2272 		ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2273 		ASSERT(kbft->buf_ctl != NULL);
2274 		KBUF_DTOR(kbft, usecnt);
2275 		SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u",
2276 		    SK_KVA(pp), SK_KVA(objaddr), usecnt);
2277 		if (__probable(usecnt == 0)) {
2278 			skmem_cache_free(pp->pp_buf_cache, objaddr);
2279 		}
2280 	}
2281 }
2282 
2283 void
pp_free_buflet(const kern_pbufpool_t pp,kern_buflet_t kbft)2284 pp_free_buflet(const kern_pbufpool_t pp, kern_buflet_t kbft)
2285 {
2286 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
2287 	ASSERT(pp != NULL && kbft != NULL);
2288 	pp_free_buflet_common(pp, kbft);
2289 }
2290 
2291 void
pp_reap_caches(boolean_t purge)2292 pp_reap_caches(boolean_t purge)
2293 {
2294 	skmem_cache_reap_now(pp_opt_cache, purge);
2295 	skmem_cache_reap_now(pp_flow_cache, purge);
2296 	skmem_cache_reap_now(pp_compl_cache, purge);
2297 }
2298