xref: /xnu-11215.81.4/bsd/skywalk/packet/pbufpool.c (revision d4514f0bc1d3f944c22d92e68b646ac3fb40d452)
1 /*
2  * Copyright (c) 2016-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/packet/pbufpool_var.h>
31 #include <sys/sdt.h>
32 #include <net/droptap.h>
33 
34 static struct kern_pbufpool *pp_alloc(zalloc_flags_t);
35 static void pp_free(struct kern_pbufpool *);
36 static uint32_t pp_alloc_packet_common(struct kern_pbufpool *, uint16_t,
37     uint64_t *__counted_by(num), uint32_t num, boolean_t, alloc_cb_func_t,
38     const void *, uint32_t);
39 static void pp_free_packet_array(struct kern_pbufpool *,
40     uint64_t *__counted_by(num)array, uint32_t num);
41 static int pp_metadata_ctor_no_buflet(struct skmem_obj_info *,
42     struct skmem_obj_info *, void *, uint32_t);
43 static int pp_metadata_ctor_max_buflet(struct skmem_obj_info *,
44     struct skmem_obj_info *, void *, uint32_t);
45 static void pp_metadata_dtor(void *, void *);
46 static int pp_metadata_construct(struct __kern_quantum *,
47     struct __user_quantum *, obj_idx_t, struct kern_pbufpool *, uint32_t,
48     uint16_t, bool, struct skmem_obj **);
49 static void pp_metadata_destruct(struct __kern_quantum *,
50     struct kern_pbufpool *, bool);
51 static struct __kern_quantum *pp_metadata_init(struct __metadata_preamble *,
52     struct kern_pbufpool *, uint16_t, uint32_t, struct skmem_obj **);
53 static struct __metadata_preamble *pp_metadata_fini(struct __kern_quantum *,
54     struct kern_pbufpool *, struct mbuf **, struct __kern_packet **,
55     struct skmem_obj **, struct skmem_obj **, struct skmem_obj **, struct skmem_obj **);
56 static void pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid);
57 static void pp_buf_seg_ctor(struct sksegment *, IOSKMemoryBufferRef, void *);
58 static void pp_buf_seg_dtor(struct sksegment *, IOSKMemoryBufferRef, void *);
59 static void pp_destroy_upp_locked(struct kern_pbufpool *);
60 static void pp_destroy_upp_bft_locked(struct kern_pbufpool *);
61 static int pp_init_upp_bft_locked(struct kern_pbufpool *, boolean_t);
62 static void pp_free_buflet_common(const kern_pbufpool_t, kern_buflet_t);
63 static mach_vm_address_t pp_alloc_buffer_common(const kern_pbufpool_t pp,
64     struct skmem_obj_info *oi, uint32_t skmflag, bool large);
65 static inline uint32_t
66 pp_alloc_buflet_common(struct kern_pbufpool *pp,
67     uint64_t *__counted_by(num)array, uint32_t num, uint32_t skmflag,
68     bool large);
69 
70 #define KERN_PBUFPOOL_U_HASH_SIZE       64      /* hash table size */
71 
72 #define KERN_BUF_MIN_STRIDING_SIZE      32 * 1024
73 static uint32_t kern_buf_min_striding_size = KERN_BUF_MIN_STRIDING_SIZE;
74 
75 /*
76  * Since the inputs are small (indices to the metadata region), we can use
77  * Knuth's multiplicative hash method which is fast and good enough.  Here
78  * we multiply the input by the golden ratio of 2^32.  See "The Art of
79  * Computer Programming", section 6.4.
80  */
81 #define KERN_PBUFPOOL_U_HASH_INDEX(_i, _m)                      \
82 	(((_i) * 2654435761U) & (_m))
83 #define KERN_PBUFPOOL_U_HASH(_pp, _i)                           \
84 	(&(_pp)->pp_u_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
85 	KERN_PBUFPOOL_U_HASH_SIZE - 1)])
86 #define KERN_PBUFPOOL_U_BFT_HASH(_pp, _i)                           \
87 	(&(_pp)->pp_u_bft_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
88 	KERN_PBUFPOOL_U_HASH_SIZE - 1)])
89 
90 static SKMEM_TYPE_DEFINE(pp_zone, struct kern_pbufpool);
91 
92 #define SKMEM_TAG_PBUFPOOL_HASH  "com.apple.skywalk.pbufpool.hash"
93 static SKMEM_TAG_DEFINE(skmem_tag_pbufpool_hash, SKMEM_TAG_PBUFPOOL_HASH);
94 
95 #define SKMEM_TAG_PBUFPOOL_BFT_HASH  "com.apple.skywalk.pbufpool.bft.hash"
96 static SKMEM_TAG_DEFINE(skmem_tag_pbufpool_bft_hash, SKMEM_TAG_PBUFPOOL_BFT_HASH);
97 
98 struct kern_pbufpool_u_htbl {
99 	struct kern_pbufpool_u_bkt upp_hash[KERN_PBUFPOOL_U_HASH_SIZE];
100 };
101 
102 #define PP_U_HTBL_SIZE  sizeof(struct kern_pbufpool_u_htbl)
103 static SKMEM_TYPE_DEFINE(pp_u_htbl_zone, struct kern_pbufpool_u_htbl);
104 
105 static struct skmem_cache *pp_opt_cache;        /* cache for __packet_opt */
106 static struct skmem_cache *pp_flow_cache;       /* cache for __flow */
107 static struct skmem_cache *pp_compl_cache;      /* cache for __packet_compl */
108 
109 static int __pp_inited = 0;
110 
111 int
pp_init(void)112 pp_init(void)
113 {
114 	_CASSERT(KPKT_SC_UNSPEC == MBUF_SC_UNSPEC);
115 	_CASSERT(KPKT_SC_BK_SYS == MBUF_SC_BK_SYS);
116 	_CASSERT(KPKT_SC_BK == MBUF_SC_BK);
117 	_CASSERT(KPKT_SC_BE == MBUF_SC_BE);
118 	_CASSERT(KPKT_SC_RD == MBUF_SC_RD);
119 	_CASSERT(KPKT_SC_OAM == MBUF_SC_OAM);
120 	_CASSERT(KPKT_SC_AV == MBUF_SC_AV);
121 	_CASSERT(KPKT_SC_RV == MBUF_SC_RV);
122 	_CASSERT(KPKT_SC_VI == MBUF_SC_VI);
123 	_CASSERT(KPKT_SC_SIG == MBUF_SC_SIG);
124 	_CASSERT(KPKT_SC_VO == MBUF_SC_VO);
125 	_CASSERT(KPKT_SC_CTL == MBUF_SC_CTL);
126 
127 	_CASSERT(KPKT_SC_BK_SYS == PKT_SC_BK_SYS);
128 	_CASSERT(KPKT_SC_BK == PKT_SC_BK);
129 	_CASSERT(KPKT_SC_BE == PKT_SC_BE);
130 	_CASSERT(KPKT_SC_RD == PKT_SC_RD);
131 	_CASSERT(KPKT_SC_OAM == PKT_SC_OAM);
132 	_CASSERT(KPKT_SC_AV == PKT_SC_AV);
133 	_CASSERT(KPKT_SC_RV == PKT_SC_RV);
134 	_CASSERT(KPKT_SC_VI == PKT_SC_VI);
135 	_CASSERT(KPKT_SC_SIG == PKT_SC_SIG);
136 	_CASSERT(KPKT_SC_VO == PKT_SC_VO);
137 	_CASSERT(KPKT_SC_CTL == PKT_SC_CTL);
138 	_CASSERT(KPKT_SC_MAX_CLASSES == MBUF_SC_MAX_CLASSES);
139 
140 	_CASSERT(KPKT_TC_UNSPEC == MBUF_TC_UNSPEC);
141 	_CASSERT(KPKT_TC_BE == MBUF_TC_BE);
142 	_CASSERT(KPKT_TC_BK == MBUF_TC_BK);
143 	_CASSERT(KPKT_TC_VI == MBUF_TC_VI);
144 	_CASSERT(KPKT_TC_VO == MBUF_TC_VO);
145 	_CASSERT(KPKT_TC_MAX == MBUF_TC_MAX);
146 
147 	_CASSERT(KPKT_TC_BE == PKT_TC_BE);
148 	_CASSERT(KPKT_TC_BK == PKT_TC_BK);
149 	_CASSERT(KPKT_TC_VI == PKT_TC_VI);
150 	_CASSERT(KPKT_TC_VO == PKT_TC_VO);
151 
152 	_CASSERT(PKT_SCVAL_BK_SYS == SCVAL_BK_SYS);
153 	_CASSERT(PKT_SCVAL_BK == SCVAL_BK);
154 	_CASSERT(PKT_SCVAL_BE == SCVAL_BE);
155 	_CASSERT(PKT_SCVAL_RD == SCVAL_RD);
156 	_CASSERT(PKT_SCVAL_OAM == SCVAL_OAM);
157 	_CASSERT(PKT_SCVAL_AV == SCVAL_AV);
158 	_CASSERT(PKT_SCVAL_RV == SCVAL_RV);
159 	_CASSERT(PKT_SCVAL_VI == SCVAL_VI);
160 	_CASSERT(PKT_SCVAL_VO == SCVAL_VO);
161 	_CASSERT(PKT_SCVAL_CTL == SCVAL_CTL);
162 
163 	/*
164 	 * Assert that the value of common packet flags between mbuf and
165 	 * skywalk packets match, and that they are in PKT_F_COMMON_MASK.
166 	 */
167 	_CASSERT(PKT_F_BACKGROUND == PKTF_SO_BACKGROUND);
168 	_CASSERT(PKT_F_REALTIME == PKTF_SO_REALTIME);
169 	_CASSERT(PKT_F_REXMT == PKTF_TCP_REXMT);
170 	_CASSERT(PKT_F_LAST_PKT == PKTF_LAST_PKT);
171 	_CASSERT(PKT_F_FLOW_ID == PKTF_FLOW_ID);
172 	_CASSERT(PKT_F_FLOW_ADV == PKTF_FLOW_ADV);
173 	_CASSERT(PKT_F_TX_COMPL_TS_REQ == PKTF_TX_COMPL_TS_REQ);
174 	_CASSERT(PKT_F_TS_VALID == PKTF_TS_VALID);
175 	_CASSERT(PKT_F_NEW_FLOW == PKTF_NEW_FLOW);
176 	_CASSERT(PKT_F_START_SEQ == PKTF_START_SEQ);
177 	_CASSERT(PKT_F_KEEPALIVE == PKTF_KEEPALIVE);
178 	_CASSERT(PKT_F_WAKE_PKT == PKTF_WAKE_PKT);
179 	_CASSERT(PKT_F_COMMON_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME |
180 	    PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_FLOW_ID | PKT_F_FLOW_ADV |
181 	    PKT_F_TX_COMPL_TS_REQ | PKT_F_TS_VALID | PKT_F_NEW_FLOW |
182 	    PKT_F_START_SEQ | PKT_F_KEEPALIVE | PKT_F_WAKE_PKT));
183 	/*
184 	 * Assert packet flags shared with userland.
185 	 */
186 	_CASSERT(PKT_F_USER_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME |
187 	    PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_OPT_DATA | PKT_F_PROMISC |
188 	    PKT_F_TRUNCATED | PKT_F_WAKE_PKT | PKT_F_L4S));
189 
190 	_CASSERT(offsetof(struct __kern_quantum, qum_len) ==
191 	    offsetof(struct __kern_packet, pkt_length));
192 
193 	/*
194 	 * Due to the use of tagged pointer, we need the size of
195 	 * the metadata preamble structure to be multiples of 16.
196 	 * See SK_PTR_TAG() definition for details.
197 	 */
198 	_CASSERT(sizeof(struct __metadata_preamble) != 0 &&
199 	    (sizeof(struct __metadata_preamble) % 16) == 0);
200 
201 	_CASSERT(NX_PBUF_FRAGS_MIN == 1 &&
202 	    NX_PBUF_FRAGS_MIN == NX_PBUF_FRAGS_DEFAULT);
203 
204 	/*
205 	 * Batch alloc/free requires linking the objects together;
206 	 * make sure that the fields are at the same offset since
207 	 * we cast the object to struct skmem_obj.
208 	 */
209 	_CASSERT(offsetof(struct __metadata_preamble, _mdp_next) ==
210 	    offsetof(struct skmem_obj, mo_next));
211 	_CASSERT(offsetof(struct __buflet, __buflet_next) ==
212 	    offsetof(struct skmem_obj, mo_next));
213 
214 	SK_LOCK_ASSERT_HELD();
215 	ASSERT(!__pp_inited);
216 
217 	pp_opt_cache = skmem_cache_create("pkt.opt",
218 	    sizeof(struct __packet_opt), sizeof(uint64_t),
219 	    NULL, NULL, NULL, NULL, NULL, 0);
220 	pp_flow_cache = skmem_cache_create("pkt.flow",
221 	    sizeof(struct __flow), 16,  /* 16-bytes aligned */
222 	    NULL, NULL, NULL, NULL, NULL, 0);
223 	pp_compl_cache = skmem_cache_create("pkt.compl",
224 	    sizeof(struct __packet_compl), sizeof(uint64_t),
225 	    NULL, NULL, NULL, NULL, NULL, 0);
226 
227 	PE_parse_boot_argn("sk_pp_min_striding_size", &kern_buf_min_striding_size,
228 	    sizeof(kern_buf_min_striding_size));
229 
230 	return 0;
231 }
232 
233 void
pp_fini(void)234 pp_fini(void)
235 {
236 	SK_LOCK_ASSERT_HELD();
237 
238 	if (__pp_inited) {
239 		if (pp_compl_cache != NULL) {
240 			skmem_cache_destroy(pp_compl_cache);
241 			pp_compl_cache = NULL;
242 		}
243 		if (pp_flow_cache != NULL) {
244 			skmem_cache_destroy(pp_flow_cache);
245 			pp_flow_cache = NULL;
246 		}
247 		if (pp_opt_cache != NULL) {
248 			skmem_cache_destroy(pp_opt_cache);
249 			pp_opt_cache = NULL;
250 		}
251 
252 		__pp_inited = 0;
253 	}
254 }
255 
256 static struct kern_pbufpool *
pp_alloc(zalloc_flags_t how)257 pp_alloc(zalloc_flags_t how)
258 {
259 	struct kern_pbufpool *pp = zalloc_flags(pp_zone, how | Z_ZERO);
260 
261 	if (pp) {
262 		lck_mtx_init(&pp->pp_lock, &skmem_lock_grp, &skmem_lock_attr);
263 	}
264 	return pp;
265 }
266 
267 static void
pp_free(struct kern_pbufpool * pp)268 pp_free(struct kern_pbufpool *pp)
269 {
270 	PP_LOCK_ASSERT_HELD(pp);
271 
272 	pp_destroy(pp);
273 	PP_UNLOCK(pp);
274 
275 	SK_DF(SK_VERB_MEM, "pp 0x%llx FREE", SK_KVA(pp));
276 	lck_mtx_destroy(&pp->pp_lock, &skmem_lock_grp);
277 	zfree(pp_zone, pp);
278 }
279 
280 void
pp_retain_locked(struct kern_pbufpool * pp)281 pp_retain_locked(struct kern_pbufpool *pp)
282 {
283 	PP_LOCK_ASSERT_HELD(pp);
284 
285 	pp->pp_refcnt++;
286 	ASSERT(pp->pp_refcnt != 0);
287 }
288 
289 void
pp_retain(struct kern_pbufpool * pp)290 pp_retain(struct kern_pbufpool *pp)
291 {
292 	PP_LOCK(pp);
293 	pp_retain_locked(pp);
294 	PP_UNLOCK(pp);
295 }
296 
297 boolean_t
pp_release_locked(struct kern_pbufpool * pp)298 pp_release_locked(struct kern_pbufpool *pp)
299 {
300 	uint32_t oldref = pp->pp_refcnt;
301 
302 	PP_LOCK_ASSERT_HELD(pp);
303 
304 	ASSERT(pp->pp_refcnt != 0);
305 	if (--pp->pp_refcnt == 0) {
306 		pp_free(pp);
307 	}
308 
309 	return oldref == 1;
310 }
311 
312 boolean_t
pp_release(struct kern_pbufpool * pp)313 pp_release(struct kern_pbufpool *pp)
314 {
315 	boolean_t lastref;
316 
317 	PP_LOCK(pp);
318 	if (!(lastref = pp_release_locked(pp))) {
319 		PP_UNLOCK(pp);
320 	}
321 
322 	return lastref;
323 }
324 
325 void
pp_close(struct kern_pbufpool * pp)326 pp_close(struct kern_pbufpool *pp)
327 {
328 	PP_LOCK(pp);
329 	ASSERT(pp->pp_refcnt > 0);
330 	ASSERT(!(pp->pp_flags & PPF_CLOSED));
331 	pp->pp_flags |= PPF_CLOSED;
332 	if (!pp_release_locked(pp)) {
333 		PP_UNLOCK(pp);
334 	}
335 }
336 
337 /*
338  * -fbounds-safety: All callers of pp_regions_params_adjust use SKMEM_REGIONS
339  * size for the srp_array. This is same as marking it __counted_by(SKMEM_REGIONS)
340  */
341 void
pp_regions_params_adjust(struct skmem_region_params srp_array[SKMEM_REGIONS],nexus_meta_type_t md_type,nexus_meta_subtype_t md_subtype,uint32_t md_cnt,uint16_t max_frags,uint32_t buf_size,uint32_t large_buf_size,uint32_t buf_cnt,uint32_t buf_seg_size,uint32_t flags)342 pp_regions_params_adjust(struct skmem_region_params srp_array[SKMEM_REGIONS],
343     nexus_meta_type_t md_type, nexus_meta_subtype_t md_subtype, uint32_t md_cnt,
344     uint16_t max_frags, uint32_t buf_size, uint32_t large_buf_size,
345     uint32_t buf_cnt, uint32_t buf_seg_size, uint32_t flags)
346 {
347 	struct skmem_region_params *srp, *kmd_srp, *buf_srp, *kbft_srp,
348 	    *lbuf_srp;
349 	uint32_t md_size = 0;
350 	bool kernel_only = ((flags & PP_REGION_CONFIG_KERNEL_ONLY) != 0);
351 	bool md_persistent = ((flags & PP_REGION_CONFIG_MD_PERSISTENT) != 0);
352 	bool buf_persistent = ((flags & PP_REGION_CONFIG_BUF_PERSISTENT) != 0);
353 	bool config_buflet = ((flags & PP_REGION_CONFIG_BUFLET) != 0);
354 	bool md_magazine_enable = ((flags &
355 	    PP_REGION_CONFIG_MD_MAGAZINE_ENABLE) != 0);
356 
357 	ASSERT(max_frags != 0);
358 
359 	switch (md_type) {
360 	case NEXUS_META_TYPE_QUANTUM:
361 		md_size = NX_METADATA_QUANTUM_SZ;
362 		break;
363 	case NEXUS_META_TYPE_PACKET:
364 		md_size = NX_METADATA_PACKET_SZ(max_frags);
365 		break;
366 	default:
367 		VERIFY(0);
368 		/* NOTREACHED */
369 		__builtin_unreachable();
370 	}
371 
372 	switch (flags & PP_REGION_CONFIG_BUF_IODIR_BIDIR) {
373 	case PP_REGION_CONFIG_BUF_IODIR_IN:
374 		kmd_srp = &srp_array[SKMEM_REGION_RXKMD];
375 		buf_srp = &srp_array[SKMEM_REGION_RXBUF_DEF];
376 		lbuf_srp = &srp_array[SKMEM_REGION_RXBUF_LARGE];
377 		kbft_srp = &srp_array[SKMEM_REGION_RXKBFT];
378 		break;
379 	case PP_REGION_CONFIG_BUF_IODIR_OUT:
380 		kmd_srp = &srp_array[SKMEM_REGION_TXKMD];
381 		buf_srp = &srp_array[SKMEM_REGION_TXBUF_DEF];
382 		lbuf_srp = &srp_array[SKMEM_REGION_TXBUF_LARGE];
383 		kbft_srp = &srp_array[SKMEM_REGION_TXKBFT];
384 		break;
385 	case PP_REGION_CONFIG_BUF_IODIR_BIDIR:
386 	default:
387 		kmd_srp = &srp_array[SKMEM_REGION_KMD];
388 		buf_srp = &srp_array[SKMEM_REGION_BUF_DEF];
389 		lbuf_srp = &srp_array[SKMEM_REGION_BUF_LARGE];
390 		kbft_srp = &srp_array[SKMEM_REGION_KBFT];
391 		break;
392 	}
393 
394 	/* add preamble size to metadata obj size */
395 	md_size += METADATA_PREAMBLE_SZ;
396 	ASSERT(md_size >= NX_METADATA_OBJ_MIN_SZ);
397 
398 	/* configure kernel metadata region */
399 	kmd_srp->srp_md_type = md_type;
400 	kmd_srp->srp_md_subtype = md_subtype;
401 	kmd_srp->srp_r_obj_cnt = md_cnt;
402 	kmd_srp->srp_r_obj_size = md_size;
403 	kmd_srp->srp_max_frags = max_frags;
404 	ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
405 	if (md_persistent) {
406 		kmd_srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
407 	}
408 	ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
409 	if (md_magazine_enable) {
410 		kmd_srp->srp_cflags &= ~SKMEM_REGION_CR_NOMAGAZINES;
411 	}
412 	skmem_region_params_config(kmd_srp);
413 
414 	/* configure user metadata region */
415 	srp = &srp_array[SKMEM_REGION_UMD];
416 	if (!kernel_only) {
417 		srp->srp_md_type = kmd_srp->srp_md_type;
418 		srp->srp_md_subtype = kmd_srp->srp_md_subtype;
419 		srp->srp_r_obj_cnt = kmd_srp->srp_c_obj_cnt;
420 		srp->srp_r_obj_size = kmd_srp->srp_c_obj_size;
421 		srp->srp_max_frags = kmd_srp->srp_max_frags;
422 		ASSERT((srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
423 		if (md_persistent) {
424 			srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
425 		}
426 		/*
427 		 * UMD is a mirrored region and object allocation operations
428 		 * are performed on the KMD objects.
429 		 */
430 		ASSERT((srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
431 		skmem_region_params_config(srp);
432 		ASSERT(srp->srp_c_obj_cnt == kmd_srp->srp_c_obj_cnt);
433 	} else {
434 		ASSERT(srp->srp_r_obj_cnt == 0);
435 		ASSERT(srp->srp_r_obj_size == 0);
436 	}
437 
438 	/* configure buffer region */
439 	buf_srp->srp_r_obj_cnt = MAX(buf_cnt, kmd_srp->srp_c_obj_cnt);
440 	buf_srp->srp_r_obj_size = buf_size;
441 	buf_srp->srp_cflags &= ~SKMEM_REGION_CR_MONOLITHIC;
442 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
443 	if (buf_persistent) {
444 		buf_srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
445 	}
446 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
447 	if (buf_srp->srp_r_obj_size >= kern_buf_min_striding_size) {
448 		/*
449 		 * A buffer size larger than 32K indicates striding is in use, which
450 		 * means a buffer could be detached from a buflet. In this case, magzine
451 		 * layer should be enabled.
452 		 */
453 		buf_srp->srp_cflags &= ~SKMEM_REGION_CR_NOMAGAZINES;
454 	}
455 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_UREADONLY) == 0);
456 	if ((flags & PP_REGION_CONFIG_BUF_UREADONLY) != 0) {
457 		buf_srp->srp_cflags |= SKMEM_REGION_CR_UREADONLY;
458 	}
459 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_KREADONLY) == 0);
460 	if ((flags & PP_REGION_CONFIG_BUF_KREADONLY) != 0) {
461 		buf_srp->srp_cflags |= SKMEM_REGION_CR_KREADONLY;
462 	}
463 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_MONOLITHIC) == 0);
464 	if ((flags & PP_REGION_CONFIG_BUF_MONOLITHIC) != 0) {
465 		buf_srp->srp_cflags |= SKMEM_REGION_CR_MONOLITHIC;
466 	}
467 	ASSERT((srp->srp_cflags & SKMEM_REGION_CR_SEGPHYSCONTIG) == 0);
468 	if ((flags & PP_REGION_CONFIG_BUF_SEGPHYSCONTIG) != 0) {
469 		buf_srp->srp_cflags |= SKMEM_REGION_CR_SEGPHYSCONTIG;
470 	}
471 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_NOCACHE) == 0);
472 	if ((flags & PP_REGION_CONFIG_BUF_NOCACHE) != 0) {
473 		buf_srp->srp_cflags |= SKMEM_REGION_CR_NOCACHE;
474 	}
475 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_THREADSAFE) == 0);
476 	if ((flags & PP_REGION_CONFIG_BUF_THREADSAFE) != 0) {
477 		buf_srp->srp_cflags |= SKMEM_REGION_CR_THREADSAFE;
478 	}
479 	if (buf_seg_size != 0) {
480 		buf_srp->srp_r_seg_size = buf_seg_size;
481 	}
482 	skmem_region_params_config(buf_srp);
483 
484 	/* configure large buffer region */
485 	if (large_buf_size != 0) {
486 		lbuf_srp->srp_r_obj_cnt = buf_srp->srp_r_obj_cnt;
487 		lbuf_srp->srp_r_obj_size = large_buf_size;
488 		lbuf_srp->srp_r_seg_size = buf_srp->srp_r_seg_size;
489 		lbuf_srp->srp_cflags = buf_srp->srp_cflags;
490 		skmem_region_params_config(lbuf_srp);
491 	}
492 
493 	/* configure kernel buflet region */
494 	if (config_buflet) {
495 		ASSERT(md_type == NEXUS_META_TYPE_PACKET);
496 		/*
497 		 * Ideally we want the number of buflets to be
498 		 * "kmd_srp->srp_c_obj_cnt * (kmd_srp->srp_max_frags - 1)",
499 		 * so that we have enough buflets when multi-buflet and
500 		 * shared buffer object is used.
501 		 * Currently multi-buflet is being used only by user pool
502 		 * which doesn't support shared buffer object, hence to reduce
503 		 * the number of objects we are restricting the number of
504 		 * buflets to the number of buffers.
505 		 */
506 		kbft_srp->srp_r_obj_cnt = buf_srp->srp_c_obj_cnt +
507 		    lbuf_srp->srp_c_obj_cnt;
508 		kbft_srp->srp_r_obj_size = MAX(sizeof(struct __kern_buflet_ext),
509 		    sizeof(struct __user_buflet));
510 		kbft_srp->srp_cflags = kmd_srp->srp_cflags;
511 		skmem_region_params_config(kbft_srp);
512 		ASSERT(kbft_srp->srp_c_obj_cnt >= buf_srp->srp_c_obj_cnt +
513 		    lbuf_srp->srp_c_obj_cnt);
514 	} else {
515 		ASSERT(kbft_srp->srp_r_obj_cnt == 0);
516 		ASSERT(kbft_srp->srp_r_obj_size == 0);
517 	}
518 
519 	/* configure user buflet region */
520 	srp = &srp_array[SKMEM_REGION_UBFT];
521 	if (config_buflet && !kernel_only) {
522 		srp->srp_r_obj_cnt = kbft_srp->srp_c_obj_cnt;
523 		srp->srp_r_obj_size = kbft_srp->srp_c_obj_size;
524 		srp->srp_cflags = srp_array[SKMEM_REGION_UMD].srp_cflags;
525 		skmem_region_params_config(srp);
526 		ASSERT(srp->srp_c_obj_cnt == kbft_srp->srp_c_obj_cnt);
527 	} else {
528 		ASSERT(srp->srp_r_obj_cnt == 0);
529 		ASSERT(srp->srp_r_obj_size == 0);
530 	}
531 
532 	/* make sure each metadata can be paired with a buffer */
533 	ASSERT(kmd_srp->srp_c_obj_cnt <= buf_srp->srp_c_obj_cnt);
534 }
535 
536 SK_NO_INLINE_ATTRIBUTE
537 static int
pp_metadata_construct(struct __kern_quantum * kqum,struct __user_quantum * uqum,obj_idx_t midx,struct kern_pbufpool * pp,uint32_t skmflag,uint16_t bufcnt,bool raw,struct skmem_obj ** blist)538 pp_metadata_construct(struct __kern_quantum *kqum, struct __user_quantum *uqum,
539     obj_idx_t midx, struct kern_pbufpool *pp, uint32_t skmflag, uint16_t bufcnt,
540     bool raw, struct skmem_obj **blist)
541 {
542 	struct __kern_buflet *kbuf;
543 	mach_vm_address_t baddr = 0;
544 	uint16_t *pbufs_cnt, *pbufs_max;
545 	uint16_t i;
546 
547 	ASSERT(bufcnt == 1 || PP_HAS_BUFFER_ON_DEMAND(pp));
548 
549 	/* construct {user,kernel} metadata */
550 	switch (pp->pp_md_type) {
551 	case NEXUS_META_TYPE_PACKET: {
552 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
553 		struct __user_packet *upkt = SK_PTR_ADDR_UPKT(uqum);
554 		struct __packet_opt *__single opt;
555 		struct __flow *__single flow;
556 		struct __packet_compl *__single compl;
557 		uint64_t pflags;
558 
559 		if (raw) {
560 			opt = skmem_cache_alloc(pp_opt_cache, SKMEM_SLEEP);
561 			flow = skmem_cache_alloc(pp_flow_cache, SKMEM_SLEEP);
562 			compl = skmem_cache_alloc(pp_compl_cache, SKMEM_SLEEP);
563 			pflags = (PKT_F_OPT_ALLOC | PKT_F_FLOW_ALLOC |
564 			    PKT_F_TX_COMPL_ALLOC);
565 		} else {
566 			ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
567 			    kpkt->pkt_com_opt != NULL);
568 			opt = kpkt->pkt_com_opt;
569 			ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
570 			    kpkt->pkt_flow != NULL);
571 			flow = kpkt->pkt_flow;
572 			ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
573 			    kpkt->pkt_tx_compl != NULL);
574 			compl = kpkt->pkt_tx_compl;
575 			pflags = kpkt->pkt_pflags;
576 		}
577 		/* will be adjusted below as part of allocating buffer(s) */
578 		_CASSERT(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
579 		_CASSERT(sizeof(kpkt->pkt_bufs_max) == sizeof(uint16_t));
580 		pbufs_cnt = __DECONST(uint16_t *, &kpkt->pkt_bufs_cnt);
581 		pbufs_max = __DECONST(uint16_t *, &kpkt->pkt_bufs_max);
582 
583 		/* kernel (and user) packet */
584 		KPKT_CTOR(kpkt, pflags, opt, flow, compl, midx,
585 		    upkt, pp, 0, pp->pp_max_frags, 0);
586 		break;
587 	}
588 	default:
589 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
590 		VERIFY(bufcnt == 1);
591 		/* TODO: point these to quantum's once they're defined */
592 		pbufs_cnt = pbufs_max = NULL;
593 		/* kernel quantum */
594 		KQUM_CTOR(kqum, midx, uqum, pp, 0);
595 		break;
596 	}
597 
598 	kbuf = kqum->qum_buf;
599 	for (i = 0; i < bufcnt; i++) {
600 		struct skmem_obj_info oib;
601 
602 		if (!PP_HAS_BUFFER_ON_DEMAND(pp)) {
603 			ASSERT(i == 0);
604 			ASSERT(*blist == NULL);
605 			/*
606 			 * quantum has a native buflet, so we only need a
607 			 * buffer to be allocated and attached to the buflet.
608 			 */
609 			baddr = pp_alloc_buffer_common(pp, &oib, skmflag,
610 			    false);
611 			if (__improbable(baddr == 0)) {
612 				goto fail;
613 			}
614 			KBUF_CTOR(kbuf, baddr, SKMEM_OBJ_IDX_REG(&oib),
615 			    SKMEM_OBJ_BUFCTL(&oib), pp, false);
616 			baddr = 0;
617 		} else {
618 			/*
619 			 * we use pre-constructed buflets with attached buffers.
620 			 */
621 			struct __kern_buflet *pkbuf = kbuf;
622 			struct skmem_obj *blistn;
623 
624 			ASSERT(pkbuf != NULL);
625 			kbuf = (kern_buflet_t)*blist;
626 			if (__improbable(kbuf == NULL)) {
627 				SK_DF(SK_VERB_MEM, "failed to get buflet,"
628 				    " pp 0x%llx", SK_KVA(pp));
629 				goto fail;
630 			}
631 
632 #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
633 			/* Checking to ensure the object address is tagged */
634 			ASSERT((vm_offset_t)kbuf !=
635 			    vm_memtag_canonicalize_address((vm_offset_t)kbuf));
636 #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
637 
638 			blistn = (*blist)->mo_next;
639 			(*blist)->mo_next = NULL;
640 
641 			KBUF_EXT_INIT(kbuf, pp);
642 			KBUF_LINK(pkbuf, kbuf);
643 			*blist = blistn;
644 		}
645 
646 		/* adjust buffer count accordingly */
647 		if (__probable(pbufs_cnt != NULL)) {
648 			*pbufs_cnt += 1;
649 			ASSERT(*pbufs_cnt <= *pbufs_max);
650 		}
651 	}
652 
653 	ASSERT(!PP_KERNEL_ONLY(pp) || (kqum->qum_qflags & QUM_F_KERNEL_ONLY));
654 	ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE);
655 	SK_DF(SK_VERB_MEM, "pp 0x%llx pkt 0x%llx bufcnt %d buf 0x%llx",
656 	    SK_KVA(pp), SK_KVA(kqum), bufcnt, SK_KVA(baddr));
657 	return 0;
658 
659 fail:
660 	ASSERT(bufcnt != 0 && baddr == 0);
661 	pp_metadata_destruct(kqum, pp, raw);
662 	return ENOMEM;
663 }
664 
665 static int
pp_metadata_ctor_common(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,struct kern_pbufpool * pp,uint32_t skmflag,bool no_buflet)666 pp_metadata_ctor_common(struct skmem_obj_info *oi0,
667     struct skmem_obj_info *oim0, struct kern_pbufpool *pp, uint32_t skmflag,
668     bool no_buflet)
669 {
670 	struct skmem_obj_info _oi, _oim;
671 	struct skmem_obj_info *oi, *oim;
672 	struct __kern_quantum *kqum;
673 	struct __user_quantum *uqum;
674 	uint16_t bufcnt = (no_buflet ? 0 : pp->pp_max_frags);
675 	struct skmem_obj *__single blist = NULL;
676 	int error;
677 
678 #if (DEVELOPMENT || DEBUG)
679 	uint64_t mtbf = skmem_region_get_mtbf();
680 	/*
681 	 * MTBF is applicable only for non-blocking allocations here.
682 	 */
683 	if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
684 	    (skmflag & SKMEM_NOSLEEP))) {
685 		SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
686 		net_update_uptime();
687 		return ENOMEM;
688 	}
689 #endif /* (DEVELOPMENT || DEBUG) */
690 
691 	/*
692 	 * Note that oi0 and oim0 may be stored inside the object itself;
693 	 * if so, copy them to local variables before constructing.  We
694 	 * don't use PPF_BATCH to test as the allocator may be allocating
695 	 * storage space differently depending on the number of objects.
696 	 */
697 	if (__probable((uintptr_t)oi0 >= (uintptr_t)SKMEM_OBJ_ADDR(oi0) &&
698 	    ((uintptr_t)oi0 + sizeof(*oi0)) <=
699 	    ((uintptr_t)SKMEM_OBJ_ADDR(oi0) + SKMEM_OBJ_SIZE(oi0)))) {
700 		oi = &_oi;
701 		*oi = *oi0;
702 		if (__probable(oim0 != NULL)) {
703 			oim = &_oim;
704 			*oim = *oim0;
705 		} else {
706 			oim = NULL;
707 		}
708 	} else {
709 		oi = oi0;
710 		oim = oim0;
711 	}
712 
713 	kqum = SK_PTR_ADDR_KQUM((uintptr_t)SKMEM_OBJ_ADDR(oi) +
714 	    METADATA_PREAMBLE_SZ);
715 
716 	if (__probable(!PP_KERNEL_ONLY(pp))) {
717 		ASSERT(oim != NULL && SKMEM_OBJ_ADDR(oim) != NULL);
718 		ASSERT(SKMEM_OBJ_SIZE(oi) == SKMEM_OBJ_SIZE(oim));
719 		uqum = SK_PTR_ADDR_UQUM((uintptr_t)SKMEM_OBJ_ADDR(oim) +
720 		    METADATA_PREAMBLE_SZ);
721 	} else {
722 		ASSERT(oim == NULL);
723 		uqum = NULL;
724 	}
725 
726 	if (oim != NULL) {
727 		/* initialize user metadata redzone */
728 		struct __metadata_preamble *mdp = SKMEM_OBJ_ADDR(oim);
729 		mdp->mdp_redzone =
730 		    (SKMEM_OBJ_ROFF(oim) + METADATA_PREAMBLE_SZ) ^
731 		    __ch_umd_redzone_cookie;
732 	}
733 
734 	/* allocate (constructed) buflet(s) with buffer(s) attached */
735 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0) {
736 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
737 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, bufcnt, skmflag);
738 	}
739 
740 	error = pp_metadata_construct(kqum, uqum, SKMEM_OBJ_IDX_REG(oi), pp,
741 	    skmflag, bufcnt, TRUE, &blist);
742 	if (__improbable(blist != NULL)) {
743 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist);
744 		blist = NULL;
745 	}
746 	return error;
747 }
748 
749 static int
pp_metadata_ctor_no_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)750 pp_metadata_ctor_no_buflet(struct skmem_obj_info *oi0,
751     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
752 {
753 	return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, true);
754 }
755 
756 static int
pp_metadata_ctor_max_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)757 pp_metadata_ctor_max_buflet(struct skmem_obj_info *oi0,
758     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
759 {
760 	return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, false);
761 }
762 
763 __attribute__((always_inline))
764 static void
pp_metadata_destruct_common(struct __kern_quantum * kqum,struct kern_pbufpool * pp,bool raw,struct skmem_obj ** blist_def,struct skmem_obj ** blist_nocache_def,struct skmem_obj ** blist_large,struct skmem_obj ** blist_nocache_large)765 pp_metadata_destruct_common(struct __kern_quantum *kqum,
766     struct kern_pbufpool *pp, bool raw, struct skmem_obj **blist_def,
767     struct skmem_obj **blist_nocache_def, struct skmem_obj **blist_large,
768     struct skmem_obj **blist_nocache_large)
769 {
770 	struct __kern_buflet *kbuf, *nbuf;
771 	struct skmem_obj *__single p_blist_def = NULL, *__single p_blist_large = NULL;
772 	struct skmem_obj *__single p_blist_nocache_def = NULL, *__single p_blist_nocache_large = NULL;
773 	struct skmem_obj **pp_blist_def = &p_blist_def;
774 	struct skmem_obj **pp_blist_large = &p_blist_large;
775 	struct skmem_obj **pp_blist_nocache_def = &p_blist_nocache_def;
776 	struct skmem_obj **pp_blist_nocache_large = &p_blist_nocache_large;
777 	uint16_t bufcnt, i = 0;
778 	bool first_buflet_empty;
779 
780 	ASSERT(blist_def != NULL);
781 	ASSERT(blist_large != NULL);
782 
783 	switch (pp->pp_md_type) {
784 	case NEXUS_META_TYPE_PACKET: {
785 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
786 
787 		ASSERT(kpkt->pkt_user != NULL || PP_KERNEL_ONLY(pp));
788 		ASSERT(kpkt->pkt_qum.qum_pp == pp);
789 		ASSERT(METADATA_TYPE(kpkt) == pp->pp_md_type);
790 		ASSERT(METADATA_SUBTYPE(kpkt) == pp->pp_md_subtype);
791 		ASSERT(METADATA_IDX(kpkt) != OBJ_IDX_NONE);
792 		ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
793 		ASSERT(kpkt->pkt_bufs_cnt <= kpkt->pkt_bufs_max);
794 		ASSERT(kpkt->pkt_bufs_max == pp->pp_max_frags);
795 		_CASSERT(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
796 		bufcnt = kpkt->pkt_bufs_cnt;
797 		kbuf = &kqum->qum_buf[0];
798 		/*
799 		 * special handling for empty first buflet.
800 		 */
801 		first_buflet_empty = (kbuf->buf_addr == 0);
802 		*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = 0;
803 		break;
804 	}
805 	default:
806 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
807 		ASSERT(kqum->qum_user != NULL || PP_KERNEL_ONLY(pp));
808 		ASSERT(kqum->qum_pp == pp);
809 		ASSERT(METADATA_TYPE(kqum) == pp->pp_md_type);
810 		ASSERT(METADATA_SUBTYPE(kqum) == pp->pp_md_subtype);
811 		ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE);
812 		ASSERT(kqum->qum_ksd == NULL);
813 		kbuf = &kqum->qum_buf[0];
814 		/*
815 		 * XXX: Special handling for quantum as we don't currently
816 		 * define bufs_{cnt,max} there.  Given that we support at
817 		 * most only 1 buflet for now, check if buf_addr is non-NULL.
818 		 * See related code in pp_metadata_construct().
819 		 */
820 		first_buflet_empty = (kbuf->buf_addr == 0);
821 		bufcnt = first_buflet_empty ? 0 : 1;
822 		break;
823 	}
824 
825 	/*
826 	 * -fbounds-safety: buf_nbft_addr is a mach_vm_address_t which is
827 	 * unsafe, so we forge it here.
828 	 */
829 	nbuf = __unsafe_forge_single(struct __kern_buflet *,
830 	    __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
831 	BUF_NBFT_ADDR(kbuf, 0);
832 	BUF_NBFT_IDX(kbuf, OBJ_IDX_NONE);
833 	if (!first_buflet_empty) {
834 		pp_free_buflet_common(pp, kbuf);
835 		++i;
836 	}
837 
838 	while (nbuf != NULL) {
839 		ASSERT(nbuf->buf_ctl != NULL);
840 		if (BUFLET_HAS_LARGE_BUF(nbuf)) {
841 			/*
842 			 * bc_usecnt larger than 1 means the buffer has been cloned and is
843 			 * still being used by other bflts. In this case, when we free
844 			 * this bflt we need to explicitly ask for it to not be cached again
845 			 * into magzine layer to prevent immediate reuse of the buffer and
846 			 * data corruption.
847 			 */
848 			if (nbuf->buf_ctl->bc_usecnt > 1) {
849 				*pp_blist_nocache_large = (struct skmem_obj *)(void *)nbuf;
850 				pp_blist_nocache_large =
851 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
852 			} else {
853 				*pp_blist_large = (struct skmem_obj *)(void *)nbuf;
854 				pp_blist_large =
855 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
856 			}
857 		} else {
858 			if (nbuf->buf_ctl->bc_usecnt > 1) {
859 				*pp_blist_nocache_def = (struct skmem_obj *)(void *)nbuf;
860 				pp_blist_nocache_def =
861 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
862 			} else {
863 				*pp_blist_def = (struct skmem_obj *)(void *)nbuf;
864 				pp_blist_def =
865 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
866 			}
867 		}
868 		BUF_NBFT_IDX(nbuf, OBJ_IDX_NONE);
869 		nbuf = __unsafe_forge_single(struct __kern_buflet *,
870 		    __DECONST(struct __kern_buflet *, nbuf->buf_nbft_addr));
871 		++i;
872 	}
873 
874 	ASSERT(i == bufcnt);
875 
876 	if (p_blist_def != NULL) {
877 		*pp_blist_def = *blist_def;
878 		*blist_def = p_blist_def;
879 	}
880 	if (p_blist_large != NULL) {
881 		*pp_blist_large = *blist_large;
882 		*blist_large = p_blist_large;
883 	}
884 	if (p_blist_nocache_def != NULL) {
885 		*pp_blist_nocache_def = *blist_nocache_def;
886 		*blist_nocache_def = p_blist_nocache_def;
887 	}
888 	if (p_blist_nocache_large != NULL) {
889 		*pp_blist_nocache_large = *blist_nocache_large;
890 		*blist_nocache_large = p_blist_nocache_large;
891 	}
892 
893 	/* if we're about to return this object to the slab, clean it up */
894 	if (raw) {
895 		switch (pp->pp_md_type) {
896 		case NEXUS_META_TYPE_PACKET: {
897 			struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
898 
899 			ASSERT(kpkt->pkt_com_opt != NULL ||
900 			    !(kpkt->pkt_pflags & PKT_F_OPT_ALLOC));
901 			if (kpkt->pkt_com_opt != NULL) {
902 				ASSERT(kpkt->pkt_pflags & PKT_F_OPT_ALLOC);
903 				skmem_cache_free(pp_opt_cache,
904 				    kpkt->pkt_com_opt);
905 				kpkt->pkt_com_opt = NULL;
906 			}
907 			ASSERT(kpkt->pkt_flow != NULL ||
908 			    !(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC));
909 			if (kpkt->pkt_flow != NULL) {
910 				ASSERT(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC);
911 				skmem_cache_free(pp_flow_cache, kpkt->pkt_flow);
912 				kpkt->pkt_flow = NULL;
913 			}
914 			ASSERT(kpkt->pkt_tx_compl != NULL ||
915 			    !(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC));
916 			if (kpkt->pkt_tx_compl != NULL) {
917 				ASSERT(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC);
918 				skmem_cache_free(pp_compl_cache,
919 				    kpkt->pkt_tx_compl);
920 				kpkt->pkt_tx_compl = NULL;
921 			}
922 			kpkt->pkt_pflags = 0;
923 			break;
924 		}
925 		default:
926 			ASSERT(METADATA_TYPE(kqum) == NEXUS_META_TYPE_QUANTUM);
927 			/* nothing to do for quantum (yet) */
928 			break;
929 		}
930 	}
931 }
932 
933 __attribute__((always_inline))
934 static void
pp_free_kbft_list(struct kern_pbufpool * pp,struct skmem_obj * blist_def,struct skmem_obj * blist_nocache_def,struct skmem_obj * blist_large,struct skmem_obj * blist_nocache_large)935 pp_free_kbft_list(struct kern_pbufpool *pp, struct skmem_obj *blist_def, struct skmem_obj *blist_nocache_def,
936     struct skmem_obj *blist_large, struct skmem_obj *blist_nocache_large)
937 {
938 	if (blist_def != NULL) {
939 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist_def);
940 	}
941 	if (blist_large != NULL) {
942 		skmem_cache_batch_free(PP_KBFT_CACHE_LARGE(pp), blist_large);
943 	}
944 	if (blist_nocache_def != NULL) {
945 		skmem_cache_batch_free_nocache(PP_KBFT_CACHE_DEF(pp), blist_nocache_def);
946 	}
947 	if (blist_nocache_large != NULL) {
948 		skmem_cache_batch_free_nocache(PP_KBFT_CACHE_LARGE(pp), blist_nocache_large);
949 	}
950 }
951 
952 __attribute__((always_inline))
953 static void
pp_metadata_destruct(struct __kern_quantum * kqum,struct kern_pbufpool * pp,bool raw)954 pp_metadata_destruct(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
955     bool raw)
956 {
957 	struct skmem_obj *__single blist_def = NULL, *__single blist_large = NULL;
958 	struct skmem_obj *__single blist_nocache_def = NULL, *__single blist_nocache_large = NULL;
959 
960 	pp_metadata_destruct_common(kqum, pp, raw, &blist_def, &blist_nocache_def,
961 	    &blist_large, &blist_nocache_large);
962 	pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
963 }
964 
965 static void
pp_metadata_dtor(void * addr,void * arg)966 pp_metadata_dtor(void *addr, void *arg)
967 {
968 	pp_metadata_destruct(SK_PTR_ADDR_KQUM((uintptr_t)addr +
969 	    METADATA_PREAMBLE_SZ), arg, TRUE);
970 }
971 
972 static void
pp_buf_seg_ctor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)973 pp_buf_seg_ctor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
974 {
975 	struct kern_pbufpool *__single pp = arg;
976 
977 	if (pp->pp_pbuf_seg_ctor != NULL) {
978 		pp->pp_pbuf_seg_ctor(pp, sg, md);
979 	}
980 }
981 
982 static void
pp_buf_seg_dtor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)983 pp_buf_seg_dtor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
984 {
985 	struct kern_pbufpool *__single pp = arg;
986 
987 	if (pp->pp_pbuf_seg_dtor != NULL) {
988 		pp->pp_pbuf_seg_dtor(pp, sg, md);
989 	}
990 }
991 
992 static int
pp_buflet_metadata_ctor_common(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag,bool large)993 pp_buflet_metadata_ctor_common(struct skmem_obj_info *oi0,
994     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag, bool large)
995 {
996 #pragma unused (skmflag)
997 	struct kern_pbufpool *pp = (struct kern_pbufpool *)arg;
998 	struct __kern_buflet *kbft;
999 	struct __user_buflet *ubft;
1000 	struct skmem_obj_info oib;
1001 	mach_vm_address_t baddr;
1002 	obj_idx_t oi_idx_reg;
1003 
1004 	baddr = pp_alloc_buffer_common(pp, &oib, skmflag, large);
1005 	if (__improbable(baddr == 0)) {
1006 		return ENOMEM;
1007 	}
1008 	/*
1009 	 * Note that oi0 and oim0 may be stored inside the object itself;
1010 	 * so copy what is required to local variables before constructing.
1011 	 */
1012 	oi_idx_reg = SKMEM_OBJ_IDX_REG(oi0);
1013 	kbft = SKMEM_OBJ_ADDR(oi0);
1014 
1015 	if (__probable(!PP_KERNEL_ONLY(pp))) {
1016 		ASSERT(oim0 != NULL && SKMEM_OBJ_ADDR(oim0) != NULL);
1017 		ASSERT(SKMEM_OBJ_SIZE(oi0) == SKMEM_OBJ_SIZE(oim0));
1018 		ASSERT(oi_idx_reg == SKMEM_OBJ_IDX_REG(oim0));
1019 		ASSERT(SKMEM_OBJ_IDX_SEG(oi0) == SKMEM_OBJ_IDX_SEG(oim0));
1020 		ubft = SKMEM_OBJ_ADDR(oim0);
1021 	} else {
1022 		ASSERT(oim0 == NULL);
1023 		ubft = NULL;
1024 	}
1025 	KBUF_EXT_CTOR(kbft, ubft, baddr, SKMEM_OBJ_IDX_REG(&oib),
1026 	    SKMEM_OBJ_BUFCTL(&oib), oi_idx_reg, pp, large);
1027 	return 0;
1028 }
1029 
1030 static int
pp_buflet_default_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)1031 pp_buflet_default_buffer_metadata_ctor(struct skmem_obj_info *oi0,
1032     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
1033 {
1034 	return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, false);
1035 }
1036 
1037 static int
pp_buflet_large_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)1038 pp_buflet_large_buffer_metadata_ctor(struct skmem_obj_info *oi0,
1039     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
1040 {
1041 	return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, true);
1042 }
1043 
1044 static void
pp_buflet_metadata_dtor(void * addr,void * arg)1045 pp_buflet_metadata_dtor(void *addr, void *arg)
1046 {
1047 	struct __kern_buflet *__single kbft = addr;
1048 	void *objaddr = kbft->buf_objaddr;
1049 	struct kern_pbufpool *__single pp = arg;
1050 	uint32_t usecnt = 0;
1051 	bool large = BUFLET_HAS_LARGE_BUF(kbft);
1052 
1053 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1054 	/*
1055 	 * don't assert for (buf_nbft_addr == 0) here as constructed
1056 	 * buflet may have this field as non-zero. This is because
1057 	 * buf_nbft_addr (__buflet_next) is used by skmem batch alloc
1058 	 * for chaining the buflets.
1059 	 * To ensure that the frred buflet was not part of a chain we
1060 	 * assert for (buf_nbft_idx == OBJ_IDX_NONE).
1061 	 */
1062 	ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
1063 	ASSERT(((struct __kern_buflet_ext *)kbft)->kbe_buf_upp_link.sle_next ==
1064 	    NULL);
1065 	ASSERT(kbft->buf_addr != 0);
1066 	ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
1067 	ASSERT(kbft->buf_ctl != NULL);
1068 
1069 	KBUF_DTOR(kbft, usecnt);
1070 	SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u", SK_KVA(pp),
1071 	    SK_KVA(objaddr), usecnt);
1072 	if (__probable(usecnt == 0)) {
1073 		skmem_cache_free(large ? PP_BUF_CACHE_LARGE(pp) :
1074 		    PP_BUF_CACHE_DEF(pp), objaddr);
1075 	}
1076 }
1077 
1078 /*
1079  * -fbounds-safety: all callers of pp_create use srp_array with a known size:
1080  * SKMEM_REGIONS. This is same as marking it __counted_by(SKMEM_REGIONS)
1081  */
1082 struct kern_pbufpool *
pp_create(const char * name,struct skmem_region_params srp_array[SKMEM_REGIONS],pbuf_seg_ctor_fn_t buf_seg_ctor,pbuf_seg_dtor_fn_t buf_seg_dtor,const void * ctx,pbuf_ctx_retain_fn_t ctx_retain,pbuf_ctx_release_fn_t ctx_release,uint32_t ppcreatef)1083 pp_create(const char *name, struct skmem_region_params srp_array[SKMEM_REGIONS],
1084     pbuf_seg_ctor_fn_t buf_seg_ctor, pbuf_seg_dtor_fn_t buf_seg_dtor,
1085     const void *ctx, pbuf_ctx_retain_fn_t ctx_retain,
1086     pbuf_ctx_release_fn_t ctx_release, uint32_t ppcreatef)
1087 {
1088 	struct kern_pbufpool *pp = NULL;
1089 	uint32_t md_size, def_buf_obj_size;
1090 	uint32_t def_buf_size, large_buf_size;
1091 	nexus_meta_type_t md_type;
1092 	nexus_meta_subtype_t md_subtype;
1093 	uint32_t md_cflags;
1094 	uint16_t max_frags;
1095 	uint32_t buf_def_cflags;
1096 	char cname[64];
1097 	const char *__null_terminated cache_name = NULL;
1098 	struct skmem_region_params *kmd_srp;
1099 	struct skmem_region_params *buf_srp;
1100 	struct skmem_region_params *kbft_srp;
1101 	struct skmem_region_params *umd_srp = NULL;
1102 	struct skmem_region_params *ubft_srp = NULL;
1103 	struct skmem_region_params *lbuf_srp = NULL;
1104 
1105 	/* buf_seg_{ctor,dtor} pair must be either NULL or non-NULL */
1106 	ASSERT(!(!(buf_seg_ctor == NULL && buf_seg_dtor == NULL) &&
1107 	    ((buf_seg_ctor == NULL) ^ (buf_seg_dtor == NULL))));
1108 
1109 	/* ctx{,_retain,_release} must be either ALL NULL or ALL non-NULL */
1110 	ASSERT((ctx == NULL && ctx_retain == NULL && ctx_release == NULL) ||
1111 	    (ctx != NULL && ctx_retain != NULL && ctx_release != NULL));
1112 
1113 	if (srp_array[SKMEM_REGION_KMD].srp_c_obj_cnt != 0) {
1114 		kmd_srp = &srp_array[SKMEM_REGION_KMD];
1115 		buf_srp = &srp_array[SKMEM_REGION_BUF_DEF];
1116 		lbuf_srp = &srp_array[SKMEM_REGION_BUF_LARGE];
1117 		kbft_srp = &srp_array[SKMEM_REGION_KBFT];
1118 	} else if (srp_array[SKMEM_REGION_RXKMD].srp_c_obj_cnt != 0) {
1119 		kmd_srp = &srp_array[SKMEM_REGION_RXKMD];
1120 		buf_srp = &srp_array[SKMEM_REGION_RXBUF_DEF];
1121 		lbuf_srp = &srp_array[SKMEM_REGION_RXBUF_LARGE];
1122 		kbft_srp = &srp_array[SKMEM_REGION_RXKBFT];
1123 	} else {
1124 		VERIFY(srp_array[SKMEM_REGION_TXKMD].srp_c_obj_cnt != 0);
1125 		kmd_srp = &srp_array[SKMEM_REGION_TXKMD];
1126 		buf_srp = &srp_array[SKMEM_REGION_TXBUF_DEF];
1127 		lbuf_srp = &srp_array[SKMEM_REGION_TXBUF_LARGE];
1128 		kbft_srp = &srp_array[SKMEM_REGION_TXKBFT];
1129 	}
1130 
1131 	VERIFY(kmd_srp->srp_c_obj_size != 0);
1132 	VERIFY(buf_srp->srp_c_obj_cnt != 0);
1133 	VERIFY(buf_srp->srp_c_obj_size != 0);
1134 
1135 	if (ppcreatef & PPCREATEF_ONDEMAND_BUF) {
1136 		VERIFY(kbft_srp->srp_c_obj_cnt != 0);
1137 		VERIFY(kbft_srp->srp_c_obj_size != 0);
1138 	} else {
1139 		kbft_srp = NULL;
1140 	}
1141 
1142 	if ((ppcreatef & PPCREATEF_KERNEL_ONLY) == 0) {
1143 		umd_srp = &srp_array[SKMEM_REGION_UMD];
1144 		ASSERT(umd_srp->srp_c_obj_size == kmd_srp->srp_c_obj_size);
1145 		ASSERT(umd_srp->srp_c_obj_cnt == kmd_srp->srp_c_obj_cnt);
1146 		ASSERT(umd_srp->srp_c_seg_size == kmd_srp->srp_c_seg_size);
1147 		ASSERT(umd_srp->srp_seg_cnt == kmd_srp->srp_seg_cnt);
1148 		ASSERT(umd_srp->srp_md_type == kmd_srp->srp_md_type);
1149 		ASSERT(umd_srp->srp_md_subtype == kmd_srp->srp_md_subtype);
1150 		ASSERT(umd_srp->srp_max_frags == kmd_srp->srp_max_frags);
1151 		ASSERT((umd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
1152 		    (kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
1153 		if (kbft_srp != NULL) {
1154 			ubft_srp = &srp_array[SKMEM_REGION_UBFT];
1155 			ASSERT(ubft_srp->srp_c_obj_size ==
1156 			    kbft_srp->srp_c_obj_size);
1157 			ASSERT(ubft_srp->srp_c_obj_cnt ==
1158 			    kbft_srp->srp_c_obj_cnt);
1159 			ASSERT(ubft_srp->srp_c_seg_size ==
1160 			    kbft_srp->srp_c_seg_size);
1161 			ASSERT(ubft_srp->srp_seg_cnt == kbft_srp->srp_seg_cnt);
1162 		}
1163 	}
1164 
1165 	md_size = kmd_srp->srp_r_obj_size;
1166 	md_type = kmd_srp->srp_md_type;
1167 	md_subtype = kmd_srp->srp_md_subtype;
1168 	max_frags = kmd_srp->srp_max_frags;
1169 	def_buf_obj_size = buf_srp->srp_c_obj_size;
1170 	def_buf_size = def_buf_obj_size;
1171 	large_buf_size = lbuf_srp->srp_c_obj_size;
1172 
1173 #if (DEBUG || DEVELOPMENT)
1174 	ASSERT(def_buf_obj_size != 0);
1175 	ASSERT(md_type > NEXUS_META_TYPE_INVALID &&
1176 	    md_type <= NEXUS_META_TYPE_MAX);
1177 	if (md_type == NEXUS_META_TYPE_QUANTUM) {
1178 		ASSERT(max_frags == 1);
1179 		ASSERT(md_size >=
1180 		    (METADATA_PREAMBLE_SZ + NX_METADATA_QUANTUM_SZ));
1181 	} else {
1182 		ASSERT(max_frags >= 1);
1183 		ASSERT(md_type == NEXUS_META_TYPE_PACKET);
1184 		ASSERT(md_size >= (METADATA_PREAMBLE_SZ +
1185 		    NX_METADATA_PACKET_SZ(max_frags)));
1186 	}
1187 	ASSERT(md_subtype > NEXUS_META_SUBTYPE_INVALID &&
1188 	    md_subtype <= NEXUS_META_SUBTYPE_MAX);
1189 #endif /* DEBUG || DEVELOPMENT */
1190 
1191 	pp = pp_alloc(Z_WAITOK);
1192 
1193 	(void) snprintf((char *)pp->pp_name, sizeof(pp->pp_name),
1194 	    "skywalk.pp.%s", name);
1195 
1196 	pp->pp_ctx = __DECONST(void *, ctx);
1197 	pp->pp_ctx_retain = ctx_retain;
1198 	pp->pp_ctx_release = ctx_release;
1199 	if (pp->pp_ctx != NULL) {
1200 		pp->pp_ctx_retain(pp->pp_ctx);
1201 	}
1202 
1203 	pp->pp_pbuf_seg_ctor = buf_seg_ctor;
1204 	pp->pp_pbuf_seg_dtor = buf_seg_dtor;
1205 	PP_BUF_SIZE_DEF(pp) = def_buf_size;
1206 	PP_BUF_OBJ_SIZE_DEF(pp) = def_buf_obj_size;
1207 	PP_BUF_SIZE_LARGE(pp) = large_buf_size;
1208 	PP_BUF_OBJ_SIZE_LARGE(pp) = lbuf_srp->srp_c_obj_size;
1209 	pp->pp_md_type = md_type;
1210 	pp->pp_md_subtype = md_subtype;
1211 	pp->pp_max_frags = max_frags;
1212 	if (ppcreatef & PPCREATEF_EXTERNAL) {
1213 		pp->pp_flags |= PPF_EXTERNAL;
1214 	}
1215 	if (ppcreatef & PPCREATEF_TRUNCATED_BUF) {
1216 		pp->pp_flags |= PPF_TRUNCATED_BUF;
1217 	}
1218 	if (ppcreatef & PPCREATEF_KERNEL_ONLY) {
1219 		pp->pp_flags |= PPF_KERNEL;
1220 	}
1221 	if (ppcreatef & PPCREATEF_ONDEMAND_BUF) {
1222 		pp->pp_flags |= PPF_BUFFER_ON_DEMAND;
1223 	}
1224 	if (ppcreatef & PPCREATEF_DYNAMIC) {
1225 		pp->pp_flags |= PPF_DYNAMIC;
1226 	}
1227 	if (lbuf_srp->srp_c_obj_cnt > 0) {
1228 		ASSERT(lbuf_srp->srp_c_obj_size != 0);
1229 		pp->pp_flags |= PPF_LARGE_BUF;
1230 	}
1231 
1232 	pp_retain(pp);
1233 
1234 	md_cflags = ((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ?
1235 	    SKMEM_CR_NOMAGAZINES : 0);
1236 	md_cflags |= SKMEM_CR_BATCH;
1237 	pp->pp_flags |= PPF_BATCH;
1238 
1239 	if (pp->pp_flags & PPF_DYNAMIC) {
1240 		md_cflags |= SKMEM_CR_DYNAMIC;
1241 	}
1242 
1243 	if (umd_srp != NULL && (pp->pp_umd_region =
1244 	    skmem_region_create(name, umd_srp, NULL, NULL, NULL)) == NULL) {
1245 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1246 		    pp->pp_name, SK_KVA(pp), umd_srp->srp_name);
1247 		goto failed;
1248 	}
1249 
1250 	if ((pp->pp_kmd_region = skmem_region_create(name, kmd_srp, NULL, NULL,
1251 	    NULL)) == NULL) {
1252 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1253 		    pp->pp_name, SK_KVA(pp), kmd_srp->srp_name);
1254 		goto failed;
1255 	}
1256 
1257 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1258 		VERIFY((kbft_srp != NULL) && (kbft_srp->srp_c_obj_cnt > 0));
1259 		if (!PP_KERNEL_ONLY(pp)) {
1260 			VERIFY((ubft_srp != NULL) &&
1261 			    (ubft_srp->srp_c_obj_cnt > 0));
1262 		}
1263 	}
1264 	/*
1265 	 * Metadata regions {KMD,KBFT,UBFT} magazines layer and persistency
1266 	 * attribute must match.
1267 	 */
1268 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1269 		ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ==
1270 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES));
1271 		ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
1272 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
1273 	}
1274 
1275 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && !PP_KERNEL_ONLY(pp)) {
1276 		if ((pp->pp_ubft_region = skmem_region_create(name, ubft_srp,
1277 		    NULL, NULL, NULL)) == NULL) {
1278 			SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1279 			    pp->pp_name, SK_KVA(pp), ubft_srp->srp_name);
1280 			goto failed;
1281 		}
1282 	}
1283 
1284 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1285 		if ((pp->pp_kbft_region = skmem_region_create(name,
1286 		    kbft_srp, NULL, NULL, NULL)) == NULL) {
1287 			SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1288 			    pp->pp_name, SK_KVA(pp), kbft_srp->srp_name);
1289 			goto failed;
1290 		}
1291 	}
1292 
1293 	if (!PP_KERNEL_ONLY(pp)) {
1294 		skmem_region_mirror(pp->pp_kmd_region, pp->pp_umd_region);
1295 	}
1296 	if (!PP_KERNEL_ONLY(pp) && pp->pp_ubft_region != NULL) {
1297 		ASSERT(pp->pp_kbft_region != NULL);
1298 		skmem_region_mirror(pp->pp_kbft_region, pp->pp_ubft_region);
1299 	}
1300 
1301 	/*
1302 	 * Create the metadata cache; magazines layer is determined by caller.
1303 	 */
1304 	cache_name = tsnprintf(cname, sizeof(cname), "kmd.%s", name);
1305 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1306 		pp->pp_kmd_cache = skmem_cache_create(cache_name, md_size, 0,
1307 		    pp_metadata_ctor_no_buflet, pp_metadata_dtor, NULL, pp,
1308 		    pp->pp_kmd_region, md_cflags);
1309 	} else {
1310 		pp->pp_kmd_cache = skmem_cache_create(cache_name, md_size, 0,
1311 		    pp_metadata_ctor_max_buflet, pp_metadata_dtor, NULL, pp,
1312 		    pp->pp_kmd_region, md_cflags);
1313 	}
1314 
1315 	if (pp->pp_kmd_cache == NULL) {
1316 		SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1317 		    pp->pp_name, SK_KVA(pp), cname);
1318 		goto failed;
1319 	}
1320 
1321 	/*
1322 	 * Create the buflet metadata cache
1323 	 */
1324 	if (pp->pp_kbft_region != NULL) {
1325 		cache_name = tsnprintf(cname, sizeof(cname), "kbft_def.%s", name);
1326 		PP_KBFT_CACHE_DEF(pp) = skmem_cache_create(cache_name,
1327 		    kbft_srp->srp_c_obj_size, 0,
1328 		    pp_buflet_default_buffer_metadata_ctor,
1329 		    pp_buflet_metadata_dtor, NULL, pp, pp->pp_kbft_region,
1330 		    md_cflags);
1331 
1332 		if (PP_KBFT_CACHE_DEF(pp) == NULL) {
1333 			SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1334 			    pp->pp_name, SK_KVA(pp), cname);
1335 			goto failed;
1336 		}
1337 
1338 		if (PP_HAS_LARGE_BUF(pp)) {
1339 			/* Aggressive memory reclaim flag set to kbft_large for now */
1340 			md_cflags |= SKMEM_CR_RECLAIM;
1341 			cache_name = tsnprintf(cname, sizeof(cname),
1342 			    "kbft_large.%s", name);
1343 			PP_KBFT_CACHE_LARGE(pp) = skmem_cache_create(cache_name,
1344 			    kbft_srp->srp_c_obj_size, 0,
1345 			    pp_buflet_large_buffer_metadata_ctor,
1346 			    pp_buflet_metadata_dtor,
1347 			    NULL, pp, pp->pp_kbft_region, md_cflags);
1348 
1349 			if (PP_KBFT_CACHE_LARGE(pp) == NULL) {
1350 				SK_ERR("\"%s\" (0x%llx) failed to "
1351 				    "create \"%s\" cache", pp->pp_name,
1352 				    SK_KVA(pp), cname);
1353 				goto failed;
1354 			}
1355 		}
1356 	}
1357 
1358 	if ((PP_BUF_REGION_DEF(pp) = skmem_region_create(name,
1359 	    buf_srp, pp_buf_seg_ctor, pp_buf_seg_dtor, pp)) == NULL) {
1360 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1361 		    pp->pp_name, SK_KVA(pp), buf_srp->srp_name);
1362 		goto failed;
1363 	}
1364 
1365 	if (PP_HAS_LARGE_BUF(pp)) {
1366 		PP_BUF_REGION_LARGE(pp) = skmem_region_create(name, lbuf_srp,
1367 		    pp_buf_seg_ctor, pp_buf_seg_dtor, pp);
1368 		if (PP_BUF_REGION_LARGE(pp) == NULL) {
1369 			SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1370 			    pp->pp_name, SK_KVA(pp), lbuf_srp->srp_name);
1371 			goto failed;
1372 		}
1373 	}
1374 
1375 	/*
1376 	 * Create the buffer object cache without the magazines layer.
1377 	 * We rely on caching the constructed metadata object instead.
1378 	 */
1379 	cache_name = tsnprintf(cname, sizeof(cname), "buf_def.%s", name);
1380 	buf_def_cflags = buf_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES ? SKMEM_CR_NOMAGAZINES : 0;
1381 	if ((PP_BUF_CACHE_DEF(pp) = skmem_cache_create(cache_name,
1382 	    def_buf_obj_size,
1383 	    0, NULL, NULL, NULL, pp, PP_BUF_REGION_DEF(pp),
1384 	    buf_def_cflags)) == NULL) {
1385 		SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1386 		    pp->pp_name, SK_KVA(pp), cname);
1387 		goto failed;
1388 	}
1389 
1390 	if (PP_BUF_REGION_LARGE(pp) != NULL) {
1391 		cache_name = tsnprintf(cname, sizeof(cname), "buf_large.%s", name);
1392 		if ((PP_BUF_CACHE_LARGE(pp) = skmem_cache_create(cache_name,
1393 		    lbuf_srp->srp_c_obj_size, 0, NULL, NULL, NULL, pp,
1394 		    PP_BUF_REGION_LARGE(pp), SKMEM_CR_NOMAGAZINES)) == NULL) {
1395 			SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1396 			    pp->pp_name, SK_KVA(pp), cname);
1397 			goto failed;
1398 		}
1399 	}
1400 
1401 	return pp;
1402 
1403 failed:
1404 	if (pp != NULL) {
1405 		if (pp->pp_ctx != NULL) {
1406 			pp->pp_ctx_release(pp->pp_ctx);
1407 			pp->pp_ctx = NULL;
1408 		}
1409 		pp_close(pp);
1410 	}
1411 
1412 	return NULL;
1413 }
1414 
1415 void
pp_destroy(struct kern_pbufpool * pp)1416 pp_destroy(struct kern_pbufpool *pp)
1417 {
1418 	PP_LOCK_ASSERT_HELD(pp);
1419 
1420 	/* may be called for built-in pp with outstanding reference */
1421 	ASSERT(!(pp->pp_flags & PPF_EXTERNAL) || pp->pp_refcnt == 0);
1422 
1423 	pp_destroy_upp_locked(pp);
1424 
1425 	pp_destroy_upp_bft_locked(pp);
1426 
1427 	if (pp->pp_kmd_cache != NULL) {
1428 		skmem_cache_destroy(pp->pp_kmd_cache);
1429 		pp->pp_kmd_cache = NULL;
1430 	}
1431 
1432 	if (pp->pp_umd_region != NULL) {
1433 		skmem_region_release(pp->pp_umd_region);
1434 		pp->pp_umd_region = NULL;
1435 	}
1436 
1437 	if (pp->pp_kmd_region != NULL) {
1438 		skmem_region_release(pp->pp_kmd_region);
1439 		pp->pp_kmd_region = NULL;
1440 	}
1441 
1442 	if (PP_KBFT_CACHE_DEF(pp) != NULL) {
1443 		skmem_cache_destroy(PP_KBFT_CACHE_DEF(pp));
1444 		PP_KBFT_CACHE_DEF(pp) = NULL;
1445 	}
1446 
1447 	if (PP_KBFT_CACHE_LARGE(pp) != NULL) {
1448 		skmem_cache_destroy(PP_KBFT_CACHE_LARGE(pp));
1449 		PP_KBFT_CACHE_LARGE(pp) = NULL;
1450 	}
1451 
1452 	if (pp->pp_ubft_region != NULL) {
1453 		skmem_region_release(pp->pp_ubft_region);
1454 		pp->pp_ubft_region = NULL;
1455 	}
1456 
1457 	if (pp->pp_kbft_region != NULL) {
1458 		skmem_region_release(pp->pp_kbft_region);
1459 		pp->pp_kbft_region = NULL;
1460 	}
1461 
1462 	/*
1463 	 * The order is important here, since pp_metadata_dtor()
1464 	 * called by freeing on the pp_kmd_cache will in turn
1465 	 * free the attached buffer.  Therefore destroy the
1466 	 * buffer cache last.
1467 	 */
1468 	if (PP_BUF_CACHE_DEF(pp) != NULL) {
1469 		skmem_cache_destroy(PP_BUF_CACHE_DEF(pp));
1470 		PP_BUF_CACHE_DEF(pp) = NULL;
1471 	}
1472 	if (PP_BUF_REGION_DEF(pp) != NULL) {
1473 		skmem_region_release(PP_BUF_REGION_DEF(pp));
1474 		PP_BUF_REGION_DEF(pp) = NULL;
1475 	}
1476 	if (PP_BUF_CACHE_LARGE(pp) != NULL) {
1477 		skmem_cache_destroy(PP_BUF_CACHE_LARGE(pp));
1478 		PP_BUF_CACHE_LARGE(pp) = NULL;
1479 	}
1480 	if (PP_BUF_REGION_LARGE(pp) != NULL) {
1481 		skmem_region_release(PP_BUF_REGION_LARGE(pp));
1482 		PP_BUF_REGION_LARGE(pp) = NULL;
1483 	}
1484 
1485 	if (pp->pp_ctx != NULL) {
1486 		pp->pp_ctx_release(pp->pp_ctx);
1487 		pp->pp_ctx = NULL;
1488 	}
1489 }
1490 
1491 static int
pp_init_upp_locked(struct kern_pbufpool * pp,boolean_t can_block)1492 pp_init_upp_locked(struct kern_pbufpool *pp, boolean_t can_block)
1493 {
1494 	int i, err = 0;
1495 
1496 	if (pp->pp_u_hash_table != NULL) {
1497 		goto done;
1498 	}
1499 
1500 	/* allocated-address hash table */
1501 	/*
1502 	 * -fbounds-safety: We switched to sk_alloc (aka kalloc) from zalloc, so
1503 	 * if we see any performance hit, we can check if this caused it.
1504 	 */
1505 	if (can_block) {
1506 		pp->pp_u_hash_table = sk_alloc_type_array(
1507 			struct kern_pbufpool_u_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1508 			Z_WAITOK, skmem_tag_pbufpool_hash);
1509 		pp->pp_u_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1510 	} else {
1511 		pp->pp_u_hash_table = sk_alloc_type_array(
1512 			struct kern_pbufpool_u_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1513 			Z_NOWAIT, skmem_tag_pbufpool_hash);
1514 		pp->pp_u_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1515 	}
1516 	if (pp->pp_u_hash_table == NULL) {
1517 		SK_ERR("failed to zalloc packet buffer pool upp hash table");
1518 		err = ENOMEM;
1519 		goto done;
1520 	}
1521 
1522 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1523 		SLIST_INIT(&pp->pp_u_hash_table[i].upp_head);
1524 	}
1525 done:
1526 	return err;
1527 }
1528 
1529 static void
pp_destroy_upp_locked(struct kern_pbufpool * pp)1530 pp_destroy_upp_locked(struct kern_pbufpool *pp)
1531 {
1532 	PP_LOCK_ASSERT_HELD(pp);
1533 	if (pp->pp_u_hash_table != NULL) {
1534 		/* purge anything that's left */
1535 		pp_purge_upp_locked(pp, -1);
1536 
1537 #if (DEBUG || DEVELOPMENT)
1538 		for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1539 			ASSERT(SLIST_EMPTY(&pp->pp_u_hash_table[i].upp_head));
1540 		}
1541 #endif /* DEBUG || DEVELOPMENT */
1542 
1543 		kfree_type_counted_by(struct kern_pbufpool_u_bkt,
1544 		    pp->pp_u_hash_table_size,
1545 		    pp->pp_u_hash_table);
1546 	}
1547 	ASSERT(pp->pp_u_bufinuse == 0);
1548 }
1549 
1550 int
pp_init_upp(struct kern_pbufpool * pp,boolean_t can_block)1551 pp_init_upp(struct kern_pbufpool *pp, boolean_t can_block)
1552 {
1553 	int err = 0;
1554 
1555 	PP_LOCK(pp);
1556 	err = pp_init_upp_locked(pp, can_block);
1557 	if (err) {
1558 		SK_ERR("packet UPP init failed (%d)", err);
1559 		goto done;
1560 	}
1561 	err = pp_init_upp_bft_locked(pp, can_block);
1562 	if (err) {
1563 		SK_ERR("buflet UPP init failed (%d)", err);
1564 		pp_destroy_upp_locked(pp);
1565 		goto done;
1566 	}
1567 	pp_retain_locked(pp);
1568 done:
1569 	PP_UNLOCK(pp);
1570 	return err;
1571 }
1572 
1573 __attribute__((always_inline))
1574 static void
pp_insert_upp_bft_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1575 pp_insert_upp_bft_locked(struct kern_pbufpool *pp,
1576     struct __kern_buflet *kbft, pid_t pid)
1577 {
1578 	struct kern_pbufpool_u_bft_bkt *bkt;
1579 	struct __kern_buflet_ext *kbe = (struct __kern_buflet_ext *)kbft;
1580 
1581 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1582 	ASSERT(kbe->kbe_buf_pid == (pid_t)-1);
1583 	kbe->kbe_buf_pid = pid;
1584 	bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, kbft->buf_bft_idx_reg);
1585 	SLIST_INSERT_HEAD(&bkt->upp_head, kbe, kbe_buf_upp_link);
1586 	pp->pp_u_bftinuse++;
1587 }
1588 
1589 __attribute__((always_inline))
1590 static void
pp_insert_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1591 pp_insert_upp_bft_chain_locked(struct kern_pbufpool *pp,
1592     struct __kern_buflet *kbft, pid_t pid)
1593 {
1594 	while (kbft != NULL) {
1595 		pp_insert_upp_bft_locked(pp, kbft, pid);
1596 		kbft = __unsafe_forge_single(struct __kern_buflet *,
1597 		    __DECONST(kern_buflet_t, kbft->buf_nbft_addr));
1598 	}
1599 }
1600 
1601 /* Also inserts the attached chain of buflets */
1602 void static inline
pp_insert_upp_common(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1603 pp_insert_upp_common(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1604     pid_t pid)
1605 {
1606 	struct kern_pbufpool_u_bkt *bkt;
1607 	struct __kern_buflet *kbft;
1608 
1609 	ASSERT(kqum->qum_pid == (pid_t)-1);
1610 	kqum->qum_pid = pid;
1611 
1612 	bkt = KERN_PBUFPOOL_U_HASH(pp, METADATA_IDX(kqum));
1613 	SLIST_INSERT_HEAD(&bkt->upp_head, kqum, qum_upp_link);
1614 	pp->pp_u_bufinuse++;
1615 
1616 	kbft = __unsafe_forge_single(struct __kern_buflet *, (kern_buflet_t)kqum->qum_buf[0].buf_nbft_addr);
1617 	if (kbft != NULL) {
1618 		ASSERT(((kern_buflet_t)kbft)->buf_flag & BUFLET_FLAG_EXTERNAL);
1619 		ASSERT(kqum->qum_qflags & QUM_F_INTERNALIZED);
1620 		pp_insert_upp_bft_chain_locked(pp, kbft, pid);
1621 	}
1622 }
1623 
1624 void
pp_insert_upp_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1625 pp_insert_upp_locked(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1626     pid_t pid)
1627 {
1628 	pp_insert_upp_common(pp, kqum, pid);
1629 }
1630 
1631 void
pp_insert_upp(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1632 pp_insert_upp(struct kern_pbufpool *pp, struct __kern_quantum *kqum, pid_t pid)
1633 {
1634 	PP_LOCK(pp);
1635 	pp_insert_upp_common(pp, kqum, pid);
1636 	PP_UNLOCK(pp);
1637 }
1638 
1639 void
pp_insert_upp_batch(struct kern_pbufpool * pp,pid_t pid,uint64_t * __counted_by (num)array,uint32_t num)1640 pp_insert_upp_batch(struct kern_pbufpool *pp, pid_t pid,
1641     uint64_t *__counted_by(num)array, uint32_t num)
1642 {
1643 	uint32_t i = 0;
1644 
1645 	ASSERT(array != NULL && num > 0);
1646 	PP_LOCK(pp);
1647 	while (i < num) {
1648 		struct __kern_quantum *kqum = SK_PTR_ADDR_KQUM(array[i]);
1649 
1650 		ASSERT(kqum != NULL);
1651 		pp_insert_upp_common(pp, kqum, pid);
1652 		++i;
1653 	}
1654 	PP_UNLOCK(pp);
1655 }
1656 
1657 __attribute__((always_inline))
1658 static struct __kern_buflet *
pp_remove_upp_bft_locked(struct kern_pbufpool * pp,obj_idx_t bft_idx)1659 pp_remove_upp_bft_locked(struct kern_pbufpool *pp, obj_idx_t bft_idx)
1660 {
1661 	struct __kern_buflet_ext *kbft, *tbft;
1662 	struct kern_pbufpool_u_bft_bkt *bkt;
1663 
1664 	bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, bft_idx);
1665 	SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link, tbft) {
1666 		if (((kern_buflet_t)kbft)->buf_bft_idx_reg == bft_idx) {
1667 			SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1668 			    kbe_buf_upp_link);
1669 			kbft->kbe_buf_pid = (pid_t)-1;
1670 			kbft->kbe_buf_upp_link.sle_next = NULL;
1671 			ASSERT(pp->pp_u_bftinuse != 0);
1672 			pp->pp_u_bftinuse--;
1673 			break;
1674 		}
1675 	}
1676 	return (kern_buflet_t)kbft;
1677 }
1678 
1679 struct __kern_buflet *
pp_remove_upp_bft(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1680 pp_remove_upp_bft(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1681 {
1682 	struct __kern_buflet *kbft = pp_remove_upp_bft_locked(pp, md_idx);
1683 
1684 	*err = __improbable(kbft != NULL) ? 0 : EINVAL;
1685 	return kbft;
1686 }
1687 
1688 __attribute__((always_inline))
1689 static int
pp_remove_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum)1690 pp_remove_upp_bft_chain_locked(struct kern_pbufpool *pp,
1691     struct __kern_quantum *kqum)
1692 {
1693 	uint32_t max_frags = pp->pp_max_frags;
1694 	struct __kern_buflet *kbft;
1695 	uint16_t nbfts, upkt_nbfts;
1696 	obj_idx_t bft_idx;
1697 
1698 	ASSERT(!(kqum->qum_qflags & QUM_F_INTERNALIZED));
1699 	bft_idx = kqum->qum_user->qum_buf[0].buf_nbft_idx;
1700 	kbft = &kqum->qum_buf[0];
1701 	if (bft_idx == OBJ_IDX_NONE) {
1702 		return 0;
1703 	}
1704 
1705 	ASSERT(METADATA_TYPE(kqum) == NEXUS_META_TYPE_PACKET);
1706 	struct __kern_packet *kpkt = __DECONST(struct __kern_packet *, kqum);
1707 	struct __user_packet *upkt = __DECONST(struct __user_packet *,
1708 	    kpkt->pkt_qum.qum_user);
1709 
1710 	upkt_nbfts = upkt->pkt_bufs_cnt;
1711 	if (__improbable(upkt_nbfts > max_frags)) {
1712 		SK_ERR("bad bcnt in upkt (%d > %d)", upkt_nbfts, max_frags);
1713 		BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1714 		BUF_NBFT_ADDR(kbft, 0);
1715 		return ERANGE;
1716 	}
1717 
1718 	nbfts = (kbft->buf_addr != 0) ? 1 : 0;
1719 
1720 	do {
1721 		struct __kern_buflet *pbft = kbft;
1722 		struct __kern_buflet_ext *kbe;
1723 
1724 		kbft = pp_remove_upp_bft_locked(pp, bft_idx);
1725 		if (__improbable(kbft == NULL)) {
1726 			BUF_NBFT_IDX(pbft, OBJ_IDX_NONE);
1727 			BUF_NBFT_ADDR(pbft, 0);
1728 			SK_ERR("unallocated next buflet (%d), %p", bft_idx,
1729 			    SK_KVA(pbft));
1730 			return ERANGE;
1731 		}
1732 		ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1733 		BUF_NBFT_IDX(pbft, bft_idx);
1734 		BUF_NBFT_ADDR(pbft, kbft);
1735 		kbe = __container_of(kbft, struct __kern_buflet_ext, kbe_overlay);
1736 		bft_idx = kbe->kbe_buf_user->buf_nbft_idx;
1737 		++nbfts;
1738 	} while ((bft_idx != OBJ_IDX_NONE) && (nbfts < upkt_nbfts));
1739 
1740 	ASSERT(kbft != NULL);
1741 	BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1742 	BUF_NBFT_ADDR(kbft, 0);
1743 	*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = nbfts;
1744 
1745 	if (__improbable((bft_idx != OBJ_IDX_NONE) || (nbfts != upkt_nbfts))) {
1746 		SK_ERR("bad buflet in upkt (%d, %d)", nbfts, upkt_nbfts);
1747 		return ERANGE;
1748 	}
1749 	return 0;
1750 }
1751 
1752 struct __kern_quantum *
pp_remove_upp_locked(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1753 pp_remove_upp_locked(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1754 {
1755 	struct __kern_quantum *kqum, *tqum;
1756 	struct kern_pbufpool_u_bkt *bkt;
1757 
1758 	bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1759 	SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1760 		if (METADATA_IDX(kqum) == md_idx) {
1761 			SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1762 			    qum_upp_link);
1763 			kqum->qum_pid = (pid_t)-1;
1764 			ASSERT(pp->pp_u_bufinuse != 0);
1765 			pp->pp_u_bufinuse--;
1766 			break;
1767 		}
1768 	}
1769 	if (__probable(kqum != NULL)) {
1770 		*err = pp_remove_upp_bft_chain_locked(pp, kqum);
1771 	} else {
1772 		*err = ERANGE;
1773 	}
1774 	return kqum;
1775 }
1776 
1777 struct __kern_quantum *
pp_remove_upp(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1778 pp_remove_upp(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1779 {
1780 	struct __kern_quantum *kqum;
1781 
1782 	PP_LOCK(pp);
1783 	kqum = pp_remove_upp_locked(pp, md_idx, err);
1784 	PP_UNLOCK(pp);
1785 	return kqum;
1786 }
1787 
1788 struct __kern_quantum *
pp_find_upp(struct kern_pbufpool * pp,obj_idx_t md_idx)1789 pp_find_upp(struct kern_pbufpool *pp, obj_idx_t md_idx)
1790 {
1791 	struct __kern_quantum *kqum, *tqum;
1792 	struct kern_pbufpool_u_bkt *bkt;
1793 
1794 	PP_LOCK(pp);
1795 	bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1796 	SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1797 		if (METADATA_IDX(kqum) == md_idx) {
1798 			break;
1799 		}
1800 	}
1801 	PP_UNLOCK(pp);
1802 
1803 	return kqum;
1804 }
1805 
1806 __attribute__((always_inline))
1807 static void
pp_purge_upp_locked(struct kern_pbufpool * pp,pid_t pid)1808 pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid)
1809 {
1810 	struct __kern_quantum *kqum, *tqum;
1811 	struct kern_pbufpool_u_bkt *bkt;
1812 	int i;
1813 
1814 	PP_LOCK_ASSERT_HELD(pp);
1815 
1816 	/*
1817 	 * TODO: Build a list of packets and batch-free them.
1818 	 */
1819 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1820 		bkt = &pp->pp_u_hash_table[i];
1821 		SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1822 			ASSERT(kqum->qum_pid != (pid_t)-1);
1823 			if (pid != (pid_t)-1 && kqum->qum_pid != pid) {
1824 				continue;
1825 			}
1826 			SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1827 			    qum_upp_link);
1828 			pp_remove_upp_bft_chain_locked(pp, kqum);
1829 			kqum->qum_pid = (pid_t)-1;
1830 			kqum->qum_qflags &= ~QUM_F_FINALIZED;
1831 			kqum->qum_ksd = NULL;
1832 			pp_free_packet(__DECONST(struct kern_pbufpool *,
1833 			    kqum->qum_pp), (uint64_t)kqum);
1834 			ASSERT(pp->pp_u_bufinuse != 0);
1835 			pp->pp_u_bufinuse--;
1836 		}
1837 	}
1838 }
1839 
1840 __attribute__((always_inline))
1841 static void
pp_purge_upp_bft_locked(struct kern_pbufpool * pp,pid_t pid)1842 pp_purge_upp_bft_locked(struct kern_pbufpool *pp, pid_t pid)
1843 {
1844 	struct __kern_buflet_ext *kbft, *tbft;
1845 	struct kern_pbufpool_u_bft_bkt *bkt;
1846 	int i;
1847 
1848 	PP_LOCK_ASSERT_HELD(pp);
1849 
1850 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1851 		bkt = &pp->pp_u_bft_hash_table[i];
1852 		SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link,
1853 		    tbft) {
1854 			ASSERT(kbft->kbe_buf_pid != (pid_t)-1);
1855 			if (pid != (pid_t)-1 && kbft->kbe_buf_pid != pid) {
1856 				continue;
1857 			}
1858 			SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1859 			    kbe_buf_upp_link);
1860 			kbft->kbe_buf_pid = (pid_t)-1;
1861 			kbft->kbe_buf_upp_link.sle_next = NULL;
1862 			pp_free_buflet(pp, (kern_buflet_t)kbft);
1863 			ASSERT(pp->pp_u_bftinuse != 0);
1864 			pp->pp_u_bftinuse--;
1865 		}
1866 	}
1867 }
1868 
1869 void
pp_purge_upp(struct kern_pbufpool * pp,pid_t pid)1870 pp_purge_upp(struct kern_pbufpool *pp, pid_t pid)
1871 {
1872 	PP_LOCK(pp);
1873 	pp_purge_upp_locked(pp, pid);
1874 	pp_purge_upp_bft_locked(pp, pid);
1875 	PP_UNLOCK(pp);
1876 }
1877 
1878 static int
pp_init_upp_bft_locked(struct kern_pbufpool * pp,boolean_t can_block)1879 pp_init_upp_bft_locked(struct kern_pbufpool *pp, boolean_t can_block)
1880 {
1881 	int i, err = 0;
1882 
1883 	PP_LOCK_ASSERT_HELD(pp);
1884 	if (pp->pp_u_bft_hash_table != NULL) {
1885 		return 0;
1886 	}
1887 
1888 	/* allocated-address hash table */
1889 	/*
1890 	 * -fbounds-safety: We switched to sk_alloc (aka kalloc) from zalloc, so
1891 	 * if we see any performance hit, we can check if this caused it.
1892 	 */
1893 	if (can_block) {
1894 		pp->pp_u_bft_hash_table = sk_alloc_type_array(
1895 			struct kern_pbufpool_u_bft_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1896 			Z_WAITOK, skmem_tag_pbufpool_bft_hash);
1897 		pp->pp_u_bft_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1898 	} else {
1899 		pp->pp_u_bft_hash_table = sk_alloc_type_array(
1900 			struct kern_pbufpool_u_bft_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1901 			Z_NOWAIT, skmem_tag_pbufpool_bft_hash);
1902 		pp->pp_u_bft_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1903 	}
1904 	if (pp->pp_u_bft_hash_table == NULL) {
1905 		SK_ERR("failed to zalloc packet buffer pool upp buflet hash table");
1906 		err = ENOMEM;
1907 		goto fail;
1908 	}
1909 
1910 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1911 		SLIST_INIT(&pp->pp_u_bft_hash_table[i].upp_head);
1912 	}
1913 
1914 fail:
1915 	return err;
1916 }
1917 
1918 static void
pp_destroy_upp_bft_locked(struct kern_pbufpool * pp)1919 pp_destroy_upp_bft_locked(struct kern_pbufpool *pp)
1920 {
1921 	PP_LOCK_ASSERT_HELD(pp);
1922 	if (pp->pp_u_bft_hash_table != NULL) {
1923 		/* purge anything that's left */
1924 		pp_purge_upp_bft_locked(pp, -1);
1925 
1926 #if (DEBUG || DEVELOPMENT)
1927 		for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1928 			ASSERT(SLIST_EMPTY(&pp->pp_u_bft_hash_table[i].upp_head));
1929 		}
1930 #endif /* DEBUG || DEVELOPMENT */
1931 
1932 		kfree_type_counted_by(struct kern_pbufpool_u_bft_bkt,
1933 		    pp->pp_u_bft_hash_table_size,
1934 		    pp->pp_u_bft_hash_table);
1935 	}
1936 	ASSERT(pp->pp_u_bftinuse == 0);
1937 }
1938 
1939 void
pp_insert_upp_bft(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1940 pp_insert_upp_bft(struct kern_pbufpool *pp,
1941     struct __kern_buflet *kbft, pid_t pid)
1942 {
1943 	PP_LOCK(pp);
1944 	pp_insert_upp_bft_locked(pp, kbft, pid);
1945 	PP_UNLOCK(pp);
1946 }
1947 
1948 boolean_t
pp_isempty_upp(struct kern_pbufpool * pp)1949 pp_isempty_upp(struct kern_pbufpool *pp)
1950 {
1951 	boolean_t isempty;
1952 
1953 	PP_LOCK(pp);
1954 	isempty = (pp->pp_u_bufinuse == 0);
1955 	PP_UNLOCK(pp);
1956 
1957 	return isempty;
1958 }
1959 
1960 __attribute__((always_inline))
1961 static inline struct __kern_quantum *
pp_metadata_init(struct __metadata_preamble * mdp,struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag,struct skmem_obj ** blist)1962 pp_metadata_init(struct __metadata_preamble *mdp, struct kern_pbufpool *pp,
1963     uint16_t bufcnt, uint32_t skmflag, struct skmem_obj **blist)
1964 {
1965 	struct __kern_quantum *kqum;
1966 	struct __user_quantum *uqum;
1967 
1968 	kqum = SK_PTR_ADDR_KQUM((uintptr_t)mdp + METADATA_PREAMBLE_SZ);
1969 	ASSERT(kqum->qum_pp == pp);
1970 	if (__probable(!PP_KERNEL_ONLY(pp))) {
1971 		ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY));
1972 		uqum =  __DECONST(struct __user_quantum *, kqum->qum_user);
1973 		ASSERT(uqum != NULL);
1974 	} else {
1975 		ASSERT(kqum->qum_qflags & QUM_F_KERNEL_ONLY);
1976 		ASSERT(kqum->qum_user == NULL);
1977 		uqum = NULL;
1978 	}
1979 
1980 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 &&
1981 	    pp_metadata_construct(kqum, uqum, METADATA_IDX(kqum), pp,
1982 	    skmflag, bufcnt, FALSE, blist) != 0) {
1983 		return NULL;
1984 	}
1985 
1986 	/* (re)construct {user,kernel} metadata */
1987 	switch (pp->pp_md_type) {
1988 	case NEXUS_META_TYPE_PACKET: {
1989 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
1990 		struct __kern_buflet *kbuf = &kpkt->pkt_qum_buf;
1991 		uint16_t i;
1992 
1993 		/* sanitize flags */
1994 		kpkt->pkt_pflags &= PKT_F_INIT_MASK;
1995 
1996 		ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
1997 		    kpkt->pkt_com_opt != NULL);
1998 		ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
1999 		    kpkt->pkt_flow != NULL);
2000 		ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
2001 		    kpkt->pkt_tx_compl != NULL);
2002 
2003 		/*
2004 		 * XXX: For now we always set PKT_F_FLOW_DATA;
2005 		 * this is a no-op but done for consistency
2006 		 * with the other PKT_F_*_DATA flags.
2007 		 */
2008 		kpkt->pkt_pflags |= PKT_F_FLOW_DATA;
2009 
2010 		/* initialize kernel packet */
2011 		KPKT_INIT(kpkt, QUM_F_INTERNALIZED);
2012 
2013 		ASSERT(bufcnt || PP_HAS_BUFFER_ON_DEMAND(pp));
2014 		if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
2015 			ASSERT(kbuf->buf_ctl == NULL);
2016 			ASSERT(kbuf->buf_addr == 0);
2017 			/*
2018 			 * -fbounds-safety: buf_nbft_addr is a mach_vm_address_t
2019 			 * which is unsafe, so we just forge it here.
2020 			 */
2021 			kbuf = __unsafe_forge_single(struct __kern_buflet *,
2022 			    __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
2023 		}
2024 		/* initialize kernel buflet */
2025 		for (i = 0; i < bufcnt; i++) {
2026 			ASSERT(kbuf != NULL);
2027 			KBUF_INIT(kbuf);
2028 			kbuf = __unsafe_forge_single(struct __kern_buflet *,
2029 			    __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
2030 		}
2031 		ASSERT((kbuf == NULL) || (bufcnt == 0));
2032 		break;
2033 	}
2034 	default:
2035 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
2036 		/* kernel quantum */
2037 		KQUM_INIT(kqum, QUM_F_INTERNALIZED);
2038 		KBUF_INIT(&kqum->qum_buf[0]);
2039 		break;
2040 	}
2041 
2042 	return kqum;
2043 }
2044 
2045 /*
2046  * When PPF_BUFFER_ON_DEMAND flag is set on packet pool creation, we create
2047  * packet descriptor cache with no buffer attached and a buflet cache with
2048  * cpu layer caching enabled. While operating in this mode, we can call
2049  * pp_alloc_packet_common() either with `bufcnt = 0` or `bufcnt = n`,
2050  * where n <= pp->pp_max_frags. If `bufcnt == 0` then we allocate packet
2051  * descriptor with no attached buffer from the metadata cache.
2052  * If `bufcnt != 0`, then this routine allocates packet descriptor and buflets
2053  * from their respective caches and constructs the packet on behalf of the
2054  * caller.
2055  */
2056 __attribute__((always_inline))
2057 static inline uint32_t
pp_alloc_packet_common(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * __counted_by (num)array,uint32_t num,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2058 pp_alloc_packet_common(struct kern_pbufpool *pp, uint16_t bufcnt,
2059     uint64_t *__counted_by(num)array, uint32_t num, boolean_t tagged,
2060     alloc_cb_func_t cb, const void *ctx, uint32_t skmflag)
2061 {
2062 	struct __metadata_preamble *mdp;
2063 	struct __kern_quantum *kqum = NULL;
2064 	uint32_t allocp, need = num;
2065 	struct skmem_obj *__single plist, *__single blist = NULL;
2066 	uint64_t *array_cp;  /* -fbounds-safety */
2067 
2068 	ASSERT(bufcnt <= pp->pp_max_frags);
2069 	ASSERT(array != NULL && num > 0);
2070 	ASSERT(PP_BATCH_CAPABLE(pp));
2071 
2072 	/* allocate (constructed) packet(s) with buffer(s) attached */
2073 	allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist,
2074 	    pp->pp_kmd_cache->skm_objsize, num, skmflag);
2075 
2076 	/* allocate (constructed) buflet(s) with buffer(s) attached */
2077 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
2078 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
2079 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, (allocp * bufcnt), skmflag);
2080 	}
2081 
2082 	array_cp = array;
2083 	while (plist != NULL) {
2084 		struct skmem_obj *plistn;
2085 
2086 		plistn = plist->mo_next;
2087 		plist->mo_next = NULL;
2088 
2089 		mdp = (struct __metadata_preamble *)(void *)plist;
2090 		kqum = pp_metadata_init(mdp, pp, bufcnt, skmflag, &blist);
2091 		if (kqum == NULL) {
2092 			if (blist != NULL) {
2093 				skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp),
2094 				    blist);
2095 				blist = NULL;
2096 			}
2097 			plist->mo_next = plistn;
2098 			skmem_cache_batch_free(pp->pp_kmd_cache, plist);
2099 			plist = NULL;
2100 			break;
2101 		}
2102 
2103 #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
2104 		/* Checking to ensure the object address is tagged */
2105 		ASSERT((vm_offset_t)kqum !=
2106 		    vm_memtag_canonicalize_address((vm_offset_t)kqum));
2107 #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
2108 
2109 		if (tagged) {
2110 			*array_cp = SK_PTR_ENCODE(kqum, METADATA_TYPE(kqum),
2111 			    METADATA_SUBTYPE(kqum));
2112 		} else {
2113 			*array_cp = (uint64_t)kqum;
2114 		}
2115 
2116 		if (cb != NULL) {
2117 			(cb)(*array_cp, (num - need), ctx);
2118 		}
2119 
2120 		++array_cp;
2121 		plist = plistn;
2122 
2123 		ASSERT(need > 0);
2124 		--need;
2125 	}
2126 	ASSERT(blist == NULL);
2127 	ASSERT((num - need) == allocp || kqum == NULL);
2128 
2129 	return num - need;
2130 }
2131 
2132 uint64_t
pp_alloc_packet(struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag)2133 pp_alloc_packet(struct kern_pbufpool *pp, uint16_t bufcnt, uint32_t skmflag)
2134 {
2135 	uint64_t kpkt = 0;
2136 
2137 	(void) pp_alloc_packet_common(pp, bufcnt, &kpkt, 1, FALSE,
2138 	    NULL, NULL, skmflag);
2139 
2140 	return kpkt;
2141 }
2142 
2143 int
pp_alloc_packet_batch(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * __counted_by (* size)array,uint32_t * size,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2144 pp_alloc_packet_batch(struct kern_pbufpool *pp, uint16_t bufcnt,
2145     uint64_t *__counted_by(*size)array, uint32_t *size, boolean_t tagged,
2146     alloc_cb_func_t cb, const void *ctx, uint32_t skmflag)
2147 {
2148 	uint32_t i, n;
2149 	int err;
2150 
2151 	ASSERT(array != NULL && size > 0);
2152 
2153 	n = *size;
2154 	/*
2155 	 * -fbounds-safety: Originally there was this line here: *size = 0; but
2156 	 * we removed this because array is now __counted_by(*size), so *size =
2157 	 * 0 leads to brk 0x5519. Also, *size is set to i anyway.
2158 	 */
2159 
2160 	i = pp_alloc_packet_common(pp, bufcnt, array, n, tagged,
2161 	    cb, ctx, skmflag);
2162 	/*
2163 	 * -fbounds-safety: Since array is __counted_by(*size), we need to be
2164 	 * extra careful when *size is updated, like below. Here, we know i will
2165 	 * be less than or equal to the original *size value, so updating *size
2166 	 * is okay.
2167 	 */
2168 	*size = i;
2169 
2170 	if (__probable(i == n)) {
2171 		err = 0;
2172 	} else if (i != 0) {
2173 		err = EAGAIN;
2174 	} else {
2175 		err = ENOMEM;
2176 	}
2177 
2178 	return err;
2179 }
2180 
2181 int
pp_alloc_pktq(struct kern_pbufpool * pp,uint16_t bufcnt,struct pktq * pktq,uint32_t num,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2182 pp_alloc_pktq(struct kern_pbufpool *pp, uint16_t bufcnt,
2183     struct pktq *pktq, uint32_t num, alloc_cb_func_t cb, const void *ctx,
2184     uint32_t skmflag)
2185 {
2186 	struct __metadata_preamble *mdp;
2187 	struct __kern_packet *kpkt = NULL;
2188 	uint32_t allocp, need = num;
2189 	struct skmem_obj *__single plist, *__single blist = NULL;
2190 	int err;
2191 
2192 	ASSERT(pktq != NULL && num > 0);
2193 	ASSERT(pp->pp_md_type == NEXUS_META_TYPE_PACKET);
2194 	ASSERT(bufcnt <= pp->pp_max_frags);
2195 	ASSERT(PP_BATCH_CAPABLE(pp));
2196 
2197 	/* allocate (constructed) packet(s) with buffer(s) attached */
2198 	allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist,
2199 	    pp->pp_kmd_cache->skm_objsize, num, skmflag);
2200 
2201 	/* allocate (constructed) buflet(s) with buffer(s) attached */
2202 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
2203 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
2204 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, (allocp * bufcnt), skmflag);
2205 	}
2206 
2207 	while (plist != NULL) {
2208 		struct skmem_obj *plistn;
2209 
2210 		plistn = plist->mo_next;
2211 		plist->mo_next = NULL;
2212 
2213 		mdp = (struct __metadata_preamble *)(void *)plist;
2214 		kpkt = (struct __kern_packet *)pp_metadata_init(mdp, pp,
2215 		    bufcnt, skmflag, &blist);
2216 		if (kpkt == NULL) {
2217 			if (blist != NULL) {
2218 				skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp),
2219 				    blist);
2220 				blist = NULL;
2221 			}
2222 			plist->mo_next = plistn;
2223 			skmem_cache_batch_free(pp->pp_kmd_cache, plist);
2224 			plist = NULL;
2225 			break;
2226 		}
2227 
2228 #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
2229 		/* Checking to ensure the object address is tagged */
2230 		ASSERT((vm_offset_t)kpkt !=
2231 		    vm_memtag_canonicalize_address((vm_offset_t)kpkt));
2232 #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
2233 
2234 		KPKTQ_ENQUEUE(pktq, kpkt);
2235 
2236 		if (cb != NULL) {
2237 			(cb)((uint64_t)kpkt, (num - need), ctx);
2238 		}
2239 
2240 		plist = plistn;
2241 
2242 		ASSERT(need > 0);
2243 		--need;
2244 	}
2245 	ASSERT(blist == NULL);
2246 	ASSERT((num - need) == allocp || kpkt == NULL);
2247 
2248 	if (__probable(need == 0)) {
2249 		err = 0;
2250 	} else if (need == num) {
2251 		err = ENOMEM;
2252 	} else {
2253 		err = EAGAIN;
2254 	}
2255 
2256 	return err;
2257 }
2258 
2259 uint64_t
pp_alloc_packet_by_size(struct kern_pbufpool * pp,uint32_t size,uint32_t skmflag)2260 pp_alloc_packet_by_size(struct kern_pbufpool *pp, uint32_t size,
2261     uint32_t skmflag)
2262 {
2263 	uint32_t bufcnt = pp->pp_max_frags;
2264 	uint64_t kpkt = 0;
2265 
2266 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
2267 		bufcnt =
2268 		    SK_ROUNDUP(size, PP_BUF_SIZE_DEF(pp)) / PP_BUF_SIZE_DEF(pp);
2269 		ASSERT(bufcnt <= UINT16_MAX);
2270 	}
2271 
2272 	(void) pp_alloc_packet_common(pp, (uint16_t)bufcnt, &kpkt, 1, TRUE,
2273 	    NULL, NULL, skmflag);
2274 
2275 	return kpkt;
2276 }
2277 
2278 __attribute__((always_inline))
2279 static inline struct __metadata_preamble *
pp_metadata_fini(struct __kern_quantum * kqum,struct kern_pbufpool * pp,struct mbuf ** mp,struct __kern_packet ** kpp,struct skmem_obj ** blist_def,struct skmem_obj ** blist_nocache_def,struct skmem_obj ** blist_large,struct skmem_obj ** blist_nocahce_large)2280 pp_metadata_fini(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
2281     struct mbuf **mp, struct __kern_packet **kpp, struct skmem_obj **blist_def,
2282     struct skmem_obj **blist_nocache_def, struct skmem_obj **blist_large,
2283     struct skmem_obj **blist_nocahce_large)
2284 {
2285 	struct __metadata_preamble *mdp = METADATA_PREAMBLE(kqum);
2286 
2287 	ASSERT(SK_PTR_TAG(kqum) == 0);
2288 
2289 	switch (pp->pp_md_type) {
2290 	case NEXUS_META_TYPE_PACKET: {
2291 		struct __kern_packet *kpkt = SK_PTR_KPKT(kqum);
2292 
2293 		if ((kpkt->pkt_pflags & PKT_F_TX_COMPL_TS_REQ) != 0) {
2294 			__packet_perform_tx_completion_callbacks(
2295 				SK_PKT2PH(kpkt), NULL);
2296 		}
2297 		if ((kpkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) {
2298 			ASSERT((kpkt->pkt_pflags & PKT_F_PKT_DATA) == 0);
2299 			ASSERT(kpkt->pkt_mbuf != NULL);
2300 			ASSERT(kpkt->pkt_mbuf->m_nextpkt == NULL);
2301 			if (mp != NULL) {
2302 				ASSERT(*mp == NULL);
2303 				*mp = kpkt->pkt_mbuf;
2304 			} else {
2305 				m_freem(kpkt->pkt_mbuf);
2306 			}
2307 			KPKT_CLEAR_MBUF_DATA(kpkt);
2308 		} else if ((kpkt->pkt_pflags & PKT_F_PKT_DATA) != 0) {
2309 			ASSERT(kpkt->pkt_pkt != NULL);
2310 			ASSERT(kpkt->pkt_pkt->pkt_nextpkt == NULL);
2311 			if (kpp != NULL) {
2312 				ASSERT(*kpp == NULL);
2313 				*kpp = kpkt->pkt_pkt;
2314 			} else {
2315 				/* can only recurse once */
2316 				ASSERT((kpkt->pkt_pkt->pkt_pflags &
2317 				    PKT_F_PKT_DATA) == 0);
2318 				pp_free_packet_single(kpkt->pkt_pkt);
2319 			}
2320 			KPKT_CLEAR_PKT_DATA(kpkt);
2321 		}
2322 		kpkt->pkt_pflags &= ~PKT_F_TRUNCATED;
2323 		ASSERT(kpkt->pkt_nextpkt == NULL);
2324 		ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
2325 		ASSERT((kpkt->pkt_pflags & PKT_F_MBUF_MASK) == 0);
2326 		ASSERT((kpkt->pkt_pflags & PKT_F_PKT_MASK) == 0);
2327 		break;
2328 	}
2329 	default:
2330 		break;
2331 	}
2332 
2333 	if (__improbable(PP_HAS_BUFFER_ON_DEMAND(pp))) {
2334 		pp_metadata_destruct_common(kqum, pp, FALSE, blist_def, blist_nocache_def,
2335 		    blist_large, blist_nocahce_large);
2336 	}
2337 	return mdp;
2338 }
2339 
2340 void
pp_free_packet_chain(struct __kern_packet * pkt_chain,int * npkt)2341 pp_free_packet_chain(struct __kern_packet *pkt_chain, int *npkt)
2342 {
2343 	struct __metadata_preamble *mdp;
2344 	struct skmem_obj *__single obj_mdp;
2345 	struct skmem_obj *__single top = NULL;
2346 	struct skmem_obj *__single blist_def = NULL, *__single blist_nocache_def = NULL;
2347 	struct skmem_obj *__single blist_large = NULL, *__single blist_nocache_large = NULL;
2348 	struct skmem_obj **list = &top;
2349 	struct mbuf *__single mtop = NULL;
2350 	struct mbuf **mp = &mtop;
2351 	struct __kern_packet *__single kptop = NULL;
2352 	struct __kern_packet **__single kpp = &kptop, *pkt, *next;
2353 	struct kern_pbufpool *pp;
2354 	int c = 0;
2355 
2356 	pp = __DECONST(struct kern_pbufpool *, pkt_chain->pkt_qum.qum_pp);
2357 	ASSERT(pp != NULL);
2358 	ASSERT(PP_BATCH_CAPABLE(pp));
2359 
2360 	for (pkt = pkt_chain; pkt != NULL; pkt = next) {
2361 		next = pkt->pkt_nextpkt;
2362 		pkt->pkt_nextpkt = NULL;
2363 
2364 		ASSERT(SK_PTR_ADDR_KQUM(pkt)->qum_pp == pp);
2365 		mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(pkt), pp,
2366 		    mp, kpp, &blist_def, &blist_nocache_def, &blist_large, &blist_nocache_large);
2367 
2368 		obj_mdp = __unsafe_forge_single(struct skmem_obj *, mdp);
2369 		*list = obj_mdp;
2370 		list = &(*list)->mo_next;
2371 		c++;
2372 
2373 		if (*mp != NULL) {
2374 			mp = &(*mp)->m_nextpkt;
2375 			ASSERT(*mp == NULL);
2376 		}
2377 		if (*kpp != NULL) {
2378 			kpp = &(*kpp)->pkt_nextpkt;
2379 			ASSERT(*kpp == NULL);
2380 		}
2381 	}
2382 
2383 	ASSERT(top != NULL);
2384 	skmem_cache_batch_free(pp->pp_kmd_cache, top);
2385 	pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
2386 	if (mtop != NULL) {
2387 		DTRACE_SKYWALK(free__attached__mbuf);
2388 		if (__probable(mtop->m_nextpkt != NULL)) {
2389 			m_freem_list(mtop);
2390 		} else {
2391 			m_freem(mtop);
2392 		}
2393 	}
2394 	if (kptop != NULL) {
2395 		int cnt = 0;
2396 		pp_free_packet_chain(kptop, &cnt);
2397 		DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2398 	}
2399 	if (npkt != NULL) {
2400 		*npkt = c;
2401 	}
2402 }
2403 
2404 void
pp_free_pktq(struct pktq * pktq)2405 pp_free_pktq(struct pktq *pktq)
2406 {
2407 	if (__improbable(KPKTQ_EMPTY(pktq))) {
2408 		return;
2409 	}
2410 	struct __kern_packet *pkt = KPKTQ_FIRST(pktq);
2411 	pp_free_packet_chain(pkt, NULL);
2412 	KPKTQ_DISPOSE(pktq);
2413 }
2414 
2415 void
pp_drop_pktq(struct pktq * pktq,struct ifnet * ifp,uint16_t flags,drop_reason_t reason,const char * funcname,uint16_t linenum)2416 pp_drop_pktq(struct pktq *pktq, struct ifnet *ifp, uint16_t flags,
2417     drop_reason_t reason, const char *funcname, uint16_t linenum)
2418 {
2419 	drop_func_t dropfunc;
2420 	struct __kern_packet *kpkt;
2421 
2422 	if (KPKTQ_EMPTY(pktq)) {
2423 		return;
2424 	}
2425 	if (__probable(droptap_total_tap_count == 0)) {
2426 		goto nodroptap;
2427 	}
2428 
2429 	if (flags & DROPTAP_FLAG_DIR_OUT) {
2430 		dropfunc = droptap_output_packet;
2431 	} else if (flags & DROPTAP_FLAG_DIR_IN) {
2432 		dropfunc = droptap_input_packet;
2433 	} else {
2434 		goto nodroptap;
2435 	}
2436 
2437 	KPKTQ_FOREACH(kpkt, pktq) {
2438 		dropfunc(SK_PKT2PH(kpkt), reason, funcname, linenum, flags, ifp,
2439 		    kpkt->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);
2440 	}
2441 
2442 nodroptap:
2443 	pp_free_pktq(pktq);
2444 }
2445 
2446 __attribute__((always_inline))
2447 static inline void
pp_free_packet_array(struct kern_pbufpool * pp,uint64_t * __counted_by (num)array,uint32_t num)2448 pp_free_packet_array(struct kern_pbufpool *pp, uint64_t *__counted_by(num)array, uint32_t num)
2449 {
2450 	struct __metadata_preamble *mdp;
2451 	struct skmem_obj *__single obj_mdp = NULL;
2452 	struct skmem_obj *__single top = NULL;
2453 	struct skmem_obj *__single blist_def = NULL, *__single blist_nocache_def = NULL;
2454 	struct skmem_obj *__single blist_large = NULL, *__single blist_nocache_large = NULL;
2455 	struct skmem_obj **list = &top;
2456 	struct mbuf *__single mtop = NULL;
2457 	struct mbuf **mp = &mtop;
2458 	struct __kern_packet *__single kptop = NULL;
2459 	struct __kern_packet **kpp = &kptop;
2460 	uint32_t i;
2461 
2462 	ASSERT(pp != NULL);
2463 	ASSERT(array != NULL && num > 0);
2464 	ASSERT(PP_BATCH_CAPABLE(pp));
2465 
2466 	for (i = 0; i < num; i++) {
2467 		ASSERT(SK_PTR_ADDR_KQUM(array[i])->qum_pp == pp);
2468 		mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(array[i]), pp,
2469 		    mp, kpp, &blist_def, &blist_nocache_def, &blist_large, &blist_nocache_large);
2470 
2471 		obj_mdp = __unsafe_forge_single(struct skmem_obj *, mdp);
2472 		*list = obj_mdp;
2473 		list = &(*list)->mo_next;
2474 		array[i] = 0;
2475 
2476 		if (*mp != NULL) {
2477 			mp = &(*mp)->m_nextpkt;
2478 			ASSERT(*mp == NULL);
2479 		}
2480 		if (*kpp != NULL) {
2481 			kpp = &(*kpp)->pkt_nextpkt;
2482 			ASSERT(*kpp == NULL);
2483 		}
2484 	}
2485 
2486 	ASSERT(top != NULL);
2487 	skmem_cache_batch_free(pp->pp_kmd_cache, top);
2488 	pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
2489 	if (mtop != NULL) {
2490 		DTRACE_SKYWALK(free__attached__mbuf);
2491 		if (__probable(mtop->m_nextpkt != NULL)) {
2492 			m_freem_list(mtop);
2493 		} else {
2494 			m_freem(mtop);
2495 		}
2496 	}
2497 	if (kptop != NULL) {
2498 		int cnt = 0;
2499 		pp_free_packet_chain(kptop, &cnt);
2500 		DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2501 	}
2502 }
2503 
2504 void
pp_free_packet(struct kern_pbufpool * pp,uint64_t kqum)2505 pp_free_packet(struct kern_pbufpool *pp, uint64_t kqum)
2506 {
2507 	pp_free_packet_array(pp, &kqum, 1);
2508 }
2509 
2510 void
pp_free_packet_batch(const kern_pbufpool_t pp,uint64_t * __counted_by (size)array,uint32_t size)2511 pp_free_packet_batch(const kern_pbufpool_t pp, uint64_t *__counted_by(size)array, uint32_t size)
2512 {
2513 	pp_free_packet_array(pp, array, size);
2514 }
2515 
2516 void
pp_free_packet_single(struct __kern_packet * pkt)2517 pp_free_packet_single(struct __kern_packet *pkt)
2518 {
2519 	ASSERT(pkt->pkt_nextpkt == NULL);
2520 	pp_free_packet(__DECONST(struct kern_pbufpool *,
2521 	    pkt->pkt_qum.qum_pp), SK_PTR_ADDR(pkt));
2522 }
2523 
2524 void
pp_drop_packet_single(struct __kern_packet * pkt,struct ifnet * ifp,uint16_t flags,drop_reason_t reason,const char * funcname,uint16_t linenum)2525 pp_drop_packet_single(struct __kern_packet *pkt, struct ifnet *ifp, uint16_t flags,
2526     drop_reason_t reason, const char *funcname, uint16_t linenum)
2527 {
2528 	drop_func_t dropfunc;
2529 
2530 	if (pkt->pkt_length == 0) {
2531 		return;
2532 	}
2533 	if (__probable(droptap_total_tap_count == 0)) {
2534 		goto nodroptap;
2535 	}
2536 
2537 	if (flags & DROPTAP_FLAG_DIR_OUT) {
2538 		dropfunc = droptap_output_packet;
2539 	} else if (flags & DROPTAP_FLAG_DIR_IN) {
2540 		dropfunc = droptap_input_packet;
2541 	} else {
2542 		goto nodroptap;
2543 	}
2544 
2545 	dropfunc(SK_PKT2PH(pkt), reason, funcname, linenum, flags, ifp,
2546 	    pkt->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);
2547 
2548 nodroptap:
2549 	pp_free_packet_single(pkt);
2550 }
2551 
2552 static mach_vm_address_t
pp_alloc_buffer_common(const kern_pbufpool_t pp,struct skmem_obj_info * oi,uint32_t skmflag,bool large)2553 pp_alloc_buffer_common(const kern_pbufpool_t pp, struct skmem_obj_info *oi,
2554     uint32_t skmflag, bool large)
2555 {
2556 	/*
2557 	 * XXX -fbounds-safety: We can't change this mach_vm_address_t to some
2558 	 * other (safe) pointer type, because IOSkywalkFamily depends on this
2559 	 * being mach_vm_address_t
2560 	 */
2561 	mach_vm_address_t baddr;
2562 	struct skmem_cache *skm = large ? PP_BUF_CACHE_LARGE(pp):
2563 	    PP_BUF_CACHE_DEF(pp);
2564 
2565 	ASSERT(skm != NULL);
2566 	/* allocate a cached buffer */
2567 	baddr = (mach_vm_address_t)skmem_cache_alloc(skm, skmflag);
2568 
2569 #if (DEVELOPMENT || DEBUG)
2570 	uint64_t mtbf = skmem_region_get_mtbf();
2571 	/*
2572 	 * MTBF is applicable only for non-blocking allocations here.
2573 	 */
2574 	if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
2575 	    (skmflag & SKMEM_NOSLEEP))) {
2576 		SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
2577 		net_update_uptime();
2578 		if (baddr != 0) {
2579 			skmem_cache_free(skm,
2580 			    __unsafe_forge_single(struct skmem_obj *, baddr));
2581 			baddr = 0;
2582 		}
2583 	}
2584 #endif /* (DEVELOPMENT || DEBUG) */
2585 
2586 	if (__improbable(baddr == 0)) {
2587 		SK_DF(SK_VERB_MEM, "failed to alloc buffer, pp 0x%llx",
2588 		    SK_KVA(pp));
2589 		return 0;
2590 	}
2591 	skmem_cache_get_obj_info(skm,
2592 	    __unsafe_forge_single(struct skmem_obj *, baddr), oi, NULL);
2593 	ASSERT(SKMEM_OBJ_BUFCTL(oi) != NULL);
2594 	ASSERT((mach_vm_address_t)SKMEM_OBJ_ADDR(oi) == baddr);
2595 	return baddr;
2596 }
2597 
2598 errno_t
pp_alloc_buffer(const kern_pbufpool_t pp,mach_vm_address_t * baddr,kern_segment_t * seg,kern_obj_idx_seg_t * idx,uint32_t skmflag)2599 pp_alloc_buffer(const kern_pbufpool_t pp, mach_vm_address_t *baddr,
2600     kern_segment_t *seg, kern_obj_idx_seg_t *idx, uint32_t skmflag)
2601 {
2602 	struct skmem_obj_info oib;
2603 
2604 	VERIFY(pp != NULL && baddr != NULL);
2605 	VERIFY((seg != NULL) == (idx != NULL));
2606 
2607 	if (__improbable(!PP_HAS_BUFFER_ON_DEMAND(pp))) {
2608 		return ENOTSUP;
2609 	}
2610 
2611 	*baddr = pp_alloc_buffer_common(pp, &oib, skmflag, false);
2612 	if (__improbable(*baddr == 0)) {
2613 		return ENOMEM;
2614 	}
2615 
2616 	if (seg != NULL) {
2617 		ASSERT(SKMEM_OBJ_SEG(&oib) != NULL);
2618 		*seg = SKMEM_OBJ_SEG(&oib);
2619 		*idx = SKMEM_OBJ_IDX_SEG(&oib);
2620 	}
2621 	return 0;
2622 }
2623 
2624 void
pp_free_buffer(const kern_pbufpool_t pp,mach_vm_address_t addr)2625 pp_free_buffer(const kern_pbufpool_t pp, mach_vm_address_t addr)
2626 {
2627 	ASSERT(pp != NULL && addr != 0);
2628 	skmem_cache_free(PP_BUF_CACHE_DEF(pp), __unsafe_forge_single(
2629 		    struct skmem_obj *, addr));
2630 }
2631 
2632 __attribute__((always_inline))
2633 static inline uint32_t
pp_alloc_buflet_common(struct kern_pbufpool * pp,uint64_t * __counted_by (num)array,uint32_t num,uint32_t skmflag,bool large)2634 pp_alloc_buflet_common(struct kern_pbufpool *pp,
2635     uint64_t *__counted_by(num)array, uint32_t num, uint32_t skmflag,
2636     bool large)
2637 {
2638 	struct __kern_buflet *kbft = NULL;
2639 	uint32_t allocd, need = num;
2640 	struct skmem_obj *__single list;
2641 	uint64_t *array_cp;  /* -fbounds-safety */
2642 
2643 	ASSERT(array != NULL && num > 0);
2644 	ASSERT(PP_BATCH_CAPABLE(pp));
2645 	ASSERT(PP_KBFT_CACHE_DEF(pp) != NULL);
2646 	ASSERT(PP_BUF_SIZE_LARGE(pp) != 0 || !large);
2647 
2648 	if (large) {
2649 		allocd = skmem_cache_batch_alloc(PP_KBFT_CACHE_LARGE(pp), &list,
2650 		    PP_KBFT_CACHE_LARGE(pp)->skm_objsize, num, skmflag);
2651 	} else {
2652 		allocd = skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &list,
2653 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, num, skmflag);
2654 	}
2655 
2656 	array_cp = array;
2657 	while (list != NULL) {
2658 		struct skmem_obj *listn;
2659 
2660 		listn = list->mo_next;
2661 		list->mo_next = NULL;
2662 		kbft = (kern_buflet_t)(void *)list;
2663 
2664 #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
2665 		/* Checking to ensure the object address is tagged */
2666 		ASSERT((vm_offset_t)kbft !=
2667 		    vm_memtag_canonicalize_address((vm_offset_t)kbft));
2668 #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
2669 
2670 		KBUF_EXT_INIT(kbft, pp);
2671 		*array_cp = (uint64_t)kbft;
2672 		++array_cp;
2673 		list = listn;
2674 		ASSERT(need > 0);
2675 		--need;
2676 	}
2677 	ASSERT((num - need) == allocd || kbft == NULL);
2678 	return num - need;
2679 }
2680 
2681 errno_t
pp_alloc_buflet(struct kern_pbufpool * pp,kern_buflet_t * kbft,uint32_t skmflag,bool large)2682 pp_alloc_buflet(struct kern_pbufpool *pp, kern_buflet_t *kbft, uint32_t skmflag,
2683     bool large)
2684 {
2685 	uint64_t bft;
2686 
2687 	if (__improbable(!pp_alloc_buflet_common(pp, &bft, 1, skmflag, large))) {
2688 		return ENOMEM;
2689 	}
2690 	*kbft = __unsafe_forge_single(kern_buflet_t, bft);
2691 	return 0;
2692 }
2693 
2694 errno_t
pp_alloc_buflet_batch(struct kern_pbufpool * pp,uint64_t * __counted_by (* size)array,uint32_t * size,uint32_t skmflag,bool large)2695 pp_alloc_buflet_batch(struct kern_pbufpool *pp,
2696     uint64_t *__counted_by(*size)array, uint32_t *size, uint32_t skmflag,
2697     bool large)
2698 {
2699 	uint32_t i, n;
2700 	int err;
2701 
2702 	ASSERT(array != NULL && size > 0);
2703 
2704 	n = *size;
2705 	i = pp_alloc_buflet_common(pp, array, n, skmflag, large);
2706 	*size = i;
2707 
2708 	if (__probable(i == n)) {
2709 		err = 0;
2710 	} else if (i != 0) {
2711 		err = EAGAIN;
2712 	} else {
2713 		err = ENOMEM;
2714 	}
2715 
2716 	return err;
2717 }
2718 
2719 __attribute__((always_inline))
2720 static void
pp_free_buflet_common(const kern_pbufpool_t pp,kern_buflet_t kbft)2721 pp_free_buflet_common(const kern_pbufpool_t pp, kern_buflet_t kbft)
2722 {
2723 	ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
2724 	ASSERT(kbft->buf_nbft_addr == 0);
2725 
2726 	if (kbft->buf_flag & BUFLET_FLAG_EXTERNAL) {
2727 		ASSERT(kbft->buf_addr != 0);
2728 		ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2729 		ASSERT(kbft->buf_bft_idx_reg != OBJ_IDX_NONE);
2730 		ASSERT(kbft->buf_ctl != NULL);
2731 		ASSERT(((struct __kern_buflet_ext *)kbft)->
2732 		    kbe_buf_upp_link.sle_next == NULL);
2733 		if (kbft->buf_ctl->bc_usecnt > 1) {
2734 			skmem_cache_free_nocache(BUFLET_HAS_LARGE_BUF(kbft) ?
2735 			    PP_KBFT_CACHE_LARGE(pp) : PP_KBFT_CACHE_DEF(pp),
2736 			    (void *)kbft);
2737 		} else {
2738 			skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2739 			    PP_KBFT_CACHE_LARGE(pp) : PP_KBFT_CACHE_DEF(pp),
2740 			    (void *)kbft);
2741 		}
2742 	} else if (__probable(kbft->buf_addr != 0)) {
2743 		void *objaddr = kbft->buf_objaddr;
2744 		uint32_t usecnt = 0;
2745 
2746 		ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2747 		ASSERT(kbft->buf_ctl != NULL);
2748 		KBUF_DTOR(kbft, usecnt);
2749 		SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u",
2750 		    SK_KVA(pp), SK_KVA(objaddr), usecnt);
2751 		if (__probable(usecnt == 0)) {
2752 			skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2753 			    PP_BUF_CACHE_LARGE(pp) : PP_BUF_CACHE_DEF(pp),
2754 			    objaddr);
2755 		}
2756 	}
2757 }
2758 
2759 void
pp_free_buflet(const kern_pbufpool_t pp,kern_buflet_t kbft)2760 pp_free_buflet(const kern_pbufpool_t pp, kern_buflet_t kbft)
2761 {
2762 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
2763 	ASSERT(pp != NULL && kbft != NULL);
2764 	pp_free_buflet_common(pp, kbft);
2765 }
2766 
2767 void
pp_reap_caches(boolean_t purge)2768 pp_reap_caches(boolean_t purge)
2769 {
2770 	skmem_cache_reap_now(pp_opt_cache, purge);
2771 	skmem_cache_reap_now(pp_flow_cache, purge);
2772 	skmem_cache_reap_now(pp_compl_cache, purge);
2773 }
2774