xref: /xnu-11215.41.3/bsd/skywalk/packet/pbufpool.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2016-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/packet/pbufpool_var.h>
31 #include <sys/sdt.h>
32 #include <net/droptap.h>
33 
34 static struct kern_pbufpool *pp_alloc(zalloc_flags_t);
35 static void pp_free(struct kern_pbufpool *);
36 static uint32_t pp_alloc_packet_common(struct kern_pbufpool *, uint16_t,
37     uint64_t *__counted_by(num), uint32_t num, boolean_t, alloc_cb_func_t,
38     const void *, uint32_t);
39 static void pp_free_packet_array(struct kern_pbufpool *,
40     uint64_t *__counted_by(num)array, uint32_t num);
41 static int pp_metadata_ctor_no_buflet(struct skmem_obj_info *,
42     struct skmem_obj_info *, void *, uint32_t);
43 static int pp_metadata_ctor_max_buflet(struct skmem_obj_info *,
44     struct skmem_obj_info *, void *, uint32_t);
45 static void pp_metadata_dtor(void *, void *);
46 static int pp_metadata_construct(struct __kern_quantum *,
47     struct __user_quantum *, obj_idx_t, struct kern_pbufpool *, uint32_t,
48     uint16_t, bool, struct skmem_obj **);
49 static void pp_metadata_destruct(struct __kern_quantum *,
50     struct kern_pbufpool *, bool);
51 static struct __kern_quantum *pp_metadata_init(struct __metadata_preamble *,
52     struct kern_pbufpool *, uint16_t, uint32_t, struct skmem_obj **);
53 static struct __metadata_preamble *pp_metadata_fini(struct __kern_quantum *,
54     struct kern_pbufpool *, struct mbuf **, struct __kern_packet **,
55     struct skmem_obj **, struct skmem_obj **, struct skmem_obj **, struct skmem_obj **);
56 static void pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid);
57 static void pp_buf_seg_ctor(struct sksegment *, IOSKMemoryBufferRef, void *);
58 static void pp_buf_seg_dtor(struct sksegment *, IOSKMemoryBufferRef, void *);
59 static void pp_destroy_upp_locked(struct kern_pbufpool *);
60 static void pp_destroy_upp_bft_locked(struct kern_pbufpool *);
61 static int pp_init_upp_bft_locked(struct kern_pbufpool *, boolean_t);
62 static void pp_free_buflet_common(const kern_pbufpool_t, kern_buflet_t);
63 static mach_vm_address_t pp_alloc_buffer_common(const kern_pbufpool_t pp,
64     struct skmem_obj_info *oi, uint32_t skmflag, bool large);
65 static inline uint32_t
66 pp_alloc_buflet_common(struct kern_pbufpool *pp,
67     uint64_t *__counted_by(num)array, uint32_t num, uint32_t skmflag,
68     bool large);
69 
70 #define KERN_PBUFPOOL_U_HASH_SIZE       64      /* hash table size */
71 #define KERN_BUF_MIN_STRIDING_SIZE      256 * 1024
72 
73 /*
74  * Since the inputs are small (indices to the metadata region), we can use
75  * Knuth's multiplicative hash method which is fast and good enough.  Here
76  * we multiply the input by the golden ratio of 2^32.  See "The Art of
77  * Computer Programming", section 6.4.
78  */
79 #define KERN_PBUFPOOL_U_HASH_INDEX(_i, _m)                      \
80 	(((_i) * 2654435761U) & (_m))
81 #define KERN_PBUFPOOL_U_HASH(_pp, _i)                           \
82 	(&(_pp)->pp_u_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
83 	KERN_PBUFPOOL_U_HASH_SIZE - 1)])
84 #define KERN_PBUFPOOL_U_BFT_HASH(_pp, _i)                           \
85 	(&(_pp)->pp_u_bft_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
86 	KERN_PBUFPOOL_U_HASH_SIZE - 1)])
87 
88 static SKMEM_TYPE_DEFINE(pp_zone, struct kern_pbufpool);
89 
90 #define SKMEM_TAG_PBUFPOOL_HASH  "com.apple.skywalk.pbufpool.hash"
91 static SKMEM_TAG_DEFINE(skmem_tag_pbufpool_hash, SKMEM_TAG_PBUFPOOL_HASH);
92 
93 #define SKMEM_TAG_PBUFPOOL_BFT_HASH  "com.apple.skywalk.pbufpool.bft.hash"
94 static SKMEM_TAG_DEFINE(skmem_tag_pbufpool_bft_hash, SKMEM_TAG_PBUFPOOL_BFT_HASH);
95 
96 struct kern_pbufpool_u_htbl {
97 	struct kern_pbufpool_u_bkt upp_hash[KERN_PBUFPOOL_U_HASH_SIZE];
98 };
99 
100 #define PP_U_HTBL_SIZE  sizeof(struct kern_pbufpool_u_htbl)
101 static SKMEM_TYPE_DEFINE(pp_u_htbl_zone, struct kern_pbufpool_u_htbl);
102 
103 static struct skmem_cache *pp_opt_cache;        /* cache for __packet_opt */
104 static struct skmem_cache *pp_flow_cache;       /* cache for __flow */
105 static struct skmem_cache *pp_compl_cache;      /* cache for __packet_compl */
106 
107 static int __pp_inited = 0;
108 
109 int
pp_init(void)110 pp_init(void)
111 {
112 	_CASSERT(KPKT_SC_UNSPEC == MBUF_SC_UNSPEC);
113 	_CASSERT(KPKT_SC_BK_SYS == MBUF_SC_BK_SYS);
114 	_CASSERT(KPKT_SC_BK == MBUF_SC_BK);
115 	_CASSERT(KPKT_SC_BE == MBUF_SC_BE);
116 	_CASSERT(KPKT_SC_RD == MBUF_SC_RD);
117 	_CASSERT(KPKT_SC_OAM == MBUF_SC_OAM);
118 	_CASSERT(KPKT_SC_AV == MBUF_SC_AV);
119 	_CASSERT(KPKT_SC_RV == MBUF_SC_RV);
120 	_CASSERT(KPKT_SC_VI == MBUF_SC_VI);
121 	_CASSERT(KPKT_SC_SIG == MBUF_SC_SIG);
122 	_CASSERT(KPKT_SC_VO == MBUF_SC_VO);
123 	_CASSERT(KPKT_SC_CTL == MBUF_SC_CTL);
124 
125 	_CASSERT(KPKT_SC_BK_SYS == PKT_SC_BK_SYS);
126 	_CASSERT(KPKT_SC_BK == PKT_SC_BK);
127 	_CASSERT(KPKT_SC_BE == PKT_SC_BE);
128 	_CASSERT(KPKT_SC_RD == PKT_SC_RD);
129 	_CASSERT(KPKT_SC_OAM == PKT_SC_OAM);
130 	_CASSERT(KPKT_SC_AV == PKT_SC_AV);
131 	_CASSERT(KPKT_SC_RV == PKT_SC_RV);
132 	_CASSERT(KPKT_SC_VI == PKT_SC_VI);
133 	_CASSERT(KPKT_SC_SIG == PKT_SC_SIG);
134 	_CASSERT(KPKT_SC_VO == PKT_SC_VO);
135 	_CASSERT(KPKT_SC_CTL == PKT_SC_CTL);
136 	_CASSERT(KPKT_SC_MAX_CLASSES == MBUF_SC_MAX_CLASSES);
137 
138 	_CASSERT(KPKT_TC_UNSPEC == MBUF_TC_UNSPEC);
139 	_CASSERT(KPKT_TC_BE == MBUF_TC_BE);
140 	_CASSERT(KPKT_TC_BK == MBUF_TC_BK);
141 	_CASSERT(KPKT_TC_VI == MBUF_TC_VI);
142 	_CASSERT(KPKT_TC_VO == MBUF_TC_VO);
143 	_CASSERT(KPKT_TC_MAX == MBUF_TC_MAX);
144 
145 	_CASSERT(KPKT_TC_BE == PKT_TC_BE);
146 	_CASSERT(KPKT_TC_BK == PKT_TC_BK);
147 	_CASSERT(KPKT_TC_VI == PKT_TC_VI);
148 	_CASSERT(KPKT_TC_VO == PKT_TC_VO);
149 
150 	_CASSERT(PKT_SCVAL_BK_SYS == SCVAL_BK_SYS);
151 	_CASSERT(PKT_SCVAL_BK == SCVAL_BK);
152 	_CASSERT(PKT_SCVAL_BE == SCVAL_BE);
153 	_CASSERT(PKT_SCVAL_RD == SCVAL_RD);
154 	_CASSERT(PKT_SCVAL_OAM == SCVAL_OAM);
155 	_CASSERT(PKT_SCVAL_AV == SCVAL_AV);
156 	_CASSERT(PKT_SCVAL_RV == SCVAL_RV);
157 	_CASSERT(PKT_SCVAL_VI == SCVAL_VI);
158 	_CASSERT(PKT_SCVAL_VO == SCVAL_VO);
159 	_CASSERT(PKT_SCVAL_CTL == SCVAL_CTL);
160 
161 	/*
162 	 * Assert that the value of common packet flags between mbuf and
163 	 * skywalk packets match, and that they are in PKT_F_COMMON_MASK.
164 	 */
165 	_CASSERT(PKT_F_BACKGROUND == PKTF_SO_BACKGROUND);
166 	_CASSERT(PKT_F_REALTIME == PKTF_SO_REALTIME);
167 	_CASSERT(PKT_F_REXMT == PKTF_TCP_REXMT);
168 	_CASSERT(PKT_F_LAST_PKT == PKTF_LAST_PKT);
169 	_CASSERT(PKT_F_FLOW_ID == PKTF_FLOW_ID);
170 	_CASSERT(PKT_F_FLOW_ADV == PKTF_FLOW_ADV);
171 	_CASSERT(PKT_F_TX_COMPL_TS_REQ == PKTF_TX_COMPL_TS_REQ);
172 	_CASSERT(PKT_F_TS_VALID == PKTF_TS_VALID);
173 	_CASSERT(PKT_F_NEW_FLOW == PKTF_NEW_FLOW);
174 	_CASSERT(PKT_F_START_SEQ == PKTF_START_SEQ);
175 	_CASSERT(PKT_F_KEEPALIVE == PKTF_KEEPALIVE);
176 	_CASSERT(PKT_F_WAKE_PKT == PKTF_WAKE_PKT);
177 	_CASSERT(PKT_F_COMMON_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME |
178 	    PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_FLOW_ID | PKT_F_FLOW_ADV |
179 	    PKT_F_TX_COMPL_TS_REQ | PKT_F_TS_VALID | PKT_F_NEW_FLOW |
180 	    PKT_F_START_SEQ | PKT_F_KEEPALIVE | PKT_F_WAKE_PKT));
181 	/*
182 	 * Assert packet flags shared with userland.
183 	 */
184 	_CASSERT(PKT_F_USER_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME |
185 	    PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_OPT_DATA | PKT_F_PROMISC |
186 	    PKT_F_TRUNCATED | PKT_F_WAKE_PKT | PKT_F_L4S));
187 
188 	_CASSERT(offsetof(struct __kern_quantum, qum_len) ==
189 	    offsetof(struct __kern_packet, pkt_length));
190 
191 	/*
192 	 * Due to the use of tagged pointer, we need the size of
193 	 * the metadata preamble structure to be multiples of 16.
194 	 * See SK_PTR_TAG() definition for details.
195 	 */
196 	_CASSERT(sizeof(struct __metadata_preamble) != 0 &&
197 	    (sizeof(struct __metadata_preamble) % 16) == 0);
198 
199 	_CASSERT(NX_PBUF_FRAGS_MIN == 1 &&
200 	    NX_PBUF_FRAGS_MIN == NX_PBUF_FRAGS_DEFAULT);
201 
202 	/*
203 	 * Batch alloc/free requires linking the objects together;
204 	 * make sure that the fields are at the same offset since
205 	 * we cast the object to struct skmem_obj.
206 	 */
207 	_CASSERT(offsetof(struct __metadata_preamble, _mdp_next) ==
208 	    offsetof(struct skmem_obj, mo_next));
209 	_CASSERT(offsetof(struct __buflet, __buflet_next) ==
210 	    offsetof(struct skmem_obj, mo_next));
211 
212 	SK_LOCK_ASSERT_HELD();
213 	ASSERT(!__pp_inited);
214 
215 	pp_opt_cache = skmem_cache_create("pkt.opt",
216 	    sizeof(struct __packet_opt), sizeof(uint64_t),
217 	    NULL, NULL, NULL, NULL, NULL, 0);
218 	pp_flow_cache = skmem_cache_create("pkt.flow",
219 	    sizeof(struct __flow), 16,  /* 16-bytes aligned */
220 	    NULL, NULL, NULL, NULL, NULL, 0);
221 	pp_compl_cache = skmem_cache_create("pkt.compl",
222 	    sizeof(struct __packet_compl), sizeof(uint64_t),
223 	    NULL, NULL, NULL, NULL, NULL, 0);
224 
225 	return 0;
226 }
227 
228 void
pp_fini(void)229 pp_fini(void)
230 {
231 	SK_LOCK_ASSERT_HELD();
232 
233 	if (__pp_inited) {
234 		if (pp_compl_cache != NULL) {
235 			skmem_cache_destroy(pp_compl_cache);
236 			pp_compl_cache = NULL;
237 		}
238 		if (pp_flow_cache != NULL) {
239 			skmem_cache_destroy(pp_flow_cache);
240 			pp_flow_cache = NULL;
241 		}
242 		if (pp_opt_cache != NULL) {
243 			skmem_cache_destroy(pp_opt_cache);
244 			pp_opt_cache = NULL;
245 		}
246 
247 		__pp_inited = 0;
248 	}
249 }
250 
251 static struct kern_pbufpool *
pp_alloc(zalloc_flags_t how)252 pp_alloc(zalloc_flags_t how)
253 {
254 	struct kern_pbufpool *pp = zalloc_flags(pp_zone, how | Z_ZERO);
255 
256 	if (pp) {
257 		lck_mtx_init(&pp->pp_lock, &skmem_lock_grp, &skmem_lock_attr);
258 	}
259 	return pp;
260 }
261 
262 static void
pp_free(struct kern_pbufpool * pp)263 pp_free(struct kern_pbufpool *pp)
264 {
265 	PP_LOCK_ASSERT_HELD(pp);
266 
267 	pp_destroy(pp);
268 	PP_UNLOCK(pp);
269 
270 	SK_DF(SK_VERB_MEM, "pp 0x%llx FREE", SK_KVA(pp));
271 	lck_mtx_destroy(&pp->pp_lock, &skmem_lock_grp);
272 	zfree(pp_zone, pp);
273 }
274 
275 void
pp_retain_locked(struct kern_pbufpool * pp)276 pp_retain_locked(struct kern_pbufpool *pp)
277 {
278 	PP_LOCK_ASSERT_HELD(pp);
279 
280 	pp->pp_refcnt++;
281 	ASSERT(pp->pp_refcnt != 0);
282 }
283 
284 void
pp_retain(struct kern_pbufpool * pp)285 pp_retain(struct kern_pbufpool *pp)
286 {
287 	PP_LOCK(pp);
288 	pp_retain_locked(pp);
289 	PP_UNLOCK(pp);
290 }
291 
292 boolean_t
pp_release_locked(struct kern_pbufpool * pp)293 pp_release_locked(struct kern_pbufpool *pp)
294 {
295 	uint32_t oldref = pp->pp_refcnt;
296 
297 	PP_LOCK_ASSERT_HELD(pp);
298 
299 	ASSERT(pp->pp_refcnt != 0);
300 	if (--pp->pp_refcnt == 0) {
301 		pp_free(pp);
302 	}
303 
304 	return oldref == 1;
305 }
306 
307 boolean_t
pp_release(struct kern_pbufpool * pp)308 pp_release(struct kern_pbufpool *pp)
309 {
310 	boolean_t lastref;
311 
312 	PP_LOCK(pp);
313 	if (!(lastref = pp_release_locked(pp))) {
314 		PP_UNLOCK(pp);
315 	}
316 
317 	return lastref;
318 }
319 
320 void
pp_close(struct kern_pbufpool * pp)321 pp_close(struct kern_pbufpool *pp)
322 {
323 	PP_LOCK(pp);
324 	ASSERT(pp->pp_refcnt > 0);
325 	ASSERT(!(pp->pp_flags & PPF_CLOSED));
326 	pp->pp_flags |= PPF_CLOSED;
327 	if (!pp_release_locked(pp)) {
328 		PP_UNLOCK(pp);
329 	}
330 }
331 
332 /*
333  * -fbounds-safety: All callers of pp_regions_params_adjust use SKMEM_REGIONS
334  * size for the srp_array. This is same as marking it __counted_by(SKMEM_REGIONS)
335  */
336 void
pp_regions_params_adjust(struct skmem_region_params srp_array[SKMEM_REGIONS],nexus_meta_type_t md_type,nexus_meta_subtype_t md_subtype,uint32_t md_cnt,uint16_t max_frags,uint32_t buf_size,uint32_t large_buf_size,uint32_t buf_cnt,uint32_t buf_seg_size,uint32_t flags)337 pp_regions_params_adjust(struct skmem_region_params srp_array[SKMEM_REGIONS],
338     nexus_meta_type_t md_type, nexus_meta_subtype_t md_subtype, uint32_t md_cnt,
339     uint16_t max_frags, uint32_t buf_size, uint32_t large_buf_size,
340     uint32_t buf_cnt, uint32_t buf_seg_size, uint32_t flags)
341 {
342 	struct skmem_region_params *srp, *kmd_srp, *buf_srp, *kbft_srp,
343 	    *lbuf_srp;
344 	uint32_t md_size = 0;
345 	bool kernel_only = ((flags & PP_REGION_CONFIG_KERNEL_ONLY) != 0);
346 	bool md_persistent = ((flags & PP_REGION_CONFIG_MD_PERSISTENT) != 0);
347 	bool buf_persistent = ((flags & PP_REGION_CONFIG_BUF_PERSISTENT) != 0);
348 	bool config_buflet = ((flags & PP_REGION_CONFIG_BUFLET) != 0);
349 	bool md_magazine_enable = ((flags &
350 	    PP_REGION_CONFIG_MD_MAGAZINE_ENABLE) != 0);
351 
352 	ASSERT(max_frags != 0);
353 
354 	switch (md_type) {
355 	case NEXUS_META_TYPE_QUANTUM:
356 		md_size = NX_METADATA_QUANTUM_SZ;
357 		break;
358 	case NEXUS_META_TYPE_PACKET:
359 		md_size = NX_METADATA_PACKET_SZ(max_frags);
360 		break;
361 	default:
362 		VERIFY(0);
363 		/* NOTREACHED */
364 		__builtin_unreachable();
365 	}
366 
367 	switch (flags & PP_REGION_CONFIG_BUF_IODIR_BIDIR) {
368 	case PP_REGION_CONFIG_BUF_IODIR_IN:
369 		kmd_srp = &srp_array[SKMEM_REGION_RXKMD];
370 		buf_srp = &srp_array[SKMEM_REGION_RXBUF_DEF];
371 		lbuf_srp = &srp_array[SKMEM_REGION_RXBUF_LARGE];
372 		kbft_srp = &srp_array[SKMEM_REGION_RXKBFT];
373 		break;
374 	case PP_REGION_CONFIG_BUF_IODIR_OUT:
375 		kmd_srp = &srp_array[SKMEM_REGION_TXKMD];
376 		buf_srp = &srp_array[SKMEM_REGION_TXBUF_DEF];
377 		lbuf_srp = &srp_array[SKMEM_REGION_TXBUF_LARGE];
378 		kbft_srp = &srp_array[SKMEM_REGION_TXKBFT];
379 		break;
380 	case PP_REGION_CONFIG_BUF_IODIR_BIDIR:
381 	default:
382 		kmd_srp = &srp_array[SKMEM_REGION_KMD];
383 		buf_srp = &srp_array[SKMEM_REGION_BUF_DEF];
384 		lbuf_srp = &srp_array[SKMEM_REGION_BUF_LARGE];
385 		kbft_srp = &srp_array[SKMEM_REGION_KBFT];
386 		break;
387 	}
388 
389 	/* add preamble size to metadata obj size */
390 	md_size += METADATA_PREAMBLE_SZ;
391 	ASSERT(md_size >= NX_METADATA_OBJ_MIN_SZ);
392 
393 	/* configure kernel metadata region */
394 	kmd_srp->srp_md_type = md_type;
395 	kmd_srp->srp_md_subtype = md_subtype;
396 	kmd_srp->srp_r_obj_cnt = md_cnt;
397 	kmd_srp->srp_r_obj_size = md_size;
398 	kmd_srp->srp_max_frags = max_frags;
399 	ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
400 	if (md_persistent) {
401 		kmd_srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
402 	}
403 	ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
404 	if (md_magazine_enable) {
405 		kmd_srp->srp_cflags &= ~SKMEM_REGION_CR_NOMAGAZINES;
406 	}
407 	skmem_region_params_config(kmd_srp);
408 
409 	/* configure user metadata region */
410 	srp = &srp_array[SKMEM_REGION_UMD];
411 	if (!kernel_only) {
412 		srp->srp_md_type = kmd_srp->srp_md_type;
413 		srp->srp_md_subtype = kmd_srp->srp_md_subtype;
414 		srp->srp_r_obj_cnt = kmd_srp->srp_c_obj_cnt;
415 		srp->srp_r_obj_size = kmd_srp->srp_c_obj_size;
416 		srp->srp_max_frags = kmd_srp->srp_max_frags;
417 		ASSERT((srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
418 		if (md_persistent) {
419 			srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
420 		}
421 		/*
422 		 * UMD is a mirrored region and object allocation operations
423 		 * are performed on the KMD objects.
424 		 */
425 		ASSERT((srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
426 		skmem_region_params_config(srp);
427 		ASSERT(srp->srp_c_obj_cnt == kmd_srp->srp_c_obj_cnt);
428 	} else {
429 		ASSERT(srp->srp_r_obj_cnt == 0);
430 		ASSERT(srp->srp_r_obj_size == 0);
431 	}
432 
433 	/* configure buffer region */
434 	buf_srp->srp_r_obj_cnt = MAX(buf_cnt, kmd_srp->srp_c_obj_cnt);
435 	buf_srp->srp_r_obj_size = buf_size;
436 	buf_srp->srp_cflags &= ~SKMEM_REGION_CR_MONOLITHIC;
437 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
438 	if (buf_persistent) {
439 		buf_srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
440 	}
441 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
442 	if (buf_srp->srp_r_obj_size >= KERN_BUF_MIN_STRIDING_SIZE) {
443 		/*
444 		 * A buffer size larger than 256K indicates striding is in use, which
445 		 * means a buffer could be detached from a buflet. In this case, magzine
446 		 * layer should be enabled.
447 		 */
448 		buf_srp->srp_cflags &= ~SKMEM_REGION_CR_NOMAGAZINES;
449 	}
450 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_UREADONLY) == 0);
451 	if ((flags & PP_REGION_CONFIG_BUF_UREADONLY) != 0) {
452 		buf_srp->srp_cflags |= SKMEM_REGION_CR_UREADONLY;
453 	}
454 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_KREADONLY) == 0);
455 	if ((flags & PP_REGION_CONFIG_BUF_KREADONLY) != 0) {
456 		buf_srp->srp_cflags |= SKMEM_REGION_CR_KREADONLY;
457 	}
458 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_MONOLITHIC) == 0);
459 	if ((flags & PP_REGION_CONFIG_BUF_MONOLITHIC) != 0) {
460 		buf_srp->srp_cflags |= SKMEM_REGION_CR_MONOLITHIC;
461 	}
462 	ASSERT((srp->srp_cflags & SKMEM_REGION_CR_SEGPHYSCONTIG) == 0);
463 	if ((flags & PP_REGION_CONFIG_BUF_SEGPHYSCONTIG) != 0) {
464 		buf_srp->srp_cflags |= SKMEM_REGION_CR_SEGPHYSCONTIG;
465 	}
466 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_NOCACHE) == 0);
467 	if ((flags & PP_REGION_CONFIG_BUF_NOCACHE) != 0) {
468 		buf_srp->srp_cflags |= SKMEM_REGION_CR_NOCACHE;
469 	}
470 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_THREADSAFE) == 0);
471 	if ((flags & PP_REGION_CONFIG_BUF_THREADSAFE) != 0) {
472 		buf_srp->srp_cflags |= SKMEM_REGION_CR_THREADSAFE;
473 	}
474 	if (buf_seg_size != 0) {
475 		buf_srp->srp_r_seg_size = buf_seg_size;
476 	}
477 	skmem_region_params_config(buf_srp);
478 
479 	/* configure large buffer region */
480 	if (large_buf_size != 0) {
481 		lbuf_srp->srp_r_obj_cnt = buf_srp->srp_r_obj_cnt;
482 		lbuf_srp->srp_r_obj_size = large_buf_size;
483 		lbuf_srp->srp_r_seg_size = buf_srp->srp_r_seg_size;
484 		lbuf_srp->srp_cflags = buf_srp->srp_cflags;
485 		skmem_region_params_config(lbuf_srp);
486 	}
487 
488 	/* configure kernel buflet region */
489 	if (config_buflet) {
490 		ASSERT(md_type == NEXUS_META_TYPE_PACKET);
491 		/*
492 		 * Ideally we want the number of buflets to be
493 		 * "kmd_srp->srp_c_obj_cnt * (kmd_srp->srp_max_frags - 1)",
494 		 * so that we have enough buflets when multi-buflet and
495 		 * shared buffer object is used.
496 		 * Currently multi-buflet is being used only by user pool
497 		 * which doesn't support shared buffer object, hence to reduce
498 		 * the number of objects we are restricting the number of
499 		 * buflets to the number of buffers.
500 		 */
501 		kbft_srp->srp_r_obj_cnt = buf_srp->srp_c_obj_cnt +
502 		    lbuf_srp->srp_c_obj_cnt;
503 		kbft_srp->srp_r_obj_size = MAX(sizeof(struct __kern_buflet_ext),
504 		    sizeof(struct __user_buflet));
505 		kbft_srp->srp_cflags = kmd_srp->srp_cflags;
506 		skmem_region_params_config(kbft_srp);
507 		ASSERT(kbft_srp->srp_c_obj_cnt >= buf_srp->srp_c_obj_cnt +
508 		    lbuf_srp->srp_c_obj_cnt);
509 	} else {
510 		ASSERT(kbft_srp->srp_r_obj_cnt == 0);
511 		ASSERT(kbft_srp->srp_r_obj_size == 0);
512 	}
513 
514 	/* configure user buflet region */
515 	srp = &srp_array[SKMEM_REGION_UBFT];
516 	if (config_buflet && !kernel_only) {
517 		srp->srp_r_obj_cnt = kbft_srp->srp_c_obj_cnt;
518 		srp->srp_r_obj_size = kbft_srp->srp_c_obj_size;
519 		srp->srp_cflags = srp_array[SKMEM_REGION_UMD].srp_cflags;
520 		skmem_region_params_config(srp);
521 		ASSERT(srp->srp_c_obj_cnt == kbft_srp->srp_c_obj_cnt);
522 	} else {
523 		ASSERT(srp->srp_r_obj_cnt == 0);
524 		ASSERT(srp->srp_r_obj_size == 0);
525 	}
526 
527 	/* make sure each metadata can be paired with a buffer */
528 	ASSERT(kmd_srp->srp_c_obj_cnt <= buf_srp->srp_c_obj_cnt);
529 }
530 
531 SK_NO_INLINE_ATTRIBUTE
532 static int
pp_metadata_construct(struct __kern_quantum * kqum,struct __user_quantum * uqum,obj_idx_t midx,struct kern_pbufpool * pp,uint32_t skmflag,uint16_t bufcnt,bool raw,struct skmem_obj ** blist)533 pp_metadata_construct(struct __kern_quantum *kqum, struct __user_quantum *uqum,
534     obj_idx_t midx, struct kern_pbufpool *pp, uint32_t skmflag, uint16_t bufcnt,
535     bool raw, struct skmem_obj **blist)
536 {
537 	struct __kern_buflet *kbuf;
538 	mach_vm_address_t baddr = 0;
539 	uint16_t *pbufs_cnt, *pbufs_max;
540 	uint16_t i;
541 
542 	ASSERT(bufcnt == 1 || PP_HAS_BUFFER_ON_DEMAND(pp));
543 
544 	/* construct {user,kernel} metadata */
545 	switch (pp->pp_md_type) {
546 	case NEXUS_META_TYPE_PACKET: {
547 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
548 		struct __user_packet *upkt = SK_PTR_ADDR_UPKT(uqum);
549 		struct __packet_opt *__single opt;
550 		struct __flow *__single flow;
551 		struct __packet_compl *__single compl;
552 		uint64_t pflags;
553 
554 		if (raw) {
555 			opt = skmem_cache_alloc(pp_opt_cache, SKMEM_SLEEP);
556 			flow = skmem_cache_alloc(pp_flow_cache, SKMEM_SLEEP);
557 			compl = skmem_cache_alloc(pp_compl_cache, SKMEM_SLEEP);
558 			pflags = (PKT_F_OPT_ALLOC | PKT_F_FLOW_ALLOC |
559 			    PKT_F_TX_COMPL_ALLOC);
560 		} else {
561 			ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
562 			    kpkt->pkt_com_opt != NULL);
563 			opt = kpkt->pkt_com_opt;
564 			ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
565 			    kpkt->pkt_flow != NULL);
566 			flow = kpkt->pkt_flow;
567 			ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
568 			    kpkt->pkt_tx_compl != NULL);
569 			compl = kpkt->pkt_tx_compl;
570 			pflags = kpkt->pkt_pflags;
571 		}
572 		/* will be adjusted below as part of allocating buffer(s) */
573 		_CASSERT(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
574 		_CASSERT(sizeof(kpkt->pkt_bufs_max) == sizeof(uint16_t));
575 		pbufs_cnt = __DECONST(uint16_t *, &kpkt->pkt_bufs_cnt);
576 		pbufs_max = __DECONST(uint16_t *, &kpkt->pkt_bufs_max);
577 
578 		/* kernel (and user) packet */
579 		KPKT_CTOR(kpkt, pflags, opt, flow, compl, midx,
580 		    upkt, pp, 0, pp->pp_max_frags, 0);
581 		break;
582 	}
583 	default:
584 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
585 		VERIFY(bufcnt == 1);
586 		/* TODO: point these to quantum's once they're defined */
587 		pbufs_cnt = pbufs_max = NULL;
588 		/* kernel quantum */
589 		KQUM_CTOR(kqum, midx, uqum, pp, 0);
590 		break;
591 	}
592 
593 	kbuf = kqum->qum_buf;
594 	for (i = 0; i < bufcnt; i++) {
595 		struct skmem_obj_info oib;
596 
597 		if (!PP_HAS_BUFFER_ON_DEMAND(pp)) {
598 			ASSERT(i == 0);
599 			ASSERT(*blist == NULL);
600 			/*
601 			 * quantum has a native buflet, so we only need a
602 			 * buffer to be allocated and attached to the buflet.
603 			 */
604 			baddr = pp_alloc_buffer_common(pp, &oib, skmflag,
605 			    false);
606 			if (__improbable(baddr == 0)) {
607 				goto fail;
608 			}
609 			KBUF_CTOR(kbuf, baddr, SKMEM_OBJ_IDX_REG(&oib),
610 			    SKMEM_OBJ_BUFCTL(&oib), pp, false);
611 			baddr = 0;
612 		} else {
613 			/*
614 			 * we use pre-constructed buflets with attached buffers.
615 			 */
616 			struct __kern_buflet *pkbuf = kbuf;
617 			struct skmem_obj *blistn;
618 
619 			ASSERT(pkbuf != NULL);
620 			kbuf = (kern_buflet_t)*blist;
621 			if (__improbable(kbuf == NULL)) {
622 				SK_DF(SK_VERB_MEM, "failed to get buflet,"
623 				    " pp 0x%llx", SK_KVA(pp));
624 				goto fail;
625 			}
626 
627 #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
628 			/* Checking to ensure the object address is tagged */
629 			ASSERT((vm_offset_t)kbuf !=
630 			    vm_memtag_canonicalize_address((vm_offset_t)kbuf));
631 #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
632 
633 			blistn = (*blist)->mo_next;
634 			(*blist)->mo_next = NULL;
635 
636 			KBUF_EXT_INIT(kbuf, pp);
637 			KBUF_LINK(pkbuf, kbuf);
638 			*blist = blistn;
639 		}
640 
641 		/* adjust buffer count accordingly */
642 		if (__probable(pbufs_cnt != NULL)) {
643 			*pbufs_cnt += 1;
644 			ASSERT(*pbufs_cnt <= *pbufs_max);
645 		}
646 	}
647 
648 	ASSERT(!PP_KERNEL_ONLY(pp) || (kqum->qum_qflags & QUM_F_KERNEL_ONLY));
649 	ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE);
650 	SK_DF(SK_VERB_MEM, "pp 0x%llx pkt 0x%llx bufcnt %d buf 0x%llx",
651 	    SK_KVA(pp), SK_KVA(kqum), bufcnt, SK_KVA(baddr));
652 	return 0;
653 
654 fail:
655 	ASSERT(bufcnt != 0 && baddr == 0);
656 	pp_metadata_destruct(kqum, pp, raw);
657 	return ENOMEM;
658 }
659 
660 static int
pp_metadata_ctor_common(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,struct kern_pbufpool * pp,uint32_t skmflag,bool no_buflet)661 pp_metadata_ctor_common(struct skmem_obj_info *oi0,
662     struct skmem_obj_info *oim0, struct kern_pbufpool *pp, uint32_t skmflag,
663     bool no_buflet)
664 {
665 	struct skmem_obj_info _oi, _oim;
666 	struct skmem_obj_info *oi, *oim;
667 	struct __kern_quantum *kqum;
668 	struct __user_quantum *uqum;
669 	uint16_t bufcnt = (no_buflet ? 0 : pp->pp_max_frags);
670 	struct skmem_obj *__single blist = NULL;
671 	int error;
672 
673 #if (DEVELOPMENT || DEBUG)
674 	uint64_t mtbf = skmem_region_get_mtbf();
675 	/*
676 	 * MTBF is applicable only for non-blocking allocations here.
677 	 */
678 	if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
679 	    (skmflag & SKMEM_NOSLEEP))) {
680 		SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
681 		net_update_uptime();
682 		return ENOMEM;
683 	}
684 #endif /* (DEVELOPMENT || DEBUG) */
685 
686 	/*
687 	 * Note that oi0 and oim0 may be stored inside the object itself;
688 	 * if so, copy them to local variables before constructing.  We
689 	 * don't use PPF_BATCH to test as the allocator may be allocating
690 	 * storage space differently depending on the number of objects.
691 	 */
692 	if (__probable((uintptr_t)oi0 >= (uintptr_t)SKMEM_OBJ_ADDR(oi0) &&
693 	    ((uintptr_t)oi0 + sizeof(*oi0)) <=
694 	    ((uintptr_t)SKMEM_OBJ_ADDR(oi0) + SKMEM_OBJ_SIZE(oi0)))) {
695 		oi = &_oi;
696 		*oi = *oi0;
697 		if (__probable(oim0 != NULL)) {
698 			oim = &_oim;
699 			*oim = *oim0;
700 		} else {
701 			oim = NULL;
702 		}
703 	} else {
704 		oi = oi0;
705 		oim = oim0;
706 	}
707 
708 	kqum = SK_PTR_ADDR_KQUM((uintptr_t)SKMEM_OBJ_ADDR(oi) +
709 	    METADATA_PREAMBLE_SZ);
710 
711 	if (__probable(!PP_KERNEL_ONLY(pp))) {
712 		ASSERT(oim != NULL && SKMEM_OBJ_ADDR(oim) != NULL);
713 		ASSERT(SKMEM_OBJ_SIZE(oi) == SKMEM_OBJ_SIZE(oim));
714 		uqum = SK_PTR_ADDR_UQUM((uintptr_t)SKMEM_OBJ_ADDR(oim) +
715 		    METADATA_PREAMBLE_SZ);
716 	} else {
717 		ASSERT(oim == NULL);
718 		uqum = NULL;
719 	}
720 
721 	if (oim != NULL) {
722 		/* initialize user metadata redzone */
723 		struct __metadata_preamble *mdp = SKMEM_OBJ_ADDR(oim);
724 		mdp->mdp_redzone =
725 		    (SKMEM_OBJ_ROFF(oim) + METADATA_PREAMBLE_SZ) ^
726 		    __ch_umd_redzone_cookie;
727 	}
728 
729 	/* allocate (constructed) buflet(s) with buffer(s) attached */
730 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0) {
731 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
732 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, bufcnt, skmflag);
733 	}
734 
735 	error = pp_metadata_construct(kqum, uqum, SKMEM_OBJ_IDX_REG(oi), pp,
736 	    skmflag, bufcnt, TRUE, &blist);
737 	if (__improbable(blist != NULL)) {
738 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist);
739 		blist = NULL;
740 	}
741 	return error;
742 }
743 
744 static int
pp_metadata_ctor_no_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)745 pp_metadata_ctor_no_buflet(struct skmem_obj_info *oi0,
746     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
747 {
748 	return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, true);
749 }
750 
751 static int
pp_metadata_ctor_max_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)752 pp_metadata_ctor_max_buflet(struct skmem_obj_info *oi0,
753     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
754 {
755 	return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, false);
756 }
757 
758 __attribute__((always_inline))
759 static void
pp_metadata_destruct_common(struct __kern_quantum * kqum,struct kern_pbufpool * pp,bool raw,struct skmem_obj ** blist_def,struct skmem_obj ** blist_nocache_def,struct skmem_obj ** blist_large,struct skmem_obj ** blist_nocache_large)760 pp_metadata_destruct_common(struct __kern_quantum *kqum,
761     struct kern_pbufpool *pp, bool raw, struct skmem_obj **blist_def,
762     struct skmem_obj **blist_nocache_def, struct skmem_obj **blist_large,
763     struct skmem_obj **blist_nocache_large)
764 {
765 	struct __kern_buflet *kbuf, *nbuf;
766 	struct skmem_obj *__single p_blist_def = NULL, *__single p_blist_large = NULL;
767 	struct skmem_obj *__single p_blist_nocache_def = NULL, *__single p_blist_nocache_large = NULL;
768 	struct skmem_obj **pp_blist_def = &p_blist_def;
769 	struct skmem_obj **pp_blist_large = &p_blist_large;
770 	struct skmem_obj **pp_blist_nocache_def = &p_blist_nocache_def;
771 	struct skmem_obj **pp_blist_nocache_large = &p_blist_nocache_large;
772 	uint16_t bufcnt, i = 0;
773 	bool first_buflet_empty;
774 
775 	ASSERT(blist_def != NULL);
776 	ASSERT(blist_large != NULL);
777 
778 	switch (pp->pp_md_type) {
779 	case NEXUS_META_TYPE_PACKET: {
780 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
781 
782 		ASSERT(kpkt->pkt_user != NULL || PP_KERNEL_ONLY(pp));
783 		ASSERT(kpkt->pkt_qum.qum_pp == pp);
784 		ASSERT(METADATA_TYPE(kpkt) == pp->pp_md_type);
785 		ASSERT(METADATA_SUBTYPE(kpkt) == pp->pp_md_subtype);
786 		ASSERT(METADATA_IDX(kpkt) != OBJ_IDX_NONE);
787 		ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
788 		ASSERT(kpkt->pkt_bufs_cnt <= kpkt->pkt_bufs_max);
789 		ASSERT(kpkt->pkt_bufs_max == pp->pp_max_frags);
790 		_CASSERT(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
791 		bufcnt = kpkt->pkt_bufs_cnt;
792 		kbuf = &kqum->qum_buf[0];
793 		/*
794 		 * special handling for empty first buflet.
795 		 */
796 		first_buflet_empty = (kbuf->buf_addr == 0);
797 		*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = 0;
798 		break;
799 	}
800 	default:
801 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
802 		ASSERT(kqum->qum_user != NULL || PP_KERNEL_ONLY(pp));
803 		ASSERT(kqum->qum_pp == pp);
804 		ASSERT(METADATA_TYPE(kqum) == pp->pp_md_type);
805 		ASSERT(METADATA_SUBTYPE(kqum) == pp->pp_md_subtype);
806 		ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE);
807 		ASSERT(kqum->qum_ksd == NULL);
808 		kbuf = &kqum->qum_buf[0];
809 		/*
810 		 * XXX: Special handling for quantum as we don't currently
811 		 * define bufs_{cnt,max} there.  Given that we support at
812 		 * most only 1 buflet for now, check if buf_addr is non-NULL.
813 		 * See related code in pp_metadata_construct().
814 		 */
815 		first_buflet_empty = (kbuf->buf_addr == 0);
816 		bufcnt = first_buflet_empty ? 0 : 1;
817 		break;
818 	}
819 
820 	/*
821 	 * -fbounds-safety: buf_nbft_addr is a mach_vm_address_t which is
822 	 * unsafe, so we forge it here.
823 	 */
824 	nbuf = __unsafe_forge_single(struct __kern_buflet *,
825 	    __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
826 	BUF_NBFT_ADDR(kbuf, 0);
827 	BUF_NBFT_IDX(kbuf, OBJ_IDX_NONE);
828 	if (!first_buflet_empty) {
829 		pp_free_buflet_common(pp, kbuf);
830 		++i;
831 	}
832 
833 	while (nbuf != NULL) {
834 		ASSERT(nbuf->buf_ctl != NULL);
835 		if (BUFLET_HAS_LARGE_BUF(nbuf)) {
836 			/*
837 			 * bc_usecnt larger than 1 means the buffer has been cloned and is
838 			 * still being used by other bflts. In this case, when we free
839 			 * this bflt we need to explicitly ask for it to not be cached again
840 			 * into magzine layer to prevent immediate reuse of the buffer and
841 			 * data corruption.
842 			 */
843 			if (nbuf->buf_ctl->bc_usecnt > 1) {
844 				*pp_blist_nocache_large = (struct skmem_obj *)(void *)nbuf;
845 				pp_blist_nocache_large =
846 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
847 			} else {
848 				*pp_blist_large = (struct skmem_obj *)(void *)nbuf;
849 				pp_blist_large =
850 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
851 			}
852 		} else {
853 			if (nbuf->buf_ctl->bc_usecnt > 1) {
854 				*pp_blist_nocache_def = (struct skmem_obj *)(void *)nbuf;
855 				pp_blist_nocache_def =
856 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
857 			} else {
858 				*pp_blist_def = (struct skmem_obj *)(void *)nbuf;
859 				pp_blist_def =
860 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
861 			}
862 		}
863 		BUF_NBFT_IDX(nbuf, OBJ_IDX_NONE);
864 		nbuf = __unsafe_forge_single(struct __kern_buflet *,
865 		    __DECONST(struct __kern_buflet *, nbuf->buf_nbft_addr));
866 		++i;
867 	}
868 
869 	ASSERT(i == bufcnt);
870 
871 	if (p_blist_def != NULL) {
872 		*pp_blist_def = *blist_def;
873 		*blist_def = p_blist_def;
874 	}
875 	if (p_blist_large != NULL) {
876 		*pp_blist_large = *blist_large;
877 		*blist_large = p_blist_large;
878 	}
879 	if (p_blist_nocache_def != NULL) {
880 		*pp_blist_nocache_def = *blist_nocache_def;
881 		*blist_nocache_def = p_blist_nocache_def;
882 	}
883 	if (p_blist_nocache_large != NULL) {
884 		*pp_blist_nocache_large = *blist_nocache_large;
885 		*blist_nocache_large = p_blist_nocache_large;
886 	}
887 
888 	/* if we're about to return this object to the slab, clean it up */
889 	if (raw) {
890 		switch (pp->pp_md_type) {
891 		case NEXUS_META_TYPE_PACKET: {
892 			struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
893 
894 			ASSERT(kpkt->pkt_com_opt != NULL ||
895 			    !(kpkt->pkt_pflags & PKT_F_OPT_ALLOC));
896 			if (kpkt->pkt_com_opt != NULL) {
897 				ASSERT(kpkt->pkt_pflags & PKT_F_OPT_ALLOC);
898 				skmem_cache_free(pp_opt_cache,
899 				    kpkt->pkt_com_opt);
900 				kpkt->pkt_com_opt = NULL;
901 			}
902 			ASSERT(kpkt->pkt_flow != NULL ||
903 			    !(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC));
904 			if (kpkt->pkt_flow != NULL) {
905 				ASSERT(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC);
906 				skmem_cache_free(pp_flow_cache, kpkt->pkt_flow);
907 				kpkt->pkt_flow = NULL;
908 			}
909 			ASSERT(kpkt->pkt_tx_compl != NULL ||
910 			    !(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC));
911 			if (kpkt->pkt_tx_compl != NULL) {
912 				ASSERT(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC);
913 				skmem_cache_free(pp_compl_cache,
914 				    kpkt->pkt_tx_compl);
915 				kpkt->pkt_tx_compl = NULL;
916 			}
917 			kpkt->pkt_pflags = 0;
918 			break;
919 		}
920 		default:
921 			ASSERT(METADATA_TYPE(kqum) == NEXUS_META_TYPE_QUANTUM);
922 			/* nothing to do for quantum (yet) */
923 			break;
924 		}
925 	}
926 }
927 
928 __attribute__((always_inline))
929 static void
pp_free_kbft_list(struct kern_pbufpool * pp,struct skmem_obj * blist_def,struct skmem_obj * blist_nocache_def,struct skmem_obj * blist_large,struct skmem_obj * blist_nocache_large)930 pp_free_kbft_list(struct kern_pbufpool *pp, struct skmem_obj *blist_def, struct skmem_obj *blist_nocache_def,
931     struct skmem_obj *blist_large, struct skmem_obj *blist_nocache_large)
932 {
933 	if (blist_def != NULL) {
934 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist_def);
935 	}
936 	if (blist_large != NULL) {
937 		skmem_cache_batch_free(PP_KBFT_CACHE_LARGE(pp), blist_large);
938 	}
939 	if (blist_nocache_def != NULL) {
940 		skmem_cache_batch_free_nocache(PP_KBFT_CACHE_DEF(pp), blist_nocache_def);
941 	}
942 	if (blist_nocache_large != NULL) {
943 		skmem_cache_batch_free_nocache(PP_KBFT_CACHE_LARGE(pp), blist_nocache_large);
944 	}
945 }
946 
947 __attribute__((always_inline))
948 static void
pp_metadata_destruct(struct __kern_quantum * kqum,struct kern_pbufpool * pp,bool raw)949 pp_metadata_destruct(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
950     bool raw)
951 {
952 	struct skmem_obj *__single blist_def = NULL, *__single blist_large = NULL;
953 	struct skmem_obj *__single blist_nocache_def = NULL, *__single blist_nocache_large = NULL;
954 
955 	pp_metadata_destruct_common(kqum, pp, raw, &blist_def, &blist_nocache_def,
956 	    &blist_large, &blist_nocache_large);
957 	pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
958 }
959 
960 static void
pp_metadata_dtor(void * addr,void * arg)961 pp_metadata_dtor(void *addr, void *arg)
962 {
963 	pp_metadata_destruct(SK_PTR_ADDR_KQUM((uintptr_t)addr +
964 	    METADATA_PREAMBLE_SZ), arg, TRUE);
965 }
966 
967 static void
pp_buf_seg_ctor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)968 pp_buf_seg_ctor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
969 {
970 	struct kern_pbufpool *__single pp = arg;
971 
972 	if (pp->pp_pbuf_seg_ctor != NULL) {
973 		pp->pp_pbuf_seg_ctor(pp, sg, md);
974 	}
975 }
976 
977 static void
pp_buf_seg_dtor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)978 pp_buf_seg_dtor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
979 {
980 	struct kern_pbufpool *__single pp = arg;
981 
982 	if (pp->pp_pbuf_seg_dtor != NULL) {
983 		pp->pp_pbuf_seg_dtor(pp, sg, md);
984 	}
985 }
986 
987 static int
pp_buflet_metadata_ctor_common(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag,bool large)988 pp_buflet_metadata_ctor_common(struct skmem_obj_info *oi0,
989     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag, bool large)
990 {
991 #pragma unused (skmflag)
992 	struct kern_pbufpool *pp = (struct kern_pbufpool *)arg;
993 	struct __kern_buflet *kbft;
994 	struct __user_buflet *ubft;
995 	struct skmem_obj_info oib;
996 	mach_vm_address_t baddr;
997 	obj_idx_t oi_idx_reg;
998 
999 	baddr = pp_alloc_buffer_common(pp, &oib, skmflag, large);
1000 	if (__improbable(baddr == 0)) {
1001 		return ENOMEM;
1002 	}
1003 	/*
1004 	 * Note that oi0 and oim0 may be stored inside the object itself;
1005 	 * so copy what is required to local variables before constructing.
1006 	 */
1007 	oi_idx_reg = SKMEM_OBJ_IDX_REG(oi0);
1008 	kbft = SKMEM_OBJ_ADDR(oi0);
1009 
1010 	if (__probable(!PP_KERNEL_ONLY(pp))) {
1011 		ASSERT(oim0 != NULL && SKMEM_OBJ_ADDR(oim0) != NULL);
1012 		ASSERT(SKMEM_OBJ_SIZE(oi0) == SKMEM_OBJ_SIZE(oim0));
1013 		ASSERT(oi_idx_reg == SKMEM_OBJ_IDX_REG(oim0));
1014 		ASSERT(SKMEM_OBJ_IDX_SEG(oi0) == SKMEM_OBJ_IDX_SEG(oim0));
1015 		ubft = SKMEM_OBJ_ADDR(oim0);
1016 	} else {
1017 		ASSERT(oim0 == NULL);
1018 		ubft = NULL;
1019 	}
1020 	KBUF_EXT_CTOR(kbft, ubft, baddr, SKMEM_OBJ_IDX_REG(&oib),
1021 	    SKMEM_OBJ_BUFCTL(&oib), oi_idx_reg, pp, large);
1022 	return 0;
1023 }
1024 
1025 static int
pp_buflet_default_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)1026 pp_buflet_default_buffer_metadata_ctor(struct skmem_obj_info *oi0,
1027     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
1028 {
1029 	return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, false);
1030 }
1031 
1032 static int
pp_buflet_large_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)1033 pp_buflet_large_buffer_metadata_ctor(struct skmem_obj_info *oi0,
1034     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
1035 {
1036 	return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, true);
1037 }
1038 
1039 static void
pp_buflet_metadata_dtor(void * addr,void * arg)1040 pp_buflet_metadata_dtor(void *addr, void *arg)
1041 {
1042 	struct __kern_buflet *__single kbft = addr;
1043 	void *objaddr = kbft->buf_objaddr;
1044 	struct kern_pbufpool *__single pp = arg;
1045 	uint32_t usecnt = 0;
1046 	bool large = BUFLET_HAS_LARGE_BUF(kbft);
1047 
1048 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1049 	/*
1050 	 * don't assert for (buf_nbft_addr == 0) here as constructed
1051 	 * buflet may have this field as non-zero. This is because
1052 	 * buf_nbft_addr (__buflet_next) is used by skmem batch alloc
1053 	 * for chaining the buflets.
1054 	 * To ensure that the frred buflet was not part of a chain we
1055 	 * assert for (buf_nbft_idx == OBJ_IDX_NONE).
1056 	 */
1057 	ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
1058 	ASSERT(((struct __kern_buflet_ext *)kbft)->kbe_buf_upp_link.sle_next ==
1059 	    NULL);
1060 	ASSERT(kbft->buf_addr != 0);
1061 	ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
1062 	ASSERT(kbft->buf_ctl != NULL);
1063 
1064 	KBUF_DTOR(kbft, usecnt);
1065 	SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u", SK_KVA(pp),
1066 	    SK_KVA(objaddr), usecnt);
1067 	if (__probable(usecnt == 0)) {
1068 		skmem_cache_free(large ? PP_BUF_CACHE_LARGE(pp) :
1069 		    PP_BUF_CACHE_DEF(pp), objaddr);
1070 	}
1071 }
1072 
1073 /*
1074  * -fbounds-safety: all callers of pp_create use srp_array with a known size:
1075  * SKMEM_REGIONS. This is same as marking it __counted_by(SKMEM_REGIONS)
1076  */
1077 struct kern_pbufpool *
pp_create(const char * name,struct skmem_region_params srp_array[SKMEM_REGIONS],pbuf_seg_ctor_fn_t buf_seg_ctor,pbuf_seg_dtor_fn_t buf_seg_dtor,const void * ctx,pbuf_ctx_retain_fn_t ctx_retain,pbuf_ctx_release_fn_t ctx_release,uint32_t ppcreatef)1078 pp_create(const char *name, struct skmem_region_params srp_array[SKMEM_REGIONS],
1079     pbuf_seg_ctor_fn_t buf_seg_ctor, pbuf_seg_dtor_fn_t buf_seg_dtor,
1080     const void *ctx, pbuf_ctx_retain_fn_t ctx_retain,
1081     pbuf_ctx_release_fn_t ctx_release, uint32_t ppcreatef)
1082 {
1083 	struct kern_pbufpool *pp = NULL;
1084 	uint32_t md_size, def_buf_obj_size;
1085 	uint32_t def_buf_size, large_buf_size;
1086 	nexus_meta_type_t md_type;
1087 	nexus_meta_subtype_t md_subtype;
1088 	uint32_t md_cflags;
1089 	uint16_t max_frags;
1090 	uint32_t buf_def_cflags;
1091 	char cname[64];
1092 	const char *__null_terminated cache_name = NULL;
1093 	struct skmem_region_params *kmd_srp;
1094 	struct skmem_region_params *buf_srp;
1095 	struct skmem_region_params *kbft_srp;
1096 	struct skmem_region_params *umd_srp = NULL;
1097 	struct skmem_region_params *ubft_srp = NULL;
1098 	struct skmem_region_params *lbuf_srp = NULL;
1099 
1100 	/* buf_seg_{ctor,dtor} pair must be either NULL or non-NULL */
1101 	ASSERT(!(!(buf_seg_ctor == NULL && buf_seg_dtor == NULL) &&
1102 	    ((buf_seg_ctor == NULL) ^ (buf_seg_dtor == NULL))));
1103 
1104 	/* ctx{,_retain,_release} must be either ALL NULL or ALL non-NULL */
1105 	ASSERT((ctx == NULL && ctx_retain == NULL && ctx_release == NULL) ||
1106 	    (ctx != NULL && ctx_retain != NULL && ctx_release != NULL));
1107 
1108 	if (srp_array[SKMEM_REGION_KMD].srp_c_obj_cnt != 0) {
1109 		kmd_srp = &srp_array[SKMEM_REGION_KMD];
1110 		buf_srp = &srp_array[SKMEM_REGION_BUF_DEF];
1111 		lbuf_srp = &srp_array[SKMEM_REGION_BUF_LARGE];
1112 		kbft_srp = &srp_array[SKMEM_REGION_KBFT];
1113 	} else if (srp_array[SKMEM_REGION_RXKMD].srp_c_obj_cnt != 0) {
1114 		kmd_srp = &srp_array[SKMEM_REGION_RXKMD];
1115 		buf_srp = &srp_array[SKMEM_REGION_RXBUF_DEF];
1116 		lbuf_srp = &srp_array[SKMEM_REGION_RXBUF_LARGE];
1117 		kbft_srp = &srp_array[SKMEM_REGION_RXKBFT];
1118 	} else {
1119 		VERIFY(srp_array[SKMEM_REGION_TXKMD].srp_c_obj_cnt != 0);
1120 		kmd_srp = &srp_array[SKMEM_REGION_TXKMD];
1121 		buf_srp = &srp_array[SKMEM_REGION_TXBUF_DEF];
1122 		lbuf_srp = &srp_array[SKMEM_REGION_TXBUF_LARGE];
1123 		kbft_srp = &srp_array[SKMEM_REGION_TXKBFT];
1124 	}
1125 
1126 	VERIFY(kmd_srp->srp_c_obj_size != 0);
1127 	VERIFY(buf_srp->srp_c_obj_cnt != 0);
1128 	VERIFY(buf_srp->srp_c_obj_size != 0);
1129 
1130 	if (ppcreatef & PPCREATEF_ONDEMAND_BUF) {
1131 		VERIFY(kbft_srp->srp_c_obj_cnt != 0);
1132 		VERIFY(kbft_srp->srp_c_obj_size != 0);
1133 	} else {
1134 		kbft_srp = NULL;
1135 	}
1136 
1137 	if ((ppcreatef & PPCREATEF_KERNEL_ONLY) == 0) {
1138 		umd_srp = &srp_array[SKMEM_REGION_UMD];
1139 		ASSERT(umd_srp->srp_c_obj_size == kmd_srp->srp_c_obj_size);
1140 		ASSERT(umd_srp->srp_c_obj_cnt == kmd_srp->srp_c_obj_cnt);
1141 		ASSERT(umd_srp->srp_c_seg_size == kmd_srp->srp_c_seg_size);
1142 		ASSERT(umd_srp->srp_seg_cnt == kmd_srp->srp_seg_cnt);
1143 		ASSERT(umd_srp->srp_md_type == kmd_srp->srp_md_type);
1144 		ASSERT(umd_srp->srp_md_subtype == kmd_srp->srp_md_subtype);
1145 		ASSERT(umd_srp->srp_max_frags == kmd_srp->srp_max_frags);
1146 		ASSERT((umd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
1147 		    (kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
1148 		if (kbft_srp != NULL) {
1149 			ubft_srp = &srp_array[SKMEM_REGION_UBFT];
1150 			ASSERT(ubft_srp->srp_c_obj_size ==
1151 			    kbft_srp->srp_c_obj_size);
1152 			ASSERT(ubft_srp->srp_c_obj_cnt ==
1153 			    kbft_srp->srp_c_obj_cnt);
1154 			ASSERT(ubft_srp->srp_c_seg_size ==
1155 			    kbft_srp->srp_c_seg_size);
1156 			ASSERT(ubft_srp->srp_seg_cnt == kbft_srp->srp_seg_cnt);
1157 		}
1158 	}
1159 
1160 	md_size = kmd_srp->srp_r_obj_size;
1161 	md_type = kmd_srp->srp_md_type;
1162 	md_subtype = kmd_srp->srp_md_subtype;
1163 	max_frags = kmd_srp->srp_max_frags;
1164 	def_buf_obj_size = buf_srp->srp_c_obj_size;
1165 	def_buf_size = def_buf_obj_size;
1166 	large_buf_size = lbuf_srp->srp_c_obj_size;
1167 
1168 #if (DEBUG || DEVELOPMENT)
1169 	ASSERT(def_buf_obj_size != 0);
1170 	ASSERT(md_type > NEXUS_META_TYPE_INVALID &&
1171 	    md_type <= NEXUS_META_TYPE_MAX);
1172 	if (md_type == NEXUS_META_TYPE_QUANTUM) {
1173 		ASSERT(max_frags == 1);
1174 		ASSERT(md_size >=
1175 		    (METADATA_PREAMBLE_SZ + NX_METADATA_QUANTUM_SZ));
1176 	} else {
1177 		ASSERT(max_frags >= 1);
1178 		ASSERT(md_type == NEXUS_META_TYPE_PACKET);
1179 		ASSERT(md_size >= (METADATA_PREAMBLE_SZ +
1180 		    NX_METADATA_PACKET_SZ(max_frags)));
1181 	}
1182 	ASSERT(md_subtype > NEXUS_META_SUBTYPE_INVALID &&
1183 	    md_subtype <= NEXUS_META_SUBTYPE_MAX);
1184 #endif /* DEBUG || DEVELOPMENT */
1185 
1186 	pp = pp_alloc(Z_WAITOK);
1187 
1188 	(void) snprintf((char *)pp->pp_name, sizeof(pp->pp_name),
1189 	    "skywalk.pp.%s", name);
1190 
1191 	pp->pp_ctx = __DECONST(void *, ctx);
1192 	pp->pp_ctx_retain = ctx_retain;
1193 	pp->pp_ctx_release = ctx_release;
1194 	if (pp->pp_ctx != NULL) {
1195 		pp->pp_ctx_retain(pp->pp_ctx);
1196 	}
1197 
1198 	pp->pp_pbuf_seg_ctor = buf_seg_ctor;
1199 	pp->pp_pbuf_seg_dtor = buf_seg_dtor;
1200 	PP_BUF_SIZE_DEF(pp) = def_buf_size;
1201 	PP_BUF_OBJ_SIZE_DEF(pp) = def_buf_obj_size;
1202 	PP_BUF_SIZE_LARGE(pp) = large_buf_size;
1203 	PP_BUF_OBJ_SIZE_LARGE(pp) = lbuf_srp->srp_c_obj_size;
1204 	pp->pp_md_type = md_type;
1205 	pp->pp_md_subtype = md_subtype;
1206 	pp->pp_max_frags = max_frags;
1207 	if (ppcreatef & PPCREATEF_EXTERNAL) {
1208 		pp->pp_flags |= PPF_EXTERNAL;
1209 	}
1210 	if (ppcreatef & PPCREATEF_TRUNCATED_BUF) {
1211 		pp->pp_flags |= PPF_TRUNCATED_BUF;
1212 	}
1213 	if (ppcreatef & PPCREATEF_KERNEL_ONLY) {
1214 		pp->pp_flags |= PPF_KERNEL;
1215 	}
1216 	if (ppcreatef & PPCREATEF_ONDEMAND_BUF) {
1217 		pp->pp_flags |= PPF_BUFFER_ON_DEMAND;
1218 	}
1219 	if (ppcreatef & PPCREATEF_DYNAMIC) {
1220 		pp->pp_flags |= PPF_DYNAMIC;
1221 	}
1222 	if (lbuf_srp->srp_c_obj_cnt > 0) {
1223 		ASSERT(lbuf_srp->srp_c_obj_size != 0);
1224 		pp->pp_flags |= PPF_LARGE_BUF;
1225 	}
1226 
1227 	pp_retain(pp);
1228 
1229 	md_cflags = ((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ?
1230 	    SKMEM_CR_NOMAGAZINES : 0);
1231 	md_cflags |= SKMEM_CR_BATCH;
1232 	pp->pp_flags |= PPF_BATCH;
1233 
1234 	if (pp->pp_flags & PPF_DYNAMIC) {
1235 		md_cflags |= SKMEM_CR_DYNAMIC;
1236 	}
1237 
1238 	if (umd_srp != NULL && (pp->pp_umd_region =
1239 	    skmem_region_create(name, umd_srp, NULL, NULL, NULL)) == NULL) {
1240 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1241 		    pp->pp_name, SK_KVA(pp), umd_srp->srp_name);
1242 		goto failed;
1243 	}
1244 
1245 	if ((pp->pp_kmd_region = skmem_region_create(name, kmd_srp, NULL, NULL,
1246 	    NULL)) == NULL) {
1247 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1248 		    pp->pp_name, SK_KVA(pp), kmd_srp->srp_name);
1249 		goto failed;
1250 	}
1251 
1252 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1253 		VERIFY((kbft_srp != NULL) && (kbft_srp->srp_c_obj_cnt > 0));
1254 		if (!PP_KERNEL_ONLY(pp)) {
1255 			VERIFY((ubft_srp != NULL) &&
1256 			    (ubft_srp->srp_c_obj_cnt > 0));
1257 		}
1258 	}
1259 	/*
1260 	 * Metadata regions {KMD,KBFT,UBFT} magazines layer and persistency
1261 	 * attribute must match.
1262 	 */
1263 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1264 		ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ==
1265 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES));
1266 		ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
1267 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
1268 	}
1269 
1270 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && !PP_KERNEL_ONLY(pp)) {
1271 		if ((pp->pp_ubft_region = skmem_region_create(name, ubft_srp,
1272 		    NULL, NULL, NULL)) == NULL) {
1273 			SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1274 			    pp->pp_name, SK_KVA(pp), ubft_srp->srp_name);
1275 			goto failed;
1276 		}
1277 	}
1278 
1279 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1280 		if ((pp->pp_kbft_region = skmem_region_create(name,
1281 		    kbft_srp, NULL, NULL, NULL)) == NULL) {
1282 			SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1283 			    pp->pp_name, SK_KVA(pp), kbft_srp->srp_name);
1284 			goto failed;
1285 		}
1286 	}
1287 
1288 	if (!PP_KERNEL_ONLY(pp)) {
1289 		skmem_region_mirror(pp->pp_kmd_region, pp->pp_umd_region);
1290 	}
1291 	if (!PP_KERNEL_ONLY(pp) && pp->pp_ubft_region != NULL) {
1292 		ASSERT(pp->pp_kbft_region != NULL);
1293 		skmem_region_mirror(pp->pp_kbft_region, pp->pp_ubft_region);
1294 	}
1295 
1296 	/*
1297 	 * Create the metadata cache; magazines layer is determined by caller.
1298 	 */
1299 	cache_name = tsnprintf(cname, sizeof(cname), "kmd.%s", name);
1300 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1301 		pp->pp_kmd_cache = skmem_cache_create(cache_name, md_size, 0,
1302 		    pp_metadata_ctor_no_buflet, pp_metadata_dtor, NULL, pp,
1303 		    pp->pp_kmd_region, md_cflags);
1304 	} else {
1305 		pp->pp_kmd_cache = skmem_cache_create(cache_name, md_size, 0,
1306 		    pp_metadata_ctor_max_buflet, pp_metadata_dtor, NULL, pp,
1307 		    pp->pp_kmd_region, md_cflags);
1308 	}
1309 
1310 	if (pp->pp_kmd_cache == NULL) {
1311 		SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1312 		    pp->pp_name, SK_KVA(pp), cname);
1313 		goto failed;
1314 	}
1315 
1316 	/*
1317 	 * Create the buflet metadata cache
1318 	 */
1319 	if (pp->pp_kbft_region != NULL) {
1320 		cache_name = tsnprintf(cname, sizeof(cname), "kbft_def.%s", name);
1321 		PP_KBFT_CACHE_DEF(pp) = skmem_cache_create(cache_name,
1322 		    kbft_srp->srp_c_obj_size, 0,
1323 		    pp_buflet_default_buffer_metadata_ctor,
1324 		    pp_buflet_metadata_dtor, NULL, pp, pp->pp_kbft_region,
1325 		    md_cflags);
1326 
1327 		if (PP_KBFT_CACHE_DEF(pp) == NULL) {
1328 			SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1329 			    pp->pp_name, SK_KVA(pp), cname);
1330 			goto failed;
1331 		}
1332 
1333 		if (PP_HAS_LARGE_BUF(pp)) {
1334 			/* Aggressive memory reclaim flag set to kbft_large for now */
1335 			md_cflags |= SKMEM_CR_RECLAIM;
1336 			cache_name = tsnprintf(cname, sizeof(cname),
1337 			    "kbft_large.%s", name);
1338 			PP_KBFT_CACHE_LARGE(pp) = skmem_cache_create(cache_name,
1339 			    kbft_srp->srp_c_obj_size, 0,
1340 			    pp_buflet_large_buffer_metadata_ctor,
1341 			    pp_buflet_metadata_dtor,
1342 			    NULL, pp, pp->pp_kbft_region, md_cflags);
1343 
1344 			if (PP_KBFT_CACHE_LARGE(pp) == NULL) {
1345 				SK_ERR("\"%s\" (0x%llx) failed to "
1346 				    "create \"%s\" cache", pp->pp_name,
1347 				    SK_KVA(pp), cname);
1348 				goto failed;
1349 			}
1350 		}
1351 	}
1352 
1353 	if ((PP_BUF_REGION_DEF(pp) = skmem_region_create(name,
1354 	    buf_srp, pp_buf_seg_ctor, pp_buf_seg_dtor, pp)) == NULL) {
1355 		SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1356 		    pp->pp_name, SK_KVA(pp), buf_srp->srp_name);
1357 		goto failed;
1358 	}
1359 
1360 	if (PP_HAS_LARGE_BUF(pp)) {
1361 		PP_BUF_REGION_LARGE(pp) = skmem_region_create(name, lbuf_srp,
1362 		    pp_buf_seg_ctor, pp_buf_seg_dtor, pp);
1363 		if (PP_BUF_REGION_LARGE(pp) == NULL) {
1364 			SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1365 			    pp->pp_name, SK_KVA(pp), lbuf_srp->srp_name);
1366 			goto failed;
1367 		}
1368 	}
1369 
1370 	/*
1371 	 * Create the buffer object cache without the magazines layer.
1372 	 * We rely on caching the constructed metadata object instead.
1373 	 */
1374 	cache_name = tsnprintf(cname, sizeof(cname), "buf_def.%s", name);
1375 	buf_def_cflags = buf_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES ? SKMEM_CR_NOMAGAZINES : 0;
1376 	if ((PP_BUF_CACHE_DEF(pp) = skmem_cache_create(cache_name,
1377 	    def_buf_obj_size,
1378 	    0, NULL, NULL, NULL, pp, PP_BUF_REGION_DEF(pp),
1379 	    buf_def_cflags)) == NULL) {
1380 		SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1381 		    pp->pp_name, SK_KVA(pp), cname);
1382 		goto failed;
1383 	}
1384 
1385 	if (PP_BUF_REGION_LARGE(pp) != NULL) {
1386 		cache_name = tsnprintf(cname, sizeof(cname), "buf_large.%s", name);
1387 		if ((PP_BUF_CACHE_LARGE(pp) = skmem_cache_create(cache_name,
1388 		    lbuf_srp->srp_c_obj_size, 0, NULL, NULL, NULL, pp,
1389 		    PP_BUF_REGION_LARGE(pp), SKMEM_CR_NOMAGAZINES)) == NULL) {
1390 			SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1391 			    pp->pp_name, SK_KVA(pp), cname);
1392 			goto failed;
1393 		}
1394 	}
1395 
1396 	return pp;
1397 
1398 failed:
1399 	if (pp != NULL) {
1400 		if (pp->pp_ctx != NULL) {
1401 			pp->pp_ctx_release(pp->pp_ctx);
1402 			pp->pp_ctx = NULL;
1403 		}
1404 		pp_close(pp);
1405 	}
1406 
1407 	return NULL;
1408 }
1409 
1410 void
pp_destroy(struct kern_pbufpool * pp)1411 pp_destroy(struct kern_pbufpool *pp)
1412 {
1413 	PP_LOCK_ASSERT_HELD(pp);
1414 
1415 	/* may be called for built-in pp with outstanding reference */
1416 	ASSERT(!(pp->pp_flags & PPF_EXTERNAL) || pp->pp_refcnt == 0);
1417 
1418 	pp_destroy_upp_locked(pp);
1419 
1420 	pp_destroy_upp_bft_locked(pp);
1421 
1422 	if (pp->pp_kmd_cache != NULL) {
1423 		skmem_cache_destroy(pp->pp_kmd_cache);
1424 		pp->pp_kmd_cache = NULL;
1425 	}
1426 
1427 	if (pp->pp_umd_region != NULL) {
1428 		skmem_region_release(pp->pp_umd_region);
1429 		pp->pp_umd_region = NULL;
1430 	}
1431 
1432 	if (pp->pp_kmd_region != NULL) {
1433 		skmem_region_release(pp->pp_kmd_region);
1434 		pp->pp_kmd_region = NULL;
1435 	}
1436 
1437 	if (PP_KBFT_CACHE_DEF(pp) != NULL) {
1438 		skmem_cache_destroy(PP_KBFT_CACHE_DEF(pp));
1439 		PP_KBFT_CACHE_DEF(pp) = NULL;
1440 	}
1441 
1442 	if (PP_KBFT_CACHE_LARGE(pp) != NULL) {
1443 		skmem_cache_destroy(PP_KBFT_CACHE_LARGE(pp));
1444 		PP_KBFT_CACHE_LARGE(pp) = NULL;
1445 	}
1446 
1447 	if (pp->pp_ubft_region != NULL) {
1448 		skmem_region_release(pp->pp_ubft_region);
1449 		pp->pp_ubft_region = NULL;
1450 	}
1451 
1452 	if (pp->pp_kbft_region != NULL) {
1453 		skmem_region_release(pp->pp_kbft_region);
1454 		pp->pp_kbft_region = NULL;
1455 	}
1456 
1457 	/*
1458 	 * The order is important here, since pp_metadata_dtor()
1459 	 * called by freeing on the pp_kmd_cache will in turn
1460 	 * free the attached buffer.  Therefore destroy the
1461 	 * buffer cache last.
1462 	 */
1463 	if (PP_BUF_CACHE_DEF(pp) != NULL) {
1464 		skmem_cache_destroy(PP_BUF_CACHE_DEF(pp));
1465 		PP_BUF_CACHE_DEF(pp) = NULL;
1466 	}
1467 	if (PP_BUF_REGION_DEF(pp) != NULL) {
1468 		skmem_region_release(PP_BUF_REGION_DEF(pp));
1469 		PP_BUF_REGION_DEF(pp) = NULL;
1470 	}
1471 	if (PP_BUF_CACHE_LARGE(pp) != NULL) {
1472 		skmem_cache_destroy(PP_BUF_CACHE_LARGE(pp));
1473 		PP_BUF_CACHE_LARGE(pp) = NULL;
1474 	}
1475 	if (PP_BUF_REGION_LARGE(pp) != NULL) {
1476 		skmem_region_release(PP_BUF_REGION_LARGE(pp));
1477 		PP_BUF_REGION_LARGE(pp) = NULL;
1478 	}
1479 
1480 	if (pp->pp_ctx != NULL) {
1481 		pp->pp_ctx_release(pp->pp_ctx);
1482 		pp->pp_ctx = NULL;
1483 	}
1484 }
1485 
1486 static int
pp_init_upp_locked(struct kern_pbufpool * pp,boolean_t can_block)1487 pp_init_upp_locked(struct kern_pbufpool *pp, boolean_t can_block)
1488 {
1489 	int i, err = 0;
1490 
1491 	if (pp->pp_u_hash_table != NULL) {
1492 		goto done;
1493 	}
1494 
1495 	/* allocated-address hash table */
1496 	/*
1497 	 * -fbounds-safety: We switched to sk_alloc (aka kalloc) from zalloc, so
1498 	 * if we see any performance hit, we can check if this caused it.
1499 	 */
1500 	if (can_block) {
1501 		pp->pp_u_hash_table = sk_alloc_type_array(
1502 			struct kern_pbufpool_u_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1503 			Z_WAITOK, skmem_tag_pbufpool_hash);
1504 		pp->pp_u_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1505 	} else {
1506 		pp->pp_u_hash_table = sk_alloc_type_array(
1507 			struct kern_pbufpool_u_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1508 			Z_NOWAIT, skmem_tag_pbufpool_hash);
1509 		pp->pp_u_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1510 	}
1511 	if (pp->pp_u_hash_table == NULL) {
1512 		SK_ERR("failed to zalloc packet buffer pool upp hash table");
1513 		err = ENOMEM;
1514 		goto done;
1515 	}
1516 
1517 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1518 		SLIST_INIT(&pp->pp_u_hash_table[i].upp_head);
1519 	}
1520 done:
1521 	return err;
1522 }
1523 
1524 static void
pp_destroy_upp_locked(struct kern_pbufpool * pp)1525 pp_destroy_upp_locked(struct kern_pbufpool *pp)
1526 {
1527 	PP_LOCK_ASSERT_HELD(pp);
1528 	if (pp->pp_u_hash_table != NULL) {
1529 		/* purge anything that's left */
1530 		pp_purge_upp_locked(pp, -1);
1531 
1532 #if (DEBUG || DEVELOPMENT)
1533 		for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1534 			ASSERT(SLIST_EMPTY(&pp->pp_u_hash_table[i].upp_head));
1535 		}
1536 #endif /* DEBUG || DEVELOPMENT */
1537 
1538 		kfree_type_counted_by(struct kern_pbufpool_u_bkt,
1539 		    pp->pp_u_hash_table_size,
1540 		    pp->pp_u_hash_table);
1541 	}
1542 	ASSERT(pp->pp_u_bufinuse == 0);
1543 }
1544 
1545 int
pp_init_upp(struct kern_pbufpool * pp,boolean_t can_block)1546 pp_init_upp(struct kern_pbufpool *pp, boolean_t can_block)
1547 {
1548 	int err = 0;
1549 
1550 	PP_LOCK(pp);
1551 	err = pp_init_upp_locked(pp, can_block);
1552 	if (err) {
1553 		SK_ERR("packet UPP init failed (%d)", err);
1554 		goto done;
1555 	}
1556 	err = pp_init_upp_bft_locked(pp, can_block);
1557 	if (err) {
1558 		SK_ERR("buflet UPP init failed (%d)", err);
1559 		pp_destroy_upp_locked(pp);
1560 		goto done;
1561 	}
1562 	pp_retain_locked(pp);
1563 done:
1564 	PP_UNLOCK(pp);
1565 	return err;
1566 }
1567 
1568 __attribute__((always_inline))
1569 static void
pp_insert_upp_bft_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1570 pp_insert_upp_bft_locked(struct kern_pbufpool *pp,
1571     struct __kern_buflet *kbft, pid_t pid)
1572 {
1573 	struct kern_pbufpool_u_bft_bkt *bkt;
1574 	struct __kern_buflet_ext *kbe = (struct __kern_buflet_ext *)kbft;
1575 
1576 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1577 	ASSERT(kbe->kbe_buf_pid == (pid_t)-1);
1578 	kbe->kbe_buf_pid = pid;
1579 	bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, kbft->buf_bft_idx_reg);
1580 	SLIST_INSERT_HEAD(&bkt->upp_head, kbe, kbe_buf_upp_link);
1581 	pp->pp_u_bftinuse++;
1582 }
1583 
1584 __attribute__((always_inline))
1585 static void
pp_insert_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1586 pp_insert_upp_bft_chain_locked(struct kern_pbufpool *pp,
1587     struct __kern_buflet *kbft, pid_t pid)
1588 {
1589 	while (kbft != NULL) {
1590 		pp_insert_upp_bft_locked(pp, kbft, pid);
1591 		kbft = __unsafe_forge_single(struct __kern_buflet *,
1592 		    __DECONST(kern_buflet_t, kbft->buf_nbft_addr));
1593 	}
1594 }
1595 
1596 /* Also inserts the attached chain of buflets */
1597 void static inline
pp_insert_upp_common(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1598 pp_insert_upp_common(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1599     pid_t pid)
1600 {
1601 	struct kern_pbufpool_u_bkt *bkt;
1602 	struct __kern_buflet *kbft;
1603 
1604 	ASSERT(kqum->qum_pid == (pid_t)-1);
1605 	kqum->qum_pid = pid;
1606 
1607 	bkt = KERN_PBUFPOOL_U_HASH(pp, METADATA_IDX(kqum));
1608 	SLIST_INSERT_HEAD(&bkt->upp_head, kqum, qum_upp_link);
1609 	pp->pp_u_bufinuse++;
1610 
1611 	kbft = __unsafe_forge_single(struct __kern_buflet *, (kern_buflet_t)kqum->qum_buf[0].buf_nbft_addr);
1612 	if (kbft != NULL) {
1613 		ASSERT(((kern_buflet_t)kbft)->buf_flag & BUFLET_FLAG_EXTERNAL);
1614 		ASSERT(kqum->qum_qflags & QUM_F_INTERNALIZED);
1615 		pp_insert_upp_bft_chain_locked(pp, kbft, pid);
1616 	}
1617 }
1618 
1619 void
pp_insert_upp_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1620 pp_insert_upp_locked(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1621     pid_t pid)
1622 {
1623 	pp_insert_upp_common(pp, kqum, pid);
1624 }
1625 
1626 void
pp_insert_upp(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1627 pp_insert_upp(struct kern_pbufpool *pp, struct __kern_quantum *kqum, pid_t pid)
1628 {
1629 	PP_LOCK(pp);
1630 	pp_insert_upp_common(pp, kqum, pid);
1631 	PP_UNLOCK(pp);
1632 }
1633 
1634 void
pp_insert_upp_batch(struct kern_pbufpool * pp,pid_t pid,uint64_t * __counted_by (num)array,uint32_t num)1635 pp_insert_upp_batch(struct kern_pbufpool *pp, pid_t pid,
1636     uint64_t *__counted_by(num)array, uint32_t num)
1637 {
1638 	uint32_t i = 0;
1639 
1640 	ASSERT(array != NULL && num > 0);
1641 	PP_LOCK(pp);
1642 	while (i < num) {
1643 		struct __kern_quantum *kqum = SK_PTR_ADDR_KQUM(array[i]);
1644 
1645 		ASSERT(kqum != NULL);
1646 		pp_insert_upp_common(pp, kqum, pid);
1647 		++i;
1648 	}
1649 	PP_UNLOCK(pp);
1650 }
1651 
1652 __attribute__((always_inline))
1653 static struct __kern_buflet *
pp_remove_upp_bft_locked(struct kern_pbufpool * pp,obj_idx_t bft_idx)1654 pp_remove_upp_bft_locked(struct kern_pbufpool *pp, obj_idx_t bft_idx)
1655 {
1656 	struct __kern_buflet_ext *kbft, *tbft;
1657 	struct kern_pbufpool_u_bft_bkt *bkt;
1658 
1659 	bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, bft_idx);
1660 	SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link, tbft) {
1661 		if (((kern_buflet_t)kbft)->buf_bft_idx_reg == bft_idx) {
1662 			SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1663 			    kbe_buf_upp_link);
1664 			kbft->kbe_buf_pid = (pid_t)-1;
1665 			kbft->kbe_buf_upp_link.sle_next = NULL;
1666 			ASSERT(pp->pp_u_bftinuse != 0);
1667 			pp->pp_u_bftinuse--;
1668 			break;
1669 		}
1670 	}
1671 	return (kern_buflet_t)kbft;
1672 }
1673 
1674 struct __kern_buflet *
pp_remove_upp_bft(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1675 pp_remove_upp_bft(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1676 {
1677 	struct __kern_buflet *kbft = pp_remove_upp_bft_locked(pp, md_idx);
1678 
1679 	*err = __improbable(kbft != NULL) ? 0 : EINVAL;
1680 	return kbft;
1681 }
1682 
1683 __attribute__((always_inline))
1684 static int
pp_remove_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum)1685 pp_remove_upp_bft_chain_locked(struct kern_pbufpool *pp,
1686     struct __kern_quantum *kqum)
1687 {
1688 	uint32_t max_frags = pp->pp_max_frags;
1689 	struct __kern_buflet *kbft;
1690 	uint16_t nbfts, upkt_nbfts;
1691 	obj_idx_t bft_idx;
1692 
1693 	ASSERT(!(kqum->qum_qflags & QUM_F_INTERNALIZED));
1694 	bft_idx = kqum->qum_user->qum_buf[0].buf_nbft_idx;
1695 	kbft = &kqum->qum_buf[0];
1696 	if (bft_idx == OBJ_IDX_NONE) {
1697 		return 0;
1698 	}
1699 
1700 	ASSERT(METADATA_TYPE(kqum) == NEXUS_META_TYPE_PACKET);
1701 	struct __kern_packet *kpkt = __DECONST(struct __kern_packet *, kqum);
1702 	struct __user_packet *upkt = __DECONST(struct __user_packet *,
1703 	    kpkt->pkt_qum.qum_user);
1704 
1705 	upkt_nbfts = upkt->pkt_bufs_cnt;
1706 	if (__improbable(upkt_nbfts > max_frags)) {
1707 		SK_ERR("bad bcnt in upkt (%d > %d)", upkt_nbfts, max_frags);
1708 		BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1709 		BUF_NBFT_ADDR(kbft, 0);
1710 		return ERANGE;
1711 	}
1712 
1713 	nbfts = (kbft->buf_addr != 0) ? 1 : 0;
1714 
1715 	do {
1716 		struct __kern_buflet *pbft = kbft;
1717 		struct __kern_buflet_ext *kbe;
1718 
1719 		kbft = pp_remove_upp_bft_locked(pp, bft_idx);
1720 		if (__improbable(kbft == NULL)) {
1721 			BUF_NBFT_IDX(pbft, OBJ_IDX_NONE);
1722 			BUF_NBFT_ADDR(pbft, 0);
1723 			SK_ERR("unallocated next buflet (%d), %p", bft_idx,
1724 			    SK_KVA(pbft));
1725 			return ERANGE;
1726 		}
1727 		ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1728 		BUF_NBFT_IDX(pbft, bft_idx);
1729 		BUF_NBFT_ADDR(pbft, kbft);
1730 		kbe = __container_of(kbft, struct __kern_buflet_ext, kbe_overlay);
1731 		bft_idx = kbe->kbe_buf_user->buf_nbft_idx;
1732 		++nbfts;
1733 	} while ((bft_idx != OBJ_IDX_NONE) && (nbfts < upkt_nbfts));
1734 
1735 	ASSERT(kbft != NULL);
1736 	BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1737 	BUF_NBFT_ADDR(kbft, 0);
1738 	*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = nbfts;
1739 
1740 	if (__improbable((bft_idx != OBJ_IDX_NONE) || (nbfts != upkt_nbfts))) {
1741 		SK_ERR("bad buflet in upkt (%d, %d)", nbfts, upkt_nbfts);
1742 		return ERANGE;
1743 	}
1744 	return 0;
1745 }
1746 
1747 struct __kern_quantum *
pp_remove_upp_locked(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1748 pp_remove_upp_locked(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1749 {
1750 	struct __kern_quantum *kqum, *tqum;
1751 	struct kern_pbufpool_u_bkt *bkt;
1752 
1753 	bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1754 	SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1755 		if (METADATA_IDX(kqum) == md_idx) {
1756 			SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1757 			    qum_upp_link);
1758 			kqum->qum_pid = (pid_t)-1;
1759 			ASSERT(pp->pp_u_bufinuse != 0);
1760 			pp->pp_u_bufinuse--;
1761 			break;
1762 		}
1763 	}
1764 	if (__probable(kqum != NULL)) {
1765 		*err = pp_remove_upp_bft_chain_locked(pp, kqum);
1766 	} else {
1767 		*err = ERANGE;
1768 	}
1769 	return kqum;
1770 }
1771 
1772 struct __kern_quantum *
pp_remove_upp(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1773 pp_remove_upp(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1774 {
1775 	struct __kern_quantum *kqum;
1776 
1777 	PP_LOCK(pp);
1778 	kqum = pp_remove_upp_locked(pp, md_idx, err);
1779 	PP_UNLOCK(pp);
1780 	return kqum;
1781 }
1782 
1783 struct __kern_quantum *
pp_find_upp(struct kern_pbufpool * pp,obj_idx_t md_idx)1784 pp_find_upp(struct kern_pbufpool *pp, obj_idx_t md_idx)
1785 {
1786 	struct __kern_quantum *kqum, *tqum;
1787 	struct kern_pbufpool_u_bkt *bkt;
1788 
1789 	PP_LOCK(pp);
1790 	bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1791 	SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1792 		if (METADATA_IDX(kqum) == md_idx) {
1793 			break;
1794 		}
1795 	}
1796 	PP_UNLOCK(pp);
1797 
1798 	return kqum;
1799 }
1800 
1801 __attribute__((always_inline))
1802 static void
pp_purge_upp_locked(struct kern_pbufpool * pp,pid_t pid)1803 pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid)
1804 {
1805 	struct __kern_quantum *kqum, *tqum;
1806 	struct kern_pbufpool_u_bkt *bkt;
1807 	int i;
1808 
1809 	PP_LOCK_ASSERT_HELD(pp);
1810 
1811 	/*
1812 	 * TODO: Build a list of packets and batch-free them.
1813 	 */
1814 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1815 		bkt = &pp->pp_u_hash_table[i];
1816 		SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1817 			ASSERT(kqum->qum_pid != (pid_t)-1);
1818 			if (pid != (pid_t)-1 && kqum->qum_pid != pid) {
1819 				continue;
1820 			}
1821 			SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1822 			    qum_upp_link);
1823 			pp_remove_upp_bft_chain_locked(pp, kqum);
1824 			kqum->qum_pid = (pid_t)-1;
1825 			kqum->qum_qflags &= ~QUM_F_FINALIZED;
1826 			kqum->qum_ksd = NULL;
1827 			pp_free_packet(__DECONST(struct kern_pbufpool *,
1828 			    kqum->qum_pp), (uint64_t)kqum);
1829 			ASSERT(pp->pp_u_bufinuse != 0);
1830 			pp->pp_u_bufinuse--;
1831 		}
1832 	}
1833 }
1834 
1835 __attribute__((always_inline))
1836 static void
pp_purge_upp_bft_locked(struct kern_pbufpool * pp,pid_t pid)1837 pp_purge_upp_bft_locked(struct kern_pbufpool *pp, pid_t pid)
1838 {
1839 	struct __kern_buflet_ext *kbft, *tbft;
1840 	struct kern_pbufpool_u_bft_bkt *bkt;
1841 	int i;
1842 
1843 	PP_LOCK_ASSERT_HELD(pp);
1844 
1845 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1846 		bkt = &pp->pp_u_bft_hash_table[i];
1847 		SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link,
1848 		    tbft) {
1849 			ASSERT(kbft->kbe_buf_pid != (pid_t)-1);
1850 			if (pid != (pid_t)-1 && kbft->kbe_buf_pid != pid) {
1851 				continue;
1852 			}
1853 			SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1854 			    kbe_buf_upp_link);
1855 			kbft->kbe_buf_pid = (pid_t)-1;
1856 			kbft->kbe_buf_upp_link.sle_next = NULL;
1857 			pp_free_buflet(pp, (kern_buflet_t)kbft);
1858 			ASSERT(pp->pp_u_bftinuse != 0);
1859 			pp->pp_u_bftinuse--;
1860 		}
1861 	}
1862 }
1863 
1864 void
pp_purge_upp(struct kern_pbufpool * pp,pid_t pid)1865 pp_purge_upp(struct kern_pbufpool *pp, pid_t pid)
1866 {
1867 	PP_LOCK(pp);
1868 	pp_purge_upp_locked(pp, pid);
1869 	pp_purge_upp_bft_locked(pp, pid);
1870 	PP_UNLOCK(pp);
1871 }
1872 
1873 static int
pp_init_upp_bft_locked(struct kern_pbufpool * pp,boolean_t can_block)1874 pp_init_upp_bft_locked(struct kern_pbufpool *pp, boolean_t can_block)
1875 {
1876 	int i, err = 0;
1877 
1878 	PP_LOCK_ASSERT_HELD(pp);
1879 	if (pp->pp_u_bft_hash_table != NULL) {
1880 		return 0;
1881 	}
1882 
1883 	/* allocated-address hash table */
1884 	/*
1885 	 * -fbounds-safety: We switched to sk_alloc (aka kalloc) from zalloc, so
1886 	 * if we see any performance hit, we can check if this caused it.
1887 	 */
1888 	if (can_block) {
1889 		pp->pp_u_bft_hash_table = sk_alloc_type_array(
1890 			struct kern_pbufpool_u_bft_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1891 			Z_WAITOK, skmem_tag_pbufpool_bft_hash);
1892 		pp->pp_u_bft_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1893 	} else {
1894 		pp->pp_u_bft_hash_table = sk_alloc_type_array(
1895 			struct kern_pbufpool_u_bft_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1896 			Z_NOWAIT, skmem_tag_pbufpool_bft_hash);
1897 		pp->pp_u_bft_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1898 	}
1899 	if (pp->pp_u_bft_hash_table == NULL) {
1900 		SK_ERR("failed to zalloc packet buffer pool upp buflet hash table");
1901 		err = ENOMEM;
1902 		goto fail;
1903 	}
1904 
1905 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1906 		SLIST_INIT(&pp->pp_u_bft_hash_table[i].upp_head);
1907 	}
1908 
1909 fail:
1910 	return err;
1911 }
1912 
1913 static void
pp_destroy_upp_bft_locked(struct kern_pbufpool * pp)1914 pp_destroy_upp_bft_locked(struct kern_pbufpool *pp)
1915 {
1916 	PP_LOCK_ASSERT_HELD(pp);
1917 	if (pp->pp_u_bft_hash_table != NULL) {
1918 		/* purge anything that's left */
1919 		pp_purge_upp_bft_locked(pp, -1);
1920 
1921 #if (DEBUG || DEVELOPMENT)
1922 		for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1923 			ASSERT(SLIST_EMPTY(&pp->pp_u_bft_hash_table[i].upp_head));
1924 		}
1925 #endif /* DEBUG || DEVELOPMENT */
1926 
1927 		kfree_type_counted_by(struct kern_pbufpool_u_bft_bkt,
1928 		    pp->pp_u_bft_hash_table_size,
1929 		    pp->pp_u_bft_hash_table);
1930 	}
1931 	ASSERT(pp->pp_u_bftinuse == 0);
1932 }
1933 
1934 void
pp_insert_upp_bft(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1935 pp_insert_upp_bft(struct kern_pbufpool *pp,
1936     struct __kern_buflet *kbft, pid_t pid)
1937 {
1938 	PP_LOCK(pp);
1939 	pp_insert_upp_bft_locked(pp, kbft, pid);
1940 	PP_UNLOCK(pp);
1941 }
1942 
1943 boolean_t
pp_isempty_upp(struct kern_pbufpool * pp)1944 pp_isempty_upp(struct kern_pbufpool *pp)
1945 {
1946 	boolean_t isempty;
1947 
1948 	PP_LOCK(pp);
1949 	isempty = (pp->pp_u_bufinuse == 0);
1950 	PP_UNLOCK(pp);
1951 
1952 	return isempty;
1953 }
1954 
1955 __attribute__((always_inline))
1956 static inline struct __kern_quantum *
pp_metadata_init(struct __metadata_preamble * mdp,struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag,struct skmem_obj ** blist)1957 pp_metadata_init(struct __metadata_preamble *mdp, struct kern_pbufpool *pp,
1958     uint16_t bufcnt, uint32_t skmflag, struct skmem_obj **blist)
1959 {
1960 	struct __kern_quantum *kqum;
1961 	struct __user_quantum *uqum;
1962 
1963 	kqum = SK_PTR_ADDR_KQUM((uintptr_t)mdp + METADATA_PREAMBLE_SZ);
1964 	ASSERT(kqum->qum_pp == pp);
1965 	if (__probable(!PP_KERNEL_ONLY(pp))) {
1966 		ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY));
1967 		uqum =  __DECONST(struct __user_quantum *, kqum->qum_user);
1968 		ASSERT(uqum != NULL);
1969 	} else {
1970 		ASSERT(kqum->qum_qflags & QUM_F_KERNEL_ONLY);
1971 		ASSERT(kqum->qum_user == NULL);
1972 		uqum = NULL;
1973 	}
1974 
1975 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 &&
1976 	    pp_metadata_construct(kqum, uqum, METADATA_IDX(kqum), pp,
1977 	    skmflag, bufcnt, FALSE, blist) != 0) {
1978 		return NULL;
1979 	}
1980 
1981 	/* (re)construct {user,kernel} metadata */
1982 	switch (pp->pp_md_type) {
1983 	case NEXUS_META_TYPE_PACKET: {
1984 		struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
1985 		struct __kern_buflet *kbuf = &kpkt->pkt_qum_buf;
1986 		uint16_t i;
1987 
1988 		/* sanitize flags */
1989 		kpkt->pkt_pflags &= PKT_F_INIT_MASK;
1990 
1991 		ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
1992 		    kpkt->pkt_com_opt != NULL);
1993 		ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
1994 		    kpkt->pkt_flow != NULL);
1995 		ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
1996 		    kpkt->pkt_tx_compl != NULL);
1997 
1998 		/*
1999 		 * XXX: For now we always set PKT_F_FLOW_DATA;
2000 		 * this is a no-op but done for consistency
2001 		 * with the other PKT_F_*_DATA flags.
2002 		 */
2003 		kpkt->pkt_pflags |= PKT_F_FLOW_DATA;
2004 
2005 		/* initialize kernel packet */
2006 		KPKT_INIT(kpkt, QUM_F_INTERNALIZED);
2007 
2008 		ASSERT(bufcnt || PP_HAS_BUFFER_ON_DEMAND(pp));
2009 		if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
2010 			ASSERT(kbuf->buf_ctl == NULL);
2011 			ASSERT(kbuf->buf_addr == 0);
2012 			/*
2013 			 * -fbounds-safety: buf_nbft_addr is a mach_vm_address_t
2014 			 * which is unsafe, so we just forge it here.
2015 			 */
2016 			kbuf = __unsafe_forge_single(struct __kern_buflet *,
2017 			    __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
2018 		}
2019 		/* initialize kernel buflet */
2020 		for (i = 0; i < bufcnt; i++) {
2021 			ASSERT(kbuf != NULL);
2022 			KBUF_INIT(kbuf);
2023 			kbuf = __unsafe_forge_single(struct __kern_buflet *,
2024 			    __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
2025 		}
2026 		ASSERT((kbuf == NULL) || (bufcnt == 0));
2027 		break;
2028 	}
2029 	default:
2030 		ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
2031 		/* kernel quantum */
2032 		KQUM_INIT(kqum, QUM_F_INTERNALIZED);
2033 		KBUF_INIT(&kqum->qum_buf[0]);
2034 		break;
2035 	}
2036 
2037 	return kqum;
2038 }
2039 
2040 /*
2041  * When PPF_BUFFER_ON_DEMAND flag is set on packet pool creation, we create
2042  * packet descriptor cache with no buffer attached and a buflet cache with
2043  * cpu layer caching enabled. While operating in this mode, we can call
2044  * pp_alloc_packet_common() either with `bufcnt = 0` or `bufcnt = n`,
2045  * where n <= pp->pp_max_frags. If `bufcnt == 0` then we allocate packet
2046  * descriptor with no attached buffer from the metadata cache.
2047  * If `bufcnt != 0`, then this routine allocates packet descriptor and buflets
2048  * from their respective caches and constructs the packet on behalf of the
2049  * caller.
2050  */
2051 __attribute__((always_inline))
2052 static inline uint32_t
pp_alloc_packet_common(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * __counted_by (num)array,uint32_t num,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2053 pp_alloc_packet_common(struct kern_pbufpool *pp, uint16_t bufcnt,
2054     uint64_t *__counted_by(num)array, uint32_t num, boolean_t tagged,
2055     alloc_cb_func_t cb, const void *ctx, uint32_t skmflag)
2056 {
2057 	struct __metadata_preamble *mdp;
2058 	struct __kern_quantum *kqum = NULL;
2059 	uint32_t allocp, need = num;
2060 	struct skmem_obj *__single plist, *__single blist = NULL;
2061 	uint64_t *array_cp;  /* -fbounds-safety */
2062 
2063 	ASSERT(bufcnt <= pp->pp_max_frags);
2064 	ASSERT(array != NULL && num > 0);
2065 	ASSERT(PP_BATCH_CAPABLE(pp));
2066 
2067 	/* allocate (constructed) packet(s) with buffer(s) attached */
2068 	allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist,
2069 	    pp->pp_kmd_cache->skm_objsize, num, skmflag);
2070 
2071 	/* allocate (constructed) buflet(s) with buffer(s) attached */
2072 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
2073 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
2074 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, (allocp * bufcnt), skmflag);
2075 	}
2076 
2077 	array_cp = array;
2078 	while (plist != NULL) {
2079 		struct skmem_obj *plistn;
2080 
2081 		plistn = plist->mo_next;
2082 		plist->mo_next = NULL;
2083 
2084 		mdp = (struct __metadata_preamble *)(void *)plist;
2085 		kqum = pp_metadata_init(mdp, pp, bufcnt, skmflag, &blist);
2086 		if (kqum == NULL) {
2087 			if (blist != NULL) {
2088 				skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp),
2089 				    blist);
2090 				blist = NULL;
2091 			}
2092 			plist->mo_next = plistn;
2093 			skmem_cache_batch_free(pp->pp_kmd_cache, plist);
2094 			plist = NULL;
2095 			break;
2096 		}
2097 
2098 #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
2099 		/* Checking to ensure the object address is tagged */
2100 		ASSERT((vm_offset_t)kqum !=
2101 		    vm_memtag_canonicalize_address((vm_offset_t)kqum));
2102 #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
2103 
2104 		if (tagged) {
2105 			*array_cp = SK_PTR_ENCODE(kqum, METADATA_TYPE(kqum),
2106 			    METADATA_SUBTYPE(kqum));
2107 		} else {
2108 			*array_cp = (uint64_t)kqum;
2109 		}
2110 
2111 		if (cb != NULL) {
2112 			(cb)(*array_cp, (num - need), ctx);
2113 		}
2114 
2115 		++array_cp;
2116 		plist = plistn;
2117 
2118 		ASSERT(need > 0);
2119 		--need;
2120 	}
2121 	ASSERT(blist == NULL);
2122 	ASSERT((num - need) == allocp || kqum == NULL);
2123 
2124 	return num - need;
2125 }
2126 
2127 uint64_t
pp_alloc_packet(struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag)2128 pp_alloc_packet(struct kern_pbufpool *pp, uint16_t bufcnt, uint32_t skmflag)
2129 {
2130 	uint64_t kpkt = 0;
2131 
2132 	(void) pp_alloc_packet_common(pp, bufcnt, &kpkt, 1, FALSE,
2133 	    NULL, NULL, skmflag);
2134 
2135 	return kpkt;
2136 }
2137 
2138 int
pp_alloc_packet_batch(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * __counted_by (* size)array,uint32_t * size,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2139 pp_alloc_packet_batch(struct kern_pbufpool *pp, uint16_t bufcnt,
2140     uint64_t *__counted_by(*size)array, uint32_t *size, boolean_t tagged,
2141     alloc_cb_func_t cb, const void *ctx, uint32_t skmflag)
2142 {
2143 	uint32_t i, n;
2144 	int err;
2145 
2146 	ASSERT(array != NULL && size > 0);
2147 
2148 	n = *size;
2149 	/*
2150 	 * -fbounds-safety: Originally there was this line here: *size = 0; but
2151 	 * we removed this because array is now __counted_by(*size), so *size =
2152 	 * 0 leads to brk 0x5519. Also, *size is set to i anyway.
2153 	 */
2154 
2155 	i = pp_alloc_packet_common(pp, bufcnt, array, n, tagged,
2156 	    cb, ctx, skmflag);
2157 	/*
2158 	 * -fbounds-safety: Since array is __counted_by(*size), we need to be
2159 	 * extra careful when *size is updated, like below. Here, we know i will
2160 	 * be less than or equal to the original *size value, so updating *size
2161 	 * is okay.
2162 	 */
2163 	*size = i;
2164 
2165 	if (__probable(i == n)) {
2166 		err = 0;
2167 	} else if (i != 0) {
2168 		err = EAGAIN;
2169 	} else {
2170 		err = ENOMEM;
2171 	}
2172 
2173 	return err;
2174 }
2175 
2176 int
pp_alloc_pktq(struct kern_pbufpool * pp,uint16_t bufcnt,struct pktq * pktq,uint32_t num,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2177 pp_alloc_pktq(struct kern_pbufpool *pp, uint16_t bufcnt,
2178     struct pktq *pktq, uint32_t num, alloc_cb_func_t cb, const void *ctx,
2179     uint32_t skmflag)
2180 {
2181 	struct __metadata_preamble *mdp;
2182 	struct __kern_packet *kpkt = NULL;
2183 	uint32_t allocp, need = num;
2184 	struct skmem_obj *__single plist, *__single blist = NULL;
2185 	int err;
2186 
2187 	ASSERT(pktq != NULL && num > 0);
2188 	ASSERT(pp->pp_md_type == NEXUS_META_TYPE_PACKET);
2189 	ASSERT(bufcnt <= pp->pp_max_frags);
2190 	ASSERT(PP_BATCH_CAPABLE(pp));
2191 
2192 	/* allocate (constructed) packet(s) with buffer(s) attached */
2193 	allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist,
2194 	    pp->pp_kmd_cache->skm_objsize, num, skmflag);
2195 
2196 	/* allocate (constructed) buflet(s) with buffer(s) attached */
2197 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
2198 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
2199 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, (allocp * bufcnt), skmflag);
2200 	}
2201 
2202 	while (plist != NULL) {
2203 		struct skmem_obj *plistn;
2204 
2205 		plistn = plist->mo_next;
2206 		plist->mo_next = NULL;
2207 
2208 		mdp = (struct __metadata_preamble *)(void *)plist;
2209 		kpkt = (struct __kern_packet *)pp_metadata_init(mdp, pp,
2210 		    bufcnt, skmflag, &blist);
2211 		if (kpkt == NULL) {
2212 			if (blist != NULL) {
2213 				skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp),
2214 				    blist);
2215 				blist = NULL;
2216 			}
2217 			plist->mo_next = plistn;
2218 			skmem_cache_batch_free(pp->pp_kmd_cache, plist);
2219 			plist = NULL;
2220 			break;
2221 		}
2222 
2223 #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
2224 		/* Checking to ensure the object address is tagged */
2225 		ASSERT((vm_offset_t)kpkt !=
2226 		    vm_memtag_canonicalize_address((vm_offset_t)kpkt));
2227 #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
2228 
2229 		KPKTQ_ENQUEUE(pktq, kpkt);
2230 
2231 		if (cb != NULL) {
2232 			(cb)((uint64_t)kpkt, (num - need), ctx);
2233 		}
2234 
2235 		plist = plistn;
2236 
2237 		ASSERT(need > 0);
2238 		--need;
2239 	}
2240 	ASSERT(blist == NULL);
2241 	ASSERT((num - need) == allocp || kpkt == NULL);
2242 
2243 	if (__probable(need == 0)) {
2244 		err = 0;
2245 	} else if (need == num) {
2246 		err = ENOMEM;
2247 	} else {
2248 		err = EAGAIN;
2249 	}
2250 
2251 	return err;
2252 }
2253 
2254 uint64_t
pp_alloc_packet_by_size(struct kern_pbufpool * pp,uint32_t size,uint32_t skmflag)2255 pp_alloc_packet_by_size(struct kern_pbufpool *pp, uint32_t size,
2256     uint32_t skmflag)
2257 {
2258 	uint32_t bufcnt = pp->pp_max_frags;
2259 	uint64_t kpkt = 0;
2260 
2261 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
2262 		bufcnt =
2263 		    SK_ROUNDUP(size, PP_BUF_SIZE_DEF(pp)) / PP_BUF_SIZE_DEF(pp);
2264 		ASSERT(bufcnt <= UINT16_MAX);
2265 	}
2266 
2267 	(void) pp_alloc_packet_common(pp, (uint16_t)bufcnt, &kpkt, 1, TRUE,
2268 	    NULL, NULL, skmflag);
2269 
2270 	return kpkt;
2271 }
2272 
2273 __attribute__((always_inline))
2274 static inline struct __metadata_preamble *
pp_metadata_fini(struct __kern_quantum * kqum,struct kern_pbufpool * pp,struct mbuf ** mp,struct __kern_packet ** kpp,struct skmem_obj ** blist_def,struct skmem_obj ** blist_nocache_def,struct skmem_obj ** blist_large,struct skmem_obj ** blist_nocahce_large)2275 pp_metadata_fini(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
2276     struct mbuf **mp, struct __kern_packet **kpp, struct skmem_obj **blist_def,
2277     struct skmem_obj **blist_nocache_def, struct skmem_obj **blist_large,
2278     struct skmem_obj **blist_nocahce_large)
2279 {
2280 	struct __metadata_preamble *mdp = METADATA_PREAMBLE(kqum);
2281 
2282 	ASSERT(SK_PTR_TAG(kqum) == 0);
2283 
2284 	switch (pp->pp_md_type) {
2285 	case NEXUS_META_TYPE_PACKET: {
2286 		struct __kern_packet *kpkt = SK_PTR_KPKT(kqum);
2287 
2288 		if ((kpkt->pkt_pflags & PKT_F_TX_COMPL_TS_REQ) != 0) {
2289 			__packet_perform_tx_completion_callbacks(
2290 				SK_PKT2PH(kpkt), NULL);
2291 		}
2292 		if ((kpkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) {
2293 			ASSERT((kpkt->pkt_pflags & PKT_F_PKT_DATA) == 0);
2294 			ASSERT(kpkt->pkt_mbuf != NULL);
2295 			ASSERT(kpkt->pkt_mbuf->m_nextpkt == NULL);
2296 			if (mp != NULL) {
2297 				ASSERT(*mp == NULL);
2298 				*mp = kpkt->pkt_mbuf;
2299 			} else {
2300 				m_freem(kpkt->pkt_mbuf);
2301 			}
2302 			KPKT_CLEAR_MBUF_DATA(kpkt);
2303 		} else if ((kpkt->pkt_pflags & PKT_F_PKT_DATA) != 0) {
2304 			ASSERT(kpkt->pkt_pkt != NULL);
2305 			ASSERT(kpkt->pkt_pkt->pkt_nextpkt == NULL);
2306 			if (kpp != NULL) {
2307 				ASSERT(*kpp == NULL);
2308 				*kpp = kpkt->pkt_pkt;
2309 			} else {
2310 				/* can only recurse once */
2311 				ASSERT((kpkt->pkt_pkt->pkt_pflags &
2312 				    PKT_F_PKT_DATA) == 0);
2313 				pp_free_packet_single(kpkt->pkt_pkt);
2314 			}
2315 			KPKT_CLEAR_PKT_DATA(kpkt);
2316 		}
2317 		kpkt->pkt_pflags &= ~PKT_F_TRUNCATED;
2318 		ASSERT(kpkt->pkt_nextpkt == NULL);
2319 		ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
2320 		ASSERT((kpkt->pkt_pflags & PKT_F_MBUF_MASK) == 0);
2321 		ASSERT((kpkt->pkt_pflags & PKT_F_PKT_MASK) == 0);
2322 		break;
2323 	}
2324 	default:
2325 		break;
2326 	}
2327 
2328 	if (__improbable(PP_HAS_BUFFER_ON_DEMAND(pp))) {
2329 		pp_metadata_destruct_common(kqum, pp, FALSE, blist_def, blist_nocache_def,
2330 		    blist_large, blist_nocahce_large);
2331 	}
2332 	return mdp;
2333 }
2334 
2335 void
pp_free_packet_chain(struct __kern_packet * pkt_chain,int * npkt)2336 pp_free_packet_chain(struct __kern_packet *pkt_chain, int *npkt)
2337 {
2338 	struct __metadata_preamble *mdp;
2339 	struct skmem_obj *__single obj_mdp;
2340 	struct skmem_obj *__single top = NULL;
2341 	struct skmem_obj *__single blist_def = NULL, *__single blist_nocache_def = NULL;
2342 	struct skmem_obj *__single blist_large = NULL, *__single blist_nocache_large = NULL;
2343 	struct skmem_obj **list = &top;
2344 	struct mbuf *__single mtop = NULL;
2345 	struct mbuf **mp = &mtop;
2346 	struct __kern_packet *__single kptop = NULL;
2347 	struct __kern_packet **__single kpp = &kptop, *pkt, *next;
2348 	struct kern_pbufpool *pp;
2349 	int c = 0;
2350 
2351 	pp = __DECONST(struct kern_pbufpool *, pkt_chain->pkt_qum.qum_pp);
2352 	ASSERT(pp != NULL);
2353 	ASSERT(PP_BATCH_CAPABLE(pp));
2354 
2355 	for (pkt = pkt_chain; pkt != NULL; pkt = next) {
2356 		next = pkt->pkt_nextpkt;
2357 		pkt->pkt_nextpkt = NULL;
2358 
2359 		ASSERT(SK_PTR_ADDR_KQUM(pkt)->qum_pp == pp);
2360 		mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(pkt), pp,
2361 		    mp, kpp, &blist_def, &blist_nocache_def, &blist_large, &blist_nocache_large);
2362 
2363 		obj_mdp = __unsafe_forge_single(struct skmem_obj *, mdp);
2364 		*list = obj_mdp;
2365 		list = &(*list)->mo_next;
2366 		c++;
2367 
2368 		if (*mp != NULL) {
2369 			mp = &(*mp)->m_nextpkt;
2370 			ASSERT(*mp == NULL);
2371 		}
2372 		if (*kpp != NULL) {
2373 			kpp = &(*kpp)->pkt_nextpkt;
2374 			ASSERT(*kpp == NULL);
2375 		}
2376 	}
2377 
2378 	ASSERT(top != NULL);
2379 	skmem_cache_batch_free(pp->pp_kmd_cache, top);
2380 	pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
2381 	if (mtop != NULL) {
2382 		DTRACE_SKYWALK(free__attached__mbuf);
2383 		if (__probable(mtop->m_nextpkt != NULL)) {
2384 			m_freem_list(mtop);
2385 		} else {
2386 			m_freem(mtop);
2387 		}
2388 	}
2389 	if (kptop != NULL) {
2390 		int cnt = 0;
2391 		pp_free_packet_chain(kptop, &cnt);
2392 		DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2393 	}
2394 	if (npkt != NULL) {
2395 		*npkt = c;
2396 	}
2397 }
2398 
2399 void
pp_free_pktq(struct pktq * pktq)2400 pp_free_pktq(struct pktq *pktq)
2401 {
2402 	if (__improbable(KPKTQ_EMPTY(pktq))) {
2403 		return;
2404 	}
2405 	struct __kern_packet *pkt = KPKTQ_FIRST(pktq);
2406 	pp_free_packet_chain(pkt, NULL);
2407 	KPKTQ_DISPOSE(pktq);
2408 }
2409 
2410 void
pp_drop_pktq(struct pktq * pktq,struct ifnet * ifp,uint16_t flags,drop_reason_t reason,const char * funcname,uint16_t linenum)2411 pp_drop_pktq(struct pktq *pktq, struct ifnet *ifp, uint16_t flags,
2412     drop_reason_t reason, const char *funcname, uint16_t linenum)
2413 {
2414 	drop_func_t dropfunc;
2415 	struct __kern_packet *kpkt;
2416 
2417 	if (KPKTQ_EMPTY(pktq)) {
2418 		return;
2419 	}
2420 	if (__probable(droptap_total_tap_count == 0)) {
2421 		goto nodroptap;
2422 	}
2423 
2424 	if (flags & DROPTAP_FLAG_DIR_OUT) {
2425 		dropfunc = droptap_output_packet;
2426 	} else if (flags & DROPTAP_FLAG_DIR_IN) {
2427 		dropfunc = droptap_input_packet;
2428 	} else {
2429 		goto nodroptap;
2430 	}
2431 
2432 	KPKTQ_FOREACH(kpkt, pktq) {
2433 		dropfunc(SK_PKT2PH(kpkt), reason, funcname, linenum, flags, ifp,
2434 		    kpkt->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);
2435 	}
2436 
2437 nodroptap:
2438 	pp_free_pktq(pktq);
2439 }
2440 
2441 __attribute__((always_inline))
2442 static inline void
pp_free_packet_array(struct kern_pbufpool * pp,uint64_t * __counted_by (num)array,uint32_t num)2443 pp_free_packet_array(struct kern_pbufpool *pp, uint64_t *__counted_by(num)array, uint32_t num)
2444 {
2445 	struct __metadata_preamble *mdp;
2446 	struct skmem_obj *__single obj_mdp = NULL;
2447 	struct skmem_obj *__single top = NULL;
2448 	struct skmem_obj *__single blist_def = NULL, *__single blist_nocache_def = NULL;
2449 	struct skmem_obj *__single blist_large = NULL, *__single blist_nocache_large = NULL;
2450 	struct skmem_obj **list = &top;
2451 	struct mbuf *__single mtop = NULL;
2452 	struct mbuf **mp = &mtop;
2453 	struct __kern_packet *__single kptop = NULL;
2454 	struct __kern_packet **kpp = &kptop;
2455 	uint32_t i;
2456 
2457 	ASSERT(pp != NULL);
2458 	ASSERT(array != NULL && num > 0);
2459 	ASSERT(PP_BATCH_CAPABLE(pp));
2460 
2461 	for (i = 0; i < num; i++) {
2462 		ASSERT(SK_PTR_ADDR_KQUM(array[i])->qum_pp == pp);
2463 		mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(array[i]), pp,
2464 		    mp, kpp, &blist_def, &blist_nocache_def, &blist_large, &blist_nocache_large);
2465 
2466 		obj_mdp = __unsafe_forge_single(struct skmem_obj *, mdp);
2467 		*list = obj_mdp;
2468 		list = &(*list)->mo_next;
2469 		array[i] = 0;
2470 
2471 		if (*mp != NULL) {
2472 			mp = &(*mp)->m_nextpkt;
2473 			ASSERT(*mp == NULL);
2474 		}
2475 		if (*kpp != NULL) {
2476 			kpp = &(*kpp)->pkt_nextpkt;
2477 			ASSERT(*kpp == NULL);
2478 		}
2479 	}
2480 
2481 	ASSERT(top != NULL);
2482 	skmem_cache_batch_free(pp->pp_kmd_cache, top);
2483 	pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
2484 	if (mtop != NULL) {
2485 		DTRACE_SKYWALK(free__attached__mbuf);
2486 		if (__probable(mtop->m_nextpkt != NULL)) {
2487 			m_freem_list(mtop);
2488 		} else {
2489 			m_freem(mtop);
2490 		}
2491 	}
2492 	if (kptop != NULL) {
2493 		int cnt = 0;
2494 		pp_free_packet_chain(kptop, &cnt);
2495 		DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2496 	}
2497 }
2498 
2499 void
pp_free_packet(struct kern_pbufpool * pp,uint64_t kqum)2500 pp_free_packet(struct kern_pbufpool *pp, uint64_t kqum)
2501 {
2502 	pp_free_packet_array(pp, &kqum, 1);
2503 }
2504 
2505 void
pp_free_packet_batch(const kern_pbufpool_t pp,uint64_t * __counted_by (size)array,uint32_t size)2506 pp_free_packet_batch(const kern_pbufpool_t pp, uint64_t *__counted_by(size)array, uint32_t size)
2507 {
2508 	pp_free_packet_array(pp, array, size);
2509 }
2510 
2511 void
pp_free_packet_single(struct __kern_packet * pkt)2512 pp_free_packet_single(struct __kern_packet *pkt)
2513 {
2514 	ASSERT(pkt->pkt_nextpkt == NULL);
2515 	pp_free_packet(__DECONST(struct kern_pbufpool *,
2516 	    pkt->pkt_qum.qum_pp), SK_PTR_ADDR(pkt));
2517 }
2518 
2519 void
pp_drop_packet_single(struct __kern_packet * pkt,struct ifnet * ifp,uint16_t flags,drop_reason_t reason,const char * funcname,uint16_t linenum)2520 pp_drop_packet_single(struct __kern_packet *pkt, struct ifnet *ifp, uint16_t flags,
2521     drop_reason_t reason, const char *funcname, uint16_t linenum)
2522 {
2523 	drop_func_t dropfunc;
2524 
2525 	if (pkt->pkt_length == 0) {
2526 		return;
2527 	}
2528 	if (__probable(droptap_total_tap_count == 0)) {
2529 		goto nodroptap;
2530 	}
2531 
2532 	if (flags & DROPTAP_FLAG_DIR_OUT) {
2533 		dropfunc = droptap_output_packet;
2534 	} else if (flags & DROPTAP_FLAG_DIR_IN) {
2535 		dropfunc = droptap_input_packet;
2536 	} else {
2537 		goto nodroptap;
2538 	}
2539 
2540 	dropfunc(SK_PKT2PH(pkt), reason, funcname, linenum, flags, ifp,
2541 	    pkt->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);
2542 
2543 nodroptap:
2544 	pp_free_packet_single(pkt);
2545 }
2546 
2547 static mach_vm_address_t
pp_alloc_buffer_common(const kern_pbufpool_t pp,struct skmem_obj_info * oi,uint32_t skmflag,bool large)2548 pp_alloc_buffer_common(const kern_pbufpool_t pp, struct skmem_obj_info *oi,
2549     uint32_t skmflag, bool large)
2550 {
2551 	/*
2552 	 * XXX -fbounds-safety: We can't change this mach_vm_address_t to some
2553 	 * other (safe) pointer type, because IOSkywalkFamily depends on this
2554 	 * being mach_vm_address_t
2555 	 */
2556 	mach_vm_address_t baddr;
2557 	struct skmem_cache *skm = large ? PP_BUF_CACHE_LARGE(pp):
2558 	    PP_BUF_CACHE_DEF(pp);
2559 
2560 	ASSERT(skm != NULL);
2561 	/* allocate a cached buffer */
2562 	baddr = (mach_vm_address_t)skmem_cache_alloc(skm, skmflag);
2563 
2564 #if (DEVELOPMENT || DEBUG)
2565 	uint64_t mtbf = skmem_region_get_mtbf();
2566 	/*
2567 	 * MTBF is applicable only for non-blocking allocations here.
2568 	 */
2569 	if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
2570 	    (skmflag & SKMEM_NOSLEEP))) {
2571 		SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
2572 		net_update_uptime();
2573 		if (baddr != 0) {
2574 			skmem_cache_free(skm,
2575 			    __unsafe_forge_single(struct skmem_obj *, baddr));
2576 			baddr = 0;
2577 		}
2578 	}
2579 #endif /* (DEVELOPMENT || DEBUG) */
2580 
2581 	if (__improbable(baddr == 0)) {
2582 		SK_DF(SK_VERB_MEM, "failed to alloc buffer, pp 0x%llx",
2583 		    SK_KVA(pp));
2584 		return 0;
2585 	}
2586 	skmem_cache_get_obj_info(skm,
2587 	    __unsafe_forge_single(struct skmem_obj *, baddr), oi, NULL);
2588 	ASSERT(SKMEM_OBJ_BUFCTL(oi) != NULL);
2589 	ASSERT((mach_vm_address_t)SKMEM_OBJ_ADDR(oi) == baddr);
2590 	return baddr;
2591 }
2592 
2593 errno_t
pp_alloc_buffer(const kern_pbufpool_t pp,mach_vm_address_t * baddr,kern_segment_t * seg,kern_obj_idx_seg_t * idx,uint32_t skmflag)2594 pp_alloc_buffer(const kern_pbufpool_t pp, mach_vm_address_t *baddr,
2595     kern_segment_t *seg, kern_obj_idx_seg_t *idx, uint32_t skmflag)
2596 {
2597 	struct skmem_obj_info oib;
2598 
2599 	VERIFY(pp != NULL && baddr != NULL);
2600 	VERIFY((seg != NULL) == (idx != NULL));
2601 
2602 	if (__improbable(!PP_HAS_BUFFER_ON_DEMAND(pp))) {
2603 		return ENOTSUP;
2604 	}
2605 
2606 	*baddr = pp_alloc_buffer_common(pp, &oib, skmflag, false);
2607 	if (__improbable(*baddr == 0)) {
2608 		return ENOMEM;
2609 	}
2610 
2611 	if (seg != NULL) {
2612 		ASSERT(SKMEM_OBJ_SEG(&oib) != NULL);
2613 		*seg = SKMEM_OBJ_SEG(&oib);
2614 		*idx = SKMEM_OBJ_IDX_SEG(&oib);
2615 	}
2616 	return 0;
2617 }
2618 
2619 void
pp_free_buffer(const kern_pbufpool_t pp,mach_vm_address_t addr)2620 pp_free_buffer(const kern_pbufpool_t pp, mach_vm_address_t addr)
2621 {
2622 	ASSERT(pp != NULL && addr != 0);
2623 	skmem_cache_free(PP_BUF_CACHE_DEF(pp), __unsafe_forge_single(
2624 		    struct skmem_obj *, addr));
2625 }
2626 
2627 __attribute__((always_inline))
2628 static inline uint32_t
pp_alloc_buflet_common(struct kern_pbufpool * pp,uint64_t * __counted_by (num)array,uint32_t num,uint32_t skmflag,bool large)2629 pp_alloc_buflet_common(struct kern_pbufpool *pp,
2630     uint64_t *__counted_by(num)array, uint32_t num, uint32_t skmflag,
2631     bool large)
2632 {
2633 	struct __kern_buflet *kbft = NULL;
2634 	uint32_t allocd, need = num;
2635 	struct skmem_obj *__single list;
2636 	uint64_t *array_cp;  /* -fbounds-safety */
2637 
2638 	ASSERT(array != NULL && num > 0);
2639 	ASSERT(PP_BATCH_CAPABLE(pp));
2640 	ASSERT(PP_KBFT_CACHE_DEF(pp) != NULL);
2641 	ASSERT(PP_BUF_SIZE_LARGE(pp) != 0 || !large);
2642 
2643 	if (large) {
2644 		allocd = skmem_cache_batch_alloc(PP_KBFT_CACHE_LARGE(pp), &list,
2645 		    PP_KBFT_CACHE_LARGE(pp)->skm_objsize, num, skmflag);
2646 	} else {
2647 		allocd = skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &list,
2648 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, num, skmflag);
2649 	}
2650 
2651 	array_cp = array;
2652 	while (list != NULL) {
2653 		struct skmem_obj *listn;
2654 
2655 		listn = list->mo_next;
2656 		list->mo_next = NULL;
2657 		kbft = (kern_buflet_t)(void *)list;
2658 
2659 #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
2660 		/* Checking to ensure the object address is tagged */
2661 		ASSERT((vm_offset_t)kbft !=
2662 		    vm_memtag_canonicalize_address((vm_offset_t)kbft));
2663 #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
2664 
2665 		KBUF_EXT_INIT(kbft, pp);
2666 		*array_cp = (uint64_t)kbft;
2667 		++array_cp;
2668 		list = listn;
2669 		ASSERT(need > 0);
2670 		--need;
2671 	}
2672 	ASSERT((num - need) == allocd || kbft == NULL);
2673 	return num - need;
2674 }
2675 
2676 errno_t
pp_alloc_buflet(struct kern_pbufpool * pp,kern_buflet_t * kbft,uint32_t skmflag,bool large)2677 pp_alloc_buflet(struct kern_pbufpool *pp, kern_buflet_t *kbft, uint32_t skmflag,
2678     bool large)
2679 {
2680 	uint64_t bft;
2681 
2682 	if (__improbable(!pp_alloc_buflet_common(pp, &bft, 1, skmflag, large))) {
2683 		return ENOMEM;
2684 	}
2685 	*kbft = __unsafe_forge_single(kern_buflet_t, bft);
2686 	return 0;
2687 }
2688 
2689 errno_t
pp_alloc_buflet_batch(struct kern_pbufpool * pp,uint64_t * __counted_by (* size)array,uint32_t * size,uint32_t skmflag,bool large)2690 pp_alloc_buflet_batch(struct kern_pbufpool *pp,
2691     uint64_t *__counted_by(*size)array, uint32_t *size, uint32_t skmflag,
2692     bool large)
2693 {
2694 	uint32_t i, n;
2695 	int err;
2696 
2697 	ASSERT(array != NULL && size > 0);
2698 
2699 	n = *size;
2700 	i = pp_alloc_buflet_common(pp, array, n, skmflag, large);
2701 	*size = i;
2702 
2703 	if (__probable(i == n)) {
2704 		err = 0;
2705 	} else if (i != 0) {
2706 		err = EAGAIN;
2707 	} else {
2708 		err = ENOMEM;
2709 	}
2710 
2711 	return err;
2712 }
2713 
2714 __attribute__((always_inline))
2715 static void
pp_free_buflet_common(const kern_pbufpool_t pp,kern_buflet_t kbft)2716 pp_free_buflet_common(const kern_pbufpool_t pp, kern_buflet_t kbft)
2717 {
2718 	ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
2719 	ASSERT(kbft->buf_nbft_addr == 0);
2720 
2721 	if (kbft->buf_flag & BUFLET_FLAG_EXTERNAL) {
2722 		ASSERT(kbft->buf_addr != 0);
2723 		ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2724 		ASSERT(kbft->buf_bft_idx_reg != OBJ_IDX_NONE);
2725 		ASSERT(kbft->buf_ctl != NULL);
2726 		ASSERT(((struct __kern_buflet_ext *)kbft)->
2727 		    kbe_buf_upp_link.sle_next == NULL);
2728 		if (kbft->buf_ctl->bc_usecnt > 1) {
2729 			skmem_cache_free_nocache(BUFLET_HAS_LARGE_BUF(kbft) ?
2730 			    PP_KBFT_CACHE_LARGE(pp) : PP_KBFT_CACHE_DEF(pp),
2731 			    (void *)kbft);
2732 		} else {
2733 			skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2734 			    PP_KBFT_CACHE_LARGE(pp) : PP_KBFT_CACHE_DEF(pp),
2735 			    (void *)kbft);
2736 		}
2737 	} else if (__probable(kbft->buf_addr != 0)) {
2738 		void *objaddr = kbft->buf_objaddr;
2739 		uint32_t usecnt = 0;
2740 
2741 		ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2742 		ASSERT(kbft->buf_ctl != NULL);
2743 		KBUF_DTOR(kbft, usecnt);
2744 		SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u",
2745 		    SK_KVA(pp), SK_KVA(objaddr), usecnt);
2746 		if (__probable(usecnt == 0)) {
2747 			skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2748 			    PP_BUF_CACHE_LARGE(pp) : PP_BUF_CACHE_DEF(pp),
2749 			    objaddr);
2750 		}
2751 	}
2752 }
2753 
2754 void
pp_free_buflet(const kern_pbufpool_t pp,kern_buflet_t kbft)2755 pp_free_buflet(const kern_pbufpool_t pp, kern_buflet_t kbft)
2756 {
2757 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
2758 	ASSERT(pp != NULL && kbft != NULL);
2759 	pp_free_buflet_common(pp, kbft);
2760 }
2761 
2762 void
pp_reap_caches(boolean_t purge)2763 pp_reap_caches(boolean_t purge)
2764 {
2765 	skmem_cache_reap_now(pp_opt_cache, purge);
2766 	skmem_cache_reap_now(pp_flow_cache, purge);
2767 	skmem_cache_reap_now(pp_compl_cache, purge);
2768 }
2769