xref: /xnu-12377.1.9/bsd/skywalk/packet/pbufpool.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 2016-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/packet/pbufpool_var.h>
31 #include <sys/sdt.h>
32 #include <net/droptap.h>
33 #include <kern/uipc_domain.h>
34 
35 static struct kern_pbufpool *pp_alloc(zalloc_flags_t);
36 static void pp_free(struct kern_pbufpool *);
37 static uint32_t pp_alloc_packet_common(struct kern_pbufpool *, uint16_t,
38     uint64_t *__counted_by(num), uint32_t num, boolean_t, alloc_cb_func_t,
39     const void *, uint32_t);
40 static void pp_free_packet_array(struct kern_pbufpool *,
41     uint64_t *__counted_by(num)array, uint32_t num);
42 static int pp_metadata_ctor_no_buflet(struct skmem_obj_info *,
43     struct skmem_obj_info *, void *, uint32_t);
44 static int pp_metadata_ctor_max_buflet(struct skmem_obj_info *,
45     struct skmem_obj_info *, void *, uint32_t);
46 static void pp_metadata_dtor(void *, void *);
47 static int pp_metadata_construct(struct __kern_quantum *,
48     struct __user_quantum *, obj_idx_t, struct kern_pbufpool *, uint32_t,
49     uint16_t, bool, struct skmem_obj **);
50 static void pp_metadata_destruct(struct __kern_quantum *,
51     struct kern_pbufpool *, bool);
52 static struct __kern_quantum *pp_metadata_init(struct __metadata_preamble *,
53     struct kern_pbufpool *, uint16_t, uint32_t, struct skmem_obj **);
54 static struct __metadata_preamble *pp_metadata_fini(struct __kern_quantum *,
55     struct kern_pbufpool *, struct mbuf **, struct __kern_packet **,
56     struct skmem_obj **, struct skmem_obj **, struct skmem_obj **, struct skmem_obj **);
57 static void pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid);
58 static void pp_buf_seg_ctor(struct sksegment *, IOSKMemoryBufferRef, void *);
59 static void pp_buf_seg_dtor(struct sksegment *, IOSKMemoryBufferRef, void *);
60 static void pp_destroy_upp_locked(struct kern_pbufpool *);
61 static void pp_destroy_upp_bft_locked(struct kern_pbufpool *);
62 static int pp_init_upp_bft_locked(struct kern_pbufpool *, boolean_t);
63 static void pp_free_buflet_common(const kern_pbufpool_t, kern_buflet_t);
64 static mach_vm_address_t pp_alloc_buffer_common(const kern_pbufpool_t pp,
65     struct skmem_obj_info *oi, uint32_t skmflag, bool large);
66 static inline uint32_t
67 pp_alloc_buflet_common(struct kern_pbufpool *pp,
68     uint64_t *__counted_by(num)array, uint32_t num, uint32_t skmflag,
69     bool large);
70 
71 #define KERN_PBUFPOOL_U_HASH_SIZE       64      /* hash table size */
72 
73 #define KERN_BUF_MIN_STRIDING_SIZE      32 * 1024
74 static uint32_t kern_buf_min_striding_size = KERN_BUF_MIN_STRIDING_SIZE;
75 
76 /*
77  * Since the inputs are small (indices to the metadata region), we can use
78  * Knuth's multiplicative hash method which is fast and good enough.  Here
79  * we multiply the input by the golden ratio of 2^32.  See "The Art of
80  * Computer Programming", section 6.4.
81  */
82 #define KERN_PBUFPOOL_U_HASH_INDEX(_i, _m)                      \
83 	(((_i) * 2654435761U) & (_m))
84 #define KERN_PBUFPOOL_U_HASH(_pp, _i)                           \
85 	(&(_pp)->pp_u_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
86 	KERN_PBUFPOOL_U_HASH_SIZE - 1)])
87 #define KERN_PBUFPOOL_U_BFT_HASH(_pp, _i)                           \
88 	(&(_pp)->pp_u_bft_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
89 	KERN_PBUFPOOL_U_HASH_SIZE - 1)])
90 
91 static SKMEM_TYPE_DEFINE(pp_zone, struct kern_pbufpool);
92 
93 #define SKMEM_TAG_PBUFPOOL_HASH  "com.apple.skywalk.pbufpool.hash"
94 static SKMEM_TAG_DEFINE(skmem_tag_pbufpool_hash, SKMEM_TAG_PBUFPOOL_HASH);
95 
96 #define SKMEM_TAG_PBUFPOOL_BFT_HASH  "com.apple.skywalk.pbufpool.bft.hash"
97 static SKMEM_TAG_DEFINE(skmem_tag_pbufpool_bft_hash, SKMEM_TAG_PBUFPOOL_BFT_HASH);
98 
99 
100 struct kern_pbufpool_u_htbl {
101 	struct kern_pbufpool_u_bkt upp_hash[KERN_PBUFPOOL_U_HASH_SIZE];
102 };
103 
104 #define PP_U_HTBL_SIZE  sizeof(struct kern_pbufpool_u_htbl)
105 static SKMEM_TYPE_DEFINE(pp_u_htbl_zone, struct kern_pbufpool_u_htbl);
106 
107 static struct skmem_cache *pp_opt_cache;        /* cache for __packet_opt */
108 static struct skmem_cache *pp_flow_cache;       /* cache for __flow */
109 static struct skmem_cache *pp_compl_cache;      /* cache for __packet_compl */
110 
111 static int __pp_inited = 0;
112 
113 int
pp_init(void)114 pp_init(void)
115 {
116 	static_assert(KPKT_SC_UNSPEC == MBUF_SC_UNSPEC);
117 	static_assert(KPKT_SC_BK_SYS == MBUF_SC_BK_SYS);
118 	static_assert(KPKT_SC_BK == MBUF_SC_BK);
119 	static_assert(KPKT_SC_BE == MBUF_SC_BE);
120 	static_assert(KPKT_SC_RD == MBUF_SC_RD);
121 	static_assert(KPKT_SC_OAM == MBUF_SC_OAM);
122 	static_assert(KPKT_SC_AV == MBUF_SC_AV);
123 	static_assert(KPKT_SC_RV == MBUF_SC_RV);
124 	static_assert(KPKT_SC_VI == MBUF_SC_VI);
125 	static_assert(KPKT_SC_SIG == MBUF_SC_SIG);
126 	static_assert(KPKT_SC_VO == MBUF_SC_VO);
127 	static_assert(KPKT_SC_CTL == MBUF_SC_CTL);
128 
129 	static_assert(KPKT_SC_BK_SYS == PKT_SC_BK_SYS);
130 	static_assert(KPKT_SC_BK == PKT_SC_BK);
131 	static_assert(KPKT_SC_BE == PKT_SC_BE);
132 	static_assert(KPKT_SC_RD == PKT_SC_RD);
133 	static_assert(KPKT_SC_OAM == PKT_SC_OAM);
134 	static_assert(KPKT_SC_AV == PKT_SC_AV);
135 	static_assert(KPKT_SC_RV == PKT_SC_RV);
136 	static_assert(KPKT_SC_VI == PKT_SC_VI);
137 	static_assert(KPKT_SC_SIG == PKT_SC_SIG);
138 	static_assert(KPKT_SC_VO == PKT_SC_VO);
139 	static_assert(KPKT_SC_CTL == PKT_SC_CTL);
140 	static_assert(KPKT_SC_MAX_CLASSES == MBUF_SC_MAX_CLASSES);
141 
142 	static_assert(KPKT_TC_UNSPEC == MBUF_TC_UNSPEC);
143 	static_assert(KPKT_TC_BE == MBUF_TC_BE);
144 	static_assert(KPKT_TC_BK == MBUF_TC_BK);
145 	static_assert(KPKT_TC_VI == MBUF_TC_VI);
146 	static_assert(KPKT_TC_VO == MBUF_TC_VO);
147 	static_assert(KPKT_TC_MAX == MBUF_TC_MAX);
148 
149 	static_assert(KPKT_TC_BE == PKT_TC_BE);
150 	static_assert(KPKT_TC_BK == PKT_TC_BK);
151 	static_assert(KPKT_TC_VI == PKT_TC_VI);
152 	static_assert(KPKT_TC_VO == PKT_TC_VO);
153 
154 	static_assert(PKT_SCVAL_BK_SYS == SCVAL_BK_SYS);
155 	static_assert(PKT_SCVAL_BK == SCVAL_BK);
156 	static_assert(PKT_SCVAL_BE == SCVAL_BE);
157 	static_assert(PKT_SCVAL_RD == SCVAL_RD);
158 	static_assert(PKT_SCVAL_OAM == SCVAL_OAM);
159 	static_assert(PKT_SCVAL_AV == SCVAL_AV);
160 	static_assert(PKT_SCVAL_RV == SCVAL_RV);
161 	static_assert(PKT_SCVAL_VI == SCVAL_VI);
162 	static_assert(PKT_SCVAL_VO == SCVAL_VO);
163 	static_assert(PKT_SCVAL_CTL == SCVAL_CTL);
164 
165 	/*
166 	 * Assert that the value of common packet flags between mbuf and
167 	 * skywalk packets match, and that they are in PKT_F_COMMON_MASK.
168 	 */
169 	static_assert(PKT_F_BACKGROUND == PKTF_SO_BACKGROUND);
170 	static_assert(PKT_F_REALTIME == PKTF_SO_REALTIME);
171 	static_assert(PKT_F_REXMT == PKTF_TCP_REXMT);
172 	static_assert(PKT_F_LAST_PKT == PKTF_LAST_PKT);
173 	static_assert(PKT_F_FLOW_ID == PKTF_FLOW_ID);
174 	static_assert(PKT_F_FLOW_ADV == PKTF_FLOW_ADV);
175 	static_assert(PKT_F_TX_COMPL_TS_REQ == PKTF_TX_COMPL_TS_REQ);
176 	static_assert(PKT_F_TS_VALID == PKTF_TS_VALID);
177 	static_assert(PKT_F_NEW_FLOW == PKTF_NEW_FLOW);
178 	static_assert(PKT_F_START_SEQ == PKTF_START_SEQ);
179 	static_assert(PKT_F_KEEPALIVE == PKTF_KEEPALIVE);
180 	static_assert(PKT_F_WAKE_PKT == PKTF_WAKE_PKT);
181 	static_assert(PKT_F_COMMON_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME | PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_FLOW_ID | PKT_F_FLOW_ADV | PKT_F_TX_COMPL_TS_REQ | PKT_F_TS_VALID | PKT_F_NEW_FLOW | PKT_F_START_SEQ | PKT_F_KEEPALIVE | PKT_F_WAKE_PKT));
182 	/*
183 	 * Assert packet flags shared with userland.
184 	 */
185 	static_assert(PKT_F_USER_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME | PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_OPT_DATA | PKT_F_PROMISC | PKT_F_TRUNCATED | PKT_F_WAKE_PKT | PKT_F_L4S | PKT_F_ULPN));
186 
187 	static_assert(offsetof(struct __kern_quantum, qum_len) == offsetof(struct __kern_packet, pkt_length));
188 
189 	/*
190 	 * Due to the use of tagged pointer, we need the size of
191 	 * the metadata preamble structure to be multiples of 16.
192 	 * See SK_PTR_TAG() definition for details.
193 	 */
194 	static_assert(sizeof(struct __metadata_preamble) != 0 && (sizeof(struct __metadata_preamble) % 16) == 0);
195 
196 	static_assert(NX_PBUF_FRAGS_MIN == 1 && NX_PBUF_FRAGS_MIN == NX_PBUF_FRAGS_DEFAULT);
197 
198 	/*
199 	 * Batch alloc/free requires linking the objects together;
200 	 * make sure that the fields are at the same offset since
201 	 * we cast the object to struct skmem_obj.
202 	 */
203 	static_assert(offsetof(struct __metadata_preamble, _mdp_next) == offsetof(struct skmem_obj, mo_next));
204 	static_assert(offsetof(struct __buflet, __buflet_next) == offsetof(struct skmem_obj, mo_next));
205 
206 	SK_LOCK_ASSERT_HELD();
207 	ASSERT(!__pp_inited);
208 
209 	pp_opt_cache = skmem_cache_create("pkt.opt",
210 	    sizeof(struct __packet_opt), sizeof(uint64_t),
211 	    NULL, NULL, NULL, NULL, NULL, 0);
212 	pp_flow_cache = skmem_cache_create("pkt.flow",
213 	    sizeof(struct __flow), 16,  /* 16-bytes aligned */
214 	    NULL, NULL, NULL, NULL, NULL, 0);
215 	pp_compl_cache = skmem_cache_create("pkt.compl",
216 	    sizeof(struct __packet_compl), sizeof(uint64_t),
217 	    NULL, NULL, NULL, NULL, NULL, 0);
218 
219 	PE_parse_boot_argn("sk_pp_min_striding_size", &kern_buf_min_striding_size,
220 	    sizeof(kern_buf_min_striding_size));
221 
222 	return 0;
223 }
224 
225 void
pp_fini(void)226 pp_fini(void)
227 {
228 	SK_LOCK_ASSERT_HELD();
229 
230 	if (__pp_inited) {
231 		if (pp_compl_cache != NULL) {
232 			skmem_cache_destroy(pp_compl_cache);
233 			pp_compl_cache = NULL;
234 		}
235 		if (pp_flow_cache != NULL) {
236 			skmem_cache_destroy(pp_flow_cache);
237 			pp_flow_cache = NULL;
238 		}
239 		if (pp_opt_cache != NULL) {
240 			skmem_cache_destroy(pp_opt_cache);
241 			pp_opt_cache = NULL;
242 		}
243 
244 		__pp_inited = 0;
245 	}
246 }
247 
248 static struct kern_pbufpool *
pp_alloc(zalloc_flags_t how)249 pp_alloc(zalloc_flags_t how)
250 {
251 	struct kern_pbufpool *pp = zalloc_flags(pp_zone, how | Z_ZERO);
252 
253 	if (pp) {
254 		lck_mtx_init(&pp->pp_lock, &skmem_lock_grp, &skmem_lock_attr);
255 	}
256 	return pp;
257 }
258 
259 static void
pp_free(struct kern_pbufpool * pp)260 pp_free(struct kern_pbufpool *pp)
261 {
262 	PP_LOCK_ASSERT_HELD(pp);
263 
264 	pp_destroy(pp);
265 	PP_UNLOCK(pp);
266 
267 	SK_DF(SK_VERB_MEM, "pp %p FREE", SK_KVA(pp));
268 	lck_mtx_destroy(&pp->pp_lock, &skmem_lock_grp);
269 	zfree(pp_zone, pp);
270 }
271 
272 void
pp_retain_locked(struct kern_pbufpool * pp)273 pp_retain_locked(struct kern_pbufpool *pp)
274 {
275 	PP_LOCK_ASSERT_HELD(pp);
276 
277 	pp->pp_refcnt++;
278 	ASSERT(pp->pp_refcnt != 0);
279 }
280 
281 void
pp_retain(struct kern_pbufpool * pp)282 pp_retain(struct kern_pbufpool *pp)
283 {
284 	PP_LOCK(pp);
285 	pp_retain_locked(pp);
286 	PP_UNLOCK(pp);
287 }
288 
289 boolean_t
pp_release_locked(struct kern_pbufpool * pp)290 pp_release_locked(struct kern_pbufpool *pp)
291 {
292 	uint32_t oldref = pp->pp_refcnt;
293 
294 	PP_LOCK_ASSERT_HELD(pp);
295 
296 	ASSERT(pp->pp_refcnt != 0);
297 	if (--pp->pp_refcnt == 0) {
298 		pp_free(pp);
299 	}
300 
301 	return oldref == 1;
302 }
303 
304 boolean_t
pp_release(struct kern_pbufpool * pp)305 pp_release(struct kern_pbufpool *pp)
306 {
307 	boolean_t lastref;
308 
309 	PP_LOCK(pp);
310 	if (!(lastref = pp_release_locked(pp))) {
311 		PP_UNLOCK(pp);
312 	}
313 
314 	return lastref;
315 }
316 
317 void
pp_close(struct kern_pbufpool * pp)318 pp_close(struct kern_pbufpool *pp)
319 {
320 	PP_LOCK(pp);
321 	ASSERT(pp->pp_refcnt > 0);
322 	ASSERT(!(pp->pp_flags & PPF_CLOSED));
323 	pp->pp_flags |= PPF_CLOSED;
324 	if (!pp_release_locked(pp)) {
325 		PP_UNLOCK(pp);
326 	}
327 }
328 
329 /*
330  * -fbounds-safety: All callers of pp_regions_params_adjust use SKMEM_REGIONS
331  * size for the srp_array. This is same as marking it __counted_by(SKMEM_REGIONS)
332  */
333 void
pp_regions_params_adjust(struct skmem_region_params srp_array[SKMEM_REGIONS],nexus_meta_type_t md_type,nexus_meta_subtype_t md_subtype,uint32_t md_cnt,uint16_t max_frags,uint32_t buf_size,uint32_t large_buf_size,uint32_t buf_cnt,uint32_t buf_seg_size,uint32_t flags)334 pp_regions_params_adjust(struct skmem_region_params srp_array[SKMEM_REGIONS],
335     nexus_meta_type_t md_type, nexus_meta_subtype_t md_subtype, uint32_t md_cnt,
336     uint16_t max_frags, uint32_t buf_size, uint32_t large_buf_size,
337     uint32_t buf_cnt, uint32_t buf_seg_size, uint32_t flags)
338 {
339 	struct skmem_region_params *srp, *kmd_srp, *buf_srp, *kbft_srp,
340 	    *lbuf_srp;
341 	uint32_t md_size = 0;
342 	bool kernel_only = ((flags & PP_REGION_CONFIG_KERNEL_ONLY) != 0);
343 	bool md_persistent = ((flags & PP_REGION_CONFIG_MD_PERSISTENT) != 0);
344 	bool buf_persistent = ((flags & PP_REGION_CONFIG_BUF_PERSISTENT) != 0);
345 	bool config_buflet = ((flags & PP_REGION_CONFIG_BUFLET) != 0);
346 	bool md_magazine_enable = ((flags &
347 	    PP_REGION_CONFIG_MD_MAGAZINE_ENABLE) != 0);
348 
349 	ASSERT(max_frags != 0);
350 
351 	md_size = NX_METADATA_PACKET_SZ(max_frags);
352 
353 	switch (flags & PP_REGION_CONFIG_BUF_IODIR_BIDIR) {
354 	case PP_REGION_CONFIG_BUF_IODIR_IN:
355 		kmd_srp = &srp_array[SKMEM_REGION_RXKMD];
356 		buf_srp = &srp_array[SKMEM_REGION_RXBUF_DEF];
357 		lbuf_srp = &srp_array[SKMEM_REGION_RXBUF_LARGE];
358 		kbft_srp = &srp_array[SKMEM_REGION_RXKBFT];
359 		break;
360 	case PP_REGION_CONFIG_BUF_IODIR_OUT:
361 		kmd_srp = &srp_array[SKMEM_REGION_TXKMD];
362 		buf_srp = &srp_array[SKMEM_REGION_TXBUF_DEF];
363 		lbuf_srp = &srp_array[SKMEM_REGION_TXBUF_LARGE];
364 		kbft_srp = &srp_array[SKMEM_REGION_TXKBFT];
365 		break;
366 	case PP_REGION_CONFIG_BUF_IODIR_BIDIR:
367 	default:
368 		kmd_srp = &srp_array[SKMEM_REGION_KMD];
369 		buf_srp = &srp_array[SKMEM_REGION_BUF_DEF];
370 		lbuf_srp = &srp_array[SKMEM_REGION_BUF_LARGE];
371 		kbft_srp = &srp_array[SKMEM_REGION_KBFT];
372 		break;
373 	}
374 
375 	/* add preamble size to metadata obj size */
376 	md_size += METADATA_PREAMBLE_SZ;
377 	ASSERT(md_size >= NX_METADATA_OBJ_MIN_SZ);
378 
379 	/* configure kernel metadata region */
380 	kmd_srp->srp_md_type = md_type;
381 	kmd_srp->srp_md_subtype = md_subtype;
382 	kmd_srp->srp_r_obj_cnt = md_cnt;
383 	kmd_srp->srp_r_obj_size = md_size;
384 	kmd_srp->srp_max_frags = max_frags;
385 	ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
386 	if (md_persistent) {
387 		kmd_srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
388 	}
389 	ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
390 	if (md_magazine_enable) {
391 		kmd_srp->srp_cflags &= ~SKMEM_REGION_CR_NOMAGAZINES;
392 	}
393 	skmem_region_params_config(kmd_srp);
394 
395 	/* Sanity check for memtag */
396 	ASSERT(kmd_srp->srp_c_seg_size == SKMEM_MD_SEG_SIZE);
397 
398 	/* configure user metadata region */
399 	srp = &srp_array[SKMEM_REGION_UMD];
400 	if (!kernel_only) {
401 		srp->srp_md_type = kmd_srp->srp_md_type;
402 		srp->srp_md_subtype = kmd_srp->srp_md_subtype;
403 		srp->srp_r_obj_cnt = kmd_srp->srp_c_obj_cnt;
404 		srp->srp_r_obj_size = kmd_srp->srp_c_obj_size;
405 		srp->srp_max_frags = kmd_srp->srp_max_frags;
406 		ASSERT((srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
407 		if (md_persistent) {
408 			srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
409 		}
410 		/*
411 		 * UMD is a mirrored region and object allocation operations
412 		 * are performed on the KMD objects.
413 		 */
414 		ASSERT((srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
415 		skmem_region_params_config(srp);
416 		ASSERT(srp->srp_c_obj_cnt == kmd_srp->srp_c_obj_cnt);
417 	} else {
418 		ASSERT(srp->srp_r_obj_cnt == 0);
419 		ASSERT(srp->srp_r_obj_size == 0);
420 	}
421 
422 	/* configure buffer region */
423 	buf_srp->srp_r_obj_cnt = MAX(buf_cnt, kmd_srp->srp_c_obj_cnt);
424 	buf_srp->srp_r_obj_size = buf_size;
425 	buf_srp->srp_cflags &= ~SKMEM_REGION_CR_MONOLITHIC;
426 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
427 	if (buf_persistent) {
428 		buf_srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
429 	}
430 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
431 	if (buf_srp->srp_r_obj_size >= kern_buf_min_striding_size) {
432 		/*
433 		 * A buffer size larger than 32K indicates striding is in use, which
434 		 * means a buffer could be detached from a buflet. In this case, magzine
435 		 * layer should be enabled.
436 		 */
437 		buf_srp->srp_cflags &= ~SKMEM_REGION_CR_NOMAGAZINES;
438 	}
439 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_UREADONLY) == 0);
440 	if ((flags & PP_REGION_CONFIG_BUF_UREADONLY) != 0) {
441 		buf_srp->srp_cflags |= SKMEM_REGION_CR_UREADONLY;
442 	}
443 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_KREADONLY) == 0);
444 	if ((flags & PP_REGION_CONFIG_BUF_KREADONLY) != 0) {
445 		buf_srp->srp_cflags |= SKMEM_REGION_CR_KREADONLY;
446 	}
447 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_MONOLITHIC) == 0);
448 	if ((flags & PP_REGION_CONFIG_BUF_MONOLITHIC) != 0) {
449 		buf_srp->srp_cflags |= SKMEM_REGION_CR_MONOLITHIC;
450 	}
451 	ASSERT((srp->srp_cflags & SKMEM_REGION_CR_SEGPHYSCONTIG) == 0);
452 	if ((flags & PP_REGION_CONFIG_BUF_SEGPHYSCONTIG) != 0) {
453 		buf_srp->srp_cflags |= SKMEM_REGION_CR_SEGPHYSCONTIG;
454 	}
455 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_NOCACHE) == 0);
456 	if ((flags & PP_REGION_CONFIG_BUF_NOCACHE) != 0) {
457 		buf_srp->srp_cflags |= SKMEM_REGION_CR_NOCACHE;
458 	}
459 	ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_THREADSAFE) == 0);
460 	if ((flags & PP_REGION_CONFIG_BUF_THREADSAFE) != 0) {
461 		buf_srp->srp_cflags |= SKMEM_REGION_CR_THREADSAFE;
462 	}
463 	if (buf_seg_size != 0) {
464 		buf_srp->srp_r_seg_size = buf_seg_size;
465 	}
466 	skmem_region_params_config(buf_srp);
467 
468 	/* configure large buffer region */
469 	if (large_buf_size != 0) {
470 		lbuf_srp->srp_r_obj_cnt = buf_srp->srp_r_obj_cnt;
471 		lbuf_srp->srp_r_obj_size = large_buf_size;
472 		lbuf_srp->srp_r_seg_size = buf_srp->srp_r_seg_size;
473 		lbuf_srp->srp_cflags = buf_srp->srp_cflags;
474 		skmem_region_params_config(lbuf_srp);
475 	}
476 
477 	/* configure kernel buflet region */
478 	if (config_buflet) {
479 		/*
480 		 * Ideally we want the number of buflets to be
481 		 * "kmd_srp->srp_c_obj_cnt * (kmd_srp->srp_max_frags - 1)",
482 		 * so that we have enough buflets when multi-buflet and
483 		 * shared buffer object is used.
484 		 * Currently multi-buflet is being used only by user pool
485 		 * which doesn't support shared buffer object, hence to reduce
486 		 * the number of objects we are restricting the number of
487 		 * buflets to the number of buffers.
488 		 */
489 		kbft_srp->srp_r_obj_cnt = buf_srp->srp_c_obj_cnt +
490 		    lbuf_srp->srp_c_obj_cnt;
491 		kbft_srp->srp_r_obj_size = MAX(sizeof(struct __kern_buflet_ext),
492 		    sizeof(struct __user_buflet));
493 		kbft_srp->srp_cflags = kmd_srp->srp_cflags;
494 		skmem_region_params_config(kbft_srp);
495 		ASSERT(kbft_srp->srp_c_obj_cnt >= buf_srp->srp_c_obj_cnt +
496 		    lbuf_srp->srp_c_obj_cnt);
497 		/* Sanity check for memtag */
498 		ASSERT(kbft_srp->srp_c_seg_size == SKMEM_MD_SEG_SIZE);
499 	} else {
500 		ASSERT(kbft_srp->srp_r_obj_cnt == 0);
501 		ASSERT(kbft_srp->srp_r_obj_size == 0);
502 	}
503 
504 	/* configure user buflet region */
505 	srp = &srp_array[SKMEM_REGION_UBFT];
506 	if (config_buflet && !kernel_only) {
507 		srp->srp_r_obj_cnt = kbft_srp->srp_c_obj_cnt;
508 		srp->srp_r_obj_size = kbft_srp->srp_c_obj_size;
509 		srp->srp_cflags = srp_array[SKMEM_REGION_UMD].srp_cflags;
510 		skmem_region_params_config(srp);
511 		ASSERT(srp->srp_c_obj_cnt == kbft_srp->srp_c_obj_cnt);
512 	} else {
513 		ASSERT(srp->srp_r_obj_cnt == 0);
514 		ASSERT(srp->srp_r_obj_size == 0);
515 	}
516 
517 	/* make sure each metadata can be paired with a buffer */
518 	ASSERT(kmd_srp->srp_c_obj_cnt <= buf_srp->srp_c_obj_cnt);
519 }
520 
521 SK_NO_INLINE_ATTRIBUTE
522 static int
pp_metadata_construct(struct __kern_quantum * kqum,struct __user_quantum * uqum,obj_idx_t midx,struct kern_pbufpool * pp,uint32_t skmflag,uint16_t bufcnt,bool raw,struct skmem_obj ** blist)523 pp_metadata_construct(struct __kern_quantum *kqum, struct __user_quantum *uqum,
524     obj_idx_t midx, struct kern_pbufpool *pp, uint32_t skmflag, uint16_t bufcnt,
525     bool raw, struct skmem_obj **blist)
526 {
527 	struct __kern_buflet *kbuf;
528 	mach_vm_address_t baddr = 0;
529 	uint16_t *pbufs_cnt, *pbufs_max;
530 	uint16_t i;
531 
532 	ASSERT(bufcnt == 1 || PP_HAS_BUFFER_ON_DEMAND(pp));
533 
534 	/* construct {user,kernel} metadata */
535 	struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
536 	struct __user_packet *upkt = SK_PTR_ADDR_UPKT(uqum);
537 	struct __packet_opt *__single opt;
538 	struct __flow *__single flow;
539 	struct __packet_compl *__single compl;
540 	uint64_t pflags;
541 
542 	if (raw) {
543 		opt = skmem_cache_alloc(pp_opt_cache, SKMEM_SLEEP);
544 		flow = skmem_cache_alloc(pp_flow_cache, SKMEM_SLEEP);
545 		compl = skmem_cache_alloc(pp_compl_cache, SKMEM_SLEEP);
546 		pflags = (PKT_F_OPT_ALLOC | PKT_F_FLOW_ALLOC |
547 		    PKT_F_TX_COMPL_ALLOC);
548 	} else {
549 		ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
550 		    kpkt->pkt_com_opt != NULL);
551 		opt = kpkt->pkt_com_opt;
552 		ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
553 		    kpkt->pkt_flow != NULL);
554 		flow = kpkt->pkt_flow;
555 		ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
556 		    kpkt->pkt_tx_compl != NULL);
557 		compl = kpkt->pkt_tx_compl;
558 		pflags = kpkt->pkt_pflags;
559 	}
560 	/* will be adjusted below as part of allocating buffer(s) */
561 	static_assert(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
562 	static_assert(sizeof(kpkt->pkt_bufs_max) == sizeof(uint16_t));
563 	pbufs_cnt = __DECONST(uint16_t *, &kpkt->pkt_bufs_cnt);
564 	pbufs_max = __DECONST(uint16_t *, &kpkt->pkt_bufs_max);
565 
566 	/* kernel (and user) packet */
567 	KPKT_CTOR(kpkt, pflags, opt, flow, compl, midx,
568 	    upkt, pp, 0, pp->pp_max_frags, 0);
569 
570 	kbuf = kqum->qum_buf;
571 	for (i = 0; i < bufcnt; i++) {
572 		struct skmem_obj_info oib;
573 
574 		if (!PP_HAS_BUFFER_ON_DEMAND(pp)) {
575 			ASSERT(i == 0);
576 			ASSERT(*blist == NULL);
577 			/*
578 			 * quantum has a native buflet, so we only need a
579 			 * buffer to be allocated and attached to the buflet.
580 			 */
581 			baddr = pp_alloc_buffer_common(pp, &oib, skmflag,
582 			    false);
583 			if (__improbable(baddr == 0)) {
584 				goto fail;
585 			}
586 			KBUF_CTOR(kbuf, baddr, SKMEM_OBJ_IDX_REG(&oib),
587 			    SKMEM_OBJ_BUFCTL(&oib), pp, false);
588 			baddr = 0;
589 		} else {
590 			/*
591 			 * we use pre-constructed buflets with attached buffers.
592 			 */
593 			struct __kern_buflet *pkbuf = kbuf;
594 			struct skmem_obj *blistn;
595 
596 			ASSERT(pkbuf != NULL);
597 			kbuf = (kern_buflet_t)*blist;
598 			if (__improbable(kbuf == NULL)) {
599 				SK_DF(SK_VERB_MEM, "failed to get buflet,"
600 				    " pp %p", SK_KVA(pp));
601 				goto fail;
602 			}
603 
604 
605 			blistn = (*blist)->mo_next;
606 			(*blist)->mo_next = NULL;
607 
608 			KBUF_EXT_INIT(kbuf, pp);
609 			KBUF_LINK(pkbuf, kbuf);
610 			*blist = blistn;
611 		}
612 
613 		/* adjust buffer count accordingly */
614 		if (__probable(pbufs_cnt != NULL)) {
615 			*pbufs_cnt += 1;
616 			ASSERT(*pbufs_cnt <= *pbufs_max);
617 		}
618 	}
619 
620 	ASSERT(!PP_KERNEL_ONLY(pp) || (kqum->qum_qflags & QUM_F_KERNEL_ONLY));
621 	ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE);
622 	SK_DF(SK_VERB_MEM, "pp %p pkt %p bufcnt %d buf %p",
623 	    SK_KVA(pp), SK_KVA(kqum), bufcnt, SK_KVA(baddr));
624 	return 0;
625 
626 fail:
627 	ASSERT(bufcnt != 0 && baddr == 0);
628 	pp_metadata_destruct(kqum, pp, raw);
629 	return ENOMEM;
630 }
631 
632 static int
pp_metadata_ctor_common(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,struct kern_pbufpool * pp,uint32_t skmflag,bool no_buflet)633 pp_metadata_ctor_common(struct skmem_obj_info *oi0,
634     struct skmem_obj_info *oim0, struct kern_pbufpool *pp, uint32_t skmflag,
635     bool no_buflet)
636 {
637 	struct skmem_obj_info _oi, _oim;
638 	struct skmem_obj_info *oi, *oim;
639 	struct __kern_quantum *kqum;
640 	struct __user_quantum *uqum;
641 	uint16_t bufcnt = (no_buflet ? 0 : pp->pp_max_frags);
642 	struct skmem_obj *__single blist = NULL;
643 	int error;
644 
645 #if (DEVELOPMENT || DEBUG)
646 	uint64_t mtbf = skmem_region_get_mtbf();
647 	/*
648 	 * MTBF is applicable only for non-blocking allocations here.
649 	 */
650 	if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
651 	    (skmflag & SKMEM_NOSLEEP))) {
652 		SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
653 		net_update_uptime();
654 		return ENOMEM;
655 	}
656 #endif /* (DEVELOPMENT || DEBUG) */
657 
658 	/*
659 	 * Note that oi0 and oim0 may be stored inside the object itself;
660 	 * if so, copy them to local variables before constructing.  We
661 	 * don't use PPF_BATCH to test as the allocator may be allocating
662 	 * storage space differently depending on the number of objects.
663 	 */
664 	if (__probable((uintptr_t)oi0 >= (uintptr_t)SKMEM_OBJ_ADDR(oi0) &&
665 	    ((uintptr_t)oi0 + sizeof(*oi0)) <=
666 	    ((uintptr_t)SKMEM_OBJ_ADDR(oi0) + SKMEM_OBJ_SIZE(oi0)))) {
667 		oi = &_oi;
668 		*oi = *oi0;
669 		if (__probable(oim0 != NULL)) {
670 			oim = &_oim;
671 			*oim = *oim0;
672 		} else {
673 			oim = NULL;
674 		}
675 	} else {
676 		oi = oi0;
677 		oim = oim0;
678 	}
679 
680 	kqum = SK_PTR_ADDR_KQUM((uintptr_t)SKMEM_OBJ_ADDR(oi) +
681 	    METADATA_PREAMBLE_SZ);
682 
683 	if (__probable(!PP_KERNEL_ONLY(pp))) {
684 		ASSERT(oim != NULL && SKMEM_OBJ_ADDR(oim) != NULL);
685 		ASSERT(SKMEM_OBJ_SIZE(oi) == SKMEM_OBJ_SIZE(oim));
686 		uqum = SK_PTR_ADDR_UQUM((uintptr_t)SKMEM_OBJ_ADDR(oim) +
687 		    METADATA_PREAMBLE_SZ);
688 	} else {
689 		ASSERT(oim == NULL);
690 		uqum = NULL;
691 	}
692 
693 	if (oim != NULL) {
694 		/* initialize user metadata redzone */
695 		struct __metadata_preamble *mdp = SKMEM_OBJ_ADDR(oim);
696 		mdp->mdp_redzone =
697 		    (SKMEM_OBJ_ROFF(oim) + METADATA_PREAMBLE_SZ) ^
698 		    __ch_umd_redzone_cookie;
699 	}
700 
701 	/* allocate (constructed) buflet(s) with buffer(s) attached */
702 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0) {
703 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
704 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, bufcnt, skmflag);
705 	}
706 
707 	error = pp_metadata_construct(kqum, uqum, SKMEM_OBJ_IDX_REG(oi), pp,
708 	    skmflag, bufcnt, TRUE, &blist);
709 	if (__improbable(blist != NULL)) {
710 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist);
711 		blist = NULL;
712 	}
713 	return error;
714 }
715 
716 static int
pp_metadata_ctor_no_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)717 pp_metadata_ctor_no_buflet(struct skmem_obj_info *oi0,
718     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
719 {
720 	return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, true);
721 }
722 
723 static int
pp_metadata_ctor_max_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)724 pp_metadata_ctor_max_buflet(struct skmem_obj_info *oi0,
725     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
726 {
727 	return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, false);
728 }
729 
730 __attribute__((always_inline))
731 static void
pp_metadata_destruct_common(struct __kern_quantum * kqum,struct kern_pbufpool * pp,bool raw,struct skmem_obj ** blist_def,struct skmem_obj ** blist_nocache_def,struct skmem_obj ** blist_large,struct skmem_obj ** blist_nocache_large)732 pp_metadata_destruct_common(struct __kern_quantum *kqum,
733     struct kern_pbufpool *pp, bool raw, struct skmem_obj **blist_def,
734     struct skmem_obj **blist_nocache_def, struct skmem_obj **blist_large,
735     struct skmem_obj **blist_nocache_large)
736 {
737 	struct __kern_buflet *kbuf, *nbuf;
738 	struct skmem_obj *__single p_blist_def = NULL, *__single p_blist_large = NULL;
739 	struct skmem_obj *__single p_blist_nocache_def = NULL, *__single p_blist_nocache_large = NULL;
740 	struct skmem_obj **pp_blist_def = &p_blist_def;
741 	struct skmem_obj **pp_blist_large = &p_blist_large;
742 	struct skmem_obj **pp_blist_nocache_def = &p_blist_nocache_def;
743 	struct skmem_obj **pp_blist_nocache_large = &p_blist_nocache_large;
744 	uint16_t bufcnt, i = 0;
745 	bool first_buflet_empty;
746 
747 	ASSERT(blist_def != NULL);
748 	ASSERT(blist_large != NULL);
749 
750 	struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
751 
752 	ASSERT(kpkt->pkt_user != NULL || PP_KERNEL_ONLY(pp));
753 	ASSERT(kpkt->pkt_qum.qum_pp == pp);
754 	ASSERT(METADATA_TYPE(kpkt) == pp->pp_md_type);
755 	ASSERT(METADATA_SUBTYPE(kpkt) == pp->pp_md_subtype);
756 	ASSERT(METADATA_IDX(kpkt) != OBJ_IDX_NONE);
757 	ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
758 	ASSERT(kpkt->pkt_bufs_cnt <= kpkt->pkt_bufs_max);
759 	ASSERT(kpkt->pkt_bufs_max == pp->pp_max_frags);
760 	static_assert(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
761 	bufcnt = kpkt->pkt_bufs_cnt;
762 	kbuf = &kqum->qum_buf[0];
763 	/*
764 	 * special handling for empty first buflet.
765 	 */
766 	first_buflet_empty = (kbuf->buf_addr == 0);
767 	*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = 0;
768 
769 	/*
770 	 * -fbounds-safety: buf_nbft_addr is a mach_vm_address_t which is
771 	 * unsafe, so we forge it here.
772 	 */
773 	nbuf = __unsafe_forge_single(struct __kern_buflet *,
774 	    __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
775 	BUF_NBFT_ADDR(kbuf, 0);
776 	BUF_NBFT_IDX(kbuf, OBJ_IDX_NONE);
777 	if (!first_buflet_empty) {
778 		pp_free_buflet_common(pp, kbuf);
779 		++i;
780 	}
781 
782 	while (nbuf != NULL) {
783 		ASSERT(nbuf->buf_ctl != NULL);
784 		if (BUFLET_HAS_LARGE_BUF(nbuf)) {
785 			/*
786 			 * bc_usecnt larger than 1 means the buffer has been cloned and is
787 			 * still being used by other bflts. In this case, when we free
788 			 * this bflt we need to explicitly ask for it to not be cached again
789 			 * into magzine layer to prevent immediate reuse of the buffer and
790 			 * data corruption.
791 			 */
792 			if (nbuf->buf_ctl->bc_usecnt > 1) {
793 				*pp_blist_nocache_large = (struct skmem_obj *)(void *)nbuf;
794 				pp_blist_nocache_large =
795 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
796 			} else {
797 				*pp_blist_large = (struct skmem_obj *)(void *)nbuf;
798 				pp_blist_large =
799 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
800 			}
801 		} else {
802 			if (nbuf->buf_ctl->bc_usecnt > 1) {
803 				*pp_blist_nocache_def = (struct skmem_obj *)(void *)nbuf;
804 				pp_blist_nocache_def =
805 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
806 			} else {
807 				*pp_blist_def = (struct skmem_obj *)(void *)nbuf;
808 				pp_blist_def =
809 				    &((struct skmem_obj *)(void *)nbuf)->mo_next;
810 			}
811 		}
812 		BUF_NBFT_IDX(nbuf, OBJ_IDX_NONE);
813 		nbuf = __unsafe_forge_single(struct __kern_buflet *,
814 		    __DECONST(struct __kern_buflet *, nbuf->buf_nbft_addr));
815 		++i;
816 	}
817 
818 	ASSERT(i == bufcnt);
819 
820 	if (p_blist_def != NULL) {
821 		*pp_blist_def = *blist_def;
822 		*blist_def = p_blist_def;
823 	}
824 	if (p_blist_large != NULL) {
825 		*pp_blist_large = *blist_large;
826 		*blist_large = p_blist_large;
827 	}
828 	if (p_blist_nocache_def != NULL) {
829 		*pp_blist_nocache_def = *blist_nocache_def;
830 		*blist_nocache_def = p_blist_nocache_def;
831 	}
832 	if (p_blist_nocache_large != NULL) {
833 		*pp_blist_nocache_large = *blist_nocache_large;
834 		*blist_nocache_large = p_blist_nocache_large;
835 	}
836 
837 	/* if we're about to return this object to the slab, clean it up */
838 	if (raw) {
839 		ASSERT(kpkt->pkt_com_opt != NULL ||
840 		    !(kpkt->pkt_pflags & PKT_F_OPT_ALLOC));
841 		if (kpkt->pkt_com_opt != NULL) {
842 			ASSERT(kpkt->pkt_pflags & PKT_F_OPT_ALLOC);
843 			skmem_cache_free(pp_opt_cache,
844 			    kpkt->pkt_com_opt);
845 			kpkt->pkt_com_opt = NULL;
846 		}
847 		ASSERT(kpkt->pkt_flow != NULL ||
848 		    !(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC));
849 		if (kpkt->pkt_flow != NULL) {
850 			ASSERT(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC);
851 			skmem_cache_free(pp_flow_cache, kpkt->pkt_flow);
852 			kpkt->pkt_flow = NULL;
853 		}
854 		ASSERT(kpkt->pkt_tx_compl != NULL ||
855 		    !(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC));
856 		if (kpkt->pkt_tx_compl != NULL) {
857 			ASSERT(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC);
858 			skmem_cache_free(pp_compl_cache,
859 			    kpkt->pkt_tx_compl);
860 			kpkt->pkt_tx_compl = NULL;
861 		}
862 		kpkt->pkt_pflags = 0;
863 	}
864 }
865 
866 __attribute__((always_inline))
867 static void
pp_free_kbft_list(struct kern_pbufpool * pp,struct skmem_obj * blist_def,struct skmem_obj * blist_nocache_def,struct skmem_obj * blist_large,struct skmem_obj * blist_nocache_large)868 pp_free_kbft_list(struct kern_pbufpool *pp, struct skmem_obj *blist_def, struct skmem_obj *blist_nocache_def,
869     struct skmem_obj *blist_large, struct skmem_obj *blist_nocache_large)
870 {
871 	if (blist_def != NULL) {
872 		skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist_def);
873 	}
874 	if (blist_large != NULL) {
875 		skmem_cache_batch_free(PP_KBFT_CACHE_LARGE(pp), blist_large);
876 	}
877 	if (blist_nocache_def != NULL) {
878 		skmem_cache_batch_free_nocache(PP_KBFT_CACHE_DEF(pp), blist_nocache_def);
879 	}
880 	if (blist_nocache_large != NULL) {
881 		skmem_cache_batch_free_nocache(PP_KBFT_CACHE_LARGE(pp), blist_nocache_large);
882 	}
883 }
884 
885 __attribute__((always_inline))
886 static void
pp_metadata_destruct(struct __kern_quantum * kqum,struct kern_pbufpool * pp,bool raw)887 pp_metadata_destruct(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
888     bool raw)
889 {
890 	struct skmem_obj *__single blist_def = NULL, *__single blist_large = NULL;
891 	struct skmem_obj *__single blist_nocache_def = NULL, *__single blist_nocache_large = NULL;
892 
893 	pp_metadata_destruct_common(kqum, pp, raw, &blist_def, &blist_nocache_def,
894 	    &blist_large, &blist_nocache_large);
895 	pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
896 }
897 
898 static void
pp_metadata_dtor(void * addr,void * arg)899 pp_metadata_dtor(void *addr, void *arg)
900 {
901 	pp_metadata_destruct(SK_PTR_ADDR_KQUM((uintptr_t)addr +
902 	    METADATA_PREAMBLE_SZ), arg, TRUE);
903 }
904 
905 static void
pp_buf_seg_ctor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)906 pp_buf_seg_ctor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
907 {
908 	struct kern_pbufpool *__single pp = arg;
909 
910 	if (pp->pp_pbuf_seg_ctor != NULL) {
911 		pp->pp_pbuf_seg_ctor(pp, sg, md);
912 	}
913 }
914 
915 static void
pp_buf_seg_dtor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)916 pp_buf_seg_dtor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
917 {
918 	struct kern_pbufpool *__single pp = arg;
919 
920 	if (pp->pp_pbuf_seg_dtor != NULL) {
921 		pp->pp_pbuf_seg_dtor(pp, sg, md);
922 	}
923 }
924 
925 static int
pp_buflet_metadata_ctor_common(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag,bool large)926 pp_buflet_metadata_ctor_common(struct skmem_obj_info *oi0,
927     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag, bool large)
928 {
929 #pragma unused (skmflag)
930 	struct kern_pbufpool *pp = (struct kern_pbufpool *)arg;
931 	struct __kern_buflet *kbft;
932 	struct __user_buflet *ubft;
933 	struct skmem_obj_info oib;
934 	mach_vm_address_t baddr;
935 	obj_idx_t oi_idx_reg;
936 
937 	baddr = pp_alloc_buffer_common(pp, &oib, skmflag, large);
938 	if (__improbable(baddr == 0)) {
939 		return ENOMEM;
940 	}
941 	/*
942 	 * Note that oi0 and oim0 may be stored inside the object itself;
943 	 * so copy what is required to local variables before constructing.
944 	 */
945 	oi_idx_reg = SKMEM_OBJ_IDX_REG(oi0);
946 	kbft = SKMEM_OBJ_ADDR(oi0);
947 
948 	if (__probable(!PP_KERNEL_ONLY(pp))) {
949 		ASSERT(oim0 != NULL && SKMEM_OBJ_ADDR(oim0) != NULL);
950 		ASSERT(SKMEM_OBJ_SIZE(oi0) == SKMEM_OBJ_SIZE(oim0));
951 		ASSERT(oi_idx_reg == SKMEM_OBJ_IDX_REG(oim0));
952 		ASSERT(SKMEM_OBJ_IDX_SEG(oi0) == SKMEM_OBJ_IDX_SEG(oim0));
953 		ubft = SKMEM_OBJ_ADDR(oim0);
954 	} else {
955 		ASSERT(oim0 == NULL);
956 		ubft = NULL;
957 	}
958 	KBUF_EXT_CTOR(kbft, ubft, baddr, SKMEM_OBJ_IDX_REG(&oib),
959 	    SKMEM_OBJ_BUFCTL(&oib), oi_idx_reg, pp, large);
960 	return 0;
961 }
962 
963 static int
pp_buflet_default_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)964 pp_buflet_default_buffer_metadata_ctor(struct skmem_obj_info *oi0,
965     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
966 {
967 	return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, false);
968 }
969 
970 static int
pp_buflet_large_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)971 pp_buflet_large_buffer_metadata_ctor(struct skmem_obj_info *oi0,
972     struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
973 {
974 	return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, true);
975 }
976 
977 static void
pp_buflet_metadata_dtor(void * addr,void * arg)978 pp_buflet_metadata_dtor(void *addr, void *arg)
979 {
980 	struct __kern_buflet *__single kbft = addr;
981 	void *objaddr = kbft->buf_objaddr;
982 	struct kern_pbufpool *__single pp = arg;
983 	uint32_t usecnt = 0;
984 	bool large = BUFLET_HAS_LARGE_BUF(kbft);
985 
986 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
987 	/*
988 	 * don't assert for (buf_nbft_addr == 0) here as constructed
989 	 * buflet may have this field as non-zero. This is because
990 	 * buf_nbft_addr (__buflet_next) is used by skmem batch alloc
991 	 * for chaining the buflets.
992 	 * To ensure that the frred buflet was not part of a chain we
993 	 * assert for (buf_nbft_idx == OBJ_IDX_NONE).
994 	 */
995 	ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
996 	ASSERT(((struct __kern_buflet_ext *)kbft)->kbe_buf_upp_link.sle_next ==
997 	    NULL);
998 	ASSERT(kbft->buf_addr != 0);
999 	ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
1000 	ASSERT(kbft->buf_ctl != NULL);
1001 
1002 	KBUF_DTOR(kbft, usecnt);
1003 	SK_DF(SK_VERB_MEM, "pp %p buf %p usecnt %u", SK_KVA(pp),
1004 	    SK_KVA(objaddr), usecnt);
1005 	if (__probable(usecnt == 0)) {
1006 		skmem_cache_free(large ? PP_BUF_CACHE_LARGE(pp) :
1007 		    PP_BUF_CACHE_DEF(pp), objaddr);
1008 	}
1009 }
1010 
1011 /*
1012  * -fbounds-safety: all callers of pp_create use srp_array with a known size:
1013  * SKMEM_REGIONS. This is same as marking it __counted_by(SKMEM_REGIONS)
1014  */
1015 struct kern_pbufpool *
pp_create(const char * name,struct skmem_region_params srp_array[SKMEM_REGIONS],pbuf_seg_ctor_fn_t buf_seg_ctor,pbuf_seg_dtor_fn_t buf_seg_dtor,const void * ctx,pbuf_ctx_retain_fn_t ctx_retain,pbuf_ctx_release_fn_t ctx_release,uint32_t ppcreatef)1016 pp_create(const char *name, struct skmem_region_params srp_array[SKMEM_REGIONS],
1017     pbuf_seg_ctor_fn_t buf_seg_ctor, pbuf_seg_dtor_fn_t buf_seg_dtor,
1018     const void *ctx, pbuf_ctx_retain_fn_t ctx_retain,
1019     pbuf_ctx_release_fn_t ctx_release, uint32_t ppcreatef)
1020 {
1021 	struct kern_pbufpool *pp = NULL;
1022 	uint32_t md_size, def_buf_obj_size;
1023 	uint32_t def_buf_size, large_buf_size;
1024 	nexus_meta_type_t md_type;
1025 	nexus_meta_subtype_t md_subtype;
1026 	uint32_t md_cflags;
1027 	uint16_t max_frags;
1028 	uint32_t buf_def_cflags;
1029 	char cname[64];
1030 	const char *__null_terminated cache_name = NULL;
1031 	struct skmem_region_params *kmd_srp;
1032 	struct skmem_region_params *buf_srp;
1033 	struct skmem_region_params *kbft_srp;
1034 	struct skmem_region_params *umd_srp = NULL;
1035 	struct skmem_region_params *ubft_srp = NULL;
1036 	struct skmem_region_params *lbuf_srp = NULL;
1037 
1038 	/* buf_seg_{ctor,dtor} pair must be either NULL or non-NULL */
1039 	ASSERT(!(!(buf_seg_ctor == NULL && buf_seg_dtor == NULL) &&
1040 	    ((buf_seg_ctor == NULL) ^ (buf_seg_dtor == NULL))));
1041 
1042 	/* ctx{,_retain,_release} must be either ALL NULL or ALL non-NULL */
1043 	ASSERT((ctx == NULL && ctx_retain == NULL && ctx_release == NULL) ||
1044 	    (ctx != NULL && ctx_retain != NULL && ctx_release != NULL));
1045 
1046 	if (srp_array[SKMEM_REGION_KMD].srp_c_obj_cnt != 0) {
1047 		kmd_srp = &srp_array[SKMEM_REGION_KMD];
1048 		buf_srp = &srp_array[SKMEM_REGION_BUF_DEF];
1049 		lbuf_srp = &srp_array[SKMEM_REGION_BUF_LARGE];
1050 		kbft_srp = &srp_array[SKMEM_REGION_KBFT];
1051 	} else if (srp_array[SKMEM_REGION_RXKMD].srp_c_obj_cnt != 0) {
1052 		kmd_srp = &srp_array[SKMEM_REGION_RXKMD];
1053 		buf_srp = &srp_array[SKMEM_REGION_RXBUF_DEF];
1054 		lbuf_srp = &srp_array[SKMEM_REGION_RXBUF_LARGE];
1055 		kbft_srp = &srp_array[SKMEM_REGION_RXKBFT];
1056 	} else {
1057 		VERIFY(srp_array[SKMEM_REGION_TXKMD].srp_c_obj_cnt != 0);
1058 		kmd_srp = &srp_array[SKMEM_REGION_TXKMD];
1059 		buf_srp = &srp_array[SKMEM_REGION_TXBUF_DEF];
1060 		lbuf_srp = &srp_array[SKMEM_REGION_TXBUF_LARGE];
1061 		kbft_srp = &srp_array[SKMEM_REGION_TXKBFT];
1062 	}
1063 
1064 	VERIFY(kmd_srp->srp_c_obj_size != 0);
1065 	VERIFY(buf_srp->srp_c_obj_cnt != 0);
1066 	VERIFY(buf_srp->srp_c_obj_size != 0);
1067 
1068 	if (ppcreatef & PPCREATEF_ONDEMAND_BUF) {
1069 		VERIFY(kbft_srp->srp_c_obj_cnt != 0);
1070 		VERIFY(kbft_srp->srp_c_obj_size != 0);
1071 	} else {
1072 		kbft_srp = NULL;
1073 	}
1074 
1075 	if ((ppcreatef & PPCREATEF_KERNEL_ONLY) == 0) {
1076 		umd_srp = &srp_array[SKMEM_REGION_UMD];
1077 		ASSERT(umd_srp->srp_c_obj_size == kmd_srp->srp_c_obj_size);
1078 		ASSERT(umd_srp->srp_c_obj_cnt == kmd_srp->srp_c_obj_cnt);
1079 		ASSERT(umd_srp->srp_c_seg_size == kmd_srp->srp_c_seg_size);
1080 		ASSERT(umd_srp->srp_seg_cnt == kmd_srp->srp_seg_cnt);
1081 		ASSERT(umd_srp->srp_md_type == kmd_srp->srp_md_type);
1082 		ASSERT(umd_srp->srp_md_subtype == kmd_srp->srp_md_subtype);
1083 		ASSERT(umd_srp->srp_max_frags == kmd_srp->srp_max_frags);
1084 		ASSERT((umd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
1085 		    (kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
1086 		if (kbft_srp != NULL) {
1087 			ubft_srp = &srp_array[SKMEM_REGION_UBFT];
1088 			ASSERT(ubft_srp->srp_c_obj_size ==
1089 			    kbft_srp->srp_c_obj_size);
1090 			ASSERT(ubft_srp->srp_c_obj_cnt ==
1091 			    kbft_srp->srp_c_obj_cnt);
1092 			ASSERT(ubft_srp->srp_c_seg_size ==
1093 			    kbft_srp->srp_c_seg_size);
1094 			ASSERT(ubft_srp->srp_seg_cnt == kbft_srp->srp_seg_cnt);
1095 		}
1096 	}
1097 
1098 	md_size = kmd_srp->srp_r_obj_size;
1099 	md_type = kmd_srp->srp_md_type;
1100 	md_subtype = kmd_srp->srp_md_subtype;
1101 	max_frags = kmd_srp->srp_max_frags;
1102 	def_buf_obj_size = buf_srp->srp_c_obj_size;
1103 	def_buf_size = def_buf_obj_size;
1104 	large_buf_size = lbuf_srp->srp_c_obj_size;
1105 
1106 #if (DEBUG || DEVELOPMENT)
1107 	ASSERT(def_buf_obj_size != 0);
1108 	ASSERT(md_type > NEXUS_META_TYPE_INVALID &&
1109 	    md_type <= NEXUS_META_TYPE_MAX);
1110 	ASSERT(max_frags >= 1);
1111 	ASSERT(md_type == NEXUS_META_TYPE_PACKET);
1112 	ASSERT(md_size >= (METADATA_PREAMBLE_SZ +
1113 	    NX_METADATA_PACKET_SZ(max_frags)));
1114 	ASSERT(md_subtype > NEXUS_META_SUBTYPE_INVALID &&
1115 	    md_subtype <= NEXUS_META_SUBTYPE_MAX);
1116 #endif /* DEBUG || DEVELOPMENT */
1117 
1118 	pp = pp_alloc(Z_WAITOK);
1119 
1120 	(void) snprintf((char *)pp->pp_name, sizeof(pp->pp_name),
1121 	    "skywalk.pp.%s", name);
1122 
1123 	pp->pp_ctx = __DECONST(void *, ctx);
1124 	pp->pp_ctx_retain = ctx_retain;
1125 	pp->pp_ctx_release = ctx_release;
1126 	if (pp->pp_ctx != NULL) {
1127 		pp->pp_ctx_retain(pp->pp_ctx);
1128 	}
1129 
1130 	pp->pp_pbuf_seg_ctor = buf_seg_ctor;
1131 	pp->pp_pbuf_seg_dtor = buf_seg_dtor;
1132 	PP_BUF_SIZE_DEF(pp) = def_buf_size;
1133 	PP_BUF_OBJ_SIZE_DEF(pp) = def_buf_obj_size;
1134 	PP_BUF_SIZE_LARGE(pp) = large_buf_size;
1135 	PP_BUF_OBJ_SIZE_LARGE(pp) = lbuf_srp->srp_c_obj_size;
1136 	pp->pp_md_type = md_type;
1137 	pp->pp_md_subtype = md_subtype;
1138 	pp->pp_max_frags = max_frags;
1139 	if (ppcreatef & PPCREATEF_EXTERNAL) {
1140 		pp->pp_flags |= PPF_EXTERNAL;
1141 	}
1142 	if (ppcreatef & PPCREATEF_TRUNCATED_BUF) {
1143 		pp->pp_flags |= PPF_TRUNCATED_BUF;
1144 	}
1145 	if (ppcreatef & PPCREATEF_KERNEL_ONLY) {
1146 		pp->pp_flags |= PPF_KERNEL;
1147 	}
1148 	if (ppcreatef & PPCREATEF_ONDEMAND_BUF) {
1149 		pp->pp_flags |= PPF_BUFFER_ON_DEMAND;
1150 	}
1151 	if (ppcreatef & PPCREATEF_DYNAMIC) {
1152 		pp->pp_flags |= PPF_DYNAMIC;
1153 	}
1154 	if (lbuf_srp->srp_c_obj_cnt > 0) {
1155 		ASSERT(lbuf_srp->srp_c_obj_size != 0);
1156 		pp->pp_flags |= PPF_LARGE_BUF;
1157 	}
1158 
1159 	pp_retain(pp);
1160 
1161 	md_cflags = ((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ?
1162 	    SKMEM_CR_NOMAGAZINES : 0);
1163 	md_cflags |= SKMEM_CR_BATCH;
1164 	pp->pp_flags |= PPF_BATCH;
1165 
1166 	if (pp->pp_flags & PPF_DYNAMIC) {
1167 		md_cflags |= SKMEM_CR_DYNAMIC;
1168 	}
1169 
1170 	if (umd_srp != NULL && (pp->pp_umd_region =
1171 	    skmem_region_create(name, umd_srp, NULL, NULL, NULL)) == NULL) {
1172 		SK_ERR("\"%s\" (%p) failed to create %s region",
1173 		    pp->pp_name, SK_KVA(pp), umd_srp->srp_name);
1174 		goto failed;
1175 	}
1176 
1177 	if ((pp->pp_kmd_region = skmem_region_create(name, kmd_srp, NULL, NULL,
1178 	    NULL)) == NULL) {
1179 		SK_ERR("\"%s\" (%p) failed to create %s region",
1180 		    pp->pp_name, SK_KVA(pp), kmd_srp->srp_name);
1181 		goto failed;
1182 	}
1183 
1184 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1185 		VERIFY((kbft_srp != NULL) && (kbft_srp->srp_c_obj_cnt > 0));
1186 		if (!PP_KERNEL_ONLY(pp)) {
1187 			VERIFY((ubft_srp != NULL) &&
1188 			    (ubft_srp->srp_c_obj_cnt > 0));
1189 		}
1190 	}
1191 	/*
1192 	 * Metadata regions {KMD,KBFT,UBFT} magazines layer and persistency
1193 	 * attribute must match.
1194 	 */
1195 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1196 		ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ==
1197 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES));
1198 		ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
1199 		    (kbft_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
1200 	}
1201 
1202 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && !PP_KERNEL_ONLY(pp)) {
1203 		if ((pp->pp_ubft_region = skmem_region_create(name, ubft_srp,
1204 		    NULL, NULL, NULL)) == NULL) {
1205 			SK_ERR("\"%s\" (%p) failed to create %s region",
1206 			    pp->pp_name, SK_KVA(pp), ubft_srp->srp_name);
1207 			goto failed;
1208 		}
1209 	}
1210 
1211 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1212 		if ((pp->pp_kbft_region = skmem_region_create(name,
1213 		    kbft_srp, NULL, NULL, NULL)) == NULL) {
1214 			SK_ERR("\"%s\" (%p) failed to create %s region",
1215 			    pp->pp_name, SK_KVA(pp), kbft_srp->srp_name);
1216 			goto failed;
1217 		}
1218 	}
1219 
1220 	if (!PP_KERNEL_ONLY(pp)) {
1221 		skmem_region_mirror(pp->pp_kmd_region, pp->pp_umd_region);
1222 	}
1223 	if (!PP_KERNEL_ONLY(pp) && pp->pp_ubft_region != NULL) {
1224 		ASSERT(pp->pp_kbft_region != NULL);
1225 		skmem_region_mirror(pp->pp_kbft_region, pp->pp_ubft_region);
1226 	}
1227 
1228 	/*
1229 	 * Create the metadata cache; magazines layer is determined by caller.
1230 	 */
1231 	cache_name = tsnprintf(cname, sizeof(cname), "kmd.%s", name);
1232 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1233 		pp->pp_kmd_cache = skmem_cache_create(cache_name, md_size, 0,
1234 		    pp_metadata_ctor_no_buflet, pp_metadata_dtor, NULL, pp,
1235 		    pp->pp_kmd_region, md_cflags);
1236 	} else {
1237 		pp->pp_kmd_cache = skmem_cache_create(cache_name, md_size, 0,
1238 		    pp_metadata_ctor_max_buflet, pp_metadata_dtor, NULL, pp,
1239 		    pp->pp_kmd_region, md_cflags);
1240 	}
1241 
1242 	if (pp->pp_kmd_cache == NULL) {
1243 		SK_ERR("\"%s\" (%p) failed to create \"%s\" cache",
1244 		    pp->pp_name, SK_KVA(pp), cname);
1245 		goto failed;
1246 	}
1247 
1248 	/*
1249 	 * Create the buflet metadata cache
1250 	 */
1251 	if (pp->pp_kbft_region != NULL) {
1252 		cache_name = tsnprintf(cname, sizeof(cname), "kbft_def.%s", name);
1253 		PP_KBFT_CACHE_DEF(pp) = skmem_cache_create(cache_name,
1254 		    kbft_srp->srp_c_obj_size, 0,
1255 		    pp_buflet_default_buffer_metadata_ctor,
1256 		    pp_buflet_metadata_dtor, NULL, pp, pp->pp_kbft_region,
1257 		    md_cflags);
1258 
1259 		if (PP_KBFT_CACHE_DEF(pp) == NULL) {
1260 			SK_ERR("\"%s\" (%p) failed to create \"%s\" cache",
1261 			    pp->pp_name, SK_KVA(pp), cname);
1262 			goto failed;
1263 		}
1264 
1265 		if (PP_HAS_LARGE_BUF(pp)) {
1266 			/* Aggressive memory reclaim flag set to kbft_large for now */
1267 			md_cflags |= SKMEM_CR_RECLAIM;
1268 			cache_name = tsnprintf(cname, sizeof(cname),
1269 			    "kbft_large.%s", name);
1270 			PP_KBFT_CACHE_LARGE(pp) = skmem_cache_create(cache_name,
1271 			    kbft_srp->srp_c_obj_size, 0,
1272 			    pp_buflet_large_buffer_metadata_ctor,
1273 			    pp_buflet_metadata_dtor,
1274 			    NULL, pp, pp->pp_kbft_region, md_cflags);
1275 
1276 			if (PP_KBFT_CACHE_LARGE(pp) == NULL) {
1277 				SK_ERR("\"%s\" (%p) failed to "
1278 				    "create \"%s\" cache", pp->pp_name,
1279 				    SK_KVA(pp), cname);
1280 				goto failed;
1281 			}
1282 		}
1283 	}
1284 
1285 	if ((PP_BUF_REGION_DEF(pp) = skmem_region_create(name,
1286 	    buf_srp, pp_buf_seg_ctor, pp_buf_seg_dtor, pp)) == NULL) {
1287 		SK_ERR("\"%s\" (%p) failed to create %s region",
1288 		    pp->pp_name, SK_KVA(pp), buf_srp->srp_name);
1289 		goto failed;
1290 	}
1291 
1292 	if (PP_HAS_LARGE_BUF(pp)) {
1293 		PP_BUF_REGION_LARGE(pp) = skmem_region_create(name, lbuf_srp,
1294 		    pp_buf_seg_ctor, pp_buf_seg_dtor, pp);
1295 		if (PP_BUF_REGION_LARGE(pp) == NULL) {
1296 			SK_ERR("\"%s\" (%p) failed to create %s region",
1297 			    pp->pp_name, SK_KVA(pp), lbuf_srp->srp_name);
1298 			goto failed;
1299 		}
1300 	}
1301 
1302 	/*
1303 	 * Create the buffer object cache without the magazines layer.
1304 	 * We rely on caching the constructed metadata object instead.
1305 	 */
1306 	cache_name = tsnprintf(cname, sizeof(cname), "buf_def.%s", name);
1307 	buf_def_cflags = buf_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES ? SKMEM_CR_NOMAGAZINES : 0;
1308 	if ((PP_BUF_CACHE_DEF(pp) = skmem_cache_create(cache_name,
1309 	    def_buf_obj_size,
1310 	    0, NULL, NULL, NULL, pp, PP_BUF_REGION_DEF(pp),
1311 	    buf_def_cflags)) == NULL) {
1312 		SK_ERR("\"%s\" (%p) failed to create \"%s\" cache",
1313 		    pp->pp_name, SK_KVA(pp), cname);
1314 		goto failed;
1315 	}
1316 
1317 	if (PP_BUF_REGION_LARGE(pp) != NULL) {
1318 		cache_name = tsnprintf(cname, sizeof(cname), "buf_large.%s", name);
1319 		if ((PP_BUF_CACHE_LARGE(pp) = skmem_cache_create(cache_name,
1320 		    lbuf_srp->srp_c_obj_size, 0, NULL, NULL, NULL, pp,
1321 		    PP_BUF_REGION_LARGE(pp), SKMEM_CR_NOMAGAZINES)) == NULL) {
1322 			SK_ERR("\"%s\" (%p) failed to create \"%s\" cache",
1323 			    pp->pp_name, SK_KVA(pp), cname);
1324 			goto failed;
1325 		}
1326 	}
1327 
1328 	return pp;
1329 
1330 failed:
1331 	if (pp != NULL) {
1332 		if (pp->pp_ctx != NULL) {
1333 			pp->pp_ctx_release(pp->pp_ctx);
1334 			pp->pp_ctx = NULL;
1335 		}
1336 		pp_close(pp);
1337 	}
1338 
1339 	return NULL;
1340 }
1341 
1342 void
pp_destroy(struct kern_pbufpool * pp)1343 pp_destroy(struct kern_pbufpool *pp)
1344 {
1345 	PP_LOCK_ASSERT_HELD(pp);
1346 
1347 	/* may be called for built-in pp with outstanding reference */
1348 	ASSERT(!(pp->pp_flags & PPF_EXTERNAL) || pp->pp_refcnt == 0);
1349 
1350 	pp_destroy_upp_locked(pp);
1351 
1352 	pp_destroy_upp_bft_locked(pp);
1353 
1354 	if (pp->pp_kmd_cache != NULL) {
1355 		skmem_cache_destroy(pp->pp_kmd_cache);
1356 		pp->pp_kmd_cache = NULL;
1357 	}
1358 
1359 	if (pp->pp_umd_region != NULL) {
1360 		skmem_region_release(pp->pp_umd_region);
1361 		pp->pp_umd_region = NULL;
1362 	}
1363 
1364 	if (pp->pp_kmd_region != NULL) {
1365 		skmem_region_release(pp->pp_kmd_region);
1366 		pp->pp_kmd_region = NULL;
1367 	}
1368 
1369 	if (PP_KBFT_CACHE_DEF(pp) != NULL) {
1370 		skmem_cache_destroy(PP_KBFT_CACHE_DEF(pp));
1371 		PP_KBFT_CACHE_DEF(pp) = NULL;
1372 	}
1373 
1374 	if (PP_KBFT_CACHE_LARGE(pp) != NULL) {
1375 		skmem_cache_destroy(PP_KBFT_CACHE_LARGE(pp));
1376 		PP_KBFT_CACHE_LARGE(pp) = NULL;
1377 	}
1378 
1379 	if (pp->pp_ubft_region != NULL) {
1380 		skmem_region_release(pp->pp_ubft_region);
1381 		pp->pp_ubft_region = NULL;
1382 	}
1383 
1384 	if (pp->pp_kbft_region != NULL) {
1385 		skmem_region_release(pp->pp_kbft_region);
1386 		pp->pp_kbft_region = NULL;
1387 	}
1388 
1389 	/*
1390 	 * The order is important here, since pp_metadata_dtor()
1391 	 * called by freeing on the pp_kmd_cache will in turn
1392 	 * free the attached buffer.  Therefore destroy the
1393 	 * buffer cache last.
1394 	 */
1395 	if (PP_BUF_CACHE_DEF(pp) != NULL) {
1396 		skmem_cache_destroy(PP_BUF_CACHE_DEF(pp));
1397 		PP_BUF_CACHE_DEF(pp) = NULL;
1398 	}
1399 	if (PP_BUF_REGION_DEF(pp) != NULL) {
1400 		skmem_region_release(PP_BUF_REGION_DEF(pp));
1401 		PP_BUF_REGION_DEF(pp) = NULL;
1402 	}
1403 	if (PP_BUF_CACHE_LARGE(pp) != NULL) {
1404 		skmem_cache_destroy(PP_BUF_CACHE_LARGE(pp));
1405 		PP_BUF_CACHE_LARGE(pp) = NULL;
1406 	}
1407 	if (PP_BUF_REGION_LARGE(pp) != NULL) {
1408 		skmem_region_release(PP_BUF_REGION_LARGE(pp));
1409 		PP_BUF_REGION_LARGE(pp) = NULL;
1410 	}
1411 
1412 	if (pp->pp_ctx != NULL) {
1413 		pp->pp_ctx_release(pp->pp_ctx);
1414 		pp->pp_ctx = NULL;
1415 	}
1416 }
1417 
1418 static int
pp_init_upp_locked(struct kern_pbufpool * pp,boolean_t can_block)1419 pp_init_upp_locked(struct kern_pbufpool *pp, boolean_t can_block)
1420 {
1421 	int i, err = 0;
1422 
1423 	if (pp->pp_u_hash_table != NULL) {
1424 		goto done;
1425 	}
1426 
1427 	/* allocated-address hash table */
1428 	/*
1429 	 * -fbounds-safety: We switched to sk_alloc (aka kalloc) from zalloc, so
1430 	 * if we see any performance hit, we can check if this caused it.
1431 	 */
1432 	if (can_block) {
1433 		pp->pp_u_hash_table = sk_alloc_type_array(
1434 			struct kern_pbufpool_u_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1435 			Z_WAITOK, skmem_tag_pbufpool_hash);
1436 		pp->pp_u_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1437 	} else {
1438 		pp->pp_u_hash_table = sk_alloc_type_array(
1439 			struct kern_pbufpool_u_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1440 			Z_NOWAIT, skmem_tag_pbufpool_hash);
1441 		pp->pp_u_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1442 	}
1443 	if (pp->pp_u_hash_table == NULL) {
1444 		SK_ERR("failed to zalloc packet buffer pool upp hash table");
1445 		err = ENOMEM;
1446 		goto done;
1447 	}
1448 
1449 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1450 		SLIST_INIT(&pp->pp_u_hash_table[i].upp_head);
1451 	}
1452 done:
1453 	return err;
1454 }
1455 
1456 static void
pp_destroy_upp_locked(struct kern_pbufpool * pp)1457 pp_destroy_upp_locked(struct kern_pbufpool *pp)
1458 {
1459 	PP_LOCK_ASSERT_HELD(pp);
1460 	if (pp->pp_u_hash_table != NULL) {
1461 		/* purge anything that's left */
1462 		pp_purge_upp_locked(pp, -1);
1463 
1464 #if (DEBUG || DEVELOPMENT)
1465 		for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1466 			ASSERT(SLIST_EMPTY(&pp->pp_u_hash_table[i].upp_head));
1467 		}
1468 #endif /* DEBUG || DEVELOPMENT */
1469 
1470 		kfree_type_counted_by(struct kern_pbufpool_u_bkt,
1471 		    pp->pp_u_hash_table_size,
1472 		    pp->pp_u_hash_table);
1473 	}
1474 	ASSERT(pp->pp_u_bufinuse == 0);
1475 }
1476 
1477 int
pp_init_upp(struct kern_pbufpool * pp,boolean_t can_block)1478 pp_init_upp(struct kern_pbufpool *pp, boolean_t can_block)
1479 {
1480 	int err = 0;
1481 
1482 	PP_LOCK(pp);
1483 	err = pp_init_upp_locked(pp, can_block);
1484 	if (err) {
1485 		SK_ERR("packet UPP init failed (%d)", err);
1486 		goto done;
1487 	}
1488 	err = pp_init_upp_bft_locked(pp, can_block);
1489 	if (err) {
1490 		SK_ERR("buflet UPP init failed (%d)", err);
1491 		pp_destroy_upp_locked(pp);
1492 		goto done;
1493 	}
1494 	pp_retain_locked(pp);
1495 done:
1496 	PP_UNLOCK(pp);
1497 	return err;
1498 }
1499 
1500 __attribute__((always_inline))
1501 static void
pp_insert_upp_bft_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1502 pp_insert_upp_bft_locked(struct kern_pbufpool *pp,
1503     struct __kern_buflet *kbft, pid_t pid)
1504 {
1505 	struct kern_pbufpool_u_bft_bkt *bkt;
1506 	struct __kern_buflet_ext *kbe = (struct __kern_buflet_ext *)kbft;
1507 
1508 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1509 	ASSERT(kbe->kbe_buf_pid == (pid_t)-1);
1510 	kbe->kbe_buf_pid = pid;
1511 	bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, kbft->buf_bft_idx_reg);
1512 	SLIST_INSERT_HEAD(&bkt->upp_head, kbe, kbe_buf_upp_link);
1513 	pp->pp_u_bftinuse++;
1514 }
1515 
1516 __attribute__((always_inline))
1517 static void
pp_insert_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1518 pp_insert_upp_bft_chain_locked(struct kern_pbufpool *pp,
1519     struct __kern_buflet *kbft, pid_t pid)
1520 {
1521 	while (kbft != NULL) {
1522 		pp_insert_upp_bft_locked(pp, kbft, pid);
1523 		kbft = __unsafe_forge_single(struct __kern_buflet *,
1524 		    __DECONST(kern_buflet_t, kbft->buf_nbft_addr));
1525 	}
1526 }
1527 
1528 /* Also inserts the attached chain of buflets */
1529 void static inline
pp_insert_upp_common(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1530 pp_insert_upp_common(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1531     pid_t pid)
1532 {
1533 	struct kern_pbufpool_u_bkt *bkt;
1534 	struct __kern_buflet *kbft;
1535 
1536 	ASSERT(kqum->qum_pid == (pid_t)-1);
1537 	kqum->qum_pid = pid;
1538 
1539 	bkt = KERN_PBUFPOOL_U_HASH(pp, METADATA_IDX(kqum));
1540 	SLIST_INSERT_HEAD(&bkt->upp_head, kqum, qum_upp_link);
1541 	pp->pp_u_bufinuse++;
1542 
1543 	kbft = __unsafe_forge_single(struct __kern_buflet *, (kern_buflet_t)kqum->qum_buf[0].buf_nbft_addr);
1544 	if (kbft != NULL) {
1545 		ASSERT(((kern_buflet_t)kbft)->buf_flag & BUFLET_FLAG_EXTERNAL);
1546 		ASSERT(kqum->qum_qflags & QUM_F_INTERNALIZED);
1547 		pp_insert_upp_bft_chain_locked(pp, kbft, pid);
1548 	}
1549 }
1550 
1551 void
pp_insert_upp_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1552 pp_insert_upp_locked(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1553     pid_t pid)
1554 {
1555 	pp_insert_upp_common(pp, kqum, pid);
1556 }
1557 
1558 void
pp_insert_upp(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1559 pp_insert_upp(struct kern_pbufpool *pp, struct __kern_quantum *kqum, pid_t pid)
1560 {
1561 	PP_LOCK(pp);
1562 	pp_insert_upp_common(pp, kqum, pid);
1563 	PP_UNLOCK(pp);
1564 }
1565 
1566 void
pp_insert_upp_batch(struct kern_pbufpool * pp,pid_t pid,uint64_t * __counted_by (num)array,uint32_t num)1567 pp_insert_upp_batch(struct kern_pbufpool *pp, pid_t pid,
1568     uint64_t *__counted_by(num)array, uint32_t num)
1569 {
1570 	uint32_t i = 0;
1571 
1572 	ASSERT(array != NULL && num > 0);
1573 	PP_LOCK(pp);
1574 	while (i < num) {
1575 		struct __kern_quantum *kqum = SK_PTR_ADDR_KQUM(array[i]);
1576 
1577 		ASSERT(kqum != NULL);
1578 		pp_insert_upp_common(pp, kqum, pid);
1579 		++i;
1580 	}
1581 	PP_UNLOCK(pp);
1582 }
1583 
1584 __attribute__((always_inline))
1585 static struct __kern_buflet *
pp_remove_upp_bft_locked(struct kern_pbufpool * pp,obj_idx_t bft_idx)1586 pp_remove_upp_bft_locked(struct kern_pbufpool *pp, obj_idx_t bft_idx)
1587 {
1588 	struct __kern_buflet_ext *kbft, *tbft;
1589 	struct kern_pbufpool_u_bft_bkt *bkt;
1590 
1591 	bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, bft_idx);
1592 	SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link, tbft) {
1593 		if (((kern_buflet_t)kbft)->buf_bft_idx_reg == bft_idx) {
1594 			SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1595 			    kbe_buf_upp_link);
1596 			kbft->kbe_buf_pid = (pid_t)-1;
1597 			kbft->kbe_buf_upp_link.sle_next = NULL;
1598 			ASSERT(pp->pp_u_bftinuse != 0);
1599 			pp->pp_u_bftinuse--;
1600 			break;
1601 		}
1602 	}
1603 	return (kern_buflet_t)kbft;
1604 }
1605 
1606 struct __kern_buflet *
pp_remove_upp_bft(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1607 pp_remove_upp_bft(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1608 {
1609 	struct __kern_buflet *kbft = pp_remove_upp_bft_locked(pp, md_idx);
1610 
1611 	*err = __improbable(kbft != NULL) ? 0 : EINVAL;
1612 	return kbft;
1613 }
1614 
1615 __attribute__((always_inline))
1616 static int
pp_remove_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum)1617 pp_remove_upp_bft_chain_locked(struct kern_pbufpool *pp,
1618     struct __kern_quantum *kqum)
1619 {
1620 	uint32_t max_frags = pp->pp_max_frags;
1621 	struct __kern_buflet *kbft;
1622 	uint16_t nbfts, upkt_nbfts;
1623 	obj_idx_t bft_idx;
1624 
1625 	ASSERT(!(kqum->qum_qflags & QUM_F_INTERNALIZED));
1626 	bft_idx = kqum->qum_user->qum_buf[0].buf_nbft_idx;
1627 	kbft = &kqum->qum_buf[0];
1628 	if (bft_idx == OBJ_IDX_NONE) {
1629 		return 0;
1630 	}
1631 
1632 	ASSERT(METADATA_TYPE(kqum) == NEXUS_META_TYPE_PACKET);
1633 	struct __kern_packet *kpkt = __DECONST(struct __kern_packet *, kqum);
1634 	struct __user_packet *upkt = __DECONST(struct __user_packet *,
1635 	    kpkt->pkt_qum.qum_user);
1636 
1637 	upkt_nbfts = upkt->pkt_bufs_cnt;
1638 	if (__improbable(upkt_nbfts > max_frags)) {
1639 		SK_ERR("bad bcnt in upkt (%d > %d)", upkt_nbfts, max_frags);
1640 		BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1641 		BUF_NBFT_ADDR(kbft, 0);
1642 		return ERANGE;
1643 	}
1644 
1645 	nbfts = (kbft->buf_addr != 0) ? 1 : 0;
1646 
1647 	do {
1648 		struct __kern_buflet *pbft = kbft;
1649 		struct __kern_buflet_ext *kbe;
1650 
1651 		kbft = pp_remove_upp_bft_locked(pp, bft_idx);
1652 		if (__improbable(kbft == NULL)) {
1653 			BUF_NBFT_IDX(pbft, OBJ_IDX_NONE);
1654 			BUF_NBFT_ADDR(pbft, 0);
1655 			SK_ERR("unallocated next buflet (%d), %p", bft_idx,
1656 			    SK_KVA(pbft));
1657 			return ERANGE;
1658 		}
1659 		ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1660 		BUF_NBFT_IDX(pbft, bft_idx);
1661 		BUF_NBFT_ADDR(pbft, kbft);
1662 		kbe = __container_of(kbft, struct __kern_buflet_ext, kbe_overlay);
1663 		bft_idx = kbe->kbe_buf_user->buf_nbft_idx;
1664 		++nbfts;
1665 	} while ((bft_idx != OBJ_IDX_NONE) && (nbfts < upkt_nbfts));
1666 
1667 	ASSERT(kbft != NULL);
1668 	BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1669 	BUF_NBFT_ADDR(kbft, 0);
1670 	*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = nbfts;
1671 
1672 	if (__improbable((bft_idx != OBJ_IDX_NONE) || (nbfts != upkt_nbfts))) {
1673 		SK_ERR("bad buflet in upkt (%d, %d)", nbfts, upkt_nbfts);
1674 		return ERANGE;
1675 	}
1676 	return 0;
1677 }
1678 
1679 struct __kern_quantum *
pp_remove_upp_locked(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1680 pp_remove_upp_locked(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1681 {
1682 	struct __kern_quantum *kqum, *tqum;
1683 	struct kern_pbufpool_u_bkt *bkt;
1684 
1685 	bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1686 	SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1687 		if (METADATA_IDX(kqum) == md_idx) {
1688 			SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1689 			    qum_upp_link);
1690 			kqum->qum_pid = (pid_t)-1;
1691 			ASSERT(pp->pp_u_bufinuse != 0);
1692 			pp->pp_u_bufinuse--;
1693 			break;
1694 		}
1695 	}
1696 	if (__probable(kqum != NULL)) {
1697 		*err = pp_remove_upp_bft_chain_locked(pp, kqum);
1698 	} else {
1699 		*err = ERANGE;
1700 	}
1701 	return kqum;
1702 }
1703 
1704 struct __kern_quantum *
pp_remove_upp(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1705 pp_remove_upp(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1706 {
1707 	struct __kern_quantum *kqum;
1708 
1709 	PP_LOCK(pp);
1710 	kqum = pp_remove_upp_locked(pp, md_idx, err);
1711 	PP_UNLOCK(pp);
1712 	return kqum;
1713 }
1714 
1715 struct __kern_quantum *
pp_find_upp(struct kern_pbufpool * pp,obj_idx_t md_idx)1716 pp_find_upp(struct kern_pbufpool *pp, obj_idx_t md_idx)
1717 {
1718 	struct __kern_quantum *kqum, *tqum;
1719 	struct kern_pbufpool_u_bkt *bkt;
1720 
1721 	PP_LOCK(pp);
1722 	bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1723 	SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1724 		if (METADATA_IDX(kqum) == md_idx) {
1725 			break;
1726 		}
1727 	}
1728 	PP_UNLOCK(pp);
1729 
1730 	return kqum;
1731 }
1732 
1733 __attribute__((always_inline))
1734 static void
pp_purge_upp_locked(struct kern_pbufpool * pp,pid_t pid)1735 pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid)
1736 {
1737 	struct __kern_quantum *kqum, *tqum;
1738 	struct kern_pbufpool_u_bkt *bkt;
1739 	int i;
1740 
1741 	PP_LOCK_ASSERT_HELD(pp);
1742 
1743 	/*
1744 	 * TODO: Build a list of packets and batch-free them.
1745 	 */
1746 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1747 		bkt = &pp->pp_u_hash_table[i];
1748 		SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1749 			ASSERT(kqum->qum_pid != (pid_t)-1);
1750 			if (pid != (pid_t)-1 && kqum->qum_pid != pid) {
1751 				continue;
1752 			}
1753 			SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1754 			    qum_upp_link);
1755 			pp_remove_upp_bft_chain_locked(pp, kqum);
1756 			kqum->qum_pid = (pid_t)-1;
1757 			kqum->qum_qflags &= ~QUM_F_FINALIZED;
1758 			kqum->qum_ksd = NULL;
1759 			pp_free_packet(__DECONST(struct kern_pbufpool *,
1760 			    kqum->qum_pp), (uint64_t)kqum);
1761 			ASSERT(pp->pp_u_bufinuse != 0);
1762 			pp->pp_u_bufinuse--;
1763 		}
1764 	}
1765 }
1766 
1767 __attribute__((always_inline))
1768 static void
pp_purge_upp_bft_locked(struct kern_pbufpool * pp,pid_t pid)1769 pp_purge_upp_bft_locked(struct kern_pbufpool *pp, pid_t pid)
1770 {
1771 	struct __kern_buflet_ext *kbft, *tbft;
1772 	struct kern_pbufpool_u_bft_bkt *bkt;
1773 	int i;
1774 
1775 	PP_LOCK_ASSERT_HELD(pp);
1776 
1777 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1778 		bkt = &pp->pp_u_bft_hash_table[i];
1779 		SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link,
1780 		    tbft) {
1781 			ASSERT(kbft->kbe_buf_pid != (pid_t)-1);
1782 			if (pid != (pid_t)-1 && kbft->kbe_buf_pid != pid) {
1783 				continue;
1784 			}
1785 			SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1786 			    kbe_buf_upp_link);
1787 			kbft->kbe_buf_pid = (pid_t)-1;
1788 			kbft->kbe_buf_upp_link.sle_next = NULL;
1789 			pp_free_buflet(pp, (kern_buflet_t)kbft);
1790 			ASSERT(pp->pp_u_bftinuse != 0);
1791 			pp->pp_u_bftinuse--;
1792 		}
1793 	}
1794 }
1795 
1796 void
pp_purge_upp(struct kern_pbufpool * pp,pid_t pid)1797 pp_purge_upp(struct kern_pbufpool *pp, pid_t pid)
1798 {
1799 	PP_LOCK(pp);
1800 	pp_purge_upp_locked(pp, pid);
1801 	pp_purge_upp_bft_locked(pp, pid);
1802 	PP_UNLOCK(pp);
1803 }
1804 
1805 static int
pp_init_upp_bft_locked(struct kern_pbufpool * pp,boolean_t can_block)1806 pp_init_upp_bft_locked(struct kern_pbufpool *pp, boolean_t can_block)
1807 {
1808 	int i, err = 0;
1809 
1810 	PP_LOCK_ASSERT_HELD(pp);
1811 	if (pp->pp_u_bft_hash_table != NULL) {
1812 		return 0;
1813 	}
1814 
1815 	/* allocated-address hash table */
1816 	/*
1817 	 * -fbounds-safety: We switched to sk_alloc (aka kalloc) from zalloc, so
1818 	 * if we see any performance hit, we can check if this caused it.
1819 	 */
1820 	if (can_block) {
1821 		pp->pp_u_bft_hash_table = sk_alloc_type_array(
1822 			struct kern_pbufpool_u_bft_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1823 			Z_WAITOK, skmem_tag_pbufpool_bft_hash);
1824 		pp->pp_u_bft_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1825 	} else {
1826 		pp->pp_u_bft_hash_table = sk_alloc_type_array(
1827 			struct kern_pbufpool_u_bft_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1828 			Z_NOWAIT, skmem_tag_pbufpool_bft_hash);
1829 		pp->pp_u_bft_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1830 	}
1831 	if (pp->pp_u_bft_hash_table == NULL) {
1832 		SK_ERR("failed to zalloc packet buffer pool upp buflet hash table");
1833 		err = ENOMEM;
1834 		goto fail;
1835 	}
1836 
1837 	for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1838 		SLIST_INIT(&pp->pp_u_bft_hash_table[i].upp_head);
1839 	}
1840 
1841 fail:
1842 	return err;
1843 }
1844 
1845 static void
pp_destroy_upp_bft_locked(struct kern_pbufpool * pp)1846 pp_destroy_upp_bft_locked(struct kern_pbufpool *pp)
1847 {
1848 	PP_LOCK_ASSERT_HELD(pp);
1849 	if (pp->pp_u_bft_hash_table != NULL) {
1850 		/* purge anything that's left */
1851 		pp_purge_upp_bft_locked(pp, -1);
1852 
1853 #if (DEBUG || DEVELOPMENT)
1854 		for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1855 			ASSERT(SLIST_EMPTY(&pp->pp_u_bft_hash_table[i].upp_head));
1856 		}
1857 #endif /* DEBUG || DEVELOPMENT */
1858 
1859 		kfree_type_counted_by(struct kern_pbufpool_u_bft_bkt,
1860 		    pp->pp_u_bft_hash_table_size,
1861 		    pp->pp_u_bft_hash_table);
1862 	}
1863 	ASSERT(pp->pp_u_bftinuse == 0);
1864 }
1865 
1866 void
pp_insert_upp_bft(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1867 pp_insert_upp_bft(struct kern_pbufpool *pp,
1868     struct __kern_buflet *kbft, pid_t pid)
1869 {
1870 	PP_LOCK(pp);
1871 	pp_insert_upp_bft_locked(pp, kbft, pid);
1872 	PP_UNLOCK(pp);
1873 }
1874 
1875 boolean_t
pp_isempty_upp(struct kern_pbufpool * pp)1876 pp_isempty_upp(struct kern_pbufpool *pp)
1877 {
1878 	boolean_t isempty;
1879 
1880 	PP_LOCK(pp);
1881 	isempty = (pp->pp_u_bufinuse == 0);
1882 	PP_UNLOCK(pp);
1883 
1884 	return isempty;
1885 }
1886 
1887 __attribute__((always_inline))
1888 static inline struct __kern_quantum *
pp_metadata_init(struct __metadata_preamble * mdp,struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag,struct skmem_obj ** blist)1889 pp_metadata_init(struct __metadata_preamble *mdp, struct kern_pbufpool *pp,
1890     uint16_t bufcnt, uint32_t skmflag, struct skmem_obj **blist)
1891 {
1892 	struct __kern_quantum *kqum;
1893 	struct __user_quantum *uqum;
1894 
1895 	kqum = SK_PTR_ADDR_KQUM((uintptr_t)mdp + METADATA_PREAMBLE_SZ);
1896 	ASSERT(kqum->qum_pp == pp);
1897 	if (__probable(!PP_KERNEL_ONLY(pp))) {
1898 		ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY));
1899 		uqum =  __DECONST(struct __user_quantum *, kqum->qum_user);
1900 		ASSERT(uqum != NULL);
1901 	} else {
1902 		ASSERT(kqum->qum_qflags & QUM_F_KERNEL_ONLY);
1903 		ASSERT(kqum->qum_user == NULL);
1904 		uqum = NULL;
1905 	}
1906 
1907 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 &&
1908 	    pp_metadata_construct(kqum, uqum, METADATA_IDX(kqum), pp,
1909 	    skmflag, bufcnt, FALSE, blist) != 0) {
1910 		return NULL;
1911 	}
1912 
1913 	/* (re)construct {user,kernel} metadata */
1914 	struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
1915 	struct __kern_buflet *kbuf = &kpkt->pkt_qum_buf;
1916 	uint16_t i;
1917 
1918 	/* sanitize flags */
1919 	kpkt->pkt_pflags &= PKT_F_INIT_MASK;
1920 
1921 	ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
1922 	    kpkt->pkt_com_opt != NULL);
1923 	ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
1924 	    kpkt->pkt_flow != NULL);
1925 	ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
1926 	    kpkt->pkt_tx_compl != NULL);
1927 
1928 	/*
1929 	 * XXX: For now we always set PKT_F_FLOW_DATA;
1930 	 * this is a no-op but done for consistency
1931 	 * with the other PKT_F_*_DATA flags.
1932 	 */
1933 	kpkt->pkt_pflags |= PKT_F_FLOW_DATA;
1934 
1935 	/* initialize kernel packet */
1936 	KPKT_INIT(kpkt, QUM_F_INTERNALIZED);
1937 
1938 	ASSERT(bufcnt || PP_HAS_BUFFER_ON_DEMAND(pp));
1939 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1940 		ASSERT(kbuf->buf_ctl == NULL);
1941 		ASSERT(kbuf->buf_addr == 0);
1942 		/*
1943 		 * -fbounds-safety: buf_nbft_addr is a mach_vm_address_t
1944 		 * which is unsafe, so we just forge it here.
1945 		 */
1946 		kbuf = __unsafe_forge_single(struct __kern_buflet *,
1947 		    __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
1948 	}
1949 	/* initialize kernel buflet */
1950 	for (i = 0; i < bufcnt; i++) {
1951 		ASSERT(kbuf != NULL);
1952 		KBUF_INIT(kbuf);
1953 		kbuf = __unsafe_forge_single(struct __kern_buflet *,
1954 		    __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
1955 	}
1956 	ASSERT((kbuf == NULL) || (bufcnt == 0));
1957 
1958 	return kqum;
1959 }
1960 
1961 /*
1962  * When PPF_BUFFER_ON_DEMAND flag is set on packet pool creation, we create
1963  * packet descriptor cache with no buffer attached and a buflet cache with
1964  * cpu layer caching enabled. While operating in this mode, we can call
1965  * pp_alloc_packet_common() either with `bufcnt = 0` or `bufcnt = n`,
1966  * where n <= pp->pp_max_frags. If `bufcnt == 0` then we allocate packet
1967  * descriptor with no attached buffer from the metadata cache.
1968  * If `bufcnt != 0`, then this routine allocates packet descriptor and buflets
1969  * from their respective caches and constructs the packet on behalf of the
1970  * caller.
1971  */
1972 __attribute__((always_inline))
1973 static inline uint32_t
pp_alloc_packet_common(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * __counted_by (num)array,uint32_t num,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)1974 pp_alloc_packet_common(struct kern_pbufpool *pp, uint16_t bufcnt,
1975     uint64_t *__counted_by(num)array, uint32_t num, boolean_t tagged,
1976     alloc_cb_func_t cb, const void *ctx, uint32_t skmflag)
1977 {
1978 	struct __metadata_preamble *mdp;
1979 	struct __kern_quantum *kqum = NULL;
1980 	uint32_t allocp, need = num;
1981 	struct skmem_obj *__single plist, *__single blist = NULL;
1982 	uint64_t *array_cp;  /* -fbounds-safety */
1983 
1984 	ASSERT(bufcnt <= pp->pp_max_frags);
1985 	ASSERT(array != NULL && num > 0);
1986 	ASSERT(PP_BATCH_CAPABLE(pp));
1987 
1988 	/* allocate (constructed) packet(s) with buffer(s) attached */
1989 	allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist,
1990 	    pp->pp_kmd_cache->skm_objsize, num, skmflag);
1991 
1992 	/* allocate (constructed) buflet(s) with buffer(s) attached */
1993 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
1994 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
1995 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, (allocp * bufcnt), skmflag);
1996 	}
1997 
1998 	array_cp = array;
1999 	while (plist != NULL) {
2000 		struct skmem_obj *plistn;
2001 
2002 		plistn = plist->mo_next;
2003 		plist->mo_next = NULL;
2004 
2005 		mdp = (struct __metadata_preamble *)(void *)plist;
2006 		kqum = pp_metadata_init(mdp, pp, bufcnt, skmflag, &blist);
2007 		if (kqum == NULL) {
2008 			if (blist != NULL) {
2009 				skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp),
2010 				    blist);
2011 				blist = NULL;
2012 			}
2013 			plist->mo_next = plistn;
2014 			skmem_cache_batch_free(pp->pp_kmd_cache, plist);
2015 			plist = NULL;
2016 			break;
2017 		}
2018 
2019 
2020 		if (tagged) {
2021 			*array_cp = SK_PTR_ENCODE(kqum, METADATA_TYPE(kqum),
2022 			    METADATA_SUBTYPE(kqum));
2023 		} else {
2024 			*array_cp = (uint64_t)kqum;
2025 		}
2026 
2027 		if (cb != NULL) {
2028 			(cb)(*array_cp, (num - need), ctx);
2029 		}
2030 
2031 		++array_cp;
2032 		plist = plistn;
2033 
2034 		ASSERT(need > 0);
2035 		--need;
2036 	}
2037 	ASSERT(blist == NULL);
2038 	ASSERT((num - need) == allocp || kqum == NULL);
2039 
2040 	return num - need;
2041 }
2042 
2043 uint64_t
pp_alloc_packet(struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag)2044 pp_alloc_packet(struct kern_pbufpool *pp, uint16_t bufcnt, uint32_t skmflag)
2045 {
2046 	uint64_t kpkt = 0;
2047 
2048 	(void) pp_alloc_packet_common(pp, bufcnt, &kpkt, 1, FALSE,
2049 	    NULL, NULL, skmflag);
2050 
2051 	return kpkt;
2052 }
2053 
2054 int
pp_alloc_packet_batch(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * __counted_by (* size)array,uint32_t * size,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2055 pp_alloc_packet_batch(struct kern_pbufpool *pp, uint16_t bufcnt,
2056     uint64_t *__counted_by(*size)array, uint32_t *size, boolean_t tagged,
2057     alloc_cb_func_t cb, const void *ctx, uint32_t skmflag)
2058 {
2059 	uint32_t i, n;
2060 	int err;
2061 
2062 	ASSERT(array != NULL && size > 0);
2063 
2064 	n = *size;
2065 	/*
2066 	 * -fbounds-safety: Originally there was this line here: *size = 0; but
2067 	 * we removed this because array is now __counted_by(*size), so *size =
2068 	 * 0 leads to brk 0x5519. Also, *size is set to i anyway.
2069 	 */
2070 
2071 	i = pp_alloc_packet_common(pp, bufcnt, array, n, tagged,
2072 	    cb, ctx, skmflag);
2073 	/*
2074 	 * -fbounds-safety: Since array is __counted_by(*size), we need to be
2075 	 * extra careful when *size is updated, like below. Here, we know i will
2076 	 * be less than or equal to the original *size value, so updating *size
2077 	 * is okay.
2078 	 */
2079 	*size = i;
2080 
2081 	if (__probable(i == n)) {
2082 		err = 0;
2083 	} else if (i != 0) {
2084 		err = EAGAIN;
2085 	} else {
2086 		err = ENOMEM;
2087 	}
2088 
2089 	return err;
2090 }
2091 
2092 int
pp_alloc_pktq(struct kern_pbufpool * pp,uint16_t bufcnt,struct pktq * pktq,uint32_t num,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2093 pp_alloc_pktq(struct kern_pbufpool *pp, uint16_t bufcnt,
2094     struct pktq *pktq, uint32_t num, alloc_cb_func_t cb, const void *ctx,
2095     uint32_t skmflag)
2096 {
2097 	struct __metadata_preamble *mdp;
2098 	struct __kern_packet *kpkt = NULL;
2099 	uint32_t allocp, need = num;
2100 	struct skmem_obj *__single plist, *__single blist = NULL;
2101 	int err;
2102 
2103 	ASSERT(pktq != NULL && num > 0);
2104 	ASSERT(pp->pp_md_type == NEXUS_META_TYPE_PACKET);
2105 	ASSERT(bufcnt <= pp->pp_max_frags);
2106 	ASSERT(PP_BATCH_CAPABLE(pp));
2107 
2108 	/* allocate (constructed) packet(s) with buffer(s) attached */
2109 	allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist,
2110 	    pp->pp_kmd_cache->skm_objsize, num, skmflag);
2111 
2112 	/* allocate (constructed) buflet(s) with buffer(s) attached */
2113 	if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
2114 		(void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
2115 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, (allocp * bufcnt), skmflag);
2116 	}
2117 
2118 	while (plist != NULL) {
2119 		struct skmem_obj *plistn;
2120 
2121 		plistn = plist->mo_next;
2122 		plist->mo_next = NULL;
2123 
2124 		mdp = (struct __metadata_preamble *)(void *)plist;
2125 		kpkt = (struct __kern_packet *)pp_metadata_init(mdp, pp,
2126 		    bufcnt, skmflag, &blist);
2127 		if (kpkt == NULL) {
2128 			if (blist != NULL) {
2129 				skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp),
2130 				    blist);
2131 				blist = NULL;
2132 			}
2133 			plist->mo_next = plistn;
2134 			skmem_cache_batch_free(pp->pp_kmd_cache, plist);
2135 			plist = NULL;
2136 			break;
2137 		}
2138 
2139 
2140 		KPKTQ_ENQUEUE(pktq, kpkt);
2141 
2142 		if (cb != NULL) {
2143 			(cb)((uint64_t)kpkt, (num - need), ctx);
2144 		}
2145 
2146 		plist = plistn;
2147 
2148 		ASSERT(need > 0);
2149 		--need;
2150 	}
2151 	ASSERT(blist == NULL);
2152 	ASSERT((num - need) == allocp || kpkt == NULL);
2153 
2154 	if (__probable(need == 0)) {
2155 		err = 0;
2156 	} else if (need == num) {
2157 		err = ENOMEM;
2158 	} else {
2159 		err = EAGAIN;
2160 	}
2161 
2162 	return err;
2163 }
2164 
2165 uint64_t
pp_alloc_packet_by_size(struct kern_pbufpool * pp,uint32_t size,uint32_t skmflag)2166 pp_alloc_packet_by_size(struct kern_pbufpool *pp, uint32_t size,
2167     uint32_t skmflag)
2168 {
2169 	uint32_t bufcnt = pp->pp_max_frags;
2170 	uint64_t kpkt = 0;
2171 
2172 	if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
2173 		bufcnt =
2174 		    SK_ROUNDUP(size, PP_BUF_SIZE_DEF(pp)) / PP_BUF_SIZE_DEF(pp);
2175 		ASSERT(bufcnt <= UINT16_MAX);
2176 	}
2177 
2178 	(void) pp_alloc_packet_common(pp, (uint16_t)bufcnt, &kpkt, 1, TRUE,
2179 	    NULL, NULL, skmflag);
2180 
2181 	return kpkt;
2182 }
2183 
2184 __attribute__((always_inline))
2185 static inline struct __metadata_preamble *
pp_metadata_fini(struct __kern_quantum * kqum,struct kern_pbufpool * pp,struct mbuf ** mp,struct __kern_packet ** kpp,struct skmem_obj ** blist_def,struct skmem_obj ** blist_nocache_def,struct skmem_obj ** blist_large,struct skmem_obj ** blist_nocahce_large)2186 pp_metadata_fini(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
2187     struct mbuf **mp, struct __kern_packet **kpp, struct skmem_obj **blist_def,
2188     struct skmem_obj **blist_nocache_def, struct skmem_obj **blist_large,
2189     struct skmem_obj **blist_nocahce_large)
2190 {
2191 	struct __metadata_preamble *mdp = METADATA_PREAMBLE(kqum);
2192 	ASSERT(SK_PTR_TAG(kqum) == 0);
2193 	struct __kern_packet *kpkt = SK_PTR_KPKT(kqum);
2194 
2195 	if ((kpkt->pkt_pflags & PKT_F_TX_COMPL_TS_REQ) != 0) {
2196 		__packet_perform_tx_completion_callbacks(
2197 			SK_PKT2PH(kpkt), NULL);
2198 	}
2199 	if ((kpkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) {
2200 		ASSERT((kpkt->pkt_pflags & PKT_F_PKT_DATA) == 0);
2201 		ASSERT(kpkt->pkt_mbuf != NULL);
2202 		ASSERT(kpkt->pkt_mbuf->m_nextpkt == NULL);
2203 		if (mp != NULL) {
2204 			ASSERT(*mp == NULL);
2205 			*mp = kpkt->pkt_mbuf;
2206 		} else {
2207 			m_freem(kpkt->pkt_mbuf);
2208 		}
2209 		KPKT_CLEAR_MBUF_DATA(kpkt);
2210 	} else if ((kpkt->pkt_pflags & PKT_F_PKT_DATA) != 0) {
2211 		ASSERT(kpkt->pkt_pkt != NULL);
2212 		ASSERT(kpkt->pkt_pkt->pkt_nextpkt == NULL);
2213 		if (kpp != NULL) {
2214 			ASSERT(*kpp == NULL);
2215 			*kpp = kpkt->pkt_pkt;
2216 		} else {
2217 			/* can only recurse once */
2218 			ASSERT((kpkt->pkt_pkt->pkt_pflags &
2219 			    PKT_F_PKT_DATA) == 0);
2220 			pp_free_packet_single(kpkt->pkt_pkt);
2221 		}
2222 		KPKT_CLEAR_PKT_DATA(kpkt);
2223 	}
2224 	kpkt->pkt_pflags &= ~PKT_F_TRUNCATED;
2225 	ASSERT(kpkt->pkt_nextpkt == NULL);
2226 	ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
2227 	ASSERT((kpkt->pkt_pflags & PKT_F_MBUF_MASK) == 0);
2228 	ASSERT((kpkt->pkt_pflags & PKT_F_PKT_MASK) == 0);
2229 
2230 	if (__improbable(PP_HAS_BUFFER_ON_DEMAND(pp))) {
2231 		pp_metadata_destruct_common(kqum, pp, FALSE, blist_def, blist_nocache_def,
2232 		    blist_large, blist_nocahce_large);
2233 	}
2234 	return mdp;
2235 }
2236 
2237 void
pp_free_packet_chain(struct __kern_packet * pkt_chain,int * npkt)2238 pp_free_packet_chain(struct __kern_packet *pkt_chain, int *npkt)
2239 {
2240 	struct __metadata_preamble *mdp;
2241 	struct skmem_obj *__single obj_mdp;
2242 	struct skmem_obj *__single top = NULL;
2243 	struct skmem_obj *__single blist_def = NULL, *__single blist_nocache_def = NULL;
2244 	struct skmem_obj *__single blist_large = NULL, *__single blist_nocache_large = NULL;
2245 	struct skmem_obj **list = &top;
2246 	struct mbuf *__single mtop = NULL;
2247 	struct mbuf **mp = &mtop;
2248 	struct __kern_packet *__single kptop = NULL;
2249 	struct __kern_packet **__single kpp = &kptop, *pkt, *next;
2250 	struct kern_pbufpool *pp;
2251 	int c = 0;
2252 
2253 	pp = __DECONST(struct kern_pbufpool *, pkt_chain->pkt_qum.qum_pp);
2254 	ASSERT(pp != NULL);
2255 	ASSERT(PP_BATCH_CAPABLE(pp));
2256 
2257 	for (pkt = pkt_chain; pkt != NULL; pkt = next) {
2258 		next = pkt->pkt_nextpkt;
2259 		pkt->pkt_nextpkt = NULL;
2260 
2261 		ASSERT(SK_PTR_ADDR_KQUM(pkt)->qum_pp == pp);
2262 		mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(pkt), pp,
2263 		    mp, kpp, &blist_def, &blist_nocache_def, &blist_large, &blist_nocache_large);
2264 
2265 		obj_mdp = __unsafe_forge_single(struct skmem_obj *, mdp);
2266 		*list = obj_mdp;
2267 		list = &(*list)->mo_next;
2268 		c++;
2269 
2270 		if (*mp != NULL) {
2271 			mp = &(*mp)->m_nextpkt;
2272 			ASSERT(*mp == NULL);
2273 		}
2274 		if (*kpp != NULL) {
2275 			kpp = &(*kpp)->pkt_nextpkt;
2276 			ASSERT(*kpp == NULL);
2277 		}
2278 	}
2279 
2280 	ASSERT(top != NULL);
2281 	skmem_cache_batch_free(pp->pp_kmd_cache, top);
2282 	pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
2283 	if (mtop != NULL) {
2284 		DTRACE_SKYWALK(free__attached__mbuf);
2285 		if (__probable(mtop->m_nextpkt != NULL)) {
2286 			m_freem_list(mtop);
2287 		} else {
2288 			m_freem(mtop);
2289 		}
2290 	}
2291 	if (kptop != NULL) {
2292 		int cnt = 0;
2293 		pp_free_packet_chain(kptop, &cnt);
2294 		DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2295 	}
2296 	if (npkt != NULL) {
2297 		*npkt = c;
2298 	}
2299 }
2300 
2301 void
pp_free_pktq(struct pktq * pktq)2302 pp_free_pktq(struct pktq *pktq)
2303 {
2304 	if (__improbable(KPKTQ_EMPTY(pktq))) {
2305 		return;
2306 	}
2307 	struct __kern_packet *pkt = KPKTQ_FIRST(pktq);
2308 	pp_free_packet_chain(pkt, NULL);
2309 	KPKTQ_DISPOSE(pktq);
2310 }
2311 
2312 void
pp_drop_pktq(struct pktq * pktq,struct ifnet * ifp,uint16_t flags,drop_reason_t reason,const char * funcname,uint16_t linenum)2313 pp_drop_pktq(struct pktq *pktq, struct ifnet *ifp, uint16_t flags,
2314     drop_reason_t reason, const char *funcname, uint16_t linenum)
2315 {
2316 	drop_func_t dropfunc;
2317 	struct __kern_packet *kpkt;
2318 
2319 	if (KPKTQ_EMPTY(pktq)) {
2320 		return;
2321 	}
2322 	if (__probable(droptap_total_tap_count == 0)) {
2323 		goto nodroptap;
2324 	}
2325 
2326 	if (flags & DROPTAP_FLAG_DIR_OUT) {
2327 		dropfunc = droptap_output_packet;
2328 	} else if (flags & DROPTAP_FLAG_DIR_IN) {
2329 		dropfunc = droptap_input_packet;
2330 	} else {
2331 		goto nodroptap;
2332 	}
2333 
2334 	KPKTQ_FOREACH(kpkt, pktq) {
2335 		dropfunc(SK_PKT2PH(kpkt), reason, funcname, linenum, flags, ifp,
2336 		    kpkt->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);
2337 	}
2338 
2339 nodroptap:
2340 	pp_free_pktq(pktq);
2341 }
2342 
2343 __attribute__((always_inline))
2344 static inline void
pp_free_packet_array(struct kern_pbufpool * pp,uint64_t * __counted_by (num)array,uint32_t num)2345 pp_free_packet_array(struct kern_pbufpool *pp, uint64_t *__counted_by(num)array, uint32_t num)
2346 {
2347 	struct __metadata_preamble *mdp;
2348 	struct skmem_obj *__single obj_mdp = NULL;
2349 	struct skmem_obj *__single top = NULL;
2350 	struct skmem_obj *__single blist_def = NULL, *__single blist_nocache_def = NULL;
2351 	struct skmem_obj *__single blist_large = NULL, *__single blist_nocache_large = NULL;
2352 	struct skmem_obj **list = &top;
2353 	struct mbuf *__single mtop = NULL;
2354 	struct mbuf **mp = &mtop;
2355 	struct __kern_packet *__single kptop = NULL;
2356 	struct __kern_packet **kpp = &kptop;
2357 	uint32_t i;
2358 
2359 	ASSERT(pp != NULL);
2360 	ASSERT(array != NULL && num > 0);
2361 	ASSERT(PP_BATCH_CAPABLE(pp));
2362 
2363 	for (i = 0; i < num; i++) {
2364 		ASSERT(SK_PTR_ADDR_KQUM(array[i])->qum_pp == pp);
2365 		mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(array[i]), pp,
2366 		    mp, kpp, &blist_def, &blist_nocache_def, &blist_large, &blist_nocache_large);
2367 
2368 		obj_mdp = __unsafe_forge_single(struct skmem_obj *, mdp);
2369 		*list = obj_mdp;
2370 		list = &(*list)->mo_next;
2371 		array[i] = 0;
2372 
2373 		if (*mp != NULL) {
2374 			mp = &(*mp)->m_nextpkt;
2375 			ASSERT(*mp == NULL);
2376 		}
2377 		if (*kpp != NULL) {
2378 			kpp = &(*kpp)->pkt_nextpkt;
2379 			ASSERT(*kpp == NULL);
2380 		}
2381 	}
2382 
2383 	ASSERT(top != NULL);
2384 	skmem_cache_batch_free(pp->pp_kmd_cache, top);
2385 	pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
2386 	if (mtop != NULL) {
2387 		DTRACE_SKYWALK(free__attached__mbuf);
2388 		if (__probable(mtop->m_nextpkt != NULL)) {
2389 			m_freem_list(mtop);
2390 		} else {
2391 			m_freem(mtop);
2392 		}
2393 	}
2394 	if (kptop != NULL) {
2395 		int cnt = 0;
2396 		pp_free_packet_chain(kptop, &cnt);
2397 		DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2398 	}
2399 }
2400 
2401 void
pp_free_packet(struct kern_pbufpool * pp,uint64_t kqum)2402 pp_free_packet(struct kern_pbufpool *pp, uint64_t kqum)
2403 {
2404 	pp_free_packet_array(pp, &kqum, 1);
2405 }
2406 
2407 void
pp_free_packet_batch(const kern_pbufpool_t pp,uint64_t * __counted_by (size)array,uint32_t size)2408 pp_free_packet_batch(const kern_pbufpool_t pp, uint64_t *__counted_by(size)array, uint32_t size)
2409 {
2410 	pp_free_packet_array(pp, array, size);
2411 }
2412 
2413 void
pp_free_packet_single(struct __kern_packet * pkt)2414 pp_free_packet_single(struct __kern_packet *pkt)
2415 {
2416 	ASSERT(pkt->pkt_nextpkt == NULL);
2417 	pp_free_packet(__DECONST(struct kern_pbufpool *,
2418 	    pkt->pkt_qum.qum_pp), SK_PTR_ADDR(pkt));
2419 }
2420 
2421 void
pp_drop_packet_single(struct __kern_packet * pkt,struct ifnet * ifp,uint16_t flags,drop_reason_t reason,const char * funcname,uint16_t linenum)2422 pp_drop_packet_single(struct __kern_packet *pkt, struct ifnet *ifp, uint16_t flags,
2423     drop_reason_t reason, const char *funcname, uint16_t linenum)
2424 {
2425 	drop_func_t dropfunc;
2426 
2427 	if (pkt->pkt_length == 0) {
2428 		return;
2429 	}
2430 	if (__probable(droptap_total_tap_count == 0)) {
2431 		goto nodroptap;
2432 	}
2433 
2434 	if (flags & DROPTAP_FLAG_DIR_OUT) {
2435 		dropfunc = droptap_output_packet;
2436 	} else if (flags & DROPTAP_FLAG_DIR_IN) {
2437 		dropfunc = droptap_input_packet;
2438 	} else {
2439 		goto nodroptap;
2440 	}
2441 
2442 	dropfunc(SK_PKT2PH(pkt), reason, funcname, linenum, flags, ifp,
2443 	    pkt->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);
2444 
2445 nodroptap:
2446 	pp_free_packet_single(pkt);
2447 }
2448 
2449 static mach_vm_address_t
pp_alloc_buffer_common(const kern_pbufpool_t pp,struct skmem_obj_info * oi,uint32_t skmflag,bool large)2450 pp_alloc_buffer_common(const kern_pbufpool_t pp, struct skmem_obj_info *oi,
2451     uint32_t skmflag, bool large)
2452 {
2453 	/*
2454 	 * XXX -fbounds-safety: We can't change this mach_vm_address_t to some
2455 	 * other (safe) pointer type, because IOSkywalkFamily depends on this
2456 	 * being mach_vm_address_t
2457 	 */
2458 	mach_vm_address_t baddr;
2459 	struct skmem_cache *skm = large ? PP_BUF_CACHE_LARGE(pp):
2460 	    PP_BUF_CACHE_DEF(pp);
2461 
2462 	ASSERT(skm != NULL);
2463 	/* allocate a cached buffer */
2464 	baddr = (mach_vm_address_t)skmem_cache_alloc(skm, skmflag);
2465 
2466 #if (DEVELOPMENT || DEBUG)
2467 	uint64_t mtbf = skmem_region_get_mtbf();
2468 	/*
2469 	 * MTBF is applicable only for non-blocking allocations here.
2470 	 */
2471 	if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
2472 	    (skmflag & SKMEM_NOSLEEP))) {
2473 		SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
2474 		net_update_uptime();
2475 		if (baddr != 0) {
2476 			skmem_cache_free(skm,
2477 			    __unsafe_forge_single(struct skmem_obj *, baddr));
2478 			baddr = 0;
2479 		}
2480 	}
2481 #endif /* (DEVELOPMENT || DEBUG) */
2482 
2483 	if (__improbable(baddr == 0)) {
2484 		SK_DF(SK_VERB_MEM, "failed to alloc buffer, pp %p",
2485 		    SK_KVA(pp));
2486 		return 0;
2487 	}
2488 	skmem_cache_get_obj_info(skm,
2489 	    __unsafe_forge_single(struct skmem_obj *, baddr), oi, NULL);
2490 	ASSERT(SKMEM_OBJ_BUFCTL(oi) != NULL);
2491 	ASSERT((mach_vm_address_t)SKMEM_OBJ_ADDR(oi) == baddr);
2492 	return baddr;
2493 }
2494 
2495 errno_t
pp_alloc_buffer(const kern_pbufpool_t pp,mach_vm_address_t * baddr,kern_segment_t * seg,kern_obj_idx_seg_t * idx,uint32_t skmflag)2496 pp_alloc_buffer(const kern_pbufpool_t pp, mach_vm_address_t *baddr,
2497     kern_segment_t *seg, kern_obj_idx_seg_t *idx, uint32_t skmflag)
2498 {
2499 	struct skmem_obj_info oib;
2500 
2501 	VERIFY(pp != NULL && baddr != NULL);
2502 	VERIFY((seg != NULL) == (idx != NULL));
2503 
2504 	if (__improbable(!PP_HAS_BUFFER_ON_DEMAND(pp))) {
2505 		return ENOTSUP;
2506 	}
2507 
2508 	*baddr = pp_alloc_buffer_common(pp, &oib, skmflag, false);
2509 	if (__improbable(*baddr == 0)) {
2510 		return ENOMEM;
2511 	}
2512 
2513 	if (seg != NULL) {
2514 		ASSERT(SKMEM_OBJ_SEG(&oib) != NULL);
2515 		*seg = SKMEM_OBJ_SEG(&oib);
2516 		*idx = SKMEM_OBJ_IDX_SEG(&oib);
2517 	}
2518 	return 0;
2519 }
2520 
2521 void
pp_free_buffer(const kern_pbufpool_t pp,mach_vm_address_t addr)2522 pp_free_buffer(const kern_pbufpool_t pp, mach_vm_address_t addr)
2523 {
2524 	ASSERT(pp != NULL && addr != 0);
2525 	skmem_cache_free(PP_BUF_CACHE_DEF(pp), __unsafe_forge_single(
2526 		    struct skmem_obj *, addr));
2527 }
2528 
2529 __attribute__((always_inline))
2530 static inline uint32_t
pp_alloc_buflet_common(struct kern_pbufpool * pp,uint64_t * __counted_by (num)array,uint32_t num,uint32_t skmflag,bool large)2531 pp_alloc_buflet_common(struct kern_pbufpool *pp,
2532     uint64_t *__counted_by(num)array, uint32_t num, uint32_t skmflag,
2533     bool large)
2534 {
2535 	struct __kern_buflet *kbft = NULL;
2536 	uint32_t allocd, need = num;
2537 	struct skmem_obj *__single list;
2538 	uint64_t *array_cp;  /* -fbounds-safety */
2539 
2540 	ASSERT(array != NULL && num > 0);
2541 	ASSERT(PP_BATCH_CAPABLE(pp));
2542 	ASSERT(PP_KBFT_CACHE_DEF(pp) != NULL);
2543 	ASSERT(PP_BUF_SIZE_LARGE(pp) != 0 || !large);
2544 
2545 	if (large) {
2546 		allocd = skmem_cache_batch_alloc(PP_KBFT_CACHE_LARGE(pp), &list,
2547 		    PP_KBFT_CACHE_LARGE(pp)->skm_objsize, num, skmflag);
2548 	} else {
2549 		allocd = skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &list,
2550 		    PP_KBFT_CACHE_DEF(pp)->skm_objsize, num, skmflag);
2551 	}
2552 
2553 	array_cp = array;
2554 	while (list != NULL) {
2555 		struct skmem_obj *listn;
2556 
2557 		listn = list->mo_next;
2558 		list->mo_next = NULL;
2559 		kbft = (kern_buflet_t)(void *)list;
2560 
2561 
2562 		KBUF_EXT_INIT(kbft, pp);
2563 		*array_cp = (uint64_t)kbft;
2564 		++array_cp;
2565 		list = listn;
2566 		ASSERT(need > 0);
2567 		--need;
2568 	}
2569 	ASSERT((num - need) == allocd || kbft == NULL);
2570 	return num - need;
2571 }
2572 
2573 errno_t
pp_alloc_buflet(struct kern_pbufpool * pp,kern_buflet_t * kbft,uint32_t skmflag,bool large)2574 pp_alloc_buflet(struct kern_pbufpool *pp, kern_buflet_t *kbft, uint32_t skmflag,
2575     bool large)
2576 {
2577 	uint64_t bft;
2578 
2579 	if (__improbable(!pp_alloc_buflet_common(pp, &bft, 1, skmflag, large))) {
2580 		return ENOMEM;
2581 	}
2582 	*kbft = __unsafe_forge_single(kern_buflet_t, bft);
2583 	return 0;
2584 }
2585 
2586 errno_t
pp_alloc_buflet_batch(struct kern_pbufpool * pp,uint64_t * __counted_by (* size)array,uint32_t * size,uint32_t skmflag,bool large)2587 pp_alloc_buflet_batch(struct kern_pbufpool *pp,
2588     uint64_t *__counted_by(*size)array, uint32_t *size, uint32_t skmflag,
2589     bool large)
2590 {
2591 	uint32_t i, n;
2592 	int err;
2593 
2594 	ASSERT(array != NULL && size > 0);
2595 
2596 	n = *size;
2597 	i = pp_alloc_buflet_common(pp, array, n, skmflag, large);
2598 	*size = i;
2599 
2600 	if (__probable(i == n)) {
2601 		err = 0;
2602 	} else if (i != 0) {
2603 		err = EAGAIN;
2604 	} else {
2605 		err = ENOMEM;
2606 	}
2607 
2608 	return err;
2609 }
2610 
2611 __attribute__((always_inline))
2612 static void
pp_free_buflet_common(const kern_pbufpool_t pp,kern_buflet_t kbft)2613 pp_free_buflet_common(const kern_pbufpool_t pp, kern_buflet_t kbft)
2614 {
2615 	ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
2616 	ASSERT(kbft->buf_nbft_addr == 0);
2617 
2618 	if (kbft->buf_flag & BUFLET_FLAG_EXTERNAL) {
2619 		ASSERT(kbft->buf_addr != 0);
2620 		ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2621 		ASSERT(kbft->buf_bft_idx_reg != OBJ_IDX_NONE);
2622 		ASSERT(kbft->buf_ctl != NULL);
2623 		ASSERT(((struct __kern_buflet_ext *)kbft)->
2624 		    kbe_buf_upp_link.sle_next == NULL);
2625 		if (kbft->buf_ctl->bc_usecnt > 1) {
2626 			skmem_cache_free_nocache(BUFLET_HAS_LARGE_BUF(kbft) ?
2627 			    PP_KBFT_CACHE_LARGE(pp) : PP_KBFT_CACHE_DEF(pp),
2628 			    (void *)kbft);
2629 		} else {
2630 			skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2631 			    PP_KBFT_CACHE_LARGE(pp) : PP_KBFT_CACHE_DEF(pp),
2632 			    (void *)kbft);
2633 		}
2634 	} else if (__probable(kbft->buf_addr != 0)) {
2635 		void *objaddr = kbft->buf_objaddr;
2636 		uint32_t usecnt = 0;
2637 
2638 		ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2639 		ASSERT(kbft->buf_ctl != NULL);
2640 		KBUF_DTOR(kbft, usecnt);
2641 		SK_DF(SK_VERB_MEM, "pp %p buf %p usecnt %u",
2642 		    SK_KVA(pp), SK_KVA(objaddr), usecnt);
2643 		if (__probable(usecnt == 0)) {
2644 			skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2645 			    PP_BUF_CACHE_LARGE(pp) : PP_BUF_CACHE_DEF(pp),
2646 			    objaddr);
2647 		}
2648 	}
2649 }
2650 
2651 void
pp_free_buflet(const kern_pbufpool_t pp,kern_buflet_t kbft)2652 pp_free_buflet(const kern_pbufpool_t pp, kern_buflet_t kbft)
2653 {
2654 	ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
2655 	ASSERT(pp != NULL && kbft != NULL);
2656 	pp_free_buflet_common(pp, kbft);
2657 }
2658 
2659 void
pp_reap_caches(boolean_t purge)2660 pp_reap_caches(boolean_t purge)
2661 {
2662 	skmem_cache_reap_now(pp_opt_cache, purge);
2663 	skmem_cache_reap_now(pp_flow_cache, purge);
2664 	skmem_cache_reap_now(pp_compl_cache, purge);
2665 }
2666