1 /*
2 * Copyright (c) 2016-2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/packet/pbufpool_var.h>
31 #include <sys/sdt.h>
32 #include <net/droptap.h>
33
34 static struct kern_pbufpool *pp_alloc(zalloc_flags_t);
35 static void pp_free(struct kern_pbufpool *);
36 static uint32_t pp_alloc_packet_common(struct kern_pbufpool *, uint16_t,
37 uint64_t *__counted_by(num), uint32_t num, boolean_t, alloc_cb_func_t,
38 const void *, uint32_t);
39 static void pp_free_packet_array(struct kern_pbufpool *,
40 uint64_t *__counted_by(num)array, uint32_t num);
41 static int pp_metadata_ctor_no_buflet(struct skmem_obj_info *,
42 struct skmem_obj_info *, void *, uint32_t);
43 static int pp_metadata_ctor_max_buflet(struct skmem_obj_info *,
44 struct skmem_obj_info *, void *, uint32_t);
45 static void pp_metadata_dtor(void *, void *);
46 static int pp_metadata_construct(struct __kern_quantum *,
47 struct __user_quantum *, obj_idx_t, struct kern_pbufpool *, uint32_t,
48 uint16_t, bool, struct skmem_obj **);
49 static void pp_metadata_destruct(struct __kern_quantum *,
50 struct kern_pbufpool *, bool);
51 static struct __kern_quantum *pp_metadata_init(struct __metadata_preamble *,
52 struct kern_pbufpool *, uint16_t, uint32_t, struct skmem_obj **);
53 static struct __metadata_preamble *pp_metadata_fini(struct __kern_quantum *,
54 struct kern_pbufpool *, struct mbuf **, struct __kern_packet **,
55 struct skmem_obj **, struct skmem_obj **, struct skmem_obj **, struct skmem_obj **);
56 static void pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid);
57 static void pp_buf_seg_ctor(struct sksegment *, IOSKMemoryBufferRef, void *);
58 static void pp_buf_seg_dtor(struct sksegment *, IOSKMemoryBufferRef, void *);
59 static void pp_destroy_upp_locked(struct kern_pbufpool *);
60 static void pp_destroy_upp_bft_locked(struct kern_pbufpool *);
61 static int pp_init_upp_bft_locked(struct kern_pbufpool *, boolean_t);
62 static void pp_free_buflet_common(const kern_pbufpool_t, kern_buflet_t);
63 static mach_vm_address_t pp_alloc_buffer_common(const kern_pbufpool_t pp,
64 struct skmem_obj_info *oi, uint32_t skmflag, bool large);
65 static inline uint32_t
66 pp_alloc_buflet_common(struct kern_pbufpool *pp,
67 uint64_t *__counted_by(num)array, uint32_t num, uint32_t skmflag,
68 bool large);
69
70 #define KERN_PBUFPOOL_U_HASH_SIZE 64 /* hash table size */
71
72 #define KERN_BUF_MIN_STRIDING_SIZE 32 * 1024
73 static uint32_t kern_buf_min_striding_size = KERN_BUF_MIN_STRIDING_SIZE;
74
75 /*
76 * Since the inputs are small (indices to the metadata region), we can use
77 * Knuth's multiplicative hash method which is fast and good enough. Here
78 * we multiply the input by the golden ratio of 2^32. See "The Art of
79 * Computer Programming", section 6.4.
80 */
81 #define KERN_PBUFPOOL_U_HASH_INDEX(_i, _m) \
82 (((_i) * 2654435761U) & (_m))
83 #define KERN_PBUFPOOL_U_HASH(_pp, _i) \
84 (&(_pp)->pp_u_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
85 KERN_PBUFPOOL_U_HASH_SIZE - 1)])
86 #define KERN_PBUFPOOL_U_BFT_HASH(_pp, _i) \
87 (&(_pp)->pp_u_bft_hash_table[KERN_PBUFPOOL_U_HASH_INDEX(_i, \
88 KERN_PBUFPOOL_U_HASH_SIZE - 1)])
89
90 static SKMEM_TYPE_DEFINE(pp_zone, struct kern_pbufpool);
91
92 #define SKMEM_TAG_PBUFPOOL_HASH "com.apple.skywalk.pbufpool.hash"
93 static SKMEM_TAG_DEFINE(skmem_tag_pbufpool_hash, SKMEM_TAG_PBUFPOOL_HASH);
94
95 #define SKMEM_TAG_PBUFPOOL_BFT_HASH "com.apple.skywalk.pbufpool.bft.hash"
96 static SKMEM_TAG_DEFINE(skmem_tag_pbufpool_bft_hash, SKMEM_TAG_PBUFPOOL_BFT_HASH);
97
98
99 struct kern_pbufpool_u_htbl {
100 struct kern_pbufpool_u_bkt upp_hash[KERN_PBUFPOOL_U_HASH_SIZE];
101 };
102
103 #define PP_U_HTBL_SIZE sizeof(struct kern_pbufpool_u_htbl)
104 static SKMEM_TYPE_DEFINE(pp_u_htbl_zone, struct kern_pbufpool_u_htbl);
105
106 static struct skmem_cache *pp_opt_cache; /* cache for __packet_opt */
107 static struct skmem_cache *pp_flow_cache; /* cache for __flow */
108 static struct skmem_cache *pp_compl_cache; /* cache for __packet_compl */
109
110 static int __pp_inited = 0;
111
112 int
pp_init(void)113 pp_init(void)
114 {
115 _CASSERT(KPKT_SC_UNSPEC == MBUF_SC_UNSPEC);
116 _CASSERT(KPKT_SC_BK_SYS == MBUF_SC_BK_SYS);
117 _CASSERT(KPKT_SC_BK == MBUF_SC_BK);
118 _CASSERT(KPKT_SC_BE == MBUF_SC_BE);
119 _CASSERT(KPKT_SC_RD == MBUF_SC_RD);
120 _CASSERT(KPKT_SC_OAM == MBUF_SC_OAM);
121 _CASSERT(KPKT_SC_AV == MBUF_SC_AV);
122 _CASSERT(KPKT_SC_RV == MBUF_SC_RV);
123 _CASSERT(KPKT_SC_VI == MBUF_SC_VI);
124 _CASSERT(KPKT_SC_SIG == MBUF_SC_SIG);
125 _CASSERT(KPKT_SC_VO == MBUF_SC_VO);
126 _CASSERT(KPKT_SC_CTL == MBUF_SC_CTL);
127
128 _CASSERT(KPKT_SC_BK_SYS == PKT_SC_BK_SYS);
129 _CASSERT(KPKT_SC_BK == PKT_SC_BK);
130 _CASSERT(KPKT_SC_BE == PKT_SC_BE);
131 _CASSERT(KPKT_SC_RD == PKT_SC_RD);
132 _CASSERT(KPKT_SC_OAM == PKT_SC_OAM);
133 _CASSERT(KPKT_SC_AV == PKT_SC_AV);
134 _CASSERT(KPKT_SC_RV == PKT_SC_RV);
135 _CASSERT(KPKT_SC_VI == PKT_SC_VI);
136 _CASSERT(KPKT_SC_SIG == PKT_SC_SIG);
137 _CASSERT(KPKT_SC_VO == PKT_SC_VO);
138 _CASSERT(KPKT_SC_CTL == PKT_SC_CTL);
139 _CASSERT(KPKT_SC_MAX_CLASSES == MBUF_SC_MAX_CLASSES);
140
141 _CASSERT(KPKT_TC_UNSPEC == MBUF_TC_UNSPEC);
142 _CASSERT(KPKT_TC_BE == MBUF_TC_BE);
143 _CASSERT(KPKT_TC_BK == MBUF_TC_BK);
144 _CASSERT(KPKT_TC_VI == MBUF_TC_VI);
145 _CASSERT(KPKT_TC_VO == MBUF_TC_VO);
146 _CASSERT(KPKT_TC_MAX == MBUF_TC_MAX);
147
148 _CASSERT(KPKT_TC_BE == PKT_TC_BE);
149 _CASSERT(KPKT_TC_BK == PKT_TC_BK);
150 _CASSERT(KPKT_TC_VI == PKT_TC_VI);
151 _CASSERT(KPKT_TC_VO == PKT_TC_VO);
152
153 _CASSERT(PKT_SCVAL_BK_SYS == SCVAL_BK_SYS);
154 _CASSERT(PKT_SCVAL_BK == SCVAL_BK);
155 _CASSERT(PKT_SCVAL_BE == SCVAL_BE);
156 _CASSERT(PKT_SCVAL_RD == SCVAL_RD);
157 _CASSERT(PKT_SCVAL_OAM == SCVAL_OAM);
158 _CASSERT(PKT_SCVAL_AV == SCVAL_AV);
159 _CASSERT(PKT_SCVAL_RV == SCVAL_RV);
160 _CASSERT(PKT_SCVAL_VI == SCVAL_VI);
161 _CASSERT(PKT_SCVAL_VO == SCVAL_VO);
162 _CASSERT(PKT_SCVAL_CTL == SCVAL_CTL);
163
164 /*
165 * Assert that the value of common packet flags between mbuf and
166 * skywalk packets match, and that they are in PKT_F_COMMON_MASK.
167 */
168 _CASSERT(PKT_F_BACKGROUND == PKTF_SO_BACKGROUND);
169 _CASSERT(PKT_F_REALTIME == PKTF_SO_REALTIME);
170 _CASSERT(PKT_F_REXMT == PKTF_TCP_REXMT);
171 _CASSERT(PKT_F_LAST_PKT == PKTF_LAST_PKT);
172 _CASSERT(PKT_F_FLOW_ID == PKTF_FLOW_ID);
173 _CASSERT(PKT_F_FLOW_ADV == PKTF_FLOW_ADV);
174 _CASSERT(PKT_F_TX_COMPL_TS_REQ == PKTF_TX_COMPL_TS_REQ);
175 _CASSERT(PKT_F_TS_VALID == PKTF_TS_VALID);
176 _CASSERT(PKT_F_NEW_FLOW == PKTF_NEW_FLOW);
177 _CASSERT(PKT_F_START_SEQ == PKTF_START_SEQ);
178 _CASSERT(PKT_F_KEEPALIVE == PKTF_KEEPALIVE);
179 _CASSERT(PKT_F_WAKE_PKT == PKTF_WAKE_PKT);
180 _CASSERT(PKT_F_COMMON_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME |
181 PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_FLOW_ID | PKT_F_FLOW_ADV |
182 PKT_F_TX_COMPL_TS_REQ | PKT_F_TS_VALID | PKT_F_NEW_FLOW |
183 PKT_F_START_SEQ | PKT_F_KEEPALIVE | PKT_F_WAKE_PKT));
184 /*
185 * Assert packet flags shared with userland.
186 */
187 _CASSERT(PKT_F_USER_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME |
188 PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_OPT_DATA | PKT_F_PROMISC |
189 PKT_F_TRUNCATED | PKT_F_WAKE_PKT | PKT_F_L4S));
190
191 _CASSERT(offsetof(struct __kern_quantum, qum_len) ==
192 offsetof(struct __kern_packet, pkt_length));
193
194 /*
195 * Due to the use of tagged pointer, we need the size of
196 * the metadata preamble structure to be multiples of 16.
197 * See SK_PTR_TAG() definition for details.
198 */
199 _CASSERT(sizeof(struct __metadata_preamble) != 0 &&
200 (sizeof(struct __metadata_preamble) % 16) == 0);
201
202 _CASSERT(NX_PBUF_FRAGS_MIN == 1 &&
203 NX_PBUF_FRAGS_MIN == NX_PBUF_FRAGS_DEFAULT);
204
205 /*
206 * Batch alloc/free requires linking the objects together;
207 * make sure that the fields are at the same offset since
208 * we cast the object to struct skmem_obj.
209 */
210 _CASSERT(offsetof(struct __metadata_preamble, _mdp_next) ==
211 offsetof(struct skmem_obj, mo_next));
212 _CASSERT(offsetof(struct __buflet, __buflet_next) ==
213 offsetof(struct skmem_obj, mo_next));
214
215 SK_LOCK_ASSERT_HELD();
216 ASSERT(!__pp_inited);
217
218 pp_opt_cache = skmem_cache_create("pkt.opt",
219 sizeof(struct __packet_opt), sizeof(uint64_t),
220 NULL, NULL, NULL, NULL, NULL, 0);
221 pp_flow_cache = skmem_cache_create("pkt.flow",
222 sizeof(struct __flow), 16, /* 16-bytes aligned */
223 NULL, NULL, NULL, NULL, NULL, 0);
224 pp_compl_cache = skmem_cache_create("pkt.compl",
225 sizeof(struct __packet_compl), sizeof(uint64_t),
226 NULL, NULL, NULL, NULL, NULL, 0);
227
228 PE_parse_boot_argn("sk_pp_min_striding_size", &kern_buf_min_striding_size,
229 sizeof(kern_buf_min_striding_size));
230
231 return 0;
232 }
233
234 void
pp_fini(void)235 pp_fini(void)
236 {
237 SK_LOCK_ASSERT_HELD();
238
239 if (__pp_inited) {
240 if (pp_compl_cache != NULL) {
241 skmem_cache_destroy(pp_compl_cache);
242 pp_compl_cache = NULL;
243 }
244 if (pp_flow_cache != NULL) {
245 skmem_cache_destroy(pp_flow_cache);
246 pp_flow_cache = NULL;
247 }
248 if (pp_opt_cache != NULL) {
249 skmem_cache_destroy(pp_opt_cache);
250 pp_opt_cache = NULL;
251 }
252
253 __pp_inited = 0;
254 }
255 }
256
257 static struct kern_pbufpool *
pp_alloc(zalloc_flags_t how)258 pp_alloc(zalloc_flags_t how)
259 {
260 struct kern_pbufpool *pp = zalloc_flags(pp_zone, how | Z_ZERO);
261
262 if (pp) {
263 lck_mtx_init(&pp->pp_lock, &skmem_lock_grp, &skmem_lock_attr);
264 }
265 return pp;
266 }
267
268 static void
pp_free(struct kern_pbufpool * pp)269 pp_free(struct kern_pbufpool *pp)
270 {
271 PP_LOCK_ASSERT_HELD(pp);
272
273 pp_destroy(pp);
274 PP_UNLOCK(pp);
275
276 SK_DF(SK_VERB_MEM, "pp 0x%llx FREE", SK_KVA(pp));
277 lck_mtx_destroy(&pp->pp_lock, &skmem_lock_grp);
278 zfree(pp_zone, pp);
279 }
280
281 void
pp_retain_locked(struct kern_pbufpool * pp)282 pp_retain_locked(struct kern_pbufpool *pp)
283 {
284 PP_LOCK_ASSERT_HELD(pp);
285
286 pp->pp_refcnt++;
287 ASSERT(pp->pp_refcnt != 0);
288 }
289
290 void
pp_retain(struct kern_pbufpool * pp)291 pp_retain(struct kern_pbufpool *pp)
292 {
293 PP_LOCK(pp);
294 pp_retain_locked(pp);
295 PP_UNLOCK(pp);
296 }
297
298 boolean_t
pp_release_locked(struct kern_pbufpool * pp)299 pp_release_locked(struct kern_pbufpool *pp)
300 {
301 uint32_t oldref = pp->pp_refcnt;
302
303 PP_LOCK_ASSERT_HELD(pp);
304
305 ASSERT(pp->pp_refcnt != 0);
306 if (--pp->pp_refcnt == 0) {
307 pp_free(pp);
308 }
309
310 return oldref == 1;
311 }
312
313 boolean_t
pp_release(struct kern_pbufpool * pp)314 pp_release(struct kern_pbufpool *pp)
315 {
316 boolean_t lastref;
317
318 PP_LOCK(pp);
319 if (!(lastref = pp_release_locked(pp))) {
320 PP_UNLOCK(pp);
321 }
322
323 return lastref;
324 }
325
326 void
pp_close(struct kern_pbufpool * pp)327 pp_close(struct kern_pbufpool *pp)
328 {
329 PP_LOCK(pp);
330 ASSERT(pp->pp_refcnt > 0);
331 ASSERT(!(pp->pp_flags & PPF_CLOSED));
332 pp->pp_flags |= PPF_CLOSED;
333 if (!pp_release_locked(pp)) {
334 PP_UNLOCK(pp);
335 }
336 }
337
338 /*
339 * -fbounds-safety: All callers of pp_regions_params_adjust use SKMEM_REGIONS
340 * size for the srp_array. This is same as marking it __counted_by(SKMEM_REGIONS)
341 */
342 void
pp_regions_params_adjust(struct skmem_region_params srp_array[SKMEM_REGIONS],nexus_meta_type_t md_type,nexus_meta_subtype_t md_subtype,uint32_t md_cnt,uint16_t max_frags,uint32_t buf_size,uint32_t large_buf_size,uint32_t buf_cnt,uint32_t buf_seg_size,uint32_t flags)343 pp_regions_params_adjust(struct skmem_region_params srp_array[SKMEM_REGIONS],
344 nexus_meta_type_t md_type, nexus_meta_subtype_t md_subtype, uint32_t md_cnt,
345 uint16_t max_frags, uint32_t buf_size, uint32_t large_buf_size,
346 uint32_t buf_cnt, uint32_t buf_seg_size, uint32_t flags)
347 {
348 struct skmem_region_params *srp, *kmd_srp, *buf_srp, *kbft_srp,
349 *lbuf_srp;
350 uint32_t md_size = 0;
351 bool kernel_only = ((flags & PP_REGION_CONFIG_KERNEL_ONLY) != 0);
352 bool md_persistent = ((flags & PP_REGION_CONFIG_MD_PERSISTENT) != 0);
353 bool buf_persistent = ((flags & PP_REGION_CONFIG_BUF_PERSISTENT) != 0);
354 bool config_buflet = ((flags & PP_REGION_CONFIG_BUFLET) != 0);
355 bool md_magazine_enable = ((flags &
356 PP_REGION_CONFIG_MD_MAGAZINE_ENABLE) != 0);
357
358 ASSERT(max_frags != 0);
359
360 switch (md_type) {
361 case NEXUS_META_TYPE_QUANTUM:
362 md_size = NX_METADATA_QUANTUM_SZ;
363 break;
364 case NEXUS_META_TYPE_PACKET:
365 md_size = NX_METADATA_PACKET_SZ(max_frags);
366 break;
367 default:
368 VERIFY(0);
369 /* NOTREACHED */
370 __builtin_unreachable();
371 }
372
373 switch (flags & PP_REGION_CONFIG_BUF_IODIR_BIDIR) {
374 case PP_REGION_CONFIG_BUF_IODIR_IN:
375 kmd_srp = &srp_array[SKMEM_REGION_RXKMD];
376 buf_srp = &srp_array[SKMEM_REGION_RXBUF_DEF];
377 lbuf_srp = &srp_array[SKMEM_REGION_RXBUF_LARGE];
378 kbft_srp = &srp_array[SKMEM_REGION_RXKBFT];
379 break;
380 case PP_REGION_CONFIG_BUF_IODIR_OUT:
381 kmd_srp = &srp_array[SKMEM_REGION_TXKMD];
382 buf_srp = &srp_array[SKMEM_REGION_TXBUF_DEF];
383 lbuf_srp = &srp_array[SKMEM_REGION_TXBUF_LARGE];
384 kbft_srp = &srp_array[SKMEM_REGION_TXKBFT];
385 break;
386 case PP_REGION_CONFIG_BUF_IODIR_BIDIR:
387 default:
388 kmd_srp = &srp_array[SKMEM_REGION_KMD];
389 buf_srp = &srp_array[SKMEM_REGION_BUF_DEF];
390 lbuf_srp = &srp_array[SKMEM_REGION_BUF_LARGE];
391 kbft_srp = &srp_array[SKMEM_REGION_KBFT];
392 break;
393 }
394
395 /* add preamble size to metadata obj size */
396 md_size += METADATA_PREAMBLE_SZ;
397 ASSERT(md_size >= NX_METADATA_OBJ_MIN_SZ);
398
399 /* configure kernel metadata region */
400 kmd_srp->srp_md_type = md_type;
401 kmd_srp->srp_md_subtype = md_subtype;
402 kmd_srp->srp_r_obj_cnt = md_cnt;
403 kmd_srp->srp_r_obj_size = md_size;
404 kmd_srp->srp_max_frags = max_frags;
405 ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
406 if (md_persistent) {
407 kmd_srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
408 }
409 ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
410 if (md_magazine_enable) {
411 kmd_srp->srp_cflags &= ~SKMEM_REGION_CR_NOMAGAZINES;
412 }
413 skmem_region_params_config(kmd_srp);
414
415 /* Sanity check for memtag */
416 ASSERT(kmd_srp->srp_c_seg_size == SKMEM_MD_SEG_SIZE);
417
418 /* configure user metadata region */
419 srp = &srp_array[SKMEM_REGION_UMD];
420 if (!kernel_only) {
421 srp->srp_md_type = kmd_srp->srp_md_type;
422 srp->srp_md_subtype = kmd_srp->srp_md_subtype;
423 srp->srp_r_obj_cnt = kmd_srp->srp_c_obj_cnt;
424 srp->srp_r_obj_size = kmd_srp->srp_c_obj_size;
425 srp->srp_max_frags = kmd_srp->srp_max_frags;
426 ASSERT((srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
427 if (md_persistent) {
428 srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
429 }
430 /*
431 * UMD is a mirrored region and object allocation operations
432 * are performed on the KMD objects.
433 */
434 ASSERT((srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
435 skmem_region_params_config(srp);
436 ASSERT(srp->srp_c_obj_cnt == kmd_srp->srp_c_obj_cnt);
437 } else {
438 ASSERT(srp->srp_r_obj_cnt == 0);
439 ASSERT(srp->srp_r_obj_size == 0);
440 }
441
442 /* configure buffer region */
443 buf_srp->srp_r_obj_cnt = MAX(buf_cnt, kmd_srp->srp_c_obj_cnt);
444 buf_srp->srp_r_obj_size = buf_size;
445 buf_srp->srp_cflags &= ~SKMEM_REGION_CR_MONOLITHIC;
446 ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) == 0);
447 if (buf_persistent) {
448 buf_srp->srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
449 }
450 ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) != 0);
451 if (buf_srp->srp_r_obj_size >= kern_buf_min_striding_size) {
452 /*
453 * A buffer size larger than 32K indicates striding is in use, which
454 * means a buffer could be detached from a buflet. In this case, magzine
455 * layer should be enabled.
456 */
457 buf_srp->srp_cflags &= ~SKMEM_REGION_CR_NOMAGAZINES;
458 }
459 ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_UREADONLY) == 0);
460 if ((flags & PP_REGION_CONFIG_BUF_UREADONLY) != 0) {
461 buf_srp->srp_cflags |= SKMEM_REGION_CR_UREADONLY;
462 }
463 ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_KREADONLY) == 0);
464 if ((flags & PP_REGION_CONFIG_BUF_KREADONLY) != 0) {
465 buf_srp->srp_cflags |= SKMEM_REGION_CR_KREADONLY;
466 }
467 ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_MONOLITHIC) == 0);
468 if ((flags & PP_REGION_CONFIG_BUF_MONOLITHIC) != 0) {
469 buf_srp->srp_cflags |= SKMEM_REGION_CR_MONOLITHIC;
470 }
471 ASSERT((srp->srp_cflags & SKMEM_REGION_CR_SEGPHYSCONTIG) == 0);
472 if ((flags & PP_REGION_CONFIG_BUF_SEGPHYSCONTIG) != 0) {
473 buf_srp->srp_cflags |= SKMEM_REGION_CR_SEGPHYSCONTIG;
474 }
475 ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_NOCACHE) == 0);
476 if ((flags & PP_REGION_CONFIG_BUF_NOCACHE) != 0) {
477 buf_srp->srp_cflags |= SKMEM_REGION_CR_NOCACHE;
478 }
479 ASSERT((buf_srp->srp_cflags & SKMEM_REGION_CR_THREADSAFE) == 0);
480 if ((flags & PP_REGION_CONFIG_BUF_THREADSAFE) != 0) {
481 buf_srp->srp_cflags |= SKMEM_REGION_CR_THREADSAFE;
482 }
483 if (buf_seg_size != 0) {
484 buf_srp->srp_r_seg_size = buf_seg_size;
485 }
486 skmem_region_params_config(buf_srp);
487
488 /* configure large buffer region */
489 if (large_buf_size != 0) {
490 lbuf_srp->srp_r_obj_cnt = buf_srp->srp_r_obj_cnt;
491 lbuf_srp->srp_r_obj_size = large_buf_size;
492 lbuf_srp->srp_r_seg_size = buf_srp->srp_r_seg_size;
493 lbuf_srp->srp_cflags = buf_srp->srp_cflags;
494 skmem_region_params_config(lbuf_srp);
495 }
496
497 /* configure kernel buflet region */
498 if (config_buflet) {
499 ASSERT(md_type == NEXUS_META_TYPE_PACKET);
500 /*
501 * Ideally we want the number of buflets to be
502 * "kmd_srp->srp_c_obj_cnt * (kmd_srp->srp_max_frags - 1)",
503 * so that we have enough buflets when multi-buflet and
504 * shared buffer object is used.
505 * Currently multi-buflet is being used only by user pool
506 * which doesn't support shared buffer object, hence to reduce
507 * the number of objects we are restricting the number of
508 * buflets to the number of buffers.
509 */
510 kbft_srp->srp_r_obj_cnt = buf_srp->srp_c_obj_cnt +
511 lbuf_srp->srp_c_obj_cnt;
512 kbft_srp->srp_r_obj_size = MAX(sizeof(struct __kern_buflet_ext),
513 sizeof(struct __user_buflet));
514 kbft_srp->srp_cflags = kmd_srp->srp_cflags;
515 skmem_region_params_config(kbft_srp);
516 ASSERT(kbft_srp->srp_c_obj_cnt >= buf_srp->srp_c_obj_cnt +
517 lbuf_srp->srp_c_obj_cnt);
518 /* Sanity check for memtag */
519 ASSERT(kbft_srp->srp_c_seg_size == SKMEM_MD_SEG_SIZE);
520 } else {
521 ASSERT(kbft_srp->srp_r_obj_cnt == 0);
522 ASSERT(kbft_srp->srp_r_obj_size == 0);
523 }
524
525 /* configure user buflet region */
526 srp = &srp_array[SKMEM_REGION_UBFT];
527 if (config_buflet && !kernel_only) {
528 srp->srp_r_obj_cnt = kbft_srp->srp_c_obj_cnt;
529 srp->srp_r_obj_size = kbft_srp->srp_c_obj_size;
530 srp->srp_cflags = srp_array[SKMEM_REGION_UMD].srp_cflags;
531 skmem_region_params_config(srp);
532 ASSERT(srp->srp_c_obj_cnt == kbft_srp->srp_c_obj_cnt);
533 } else {
534 ASSERT(srp->srp_r_obj_cnt == 0);
535 ASSERT(srp->srp_r_obj_size == 0);
536 }
537
538 /* make sure each metadata can be paired with a buffer */
539 ASSERT(kmd_srp->srp_c_obj_cnt <= buf_srp->srp_c_obj_cnt);
540 }
541
542 SK_NO_INLINE_ATTRIBUTE
543 static int
pp_metadata_construct(struct __kern_quantum * kqum,struct __user_quantum * uqum,obj_idx_t midx,struct kern_pbufpool * pp,uint32_t skmflag,uint16_t bufcnt,bool raw,struct skmem_obj ** blist)544 pp_metadata_construct(struct __kern_quantum *kqum, struct __user_quantum *uqum,
545 obj_idx_t midx, struct kern_pbufpool *pp, uint32_t skmflag, uint16_t bufcnt,
546 bool raw, struct skmem_obj **blist)
547 {
548 struct __kern_buflet *kbuf;
549 mach_vm_address_t baddr = 0;
550 uint16_t *pbufs_cnt, *pbufs_max;
551 uint16_t i;
552
553 ASSERT(bufcnt == 1 || PP_HAS_BUFFER_ON_DEMAND(pp));
554
555 /* construct {user,kernel} metadata */
556 switch (pp->pp_md_type) {
557 case NEXUS_META_TYPE_PACKET: {
558 struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
559 struct __user_packet *upkt = SK_PTR_ADDR_UPKT(uqum);
560 struct __packet_opt *__single opt;
561 struct __flow *__single flow;
562 struct __packet_compl *__single compl;
563 uint64_t pflags;
564
565 if (raw) {
566 opt = skmem_cache_alloc(pp_opt_cache, SKMEM_SLEEP);
567 flow = skmem_cache_alloc(pp_flow_cache, SKMEM_SLEEP);
568 compl = skmem_cache_alloc(pp_compl_cache, SKMEM_SLEEP);
569 pflags = (PKT_F_OPT_ALLOC | PKT_F_FLOW_ALLOC |
570 PKT_F_TX_COMPL_ALLOC);
571 } else {
572 ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
573 kpkt->pkt_com_opt != NULL);
574 opt = kpkt->pkt_com_opt;
575 ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
576 kpkt->pkt_flow != NULL);
577 flow = kpkt->pkt_flow;
578 ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
579 kpkt->pkt_tx_compl != NULL);
580 compl = kpkt->pkt_tx_compl;
581 pflags = kpkt->pkt_pflags;
582 }
583 /* will be adjusted below as part of allocating buffer(s) */
584 _CASSERT(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
585 _CASSERT(sizeof(kpkt->pkt_bufs_max) == sizeof(uint16_t));
586 pbufs_cnt = __DECONST(uint16_t *, &kpkt->pkt_bufs_cnt);
587 pbufs_max = __DECONST(uint16_t *, &kpkt->pkt_bufs_max);
588
589 /* kernel (and user) packet */
590 KPKT_CTOR(kpkt, pflags, opt, flow, compl, midx,
591 upkt, pp, 0, pp->pp_max_frags, 0);
592 break;
593 }
594 default:
595 ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
596 VERIFY(bufcnt == 1);
597 /* TODO: point these to quantum's once they're defined */
598 pbufs_cnt = pbufs_max = NULL;
599 /* kernel quantum */
600 KQUM_CTOR(kqum, midx, uqum, pp, 0);
601 break;
602 }
603
604 kbuf = kqum->qum_buf;
605 for (i = 0; i < bufcnt; i++) {
606 struct skmem_obj_info oib;
607
608 if (!PP_HAS_BUFFER_ON_DEMAND(pp)) {
609 ASSERT(i == 0);
610 ASSERT(*blist == NULL);
611 /*
612 * quantum has a native buflet, so we only need a
613 * buffer to be allocated and attached to the buflet.
614 */
615 baddr = pp_alloc_buffer_common(pp, &oib, skmflag,
616 false);
617 if (__improbable(baddr == 0)) {
618 goto fail;
619 }
620 KBUF_CTOR(kbuf, baddr, SKMEM_OBJ_IDX_REG(&oib),
621 SKMEM_OBJ_BUFCTL(&oib), pp, false);
622 baddr = 0;
623 } else {
624 /*
625 * we use pre-constructed buflets with attached buffers.
626 */
627 struct __kern_buflet *pkbuf = kbuf;
628 struct skmem_obj *blistn;
629
630 ASSERT(pkbuf != NULL);
631 kbuf = (kern_buflet_t)*blist;
632 if (__improbable(kbuf == NULL)) {
633 SK_DF(SK_VERB_MEM, "failed to get buflet,"
634 " pp 0x%llx", SK_KVA(pp));
635 goto fail;
636 }
637
638
639 blistn = (*blist)->mo_next;
640 (*blist)->mo_next = NULL;
641
642 KBUF_EXT_INIT(kbuf, pp);
643 KBUF_LINK(pkbuf, kbuf);
644 *blist = blistn;
645 }
646
647 /* adjust buffer count accordingly */
648 if (__probable(pbufs_cnt != NULL)) {
649 *pbufs_cnt += 1;
650 ASSERT(*pbufs_cnt <= *pbufs_max);
651 }
652 }
653
654 ASSERT(!PP_KERNEL_ONLY(pp) || (kqum->qum_qflags & QUM_F_KERNEL_ONLY));
655 ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE);
656 SK_DF(SK_VERB_MEM, "pp 0x%llx pkt 0x%llx bufcnt %d buf 0x%llx",
657 SK_KVA(pp), SK_KVA(kqum), bufcnt, SK_KVA(baddr));
658 return 0;
659
660 fail:
661 ASSERT(bufcnt != 0 && baddr == 0);
662 pp_metadata_destruct(kqum, pp, raw);
663 return ENOMEM;
664 }
665
666 static int
pp_metadata_ctor_common(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,struct kern_pbufpool * pp,uint32_t skmflag,bool no_buflet)667 pp_metadata_ctor_common(struct skmem_obj_info *oi0,
668 struct skmem_obj_info *oim0, struct kern_pbufpool *pp, uint32_t skmflag,
669 bool no_buflet)
670 {
671 struct skmem_obj_info _oi, _oim;
672 struct skmem_obj_info *oi, *oim;
673 struct __kern_quantum *kqum;
674 struct __user_quantum *uqum;
675 uint16_t bufcnt = (no_buflet ? 0 : pp->pp_max_frags);
676 struct skmem_obj *__single blist = NULL;
677 int error;
678
679 #if (DEVELOPMENT || DEBUG)
680 uint64_t mtbf = skmem_region_get_mtbf();
681 /*
682 * MTBF is applicable only for non-blocking allocations here.
683 */
684 if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
685 (skmflag & SKMEM_NOSLEEP))) {
686 SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
687 net_update_uptime();
688 return ENOMEM;
689 }
690 #endif /* (DEVELOPMENT || DEBUG) */
691
692 /*
693 * Note that oi0 and oim0 may be stored inside the object itself;
694 * if so, copy them to local variables before constructing. We
695 * don't use PPF_BATCH to test as the allocator may be allocating
696 * storage space differently depending on the number of objects.
697 */
698 if (__probable((uintptr_t)oi0 >= (uintptr_t)SKMEM_OBJ_ADDR(oi0) &&
699 ((uintptr_t)oi0 + sizeof(*oi0)) <=
700 ((uintptr_t)SKMEM_OBJ_ADDR(oi0) + SKMEM_OBJ_SIZE(oi0)))) {
701 oi = &_oi;
702 *oi = *oi0;
703 if (__probable(oim0 != NULL)) {
704 oim = &_oim;
705 *oim = *oim0;
706 } else {
707 oim = NULL;
708 }
709 } else {
710 oi = oi0;
711 oim = oim0;
712 }
713
714 kqum = SK_PTR_ADDR_KQUM((uintptr_t)SKMEM_OBJ_ADDR(oi) +
715 METADATA_PREAMBLE_SZ);
716
717 if (__probable(!PP_KERNEL_ONLY(pp))) {
718 ASSERT(oim != NULL && SKMEM_OBJ_ADDR(oim) != NULL);
719 ASSERT(SKMEM_OBJ_SIZE(oi) == SKMEM_OBJ_SIZE(oim));
720 uqum = SK_PTR_ADDR_UQUM((uintptr_t)SKMEM_OBJ_ADDR(oim) +
721 METADATA_PREAMBLE_SZ);
722 } else {
723 ASSERT(oim == NULL);
724 uqum = NULL;
725 }
726
727 if (oim != NULL) {
728 /* initialize user metadata redzone */
729 struct __metadata_preamble *mdp = SKMEM_OBJ_ADDR(oim);
730 mdp->mdp_redzone =
731 (SKMEM_OBJ_ROFF(oim) + METADATA_PREAMBLE_SZ) ^
732 __ch_umd_redzone_cookie;
733 }
734
735 /* allocate (constructed) buflet(s) with buffer(s) attached */
736 if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0) {
737 (void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
738 PP_KBFT_CACHE_DEF(pp)->skm_objsize, bufcnt, skmflag);
739 }
740
741 error = pp_metadata_construct(kqum, uqum, SKMEM_OBJ_IDX_REG(oi), pp,
742 skmflag, bufcnt, TRUE, &blist);
743 if (__improbable(blist != NULL)) {
744 skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist);
745 blist = NULL;
746 }
747 return error;
748 }
749
750 static int
pp_metadata_ctor_no_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)751 pp_metadata_ctor_no_buflet(struct skmem_obj_info *oi0,
752 struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
753 {
754 return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, true);
755 }
756
757 static int
pp_metadata_ctor_max_buflet(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)758 pp_metadata_ctor_max_buflet(struct skmem_obj_info *oi0,
759 struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
760 {
761 return pp_metadata_ctor_common(oi0, oim0, arg, skmflag, false);
762 }
763
764 __attribute__((always_inline))
765 static void
pp_metadata_destruct_common(struct __kern_quantum * kqum,struct kern_pbufpool * pp,bool raw,struct skmem_obj ** blist_def,struct skmem_obj ** blist_nocache_def,struct skmem_obj ** blist_large,struct skmem_obj ** blist_nocache_large)766 pp_metadata_destruct_common(struct __kern_quantum *kqum,
767 struct kern_pbufpool *pp, bool raw, struct skmem_obj **blist_def,
768 struct skmem_obj **blist_nocache_def, struct skmem_obj **blist_large,
769 struct skmem_obj **blist_nocache_large)
770 {
771 struct __kern_buflet *kbuf, *nbuf;
772 struct skmem_obj *__single p_blist_def = NULL, *__single p_blist_large = NULL;
773 struct skmem_obj *__single p_blist_nocache_def = NULL, *__single p_blist_nocache_large = NULL;
774 struct skmem_obj **pp_blist_def = &p_blist_def;
775 struct skmem_obj **pp_blist_large = &p_blist_large;
776 struct skmem_obj **pp_blist_nocache_def = &p_blist_nocache_def;
777 struct skmem_obj **pp_blist_nocache_large = &p_blist_nocache_large;
778 uint16_t bufcnt, i = 0;
779 bool first_buflet_empty;
780
781 ASSERT(blist_def != NULL);
782 ASSERT(blist_large != NULL);
783
784 switch (pp->pp_md_type) {
785 case NEXUS_META_TYPE_PACKET: {
786 struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
787
788 ASSERT(kpkt->pkt_user != NULL || PP_KERNEL_ONLY(pp));
789 ASSERT(kpkt->pkt_qum.qum_pp == pp);
790 ASSERT(METADATA_TYPE(kpkt) == pp->pp_md_type);
791 ASSERT(METADATA_SUBTYPE(kpkt) == pp->pp_md_subtype);
792 ASSERT(METADATA_IDX(kpkt) != OBJ_IDX_NONE);
793 ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
794 ASSERT(kpkt->pkt_bufs_cnt <= kpkt->pkt_bufs_max);
795 ASSERT(kpkt->pkt_bufs_max == pp->pp_max_frags);
796 _CASSERT(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t));
797 bufcnt = kpkt->pkt_bufs_cnt;
798 kbuf = &kqum->qum_buf[0];
799 /*
800 * special handling for empty first buflet.
801 */
802 first_buflet_empty = (kbuf->buf_addr == 0);
803 *__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = 0;
804 break;
805 }
806 default:
807 ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
808 ASSERT(kqum->qum_user != NULL || PP_KERNEL_ONLY(pp));
809 ASSERT(kqum->qum_pp == pp);
810 ASSERT(METADATA_TYPE(kqum) == pp->pp_md_type);
811 ASSERT(METADATA_SUBTYPE(kqum) == pp->pp_md_subtype);
812 ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE);
813 ASSERT(kqum->qum_ksd == NULL);
814 kbuf = &kqum->qum_buf[0];
815 /*
816 * XXX: Special handling for quantum as we don't currently
817 * define bufs_{cnt,max} there. Given that we support at
818 * most only 1 buflet for now, check if buf_addr is non-NULL.
819 * See related code in pp_metadata_construct().
820 */
821 first_buflet_empty = (kbuf->buf_addr == 0);
822 bufcnt = first_buflet_empty ? 0 : 1;
823 break;
824 }
825
826 /*
827 * -fbounds-safety: buf_nbft_addr is a mach_vm_address_t which is
828 * unsafe, so we forge it here.
829 */
830 nbuf = __unsafe_forge_single(struct __kern_buflet *,
831 __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
832 BUF_NBFT_ADDR(kbuf, 0);
833 BUF_NBFT_IDX(kbuf, OBJ_IDX_NONE);
834 if (!first_buflet_empty) {
835 pp_free_buflet_common(pp, kbuf);
836 ++i;
837 }
838
839 while (nbuf != NULL) {
840 ASSERT(nbuf->buf_ctl != NULL);
841 if (BUFLET_HAS_LARGE_BUF(nbuf)) {
842 /*
843 * bc_usecnt larger than 1 means the buffer has been cloned and is
844 * still being used by other bflts. In this case, when we free
845 * this bflt we need to explicitly ask for it to not be cached again
846 * into magzine layer to prevent immediate reuse of the buffer and
847 * data corruption.
848 */
849 if (nbuf->buf_ctl->bc_usecnt > 1) {
850 *pp_blist_nocache_large = (struct skmem_obj *)(void *)nbuf;
851 pp_blist_nocache_large =
852 &((struct skmem_obj *)(void *)nbuf)->mo_next;
853 } else {
854 *pp_blist_large = (struct skmem_obj *)(void *)nbuf;
855 pp_blist_large =
856 &((struct skmem_obj *)(void *)nbuf)->mo_next;
857 }
858 } else {
859 if (nbuf->buf_ctl->bc_usecnt > 1) {
860 *pp_blist_nocache_def = (struct skmem_obj *)(void *)nbuf;
861 pp_blist_nocache_def =
862 &((struct skmem_obj *)(void *)nbuf)->mo_next;
863 } else {
864 *pp_blist_def = (struct skmem_obj *)(void *)nbuf;
865 pp_blist_def =
866 &((struct skmem_obj *)(void *)nbuf)->mo_next;
867 }
868 }
869 BUF_NBFT_IDX(nbuf, OBJ_IDX_NONE);
870 nbuf = __unsafe_forge_single(struct __kern_buflet *,
871 __DECONST(struct __kern_buflet *, nbuf->buf_nbft_addr));
872 ++i;
873 }
874
875 ASSERT(i == bufcnt);
876
877 if (p_blist_def != NULL) {
878 *pp_blist_def = *blist_def;
879 *blist_def = p_blist_def;
880 }
881 if (p_blist_large != NULL) {
882 *pp_blist_large = *blist_large;
883 *blist_large = p_blist_large;
884 }
885 if (p_blist_nocache_def != NULL) {
886 *pp_blist_nocache_def = *blist_nocache_def;
887 *blist_nocache_def = p_blist_nocache_def;
888 }
889 if (p_blist_nocache_large != NULL) {
890 *pp_blist_nocache_large = *blist_nocache_large;
891 *blist_nocache_large = p_blist_nocache_large;
892 }
893
894 /* if we're about to return this object to the slab, clean it up */
895 if (raw) {
896 switch (pp->pp_md_type) {
897 case NEXUS_META_TYPE_PACKET: {
898 struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
899
900 ASSERT(kpkt->pkt_com_opt != NULL ||
901 !(kpkt->pkt_pflags & PKT_F_OPT_ALLOC));
902 if (kpkt->pkt_com_opt != NULL) {
903 ASSERT(kpkt->pkt_pflags & PKT_F_OPT_ALLOC);
904 skmem_cache_free(pp_opt_cache,
905 kpkt->pkt_com_opt);
906 kpkt->pkt_com_opt = NULL;
907 }
908 ASSERT(kpkt->pkt_flow != NULL ||
909 !(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC));
910 if (kpkt->pkt_flow != NULL) {
911 ASSERT(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC);
912 skmem_cache_free(pp_flow_cache, kpkt->pkt_flow);
913 kpkt->pkt_flow = NULL;
914 }
915 ASSERT(kpkt->pkt_tx_compl != NULL ||
916 !(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC));
917 if (kpkt->pkt_tx_compl != NULL) {
918 ASSERT(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC);
919 skmem_cache_free(pp_compl_cache,
920 kpkt->pkt_tx_compl);
921 kpkt->pkt_tx_compl = NULL;
922 }
923 kpkt->pkt_pflags = 0;
924 break;
925 }
926 default:
927 ASSERT(METADATA_TYPE(kqum) == NEXUS_META_TYPE_QUANTUM);
928 /* nothing to do for quantum (yet) */
929 break;
930 }
931 }
932 }
933
934 __attribute__((always_inline))
935 static void
pp_free_kbft_list(struct kern_pbufpool * pp,struct skmem_obj * blist_def,struct skmem_obj * blist_nocache_def,struct skmem_obj * blist_large,struct skmem_obj * blist_nocache_large)936 pp_free_kbft_list(struct kern_pbufpool *pp, struct skmem_obj *blist_def, struct skmem_obj *blist_nocache_def,
937 struct skmem_obj *blist_large, struct skmem_obj *blist_nocache_large)
938 {
939 if (blist_def != NULL) {
940 skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp), blist_def);
941 }
942 if (blist_large != NULL) {
943 skmem_cache_batch_free(PP_KBFT_CACHE_LARGE(pp), blist_large);
944 }
945 if (blist_nocache_def != NULL) {
946 skmem_cache_batch_free_nocache(PP_KBFT_CACHE_DEF(pp), blist_nocache_def);
947 }
948 if (blist_nocache_large != NULL) {
949 skmem_cache_batch_free_nocache(PP_KBFT_CACHE_LARGE(pp), blist_nocache_large);
950 }
951 }
952
953 __attribute__((always_inline))
954 static void
pp_metadata_destruct(struct __kern_quantum * kqum,struct kern_pbufpool * pp,bool raw)955 pp_metadata_destruct(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
956 bool raw)
957 {
958 struct skmem_obj *__single blist_def = NULL, *__single blist_large = NULL;
959 struct skmem_obj *__single blist_nocache_def = NULL, *__single blist_nocache_large = NULL;
960
961 pp_metadata_destruct_common(kqum, pp, raw, &blist_def, &blist_nocache_def,
962 &blist_large, &blist_nocache_large);
963 pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
964 }
965
966 static void
pp_metadata_dtor(void * addr,void * arg)967 pp_metadata_dtor(void *addr, void *arg)
968 {
969 pp_metadata_destruct(SK_PTR_ADDR_KQUM((uintptr_t)addr +
970 METADATA_PREAMBLE_SZ), arg, TRUE);
971 }
972
973 static void
pp_buf_seg_ctor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)974 pp_buf_seg_ctor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
975 {
976 struct kern_pbufpool *__single pp = arg;
977
978 if (pp->pp_pbuf_seg_ctor != NULL) {
979 pp->pp_pbuf_seg_ctor(pp, sg, md);
980 }
981 }
982
983 static void
pp_buf_seg_dtor(struct sksegment * sg,IOSKMemoryBufferRef md,void * arg)984 pp_buf_seg_dtor(struct sksegment *sg, IOSKMemoryBufferRef md, void *arg)
985 {
986 struct kern_pbufpool *__single pp = arg;
987
988 if (pp->pp_pbuf_seg_dtor != NULL) {
989 pp->pp_pbuf_seg_dtor(pp, sg, md);
990 }
991 }
992
993 static int
pp_buflet_metadata_ctor_common(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag,bool large)994 pp_buflet_metadata_ctor_common(struct skmem_obj_info *oi0,
995 struct skmem_obj_info *oim0, void *arg, uint32_t skmflag, bool large)
996 {
997 #pragma unused (skmflag)
998 struct kern_pbufpool *pp = (struct kern_pbufpool *)arg;
999 struct __kern_buflet *kbft;
1000 struct __user_buflet *ubft;
1001 struct skmem_obj_info oib;
1002 mach_vm_address_t baddr;
1003 obj_idx_t oi_idx_reg;
1004
1005 baddr = pp_alloc_buffer_common(pp, &oib, skmflag, large);
1006 if (__improbable(baddr == 0)) {
1007 return ENOMEM;
1008 }
1009 /*
1010 * Note that oi0 and oim0 may be stored inside the object itself;
1011 * so copy what is required to local variables before constructing.
1012 */
1013 oi_idx_reg = SKMEM_OBJ_IDX_REG(oi0);
1014 kbft = SKMEM_OBJ_ADDR(oi0);
1015
1016 if (__probable(!PP_KERNEL_ONLY(pp))) {
1017 ASSERT(oim0 != NULL && SKMEM_OBJ_ADDR(oim0) != NULL);
1018 ASSERT(SKMEM_OBJ_SIZE(oi0) == SKMEM_OBJ_SIZE(oim0));
1019 ASSERT(oi_idx_reg == SKMEM_OBJ_IDX_REG(oim0));
1020 ASSERT(SKMEM_OBJ_IDX_SEG(oi0) == SKMEM_OBJ_IDX_SEG(oim0));
1021 ubft = SKMEM_OBJ_ADDR(oim0);
1022 } else {
1023 ASSERT(oim0 == NULL);
1024 ubft = NULL;
1025 }
1026 KBUF_EXT_CTOR(kbft, ubft, baddr, SKMEM_OBJ_IDX_REG(&oib),
1027 SKMEM_OBJ_BUFCTL(&oib), oi_idx_reg, pp, large);
1028 return 0;
1029 }
1030
1031 static int
pp_buflet_default_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)1032 pp_buflet_default_buffer_metadata_ctor(struct skmem_obj_info *oi0,
1033 struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
1034 {
1035 return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, false);
1036 }
1037
1038 static int
pp_buflet_large_buffer_metadata_ctor(struct skmem_obj_info * oi0,struct skmem_obj_info * oim0,void * arg,uint32_t skmflag)1039 pp_buflet_large_buffer_metadata_ctor(struct skmem_obj_info *oi0,
1040 struct skmem_obj_info *oim0, void *arg, uint32_t skmflag)
1041 {
1042 return pp_buflet_metadata_ctor_common(oi0, oim0, arg, skmflag, true);
1043 }
1044
1045 static void
pp_buflet_metadata_dtor(void * addr,void * arg)1046 pp_buflet_metadata_dtor(void *addr, void *arg)
1047 {
1048 struct __kern_buflet *__single kbft = addr;
1049 void *objaddr = kbft->buf_objaddr;
1050 struct kern_pbufpool *__single pp = arg;
1051 uint32_t usecnt = 0;
1052 bool large = BUFLET_HAS_LARGE_BUF(kbft);
1053
1054 ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1055 /*
1056 * don't assert for (buf_nbft_addr == 0) here as constructed
1057 * buflet may have this field as non-zero. This is because
1058 * buf_nbft_addr (__buflet_next) is used by skmem batch alloc
1059 * for chaining the buflets.
1060 * To ensure that the frred buflet was not part of a chain we
1061 * assert for (buf_nbft_idx == OBJ_IDX_NONE).
1062 */
1063 ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
1064 ASSERT(((struct __kern_buflet_ext *)kbft)->kbe_buf_upp_link.sle_next ==
1065 NULL);
1066 ASSERT(kbft->buf_addr != 0);
1067 ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
1068 ASSERT(kbft->buf_ctl != NULL);
1069
1070 KBUF_DTOR(kbft, usecnt);
1071 SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u", SK_KVA(pp),
1072 SK_KVA(objaddr), usecnt);
1073 if (__probable(usecnt == 0)) {
1074 skmem_cache_free(large ? PP_BUF_CACHE_LARGE(pp) :
1075 PP_BUF_CACHE_DEF(pp), objaddr);
1076 }
1077 }
1078
1079 /*
1080 * -fbounds-safety: all callers of pp_create use srp_array with a known size:
1081 * SKMEM_REGIONS. This is same as marking it __counted_by(SKMEM_REGIONS)
1082 */
1083 struct kern_pbufpool *
pp_create(const char * name,struct skmem_region_params srp_array[SKMEM_REGIONS],pbuf_seg_ctor_fn_t buf_seg_ctor,pbuf_seg_dtor_fn_t buf_seg_dtor,const void * ctx,pbuf_ctx_retain_fn_t ctx_retain,pbuf_ctx_release_fn_t ctx_release,uint32_t ppcreatef)1084 pp_create(const char *name, struct skmem_region_params srp_array[SKMEM_REGIONS],
1085 pbuf_seg_ctor_fn_t buf_seg_ctor, pbuf_seg_dtor_fn_t buf_seg_dtor,
1086 const void *ctx, pbuf_ctx_retain_fn_t ctx_retain,
1087 pbuf_ctx_release_fn_t ctx_release, uint32_t ppcreatef)
1088 {
1089 struct kern_pbufpool *pp = NULL;
1090 uint32_t md_size, def_buf_obj_size;
1091 uint32_t def_buf_size, large_buf_size;
1092 nexus_meta_type_t md_type;
1093 nexus_meta_subtype_t md_subtype;
1094 uint32_t md_cflags;
1095 uint16_t max_frags;
1096 uint32_t buf_def_cflags;
1097 char cname[64];
1098 const char *__null_terminated cache_name = NULL;
1099 struct skmem_region_params *kmd_srp;
1100 struct skmem_region_params *buf_srp;
1101 struct skmem_region_params *kbft_srp;
1102 struct skmem_region_params *umd_srp = NULL;
1103 struct skmem_region_params *ubft_srp = NULL;
1104 struct skmem_region_params *lbuf_srp = NULL;
1105
1106 /* buf_seg_{ctor,dtor} pair must be either NULL or non-NULL */
1107 ASSERT(!(!(buf_seg_ctor == NULL && buf_seg_dtor == NULL) &&
1108 ((buf_seg_ctor == NULL) ^ (buf_seg_dtor == NULL))));
1109
1110 /* ctx{,_retain,_release} must be either ALL NULL or ALL non-NULL */
1111 ASSERT((ctx == NULL && ctx_retain == NULL && ctx_release == NULL) ||
1112 (ctx != NULL && ctx_retain != NULL && ctx_release != NULL));
1113
1114 if (srp_array[SKMEM_REGION_KMD].srp_c_obj_cnt != 0) {
1115 kmd_srp = &srp_array[SKMEM_REGION_KMD];
1116 buf_srp = &srp_array[SKMEM_REGION_BUF_DEF];
1117 lbuf_srp = &srp_array[SKMEM_REGION_BUF_LARGE];
1118 kbft_srp = &srp_array[SKMEM_REGION_KBFT];
1119 } else if (srp_array[SKMEM_REGION_RXKMD].srp_c_obj_cnt != 0) {
1120 kmd_srp = &srp_array[SKMEM_REGION_RXKMD];
1121 buf_srp = &srp_array[SKMEM_REGION_RXBUF_DEF];
1122 lbuf_srp = &srp_array[SKMEM_REGION_RXBUF_LARGE];
1123 kbft_srp = &srp_array[SKMEM_REGION_RXKBFT];
1124 } else {
1125 VERIFY(srp_array[SKMEM_REGION_TXKMD].srp_c_obj_cnt != 0);
1126 kmd_srp = &srp_array[SKMEM_REGION_TXKMD];
1127 buf_srp = &srp_array[SKMEM_REGION_TXBUF_DEF];
1128 lbuf_srp = &srp_array[SKMEM_REGION_TXBUF_LARGE];
1129 kbft_srp = &srp_array[SKMEM_REGION_TXKBFT];
1130 }
1131
1132 VERIFY(kmd_srp->srp_c_obj_size != 0);
1133 VERIFY(buf_srp->srp_c_obj_cnt != 0);
1134 VERIFY(buf_srp->srp_c_obj_size != 0);
1135
1136 if (ppcreatef & PPCREATEF_ONDEMAND_BUF) {
1137 VERIFY(kbft_srp->srp_c_obj_cnt != 0);
1138 VERIFY(kbft_srp->srp_c_obj_size != 0);
1139 } else {
1140 kbft_srp = NULL;
1141 }
1142
1143 if ((ppcreatef & PPCREATEF_KERNEL_ONLY) == 0) {
1144 umd_srp = &srp_array[SKMEM_REGION_UMD];
1145 ASSERT(umd_srp->srp_c_obj_size == kmd_srp->srp_c_obj_size);
1146 ASSERT(umd_srp->srp_c_obj_cnt == kmd_srp->srp_c_obj_cnt);
1147 ASSERT(umd_srp->srp_c_seg_size == kmd_srp->srp_c_seg_size);
1148 ASSERT(umd_srp->srp_seg_cnt == kmd_srp->srp_seg_cnt);
1149 ASSERT(umd_srp->srp_md_type == kmd_srp->srp_md_type);
1150 ASSERT(umd_srp->srp_md_subtype == kmd_srp->srp_md_subtype);
1151 ASSERT(umd_srp->srp_max_frags == kmd_srp->srp_max_frags);
1152 ASSERT((umd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
1153 (kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
1154 if (kbft_srp != NULL) {
1155 ubft_srp = &srp_array[SKMEM_REGION_UBFT];
1156 ASSERT(ubft_srp->srp_c_obj_size ==
1157 kbft_srp->srp_c_obj_size);
1158 ASSERT(ubft_srp->srp_c_obj_cnt ==
1159 kbft_srp->srp_c_obj_cnt);
1160 ASSERT(ubft_srp->srp_c_seg_size ==
1161 kbft_srp->srp_c_seg_size);
1162 ASSERT(ubft_srp->srp_seg_cnt == kbft_srp->srp_seg_cnt);
1163 }
1164 }
1165
1166 md_size = kmd_srp->srp_r_obj_size;
1167 md_type = kmd_srp->srp_md_type;
1168 md_subtype = kmd_srp->srp_md_subtype;
1169 max_frags = kmd_srp->srp_max_frags;
1170 def_buf_obj_size = buf_srp->srp_c_obj_size;
1171 def_buf_size = def_buf_obj_size;
1172 large_buf_size = lbuf_srp->srp_c_obj_size;
1173
1174 #if (DEBUG || DEVELOPMENT)
1175 ASSERT(def_buf_obj_size != 0);
1176 ASSERT(md_type > NEXUS_META_TYPE_INVALID &&
1177 md_type <= NEXUS_META_TYPE_MAX);
1178 if (md_type == NEXUS_META_TYPE_QUANTUM) {
1179 ASSERT(max_frags == 1);
1180 ASSERT(md_size >=
1181 (METADATA_PREAMBLE_SZ + NX_METADATA_QUANTUM_SZ));
1182 } else {
1183 ASSERT(max_frags >= 1);
1184 ASSERT(md_type == NEXUS_META_TYPE_PACKET);
1185 ASSERT(md_size >= (METADATA_PREAMBLE_SZ +
1186 NX_METADATA_PACKET_SZ(max_frags)));
1187 }
1188 ASSERT(md_subtype > NEXUS_META_SUBTYPE_INVALID &&
1189 md_subtype <= NEXUS_META_SUBTYPE_MAX);
1190 #endif /* DEBUG || DEVELOPMENT */
1191
1192 pp = pp_alloc(Z_WAITOK);
1193
1194 (void) snprintf((char *)pp->pp_name, sizeof(pp->pp_name),
1195 "skywalk.pp.%s", name);
1196
1197 pp->pp_ctx = __DECONST(void *, ctx);
1198 pp->pp_ctx_retain = ctx_retain;
1199 pp->pp_ctx_release = ctx_release;
1200 if (pp->pp_ctx != NULL) {
1201 pp->pp_ctx_retain(pp->pp_ctx);
1202 }
1203
1204 pp->pp_pbuf_seg_ctor = buf_seg_ctor;
1205 pp->pp_pbuf_seg_dtor = buf_seg_dtor;
1206 PP_BUF_SIZE_DEF(pp) = def_buf_size;
1207 PP_BUF_OBJ_SIZE_DEF(pp) = def_buf_obj_size;
1208 PP_BUF_SIZE_LARGE(pp) = large_buf_size;
1209 PP_BUF_OBJ_SIZE_LARGE(pp) = lbuf_srp->srp_c_obj_size;
1210 pp->pp_md_type = md_type;
1211 pp->pp_md_subtype = md_subtype;
1212 pp->pp_max_frags = max_frags;
1213 if (ppcreatef & PPCREATEF_EXTERNAL) {
1214 pp->pp_flags |= PPF_EXTERNAL;
1215 }
1216 if (ppcreatef & PPCREATEF_TRUNCATED_BUF) {
1217 pp->pp_flags |= PPF_TRUNCATED_BUF;
1218 }
1219 if (ppcreatef & PPCREATEF_KERNEL_ONLY) {
1220 pp->pp_flags |= PPF_KERNEL;
1221 }
1222 if (ppcreatef & PPCREATEF_ONDEMAND_BUF) {
1223 pp->pp_flags |= PPF_BUFFER_ON_DEMAND;
1224 }
1225 if (ppcreatef & PPCREATEF_DYNAMIC) {
1226 pp->pp_flags |= PPF_DYNAMIC;
1227 }
1228 if (lbuf_srp->srp_c_obj_cnt > 0) {
1229 ASSERT(lbuf_srp->srp_c_obj_size != 0);
1230 pp->pp_flags |= PPF_LARGE_BUF;
1231 }
1232
1233 pp_retain(pp);
1234
1235 md_cflags = ((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ?
1236 SKMEM_CR_NOMAGAZINES : 0);
1237 md_cflags |= SKMEM_CR_BATCH;
1238 pp->pp_flags |= PPF_BATCH;
1239
1240 if (pp->pp_flags & PPF_DYNAMIC) {
1241 md_cflags |= SKMEM_CR_DYNAMIC;
1242 }
1243
1244 if (umd_srp != NULL && (pp->pp_umd_region =
1245 skmem_region_create(name, umd_srp, NULL, NULL, NULL)) == NULL) {
1246 SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1247 pp->pp_name, SK_KVA(pp), umd_srp->srp_name);
1248 goto failed;
1249 }
1250
1251 if ((pp->pp_kmd_region = skmem_region_create(name, kmd_srp, NULL, NULL,
1252 NULL)) == NULL) {
1253 SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1254 pp->pp_name, SK_KVA(pp), kmd_srp->srp_name);
1255 goto failed;
1256 }
1257
1258 if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1259 VERIFY((kbft_srp != NULL) && (kbft_srp->srp_c_obj_cnt > 0));
1260 if (!PP_KERNEL_ONLY(pp)) {
1261 VERIFY((ubft_srp != NULL) &&
1262 (ubft_srp->srp_c_obj_cnt > 0));
1263 }
1264 }
1265 /*
1266 * Metadata regions {KMD,KBFT,UBFT} magazines layer and persistency
1267 * attribute must match.
1268 */
1269 if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1270 ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES) ==
1271 (kbft_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES));
1272 ASSERT((kmd_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT) ==
1273 (kbft_srp->srp_cflags & SKMEM_REGION_CR_PERSISTENT));
1274 }
1275
1276 if (PP_HAS_BUFFER_ON_DEMAND(pp) && !PP_KERNEL_ONLY(pp)) {
1277 if ((pp->pp_ubft_region = skmem_region_create(name, ubft_srp,
1278 NULL, NULL, NULL)) == NULL) {
1279 SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1280 pp->pp_name, SK_KVA(pp), ubft_srp->srp_name);
1281 goto failed;
1282 }
1283 }
1284
1285 if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1286 if ((pp->pp_kbft_region = skmem_region_create(name,
1287 kbft_srp, NULL, NULL, NULL)) == NULL) {
1288 SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1289 pp->pp_name, SK_KVA(pp), kbft_srp->srp_name);
1290 goto failed;
1291 }
1292 }
1293
1294 if (!PP_KERNEL_ONLY(pp)) {
1295 skmem_region_mirror(pp->pp_kmd_region, pp->pp_umd_region);
1296 }
1297 if (!PP_KERNEL_ONLY(pp) && pp->pp_ubft_region != NULL) {
1298 ASSERT(pp->pp_kbft_region != NULL);
1299 skmem_region_mirror(pp->pp_kbft_region, pp->pp_ubft_region);
1300 }
1301
1302 /*
1303 * Create the metadata cache; magazines layer is determined by caller.
1304 */
1305 cache_name = tsnprintf(cname, sizeof(cname), "kmd.%s", name);
1306 if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
1307 pp->pp_kmd_cache = skmem_cache_create(cache_name, md_size, 0,
1308 pp_metadata_ctor_no_buflet, pp_metadata_dtor, NULL, pp,
1309 pp->pp_kmd_region, md_cflags);
1310 } else {
1311 pp->pp_kmd_cache = skmem_cache_create(cache_name, md_size, 0,
1312 pp_metadata_ctor_max_buflet, pp_metadata_dtor, NULL, pp,
1313 pp->pp_kmd_region, md_cflags);
1314 }
1315
1316 if (pp->pp_kmd_cache == NULL) {
1317 SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1318 pp->pp_name, SK_KVA(pp), cname);
1319 goto failed;
1320 }
1321
1322 /*
1323 * Create the buflet metadata cache
1324 */
1325 if (pp->pp_kbft_region != NULL) {
1326 cache_name = tsnprintf(cname, sizeof(cname), "kbft_def.%s", name);
1327 PP_KBFT_CACHE_DEF(pp) = skmem_cache_create(cache_name,
1328 kbft_srp->srp_c_obj_size, 0,
1329 pp_buflet_default_buffer_metadata_ctor,
1330 pp_buflet_metadata_dtor, NULL, pp, pp->pp_kbft_region,
1331 md_cflags);
1332
1333 if (PP_KBFT_CACHE_DEF(pp) == NULL) {
1334 SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1335 pp->pp_name, SK_KVA(pp), cname);
1336 goto failed;
1337 }
1338
1339 if (PP_HAS_LARGE_BUF(pp)) {
1340 /* Aggressive memory reclaim flag set to kbft_large for now */
1341 md_cflags |= SKMEM_CR_RECLAIM;
1342 cache_name = tsnprintf(cname, sizeof(cname),
1343 "kbft_large.%s", name);
1344 PP_KBFT_CACHE_LARGE(pp) = skmem_cache_create(cache_name,
1345 kbft_srp->srp_c_obj_size, 0,
1346 pp_buflet_large_buffer_metadata_ctor,
1347 pp_buflet_metadata_dtor,
1348 NULL, pp, pp->pp_kbft_region, md_cflags);
1349
1350 if (PP_KBFT_CACHE_LARGE(pp) == NULL) {
1351 SK_ERR("\"%s\" (0x%llx) failed to "
1352 "create \"%s\" cache", pp->pp_name,
1353 SK_KVA(pp), cname);
1354 goto failed;
1355 }
1356 }
1357 }
1358
1359 if ((PP_BUF_REGION_DEF(pp) = skmem_region_create(name,
1360 buf_srp, pp_buf_seg_ctor, pp_buf_seg_dtor, pp)) == NULL) {
1361 SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1362 pp->pp_name, SK_KVA(pp), buf_srp->srp_name);
1363 goto failed;
1364 }
1365
1366 if (PP_HAS_LARGE_BUF(pp)) {
1367 PP_BUF_REGION_LARGE(pp) = skmem_region_create(name, lbuf_srp,
1368 pp_buf_seg_ctor, pp_buf_seg_dtor, pp);
1369 if (PP_BUF_REGION_LARGE(pp) == NULL) {
1370 SK_ERR("\"%s\" (0x%llx) failed to create %s region",
1371 pp->pp_name, SK_KVA(pp), lbuf_srp->srp_name);
1372 goto failed;
1373 }
1374 }
1375
1376 /*
1377 * Create the buffer object cache without the magazines layer.
1378 * We rely on caching the constructed metadata object instead.
1379 */
1380 cache_name = tsnprintf(cname, sizeof(cname), "buf_def.%s", name);
1381 buf_def_cflags = buf_srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES ? SKMEM_CR_NOMAGAZINES : 0;
1382 if ((PP_BUF_CACHE_DEF(pp) = skmem_cache_create(cache_name,
1383 def_buf_obj_size,
1384 0, NULL, NULL, NULL, pp, PP_BUF_REGION_DEF(pp),
1385 buf_def_cflags)) == NULL) {
1386 SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1387 pp->pp_name, SK_KVA(pp), cname);
1388 goto failed;
1389 }
1390
1391 if (PP_BUF_REGION_LARGE(pp) != NULL) {
1392 cache_name = tsnprintf(cname, sizeof(cname), "buf_large.%s", name);
1393 if ((PP_BUF_CACHE_LARGE(pp) = skmem_cache_create(cache_name,
1394 lbuf_srp->srp_c_obj_size, 0, NULL, NULL, NULL, pp,
1395 PP_BUF_REGION_LARGE(pp), SKMEM_CR_NOMAGAZINES)) == NULL) {
1396 SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache",
1397 pp->pp_name, SK_KVA(pp), cname);
1398 goto failed;
1399 }
1400 }
1401
1402 return pp;
1403
1404 failed:
1405 if (pp != NULL) {
1406 if (pp->pp_ctx != NULL) {
1407 pp->pp_ctx_release(pp->pp_ctx);
1408 pp->pp_ctx = NULL;
1409 }
1410 pp_close(pp);
1411 }
1412
1413 return NULL;
1414 }
1415
1416 void
pp_destroy(struct kern_pbufpool * pp)1417 pp_destroy(struct kern_pbufpool *pp)
1418 {
1419 PP_LOCK_ASSERT_HELD(pp);
1420
1421 /* may be called for built-in pp with outstanding reference */
1422 ASSERT(!(pp->pp_flags & PPF_EXTERNAL) || pp->pp_refcnt == 0);
1423
1424 pp_destroy_upp_locked(pp);
1425
1426 pp_destroy_upp_bft_locked(pp);
1427
1428 if (pp->pp_kmd_cache != NULL) {
1429 skmem_cache_destroy(pp->pp_kmd_cache);
1430 pp->pp_kmd_cache = NULL;
1431 }
1432
1433 if (pp->pp_umd_region != NULL) {
1434 skmem_region_release(pp->pp_umd_region);
1435 pp->pp_umd_region = NULL;
1436 }
1437
1438 if (pp->pp_kmd_region != NULL) {
1439 skmem_region_release(pp->pp_kmd_region);
1440 pp->pp_kmd_region = NULL;
1441 }
1442
1443 if (PP_KBFT_CACHE_DEF(pp) != NULL) {
1444 skmem_cache_destroy(PP_KBFT_CACHE_DEF(pp));
1445 PP_KBFT_CACHE_DEF(pp) = NULL;
1446 }
1447
1448 if (PP_KBFT_CACHE_LARGE(pp) != NULL) {
1449 skmem_cache_destroy(PP_KBFT_CACHE_LARGE(pp));
1450 PP_KBFT_CACHE_LARGE(pp) = NULL;
1451 }
1452
1453 if (pp->pp_ubft_region != NULL) {
1454 skmem_region_release(pp->pp_ubft_region);
1455 pp->pp_ubft_region = NULL;
1456 }
1457
1458 if (pp->pp_kbft_region != NULL) {
1459 skmem_region_release(pp->pp_kbft_region);
1460 pp->pp_kbft_region = NULL;
1461 }
1462
1463 /*
1464 * The order is important here, since pp_metadata_dtor()
1465 * called by freeing on the pp_kmd_cache will in turn
1466 * free the attached buffer. Therefore destroy the
1467 * buffer cache last.
1468 */
1469 if (PP_BUF_CACHE_DEF(pp) != NULL) {
1470 skmem_cache_destroy(PP_BUF_CACHE_DEF(pp));
1471 PP_BUF_CACHE_DEF(pp) = NULL;
1472 }
1473 if (PP_BUF_REGION_DEF(pp) != NULL) {
1474 skmem_region_release(PP_BUF_REGION_DEF(pp));
1475 PP_BUF_REGION_DEF(pp) = NULL;
1476 }
1477 if (PP_BUF_CACHE_LARGE(pp) != NULL) {
1478 skmem_cache_destroy(PP_BUF_CACHE_LARGE(pp));
1479 PP_BUF_CACHE_LARGE(pp) = NULL;
1480 }
1481 if (PP_BUF_REGION_LARGE(pp) != NULL) {
1482 skmem_region_release(PP_BUF_REGION_LARGE(pp));
1483 PP_BUF_REGION_LARGE(pp) = NULL;
1484 }
1485
1486 if (pp->pp_ctx != NULL) {
1487 pp->pp_ctx_release(pp->pp_ctx);
1488 pp->pp_ctx = NULL;
1489 }
1490 }
1491
1492 static int
pp_init_upp_locked(struct kern_pbufpool * pp,boolean_t can_block)1493 pp_init_upp_locked(struct kern_pbufpool *pp, boolean_t can_block)
1494 {
1495 int i, err = 0;
1496
1497 if (pp->pp_u_hash_table != NULL) {
1498 goto done;
1499 }
1500
1501 /* allocated-address hash table */
1502 /*
1503 * -fbounds-safety: We switched to sk_alloc (aka kalloc) from zalloc, so
1504 * if we see any performance hit, we can check if this caused it.
1505 */
1506 if (can_block) {
1507 pp->pp_u_hash_table = sk_alloc_type_array(
1508 struct kern_pbufpool_u_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1509 Z_WAITOK, skmem_tag_pbufpool_hash);
1510 pp->pp_u_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1511 } else {
1512 pp->pp_u_hash_table = sk_alloc_type_array(
1513 struct kern_pbufpool_u_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1514 Z_NOWAIT, skmem_tag_pbufpool_hash);
1515 pp->pp_u_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1516 }
1517 if (pp->pp_u_hash_table == NULL) {
1518 SK_ERR("failed to zalloc packet buffer pool upp hash table");
1519 err = ENOMEM;
1520 goto done;
1521 }
1522
1523 for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1524 SLIST_INIT(&pp->pp_u_hash_table[i].upp_head);
1525 }
1526 done:
1527 return err;
1528 }
1529
1530 static void
pp_destroy_upp_locked(struct kern_pbufpool * pp)1531 pp_destroy_upp_locked(struct kern_pbufpool *pp)
1532 {
1533 PP_LOCK_ASSERT_HELD(pp);
1534 if (pp->pp_u_hash_table != NULL) {
1535 /* purge anything that's left */
1536 pp_purge_upp_locked(pp, -1);
1537
1538 #if (DEBUG || DEVELOPMENT)
1539 for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1540 ASSERT(SLIST_EMPTY(&pp->pp_u_hash_table[i].upp_head));
1541 }
1542 #endif /* DEBUG || DEVELOPMENT */
1543
1544 kfree_type_counted_by(struct kern_pbufpool_u_bkt,
1545 pp->pp_u_hash_table_size,
1546 pp->pp_u_hash_table);
1547 }
1548 ASSERT(pp->pp_u_bufinuse == 0);
1549 }
1550
1551 int
pp_init_upp(struct kern_pbufpool * pp,boolean_t can_block)1552 pp_init_upp(struct kern_pbufpool *pp, boolean_t can_block)
1553 {
1554 int err = 0;
1555
1556 PP_LOCK(pp);
1557 err = pp_init_upp_locked(pp, can_block);
1558 if (err) {
1559 SK_ERR("packet UPP init failed (%d)", err);
1560 goto done;
1561 }
1562 err = pp_init_upp_bft_locked(pp, can_block);
1563 if (err) {
1564 SK_ERR("buflet UPP init failed (%d)", err);
1565 pp_destroy_upp_locked(pp);
1566 goto done;
1567 }
1568 pp_retain_locked(pp);
1569 done:
1570 PP_UNLOCK(pp);
1571 return err;
1572 }
1573
1574 __attribute__((always_inline))
1575 static void
pp_insert_upp_bft_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1576 pp_insert_upp_bft_locked(struct kern_pbufpool *pp,
1577 struct __kern_buflet *kbft, pid_t pid)
1578 {
1579 struct kern_pbufpool_u_bft_bkt *bkt;
1580 struct __kern_buflet_ext *kbe = (struct __kern_buflet_ext *)kbft;
1581
1582 ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1583 ASSERT(kbe->kbe_buf_pid == (pid_t)-1);
1584 kbe->kbe_buf_pid = pid;
1585 bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, kbft->buf_bft_idx_reg);
1586 SLIST_INSERT_HEAD(&bkt->upp_head, kbe, kbe_buf_upp_link);
1587 pp->pp_u_bftinuse++;
1588 }
1589
1590 __attribute__((always_inline))
1591 static void
pp_insert_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1592 pp_insert_upp_bft_chain_locked(struct kern_pbufpool *pp,
1593 struct __kern_buflet *kbft, pid_t pid)
1594 {
1595 while (kbft != NULL) {
1596 pp_insert_upp_bft_locked(pp, kbft, pid);
1597 kbft = __unsafe_forge_single(struct __kern_buflet *,
1598 __DECONST(kern_buflet_t, kbft->buf_nbft_addr));
1599 }
1600 }
1601
1602 /* Also inserts the attached chain of buflets */
1603 void static inline
pp_insert_upp_common(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1604 pp_insert_upp_common(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1605 pid_t pid)
1606 {
1607 struct kern_pbufpool_u_bkt *bkt;
1608 struct __kern_buflet *kbft;
1609
1610 ASSERT(kqum->qum_pid == (pid_t)-1);
1611 kqum->qum_pid = pid;
1612
1613 bkt = KERN_PBUFPOOL_U_HASH(pp, METADATA_IDX(kqum));
1614 SLIST_INSERT_HEAD(&bkt->upp_head, kqum, qum_upp_link);
1615 pp->pp_u_bufinuse++;
1616
1617 kbft = __unsafe_forge_single(struct __kern_buflet *, (kern_buflet_t)kqum->qum_buf[0].buf_nbft_addr);
1618 if (kbft != NULL) {
1619 ASSERT(((kern_buflet_t)kbft)->buf_flag & BUFLET_FLAG_EXTERNAL);
1620 ASSERT(kqum->qum_qflags & QUM_F_INTERNALIZED);
1621 pp_insert_upp_bft_chain_locked(pp, kbft, pid);
1622 }
1623 }
1624
1625 void
pp_insert_upp_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1626 pp_insert_upp_locked(struct kern_pbufpool *pp, struct __kern_quantum *kqum,
1627 pid_t pid)
1628 {
1629 pp_insert_upp_common(pp, kqum, pid);
1630 }
1631
1632 void
pp_insert_upp(struct kern_pbufpool * pp,struct __kern_quantum * kqum,pid_t pid)1633 pp_insert_upp(struct kern_pbufpool *pp, struct __kern_quantum *kqum, pid_t pid)
1634 {
1635 PP_LOCK(pp);
1636 pp_insert_upp_common(pp, kqum, pid);
1637 PP_UNLOCK(pp);
1638 }
1639
1640 void
pp_insert_upp_batch(struct kern_pbufpool * pp,pid_t pid,uint64_t * __counted_by (num)array,uint32_t num)1641 pp_insert_upp_batch(struct kern_pbufpool *pp, pid_t pid,
1642 uint64_t *__counted_by(num)array, uint32_t num)
1643 {
1644 uint32_t i = 0;
1645
1646 ASSERT(array != NULL && num > 0);
1647 PP_LOCK(pp);
1648 while (i < num) {
1649 struct __kern_quantum *kqum = SK_PTR_ADDR_KQUM(array[i]);
1650
1651 ASSERT(kqum != NULL);
1652 pp_insert_upp_common(pp, kqum, pid);
1653 ++i;
1654 }
1655 PP_UNLOCK(pp);
1656 }
1657
1658 __attribute__((always_inline))
1659 static struct __kern_buflet *
pp_remove_upp_bft_locked(struct kern_pbufpool * pp,obj_idx_t bft_idx)1660 pp_remove_upp_bft_locked(struct kern_pbufpool *pp, obj_idx_t bft_idx)
1661 {
1662 struct __kern_buflet_ext *kbft, *tbft;
1663 struct kern_pbufpool_u_bft_bkt *bkt;
1664
1665 bkt = KERN_PBUFPOOL_U_BFT_HASH(pp, bft_idx);
1666 SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link, tbft) {
1667 if (((kern_buflet_t)kbft)->buf_bft_idx_reg == bft_idx) {
1668 SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1669 kbe_buf_upp_link);
1670 kbft->kbe_buf_pid = (pid_t)-1;
1671 kbft->kbe_buf_upp_link.sle_next = NULL;
1672 ASSERT(pp->pp_u_bftinuse != 0);
1673 pp->pp_u_bftinuse--;
1674 break;
1675 }
1676 }
1677 return (kern_buflet_t)kbft;
1678 }
1679
1680 struct __kern_buflet *
pp_remove_upp_bft(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1681 pp_remove_upp_bft(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1682 {
1683 struct __kern_buflet *kbft = pp_remove_upp_bft_locked(pp, md_idx);
1684
1685 *err = __improbable(kbft != NULL) ? 0 : EINVAL;
1686 return kbft;
1687 }
1688
1689 __attribute__((always_inline))
1690 static int
pp_remove_upp_bft_chain_locked(struct kern_pbufpool * pp,struct __kern_quantum * kqum)1691 pp_remove_upp_bft_chain_locked(struct kern_pbufpool *pp,
1692 struct __kern_quantum *kqum)
1693 {
1694 uint32_t max_frags = pp->pp_max_frags;
1695 struct __kern_buflet *kbft;
1696 uint16_t nbfts, upkt_nbfts;
1697 obj_idx_t bft_idx;
1698
1699 ASSERT(!(kqum->qum_qflags & QUM_F_INTERNALIZED));
1700 bft_idx = kqum->qum_user->qum_buf[0].buf_nbft_idx;
1701 kbft = &kqum->qum_buf[0];
1702 if (bft_idx == OBJ_IDX_NONE) {
1703 return 0;
1704 }
1705
1706 ASSERT(METADATA_TYPE(kqum) == NEXUS_META_TYPE_PACKET);
1707 struct __kern_packet *kpkt = __DECONST(struct __kern_packet *, kqum);
1708 struct __user_packet *upkt = __DECONST(struct __user_packet *,
1709 kpkt->pkt_qum.qum_user);
1710
1711 upkt_nbfts = upkt->pkt_bufs_cnt;
1712 if (__improbable(upkt_nbfts > max_frags)) {
1713 SK_ERR("bad bcnt in upkt (%d > %d)", upkt_nbfts, max_frags);
1714 BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1715 BUF_NBFT_ADDR(kbft, 0);
1716 return ERANGE;
1717 }
1718
1719 nbfts = (kbft->buf_addr != 0) ? 1 : 0;
1720
1721 do {
1722 struct __kern_buflet *pbft = kbft;
1723 struct __kern_buflet_ext *kbe;
1724
1725 kbft = pp_remove_upp_bft_locked(pp, bft_idx);
1726 if (__improbable(kbft == NULL)) {
1727 BUF_NBFT_IDX(pbft, OBJ_IDX_NONE);
1728 BUF_NBFT_ADDR(pbft, 0);
1729 SK_ERR("unallocated next buflet (%d), %p", bft_idx,
1730 SK_KVA(pbft));
1731 return ERANGE;
1732 }
1733 ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
1734 BUF_NBFT_IDX(pbft, bft_idx);
1735 BUF_NBFT_ADDR(pbft, kbft);
1736 kbe = __container_of(kbft, struct __kern_buflet_ext, kbe_overlay);
1737 bft_idx = kbe->kbe_buf_user->buf_nbft_idx;
1738 ++nbfts;
1739 } while ((bft_idx != OBJ_IDX_NONE) && (nbfts < upkt_nbfts));
1740
1741 ASSERT(kbft != NULL);
1742 BUF_NBFT_IDX(kbft, OBJ_IDX_NONE);
1743 BUF_NBFT_ADDR(kbft, 0);
1744 *__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = nbfts;
1745
1746 if (__improbable((bft_idx != OBJ_IDX_NONE) || (nbfts != upkt_nbfts))) {
1747 SK_ERR("bad buflet in upkt (%d, %d)", nbfts, upkt_nbfts);
1748 return ERANGE;
1749 }
1750 return 0;
1751 }
1752
1753 struct __kern_quantum *
pp_remove_upp_locked(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1754 pp_remove_upp_locked(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1755 {
1756 struct __kern_quantum *kqum, *tqum;
1757 struct kern_pbufpool_u_bkt *bkt;
1758
1759 bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1760 SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1761 if (METADATA_IDX(kqum) == md_idx) {
1762 SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1763 qum_upp_link);
1764 kqum->qum_pid = (pid_t)-1;
1765 ASSERT(pp->pp_u_bufinuse != 0);
1766 pp->pp_u_bufinuse--;
1767 break;
1768 }
1769 }
1770 if (__probable(kqum != NULL)) {
1771 *err = pp_remove_upp_bft_chain_locked(pp, kqum);
1772 } else {
1773 *err = ERANGE;
1774 }
1775 return kqum;
1776 }
1777
1778 struct __kern_quantum *
pp_remove_upp(struct kern_pbufpool * pp,obj_idx_t md_idx,int * err)1779 pp_remove_upp(struct kern_pbufpool *pp, obj_idx_t md_idx, int *err)
1780 {
1781 struct __kern_quantum *kqum;
1782
1783 PP_LOCK(pp);
1784 kqum = pp_remove_upp_locked(pp, md_idx, err);
1785 PP_UNLOCK(pp);
1786 return kqum;
1787 }
1788
1789 struct __kern_quantum *
pp_find_upp(struct kern_pbufpool * pp,obj_idx_t md_idx)1790 pp_find_upp(struct kern_pbufpool *pp, obj_idx_t md_idx)
1791 {
1792 struct __kern_quantum *kqum, *tqum;
1793 struct kern_pbufpool_u_bkt *bkt;
1794
1795 PP_LOCK(pp);
1796 bkt = KERN_PBUFPOOL_U_HASH(pp, md_idx);
1797 SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1798 if (METADATA_IDX(kqum) == md_idx) {
1799 break;
1800 }
1801 }
1802 PP_UNLOCK(pp);
1803
1804 return kqum;
1805 }
1806
1807 __attribute__((always_inline))
1808 static void
pp_purge_upp_locked(struct kern_pbufpool * pp,pid_t pid)1809 pp_purge_upp_locked(struct kern_pbufpool *pp, pid_t pid)
1810 {
1811 struct __kern_quantum *kqum, *tqum;
1812 struct kern_pbufpool_u_bkt *bkt;
1813 int i;
1814
1815 PP_LOCK_ASSERT_HELD(pp);
1816
1817 /*
1818 * TODO: Build a list of packets and batch-free them.
1819 */
1820 for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1821 bkt = &pp->pp_u_hash_table[i];
1822 SLIST_FOREACH_SAFE(kqum, &bkt->upp_head, qum_upp_link, tqum) {
1823 ASSERT(kqum->qum_pid != (pid_t)-1);
1824 if (pid != (pid_t)-1 && kqum->qum_pid != pid) {
1825 continue;
1826 }
1827 SLIST_REMOVE(&bkt->upp_head, kqum, __kern_quantum,
1828 qum_upp_link);
1829 pp_remove_upp_bft_chain_locked(pp, kqum);
1830 kqum->qum_pid = (pid_t)-1;
1831 kqum->qum_qflags &= ~QUM_F_FINALIZED;
1832 kqum->qum_ksd = NULL;
1833 pp_free_packet(__DECONST(struct kern_pbufpool *,
1834 kqum->qum_pp), (uint64_t)kqum);
1835 ASSERT(pp->pp_u_bufinuse != 0);
1836 pp->pp_u_bufinuse--;
1837 }
1838 }
1839 }
1840
1841 __attribute__((always_inline))
1842 static void
pp_purge_upp_bft_locked(struct kern_pbufpool * pp,pid_t pid)1843 pp_purge_upp_bft_locked(struct kern_pbufpool *pp, pid_t pid)
1844 {
1845 struct __kern_buflet_ext *kbft, *tbft;
1846 struct kern_pbufpool_u_bft_bkt *bkt;
1847 int i;
1848
1849 PP_LOCK_ASSERT_HELD(pp);
1850
1851 for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1852 bkt = &pp->pp_u_bft_hash_table[i];
1853 SLIST_FOREACH_SAFE(kbft, &bkt->upp_head, kbe_buf_upp_link,
1854 tbft) {
1855 ASSERT(kbft->kbe_buf_pid != (pid_t)-1);
1856 if (pid != (pid_t)-1 && kbft->kbe_buf_pid != pid) {
1857 continue;
1858 }
1859 SLIST_REMOVE(&bkt->upp_head, kbft, __kern_buflet_ext,
1860 kbe_buf_upp_link);
1861 kbft->kbe_buf_pid = (pid_t)-1;
1862 kbft->kbe_buf_upp_link.sle_next = NULL;
1863 pp_free_buflet(pp, (kern_buflet_t)kbft);
1864 ASSERT(pp->pp_u_bftinuse != 0);
1865 pp->pp_u_bftinuse--;
1866 }
1867 }
1868 }
1869
1870 void
pp_purge_upp(struct kern_pbufpool * pp,pid_t pid)1871 pp_purge_upp(struct kern_pbufpool *pp, pid_t pid)
1872 {
1873 PP_LOCK(pp);
1874 pp_purge_upp_locked(pp, pid);
1875 pp_purge_upp_bft_locked(pp, pid);
1876 PP_UNLOCK(pp);
1877 }
1878
1879 static int
pp_init_upp_bft_locked(struct kern_pbufpool * pp,boolean_t can_block)1880 pp_init_upp_bft_locked(struct kern_pbufpool *pp, boolean_t can_block)
1881 {
1882 int i, err = 0;
1883
1884 PP_LOCK_ASSERT_HELD(pp);
1885 if (pp->pp_u_bft_hash_table != NULL) {
1886 return 0;
1887 }
1888
1889 /* allocated-address hash table */
1890 /*
1891 * -fbounds-safety: We switched to sk_alloc (aka kalloc) from zalloc, so
1892 * if we see any performance hit, we can check if this caused it.
1893 */
1894 if (can_block) {
1895 pp->pp_u_bft_hash_table = sk_alloc_type_array(
1896 struct kern_pbufpool_u_bft_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1897 Z_WAITOK, skmem_tag_pbufpool_bft_hash);
1898 pp->pp_u_bft_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1899 } else {
1900 pp->pp_u_bft_hash_table = sk_alloc_type_array(
1901 struct kern_pbufpool_u_bft_bkt, KERN_PBUFPOOL_U_HASH_SIZE,
1902 Z_NOWAIT, skmem_tag_pbufpool_bft_hash);
1903 pp->pp_u_bft_hash_table_size = KERN_PBUFPOOL_U_HASH_SIZE;
1904 }
1905 if (pp->pp_u_bft_hash_table == NULL) {
1906 SK_ERR("failed to zalloc packet buffer pool upp buflet hash table");
1907 err = ENOMEM;
1908 goto fail;
1909 }
1910
1911 for (i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1912 SLIST_INIT(&pp->pp_u_bft_hash_table[i].upp_head);
1913 }
1914
1915 fail:
1916 return err;
1917 }
1918
1919 static void
pp_destroy_upp_bft_locked(struct kern_pbufpool * pp)1920 pp_destroy_upp_bft_locked(struct kern_pbufpool *pp)
1921 {
1922 PP_LOCK_ASSERT_HELD(pp);
1923 if (pp->pp_u_bft_hash_table != NULL) {
1924 /* purge anything that's left */
1925 pp_purge_upp_bft_locked(pp, -1);
1926
1927 #if (DEBUG || DEVELOPMENT)
1928 for (int i = 0; i < KERN_PBUFPOOL_U_HASH_SIZE; i++) {
1929 ASSERT(SLIST_EMPTY(&pp->pp_u_bft_hash_table[i].upp_head));
1930 }
1931 #endif /* DEBUG || DEVELOPMENT */
1932
1933 kfree_type_counted_by(struct kern_pbufpool_u_bft_bkt,
1934 pp->pp_u_bft_hash_table_size,
1935 pp->pp_u_bft_hash_table);
1936 }
1937 ASSERT(pp->pp_u_bftinuse == 0);
1938 }
1939
1940 void
pp_insert_upp_bft(struct kern_pbufpool * pp,struct __kern_buflet * kbft,pid_t pid)1941 pp_insert_upp_bft(struct kern_pbufpool *pp,
1942 struct __kern_buflet *kbft, pid_t pid)
1943 {
1944 PP_LOCK(pp);
1945 pp_insert_upp_bft_locked(pp, kbft, pid);
1946 PP_UNLOCK(pp);
1947 }
1948
1949 boolean_t
pp_isempty_upp(struct kern_pbufpool * pp)1950 pp_isempty_upp(struct kern_pbufpool *pp)
1951 {
1952 boolean_t isempty;
1953
1954 PP_LOCK(pp);
1955 isempty = (pp->pp_u_bufinuse == 0);
1956 PP_UNLOCK(pp);
1957
1958 return isempty;
1959 }
1960
1961 __attribute__((always_inline))
1962 static inline struct __kern_quantum *
pp_metadata_init(struct __metadata_preamble * mdp,struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag,struct skmem_obj ** blist)1963 pp_metadata_init(struct __metadata_preamble *mdp, struct kern_pbufpool *pp,
1964 uint16_t bufcnt, uint32_t skmflag, struct skmem_obj **blist)
1965 {
1966 struct __kern_quantum *kqum;
1967 struct __user_quantum *uqum;
1968
1969 kqum = SK_PTR_ADDR_KQUM((uintptr_t)mdp + METADATA_PREAMBLE_SZ);
1970 ASSERT(kqum->qum_pp == pp);
1971 if (__probable(!PP_KERNEL_ONLY(pp))) {
1972 ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY));
1973 uqum = __DECONST(struct __user_quantum *, kqum->qum_user);
1974 ASSERT(uqum != NULL);
1975 } else {
1976 ASSERT(kqum->qum_qflags & QUM_F_KERNEL_ONLY);
1977 ASSERT(kqum->qum_user == NULL);
1978 uqum = NULL;
1979 }
1980
1981 if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 &&
1982 pp_metadata_construct(kqum, uqum, METADATA_IDX(kqum), pp,
1983 skmflag, bufcnt, FALSE, blist) != 0) {
1984 return NULL;
1985 }
1986
1987 /* (re)construct {user,kernel} metadata */
1988 switch (pp->pp_md_type) {
1989 case NEXUS_META_TYPE_PACKET: {
1990 struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum);
1991 struct __kern_buflet *kbuf = &kpkt->pkt_qum_buf;
1992 uint16_t i;
1993
1994 /* sanitize flags */
1995 kpkt->pkt_pflags &= PKT_F_INIT_MASK;
1996
1997 ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) &&
1998 kpkt->pkt_com_opt != NULL);
1999 ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) &&
2000 kpkt->pkt_flow != NULL);
2001 ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) &&
2002 kpkt->pkt_tx_compl != NULL);
2003
2004 /*
2005 * XXX: For now we always set PKT_F_FLOW_DATA;
2006 * this is a no-op but done for consistency
2007 * with the other PKT_F_*_DATA flags.
2008 */
2009 kpkt->pkt_pflags |= PKT_F_FLOW_DATA;
2010
2011 /* initialize kernel packet */
2012 KPKT_INIT(kpkt, QUM_F_INTERNALIZED);
2013
2014 ASSERT(bufcnt || PP_HAS_BUFFER_ON_DEMAND(pp));
2015 if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
2016 ASSERT(kbuf->buf_ctl == NULL);
2017 ASSERT(kbuf->buf_addr == 0);
2018 /*
2019 * -fbounds-safety: buf_nbft_addr is a mach_vm_address_t
2020 * which is unsafe, so we just forge it here.
2021 */
2022 kbuf = __unsafe_forge_single(struct __kern_buflet *,
2023 __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
2024 }
2025 /* initialize kernel buflet */
2026 for (i = 0; i < bufcnt; i++) {
2027 ASSERT(kbuf != NULL);
2028 KBUF_INIT(kbuf);
2029 kbuf = __unsafe_forge_single(struct __kern_buflet *,
2030 __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr));
2031 }
2032 ASSERT((kbuf == NULL) || (bufcnt == 0));
2033 break;
2034 }
2035 default:
2036 ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM);
2037 /* kernel quantum */
2038 KQUM_INIT(kqum, QUM_F_INTERNALIZED);
2039 KBUF_INIT(&kqum->qum_buf[0]);
2040 break;
2041 }
2042
2043 return kqum;
2044 }
2045
2046 /*
2047 * When PPF_BUFFER_ON_DEMAND flag is set on packet pool creation, we create
2048 * packet descriptor cache with no buffer attached and a buflet cache with
2049 * cpu layer caching enabled. While operating in this mode, we can call
2050 * pp_alloc_packet_common() either with `bufcnt = 0` or `bufcnt = n`,
2051 * where n <= pp->pp_max_frags. If `bufcnt == 0` then we allocate packet
2052 * descriptor with no attached buffer from the metadata cache.
2053 * If `bufcnt != 0`, then this routine allocates packet descriptor and buflets
2054 * from their respective caches and constructs the packet on behalf of the
2055 * caller.
2056 */
2057 __attribute__((always_inline))
2058 static inline uint32_t
pp_alloc_packet_common(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * __counted_by (num)array,uint32_t num,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2059 pp_alloc_packet_common(struct kern_pbufpool *pp, uint16_t bufcnt,
2060 uint64_t *__counted_by(num)array, uint32_t num, boolean_t tagged,
2061 alloc_cb_func_t cb, const void *ctx, uint32_t skmflag)
2062 {
2063 struct __metadata_preamble *mdp;
2064 struct __kern_quantum *kqum = NULL;
2065 uint32_t allocp, need = num;
2066 struct skmem_obj *__single plist, *__single blist = NULL;
2067 uint64_t *array_cp; /* -fbounds-safety */
2068
2069 ASSERT(bufcnt <= pp->pp_max_frags);
2070 ASSERT(array != NULL && num > 0);
2071 ASSERT(PP_BATCH_CAPABLE(pp));
2072
2073 /* allocate (constructed) packet(s) with buffer(s) attached */
2074 allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist,
2075 pp->pp_kmd_cache->skm_objsize, num, skmflag);
2076
2077 /* allocate (constructed) buflet(s) with buffer(s) attached */
2078 if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
2079 (void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
2080 PP_KBFT_CACHE_DEF(pp)->skm_objsize, (allocp * bufcnt), skmflag);
2081 }
2082
2083 array_cp = array;
2084 while (plist != NULL) {
2085 struct skmem_obj *plistn;
2086
2087 plistn = plist->mo_next;
2088 plist->mo_next = NULL;
2089
2090 mdp = (struct __metadata_preamble *)(void *)plist;
2091 kqum = pp_metadata_init(mdp, pp, bufcnt, skmflag, &blist);
2092 if (kqum == NULL) {
2093 if (blist != NULL) {
2094 skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp),
2095 blist);
2096 blist = NULL;
2097 }
2098 plist->mo_next = plistn;
2099 skmem_cache_batch_free(pp->pp_kmd_cache, plist);
2100 plist = NULL;
2101 break;
2102 }
2103
2104
2105 if (tagged) {
2106 *array_cp = SK_PTR_ENCODE(kqum, METADATA_TYPE(kqum),
2107 METADATA_SUBTYPE(kqum));
2108 } else {
2109 *array_cp = (uint64_t)kqum;
2110 }
2111
2112 if (cb != NULL) {
2113 (cb)(*array_cp, (num - need), ctx);
2114 }
2115
2116 ++array_cp;
2117 plist = plistn;
2118
2119 ASSERT(need > 0);
2120 --need;
2121 }
2122 ASSERT(blist == NULL);
2123 ASSERT((num - need) == allocp || kqum == NULL);
2124
2125 return num - need;
2126 }
2127
2128 uint64_t
pp_alloc_packet(struct kern_pbufpool * pp,uint16_t bufcnt,uint32_t skmflag)2129 pp_alloc_packet(struct kern_pbufpool *pp, uint16_t bufcnt, uint32_t skmflag)
2130 {
2131 uint64_t kpkt = 0;
2132
2133 (void) pp_alloc_packet_common(pp, bufcnt, &kpkt, 1, FALSE,
2134 NULL, NULL, skmflag);
2135
2136 return kpkt;
2137 }
2138
2139 int
pp_alloc_packet_batch(struct kern_pbufpool * pp,uint16_t bufcnt,uint64_t * __counted_by (* size)array,uint32_t * size,boolean_t tagged,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2140 pp_alloc_packet_batch(struct kern_pbufpool *pp, uint16_t bufcnt,
2141 uint64_t *__counted_by(*size)array, uint32_t *size, boolean_t tagged,
2142 alloc_cb_func_t cb, const void *ctx, uint32_t skmflag)
2143 {
2144 uint32_t i, n;
2145 int err;
2146
2147 ASSERT(array != NULL && size > 0);
2148
2149 n = *size;
2150 /*
2151 * -fbounds-safety: Originally there was this line here: *size = 0; but
2152 * we removed this because array is now __counted_by(*size), so *size =
2153 * 0 leads to brk 0x5519. Also, *size is set to i anyway.
2154 */
2155
2156 i = pp_alloc_packet_common(pp, bufcnt, array, n, tagged,
2157 cb, ctx, skmflag);
2158 /*
2159 * -fbounds-safety: Since array is __counted_by(*size), we need to be
2160 * extra careful when *size is updated, like below. Here, we know i will
2161 * be less than or equal to the original *size value, so updating *size
2162 * is okay.
2163 */
2164 *size = i;
2165
2166 if (__probable(i == n)) {
2167 err = 0;
2168 } else if (i != 0) {
2169 err = EAGAIN;
2170 } else {
2171 err = ENOMEM;
2172 }
2173
2174 return err;
2175 }
2176
2177 int
pp_alloc_pktq(struct kern_pbufpool * pp,uint16_t bufcnt,struct pktq * pktq,uint32_t num,alloc_cb_func_t cb,const void * ctx,uint32_t skmflag)2178 pp_alloc_pktq(struct kern_pbufpool *pp, uint16_t bufcnt,
2179 struct pktq *pktq, uint32_t num, alloc_cb_func_t cb, const void *ctx,
2180 uint32_t skmflag)
2181 {
2182 struct __metadata_preamble *mdp;
2183 struct __kern_packet *kpkt = NULL;
2184 uint32_t allocp, need = num;
2185 struct skmem_obj *__single plist, *__single blist = NULL;
2186 int err;
2187
2188 ASSERT(pktq != NULL && num > 0);
2189 ASSERT(pp->pp_md_type == NEXUS_META_TYPE_PACKET);
2190 ASSERT(bufcnt <= pp->pp_max_frags);
2191 ASSERT(PP_BATCH_CAPABLE(pp));
2192
2193 /* allocate (constructed) packet(s) with buffer(s) attached */
2194 allocp = skmem_cache_batch_alloc(pp->pp_kmd_cache, &plist,
2195 pp->pp_kmd_cache->skm_objsize, num, skmflag);
2196
2197 /* allocate (constructed) buflet(s) with buffer(s) attached */
2198 if (PP_HAS_BUFFER_ON_DEMAND(pp) && bufcnt != 0 && allocp != 0) {
2199 (void) skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &blist,
2200 PP_KBFT_CACHE_DEF(pp)->skm_objsize, (allocp * bufcnt), skmflag);
2201 }
2202
2203 while (plist != NULL) {
2204 struct skmem_obj *plistn;
2205
2206 plistn = plist->mo_next;
2207 plist->mo_next = NULL;
2208
2209 mdp = (struct __metadata_preamble *)(void *)plist;
2210 kpkt = (struct __kern_packet *)pp_metadata_init(mdp, pp,
2211 bufcnt, skmflag, &blist);
2212 if (kpkt == NULL) {
2213 if (blist != NULL) {
2214 skmem_cache_batch_free(PP_KBFT_CACHE_DEF(pp),
2215 blist);
2216 blist = NULL;
2217 }
2218 plist->mo_next = plistn;
2219 skmem_cache_batch_free(pp->pp_kmd_cache, plist);
2220 plist = NULL;
2221 break;
2222 }
2223
2224
2225 KPKTQ_ENQUEUE(pktq, kpkt);
2226
2227 if (cb != NULL) {
2228 (cb)((uint64_t)kpkt, (num - need), ctx);
2229 }
2230
2231 plist = plistn;
2232
2233 ASSERT(need > 0);
2234 --need;
2235 }
2236 ASSERT(blist == NULL);
2237 ASSERT((num - need) == allocp || kpkt == NULL);
2238
2239 if (__probable(need == 0)) {
2240 err = 0;
2241 } else if (need == num) {
2242 err = ENOMEM;
2243 } else {
2244 err = EAGAIN;
2245 }
2246
2247 return err;
2248 }
2249
2250 uint64_t
pp_alloc_packet_by_size(struct kern_pbufpool * pp,uint32_t size,uint32_t skmflag)2251 pp_alloc_packet_by_size(struct kern_pbufpool *pp, uint32_t size,
2252 uint32_t skmflag)
2253 {
2254 uint32_t bufcnt = pp->pp_max_frags;
2255 uint64_t kpkt = 0;
2256
2257 if (PP_HAS_BUFFER_ON_DEMAND(pp)) {
2258 bufcnt =
2259 SK_ROUNDUP(size, PP_BUF_SIZE_DEF(pp)) / PP_BUF_SIZE_DEF(pp);
2260 ASSERT(bufcnt <= UINT16_MAX);
2261 }
2262
2263 (void) pp_alloc_packet_common(pp, (uint16_t)bufcnt, &kpkt, 1, TRUE,
2264 NULL, NULL, skmflag);
2265
2266 return kpkt;
2267 }
2268
2269 __attribute__((always_inline))
2270 static inline struct __metadata_preamble *
pp_metadata_fini(struct __kern_quantum * kqum,struct kern_pbufpool * pp,struct mbuf ** mp,struct __kern_packet ** kpp,struct skmem_obj ** blist_def,struct skmem_obj ** blist_nocache_def,struct skmem_obj ** blist_large,struct skmem_obj ** blist_nocahce_large)2271 pp_metadata_fini(struct __kern_quantum *kqum, struct kern_pbufpool *pp,
2272 struct mbuf **mp, struct __kern_packet **kpp, struct skmem_obj **blist_def,
2273 struct skmem_obj **blist_nocache_def, struct skmem_obj **blist_large,
2274 struct skmem_obj **blist_nocahce_large)
2275 {
2276 struct __metadata_preamble *mdp = METADATA_PREAMBLE(kqum);
2277
2278 ASSERT(SK_PTR_TAG(kqum) == 0);
2279
2280 switch (pp->pp_md_type) {
2281 case NEXUS_META_TYPE_PACKET: {
2282 struct __kern_packet *kpkt = SK_PTR_KPKT(kqum);
2283
2284 if ((kpkt->pkt_pflags & PKT_F_TX_COMPL_TS_REQ) != 0) {
2285 __packet_perform_tx_completion_callbacks(
2286 SK_PKT2PH(kpkt), NULL);
2287 }
2288 if ((kpkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) {
2289 ASSERT((kpkt->pkt_pflags & PKT_F_PKT_DATA) == 0);
2290 ASSERT(kpkt->pkt_mbuf != NULL);
2291 ASSERT(kpkt->pkt_mbuf->m_nextpkt == NULL);
2292 if (mp != NULL) {
2293 ASSERT(*mp == NULL);
2294 *mp = kpkt->pkt_mbuf;
2295 } else {
2296 m_freem(kpkt->pkt_mbuf);
2297 }
2298 KPKT_CLEAR_MBUF_DATA(kpkt);
2299 } else if ((kpkt->pkt_pflags & PKT_F_PKT_DATA) != 0) {
2300 ASSERT(kpkt->pkt_pkt != NULL);
2301 ASSERT(kpkt->pkt_pkt->pkt_nextpkt == NULL);
2302 if (kpp != NULL) {
2303 ASSERT(*kpp == NULL);
2304 *kpp = kpkt->pkt_pkt;
2305 } else {
2306 /* can only recurse once */
2307 ASSERT((kpkt->pkt_pkt->pkt_pflags &
2308 PKT_F_PKT_DATA) == 0);
2309 pp_free_packet_single(kpkt->pkt_pkt);
2310 }
2311 KPKT_CLEAR_PKT_DATA(kpkt);
2312 }
2313 kpkt->pkt_pflags &= ~PKT_F_TRUNCATED;
2314 ASSERT(kpkt->pkt_nextpkt == NULL);
2315 ASSERT(kpkt->pkt_qum.qum_ksd == NULL);
2316 ASSERT((kpkt->pkt_pflags & PKT_F_MBUF_MASK) == 0);
2317 ASSERT((kpkt->pkt_pflags & PKT_F_PKT_MASK) == 0);
2318 break;
2319 }
2320 default:
2321 break;
2322 }
2323
2324 if (__improbable(PP_HAS_BUFFER_ON_DEMAND(pp))) {
2325 pp_metadata_destruct_common(kqum, pp, FALSE, blist_def, blist_nocache_def,
2326 blist_large, blist_nocahce_large);
2327 }
2328 return mdp;
2329 }
2330
2331 void
pp_free_packet_chain(struct __kern_packet * pkt_chain,int * npkt)2332 pp_free_packet_chain(struct __kern_packet *pkt_chain, int *npkt)
2333 {
2334 struct __metadata_preamble *mdp;
2335 struct skmem_obj *__single obj_mdp;
2336 struct skmem_obj *__single top = NULL;
2337 struct skmem_obj *__single blist_def = NULL, *__single blist_nocache_def = NULL;
2338 struct skmem_obj *__single blist_large = NULL, *__single blist_nocache_large = NULL;
2339 struct skmem_obj **list = ⊤
2340 struct mbuf *__single mtop = NULL;
2341 struct mbuf **mp = &mtop;
2342 struct __kern_packet *__single kptop = NULL;
2343 struct __kern_packet **__single kpp = &kptop, *pkt, *next;
2344 struct kern_pbufpool *pp;
2345 int c = 0;
2346
2347 pp = __DECONST(struct kern_pbufpool *, pkt_chain->pkt_qum.qum_pp);
2348 ASSERT(pp != NULL);
2349 ASSERT(PP_BATCH_CAPABLE(pp));
2350
2351 for (pkt = pkt_chain; pkt != NULL; pkt = next) {
2352 next = pkt->pkt_nextpkt;
2353 pkt->pkt_nextpkt = NULL;
2354
2355 ASSERT(SK_PTR_ADDR_KQUM(pkt)->qum_pp == pp);
2356 mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(pkt), pp,
2357 mp, kpp, &blist_def, &blist_nocache_def, &blist_large, &blist_nocache_large);
2358
2359 obj_mdp = __unsafe_forge_single(struct skmem_obj *, mdp);
2360 *list = obj_mdp;
2361 list = &(*list)->mo_next;
2362 c++;
2363
2364 if (*mp != NULL) {
2365 mp = &(*mp)->m_nextpkt;
2366 ASSERT(*mp == NULL);
2367 }
2368 if (*kpp != NULL) {
2369 kpp = &(*kpp)->pkt_nextpkt;
2370 ASSERT(*kpp == NULL);
2371 }
2372 }
2373
2374 ASSERT(top != NULL);
2375 skmem_cache_batch_free(pp->pp_kmd_cache, top);
2376 pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
2377 if (mtop != NULL) {
2378 DTRACE_SKYWALK(free__attached__mbuf);
2379 if (__probable(mtop->m_nextpkt != NULL)) {
2380 m_freem_list(mtop);
2381 } else {
2382 m_freem(mtop);
2383 }
2384 }
2385 if (kptop != NULL) {
2386 int cnt = 0;
2387 pp_free_packet_chain(kptop, &cnt);
2388 DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2389 }
2390 if (npkt != NULL) {
2391 *npkt = c;
2392 }
2393 }
2394
2395 void
pp_free_pktq(struct pktq * pktq)2396 pp_free_pktq(struct pktq *pktq)
2397 {
2398 if (__improbable(KPKTQ_EMPTY(pktq))) {
2399 return;
2400 }
2401 struct __kern_packet *pkt = KPKTQ_FIRST(pktq);
2402 pp_free_packet_chain(pkt, NULL);
2403 KPKTQ_DISPOSE(pktq);
2404 }
2405
2406 void
pp_drop_pktq(struct pktq * pktq,struct ifnet * ifp,uint16_t flags,drop_reason_t reason,const char * funcname,uint16_t linenum)2407 pp_drop_pktq(struct pktq *pktq, struct ifnet *ifp, uint16_t flags,
2408 drop_reason_t reason, const char *funcname, uint16_t linenum)
2409 {
2410 drop_func_t dropfunc;
2411 struct __kern_packet *kpkt;
2412
2413 if (KPKTQ_EMPTY(pktq)) {
2414 return;
2415 }
2416 if (__probable(droptap_total_tap_count == 0)) {
2417 goto nodroptap;
2418 }
2419
2420 if (flags & DROPTAP_FLAG_DIR_OUT) {
2421 dropfunc = droptap_output_packet;
2422 } else if (flags & DROPTAP_FLAG_DIR_IN) {
2423 dropfunc = droptap_input_packet;
2424 } else {
2425 goto nodroptap;
2426 }
2427
2428 KPKTQ_FOREACH(kpkt, pktq) {
2429 dropfunc(SK_PKT2PH(kpkt), reason, funcname, linenum, flags, ifp,
2430 kpkt->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);
2431 }
2432
2433 nodroptap:
2434 pp_free_pktq(pktq);
2435 }
2436
2437 __attribute__((always_inline))
2438 static inline void
pp_free_packet_array(struct kern_pbufpool * pp,uint64_t * __counted_by (num)array,uint32_t num)2439 pp_free_packet_array(struct kern_pbufpool *pp, uint64_t *__counted_by(num)array, uint32_t num)
2440 {
2441 struct __metadata_preamble *mdp;
2442 struct skmem_obj *__single obj_mdp = NULL;
2443 struct skmem_obj *__single top = NULL;
2444 struct skmem_obj *__single blist_def = NULL, *__single blist_nocache_def = NULL;
2445 struct skmem_obj *__single blist_large = NULL, *__single blist_nocache_large = NULL;
2446 struct skmem_obj **list = ⊤
2447 struct mbuf *__single mtop = NULL;
2448 struct mbuf **mp = &mtop;
2449 struct __kern_packet *__single kptop = NULL;
2450 struct __kern_packet **kpp = &kptop;
2451 uint32_t i;
2452
2453 ASSERT(pp != NULL);
2454 ASSERT(array != NULL && num > 0);
2455 ASSERT(PP_BATCH_CAPABLE(pp));
2456
2457 for (i = 0; i < num; i++) {
2458 ASSERT(SK_PTR_ADDR_KQUM(array[i])->qum_pp == pp);
2459 mdp = pp_metadata_fini(SK_PTR_ADDR_KQUM(array[i]), pp,
2460 mp, kpp, &blist_def, &blist_nocache_def, &blist_large, &blist_nocache_large);
2461
2462 obj_mdp = __unsafe_forge_single(struct skmem_obj *, mdp);
2463 *list = obj_mdp;
2464 list = &(*list)->mo_next;
2465 array[i] = 0;
2466
2467 if (*mp != NULL) {
2468 mp = &(*mp)->m_nextpkt;
2469 ASSERT(*mp == NULL);
2470 }
2471 if (*kpp != NULL) {
2472 kpp = &(*kpp)->pkt_nextpkt;
2473 ASSERT(*kpp == NULL);
2474 }
2475 }
2476
2477 ASSERT(top != NULL);
2478 skmem_cache_batch_free(pp->pp_kmd_cache, top);
2479 pp_free_kbft_list(pp, blist_def, blist_nocache_def, blist_large, blist_nocache_large);
2480 if (mtop != NULL) {
2481 DTRACE_SKYWALK(free__attached__mbuf);
2482 if (__probable(mtop->m_nextpkt != NULL)) {
2483 m_freem_list(mtop);
2484 } else {
2485 m_freem(mtop);
2486 }
2487 }
2488 if (kptop != NULL) {
2489 int cnt = 0;
2490 pp_free_packet_chain(kptop, &cnt);
2491 DTRACE_SKYWALK1(free__attached__pkt, int, cnt);
2492 }
2493 }
2494
2495 void
pp_free_packet(struct kern_pbufpool * pp,uint64_t kqum)2496 pp_free_packet(struct kern_pbufpool *pp, uint64_t kqum)
2497 {
2498 pp_free_packet_array(pp, &kqum, 1);
2499 }
2500
2501 void
pp_free_packet_batch(const kern_pbufpool_t pp,uint64_t * __counted_by (size)array,uint32_t size)2502 pp_free_packet_batch(const kern_pbufpool_t pp, uint64_t *__counted_by(size)array, uint32_t size)
2503 {
2504 pp_free_packet_array(pp, array, size);
2505 }
2506
2507 void
pp_free_packet_single(struct __kern_packet * pkt)2508 pp_free_packet_single(struct __kern_packet *pkt)
2509 {
2510 ASSERT(pkt->pkt_nextpkt == NULL);
2511 pp_free_packet(__DECONST(struct kern_pbufpool *,
2512 pkt->pkt_qum.qum_pp), SK_PTR_ADDR(pkt));
2513 }
2514
2515 void
pp_drop_packet_single(struct __kern_packet * pkt,struct ifnet * ifp,uint16_t flags,drop_reason_t reason,const char * funcname,uint16_t linenum)2516 pp_drop_packet_single(struct __kern_packet *pkt, struct ifnet *ifp, uint16_t flags,
2517 drop_reason_t reason, const char *funcname, uint16_t linenum)
2518 {
2519 drop_func_t dropfunc;
2520
2521 if (pkt->pkt_length == 0) {
2522 return;
2523 }
2524 if (__probable(droptap_total_tap_count == 0)) {
2525 goto nodroptap;
2526 }
2527
2528 if (flags & DROPTAP_FLAG_DIR_OUT) {
2529 dropfunc = droptap_output_packet;
2530 } else if (flags & DROPTAP_FLAG_DIR_IN) {
2531 dropfunc = droptap_input_packet;
2532 } else {
2533 goto nodroptap;
2534 }
2535
2536 dropfunc(SK_PKT2PH(pkt), reason, funcname, linenum, flags, ifp,
2537 pkt->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);
2538
2539 nodroptap:
2540 pp_free_packet_single(pkt);
2541 }
2542
2543 static mach_vm_address_t
pp_alloc_buffer_common(const kern_pbufpool_t pp,struct skmem_obj_info * oi,uint32_t skmflag,bool large)2544 pp_alloc_buffer_common(const kern_pbufpool_t pp, struct skmem_obj_info *oi,
2545 uint32_t skmflag, bool large)
2546 {
2547 /*
2548 * XXX -fbounds-safety: We can't change this mach_vm_address_t to some
2549 * other (safe) pointer type, because IOSkywalkFamily depends on this
2550 * being mach_vm_address_t
2551 */
2552 mach_vm_address_t baddr;
2553 struct skmem_cache *skm = large ? PP_BUF_CACHE_LARGE(pp):
2554 PP_BUF_CACHE_DEF(pp);
2555
2556 ASSERT(skm != NULL);
2557 /* allocate a cached buffer */
2558 baddr = (mach_vm_address_t)skmem_cache_alloc(skm, skmflag);
2559
2560 #if (DEVELOPMENT || DEBUG)
2561 uint64_t mtbf = skmem_region_get_mtbf();
2562 /*
2563 * MTBF is applicable only for non-blocking allocations here.
2564 */
2565 if (__improbable(mtbf != 0 && (net_uptime_ms() % mtbf) == 0 &&
2566 (skmflag & SKMEM_NOSLEEP))) {
2567 SK_ERR("pp \"%s\" MTBF failure", pp->pp_name);
2568 net_update_uptime();
2569 if (baddr != 0) {
2570 skmem_cache_free(skm,
2571 __unsafe_forge_single(struct skmem_obj *, baddr));
2572 baddr = 0;
2573 }
2574 }
2575 #endif /* (DEVELOPMENT || DEBUG) */
2576
2577 if (__improbable(baddr == 0)) {
2578 SK_DF(SK_VERB_MEM, "failed to alloc buffer, pp 0x%llx",
2579 SK_KVA(pp));
2580 return 0;
2581 }
2582 skmem_cache_get_obj_info(skm,
2583 __unsafe_forge_single(struct skmem_obj *, baddr), oi, NULL);
2584 ASSERT(SKMEM_OBJ_BUFCTL(oi) != NULL);
2585 ASSERT((mach_vm_address_t)SKMEM_OBJ_ADDR(oi) == baddr);
2586 return baddr;
2587 }
2588
2589 errno_t
pp_alloc_buffer(const kern_pbufpool_t pp,mach_vm_address_t * baddr,kern_segment_t * seg,kern_obj_idx_seg_t * idx,uint32_t skmflag)2590 pp_alloc_buffer(const kern_pbufpool_t pp, mach_vm_address_t *baddr,
2591 kern_segment_t *seg, kern_obj_idx_seg_t *idx, uint32_t skmflag)
2592 {
2593 struct skmem_obj_info oib;
2594
2595 VERIFY(pp != NULL && baddr != NULL);
2596 VERIFY((seg != NULL) == (idx != NULL));
2597
2598 if (__improbable(!PP_HAS_BUFFER_ON_DEMAND(pp))) {
2599 return ENOTSUP;
2600 }
2601
2602 *baddr = pp_alloc_buffer_common(pp, &oib, skmflag, false);
2603 if (__improbable(*baddr == 0)) {
2604 return ENOMEM;
2605 }
2606
2607 if (seg != NULL) {
2608 ASSERT(SKMEM_OBJ_SEG(&oib) != NULL);
2609 *seg = SKMEM_OBJ_SEG(&oib);
2610 *idx = SKMEM_OBJ_IDX_SEG(&oib);
2611 }
2612 return 0;
2613 }
2614
2615 void
pp_free_buffer(const kern_pbufpool_t pp,mach_vm_address_t addr)2616 pp_free_buffer(const kern_pbufpool_t pp, mach_vm_address_t addr)
2617 {
2618 ASSERT(pp != NULL && addr != 0);
2619 skmem_cache_free(PP_BUF_CACHE_DEF(pp), __unsafe_forge_single(
2620 struct skmem_obj *, addr));
2621 }
2622
2623 __attribute__((always_inline))
2624 static inline uint32_t
pp_alloc_buflet_common(struct kern_pbufpool * pp,uint64_t * __counted_by (num)array,uint32_t num,uint32_t skmflag,bool large)2625 pp_alloc_buflet_common(struct kern_pbufpool *pp,
2626 uint64_t *__counted_by(num)array, uint32_t num, uint32_t skmflag,
2627 bool large)
2628 {
2629 struct __kern_buflet *kbft = NULL;
2630 uint32_t allocd, need = num;
2631 struct skmem_obj *__single list;
2632 uint64_t *array_cp; /* -fbounds-safety */
2633
2634 ASSERT(array != NULL && num > 0);
2635 ASSERT(PP_BATCH_CAPABLE(pp));
2636 ASSERT(PP_KBFT_CACHE_DEF(pp) != NULL);
2637 ASSERT(PP_BUF_SIZE_LARGE(pp) != 0 || !large);
2638
2639 if (large) {
2640 allocd = skmem_cache_batch_alloc(PP_KBFT_CACHE_LARGE(pp), &list,
2641 PP_KBFT_CACHE_LARGE(pp)->skm_objsize, num, skmflag);
2642 } else {
2643 allocd = skmem_cache_batch_alloc(PP_KBFT_CACHE_DEF(pp), &list,
2644 PP_KBFT_CACHE_DEF(pp)->skm_objsize, num, skmflag);
2645 }
2646
2647 array_cp = array;
2648 while (list != NULL) {
2649 struct skmem_obj *listn;
2650
2651 listn = list->mo_next;
2652 list->mo_next = NULL;
2653 kbft = (kern_buflet_t)(void *)list;
2654
2655
2656 KBUF_EXT_INIT(kbft, pp);
2657 *array_cp = (uint64_t)kbft;
2658 ++array_cp;
2659 list = listn;
2660 ASSERT(need > 0);
2661 --need;
2662 }
2663 ASSERT((num - need) == allocd || kbft == NULL);
2664 return num - need;
2665 }
2666
2667 errno_t
pp_alloc_buflet(struct kern_pbufpool * pp,kern_buflet_t * kbft,uint32_t skmflag,bool large)2668 pp_alloc_buflet(struct kern_pbufpool *pp, kern_buflet_t *kbft, uint32_t skmflag,
2669 bool large)
2670 {
2671 uint64_t bft;
2672
2673 if (__improbable(!pp_alloc_buflet_common(pp, &bft, 1, skmflag, large))) {
2674 return ENOMEM;
2675 }
2676 *kbft = __unsafe_forge_single(kern_buflet_t, bft);
2677 return 0;
2678 }
2679
2680 errno_t
pp_alloc_buflet_batch(struct kern_pbufpool * pp,uint64_t * __counted_by (* size)array,uint32_t * size,uint32_t skmflag,bool large)2681 pp_alloc_buflet_batch(struct kern_pbufpool *pp,
2682 uint64_t *__counted_by(*size)array, uint32_t *size, uint32_t skmflag,
2683 bool large)
2684 {
2685 uint32_t i, n;
2686 int err;
2687
2688 ASSERT(array != NULL && size > 0);
2689
2690 n = *size;
2691 i = pp_alloc_buflet_common(pp, array, n, skmflag, large);
2692 *size = i;
2693
2694 if (__probable(i == n)) {
2695 err = 0;
2696 } else if (i != 0) {
2697 err = EAGAIN;
2698 } else {
2699 err = ENOMEM;
2700 }
2701
2702 return err;
2703 }
2704
2705 __attribute__((always_inline))
2706 static void
pp_free_buflet_common(const kern_pbufpool_t pp,kern_buflet_t kbft)2707 pp_free_buflet_common(const kern_pbufpool_t pp, kern_buflet_t kbft)
2708 {
2709 ASSERT(kbft->buf_nbft_idx == OBJ_IDX_NONE);
2710 ASSERT(kbft->buf_nbft_addr == 0);
2711
2712 if (kbft->buf_flag & BUFLET_FLAG_EXTERNAL) {
2713 ASSERT(kbft->buf_addr != 0);
2714 ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2715 ASSERT(kbft->buf_bft_idx_reg != OBJ_IDX_NONE);
2716 ASSERT(kbft->buf_ctl != NULL);
2717 ASSERT(((struct __kern_buflet_ext *)kbft)->
2718 kbe_buf_upp_link.sle_next == NULL);
2719 if (kbft->buf_ctl->bc_usecnt > 1) {
2720 skmem_cache_free_nocache(BUFLET_HAS_LARGE_BUF(kbft) ?
2721 PP_KBFT_CACHE_LARGE(pp) : PP_KBFT_CACHE_DEF(pp),
2722 (void *)kbft);
2723 } else {
2724 skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2725 PP_KBFT_CACHE_LARGE(pp) : PP_KBFT_CACHE_DEF(pp),
2726 (void *)kbft);
2727 }
2728 } else if (__probable(kbft->buf_addr != 0)) {
2729 void *objaddr = kbft->buf_objaddr;
2730 uint32_t usecnt = 0;
2731
2732 ASSERT(kbft->buf_idx != OBJ_IDX_NONE);
2733 ASSERT(kbft->buf_ctl != NULL);
2734 KBUF_DTOR(kbft, usecnt);
2735 SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u",
2736 SK_KVA(pp), SK_KVA(objaddr), usecnt);
2737 if (__probable(usecnt == 0)) {
2738 skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ?
2739 PP_BUF_CACHE_LARGE(pp) : PP_BUF_CACHE_DEF(pp),
2740 objaddr);
2741 }
2742 }
2743 }
2744
2745 void
pp_free_buflet(const kern_pbufpool_t pp,kern_buflet_t kbft)2746 pp_free_buflet(const kern_pbufpool_t pp, kern_buflet_t kbft)
2747 {
2748 ASSERT(kbft->buf_flag & BUFLET_FLAG_EXTERNAL);
2749 ASSERT(pp != NULL && kbft != NULL);
2750 pp_free_buflet_common(pp, kbft);
2751 }
2752
2753 void
pp_reap_caches(boolean_t purge)2754 pp_reap_caches(boolean_t purge)
2755 {
2756 skmem_cache_reap_now(pp_opt_cache, purge);
2757 skmem_cache_reap_now(pp_flow_cache, purge);
2758 skmem_cache_reap_now(pp_compl_cache, purge);
2759 }
2760