xref: /xnu-8796.121.2/bsd/skywalk/packet/os_packet_private.h (revision c54f35ca767986246321eb901baf8f5ff7923f6a)
1 /*
2  * Copyright (c) 2016-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #ifndef _SKYWALK_OS_PACKET_PRIVATE_H_
30 #define _SKYWALK_OS_PACKET_PRIVATE_H_
31 
32 #if defined(PRIVATE) || defined(BSD_KERNEL_PRIVATE)
33 #include <skywalk/os_packet.h>
34 #include <skywalk/os_nexus_private.h>
35 #include <skywalk/os_channel_private.h>
36 #include <libkern/OSByteOrder.h>
37 #include <netinet/in.h>
38 #include <net/ethernet.h>
39 
40 #if defined(BSD_KERNEL_PRIVATE)
41 /*
42  * Flow (currently for kernel, potentially for userland one day).
43  *
44  * XXX: When we expose this to userland, we need to be make sure to NOT
45  * expose kernel pointer/address values embedded within.
46  *
47  * Values in flow_{l2,l3,l4} are stored in network byte order.  Pointers
48  * are defined using mach_vm_address_t because it's stable across user
49  * and kernel, and therefore keeps the structure size the same.
50  *
51  * Because this structure might be initialized on a per-packet allocation
52  * basis, it as well as some of its member sub-subtructures are allocated
53  * on a 16-bytes address boundary to allow 128-bit operations on platforms
54  * that support them.
55  *
56  * XXX: when adding new fields, try to leverage __pad ones first.
57  *
58  * TODO: we should consider embedding a flow_key structure here and
59  * use that to store the tuples.  That way we can leverage that for
60  * flow lookups without having to copy things back-and-forth.
61  */
62 struct __flow {
63 	union {
64 		/*
65 		 * The following is always zeroed out on each alloc.
66 		 */
67 		struct __flow_init {
68 			/*
69 			 * Layer 3
70 			 */
71 			struct __flow_l3 {
72 				union {
73 					struct __flow_l3_ipv4_addrs {
74 						struct in_addr _src;
75 						struct in_addr _dst;
76 					} _l3_ipv4;
77 					struct __flow_l3_ipv6_addrs {
78 						struct in6_addr _src;
79 						struct in6_addr _dst;
80 					} _l3_ipv6;
81 				};
82 				uint8_t  _l3_ip_ver;
83 				uint8_t  _l3_proto;
84 				uint8_t  _l3_hlen;
85 				unsigned _l3_is_frag : 1;
86 				unsigned _l3_is_first_frag : 1;
87 				unsigned _l3_reserved_flags : 6;
88 				uint32_t _l3_frag_id;
89 				mach_vm_address_t _l3_ptr;
90 			} __l3;
91 			/*
92 			 * AQM
93 			 */
94 			struct __flow_classq {
95 				uint32_t _fcq_hash;  /* classq-specific hash */
96 				uint32_t _fcq_flags; /* classq-specific flags */
97 			} __classq;
98 			/*
99 			 * Misc.
100 			 */
101 			uint32_t __ulen;      /* user data length */
102 			uint8_t  __ulp_encap; /* e.g. IPPROTO_QUIC */
103 			uint8_t  __pad[3];
104 			uint64_t __pad64[2];
105 			/*
106 			 * Flow Source.
107 			 */
108 			struct __flow_source {
109 				union {
110 					/* source identifier */
111 					uint64_t _fsrc_id_64[2];
112 					uint32_t _fsrc_id_32[4];
113 					uuid_t   _fsrc_id;
114 				} __attribute__((aligned(sizeof(uint64_t))));
115 				flowadv_idx_t _fsrc_fidx; /* flow adv. index */
116 				uint8_t       _fsrc_type; /* FLOWSRC_* mbuf.h */
117 				uint8_t       _fsrc_pad[3];
118 			} __source;
119 			/*
120 			 * Policy.
121 			 */
122 			struct __flow_policy {
123 				uint32_t _fpc_id; /* policy id of pkt sender */
124 				uint32_t _fpc_pad;
125 				union {
126 					/* process identifier */
127 					uint64_t _fpc_euuid_64[2];
128 					uint32_t _fpc_euuid_32[4];
129 					uuid_t   _fpc_euuid;
130 				} __attribute__((aligned(sizeof(uint64_t))));
131 			} __policy;
132 		} flow_init;
133 		uint64_t flow_init_data[16];
134 	} __attribute((aligned(16)));
135 #define flow_l3                 flow_init.__l3
136 #define flow_classq             flow_init.__classq
137 #define flow_ulen               flow_init.__ulen
138 #define flow_ulp_encap          flow_init.__ulp_encap
139 #define flow_source             flow_init.__source
140 #define flow_policy             flow_init.__policy
141 
142 #define flow_ipv4_addrs         flow_l3._l3_ipv4
143 #define flow_ipv4_src           flow_l3._l3_ipv4._src
144 #define flow_ipv4_dst           flow_l3._l3_ipv4._dst
145 #define flow_ipv6_addrs         flow_l3._l3_ipv6
146 #define flow_ipv6_src           flow_l3._l3_ipv6._src
147 #define flow_ipv6_dst           flow_l3._l3_ipv6._dst
148 #define flow_ip_ver             flow_l3._l3_ip_ver
149 #define flow_ip_proto           flow_l3._l3_proto
150 #define flow_ip_hlen            flow_l3._l3_hlen
151 #define flow_ip_hdr             flow_l3._l3_ptr
152 #define flow_ip_frag_id         flow_l3._l3_frag_id
153 #define flow_ip_is_frag         flow_l3._l3_is_frag
154 #define flow_ip_is_first_frag   flow_l3._l3_is_first_frag
155 
156 #define flow_classq_hash        flow_classq._fcq_hash
157 #define flow_classq_flags       flow_classq._fcq_flags
158 
159 #define flow_src_token          flow_source._fsrc_id_32[0]
160 #define flow_src_id             flow_source._fsrc_id
161 #define flow_src_fidx           flow_source._fsrc_fidx
162 #define flow_src_type           flow_source._fsrc_type
163 
164 #define flow_policy_id          flow_policy._fpc_id
165 #define flow_policy_euuid       flow_policy._fpc_euuid
166 
167 	/*
168 	 * Layer 4.
169 	 */
170 	union {
171 		struct __flow_l4 {
172 			union {
173 				struct __flow_l4_tcp {
174 					in_port_t _src;
175 					in_port_t _dst;
176 					uint32_t _seq;
177 					uint32_t _ack;
178 					union {
179 						struct {
180 #if BYTE_ORDER == LITTLE_ENDIAN
181 							uint8_t _tcp_res:4;
182 							uint8_t _off:4;
183 #else /* BYTE_ORDER == BIG_ENDIAN */
184 							uint8_t _off:4;
185 							uint8_t _tcp_res:4;
186 #endif /* BYTE_ORDER == BIG_ENDIAN */
187 							uint8_t _flags;
188 							uint16_t _win;
189 						};
190 						uint32_t _ofw;
191 					};
192 				} _l4_tcp;
193 				struct __flow_l4_udp {
194 					in_port_t _src;
195 					in_port_t _dst;
196 					uint32_t _ls;
197 				} _l4_udp;
198 				struct __flow_l4_esp {
199 					uint32_t _spi;
200 				} _l4_esp;
201 			};
202 			uint8_t _l4_hlen;
203 			uint8_t _l4_agg_fast;
204 			uint8_t _l4_pad[6];
205 			mach_vm_address_t _l4_ptr;
206 		} flow_l4;
207 		uint64_t flow_l4_data[4];
208 	} __attribute((aligned(sizeof(uint64_t))));
209 #define flow_tcp                flow_l4._l4_tcp
210 #define flow_tcp_src            flow_l4._l4_tcp._src
211 #define flow_tcp_dst            flow_l4._l4_tcp._dst
212 #define flow_tcp_seq            flow_l4._l4_tcp._seq
213 #define flow_tcp_ack            flow_l4._l4_tcp._ack
214 #define flow_tcp_off            flow_l4._l4_tcp._off
215 #define flow_tcp_flags          flow_l4._l4_tcp._flags
216 #define flow_tcp_win            flow_l4._l4_tcp._win
217 #define flow_tcp_hlen           flow_l4._l4_hlen
218 #define flow_tcp_hdr            flow_l4._l4_ptr
219 #define flow_tcp_agg_fast       flow_l4._l4_agg_fast
220 #define flow_udp                flow_l4._l4_udp
221 #define flow_udp_src            flow_l4._l4_udp._src
222 #define flow_udp_dst            flow_l4._l4_udp._dst
223 #define flow_udp_hlen           flow_l4._l4_hlen
224 #define flow_udp_hdr            flow_l4._l4_ptr
225 #define flow_esp_spi            flow_l4._l4_esp._spi
226 } __attribute((aligned(16)));
227 #endif /* BSD_KERNEL_PRIVATE */
228 
229 /*
230  * Maximum size of L2, L3 & L4 headers combined.
231  */
232 #define PKT_MAX_PROTO_HEADER_SIZE       256
233 
234 /* based on 2KB buflet size */
235 #define BUFLETS_MIN             1       /* Ethernet MTU (default) */
236 #define BUFLETS_9K_JUMBO        5       /* 9000 bytes MTU */
237 #define BUFLETS_GSO             46      /* 64KB GSO, Ethernet MTU */
238 
239 /*
240  * Common buflet structure shared by {__user,__kern}_buflet.
241  *
242  * |     boff    |   doff   |       dlen        |         |             |
243  * +-------------+----------+-------------------+---------+-------------+
244  * objaddr      baddr                                    dlim         objlim
245  */
246 struct __buflet {
247 	union {
248 		/* for skmem batch alloc/free */
249 		uint64_t __buflet_next;
250 		/* address of next buflet in chain */
251 		const mach_vm_address_t __nbft_addr;
252 	};
253 	/* buffer data address */
254 	const mach_vm_address_t __baddr;
255 	/* index of buflet object in the owning buflet region */
256 	const obj_idx_t __bft_idx;
257 	/* buffer object index in buffer region */
258 	const obj_idx_t __bidx;
259 	/* object index in buflet region of next buflet(for buflet chaining) */
260 	const obj_idx_t __nbft_idx;
261 	const uint16_t  __dlim;         /* maximum length */
262 	uint16_t        __dlen;         /* length of data in buflet */
263 	uint16_t        __doff;         /* offset of data in buflet */
264 	const uint16_t  __flag;
265 	const uint16_t  __gro_len;      /* length of each gro segement */
266 	/* offset of buf_addr relative to the start of the buffer object */
267 	const uint16_t  __buf_off;
268 #define BUFLET_FLAG_EXTERNAL    0x0001
269 #define BUFLET_FLAG_LARGE_BUF   0x0002 /* buflet holds large buffer */
270 #define BUFLET_FLAG_RAW         0x0004 /* buflet comes from the raw bflt cache */
271 } __attribute((packed));
272 
273 /*
274  * A buflet represents the smallest buffer fragment representing
275  * part of the packet.  The index refers to the position of the buflet
276  * in the pool, and the data length represents the actual payload
277  * size -- not the buflet size itself as it is fixed for all objects
278  * in the pool.
279  */
280 struct __user_buflet {
281 	/*
282 	 * Common area between user and kernel variants.
283 	 */
284 	struct __buflet buf_com;
285 #define buf_addr        buf_com.__baddr
286 #define buf_nbft_addr   buf_com.__nbft_addr
287 #define buf_idx         buf_com.__bidx
288 #define buf_nbft_idx    buf_com.__nbft_idx
289 #define buf_dlim        buf_com.__dlim
290 #define buf_dlen        buf_com.__dlen
291 #define buf_doff        buf_com.__doff
292 #define buf_flag        buf_com.__flag
293 #define buf_bft_idx_reg buf_com.__bft_idx
294 #define buf_grolen      buf_com.__gro_len
295 #define buf_boff        buf_com.__buf_off
296 };
297 
298 #define BUFLET_HAS_LARGE_BUF(_buf)          \
299 	(((_buf)->buf_flag & BUFLET_FLAG_LARGE_BUF) != 0)
300 #define BUFLET_FROM_RAW_BFLT_CACHE(_buf)    \
301 	(((_buf)->buf_flag & BUFLET_FLAG_RAW) != 0)
302 
303 #define BUF_BADDR(_buf, _addr)                                              \
304 	*__DECONST(mach_vm_address_t *, &(_buf)->buf_addr) =                \
305 	(mach_vm_address_t)(_addr)
306 
307 #define BUF_BIDX(_buf, _idx)                                                \
308 	*__DECONST(obj_idx_t *, &(_buf)->buf_idx) = (obj_idx_t)(_idx)
309 
310 #define BUF_NBFT_ADDR(_buf, _addr)                                          \
311 	*__DECONST(mach_vm_address_t *, &(_buf)->buf_nbft_addr) =           \
312 	(mach_vm_address_t)(_addr)
313 
314 #define BUF_NBFT_IDX(_buf, _idx)                                            \
315 	*__DECONST(obj_idx_t *, &(_buf)->buf_nbft_idx) = (obj_idx_t)(_idx)
316 
317 #define BUF_BFT_IDX_REG(_buf, _idx)    \
318 	*__DECONST(obj_idx_t *, &(_buf)->buf_bft_idx_reg) = (_idx)
319 
320 #define UBUF_LINK(_pubft, _ubft) do {                                   \
321 	ASSERT((_ubft) != NULL);                                        \
322 	BUF_NBFT_ADDR(_pubft, _ubft);                                   \
323 	BUF_NBFT_IDX(_pubft, (_ubft)->buf_bft_idx_reg);                 \
324 } while (0)
325 
326 #ifdef KERNEL
327 #define BUF_CTOR(_buf, _baddr, _bidx, _dlim, _dlen, _doff, _nbaddr, _nbidx, _bflag, _boff, _grolen) do {  \
328 	_CASSERT(sizeof ((_buf)->buf_addr) == sizeof (mach_vm_address_t)); \
329 	_CASSERT(sizeof ((_buf)->buf_idx) == sizeof (obj_idx_t));       \
330 	_CASSERT(sizeof ((_buf)->buf_dlim) == sizeof (uint16_t));       \
331 	_CASSERT(sizeof ((_buf)->buf_boff) == sizeof (uint16_t));       \
332 	_CASSERT(sizeof ((_buf)->buf_grolen) == sizeof (uint16_t));     \
333 	_CASSERT(sizeof ((_buf)->buf_flag) == sizeof (uint16_t));       \
334 	BUF_BADDR(_buf, _baddr);                                        \
335 	BUF_NBFT_ADDR(_buf, _nbaddr);                                   \
336 	BUF_BIDX(_buf, _bidx);                                          \
337 	BUF_NBFT_IDX(_buf, _nbidx);                                     \
338 	(_buf)->buf_dlen = (_dlen);                                     \
339 	(_buf)->buf_doff = (_doff);                                     \
340 	*(uint16_t *)(uintptr_t)&(_buf)->buf_dlim = (_dlim);            \
341 	*(uint16_t *)(uintptr_t)&(_buf)->buf_boff = (_boff);            \
342 	*(uint16_t *)(uintptr_t)&(_buf)->buf_grolen = (_grolen);        \
343 	*(uint16_t *)(uintptr_t)&(_buf)->buf_flag = (_bflag);           \
344 } while (0)
345 
346 #define BUF_INIT(_buf, _dlen, _doff) do {                               \
347 	(_buf)->buf_dlen = (_dlen);                                     \
348 	(_buf)->buf_doff = (_doff);                                     \
349 } while (0)
350 
351 #endif /* KERNEL */
352 
353 #ifdef KERNEL
354 #define BUF_IN_RANGE(_buf)                                              \
355 	((_buf)->buf_addr >= (mach_vm_address_t)(_buf)->buf_objaddr &&  \
356 	((uintptr_t)(_buf)->buf_addr + (_buf)->buf_dlim) <=             \
357 	((uintptr_t)(_buf)->buf_objaddr + (_buf)->buf_objlim) &&        \
358 	((mach_vm_address_t)(_buf)->buf_objaddr + (_buf)->buf_boff == (_buf)->buf_addr) && \
359 	((_buf)->buf_doff + (_buf)->buf_dlen) <= (_buf)->buf_dlim &&    \
360 	(_buf)->buf_grolen <= (_buf)->buf_dlen)
361 #else /* !KERNEL */
362 #define BUF_IN_RANGE(_buf)                                              \
363 	(((_buf)->buf_doff + (_buf)->buf_dlen) <= (_buf)->buf_dlim)
364 #endif /* !KERNEL */
365 
366 /*
367  * Metadata preamble.  This structure is placed at begining of each
368  * __{user,kern}_{quantum,packet} object.  Each user metadata object has a
369  * unique red zone pattern, which is an XOR of the redzone cookie and
370  * offset of the metadata object in the object's region.  Due to the use
371  * of tagged pointer, we need the structure size to be multiples of 16.
372  * See SK_PTR_TAG() definition for details.
373  */
374 struct __metadata_preamble {
375 	union {
376 		uint64_t        _mdp_next;      /* for batch alloc/free (K) */
377 		uint64_t        mdp_redzone;    /* red zone cookie (U) */
378 	};
379 	const obj_idx_t         mdp_idx;        /* index within region (UK) */
380 	uint16_t                mdp_type;       /* nexus_meta_type_t (UK) */
381 	uint16_t                mdp_subtype;    /* nexus_meta_subtype_t (UK) */
382 };
383 
384 #define METADATA_PREAMBLE_SZ    (sizeof (struct __metadata_preamble))
385 
386 #define METADATA_PREAMBLE(_md)                  \
387 	((struct __metadata_preamble *)         \
388 	((mach_vm_address_t)(_md) - METADATA_PREAMBLE_SZ))
389 
390 #define METADATA_IDX(_md)                       \
391 	(METADATA_PREAMBLE(_md)->mdp_idx)
392 
393 #define METADATA_TYPE(_md)                      \
394 	(METADATA_PREAMBLE(_md)->mdp_type)
395 
396 #define METADATA_SUBTYPE(_md)                   \
397 	(METADATA_PREAMBLE(_md)->mdp_subtype)
398 
399 /*
400  * Common packet structure shared by {__user,__kern}_quantum.
401  */
402 struct __quantum {
403 	union {
404 		uuid_t          __uuid;         /* flow UUID */
405 		uint8_t         __val8[16];
406 		uint16_t        __val16[8];
407 		uint32_t        __val32[4];
408 		uint64_t        __val64[2];
409 	} __flow_id_u;
410 #define __q_flow_id             __flow_id_u.__uuid
411 #define __q_flow_id_val8        __flow_id_u.__val8
412 #define __q_flow_id_val16       __flow_id_u.__val16
413 #define __q_flow_id_val32       __flow_id_u.__val32
414 #define __q_flow_id_val64       __flow_id_u.__val64
415 
416 	uint32_t                __q_len;
417 
418 	/* QoS service class, see packet_svc_class_t */
419 	uint32_t                __q_svc_class;  /* PKT_SC_* values */
420 
421 	/*
422 	 * See notes on _QUM_{INTERNALIZE,EXTERNALIZE}() regarding
423 	 * portion of this structure above __flags that gets copied.
424 	 * Adding more user-mutable fields after __flags would also
425 	 * require adjusting those macros as well.
426 	 */
427 	volatile uint16_t       __q_flags;      /* QUMF_* flags */
428 	uint16_t                __q_pad[3];
429 } __attribute((aligned(sizeof(uint64_t))));
430 
431 /*
432  * Quantum.
433  *
434  * This structure is aligned for efficient copy and accesses.
435  * It is the user version of the __kernel_quantum structure.
436  *
437  * XXX: Do NOT store kernel pointer/address values here.
438  */
439 struct __user_quantum {
440 	/*
441 	 * Common area between user and kernel variants.
442 	 */
443 	struct __quantum qum_com;
444 #define qum_flow_id             qum_com.__q_flow_id
445 #define qum_flow_id_val8        qum_com.__q_flow_id_val8
446 #define qum_flow_id_val16       qum_com.__q_flow_id_val16
447 #define qum_flow_id_val32       qum_com.__q_flow_id_val32
448 #define qum_flow_id_val64       qum_com.__q_flow_id_val64
449 #define qum_len                 qum_com.__q_len
450 #define qum_qflags              qum_com.__q_flags
451 #define qum_svc_class           qum_com.__q_svc_class
452 
453 	/*
454 	 * Userland specific.
455 	 */
456 	struct __user_buflet    qum_buf[1];             /* 1 buflet */
457 	/*
458 	 * use count for packet.
459 	 */
460 	uint16_t qum_usecnt;
461 } __attribute((aligned(sizeof(uint64_t))));
462 
463 /*
464  * Valid values for (16-bit) qum_qflags.
465  */
466 #define QUM_F_FINALIZED         0x0001  /* has been finalized */
467 #define QUM_F_DROPPED           0x0002  /* has been dropped */
468 #define QUM_F_FLOW_CLASSIFIED   0x0010  /* flow has been classified */
469 #ifdef KERNEL
470 #define QUM_F_INTERNALIZED      0x1000  /* has been internalized */
471 #define QUM_F_KERNEL_ONLY       0x8000  /* kernel only; no user counterpart */
472 
473 /* invariant flags we want to keep */
474 #define QUM_F_SAVE_MASK         (QUM_F_KERNEL_ONLY)
475 /* kernel-only flags that's never externalized */
476 #define QUM_F_KERNEL_FLAGS      (QUM_F_INTERNALIZED|QUM_F_KERNEL_ONLY)
477 #endif /* KERNEL */
478 
479 #ifdef KERNEL
480 #define _KQUM_CTOR(_kqum, _flags, _len, _baddr, _bidx, _dlim, _qidx) do {    \
481 	(_kqum)->qum_flow_id_val64[0] = 0;                                   \
482 	(_kqum)->qum_flow_id_val64[1] = 0;                                   \
483 	(_kqum)->qum_qflags = (_flags);                                      \
484 	(_kqum)->qum_len = (_len);                                           \
485 	_CASSERT(sizeof(METADATA_IDX(_kqum)) == sizeof(obj_idx_t));          \
486 	*(obj_idx_t *)(uintptr_t)&METADATA_IDX(_kqum) = (_qidx);             \
487 	BUF_CTOR(&(_kqum)->qum_buf[0], (_baddr), (_bidx), (_dlim), 0, 0, 0,  \
488 	    OBJ_IDX_NONE, 0, 0, 0);                                          \
489 } while (0)
490 
491 #define _KQUM_INIT(_kqum, _flags, _len, _qidx) do {                          \
492 	(_kqum)->qum_flow_id_val64[0] = 0;                                   \
493 	(_kqum)->qum_flow_id_val64[1] = 0;                                   \
494 	(_kqum)->qum_qflags = (_flags);                                      \
495 	(_kqum)->qum_len = (_len);                                           \
496 	BUF_INIT(&(_kqum)->qum_buf[0], 0, 0);                                \
497 } while (0)
498 #endif /* KERNEL */
499 
500 /*
501  * Common packet structure shared by {__user,__kern}_packet.
502  */
503 struct __packet_com {
504 	/* Link layer (offset relevant to first buflet) */
505 	uint16_t __link_flags;                          /* PKT_LINKF_* flags */
506 
507 	/*
508 	 * Headroom/protocol header length
509 	 *
510 	 * Since the security model of Skywalk nexus is that we doesn't trust
511 	 * packets either from above (userspace) or below (driver/firmware),
512 	 * the only metadata field that nexus makes use of from external is the
513 	 * headroom. Based on headroom, the flowswitch starts demux routine on
514 	 * l2 header, if any. The l2_len is stored in this step. Then the flow
515 	 * extraction (l3+l4 flow) begins parsing from (headroom + l2_len).
516 	 *
517 	 * __headroom is the empty buffer space before any packet data,
518 	 * it is also the equivalent to the first header offset.
519 	 *
520 	 * __l2_len is l2 (link layer) protocol header length, if any.
521 	 */
522 	uint8_t __headroom;
523 	uint8_t __l2_len;
524 
525 	/*
526 	 * Checksum offload.
527 	 *
528 	 * Partial checksum does not require any header parsing and is
529 	 * therefore simpler to implement both in software and hardware.
530 	 *
531 	 * On transmit, PKT_CSUMF_PARTIAL indicates that a partial one's
532 	 * complement checksum to be computed on the span starting from
533 	 * pkt_csum_tx_start_off to the end of the packet, and have the
534 	 * resulted checksum value written at the location specified by
535 	 * pkt_csum_tx_stuff_off.
536 	 *
537 	 * The PKT_CSUMF_ZERO_INVERT flag is used on transmit to indicate
538 	 * that the value 0xffff (negative 0 in one's complement) must be
539 	 * substituted for the value of 0.  This is set for UDP packets,
540 	 * since otherwise the receiver may not validate the checksum
541 	 * (UDP/IPv4), or drop the packet altogether (UDP/IPv6).
542 	 *
543 	 * On receive, PKT_CSUMF_PARTIAL indicates that a partial one's
544 	 * complement checksum has been computed on the span beginning at
545 	 * pkt_csum_rx_start_off to the end of the packet, and that the
546 	 * computed value is now stored in pkt_csum_rx_value.
547 	 *
548 	 * All offsets are relative to the base of the first buflet.
549 	 */
550 	uint32_t __csum_flags;                          /* PKT_CSUMF_* flags */
551 	union {
552 		struct {
553 			uint16_t __csum_start_off;      /* start offset */
554 			uint16_t __csum_value;          /* checksum value */
555 		} __csum_rx;
556 		struct {
557 			uint16_t __csum_start_off;      /* start offset */
558 			uint16_t __csum_stuff_off;      /* stuff offset */
559 		} __csum_tx;
560 		uint32_t __csum_data;
561 	};
562 
563 	/* Compression generation count */
564 	uint32_t __comp_gencnt;
565 
566 	/*
567 	 * Trace ID for each sampled packet.
568 	 * Non-zero ID indicates that the packet is being actively traced.
569 	 */
570 	packet_trace_id_t __trace_id;
571 
572 	/* Aggregation type */
573 	uint8_t __aggr_type;                     /* PKT_AGGR_* values */
574 	uint8_t __seg_cnt;                       /* Number of LRO-packets */
575 
576 	uint16_t __proto_seg_sz;                 /* Protocol segment size */
577 
578 	/*
579 	 * See notes on _PKT_{INTERNALIZE,EXTERNALIZE}() regarding portion
580 	 * of this structure above __p_flags that gets copied.  Adding
581 	 * more user-mutable fields after __p_flags would also require
582 	 * adjusting those macros as well.
583 	 */
584 	union {
585 		volatile uint32_t __flags32[2];
586 		volatile uint64_t __flags;              /* PKT_F_* flags */
587 	};
588 } __attribute((aligned(sizeof(uint64_t))));
589 
590 struct __packet {
591 	union {
592 		uint64_t                __pkt_data[4];
593 		struct __packet_com     __pkt_com;
594 	};
595 #define __p_link_flags          __pkt_com.__link_flags
596 #define __p_headroom            __pkt_com.__headroom
597 #define __p_l2_len              __pkt_com.__l2_len
598 #define __p_csum_flags          __pkt_com.__csum_flags
599 #define __p_csum_rx             __pkt_com.__csum_rx
600 #define __p_csum_tx             __pkt_com.__csum_tx
601 #define __p_csum_data           __pkt_com.__csum_data
602 #define __p_comp_gencnt         __pkt_com.__comp_gencnt
603 #define __p_aggr_type           __pkt_com.__aggr_type
604 #define __p_seg_cnt             __pkt_com.__seg_cnt
605 #define __p_proto_seg_sz        __pkt_com.__proto_seg_sz
606 #define __p_trace_id            __pkt_com.__trace_id
607 #define __p_flags32             __pkt_com.__flags32
608 #define __p_flags               __pkt_com.__flags
609 };
610 
611 /* optional packet token types */
612 #define PKT_OPT_TOKEN_TYPE_OPAQUE       1 /* token has opaque data */
613 #define PKT_OPT_TOKEN_TYPE_PACKET_ID    2 /* token has packet_id */
614 
615 /* maximum token size */
616 #define PKT_OPT_MAX_TOKEN_SIZE          16
617 
618 struct __packet_opt_com {
619 	union {
620 		uint64_t        __token_data[2];
621 		uint8_t         __token[PKT_OPT_MAX_TOKEN_SIZE];
622 	};
623 	uint64_t        __expire_ts;
624 	uint16_t        __vlan_tag;
625 	uint16_t        __token_len;
626 	uint8_t         __token_type;
627 	uint8_t         __expiry_action;
628 	uint8_t         __app_type;
629 	uint8_t         __app_metadata;
630 } __attribute((aligned(sizeof(uint64_t))));
631 
632 struct __packet_opt {
633 	union {
634 		uint64_t                __pkt_opt_data[4];
635 		struct __packet_opt_com __pkt_opt_com;
636 	};
637 #define __po_token_type         __pkt_opt_com.__token_type
638 #define __po_token_len          __pkt_opt_com.__token_len
639 #define __po_vlan_tag           __pkt_opt_com.__vlan_tag
640 #define __po_token_data         __pkt_opt_com.__token_data
641 #define __po_token              __pkt_opt_com.__token
642 #define __po_expire_ts          __pkt_opt_com.__expire_ts
643 #define __po_expiry_action      __pkt_opt_com.__expiry_action
644 #define __po_app_type           __pkt_opt_com.__app_type
645 #define __po_app_metadata       __pkt_opt_com.__app_metadata
646 };
647 
648 /*
649  * Packet.
650  *
651  * This structure is aligned for efficient copy and accesses.
652  * It is the user version of the __kern_packet structure.
653  *
654  * XXX: Do NOT store kernel pointer/address values here.
655  */
656 struct __user_packet {
657 	struct __user_quantum   pkt_qum;
658 /*
659  * pkt_flow_id is the flow identifier used by user space stack to identfy a
660  * flow. This identifier is passed as a metadata on all packets generated by
661  * the user space stack. On RX flowswitch fills in this metadata on every
662  * packet and can be used by user space stack for flow classification purposes.
663  */
664 #define pkt_flow_id             pkt_qum.qum_flow_id
665 #define pkt_flow_id_64          pkt_qum.qum_flow_id_val64
666 #define pkt_qum_qflags          pkt_qum.qum_qflags
667 #define pkt_length              pkt_qum.qum_len
668 #define pkt_qum_buf             pkt_qum.qum_buf[0]
669 #define pkt_svc_class           pkt_qum.qum_svc_class
670 #ifdef KERNEL
671 /*
672  * pkt_flow_token is a globally unique flow identifier generated by the
673  * flowswitch for each flow. Flowswitch stamps every TX packet with this
674  * identifier. This is the flow identifier which would be visible to the AQM
675  * logic and the driver.
676  * pkt_flow_token uses the first 4 bytes of pkt_flow_id as the storage space.
677  * This is not a problem as pkt_flow_id is only for flowswitch consumption
678  * and is not required by any other module after the flowswitch TX processing
679  * stage.
680  */
681 #define pkt_flow_token          pkt_qum.qum_flow_id_val32[0]
682 #endif /* KERNEL */
683 
684 	/*
685 	 * Common area between user and kernel variants.
686 	 */
687 	struct __packet pkt_com;
688 #define pkt_link_flags          pkt_com.__p_link_flags
689 #define pkt_headroom            pkt_com.__p_headroom
690 #define pkt_l2_len              pkt_com.__p_l2_len
691 #define pkt_csum_flags          pkt_com.__p_csum_flags
692 #define pkt_csum_rx_start_off   pkt_com.__p_csum_rx.__csum_start_off
693 #define pkt_csum_rx_value       pkt_com.__p_csum_rx.__csum_value
694 #define pkt_csum_tx_start_off   pkt_com.__p_csum_tx.__csum_start_off
695 #define pkt_csum_tx_stuff_off   pkt_com.__p_csum_tx.__csum_stuff_off
696 #define pkt_csum_data           pkt_com.__p_csum_data
697 #define pkt_comp_gencnt         pkt_com.__p_comp_gencnt
698 #define pkt_aggr_type           pkt_com.__p_aggr_type
699 #define pkt_seg_cnt             pkt_com.__p_seg_cnt
700 #define pkt_proto_seg_sz        pkt_com.__p_proto_seg_sz
701 #define pkt_trace_id            pkt_com.__p_trace_id
702 #if BYTE_ORDER == LITTLE_ENDIAN
703 #define pkt_pflags32            pkt_com.__p_flags32[0]
704 #else /* BYTE_ORDER != LITTLE_ENDIAN */
705 #define pkt_pflags32            pkt_com.__p_flags32[1]
706 #endif /* BYTE_ORDER != LITTLE_ENDIAN */
707 #define pkt_pflags              pkt_com.__p_flags
708 
709 	/*
710 	 * Optional common metadata.
711 	 */
712 	struct __packet_opt pkt_com_opt;
713 
714 	/*
715 	 * Userland specific.
716 	 */
717 
718 	/*
719 	 * pkt_{bufs,max} aren't part of the common area, on purpose,
720 	 * since we selectively update them on internalize/externalize.
721 	 */
722 	const uint16_t  pkt_bufs_max;       /* maximum size of buflet chain */
723 	const uint16_t  pkt_bufs_cnt;       /* buflet chain size */
724 } __attribute((aligned(sizeof(uint64_t))));
725 
726 /* the size of __user_packet structure for n total buflets */
727 #define _USER_PACKET_SIZE(n) sizeof(struct __user_packet)
728 
729 /*
730  * Valid values for pkt_link_flags.
731  */
732 #define PKT_LINKF_BCAST         0x0001  /* send/received as link-level bcast */
733 #define PKT_LINKF_MCAST         0x0002  /* send/received as link-level mcast */
734 #define PKT_LINKF_ETHFCS        0x0004  /* has Ethernet FCS */
735 
736 /*
737  * XXX IMPORTANT - READ THIS XXX
738  *
739  * Valid values for (64-bit) pkt_pflags.
740  *
741  * The lower 32-bit values are equivalent to PKTF_* flags used by mbufs,
742  * hence the unused values are reserved.  Do not use define any of these
743  * values unless they correspond to PKTF_* flags.  Make sure to do the
744  * following when adding a value in the lower 32-bit range:
745  *
746  * a. If the flag is kernel-only, prefix it with 2 underscore characters,
747  *    then add a PKT_F_* alias under the KERNEL block conditional.  This
748  *    will help ensure that the libsyscall code doesn't mistakenly use it.
749  *
750  * b. In pp_init(), add compile-time assertion to ensure that the PKT_F_*
751  *    value matches the corresponding PKTF_* as defined in <sys/mbuf.h>.
752  *
753  * c. Add the new flag to PKT_F_USER_MASK depending on whether it's allowed
754  *    to be used by userland.  Flags not added to this mask will only be
755  *    used by the kernel.  We only internalize and externalize flags listed
756  *    in PKT_F_USER_MASK.
757  *
758  * d. Add the new flag to PKT_F_COMMON_MASK.
759  *
760  * When adding an upper 32-bit value, ensure (a) and (c) above are done.
761  *
762  * Legend:
763  *
764  * (K)        - Kernel-only
765  * (U+K)      - User and kernel
766  * (reserved) - Only to be used for mapping with mbuf PKTF_* flags
767  */
768 #define __PKT_F_FLOW_ID         0x0000000000000001ULL /* (K) */
769 #define __PKT_F_FLOW_ADV        0x0000000000000002ULL /* (K) */
770 /*                              0x0000000000000004ULL    (reserved) */
771 /*                              0x0000000000000008ULL    (reserved) */
772 /*                              0x0000000000000010ULL    (reserved) */
773 /*                              0x0000000000000020ULL    (reserved) */
774 /*                              0x0000000000000040ULL    (reserved) */
775 /*                              0x0000000000000080ULL    (reserved) */
776 /*                              0x0000000000000100ULL    (reserved) */
777 /*                              0x0000000000000200ULL    (reserved) */
778 #define PKT_F_WAKE_PKT          0x0000000000000400ULL /* (U+K) */
779 /*                              0x0000000000000800ULL    (reserved) */
780 /*                              0x0000000000001000ULL    (reserved) */
781 /*                              0x0000000000002000ULL    (reserved) */
782 /*                              0x0000000000004000ULL    (reserved) */
783 #define PKT_F_BACKGROUND        0x0000000000008000ULL /* (U+K) */
784 /*                              0x0000000000010000ULL    (reserved) */
785 /*                              0x0000000000020000ULL    (reserved) */
786 #define PKT_F_KEEPALIVE         0x0000000000040000ULL /* (U+K) */
787 #define PKT_F_REALTIME          0x0000000000080000ULL /* (U+K) */
788 /*                              0x0000000000100000ULL    (reserved) */
789 #define PKT_F_REXMT             0x0000000000200000ULL /* (U+K) */
790 /*                              0x0000000000400000ULL    (reserved) */
791 #define __PKT_F_TX_COMPL_TS_REQ 0x0000000000800000ULL /* (K) */
792 #define __PKT_F_TS_VALID        0x0000000001000000ULL /* (K) */
793 /*                              0x0000000002000000ULL    (reserved) */
794 #define __PKT_F_NEW_FLOW        0x0000000004000000ULL /* (K) */
795 #define __PKT_F_START_SEQ       0x0000000008000000ULL /* (K) */
796 #define PKT_F_LAST_PKT          0x0000000010000000ULL /* (U+K) */
797 /*                              0x0000000020000000ULL    (reserved) */
798 /*                              0x0000000040000000ULL    (reserved) */
799 /*                              0x0000000080000000ULL    (reserved) */
800 /*                              ---------------------    upper 32-bit below */
801 #define PKT_F_OPT_GROUP_START   0x0000000100000000ULL /* (U+K) */
802 #define PKT_F_OPT_GROUP_END     0x0000000200000000ULL /* (U+K) */
803 #define PKT_F_OPT_EXPIRE_TS     0x0000000400000000ULL /* (U+K) */
804 #define PKT_F_OPT_TOKEN         0x0000000800000000ULL /* (U+K) */
805 #define __PKT_F_FLOW_DATA       0x0000001000000000ULL /* (K) */
806 #define __PKT_F_TX_COMPL_DATA   0x0000002000000000ULL /* (K) */
807 #define __PKT_F_MBUF_DATA       0x0000004000000000ULL /* (K) */
808 #define PKT_F_TRUNCATED         0x0000008000000000ULL /* (U+K) */
809 #define __PKT_F_PKT_DATA        0x0000010000000000ULL /* (K) */
810 #define PKT_F_PROMISC           0x0000020000000000ULL /* (U+K) */
811 #define PKT_F_OPT_VLTAG         0x0000040000000000ULL /* (U+K) */
812 #define PKT_F_OPT_VLTAG_IN_PKT  0x0000080000000000ULL /* (U+K) */
813 #define __PKT_F_TX_PORT_DATA    0x0000100000000000ULL /* (K) */
814 #define PKT_F_OPT_EXP_ACTION    0x0000200000000000ULL /* (U+K) */
815 #define PKT_F_OPT_APP_METADATA  0x0000400000000000ULL /* (U+K) */
816 #define PKT_F_L4S               0x0000800000000000ULL /* (U+K) */
817 /*                              0x0001000000000000ULL */
818 /*                              0x0002000000000000ULL */
819 /*                              0x0004000000000000ULL */
820 /*                              0x0008000000000000ULL */
821 /*                              0x0010000000000000ULL */
822 /*                              0x0020000000000000ULL */
823 /*                              0x0040000000000000ULL */
824 /*                              0x0080000000000000ULL */
825 #define __PKT_F_OPT_ALLOC       0x0100000000000000ULL /* (K) */
826 #define __PKT_F_FLOW_ALLOC      0x0200000000000000ULL /* (K) */
827 #define __PKT_F_TX_COMPL_ALLOC  0x0400000000000000ULL /* (K) */
828 /*                              0x0800000000000000ULL */
829 /*                              0x1000000000000000ULL */
830 /*                              0x2000000000000000ULL */
831 /*                              0x4000000000000000ULL */
832 /*                              0x8000000000000000ULL */
833 
834 /*
835  * Packet option flags.
836  */
837 #define PKT_F_OPT_DATA                                                  \
838 	(PKT_F_OPT_GROUP_START | PKT_F_OPT_GROUP_END |                  \
839 	PKT_F_OPT_EXPIRE_TS | PKT_F_OPT_TOKEN |                         \
840 	PKT_F_OPT_VLTAG | PKT_F_OPT_VLTAG_IN_PKT | PKT_F_OPT_EXP_ACTION | \
841 	PKT_F_OPT_APP_METADATA)
842 
843 #ifdef KERNEL
844 /*
845  * Flags exposed to user (and kernel).  See notes above.
846  */
847 #define PKT_F_USER_MASK                                                 \
848 	(PKT_F_BACKGROUND | PKT_F_REALTIME | PKT_F_REXMT |              \
849 	PKT_F_LAST_PKT | PKT_F_OPT_DATA | PKT_F_PROMISC |               \
850 	PKT_F_TRUNCATED | PKT_F_WAKE_PKT | PKT_F_L4S)
851 
852 /*
853  * Aliases for kernel-only flags.  See notes above.  The ones marked
854  * with (common) have corresponding PKTF_* definitions and are also
855  * included in PKT_F_COMMON_MASK below.
856  */
857 #define PKT_F_FLOW_ID           __PKT_F_FLOW_ID         /* (common) */
858 #define PKT_F_FLOW_ADV          __PKT_F_FLOW_ADV        /* (common) */
859 #define PKT_F_TX_COMPL_TS_REQ   __PKT_F_TX_COMPL_TS_REQ /* (common) */
860 #define PKT_F_TS_VALID          __PKT_F_TS_VALID        /* (common) */
861 #define PKT_F_NEW_FLOW          __PKT_F_NEW_FLOW        /* (common) */
862 #define PKT_F_START_SEQ         __PKT_F_START_SEQ       /* (common) */
863 #define PKT_F_FLOW_DATA         __PKT_F_FLOW_DATA
864 #define PKT_F_TX_COMPL_DATA     __PKT_F_TX_COMPL_DATA
865 #define PKT_F_MBUF_DATA         __PKT_F_MBUF_DATA
866 #define PKT_F_PKT_DATA          __PKT_F_PKT_DATA
867 #define PKT_F_OPT_ALLOC         __PKT_F_OPT_ALLOC
868 #define PKT_F_FLOW_ALLOC        __PKT_F_FLOW_ALLOC
869 #define PKT_F_TX_COMPL_ALLOC    __PKT_F_TX_COMPL_ALLOC
870 #define PKT_F_TX_PORT_DATA      __PKT_F_TX_PORT_DATA
871 
872 /*
873  * Flags related to mbuf attached to the packet.
874  */
875 #define PKT_F_MBUF_MASK         (PKT_F_MBUF_DATA | PKT_F_TRUNCATED)
876 
877 /*
878  * Flags related to packet attached to the packet.
879  */
880 #define PKT_F_PKT_MASK         (PKT_F_PKT_DATA | PKT_F_TRUNCATED)
881 
882 /*
883  * Invariant flags kept during _PKT_COPY().  At the moment we keep
884  * all except those related to the attached mbuf.
885  */
886 #define PKT_F_COPY_MASK         (~(PKT_F_MBUF_MASK | PKT_F_PKT_MASK))
887 
888 /*
889  * Lower 32-bit flags common to mbuf and __kern_packet.  See notes above.
890  * DO NOT add flags to this mask unless they have equivalent PKTF_* flags
891  * defined in <sys/mbuf.h>
892  */
893 #define PKT_F_COMMON_MASK                                               \
894 	(PKT_F_BACKGROUND | PKT_F_REALTIME | PKT_F_REXMT |              \
895 	PKT_F_LAST_PKT | PKT_F_FLOW_ID | PKT_F_FLOW_ADV |               \
896 	PKT_F_TX_COMPL_TS_REQ | PKT_F_TS_VALID | PKT_F_NEW_FLOW |       \
897 	PKT_F_START_SEQ | PKT_F_KEEPALIVE | PKT_F_WAKE_PKT)
898 
899 /*
900  * Flags retained across alloc/free.
901  */
902 #define PKT_F_INIT_MASK                                                 \
903 	(PKT_F_OPT_ALLOC | PKT_F_FLOW_ALLOC | PKT_F_TX_COMPL_ALLOC)
904 #endif /* KERNEL */
905 
906 /*
907  * 64-bit tagged pointer (limit tag to least significant byte).
908  * We use 2 bits to encode type, and another 2 bits for subtype.
909  */
910 #define SK_PTR_TYPE_MASK        ((uint64_t)0x3)         /* 00 11 */
911 #define SK_PTR_SUBTYPE_MASK     ((uint64_t)0xc)         /* 11 00 */
912 #define SK_PTR_TAG_MASK         ((uint64_t)0xf)         /* 11 11 */
913 
914 #define SK_PTR_TAG(_p)          ((uint64_t)(_p) & SK_PTR_TAG_MASK)
915 #define SK_PTR_ADDR_MASK        (~SK_PTR_TAG_MASK)
916 
917 #define SK_PTR_TYPE(_p)         ((uint64_t)(_p) & SK_PTR_TYPE_MASK)
918 #define SK_PTR_TYPE_ENC(_t)     ((uint64_t)(_t) & SK_PTR_TYPE_MASK)
919 
920 #define SK_PTR_SUBTYPE(_p)      (((uint64_t)(_p) & SK_PTR_SUBTYPE_MASK) >> 2)
921 #define SK_PTR_SUBTYPE_ENC(_s)  (((uint64_t)(_s) << 2) & SK_PTR_SUBTYPE_MASK)
922 
923 #define SK_PTR_ADDR(_p)         ((uint64_t)(_p) & SK_PTR_ADDR_MASK)
924 #define SK_PTR_ADDR_ENC(_p)     ((uint64_t)(_p) & SK_PTR_ADDR_MASK)
925 
926 #define SK_PTR_ENCODE(_p, _t, _s)       \
927 	(SK_PTR_ADDR_ENC(_p) | SK_PTR_TYPE_ENC(_t) | SK_PTR_SUBTYPE_ENC(_s))
928 
929 #define SK_PTR_ADDR_UQUM(_ph)   ((struct __user_quantum *)SK_PTR_ADDR(_ph))
930 #define SK_PTR_ADDR_UPKT(_ph)   ((struct __user_packet *)SK_PTR_ADDR(_ph))
931 
932 #ifdef KERNEL
933 __BEGIN_DECLS
934 /*
935  * Packets.
936  */
937 extern struct mbuf *kern_packet_get_mbuf(const kern_packet_t);
938 __END_DECLS
939 #else /* !KERNEL */
940 #if defined(LIBSYSCALL_INTERFACE)
941 __BEGIN_DECLS
942 extern void pkt_subtype_assert_fail(const packet_t, uint64_t, uint64_t);
943 extern void pkt_type_assert_fail(const packet_t, uint64_t);
944 __END_DECLS
945 #endif /* LIBSYSCALL_INTERFACE */
946 #endif /* !KERNEL */
947 #if defined(LIBSYSCALL_INTERFACE) || defined(BSD_KERNEL_PRIVATE)
948 #include <skywalk/packet_common.h>
949 #endif /* LIBSYSCALL_INTERFACE || BSD_KERNEL_PRIVATE */
950 #endif /* PRIVATE || BSD_KERNEL_PRIVATE */
951 #endif /* !_SKYWALK_OS_PACKET_PRIVATE_H_ */
952