xref: /xnu-12377.81.4/bsd/skywalk/nexus/flowswitch/flow/flow_agg.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796) !
1 /*
2  * Copyright (c) 2019-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
33 #include <skywalk/nexus/netif/nx_netif.h>
34 #include <skywalk/nexus/netif/nx_netif_compat.h>
35 #include <netinet/tcp.h>
36 #include <netinet/ip.h>
37 #include <netinet/ip6.h>
38 #include <net/pktap.h>
39 #include <sys/sdt.h>
40 
41 #define MAX_AGG_IP_LEN()        MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET)
42 #define MAX_BUFLET_COUNT        (32)
43 #define TCP_FLAGS_IGNORE        (TH_FIN|TH_SYN|TH_RST|TH_URG)
44 #define PKT_IS_MBUF(_pkt)       (_pkt->pkt_pflags & PKT_F_MBUF_DATA)
45 #define PKT_IS_TRUNC_MBUF(_pkt) (PKT_IS_MBUF(_pkt) &&           \
46 	                        (_pkt->pkt_pflags & PKT_F_TRUNCATED))
47 #define PKT_IS_WAKE_PKT(_pkt)   ((PKT_IS_MBUF(_pkt) &&                                  \
48 	                        (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) || \
49 	                        (!PKT_IS_MBUF(_pkt) &&                                  \
50 	                        (_pkt->pkt_pflags & PKT_F_WAKE_PKT)))
51 
52 
53 typedef uint16_t (* flow_agg_fix_pkt_sum_func)(uint16_t, uint16_t, uint16_t);
54 
55 static uint16_t
56 flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new);
57 
58 static uint16_t
59 flow_agg_pkt_fix_sum_no_op(uint16_t csum, uint16_t old, uint16_t new);
60 
61 /*
62  * This structure holds per-super object (mbuf/packet) flow aggregation.
63  */
64 struct flow_agg {
65 	union {
66 		struct {
67 			union {
68 				void *          _fa_sobj;
69 				struct mbuf *   _fa_smbuf;      /* super mbuf */
70 				struct __kern_packet *_fa_spkt; /* super pkt */
71 			};
72 			uint8_t *__indexable _fa_sptr;        /* ptr to super IP header */
73 			bool     _fa_sobj_is_pkt; /* super obj is pkt or mbuf */
74 			/*
75 			 * super obj is not large enough to hold the IP & TCP
76 			 * header in a contiguous buffer.
77 			 */
78 			bool     _fa_sobj_is_short;
79 			uint32_t _fa_tcp_seq;     /* expected next sequence # */
80 			uint32_t _fa_ulen;        /* expected next ulen */
81 			uint32_t _fa_total;       /* total aggregated bytes */
82 			/* function that fix packet checksum */
83 			flow_agg_fix_pkt_sum_func _fa_fix_pkt_sum;
84 		} __flow_agg;
85 		uint64_t __flow_agg_data[5];
86 	};
87 #define fa_sobj           __flow_agg._fa_sobj
88 #define fa_smbuf          __flow_agg._fa_smbuf
89 #define fa_spkt           __flow_agg._fa_spkt
90 #define fa_sptr           __flow_agg._fa_sptr
91 #define fa_sobj_is_pkt    __flow_agg._fa_sobj_is_pkt
92 #define fa_sobj_is_short  __flow_agg._fa_sobj_is_short
93 #define fa_tcp_seq        __flow_agg._fa_tcp_seq
94 #define fa_ulen           __flow_agg._fa_ulen
95 #define fa_total          __flow_agg._fa_total
96 #define fa_fix_pkt_sum   __flow_agg._fa_fix_pkt_sum
97 };
98 
99 #if __has_ptrcheck
100 #define FLOW_AGG_CLEAR(_fa) do {                                    \
101 	static_assert(sizeof(struct flow_agg) == 48);         \
102 	static_assert(offsetof(struct flow_agg, fa_fix_pkt_sum) == 40);              \
103 	sk_zero_48(_fa);                                                \
104 	(_fa)->fa_fix_pkt_sum = 0;                                                                             \
105 } while (0)
106 #else
107 #define FLOW_AGG_CLEAR(_fa) do {                                    \
108 	static_assert(sizeof(struct flow_agg) == 40);         \
109 	static_assert(offsetof(struct flow_agg, fa_fix_pkt_sum) == 32);              \
110 	sk_zero_32(_fa);                                                \
111 	(_fa)->fa_fix_pkt_sum = 0;                                                                             \
112 } while (0)
113 #endif
114 
115 #define MASK_SIZE       80      /* size of struct {ip,ip6}_tcp_mask */
116 
117 struct ip_tcp_mask {
118 	struct ip       ip_m;
119 	struct tcphdr   tcp_m;
120 	uint32_t        tcp_option_m[MAX_TCPOPTLEN / sizeof(uint32_t)];
121 };
122 
123 static const struct ip_tcp_mask ip_tcp_mask
124 __sk_aligned(16) =
125 {
126 	.ip_m = {
127 		.ip_hl = 0xf,
128 		.ip_v = 0xf,
129 		.ip_tos = 0xff,
130 		/* Not checked; aggregated packet's ip_len is increasing */
131 		.ip_len = 0,
132 		.ip_id = 0,
133 		.ip_off = 0xffff,
134 		.ip_ttl = 0xff,
135 		.ip_p = 0xff,
136 		.ip_sum = 0,
137 		.ip_src.s_addr = 0xffffffff,
138 		.ip_dst.s_addr = 0xffffffff,
139 	},
140 	.tcp_m = {
141 		.th_sport = 0xffff,
142 		.th_dport = 0xffff,
143 		.th_seq = 0,
144 		.th_ack = 0xffffffff,
145 		.th_x2 = 0xf,
146 		.th_off = 0xf,
147 		.th_flags = ~TH_PUSH,
148 		.th_win = 0xffff,
149 		.th_sum = 0,
150 		.th_urp = 0xffff,
151 	},
152 	.tcp_option_m = {
153 		/* Max 40 bytes of TCP options */
154 		0xffffffff,
155 		0xffffffff,
156 		0xffffffff,
157 		0,      /* Filling up to MASK_SIZE */
158 		0,      /* Filling up to MASK_SIZE */
159 		0,      /* Filling up to MASK_SIZE */
160 		0,      /* Filling up to MASK_SIZE */
161 		0,      /* Filling up to MASK_SIZE */
162 		0,      /* Filling up to MASK_SIZE */
163 		0,      /* Filling up to MASK_SIZE */
164 	},
165 };
166 
167 struct ip6_tcp_mask {
168 	struct ip6_hdr  ip6_m;
169 	struct tcphdr   tcp_m;
170 	uint32_t        tcp_option_m[5]; /* 5 bytes to fill up to MASK_SIZE */
171 };
172 
173 static const struct ip6_tcp_mask ip6_tcp_mask
174 __sk_aligned(16) =
175 {
176 	.ip6_m = {
177 		.ip6_ctlun.ip6_un1.ip6_un1_flow = 0xffffffff,
178 		/* Not checked; aggregated packet's ip_len is increasing */
179 		.ip6_ctlun.ip6_un1.ip6_un1_plen = 0,
180 		.ip6_ctlun.ip6_un1.ip6_un1_nxt = 0xff,
181 		.ip6_ctlun.ip6_un1.ip6_un1_hlim = 0xff,
182 		.ip6_src.__u6_addr.__u6_addr32[0] = 0xffffff,
183 		.ip6_src.__u6_addr.__u6_addr32[1] = 0xffffff,
184 		.ip6_src.__u6_addr.__u6_addr32[2] = 0xffffff,
185 		.ip6_src.__u6_addr.__u6_addr32[3] = 0xffffff,
186 		.ip6_dst.__u6_addr.__u6_addr32[0] = 0xffffff,
187 		.ip6_dst.__u6_addr.__u6_addr32[1] = 0xffffff,
188 		.ip6_dst.__u6_addr.__u6_addr32[2] = 0xffffff,
189 		.ip6_dst.__u6_addr.__u6_addr32[3] = 0xffffff,
190 	},
191 	.tcp_m = {
192 		.th_sport = 0xffff,
193 		.th_dport = 0xffff,
194 		.th_seq = 0,
195 		.th_ack = 0xffffffff,
196 		.th_x2 = 0xf,
197 		.th_off = 0xf,
198 		.th_flags = ~TH_PUSH,
199 		.th_win = 0xffff,
200 		.th_sum = 0,
201 		.th_urp = 0xffff,
202 	},
203 	.tcp_option_m = {
204 		/* Max 40 bytes of TCP options */
205 		0xffffffff,
206 		0xffffffff,
207 		0xffffffff,
208 		0,          /* Filling up to MASK_SIZE */
209 		0,          /* Filling up to MASK_SIZE */
210 	},
211 };
212 
213 #if SK_LOG
214 SK_LOG_ATTRIBUTE
215 static void
_pkt_agg_log(struct __kern_packet * pkt,struct proc * p,bool is_input)216 _pkt_agg_log(struct __kern_packet *pkt, struct proc *p, bool is_input)
217 {
218 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
219 	    (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
220 
221 	kern_packet_t ph = SK_PKT2PH(pkt);
222 	uint64_t bufcnt = 1;
223 	if (!is_input) {
224 		bufcnt = kern_packet_get_buflet_count(ph);
225 	}
226 
227 	SK_DF(logflags, "%s(%d) %spkt %p plen %u",
228 	    sk_proc_name(p), sk_proc_pid(p), is_input ? "s":"d",
229 	    SK_KVA(pkt), pkt->pkt_length);
230 
231 	SK_DF(logflags, "%spkt csumf/rxstart/rxval 0x%x/%u/0x%04x",
232 	    is_input ? "s":"d", pkt->pkt_csum_flags,
233 	    (uint32_t)pkt->pkt_csum_rx_start_off,
234 	    (uint32_t)pkt->pkt_csum_rx_value);
235 
236 	if (!is_input) {
237 		kern_buflet_t buf = kern_packet_get_next_buflet(ph, NULL);
238 
239 		/* Individual buflets */
240 		for (uint64_t i = 0; i < bufcnt && buf != NULL; i++) {
241 			SK_DF(logflags | SK_VERB_DUMP, "%s",
242 			    sk_dump("buf", __buflet_get_data_address(buf),
243 			    __buflet_get_data_length(buf), 128));
244 			buf = kern_packet_get_next_buflet(ph, buf);
245 		}
246 	}
247 }
248 
249 #define pkt_agg_log(_pkt, _p, _is_input) do {                           \
250 	if (__improbable(sk_verbose != 0)) {                            \
251 	        _pkt_agg_log(_pkt, _p, _is_input);                      \
252 	}                                                               \
253 } while (0)
254 
255 SK_LOG_ATTRIBUTE
256 static void
_mbuf_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)257 _mbuf_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
258 {
259 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
260 	    (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
261 
262 	SK_DF(logflags, "%s(%d) dest mbuf %p pktlen %u",
263 	    sk_proc_name(p), sk_proc_pid(p), SK_KVA(m),
264 	    m->m_pkthdr.len);
265 
266 	SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
267 	    m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
268 	    (uint32_t)m->m_pkthdr.csum_rx_val);
269 
270 	/* Dump the first mbuf */
271 	ASSERT(m_mtod_current(m) != NULL);
272 	SK_DF(logflags | SK_VERB_DUMP, "%s", sk_dump("buf",
273 	    (uint8_t *)m_mtod_current(m), m->m_len, 128));
274 }
275 
276 #define mbuf_agg_log(_m, _p, _is_mbuf) do {                             \
277 	if (__improbable(sk_verbose != 0)) {                            \
278 	        _mbuf_agg_log(_m, _p, _is_mbuf);                        \
279 	}                                                               \
280 } while (0)
281 
282 SK_LOG_ATTRIBUTE
283 static void
_mchain_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)284 _mchain_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
285 {
286 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
287 	    (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
288 
289 	while (m != NULL) {
290 		SK_DF(logflags, "%s(%d) dest mbuf %p pktlen %u",
291 		    sk_proc_name(p), sk_proc_pid(p), SK_KVA(m),
292 		    m->m_pkthdr.len);
293 
294 		SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
295 		    m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
296 		    (uint32_t)m->m_pkthdr.csum_rx_val);
297 
298 		m = m->m_nextpkt;
299 	}
300 }
301 
302 #define mchain_agg_log(_m, _p, _is_mbuf) do {                           \
303 	if (__improbable(sk_verbose != 0)) {                            \
304 	        _mchain_agg_log(_m, _p, _is_mbuf);                      \
305 	}                                                               \
306 } while (0)
307 #else
308 #define pkt_agg_log(...)
309 #define mbuf_agg_log(...)
310 #define mchain_agg_log(...)
311 #endif /* SK_LOG */
312 
313 /*
314  * Checksum only for packet with mbuf.
315  */
316 static bool
mbuf_csum(struct __kern_packet * pkt,struct mbuf * m,bool verify_l3,uint16_t * data_csum)317 mbuf_csum(struct __kern_packet *pkt, struct mbuf *m, bool verify_l3,
318     uint16_t *data_csum)
319 {
320 	ASSERT(data_csum != NULL);
321 
322 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
323 	uint32_t plen = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen +
324 	    pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
325 	uint16_t l4len = plen - pkt->pkt_l2_len - pkt->pkt_flow_ip_hlen;
326 	uint16_t start = pkt->pkt_l2_len;
327 	uint32_t partial = 0;
328 	uint16_t csum = 0;
329 
330 	ASSERT(plen == m_pktlen(m));
331 
332 	/* Some compat drivers compute full checksum */
333 	if ((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS) ==
334 	    CSUM_RX_FULL_FLAGS) {
335 		SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
336 		    m->m_pkthdr.csum_flags, m->m_pkthdr.csum_rx_start,
337 		    m->m_pkthdr.csum_rx_val);
338 
339 		/* Compute the data_csum */
340 		struct tcphdr *tcp =
341 		    (struct tcphdr *)(void *)(mtod(m, uint8_t *) +
342 		    pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen);
343 		/* 16-bit alignment is sufficient */
344 		ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
345 
346 		uint16_t th_sum = tcp->th_sum;
347 		tcp->th_sum = 0;
348 
349 		partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
350 		    pkt->pkt_flow_tcp_hlen);
351 		partial += htons(l4len + IPPROTO_TCP);
352 		if (pkt->pkt_flow_ip_ver == IPVERSION) {
353 			csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
354 			    pkt->pkt_flow_ipv4_dst.s_addr, partial);
355 		} else {
356 			ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
357 			csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
358 			    &pkt->pkt_flow_ipv6_dst, partial);
359 		}
360 		/* Restore the original checksum */
361 		tcp->th_sum = th_sum;
362 		th_sum = __packet_fix_sum(th_sum, csum, 0);
363 		*data_csum = ~th_sum & 0xffff;
364 
365 		/* pkt metadata will be transfer to super packet */
366 		__packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
367 		    0, m->m_pkthdr.csum_rx_val, false);
368 
369 		if ((m->m_pkthdr.csum_rx_val ^ 0xffff) == 0) {
370 			return true;
371 		} else {
372 			return false;
373 		}
374 	}
375 	/* Reset the csum RX flags */
376 	m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
377 	if (verify_l3) {
378 		csum = m_sum16(m, start, pkt->pkt_flow_ip_hlen);
379 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
380 		    start, pkt->pkt_flow_ip_hlen, csum);
381 		m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED;
382 		if ((csum ^ 0xffff) != 0) {
383 			return false;
384 		} else {
385 			m->m_pkthdr.csum_flags |= CSUM_IP_VALID;
386 		}
387 	}
388 	/* Compute L4 header checksum */
389 	partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
390 	    pkt->pkt_flow_tcp_hlen);
391 	/* Compute payload checksum */
392 	start += (pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
393 	*data_csum = m_sum16(m, start, (plen - start));
394 
395 	/* Fold in the data checksum to TCP checksum */
396 	partial += *data_csum;
397 	partial += htons(l4len + IPPROTO_TCP);
398 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
399 		csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
400 		    pkt->pkt_flow_ipv4_dst.s_addr, partial);
401 	} else {
402 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
403 		csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
404 		    &pkt->pkt_flow_ipv6_dst, partial);
405 	}
406 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
407 	    start - pkt->pkt_flow_tcp_hlen, l4len, csum);
408 	// Set start to 0 for full checksum
409 	m->m_pkthdr.csum_rx_start = 0;
410 	m->m_pkthdr.csum_rx_val = csum;
411 	m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
412 
413 	/* pkt metadata will be transfer to super packet */
414 	__packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
415 	    0, csum, false);
416 
417 	if ((csum ^ 0xffff) != 0) {
418 		return false;
419 	}
420 
421 	return true;
422 }
423 
424 /* structure to pass an array of data buffers */
425 typedef struct _dbuf_array {
426 	union {
427 		struct __kern_buflet *dba_buflet[MAX_BUFLET_COUNT];
428 		struct mbuf *dba_mbuf[MAX_BUFLET_COUNT];
429 	};
430 	uint8_t dba_num_dbufs;
431 	bool dba_is_buflet;
432 } _dbuf_array_t;
433 
434 static inline void
_copy_data_sum_dbuf(struct __kern_packet * spkt,uint16_t soff,uint16_t plen,uint32_t * partial_sum,boolean_t * odd_start,_dbuf_array_t * dbuf,boolean_t do_csum)435 _copy_data_sum_dbuf(struct __kern_packet *spkt, uint16_t soff, uint16_t plen,
436     uint32_t *partial_sum, boolean_t *odd_start, _dbuf_array_t *dbuf,
437     boolean_t do_csum)
438 {
439 	uint8_t i = 0;
440 	uint32_t buflet_dlim, buflet_dlen, buf_off = 0;
441 
442 	ASSERT(plen > 0);
443 	while (plen > 0) {
444 		ASSERT(i < dbuf->dba_num_dbufs);
445 		uint32_t dbuf_lim, tmplen;
446 		uint8_t *dbuf_addr;
447 
448 		if (dbuf->dba_is_buflet) {
449 			ASSERT(kern_buflet_get_data_offset(dbuf->dba_buflet[i]) == 0);
450 			/* XXX -fbounds-safety: use the inline variant to return an __indexable */
451 			dbuf_addr = __buflet_get_data_address(dbuf->dba_buflet[i]);
452 
453 			buflet_dlim = kern_buflet_get_data_limit(dbuf->dba_buflet[i]);
454 			buflet_dlen = kern_buflet_get_data_length(dbuf->dba_buflet[i]);
455 			buf_off = buflet_dlen;
456 			dbuf_lim = buflet_dlim - buf_off;
457 			dbuf_addr += buf_off;
458 		} else {
459 			dbuf_lim = (uint32_t) M_TRAILINGSPACE(dbuf->dba_mbuf[i]);
460 			dbuf_addr = mtod(dbuf->dba_mbuf[i], uint8_t *);
461 			buf_off = dbuf->dba_mbuf[i]->m_len;
462 			dbuf_addr += buf_off;
463 		}
464 
465 		tmplen = min(plen, dbuf_lim);
466 		if (PKT_IS_TRUNC_MBUF(spkt)) {
467 			if (do_csum) {
468 				*partial_sum = m_copydata_sum(spkt->pkt_mbuf,
469 				    soff, tmplen, dbuf_addr, *partial_sum,
470 				    odd_start);
471 			} else {
472 				m_copydata(spkt->pkt_mbuf, soff, tmplen,
473 				    dbuf_addr);
474 			}
475 		} else {
476 			*partial_sum = pkt_copyaddr_sum(SK_PKT2PH(spkt),
477 			    soff, dbuf_addr, tmplen, do_csum, *partial_sum,
478 			    odd_start);
479 		}
480 		if (dbuf->dba_is_buflet) {
481 			VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[i],
482 			    tmplen + buf_off) == 0);
483 		} else {
484 			dbuf->dba_mbuf[i]->m_len += tmplen;
485 			dbuf->dba_mbuf[0]->m_pkthdr.len += tmplen;
486 		}
487 		soff += tmplen;
488 		plen -= tmplen;
489 		buf_off = 0;
490 		i++;
491 	}
492 	ASSERT(plen == 0);
493 }
494 
495 /*
496  * Copy (fill) and checksum for packet.
497  * spkt: source IP packet.
498  * plen: length of data in spkt (IP hdr + TCP hdr + TCP payload).
499  * verify_l3: verify IPv4 header checksum.
500  * currm: destination mbuf.
501  * currp: destination skywalk packet.
502  * dbuf: additional destination data buffer(s), used when current destination
503  * packet is out of space.
504  * added: amount of data copied from spkt to the additional buffer.
505  * data_sum: 16-bit folded partial checksum of the copied TCP payload.
506  */
507 static bool
copy_pkt_csum_packed(struct __kern_packet * spkt,uint32_t plen,_dbuf_array_t * dbuf,bool verify_l3,struct mbuf * currm,struct __kern_buflet * currp,uint16_t * data_csum,int * added)508 copy_pkt_csum_packed(struct __kern_packet *spkt, uint32_t plen,
509     _dbuf_array_t *dbuf, bool verify_l3, struct mbuf *currm,
510     struct __kern_buflet *currp, uint16_t *data_csum, int *added)
511 {
512 	ASSERT(data_csum != NULL);
513 
514 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
515 	    SK_VERB_COPY));
516 
517 	uint16_t start = 0, csum = 0;
518 	uint32_t len = 0;
519 	uint32_t l4len;
520 	/* soff is only used for packets */
521 	uint16_t soff = spkt->pkt_headroom + spkt->pkt_l2_len;
522 	uint32_t data_partial = 0, partial = 0;
523 	int32_t curr_oldlen;
524 	uint32_t curr_trailing;
525 	char *curr_ptr;
526 	int32_t curr_len;
527 	uint16_t data_off;
528 	uint32_t tmplen;
529 	boolean_t odd_start = FALSE;
530 	bool verify_l4;
531 
532 	/* One of them must be != NULL, but they can't be both set */
533 	VERIFY((currm != NULL || currp != NULL) &&
534 	    ((currm != NULL) != (currp != NULL)));
535 
536 	if (currm != NULL) {
537 		curr_oldlen = currm->m_len;
538 		curr_trailing = (uint32_t)M_TRAILINGSPACE(currm);
539 		curr_ptr = mtod(currm, char *) + currm->m_len;
540 		curr_len = currm->m_len;
541 	} else {
542 		curr_oldlen = currp->buf_dlen;
543 		curr_trailing = currp->buf_dlim - currp->buf_doff -
544 		    currp->buf_dlen;
545 		/* XXX -fbounds-safety: use the inline variant to return an __indexable */
546 		curr_ptr = (char *)__buflet_get_data_address(currp) + currp->buf_doff +
547 		    currp->buf_dlen;
548 		curr_len = currp->buf_dlen;
549 	}
550 
551 	/* Verify checksum only for IPv4 */
552 	len = spkt->pkt_flow_ip_hlen;
553 	verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(spkt));
554 	if (verify_l3) {
555 		if (PKT_IS_TRUNC_MBUF(spkt)) {
556 			partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
557 			    len, 0, 0);
558 		} else {
559 			partial = pkt_sum(SK_PKT2PH(spkt), soff, len);
560 		}
561 
562 		csum = __packet_fold_sum(partial);
563 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)", 0,
564 		    len, csum);
565 		spkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
566 		if ((csum ^ 0xffff) != 0) {
567 			/* No need to copy & checkum TCP+payload */
568 			return false;
569 		} else {
570 			spkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
571 		}
572 	}
573 
574 	verify_l4 = !PACKET_HAS_FULL_CHECKSUM_FLAGS(spkt);
575 
576 	/* Copy & verify TCP checksum */
577 	start = spkt->pkt_flow_ip_hlen + spkt->pkt_flow_tcp_hlen;
578 	l4len = plen - spkt->pkt_flow_ip_hlen;
579 	len = plen - start;
580 	if (PKT_IS_TRUNC_MBUF(spkt)) {
581 		tmplen = min(len, curr_trailing);
582 		odd_start = FALSE;
583 
584 		/* First, simple checksum on the TCP header */
585 		if (verify_l4) {
586 			partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
587 			    spkt->pkt_flow_tcp_hlen, spkt->pkt_flow_ip_hlen, 0);
588 		}
589 
590 		/* Now, copy & sum the payload */
591 		if (tmplen > 0) {
592 			data_partial = m_copydata_sum(spkt->pkt_mbuf,
593 			    start, tmplen, curr_ptr, 0, &odd_start);
594 			curr_len += tmplen;
595 		}
596 		data_off = start + tmplen;
597 	} else {
598 		tmplen = min(len, curr_trailing);
599 		odd_start = FALSE;
600 
601 		/* First, simple checksum on the TCP header */
602 		if (verify_l4) {
603 			partial = pkt_sum(SK_PKT2PH(spkt), (soff +
604 			    spkt->pkt_flow_ip_hlen), spkt->pkt_flow_tcp_hlen);
605 		}
606 
607 		/* Now, copy & sum the payload */
608 		if (tmplen > 0) {
609 			data_partial = pkt_copyaddr_sum(SK_PKT2PH(spkt),
610 			    (soff + start), (uint8_t *)curr_ptr, tmplen,
611 			    true, 0, &odd_start);
612 			curr_len += tmplen;
613 		}
614 		data_off = soff + start + tmplen;
615 	}
616 
617 	/* copy & sum remaining payload in additional buffers */
618 	if ((len - tmplen) > 0) {
619 		ASSERT(dbuf != NULL);
620 		_copy_data_sum_dbuf(spkt, data_off, (len - tmplen),
621 		    &data_partial, &odd_start, dbuf, true);
622 		*added = (len - tmplen);
623 	}
624 
625 	/* Fold data checksum to 16 bit */
626 	*data_csum = __packet_fold_sum(data_partial);
627 
628 	if (currm != NULL) {
629 		currm->m_len = curr_len;
630 	} else {
631 		currp->buf_dlen = curr_len;
632 	}
633 
634 	if (verify_l4) {
635 		/* Fold in the data checksum to TCP checksum */
636 		partial += *data_csum;
637 		partial += htons(l4len + IPPROTO_TCP);
638 		if (spkt->pkt_flow_ip_ver == IPVERSION) {
639 			csum = in_pseudo(spkt->pkt_flow_ipv4_src.s_addr,
640 			    spkt->pkt_flow_ipv4_dst.s_addr, partial);
641 		} else {
642 			ASSERT(spkt->pkt_flow_ip_ver == IPV6_VERSION);
643 			csum = in6_pseudo(&spkt->pkt_flow_ipv6_src,
644 			    &spkt->pkt_flow_ipv6_dst, partial);
645 		}
646 		/* pkt metadata will be transfer to super packet */
647 		__packet_set_inet_checksum(SK_PKT2PH(spkt),
648 		    PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
649 	} else {
650 		/* grab csum value from offload */
651 		csum = spkt->pkt_csum_rx_value;
652 	}
653 
654 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
655 	    start - spkt->pkt_flow_tcp_hlen, l4len, ntohs(csum));
656 
657 	if ((csum ^ 0xffff) != 0) {
658 		/*
659 		 * Revert whatever we did here!
660 		 * currm/currp should be restored to previous value.
661 		 * dbuf (for additional payload) should be restore to 0.
662 		 */
663 		if (currm != NULL) {
664 			currm->m_len = curr_oldlen;
665 		} else {
666 			currp->buf_dlen = curr_oldlen;
667 		}
668 		if (dbuf != NULL) {
669 			for (int i = 0; i < dbuf->dba_num_dbufs; i++) {
670 				if (dbuf->dba_is_buflet) {
671 					struct __kern_buflet *b = dbuf->dba_buflet[i];
672 					kern_buflet_set_data_length(b, 0);
673 					kern_buflet_set_data_offset(b, 0);
674 				} else {
675 					struct mbuf *m = dbuf->dba_mbuf[i];
676 					m->m_len = m->m_pkthdr.len = 0;
677 				}
678 			}
679 		}
680 
681 		return false;
682 	}
683 
684 	return true;
685 }
686 
687 /*
688  * Copy and checksum for packet or packet with mbuf
689  * data_csum is only supported for bsd flows
690  */
691 static bool
copy_pkt_csum(struct __kern_packet * pkt,uint32_t plen,_dbuf_array_t * dbuf,uint16_t * data_csum,bool verify_l3)692 copy_pkt_csum(struct __kern_packet *pkt, uint32_t plen, _dbuf_array_t *dbuf,
693     uint16_t *data_csum, bool verify_l3)
694 {
695 	/*
696 	 * To keep this routine simple and optimal, we are asserting on the
697 	 * assumption that the smallest flowswitch packet pool buffer should
698 	 * be large enough to hold the IP and TCP headers in the first buflet.
699 	 */
700 	static_assert(NX_FSW_MINBUFSIZE >= NETIF_COMPAT_MAX_MBUF_DATA_COPY);
701 
702 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
703 	    (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
704 
705 	uint16_t start = 0, csum = 0;
706 	uint32_t len = 0;
707 	/* soff is only used for packets */
708 	uint16_t soff = pkt->pkt_headroom + pkt->pkt_l2_len;
709 	uint32_t data_partial = 0, partial = 0;
710 	boolean_t odd_start = false;
711 	uint32_t data_len;
712 	uint16_t dbuf_off;
713 	uint16_t copied_len = 0;
714 	bool l3_csum_ok;
715 	uint8_t *daddr;
716 
717 	if (dbuf->dba_is_buflet) {
718 		/* XXX -fbounds-safety: use the inline variant to return an __indexable */
719 		daddr = __buflet_get_data_address(dbuf->dba_buflet[0]);
720 		daddr += kern_buflet_get_data_length(dbuf->dba_buflet[0]);
721 	} else {
722 		daddr = mtod(dbuf->dba_mbuf[0], uint8_t *);
723 		daddr += dbuf->dba_mbuf[0]->m_len;
724 		/*
725 		 * available space check for payload is done later
726 		 * in _copy_data_sum_dbuf
727 		 */
728 		ASSERT(M_TRAILINGSPACE(dbuf->dba_mbuf[0]) >=
729 		    pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
730 	}
731 
732 	if (PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt)) {
733 		/* copy only */
734 		_copy_data_sum_dbuf(pkt, PKT_IS_TRUNC_MBUF(pkt) ? 0: soff,
735 		    plen, &partial, &odd_start, dbuf, false);
736 		if (PKT_IS_MBUF(pkt)) {
737 			csum = pkt->pkt_mbuf->m_pkthdr.csum_rx_val;
738 			SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
739 			    pkt->pkt_mbuf->m_pkthdr.csum_flags,
740 			    pkt->pkt_mbuf->m_pkthdr.csum_rx_start, csum);
741 		} else {
742 			csum = pkt->pkt_csum_rx_value;
743 			SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
744 			    pkt->pkt_csum_flags,
745 			    pkt->pkt_csum_rx_start_off, csum);
746 		}
747 
748 		/* pkt metadata will be transfer to super packet */
749 		__packet_set_inet_checksum(SK_PKT2PH(pkt),
750 		    PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
751 		if ((csum ^ 0xffff) == 0) {
752 			return true;
753 		} else {
754 			return false;
755 		}
756 	}
757 
758 	/* Copy l3 & verify checksum only for IPv4 */
759 	start = 0;
760 	len = pkt->pkt_flow_ip_hlen;
761 	if (PKT_IS_TRUNC_MBUF(pkt)) {
762 		partial = m_copydata_sum(pkt->pkt_mbuf, start, len,
763 		    (daddr + start), 0, NULL);
764 	} else {
765 		partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), soff,
766 		    (daddr + start), len, true, 0, NULL);
767 	}
768 	verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(pkt));
769 	l3_csum_ok = !verify_l3;
770 	if (verify_l3) {
771 		csum = __packet_fold_sum(partial);
772 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
773 		    start, len, csum);
774 		pkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
775 		if ((csum ^ 0xffff) != 0) {
776 			/* proceed to copy the rest of packet */
777 		} else {
778 			pkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
779 			l3_csum_ok = true;
780 		}
781 	}
782 	copied_len += pkt->pkt_flow_ip_hlen;
783 
784 	/* Copy & verify TCP checksum */
785 	start = pkt->pkt_flow_ip_hlen;
786 	len = plen - start;
787 
788 	if (PKT_IS_TRUNC_MBUF(pkt)) {
789 		/* First, copy and sum TCP header */
790 		partial = m_copydata_sum(pkt->pkt_mbuf, start,
791 		    pkt->pkt_flow_tcp_hlen, (daddr + start), 0, NULL);
792 
793 		data_len = len - pkt->pkt_flow_tcp_hlen;
794 		start += pkt->pkt_flow_tcp_hlen;
795 		dbuf_off = start;
796 		/* Next, copy and sum payload (if any) */
797 	} else {
798 		/* First, copy and sum TCP header */
799 		partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), (soff + start),
800 		    (daddr + start), pkt->pkt_flow_tcp_hlen, true, 0, NULL);
801 
802 		data_len = len - pkt->pkt_flow_tcp_hlen;
803 		start += pkt->pkt_flow_tcp_hlen;
804 		dbuf_off = start;
805 		start += soff;
806 	}
807 	copied_len += pkt->pkt_flow_tcp_hlen;
808 
809 	if (dbuf->dba_is_buflet) {
810 		VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[0],
811 		    kern_buflet_get_data_length(dbuf->dba_buflet[0]) +
812 		    copied_len) == 0);
813 	} else {
814 		dbuf->dba_mbuf[0]->m_len += copied_len;
815 		dbuf->dba_mbuf[0]->m_pkthdr.len += copied_len;
816 	}
817 
818 	/* copy and sum payload (if any) */
819 	if (data_len > 0) {
820 		odd_start = false;
821 		_copy_data_sum_dbuf(pkt, start, data_len, &data_partial,
822 		    &odd_start, dbuf, l3_csum_ok);
823 	}
824 
825 	if (__improbable(!l3_csum_ok)) {
826 		return false;
827 	}
828 
829 	/* Fold data sum to 16 bit and then into the partial */
830 	*data_csum = __packet_fold_sum(data_partial);
831 
832 	/* Fold in the data checksum to TCP checksum */
833 	partial += *data_csum;
834 
835 	partial += htons(len + IPPROTO_TCP);
836 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
837 		csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
838 		    pkt->pkt_flow_ipv4_dst.s_addr, partial);
839 	} else {
840 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
841 		csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
842 		    &pkt->pkt_flow_ipv6_dst, partial);
843 	}
844 
845 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
846 	    pkt->pkt_flow_ip_hlen, len, csum);
847 
848 	/* pkt metadata will be transfer to super packet */
849 	__packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
850 	    0, csum, false);
851 	if ((csum ^ 0xffff) != 0) {
852 		return false;
853 	}
854 
855 	return true;
856 }
857 
858 SK_INLINE_ATTRIBUTE
859 static void
flow_agg_init_common(struct nx_flowswitch * fsw,struct flow_agg * fa,struct __kern_packet * pkt)860 flow_agg_init_common(struct nx_flowswitch *fsw, struct flow_agg *fa,
861     struct __kern_packet *pkt)
862 {
863 	struct ifnet *ifp;
864 
865 	switch (pkt->pkt_flow_ip_ver) {
866 	case IPVERSION:
867 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip)) {
868 			return;
869 		}
870 		break;
871 	case IPV6_VERSION:
872 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
873 			return;
874 		}
875 		break;
876 	default:
877 		VERIFY(0);
878 		/* NOTREACHED */
879 		__builtin_unreachable();
880 	}
881 
882 	fa->fa_tcp_seq = ntohl(pkt->pkt_flow_tcp_seq) + pkt->pkt_flow_ulen;
883 	fa->fa_ulen = pkt->pkt_flow_ulen;
884 	fa->fa_total = pkt->pkt_flow_ip_hlen +
885 	    pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
886 
887 	ifp = fsw->fsw_ifp;
888 	ASSERT(ifp != NULL);
889 	if (__improbable((ifp->if_hwassist & IFNET_LRO) != 0)) {
890 		/* in case hardware supports LRO, don't fix checksum in the header */
891 		fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum_no_op;
892 	} else {
893 		fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum;
894 	}
895 }
896 
897 static void
flow_agg_init_smbuf(struct nx_flowswitch * fsw,struct flow_agg * fa,struct mbuf * smbuf,struct __kern_packet * pkt)898 flow_agg_init_smbuf(struct nx_flowswitch *fsw, struct flow_agg *fa,
899     struct mbuf *smbuf, struct __kern_packet *pkt)
900 {
901 	FLOW_AGG_CLEAR(fa);
902 
903 	ASSERT(smbuf != NULL);
904 	fa->fa_smbuf = smbuf;
905 
906 	fa->fa_sptr = mtod(smbuf, uint8_t *);
907 	ASSERT(fa->fa_sptr != NULL);
908 
909 	/*
910 	 * Note here we use 'pkt' instead of 'smbuf', since we rely on the
911 	 * contents of the flow structure which don't exist in 'smbuf'.
912 	 */
913 	flow_agg_init_common(fsw, fa, pkt);
914 }
915 
916 static void
flow_agg_init_spkt(struct nx_flowswitch * fsw,struct flow_agg * fa,struct __kern_packet * spkt,struct __kern_packet * pkt)917 flow_agg_init_spkt(struct nx_flowswitch *fsw, struct flow_agg *fa,
918     struct __kern_packet *spkt, struct __kern_packet *pkt)
919 {
920 	FLOW_AGG_CLEAR(fa);
921 
922 	ASSERT(spkt != NULL);
923 	fa->fa_spkt = spkt;
924 	fa->fa_sobj_is_pkt = true;
925 	VERIFY(spkt->pkt_headroom == 0 && spkt->pkt_l2_len == 0);
926 
927 	MD_BUFLET_ADDR_ABS(spkt, fa->fa_sptr);
928 	ASSERT(fa->fa_sptr != NULL);
929 
930 	/*
931 	 * Note here we use 'pkt' instead of 'spkt', since we rely on the
932 	 * contents of the flow structure which don't exist in 'spkt'.
933 	 */
934 	flow_agg_init_common(fsw, fa, pkt);
935 }
936 
937 /*
938  * -fbounds-safety: The reason hardcoded values 64 (and 80) are used here is
939  * because this function calls the 64-byte version of sk memcmp function (same
940  * thing for the 80-byte version). In can_agg_fastpath, there is a check being
941  * done for TCP header length with options: sizeof(struct tcphdr) +
942  * TCPOLEN_TSTAMP_APPA , which is 20 + 12 = 32 bytes. In case of IPv4, adding IP
943  * header size of 20 to it makes it 52 bytes. From the sk_memcmp_* variants, the
944  * closest one is the 64B option.
945  */
946 SK_INLINE_ATTRIBUTE
947 static bool
948 ipv4_tcp_memcmp(const uint8_t *__counted_by(64)h1, const uint8_t *__counted_by(64)h2)
949 {
950 	return sk_memcmp_mask_64B(h1, h2, (const uint8_t *)&ip_tcp_mask) == 0;
951 }
952 
953 SK_INLINE_ATTRIBUTE
954 static bool
955 ipv6_tcp_memcmp(const uint8_t *__counted_by(80)h1, const uint8_t *__counted_by(80)h2)
956 {
957 	return sk_memcmp_mask_80B(h1, h2, (const uint8_t *)&ip6_tcp_mask) == 0;
958 }
959 
960 SK_INLINE_ATTRIBUTE
961 static bool
can_agg_fastpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)962 can_agg_fastpath(struct flow_agg *fa, struct __kern_packet *pkt,
963     struct fsw_stats *fsws)
964 {
965 	bool match;
966 	uint8_t *ip_hdr;
967 
968 	ASSERT(fa->fa_sptr != NULL);
969 	static_assert(sizeof(struct ip6_tcp_mask) == MASK_SIZE);
970 	static_assert(sizeof(struct ip_tcp_mask) == MASK_SIZE);
971 
972 	if (__improbable(pkt->pkt_length < MASK_SIZE)) {
973 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_TCP);
974 		goto slow_path;
975 	}
976 
977 	if (__improbable(fa->fa_sobj_is_short)) {
978 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_MBUF);
979 		goto slow_path;
980 	}
981 
982 	if (__improbable(pkt->pkt_flow_tcp_hlen !=
983 	    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA))) {
984 		goto slow_path;
985 	}
986 
987 	switch (pkt->pkt_flow_ip_ver) {
988 	case IPVERSION:
989 		/*
990 		 * -fbounds-safety: pkt->pkt_flow_ip_hdr is a mach_vm_address_t,
991 		 * so we forge it here. The reason the constant values 64 and 80
992 		 * are used is because ipv4_tcp_memcmp takes a __counted_by(64)
993 		 * and __counted_by(80), respectively.
994 		 */
995 		ip_hdr = __unsafe_forge_bidi_indexable(uint8_t *,
996 		    pkt->pkt_flow_ip_hdr, 64);
997 		match = ipv4_tcp_memcmp(fa->fa_sptr, ip_hdr);
998 		break;
999 	case IPV6_VERSION:
1000 		ip_hdr = __unsafe_forge_bidi_indexable(uint8_t *,
1001 		    pkt->pkt_flow_ip_hdr, 80);
1002 		match = ipv6_tcp_memcmp(fa->fa_sptr, ip_hdr);
1003 		break;
1004 	default:
1005 		VERIFY(0);
1006 		/* NOTREACHED */
1007 		__builtin_unreachable();
1008 	}
1009 
1010 	if (__improbable(!match)) {
1011 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_MASK_TCP);
1012 		goto slow_path;
1013 	}
1014 	if (__improbable(pkt->pkt_flow_ulen != fa->fa_ulen)) {
1015 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ULEN_TCP);
1016 		goto slow_path;
1017 	}
1018 
1019 	STATS_INC(fsws, FSW_STATS_RX_AGG_OK_FASTPATH_TCP);
1020 	fa->fa_tcp_seq += pkt->pkt_flow_ulen;
1021 	fa->fa_ulen = pkt->pkt_flow_ulen;
1022 	return true;
1023 
1024 slow_path:
1025 	return false;
1026 }
1027 
1028 SK_NO_INLINE_ATTRIBUTE
1029 static bool
can_agg_slowpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)1030 can_agg_slowpath(struct flow_agg *fa, struct __kern_packet *pkt,
1031     struct fsw_stats *fsws)
1032 {
1033 	uint8_t *sl3_hdr = fa->fa_sptr;
1034 	uint8_t *l3_hdr = __unsafe_forge_bidi_indexable(uint8_t *,
1035 	    pkt->pkt_flow_ip_hdr, pkt->pkt_flow_ip_hlen);
1036 	uint32_t sl3tlen = 0;
1037 	uint16_t sl3hlen = 0;
1038 
1039 	DTRACE_SKYWALK2(aggr__slow, struct __kern_packet *, pkt,
1040 	    uint8_t *, sl3_hdr);
1041 
1042 	ASSERT(sl3_hdr != NULL);
1043 
1044 	/*
1045 	 * Compare IP header length, TOS, frag flags and IP options
1046 	 * For IPv4, the options should match exactly
1047 	 * For IPv6, if options are present, bail out
1048 	 */
1049 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
1050 		struct ip *siph = (struct ip *)(void *)sl3_hdr;
1051 		struct ip *iph = (struct ip *)(void *)l3_hdr;
1052 
1053 		ASSERT(siph->ip_v == IPVERSION);
1054 		/* 16-bit alignment is sufficient (handles mbuf case) */
1055 		ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1056 		ASSERT(IS_P2ALIGNED(iph, sizeof(uint16_t)));
1057 
1058 		sl3hlen = (siph->ip_hl << 2);
1059 		if (sl3hlen != pkt->pkt_flow_ip_hlen) {
1060 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1061 			DTRACE_SKYWALK2(aggr__fail2, uint16_t, sl3hlen, uint8_t,
1062 			    pkt->pkt_flow_ip_hlen);
1063 			return false;
1064 		}
1065 
1066 		if (siph->ip_ttl != iph->ip_ttl) {
1067 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1068 			DTRACE_SKYWALK2(aggr__fail3, uint8_t, siph->ip_ttl,
1069 			    uint8_t, iph->ip_ttl);
1070 			return false;
1071 		}
1072 
1073 		if (siph->ip_tos != iph->ip_tos) {
1074 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1075 			DTRACE_SKYWALK2(aggr__fail4, uint8_t, siph->ip_tos,
1076 			    uint8_t, iph->ip_tos);
1077 			return false;
1078 		}
1079 		/* For IPv4, DF bit should match */
1080 		if ((ntohs(siph->ip_off) & (IP_DF | IP_RF)) !=
1081 		    (ntohs(iph->ip_off) & (IP_DF | IP_RF))) {
1082 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OFF_IP);
1083 			DTRACE_SKYWALK2(aggr__fail5, uint16_t,
1084 			    ntohs(siph->ip_off), uint16_t, ntohs(iph->ip_off));
1085 			return false;
1086 		}
1087 
1088 		uint8_t ip_opts_len = pkt->pkt_flow_ip_hlen -
1089 		    sizeof(struct ip);
1090 		if (ip_opts_len > 0 &&
1091 		    memcmp((uint8_t *)(siph + 1), (uint8_t *)(iph + 1),
1092 		    ip_opts_len) != 0) {
1093 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPT_IP);
1094 			DTRACE_SKYWALK3(aggr__fail6, uint8_t, ip_opts_len,
1095 			    uint8_t *, (uint8_t *)(siph + 1), uint8_t *,
1096 			    (uint8_t *)(iph + 1));
1097 			return false;
1098 		}
1099 		sl3tlen = ntohs(siph->ip_len);
1100 	} else {
1101 		struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1102 		struct ip6_hdr *ip6 = (struct ip6_hdr *)l3_hdr;
1103 
1104 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1105 		ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1106 		/* 16-bit alignment is sufficient (handles mbuf case) */
1107 		ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1108 
1109 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
1110 			/*
1111 			 * Don't aggregate if extension header is present in
1112 			 * packet. N.B. currently flow switch only classifies
1113 			 * frag header
1114 			 */
1115 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1116 			DTRACE_SKYWALK1(aggr__fail7, uint8_t,
1117 			    pkt->pkt_flow_ip_hlen);
1118 			return false;
1119 		}
1120 
1121 		sl3hlen = sizeof(struct ip6_hdr);
1122 		/* For IPv6, flow info mask covers TOS and flow label */
1123 		if (memcmp((uint8_t *)&sip6->ip6_flow, (uint8_t *)&ip6->ip6_flow,
1124 		    sizeof(sip6->ip6_flow)) != 0) {
1125 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1126 			DTRACE_SKYWALK2(aggr__fail8, uint32_t,
1127 			    ntohl(sip6->ip6_flow), uint32_t,
1128 			    ntohl(ip6->ip6_flow));
1129 			return false;
1130 		}
1131 
1132 		if (sip6->ip6_hlim != ip6->ip6_hlim) {
1133 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1134 			DTRACE_SKYWALK2(aggr__fail9, uint8_t, sip6->ip6_hlim,
1135 			    uint8_t, ip6->ip6_hlim);
1136 			return false;
1137 		}
1138 
1139 		sl3tlen = (sizeof(struct ip6_hdr) + ntohs(sip6->ip6_plen));
1140 	}
1141 
1142 	/*
1143 	 * For TCP header, compare ACK number and window size
1144 	 * Compare TCP flags
1145 	 * Compare TCP header length and TCP options
1146 	 */
1147 	struct tcphdr *stcp = (struct tcphdr *)(void *)(sl3_hdr + sl3hlen);
1148 	/* -fbounds-safety: pkt_flow_tcp_hdr is a mach_vm_address_t */
1149 	struct tcphdr *tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
1150 	    pkt->pkt_flow_tcp_hdr, pkt->pkt_flow_tcp_hlen);
1151 
1152 	uint16_t sl4hlen = (stcp->th_off << 2);
1153 	if (memcmp(&stcp->th_ack, &tcp->th_ack, sizeof(stcp->th_ack)) != 0 ||
1154 	    memcmp(&stcp->th_win, &tcp->th_win, sizeof(stcp->th_win)) != 0) {
1155 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ACKWIN_TCP);
1156 		DTRACE_SKYWALK4(aggr__fail9, uint32_t, ntohl(stcp->th_ack),
1157 		    uint32_t, ntohl(tcp->th_ack), uint16_t, ntohs(stcp->th_win),
1158 		    uint16_t, ntohs(tcp->th_win));
1159 		return false;
1160 	}
1161 
1162 	if ((stcp->th_flags & ~(TH_PUSH)) != (tcp->th_flags & ~(TH_PUSH))) {
1163 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_FLAGS_TCP);
1164 		DTRACE_SKYWALK2(aggr__fail10, uint8_t, stcp->th_flags,
1165 		    uint8_t, tcp->th_flags);
1166 		return false;
1167 	}
1168 
1169 	if (sl4hlen != pkt->pkt_flow_tcp_hlen) {
1170 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_TCP);
1171 		DTRACE_SKYWALK2(aggr__fail11, uint8_t, sl4hlen,
1172 		    uint8_t, pkt->pkt_flow_tcp_hlen);
1173 		return false;
1174 	}
1175 
1176 	uint8_t tcp_opts_len = pkt->pkt_flow_tcp_hlen - sizeof(struct tcphdr);
1177 	/*
1178 	 * We know that the TCP-option lengthes are the same thanks to the above
1179 	 * sl4hlen check
1180 	 */
1181 	if (tcp_opts_len > 0 && memcmp((uint8_t *)(stcp + 1),
1182 	    (uint8_t *)(tcp + 1), tcp_opts_len) != 0) {
1183 		/*
1184 		 * Fast-path header prediction:
1185 		 *
1186 		 * TCP Timestamp option is usually put after two NOP-headers,
1187 		 * and thus total TCP-option length is 12. If that's the case,
1188 		 * we can aggregate as only the TCP time-stamp option differs.
1189 		 */
1190 		if (tcp_opts_len != TCPOLEN_TSTAMP_APPA) {
1191 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_EXOPT_TCP);
1192 			DTRACE_SKYWALK1(aggr__fail13, uint8_t, tcp_opts_len);
1193 			return false;
1194 		} else {
1195 			uint32_t sts_hdr, ts_hdr;
1196 			if (IS_P2ALIGNED(stcp + 1, sizeof(uint32_t))) {
1197 				sts_hdr = *((uint32_t *)(stcp + 1));
1198 			} else {
1199 				bcopy(stcp + 1, &sts_hdr, sizeof(sts_hdr));
1200 			}
1201 			if (IS_P2ALIGNED(tcp + 1, sizeof(uint32_t))) {
1202 				ts_hdr = *((uint32_t *)(tcp + 1));
1203 			} else {
1204 				bcopy(tcp + 1, &ts_hdr, sizeof(ts_hdr));
1205 			}
1206 
1207 			if (sts_hdr != htonl(TCPOPT_TSTAMP_HDR) ||
1208 			    ts_hdr != htonl(TCPOPT_TSTAMP_HDR)) {
1209 				STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPTTS_TCP);
1210 				DTRACE_SKYWALK2(aggr__fail14, uint32_t,
1211 				    sts_hdr, uint32_t, ts_hdr);
1212 				return false;
1213 			}
1214 		}
1215 	}
1216 	STATS_INC(fsws, FSW_STATS_RX_AGG_OK_SLOWPATH_TCP);
1217 	fa->fa_tcp_seq += pkt->pkt_flow_ulen;
1218 	fa->fa_ulen = pkt->pkt_flow_ulen;
1219 	return true;
1220 }
1221 
1222 static bool
flow_agg_is_ok(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)1223 flow_agg_is_ok(struct flow_agg *fa, struct __kern_packet *pkt,
1224     struct fsw_stats *fsws)
1225 {
1226 	/* Shouldn't exceed the ip_len beyond MIN(custom ip_len, 64K) */
1227 	const uint32_t max_ip_len = MAX_AGG_IP_LEN();
1228 	bool can_agg = false;
1229 
1230 	DTRACE_SKYWALK2(aggr__check, struct flow_agg *, fa,
1231 	    struct __kern_packet *, pkt);
1232 
1233 	ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1234 	if (__improbable(pkt->pkt_flow_tcp_agg_fast != 0)) {
1235 		pkt->pkt_flow_tcp_agg_fast = 0;
1236 	}
1237 	/*
1238 	 * Don't aggregate if any of the following is true:
1239 	 * 1. TCP flag is other than TH_{ACK,PUSH}
1240 	 * 2. Payload length is 0 (pure ACK)
1241 	 * 3. This is the first packet
1242 	 * 4. pkt was received as a broadcast / multicast
1243 	 * 5. TCP sequence number is not expected
1244 	 * 6. We would've exceeded the maximum aggregated size
1245 	 * 7. It's not the first packet and the wake flag is set
1246 	 */
1247 	if (__improbable((pkt->pkt_flow_tcp_flags & TCP_FLAGS_IGNORE) != 0 ||
1248 	    pkt->pkt_flow_ulen == 0 || fa->fa_sobj == NULL ||
1249 	    (pkt->pkt_link_flags & (PKT_LINKF_BCAST | PKT_LINKF_MCAST)) != 0)) {
1250 		DTRACE_SKYWALK1(aggr__fail1a, struct __kern_packet *, pkt);
1251 		goto done;
1252 	}
1253 	if (__improbable(ntohl(pkt->pkt_flow_tcp_seq) != fa->fa_tcp_seq)) {
1254 		DTRACE_SKYWALK2(aggr__fail1b, uint32_t,
1255 		    ntohl(pkt->pkt_flow_tcp_seq), uint32_t, fa->fa_tcp_seq);
1256 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SEQN_TCP);
1257 		goto done;
1258 	}
1259 	if (__improbable((fa->fa_total + pkt->pkt_flow_ulen) > max_ip_len)) {
1260 		DTRACE_SKYWALK3(aggr__fail1c, uint32_t, fa->fa_total,
1261 		    uint32_t, pkt->pkt_flow_ulen, uint32_t, max_ip_len);
1262 		/* We've reached aggregation limit */
1263 		STATS_INC(fsws, FSW_STATS_RX_AGG_LIMIT);
1264 		goto done;
1265 	}
1266 	if (__improbable(PKT_IS_WAKE_PKT(pkt) && fa->fa_total > 0)) {
1267 		DTRACE_SKYWALK1(aggr__fail1d, struct __kern_packet *, pkt);
1268 		goto done;
1269 	}
1270 
1271 	can_agg = can_agg_fastpath(fa, pkt, fsws);
1272 	if (can_agg) {
1273 		pkt->pkt_flow_tcp_agg_fast = 1;
1274 		goto done;
1275 	}
1276 
1277 	can_agg = can_agg_slowpath(fa, pkt, fsws);
1278 	ASSERT(!pkt->pkt_flow_tcp_agg_fast);
1279 
1280 done:
1281 	return can_agg;
1282 }
1283 
1284 static uint16_t
flow_agg_pkt_fix_sum(uint16_t csum,uint16_t old,uint16_t new)1285 flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new)
1286 {
1287 	return __packet_fix_sum(csum, old, new);
1288 }
1289 
1290 static uint16_t
flow_agg_pkt_fix_sum_no_op(uint16_t __unused csum,uint16_t __unused old,uint16_t __unused new)1291 flow_agg_pkt_fix_sum_no_op(uint16_t __unused csum, uint16_t __unused old,
1292     uint16_t __unused new)
1293 {
1294 	return 0;
1295 }
1296 
1297 static inline void
flow_agg_pkt_fix_hdr_sum(struct flow_agg * fa,uint8_t * __sized_by (sizeof (uint32_t))field,uint16_t * csum,uint32_t new)1298 flow_agg_pkt_fix_hdr_sum(struct flow_agg *fa,
1299     uint8_t *__sized_by(sizeof(uint32_t))field, uint16_t *csum,
1300     uint32_t new)
1301 {
1302 	uint32_t old;
1303 	memcpy((uint8_t *)&old, field, sizeof(old));
1304 	memcpy(field, (uint8_t *)&new, sizeof(uint32_t));
1305 	*csum = fa->fa_fix_pkt_sum(fa->fa_fix_pkt_sum(*csum,
1306 	    (uint16_t)(old >> 16), (uint16_t)(new >> 16)),
1307 	    (uint16_t)(old & 0xffff),
1308 	    (uint16_t)(new & 0xffff));
1309 }
1310 
1311 static void
flow_agg_merge_hdr(struct flow_agg * fa,struct __kern_packet * pkt,__unused uint16_t data_csum,struct fsw_stats * fsws)1312 flow_agg_merge_hdr(struct flow_agg *fa, struct __kern_packet *pkt,
1313     __unused uint16_t data_csum, struct fsw_stats *fsws)
1314 {
1315 	struct tcphdr *stcp, *tcp;
1316 	uint8_t *l3hdr, l3hlen;
1317 	uint16_t old_l3len = 0;
1318 	uint8_t result;
1319 
1320 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1321 
1322 	/*
1323 	 * The packet being merged should always have full checksum flags
1324 	 * and a valid checksum. Otherwise, it would fail copy_pkt_csum_packed
1325 	 * and not enter this function.
1326 	 */
1327 	ASSERT(PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt));
1328 	ASSERT((pkt->pkt_csum_rx_value ^ 0xffff) == 0);
1329 
1330 	ASSERT(fa->fa_sobj != NULL);
1331 	ASSERT(!fa->fa_sobj_is_pkt ||
1332 	    (fa->fa_spkt->pkt_headroom == 0 && fa->fa_spkt->pkt_l2_len == 0));
1333 	uint8_t *sl3_hdr = fa->fa_sptr;
1334 	ASSERT(sl3_hdr != NULL);
1335 	ASSERT(fa->fa_fix_pkt_sum != NULL);
1336 
1337 	fa->fa_total += pkt->pkt_flow_ulen;
1338 
1339 	/*
1340 	 * Update the IP header as:
1341 	 * 1. Set the IP ID (IPv4 only) to that of the new packet
1342 	 * 2. Set the ttl to the lowest of the two
1343 	 * 3. Increment the IP length by the payload length of new packet
1344 	 * 4. Leave the IP (IPv4 only) checksum as is
1345 	 * Update the resp. flow classification fields, if any
1346 	 * Nothing to update for TCP header for now
1347 	 */
1348 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
1349 		struct ip *siph = (struct ip *)(void *)sl3_hdr;
1350 
1351 		/* 16-bit alignment is sufficient (handles mbuf case) */
1352 		ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1353 
1354 		l3hdr = (uint8_t *)siph;
1355 		l3hlen = siph->ip_hl << 2;
1356 
1357 		old_l3len = ntohs(siph->ip_len);
1358 		uint16_t l3tlen = ntohs(siph->ip_len) + pkt->pkt_flow_ulen;
1359 		siph->ip_len = htons(l3tlen);
1360 		siph->ip_sum = fa->fa_fix_pkt_sum(siph->ip_sum, 0,
1361 		    htons(pkt->pkt_flow_ulen));
1362 
1363 		SK_DF(logflags, "Agg IP len %u", ntohs(siph->ip_len));
1364 	} else {
1365 		struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1366 
1367 		/* 16-bit alignment is sufficient (handles mbuf case) */
1368 		ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1369 		ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1370 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1371 
1372 		l3hdr = (uint8_t *)sip6;
1373 		l3hlen = sizeof(struct ip6_hdr);
1374 
1375 		/* No extension headers should be present */
1376 		ASSERT(pkt->pkt_flow_ip_hlen == sizeof(struct ip6_hdr));
1377 
1378 		old_l3len = ntohs(sip6->ip6_plen) + sizeof(struct ip6_hdr);
1379 		uint16_t l3plen = ntohs(sip6->ip6_plen) + pkt->pkt_flow_ulen;
1380 		sip6->ip6_plen = htons(l3plen);
1381 
1382 		SK_DF(logflags, "Agg IP6 len %u", ntohs(sip6->ip6_plen));
1383 	}
1384 
1385 	if (__probable(pkt->pkt_flow_tcp_agg_fast)) {
1386 		STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_IP);
1387 	} else {
1388 		STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_IP);
1389 	}
1390 
1391 	stcp = (struct tcphdr *)(void *)(l3hdr + l3hlen);
1392 	tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
1393 	    (struct tcphdr *)pkt->pkt_flow_tcp_hdr, pkt->pkt_flow_tcp_hlen);
1394 
1395 	/* 16-bit alignment is sufficient (handles mbuf case) */
1396 	ASSERT(IS_P2ALIGNED(stcp, sizeof(uint16_t)));
1397 	ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
1398 
1399 	/*
1400 	 * If it is bigger, that means there are TCP-options that need to be
1401 	 * copied over.
1402 	 */
1403 	if (pkt->pkt_flow_tcp_hlen > sizeof(struct tcphdr) ||
1404 	    (stcp->th_flags & TH_PUSH) == 0) {
1405 		VERIFY(stcp->th_off << 2 == pkt->pkt_flow_tcp_hlen);
1406 		if (__improbable(!pkt->pkt_flow_tcp_agg_fast &&
1407 		    memcmp(stcp + 1, tcp + 1, (pkt->pkt_flow_tcp_hlen -
1408 		    sizeof(struct tcphdr))) != 0)) {
1409 			uint8_t *sopt = (uint8_t *)(stcp + 1);
1410 			uint8_t *opt = (uint8_t *)(tcp + 1);
1411 
1412 			uint32_t ntsval, ntsecr;
1413 			bcopy((void *)(opt + 4), &ntsval, sizeof(ntsval));
1414 			bcopy((void *)(opt + 8), &ntsecr, sizeof(ntsecr));
1415 
1416 			flow_agg_pkt_fix_hdr_sum(fa, sopt + 4, &stcp->th_sum, ntsval);
1417 			flow_agg_pkt_fix_hdr_sum(fa, sopt + 8, &stcp->th_sum, ntsecr);
1418 
1419 			STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_TCP);
1420 		} else {
1421 			STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_TCP);
1422 		}
1423 
1424 		if ((stcp->th_flags & TH_PUSH) == 0 &&
1425 		    (tcp->th_flags & TH_PUSH) != 0) {
1426 			uint16_t old, new;
1427 			tcp_seq *th_ack = &stcp->th_ack;
1428 			/*
1429 			 * -fbounds-safety: C-style cast (uint16_t *)(th_ack+1)
1430 			 * doesn't work here, because th_ack's bound is a single
1431 			 * uint32_t, so trying to go one address above, and then
1432 			 * later dereferncing it would lead to a panic.
1433 			 */
1434 			uint16_t *next = __unsafe_forge_single(uint16_t *,
1435 			    th_ack + 1);
1436 			old = *next;
1437 			/* If the new segment has a PUSH-flag, append it! */
1438 			stcp->th_flags |= tcp->th_flags & TH_PUSH;
1439 			next = __unsafe_forge_single(uint16_t *, th_ack + 1);
1440 			new = *next;
1441 			stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, old, new);
1442 		}
1443 	}
1444 
1445 	/* Update pseudo header checksum */
1446 	stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
1447 	    htons(pkt->pkt_flow_ulen));
1448 
1449 	/* Update data checksum  */
1450 	if (__improbable(old_l3len & 0x1)) {
1451 		/* swap the byte order, refer to rfc 1071 section 2 */
1452 		stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
1453 		    ntohs(data_csum));
1454 	} else {
1455 		stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0, data_csum);
1456 	}
1457 
1458 	if (fa->fa_sobj_is_pkt) {
1459 		struct __kern_packet *spkt = fa->fa_spkt;
1460 		spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1461 		spkt->pkt_flow_ulen += pkt->pkt_flow_ulen;
1462 		/*
1463 		 * Super packet length includes L3 and L4
1464 		 * header length for first packet only.
1465 		 */
1466 		spkt->pkt_length += pkt->pkt_flow_ulen;
1467 		if (spkt->pkt_seg_cnt == 0) {
1468 			/* First time we append packets, need to set it to 1 */
1469 			spkt->pkt_seg_cnt = 1;
1470 		}
1471 		static_assert(sizeof(result) == sizeof(spkt->pkt_seg_cnt));
1472 		if (!os_add_overflow(1, spkt->pkt_seg_cnt, &result)) {
1473 			spkt->pkt_seg_cnt = result;
1474 		}
1475 		SK_DF(logflags, "Agg pkt len %u TCP csum 0x%04x",
1476 		    spkt->pkt_length, ntohs(stcp->th_sum));
1477 	} else {
1478 		struct mbuf *smbuf = fa->fa_smbuf;
1479 		smbuf->m_pkthdr.len += pkt->pkt_flow_ulen;
1480 		if (smbuf->m_pkthdr.rx_seg_cnt == 0) {
1481 			/* First time we append packets, need to set it to 1 */
1482 			smbuf->m_pkthdr.rx_seg_cnt = 1;
1483 		}
1484 		static_assert(sizeof(result) == sizeof(smbuf->m_pkthdr.rx_seg_cnt));
1485 		if (!os_add_overflow(1, smbuf->m_pkthdr.rx_seg_cnt, &result)) {
1486 			smbuf->m_pkthdr.rx_seg_cnt = result;
1487 		}
1488 		SK_DF(logflags, "Agg mbuf len %u TCP csum 0x%04x",
1489 		    smbuf->m_pkthdr.len, ntohs(stcp->th_sum));
1490 	}
1491 }
1492 
1493 /*
1494  * Copy metadata from source packet to destination packet
1495  */
1496 static void
pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)1497 pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
1498 {
1499 	/* Copy packet metadata */
1500 	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
1501 	_PKT_COPY(spkt, dpkt);
1502 }
1503 
1504 static void
pkt_finalize(kern_packet_t ph)1505 pkt_finalize(kern_packet_t ph)
1506 {
1507 	int err = __packet_finalize(ph);
1508 	VERIFY(err == 0);
1509 #if (DEVELOPMENT || DEBUG)
1510 	struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1511 	uint8_t *buf;
1512 	MD_BUFLET_ADDR_ABS(pkt, buf);
1513 	buf += pkt->pkt_headroom + pkt->pkt_l2_len;
1514 	DTRACE_SKYWALK2(aggr__finalize, struct __kern_packet *, pkt,
1515 	    uint8_t *, buf);
1516 #endif
1517 }
1518 
1519 static inline uint32_t
estimate_buf_cnt(struct flow_entry * fe,uint32_t total_bytes,uint32_t total_pkts,uint32_t min_bufsize,uint32_t agg_bufsize)1520 estimate_buf_cnt(struct flow_entry *fe, uint32_t total_bytes, uint32_t total_pkts,
1521     uint32_t min_bufsize, uint32_t agg_bufsize)
1522 {
1523 	uint32_t max_ip_len = MAX_AGG_IP_LEN();
1524 	uint32_t agg_size = MAX(fe->fe_rx_largest_size, min_bufsize);
1525 	uint32_t hdr_overhead;
1526 
1527 	if (__improbable(sk_fsw_rx_agg_tcp == 0)) {
1528 		return MIN(total_pkts, MAX_BUFLET_COUNT);
1529 	}
1530 
1531 	agg_size = MIN(agg_size, agg_bufsize);
1532 
1533 	hdr_overhead = (total_bytes / max_ip_len) *
1534 	    (MAX(sizeof(struct ip), sizeof(struct ip6_hdr)) +
1535 	    sizeof(struct tcphdr));
1536 
1537 	return ((total_bytes + hdr_overhead) / agg_size) + 1;
1538 }
1539 
1540 SK_INLINE_ATTRIBUTE
1541 static inline void
_append_dbuf_array_to_kpkt(kern_packet_t ph,kern_buflet_t pbuf,_dbuf_array_t * dbuf_array,kern_buflet_t * lbuf)1542 _append_dbuf_array_to_kpkt(kern_packet_t ph, kern_buflet_t pbuf,
1543     _dbuf_array_t *dbuf_array, kern_buflet_t *lbuf)
1544 {
1545 	for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1546 		kern_buflet_t buf = dbuf_array->dba_buflet[i];
1547 		VERIFY(kern_packet_add_buflet(ph, pbuf, buf) == 0);
1548 		pbuf = buf;
1549 		dbuf_array->dba_buflet[i] = NULL;
1550 	}
1551 	ASSERT(pbuf != NULL);
1552 	dbuf_array->dba_num_dbufs = 0;
1553 	*lbuf = pbuf;
1554 }
1555 
1556 SK_INLINE_ATTRIBUTE
1557 static inline void
_free_dbuf_array(struct kern_pbufpool * pp,_dbuf_array_t * dbuf_array)1558 _free_dbuf_array(struct kern_pbufpool *pp,
1559     _dbuf_array_t *dbuf_array)
1560 {
1561 	for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1562 		kern_buflet_t buf = dbuf_array->dba_buflet[i];
1563 		pp_free_buflet(pp, buf);
1564 		dbuf_array->dba_buflet[i] = NULL;
1565 	}
1566 	dbuf_array->dba_num_dbufs = 0;
1567 }
1568 
1569 static inline void
finalize_super_packet(struct __kern_packet ** spkt,kern_packet_t * sph,struct flow_agg * fa,uint32_t * largest_spkt,uint16_t * spkts,uint16_t bufcnt)1570 finalize_super_packet(struct __kern_packet **spkt, kern_packet_t *sph,
1571     struct flow_agg *fa, uint32_t *largest_spkt, uint16_t *spkts,
1572     uint16_t bufcnt)
1573 {
1574 	(*spkts)++;
1575 	if (bufcnt > 1) {
1576 		(*spkt)->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1577 	}
1578 	pkt_finalize(*sph);
1579 	if ((*spkt)->pkt_length > *largest_spkt) {
1580 		*largest_spkt = (*spkt)->pkt_length;
1581 	}
1582 	pkt_agg_log(*spkt, kernproc, false);
1583 	DTRACE_SKYWALK1(aggr__buflet__count, uint16_t, bufcnt);
1584 	*sph = 0;
1585 	*spkt = NULL;
1586 	FLOW_AGG_CLEAR(fa);
1587 }
1588 
1589 static inline void
converge_aggregation_size(struct flow_entry * fe,uint32_t largest_agg_size)1590 converge_aggregation_size(struct flow_entry *fe, uint32_t largest_agg_size)
1591 {
1592 	if (fe->fe_rx_largest_size > largest_agg_size) {
1593 		/*
1594 		 * Make it slowly move towards largest_agg_size if we
1595 		 * consistently get non-aggregatable size.
1596 		 *
1597 		 * If we start at 16K, this makes us go to 4K within 6 rounds
1598 		 * and down to 2K within 12 rounds.
1599 		 */
1600 		fe->fe_rx_largest_size -=
1601 		    ((fe->fe_rx_largest_size - largest_agg_size) >> 2);
1602 	} else {
1603 		fe->fe_rx_largest_size +=
1604 		    ((largest_agg_size - fe->fe_rx_largest_size) >> 2);
1605 	}
1606 }
1607 
1608 SK_NO_INLINE_ATTRIBUTE
1609 static void
flow_rx_agg_channel(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,uint32_t rx_bytes,bool is_mbuf)1610 flow_rx_agg_channel(struct nx_flowswitch *fsw, struct flow_entry *fe,
1611     struct pktq *rx_pkts, uint32_t rx_bytes, bool is_mbuf)
1612 {
1613 #define __RX_AGG_CHAN_DROP_SOURCE_PACKET(_pkt, _reason, _flags)    do {    \
1614 	pp_drop_packet_single(_pkt, fsw->fsw_ifp, _flags, _reason, __func__, __LINE__); \
1615 	(_pkt) = NULL;                                                     \
1616 	FLOW_AGG_CLEAR(&fa);                                               \
1617 	prev_csum_ok = false;                                              \
1618 } while (0)
1619 	struct flow_agg fa;             /* states */
1620 	FLOW_AGG_CLEAR(&fa);
1621 
1622 	struct pktq super_pkts;         /* dst super packets */
1623 	struct pktq disposed_pkts;      /* done src packets */
1624 
1625 	KPKTQ_INIT(&super_pkts);
1626 	KPKTQ_INIT(&disposed_pkts);
1627 
1628 	struct __kern_channel_ring *ring;
1629 	ring = fsw_flow_get_rx_ring(fsw, fe);
1630 	if (__improbable(ring == NULL)) {
1631 		SK_ERR("Rx ring is NULL");
1632 		STATS_ADD(&fsw->fsw_stats, FSW_STATS_DST_NXPORT_INVALID,
1633 		    KPKTQ_LEN(rx_pkts));
1634 		pp_drop_pktq(rx_pkts, fsw->fsw_ifp, DROPTAP_FLAG_DIR_IN,
1635 		    DROP_REASON_FSW_DST_NXPORT_INVALID, __func__, __LINE__);
1636 		return;
1637 	}
1638 	struct kern_pbufpool *dpp = ring->ckr_pp;
1639 	ASSERT(dpp->pp_max_frags > 1);
1640 
1641 	struct __kern_packet *pkt, *tpkt;
1642 	/* state for super packet */
1643 	struct __kern_packet *__single spkt = NULL;
1644 	kern_packet_t sph = 0;
1645 	kern_buflet_t __single sbuf = NULL;
1646 	bool prev_csum_ok = false, csum_ok, agg_ok;
1647 	uint16_t spkts = 0, bufcnt = 0;
1648 	int err;
1649 
1650 	struct fsw_stats *fsws = &fsw->fsw_stats;
1651 
1652 	/* state for buflet batch alloc */
1653 	uint32_t bh_cnt, bh_cnt_tmp;
1654 	uint64_t buf_arr[MAX_BUFLET_COUNT];
1655 	_dbuf_array_t dbuf_array = {.dba_is_buflet = true, .dba_num_dbufs = 0};
1656 	uint32_t largest_spkt = 0; /* largest aggregated packet size */
1657 	uint32_t agg_bufsize;
1658 	uint8_t iter = 0;
1659 	bool large_buffer = false;
1660 
1661 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1662 	SK_DF(logflags, "Rx input queue len %u", KPKTQ_LEN(rx_pkts));
1663 
1664 	if (__probable(fe->fe_rx_largest_size != 0 &&
1665 	    NX_FSW_TCP_RX_AGG_ENABLED())) {
1666 		if (fe->fe_rx_largest_size <= PP_BUF_SIZE_DEF(dpp) ||
1667 		    PP_BUF_SIZE_LARGE(dpp) == 0) {
1668 			agg_bufsize = PP_BUF_SIZE_DEF(dpp);
1669 		} else {
1670 			agg_bufsize = PP_BUF_SIZE_LARGE(dpp);
1671 			large_buffer = true;
1672 		}
1673 		bh_cnt = estimate_buf_cnt(fe, rx_bytes, KPKTQ_LEN(rx_pkts),
1674 		    PP_BUF_SIZE_DEF(dpp), agg_bufsize);
1675 		DTRACE_SKYWALK1(needed_blt_cnt_agg, uint32_t, bh_cnt);
1676 		bh_cnt = MIN(bh_cnt, MAX_BUFLET_COUNT);
1677 		bh_cnt_tmp = bh_cnt;
1678 	} else {
1679 		/*
1680 		 * No payload, thus it's all small-sized ACKs/...
1681 		 * OR aggregation is disabled.
1682 		 */
1683 		agg_bufsize = PP_BUF_SIZE_DEF(dpp);
1684 		bh_cnt_tmp = bh_cnt = MIN(KPKTQ_LEN(rx_pkts), MAX_BUFLET_COUNT);
1685 		DTRACE_SKYWALK1(needed_blt_cnt_no_agg, uint32_t, bh_cnt);
1686 	}
1687 
1688 	err = pp_alloc_buflet_batch(dpp, buf_arr, &bh_cnt, SKMEM_NOSLEEP,
1689 	    large_buffer);
1690 	if (__improbable(bh_cnt == 0)) {
1691 		SK_ERR("failed to alloc %u buflets (err %d), use slow path",
1692 		    bh_cnt_tmp, err);
1693 	}
1694 	bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1695 	KPKTQ_FOREACH_SAFE(pkt, rx_pkts, tpkt) {
1696 		if (tpkt != NULL) {
1697 			void *baddr;
1698 			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1699 			SK_PREFETCH(baddr, 0);
1700 		}
1701 
1702 		ASSERT(pkt->pkt_qum.qum_pp != dpp);
1703 		ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
1704 		ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
1705 		ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
1706 		ASSERT(!pkt->pkt_flow_ip_is_frag);
1707 		ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1708 
1709 		csum_ok = false;
1710 		agg_ok = false;
1711 		/* supports TCP only */
1712 		uint32_t thlen = (pkt->pkt_flow_ip_hlen +
1713 		    pkt->pkt_flow_tcp_hlen);
1714 		uint32_t plen = (thlen + pkt->pkt_flow_ulen);
1715 		uint16_t data_csum = 0;
1716 
1717 		KPKTQ_REMOVE(rx_pkts, pkt);
1718 		rx_bytes -= pkt->pkt_flow_ulen;
1719 		err = flow_pkt_track(fe, pkt, true);
1720 		if (__improbable(err != 0)) {
1721 			STATS_INC(fsws, FSW_STATS_RX_FLOW_TRACK_ERR);
1722 			/* if need to trigger RST */
1723 			if (err == ENETRESET) {
1724 				flow_track_abort_tcp(fe, pkt, NULL);
1725 			}
1726 			SK_DF(SK_VERB_FLOW_TRACK, "flow_pkt_track failed (err %d)", err);
1727 			__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
1728 			    DROP_REASON_FSW_FLOW_TRACK_ERR, DROPTAP_FLAG_DIR_IN);
1729 			continue;
1730 		}
1731 
1732 		if (is_mbuf) {          /* compat */
1733 			m_adj(pkt->pkt_mbuf, pkt->pkt_l2_len);
1734 			pkt->pkt_svc_class = m_get_service_class(pkt->pkt_mbuf);
1735 			if (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
1736 				pkt->pkt_pflags |= PKT_F_WAKE_PKT;
1737 			}
1738 		}
1739 
1740 		if (prev_csum_ok && sbuf) {
1741 			ASSERT(fa.fa_spkt == spkt);
1742 			ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1743 			agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
1744 			agg_ok = (agg_ok && bufcnt < dpp->pp_max_frags);
1745 
1746 			if (agg_ok && sbuf->buf_dlim - sbuf->buf_doff -
1747 			    sbuf->buf_dlen >= plen - thlen) {
1748 				/*
1749 				 * No need for a new packet, just
1750 				 * append to curr_m.
1751 				 */
1752 				csum_ok = copy_pkt_csum_packed(pkt, plen, NULL,
1753 				    is_ipv4, NULL, sbuf, &data_csum, NULL);
1754 
1755 				if (!csum_ok) {
1756 					STATS_INC(fsws,
1757 					    FSW_STATS_RX_AGG_BAD_CSUM);
1758 					SK_ERR("Checksum for aggregation "
1759 					    "is wrong");
1760 					DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail1);
1761 					/*
1762 					 * Turns out, checksum is wrong!
1763 					 * Fallback to no-agg mode.
1764 					 */
1765 					agg_ok = false;
1766 				} else {
1767 					flow_agg_merge_hdr(&fa, pkt,
1768 					    data_csum, fsws);
1769 					goto next;
1770 				}
1771 			}
1772 		}
1773 
1774 		/* calculate number of buflets required */
1775 		bh_cnt_tmp = howmany(plen, agg_bufsize);
1776 		if (__improbable(bh_cnt_tmp > MAX_BUFLET_COUNT)) {
1777 			STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1778 			SK_ERR("packet too big: bufcnt %d len %d", bh_cnt_tmp,
1779 			    plen);
1780 			__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
1781 			    DROP_REASON_FSW_GSO_NOMEM_PKT, DROPTAP_FLAG_DIR_IN);
1782 			continue;
1783 		}
1784 		if (bh_cnt < bh_cnt_tmp) {
1785 			uint32_t tmp;
1786 
1787 			if (iter != 0) {
1788 				/*
1789 				 * rearrange the array for additional
1790 				 * allocation
1791 				 */
1792 				uint8_t i;
1793 				for (i = 0; i < bh_cnt; i++, iter++) {
1794 					buf_arr[i] = buf_arr[iter];
1795 					buf_arr[iter] = 0;
1796 				}
1797 				iter = 0;
1798 			}
1799 			tmp = estimate_buf_cnt(fe, rx_bytes, KPKTQ_LEN(rx_pkts),
1800 			    PP_BUF_SIZE_DEF(dpp), agg_bufsize);
1801 			tmp = MIN(tmp, MAX_BUFLET_COUNT);
1802 			tmp = MAX(tmp, bh_cnt_tmp);
1803 			tmp -= bh_cnt;
1804 			ASSERT(tmp <= (MAX_BUFLET_COUNT - bh_cnt));
1805 			DTRACE_SKYWALK1(refilled_blt_cnt, uint32_t, tmp);
1806 			err = pp_alloc_buflet_batch(dpp, &buf_arr[bh_cnt],
1807 			    &tmp, SKMEM_NOSLEEP, large_buffer);
1808 			bh_cnt += tmp;
1809 			if (__improbable((tmp == 0) || (bh_cnt < bh_cnt_tmp))) {
1810 				STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1811 				SK_ERR("buflet alloc failed (err %d)", err);
1812 				__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
1813 				    DROP_REASON_FSW_GSO_NOMEM_PKT, DROPTAP_FLAG_DIR_IN);
1814 				continue;
1815 			}
1816 		}
1817 		/* Use pre-allocated buflets */
1818 		ASSERT(bh_cnt >= bh_cnt_tmp);
1819 		dbuf_array.dba_num_dbufs = bh_cnt_tmp;
1820 		while (bh_cnt_tmp-- > 0) {
1821 			/*
1822 			 * -fbounds-safety: buf_arr[iter] is a uint64_t, so
1823 			 * forging it
1824 			 */
1825 			dbuf_array.dba_buflet[bh_cnt_tmp] =
1826 			    __unsafe_forge_single(kern_buflet_t, buf_arr[iter]);
1827 			buf_arr[iter] = 0;
1828 			bh_cnt--;
1829 			iter++;
1830 		}
1831 		/* copy and checksum TCP data */
1832 		if (agg_ok) {
1833 			int added = 0;
1834 			ASSERT(dbuf_array.dba_num_dbufs != 0);
1835 			csum_ok = copy_pkt_csum_packed(pkt, plen, &dbuf_array,
1836 			    is_ipv4, NULL, sbuf, &data_csum, &added);
1837 
1838 			if (__improbable(!csum_ok)) {
1839 				STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1840 				SK_ERR("Checksum for aggregation on new "
1841 				    "mbuf is wrong");
1842 				DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail2);
1843 				agg_ok = false;
1844 				/* reset the used buflets */
1845 				uint8_t j;
1846 				for (j = 0; j < dbuf_array.dba_num_dbufs; j++) {
1847 					VERIFY(kern_buflet_set_data_length(
1848 						    dbuf_array.dba_buflet[j], 0) == 0);
1849 				}
1850 				goto non_agg;
1851 			}
1852 
1853 			/*
1854 			 * There was not enough space in curr_m, thus we must
1855 			 * have added to m->m_data.
1856 			 */
1857 			VERIFY(added > 0);
1858 		} else {
1859 non_agg:
1860 			ASSERT(dbuf_array.dba_num_dbufs != 0);
1861 			csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
1862 			    &data_csum, is_ipv4);
1863 			if (__improbable(!csum_ok)) {
1864 				STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1865 				SK_ERR("%d incorrect csum", __LINE__);
1866 				DTRACE_SKYWALK(aggr__chan_tcp_csum_fail);
1867 			}
1868 		}
1869 		if (agg_ok) {
1870 			ASSERT(fa.fa_spkt == spkt);
1871 			ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1872 			/* update current packet header */
1873 			flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
1874 			ASSERT(dbuf_array.dba_num_dbufs > 0);
1875 			bufcnt += dbuf_array.dba_num_dbufs;
1876 			_append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1877 			    &sbuf);
1878 		} else {
1879 			/* Finalize the current super packet */
1880 			if (sph != 0) {
1881 				finalize_super_packet(&spkt, &sph, &fa,
1882 				    &largest_spkt, &spkts, bufcnt);
1883 			}
1884 
1885 			/* New super packet */
1886 			err = kern_pbufpool_alloc_nosleep(dpp, 0, &sph);
1887 			if (__improbable(err != 0)) {
1888 				STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1889 				SK_ERR("packet alloc failed (err %d)", err);
1890 				_free_dbuf_array(dpp, &dbuf_array);
1891 				__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
1892 				    DROP_REASON_FSW_GSO_NOMEM_PKT, DROPTAP_FLAG_DIR_IN);
1893 				continue;
1894 			}
1895 			spkt = SK_PTR_ADDR_KPKT(sph);
1896 			pkt_copy_metadata(pkt, spkt);
1897 			/* Packet length for super packet starts from L3 */
1898 			spkt->pkt_length = plen;
1899 			spkt->pkt_flow_ulen =  pkt->pkt_flow_ulen;
1900 			spkt->pkt_headroom = 0;
1901 			spkt->pkt_l2_len = 0;
1902 			spkt->pkt_seg_cnt = 1;
1903 
1904 			ASSERT(dbuf_array.dba_num_dbufs > 0);
1905 			bufcnt = dbuf_array.dba_num_dbufs;
1906 			sbuf = kern_packet_get_next_buflet(sph, NULL);
1907 			_append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1908 			    &sbuf);
1909 
1910 			KPKTQ_ENQUEUE(&super_pkts, spkt);
1911 			_UUID_COPY(spkt->pkt_flow_id, fe->fe_uuid);
1912 			_UUID_COPY(spkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1913 			spkt->pkt_policy_id = fe->fe_policy_id;
1914 			spkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
1915 			spkt->pkt_transport_protocol =
1916 			    fe->fe_transport_protocol;
1917 			flow_agg_init_spkt(fsw, &fa, spkt, pkt);
1918 		}
1919 next:
1920 		pkt_agg_log(pkt, kernproc, true);
1921 		prev_csum_ok = csum_ok;
1922 		KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1923 	}
1924 
1925 	/* Free unused buflets */
1926 	STATS_ADD(fsws, FSW_STATS_RX_WASTED_BFLT, bh_cnt);
1927 	while (bh_cnt > 0) {
1928 		/* -fbounds-saftey: buf_arr[iter] is a uint64_t, so forging it */
1929 		pp_free_buflet(dpp, __unsafe_forge_single(kern_buflet_t,
1930 		    buf_arr[iter]));
1931 		buf_arr[iter] = 0;
1932 		bh_cnt--;
1933 		iter++;
1934 	}
1935 	/* Finalize the last super packet */
1936 	if (sph != 0) {
1937 		finalize_super_packet(&spkt, &sph, &fa, &largest_spkt,
1938 		    &spkts, bufcnt);
1939 	}
1940 	converge_aggregation_size(fe, largest_spkt);
1941 	DTRACE_SKYWALK1(aggr__spkt__count, uint16_t, spkts);
1942 	if (__improbable(is_mbuf)) {
1943 		STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2PKT, spkts);
1944 	} else {
1945 		STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2PKT, spkts);
1946 	}
1947 	FLOW_STATS_IN_ADD(fe, spackets, spkts);
1948 
1949 	KPKTQ_FINI(rx_pkts);
1950 
1951 	if (KPKTQ_LEN(&super_pkts) > 0) {
1952 		fsw_ring_enqueue_tail_drop(fsw, ring, &super_pkts);
1953 	}
1954 	KPKTQ_FINI(&super_pkts);
1955 
1956 	pp_free_pktq(&disposed_pkts);
1957 }
1958 
1959 /* streamline a smbuf */
1960 static bool
_finalize_smbuf(struct mbuf * smbuf)1961 _finalize_smbuf(struct mbuf *smbuf)
1962 {
1963 	/* the 1st mbuf always contains something, so start with the 2nd one */
1964 	struct mbuf *m_chained = smbuf->m_next;
1965 	struct mbuf *prev_m = smbuf;
1966 	bool freed = false;
1967 
1968 	while (m_chained != NULL) {
1969 		if (m_chained->m_len != 0) {
1970 			prev_m = m_chained;
1971 			m_chained = m_chained->m_next;
1972 			continue;
1973 		}
1974 		prev_m->m_next = m_chained->m_next;
1975 		m_free(m_chained);
1976 		m_chained = prev_m->m_next;
1977 		freed = true;
1978 	}
1979 	return freed;
1980 }
1981 
1982 SK_NO_INLINE_ATTRIBUTE
1983 static void
flow_rx_agg_host(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,struct mbufq * host_mq,uint32_t rx_bytes,bool is_mbuf)1984 flow_rx_agg_host(struct nx_flowswitch *fsw, struct flow_entry *fe,
1985     struct pktq *rx_pkts, struct mbufq *host_mq,
1986     uint32_t rx_bytes, bool is_mbuf)
1987 {
1988 #define __RX_AGG_HOST_DROP_SOURCE_PACKET(_pkt, _reason, _flags)    do {   \
1989 	drop_packets++;                                                   \
1990 	drop_bytes += (_pkt)->pkt_length;                                 \
1991 	pp_drop_packet_single(_pkt, fsw->fsw_ifp, _flags, _reason, __func__, __LINE__); \
1992 	(_pkt) = NULL;                                                    \
1993 	FLOW_AGG_CLEAR(&fa);                                              \
1994 	prev_csum_ok = false;                                             \
1995 } while (0)
1996 	struct flow_agg fa;             /* states */
1997 	FLOW_AGG_CLEAR(&fa);
1998 
1999 	struct pktq disposed_pkts;      /* done src packets */
2000 	KPKTQ_INIT(&disposed_pkts);
2001 
2002 	struct __kern_packet *pkt, *tpkt;
2003 	/* points to the first mbuf of chain */
2004 	struct mbuf *m_chain = NULL;
2005 	/* super mbuf, at the end it points to last mbuf packet */
2006 	struct  mbuf *smbuf = NULL, *curr_m = NULL;
2007 	bool prev_csum_ok = false, csum_ok, agg_ok;
2008 	uint16_t smbufs = 0, smbuf_finalized = 0;
2009 	uint32_t bytes = 0, rcvd_ulen = 0;
2010 	uint32_t rcvd_packets = 0, rcvd_bytes = 0; /* raw packets & bytes */
2011 	uint32_t drop_packets = 0, drop_bytes = 0; /* dropped packets & bytes */
2012 	uint32_t largest_smbuf = 0;
2013 	int err = 0;
2014 
2015 	struct fsw_stats *fsws = &fsw->fsw_stats;
2016 	bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
2017 
2018 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
2019 
2020 	/* state for mbuf batch alloc */
2021 	uint32_t mhead_cnt = 0;
2022 	uint32_t mhead_bufsize = 0;
2023 	struct mbuf * mhead = NULL;
2024 
2025 	uint16_t l2len = KPKTQ_FIRST(rx_pkts)->pkt_l2_len;
2026 
2027 	SK_DF(logflags, "Rx input queue bytes %u", rx_bytes);
2028 
2029 	if (__probable(!is_mbuf)) {
2030 		/*
2031 		 *  Batch mbuf alloc is based on
2032 		 * convert_native_pkt_to_mbuf_chain
2033 		 */
2034 		if (__probable(fe->fe_rx_largest_size != 0 &&
2035 		    NX_FSW_TCP_RX_AGG_ENABLED())) {
2036 			unsigned int num_segs = 1;
2037 			int pktq_len = KPKTQ_LEN(rx_pkts);
2038 
2039 			if (fe->fe_rx_largest_size <= MCLBYTES &&
2040 			    rx_bytes / pktq_len <= MCLBYTES) {
2041 				mhead_bufsize = MCLBYTES;
2042 			} else if (fe->fe_rx_largest_size <= MBIGCLBYTES &&
2043 			    rx_bytes / pktq_len <= MBIGCLBYTES) {
2044 				mhead_bufsize = MBIGCLBYTES;
2045 			} else if (fe->fe_rx_largest_size <= M16KCLBYTES &&
2046 			    rx_bytes / pktq_len <= M16KCLBYTES) {
2047 				mhead_bufsize = M16KCLBYTES;
2048 			} else {
2049 				mhead_bufsize = M16KCLBYTES * 2;
2050 				num_segs = 2;
2051 			}
2052 
2053 try_again:
2054 			if (rx_bytes != 0) {
2055 				mhead_cnt = estimate_buf_cnt(fe, rx_bytes, KPKTQ_LEN(rx_pkts),
2056 				    MCLBYTES, mhead_bufsize);
2057 			} else {
2058 				/* No payload, thus it's all small-sized ACKs/... */
2059 				mhead_bufsize = MHLEN;
2060 				mhead_cnt = pktq_len;
2061 			}
2062 
2063 			mhead = m_allocpacket_internal(&mhead_cnt,
2064 			    mhead_bufsize, &num_segs, M_NOWAIT, 1, 0);
2065 
2066 			if (mhead == NULL) {
2067 				if (mhead_bufsize > M16KCLBYTES) {
2068 					mhead_bufsize = M16KCLBYTES;
2069 					num_segs = 1;
2070 					goto try_again;
2071 				}
2072 
2073 				if (mhead_bufsize == M16KCLBYTES) {
2074 					mhead_bufsize = MBIGCLBYTES;
2075 					goto try_again;
2076 				}
2077 
2078 				if (mhead_bufsize == MBIGCLBYTES) {
2079 					mhead_bufsize = MCLBYTES;
2080 					goto try_again;
2081 				}
2082 			}
2083 		} else {
2084 			mhead = NULL;
2085 			mhead_bufsize = mhead_cnt = 0;
2086 		}
2087 		SK_DF(logflags, "batch alloc'ed %u mbufs of size %u", mhead_cnt,
2088 		    mhead_bufsize);
2089 	}
2090 
2091 	KPKTQ_FOREACH_SAFE(pkt, rx_pkts, tpkt) {
2092 		if (tpkt != NULL) {
2093 			void *baddr;
2094 			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
2095 			SK_PREFETCH(baddr, 0);
2096 		}
2097 
2098 		/* Validate l2 len, ip vers, is_mbuf */
2099 		ASSERT(pkt->pkt_l2_len == l2len);
2100 		ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
2101 		ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
2102 		ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
2103 		ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
2104 		ASSERT(!pkt->pkt_flow_ip_is_frag);
2105 		ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
2106 
2107 		csum_ok = false;
2108 		agg_ok = false;
2109 		/*
2110 		 * As we only agg packets with same hdr length,
2111 		 * leverage the pkt metadata
2112 		 */
2113 		uint32_t thlen = (pkt->pkt_flow_ip_hlen +
2114 		    pkt->pkt_flow_tcp_hlen);
2115 		uint32_t plen = (thlen + pkt->pkt_flow_ulen);
2116 
2117 		/*
2118 		 * Rather than calling flow_pkt_track() for each
2119 		 * packet here, we accumulate received packet stats
2120 		 * for the call to flow_track_stats() below.  This
2121 		 * is because flow tracking is a no-op for traffic
2122 		 * that belongs to the host stack.
2123 		 */
2124 		rcvd_ulen += pkt->pkt_flow_ulen;
2125 		rcvd_bytes += pkt->pkt_length;
2126 		rcvd_packets++;
2127 
2128 		KPKTQ_REMOVE(rx_pkts, pkt);
2129 		rx_bytes -= pkt->pkt_flow_ulen;
2130 
2131 		/* packet is for BSD flow, create a mbuf chain */
2132 		uint32_t len = (l2len + plen);
2133 		uint16_t data_csum = 0;
2134 		struct mbuf *__single m;
2135 		bool is_wake_pkt = false;
2136 		if (__improbable(is_mbuf)) {
2137 			m = pkt->pkt_mbuf;
2138 
2139 			if (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
2140 				is_wake_pkt = true;
2141 			}
2142 
2143 			/* Detach mbuf from source pkt */
2144 			KPKT_CLEAR_MBUF_DATA(pkt);
2145 
2146 			uint32_t trailer = (m_pktlen(m) - len);
2147 			ASSERT((uint32_t)m_pktlen(m) >= plen);
2148 			/* Remove the trailer */
2149 			if (trailer > 0) {
2150 				m_adj(m, -trailer);
2151 			}
2152 			if ((uint32_t) m->m_len < (l2len + thlen)) {
2153 				m = m_pullup(m, (l2len + thlen));
2154 				if (m == NULL) {
2155 					STATS_INC(fsws,
2156 					    FSW_STATS_RX_DROP_NOMEM_BUF);
2157 					SK_ERR("mbuf pullup failed (err %d)",
2158 					    err);
2159 					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt,
2160 					    DROP_REASON_FSW_GSO_NOMEM_MBUF, DROPTAP_FLAG_DIR_IN);
2161 					continue;
2162 				}
2163 				m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2164 			}
2165 			/* attached mbuf is already allocated */
2166 			csum_ok = mbuf_csum(pkt, m, is_ipv4, &data_csum);
2167 		} else {                /* native */
2168 			uint16_t pad = P2ROUNDUP(l2len, sizeof(uint32_t)) -
2169 			    l2len;
2170 			uint32_t tot_len = (len + pad);
2171 			/* remember largest aggregated packet size */
2172 			if (smbuf) {
2173 				/* plus 4 bytes to account for padding */
2174 				if (largest_smbuf <
2175 				    (uint32_t)m_pktlen(smbuf) + pad) {
2176 					largest_smbuf = (uint32_t)m_pktlen(smbuf) + pad;
2177 				}
2178 			}
2179 
2180 			if ((pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
2181 				is_wake_pkt = true;
2182 			}
2183 
2184 			if (prev_csum_ok && curr_m) {
2185 				ASSERT(fa.fa_smbuf == smbuf);
2186 				ASSERT(!fa.fa_sobj_is_pkt);
2187 				agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2188 
2189 				if (agg_ok &&
2190 				    M_TRAILINGSPACE(curr_m) >= plen - thlen) {
2191 					/*
2192 					 * No need for a new mbuf,
2193 					 * just append to curr_m.
2194 					 */
2195 					csum_ok = copy_pkt_csum_packed(pkt,
2196 					    plen, NULL, is_ipv4, curr_m, NULL,
2197 					    &data_csum, NULL);
2198 
2199 					if (!csum_ok) {
2200 						STATS_INC(fsws,
2201 						    FSW_STATS_RX_AGG_BAD_CSUM);
2202 						SK_ERR("Checksum for "
2203 						    "aggregation is wrong");
2204 						DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail1);
2205 						/*
2206 						 * Turns out, checksum is wrong!
2207 						 * Fallback to no-agg mode.
2208 						 */
2209 						agg_ok = 0;
2210 					} else {
2211 						/*
2212 						 * We only added payload,
2213 						 * thus -thlen.
2214 						 */
2215 						bytes += (plen - thlen);
2216 						flow_agg_merge_hdr(&fa, pkt,
2217 						    data_csum, fsws);
2218 						goto next;
2219 					}
2220 				}
2221 			}
2222 
2223 			/*
2224 			 * If the batch allocation returned partial success,
2225 			 * we try blocking allocation here again
2226 			 */
2227 			m = mhead;
2228 			if (__improbable(m == NULL ||
2229 			    tot_len > mhead_bufsize)) {
2230 				unsigned int num_segs = 1;
2231 				if (tot_len > M16KCLBYTES) {
2232 					num_segs = 0;
2233 				}
2234 
2235 				ASSERT(mhead_cnt == 0 || mhead != NULL);
2236 				err = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
2237 				    &num_segs, &m);
2238 				if (err != 0) {
2239 					STATS_INC(fsws,
2240 					    FSW_STATS_RX_DROP_NOMEM_BUF);
2241 					SK_ERR("mbuf alloc failed (err %d), "
2242 					    "maxchunks %d, len %d", err, num_segs,
2243 					    tot_len);
2244 					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt,
2245 					    DROP_REASON_FSW_GSO_NOMEM_MBUF, DROPTAP_FLAG_DIR_IN);
2246 					continue;
2247 				}
2248 			} else {
2249 				ASSERT(mhead_cnt > 0);
2250 				mhead = m->m_nextpkt;
2251 				m->m_nextpkt = NULL;
2252 				mhead_cnt--;
2253 			}
2254 			m->m_data += pad;
2255 			m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2256 
2257 			/*
2258 			 * copy and checksum l3, l4 and payload
2259 			 * l2 header is copied later only if we
2260 			 * can't agg as an optimization
2261 			 */
2262 			m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
2263 			_dbuf_array_t dbuf_array = {.dba_is_buflet = false};
2264 			if (agg_ok) {
2265 				int added = 0, dbuf_idx = 0;
2266 				struct mbuf *m_tmp = m;
2267 				dbuf_array.dba_num_dbufs = 0;
2268 				uint32_t m_chain_max_len = 0;
2269 				while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
2270 					dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
2271 					dbuf_array.dba_num_dbufs += 1;
2272 					m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
2273 					m_tmp = m_tmp->m_next;
2274 					dbuf_idx++;
2275 				}
2276 				ASSERT(m_tmp == NULL);
2277 
2278 				csum_ok = copy_pkt_csum_packed(pkt, plen,
2279 				    &dbuf_array, is_ipv4, curr_m, NULL,
2280 				    &data_csum, &added);
2281 
2282 				if (!csum_ok) {
2283 					STATS_INC(fsws,
2284 					    FSW_STATS_RX_AGG_BAD_CSUM);
2285 					SK_ERR("Checksum for aggregation "
2286 					    "on new mbuf is wrong");
2287 					DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail2);
2288 					agg_ok = false;
2289 					goto non_agg;
2290 				}
2291 
2292 				/*
2293 				 * There was not enough space in curr_m,
2294 				 * thus we must have added to m->m_data.
2295 				 */
2296 				VERIFY(added > 0);
2297 				VERIFY(m->m_len <= m->m_pkthdr.len &&
2298 				    (uint32_t)m->m_pkthdr.len <= m_chain_max_len);
2299 
2300 				/*
2301 				 * We account for whatever we added
2302 				 * to m later on, thus - added.
2303 				 */
2304 				bytes += plen - thlen - added;
2305 			} else {
2306 non_agg:
2307 				dbuf_array.dba_num_dbufs = 0;
2308 				uint32_t m_chain_max_len = 0;
2309 				struct mbuf *m_tmp = m;
2310 				int dbuf_idx = 0;
2311 				while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
2312 					dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
2313 					dbuf_array.dba_num_dbufs += 1;
2314 					m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
2315 					m_tmp = m_tmp->m_next;
2316 					dbuf_idx++;
2317 				}
2318 				ASSERT(m_tmp == NULL);
2319 
2320 				m->m_len += l2len;
2321 				m->m_pkthdr.len += l2len;
2322 				csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
2323 				    &data_csum, is_ipv4);
2324 				if (__improbable(!csum_ok)) {
2325 					STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
2326 					SK_ERR("%d incorrect csum", __LINE__);
2327 					DTRACE_SKYWALK(aggr__host_tcp_csum_fail);
2328 				}
2329 				VERIFY(m->m_len <= m->m_pkthdr.len &&
2330 				    (uint32_t)m->m_pkthdr.len <= m_chain_max_len);
2331 			}
2332 
2333 			STATS_INC(fsws, FSW_STATS_RX_COPY_PKT2MBUF);
2334 			STATS_INC(fsws, FSW_STATS_RX_COPY_SUM);
2335 
2336 			m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off;
2337 			m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value;
2338 			if (__improbable((pkt->pkt_link_flags &
2339 			    PKT_LINKF_BCAST) != 0)) {
2340 				m->m_flags |= M_BCAST;
2341 			}
2342 			if (__improbable((pkt->pkt_link_flags &
2343 			    PKT_LINKF_MCAST) != 0)) {
2344 				m->m_flags |= M_MCAST;
2345 			}
2346 			/*
2347 			 *  Note that these flags have same value,
2348 			 * except PACKET_CSUM_PARTIAL
2349 			 */
2350 			m->m_pkthdr.csum_flags |= (pkt->pkt_csum_flags &
2351 			    PACKET_CSUM_RX_FLAGS);
2352 
2353 			/* Set the rcvif */
2354 			m->m_pkthdr.rcvif = fsw->fsw_ifp;
2355 
2356 			/* Make sure to propagate the wake pkt flag */
2357 			if (is_wake_pkt) {
2358 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
2359 			}
2360 		}
2361 		ASSERT(m != NULL);
2362 		ASSERT((m->m_flags & M_PKTHDR) && m->m_pkthdr.pkt_hdr != NULL);
2363 		ASSERT((m->m_flags & M_HASFCS) == 0);
2364 		ASSERT(m->m_nextpkt == NULL);
2365 
2366 		if (__improbable(is_mbuf)) {
2367 			if (prev_csum_ok && csum_ok) {
2368 				ASSERT(fa.fa_smbuf == smbuf);
2369 				agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2370 			}
2371 		}
2372 
2373 		if (agg_ok) {
2374 			ASSERT(is_wake_pkt == false);
2375 			ASSERT(fa.fa_smbuf == smbuf);
2376 			ASSERT(!fa.fa_sobj_is_pkt);
2377 			if (__improbable(is_mbuf)) {
2378 				bytes += (m_pktlen(m) - l2len);
2379 				/* adjust mbuf by l2, l3 and l4  hdr */
2380 				m_adj(m, l2len + thlen);
2381 			} else {
2382 				bytes += m_pktlen(m);
2383 			}
2384 
2385 			m->m_flags &= ~M_PKTHDR;
2386 			flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
2387 			while (curr_m->m_next != NULL) {
2388 				curr_m = curr_m->m_next;
2389 			}
2390 			curr_m->m_next = m;
2391 			curr_m = m;
2392 			m = NULL;
2393 		} else {
2394 			if ((uint32_t) m->m_len < l2len) {
2395 				m = m_pullup(m, l2len);
2396 				if (m == NULL) {
2397 					STATS_INC(fsws,
2398 					    FSW_STATS_RX_DROP_NOMEM_BUF);
2399 					SK_ERR("mbuf pullup failed (err %d)",
2400 					    err);
2401 					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt,
2402 					    DROP_REASON_FSW_GSO_NOMEM_MBUF, DROPTAP_FLAG_DIR_IN);
2403 					continue;
2404 				}
2405 				m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2406 			}
2407 
2408 			/* copy l2 header for native */
2409 			if (__probable(!is_mbuf)) {
2410 				uint16_t llhoff = pkt->pkt_headroom;
2411 				uint8_t *baddr;
2412 				MD_BUFLET_ADDR_ABS(pkt, baddr);
2413 				ASSERT(baddr != NULL);
2414 				baddr += llhoff;
2415 				pkt_copy(baddr, m_mtod_current(m), l2len);
2416 			}
2417 			/* adjust mbuf by l2 hdr */
2418 			m_adj(m, l2len);
2419 			bytes += m_pktlen(m);
2420 
2421 			/*
2422 			 * aggregated packets can be skipped by pktap because
2423 			 * the original pre-aggregated chain already passed through
2424 			 * pktap (see fsw_snoop()) before entering this function.
2425 			 */
2426 			m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
2427 
2428 			if (m_chain == NULL) {
2429 				/* this is the start of the chain */
2430 				m_chain = m;
2431 				smbuf = m;
2432 				curr_m = m;
2433 			} else if (smbuf != NULL) {
2434 				/*
2435 				 * set m to be next packet
2436 				 */
2437 				mbuf_agg_log(smbuf, kernproc, is_mbuf);
2438 				smbuf->m_nextpkt = m;
2439 				/*
2440 				 * Clean up (finalize) a smbuf only if it pre-allocated >1 segments,
2441 				 * which only happens when mhead_bufsize > M16KCLBYTES
2442 				 */
2443 				if (_finalize_smbuf(smbuf)) {
2444 					FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
2445 				}
2446 				smbuf_finalized++;
2447 				smbuf = m;
2448 				curr_m = m;
2449 			} else {
2450 				VERIFY(0);
2451 			}
2452 
2453 			smbufs++;
2454 			m = NULL;
2455 
2456 			flow_agg_init_smbuf(fsw, &fa, smbuf, pkt);
2457 			/*
2458 			 * If the super packet is an mbuf which can't accomodate
2459 			 * sizeof(struct ip_tcp_mask) or sizeof(struct ip6_tcp_mask)
2460 			 * in a single buffer, then do the aggregation check in slow path.
2461 			 * Note that on Intel platforms, an mbuf without cluster
2462 			 * has only 80 bytes available for data. That means if a
2463 			 * packet contains an Ethernet header, the mbuf won't be
2464 			 * able to fully contain "struct ip6_tcp_mask" or
2465 			 * "struct ip6_tcp_mask" data in a single buffer, because
2466 			 * sizeof(struct ip_tcp_mask) and sizeof(struct ip6_tcp_mask)
2467 			 * are all 80 bytes as well.
2468 			 */
2469 			if (__improbable(smbuf->m_len <
2470 			    ((m_mtod_current(smbuf) - (caddr_t)(smbuf->m_pkthdr.pkt_hdr)) + MASK_SIZE))) {
2471 				fa.fa_sobj_is_short = true;
2472 			}
2473 		}
2474 next:
2475 		pkt_agg_log(pkt, kernproc, true);
2476 		prev_csum_ok = csum_ok;
2477 		KPKTQ_ENQUEUE(&disposed_pkts, pkt);
2478 	}
2479 
2480 	KPKTQ_FINI(rx_pkts);
2481 
2482 	/* Free any leftover mbufs, true only for native  */
2483 	if (__improbable(mhead != NULL)) {
2484 		ASSERT(mhead_cnt != 0);
2485 		STATS_ADD(fsws, FSW_STATS_RX_WASTED_MBUF, mhead_cnt);
2486 		(void) m_freem_list(mhead);
2487 		mhead = NULL;
2488 		mhead_cnt = 0;
2489 	}
2490 
2491 	converge_aggregation_size(fe, largest_smbuf);
2492 
2493 	if (smbufs > 0) {
2494 		/* Last smbuf */
2495 		mbuf_agg_log(smbuf, kernproc, is_mbuf);
2496 		SK_DF(logflags, "smbuf count %u", smbufs);
2497 
2498 		ASSERT(m_chain != NULL);
2499 		ASSERT(smbuf != NULL);
2500 
2501 		/*
2502 		 * If the last mbuf needs to be finalized (mhead_bufsize > M16KCLBYTES)
2503 		 * but is not (smbuf_finalized < smbuf), do it now.
2504 		 */
2505 		if (smbuf_finalized < smbufs &&
2506 		    _finalize_smbuf(smbuf)) {
2507 			FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
2508 		}
2509 
2510 		/*
2511 		 * Enqueue smbufs for caller to process.
2512 		 */
2513 		mchain_agg_log(m_chain, kernproc, is_mbuf);
2514 		mbufq_enqueue(host_mq, m_chain, smbuf, smbufs, bytes);
2515 
2516 		if (__improbable(is_mbuf)) {
2517 			STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2MBUF, smbufs);
2518 		} else {
2519 			STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2MBUF, smbufs);
2520 		}
2521 		FLOW_STATS_IN_ADD(fe, spackets, smbufs);
2522 
2523 		ASSERT((fe->fe_flags & FLOWENTF_TRACK) == 0);
2524 	}
2525 
2526 	/* record (raw) number of packets and bytes */
2527 	ASSERT((int)(rcvd_bytes - drop_bytes) >= 0);
2528 	ASSERT((int)(rcvd_packets - drop_packets) >= 0);
2529 	flow_track_stats(fe, (rcvd_bytes - drop_bytes),
2530 	    (rcvd_packets - drop_packets), (rcvd_ulen != 0), true);
2531 
2532 	pp_free_pktq(&disposed_pkts);
2533 }
2534 
2535 void
flow_rx_agg_tcp(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,uint32_t rx_bytes,struct mbufq * host_mq,uint32_t flags)2536 flow_rx_agg_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe,
2537     struct pktq *rx_pkts, uint32_t rx_bytes, struct mbufq *host_mq,
2538     uint32_t flags)
2539 {
2540 #pragma unused(flags)
2541 	struct pktq dropped_pkts;
2542 	bool is_mbuf;
2543 
2544 	if (__improbable((flags & FLOW_PROC_FLAG_FRAGMENTS) != 0)) {
2545 		dp_flow_rx_process(fsw, fe, rx_pkts, rx_bytes, host_mq, FLOW_PROC_FLAG_FRAGMENTS);
2546 		return;
2547 	}
2548 
2549 	KPKTQ_INIT(&dropped_pkts);
2550 
2551 	if (!dp_flow_rx_route_process(fsw, fe)) {
2552 		SK_ERR("Rx route bad");
2553 		fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
2554 		STATS_ADD(&fsw->fsw_stats, FSW_STATS_RX_FLOW_NONVIABLE,
2555 		    KPKTQ_LEN(&dropped_pkts));
2556 		pp_drop_pktq(&dropped_pkts, fsw->fsw_ifp, DROPTAP_FLAG_DIR_IN,
2557 		    DROP_REASON_FSW_FLOW_NONVIABLE, __func__, __LINE__);
2558 		return;
2559 	}
2560 
2561 	is_mbuf = !!(PKT_IS_MBUF(KPKTQ_FIRST(rx_pkts)));
2562 
2563 	if (fe->fe_nx_port == FSW_VP_HOST) {
2564 		boolean_t do_rx_agg;
2565 
2566 		/* BSD flow */
2567 		if (sk_fsw_rx_agg_tcp_host != SK_FSW_RX_AGG_TCP_HOST_AUTO) {
2568 			do_rx_agg = (sk_fsw_rx_agg_tcp_host ==
2569 			    SK_FSW_RX_AGG_TCP_HOST_ON);
2570 		} else {
2571 			do_rx_agg = !dlil_has_ip_filter() &&
2572 			    !dlil_has_if_filter(fsw->fsw_ifp);
2573 		}
2574 		if (__improbable(!do_rx_agg)) {
2575 			fsw_host_rx_enqueue_mbq(fsw, rx_pkts, host_mq);
2576 			return;
2577 		}
2578 		if (__improbable(pktap_total_tap_count != 0)) {
2579 			fsw_snoop(fsw, fe, rx_pkts, true);
2580 		}
2581 		flow_rx_agg_host(fsw, fe, rx_pkts, host_mq, rx_bytes, is_mbuf);
2582 	} else {
2583 		/* channel flow */
2584 		if (__improbable(pktap_total_tap_count != 0)) {
2585 			fsw_snoop(fsw, fe, rx_pkts, true);
2586 		}
2587 		flow_rx_agg_channel(fsw, fe, rx_pkts, rx_bytes, is_mbuf);
2588 	}
2589 }
2590