xref: /xnu-10002.1.13/bsd/skywalk/nexus/flowswitch/flow/flow_agg.c (revision 1031c584a5e37aff177559b9f69dbd3c8c3fd30a)
1 /*
2  * Copyright (c) 2019-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
33 #include <skywalk/nexus/netif/nx_netif.h>
34 #include <skywalk/nexus/netif/nx_netif_compat.h>
35 #include <netinet/tcp.h>
36 #include <netinet/ip.h>
37 #include <netinet/ip6.h>
38 #include <net/pktap.h>
39 #include <sys/sdt.h>
40 
41 #define MAX_AGG_IP_LEN()        MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET)
42 #define MAX_BUFLET_COUNT        (32)
43 #define TCP_FLAGS_IGNORE        (TH_FIN|TH_SYN|TH_RST|TH_URG)
44 #define PKT_IS_MBUF(_pkt)       (_pkt->pkt_pflags & PKT_F_MBUF_DATA)
45 #define PKT_IS_TRUNC_MBUF(_pkt) (PKT_IS_MBUF(_pkt) &&           \
46 	                        (_pkt->pkt_pflags & PKT_F_TRUNCATED))
47 #define PKT_IS_WAKE_PKT(_pkt)   ((PKT_IS_MBUF(_pkt) &&                                  \
48 	                        (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) || \
49 	                        (!PKT_IS_MBUF(_pkt) &&                                  \
50 	                        (_pkt->pkt_pflags & PKT_F_WAKE_PKT)))
51 
52 
53 typedef uint16_t (* flow_agg_fix_pkt_sum_func)(uint16_t, uint16_t, uint16_t);
54 
55 static uint16_t
56 flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new);
57 
58 static uint16_t
59 flow_agg_pkt_fix_sum_no_op(uint16_t csum, uint16_t old, uint16_t new);
60 
61 /*
62  * This structure holds per-super object (mbuf/packet) flow aggregation.
63  */
64 struct flow_agg {
65 	union {
66 		struct {
67 			union {
68 				void *          _fa_sobj;
69 				struct mbuf *   _fa_smbuf;      /* super mbuf */
70 				struct __kern_packet *_fa_spkt; /* super pkt */
71 			};
72 			uint8_t *_fa_sptr;        /* ptr to super IP header */
73 			bool     _fa_sobj_is_pkt; /* super obj is pkt or mbuf */
74 			/*
75 			 * super obj is not large enough to hold the IP & TCP
76 			 * header in a contiguous buffer.
77 			 */
78 			bool     _fa_sobj_is_short;
79 			uint32_t _fa_tcp_seq;     /* expected next sequence # */
80 			uint32_t _fa_ulen;        /* expected next ulen */
81 			uint32_t _fa_total;       /* total aggregated bytes */
82 			/* function that fix packet checksum */
83 			flow_agg_fix_pkt_sum_func _fa_fix_pkt_sum;
84 		} __flow_agg;
85 		uint64_t __flow_agg_data[5];
86 	};
87 #define fa_sobj           __flow_agg._fa_sobj
88 #define fa_smbuf          __flow_agg._fa_smbuf
89 #define fa_spkt           __flow_agg._fa_spkt
90 #define fa_sptr           __flow_agg._fa_sptr
91 #define fa_sobj_is_pkt    __flow_agg._fa_sobj_is_pkt
92 #define fa_sobj_is_short  __flow_agg._fa_sobj_is_short
93 #define fa_tcp_seq        __flow_agg._fa_tcp_seq
94 #define fa_ulen           __flow_agg._fa_ulen
95 #define fa_total          __flow_agg._fa_total
96 #define fa_fix_pkt_sum   __flow_agg._fa_fix_pkt_sum
97 };
98 
99 #define FLOW_AGG_CLEAR(_fa) do {                                    \
100 	_CASSERT(sizeof(struct flow_agg) == 40);                        \
101 	_CASSERT(offsetof(struct flow_agg, fa_fix_pkt_sum) == 32);              \
102 	sk_zero_32(_fa);                                                \
103 	(_fa)->fa_fix_pkt_sum = 0;                                                                             \
104 } while (0)
105 
106 #define MASK_SIZE       80      /* size of struct {ip,ip6}_tcp_mask */
107 
108 struct ip_tcp_mask {
109 	struct ip       ip_m;
110 	struct tcphdr   tcp_m;
111 	uint32_t        tcp_option_m[MAX_TCPOPTLEN / sizeof(uint32_t)];
112 };
113 
114 static const struct ip_tcp_mask ip_tcp_mask
115 __sk_aligned(16) =
116 {
117 	.ip_m = {
118 		.ip_hl = 0xf,
119 		.ip_v = 0xf,
120 		.ip_tos = 0xff,
121 		/* Not checked; aggregated packet's ip_len is increasing */
122 		.ip_len = 0,
123 		.ip_id = 0,
124 		.ip_off = 0xffff,
125 		.ip_ttl = 0xff,
126 		.ip_p = 0xff,
127 		.ip_sum = 0,
128 		.ip_src.s_addr = 0xffffffff,
129 		.ip_dst.s_addr = 0xffffffff,
130 	},
131 	.tcp_m = {
132 		.th_sport = 0xffff,
133 		.th_dport = 0xffff,
134 		.th_seq = 0,
135 		.th_ack = 0xffffffff,
136 		.th_x2 = 0xf,
137 		.th_off = 0xf,
138 		.th_flags = ~TH_PUSH,
139 		.th_win = 0xffff,
140 		.th_sum = 0,
141 		.th_urp = 0xffff,
142 	},
143 	.tcp_option_m = {
144 		/* Max 40 bytes of TCP options */
145 		0xffffffff,
146 		0xffffffff,
147 		0xffffffff,
148 		0,      /* Filling up to MASK_SIZE */
149 		0,      /* Filling up to MASK_SIZE */
150 		0,      /* Filling up to MASK_SIZE */
151 		0,      /* Filling up to MASK_SIZE */
152 		0,      /* Filling up to MASK_SIZE */
153 		0,      /* Filling up to MASK_SIZE */
154 		0,      /* Filling up to MASK_SIZE */
155 	},
156 };
157 
158 struct ip6_tcp_mask {
159 	struct ip6_hdr  ip6_m;
160 	struct tcphdr   tcp_m;
161 	uint32_t        tcp_option_m[5]; /* 5 bytes to fill up to MASK_SIZE */
162 };
163 
164 static const struct ip6_tcp_mask ip6_tcp_mask
165 __sk_aligned(16) =
166 {
167 	.ip6_m = {
168 		.ip6_ctlun.ip6_un1.ip6_un1_flow = 0xffffffff,
169 		/* Not checked; aggregated packet's ip_len is increasing */
170 		.ip6_ctlun.ip6_un1.ip6_un1_plen = 0,
171 		.ip6_ctlun.ip6_un1.ip6_un1_nxt = 0xff,
172 		.ip6_ctlun.ip6_un1.ip6_un1_hlim = 0xff,
173 		.ip6_src.__u6_addr.__u6_addr32[0] = 0xffffff,
174 		.ip6_src.__u6_addr.__u6_addr32[1] = 0xffffff,
175 		.ip6_src.__u6_addr.__u6_addr32[2] = 0xffffff,
176 		.ip6_src.__u6_addr.__u6_addr32[3] = 0xffffff,
177 		.ip6_dst.__u6_addr.__u6_addr32[0] = 0xffffff,
178 		.ip6_dst.__u6_addr.__u6_addr32[1] = 0xffffff,
179 		.ip6_dst.__u6_addr.__u6_addr32[2] = 0xffffff,
180 		.ip6_dst.__u6_addr.__u6_addr32[3] = 0xffffff,
181 	},
182 	.tcp_m = {
183 		.th_sport = 0xffff,
184 		.th_dport = 0xffff,
185 		.th_seq = 0,
186 		.th_ack = 0xffffffff,
187 		.th_x2 = 0xf,
188 		.th_off = 0xf,
189 		.th_flags = ~TH_PUSH,
190 		.th_win = 0xffff,
191 		.th_sum = 0,
192 		.th_urp = 0xffff,
193 	},
194 	.tcp_option_m = {
195 		/* Max 40 bytes of TCP options */
196 		0xffffffff,
197 		0xffffffff,
198 		0xffffffff,
199 		0,          /* Filling up to MASK_SIZE */
200 		0,          /* Filling up to MASK_SIZE */
201 	},
202 };
203 
204 #if SK_LOG
205 SK_LOG_ATTRIBUTE
206 static void
_pkt_agg_log(struct __kern_packet * pkt,struct proc * p,bool is_input)207 _pkt_agg_log(struct __kern_packet *pkt, struct proc *p, bool is_input)
208 {
209 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
210 	    (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
211 
212 	kern_packet_t ph = SK_PKT2PH(pkt);
213 	uint64_t bufcnt = 1;
214 	if (!is_input) {
215 		bufcnt = kern_packet_get_buflet_count(ph);
216 	}
217 
218 	SK_DF(logflags, "%s(%d) %spkt 0x%llx plen %u",
219 	    sk_proc_name_address(p), sk_proc_pid(p), is_input ? "s":"d",
220 	    SK_KVA(pkt), pkt->pkt_length);
221 
222 	SK_DF(logflags, "%spkt csumf/rxstart/rxval 0x%x/%u/0x%04x",
223 	    is_input ? "s":"d", pkt->pkt_csum_flags,
224 	    (uint32_t)pkt->pkt_csum_rx_start_off,
225 	    (uint32_t)pkt->pkt_csum_rx_value);
226 
227 	if (!is_input) {
228 		kern_buflet_t buf = kern_packet_get_next_buflet(ph, NULL);
229 
230 		/* Individual buflets */
231 		for (uint64_t i = 0; i < bufcnt && buf != NULL; i++) {
232 			SK_DF(logflags | SK_VERB_DUMP, "%s",
233 			    sk_dump("buf", kern_buflet_get_data_address(buf),
234 			    pkt->pkt_length, 128, NULL, 0));
235 			buf = kern_packet_get_next_buflet(ph, buf);
236 		}
237 	}
238 }
239 
240 #define pkt_agg_log(_pkt, _p, _is_input) do {                           \
241 	if (__improbable(sk_verbose != 0)) {                            \
242 	        _pkt_agg_log(_pkt, _p, _is_input);                      \
243 	}                                                               \
244 } while (0)
245 
246 SK_LOG_ATTRIBUTE
247 static void
_mbuf_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)248 _mbuf_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
249 {
250 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
251 	    (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
252 
253 	SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
254 	    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
255 	    m->m_pkthdr.len);
256 
257 	SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
258 	    m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
259 	    (uint32_t)m->m_pkthdr.csum_rx_val);
260 
261 	/* Dump the first mbuf */
262 	ASSERT(m->m_data != NULL);
263 	SK_DF(logflags | SK_VERB_DUMP, "%s", sk_dump("buf",
264 	    (uint8_t *)m->m_data, m->m_len, 128, NULL, 0));
265 }
266 
267 #define mbuf_agg_log(_m, _p, _is_mbuf) do {                             \
268 	if (__improbable(sk_verbose != 0)) {                            \
269 	        _mbuf_agg_log(_m, _p, _is_mbuf);                        \
270 	}                                                               \
271 } while (0)
272 
273 SK_LOG_ATTRIBUTE
274 static void
_mchain_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)275 _mchain_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
276 {
277 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
278 	    (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
279 
280 	while (m != NULL) {
281 		SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
282 		    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
283 		    m->m_pkthdr.len);
284 
285 		SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
286 		    m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
287 		    (uint32_t)m->m_pkthdr.csum_rx_val);
288 
289 		m = m->m_nextpkt;
290 	}
291 }
292 
293 #define mchain_agg_log(_m, _p, _is_mbuf) do {                           \
294 	if (__improbable(sk_verbose != 0)) {                            \
295 	        _mchain_agg_log(_m, _p, _is_mbuf);                      \
296 	}                                                               \
297 } while (0)
298 #else
299 #define pkt_agg_log(...)
300 #define mbuf_agg_log(...)
301 #define mchain_agg_log(...)
302 #endif /* SK_LOG */
303 
304 /*
305  * Checksum only for packet with mbuf.
306  */
307 static bool
mbuf_csum(struct __kern_packet * pkt,struct mbuf * m,bool verify_l3,uint16_t * data_csum)308 mbuf_csum(struct __kern_packet *pkt, struct mbuf *m, bool verify_l3,
309     uint16_t *data_csum)
310 {
311 	ASSERT(data_csum != NULL);
312 
313 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
314 	uint32_t plen = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen +
315 	    pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
316 	uint16_t l4len = plen - pkt->pkt_l2_len - pkt->pkt_flow_ip_hlen;
317 	uint16_t start = pkt->pkt_l2_len;
318 	uint32_t partial = 0;
319 	uint16_t csum = 0;
320 
321 	ASSERT(plen == m_pktlen(m));
322 
323 	/* Some compat drivers compute full checksum */
324 	if ((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS) ==
325 	    CSUM_RX_FULL_FLAGS) {
326 		SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
327 		    m->m_pkthdr.csum_flags, m->m_pkthdr.csum_rx_start,
328 		    m->m_pkthdr.csum_rx_val);
329 
330 		/* Compute the data_csum */
331 		struct tcphdr *tcp =
332 		    (struct tcphdr *)(void *)(mtod(m, uint8_t *) +
333 		    pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen);
334 		/* 16-bit alignment is sufficient */
335 		ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
336 
337 		uint16_t th_sum = tcp->th_sum;
338 		tcp->th_sum = 0;
339 
340 		partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
341 		    pkt->pkt_flow_tcp_hlen);
342 		partial += htons(l4len + IPPROTO_TCP);
343 		if (pkt->pkt_flow_ip_ver == IPVERSION) {
344 			csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
345 			    pkt->pkt_flow_ipv4_dst.s_addr, partial);
346 		} else {
347 			ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
348 			csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
349 			    &pkt->pkt_flow_ipv6_dst, partial);
350 		}
351 		/* Restore the original checksum */
352 		tcp->th_sum = th_sum;
353 		th_sum = __packet_fix_sum(th_sum, csum, 0);
354 		*data_csum = ~th_sum & 0xffff;
355 
356 		/* pkt metadata will be transfer to super packet */
357 		__packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
358 		    0, m->m_pkthdr.csum_rx_val, false);
359 
360 		if ((m->m_pkthdr.csum_rx_val ^ 0xffff) == 0) {
361 			return true;
362 		} else {
363 			return false;
364 		}
365 	}
366 	/* Reset the csum RX flags */
367 	m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
368 	if (verify_l3) {
369 		csum = m_sum16(m, start, pkt->pkt_flow_ip_hlen);
370 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
371 		    start, pkt->pkt_flow_ip_hlen, csum);
372 		m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED;
373 		if ((csum ^ 0xffff) != 0) {
374 			return false;
375 		} else {
376 			m->m_pkthdr.csum_flags |= CSUM_IP_VALID;
377 		}
378 	}
379 	/* Compute L4 header checksum */
380 	partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
381 	    pkt->pkt_flow_tcp_hlen);
382 	/* Compute payload checksum */
383 	start += (pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
384 	*data_csum = m_sum16(m, start, (plen - start));
385 
386 	/* Fold in the data checksum to TCP checksum */
387 	partial += *data_csum;
388 	partial += htons(l4len + IPPROTO_TCP);
389 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
390 		csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
391 		    pkt->pkt_flow_ipv4_dst.s_addr, partial);
392 	} else {
393 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
394 		csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
395 		    &pkt->pkt_flow_ipv6_dst, partial);
396 	}
397 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
398 	    start - pkt->pkt_flow_tcp_hlen, l4len, csum);
399 	// Set start to 0 for full checksum
400 	m->m_pkthdr.csum_rx_start = 0;
401 	m->m_pkthdr.csum_rx_val = csum;
402 	m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
403 
404 	/* pkt metadata will be transfer to super packet */
405 	__packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
406 	    0, csum, false);
407 
408 	if ((csum ^ 0xffff) != 0) {
409 		return false;
410 	}
411 
412 	return true;
413 }
414 
415 /* structure to pass an array of data buffers */
416 typedef struct _dbuf_array {
417 	union {
418 		struct __kern_buflet *dba_buflet[MAX_BUFLET_COUNT];
419 		struct mbuf *dba_mbuf[MAX_BUFLET_COUNT];
420 	};
421 	uint8_t dba_num_dbufs;
422 	bool dba_is_buflet;
423 } _dbuf_array_t;
424 
425 static inline void
_copy_data_sum_dbuf(struct __kern_packet * spkt,uint16_t soff,uint16_t plen,uint32_t * partial_sum,boolean_t * odd_start,_dbuf_array_t * dbuf,boolean_t do_csum)426 _copy_data_sum_dbuf(struct __kern_packet *spkt, uint16_t soff, uint16_t plen,
427     uint32_t *partial_sum, boolean_t *odd_start, _dbuf_array_t *dbuf,
428     boolean_t do_csum)
429 {
430 	uint8_t i = 0;
431 	uint32_t buflet_dlim, buflet_dlen, buf_off = 0;
432 
433 	ASSERT(plen > 0);
434 	while (plen > 0) {
435 		ASSERT(i < dbuf->dba_num_dbufs);
436 		uint32_t dbuf_lim, tmplen;
437 		uint8_t *dbuf_addr;
438 
439 		if (dbuf->dba_is_buflet) {
440 			ASSERT(kern_buflet_get_data_offset(dbuf->dba_buflet[i]) == 0);
441 			dbuf_addr = kern_buflet_get_data_address(dbuf->dba_buflet[i]);
442 
443 			buflet_dlim = kern_buflet_get_data_limit(dbuf->dba_buflet[i]);
444 			buflet_dlen = kern_buflet_get_data_length(dbuf->dba_buflet[i]);
445 			buf_off = buflet_dlen;
446 			dbuf_lim = buflet_dlim - buf_off;
447 			dbuf_addr += buf_off;
448 		} else {
449 			dbuf_lim = (uint32_t) M_TRAILINGSPACE(dbuf->dba_mbuf[i]);
450 			dbuf_addr = mtod(dbuf->dba_mbuf[i], uint8_t *);
451 			buf_off = dbuf->dba_mbuf[i]->m_len;
452 			dbuf_addr += buf_off;
453 		}
454 		tmplen = min(plen, dbuf_lim);
455 		if (PKT_IS_TRUNC_MBUF(spkt)) {
456 			if (do_csum) {
457 				*partial_sum = m_copydata_sum(spkt->pkt_mbuf,
458 				    soff, tmplen, dbuf_addr, *partial_sum,
459 				    odd_start);
460 			} else {
461 				m_copydata(spkt->pkt_mbuf, soff, tmplen,
462 				    dbuf_addr);
463 			}
464 		} else {
465 			*partial_sum = pkt_copyaddr_sum(SK_PKT2PH(spkt),
466 			    soff, dbuf_addr, tmplen, do_csum, *partial_sum,
467 			    odd_start);
468 		}
469 		if (dbuf->dba_is_buflet) {
470 			VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[i],
471 			    tmplen + buf_off) == 0);
472 		} else {
473 			dbuf->dba_mbuf[i]->m_len += tmplen;
474 			dbuf->dba_mbuf[0]->m_pkthdr.len += tmplen;
475 		}
476 		soff += tmplen;
477 		plen -= tmplen;
478 		buf_off = 0;
479 		i++;
480 	}
481 	ASSERT(plen == 0);
482 }
483 
484 /*
485  * Copy (fill) and checksum for packet.
486  * spkt: source IP packet.
487  * plen: length of data in spkt (IP hdr + TCP hdr + TCP payload).
488  * verify_l3: verify IPv4 header checksum.
489  * currm: destination mbuf.
490  * currp: destination skywalk packet.
491  * dbuf: additional destination data buffer(s), used when current destination
492  * packet is out of space.
493  * added: amount of data copied from spkt to the additional buffer.
494  * data_sum: 16-bit folded partial checksum of the copied TCP payload.
495  */
496 static bool
copy_pkt_csum_packed(struct __kern_packet * spkt,uint32_t plen,_dbuf_array_t * dbuf,bool verify_l3,struct mbuf * currm,struct __kern_buflet * currp,uint16_t * data_csum,int * added)497 copy_pkt_csum_packed(struct __kern_packet *spkt, uint32_t plen,
498     _dbuf_array_t *dbuf, bool verify_l3, struct mbuf *currm,
499     struct __kern_buflet *currp, uint16_t *data_csum, int *added)
500 {
501 	ASSERT(data_csum != NULL);
502 
503 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
504 	    SK_VERB_COPY));
505 
506 	uint16_t start = 0, csum = 0;
507 	uint32_t len = 0;
508 	uint32_t l4len;
509 	/* soff is only used for packets */
510 	uint16_t soff = spkt->pkt_headroom + spkt->pkt_l2_len;
511 	uint32_t data_partial = 0, partial = 0;
512 	int32_t curr_oldlen;
513 	uint32_t curr_trailing;
514 	char *curr_ptr;
515 	int32_t curr_len;
516 	uint16_t data_off;
517 	uint32_t tmplen;
518 	boolean_t odd_start = FALSE;
519 	bool verify_l4;
520 
521 	/* One of them must be != NULL, but they can't be both set */
522 	VERIFY((currm != NULL || currp != NULL) &&
523 	    ((currm != NULL) != (currp != NULL)));
524 
525 	if (currm != NULL) {
526 		curr_oldlen = currm->m_len;
527 		curr_trailing = (uint32_t)M_TRAILINGSPACE(currm);
528 		curr_ptr = mtod(currm, char *) + currm->m_len;
529 		curr_len = currm->m_len;
530 	} else {
531 		curr_oldlen = currp->buf_dlen;
532 		curr_trailing = currp->buf_dlim - currp->buf_doff -
533 		    currp->buf_dlen;
534 		curr_ptr = (char *)(currp->buf_addr + currp->buf_doff +
535 		    currp->buf_dlen);
536 		curr_len = currp->buf_dlen;
537 	}
538 
539 	/* Verify checksum only for IPv4 */
540 	len = spkt->pkt_flow_ip_hlen;
541 	verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(spkt));
542 	if (verify_l3) {
543 		if (PKT_IS_TRUNC_MBUF(spkt)) {
544 			partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
545 			    len, 0, 0);
546 		} else {
547 			partial = pkt_sum(SK_PKT2PH(spkt), soff, len);
548 		}
549 
550 		csum = __packet_fold_sum(partial);
551 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)", 0,
552 		    len, csum);
553 		spkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
554 		if ((csum ^ 0xffff) != 0) {
555 			/* No need to copy & checkum TCP+payload */
556 			return false;
557 		} else {
558 			spkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
559 		}
560 	}
561 
562 	verify_l4 = !PACKET_HAS_FULL_CHECKSUM_FLAGS(spkt);
563 
564 	/* Copy & verify TCP checksum */
565 	start = spkt->pkt_flow_ip_hlen + spkt->pkt_flow_tcp_hlen;
566 	l4len = plen - spkt->pkt_flow_ip_hlen;
567 	len = plen - start;
568 	if (PKT_IS_TRUNC_MBUF(spkt)) {
569 		tmplen = min(len, curr_trailing);
570 		odd_start = FALSE;
571 
572 		/* First, simple checksum on the TCP header */
573 		if (verify_l4) {
574 			partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
575 			    spkt->pkt_flow_tcp_hlen, spkt->pkt_flow_ip_hlen, 0);
576 		}
577 
578 		/* Now, copy & sum the payload */
579 		if (tmplen > 0) {
580 			data_partial = m_copydata_sum(spkt->pkt_mbuf,
581 			    start, tmplen, curr_ptr, 0, &odd_start);
582 			curr_len += tmplen;
583 		}
584 		data_off = start + tmplen;
585 	} else {
586 		tmplen = min(len, curr_trailing);
587 		odd_start = FALSE;
588 
589 		/* First, simple checksum on the TCP header */
590 		if (verify_l4) {
591 			partial = pkt_sum(SK_PKT2PH(spkt), (soff +
592 			    spkt->pkt_flow_ip_hlen), spkt->pkt_flow_tcp_hlen);
593 		}
594 
595 		/* Now, copy & sum the payload */
596 		if (tmplen > 0) {
597 			data_partial = pkt_copyaddr_sum(SK_PKT2PH(spkt),
598 			    (soff + start), (uint8_t *)curr_ptr, tmplen,
599 			    true, 0, &odd_start);
600 			curr_len += tmplen;
601 		}
602 		data_off = soff + start + tmplen;
603 	}
604 
605 	/* copy & sum remaining payload in additional buffers */
606 	if ((len - tmplen) > 0) {
607 		ASSERT(dbuf != NULL);
608 		_copy_data_sum_dbuf(spkt, data_off, (len - tmplen),
609 		    &data_partial, &odd_start, dbuf, true);
610 		*added = (len - tmplen);
611 	}
612 
613 	/* Fold data checksum to 16 bit */
614 	*data_csum = __packet_fold_sum(data_partial);
615 
616 	if (currm != NULL) {
617 		currm->m_len = curr_len;
618 	} else {
619 		currp->buf_dlen = curr_len;
620 	}
621 
622 	if (verify_l4) {
623 		/* Fold in the data checksum to TCP checksum */
624 		partial += *data_csum;
625 		partial += htons(l4len + IPPROTO_TCP);
626 		if (spkt->pkt_flow_ip_ver == IPVERSION) {
627 			csum = in_pseudo(spkt->pkt_flow_ipv4_src.s_addr,
628 			    spkt->pkt_flow_ipv4_dst.s_addr, partial);
629 		} else {
630 			ASSERT(spkt->pkt_flow_ip_ver == IPV6_VERSION);
631 			csum = in6_pseudo(&spkt->pkt_flow_ipv6_src,
632 			    &spkt->pkt_flow_ipv6_dst, partial);
633 		}
634 		/* pkt metadata will be transfer to super packet */
635 		__packet_set_inet_checksum(SK_PKT2PH(spkt),
636 		    PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
637 	} else {
638 		/* grab csum value from offload */
639 		csum = spkt->pkt_csum_rx_value;
640 	}
641 
642 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
643 	    start - spkt->pkt_flow_tcp_hlen, l4len, ntohs(csum));
644 
645 	if ((csum ^ 0xffff) != 0) {
646 		/*
647 		 * Revert whatever we did here!
648 		 * currm/currp should be restored to previous value.
649 		 * dbuf (for additional payload) should be restore to 0.
650 		 */
651 		if (currm != NULL) {
652 			currm->m_len = curr_oldlen;
653 		} else {
654 			currp->buf_dlen = curr_oldlen;
655 		}
656 		if (dbuf != NULL) {
657 			for (int i = 0; i < dbuf->dba_num_dbufs; i++) {
658 				if (dbuf->dba_is_buflet) {
659 					struct __kern_buflet *b = dbuf->dba_buflet[i];
660 					kern_buflet_set_data_length(b, 0);
661 					kern_buflet_set_data_offset(b, 0);
662 				} else {
663 					struct mbuf *m = dbuf->dba_mbuf[i];
664 					m->m_len = m->m_pkthdr.len = 0;
665 				}
666 			}
667 		}
668 
669 		return false;
670 	}
671 
672 	return true;
673 }
674 
675 /*
676  * Copy and checksum for packet or packet with mbuf
677  * data_csum is only supported for bsd flows
678  */
679 static bool
copy_pkt_csum(struct __kern_packet * pkt,uint32_t plen,_dbuf_array_t * dbuf,uint16_t * data_csum,bool verify_l3)680 copy_pkt_csum(struct __kern_packet *pkt, uint32_t plen, _dbuf_array_t *dbuf,
681     uint16_t *data_csum, bool verify_l3)
682 {
683 	/*
684 	 * To keep this routine simple and optimal, we are asserting on the
685 	 * assumption that the smallest flowswitch packet pool buffer should
686 	 * be large enough to hold the IP and TCP headers in the first buflet.
687 	 */
688 	_CASSERT(NX_FSW_MINBUFSIZE >= NETIF_COMPAT_MAX_MBUF_DATA_COPY);
689 
690 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
691 	    (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
692 
693 	uint16_t start = 0, csum = 0;
694 	uint32_t len = 0;
695 	/* soff is only used for packets */
696 	uint16_t soff = pkt->pkt_headroom + pkt->pkt_l2_len;
697 	uint32_t data_partial = 0, partial = 0;
698 	boolean_t odd_start = false;
699 	uint32_t data_len;
700 	uint16_t dbuf_off;
701 	uint16_t copied_len = 0;
702 	bool l3_csum_ok;
703 	uint8_t *daddr;
704 
705 	if (dbuf->dba_is_buflet) {
706 		daddr = kern_buflet_get_data_address(dbuf->dba_buflet[0]);
707 		daddr += kern_buflet_get_data_length(dbuf->dba_buflet[0]);
708 	} else {
709 		daddr = mtod(dbuf->dba_mbuf[0], uint8_t *);
710 		daddr += dbuf->dba_mbuf[0]->m_len;
711 		/*
712 		 * available space check for payload is done later
713 		 * in _copy_data_sum_dbuf
714 		 */
715 		ASSERT(M_TRAILINGSPACE(dbuf->dba_mbuf[0]) >=
716 		    pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
717 	}
718 
719 	if (PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt)) {
720 		/* copy only */
721 		_copy_data_sum_dbuf(pkt, PKT_IS_TRUNC_MBUF(pkt) ? 0: soff,
722 		    plen, &partial, &odd_start, dbuf, false);
723 		if (PKT_IS_MBUF(pkt)) {
724 			csum = pkt->pkt_mbuf->m_pkthdr.csum_rx_val;
725 			SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
726 			    pkt->pkt_mbuf->m_pkthdr.csum_flags,
727 			    pkt->pkt_mbuf->m_pkthdr.csum_rx_start, csum);
728 		} else {
729 			csum = pkt->pkt_csum_rx_value;
730 			SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
731 			    pkt->pkt_csum_flags,
732 			    pkt->pkt_csum_rx_start_off, csum);
733 		}
734 
735 		/* pkt metadata will be transfer to super packet */
736 		__packet_set_inet_checksum(SK_PKT2PH(pkt),
737 		    PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
738 		if ((csum ^ 0xffff) == 0) {
739 			return true;
740 		} else {
741 			return false;
742 		}
743 	}
744 
745 	/* Copy l3 & verify checksum only for IPv4 */
746 	start = 0;
747 	len = pkt->pkt_flow_ip_hlen;
748 	if (PKT_IS_TRUNC_MBUF(pkt)) {
749 		partial = m_copydata_sum(pkt->pkt_mbuf, start, len,
750 		    (daddr + start), 0, NULL);
751 	} else {
752 		partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), soff,
753 		    (daddr + start), len, true, 0, NULL);
754 	}
755 	verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(pkt));
756 	l3_csum_ok = !verify_l3;
757 	if (verify_l3) {
758 		csum = __packet_fold_sum(partial);
759 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
760 		    start, len, csum);
761 		pkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
762 		if ((csum ^ 0xffff) != 0) {
763 			/* proceed to copy the rest of packet */
764 		} else {
765 			pkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
766 			l3_csum_ok = true;
767 		}
768 	}
769 	copied_len += pkt->pkt_flow_ip_hlen;
770 
771 	/* Copy & verify TCP checksum */
772 	start = pkt->pkt_flow_ip_hlen;
773 	len = plen - start;
774 
775 	if (PKT_IS_TRUNC_MBUF(pkt)) {
776 		/* First, copy and sum TCP header */
777 		partial = m_copydata_sum(pkt->pkt_mbuf, start,
778 		    pkt->pkt_flow_tcp_hlen, (daddr + start), 0, NULL);
779 
780 		data_len = len - pkt->pkt_flow_tcp_hlen;
781 		start += pkt->pkt_flow_tcp_hlen;
782 		dbuf_off = start;
783 		/* Next, copy and sum payload (if any) */
784 	} else {
785 		/* First, copy and sum TCP header */
786 		partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), (soff + start),
787 		    (daddr + start), pkt->pkt_flow_tcp_hlen, true, 0, NULL);
788 
789 		data_len = len - pkt->pkt_flow_tcp_hlen;
790 		start += pkt->pkt_flow_tcp_hlen;
791 		dbuf_off = start;
792 		start += soff;
793 	}
794 	copied_len += pkt->pkt_flow_tcp_hlen;
795 
796 	if (dbuf->dba_is_buflet) {
797 		VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[0],
798 		    kern_buflet_get_data_length(dbuf->dba_buflet[0]) +
799 		    copied_len) == 0);
800 	} else {
801 		dbuf->dba_mbuf[0]->m_len += copied_len;
802 		dbuf->dba_mbuf[0]->m_pkthdr.len += copied_len;
803 	}
804 
805 	/* copy and sum payload (if any) */
806 	if (data_len > 0) {
807 		odd_start = false;
808 		_copy_data_sum_dbuf(pkt, start, data_len, &data_partial,
809 		    &odd_start, dbuf, l3_csum_ok);
810 	}
811 
812 	if (__improbable(!l3_csum_ok)) {
813 		return false;
814 	}
815 
816 	/* Fold data sum to 16 bit and then into the partial */
817 	*data_csum = __packet_fold_sum(data_partial);
818 
819 	/* Fold in the data checksum to TCP checksum */
820 	partial += *data_csum;
821 
822 	partial += htons(len + IPPROTO_TCP);
823 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
824 		csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
825 		    pkt->pkt_flow_ipv4_dst.s_addr, partial);
826 	} else {
827 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
828 		csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
829 		    &pkt->pkt_flow_ipv6_dst, partial);
830 	}
831 
832 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
833 	    pkt->pkt_flow_ip_hlen, len, csum);
834 
835 	/* pkt metadata will be transfer to super packet */
836 	__packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
837 	    0, csum, false);
838 	if ((csum ^ 0xffff) != 0) {
839 		return false;
840 	}
841 
842 	return true;
843 }
844 
845 SK_INLINE_ATTRIBUTE
846 static void
flow_agg_init_common(struct nx_flowswitch * fsw,struct flow_agg * fa,struct __kern_packet * pkt)847 flow_agg_init_common(struct nx_flowswitch *fsw, struct flow_agg *fa,
848     struct __kern_packet *pkt)
849 {
850 	struct ifnet *ifp;
851 
852 	switch (pkt->pkt_flow_ip_ver) {
853 	case IPVERSION:
854 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip)) {
855 			return;
856 		}
857 		break;
858 	case IPV6_VERSION:
859 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
860 			return;
861 		}
862 		break;
863 	default:
864 		VERIFY(0);
865 		/* NOTREACHED */
866 		__builtin_unreachable();
867 	}
868 
869 	fa->fa_tcp_seq = ntohl(pkt->pkt_flow_tcp_seq) + pkt->pkt_flow_ulen;
870 	fa->fa_ulen = pkt->pkt_flow_ulen;
871 	fa->fa_total = pkt->pkt_flow_ip_hlen +
872 	    pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
873 
874 	ifp = fsw->fsw_ifp;
875 	ASSERT(ifp != NULL);
876 	if (__improbable((ifp->if_hwassist & IFNET_LRO) != 0)) {
877 		/* in case hardware supports LRO, don't fix checksum in the header */
878 		fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum_no_op;
879 	} else {
880 		fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum;
881 	}
882 }
883 
884 static void
flow_agg_init_smbuf(struct nx_flowswitch * fsw,struct flow_agg * fa,struct mbuf * smbuf,struct __kern_packet * pkt)885 flow_agg_init_smbuf(struct nx_flowswitch *fsw, struct flow_agg *fa,
886     struct mbuf *smbuf, struct __kern_packet *pkt)
887 {
888 	FLOW_AGG_CLEAR(fa);
889 
890 	ASSERT(smbuf != NULL);
891 	fa->fa_smbuf = smbuf;
892 
893 	fa->fa_sptr = mtod(smbuf, uint8_t *);
894 	ASSERT(fa->fa_sptr != NULL);
895 
896 	/*
897 	 * Note here we use 'pkt' instead of 'smbuf', since we rely on the
898 	 * contents of the flow structure which don't exist in 'smbuf'.
899 	 */
900 	flow_agg_init_common(fsw, fa, pkt);
901 }
902 
903 static void
flow_agg_init_spkt(struct nx_flowswitch * fsw,struct flow_agg * fa,struct __kern_packet * spkt,struct __kern_packet * pkt)904 flow_agg_init_spkt(struct nx_flowswitch *fsw, struct flow_agg *fa,
905     struct __kern_packet *spkt, struct __kern_packet *pkt)
906 {
907 	FLOW_AGG_CLEAR(fa);
908 
909 	ASSERT(spkt != NULL);
910 	fa->fa_spkt = spkt;
911 	fa->fa_sobj_is_pkt = true;
912 	VERIFY(spkt->pkt_headroom == 0 && spkt->pkt_l2_len == 0);
913 
914 	MD_BUFLET_ADDR_ABS(spkt, fa->fa_sptr);
915 	ASSERT(fa->fa_sptr != NULL);
916 
917 	/*
918 	 * Note here we use 'pkt' instead of 'spkt', since we rely on the
919 	 * contents of the flow structure which don't exist in 'spkt'.
920 	 */
921 	flow_agg_init_common(fsw, fa, pkt);
922 }
923 
924 SK_INLINE_ATTRIBUTE
925 static bool
ipv4_tcp_memcmp(const uint8_t * h1,const uint8_t * h2)926 ipv4_tcp_memcmp(const uint8_t *h1, const uint8_t *h2)
927 {
928 	return sk_memcmp_mask_64B(h1, h2, (const uint8_t *)&ip_tcp_mask) == 0;
929 }
930 
931 SK_INLINE_ATTRIBUTE
932 static bool
ipv6_tcp_memcmp(const uint8_t * h1,const uint8_t * h2)933 ipv6_tcp_memcmp(const uint8_t *h1, const uint8_t *h2)
934 {
935 	return sk_memcmp_mask_80B(h1, h2, (const uint8_t *)&ip6_tcp_mask) == 0;
936 }
937 
938 SK_INLINE_ATTRIBUTE
939 static bool
can_agg_fastpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)940 can_agg_fastpath(struct flow_agg *fa, struct __kern_packet *pkt,
941     struct fsw_stats *fsws)
942 {
943 	bool match;
944 
945 	ASSERT(fa->fa_sptr != NULL);
946 	_CASSERT(sizeof(struct ip6_tcp_mask) == MASK_SIZE);
947 	_CASSERT(sizeof(struct ip_tcp_mask) == MASK_SIZE);
948 
949 	if (__improbable(pkt->pkt_length < MASK_SIZE)) {
950 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_TCP);
951 		goto slow_path;
952 	}
953 
954 	if (__improbable(fa->fa_sobj_is_short)) {
955 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_MBUF);
956 		goto slow_path;
957 	}
958 
959 	if (__improbable(pkt->pkt_flow_tcp_hlen !=
960 	    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA))) {
961 		goto slow_path;
962 	}
963 
964 	switch (pkt->pkt_flow_ip_ver) {
965 	case IPVERSION:
966 		match = ipv4_tcp_memcmp(fa->fa_sptr,
967 		    (uint8_t *)pkt->pkt_flow_ip_hdr);
968 		break;
969 	case IPV6_VERSION:
970 		match = ipv6_tcp_memcmp(fa->fa_sptr,
971 		    (uint8_t *)pkt->pkt_flow_ip_hdr);
972 		break;
973 	default:
974 		VERIFY(0);
975 		/* NOTREACHED */
976 		__builtin_unreachable();
977 	}
978 
979 	if (__improbable(!match)) {
980 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_MASK_TCP);
981 		goto slow_path;
982 	}
983 	if (__improbable(pkt->pkt_flow_ulen != fa->fa_ulen)) {
984 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ULEN_TCP);
985 		goto slow_path;
986 	}
987 
988 	STATS_INC(fsws, FSW_STATS_RX_AGG_OK_FASTPATH_TCP);
989 	fa->fa_tcp_seq += pkt->pkt_flow_ulen;
990 	fa->fa_ulen = pkt->pkt_flow_ulen;
991 	return true;
992 
993 slow_path:
994 	return false;
995 }
996 
997 SK_NO_INLINE_ATTRIBUTE
998 static bool
can_agg_slowpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)999 can_agg_slowpath(struct flow_agg *fa, struct __kern_packet *pkt,
1000     struct fsw_stats *fsws)
1001 {
1002 	uint8_t *sl3_hdr = fa->fa_sptr;
1003 	uint32_t sl3tlen = 0;
1004 	uint16_t sl3hlen = 0;
1005 
1006 	DTRACE_SKYWALK2(aggr__slow, struct __kern_packet *, pkt,
1007 	    uint8_t *, sl3_hdr);
1008 
1009 	ASSERT(sl3_hdr != NULL);
1010 
1011 	/*
1012 	 * Compare IP header length, TOS, frag flags and IP options
1013 	 * For IPv4, the options should match exactly
1014 	 * For IPv6, if options are present, bail out
1015 	 */
1016 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
1017 		struct ip *siph = (struct ip *)(void *)sl3_hdr;
1018 		struct ip *iph = (struct ip *)pkt->pkt_flow_ip_hdr;
1019 
1020 		ASSERT(siph->ip_v == IPVERSION);
1021 		/* 16-bit alignment is sufficient (handles mbuf case) */
1022 		ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1023 		ASSERT(IS_P2ALIGNED(iph, sizeof(uint16_t)));
1024 
1025 		sl3hlen = (siph->ip_hl << 2);
1026 		if (sl3hlen != pkt->pkt_flow_ip_hlen) {
1027 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1028 			DTRACE_SKYWALK2(aggr__fail2, uint16_t, sl3hlen, uint8_t,
1029 			    pkt->pkt_flow_ip_hlen);
1030 			return false;
1031 		}
1032 
1033 		if (siph->ip_ttl != iph->ip_ttl) {
1034 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1035 			DTRACE_SKYWALK2(aggr__fail3, uint8_t, siph->ip_ttl,
1036 			    uint8_t, iph->ip_ttl);
1037 			return false;
1038 		}
1039 
1040 		if (siph->ip_tos != iph->ip_tos) {
1041 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1042 			DTRACE_SKYWALK2(aggr__fail4, uint8_t, siph->ip_tos,
1043 			    uint8_t, iph->ip_tos);
1044 			return false;
1045 		}
1046 		/* For IPv4, DF bit should match */
1047 		if ((ntohs(siph->ip_off) & (IP_DF | IP_RF)) !=
1048 		    (ntohs(iph->ip_off) & (IP_DF | IP_RF))) {
1049 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OFF_IP);
1050 			DTRACE_SKYWALK2(aggr__fail5, uint16_t,
1051 			    ntohs(siph->ip_off), uint16_t, ntohs(iph->ip_off));
1052 			return false;
1053 		}
1054 
1055 		uint8_t ip_opts_len = pkt->pkt_flow_ip_hlen -
1056 		    sizeof(struct ip);
1057 		if (ip_opts_len > 0 &&
1058 		    memcmp((uint8_t *)(siph + 1), (uint8_t *)(iph + 1),
1059 		    ip_opts_len) != 0) {
1060 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPT_IP);
1061 			DTRACE_SKYWALK3(aggr__fail6, uint8_t, ip_opts_len,
1062 			    uint8_t *, (uint8_t *)(siph + 1), uint8_t *,
1063 			    (uint8_t *)(iph + 1));
1064 			return false;
1065 		}
1066 		sl3tlen = ntohs(siph->ip_len);
1067 	} else {
1068 		struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1069 		struct ip6_hdr *ip6 = (struct ip6_hdr *)pkt->pkt_flow_ip_hdr;
1070 
1071 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1072 		ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1073 		/* 16-bit alignment is sufficient (handles mbuf case) */
1074 		ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1075 
1076 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
1077 			/*
1078 			 * Don't aggregate if extension header is present in
1079 			 * packet. N.B. currently flow switch only classifies
1080 			 * frag header
1081 			 */
1082 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1083 			DTRACE_SKYWALK1(aggr__fail7, uint8_t,
1084 			    pkt->pkt_flow_ip_hlen);
1085 			return false;
1086 		}
1087 
1088 		sl3hlen = sizeof(struct ip6_hdr);
1089 		/* For IPv6, flow info mask covers TOS and flow label */
1090 		if (memcmp(&sip6->ip6_flow, &ip6->ip6_flow,
1091 		    sizeof(sip6->ip6_flow)) != 0) {
1092 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1093 			DTRACE_SKYWALK2(aggr__fail8, uint32_t,
1094 			    ntohl(sip6->ip6_flow), uint32_t,
1095 			    ntohl(ip6->ip6_flow));
1096 			return false;
1097 		}
1098 
1099 		if (sip6->ip6_hlim != ip6->ip6_hlim) {
1100 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1101 			DTRACE_SKYWALK2(aggr__fail9, uint8_t, sip6->ip6_hlim,
1102 			    uint8_t, ip6->ip6_hlim);
1103 			return false;
1104 		}
1105 
1106 		sl3tlen = (sizeof(struct ip6_hdr) + ntohs(sip6->ip6_plen));
1107 	}
1108 
1109 	/*
1110 	 * For TCP header, compare ACK number and window size
1111 	 * Compare TCP flags
1112 	 * Compare TCP header length and TCP options
1113 	 */
1114 	struct tcphdr *stcp = (struct tcphdr *)(void *)(sl3_hdr + sl3hlen);
1115 	struct tcphdr *tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
1116 
1117 	uint16_t sl4hlen = (stcp->th_off << 2);
1118 	if (memcmp(&stcp->th_ack, &tcp->th_ack, sizeof(stcp->th_ack)) != 0 ||
1119 	    memcmp(&stcp->th_win, &tcp->th_win, sizeof(stcp->th_win)) != 0) {
1120 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ACKWIN_TCP);
1121 		DTRACE_SKYWALK4(aggr__fail9, uint32_t, ntohl(stcp->th_ack),
1122 		    uint32_t, ntohl(tcp->th_ack), uint16_t, ntohs(stcp->th_win),
1123 		    uint16_t, ntohs(tcp->th_win));
1124 		return false;
1125 	}
1126 
1127 	if ((stcp->th_flags & ~(TH_PUSH)) != (tcp->th_flags & ~(TH_PUSH))) {
1128 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_FLAGS_TCP);
1129 		DTRACE_SKYWALK2(aggr__fail10, uint8_t, stcp->th_flags,
1130 		    uint8_t, tcp->th_flags);
1131 		return false;
1132 	}
1133 
1134 	if (sl4hlen != pkt->pkt_flow_tcp_hlen) {
1135 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_TCP);
1136 		DTRACE_SKYWALK2(aggr__fail11, uint8_t, sl4hlen,
1137 		    uint8_t, pkt->pkt_flow_tcp_hlen);
1138 		return false;
1139 	}
1140 
1141 	uint8_t tcp_opts_len = pkt->pkt_flow_tcp_hlen - sizeof(struct tcphdr);
1142 	/*
1143 	 * We know that the TCP-option lengthes are the same thanks to the above
1144 	 * sl4hlen check
1145 	 */
1146 	if (tcp_opts_len > 0 && memcmp((uint8_t *)(stcp + 1),
1147 	    (uint8_t *)(tcp + 1), tcp_opts_len) != 0) {
1148 		/*
1149 		 * Fast-path header prediction:
1150 		 *
1151 		 * TCP Timestamp option is usually put after two NOP-headers,
1152 		 * and thus total TCP-option length is 12. If that's the case,
1153 		 * we can aggregate as only the TCP time-stamp option differs.
1154 		 */
1155 		if (tcp_opts_len != TCPOLEN_TSTAMP_APPA) {
1156 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_EXOPT_TCP);
1157 			DTRACE_SKYWALK1(aggr__fail13, uint8_t, tcp_opts_len);
1158 			return false;
1159 		} else {
1160 			uint32_t sts_hdr, ts_hdr;
1161 			if (IS_P2ALIGNED(stcp + 1, sizeof(uint32_t))) {
1162 				sts_hdr = *((uint32_t *)(stcp + 1));
1163 			} else {
1164 				bcopy(stcp + 1, &sts_hdr, sizeof(sts_hdr));
1165 			}
1166 			if (IS_P2ALIGNED(tcp + 1, sizeof(uint32_t))) {
1167 				ts_hdr = *((uint32_t *)(tcp + 1));
1168 			} else {
1169 				bcopy(tcp + 1, &ts_hdr, sizeof(ts_hdr));
1170 			}
1171 
1172 			if (sts_hdr != htonl(TCPOPT_TSTAMP_HDR) ||
1173 			    ts_hdr != htonl(TCPOPT_TSTAMP_HDR)) {
1174 				STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPTTS_TCP);
1175 				DTRACE_SKYWALK2(aggr__fail14, uint32_t,
1176 				    sts_hdr, uint32_t, ts_hdr);
1177 				return false;
1178 			}
1179 		}
1180 	}
1181 	STATS_INC(fsws, FSW_STATS_RX_AGG_OK_SLOWPATH_TCP);
1182 	fa->fa_tcp_seq += pkt->pkt_flow_ulen;
1183 	fa->fa_ulen = pkt->pkt_flow_ulen;
1184 	return true;
1185 }
1186 
1187 static bool
flow_agg_is_ok(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)1188 flow_agg_is_ok(struct flow_agg *fa, struct __kern_packet *pkt,
1189     struct fsw_stats *fsws)
1190 {
1191 	/* Shouldn't exceed the ip_len beyond MIN(custom ip_len, 64K) */
1192 	const uint32_t max_ip_len = MAX_AGG_IP_LEN();
1193 	bool can_agg = false;
1194 
1195 	DTRACE_SKYWALK2(aggr__check, struct flow_agg *, fa,
1196 	    struct __kern_packet *, pkt);
1197 
1198 	ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1199 	if (__improbable(pkt->pkt_flow_tcp_agg_fast != 0)) {
1200 		pkt->pkt_flow_tcp_agg_fast = 0;
1201 	}
1202 	/*
1203 	 * Don't aggregate if any of the following is true:
1204 	 * 1. TCP flag is other than TH_{ACK,PUSH}
1205 	 * 2. Payload length is 0 (pure ACK)
1206 	 * 3. This is the first packet
1207 	 * 4. TCP sequence number is not expected
1208 	 * 5. We would've exceeded the maximum aggregated size
1209 	 * 6. It's not the first packet and the wake flag is set
1210 	 */
1211 	if (__improbable((pkt->pkt_flow_tcp_flags & TCP_FLAGS_IGNORE) != 0 ||
1212 	    pkt->pkt_flow_ulen == 0 || fa->fa_sobj == NULL)) {
1213 		DTRACE_SKYWALK1(aggr__fail1a, struct __kern_packet *, pkt);
1214 		goto done;
1215 	}
1216 	if (__improbable(ntohl(pkt->pkt_flow_tcp_seq) != fa->fa_tcp_seq)) {
1217 		DTRACE_SKYWALK2(aggr__fail1b, uint32_t,
1218 		    ntohl(pkt->pkt_flow_tcp_seq), uint32_t, fa->fa_tcp_seq);
1219 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SEQN_TCP);
1220 		goto done;
1221 	}
1222 	if (__improbable((fa->fa_total + pkt->pkt_flow_ulen) > max_ip_len)) {
1223 		DTRACE_SKYWALK3(aggr__fail1c, uint32_t, fa->fa_total,
1224 		    uint32_t, pkt->pkt_flow_ulen, uint32_t, max_ip_len);
1225 		/* We've reached aggregation limit */
1226 		STATS_INC(fsws, FSW_STATS_RX_AGG_LIMIT);
1227 		goto done;
1228 	}
1229 	if (__improbable(PKT_IS_WAKE_PKT(pkt) && fa->fa_total > 0)) {
1230 		DTRACE_SKYWALK1(aggr__fail1d, struct __kern_packet *, pkt);
1231 		goto done;
1232 	}
1233 
1234 	can_agg = can_agg_fastpath(fa, pkt, fsws);
1235 	if (can_agg) {
1236 		pkt->pkt_flow_tcp_agg_fast = 1;
1237 		goto done;
1238 	}
1239 
1240 	can_agg = can_agg_slowpath(fa, pkt, fsws);
1241 	ASSERT(!pkt->pkt_flow_tcp_agg_fast);
1242 
1243 done:
1244 	return can_agg;
1245 }
1246 
1247 static uint16_t
flow_agg_pkt_fix_sum(uint16_t csum,uint16_t old,uint16_t new)1248 flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new)
1249 {
1250 	return __packet_fix_sum(csum, old, new);
1251 }
1252 
1253 static uint16_t
flow_agg_pkt_fix_sum_no_op(uint16_t __unused csum,uint16_t __unused old,uint16_t __unused new)1254 flow_agg_pkt_fix_sum_no_op(uint16_t __unused csum, uint16_t __unused old,
1255     uint16_t __unused new)
1256 {
1257 	return 0;
1258 }
1259 
1260 static inline void
flow_agg_pkt_fix_hdr_sum(struct flow_agg * fa,uint8_t * field,uint16_t * csum,uint32_t new)1261 flow_agg_pkt_fix_hdr_sum(struct flow_agg *fa, uint8_t *field, uint16_t *csum,
1262     uint32_t new)
1263 {
1264 	uint32_t old;
1265 	memcpy(&old, field, sizeof(old));
1266 	memcpy(field, &new, sizeof(uint32_t));
1267 	*csum = fa->fa_fix_pkt_sum(fa->fa_fix_pkt_sum(*csum,
1268 	    (uint16_t)(old >> 16), (uint16_t)(new >> 16)),
1269 	    (uint16_t)(old & 0xffff),
1270 	    (uint16_t)(new & 0xffff));
1271 }
1272 
1273 static void
flow_agg_merge_hdr(struct flow_agg * fa,struct __kern_packet * pkt,__unused uint16_t data_csum,struct fsw_stats * fsws)1274 flow_agg_merge_hdr(struct flow_agg *fa, struct __kern_packet *pkt,
1275     __unused uint16_t data_csum, struct fsw_stats *fsws)
1276 {
1277 	struct tcphdr *stcp, *tcp;
1278 	uint8_t *l3hdr, l3hlen;
1279 	uint16_t old_l3len = 0;
1280 	uint8_t result;
1281 
1282 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1283 
1284 	/*
1285 	 * The packet being merged should always have full checksum flags
1286 	 * and a valid checksum. Otherwise, it would fail copy_pkt_csum_packed
1287 	 * and not enter this function.
1288 	 */
1289 	ASSERT(PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt));
1290 	ASSERT((pkt->pkt_csum_rx_value ^ 0xffff) == 0);
1291 
1292 	ASSERT(fa->fa_sobj != NULL);
1293 	ASSERT(!fa->fa_sobj_is_pkt ||
1294 	    (fa->fa_spkt->pkt_headroom == 0 && fa->fa_spkt->pkt_l2_len == 0));
1295 	uint8_t *sl3_hdr = fa->fa_sptr;
1296 	ASSERT(sl3_hdr != NULL);
1297 	ASSERT(fa->fa_fix_pkt_sum != NULL);
1298 
1299 	fa->fa_total += pkt->pkt_flow_ulen;
1300 
1301 	/*
1302 	 * Update the IP header as:
1303 	 * 1. Set the IP ID (IPv4 only) to that of the new packet
1304 	 * 2. Set the ttl to the lowest of the two
1305 	 * 3. Increment the IP length by the payload length of new packet
1306 	 * 4. Leave the IP (IPv4 only) checksum as is
1307 	 * Update the resp. flow classification fields, if any
1308 	 * Nothing to update for TCP header for now
1309 	 */
1310 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
1311 		struct ip *siph = (struct ip *)(void *)sl3_hdr;
1312 
1313 		/* 16-bit alignment is sufficient (handles mbuf case) */
1314 		ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1315 
1316 		l3hdr = (uint8_t *)siph;
1317 		l3hlen = siph->ip_hl << 2;
1318 
1319 		old_l3len = ntohs(siph->ip_len);
1320 		uint16_t l3tlen = ntohs(siph->ip_len) + pkt->pkt_flow_ulen;
1321 		siph->ip_len = htons(l3tlen);
1322 		siph->ip_sum = fa->fa_fix_pkt_sum(siph->ip_sum, 0,
1323 		    htons(pkt->pkt_flow_ulen));
1324 
1325 		SK_DF(logflags, "Agg IP len %u", ntohs(siph->ip_len));
1326 	} else {
1327 		struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1328 
1329 		/* 16-bit alignment is sufficient (handles mbuf case) */
1330 		ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1331 		ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1332 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1333 
1334 		l3hdr = (uint8_t *)sip6;
1335 		l3hlen = sizeof(struct ip6_hdr);
1336 
1337 		/* No extension headers should be present */
1338 		ASSERT(pkt->pkt_flow_ip_hlen == sizeof(struct ip6_hdr));
1339 
1340 		old_l3len = ntohs(sip6->ip6_plen) + sizeof(struct ip6_hdr);
1341 		uint16_t l3plen = ntohs(sip6->ip6_plen) + pkt->pkt_flow_ulen;
1342 		sip6->ip6_plen = htons(l3plen);
1343 
1344 		SK_DF(logflags, "Agg IP6 len %u", ntohs(sip6->ip6_plen));
1345 	}
1346 
1347 	if (__probable(pkt->pkt_flow_tcp_agg_fast)) {
1348 		STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_IP);
1349 	} else {
1350 		STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_IP);
1351 	}
1352 
1353 	stcp = (struct tcphdr *)(void *)(l3hdr + l3hlen);
1354 	tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
1355 	/* 16-bit alignment is sufficient (handles mbuf case) */
1356 	ASSERT(IS_P2ALIGNED(stcp, sizeof(uint16_t)));
1357 	ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
1358 
1359 	/*
1360 	 * If it is bigger, that means there are TCP-options that need to be
1361 	 * copied over.
1362 	 */
1363 	if (pkt->pkt_flow_tcp_hlen > sizeof(struct tcphdr) ||
1364 	    (stcp->th_flags & TH_PUSH) == 0) {
1365 		VERIFY(stcp->th_off << 2 == pkt->pkt_flow_tcp_hlen);
1366 		if (__improbable(!pkt->pkt_flow_tcp_agg_fast &&
1367 		    memcmp(stcp + 1, tcp + 1, (pkt->pkt_flow_tcp_hlen -
1368 		    sizeof(struct tcphdr))) != 0)) {
1369 			uint8_t *sopt = (uint8_t *)(stcp + 1);
1370 			uint8_t *opt = (uint8_t *)(tcp + 1);
1371 
1372 			uint32_t ntsval, ntsecr;
1373 			bcopy((void *)(opt + 4), &ntsval, sizeof(ntsval));
1374 			bcopy((void *)(opt + 8), &ntsecr, sizeof(ntsecr));
1375 
1376 			flow_agg_pkt_fix_hdr_sum(fa, sopt + 4, &stcp->th_sum, ntsval);
1377 			flow_agg_pkt_fix_hdr_sum(fa, sopt + 8, &stcp->th_sum, ntsecr);
1378 
1379 			STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_TCP);
1380 		} else {
1381 			STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_TCP);
1382 		}
1383 
1384 		if ((stcp->th_flags & TH_PUSH) == 0 &&
1385 		    (tcp->th_flags & TH_PUSH) != 0) {
1386 			uint16_t old, new;
1387 			old = *(uint16_t *)(void *)(&stcp->th_ack + 1);
1388 			/* If the new segment has a PUSH-flag, append it! */
1389 			stcp->th_flags |= tcp->th_flags & TH_PUSH;
1390 			new = *(uint16_t *)(void *)(&stcp->th_ack + 1);
1391 			stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, old, new);
1392 		}
1393 	}
1394 
1395 	/* Update pseudo header checksum */
1396 	stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
1397 	    htons(pkt->pkt_flow_ulen));
1398 
1399 	/* Update data checksum  */
1400 	if (__improbable(old_l3len & 0x1)) {
1401 		/* swap the byte order, refer to rfc 1071 section 2 */
1402 		stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
1403 		    ntohs(data_csum));
1404 	} else {
1405 		stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0, data_csum);
1406 	}
1407 
1408 	if (fa->fa_sobj_is_pkt) {
1409 		struct __kern_packet *spkt = fa->fa_spkt;
1410 		spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1411 		spkt->pkt_flow_ulen += pkt->pkt_flow_ulen;
1412 		/*
1413 		 * Super packet length includes L3 and L4
1414 		 * header length for first packet only.
1415 		 */
1416 		spkt->pkt_length += pkt->pkt_flow_ulen;
1417 		if (spkt->pkt_seg_cnt == 0) {
1418 			/* First time we append packets, need to set it to 1 */
1419 			spkt->pkt_seg_cnt = 1;
1420 		}
1421 		_CASSERT(sizeof(result) == sizeof(spkt->pkt_seg_cnt));
1422 		if (!os_add_overflow(1, spkt->pkt_seg_cnt, &result)) {
1423 			spkt->pkt_seg_cnt = result;
1424 		}
1425 		SK_DF(logflags, "Agg pkt len %u TCP csum 0x%04x",
1426 		    spkt->pkt_length, ntohs(stcp->th_sum));
1427 	} else {
1428 		struct mbuf *smbuf = fa->fa_smbuf;
1429 		smbuf->m_pkthdr.len += pkt->pkt_flow_ulen;
1430 		if (smbuf->m_pkthdr.seg_cnt == 0) {
1431 			/* First time we append packets, need to set it to 1 */
1432 			smbuf->m_pkthdr.seg_cnt = 1;
1433 		}
1434 		_CASSERT(sizeof(result) == sizeof(smbuf->m_pkthdr.seg_cnt));
1435 		if (!os_add_overflow(1, smbuf->m_pkthdr.seg_cnt, &result)) {
1436 			smbuf->m_pkthdr.seg_cnt = result;
1437 		}
1438 		SK_DF(logflags, "Agg mbuf len %u TCP csum 0x%04x",
1439 		    smbuf->m_pkthdr.len, ntohs(stcp->th_sum));
1440 	}
1441 }
1442 
1443 /*
1444  * Copy metadata from source packet to destination packet
1445  */
1446 static void
pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)1447 pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
1448 {
1449 	/* Copy packet metadata */
1450 	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
1451 	_PKT_COPY(spkt, dpkt);
1452 }
1453 
1454 static void
pkt_finalize(kern_packet_t ph)1455 pkt_finalize(kern_packet_t ph)
1456 {
1457 	int err = __packet_finalize(ph);
1458 	VERIFY(err == 0);
1459 #if (DEVELOPMENT || DEBUG)
1460 	struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1461 	uint8_t *buf;
1462 	MD_BUFLET_ADDR_ABS(pkt, buf);
1463 	buf += pkt->pkt_headroom + pkt->pkt_l2_len;
1464 	DTRACE_SKYWALK2(aggr__finalize, struct __kern_packet *, pkt,
1465 	    uint8_t *, buf);
1466 #endif
1467 }
1468 
1469 static inline uint32_t
estimate_buf_cnt(struct flow_entry * fe,uint32_t min_bufsize,uint32_t agg_bufsize)1470 estimate_buf_cnt(struct flow_entry *fe, uint32_t min_bufsize,
1471     uint32_t agg_bufsize)
1472 {
1473 	uint32_t max_ip_len = MAX_AGG_IP_LEN();
1474 	uint32_t agg_size = MAX(fe->fe_rx_largest_size, min_bufsize);
1475 	uint32_t hdr_overhead;
1476 
1477 	agg_size = MIN(agg_size, agg_bufsize);
1478 
1479 	hdr_overhead = (fe->fe_rx_pktq_bytes / max_ip_len) *
1480 	    (MAX(sizeof(struct ip), sizeof(struct ip6_hdr)) +
1481 	    sizeof(struct tcphdr));
1482 
1483 	return ((fe->fe_rx_pktq_bytes + hdr_overhead) / agg_size) + 1;
1484 }
1485 
1486 SK_INLINE_ATTRIBUTE
1487 static inline void
_append_dbuf_array_to_kpkt(kern_packet_t ph,kern_buflet_t pbuf,_dbuf_array_t * dbuf_array,kern_buflet_t * lbuf)1488 _append_dbuf_array_to_kpkt(kern_packet_t ph, kern_buflet_t pbuf,
1489     _dbuf_array_t *dbuf_array, kern_buflet_t *lbuf)
1490 {
1491 	for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1492 		kern_buflet_t buf = dbuf_array->dba_buflet[i];
1493 		VERIFY(kern_packet_add_buflet(ph, pbuf, buf) == 0);
1494 		pbuf = buf;
1495 		dbuf_array->dba_buflet[i] = NULL;
1496 	}
1497 	ASSERT(pbuf != NULL);
1498 	dbuf_array->dba_num_dbufs = 0;
1499 	*lbuf = pbuf;
1500 }
1501 
1502 SK_INLINE_ATTRIBUTE
1503 static inline void
_free_dbuf_array(struct kern_pbufpool * pp,_dbuf_array_t * dbuf_array)1504 _free_dbuf_array(struct kern_pbufpool *pp,
1505     _dbuf_array_t *dbuf_array)
1506 {
1507 	for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1508 		kern_buflet_t buf = dbuf_array->dba_buflet[i];
1509 		pp_free_buflet(pp, buf);
1510 		dbuf_array->dba_buflet[i] = NULL;
1511 	}
1512 	dbuf_array->dba_num_dbufs = 0;
1513 }
1514 
1515 static inline void
finalize_super_packet(struct __kern_packet ** spkt,kern_packet_t * sph,struct flow_agg * fa,uint32_t * largest_spkt,uint16_t * spkts,uint16_t bufcnt)1516 finalize_super_packet(struct __kern_packet **spkt, kern_packet_t *sph,
1517     struct flow_agg *fa, uint32_t *largest_spkt, uint16_t *spkts,
1518     uint16_t bufcnt)
1519 {
1520 	(*spkts)++;
1521 	if (bufcnt > 1) {
1522 		(*spkt)->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1523 	}
1524 	pkt_finalize(*sph);
1525 	if ((*spkt)->pkt_length > *largest_spkt) {
1526 		*largest_spkt = (*spkt)->pkt_length;
1527 	}
1528 	pkt_agg_log(*spkt, kernproc, false);
1529 	DTRACE_SKYWALK1(aggr__buflet__count, uint16_t, bufcnt);
1530 	*sph = 0;
1531 	*spkt = NULL;
1532 	FLOW_AGG_CLEAR(fa);
1533 }
1534 
1535 static inline void
converge_aggregation_size(struct flow_entry * fe,uint32_t largest_agg_size)1536 converge_aggregation_size(struct flow_entry *fe, uint32_t largest_agg_size)
1537 {
1538 	if (fe->fe_rx_largest_size > largest_agg_size) {
1539 		/*
1540 		 * Make it slowly move towards largest_agg_size if we
1541 		 * consistently get non-aggregatable size.
1542 		 *
1543 		 * If we start at 16K, this makes us go to 4K within 6 rounds
1544 		 * and down to 2K within 12 rounds.
1545 		 */
1546 		fe->fe_rx_largest_size -=
1547 		    ((fe->fe_rx_largest_size - largest_agg_size) >> 2);
1548 	} else {
1549 		fe->fe_rx_largest_size +=
1550 		    ((largest_agg_size - fe->fe_rx_largest_size) >> 2);
1551 	}
1552 }
1553 
1554 SK_NO_INLINE_ATTRIBUTE
1555 static void
flow_rx_agg_channel(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * dropped_pkts,bool is_mbuf)1556 flow_rx_agg_channel(struct nx_flowswitch *fsw, struct flow_entry *fe,
1557     struct pktq *dropped_pkts, bool is_mbuf)
1558 {
1559 #define __RX_AGG_CHAN_DROP_SOURCE_PACKET(_pkt)    do {   \
1560 	KPKTQ_ENQUEUE(dropped_pkts, (_pkt));             \
1561 	(_pkt) = NULL;                                   \
1562 	FLOW_AGG_CLEAR(&fa);                             \
1563 	prev_csum_ok = false;                            \
1564 } while (0)
1565 	struct flow_agg fa;             /* states */
1566 	FLOW_AGG_CLEAR(&fa);
1567 
1568 	struct pktq pkts;               /* dst super packets */
1569 	struct pktq disposed_pkts;      /* done src packets */
1570 
1571 	KPKTQ_INIT(&pkts);
1572 	KPKTQ_INIT(&disposed_pkts);
1573 
1574 	struct __kern_channel_ring *ring;
1575 	ring = fsw_flow_get_rx_ring(fsw, fe);
1576 	if (__improbable(ring == NULL)) {
1577 		SK_ERR("Rx ring is NULL");
1578 		KPKTQ_CONCAT(dropped_pkts, &fe->fe_rx_pktq);
1579 		STATS_ADD(&fsw->fsw_stats, FSW_STATS_DST_NXPORT_INVALID,
1580 		    KPKTQ_LEN(dropped_pkts));
1581 		return;
1582 	}
1583 	struct kern_pbufpool *dpp = ring->ckr_pp;
1584 	ASSERT(dpp->pp_max_frags > 1);
1585 
1586 	struct __kern_packet *pkt, *tpkt;
1587 	/* state for super packet */
1588 	struct __kern_packet *spkt = NULL;
1589 	kern_packet_t sph = 0;
1590 	kern_buflet_t sbuf = NULL;
1591 	bool prev_csum_ok = false, csum_ok, agg_ok;
1592 	uint16_t spkts = 0, bufcnt = 0;
1593 	int err;
1594 
1595 	struct fsw_stats *fsws = &fsw->fsw_stats;
1596 
1597 	/* state for buflet batch alloc */
1598 	uint32_t bh_cnt, bh_cnt_tmp;
1599 	uint64_t buf_arr[MAX_BUFLET_COUNT];
1600 	_dbuf_array_t dbuf_array = {.dba_is_buflet = true, .dba_num_dbufs = 0};
1601 	uint32_t largest_spkt = 0; /* largest aggregated packet size */
1602 	uint32_t agg_bufsize;
1603 	uint8_t iter = 0;
1604 	bool large_buffer = false;
1605 
1606 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1607 	SK_DF(logflags, "Rx input queue len %u", KPKTQ_LEN(&fe->fe_rx_pktq));
1608 
1609 	if (__probable(fe->fe_rx_largest_size != 0 &&
1610 	    NX_FSW_TCP_RX_AGG_ENABLED())) {
1611 		if (fe->fe_rx_largest_size <= PP_BUF_SIZE_DEF(dpp) ||
1612 		    PP_BUF_SIZE_LARGE(dpp) == 0) {
1613 			agg_bufsize = PP_BUF_SIZE_DEF(dpp);
1614 		} else {
1615 			agg_bufsize = PP_BUF_SIZE_LARGE(dpp);
1616 			large_buffer = true;
1617 		}
1618 		bh_cnt = estimate_buf_cnt(fe, PP_BUF_SIZE_DEF(dpp),
1619 		    agg_bufsize);
1620 		DTRACE_SKYWALK1(needed_blt_cnt_agg, uint32_t, bh_cnt);
1621 		bh_cnt = MIN(bh_cnt, MAX_BUFLET_COUNT);
1622 		bh_cnt_tmp = bh_cnt;
1623 	} else {
1624 		/*
1625 		 * No payload, thus it's all small-sized ACKs/...
1626 		 * OR aggregation is disabled.
1627 		 */
1628 		agg_bufsize = PP_BUF_SIZE_DEF(dpp);
1629 		bh_cnt_tmp = bh_cnt = MIN(KPKTQ_LEN(&fe->fe_rx_pktq), MAX_BUFLET_COUNT);
1630 		DTRACE_SKYWALK1(needed_blt_cnt_no_agg, uint32_t, bh_cnt);
1631 	}
1632 
1633 	err = pp_alloc_buflet_batch(dpp, buf_arr, &bh_cnt, SKMEM_NOSLEEP,
1634 	    large_buffer);
1635 	if (__improbable(bh_cnt == 0)) {
1636 		SK_ERR("failed to alloc %u buflets (err %d), use slow path",
1637 		    bh_cnt_tmp, err);
1638 	}
1639 	bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1640 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1641 		if (tpkt != NULL) {
1642 			void *baddr;
1643 			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1644 			SK_PREFETCH(baddr, 0);
1645 		}
1646 
1647 		ASSERT(pkt->pkt_qum.qum_pp != dpp);
1648 		ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
1649 		ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
1650 		ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
1651 		ASSERT(!pkt->pkt_flow_ip_is_frag);
1652 		ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1653 
1654 		csum_ok = false;
1655 		agg_ok = false;
1656 		/* supports TCP only */
1657 		uint32_t thlen = (pkt->pkt_flow_ip_hlen +
1658 		    pkt->pkt_flow_tcp_hlen);
1659 		uint32_t plen = (thlen + pkt->pkt_flow_ulen);
1660 		uint16_t data_csum = 0;
1661 
1662 		KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1663 		fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1664 		err = flow_pkt_track(fe, pkt, true);
1665 		if (__improbable(err != 0)) {
1666 			STATS_INC(fsws, FSW_STATS_RX_FLOW_TRACK_ERR);
1667 			/* if need to trigger RST */
1668 			if (err == ENETRESET) {
1669 				flow_track_abort_tcp(fe, pkt, NULL);
1670 			}
1671 			SK_ERR("flow_pkt_track failed (err %d)", err);
1672 			__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1673 			continue;
1674 		}
1675 
1676 		if (is_mbuf) {          /* compat */
1677 			m_adj(pkt->pkt_mbuf, pkt->pkt_l2_len);
1678 			pkt->pkt_svc_class = m_get_service_class(pkt->pkt_mbuf);
1679 			if (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
1680 				pkt->pkt_pflags |= PKT_F_WAKE_PKT;
1681 			}
1682 		}
1683 
1684 		if (prev_csum_ok && sbuf) {
1685 			ASSERT(fa.fa_spkt == spkt);
1686 			ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1687 			agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
1688 			agg_ok = (agg_ok && bufcnt < dpp->pp_max_frags);
1689 
1690 			if (agg_ok && sbuf->buf_dlim - sbuf->buf_doff -
1691 			    sbuf->buf_dlen >= plen - thlen) {
1692 				/*
1693 				 * No need for a new packet, just
1694 				 * append to curr_m.
1695 				 */
1696 				csum_ok = copy_pkt_csum_packed(pkt, plen, NULL,
1697 				    is_ipv4, NULL, sbuf, &data_csum, NULL);
1698 
1699 				if (!csum_ok) {
1700 					STATS_INC(fsws,
1701 					    FSW_STATS_RX_AGG_BAD_CSUM);
1702 					SK_ERR("Checksum for aggregation "
1703 					    "is wrong");
1704 					DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail1);
1705 					/*
1706 					 * Turns out, checksum is wrong!
1707 					 * Fallback to no-agg mode.
1708 					 */
1709 					agg_ok = false;
1710 				} else {
1711 					flow_agg_merge_hdr(&fa, pkt,
1712 					    data_csum, fsws);
1713 					goto next;
1714 				}
1715 			}
1716 		}
1717 
1718 		/* calculate number of buflets required */
1719 		bh_cnt_tmp = howmany(plen, agg_bufsize);
1720 		if (__improbable(bh_cnt_tmp > MAX_BUFLET_COUNT)) {
1721 			STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1722 			SK_ERR("packet too big: bufcnt %d len %d", bh_cnt_tmp,
1723 			    plen);
1724 			__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1725 			continue;
1726 		}
1727 		if (bh_cnt < bh_cnt_tmp) {
1728 			uint32_t tmp;
1729 
1730 			if (iter != 0) {
1731 				/*
1732 				 * rearrange the array for additional
1733 				 * allocation
1734 				 */
1735 				uint8_t i;
1736 				for (i = 0; i < bh_cnt; i++, iter++) {
1737 					buf_arr[i] = buf_arr[iter];
1738 					buf_arr[iter] = 0;
1739 				}
1740 				iter = 0;
1741 			}
1742 			tmp = estimate_buf_cnt(fe, PP_BUF_SIZE_DEF(dpp),
1743 			    agg_bufsize);
1744 			tmp = MIN(tmp, MAX_BUFLET_COUNT);
1745 			tmp = MAX(tmp, bh_cnt_tmp);
1746 			tmp -= bh_cnt;
1747 			ASSERT(tmp <= (MAX_BUFLET_COUNT - bh_cnt));
1748 			DTRACE_SKYWALK1(refilled_blt_cnt, uint32_t, tmp);
1749 			err = pp_alloc_buflet_batch(dpp, &buf_arr[bh_cnt],
1750 			    &tmp, SKMEM_NOSLEEP, large_buffer);
1751 			bh_cnt += tmp;
1752 			if (__improbable((tmp == 0) || (bh_cnt < bh_cnt_tmp))) {
1753 				STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1754 				SK_ERR("buflet alloc failed (err %d)", err);
1755 				__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1756 				continue;
1757 			}
1758 		}
1759 		/* Use pre-allocated buflets */
1760 		ASSERT(bh_cnt >= bh_cnt_tmp);
1761 		dbuf_array.dba_num_dbufs = bh_cnt_tmp;
1762 		while (bh_cnt_tmp-- > 0) {
1763 			dbuf_array.dba_buflet[bh_cnt_tmp] =
1764 			    (kern_buflet_t)(buf_arr[iter]);
1765 			buf_arr[iter] = 0;
1766 			bh_cnt--;
1767 			iter++;
1768 		}
1769 		/* copy and checksum TCP data */
1770 		if (agg_ok) {
1771 			int added = 0;
1772 			ASSERT(dbuf_array.dba_num_dbufs != 0);
1773 			csum_ok = copy_pkt_csum_packed(pkt, plen, &dbuf_array,
1774 			    is_ipv4, NULL, sbuf, &data_csum, &added);
1775 
1776 			if (__improbable(!csum_ok)) {
1777 				STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1778 				SK_ERR("Checksum for aggregation on new "
1779 				    "mbuf is wrong");
1780 				DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail2);
1781 				agg_ok = false;
1782 				/* reset the used buflets */
1783 				uint8_t j;
1784 				for (j = 0; j < dbuf_array.dba_num_dbufs; j++) {
1785 					VERIFY(kern_buflet_set_data_length(
1786 						    dbuf_array.dba_buflet[j], 0) == 0);
1787 				}
1788 				goto non_agg;
1789 			}
1790 
1791 			/*
1792 			 * There was not enough space in curr_m, thus we must
1793 			 * have added to m->m_data.
1794 			 */
1795 			VERIFY(added > 0);
1796 		} else {
1797 non_agg:
1798 			ASSERT(dbuf_array.dba_num_dbufs != 0);
1799 			csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
1800 			    &data_csum, is_ipv4);
1801 			if (__improbable(!csum_ok)) {
1802 				STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1803 				SK_ERR("%d incorrect csum", __LINE__);
1804 				DTRACE_SKYWALK(aggr__chan_tcp_csum_fail);
1805 			}
1806 		}
1807 		if (agg_ok) {
1808 			ASSERT(fa.fa_spkt == spkt);
1809 			ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1810 			/* update current packet header */
1811 			flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
1812 			ASSERT(dbuf_array.dba_num_dbufs > 0);
1813 			bufcnt += dbuf_array.dba_num_dbufs;
1814 			_append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1815 			    &sbuf);
1816 		} else {
1817 			/* Finalize the current super packet */
1818 			if (sph != 0) {
1819 				finalize_super_packet(&spkt, &sph, &fa,
1820 				    &largest_spkt, &spkts, bufcnt);
1821 			}
1822 
1823 			/* New super packet */
1824 			err = kern_pbufpool_alloc_nosleep(dpp, 0, &sph);
1825 			if (__improbable(err != 0)) {
1826 				STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1827 				SK_ERR("packet alloc failed (err %d)", err);
1828 				_free_dbuf_array(dpp, &dbuf_array);
1829 				__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1830 				continue;
1831 			}
1832 			spkt = SK_PTR_ADDR_KPKT(sph);
1833 			pkt_copy_metadata(pkt, spkt);
1834 			/* Packet length for super packet starts from L3 */
1835 			spkt->pkt_length = plen;
1836 			spkt->pkt_flow_ulen =  pkt->pkt_flow_ulen;
1837 			spkt->pkt_headroom = 0;
1838 			spkt->pkt_l2_len = 0;
1839 			spkt->pkt_seg_cnt = 1;
1840 
1841 			ASSERT(dbuf_array.dba_num_dbufs > 0);
1842 			bufcnt = dbuf_array.dba_num_dbufs;
1843 			sbuf = kern_packet_get_next_buflet(sph, NULL);
1844 			_append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1845 			    &sbuf);
1846 
1847 			KPKTQ_ENQUEUE(&pkts, spkt);
1848 			_UUID_COPY(spkt->pkt_flow_id, fe->fe_uuid);
1849 			_UUID_COPY(spkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1850 			spkt->pkt_policy_id = fe->fe_policy_id;
1851 			spkt->pkt_transport_protocol =
1852 			    fe->fe_transport_protocol;
1853 			flow_agg_init_spkt(fsw, &fa, spkt, pkt);
1854 		}
1855 next:
1856 		pkt_agg_log(pkt, kernproc, true);
1857 		prev_csum_ok = csum_ok;
1858 		KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1859 	}
1860 
1861 	/* Free unused buflets */
1862 	STATS_ADD(fsws, FSW_STATS_RX_WASTED_BFLT, bh_cnt);
1863 	while (bh_cnt > 0) {
1864 		pp_free_buflet(dpp, (kern_buflet_t)(buf_arr[iter]));
1865 		buf_arr[iter] = 0;
1866 		bh_cnt--;
1867 		iter++;
1868 	}
1869 	/* Finalize the last super packet */
1870 	if (sph != 0) {
1871 		finalize_super_packet(&spkt, &sph, &fa, &largest_spkt,
1872 		    &spkts, bufcnt);
1873 	}
1874 	converge_aggregation_size(fe, largest_spkt);
1875 	DTRACE_SKYWALK1(aggr__spkt__count, uint16_t, spkts);
1876 	if (__improbable(is_mbuf)) {
1877 		STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2PKT, spkts);
1878 	} else {
1879 		STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2PKT, spkts);
1880 	}
1881 	FLOW_STATS_IN_ADD(fe, spackets, spkts);
1882 
1883 	KPKTQ_FINI(&fe->fe_rx_pktq);
1884 	KPKTQ_CONCAT(&fe->fe_rx_pktq, &pkts);
1885 	KPKTQ_FINI(&pkts);
1886 
1887 	fsw_ring_enqueue_tail_drop(fsw, ring, &fe->fe_rx_pktq);
1888 
1889 	pp_free_pktq(&disposed_pkts);
1890 }
1891 
1892 /* streamline a smbuf */
1893 static bool
_finalize_smbuf(struct mbuf * smbuf)1894 _finalize_smbuf(struct mbuf *smbuf)
1895 {
1896 	/* the 1st mbuf always contains something, so start with the 2nd one */
1897 	struct mbuf *m_chained = smbuf->m_next;
1898 	struct mbuf *prev_m = smbuf;
1899 	bool freed = false;
1900 
1901 	while (m_chained != NULL) {
1902 		if (m_chained->m_len != 0) {
1903 			prev_m = m_chained;
1904 			m_chained = m_chained->m_next;
1905 			continue;
1906 		}
1907 		prev_m->m_next = m_chained->m_next;
1908 		m_free(m_chained);
1909 		m_chained = prev_m->m_next;
1910 		freed = true;
1911 	}
1912 	return freed;
1913 }
1914 
1915 SK_NO_INLINE_ATTRIBUTE
1916 static void
flow_rx_agg_host(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * dropped_pkts,bool is_mbuf)1917 flow_rx_agg_host(struct nx_flowswitch *fsw, struct flow_entry *fe,
1918     struct pktq *dropped_pkts, bool is_mbuf)
1919 {
1920 #define __RX_AGG_HOST_DROP_SOURCE_PACKET(_pkt)    do {   \
1921 	drop_packets++;                                  \
1922 	drop_bytes += (_pkt)->pkt_length;                \
1923 	KPKTQ_ENQUEUE(dropped_pkts, (_pkt));             \
1924 	(_pkt) = NULL;                                   \
1925 	FLOW_AGG_CLEAR(&fa);                             \
1926 	prev_csum_ok = false;                            \
1927 } while (0)
1928 	struct flow_agg fa;             /* states */
1929 	FLOW_AGG_CLEAR(&fa);
1930 
1931 	struct pktq disposed_pkts;      /* done src packets */
1932 	KPKTQ_INIT(&disposed_pkts);
1933 
1934 	struct __kern_packet *pkt, *tpkt;
1935 	/* points to the first mbuf of chain */
1936 	struct mbuf *m_chain = NULL;
1937 	/* super mbuf, at the end it points to last mbuf packet */
1938 	struct  mbuf *smbuf = NULL, *curr_m = NULL;
1939 	bool prev_csum_ok = false, csum_ok, agg_ok;
1940 	uint16_t smbufs = 0, smbuf_finalized = 0;
1941 	uint32_t bytes = 0, rcvd_ulen = 0;
1942 	uint32_t rcvd_packets = 0, rcvd_bytes = 0; /* raw packets & bytes */
1943 	uint32_t drop_packets = 0, drop_bytes = 0; /* dropped packets & bytes */
1944 	uint32_t largest_smbuf = 0;
1945 	int err = 0;
1946 
1947 	struct fsw_stats *fsws = &fsw->fsw_stats;
1948 	bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1949 
1950 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1951 
1952 	/* state for mbuf batch alloc */
1953 	uint32_t mhead_cnt = 0;
1954 	uint32_t mhead_bufsize = 0;
1955 	struct mbuf * mhead = NULL;
1956 
1957 	uint16_t l2len = KPKTQ_FIRST(&fe->fe_rx_pktq)->pkt_l2_len;
1958 
1959 	SK_DF(logflags, "Rx input queue bytes %u", fe->fe_rx_pktq_bytes);
1960 
1961 	if (__probable(!is_mbuf)) {
1962 		/*
1963 		 *  Batch mbuf alloc is based on
1964 		 * convert_native_pkt_to_mbuf_chain
1965 		 */
1966 		if (__probable(fe->fe_rx_largest_size != 0 &&
1967 		    NX_FSW_TCP_RX_AGG_ENABLED())) {
1968 			unsigned int num_segs = 1;
1969 			int pktq_len = KPKTQ_LEN(&fe->fe_rx_pktq);
1970 
1971 			if (fe->fe_rx_largest_size <= MCLBYTES &&
1972 			    fe->fe_rx_pktq_bytes / pktq_len <= MCLBYTES) {
1973 				mhead_bufsize = MCLBYTES;
1974 			} else if (fe->fe_rx_largest_size <= MBIGCLBYTES &&
1975 			    fe->fe_rx_pktq_bytes / pktq_len <= MBIGCLBYTES) {
1976 				mhead_bufsize = MBIGCLBYTES;
1977 			} else if (fe->fe_rx_largest_size <= M16KCLBYTES &&
1978 			    fe->fe_rx_pktq_bytes / pktq_len <= M16KCLBYTES) {
1979 				mhead_bufsize = M16KCLBYTES;
1980 			} else {
1981 				mhead_bufsize = M16KCLBYTES * 2;
1982 				num_segs = 2;
1983 			}
1984 
1985 try_again:
1986 			if (fe->fe_rx_pktq_bytes != 0) {
1987 				mhead_cnt = estimate_buf_cnt(fe, MCLBYTES,
1988 				    mhead_bufsize);
1989 			} else {
1990 				/* No payload, thus it's all small-sized ACKs/... */
1991 				mhead_bufsize = MHLEN;
1992 				mhead_cnt = pktq_len;
1993 			}
1994 
1995 			mhead = m_allocpacket_internal(&mhead_cnt,
1996 			    mhead_bufsize, &num_segs, M_NOWAIT, 1, 0);
1997 
1998 			if (mhead == NULL) {
1999 				if (mhead_bufsize > M16KCLBYTES) {
2000 					mhead_bufsize = M16KCLBYTES;
2001 					num_segs = 1;
2002 					goto try_again;
2003 				}
2004 
2005 				if (mhead_bufsize == M16KCLBYTES) {
2006 					mhead_bufsize = MBIGCLBYTES;
2007 					goto try_again;
2008 				}
2009 
2010 				if (mhead_bufsize == MBIGCLBYTES) {
2011 					mhead_bufsize = MCLBYTES;
2012 					goto try_again;
2013 				}
2014 			}
2015 		} else {
2016 			mhead = NULL;
2017 			mhead_bufsize = mhead_cnt = 0;
2018 		}
2019 		SK_DF(logflags, "batch alloc'ed %u mbufs of size %u", mhead_cnt,
2020 		    mhead_bufsize);
2021 	}
2022 
2023 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
2024 		if (tpkt != NULL) {
2025 			void *baddr;
2026 			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
2027 			SK_PREFETCH(baddr, 0);
2028 		}
2029 
2030 		/* Validate l2 len, ip vers, is_mbuf */
2031 		ASSERT(pkt->pkt_l2_len == l2len);
2032 		ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
2033 		ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
2034 		ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
2035 		ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
2036 		ASSERT(!pkt->pkt_flow_ip_is_frag);
2037 		ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
2038 
2039 		csum_ok = false;
2040 		agg_ok = false;
2041 		/*
2042 		 * As we only agg packets with same hdr length,
2043 		 * leverage the pkt metadata
2044 		 */
2045 		uint32_t thlen = (pkt->pkt_flow_ip_hlen +
2046 		    pkt->pkt_flow_tcp_hlen);
2047 		uint32_t plen = (thlen + pkt->pkt_flow_ulen);
2048 
2049 		/*
2050 		 * Rather than calling flow_pkt_track() for each
2051 		 * packet here, we accumulate received packet stats
2052 		 * for the call to flow_track_stats() below.  This
2053 		 * is because flow tracking is a no-op for traffic
2054 		 * that belongs to the host stack.
2055 		 */
2056 		rcvd_ulen += pkt->pkt_flow_ulen;
2057 		rcvd_bytes += pkt->pkt_length;
2058 		rcvd_packets++;
2059 
2060 		KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
2061 		fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
2062 
2063 		/* packet is for BSD flow, create a mbuf chain */
2064 		uint32_t len = (l2len + plen);
2065 		uint16_t data_csum = 0;
2066 		struct mbuf *m;
2067 		bool is_wake_pkt = false;
2068 		if (__improbable(is_mbuf)) {
2069 			m = pkt->pkt_mbuf;
2070 
2071 			if (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
2072 				is_wake_pkt = true;
2073 			}
2074 
2075 			/* Detach mbuf from source pkt */
2076 			KPKT_CLEAR_MBUF_DATA(pkt);
2077 
2078 			uint32_t trailer = (m_pktlen(m) - len);
2079 			ASSERT((uint32_t)m_pktlen(m) >= plen);
2080 			/* Remove the trailer */
2081 			if (trailer > 0) {
2082 				m_adj(m, -trailer);
2083 			}
2084 			/* attached mbuf is already allocated */
2085 			csum_ok = mbuf_csum(pkt, m, is_ipv4, &data_csum);
2086 		} else {                /* native */
2087 			uint16_t pad = P2ROUNDUP(l2len, sizeof(uint32_t)) -
2088 			    l2len;
2089 			uint32_t tot_len = (len + pad);
2090 			/* remember largest aggregated packet size */
2091 			if (smbuf) {
2092 				/* plus 4 bytes to account for padding */
2093 				if (largest_smbuf <
2094 				    (uint32_t)m_pktlen(smbuf) + pad) {
2095 					largest_smbuf = (uint32_t)m_pktlen(smbuf) + pad;
2096 				}
2097 			}
2098 
2099 			if ((pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
2100 				is_wake_pkt = true;
2101 			}
2102 
2103 			if (prev_csum_ok && curr_m) {
2104 				ASSERT(fa.fa_smbuf == smbuf);
2105 				ASSERT(!fa.fa_sobj_is_pkt);
2106 				agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2107 
2108 				if (agg_ok &&
2109 				    M_TRAILINGSPACE(curr_m) >= plen - thlen) {
2110 					/*
2111 					 * No need for a new mbuf,
2112 					 * just append to curr_m.
2113 					 */
2114 					csum_ok = copy_pkt_csum_packed(pkt,
2115 					    plen, NULL, is_ipv4, curr_m, NULL,
2116 					    &data_csum, NULL);
2117 
2118 					if (!csum_ok) {
2119 						STATS_INC(fsws,
2120 						    FSW_STATS_RX_AGG_BAD_CSUM);
2121 						SK_ERR("Checksum for "
2122 						    "aggregation is wrong");
2123 						DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail1);
2124 						/*
2125 						 * Turns out, checksum is wrong!
2126 						 * Fallback to no-agg mode.
2127 						 */
2128 						agg_ok = 0;
2129 					} else {
2130 						/*
2131 						 * We only added payload,
2132 						 * thus -thlen.
2133 						 */
2134 						bytes += (plen - thlen);
2135 						flow_agg_merge_hdr(&fa, pkt,
2136 						    data_csum, fsws);
2137 						goto next;
2138 					}
2139 				}
2140 			}
2141 
2142 			/*
2143 			 * If the batch allocation returned partial success,
2144 			 * we try blocking allocation here again
2145 			 */
2146 			m = mhead;
2147 			if (__improbable(m == NULL ||
2148 			    tot_len > mhead_bufsize)) {
2149 				unsigned int num_segs = 1;
2150 				if (tot_len > M16KCLBYTES) {
2151 					num_segs = 0;
2152 				}
2153 
2154 				ASSERT(mhead_cnt == 0 || mhead != NULL);
2155 				err = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
2156 				    &num_segs, &m);
2157 				if (err != 0) {
2158 					STATS_INC(fsws,
2159 					    FSW_STATS_RX_DROP_NOMEM_BUF);
2160 					SK_ERR("mbuf alloc failed (err %d), "
2161 					    "maxchunks %d, len %d", err, num_segs,
2162 					    tot_len);
2163 					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2164 					continue;
2165 				}
2166 			} else {
2167 				ASSERT(mhead_cnt > 0);
2168 				mhead = m->m_nextpkt;
2169 				m->m_nextpkt = NULL;
2170 				mhead_cnt--;
2171 			}
2172 			m->m_data += pad;
2173 			m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2174 
2175 			/*
2176 			 * copy and checksum l3, l4 and payload
2177 			 * l2 header is copied later only if we
2178 			 * can't agg as an optimization
2179 			 */
2180 			m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
2181 			_dbuf_array_t dbuf_array = {.dba_is_buflet = false};
2182 			if (agg_ok) {
2183 				int added = 0, dbuf_idx = 0;
2184 				struct mbuf *m_tmp = m;
2185 				dbuf_array.dba_num_dbufs = 0;
2186 				uint32_t m_chain_max_len = 0;
2187 				while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
2188 					dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
2189 					dbuf_array.dba_num_dbufs += 1;
2190 					m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
2191 					m_tmp = m_tmp->m_next;
2192 					dbuf_idx++;
2193 				}
2194 				ASSERT(m_tmp == NULL);
2195 
2196 				csum_ok = copy_pkt_csum_packed(pkt, plen,
2197 				    &dbuf_array, is_ipv4, curr_m, NULL,
2198 				    &data_csum, &added);
2199 
2200 				if (!csum_ok) {
2201 					STATS_INC(fsws,
2202 					    FSW_STATS_RX_AGG_BAD_CSUM);
2203 					SK_ERR("Checksum for aggregation "
2204 					    "on new mbuf is wrong");
2205 					DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail2);
2206 					agg_ok = false;
2207 					goto non_agg;
2208 				}
2209 
2210 				/*
2211 				 * There was not enough space in curr_m,
2212 				 * thus we must have added to m->m_data.
2213 				 */
2214 				VERIFY(added > 0);
2215 				VERIFY(m->m_len <= m->m_pkthdr.len &&
2216 				    (uint32_t)m->m_pkthdr.len <= m_chain_max_len);
2217 
2218 				/*
2219 				 * We account for whatever we added
2220 				 * to m later on, thus - added.
2221 				 */
2222 				bytes += plen - thlen - added;
2223 			} else {
2224 non_agg:
2225 				dbuf_array.dba_num_dbufs = 0;
2226 				uint32_t m_chain_max_len = 0;
2227 				struct mbuf *m_tmp = m;
2228 				int dbuf_idx = 0;
2229 				while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
2230 					dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
2231 					dbuf_array.dba_num_dbufs += 1;
2232 					m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
2233 					m_tmp = m_tmp->m_next;
2234 					dbuf_idx++;
2235 				}
2236 				ASSERT(m_tmp == NULL);
2237 
2238 				m->m_len += l2len;
2239 				m->m_pkthdr.len += l2len;
2240 				csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
2241 				    &data_csum, is_ipv4);
2242 				if (__improbable(!csum_ok)) {
2243 					STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
2244 					SK_ERR("%d incorrect csum", __LINE__);
2245 					DTRACE_SKYWALK(aggr__host_tcp_csum_fail);
2246 				}
2247 				VERIFY(m->m_len <= m->m_pkthdr.len &&
2248 				    (uint32_t)m->m_pkthdr.len <= m_chain_max_len);
2249 			}
2250 
2251 			STATS_INC(fsws, FSW_STATS_RX_COPY_PKT2MBUF);
2252 			STATS_INC(fsws, FSW_STATS_RX_COPY_SUM);
2253 
2254 			m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off;
2255 			m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value;
2256 			/*
2257 			 *  Note that these flags have same value,
2258 			 * except PACKET_CSUM_PARTIAL
2259 			 */
2260 			m->m_pkthdr.csum_flags |= (pkt->pkt_csum_flags &
2261 			    PACKET_CSUM_RX_FLAGS);
2262 
2263 			/* Set the rcvif */
2264 			m->m_pkthdr.rcvif = fsw->fsw_ifp;
2265 
2266 			/* Make sure to propagate the wake pkt flag */
2267 			if (is_wake_pkt) {
2268 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
2269 			}
2270 		}
2271 		ASSERT(m != NULL);
2272 		ASSERT((m->m_flags & M_PKTHDR) && m->m_pkthdr.pkt_hdr != NULL);
2273 		ASSERT((m->m_flags & M_HASFCS) == 0);
2274 		ASSERT(m->m_nextpkt == NULL);
2275 
2276 		if (__improbable(is_mbuf)) {
2277 			if ((uint32_t) m->m_len < (l2len + thlen)) {
2278 				m = m_pullup(m, (l2len + thlen));
2279 				if (m == NULL) {
2280 					STATS_INC(fsws,
2281 					    FSW_STATS_RX_DROP_NOMEM_BUF);
2282 					SK_ERR("mbuf pullup failed (err %d)",
2283 					    err);
2284 					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2285 					continue;
2286 				}
2287 				m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2288 			}
2289 			if (prev_csum_ok && csum_ok) {
2290 				ASSERT(fa.fa_smbuf == smbuf);
2291 				agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2292 			}
2293 		}
2294 
2295 		if (agg_ok) {
2296 			ASSERT(is_wake_pkt == false);
2297 			ASSERT(fa.fa_smbuf == smbuf);
2298 			ASSERT(!fa.fa_sobj_is_pkt);
2299 			if (__improbable(is_mbuf)) {
2300 				bytes += (m_pktlen(m) - l2len);
2301 				/* adjust mbuf by l2, l3 and l4  hdr */
2302 				m_adj(m, l2len + thlen);
2303 			} else {
2304 				bytes += m_pktlen(m);
2305 			}
2306 
2307 			m->m_flags &= ~M_PKTHDR;
2308 			flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
2309 			while (curr_m->m_next != NULL) {
2310 				curr_m = curr_m->m_next;
2311 			}
2312 			curr_m->m_next = m;
2313 			curr_m = m;
2314 			m = NULL;
2315 		} else {
2316 			if ((uint32_t) m->m_len < l2len) {
2317 				m = m_pullup(m, l2len);
2318 				if (m == NULL) {
2319 					STATS_INC(fsws,
2320 					    FSW_STATS_RX_DROP_NOMEM_BUF);
2321 					SK_ERR("mbuf pullup failed (err %d)",
2322 					    err);
2323 					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2324 					continue;
2325 				}
2326 				m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2327 			}
2328 
2329 			/* copy l2 header for native */
2330 			if (__probable(!is_mbuf)) {
2331 				uint16_t llhoff = pkt->pkt_headroom;
2332 				uint8_t *baddr;
2333 				MD_BUFLET_ADDR_ABS(pkt, baddr);
2334 				ASSERT(baddr != NULL);
2335 				baddr += llhoff;
2336 				pkt_copy(baddr, m->m_data, l2len);
2337 			}
2338 			/* adjust mbuf by l2 hdr */
2339 			m_adj(m, l2len);
2340 			bytes += m_pktlen(m);
2341 
2342 			/*
2343 			 * aggregated packets can be skipped by pktap because
2344 			 * the original pre-aggregated chain already passed through
2345 			 * pktap (see fsw_snoop()) before entering this function.
2346 			 */
2347 			m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
2348 
2349 			if (m_chain == NULL) {
2350 				/* this is the start of the chain */
2351 				m_chain = m;
2352 				smbuf = m;
2353 				curr_m = m;
2354 			} else if (smbuf != NULL) {
2355 				/*
2356 				 * set m to be next packet
2357 				 */
2358 				mbuf_agg_log(smbuf, kernproc, is_mbuf);
2359 				smbuf->m_nextpkt = m;
2360 				/*
2361 				 * Clean up (finalize) a smbuf only if it pre-allocated >1 segments,
2362 				 * which only happens when mhead_bufsize > M16KCLBYTES
2363 				 */
2364 				if (_finalize_smbuf(smbuf)) {
2365 					FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
2366 				}
2367 				smbuf_finalized++;
2368 				smbuf = m;
2369 				curr_m = m;
2370 			} else {
2371 				VERIFY(0);
2372 			}
2373 
2374 			smbufs++;
2375 			m = NULL;
2376 
2377 			flow_agg_init_smbuf(fsw, &fa, smbuf, pkt);
2378 			/*
2379 			 * if the super packet is an mbuf which can't accomodate
2380 			 * (sizeof(struct ip6_tcp_mask) in a single buffer then
2381 			 * do the aggregation check in slow path.
2382 			 * Note that an mbuf without cluster has only 80 bytes
2383 			 * available for data, sizeof(struct ip6_tcp_mask) is
2384 			 * also 80 bytes, so if the packet contains an
2385 			 * ethernet header, this mbuf won't be able to fully
2386 			 * contain "struct ip6_tcp_mask" data in a single
2387 			 * buffer.
2388 			 */
2389 			if (pkt->pkt_flow_ip_ver == IPV6_VERSION) {
2390 				if (__improbable(smbuf->m_len <
2391 				    ((smbuf->m_data -
2392 				    (caddr_t)(smbuf->m_pkthdr.pkt_hdr)) +
2393 				    MASK_SIZE))) {
2394 					fa.fa_sobj_is_short = true;
2395 				}
2396 			}
2397 		}
2398 next:
2399 		pkt_agg_log(pkt, kernproc, true);
2400 		prev_csum_ok = csum_ok;
2401 		KPKTQ_ENQUEUE(&disposed_pkts, pkt);
2402 	}
2403 
2404 	KPKTQ_FINI(&fe->fe_rx_pktq);
2405 
2406 	/* Free any leftover mbufs, true only for native  */
2407 	if (__improbable(mhead != NULL)) {
2408 		ASSERT(mhead_cnt != 0);
2409 		STATS_ADD(fsws, FSW_STATS_RX_WASTED_MBUF, mhead_cnt);
2410 		(void) m_freem_list(mhead);
2411 		mhead = NULL;
2412 		mhead_cnt = 0;
2413 	}
2414 
2415 	converge_aggregation_size(fe, largest_smbuf);
2416 
2417 	if (smbufs > 0) {
2418 		/* Last smbuf */
2419 		mbuf_agg_log(smbuf, kernproc, is_mbuf);
2420 		SK_DF(logflags, "smbuf count %u", smbufs);
2421 
2422 		ASSERT(m_chain != NULL);
2423 		ASSERT(smbuf != NULL);
2424 
2425 		/*
2426 		 * If the last mbuf needs to be finalized (mhead_bufsize > M16KCLBYTES)
2427 		 * but is not (smbuf_finalized < smbuf), do it now.
2428 		 */
2429 		if (smbuf_finalized < smbufs &&
2430 		    _finalize_smbuf(smbuf)) {
2431 			FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
2432 		}
2433 
2434 		/*
2435 		 * Call fsw_host_sendup() with mbuf chain
2436 		 * directly.
2437 		 */
2438 		mchain_agg_log(m_chain, kernproc, is_mbuf);
2439 		fsw_host_sendup(fsw->fsw_ifp, m_chain, smbuf, smbufs, bytes);
2440 
2441 		if (__improbable(is_mbuf)) {
2442 			STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2MBUF, smbufs);
2443 		} else {
2444 			STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2MBUF, smbufs);
2445 		}
2446 		FLOW_STATS_IN_ADD(fe, spackets, smbufs);
2447 
2448 		ASSERT((fe->fe_flags & FLOWENTF_TRACK) == 0);
2449 	}
2450 
2451 	/* record (raw) number of packets and bytes */
2452 	ASSERT((int)(rcvd_bytes - drop_bytes) >= 0);
2453 	ASSERT((int)(rcvd_packets - drop_packets) >= 0);
2454 	flow_track_stats(fe, (rcvd_bytes - drop_bytes),
2455 	    (rcvd_packets - drop_packets), (rcvd_ulen != 0), true);
2456 
2457 	pp_free_pktq(&disposed_pkts);
2458 }
2459 
2460 void
flow_rx_agg_tcp(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)2461 flow_rx_agg_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe,
2462     uint32_t flags)
2463 {
2464 #pragma unused(flags)
2465 	struct pktq dropped_pkts;
2466 	bool is_mbuf;
2467 
2468 	if (__improbable(fe->fe_rx_frag_count > 0)) {
2469 		dp_flow_rx_process(fsw, fe, 0);
2470 		return;
2471 	}
2472 
2473 	KPKTQ_INIT(&dropped_pkts);
2474 
2475 	if (!dp_flow_rx_route_process(fsw, fe)) {
2476 		SK_ERR("Rx route bad");
2477 		fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
2478 		STATS_ADD(&fsw->fsw_stats, FSW_STATS_RX_FLOW_NONVIABLE,
2479 		    KPKTQ_LEN(&dropped_pkts));
2480 		goto done;
2481 	}
2482 
2483 	is_mbuf = !!(PKT_IS_MBUF(KPKTQ_FIRST(&fe->fe_rx_pktq)));
2484 
2485 	if (fe->fe_nx_port == FSW_VP_HOST) {
2486 		boolean_t do_rx_agg;
2487 
2488 		/* BSD flow */
2489 		if (sk_fsw_rx_agg_tcp_host != SK_FSW_RX_AGG_TCP_HOST_AUTO) {
2490 			do_rx_agg = (sk_fsw_rx_agg_tcp_host ==
2491 			    SK_FSW_RX_AGG_TCP_HOST_ON);
2492 		} else {
2493 			do_rx_agg = !dlil_has_ip_filter() &&
2494 			    !dlil_has_if_filter(fsw->fsw_ifp);
2495 		}
2496 		if (__improbable(!do_rx_agg)) {
2497 			fsw_host_rx(fsw, &fe->fe_rx_pktq);
2498 			return;
2499 		}
2500 		if (__improbable(pktap_total_tap_count != 0)) {
2501 			fsw_snoop(fsw, fe, true);
2502 		}
2503 		flow_rx_agg_host(fsw, fe, &dropped_pkts, is_mbuf);
2504 	} else {
2505 		/* channel flow */
2506 		if (__improbable(pktap_total_tap_count != 0)) {
2507 			fsw_snoop(fsw, fe, true);
2508 		}
2509 		flow_rx_agg_channel(fsw, fe, &dropped_pkts, is_mbuf);
2510 	}
2511 
2512 done:
2513 	pp_free_pktq(&dropped_pkts);
2514 }
2515