xref: /xnu-8792.61.2/bsd/skywalk/nexus/flowswitch/flow/flow_agg.c (revision 42e220869062b56f8d7d0726fd4c88954f87902c) !
1 /*
2  * Copyright (c) 2019-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
33 #include <skywalk/nexus/netif/nx_netif.h>
34 #include <skywalk/nexus/netif/nx_netif_compat.h>
35 #include <netinet/tcp.h>
36 #include <netinet/ip.h>
37 #include <netinet/ip6.h>
38 #include <net/pktap.h>
39 #include <sys/sdt.h>
40 
41 #define MAX_AGG_IP_LEN()        MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET)
42 #define MAX_BUFLET_COUNT        (32)
43 #define TCP_FLAGS_IGNORE        (TH_FIN|TH_SYN|TH_RST|TH_URG)
44 #define PKT_IS_MBUF(_pkt)       (_pkt->pkt_pflags & PKT_F_MBUF_DATA)
45 #define PKT_IS_TRUNC_MBUF(_pkt) (PKT_IS_MBUF(_pkt) &&           \
46 	                        (_pkt->pkt_pflags & PKT_F_TRUNCATED))
47 #define PKT_IS_WAKE_PKT(_pkt)   ((PKT_IS_MBUF(_pkt) &&                                  \
48 	                        (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) || \
49 	                        (!PKT_IS_MBUF(_pkt) &&                                  \
50 	                        (_pkt->pkt_pflags & PKT_F_WAKE_PKT)))
51 
52 
53 typedef uint16_t (* flow_agg_fix_pkt_sum_func)(uint16_t, uint16_t, uint16_t);
54 
55 static uint16_t
56 flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new);
57 
58 static uint16_t
59 flow_agg_pkt_fix_sum_no_op(uint16_t csum, uint16_t old, uint16_t new);
60 
61 /*
62  * This structure holds per-super object (mbuf/packet) flow aggregation.
63  */
64 struct flow_agg {
65 	union {
66 		struct {
67 			union {
68 				void *          _fa_sobj;
69 				struct mbuf *   _fa_smbuf;      /* super mbuf */
70 				struct __kern_packet *_fa_spkt; /* super pkt */
71 			};
72 			uint8_t *_fa_sptr;        /* ptr to super IP header */
73 			bool     _fa_sobj_is_pkt; /* super obj is pkt or mbuf */
74 			/*
75 			 * super obj is not large enough to hold the IP & TCP
76 			 * header in a contiguous buffer.
77 			 */
78 			bool     _fa_sobj_is_short;
79 			uint32_t _fa_tcp_seq;     /* expected next sequence # */
80 			uint32_t _fa_ulen;        /* expected next ulen */
81 			uint32_t _fa_total;       /* total aggregated bytes */
82 			/* function that fix packet checksum */
83 			flow_agg_fix_pkt_sum_func _fa_fix_pkt_sum;
84 		} __flow_agg;
85 		uint64_t __flow_agg_data[5];
86 	};
87 #define fa_sobj           __flow_agg._fa_sobj
88 #define fa_smbuf          __flow_agg._fa_smbuf
89 #define fa_spkt           __flow_agg._fa_spkt
90 #define fa_sptr           __flow_agg._fa_sptr
91 #define fa_sobj_is_pkt    __flow_agg._fa_sobj_is_pkt
92 #define fa_sobj_is_short  __flow_agg._fa_sobj_is_short
93 #define fa_tcp_seq        __flow_agg._fa_tcp_seq
94 #define fa_ulen           __flow_agg._fa_ulen
95 #define fa_total          __flow_agg._fa_total
96 #define fa_fix_pkt_sum   __flow_agg._fa_fix_pkt_sum
97 };
98 
99 #define FLOW_AGG_CLEAR(_fa) do {                                    \
100 	_CASSERT(sizeof(struct flow_agg) == 40);                        \
101 	_CASSERT(offsetof(struct flow_agg, fa_fix_pkt_sum) == 32);              \
102 	sk_zero_32(_fa);                                                \
103 	(_fa)->fa_fix_pkt_sum = 0;                                                                             \
104 } while (0)
105 
106 #define MASK_SIZE       80      /* size of struct {ip,ip6}_tcp_mask */
107 
108 struct ip_tcp_mask {
109 	struct ip       ip_m;
110 	struct tcphdr   tcp_m;
111 	uint32_t        tcp_option_m[MAX_TCPOPTLEN / sizeof(uint32_t)];
112 };
113 
114 static const struct ip_tcp_mask ip_tcp_mask
115 __sk_aligned(16) =
116 {
117 	.ip_m = {
118 		.ip_hl = 0xf,
119 		.ip_v = 0xf,
120 		.ip_tos = 0xff,
121 		/* Not checked; aggregated packet's ip_len is increasing */
122 		.ip_len = 0,
123 		.ip_id = 0,
124 		.ip_off = 0xffff,
125 		.ip_ttl = 0xff,
126 		.ip_p = 0xff,
127 		.ip_sum = 0,
128 		.ip_src.s_addr = 0xffffffff,
129 		.ip_dst.s_addr = 0xffffffff,
130 	},
131 	.tcp_m = {
132 		.th_sport = 0xffff,
133 		.th_dport = 0xffff,
134 		.th_seq = 0,
135 		.th_ack = 0xffffffff,
136 		.th_x2 = 0xf,
137 		.th_off = 0xf,
138 		.th_flags = ~TH_PUSH,
139 		.th_win = 0xffff,
140 		.th_sum = 0,
141 		.th_urp = 0xffff,
142 	},
143 	.tcp_option_m = {
144 		/* Max 40 bytes of TCP options */
145 		0xffffffff,
146 		0xffffffff,
147 		0xffffffff,
148 		0,      /* Filling up to MASK_SIZE */
149 		0,      /* Filling up to MASK_SIZE */
150 		0,      /* Filling up to MASK_SIZE */
151 		0,      /* Filling up to MASK_SIZE */
152 		0,      /* Filling up to MASK_SIZE */
153 		0,      /* Filling up to MASK_SIZE */
154 		0,      /* Filling up to MASK_SIZE */
155 	},
156 };
157 
158 struct ip6_tcp_mask {
159 	struct ip6_hdr  ip6_m;
160 	struct tcphdr   tcp_m;
161 	uint32_t        tcp_option_m[5]; /* 5 bytes to fill up to MASK_SIZE */
162 };
163 
164 static const struct ip6_tcp_mask ip6_tcp_mask
165 __sk_aligned(16) =
166 {
167 	.ip6_m = {
168 		.ip6_ctlun.ip6_un1.ip6_un1_flow = 0xffffffff,
169 		/* Not checked; aggregated packet's ip_len is increasing */
170 		.ip6_ctlun.ip6_un1.ip6_un1_plen = 0,
171 		.ip6_ctlun.ip6_un1.ip6_un1_nxt = 0xff,
172 		.ip6_ctlun.ip6_un1.ip6_un1_hlim = 0xff,
173 		.ip6_src.__u6_addr.__u6_addr32[0] = 0xffffff,
174 		.ip6_src.__u6_addr.__u6_addr32[1] = 0xffffff,
175 		.ip6_src.__u6_addr.__u6_addr32[2] = 0xffffff,
176 		.ip6_src.__u6_addr.__u6_addr32[3] = 0xffffff,
177 		.ip6_dst.__u6_addr.__u6_addr32[0] = 0xffffff,
178 		.ip6_dst.__u6_addr.__u6_addr32[1] = 0xffffff,
179 		.ip6_dst.__u6_addr.__u6_addr32[2] = 0xffffff,
180 		.ip6_dst.__u6_addr.__u6_addr32[3] = 0xffffff,
181 	},
182 	.tcp_m = {
183 		.th_sport = 0xffff,
184 		.th_dport = 0xffff,
185 		.th_seq = 0,
186 		.th_ack = 0xffffffff,
187 		.th_x2 = 0xf,
188 		.th_off = 0xf,
189 		.th_flags = ~TH_PUSH,
190 		.th_win = 0xffff,
191 		.th_sum = 0,
192 		.th_urp = 0xffff,
193 	},
194 	.tcp_option_m = {
195 		/* Max 40 bytes of TCP options */
196 		0xffffffff,
197 		0xffffffff,
198 		0xffffffff,
199 		0,          /* Filling up to MASK_SIZE */
200 		0,          /* Filling up to MASK_SIZE */
201 	},
202 };
203 
204 #if SK_LOG
205 SK_LOG_ATTRIBUTE
206 static void
_pkt_agg_log(struct __kern_packet * pkt,struct proc * p,bool is_input)207 _pkt_agg_log(struct __kern_packet *pkt, struct proc *p, bool is_input)
208 {
209 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
210 	    (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
211 
212 	kern_packet_t ph = SK_PKT2PH(pkt);
213 	uint64_t bufcnt = 1;
214 	if (!is_input) {
215 		bufcnt = kern_packet_get_buflet_count(ph);
216 	}
217 
218 	SK_DF(logflags, "%s(%d) %spkt 0x%llx plen %u",
219 	    sk_proc_name_address(p), sk_proc_pid(p), is_input ? "s":"d",
220 	    SK_KVA(pkt), pkt->pkt_length);
221 
222 	SK_DF(logflags, "%spkt csumf/rxstart/rxval 0x%x/%u/0x%04x",
223 	    is_input ? "s":"d", pkt->pkt_csum_flags,
224 	    (uint32_t)pkt->pkt_csum_rx_start_off,
225 	    (uint32_t)pkt->pkt_csum_rx_value);
226 
227 	if (!is_input) {
228 		kern_buflet_t buf = kern_packet_get_next_buflet(ph, NULL);
229 
230 		/* Individual buflets */
231 		for (uint64_t i = 0; i < bufcnt && buf != NULL; i++) {
232 			SK_DF(logflags | SK_VERB_DUMP, "%s",
233 			    sk_dump("buf", kern_buflet_get_data_address(buf),
234 			    pkt->pkt_length, 128, NULL, 0));
235 			buf = kern_packet_get_next_buflet(ph, buf);
236 		}
237 	}
238 }
239 
240 #define pkt_agg_log(_pkt, _p, _is_input) do {                           \
241 	if (__improbable(sk_verbose != 0)) {                            \
242 	        _pkt_agg_log(_pkt, _p, _is_input);                      \
243 	}                                                               \
244 } while (0)
245 
246 SK_LOG_ATTRIBUTE
247 static void
_mbuf_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)248 _mbuf_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
249 {
250 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
251 	    (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
252 
253 	SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
254 	    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
255 	    m->m_pkthdr.len);
256 
257 	SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
258 	    m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
259 	    (uint32_t)m->m_pkthdr.csum_rx_val);
260 
261 	/* Dump the first mbuf */
262 	ASSERT(m->m_data != NULL);
263 	SK_DF(logflags | SK_VERB_DUMP, "%s", sk_dump("buf",
264 	    (uint8_t *)m->m_data, m->m_len, 128, NULL, 0));
265 }
266 
267 #define mbuf_agg_log(_m, _p, _is_mbuf) do {                             \
268 	if (__improbable(sk_verbose != 0)) {                            \
269 	        _mbuf_agg_log(_m, _p, _is_mbuf);                        \
270 	}                                                               \
271 } while (0)
272 
273 SK_LOG_ATTRIBUTE
274 static void
_mchain_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)275 _mchain_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
276 {
277 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
278 	    (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
279 
280 	while (m != NULL) {
281 		SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
282 		    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
283 		    m->m_pkthdr.len);
284 
285 		SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
286 		    m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
287 		    (uint32_t)m->m_pkthdr.csum_rx_val);
288 
289 		m = m->m_nextpkt;
290 	}
291 }
292 
293 #define mchain_agg_log(_m, _p, _is_mbuf) do {                           \
294 	if (__improbable(sk_verbose != 0)) {                            \
295 	        _mchain_agg_log(_m, _p, _is_mbuf);                      \
296 	}                                                               \
297 } while (0)
298 #else
299 #define pkt_agg_log(...)
300 #define mbuf_agg_log(...)
301 #define mchain_agg_log(...)
302 #endif /* SK_LOG */
303 
304 /*
305  * Checksum only for packet with mbuf.
306  */
307 static bool
mbuf_csum(struct __kern_packet * pkt,struct mbuf * m,bool verify_l3,uint16_t * data_csum)308 mbuf_csum(struct __kern_packet *pkt, struct mbuf *m, bool verify_l3,
309     uint16_t *data_csum)
310 {
311 	ASSERT(data_csum != NULL);
312 
313 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
314 	uint32_t plen = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen +
315 	    pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
316 	uint16_t l4len = plen - pkt->pkt_l2_len - pkt->pkt_flow_ip_hlen;
317 	uint16_t start = pkt->pkt_l2_len;
318 	uint32_t partial = 0;
319 	uint16_t csum = 0;
320 
321 	ASSERT(plen == m_pktlen(m));
322 
323 	/* Some compat drivers compute full checksum */
324 	if ((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS) ==
325 	    CSUM_RX_FULL_FLAGS) {
326 		SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
327 		    m->m_pkthdr.csum_flags, m->m_pkthdr.csum_rx_start,
328 		    m->m_pkthdr.csum_rx_val);
329 
330 		/* Compute the data_csum */
331 		struct tcphdr *tcp =
332 		    (struct tcphdr *)(void *)(mtod(m, uint8_t *) +
333 		    pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen);
334 		/* 16-bit alignment is sufficient */
335 		ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
336 
337 		uint16_t th_sum = tcp->th_sum;
338 		tcp->th_sum = 0;
339 
340 		partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
341 		    pkt->pkt_flow_tcp_hlen);
342 		partial += htons(l4len + IPPROTO_TCP);
343 		if (pkt->pkt_flow_ip_ver == IPVERSION) {
344 			csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
345 			    pkt->pkt_flow_ipv4_dst.s_addr, partial);
346 		} else {
347 			ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
348 			csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
349 			    &pkt->pkt_flow_ipv6_dst, partial);
350 		}
351 		/* Restore the original checksum */
352 		tcp->th_sum = th_sum;
353 		th_sum = __packet_fix_sum(th_sum, csum, 0);
354 		*data_csum = ~th_sum & 0xffff;
355 
356 		/* pkt metadata will be transfer to super packet */
357 		__packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
358 		    0, m->m_pkthdr.csum_rx_val, false);
359 
360 		if ((m->m_pkthdr.csum_rx_val ^ 0xffff) == 0) {
361 			return true;
362 		} else {
363 			return false;
364 		}
365 	}
366 	/* Reset the csum RX flags */
367 	m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
368 	if (verify_l3) {
369 		csum = m_sum16(m, start, pkt->pkt_flow_ip_hlen);
370 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
371 		    start, pkt->pkt_flow_ip_hlen, csum);
372 		m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED;
373 		if ((csum ^ 0xffff) != 0) {
374 			return false;
375 		} else {
376 			m->m_pkthdr.csum_flags |= CSUM_IP_VALID;
377 		}
378 	}
379 	/* Compute L4 header checksum */
380 	partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
381 	    pkt->pkt_flow_tcp_hlen);
382 	/* Compute payload checksum */
383 	start += (pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
384 	*data_csum = m_sum16(m, start, (plen - start));
385 
386 	/* Fold in the data checksum to TCP checksum */
387 	partial += *data_csum;
388 	partial += htons(l4len + IPPROTO_TCP);
389 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
390 		csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
391 		    pkt->pkt_flow_ipv4_dst.s_addr, partial);
392 	} else {
393 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
394 		csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
395 		    &pkt->pkt_flow_ipv6_dst, partial);
396 	}
397 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
398 	    start - pkt->pkt_flow_tcp_hlen, l4len, csum);
399 	// Set start to 0 for full checksum
400 	m->m_pkthdr.csum_rx_start = 0;
401 	m->m_pkthdr.csum_rx_val = csum;
402 	m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
403 
404 	/* pkt metadata will be transfer to super packet */
405 	__packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
406 	    0, csum, false);
407 
408 	if ((csum ^ 0xffff) != 0) {
409 		return false;
410 	}
411 
412 	return true;
413 }
414 
415 /* structure to pass an array of data buffers */
416 typedef struct _dbuf_array {
417 	union {
418 		struct __kern_buflet *dba_buflet[MAX_BUFLET_COUNT];
419 		struct mbuf *dba_mbuf[MAX_BUFLET_COUNT];
420 	};
421 	uint8_t dba_num_dbufs;
422 	bool dba_is_buflet;
423 } _dbuf_array_t;
424 
425 static inline void
_copy_data_sum_dbuf(struct __kern_packet * spkt,uint16_t soff,uint16_t plen,uint32_t * partial_sum,boolean_t * odd_start,_dbuf_array_t * dbuf,boolean_t do_csum)426 _copy_data_sum_dbuf(struct __kern_packet *spkt, uint16_t soff, uint16_t plen,
427     uint32_t *partial_sum, boolean_t *odd_start, _dbuf_array_t *dbuf,
428     boolean_t do_csum)
429 {
430 	uint8_t i = 0;
431 	uint16_t buf_off = 0;
432 	uint16_t buflet_dlim;
433 	uint16_t buflet_dlen;
434 
435 	ASSERT(plen > 0);
436 	while (plen > 0) {
437 		ASSERT(i < dbuf->dba_num_dbufs);
438 		uint16_t tmplen;
439 		uint16_t dbuf_lim;
440 		uint8_t *dbuf_addr;
441 
442 		if (dbuf->dba_is_buflet) {
443 			ASSERT(kern_buflet_get_data_offset(dbuf->dba_buflet[i]) == 0);
444 			dbuf_addr = kern_buflet_get_data_address(dbuf->dba_buflet[i]);
445 
446 			buflet_dlim = kern_buflet_get_data_limit(dbuf->dba_buflet[i]);
447 			buflet_dlen = kern_buflet_get_data_length(dbuf->dba_buflet[i]);
448 			buf_off = buflet_dlen;
449 			dbuf_lim = buflet_dlim - buf_off;
450 			dbuf_addr += buf_off;
451 		} else {
452 			dbuf_lim = M_TRAILINGSPACE(dbuf->dba_mbuf[i]);
453 			dbuf_addr = mtod(dbuf->dba_mbuf[i], uint8_t *);
454 			buf_off = dbuf->dba_mbuf[i]->m_len;
455 			dbuf_addr += buf_off;
456 		}
457 		tmplen = min(plen, dbuf_lim);
458 		if (PKT_IS_TRUNC_MBUF(spkt)) {
459 			if (do_csum) {
460 				*partial_sum = m_copydata_sum(spkt->pkt_mbuf,
461 				    soff, tmplen, dbuf_addr, *partial_sum,
462 				    odd_start);
463 			} else {
464 				m_copydata(spkt->pkt_mbuf, soff, tmplen,
465 				    dbuf_addr);
466 			}
467 		} else {
468 			*partial_sum = pkt_copyaddr_sum(SK_PKT2PH(spkt),
469 			    soff, dbuf_addr, tmplen, do_csum, *partial_sum,
470 			    odd_start);
471 		}
472 		if (dbuf->dba_is_buflet) {
473 			VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[i],
474 			    tmplen + buf_off) == 0);
475 		} else {
476 			dbuf->dba_mbuf[i]->m_len += tmplen;
477 			dbuf->dba_mbuf[0]->m_pkthdr.len += tmplen;
478 		}
479 		soff += tmplen;
480 		plen -= tmplen;
481 		buf_off = 0;
482 		i++;
483 	}
484 	ASSERT(plen == 0);
485 }
486 
487 /*
488  * Copy (fill) and checksum for packet.
489  * spkt: source IP packet.
490  * plen: length of data in spkt (IP hdr + TCP hdr + TCP payload).
491  * verify_l3: verify IPv4 header checksum.
492  * currm: destination mbuf.
493  * currp: destination skywalk packet.
494  * dbuf: additional destination data buffer(s), used when current destination
495  * packet is out of space.
496  * added: amount of data copied from spkt to the additional buffer.
497  * data_sum: 16-bit folded partial checksum of the copied TCP payload.
498  */
499 static bool
copy_pkt_csum_packed(struct __kern_packet * spkt,uint32_t plen,_dbuf_array_t * dbuf,bool verify_l3,struct mbuf * currm,struct __kern_buflet * currp,uint16_t * data_csum,int * added)500 copy_pkt_csum_packed(struct __kern_packet *spkt, uint32_t plen,
501     _dbuf_array_t *dbuf, bool verify_l3, struct mbuf *currm,
502     struct __kern_buflet *currp, uint16_t *data_csum, int *added)
503 {
504 	ASSERT(data_csum != NULL);
505 
506 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
507 	    SK_VERB_COPY));
508 
509 	uint16_t start = 0, csum = 0;
510 	uint32_t len = 0;
511 	uint32_t l4len;
512 	/* soff is only used for packets */
513 	uint16_t soff = spkt->pkt_headroom + spkt->pkt_l2_len;
514 	uint32_t data_partial = 0, partial = 0;
515 	int32_t curr_oldlen;
516 	uint32_t curr_trailing;
517 	char *curr_ptr;
518 	int32_t curr_len;
519 	uint16_t data_off;
520 	uint32_t tmplen;
521 	boolean_t odd_start = FALSE;
522 	bool verify_l4;
523 
524 	/* One of them must be != NULL, but they can't be both set */
525 	VERIFY((currm != NULL || currp != NULL) &&
526 	    ((currm != NULL) != (currp != NULL)));
527 
528 	if (currm != NULL) {
529 		curr_oldlen = currm->m_len;
530 		curr_trailing = (uint32_t)M_TRAILINGSPACE(currm);
531 		curr_ptr = mtod(currm, char *) + currm->m_len;
532 		curr_len = currm->m_len;
533 	} else {
534 		curr_oldlen = currp->buf_dlen;
535 		curr_trailing = currp->buf_dlim - currp->buf_doff -
536 		    currp->buf_dlen;
537 		curr_ptr = (char *)(currp->buf_addr + currp->buf_doff +
538 		    currp->buf_dlen);
539 		curr_len = currp->buf_dlen;
540 	}
541 
542 	/* Verify checksum only for IPv4 */
543 	len = spkt->pkt_flow_ip_hlen;
544 	verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(spkt));
545 	if (verify_l3) {
546 		if (PKT_IS_TRUNC_MBUF(spkt)) {
547 			partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
548 			    len, 0, 0);
549 		} else {
550 			partial = pkt_sum(SK_PKT2PH(spkt), soff, len);
551 		}
552 
553 		csum = __packet_fold_sum(partial);
554 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)", 0,
555 		    len, csum);
556 		spkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
557 		if ((csum ^ 0xffff) != 0) {
558 			/* No need to copy & checkum TCP+payload */
559 			return false;
560 		} else {
561 			spkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
562 		}
563 	}
564 
565 	verify_l4 = !PACKET_HAS_FULL_CHECKSUM_FLAGS(spkt);
566 
567 	/* Copy & verify TCP checksum */
568 	start = spkt->pkt_flow_ip_hlen + spkt->pkt_flow_tcp_hlen;
569 	l4len = plen - spkt->pkt_flow_ip_hlen;
570 	len = plen - start;
571 	if (PKT_IS_TRUNC_MBUF(spkt)) {
572 		tmplen = min(len, curr_trailing);
573 		odd_start = FALSE;
574 
575 		/* First, simple checksum on the TCP header */
576 		if (verify_l4) {
577 			partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
578 			    spkt->pkt_flow_tcp_hlen, spkt->pkt_flow_ip_hlen, 0);
579 		}
580 
581 		/* Now, copy & sum the payload */
582 		if (tmplen > 0) {
583 			data_partial = m_copydata_sum(spkt->pkt_mbuf,
584 			    start, tmplen, curr_ptr, 0, &odd_start);
585 			curr_len += tmplen;
586 		}
587 		data_off = start + tmplen;
588 	} else {
589 		tmplen = min(len, curr_trailing);
590 		odd_start = FALSE;
591 
592 		/* First, simple checksum on the TCP header */
593 		if (verify_l4) {
594 			partial = pkt_sum(SK_PKT2PH(spkt), (soff +
595 			    spkt->pkt_flow_ip_hlen), spkt->pkt_flow_tcp_hlen);
596 		}
597 
598 		/* Now, copy & sum the payload */
599 		if (tmplen > 0) {
600 			data_partial = pkt_copyaddr_sum(SK_PKT2PH(spkt),
601 			    (soff + start), (uint8_t *)curr_ptr, tmplen,
602 			    true, 0, &odd_start);
603 			curr_len += tmplen;
604 		}
605 		data_off = soff + start + tmplen;
606 	}
607 
608 	/* copy & sum remaining payload in additional buffers */
609 	if ((len - tmplen) > 0) {
610 		ASSERT(dbuf != NULL);
611 		_copy_data_sum_dbuf(spkt, data_off, (len - tmplen),
612 		    &data_partial, &odd_start, dbuf, true);
613 		*added = (len - tmplen);
614 	}
615 
616 	/* Fold data checksum to 16 bit */
617 	*data_csum = __packet_fold_sum(data_partial);
618 
619 	if (currm != NULL) {
620 		currm->m_len = curr_len;
621 	} else {
622 		currp->buf_dlen = curr_len;
623 	}
624 
625 	if (verify_l4) {
626 		/* Fold in the data checksum to TCP checksum */
627 		partial += *data_csum;
628 		partial += htons(l4len + IPPROTO_TCP);
629 		if (spkt->pkt_flow_ip_ver == IPVERSION) {
630 			csum = in_pseudo(spkt->pkt_flow_ipv4_src.s_addr,
631 			    spkt->pkt_flow_ipv4_dst.s_addr, partial);
632 		} else {
633 			ASSERT(spkt->pkt_flow_ip_ver == IPV6_VERSION);
634 			csum = in6_pseudo(&spkt->pkt_flow_ipv6_src,
635 			    &spkt->pkt_flow_ipv6_dst, partial);
636 		}
637 		/* pkt metadata will be transfer to super packet */
638 		__packet_set_inet_checksum(SK_PKT2PH(spkt),
639 		    PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
640 	} else {
641 		/* grab csum value from offload */
642 		csum = spkt->pkt_csum_rx_value;
643 	}
644 
645 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
646 	    start - spkt->pkt_flow_tcp_hlen, l4len, ntohs(csum));
647 
648 	if ((csum ^ 0xffff) != 0) {
649 		/*
650 		 * Revert whatever we did here!
651 		 * currm/currp should be restored to previous value.
652 		 * dbuf (for additional payload) should be restore to 0.
653 		 */
654 		if (currm != NULL) {
655 			currm->m_len = curr_oldlen;
656 		} else {
657 			currp->buf_dlen = curr_oldlen;
658 		}
659 		if (dbuf != NULL) {
660 			for (int i = 0; i < dbuf->dba_num_dbufs; i++) {
661 				if (dbuf->dba_is_buflet) {
662 					struct __kern_buflet *b = dbuf->dba_buflet[i];
663 					kern_buflet_set_data_length(b, 0);
664 					kern_buflet_set_data_offset(b, 0);
665 				} else {
666 					struct mbuf *m = dbuf->dba_mbuf[i];
667 					m->m_len = m->m_pkthdr.len = 0;
668 				}
669 			}
670 		}
671 
672 		return false;
673 	}
674 
675 	return true;
676 }
677 
678 /*
679  * Copy and checksum for packet or packet with mbuf
680  * data_csum is only supported for bsd flows
681  */
682 static bool
copy_pkt_csum(struct __kern_packet * pkt,uint32_t plen,_dbuf_array_t * dbuf,uint16_t * data_csum,bool verify_l3)683 copy_pkt_csum(struct __kern_packet *pkt, uint32_t plen, _dbuf_array_t *dbuf,
684     uint16_t *data_csum, bool verify_l3)
685 {
686 	/*
687 	 * To keep this routine simple and optimal, we are asserting on the
688 	 * assumption that the smallest flowswitch packet pool buffer should
689 	 * be large enough to hold the IP and TCP headers in the first buflet.
690 	 */
691 	_CASSERT(NX_FSW_MINBUFSIZE >= NETIF_COMPAT_MAX_MBUF_DATA_COPY);
692 
693 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
694 	    (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
695 
696 	uint16_t start = 0, csum = 0;
697 	uint32_t len = 0;
698 	/* soff is only used for packets */
699 	uint16_t soff = pkt->pkt_headroom + pkt->pkt_l2_len;
700 	uint32_t data_partial = 0, partial = 0;
701 	boolean_t odd_start = false;
702 	uint32_t data_len;
703 	uint16_t dbuf_off;
704 	uint16_t copied_len = 0;
705 	bool l3_csum_ok;
706 	uint8_t *daddr;
707 
708 	if (dbuf->dba_is_buflet) {
709 		daddr = kern_buflet_get_data_address(dbuf->dba_buflet[0]);
710 		daddr += kern_buflet_get_data_length(dbuf->dba_buflet[0]);
711 	} else {
712 		daddr = mtod(dbuf->dba_mbuf[0], uint8_t *);
713 		daddr += dbuf->dba_mbuf[0]->m_len;
714 		/*
715 		 * available space check for payload is done later
716 		 * in _copy_data_sum_dbuf
717 		 */
718 		ASSERT(M_TRAILINGSPACE(dbuf->dba_mbuf[0]) >=
719 		    pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
720 	}
721 
722 	if (PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt)) {
723 		/* copy only */
724 		_copy_data_sum_dbuf(pkt, PKT_IS_TRUNC_MBUF(pkt) ? 0: soff,
725 		    plen, &partial, &odd_start, dbuf, false);
726 		if (PKT_IS_MBUF(pkt)) {
727 			csum = pkt->pkt_mbuf->m_pkthdr.csum_rx_val;
728 			SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
729 			    pkt->pkt_mbuf->m_pkthdr.csum_flags,
730 			    pkt->pkt_mbuf->m_pkthdr.csum_rx_start, csum);
731 		} else {
732 			csum = pkt->pkt_csum_rx_value;
733 			SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
734 			    pkt->pkt_csum_flags,
735 			    pkt->pkt_csum_rx_start_off, csum);
736 		}
737 
738 		/* pkt metadata will be transfer to super packet */
739 		__packet_set_inet_checksum(SK_PKT2PH(pkt),
740 		    PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
741 		if ((csum ^ 0xffff) == 0) {
742 			return true;
743 		} else {
744 			return false;
745 		}
746 	}
747 
748 	/* Copy l3 & verify checksum only for IPv4 */
749 	start = 0;
750 	len = pkt->pkt_flow_ip_hlen;
751 	if (PKT_IS_TRUNC_MBUF(pkt)) {
752 		partial = m_copydata_sum(pkt->pkt_mbuf, start, len,
753 		    (daddr + start), 0, NULL);
754 	} else {
755 		partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), soff,
756 		    (daddr + start), len, true, 0, NULL);
757 	}
758 	verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(pkt));
759 	l3_csum_ok = !verify_l3;
760 	if (verify_l3) {
761 		csum = __packet_fold_sum(partial);
762 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
763 		    start, len, csum);
764 		pkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
765 		if ((csum ^ 0xffff) != 0) {
766 			/* proceed to copy the rest of packet */
767 		} else {
768 			pkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
769 			l3_csum_ok = true;
770 		}
771 	}
772 	copied_len += pkt->pkt_flow_ip_hlen;
773 
774 	/* Copy & verify TCP checksum */
775 	start = pkt->pkt_flow_ip_hlen;
776 	len = plen - start;
777 
778 	if (PKT_IS_TRUNC_MBUF(pkt)) {
779 		/* First, copy and sum TCP header */
780 		partial = m_copydata_sum(pkt->pkt_mbuf, start,
781 		    pkt->pkt_flow_tcp_hlen, (daddr + start), 0, NULL);
782 
783 		data_len = len - pkt->pkt_flow_tcp_hlen;
784 		start += pkt->pkt_flow_tcp_hlen;
785 		dbuf_off = start;
786 		/* Next, copy and sum payload (if any) */
787 	} else {
788 		/* First, copy and sum TCP header */
789 		partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), (soff + start),
790 		    (daddr + start), pkt->pkt_flow_tcp_hlen, true, 0, NULL);
791 
792 		data_len = len - pkt->pkt_flow_tcp_hlen;
793 		start += pkt->pkt_flow_tcp_hlen;
794 		dbuf_off = start;
795 		start += soff;
796 	}
797 	copied_len += pkt->pkt_flow_tcp_hlen;
798 
799 	if (dbuf->dba_is_buflet) {
800 		VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[0],
801 		    kern_buflet_get_data_length(dbuf->dba_buflet[0]) +
802 		    copied_len) == 0);
803 	} else {
804 		dbuf->dba_mbuf[0]->m_len += copied_len;
805 		dbuf->dba_mbuf[0]->m_pkthdr.len += copied_len;
806 	}
807 
808 	/* copy and sum payload (if any) */
809 	if (data_len > 0) {
810 		odd_start = false;
811 		_copy_data_sum_dbuf(pkt, start, data_len, &data_partial,
812 		    &odd_start, dbuf, l3_csum_ok);
813 	}
814 
815 	if (__improbable(!l3_csum_ok)) {
816 		return false;
817 	}
818 
819 	/* Fold data sum to 16 bit and then into the partial */
820 	*data_csum = __packet_fold_sum(data_partial);
821 
822 	/* Fold in the data checksum to TCP checksum */
823 	partial += *data_csum;
824 
825 	partial += htons(len + IPPROTO_TCP);
826 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
827 		csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
828 		    pkt->pkt_flow_ipv4_dst.s_addr, partial);
829 	} else {
830 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
831 		csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
832 		    &pkt->pkt_flow_ipv6_dst, partial);
833 	}
834 
835 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
836 	    pkt->pkt_flow_ip_hlen, len, csum);
837 
838 	/* pkt metadata will be transfer to super packet */
839 	__packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
840 	    0, csum, false);
841 	if ((csum ^ 0xffff) != 0) {
842 		return false;
843 	}
844 
845 	return true;
846 }
847 
848 SK_INLINE_ATTRIBUTE
849 static void
flow_agg_init_common(struct nx_flowswitch * fsw,struct flow_agg * fa,struct __kern_packet * pkt)850 flow_agg_init_common(struct nx_flowswitch *fsw, struct flow_agg *fa,
851     struct __kern_packet *pkt)
852 {
853 	struct ifnet *ifp;
854 
855 	switch (pkt->pkt_flow_ip_ver) {
856 	case IPVERSION:
857 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip)) {
858 			return;
859 		}
860 		break;
861 	case IPV6_VERSION:
862 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
863 			return;
864 		}
865 		break;
866 	default:
867 		VERIFY(0);
868 		/* NOTREACHED */
869 		__builtin_unreachable();
870 	}
871 
872 	fa->fa_tcp_seq = ntohl(pkt->pkt_flow_tcp_seq) + pkt->pkt_flow_ulen;
873 	fa->fa_ulen = pkt->pkt_flow_ulen;
874 	fa->fa_total = pkt->pkt_flow_ip_hlen +
875 	    pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
876 
877 	ifp = fsw->fsw_ifp;
878 	ASSERT(ifp != NULL);
879 	if (__improbable((ifp->if_hwassist & IFNET_LRO) != 0)) {
880 		/* in case hardware supports LRO, don't fix checksum in the header */
881 		fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum_no_op;
882 	} else {
883 		fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum;
884 	}
885 }
886 
887 static void
flow_agg_init_smbuf(struct nx_flowswitch * fsw,struct flow_agg * fa,struct mbuf * smbuf,struct __kern_packet * pkt)888 flow_agg_init_smbuf(struct nx_flowswitch *fsw, struct flow_agg *fa,
889     struct mbuf *smbuf, struct __kern_packet *pkt)
890 {
891 	FLOW_AGG_CLEAR(fa);
892 
893 	ASSERT(smbuf != NULL);
894 	fa->fa_smbuf = smbuf;
895 
896 	fa->fa_sptr = mtod(smbuf, uint8_t *);
897 	ASSERT(fa->fa_sptr != NULL);
898 
899 	/*
900 	 * Note here we use 'pkt' instead of 'smbuf', since we rely on the
901 	 * contents of the flow structure which don't exist in 'smbuf'.
902 	 */
903 	flow_agg_init_common(fsw, fa, pkt);
904 }
905 
906 static void
flow_agg_init_spkt(struct nx_flowswitch * fsw,struct flow_agg * fa,struct __kern_packet * spkt,struct __kern_packet * pkt)907 flow_agg_init_spkt(struct nx_flowswitch *fsw, struct flow_agg *fa,
908     struct __kern_packet *spkt, struct __kern_packet *pkt)
909 {
910 	FLOW_AGG_CLEAR(fa);
911 
912 	ASSERT(spkt != NULL);
913 	fa->fa_spkt = spkt;
914 	fa->fa_sobj_is_pkt = true;
915 	VERIFY(spkt->pkt_headroom == 0 && spkt->pkt_l2_len == 0);
916 
917 	MD_BUFLET_ADDR_ABS(spkt, fa->fa_sptr);
918 	ASSERT(fa->fa_sptr != NULL);
919 
920 	/*
921 	 * Note here we use 'pkt' instead of 'spkt', since we rely on the
922 	 * contents of the flow structure which don't exist in 'spkt'.
923 	 */
924 	flow_agg_init_common(fsw, fa, pkt);
925 }
926 
927 SK_INLINE_ATTRIBUTE
928 static bool
ipv4_tcp_memcmp(const uint8_t * h1,const uint8_t * h2)929 ipv4_tcp_memcmp(const uint8_t *h1, const uint8_t *h2)
930 {
931 	return sk_memcmp_mask_64B(h1, h2, (const uint8_t *)&ip_tcp_mask) == 0;
932 }
933 
934 SK_INLINE_ATTRIBUTE
935 static bool
ipv6_tcp_memcmp(const uint8_t * h1,const uint8_t * h2)936 ipv6_tcp_memcmp(const uint8_t *h1, const uint8_t *h2)
937 {
938 	return sk_memcmp_mask_80B(h1, h2, (const uint8_t *)&ip6_tcp_mask) == 0;
939 }
940 
941 SK_INLINE_ATTRIBUTE
942 static bool
can_agg_fastpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)943 can_agg_fastpath(struct flow_agg *fa, struct __kern_packet *pkt,
944     struct fsw_stats *fsws)
945 {
946 	bool match;
947 
948 	ASSERT(fa->fa_sptr != NULL);
949 	_CASSERT(sizeof(struct ip6_tcp_mask) == MASK_SIZE);
950 	_CASSERT(sizeof(struct ip_tcp_mask) == MASK_SIZE);
951 
952 	if (__improbable(pkt->pkt_length < MASK_SIZE)) {
953 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_TCP);
954 		goto slow_path;
955 	}
956 
957 	if (__improbable(fa->fa_sobj_is_short)) {
958 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_MBUF);
959 		goto slow_path;
960 	}
961 
962 	if (__improbable(pkt->pkt_flow_tcp_hlen !=
963 	    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA))) {
964 		goto slow_path;
965 	}
966 
967 	switch (pkt->pkt_flow_ip_ver) {
968 	case IPVERSION:
969 		match = ipv4_tcp_memcmp(fa->fa_sptr,
970 		    (uint8_t *)pkt->pkt_flow_ip_hdr);
971 		break;
972 	case IPV6_VERSION:
973 		match = ipv6_tcp_memcmp(fa->fa_sptr,
974 		    (uint8_t *)pkt->pkt_flow_ip_hdr);
975 		break;
976 	default:
977 		VERIFY(0);
978 		/* NOTREACHED */
979 		__builtin_unreachable();
980 	}
981 
982 	if (__improbable(!match)) {
983 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_MASK_TCP);
984 		goto slow_path;
985 	}
986 	if (__improbable(pkt->pkt_flow_ulen != fa->fa_ulen)) {
987 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ULEN_TCP);
988 		goto slow_path;
989 	}
990 
991 	STATS_INC(fsws, FSW_STATS_RX_AGG_OK_FASTPATH_TCP);
992 	fa->fa_tcp_seq += pkt->pkt_flow_ulen;
993 	fa->fa_ulen = pkt->pkt_flow_ulen;
994 	return true;
995 
996 slow_path:
997 	return false;
998 }
999 
1000 SK_NO_INLINE_ATTRIBUTE
1001 static bool
can_agg_slowpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)1002 can_agg_slowpath(struct flow_agg *fa, struct __kern_packet *pkt,
1003     struct fsw_stats *fsws)
1004 {
1005 	uint8_t *sl3_hdr = fa->fa_sptr;
1006 	uint32_t sl3tlen = 0;
1007 	uint16_t sl3hlen = 0;
1008 
1009 	DTRACE_SKYWALK2(aggr__slow, struct __kern_packet *, pkt,
1010 	    uint8_t *, sl3_hdr);
1011 
1012 	ASSERT(sl3_hdr != NULL);
1013 
1014 	/*
1015 	 * Compare IP header length, TOS, frag flags and IP options
1016 	 * For IPv4, the options should match exactly
1017 	 * For IPv6, if options are present, bail out
1018 	 */
1019 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
1020 		struct ip *siph = (struct ip *)(void *)sl3_hdr;
1021 		struct ip *iph = (struct ip *)pkt->pkt_flow_ip_hdr;
1022 
1023 		ASSERT(siph->ip_v == IPVERSION);
1024 		/* 16-bit alignment is sufficient (handles mbuf case) */
1025 		ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1026 		ASSERT(IS_P2ALIGNED(iph, sizeof(uint16_t)));
1027 
1028 		sl3hlen = (siph->ip_hl << 2);
1029 		if (sl3hlen != pkt->pkt_flow_ip_hlen) {
1030 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1031 			DTRACE_SKYWALK2(aggr__fail2, uint16_t, sl3hlen, uint8_t,
1032 			    pkt->pkt_flow_ip_hlen);
1033 			return false;
1034 		}
1035 
1036 		if (siph->ip_ttl != iph->ip_ttl) {
1037 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1038 			DTRACE_SKYWALK2(aggr__fail3, uint8_t, siph->ip_ttl,
1039 			    uint8_t, iph->ip_ttl);
1040 			return false;
1041 		}
1042 
1043 		if (siph->ip_tos != iph->ip_tos) {
1044 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1045 			DTRACE_SKYWALK2(aggr__fail4, uint8_t, siph->ip_tos,
1046 			    uint8_t, iph->ip_tos);
1047 			return false;
1048 		}
1049 		/* For IPv4, DF bit should match */
1050 		if ((ntohs(siph->ip_off) & (IP_DF | IP_RF)) !=
1051 		    (ntohs(iph->ip_off) & (IP_DF | IP_RF))) {
1052 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OFF_IP);
1053 			DTRACE_SKYWALK2(aggr__fail5, uint16_t,
1054 			    ntohs(siph->ip_off), uint16_t, ntohs(iph->ip_off));
1055 			return false;
1056 		}
1057 
1058 		uint8_t ip_opts_len = pkt->pkt_flow_ip_hlen -
1059 		    sizeof(struct ip);
1060 		if (ip_opts_len > 0 &&
1061 		    memcmp((uint8_t *)(siph + 1), (uint8_t *)(iph + 1),
1062 		    ip_opts_len) != 0) {
1063 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPT_IP);
1064 			DTRACE_SKYWALK3(aggr__fail6, uint8_t, ip_opts_len,
1065 			    uint8_t *, (uint8_t *)(siph + 1), uint8_t *,
1066 			    (uint8_t *)(iph + 1));
1067 			return false;
1068 		}
1069 		sl3tlen = ntohs(siph->ip_len);
1070 	} else {
1071 		struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1072 		struct ip6_hdr *ip6 = (struct ip6_hdr *)pkt->pkt_flow_ip_hdr;
1073 
1074 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1075 		ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1076 		/* 16-bit alignment is sufficient (handles mbuf case) */
1077 		ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1078 
1079 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
1080 			/*
1081 			 * Don't aggregate if extension header is present in
1082 			 * packet. N.B. currently flow switch only classifies
1083 			 * frag header
1084 			 */
1085 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1086 			DTRACE_SKYWALK1(aggr__fail7, uint8_t,
1087 			    pkt->pkt_flow_ip_hlen);
1088 			return false;
1089 		}
1090 
1091 		sl3hlen = sizeof(struct ip6_hdr);
1092 		/* For IPv6, flow info mask covers TOS and flow label */
1093 		if (memcmp(&sip6->ip6_flow, &ip6->ip6_flow,
1094 		    sizeof(sip6->ip6_flow)) != 0) {
1095 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1096 			DTRACE_SKYWALK2(aggr__fail8, uint32_t,
1097 			    ntohl(sip6->ip6_flow), uint32_t,
1098 			    ntohl(ip6->ip6_flow));
1099 			return false;
1100 		}
1101 
1102 		if (sip6->ip6_hlim != ip6->ip6_hlim) {
1103 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1104 			DTRACE_SKYWALK2(aggr__fail9, uint8_t, sip6->ip6_hlim,
1105 			    uint8_t, ip6->ip6_hlim);
1106 			return false;
1107 		}
1108 
1109 		sl3tlen = (sizeof(struct ip6_hdr) + ntohs(sip6->ip6_plen));
1110 	}
1111 
1112 	/*
1113 	 * For TCP header, compare ACK number and window size
1114 	 * Compare TCP flags
1115 	 * Compare TCP header length and TCP options
1116 	 */
1117 	struct tcphdr *stcp = (struct tcphdr *)(void *)(sl3_hdr + sl3hlen);
1118 	struct tcphdr *tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
1119 
1120 	uint16_t sl4hlen = (stcp->th_off << 2);
1121 	if (memcmp(&stcp->th_ack, &tcp->th_ack, sizeof(stcp->th_ack)) != 0 ||
1122 	    memcmp(&stcp->th_win, &tcp->th_win, sizeof(stcp->th_win)) != 0) {
1123 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ACKWIN_TCP);
1124 		DTRACE_SKYWALK4(aggr__fail9, uint32_t, ntohl(stcp->th_ack),
1125 		    uint32_t, ntohl(tcp->th_ack), uint16_t, ntohs(stcp->th_win),
1126 		    uint16_t, ntohs(tcp->th_win));
1127 		return false;
1128 	}
1129 
1130 	if ((stcp->th_flags & ~(TH_PUSH)) != (tcp->th_flags & ~(TH_PUSH))) {
1131 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_FLAGS_TCP);
1132 		DTRACE_SKYWALK2(aggr__fail10, uint8_t, stcp->th_flags,
1133 		    uint8_t, tcp->th_flags);
1134 		return false;
1135 	}
1136 
1137 	if (sl4hlen != pkt->pkt_flow_tcp_hlen) {
1138 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_TCP);
1139 		DTRACE_SKYWALK2(aggr__fail11, uint8_t, sl4hlen,
1140 		    uint8_t, pkt->pkt_flow_tcp_hlen);
1141 		return false;
1142 	}
1143 
1144 	uint8_t tcp_opts_len = pkt->pkt_flow_tcp_hlen - sizeof(struct tcphdr);
1145 	/*
1146 	 * We know that the TCP-option lengthes are the same thanks to the above
1147 	 * sl4hlen check
1148 	 */
1149 	if (tcp_opts_len > 0 && memcmp((uint8_t *)(stcp + 1),
1150 	    (uint8_t *)(tcp + 1), tcp_opts_len) != 0) {
1151 		/*
1152 		 * Fast-path header prediction:
1153 		 *
1154 		 * TCP Timestamp option is usually put after two NOP-headers,
1155 		 * and thus total TCP-option length is 12. If that's the case,
1156 		 * we can aggregate as only the TCP time-stamp option differs.
1157 		 */
1158 		if (tcp_opts_len != TCPOLEN_TSTAMP_APPA) {
1159 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_EXOPT_TCP);
1160 			DTRACE_SKYWALK1(aggr__fail13, uint8_t, tcp_opts_len);
1161 			return false;
1162 		} else {
1163 			uint32_t sts_hdr, ts_hdr;
1164 			if (IS_P2ALIGNED(stcp + 1, sizeof(uint32_t))) {
1165 				sts_hdr = *((uint32_t *)(stcp + 1));
1166 			} else {
1167 				bcopy(stcp + 1, &sts_hdr, sizeof(sts_hdr));
1168 			}
1169 			if (IS_P2ALIGNED(tcp + 1, sizeof(uint32_t))) {
1170 				ts_hdr = *((uint32_t *)(tcp + 1));
1171 			} else {
1172 				bcopy(tcp + 1, &ts_hdr, sizeof(ts_hdr));
1173 			}
1174 
1175 			if (sts_hdr != htonl(TCPOPT_TSTAMP_HDR) ||
1176 			    ts_hdr != htonl(TCPOPT_TSTAMP_HDR)) {
1177 				STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPTTS_TCP);
1178 				DTRACE_SKYWALK2(aggr__fail14, uint32_t,
1179 				    sts_hdr, uint32_t, ts_hdr);
1180 				return false;
1181 			}
1182 		}
1183 	}
1184 	STATS_INC(fsws, FSW_STATS_RX_AGG_OK_SLOWPATH_TCP);
1185 	fa->fa_tcp_seq += pkt->pkt_flow_ulen;
1186 	fa->fa_ulen = pkt->pkt_flow_ulen;
1187 	return true;
1188 }
1189 
1190 static bool
flow_agg_is_ok(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)1191 flow_agg_is_ok(struct flow_agg *fa, struct __kern_packet *pkt,
1192     struct fsw_stats *fsws)
1193 {
1194 	/* Shouldn't exceed the ip_len beyond MIN(custom ip_len, 64K) */
1195 	const uint32_t max_ip_len = MAX_AGG_IP_LEN();
1196 	bool can_agg = false;
1197 
1198 	DTRACE_SKYWALK2(aggr__check, struct flow_agg *, fa,
1199 	    struct __kern_packet *, pkt);
1200 
1201 	ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1202 	if (__improbable(pkt->pkt_flow_tcp_agg_fast != 0)) {
1203 		pkt->pkt_flow_tcp_agg_fast = 0;
1204 	}
1205 	/*
1206 	 * Don't aggregate if any of the following is true:
1207 	 * 1. TCP flag is other than TH_{ACK,PUSH}
1208 	 * 2. Payload length is 0 (pure ACK)
1209 	 * 3. This is the first packet
1210 	 * 4. TCP sequence number is not expected
1211 	 * 5. We would've exceeded the maximum aggregated size
1212 	 * 6. It's not the first packet and the wake flag is set
1213 	 */
1214 	if (__improbable((pkt->pkt_flow_tcp_flags & TCP_FLAGS_IGNORE) != 0 ||
1215 	    pkt->pkt_flow_ulen == 0 || fa->fa_sobj == NULL)) {
1216 		DTRACE_SKYWALK1(aggr__fail1a, struct __kern_packet *, pkt);
1217 		goto done;
1218 	}
1219 	if (__improbable(ntohl(pkt->pkt_flow_tcp_seq) != fa->fa_tcp_seq)) {
1220 		DTRACE_SKYWALK2(aggr__fail1b, uint32_t,
1221 		    ntohl(pkt->pkt_flow_tcp_seq), uint32_t, fa->fa_tcp_seq);
1222 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SEQN_TCP);
1223 		goto done;
1224 	}
1225 	if (__improbable((fa->fa_total + pkt->pkt_flow_ulen) > max_ip_len)) {
1226 		DTRACE_SKYWALK3(aggr__fail1c, uint32_t, fa->fa_total,
1227 		    uint32_t, pkt->pkt_flow_ulen, uint32_t, max_ip_len);
1228 		/* We've reached aggregation limit */
1229 		STATS_INC(fsws, FSW_STATS_RX_AGG_LIMIT);
1230 		goto done;
1231 	}
1232 	if (__improbable(PKT_IS_WAKE_PKT(pkt) && fa->fa_total > 0)) {
1233 		DTRACE_SKYWALK1(aggr__fail1d, struct __kern_packet *, pkt);
1234 		goto done;
1235 	}
1236 
1237 	can_agg = can_agg_fastpath(fa, pkt, fsws);
1238 	if (can_agg) {
1239 		pkt->pkt_flow_tcp_agg_fast = 1;
1240 		goto done;
1241 	}
1242 
1243 	can_agg = can_agg_slowpath(fa, pkt, fsws);
1244 	ASSERT(!pkt->pkt_flow_tcp_agg_fast);
1245 
1246 done:
1247 	return can_agg;
1248 }
1249 
1250 static uint16_t
flow_agg_pkt_fix_sum(uint16_t csum,uint16_t old,uint16_t new)1251 flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new)
1252 {
1253 	return __packet_fix_sum(csum, old, new);
1254 }
1255 
1256 static uint16_t
flow_agg_pkt_fix_sum_no_op(uint16_t __unused csum,uint16_t __unused old,uint16_t __unused new)1257 flow_agg_pkt_fix_sum_no_op(uint16_t __unused csum, uint16_t __unused old,
1258     uint16_t __unused new)
1259 {
1260 	return 0;
1261 }
1262 
1263 static inline void
flow_agg_pkt_fix_hdr_sum(struct flow_agg * fa,uint8_t * field,uint16_t * csum,uint32_t new)1264 flow_agg_pkt_fix_hdr_sum(struct flow_agg *fa, uint8_t *field, uint16_t *csum,
1265     uint32_t new)
1266 {
1267 	uint32_t old;
1268 	memcpy(&old, field, sizeof(old));
1269 	memcpy(field, &new, sizeof(uint32_t));
1270 	*csum = fa->fa_fix_pkt_sum(fa->fa_fix_pkt_sum(*csum,
1271 	    (uint16_t)(old >> 16), (uint16_t)(new >> 16)),
1272 	    (uint16_t)(old & 0xffff),
1273 	    (uint16_t)(new & 0xffff));
1274 }
1275 
1276 static void
flow_agg_merge_hdr(struct flow_agg * fa,struct __kern_packet * pkt,__unused uint16_t data_csum,struct fsw_stats * fsws)1277 flow_agg_merge_hdr(struct flow_agg *fa, struct __kern_packet *pkt,
1278     __unused uint16_t data_csum, struct fsw_stats *fsws)
1279 {
1280 	struct tcphdr *stcp, *tcp;
1281 	uint8_t *l3hdr, l3hlen;
1282 	uint16_t old_l3len = 0;
1283 	uint8_t result;
1284 
1285 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1286 
1287 	/*
1288 	 * The packet being merged should always have full checksum flags
1289 	 * and a valid checksum. Otherwise, it would fail copy_pkt_csum_packed
1290 	 * and not enter this function.
1291 	 */
1292 	ASSERT(PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt));
1293 	ASSERT((pkt->pkt_csum_rx_value ^ 0xffff) == 0);
1294 
1295 	ASSERT(fa->fa_sobj != NULL);
1296 	ASSERT(!fa->fa_sobj_is_pkt ||
1297 	    (fa->fa_spkt->pkt_headroom == 0 && fa->fa_spkt->pkt_l2_len == 0));
1298 	uint8_t *sl3_hdr = fa->fa_sptr;
1299 	ASSERT(sl3_hdr != NULL);
1300 	ASSERT(fa->fa_fix_pkt_sum != NULL);
1301 
1302 	fa->fa_total += pkt->pkt_flow_ulen;
1303 
1304 	/*
1305 	 * Update the IP header as:
1306 	 * 1. Set the IP ID (IPv4 only) to that of the new packet
1307 	 * 2. Set the ttl to the lowest of the two
1308 	 * 3. Increment the IP length by the payload length of new packet
1309 	 * 4. Leave the IP (IPv4 only) checksum as is
1310 	 * Update the resp. flow classification fields, if any
1311 	 * Nothing to update for TCP header for now
1312 	 */
1313 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
1314 		struct ip *siph = (struct ip *)(void *)sl3_hdr;
1315 
1316 		/* 16-bit alignment is sufficient (handles mbuf case) */
1317 		ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1318 
1319 		l3hdr = (uint8_t *)siph;
1320 		l3hlen = siph->ip_hl << 2;
1321 
1322 		old_l3len = ntohs(siph->ip_len);
1323 		uint16_t l3tlen = ntohs(siph->ip_len) + pkt->pkt_flow_ulen;
1324 		siph->ip_len = htons(l3tlen);
1325 		siph->ip_sum = fa->fa_fix_pkt_sum(siph->ip_sum, 0,
1326 		    htons(pkt->pkt_flow_ulen));
1327 
1328 		SK_DF(logflags, "Agg IP len %u", ntohs(siph->ip_len));
1329 	} else {
1330 		struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1331 
1332 		/* 16-bit alignment is sufficient (handles mbuf case) */
1333 		ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1334 		ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1335 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1336 
1337 		l3hdr = (uint8_t *)sip6;
1338 		l3hlen = sizeof(struct ip6_hdr);
1339 
1340 		/* No extension headers should be present */
1341 		ASSERT(pkt->pkt_flow_ip_hlen == sizeof(struct ip6_hdr));
1342 
1343 		old_l3len = ntohs(sip6->ip6_plen) + sizeof(struct ip6_hdr);
1344 		uint16_t l3plen = ntohs(sip6->ip6_plen) + pkt->pkt_flow_ulen;
1345 		sip6->ip6_plen = htons(l3plen);
1346 
1347 		SK_DF(logflags, "Agg IP6 len %u", ntohs(sip6->ip6_plen));
1348 	}
1349 
1350 	if (__probable(pkt->pkt_flow_tcp_agg_fast)) {
1351 		STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_IP);
1352 	} else {
1353 		STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_IP);
1354 	}
1355 
1356 	stcp = (struct tcphdr *)(void *)(l3hdr + l3hlen);
1357 	tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
1358 	/* 16-bit alignment is sufficient (handles mbuf case) */
1359 	ASSERT(IS_P2ALIGNED(stcp, sizeof(uint16_t)));
1360 	ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
1361 
1362 	/*
1363 	 * If it is bigger, that means there are TCP-options that need to be
1364 	 * copied over.
1365 	 */
1366 	if (pkt->pkt_flow_tcp_hlen > sizeof(struct tcphdr) ||
1367 	    (stcp->th_flags & TH_PUSH) == 0) {
1368 		VERIFY(stcp->th_off << 2 == pkt->pkt_flow_tcp_hlen);
1369 		if (__improbable(!pkt->pkt_flow_tcp_agg_fast &&
1370 		    memcmp(stcp + 1, tcp + 1, (pkt->pkt_flow_tcp_hlen -
1371 		    sizeof(struct tcphdr))) != 0)) {
1372 			uint8_t *sopt = (uint8_t *)(stcp + 1);
1373 			uint8_t *opt = (uint8_t *)(tcp + 1);
1374 
1375 			uint32_t ntsval, ntsecr;
1376 			bcopy((void *)(opt + 4), &ntsval, sizeof(ntsval));
1377 			bcopy((void *)(opt + 8), &ntsecr, sizeof(ntsecr));
1378 
1379 			flow_agg_pkt_fix_hdr_sum(fa, sopt + 4, &stcp->th_sum, ntsval);
1380 			flow_agg_pkt_fix_hdr_sum(fa, sopt + 8, &stcp->th_sum, ntsecr);
1381 
1382 			STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_TCP);
1383 		} else {
1384 			STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_TCP);
1385 		}
1386 
1387 		if ((stcp->th_flags & TH_PUSH) == 0 &&
1388 		    (tcp->th_flags & TH_PUSH) != 0) {
1389 			uint16_t old, new;
1390 			old = *(uint16_t *)(void *)(&stcp->th_ack + 1);
1391 			/* If the new segment has a PUSH-flag, append it! */
1392 			stcp->th_flags |= tcp->th_flags & TH_PUSH;
1393 			new = *(uint16_t *)(void *)(&stcp->th_ack + 1);
1394 			stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, old, new);
1395 		}
1396 	}
1397 
1398 	/* Update pseudo header checksum */
1399 	stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
1400 	    htons(pkt->pkt_flow_ulen));
1401 
1402 	/* Update data checksum  */
1403 	if (__improbable(old_l3len & 0x1)) {
1404 		/* swap the byte order, refer to rfc 1071 section 2 */
1405 		stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
1406 		    ntohs(data_csum));
1407 	} else {
1408 		stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0, data_csum);
1409 	}
1410 
1411 	if (fa->fa_sobj_is_pkt) {
1412 		struct __kern_packet *spkt = fa->fa_spkt;
1413 		spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1414 		spkt->pkt_flow_ulen += pkt->pkt_flow_ulen;
1415 		/*
1416 		 * Super packet length includes L3 and L4
1417 		 * header length for first packet only.
1418 		 */
1419 		spkt->pkt_length += pkt->pkt_flow_ulen;
1420 		if (spkt->pkt_seg_cnt == 0) {
1421 			/* First time we append packets, need to set it to 1 */
1422 			spkt->pkt_seg_cnt = 1;
1423 		}
1424 		_CASSERT(sizeof(result) == sizeof(spkt->pkt_seg_cnt));
1425 		if (!os_add_overflow(1, spkt->pkt_seg_cnt, &result)) {
1426 			spkt->pkt_seg_cnt = result;
1427 		}
1428 		SK_DF(logflags, "Agg pkt len %u TCP csum 0x%04x",
1429 		    spkt->pkt_length, ntohs(stcp->th_sum));
1430 	} else {
1431 		struct mbuf *smbuf = fa->fa_smbuf;
1432 		smbuf->m_pkthdr.len += pkt->pkt_flow_ulen;
1433 		if (smbuf->m_pkthdr.seg_cnt == 0) {
1434 			/* First time we append packets, need to set it to 1 */
1435 			smbuf->m_pkthdr.seg_cnt = 1;
1436 		}
1437 		_CASSERT(sizeof(result) == sizeof(smbuf->m_pkthdr.seg_cnt));
1438 		if (!os_add_overflow(1, smbuf->m_pkthdr.seg_cnt, &result)) {
1439 			smbuf->m_pkthdr.seg_cnt = result;
1440 		}
1441 		SK_DF(logflags, "Agg mbuf len %u TCP csum 0x%04x",
1442 		    smbuf->m_pkthdr.len, ntohs(stcp->th_sum));
1443 	}
1444 }
1445 
1446 /*
1447  * Copy metadata from source packet to destination packet
1448  */
1449 static void
pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)1450 pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
1451 {
1452 	/* Copy packet metadata */
1453 	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
1454 	_PKT_COPY(spkt, dpkt);
1455 }
1456 
1457 static void
pkt_finalize(kern_packet_t ph)1458 pkt_finalize(kern_packet_t ph)
1459 {
1460 	int err = __packet_finalize(ph);
1461 	VERIFY(err == 0);
1462 #if (DEVELOPMENT || DEBUG)
1463 	struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1464 	uint8_t *buf;
1465 	MD_BUFLET_ADDR_ABS(pkt, buf);
1466 	buf += pkt->pkt_headroom + pkt->pkt_l2_len;
1467 	DTRACE_SKYWALK2(aggr__finalize, struct __kern_packet *, pkt,
1468 	    uint8_t *, buf);
1469 #endif
1470 }
1471 
1472 static inline uint32_t
estimate_buf_cnt(struct flow_entry * fe,uint32_t min_bufsize,uint32_t agg_bufsize)1473 estimate_buf_cnt(struct flow_entry *fe, uint32_t min_bufsize,
1474     uint32_t agg_bufsize)
1475 {
1476 	uint32_t max_ip_len = MAX_AGG_IP_LEN();
1477 	uint32_t agg_size = MAX(fe->fe_rx_largest_size, min_bufsize);
1478 	uint32_t hdr_overhead;
1479 
1480 	agg_size = MIN(agg_size, agg_bufsize);
1481 
1482 	hdr_overhead = (fe->fe_rx_pktq_bytes / max_ip_len) *
1483 	    (MAX(sizeof(struct ip), sizeof(struct ip6_hdr)) +
1484 	    sizeof(struct tcphdr));
1485 
1486 	return ((fe->fe_rx_pktq_bytes + hdr_overhead) / agg_size) + 1;
1487 }
1488 
1489 SK_INLINE_ATTRIBUTE
1490 static inline void
_append_dbuf_array_to_kpkt(kern_packet_t ph,kern_buflet_t pbuf,_dbuf_array_t * dbuf_array,kern_buflet_t * lbuf)1491 _append_dbuf_array_to_kpkt(kern_packet_t ph, kern_buflet_t pbuf,
1492     _dbuf_array_t *dbuf_array, kern_buflet_t *lbuf)
1493 {
1494 	for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1495 		kern_buflet_t buf = dbuf_array->dba_buflet[i];
1496 		VERIFY(kern_packet_add_buflet(ph, pbuf, buf) == 0);
1497 		pbuf = buf;
1498 		dbuf_array->dba_buflet[i] = NULL;
1499 	}
1500 	ASSERT(pbuf != NULL);
1501 	dbuf_array->dba_num_dbufs = 0;
1502 	*lbuf = pbuf;
1503 }
1504 
1505 SK_INLINE_ATTRIBUTE
1506 static inline void
_free_dbuf_array(struct kern_pbufpool * pp,_dbuf_array_t * dbuf_array)1507 _free_dbuf_array(struct kern_pbufpool *pp,
1508     _dbuf_array_t *dbuf_array)
1509 {
1510 	for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1511 		kern_buflet_t buf = dbuf_array->dba_buflet[i];
1512 		pp_free_buflet(pp, buf);
1513 		dbuf_array->dba_buflet[i] = NULL;
1514 	}
1515 	dbuf_array->dba_num_dbufs = 0;
1516 }
1517 
1518 static inline void
finalize_super_packet(struct __kern_packet ** spkt,kern_packet_t * sph,struct flow_agg * fa,uint32_t * largest_spkt,uint16_t * spkts,uint16_t bufcnt)1519 finalize_super_packet(struct __kern_packet **spkt, kern_packet_t *sph,
1520     struct flow_agg *fa, uint32_t *largest_spkt, uint16_t *spkts,
1521     uint16_t bufcnt)
1522 {
1523 	(*spkts)++;
1524 	if (bufcnt > 1) {
1525 		(*spkt)->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1526 	}
1527 	pkt_finalize(*sph);
1528 	if ((*spkt)->pkt_length > *largest_spkt) {
1529 		*largest_spkt = (*spkt)->pkt_length;
1530 	}
1531 	pkt_agg_log(*spkt, kernproc, false);
1532 	DTRACE_SKYWALK1(aggr__buflet__count, uint16_t, bufcnt);
1533 	*sph = 0;
1534 	*spkt = NULL;
1535 	FLOW_AGG_CLEAR(fa);
1536 }
1537 
1538 static inline void
converge_aggregation_size(struct flow_entry * fe,uint32_t largest_agg_size)1539 converge_aggregation_size(struct flow_entry *fe, uint32_t largest_agg_size)
1540 {
1541 	if (fe->fe_rx_largest_size > largest_agg_size) {
1542 		/*
1543 		 * Make it slowly move towards largest_agg_size if we
1544 		 * consistently get non-aggregatable size.
1545 		 *
1546 		 * If we start at 16K, this makes us go to 4K within 6 rounds
1547 		 * and down to 2K within 12 rounds.
1548 		 */
1549 		fe->fe_rx_largest_size -=
1550 		    ((fe->fe_rx_largest_size - largest_agg_size) >> 2);
1551 	} else {
1552 		fe->fe_rx_largest_size +=
1553 		    ((largest_agg_size - fe->fe_rx_largest_size) >> 2);
1554 	}
1555 }
1556 
1557 SK_NO_INLINE_ATTRIBUTE
1558 static void
flow_rx_agg_channel(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * dropped_pkts,bool is_mbuf)1559 flow_rx_agg_channel(struct nx_flowswitch *fsw, struct flow_entry *fe,
1560     struct pktq *dropped_pkts, bool is_mbuf)
1561 {
1562 #define __RX_AGG_CHAN_DROP_SOURCE_PACKET(_pkt)    do {   \
1563 	KPKTQ_ENQUEUE(dropped_pkts, (_pkt));             \
1564 	(_pkt) = NULL;                                   \
1565 	FLOW_AGG_CLEAR(&fa);                             \
1566 	prev_csum_ok = false;                            \
1567 } while (0)
1568 	struct flow_agg fa;             /* states */
1569 	FLOW_AGG_CLEAR(&fa);
1570 
1571 	struct pktq pkts;               /* dst super packets */
1572 	struct pktq disposed_pkts;      /* done src packets */
1573 
1574 	KPKTQ_INIT(&pkts);
1575 	KPKTQ_INIT(&disposed_pkts);
1576 
1577 	struct __kern_channel_ring *ring;
1578 	ring = fsw_flow_get_rx_ring(fsw, fe);
1579 	if (__improbable(ring == NULL)) {
1580 		SK_ERR("Rx ring is NULL");
1581 		KPKTQ_CONCAT(dropped_pkts, &fe->fe_rx_pktq);
1582 		STATS_ADD(&fsw->fsw_stats, FSW_STATS_DST_NXPORT_INVALID,
1583 		    KPKTQ_LEN(dropped_pkts));
1584 		return;
1585 	}
1586 	struct kern_pbufpool *dpp = ring->ckr_pp;
1587 	ASSERT(dpp->pp_max_frags > 1);
1588 
1589 	struct __kern_packet *pkt, *tpkt;
1590 	/* state for super packet */
1591 	struct __kern_packet *spkt = NULL;
1592 	kern_packet_t sph = 0;
1593 	kern_buflet_t sbuf = NULL;
1594 	bool prev_csum_ok = false, csum_ok, agg_ok;
1595 	uint16_t spkts = 0, bufcnt = 0;
1596 	int err;
1597 
1598 	struct fsw_stats *fsws = &fsw->fsw_stats;
1599 
1600 	/* state for buflet batch alloc */
1601 	uint32_t bh_cnt, bh_cnt_tmp;
1602 	uint64_t buf_arr[MAX_BUFLET_COUNT];
1603 	_dbuf_array_t dbuf_array = {.dba_is_buflet = true, .dba_num_dbufs = 0};
1604 	uint32_t largest_spkt = 0; /* largest aggregated packet size */
1605 	uint32_t agg_bufsize;
1606 	uint8_t iter = 0;
1607 	uint32_t bft_alloc_flags = PP_ALLOC_BFT_ATTACH_BUFFER;
1608 
1609 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1610 	SK_DF(logflags, "Rx input queue len %u", KPKTQ_LEN(&fe->fe_rx_pktq));
1611 
1612 	if (__probable(fe->fe_rx_largest_size != 0 &&
1613 	    NX_FSW_TCP_RX_AGG_ENABLED())) {
1614 		if (fe->fe_rx_largest_size <= PP_BUF_SIZE_DEF(dpp) ||
1615 		    PP_BUF_SIZE_LARGE(dpp) == 0) {
1616 			agg_bufsize = PP_BUF_SIZE_DEF(dpp);
1617 		} else {
1618 			agg_bufsize = PP_BUF_SIZE_LARGE(dpp);
1619 			bft_alloc_flags |= PP_ALLOC_BFT_LARGE;
1620 		}
1621 		bh_cnt = estimate_buf_cnt(fe, PP_BUF_SIZE_DEF(dpp),
1622 		    agg_bufsize);
1623 		DTRACE_SKYWALK1(needed_blt_cnt_agg, uint32_t, bh_cnt);
1624 		bh_cnt = MIN(bh_cnt, MAX_BUFLET_COUNT);
1625 		bh_cnt_tmp = bh_cnt;
1626 	} else {
1627 		/*
1628 		 * No payload, thus it's all small-sized ACKs/...
1629 		 * OR aggregation is disabled.
1630 		 */
1631 		agg_bufsize = PP_BUF_SIZE_DEF(dpp);
1632 		bh_cnt_tmp = bh_cnt = MIN(KPKTQ_LEN(&fe->fe_rx_pktq), MAX_BUFLET_COUNT);
1633 		DTRACE_SKYWALK1(needed_blt_cnt_no_agg, uint32_t, bh_cnt);
1634 	}
1635 
1636 	err = pp_alloc_buflet_batch(dpp, buf_arr, &bh_cnt, SKMEM_NOSLEEP,
1637 	    bft_alloc_flags);
1638 	if (__improbable(bh_cnt == 0)) {
1639 		SK_ERR("failed to alloc %u buflets (err %d), use slow path",
1640 		    bh_cnt_tmp, err);
1641 	}
1642 	bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1643 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1644 		if (tpkt != NULL) {
1645 			void *baddr;
1646 			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1647 			SK_PREFETCH(baddr, 0);
1648 		}
1649 
1650 		ASSERT(pkt->pkt_qum.qum_pp != dpp);
1651 		ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
1652 		ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
1653 		ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
1654 		ASSERT(!pkt->pkt_flow_ip_is_frag);
1655 		ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1656 
1657 		csum_ok = false;
1658 		agg_ok = false;
1659 		/* supports TCP only */
1660 		uint32_t thlen = (pkt->pkt_flow_ip_hlen +
1661 		    pkt->pkt_flow_tcp_hlen);
1662 		uint32_t plen = (thlen + pkt->pkt_flow_ulen);
1663 		uint16_t data_csum = 0;
1664 
1665 		KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1666 		fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1667 		err = flow_pkt_track(fe, pkt, true);
1668 		if (__improbable(err != 0)) {
1669 			STATS_INC(fsws, FSW_STATS_RX_FLOW_TRACK_ERR);
1670 			/* if need to trigger RST */
1671 			if (err == ENETRESET) {
1672 				flow_track_abort_tcp(fe, pkt, NULL);
1673 			}
1674 			SK_ERR("flow_pkt_track failed (err %d)", err);
1675 			__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1676 			continue;
1677 		}
1678 
1679 		if (is_mbuf) {          /* compat */
1680 			m_adj(pkt->pkt_mbuf, pkt->pkt_l2_len);
1681 			pkt->pkt_svc_class = m_get_service_class(pkt->pkt_mbuf);
1682 			if (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
1683 				pkt->pkt_pflags |= PKT_F_WAKE_PKT;
1684 			}
1685 		}
1686 
1687 		if (prev_csum_ok && sbuf) {
1688 			ASSERT(fa.fa_spkt == spkt);
1689 			ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1690 			agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
1691 			agg_ok = (agg_ok && bufcnt < dpp->pp_max_frags);
1692 
1693 			if (agg_ok && sbuf->buf_dlim - sbuf->buf_doff -
1694 			    sbuf->buf_dlen >= plen - thlen) {
1695 				/*
1696 				 * No need for a new packet, just
1697 				 * append to curr_m.
1698 				 */
1699 				csum_ok = copy_pkt_csum_packed(pkt, plen, NULL,
1700 				    is_ipv4, NULL, sbuf, &data_csum, NULL);
1701 
1702 				if (!csum_ok) {
1703 					STATS_INC(fsws,
1704 					    FSW_STATS_RX_AGG_BAD_CSUM);
1705 					SK_ERR("Checksum for aggregation "
1706 					    "is wrong");
1707 					DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail1);
1708 					/*
1709 					 * Turns out, checksum is wrong!
1710 					 * Fallback to no-agg mode.
1711 					 */
1712 					agg_ok = false;
1713 				} else {
1714 					flow_agg_merge_hdr(&fa, pkt,
1715 					    data_csum, fsws);
1716 					goto next;
1717 				}
1718 			}
1719 		}
1720 
1721 		/* calculate number of buflets required */
1722 		bh_cnt_tmp = howmany(plen, agg_bufsize);
1723 		if (__improbable(bh_cnt_tmp > MAX_BUFLET_COUNT)) {
1724 			STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1725 			SK_ERR("packet too big: bufcnt %d len %d", bh_cnt_tmp,
1726 			    plen);
1727 			__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1728 			continue;
1729 		}
1730 		if (bh_cnt < bh_cnt_tmp) {
1731 			uint32_t tmp;
1732 
1733 			if (iter != 0) {
1734 				/*
1735 				 * rearrange the array for additional
1736 				 * allocation
1737 				 */
1738 				uint8_t i;
1739 				for (i = 0; i < bh_cnt; i++, iter++) {
1740 					buf_arr[i] = buf_arr[iter];
1741 					buf_arr[iter] = 0;
1742 				}
1743 				iter = 0;
1744 			}
1745 			tmp = estimate_buf_cnt(fe, PP_BUF_SIZE_DEF(dpp),
1746 			    agg_bufsize);
1747 			tmp = MIN(tmp, MAX_BUFLET_COUNT);
1748 			tmp = MAX(tmp, bh_cnt_tmp);
1749 			tmp -= bh_cnt;
1750 			ASSERT(tmp <= (MAX_BUFLET_COUNT - bh_cnt));
1751 			DTRACE_SKYWALK1(refilled_blt_cnt, uint32_t, tmp);
1752 			err = pp_alloc_buflet_batch(dpp, &buf_arr[bh_cnt],
1753 			    &tmp, SKMEM_NOSLEEP, bft_alloc_flags);
1754 			bh_cnt += tmp;
1755 			if (__improbable((tmp == 0) || (bh_cnt < bh_cnt_tmp))) {
1756 				STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1757 				SK_ERR("buflet alloc failed (err %d)", err);
1758 				__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1759 				continue;
1760 			}
1761 		}
1762 		/* Use pre-allocated buflets */
1763 		ASSERT(bh_cnt >= bh_cnt_tmp);
1764 		dbuf_array.dba_num_dbufs = bh_cnt_tmp;
1765 		while (bh_cnt_tmp-- > 0) {
1766 			dbuf_array.dba_buflet[bh_cnt_tmp] =
1767 			    (kern_buflet_t)(buf_arr[iter]);
1768 			buf_arr[iter] = 0;
1769 			bh_cnt--;
1770 			iter++;
1771 		}
1772 		/* copy and checksum TCP data */
1773 		if (agg_ok) {
1774 			int added = 0;
1775 			ASSERT(dbuf_array.dba_num_dbufs != 0);
1776 			csum_ok = copy_pkt_csum_packed(pkt, plen, &dbuf_array,
1777 			    is_ipv4, NULL, sbuf, &data_csum, &added);
1778 
1779 			if (__improbable(!csum_ok)) {
1780 				STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1781 				SK_ERR("Checksum for aggregation on new "
1782 				    "mbuf is wrong");
1783 				DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail2);
1784 				agg_ok = false;
1785 				/* reset the used buflets */
1786 				uint8_t j;
1787 				for (j = 0; j < dbuf_array.dba_num_dbufs; j++) {
1788 					VERIFY(kern_buflet_set_data_length(
1789 						    dbuf_array.dba_buflet[j], 0) == 0);
1790 				}
1791 				goto non_agg;
1792 			}
1793 
1794 			/*
1795 			 * There was not enough space in curr_m, thus we must
1796 			 * have added to m->m_data.
1797 			 */
1798 			VERIFY(added > 0);
1799 		} else {
1800 non_agg:
1801 			ASSERT(dbuf_array.dba_num_dbufs != 0);
1802 			csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
1803 			    &data_csum, is_ipv4);
1804 			if (__improbable(!csum_ok)) {
1805 				STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1806 				SK_ERR("%d incorrect csum", __LINE__);
1807 				DTRACE_SKYWALK(aggr__chan_tcp_csum_fail);
1808 			}
1809 		}
1810 		if (agg_ok) {
1811 			ASSERT(fa.fa_spkt == spkt);
1812 			ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1813 			/* update current packet header */
1814 			flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
1815 			ASSERT(dbuf_array.dba_num_dbufs > 0);
1816 			bufcnt += dbuf_array.dba_num_dbufs;
1817 			_append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1818 			    &sbuf);
1819 		} else {
1820 			/* Finalize the current super packet */
1821 			if (sph != 0) {
1822 				finalize_super_packet(&spkt, &sph, &fa,
1823 				    &largest_spkt, &spkts, bufcnt);
1824 			}
1825 
1826 			/* New super packet */
1827 			err = kern_pbufpool_alloc_nosleep(dpp, 0, &sph);
1828 			if (__improbable(err != 0)) {
1829 				STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1830 				SK_ERR("packet alloc failed (err %d)", err);
1831 				_free_dbuf_array(dpp, &dbuf_array);
1832 				__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1833 				continue;
1834 			}
1835 			spkt = SK_PTR_ADDR_KPKT(sph);
1836 			pkt_copy_metadata(pkt, spkt);
1837 			/* Packet length for super packet starts from L3 */
1838 			spkt->pkt_length = plen;
1839 			spkt->pkt_flow_ulen =  pkt->pkt_flow_ulen;
1840 			spkt->pkt_headroom = 0;
1841 			spkt->pkt_l2_len = 0;
1842 			spkt->pkt_seg_cnt = 1;
1843 
1844 			ASSERT(dbuf_array.dba_num_dbufs > 0);
1845 			bufcnt = dbuf_array.dba_num_dbufs;
1846 			sbuf = kern_packet_get_next_buflet(sph, NULL);
1847 			_append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1848 			    &sbuf);
1849 
1850 			KPKTQ_ENQUEUE(&pkts, spkt);
1851 			_UUID_COPY(spkt->pkt_flow_id, fe->fe_uuid);
1852 			_UUID_COPY(spkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1853 			spkt->pkt_policy_id = fe->fe_policy_id;
1854 			spkt->pkt_transport_protocol =
1855 			    fe->fe_transport_protocol;
1856 			flow_agg_init_spkt(fsw, &fa, spkt, pkt);
1857 		}
1858 next:
1859 		pkt_agg_log(pkt, kernproc, true);
1860 		prev_csum_ok = csum_ok;
1861 		KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1862 	}
1863 
1864 	/* Free unused buflets */
1865 	while (bh_cnt > 0) {
1866 		pp_free_buflet(dpp, (kern_buflet_t)(buf_arr[iter]));
1867 		buf_arr[iter] = 0;
1868 		bh_cnt--;
1869 		iter++;
1870 	}
1871 	/* Finalize the last super packet */
1872 	if (sph != 0) {
1873 		finalize_super_packet(&spkt, &sph, &fa, &largest_spkt,
1874 		    &spkts, bufcnt);
1875 	}
1876 	converge_aggregation_size(fe, largest_spkt);
1877 	DTRACE_SKYWALK1(aggr__spkt__count, uint16_t, spkts);
1878 	if (__improbable(is_mbuf)) {
1879 		STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2PKT, spkts);
1880 	} else {
1881 		STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2PKT, spkts);
1882 	}
1883 	FLOW_STATS_IN_ADD(fe, spackets, spkts);
1884 
1885 	KPKTQ_FINI(&fe->fe_rx_pktq);
1886 	KPKTQ_CONCAT(&fe->fe_rx_pktq, &pkts);
1887 	KPKTQ_FINI(&pkts);
1888 
1889 	fsw_ring_enqueue_tail_drop(fsw, ring, &fe->fe_rx_pktq);
1890 
1891 	pp_free_pktq(&disposed_pkts);
1892 }
1893 
1894 SK_NO_INLINE_ATTRIBUTE
1895 static void
flow_rx_agg_host(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * dropped_pkts,bool is_mbuf)1896 flow_rx_agg_host(struct nx_flowswitch *fsw, struct flow_entry *fe,
1897     struct pktq *dropped_pkts, bool is_mbuf)
1898 {
1899 #define __RX_AGG_HOST_DROP_SOURCE_PACKET(_pkt)    do {   \
1900 	drop_packets++;                                  \
1901 	drop_bytes += (_pkt)->pkt_length;                \
1902 	KPKTQ_ENQUEUE(dropped_pkts, (_pkt));             \
1903 	(_pkt) = NULL;                                   \
1904 	FLOW_AGG_CLEAR(&fa);                             \
1905 	prev_csum_ok = false;                            \
1906 } while (0)
1907 	struct flow_agg fa;             /* states */
1908 	FLOW_AGG_CLEAR(&fa);
1909 
1910 	struct pktq disposed_pkts;      /* done src packets */
1911 	KPKTQ_INIT(&disposed_pkts);
1912 
1913 	struct __kern_packet *pkt, *tpkt;
1914 	/* points to the first mbuf of chain */
1915 	struct mbuf *m_chain = NULL;
1916 	/* super mbuf, at the end it points to last mbuf packet */
1917 	struct  mbuf *smbuf = NULL, *curr_m = NULL;
1918 	bool prev_csum_ok = false, csum_ok, agg_ok;
1919 	uint16_t smbufs = 0;
1920 	uint32_t bytes = 0, rcvd_ulen = 0;
1921 	uint32_t rcvd_packets = 0, rcvd_bytes = 0; /* raw packets & bytes */
1922 	uint32_t drop_packets = 0, drop_bytes = 0; /* dropped packets & bytes */
1923 	uint32_t largest_smbuf = 0;
1924 	int err = 0;
1925 
1926 	struct fsw_stats *fsws = &fsw->fsw_stats;
1927 	bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1928 
1929 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1930 
1931 	/* state for mbuf batch alloc */
1932 	uint32_t mhead_cnt;
1933 	uint32_t mhead_bufsize;
1934 	struct mbuf * mhead = NULL;
1935 
1936 	uint16_t l2len = KPKTQ_FIRST(&fe->fe_rx_pktq)->pkt_l2_len;
1937 
1938 	SK_DF(logflags, "Rx input queue bytes %u", fe->fe_rx_pktq_bytes);
1939 
1940 	if (__probable(!is_mbuf)) {
1941 		/*
1942 		 *  Batch mbuf alloc is based on
1943 		 * convert_native_pkt_to_mbuf_chain
1944 		 */
1945 		if (__probable(fe->fe_rx_largest_size != 0 &&
1946 		    NX_FSW_TCP_RX_AGG_ENABLED())) {
1947 			unsigned int num_segs = 1;
1948 
1949 			if (fe->fe_rx_largest_size <= MCLBYTES) {
1950 				mhead_bufsize = MCLBYTES;
1951 			} else if (fe->fe_rx_largest_size <= MBIGCLBYTES) {
1952 				mhead_bufsize = MBIGCLBYTES;
1953 			} else if (fe->fe_rx_largest_size <= M16KCLBYTES) {
1954 				mhead_bufsize = M16KCLBYTES;
1955 			} else {
1956 				mhead_bufsize = M16KCLBYTES * 2;
1957 				num_segs = 2;
1958 			}
1959 
1960 try_again:
1961 			if (fe->fe_rx_pktq_bytes != 0) {
1962 				mhead_cnt = estimate_buf_cnt(fe, MCLBYTES,
1963 				    mhead_bufsize);
1964 			} else {
1965 				/* No payload, thus it's all small-sized ACKs/... */
1966 				mhead_bufsize = MHLEN;
1967 				mhead_cnt = KPKTQ_LEN(&fe->fe_rx_pktq);
1968 			}
1969 
1970 			mhead = m_allocpacket_internal(&mhead_cnt,
1971 			    mhead_bufsize, &num_segs, M_NOWAIT, 1, 0);
1972 
1973 			if (mhead == NULL) {
1974 				if (mhead_bufsize > M16KCLBYTES) {
1975 					mhead_bufsize = M16KCLBYTES;
1976 					num_segs = 1;
1977 					goto try_again;
1978 				}
1979 
1980 				if (mhead_bufsize == M16KCLBYTES) {
1981 					mhead_bufsize = MBIGCLBYTES;
1982 					goto try_again;
1983 				}
1984 
1985 				if (mhead_bufsize == MBIGCLBYTES) {
1986 					mhead_bufsize = MCLBYTES;
1987 					goto try_again;
1988 				}
1989 			}
1990 		} else {
1991 			mhead = NULL;
1992 			mhead_bufsize = mhead_cnt = 0;
1993 		}
1994 		SK_DF(logflags, "batch alloc'ed %u mbufs of size %u", mhead_cnt,
1995 		    mhead_bufsize);
1996 	}
1997 
1998 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1999 		if (tpkt != NULL) {
2000 			void *baddr;
2001 			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
2002 			SK_PREFETCH(baddr, 0);
2003 		}
2004 
2005 		/* Validate l2 len, ip vers, is_mbuf */
2006 		ASSERT(pkt->pkt_l2_len == l2len);
2007 		ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
2008 		ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
2009 		ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
2010 		ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
2011 		ASSERT(!pkt->pkt_flow_ip_is_frag);
2012 		ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
2013 
2014 		csum_ok = false;
2015 		agg_ok = false;
2016 		/*
2017 		 * As we only agg packets with same hdr length,
2018 		 * leverage the pkt metadata
2019 		 */
2020 		uint32_t thlen = (pkt->pkt_flow_ip_hlen +
2021 		    pkt->pkt_flow_tcp_hlen);
2022 		uint32_t plen = (thlen + pkt->pkt_flow_ulen);
2023 
2024 		/*
2025 		 * Rather than calling flow_pkt_track() for each
2026 		 * packet here, we accumulate received packet stats
2027 		 * for the call to flow_track_stats() below.  This
2028 		 * is because flow tracking is a no-op for traffic
2029 		 * that belongs to the host stack.
2030 		 */
2031 		rcvd_ulen += pkt->pkt_flow_ulen;
2032 		rcvd_bytes += pkt->pkt_length;
2033 		rcvd_packets++;
2034 
2035 		KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
2036 		fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
2037 
2038 		/* packet is for BSD flow, create a mbuf chain */
2039 		uint32_t len = (l2len + plen);
2040 		uint16_t data_csum = 0;
2041 		struct mbuf *m;
2042 		bool is_wake_pkt = false;
2043 		if (__improbable(is_mbuf)) {
2044 			m = pkt->pkt_mbuf;
2045 
2046 			if (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
2047 				is_wake_pkt = true;
2048 			}
2049 
2050 			/* Detach mbuf from source pkt */
2051 			KPKT_CLEAR_MBUF_DATA(pkt);
2052 
2053 			uint32_t trailer = (m_pktlen(m) - len);
2054 			ASSERT((uint32_t)m_pktlen(m) >= plen);
2055 			/* Remove the trailer */
2056 			if (trailer > 0) {
2057 				m_adj(m, -trailer);
2058 			}
2059 			/* attached mbuf is already allocated */
2060 			csum_ok = mbuf_csum(pkt, m, is_ipv4, &data_csum);
2061 		} else {                /* native */
2062 			uint16_t pad = P2ROUNDUP(l2len, sizeof(uint32_t)) -
2063 			    l2len;
2064 			uint32_t tot_len = (len + pad);
2065 			/* remember largest aggregated packet size */
2066 			if (smbuf) {
2067 				/* plus 4 bytes to account for padding */
2068 				if (largest_smbuf <
2069 				    (uint32_t)m_pktlen(smbuf) + pad) {
2070 					largest_smbuf = (uint32_t)m_pktlen(smbuf) + pad;
2071 				}
2072 			}
2073 
2074 			if ((pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
2075 				is_wake_pkt = true;
2076 			}
2077 
2078 			if (prev_csum_ok && curr_m) {
2079 				ASSERT(fa.fa_smbuf == smbuf);
2080 				ASSERT(!fa.fa_sobj_is_pkt);
2081 				agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2082 
2083 				if (agg_ok &&
2084 				    M_TRAILINGSPACE(curr_m) >= plen - thlen) {
2085 					/*
2086 					 * No need for a new mbuf,
2087 					 * just append to curr_m.
2088 					 */
2089 					csum_ok = copy_pkt_csum_packed(pkt,
2090 					    plen, NULL, is_ipv4, curr_m, NULL,
2091 					    &data_csum, NULL);
2092 
2093 					if (!csum_ok) {
2094 						STATS_INC(fsws,
2095 						    FSW_STATS_RX_AGG_BAD_CSUM);
2096 						SK_ERR("Checksum for "
2097 						    "aggregation is wrong");
2098 						DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail1);
2099 						/*
2100 						 * Turns out, checksum is wrong!
2101 						 * Fallback to no-agg mode.
2102 						 */
2103 						agg_ok = 0;
2104 					} else {
2105 						/*
2106 						 * We only added payload,
2107 						 * thus -thlen.
2108 						 */
2109 						bytes += (plen - thlen);
2110 						flow_agg_merge_hdr(&fa, pkt,
2111 						    data_csum, fsws);
2112 						goto next;
2113 					}
2114 				}
2115 			}
2116 
2117 			/*
2118 			 * If the batch allocation returned partial success,
2119 			 * we try blocking allocation here again
2120 			 */
2121 			m = mhead;
2122 			if (__improbable(m == NULL ||
2123 			    tot_len > mhead_bufsize)) {
2124 				unsigned int num_segs = 1;
2125 				if (tot_len > M16KCLBYTES) {
2126 					num_segs = 0;
2127 				}
2128 
2129 				ASSERT(mhead_cnt == 0 || mhead != NULL);
2130 				err = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
2131 				    &num_segs, &m);
2132 				if (err != 0) {
2133 					STATS_INC(fsws,
2134 					    FSW_STATS_RX_DROP_NOMEM_BUF);
2135 					SK_ERR("mbuf alloc failed (err %d), "
2136 					    "maxchunks %d, len %d", err, num_segs,
2137 					    tot_len);
2138 					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2139 					continue;
2140 				}
2141 			} else {
2142 				ASSERT(mhead_cnt > 0);
2143 				mhead = m->m_nextpkt;
2144 				m->m_nextpkt = NULL;
2145 				mhead_cnt--;
2146 				if (__improbable(mhead_bufsize >= tot_len + M16KCLBYTES)) {
2147 					FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
2148 				}
2149 			}
2150 			m->m_data += pad;
2151 			m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2152 
2153 			/*
2154 			 * copy and checksum l3, l4 and payload
2155 			 * l2 header is copied later only if we
2156 			 * can't agg as an optimization
2157 			 */
2158 			m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
2159 			_dbuf_array_t dbuf_array = {.dba_is_buflet = false};
2160 			if (agg_ok) {
2161 				int added = 0, dbuf_idx = 0;
2162 				struct mbuf *m_tmp = m;
2163 				dbuf_array.dba_num_dbufs = 0;
2164 				uint32_t m_chain_max_len = 0;
2165 				while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
2166 					dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
2167 					dbuf_array.dba_num_dbufs += 1;
2168 					m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
2169 					m_tmp = m_tmp->m_next;
2170 					dbuf_idx++;
2171 				}
2172 				ASSERT(m_tmp == NULL);
2173 
2174 				csum_ok = copy_pkt_csum_packed(pkt, plen,
2175 				    &dbuf_array, is_ipv4, curr_m, NULL,
2176 				    &data_csum, &added);
2177 
2178 				if (!csum_ok) {
2179 					STATS_INC(fsws,
2180 					    FSW_STATS_RX_AGG_BAD_CSUM);
2181 					SK_ERR("Checksum for aggregation "
2182 					    "on new mbuf is wrong");
2183 					DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail2);
2184 					agg_ok = false;
2185 					goto non_agg;
2186 				}
2187 
2188 				/*
2189 				 * There was not enough space in curr_m,
2190 				 * thus we must have added to m->m_data.
2191 				 */
2192 				VERIFY(added > 0);
2193 				VERIFY(m->m_len <= m->m_pkthdr.len &&
2194 				    (uint32_t)m->m_pkthdr.len <= m_chain_max_len);
2195 
2196 				/*
2197 				 * We account for whatever we added
2198 				 * to m later on, thus - added.
2199 				 */
2200 				bytes += plen - thlen - added;
2201 			} else {
2202 non_agg:
2203 				dbuf_array.dba_num_dbufs = 0;
2204 				uint32_t m_chain_max_len = 0;
2205 				struct mbuf *m_tmp = m;
2206 				int dbuf_idx = 0;
2207 				while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
2208 					dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
2209 					dbuf_array.dba_num_dbufs += 1;
2210 					m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
2211 					m_tmp = m_tmp->m_next;
2212 					dbuf_idx++;
2213 				}
2214 				ASSERT(m_tmp == NULL);
2215 
2216 				m->m_len += l2len;
2217 				m->m_pkthdr.len += l2len;
2218 				csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
2219 				    &data_csum, is_ipv4);
2220 				if (__improbable(!csum_ok)) {
2221 					STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
2222 					SK_ERR("%d incorrect csum", __LINE__);
2223 					DTRACE_SKYWALK(aggr__host_tcp_csum_fail);
2224 				}
2225 				VERIFY(m->m_len <= m->m_pkthdr.len &&
2226 				    (uint32_t)m->m_pkthdr.len <= m_chain_max_len);
2227 			}
2228 
2229 			STATS_INC(fsws, FSW_STATS_RX_COPY_PKT2MBUF);
2230 			STATS_INC(fsws, FSW_STATS_RX_COPY_SUM);
2231 
2232 			m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off;
2233 			m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value;
2234 			/*
2235 			 *  Note that these flags have same value,
2236 			 * except PACKET_CSUM_PARTIAL
2237 			 */
2238 			m->m_pkthdr.csum_flags |= (pkt->pkt_csum_flags &
2239 			    PACKET_CSUM_RX_FLAGS);
2240 
2241 			/* Set the rcvif */
2242 			m->m_pkthdr.rcvif = fsw->fsw_ifp;
2243 
2244 			/* Make sure to propagate the wake pkt flag */
2245 			if (is_wake_pkt) {
2246 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
2247 			}
2248 		}
2249 		ASSERT(m != NULL);
2250 		ASSERT((m->m_flags & M_PKTHDR) && m->m_pkthdr.pkt_hdr != NULL);
2251 		ASSERT((m->m_flags & M_HASFCS) == 0);
2252 		ASSERT(m->m_nextpkt == NULL);
2253 
2254 		if (__improbable(is_mbuf)) {
2255 			if ((uint32_t) m->m_len < (l2len + thlen)) {
2256 				m = m_pullup(m, (l2len + thlen));
2257 				if (m == NULL) {
2258 					STATS_INC(fsws,
2259 					    FSW_STATS_RX_DROP_NOMEM_BUF);
2260 					SK_ERR("mbuf pullup failed (err %d)",
2261 					    err);
2262 					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2263 					continue;
2264 				}
2265 				m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2266 			}
2267 			if (prev_csum_ok && csum_ok) {
2268 				ASSERT(fa.fa_smbuf == smbuf);
2269 				agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2270 			}
2271 		}
2272 
2273 		if (agg_ok) {
2274 			ASSERT(is_wake_pkt == false);
2275 			ASSERT(fa.fa_smbuf == smbuf);
2276 			ASSERT(!fa.fa_sobj_is_pkt);
2277 			if (__improbable(is_mbuf)) {
2278 				bytes += (m_pktlen(m) - l2len);
2279 				/* adjust mbuf by l2, l3 and l4  hdr */
2280 				m_adj(m, l2len + thlen);
2281 			} else {
2282 				bytes += m_pktlen(m);
2283 			}
2284 
2285 			m->m_flags &= ~M_PKTHDR;
2286 			flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
2287 			while (curr_m->m_next != NULL) {
2288 				curr_m = curr_m->m_next;
2289 			}
2290 			curr_m->m_next = m;
2291 			curr_m = m;
2292 			m = NULL;
2293 		} else {
2294 			if ((uint32_t) m->m_len < l2len) {
2295 				m = m_pullup(m, l2len);
2296 				if (m == NULL) {
2297 					STATS_INC(fsws,
2298 					    FSW_STATS_RX_DROP_NOMEM_BUF);
2299 					SK_ERR("mbuf pullup failed (err %d)",
2300 					    err);
2301 					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2302 					continue;
2303 				}
2304 				m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2305 			}
2306 
2307 			/* copy l2 header for native */
2308 			if (__probable(!is_mbuf)) {
2309 				uint16_t llhoff = pkt->pkt_headroom;
2310 				uint8_t *baddr;
2311 				MD_BUFLET_ADDR_ABS(pkt, baddr);
2312 				ASSERT(baddr != NULL);
2313 				baddr += llhoff;
2314 				pkt_copy(baddr, m->m_data, l2len);
2315 			}
2316 			/* adjust mbuf by l2 hdr */
2317 			m_adj(m, l2len);
2318 			bytes += m_pktlen(m);
2319 
2320 			/*
2321 			 * aggregated packets can be skipped by pktap because
2322 			 * the original pre-aggregated chain already passed through
2323 			 * pktap (see fsw_snoop()) before entering this function.
2324 			 */
2325 			m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
2326 
2327 			if (m_chain == NULL) {
2328 				/* this is the start of the chain */
2329 				m_chain = m;
2330 				smbuf = m;
2331 				curr_m = m;
2332 			} else if (smbuf != NULL) {
2333 				/*
2334 				 * set m to be next packet
2335 				 */
2336 				mbuf_agg_log(smbuf, kernproc, is_mbuf);
2337 				smbuf->m_nextpkt = m;
2338 				smbuf = m;
2339 				curr_m = m;
2340 			} else {
2341 				VERIFY(0);
2342 			}
2343 
2344 			smbufs++;
2345 			m = NULL;
2346 
2347 			flow_agg_init_smbuf(fsw, &fa, smbuf, pkt);
2348 			/*
2349 			 * if the super packet is an mbuf which can't accomodate
2350 			 * (sizeof(struct ip6_tcp_mask) in a single buffer then
2351 			 * do the aggregation check in slow path.
2352 			 * Note that an mbuf without cluster has only 80 bytes
2353 			 * available for data, sizeof(struct ip6_tcp_mask) is
2354 			 * also 80 bytes, so if the packet contains an
2355 			 * ethernet header, this mbuf won't be able to fully
2356 			 * contain "struct ip6_tcp_mask" data in a single
2357 			 * buffer.
2358 			 */
2359 			if (pkt->pkt_flow_ip_ver == IPV6_VERSION) {
2360 				if (__improbable(smbuf->m_len <
2361 				    ((smbuf->m_data -
2362 				    (caddr_t)(smbuf->m_pkthdr.pkt_hdr)) +
2363 				    MASK_SIZE))) {
2364 					fa.fa_sobj_is_short = true;
2365 				}
2366 			}
2367 		}
2368 next:
2369 		pkt_agg_log(pkt, kernproc, true);
2370 		prev_csum_ok = csum_ok;
2371 		KPKTQ_ENQUEUE(&disposed_pkts, pkt);
2372 	}
2373 
2374 	KPKTQ_FINI(&fe->fe_rx_pktq);
2375 
2376 	/* Free any leftover mbufs, true only for native  */
2377 	if (__improbable(mhead != NULL)) {
2378 		ASSERT(mhead_cnt != 0);
2379 		(void) m_freem_list(mhead);
2380 		mhead = NULL;
2381 		mhead_cnt = 0;
2382 		mhead_bufsize = 0;
2383 	}
2384 
2385 	converge_aggregation_size(fe, largest_smbuf);
2386 
2387 	if (smbufs > 0) {
2388 		/* Last smbuf */
2389 		mbuf_agg_log(smbuf, kernproc, is_mbuf);
2390 		SK_DF(logflags, "smbuf count %u", smbufs);
2391 
2392 		ASSERT(m_chain != NULL);
2393 		ASSERT(smbuf != NULL);
2394 		/*
2395 		 * Call fsw_host_sendup() with mbuf chain
2396 		 * directly.
2397 		 */
2398 		mchain_agg_log(m_chain, kernproc, is_mbuf);
2399 		fsw_host_sendup(fsw->fsw_ifp, m_chain, smbuf, smbufs, bytes);
2400 
2401 		if (__improbable(is_mbuf)) {
2402 			STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2MBUF, smbufs);
2403 		} else {
2404 			STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2MBUF, smbufs);
2405 		}
2406 		FLOW_STATS_IN_ADD(fe, spackets, smbufs);
2407 
2408 		ASSERT((fe->fe_flags & FLOWENTF_TRACK) == 0);
2409 	}
2410 
2411 	/* record (raw) number of packets and bytes */
2412 	ASSERT((int)(rcvd_bytes - drop_bytes) >= 0);
2413 	ASSERT((int)(rcvd_packets - drop_packets) >= 0);
2414 	flow_track_stats(fe, (rcvd_bytes - drop_bytes),
2415 	    (rcvd_packets - drop_packets), (rcvd_ulen != 0), true);
2416 
2417 	pp_free_pktq(&disposed_pkts);
2418 }
2419 
2420 void
flow_rx_agg_tcp(struct nx_flowswitch * fsw,struct flow_entry * fe)2421 flow_rx_agg_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe)
2422 {
2423 	struct pktq dropped_pkts;
2424 	bool is_mbuf;
2425 
2426 	if (__improbable(fe->fe_rx_frag_count > 0)) {
2427 		dp_flow_rx_process(fsw, fe);
2428 		return;
2429 	}
2430 
2431 	KPKTQ_INIT(&dropped_pkts);
2432 
2433 	if (!dp_flow_rx_route_process(fsw, fe)) {
2434 		SK_ERR("Rx route bad");
2435 		fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
2436 		STATS_ADD(&fsw->fsw_stats, FSW_STATS_RX_FLOW_NONVIABLE,
2437 		    KPKTQ_LEN(&dropped_pkts));
2438 		goto done;
2439 	}
2440 
2441 	is_mbuf = !!(PKT_IS_MBUF(KPKTQ_FIRST(&fe->fe_rx_pktq)));
2442 
2443 	if (fe->fe_nx_port == FSW_VP_HOST) {
2444 		boolean_t do_rx_agg;
2445 
2446 		/* BSD flow */
2447 		if (sk_fsw_rx_agg_tcp_host != SK_FSW_RX_AGG_TCP_HOST_AUTO) {
2448 			do_rx_agg = (sk_fsw_rx_agg_tcp_host ==
2449 			    SK_FSW_RX_AGG_TCP_HOST_ON);
2450 		} else {
2451 			do_rx_agg = !dlil_has_ip_filter() &&
2452 			    !dlil_has_if_filter(fsw->fsw_ifp);
2453 		}
2454 		if (__improbable(!do_rx_agg)) {
2455 			fsw_host_rx(fsw, &fe->fe_rx_pktq);
2456 			return;
2457 		}
2458 		if (__improbable(pktap_total_tap_count != 0)) {
2459 			fsw_snoop(fsw, fe, true);
2460 		}
2461 		flow_rx_agg_host(fsw, fe, &dropped_pkts, is_mbuf);
2462 	} else {
2463 		/* channel flow */
2464 		if (__improbable(pktap_total_tap_count != 0)) {
2465 			fsw_snoop(fsw, fe, true);
2466 		}
2467 		flow_rx_agg_channel(fsw, fe, &dropped_pkts, is_mbuf);
2468 	}
2469 
2470 done:
2471 	pp_free_pktq(&dropped_pkts);
2472 }
2473