xref: /xnu-8020.121.3/bsd/skywalk/nexus/flowswitch/flow/flow_agg.c (revision fdd8201d7b966f0c3ea610489d29bd841d358941)
1 /*
2  * Copyright (c) 2019-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
33 #include <skywalk/nexus/netif/nx_netif.h>
34 #include <skywalk/nexus/netif/nx_netif_compat.h>
35 #include <netinet/tcp.h>
36 #include <netinet/ip.h>
37 #include <netinet/ip6.h>
38 #include <net/pktap.h>
39 #include <sys/sdt.h>
40 
41 #define MAX_BUFLET_COUNT        (64)
42 #define TCP_FLAGS_IGNORE        (TH_FIN|TH_SYN|TH_RST|TH_URG)
43 #define PKT_IS_MBUF(_pkt)       (_pkt->pkt_pflags & PKT_F_MBUF_DATA)
44 #define PKT_IS_TRUNC_MBUF(_pkt) (PKT_IS_MBUF(_pkt) &&           \
45 	                        (_pkt->pkt_pflags & PKT_F_TRUNCATED))
46 
47 /*
48  * This structure holds per-super object (mbuf/packet) flow aggregation states.
49  */
50 struct flow_agg {
51 	union {
52 		struct {
53 			union {
54 				void *          _fa_sobj;
55 				struct mbuf *   _fa_smbuf;      /* super mbuf */
56 				struct __kern_packet *_fa_spkt; /* super pkt */
57 			};
58 			uint8_t *_fa_sptr;        /* ptr to super IP header */
59 			bool     _fa_sobj_is_pkt; /* super obj is pkt or mbuf */
60 			/*
61 			 * super obj is not large enough to hold the IP & TCP
62 			 * header in a contiguous buffer.
63 			 */
64 			bool     _fa_sobj_is_short;
65 			uint32_t _fa_tcp_seq;     /* expected next sequence # */
66 			uint32_t _fa_ulen;        /* expected next ulen */
67 			uint32_t _fa_total;       /* total aggregated bytes */
68 		} __flow_agg;
69 		uint64_t __flow_agg_data[4];
70 	};
71 #define fa_sobj           __flow_agg._fa_sobj
72 #define fa_smbuf          __flow_agg._fa_smbuf
73 #define fa_spkt           __flow_agg._fa_spkt
74 #define fa_sptr           __flow_agg._fa_sptr
75 #define fa_sobj_is_pkt    __flow_agg._fa_sobj_is_pkt
76 #define fa_sobj_is_short  __flow_agg._fa_sobj_is_short
77 #define fa_tcp_seq        __flow_agg._fa_tcp_seq
78 #define fa_ulen           __flow_agg._fa_ulen
79 #define fa_total          __flow_agg._fa_total
80 };
81 
82 #define FLOW_AGG_CLEAR(_fa) do {                                        \
83 	_CASSERT(sizeof(struct flow_agg) == 32);                        \
84 	sk_zero_32(_fa);                                                \
85 } while (0)
86 
87 #define MASK_SIZE       80      /* size of struct {ip,ip6}_tcp_mask */
88 
89 struct ip_tcp_mask {
90 	struct ip       ip_m;
91 	struct tcphdr   tcp_m;
92 	uint32_t        tcp_option_m[MAX_TCPOPTLEN / sizeof(uint32_t)];
93 };
94 
95 static const struct ip_tcp_mask ip_tcp_mask
96 __sk_aligned(16) =
97 {
98 	.ip_m = {
99 		.ip_hl = 0xf,
100 		.ip_v = 0xf,
101 		.ip_tos = 0xff,
102 		/* Not checked; aggregated packet's ip_len is increasing */
103 		.ip_len = 0,
104 		.ip_id = 0,
105 		.ip_off = 0xffff,
106 		.ip_ttl = 0xff,
107 		.ip_p = 0xff,
108 		.ip_sum = 0,
109 		.ip_src.s_addr = 0xffffffff,
110 		.ip_dst.s_addr = 0xffffffff,
111 	},
112 	.tcp_m = {
113 		.th_sport = 0xffff,
114 		.th_dport = 0xffff,
115 		.th_seq = 0,
116 		.th_ack = 0xffffffff,
117 		.th_x2 = 0xf,
118 		.th_off = 0xf,
119 		.th_flags = ~TH_PUSH,
120 		.th_win = 0xffff,
121 		.th_sum = 0,
122 		.th_urp = 0xffff,
123 	},
124 	.tcp_option_m = {
125 		/* Max 40 bytes of TCP options */
126 		0xffffffff,
127 		0xffffffff,
128 		0xffffffff,
129 		0,      /* Filling up to MASK_SIZE */
130 		0,      /* Filling up to MASK_SIZE */
131 		0,      /* Filling up to MASK_SIZE */
132 		0,      /* Filling up to MASK_SIZE */
133 		0,      /* Filling up to MASK_SIZE */
134 		0,      /* Filling up to MASK_SIZE */
135 		0,      /* Filling up to MASK_SIZE */
136 	},
137 };
138 
139 struct ip6_tcp_mask {
140 	struct ip6_hdr  ip6_m;
141 	struct tcphdr   tcp_m;
142 	uint32_t        tcp_option_m[5]; /* 5 bytes to fill up to MASK_SIZE */
143 };
144 
145 static const struct ip6_tcp_mask ip6_tcp_mask
146 __sk_aligned(16) =
147 {
148 	.ip6_m = {
149 		.ip6_ctlun.ip6_un1.ip6_un1_flow = 0xffffffff,
150 		/* Not checked; aggregated packet's ip_len is increasing */
151 		.ip6_ctlun.ip6_un1.ip6_un1_plen = 0,
152 		.ip6_ctlun.ip6_un1.ip6_un1_nxt = 0xff,
153 		.ip6_ctlun.ip6_un1.ip6_un1_hlim = 0xff,
154 		.ip6_src.__u6_addr.__u6_addr32[0] = 0xffffff,
155 		.ip6_src.__u6_addr.__u6_addr32[1] = 0xffffff,
156 		.ip6_src.__u6_addr.__u6_addr32[2] = 0xffffff,
157 		.ip6_src.__u6_addr.__u6_addr32[3] = 0xffffff,
158 		.ip6_dst.__u6_addr.__u6_addr32[0] = 0xffffff,
159 		.ip6_dst.__u6_addr.__u6_addr32[1] = 0xffffff,
160 		.ip6_dst.__u6_addr.__u6_addr32[2] = 0xffffff,
161 		.ip6_dst.__u6_addr.__u6_addr32[3] = 0xffffff,
162 	},
163 	.tcp_m = {
164 		.th_sport = 0xffff,
165 		.th_dport = 0xffff,
166 		.th_seq = 0,
167 		.th_ack = 0xffffffff,
168 		.th_x2 = 0xf,
169 		.th_off = 0xf,
170 		.th_flags = ~TH_PUSH,
171 		.th_win = 0xffff,
172 		.th_sum = 0,
173 		.th_urp = 0xffff,
174 	},
175 	.tcp_option_m = {
176 		/* Max 40 bytes of TCP options */
177 		0xffffffff,
178 		0xffffffff,
179 		0xffffffff,
180 		0,          /* Filling up to MASK_SIZE */
181 		0,          /* Filling up to MASK_SIZE */
182 	},
183 };
184 
185 
186 #if SK_LOG
187 SK_LOG_ATTRIBUTE
188 static void
_pkt_agg_log(struct __kern_packet * pkt,struct proc * p,bool is_input)189 _pkt_agg_log(struct __kern_packet *pkt, struct proc *p, bool is_input)
190 {
191 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
192 	    (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
193 
194 	kern_packet_t ph = SK_PKT2PH(pkt);
195 	uint64_t bufcnt = 1;
196 	if (!is_input) {
197 		bufcnt = kern_packet_get_buflet_count(ph);
198 	}
199 
200 	SK_DF(logflags, "%s(%d) %spkt 0x%llx plen %u",
201 	    sk_proc_name_address(p), sk_proc_pid(p), is_input ? "s":"d",
202 	    SK_KVA(pkt), pkt->pkt_length);
203 
204 	SK_DF(logflags, "%spkt csumf/rxstart/rxval 0x%x/%u/0x%04x",
205 	    is_input ? "s":"d", pkt->pkt_csum_flags,
206 	    (uint32_t)pkt->pkt_csum_rx_start_off,
207 	    (uint32_t)pkt->pkt_csum_rx_value);
208 
209 	if (!is_input) {
210 		kern_buflet_t buf = kern_packet_get_next_buflet(ph, NULL);
211 
212 		/* Individual buflets */
213 		for (uint64_t i = 0; i < bufcnt && buf != NULL; i++) {
214 			SK_DF(logflags | SK_VERB_DUMP, "%s",
215 			    sk_dump("buf", kern_buflet_get_data_address(buf),
216 			    pkt->pkt_length, 128, NULL, 0));
217 			buf = kern_packet_get_next_buflet(ph, buf);
218 		}
219 	}
220 }
221 
222 #define pkt_agg_log(_pkt, _p, _is_input) do {                           \
223 	if (__improbable(sk_verbose != 0)) {                            \
224 	        _pkt_agg_log(_pkt, _p, _is_input);                      \
225 	}                                                               \
226 } while (0)
227 
228 SK_LOG_ATTRIBUTE
229 static void
_mbuf_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)230 _mbuf_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
231 {
232 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
233 	    (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
234 
235 	SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
236 	    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
237 	    m->m_pkthdr.len);
238 
239 	SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
240 	    m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
241 	    (uint32_t)m->m_pkthdr.csum_rx_val);
242 
243 	/* Dump the first mbuf */
244 	ASSERT(m->m_data != NULL);
245 	SK_DF(logflags | SK_VERB_DUMP, "%s", sk_dump("buf",
246 	    (uint8_t *)m->m_data, m->m_len, 128, NULL, 0));
247 }
248 
249 #define mbuf_agg_log(_m, _p, _is_mbuf) do {                             \
250 	if (__improbable(sk_verbose != 0)) {                            \
251 	        _mbuf_agg_log(_m, _p, _is_mbuf);                        \
252 	}                                                               \
253 } while (0)
254 
255 SK_LOG_ATTRIBUTE
256 static void
_mchain_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)257 _mchain_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
258 {
259 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
260 	    (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
261 
262 	while (m != NULL) {
263 		SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
264 		    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
265 		    m->m_pkthdr.len);
266 
267 		SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
268 		    m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
269 		    (uint32_t)m->m_pkthdr.csum_rx_val);
270 
271 		m = m->m_nextpkt;
272 	}
273 }
274 
275 #define mchain_agg_log(_m, _p, _is_mbuf) do {                           \
276 	if (__improbable(sk_verbose != 0)) {                            \
277 	        _mchain_agg_log(_m, _p, _is_mbuf);                      \
278 	}                                                               \
279 } while (0)
280 #else
281 #define pkt_agg_log(...)
282 #define mbuf_agg_log(...)
283 #define mchain_agg_log(...)
284 #endif /* SK_LOG */
285 
286 /*
287  * Checksum only for packet with mbuf.
288  */
289 static bool
mbuf_csum(struct __kern_packet * pkt,struct mbuf * m,bool verify_l3,uint16_t * data_csum)290 mbuf_csum(struct __kern_packet *pkt, struct mbuf *m, bool verify_l3,
291     uint16_t *data_csum)
292 {
293 	ASSERT(data_csum != NULL);
294 
295 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
296 	uint32_t plen = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen +
297 	    pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
298 	uint16_t l4len = plen - pkt->pkt_l2_len - pkt->pkt_flow_ip_hlen;
299 	uint16_t start = pkt->pkt_l2_len;
300 	uint32_t partial = 0;
301 	uint16_t csum = 0;
302 
303 	ASSERT(plen == m_pktlen(m));
304 
305 	/* Some compat drivers compute full checksum */
306 	if ((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS) ==
307 	    CSUM_RX_FULL_FLAGS) {
308 		SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
309 		    m->m_pkthdr.csum_flags, m->m_pkthdr.csum_rx_start,
310 		    m->m_pkthdr.csum_rx_val);
311 
312 		/* Compute the data_csum */
313 		struct tcphdr *tcp =
314 		    (struct tcphdr *)(void *)(mtod(m, uint8_t *) +
315 		    pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen);
316 		/* 16-bit alignment is sufficient */
317 		ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
318 
319 		uint16_t th_sum = tcp->th_sum;
320 		tcp->th_sum = 0;
321 
322 		partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
323 		    pkt->pkt_flow_tcp_hlen);
324 		partial += htons(l4len + IPPROTO_TCP);
325 		if (pkt->pkt_flow_ip_ver == IPVERSION) {
326 			csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
327 			    pkt->pkt_flow_ipv4_dst.s_addr, partial);
328 		} else {
329 			ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
330 			csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
331 			    &pkt->pkt_flow_ipv6_dst, partial);
332 		}
333 		/* Restore the original checksum */
334 		tcp->th_sum = th_sum;
335 		th_sum = __packet_fix_sum(th_sum, csum, 0);
336 		*data_csum = ~th_sum & 0xffff;
337 		if ((m->m_pkthdr.csum_rx_val ^ 0xffff) == 0) {
338 			return true;
339 		} else {
340 			return false;
341 		}
342 	}
343 	/* Reset the csum RX flags */
344 	m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
345 	if (verify_l3) {
346 		csum = m_sum16(m, start, pkt->pkt_flow_ip_hlen);
347 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
348 		    start, pkt->pkt_flow_ip_hlen, csum);
349 		m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED;
350 		if ((csum ^ 0xffff) != 0) {
351 			return false;
352 		} else {
353 			m->m_pkthdr.csum_flags |= CSUM_IP_VALID;
354 		}
355 	}
356 	/* Compute L4 header checksum */
357 	partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
358 	    pkt->pkt_flow_tcp_hlen);
359 	/* Compute payload checksum */
360 	start += (pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
361 	*data_csum = m_sum16(m, start, (plen - start));
362 
363 	/* Fold in the data checksum to TCP checksum */
364 	partial += *data_csum;
365 	partial += htons(l4len + IPPROTO_TCP);
366 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
367 		csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
368 		    pkt->pkt_flow_ipv4_dst.s_addr, partial);
369 	} else {
370 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
371 		csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
372 		    &pkt->pkt_flow_ipv6_dst, partial);
373 	}
374 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
375 	    start - pkt->pkt_flow_tcp_hlen, l4len, csum);
376 	// Set start to 0 for full checksum
377 	m->m_pkthdr.csum_rx_start = 0;
378 	m->m_pkthdr.csum_rx_val = csum;
379 	m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
380 	if ((csum ^ 0xffff) != 0) {
381 		return false;
382 	}
383 
384 	return true;
385 }
386 
387 /* structure to pass an array of data buffers */
388 typedef struct _dbuf_array {
389 	union {
390 		struct __kern_buflet *dba_buflet[MAX_BUFLET_COUNT];
391 		struct mbuf *dba_mbuf[MAX_BUFLET_COUNT];
392 	};
393 	uint8_t dba_num_dbufs;
394 	bool dba_is_buflet;
395 } _dbuf_array_t;
396 
397 static inline void
_copy_data_sum_dbuf(struct __kern_packet * spkt,uint16_t soff,uint16_t plen,uint32_t * partial_sum,boolean_t * odd_start,_dbuf_array_t * dbuf,boolean_t do_csum)398 _copy_data_sum_dbuf(struct __kern_packet *spkt, uint16_t soff, uint16_t plen,
399     uint32_t *partial_sum, boolean_t *odd_start, _dbuf_array_t *dbuf,
400     boolean_t do_csum)
401 {
402 	uint8_t i = 0;
403 	uint16_t buf_off = 0;
404 	uint16_t buflet_dlim;
405 	uint16_t buflet_dlen;
406 
407 	ASSERT(plen > 0);
408 	if (!dbuf->dba_is_buflet) {
409 		/*
410 		 * Assumption about a single mbuf is being asserted due to the
411 		 * reason that the current usage always passes one mbuf and the
412 		 * routine has not been tested with multiple mbufs.
413 		 */
414 		ASSERT(dbuf->dba_num_dbufs == 1);
415 		ASSERT((mbuf_maxlen(dbuf->dba_mbuf[0]) -
416 		    dbuf->dba_mbuf[0]->m_len) >= plen);
417 		buf_off = dbuf->dba_mbuf[0]->m_len;
418 	} else {
419 		buflet_dlim = kern_buflet_get_data_limit(dbuf->dba_buflet[0]);
420 		buflet_dlen = kern_buflet_get_data_length(dbuf->dba_buflet[0]);
421 		ASSERT(buflet_dlen < buflet_dlim);
422 		buf_off = buflet_dlen;
423 	}
424 	while (plen > 0) {
425 		uint16_t tmplen;
426 		uint16_t dbuf_lim;
427 		uint8_t *dbuf_addr;
428 
429 		if (dbuf->dba_is_buflet) {
430 			ASSERT(i < dbuf->dba_num_dbufs);
431 			ASSERT(kern_buflet_get_data_offset(dbuf->dba_buflet[i])
432 			    == 0);
433 			dbuf_addr =
434 			    kern_buflet_get_data_address(dbuf->dba_buflet[i]);
435 			dbuf_lim = buflet_dlim - buf_off;
436 		} else {
437 			dbuf_addr = mtod(dbuf->dba_mbuf[i], uint8_t *);
438 			dbuf_lim = mbuf_maxlen(dbuf->dba_mbuf[i]) - buf_off;
439 		}
440 		dbuf_addr += buf_off;
441 		tmplen = min(plen, dbuf_lim);
442 		if (PKT_IS_TRUNC_MBUF(spkt)) {
443 			if (do_csum) {
444 				*partial_sum = m_copydata_sum(spkt->pkt_mbuf,
445 				    soff, tmplen, dbuf_addr, *partial_sum,
446 				    odd_start);
447 			} else {
448 				m_copydata(spkt->pkt_mbuf, soff, tmplen,
449 				    dbuf_addr);
450 			}
451 		} else {
452 			*partial_sum = pkt_copyaddr_sum(SK_PKT2PH(spkt),
453 			    soff, dbuf_addr, tmplen, do_csum, *partial_sum,
454 			    odd_start);
455 		}
456 		if (dbuf->dba_is_buflet) {
457 			VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[i],
458 			    tmplen + buf_off) == 0);
459 		} else {
460 			dbuf->dba_mbuf[i]->m_len += tmplen;
461 			dbuf->dba_mbuf[i]->m_pkthdr.len += tmplen;
462 		}
463 		soff += tmplen;
464 		plen -= tmplen;
465 		buf_off = 0;
466 		i++;
467 	}
468 	ASSERT(plen == 0);
469 }
470 
471 /*
472  * Copy (fill) and checksum for packet.
473  * spkt: source IP packet.
474  * plen: length of data in spkt (IP hdr + TCP hdr + TCP payload).
475  * verify_l3: verify IPv4 header checksum.
476  * currm: destination mbuf.
477  * currp: destination skywalk packet.
478  * dbuf: additional destination data buffer(s), used when current destination
479  * packet is out of space.
480  * added: amount of data copied from spkt to the additional buffer.
481  * data_sum: 16-bit folded partial checksum of the copied TCP payload.
482  */
483 static bool
copy_pkt_csum_packed(struct __kern_packet * spkt,uint32_t plen,_dbuf_array_t * dbuf,bool verify_l3,struct mbuf * currm,struct __kern_buflet * currp,uint16_t * data_csum,int * added)484 copy_pkt_csum_packed(struct __kern_packet *spkt, uint32_t plen,
485     _dbuf_array_t *dbuf, bool verify_l3, struct mbuf *currm,
486     struct __kern_buflet *currp, uint16_t *data_csum, int *added)
487 {
488 	ASSERT(data_csum != NULL);
489 
490 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
491 	    SK_VERB_COPY));
492 
493 	uint16_t start = 0, csum = 0;
494 	uint32_t len = 0;
495 	uint32_t l4len;
496 	/* soff is only used for packets */
497 	uint16_t soff = spkt->pkt_headroom + spkt->pkt_l2_len;
498 	uint32_t data_partial = 0, partial = 0;
499 	int32_t curr_oldlen;
500 	uint32_t curr_trailing;
501 	char *curr_ptr;
502 	int32_t curr_len;
503 	uint16_t data_off;
504 	uint32_t tmplen;
505 	boolean_t odd_start = FALSE;
506 	bool verify_l4;
507 
508 	/* One of them must be != NULL, but they can't be both set */
509 	VERIFY((currm != NULL || currp != NULL) &&
510 	    ((currm != NULL) != (currp != NULL)));
511 
512 	if (currm != NULL) {
513 		curr_oldlen = currm->m_len;
514 		curr_trailing = (uint32_t)M_TRAILINGSPACE(currm);
515 		curr_ptr = mtod(currm, char *) + currm->m_len;
516 		curr_len = currm->m_len;
517 	} else {
518 		curr_oldlen = currp->buf_dlen;
519 		curr_trailing = currp->buf_dlim - currp->buf_doff -
520 		    currp->buf_dlen;
521 		curr_ptr = (char *)(currp->buf_addr + currp->buf_doff +
522 		    currp->buf_dlen);
523 		curr_len = currp->buf_dlen;
524 	}
525 
526 	/* Verify checksum only for IPv4 */
527 	len = spkt->pkt_flow_ip_hlen;
528 	verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(spkt));
529 	if (verify_l3) {
530 		if (PKT_IS_TRUNC_MBUF(spkt)) {
531 			partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
532 			    len, 0, 0);
533 		} else {
534 			partial = pkt_sum(SK_PKT2PH(spkt), soff, len);
535 		}
536 
537 		csum = __packet_fold_sum(partial);
538 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)", 0,
539 		    len, csum);
540 		spkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
541 		if ((csum ^ 0xffff) != 0) {
542 			/* No need to copy & checkum TCP+payload */
543 			return false;
544 		} else {
545 			spkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
546 		}
547 	}
548 
549 	verify_l4 = ((spkt->pkt_csum_flags & PACKET_CSUM_RX_FULL_FLAGS) !=
550 	    PACKET_CSUM_RX_FULL_FLAGS);
551 
552 	/* Copy & verify TCP checksum */
553 	start = spkt->pkt_flow_ip_hlen + spkt->pkt_flow_tcp_hlen;
554 	l4len = plen - spkt->pkt_flow_ip_hlen;
555 	len = plen - start;
556 	if (PKT_IS_TRUNC_MBUF(spkt)) {
557 		tmplen = min(len, curr_trailing);
558 		odd_start = FALSE;
559 
560 		/* First, simple checksum on the TCP header */
561 		if (verify_l4) {
562 			partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
563 			    spkt->pkt_flow_tcp_hlen, spkt->pkt_flow_ip_hlen, 0);
564 		}
565 
566 		/* Now, copy & sum the payload */
567 		if (tmplen > 0) {
568 			data_partial = m_copydata_sum(spkt->pkt_mbuf,
569 			    start, tmplen, curr_ptr, 0, &odd_start);
570 			curr_len += tmplen;
571 		}
572 		data_off = start + tmplen;
573 	} else {
574 		tmplen = min(len, curr_trailing);
575 		odd_start = FALSE;
576 
577 		/* First, simple checksum on the TCP header */
578 		if (verify_l4) {
579 			partial = pkt_sum(SK_PKT2PH(spkt), (soff +
580 			    spkt->pkt_flow_ip_hlen), spkt->pkt_flow_tcp_hlen);
581 		}
582 
583 		/* Now, copy & sum the payload */
584 		if (tmplen > 0) {
585 			data_partial = pkt_copyaddr_sum(SK_PKT2PH(spkt),
586 			    (soff + start), (uint8_t *)curr_ptr, tmplen,
587 			    true, 0, &odd_start);
588 			curr_len += tmplen;
589 		}
590 		data_off = soff + start + tmplen;
591 	}
592 
593 	/* copy & sum remaining payload in additional buffers */
594 	if ((len - tmplen) > 0) {
595 		ASSERT(dbuf != NULL);
596 		_copy_data_sum_dbuf(spkt, data_off, (len - tmplen),
597 		    &data_partial, &odd_start, dbuf, true);
598 		*added = (len - tmplen);
599 	}
600 
601 	/* Fold data checksum to 16 bit */
602 	*data_csum = __packet_fold_sum(data_partial);
603 
604 	if (currm != NULL) {
605 		currm->m_len = curr_len;
606 	} else {
607 		currp->buf_dlen = curr_len;
608 	}
609 
610 	if (verify_l4) {
611 		/* Fold in the data checksum to TCP checksum */
612 		partial += *data_csum;
613 		partial += htons(l4len + IPPROTO_TCP);
614 		if (spkt->pkt_flow_ip_ver == IPVERSION) {
615 			csum = in_pseudo(spkt->pkt_flow_ipv4_src.s_addr,
616 			    spkt->pkt_flow_ipv4_dst.s_addr, partial);
617 		} else {
618 			ASSERT(spkt->pkt_flow_ip_ver == IPV6_VERSION);
619 			csum = in6_pseudo(&spkt->pkt_flow_ipv6_src,
620 			    &spkt->pkt_flow_ipv6_dst, partial);
621 		}
622 		/* pkt metadata will be transfer to super packet */
623 		__packet_set_inet_checksum(SK_PKT2PH(spkt),
624 		    PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
625 	} else {
626 		/* grab csum value from offload */
627 		csum = spkt->pkt_csum_rx_value;
628 	}
629 
630 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
631 	    start - spkt->pkt_flow_tcp_hlen, l4len, ntohs(csum));
632 
633 	if ((csum ^ 0xffff) != 0) {
634 		/*
635 		 * Revert whatever we did here!
636 		 * currm/currp should be restored to previous value.
637 		 * dbuf (for additional payload) should be restore to 0.
638 		 */
639 		if (currm != NULL) {
640 			currm->m_len = curr_oldlen;
641 		} else {
642 			currp->buf_dlen = curr_oldlen;
643 		}
644 		if (dbuf != NULL) {
645 			for (int i = 0; i < dbuf->dba_num_dbufs; i++) {
646 				if (dbuf->dba_is_buflet) {
647 					struct __kern_buflet *b = dbuf->dba_buflet[i];
648 					kern_buflet_set_data_length(b, 0);
649 					kern_buflet_set_data_offset(b, 0);
650 				} else {
651 					struct mbuf *m = dbuf->dba_mbuf[i];
652 					m->m_len = m->m_pkthdr.len = 0;
653 				}
654 			}
655 		}
656 
657 		return false;
658 	}
659 
660 	return true;
661 }
662 
663 /*
664  * Copy and checksum for packet or packet with mbuf
665  * data_csum is only supported for bsd flows
666  */
667 static bool
copy_pkt_csum(struct __kern_packet * pkt,uint32_t plen,_dbuf_array_t * dbuf,uint16_t * data_csum,bool verify_l3)668 copy_pkt_csum(struct __kern_packet *pkt, uint32_t plen, _dbuf_array_t *dbuf,
669     uint16_t *data_csum, bool verify_l3)
670 {
671 	/*
672 	 * To keep this routine simple and optimal, we are asserting on the
673 	 * assumption that the smallest flowswitch packet pool buffer should
674 	 * be large enough to hold the IP and TCP headers in the first buflet.
675 	 */
676 	_CASSERT(NX_FSW_MINBUFSIZE >= NETIF_COMPAT_MAX_MBUF_DATA_COPY);
677 
678 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
679 	    (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
680 
681 	uint16_t start = 0, csum = 0;
682 	uint32_t len = 0;
683 	/* soff is only used for packets */
684 	uint16_t soff = pkt->pkt_headroom + pkt->pkt_l2_len;
685 	uint32_t data_partial = 0, partial = 0;
686 	boolean_t odd_start = false;
687 	uint32_t data_len;
688 	uint16_t dbuf_off;
689 	uint16_t copied_len = 0;
690 	bool l3_csum_ok;
691 	uint8_t *daddr;
692 
693 	if (dbuf->dba_is_buflet) {
694 		daddr = kern_buflet_get_data_address(dbuf->dba_buflet[0]);
695 		daddr += kern_buflet_get_data_length(dbuf->dba_buflet[0]);
696 	} else {
697 		daddr = mtod(dbuf->dba_mbuf[0], uint8_t *);
698 		daddr += dbuf->dba_mbuf[0]->m_len;
699 		ASSERT(mbuf_maxlen(dbuf->dba_mbuf[0]) >= plen);
700 	}
701 
702 	/* Some compat drivers compute full checksum */
703 	if (PKT_IS_MBUF(pkt) && ((pkt->pkt_mbuf->m_pkthdr.csum_flags &
704 	    CSUM_RX_FULL_FLAGS) == CSUM_RX_FULL_FLAGS)) {
705 		/* copy only */
706 		_copy_data_sum_dbuf(pkt, PKT_IS_TRUNC_MBUF(pkt) ? 0: soff,
707 		    plen, &partial, &odd_start, dbuf, false);
708 		csum = pkt->pkt_mbuf->m_pkthdr.csum_rx_val;
709 		SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
710 		    pkt->pkt_mbuf->m_pkthdr.csum_flags,
711 		    pkt->pkt_mbuf->m_pkthdr.csum_rx_start, csum);
712 		/* pkt metadata will be transfer to super packet */
713 		__packet_set_inet_checksum(SK_PKT2PH(pkt),
714 		    PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
715 		if ((csum ^ 0xffff) == 0) {
716 			return true;
717 		} else {
718 			return false;
719 		}
720 	}
721 	/* Copy l3 & verify checksum only for IPv4 */
722 	start = 0;
723 	len = pkt->pkt_flow_ip_hlen;
724 	if (PKT_IS_TRUNC_MBUF(pkt)) {
725 		partial = m_copydata_sum(pkt->pkt_mbuf, start, len,
726 		    (daddr + start), 0, NULL);
727 	} else {
728 		partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), soff,
729 		    (daddr + start), len, true, 0, NULL);
730 	}
731 	verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(pkt));
732 	l3_csum_ok = !verify_l3;
733 	if (verify_l3) {
734 		csum = __packet_fold_sum(partial);
735 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
736 		    start, len, csum);
737 		pkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
738 		if ((csum ^ 0xffff) != 0) {
739 			/* proceed to copy the rest of packet */
740 		} else {
741 			pkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
742 			l3_csum_ok = true;
743 		}
744 	}
745 	copied_len += pkt->pkt_flow_ip_hlen;
746 
747 	/* Copy & verify TCP checksum */
748 	start = pkt->pkt_flow_ip_hlen;
749 	len = plen - start;
750 
751 	if (PKT_IS_TRUNC_MBUF(pkt)) {
752 		/* First, copy and sum TCP header */
753 		partial = m_copydata_sum(pkt->pkt_mbuf, start,
754 		    pkt->pkt_flow_tcp_hlen, (daddr + start), 0, NULL);
755 
756 		data_len = len - pkt->pkt_flow_tcp_hlen;
757 		start += pkt->pkt_flow_tcp_hlen;
758 		dbuf_off = start;
759 		/* Next, copy and sum payload (if any) */
760 	} else {
761 		/* First, copy and sum TCP header */
762 		partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), (soff + start),
763 		    (daddr + start), pkt->pkt_flow_tcp_hlen, true, 0, NULL);
764 
765 		data_len = len - pkt->pkt_flow_tcp_hlen;
766 		start += pkt->pkt_flow_tcp_hlen;
767 		dbuf_off = start;
768 		start += soff;
769 	}
770 	copied_len += pkt->pkt_flow_tcp_hlen;
771 
772 	if (dbuf->dba_is_buflet) {
773 		VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[0],
774 		    kern_buflet_get_data_length(dbuf->dba_buflet[0]) +
775 		    copied_len) == 0);
776 	} else {
777 		dbuf->dba_mbuf[0]->m_len += copied_len;
778 		dbuf->dba_mbuf[0]->m_pkthdr.len += copied_len;
779 	}
780 
781 	/* copy and sum payload (if any) */
782 	if (data_len > 0) {
783 		odd_start = false;
784 		_copy_data_sum_dbuf(pkt, start, data_len, &data_partial,
785 		    &odd_start, dbuf, l3_csum_ok);
786 	}
787 
788 	if (__improbable(!l3_csum_ok)) {
789 		return false;
790 	}
791 
792 	/* Fold data sum to 16 bit and then into the partial */
793 	*data_csum = __packet_fold_sum(data_partial);
794 
795 	/* Fold in the data checksum to TCP checksum */
796 	partial += *data_csum;
797 
798 	partial += htons(len + IPPROTO_TCP);
799 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
800 		csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
801 		    pkt->pkt_flow_ipv4_dst.s_addr, partial);
802 	} else {
803 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
804 		csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
805 		    &pkt->pkt_flow_ipv6_dst, partial);
806 	}
807 
808 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
809 	    pkt->pkt_flow_ip_hlen, len, csum);
810 
811 	/* pkt metadata will be transfer to super packet */
812 	__packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
813 	    0, csum, false);
814 	if ((csum ^ 0xffff) != 0) {
815 		return false;
816 	}
817 
818 	return true;
819 }
820 
821 SK_INLINE_ATTRIBUTE
822 static void
flow_agg_init_common(struct flow_agg * fa,struct __kern_packet * pkt)823 flow_agg_init_common(struct flow_agg *fa, struct __kern_packet *pkt)
824 {
825 	switch (pkt->pkt_flow_ip_ver) {
826 	case IPVERSION:
827 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip)) {
828 			return;
829 		}
830 		break;
831 	case IPV6_VERSION:
832 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
833 			return;
834 		}
835 		break;
836 	default:
837 		VERIFY(0);
838 		/* NOTREACHED */
839 		__builtin_unreachable();
840 	}
841 
842 	fa->fa_tcp_seq = ntohl(pkt->pkt_flow_tcp_seq) + pkt->pkt_flow_ulen;
843 	fa->fa_ulen = pkt->pkt_flow_ulen;
844 	fa->fa_total = pkt->pkt_flow_ip_hlen +
845 	    pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
846 }
847 
848 static void
flow_agg_init_smbuf(struct flow_agg * fa,struct mbuf * smbuf,struct __kern_packet * pkt)849 flow_agg_init_smbuf(struct flow_agg *fa, struct mbuf *smbuf,
850     struct __kern_packet *pkt)
851 {
852 	FLOW_AGG_CLEAR(fa);
853 
854 	ASSERT(smbuf != NULL);
855 	fa->fa_smbuf = smbuf;
856 
857 	fa->fa_sptr = mtod(smbuf, uint8_t *);
858 	ASSERT(fa->fa_sptr != NULL);
859 
860 	/*
861 	 * Note here we use 'pkt' instead of 'smbuf', since we rely on the
862 	 * contents of the flow structure which don't exist in 'smbuf'.
863 	 */
864 	flow_agg_init_common(fa, pkt);
865 }
866 
867 static void
flow_agg_init_spkt(struct flow_agg * fa,struct __kern_packet * spkt,struct __kern_packet * pkt)868 flow_agg_init_spkt(struct flow_agg *fa, struct __kern_packet *spkt,
869     struct __kern_packet *pkt)
870 {
871 	FLOW_AGG_CLEAR(fa);
872 
873 	ASSERT(spkt != NULL);
874 	fa->fa_spkt = spkt;
875 	fa->fa_sobj_is_pkt = true;
876 	VERIFY(spkt->pkt_headroom == 0 && spkt->pkt_l2_len == 0);
877 
878 	MD_BUFLET_ADDR_ABS(spkt, fa->fa_sptr);
879 	ASSERT(fa->fa_sptr != NULL);
880 
881 	/*
882 	 * Note here we use 'pkt' instead of 'spkt', since we rely on the
883 	 * contents of the flow structure which don't exist in 'spkt'.
884 	 */
885 	flow_agg_init_common(fa, pkt);
886 }
887 
888 SK_INLINE_ATTRIBUTE
889 static bool
ipv4_tcp_memcmp(const uint8_t * h1,const uint8_t * h2)890 ipv4_tcp_memcmp(const uint8_t *h1, const uint8_t *h2)
891 {
892 	return sk_memcmp_mask_64B(h1, h2, (const uint8_t *)&ip_tcp_mask) == 0;
893 }
894 
895 SK_INLINE_ATTRIBUTE
896 static bool
ipv6_tcp_memcmp(const uint8_t * h1,const uint8_t * h2)897 ipv6_tcp_memcmp(const uint8_t *h1, const uint8_t *h2)
898 {
899 	return sk_memcmp_mask_80B(h1, h2, (const uint8_t *)&ip6_tcp_mask) == 0;
900 }
901 
902 SK_INLINE_ATTRIBUTE
903 static bool
can_agg_fastpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)904 can_agg_fastpath(struct flow_agg *fa, struct __kern_packet *pkt,
905     struct fsw_stats *fsws)
906 {
907 	bool match;
908 
909 	ASSERT(fa->fa_sptr != NULL);
910 	_CASSERT(sizeof(struct ip6_tcp_mask) == MASK_SIZE);
911 	_CASSERT(sizeof(struct ip_tcp_mask) == MASK_SIZE);
912 
913 	if (__improbable(pkt->pkt_length < MASK_SIZE)) {
914 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_TCP);
915 		goto slow_path;
916 	}
917 
918 	if (__improbable(fa->fa_sobj_is_short)) {
919 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_MBUF);
920 		goto slow_path;
921 	}
922 
923 	if (__improbable(pkt->pkt_flow_tcp_hlen !=
924 	    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA))) {
925 		goto slow_path;
926 	}
927 
928 	switch (pkt->pkt_flow_ip_ver) {
929 	case IPVERSION:
930 		match = ipv4_tcp_memcmp(fa->fa_sptr,
931 		    (uint8_t *)pkt->pkt_flow_ip_hdr);
932 		break;
933 	case IPV6_VERSION:
934 		match = ipv6_tcp_memcmp(fa->fa_sptr,
935 		    (uint8_t *)pkt->pkt_flow_ip_hdr);
936 		break;
937 	default:
938 		VERIFY(0);
939 		/* NOTREACHED */
940 		__builtin_unreachable();
941 	}
942 
943 	if (__improbable(!match)) {
944 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_MASK_TCP);
945 		goto slow_path;
946 	}
947 	if (__improbable(pkt->pkt_flow_ulen != fa->fa_ulen)) {
948 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ULEN_TCP);
949 		goto slow_path;
950 	}
951 
952 	STATS_INC(fsws, FSW_STATS_RX_AGG_OK_FASTPATH_TCP);
953 	fa->fa_tcp_seq += pkt->pkt_flow_ulen;
954 	fa->fa_ulen = pkt->pkt_flow_ulen;
955 	return true;
956 
957 slow_path:
958 	return false;
959 }
960 
961 SK_NO_INLINE_ATTRIBUTE
962 static bool
can_agg_slowpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)963 can_agg_slowpath(struct flow_agg *fa, struct __kern_packet *pkt,
964     struct fsw_stats *fsws)
965 {
966 	uint8_t *sl3_hdr = fa->fa_sptr;
967 	uint32_t sl3tlen = 0;
968 	uint16_t sl3hlen = 0;
969 
970 	DTRACE_SKYWALK2(aggr__slow, struct __kern_packet *, pkt,
971 	    uint8_t *, sl3_hdr);
972 
973 	ASSERT(sl3_hdr != NULL);
974 
975 	/*
976 	 * Compare IP header length, TOS, frag flags and IP options
977 	 * For IPv4, the options should match exactly
978 	 * For IPv6, if options are present, bail out
979 	 */
980 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
981 		struct ip *siph = (struct ip *)(void *)sl3_hdr;
982 		struct ip *iph = (struct ip *)pkt->pkt_flow_ip_hdr;
983 
984 		ASSERT(siph->ip_v == IPVERSION);
985 		/* 16-bit alignment is sufficient (handles mbuf case) */
986 		ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
987 		ASSERT(IS_P2ALIGNED(iph, sizeof(uint16_t)));
988 
989 		sl3hlen = (siph->ip_hl << 2);
990 		if (sl3hlen != pkt->pkt_flow_ip_hlen) {
991 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
992 			DTRACE_SKYWALK2(aggr__fail2, uint16_t, sl3hlen, uint8_t,
993 			    pkt->pkt_flow_ip_hlen);
994 			return false;
995 		}
996 
997 		if (siph->ip_ttl != iph->ip_ttl) {
998 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
999 			DTRACE_SKYWALK2(aggr__fail3, uint8_t, siph->ip_ttl,
1000 			    uint8_t, iph->ip_ttl);
1001 			return false;
1002 		}
1003 
1004 		if (siph->ip_tos != iph->ip_tos) {
1005 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1006 			DTRACE_SKYWALK2(aggr__fail4, uint8_t, siph->ip_tos,
1007 			    uint8_t, iph->ip_tos);
1008 			return false;
1009 		}
1010 		/* For IPv4, DF bit should match */
1011 		if ((ntohs(siph->ip_off) & (IP_DF | IP_RF)) !=
1012 		    (ntohs(iph->ip_off) & (IP_DF | IP_RF))) {
1013 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OFF_IP);
1014 			DTRACE_SKYWALK2(aggr__fail5, uint16_t,
1015 			    ntohs(siph->ip_off), uint16_t, ntohs(iph->ip_off));
1016 			return false;
1017 		}
1018 
1019 		uint8_t ip_opts_len = pkt->pkt_flow_ip_hlen -
1020 		    sizeof(struct ip);
1021 		if (ip_opts_len > 0 &&
1022 		    memcmp((uint8_t *)(siph + 1), (uint8_t *)(iph + 1),
1023 		    ip_opts_len) != 0) {
1024 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPT_IP);
1025 			DTRACE_SKYWALK3(aggr__fail6, uint8_t, ip_opts_len,
1026 			    uint8_t *, (uint8_t *)(siph + 1), uint8_t *,
1027 			    (uint8_t *)(iph + 1));
1028 			return false;
1029 		}
1030 		sl3tlen = ntohs(siph->ip_len);
1031 	} else {
1032 		struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1033 		struct ip6_hdr *ip6 = (struct ip6_hdr *)pkt->pkt_flow_ip_hdr;
1034 
1035 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1036 		ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1037 		/* 16-bit alignment is sufficient (handles mbuf case) */
1038 		ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1039 
1040 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
1041 			/*
1042 			 * Don't aggregate if extension header is present in
1043 			 * packet. N.B. currently flow switch only classifies
1044 			 * frag header
1045 			 */
1046 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1047 			DTRACE_SKYWALK1(aggr__fail7, uint8_t,
1048 			    pkt->pkt_flow_ip_hlen);
1049 			return false;
1050 		}
1051 
1052 		sl3hlen = sizeof(struct ip6_hdr);
1053 		/* For IPv6, flow info mask covers TOS and flow label */
1054 		if (memcmp(&sip6->ip6_flow, &ip6->ip6_flow,
1055 		    sizeof(sip6->ip6_flow)) != 0) {
1056 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1057 			DTRACE_SKYWALK2(aggr__fail8, uint32_t,
1058 			    ntohl(sip6->ip6_flow), uint32_t,
1059 			    ntohl(ip6->ip6_flow));
1060 			return false;
1061 		}
1062 
1063 		if (sip6->ip6_hlim != ip6->ip6_hlim) {
1064 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1065 			DTRACE_SKYWALK2(aggr__fail9, uint8_t, sip6->ip6_hlim,
1066 			    uint8_t, ip6->ip6_hlim);
1067 			return false;
1068 		}
1069 
1070 		sl3tlen = (sizeof(struct ip6_hdr) + ntohs(sip6->ip6_plen));
1071 	}
1072 
1073 	/*
1074 	 * For TCP header, compare ACK number and window size
1075 	 * Compare TCP flags
1076 	 * Compare TCP header length and TCP options
1077 	 */
1078 	struct tcphdr *stcp = (struct tcphdr *)(void *)(sl3_hdr + sl3hlen);
1079 	struct tcphdr *tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
1080 
1081 	uint16_t sl4hlen = (stcp->th_off << 2);
1082 	if (memcmp(&stcp->th_ack, &tcp->th_ack, sizeof(stcp->th_ack)) != 0 ||
1083 	    memcmp(&stcp->th_win, &tcp->th_win, sizeof(stcp->th_win)) != 0) {
1084 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ACKWIN_TCP);
1085 		DTRACE_SKYWALK4(aggr__fail9, uint32_t, ntohl(stcp->th_ack),
1086 		    uint32_t, ntohl(tcp->th_ack), uint16_t, ntohs(stcp->th_win),
1087 		    uint16_t, ntohs(tcp->th_win));
1088 		return false;
1089 	}
1090 
1091 	if ((stcp->th_flags & ~(TH_PUSH)) != (tcp->th_flags & ~(TH_PUSH))) {
1092 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_FLAGS_TCP);
1093 		DTRACE_SKYWALK2(aggr__fail10, uint8_t, stcp->th_flags,
1094 		    uint8_t, tcp->th_flags);
1095 		return false;
1096 	}
1097 
1098 	if (sl4hlen != pkt->pkt_flow_tcp_hlen) {
1099 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_TCP);
1100 		DTRACE_SKYWALK2(aggr__fail11, uint8_t, sl4hlen,
1101 		    uint8_t, pkt->pkt_flow_tcp_hlen);
1102 		return false;
1103 	}
1104 
1105 	uint8_t tcp_opts_len = pkt->pkt_flow_tcp_hlen - sizeof(struct tcphdr);
1106 	/*
1107 	 * We know that the TCP-option lengthes are the same thanks to the above
1108 	 * sl4hlen check
1109 	 */
1110 	if (tcp_opts_len > 0 && memcmp((uint8_t *)(stcp + 1),
1111 	    (uint8_t *)(tcp + 1), tcp_opts_len) != 0) {
1112 		/*
1113 		 * Fast-path header prediction:
1114 		 *
1115 		 * TCP Timestamp option is usually put after two NOP-headers,
1116 		 * and thus total TCP-option length is 12. If that's the case,
1117 		 * we can aggregate as only the TCP time-stamp option differs.
1118 		 */
1119 		if (tcp_opts_len != TCPOLEN_TSTAMP_APPA) {
1120 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_EXOPT_TCP);
1121 			DTRACE_SKYWALK1(aggr__fail13, uint8_t, tcp_opts_len);
1122 			return false;
1123 		} else {
1124 			uint32_t sts_hdr, ts_hdr;
1125 			if (IS_P2ALIGNED(stcp + 1, sizeof(uint32_t))) {
1126 				sts_hdr = *((uint32_t *)(stcp + 1));
1127 			} else {
1128 				bcopy(stcp + 1, &sts_hdr, sizeof(sts_hdr));
1129 			}
1130 			if (IS_P2ALIGNED(tcp + 1, sizeof(uint32_t))) {
1131 				ts_hdr = *((uint32_t *)(tcp + 1));
1132 			} else {
1133 				bcopy(tcp + 1, &ts_hdr, sizeof(ts_hdr));
1134 			}
1135 
1136 			if (sts_hdr != htonl(TCPOPT_TSTAMP_HDR) ||
1137 			    ts_hdr != htonl(TCPOPT_TSTAMP_HDR)) {
1138 				STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPTTS_TCP);
1139 				DTRACE_SKYWALK2(aggr__fail14, uint32_t,
1140 				    sts_hdr, uint32_t, ts_hdr);
1141 				return false;
1142 			}
1143 		}
1144 	}
1145 	STATS_INC(fsws, FSW_STATS_RX_AGG_OK_SLOWPATH_TCP);
1146 	fa->fa_tcp_seq += pkt->pkt_flow_ulen;
1147 	fa->fa_ulen = pkt->pkt_flow_ulen;
1148 	return true;
1149 }
1150 
1151 static bool
flow_agg_is_ok(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)1152 flow_agg_is_ok(struct flow_agg *fa, struct __kern_packet *pkt,
1153     struct fsw_stats *fsws)
1154 {
1155 	/* Shouldn't exceed the ip_len beyond MIN(custom ip_len, 64K) */
1156 	const uint32_t max_ip_len = MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET);
1157 	bool can_agg = false;
1158 
1159 	DTRACE_SKYWALK2(aggr__check, struct flow_agg *, fa,
1160 	    struct __kern_packet *, pkt);
1161 
1162 	ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1163 	if (__improbable(pkt->pkt_flow_tcp_agg_fast != 0)) {
1164 		pkt->pkt_flow_tcp_agg_fast = 0;
1165 	}
1166 	/*
1167 	 * Don't aggregate if any of the following is true:
1168 	 * 1. TCP flag is other than TH_{ACK,PUSH}
1169 	 * 2. Payload length is 0 (pure ACK)
1170 	 * 3. This is the first packet
1171 	 * 4. TCP sequence number is not expected
1172 	 * 5. We would've exceeded the maximum aggregated size
1173 	 * 6. It's not the first packet and the wake flag is set
1174 	 */
1175 	if (__improbable((pkt->pkt_flow_tcp_flags & TCP_FLAGS_IGNORE) != 0 ||
1176 	    pkt->pkt_flow_ulen == 0 || fa->fa_sobj == NULL)) {
1177 		DTRACE_SKYWALK1(aggr__fail1a, struct __kern_packet *, pkt);
1178 		goto done;
1179 	}
1180 	if (__improbable(ntohl(pkt->pkt_flow_tcp_seq) != fa->fa_tcp_seq)) {
1181 		DTRACE_SKYWALK2(aggr__fail1b, uint32_t,
1182 		    ntohl(pkt->pkt_flow_tcp_seq), uint32_t, fa->fa_tcp_seq);
1183 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SEQN_TCP);
1184 		goto done;
1185 	}
1186 	if (__improbable((fa->fa_total + pkt->pkt_flow_ulen) > max_ip_len)) {
1187 		DTRACE_SKYWALK3(aggr__fail1c, uint32_t, fa->fa_total,
1188 		    uint32_t, pkt->pkt_flow_ulen, uint32_t, max_ip_len);
1189 		/* We've reached aggregation limit */
1190 		STATS_INC(fsws, FSW_STATS_RX_AGG_LIMIT);
1191 		goto done;
1192 	}
1193 	if (__improbable((pkt->pkt_pflags & PKT_F_WAKE_PKT) && fa->fa_total > 0)) {
1194 		DTRACE_SKYWALK1(aggr__fail1d, struct __kern_packet *, pkt);
1195 		goto done;
1196 	}
1197 
1198 	can_agg = can_agg_fastpath(fa, pkt, fsws);
1199 	if (can_agg) {
1200 		pkt->pkt_flow_tcp_agg_fast = 1;
1201 		goto done;
1202 	}
1203 
1204 	can_agg = can_agg_slowpath(fa, pkt, fsws);
1205 	ASSERT(!pkt->pkt_flow_tcp_agg_fast);
1206 
1207 done:
1208 	return can_agg;
1209 }
1210 
1211 static void
flow_agg_merge_hdr(struct flow_agg * fa,struct __kern_packet * pkt,uint16_t data_csum,struct fsw_stats * fsws)1212 flow_agg_merge_hdr(struct flow_agg *fa, struct __kern_packet *pkt,
1213     uint16_t data_csum, struct fsw_stats *fsws)
1214 {
1215 	struct tcphdr *stcp, *tcp;
1216 	uint8_t *l3hdr, l3hlen;
1217 	uint16_t old_l3len = 0;
1218 	uint8_t result;
1219 
1220 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1221 
1222 	ASSERT(fa->fa_sobj != NULL);
1223 	ASSERT(!fa->fa_sobj_is_pkt ||
1224 	    (fa->fa_spkt->pkt_headroom == 0 && fa->fa_spkt->pkt_l2_len == 0));
1225 	uint8_t *sl3_hdr = fa->fa_sptr;
1226 	ASSERT(sl3_hdr != NULL);
1227 
1228 	fa->fa_total += pkt->pkt_flow_ulen;
1229 
1230 	/*
1231 	 * Update the IP header as:
1232 	 * 1. Set the IP ID (IPv4 only) to that of the new packet
1233 	 * 2. Set the ttl to the lowest of the two
1234 	 * 3. Increment the IP length by the payload length of new packet
1235 	 * 4. Leave the IP (IPv4 only) checksum as is
1236 	 * Update the resp. flow classification fields, if any
1237 	 * Nothing to update for TCP header for now
1238 	 */
1239 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
1240 		struct ip *siph = (struct ip *)(void *)sl3_hdr;
1241 
1242 		/* 16-bit alignment is sufficient (handles mbuf case) */
1243 		ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1244 
1245 		l3hdr = (uint8_t *)siph;
1246 		l3hlen = siph->ip_hl << 2;
1247 
1248 		old_l3len = ntohs(siph->ip_len);
1249 		uint16_t l3tlen = ntohs(siph->ip_len) + pkt->pkt_flow_ulen;
1250 		siph->ip_len = htons(l3tlen);
1251 		siph->ip_sum = __packet_fix_sum(siph->ip_sum, 0,
1252 		    htons(pkt->pkt_flow_ulen));
1253 
1254 		SK_DF(logflags, "Agg IP len %u", ntohs(siph->ip_len));
1255 	} else {
1256 		struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1257 
1258 		/* 16-bit alignment is sufficient (handles mbuf case) */
1259 		ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1260 		ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1261 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1262 
1263 		l3hdr = (uint8_t *)sip6;
1264 		l3hlen = sizeof(struct ip6_hdr);
1265 
1266 		/* No extension headers should be present */
1267 		ASSERT(pkt->pkt_flow_ip_hlen == sizeof(struct ip6_hdr));
1268 
1269 		old_l3len = ntohs(sip6->ip6_plen) + sizeof(struct ip6_hdr);
1270 		uint16_t l3plen = ntohs(sip6->ip6_plen) + pkt->pkt_flow_ulen;
1271 		sip6->ip6_plen = htons(l3plen);
1272 
1273 		SK_DF(logflags, "Agg IP6 len %u", ntohs(sip6->ip6_plen));
1274 	}
1275 
1276 	if (__probable(pkt->pkt_flow_tcp_agg_fast)) {
1277 		STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_IP);
1278 	} else {
1279 		STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_IP);
1280 	}
1281 
1282 	stcp = (struct tcphdr *)(void *)(l3hdr + l3hlen);
1283 	tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
1284 	/* 16-bit alignment is sufficient (handles mbuf case) */
1285 	ASSERT(IS_P2ALIGNED(stcp, sizeof(uint16_t)));
1286 	ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
1287 
1288 	/*
1289 	 * If it is bigger, that means there are TCP-options that need to be
1290 	 * copied over.
1291 	 */
1292 	if (pkt->pkt_flow_tcp_hlen > sizeof(struct tcphdr) ||
1293 	    (stcp->th_flags & TH_PUSH) == 0) {
1294 		VERIFY(stcp->th_off << 2 == pkt->pkt_flow_tcp_hlen);
1295 		if (__improbable(!pkt->pkt_flow_tcp_agg_fast &&
1296 		    memcmp(stcp + 1, tcp + 1, (pkt->pkt_flow_tcp_hlen -
1297 		    sizeof(struct tcphdr))) != 0)) {
1298 			uint8_t *sopt = (uint8_t *)(stcp + 1);
1299 			uint8_t *opt = (uint8_t *)(tcp + 1);
1300 
1301 			uint32_t ntsval, ntsecr;
1302 			bcopy((void *)(opt + 4), &ntsval, sizeof(ntsval));
1303 			bcopy((void *)(opt + 8), &ntsecr, sizeof(ntsecr));
1304 
1305 			__packet_fix_hdr_sum(sopt + 4, &stcp->th_sum, ntsval);
1306 			__packet_fix_hdr_sum(sopt + 8, &stcp->th_sum, ntsecr);
1307 
1308 			STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_TCP);
1309 		} else {
1310 			STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_TCP);
1311 		}
1312 
1313 		if ((stcp->th_flags & TH_PUSH) == 0 &&
1314 		    (tcp->th_flags & TH_PUSH) != 0) {
1315 			uint16_t old, new;
1316 			old = *(uint16_t *)(void *)(&stcp->th_ack + 1);
1317 			/* If the new segment has a PUSH-flag, append it! */
1318 			stcp->th_flags |= tcp->th_flags & TH_PUSH;
1319 			new = *(uint16_t *)(void *)(&stcp->th_ack + 1);
1320 			stcp->th_sum = __packet_fix_sum(stcp->th_sum, old, new);
1321 		}
1322 	}
1323 
1324 	/* Update pseudo header checksum */
1325 	stcp->th_sum = __packet_fix_sum(stcp->th_sum, 0,
1326 	    htons(pkt->pkt_flow_ulen));
1327 
1328 	/* Update data checksum  */
1329 	if (__improbable(old_l3len & 0x1)) {
1330 		/* swap the byte order, refer to rfc 1071 section 2 */
1331 		stcp->th_sum = __packet_fix_sum(stcp->th_sum, 0,
1332 		    ntohs(data_csum));
1333 	} else {
1334 		stcp->th_sum = __packet_fix_sum(stcp->th_sum, 0, data_csum);
1335 	}
1336 
1337 	if (fa->fa_sobj_is_pkt) {
1338 		struct __kern_packet *spkt = fa->fa_spkt;
1339 		spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1340 		spkt->pkt_flow_ulen += pkt->pkt_flow_ulen;
1341 		/*
1342 		 * Super packet length includes L3 and L4
1343 		 * header length for first packet only.
1344 		 */
1345 		spkt->pkt_length += pkt->pkt_flow_ulen;
1346 		if (spkt->pkt_seg_cnt == 0) {
1347 			/* First time we append packets, need to set it to 1 */
1348 			spkt->pkt_seg_cnt = 1;
1349 		}
1350 		_CASSERT(sizeof(result) == sizeof(spkt->pkt_seg_cnt));
1351 		if (!os_add_overflow(1, spkt->pkt_seg_cnt, &result)) {
1352 			spkt->pkt_seg_cnt = result;
1353 		}
1354 		SK_DF(logflags, "Agg pkt len %u TCP csum 0x%04x",
1355 		    spkt->pkt_length, ntohs(stcp->th_sum));
1356 	} else {
1357 		struct mbuf *smbuf = fa->fa_smbuf;
1358 		smbuf->m_pkthdr.len += pkt->pkt_flow_ulen;
1359 		if (smbuf->m_pkthdr.seg_cnt == 0) {
1360 			/* First time we append packets, need to set it to 1 */
1361 			smbuf->m_pkthdr.seg_cnt = 1;
1362 		}
1363 		_CASSERT(sizeof(result) == sizeof(smbuf->m_pkthdr.seg_cnt));
1364 		if (!os_add_overflow(1, smbuf->m_pkthdr.seg_cnt, &result)) {
1365 			smbuf->m_pkthdr.seg_cnt = result;
1366 		}
1367 		SK_DF(logflags, "Agg mbuf len %u TCP csum 0x%04x",
1368 		    smbuf->m_pkthdr.len, ntohs(stcp->th_sum));
1369 	}
1370 }
1371 
1372 /*
1373  * Copy metadata from source packet to destination packet
1374  */
1375 static void
pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)1376 pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
1377 {
1378 	/* Copy packet metadata */
1379 	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
1380 	_PKT_COPY(spkt, dpkt);
1381 }
1382 
1383 static void
pkt_finalize(kern_packet_t ph)1384 pkt_finalize(kern_packet_t ph)
1385 {
1386 	int err = __packet_finalize(ph);
1387 	VERIFY(err == 0);
1388 #if (DEVELOPMENT || DEBUG)
1389 	struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1390 	uint8_t *buf;
1391 	MD_BUFLET_ADDR_ABS(pkt, buf);
1392 	buf += pkt->pkt_headroom + pkt->pkt_l2_len;
1393 	DTRACE_SKYWALK2(aggr__finalize, struct __kern_packet *, pkt,
1394 	    uint8_t *, buf);
1395 #endif
1396 }
1397 
1398 SK_INLINE_ATTRIBUTE
1399 static inline uint32_t
_estimate_buflet_cnt(struct flow_entry * fe,struct kern_pbufpool * pp)1400 _estimate_buflet_cnt(struct flow_entry *fe, struct kern_pbufpool *pp)
1401 {
1402 	uint32_t cnt;
1403 
1404 	_CASSERT(MAX_BUFLET_COUNT <= UINT8_MAX);
1405 	cnt = howmany(((fe->fe_rx_pktq_bytes + sizeof(struct ip6_hdr)) +
1406 	    sizeof(struct tcphdr)), pp->pp_buflet_size);
1407 	cnt = MAX(KPKTQ_LEN(&fe->fe_rx_pktq), cnt);
1408 	cnt = MIN(cnt, MAX_BUFLET_COUNT);
1409 	return cnt;
1410 }
1411 
1412 SK_INLINE_ATTRIBUTE
1413 static inline void
_append_dbuf_array_to_kpkt(kern_packet_t ph,kern_buflet_t pbuf,_dbuf_array_t * dbuf_array,kern_buflet_t * lbuf)1414 _append_dbuf_array_to_kpkt(kern_packet_t ph, kern_buflet_t pbuf,
1415     _dbuf_array_t *dbuf_array, kern_buflet_t *lbuf)
1416 {
1417 	for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1418 		kern_buflet_t buf = dbuf_array->dba_buflet[i];
1419 		VERIFY(kern_packet_add_buflet(ph, pbuf, buf) == 0);
1420 		pbuf = buf;
1421 		dbuf_array->dba_buflet[i] = NULL;
1422 	}
1423 	ASSERT(pbuf != NULL);
1424 	dbuf_array->dba_num_dbufs = 0;
1425 	*lbuf = pbuf;
1426 }
1427 
1428 SK_INLINE_ATTRIBUTE
1429 static inline void
_free_dbuf_array(struct kern_pbufpool * pp,_dbuf_array_t * dbuf_array)1430 _free_dbuf_array(struct kern_pbufpool *pp,
1431     _dbuf_array_t *dbuf_array)
1432 {
1433 	for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1434 		kern_buflet_t buf = dbuf_array->dba_buflet[i];
1435 		pp_free_buflet(pp, buf);
1436 		dbuf_array->dba_buflet[i] = NULL;
1437 	}
1438 	dbuf_array->dba_num_dbufs = 0;
1439 }
1440 
1441 SK_NO_INLINE_ATTRIBUTE
1442 static void
flow_rx_agg_channel(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * dropped_pkts,bool is_mbuf)1443 flow_rx_agg_channel(struct nx_flowswitch *fsw, struct flow_entry *fe,
1444     struct pktq *dropped_pkts, bool is_mbuf)
1445 {
1446 #define __RX_AGG_CHAN_DROP_SOURCE_PACKET(_pkt)    do {   \
1447 	KPKTQ_ENQUEUE(dropped_pkts, (_pkt));             \
1448 	(_pkt) = NULL;                                   \
1449 	FLOW_AGG_CLEAR(&fa);                             \
1450 	prev_csum_ok = false;                            \
1451 } while (0)
1452 	struct flow_agg fa;             /* states */
1453 	FLOW_AGG_CLEAR(&fa);
1454 
1455 	struct pktq pkts;               /* dst super packets */
1456 	struct pktq disposed_pkts;      /* done src packets */
1457 
1458 	KPKTQ_INIT(&pkts);
1459 	KPKTQ_INIT(&disposed_pkts);
1460 
1461 	struct __kern_channel_ring *ring;
1462 	ring = fsw_flow_get_rx_ring(fsw, fe);
1463 	if (__improbable(ring == NULL)) {
1464 		SK_ERR("Rx ring is NULL");
1465 		KPKTQ_CONCAT(dropped_pkts, &fe->fe_rx_pktq);
1466 		STATS_ADD(&fsw->fsw_stats, FSW_STATS_DST_NXPORT_INVALID,
1467 		    KPKTQ_LEN(dropped_pkts));
1468 		return;
1469 	}
1470 	struct kern_pbufpool *dpp = ring->ckr_pp;
1471 	ASSERT(dpp->pp_max_frags > 1);
1472 
1473 	struct __kern_packet *pkt, *tpkt;
1474 	/* state for super packet */
1475 	struct __kern_packet *spkt = NULL;
1476 	kern_packet_t sph = 0;
1477 	kern_buflet_t sbuf = NULL;
1478 	bool prev_csum_ok = false, csum_ok, agg_ok;
1479 	uint16_t spkts = 0, bufcnt = 0;
1480 	int err;
1481 
1482 	struct fsw_stats *fsws = &fsw->fsw_stats;
1483 
1484 	/* state for buflet batch alloc */
1485 	uint32_t bh_cnt, bh_cnt_tmp;
1486 	uint8_t iter = 0;
1487 	uint64_t buf_arr[MAX_BUFLET_COUNT];
1488 	_dbuf_array_t dbuf_array = {.dba_is_buflet = true, .dba_num_dbufs = 0};
1489 
1490 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1491 	SK_DF(logflags, "Rx input queue len %u", KPKTQ_LEN(&fe->fe_rx_pktq));
1492 
1493 	bh_cnt_tmp = bh_cnt = _estimate_buflet_cnt(fe, dpp);
1494 	err = pp_alloc_buflet_batch(dpp, buf_arr, &bh_cnt, SKMEM_NOSLEEP);
1495 	if (__improbable(bh_cnt == 0)) {
1496 		SK_ERR("failed to alloc %u buflets (err %d), use slow path",
1497 		    bh_cnt_tmp, err);
1498 	}
1499 	bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1500 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1501 		if (tpkt != NULL) {
1502 			void *baddr;
1503 			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1504 			SK_PREFETCH(baddr, 0);
1505 		}
1506 
1507 		ASSERT(pkt->pkt_qum.qum_pp != dpp);
1508 		ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
1509 		ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
1510 		ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
1511 		ASSERT(!pkt->pkt_flow_ip_is_frag);
1512 		ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1513 
1514 		csum_ok = false;
1515 		agg_ok = false;
1516 		/* supports TCP only */
1517 		uint32_t thlen = (pkt->pkt_flow_ip_hlen +
1518 		    pkt->pkt_flow_tcp_hlen);
1519 		uint32_t plen = (thlen + pkt->pkt_flow_ulen);
1520 		uint16_t data_csum = 0;
1521 
1522 		KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1523 		fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1524 		err = flow_pkt_track(fe, pkt, true);
1525 		if (__improbable(err != 0)) {
1526 			STATS_INC(fsws, FSW_STATS_RX_FLOW_TRACK_ERR);
1527 			/* if need to trigger RST then deliver to host */
1528 			if (err == ENETRESET) {
1529 				struct flow_entry *host_fe;
1530 				host_fe =
1531 				    flow_mgr_get_host_fe(fsw->fsw_flow_mgr);
1532 				KPKTQ_ENQUEUE(&host_fe->fe_rx_pktq, pkt);
1533 				continue;
1534 			}
1535 			SK_ERR("flow_pkt_track failed (err %d)", err);
1536 			__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1537 			continue;
1538 		}
1539 
1540 		if (is_mbuf) {          /* compat */
1541 			m_adj(pkt->pkt_mbuf, pkt->pkt_l2_len);
1542 			pkt->pkt_svc_class = m_get_service_class(pkt->pkt_mbuf);
1543 		}
1544 
1545 		if (prev_csum_ok && sbuf) {
1546 			ASSERT(fa.fa_spkt == spkt);
1547 			ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1548 			agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
1549 			agg_ok = (agg_ok && bufcnt < dpp->pp_max_frags);
1550 
1551 			if (agg_ok && sbuf->buf_dlim - sbuf->buf_doff -
1552 			    sbuf->buf_dlen >= plen - thlen) {
1553 				/*
1554 				 * No need for a new packet, just
1555 				 * append to curr_m.
1556 				 */
1557 				csum_ok = copy_pkt_csum_packed(pkt, plen, NULL,
1558 				    is_ipv4, NULL, sbuf, &data_csum, NULL);
1559 
1560 				if (!csum_ok) {
1561 					STATS_INC(fsws,
1562 					    FSW_STATS_RX_AGG_BAD_CSUM);
1563 					SK_ERR("Checksum for aggregation "
1564 					    "is wrong");
1565 					DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail1);
1566 					/*
1567 					 * Turns out, checksum is wrong!
1568 					 * Fallback to no-agg mode.
1569 					 */
1570 					agg_ok = false;
1571 				} else {
1572 					flow_agg_merge_hdr(&fa, pkt,
1573 					    data_csum, fsws);
1574 					goto next;
1575 				}
1576 			}
1577 		}
1578 
1579 		/* calculate number of buflets required */
1580 		bh_cnt_tmp = howmany(plen, dpp->pp_buflet_size);
1581 		if (__improbable(bh_cnt_tmp > MAX_BUFLET_COUNT)) {
1582 			STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1583 			SK_ERR("packet too big: bufcnt %d len %d", bh_cnt_tmp,
1584 			    plen);
1585 			__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1586 			continue;
1587 		}
1588 		if (bh_cnt < bh_cnt_tmp) {
1589 			uint32_t tmp;
1590 
1591 			if (iter != 0) {
1592 				/*
1593 				 * rearrange the array for additional
1594 				 * allocation
1595 				 */
1596 				uint8_t i;
1597 				for (i = 0; i < bh_cnt; i++, iter++) {
1598 					buf_arr[i] = buf_arr[iter];
1599 					buf_arr[iter] = 0;
1600 				}
1601 				iter = 0;
1602 			}
1603 			tmp = _estimate_buflet_cnt(fe, dpp);
1604 			tmp = MAX(tmp, bh_cnt_tmp);
1605 			tmp -= bh_cnt;
1606 			ASSERT(tmp <= (MAX_BUFLET_COUNT - bh_cnt));
1607 			err = pp_alloc_buflet_batch(dpp, &buf_arr[bh_cnt],
1608 			    &tmp, SKMEM_NOSLEEP);
1609 			bh_cnt += tmp;
1610 			if (__improbable((tmp == 0) || (bh_cnt < bh_cnt_tmp))) {
1611 				STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1612 				SK_ERR("buflet alloc failed (err %d)", err);
1613 				__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1614 				continue;
1615 			}
1616 		}
1617 		/* Use pre-allocated buflets */
1618 		ASSERT(bh_cnt >= bh_cnt_tmp);
1619 		dbuf_array.dba_num_dbufs = bh_cnt_tmp;
1620 		while (bh_cnt_tmp-- > 0) {
1621 			dbuf_array.dba_buflet[bh_cnt_tmp] =
1622 			    (kern_buflet_t)(buf_arr[iter]);
1623 			buf_arr[iter] = 0;
1624 			bh_cnt--;
1625 			iter++;
1626 		}
1627 		/* copy and checksum TCP data */
1628 		if (agg_ok) {
1629 			int added = 0;
1630 			ASSERT(dbuf_array.dba_num_dbufs != 0);
1631 			csum_ok = copy_pkt_csum_packed(pkt, plen, &dbuf_array,
1632 			    is_ipv4, NULL, sbuf, &data_csum, &added);
1633 
1634 			if (__improbable(!csum_ok)) {
1635 				STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1636 				SK_ERR("Checksum for aggregation on new "
1637 				    "mbuf is wrong");
1638 				DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail2);
1639 				agg_ok = false;
1640 				/* reset the used buflets */
1641 				uint8_t j;
1642 				for (j = 0; j < dbuf_array.dba_num_dbufs; j++) {
1643 					VERIFY(kern_buflet_set_data_length(
1644 						    dbuf_array.dba_buflet[j], 0) == 0);
1645 				}
1646 				goto non_agg;
1647 			}
1648 
1649 			/*
1650 			 * There was not enough space in curr_m, thus we must
1651 			 * have added to m->m_data.
1652 			 */
1653 			VERIFY(added > 0);
1654 		} else {
1655 non_agg:
1656 			ASSERT(dbuf_array.dba_num_dbufs != 0);
1657 			csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
1658 			    &data_csum, is_ipv4);
1659 			if (__improbable(!csum_ok)) {
1660 				STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1661 				SK_ERR("%d incorrect csum", __LINE__);
1662 				DTRACE_SKYWALK(aggr__chan_tcp_csum_fail);
1663 			}
1664 		}
1665 		if (agg_ok) {
1666 			ASSERT(fa.fa_spkt == spkt);
1667 			ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1668 			/* update current packet header */
1669 			flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
1670 			ASSERT(dbuf_array.dba_num_dbufs > 0);
1671 			bufcnt += dbuf_array.dba_num_dbufs;
1672 			_append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1673 			    &sbuf);
1674 		} else {
1675 			/* Finalize the current super packet */
1676 			if (sph != 0) {
1677 				spkts++;
1678 				if (bufcnt > 1) {
1679 					spkt->pkt_aggr_type =
1680 					    PKT_AGGR_SINGLE_IP;
1681 				}
1682 				pkt_finalize(sph);
1683 				pkt_agg_log(spkt, kernproc, false);
1684 				DTRACE_SKYWALK1(aggr__buflet__count, uint16_t,
1685 				    bufcnt);
1686 				sph = 0;
1687 				spkt = NULL;
1688 				FLOW_AGG_CLEAR(&fa);
1689 			}
1690 
1691 			/* New super packet */
1692 			err = kern_pbufpool_alloc_nosleep(dpp, 0, &sph);
1693 			if (__improbable(err != 0)) {
1694 				STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1695 				SK_ERR("packet alloc failed (err %d)", err);
1696 				_free_dbuf_array(dpp, &dbuf_array);
1697 				__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1698 				continue;
1699 			}
1700 			spkt = SK_PTR_ADDR_KPKT(sph);
1701 			pkt_copy_metadata(pkt, spkt);
1702 			/* Packet length for super packet starts from L3 */
1703 			spkt->pkt_length = plen;
1704 			spkt->pkt_flow_ulen =  pkt->pkt_flow_ulen;
1705 			spkt->pkt_headroom = 0;
1706 			spkt->pkt_l2_len = 0;
1707 			spkt->pkt_seg_cnt = 1;
1708 
1709 			ASSERT(dbuf_array.dba_num_dbufs > 0);
1710 			bufcnt = dbuf_array.dba_num_dbufs;
1711 			sbuf = kern_packet_get_next_buflet(sph, NULL);
1712 			_append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1713 			    &sbuf);
1714 
1715 			KPKTQ_ENQUEUE(&pkts, spkt);
1716 			_UUID_COPY(spkt->pkt_flow_id, fe->fe_uuid);
1717 			_UUID_COPY(spkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1718 			spkt->pkt_policy_id = fe->fe_policy_id;
1719 			spkt->pkt_transport_protocol =
1720 			    fe->fe_transport_protocol;
1721 			flow_agg_init_spkt(&fa, spkt, pkt);
1722 		}
1723 next:
1724 		pkt_agg_log(pkt, kernproc, true);
1725 		prev_csum_ok = csum_ok;
1726 		KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1727 	}
1728 
1729 	/* Free unused buflets */
1730 	while (bh_cnt > 0) {
1731 		pp_free_buflet(dpp, (kern_buflet_t)(buf_arr[iter]));
1732 		buf_arr[iter] = 0;
1733 		bh_cnt--;
1734 		iter++;
1735 	}
1736 	/* Finalize the last super packet */
1737 	if (sph != 0) {
1738 		spkts++;
1739 		if (bufcnt > 1) {
1740 			spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1741 		}
1742 		pkt_finalize(sph);
1743 		pkt_agg_log(spkt, kernproc, false);
1744 		DTRACE_SKYWALK1(aggr__buflet__count, uint16_t, bufcnt);
1745 		sph = 0;
1746 		spkt = NULL;
1747 		FLOW_AGG_CLEAR(&fa);
1748 	}
1749 	DTRACE_SKYWALK1(aggr__spkt__count, uint16_t, spkts);
1750 	if (__improbable(is_mbuf)) {
1751 		STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2PKT, spkts);
1752 	} else {
1753 		STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2PKT, spkts);
1754 	}
1755 	FLOW_STATS_IN_ADD(fe, spackets, spkts);
1756 
1757 	KPKTQ_FINI(&fe->fe_rx_pktq);
1758 	KPKTQ_CONCAT(&fe->fe_rx_pktq, &pkts);
1759 	KPKTQ_FINI(&pkts);
1760 
1761 	fsw_ring_enqueue_tail_drop(fsw, ring, &fe->fe_rx_pktq);
1762 
1763 	pp_free_pktq(&disposed_pkts);
1764 }
1765 
1766 SK_NO_INLINE_ATTRIBUTE
1767 static void
flow_rx_agg_host(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * dropped_pkts,bool is_mbuf)1768 flow_rx_agg_host(struct nx_flowswitch *fsw, struct flow_entry *fe,
1769     struct pktq *dropped_pkts, bool is_mbuf)
1770 {
1771 #define __RX_AGG_HOST_DROP_SOURCE_PACKET(_pkt)    do {   \
1772 	drop_packets++;                                  \
1773 	drop_bytes += (_pkt)->pkt_length;                \
1774 	KPKTQ_ENQUEUE(dropped_pkts, (_pkt));             \
1775 	(_pkt) = NULL;                                   \
1776 	FLOW_AGG_CLEAR(&fa);                             \
1777 	prev_csum_ok = false;                            \
1778 } while (0)
1779 	struct flow_agg fa;             /* states */
1780 	FLOW_AGG_CLEAR(&fa);
1781 
1782 	struct pktq disposed_pkts;      /* done src packets */
1783 	KPKTQ_INIT(&disposed_pkts);
1784 
1785 	int alloced = 0;
1786 	int factor;
1787 
1788 	struct __kern_packet *pkt, *tpkt;
1789 	/* points to the first mbuf of chain */
1790 	struct mbuf *m_chain = NULL;
1791 	/* super mbuf, at the end it points to last mbuf packet */
1792 	struct  mbuf *smbuf = NULL, *curr_m = NULL;
1793 	bool prev_csum_ok = false, csum_ok, agg_ok;
1794 	uint16_t smbufs = 0;
1795 	uint32_t bytes = 0, rcvd_ulen = 0;
1796 	uint32_t rcvd_packets = 0, rcvd_bytes = 0; /* raw packets & bytes */
1797 	uint32_t drop_packets = 0, drop_bytes = 0; /* dropped packets & bytes */
1798 	uint32_t largest_smbuf = 0;
1799 	int err = 0;
1800 
1801 	struct fsw_stats *fsws = &fsw->fsw_stats;
1802 	bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1803 
1804 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1805 
1806 	/* state for mbuf batch alloc */
1807 	uint32_t mhead_cnt;
1808 	uint32_t mhead_bufsize;
1809 	struct mbuf * mhead = NULL;
1810 
1811 	uint16_t l2len = KPKTQ_FIRST(&fe->fe_rx_pktq)->pkt_l2_len;
1812 
1813 	SK_DF(logflags, "Rx input queue bytes %u", fe->fe_rx_pktq_bytes);
1814 
1815 	if (__probable(!is_mbuf)) {
1816 		uint32_t max_ip_len = MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET);
1817 
1818 		/*
1819 		 *  Batch mbuf alloc is based on
1820 		 * convert_native_pkt_to_mbuf_chain
1821 		 */
1822 		if (__probable(fe->fe_rx_largest_msize != 0 &&
1823 		    max_ip_len > 0)) {
1824 			unsigned int one;
1825 			int wait;
1826 
1827 			if (fe->fe_rx_largest_msize <= MCLBYTES) {
1828 				mhead_bufsize = MCLBYTES;
1829 			} else if (fe->fe_rx_largest_msize <= MBIGCLBYTES) {
1830 				mhead_bufsize = MBIGCLBYTES;
1831 			} else {
1832 				mhead_bufsize = M16KCLBYTES;
1833 			}
1834 
1835 try_again:
1836 			if (fe->fe_rx_pktq_bytes != 0) {
1837 				uint32_t aggregation_size =
1838 				    MAX(fe->fe_rx_largest_msize, MCLBYTES);
1839 
1840 				aggregation_size =
1841 				    MIN(aggregation_size, mhead_bufsize);
1842 
1843 				factor = (fe->fe_rx_pktq_bytes / max_ip_len) *
1844 				    (MAX(sizeof(struct ip),
1845 				    sizeof(struct ip6_hdr)) +
1846 				    sizeof(struct tcphdr));
1847 
1848 				mhead_cnt = MAX(((fe->fe_rx_pktq_bytes +
1849 				    factor) / aggregation_size) + 1, 1);
1850 			} else {
1851 				/* No payload, thus it's all small-sized ACKs/... */
1852 				mhead_bufsize = MHLEN;
1853 				mhead_cnt = KPKTQ_LEN(&fe->fe_rx_pktq);
1854 			}
1855 
1856 			one = 1;
1857 
1858 			if (mhead_bufsize >= MBIGCLBYTES) {
1859 				wait = M_NOWAIT;
1860 			} else {
1861 				wait = M_WAITOK;
1862 			}
1863 
1864 			mhead = m_allocpacket_internal(&mhead_cnt,
1865 			    mhead_bufsize, &one, wait, 1, 0);
1866 
1867 			if (mhead == NULL) {
1868 				if (mhead_bufsize == M16KCLBYTES) {
1869 					mhead_bufsize = MBIGCLBYTES;
1870 					goto try_again;
1871 				}
1872 
1873 				if (mhead_bufsize == MBIGCLBYTES) {
1874 					mhead_bufsize = MCLBYTES;
1875 					goto try_again;
1876 				}
1877 			}
1878 		} else {
1879 			mhead = NULL;
1880 			mhead_bufsize = mhead_cnt = 0;
1881 		}
1882 		SK_DF(logflags, "batch alloc'ed %u mbufs of size %u", mhead_cnt,
1883 		    mhead_bufsize);
1884 	}
1885 
1886 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1887 		if (tpkt != NULL) {
1888 			void *baddr;
1889 			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1890 			SK_PREFETCH(baddr, 0);
1891 		}
1892 
1893 		/* Validate l2 len, ip vers, is_mbuf */
1894 		ASSERT(pkt->pkt_l2_len == l2len);
1895 		ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
1896 		ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
1897 		ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
1898 		ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
1899 		ASSERT(!pkt->pkt_flow_ip_is_frag);
1900 		ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1901 
1902 		csum_ok = false;
1903 		agg_ok = false;
1904 		/*
1905 		 * As we only agg packets with same hdr length,
1906 		 * leverage the pkt metadata
1907 		 */
1908 		uint32_t thlen = (pkt->pkt_flow_ip_hlen +
1909 		    pkt->pkt_flow_tcp_hlen);
1910 		uint32_t plen = (thlen + pkt->pkt_flow_ulen);
1911 
1912 		/*
1913 		 * Rather than calling flow_pkt_track() for each
1914 		 * packet here, we accumulate received packet stats
1915 		 * for the call to flow_track_stats() below.  This
1916 		 * is because flow tracking is a no-op for traffic
1917 		 * that belongs to the host stack.
1918 		 */
1919 		rcvd_ulen += pkt->pkt_flow_ulen;
1920 		rcvd_bytes += pkt->pkt_length;
1921 		rcvd_packets++;
1922 
1923 		KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1924 		fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1925 
1926 		/* packet is for BSD flow, create a mbuf chain */
1927 		uint32_t len = (l2len + plen);
1928 		uint16_t data_csum = 0;
1929 		struct mbuf *m;
1930 		if (__improbable(is_mbuf)) {
1931 			m = pkt->pkt_mbuf;
1932 			/* Detach mbuf from source pkt */
1933 			KPKT_CLEAR_MBUF_DATA(pkt);
1934 
1935 			uint32_t trailer = (m_pktlen(m) - len);
1936 			ASSERT((uint32_t)m_pktlen(m) >= plen);
1937 			/* Remove the trailer */
1938 			if (trailer > 0) {
1939 				m_adj(m, -trailer);
1940 			}
1941 			/* attached mbuf is already allocated */
1942 			csum_ok = mbuf_csum(pkt, m, is_ipv4, &data_csum);
1943 		} else {                /* native */
1944 			uint16_t pad = P2ROUNDUP(l2len, sizeof(uint32_t)) -
1945 			    l2len;
1946 			uint32_t tot_len = (len + pad);
1947 			/* remember largest aggregated packet size */
1948 			if (smbuf) {
1949 				if (largest_smbuf < (uint32_t)m_pktlen(smbuf)) {
1950 					largest_smbuf =
1951 					    (uint32_t)m_pktlen(smbuf);
1952 				}
1953 			}
1954 
1955 			if (prev_csum_ok && curr_m) {
1956 				ASSERT(fa.fa_smbuf == smbuf);
1957 				ASSERT(!fa.fa_sobj_is_pkt);
1958 				agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
1959 
1960 				if (agg_ok &&
1961 				    M_TRAILINGSPACE(curr_m) >= plen - thlen) {
1962 					/*
1963 					 * No need for a new mbuf,
1964 					 * just append to curr_m.
1965 					 */
1966 					csum_ok = copy_pkt_csum_packed(pkt,
1967 					    plen, NULL, is_ipv4, curr_m, NULL,
1968 					    &data_csum, NULL);
1969 
1970 					if (!csum_ok) {
1971 						STATS_INC(fsws,
1972 						    FSW_STATS_RX_AGG_BAD_CSUM);
1973 						SK_ERR("Checksum for "
1974 						    "aggregation is wrong");
1975 						DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail1);
1976 						/*
1977 						 * Turns out, checksum is wrong!
1978 						 * Fallback to no-agg mode.
1979 						 */
1980 						agg_ok = 0;
1981 					} else {
1982 						/*
1983 						 * We only added payload,
1984 						 * thus -thlen.
1985 						 */
1986 						bytes += (plen - thlen);
1987 						flow_agg_merge_hdr(&fa, pkt,
1988 						    data_csum, fsws);
1989 						goto next;
1990 					}
1991 				}
1992 			}
1993 
1994 			/*
1995 			 * If the batch allocation returned partial success,
1996 			 * we try blocking allocation here again
1997 			 */
1998 			m = mhead;
1999 			if (__improbable(m == NULL ||
2000 			    tot_len > mhead_bufsize)) {
2001 				unsigned int one = 1;
2002 
2003 				ASSERT(mhead_cnt == 0 || mhead != NULL);
2004 				err = mbuf_allocpacket(MBUF_WAITOK, tot_len,
2005 				    &one, &m);
2006 				if (err != 0) {
2007 					STATS_INC(fsws,
2008 					    FSW_STATS_RX_DROP_NOMEM_BUF);
2009 					SK_ERR("mbuf alloc failed (err %d)",
2010 					    err);
2011 					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2012 					continue;
2013 				}
2014 				alloced++;
2015 			} else {
2016 				ASSERT(mhead_cnt > 0);
2017 				mhead = m->m_nextpkt;
2018 				m->m_nextpkt = NULL;
2019 				mhead_cnt--;
2020 			}
2021 			m->m_data += pad;
2022 			m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2023 
2024 			/*
2025 			 * copy and checksum l3, l4 and payload
2026 			 * l2 header is copied later only if we
2027 			 * can't agg as an optimization
2028 			 */
2029 			m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
2030 			_dbuf_array_t dbuf_array = {.dba_is_buflet = false};
2031 			if (agg_ok) {
2032 				int added = 0;
2033 				dbuf_array.dba_mbuf[0] = m;
2034 				dbuf_array.dba_num_dbufs = 1;
2035 				csum_ok = copy_pkt_csum_packed(pkt, plen,
2036 				    &dbuf_array, is_ipv4, curr_m, NULL,
2037 				    &data_csum, &added);
2038 
2039 				if (!csum_ok) {
2040 					STATS_INC(fsws,
2041 					    FSW_STATS_RX_AGG_BAD_CSUM);
2042 					SK_ERR("Checksum for aggregation "
2043 					    "on new mbuf is wrong");
2044 					DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail2);
2045 					agg_ok = false;
2046 					goto non_agg;
2047 				}
2048 
2049 				/*
2050 				 * There was not enough space in curr_m,
2051 				 * thus we must have added to m->m_data.
2052 				 */
2053 				VERIFY(added > 0);
2054 				VERIFY(m->m_len == m->m_pkthdr.len &&
2055 				    (uint32_t)m->m_len <=
2056 				    (uint32_t)mbuf_maxlen(m));
2057 
2058 				/*
2059 				 * We account for whatever we added
2060 				 * to m later on, thus - added.
2061 				 */
2062 				bytes += plen - thlen - added;
2063 			} else {
2064 non_agg:
2065 				dbuf_array.dba_mbuf[0] = m;
2066 				dbuf_array.dba_num_dbufs = 1;
2067 				m->m_len += l2len;
2068 				m->m_pkthdr.len += l2len;
2069 				csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
2070 				    &data_csum, is_ipv4);
2071 				if (__improbable(!csum_ok)) {
2072 					STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
2073 					SK_ERR("%d incorrect csum", __LINE__);
2074 					DTRACE_SKYWALK(aggr__host_tcp_csum_fail);
2075 				}
2076 				VERIFY(m->m_len == m->m_pkthdr.len &&
2077 				    (uint32_t)m->m_len <=
2078 				    (uint32_t)mbuf_maxlen(m));
2079 			}
2080 
2081 			STATS_INC(fsws, FSW_STATS_RX_COPY_PKT2MBUF);
2082 			STATS_INC(fsws, FSW_STATS_RX_COPY_SUM);
2083 
2084 			m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off;
2085 			m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value;
2086 			/*
2087 			 *  Note that these flags have same value,
2088 			 * except PACKET_CSUM_PARTIAL
2089 			 */
2090 			m->m_pkthdr.csum_flags |= (pkt->pkt_csum_flags &
2091 			    PACKET_CSUM_RX_FLAGS);
2092 
2093 			/* Set the rcvif */
2094 			m->m_pkthdr.rcvif = fsw->fsw_ifp;
2095 		}
2096 		ASSERT(m != NULL);
2097 		ASSERT((m->m_flags & M_PKTHDR) && m->m_pkthdr.pkt_hdr != NULL);
2098 		ASSERT((m->m_flags & M_HASFCS) == 0);
2099 		ASSERT(m->m_nextpkt == NULL);
2100 
2101 		if (__improbable(is_mbuf)) {
2102 			if ((uint32_t) m->m_len < (l2len + thlen)) {
2103 				m = m_pullup(m, (l2len + thlen));
2104 				if (m == NULL) {
2105 					STATS_INC(fsws,
2106 					    FSW_STATS_RX_DROP_NOMEM_BUF);
2107 					SK_ERR("mbuf pullup failed (err %d)",
2108 					    err);
2109 					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2110 					continue;
2111 				}
2112 				m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2113 			}
2114 			if (prev_csum_ok && csum_ok) {
2115 				ASSERT(fa.fa_smbuf == smbuf);
2116 				agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2117 			}
2118 		}
2119 
2120 		if (agg_ok) {
2121 			ASSERT(fa.fa_smbuf == smbuf);
2122 			ASSERT(!fa.fa_sobj_is_pkt);
2123 			if (__improbable(is_mbuf)) {
2124 				bytes += (m_pktlen(m) - l2len);
2125 				/* adjust mbuf by l2, l3 and l4  hdr */
2126 				m_adj(m, l2len + thlen);
2127 			} else {
2128 				bytes += m_pktlen(m);
2129 			}
2130 
2131 			m->m_flags &= ~M_PKTHDR;
2132 			flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
2133 			while (curr_m->m_next != NULL) {
2134 				curr_m = curr_m->m_next;
2135 			}
2136 			curr_m->m_next = m;
2137 			curr_m = m;
2138 			m = NULL;
2139 		} else {
2140 			if ((uint32_t) m->m_len < l2len) {
2141 				m = m_pullup(m, l2len);
2142 				if (m == NULL) {
2143 					STATS_INC(fsws,
2144 					    FSW_STATS_RX_DROP_NOMEM_BUF);
2145 					SK_ERR("mbuf pullup failed (err %d)",
2146 					    err);
2147 					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2148 					continue;
2149 				}
2150 				m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2151 			}
2152 
2153 			/* copy l2 header for native */
2154 			if (__probable(!is_mbuf)) {
2155 				uint16_t llhoff = pkt->pkt_headroom;
2156 				uint8_t *baddr;
2157 				MD_BUFLET_ADDR_ABS(pkt, baddr);
2158 				ASSERT(baddr != NULL);
2159 				baddr += llhoff;
2160 				pkt_copy(baddr, m->m_data, l2len);
2161 			}
2162 			/* adjust mbuf by l2 hdr */
2163 			m_adj(m, l2len);
2164 			bytes += m_pktlen(m);
2165 
2166 			/*
2167 			 * aggregated packets can be skipped by pktap because
2168 			 * the original pre-aggregated chain already passed through
2169 			 * pktap (see fsw_snoop()) before entering this function.
2170 			 */
2171 			m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
2172 
2173 			if (m_chain == NULL) {
2174 				/* this is the start of the chain */
2175 				m_chain = m;
2176 				smbuf = m;
2177 				curr_m = m;
2178 			} else if (smbuf != NULL) {
2179 				/*
2180 				 * set m to be next packet
2181 				 */
2182 				mbuf_agg_log(smbuf, kernproc, is_mbuf);
2183 				smbuf->m_nextpkt = m;
2184 				smbuf = m;
2185 				curr_m = m;
2186 			} else {
2187 				VERIFY(0);
2188 			}
2189 
2190 			smbufs++;
2191 			m = NULL;
2192 
2193 			flow_agg_init_smbuf(&fa, smbuf, pkt);
2194 			/*
2195 			 * if the super packet is an mbuf which can't accomodate
2196 			 * (sizeof(struct ip6_tcp_mask) in a single buffer then
2197 			 * do the aggregation check in slow path.
2198 			 * Note that an mbuf without cluster has only 80 bytes
2199 			 * available for data, sizeof(struct ip6_tcp_mask) is
2200 			 * also 80 bytes, so if the packet contains an
2201 			 * ethernet header, this mbuf won't be able to fully
2202 			 * contain "struct ip6_tcp_mask" data in a single
2203 			 * buffer.
2204 			 */
2205 			if (pkt->pkt_flow_ip_ver == IPV6_VERSION) {
2206 				if (__improbable(smbuf->m_len <
2207 				    ((smbuf->m_data -
2208 				    (caddr_t)(smbuf->m_pkthdr.pkt_hdr)) +
2209 				    MASK_SIZE))) {
2210 					fa.fa_sobj_is_short = true;
2211 				}
2212 			}
2213 		}
2214 next:
2215 		pkt_agg_log(pkt, kernproc, true);
2216 		prev_csum_ok = csum_ok;
2217 		KPKTQ_ENQUEUE(&disposed_pkts, pkt);
2218 	}
2219 
2220 	KPKTQ_FINI(&fe->fe_rx_pktq);
2221 
2222 	/* Free any leftover mbufs, true only for native  */
2223 	if (__improbable(mhead != NULL)) {
2224 		ASSERT(mhead_cnt != 0);
2225 		(void) m_freem_list(mhead);
2226 		mhead = NULL;
2227 		mhead_cnt = 0;
2228 		mhead_bufsize = 0;
2229 	}
2230 
2231 	if (fe->fe_rx_largest_msize > largest_smbuf) {
2232 		/*
2233 		 * Make it slowly move towards smbuf if we consistently get
2234 		 * non-aggregatable size.
2235 		 *
2236 		 * If we start at 16K, this makes us go to 4K within 6 rounds
2237 		 * and down to 2K within 12 rounds.
2238 		 */
2239 		fe->fe_rx_largest_msize -=
2240 		    ((fe->fe_rx_largest_msize - largest_smbuf) >> 2);
2241 	} else {
2242 		fe->fe_rx_largest_msize +=
2243 		    ((largest_smbuf - fe->fe_rx_largest_msize) >> 2);
2244 	}
2245 
2246 	if (smbufs > 0) {
2247 		/* Last smbuf */
2248 		mbuf_agg_log(smbuf, kernproc, is_mbuf);
2249 		SK_DF(logflags, "smbuf count %u", smbufs);
2250 
2251 		ASSERT(m_chain != NULL);
2252 		ASSERT(smbuf != NULL);
2253 		/*
2254 		 * Call fsw_host_sendup() with mbuf chain
2255 		 * directly.
2256 		 */
2257 		mchain_agg_log(m_chain, kernproc, is_mbuf);
2258 		fsw_host_sendup(fsw->fsw_ifp, m_chain, smbuf, smbufs, bytes);
2259 
2260 		if (__improbable(is_mbuf)) {
2261 			STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2MBUF, smbufs);
2262 		} else {
2263 			STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2MBUF, smbufs);
2264 		}
2265 		FLOW_STATS_IN_ADD(fe, spackets, smbufs);
2266 
2267 		ASSERT((fe->fe_flags & FLOWENTF_TRACK) == 0);
2268 	}
2269 
2270 	/* record (raw) number of packets and bytes */
2271 	ASSERT((int)(rcvd_bytes - drop_bytes) > 0);
2272 	ASSERT((int)(rcvd_packets - drop_packets) > 0);
2273 	flow_track_stats(fe, (rcvd_bytes - drop_bytes),
2274 	    (rcvd_packets - drop_packets), (rcvd_ulen != 0), true);
2275 
2276 	pp_free_pktq(&disposed_pkts);
2277 }
2278 
2279 void
flow_rx_agg_tcp(struct nx_flowswitch * fsw,struct flow_entry * fe)2280 flow_rx_agg_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe)
2281 {
2282 	struct pktq dropped_pkts;
2283 	bool is_mbuf;
2284 
2285 	if (__improbable(fe->fe_rx_frag_count > 0)) {
2286 		dp_flow_rx_process(fsw, fe);
2287 		return;
2288 	}
2289 
2290 	KPKTQ_INIT(&dropped_pkts);
2291 
2292 	if (!dp_flow_rx_route_process(fsw, fe)) {
2293 		SK_ERR("Rx route bad");
2294 		fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
2295 		STATS_ADD(&fsw->fsw_stats, FSW_STATS_RX_FLOW_NONVIABLE,
2296 		    KPKTQ_LEN(&dropped_pkts));
2297 		goto done;
2298 	}
2299 
2300 	is_mbuf = !!(PKT_IS_MBUF(KPKTQ_FIRST(&fe->fe_rx_pktq)));
2301 
2302 	if (fe->fe_nx_port == FSW_VP_HOST) {
2303 		boolean_t do_rx_agg;
2304 
2305 		/* BSD flow */
2306 		if (sk_fsw_rx_agg_tcp_host != SK_FSW_RX_AGG_TCP_HOST_AUTO) {
2307 			do_rx_agg = (sk_fsw_rx_agg_tcp_host ==
2308 			    SK_FSW_RX_AGG_TCP_HOST_ON);
2309 		} else {
2310 			do_rx_agg = !dlil_has_ip_filter() &&
2311 			    !dlil_has_if_filter(fsw->fsw_ifp);
2312 		}
2313 		if (__improbable(!do_rx_agg)) {
2314 			fsw_host_rx(fsw, fe);
2315 			return;
2316 		}
2317 		if (__improbable(pktap_total_tap_count != 0)) {
2318 			fsw_snoop(fsw, fe, true);
2319 		}
2320 		flow_rx_agg_host(fsw, fe, &dropped_pkts, is_mbuf);
2321 	} else {
2322 		/* channel flow */
2323 		if (__improbable(pktap_total_tap_count != 0)) {
2324 			fsw_snoop(fsw, fe, true);
2325 		}
2326 		flow_rx_agg_channel(fsw, fe, &dropped_pkts, is_mbuf);
2327 	}
2328 
2329 done:
2330 	pp_free_pktq(&dropped_pkts);
2331 }
2332