xref: /xnu-8019.80.24/bsd/skywalk/nexus/flowswitch/flow/flow_agg.c (revision a325d9c4a84054e40bbe985afedcb50ab80993ea)
1 /*
2  * Copyright (c) 2019-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
33 #include <skywalk/nexus/netif/nx_netif.h>
34 #include <skywalk/nexus/netif/nx_netif_compat.h>
35 #include <netinet/tcp.h>
36 #include <netinet/ip.h>
37 #include <netinet/ip6.h>
38 #include <net/pktap.h>
39 #include <sys/sdt.h>
40 
41 #define MAX_BUFLET_COUNT        (64)
42 #define TCP_FLAGS_IGNORE        (TH_FIN|TH_SYN|TH_RST|TH_URG)
43 #define PKT_IS_MBUF(_pkt)       (_pkt->pkt_pflags & PKT_F_MBUF_DATA)
44 #define PKT_IS_TRUNC_MBUF(_pkt) (PKT_IS_MBUF(_pkt) &&           \
45 	                        (_pkt->pkt_pflags & PKT_F_TRUNCATED))
46 
47 /*
48  * This structure holds per-super object (mbuf/packet) flow aggregation states.
49  */
50 struct flow_agg {
51 	union {
52 		struct {
53 			union {
54 				void *          _fa_sobj;
55 				struct mbuf *   _fa_smbuf;      /* super mbuf */
56 				struct __kern_packet *_fa_spkt; /* super pkt */
57 			};
58 			uint8_t *_fa_sptr;        /* ptr to super IP header */
59 			bool     _fa_sobj_is_pkt; /* super obj is pkt or mbuf */
60 			/*
61 			 * super obj is not large enough to hold the IP & TCP
62 			 * header in a contiguous buffer.
63 			 */
64 			bool     _fa_sobj_is_short;
65 			uint32_t _fa_tcp_seq;     /* expected next sequence # */
66 			uint32_t _fa_ulen;        /* expected next ulen */
67 			uint32_t _fa_total;       /* total aggregated bytes */
68 		} __flow_agg;
69 		uint64_t __flow_agg_data[4];
70 	};
71 #define fa_sobj           __flow_agg._fa_sobj
72 #define fa_smbuf          __flow_agg._fa_smbuf
73 #define fa_spkt           __flow_agg._fa_spkt
74 #define fa_sptr           __flow_agg._fa_sptr
75 #define fa_sobj_is_pkt    __flow_agg._fa_sobj_is_pkt
76 #define fa_sobj_is_short  __flow_agg._fa_sobj_is_short
77 #define fa_tcp_seq        __flow_agg._fa_tcp_seq
78 #define fa_ulen           __flow_agg._fa_ulen
79 #define fa_total          __flow_agg._fa_total
80 };
81 
82 #define FLOW_AGG_CLEAR(_fa) do {                                        \
83 	_CASSERT(sizeof(struct flow_agg) == 32);                        \
84 	sk_zero_32(_fa);                                                \
85 } while (0)
86 
87 #define MASK_SIZE       80      /* size of struct {ip,ip6}_tcp_mask */
88 
89 struct ip_tcp_mask {
90 	struct ip       ip_m;
91 	struct tcphdr   tcp_m;
92 	uint32_t        tcp_option_m[MAX_TCPOPTLEN / sizeof(uint32_t)];
93 };
94 
95 static const struct ip_tcp_mask ip_tcp_mask
96 __sk_aligned(16) =
97 {
98 	.ip_m = {
99 		.ip_hl = 0xf,
100 		.ip_v = 0xf,
101 		.ip_tos = 0xff,
102 		/* Not checked; aggregated packet's ip_len is increasing */
103 		.ip_len = 0,
104 		.ip_id = 0,
105 		.ip_off = 0xffff,
106 		.ip_ttl = 0xff,
107 		.ip_p = 0xff,
108 		.ip_sum = 0,
109 		.ip_src.s_addr = 0xffffffff,
110 		.ip_dst.s_addr = 0xffffffff,
111 	},
112 	.tcp_m = {
113 		.th_sport = 0xffff,
114 		.th_dport = 0xffff,
115 		.th_seq = 0,
116 		.th_ack = 0xffffffff,
117 		.th_x2 = 0xf,
118 		.th_off = 0xf,
119 		.th_flags = ~TH_PUSH,
120 		.th_win = 0xffff,
121 		.th_sum = 0,
122 		.th_urp = 0xffff,
123 	},
124 	.tcp_option_m = {
125 		/* Max 40 bytes of TCP options */
126 		0xffffffff,
127 		0xffffffff,
128 		0xffffffff,
129 		0,      /* Filling up to MASK_SIZE */
130 		0,      /* Filling up to MASK_SIZE */
131 		0,      /* Filling up to MASK_SIZE */
132 		0,      /* Filling up to MASK_SIZE */
133 		0,      /* Filling up to MASK_SIZE */
134 		0,      /* Filling up to MASK_SIZE */
135 		0,      /* Filling up to MASK_SIZE */
136 	},
137 };
138 
139 struct ip6_tcp_mask {
140 	struct ip6_hdr  ip6_m;
141 	struct tcphdr   tcp_m;
142 	uint32_t        tcp_option_m[5]; /* 5 bytes to fill up to MASK_SIZE */
143 };
144 
145 static const struct ip6_tcp_mask ip6_tcp_mask
146 __sk_aligned(16) =
147 {
148 	.ip6_m = {
149 		.ip6_ctlun.ip6_un1.ip6_un1_flow = 0xffffffff,
150 		/* Not checked; aggregated packet's ip_len is increasing */
151 		.ip6_ctlun.ip6_un1.ip6_un1_plen = 0,
152 		.ip6_ctlun.ip6_un1.ip6_un1_nxt = 0xff,
153 		.ip6_ctlun.ip6_un1.ip6_un1_hlim = 0xff,
154 		.ip6_src.__u6_addr.__u6_addr32[0] = 0xffffff,
155 		.ip6_src.__u6_addr.__u6_addr32[1] = 0xffffff,
156 		.ip6_src.__u6_addr.__u6_addr32[2] = 0xffffff,
157 		.ip6_src.__u6_addr.__u6_addr32[3] = 0xffffff,
158 		.ip6_dst.__u6_addr.__u6_addr32[0] = 0xffffff,
159 		.ip6_dst.__u6_addr.__u6_addr32[1] = 0xffffff,
160 		.ip6_dst.__u6_addr.__u6_addr32[2] = 0xffffff,
161 		.ip6_dst.__u6_addr.__u6_addr32[3] = 0xffffff,
162 	},
163 	.tcp_m = {
164 		.th_sport = 0xffff,
165 		.th_dport = 0xffff,
166 		.th_seq = 0,
167 		.th_ack = 0xffffffff,
168 		.th_x2 = 0xf,
169 		.th_off = 0xf,
170 		.th_flags = ~TH_PUSH,
171 		.th_win = 0xffff,
172 		.th_sum = 0,
173 		.th_urp = 0xffff,
174 	},
175 	.tcp_option_m = {
176 		/* Max 40 bytes of TCP options */
177 		0xffffffff,
178 		0xffffffff,
179 		0xffffffff,
180 		0,          /* Filling up to MASK_SIZE */
181 		0,          /* Filling up to MASK_SIZE */
182 	},
183 };
184 
185 
186 #if SK_LOG
187 SK_LOG_ATTRIBUTE
188 static void
_pkt_agg_log(struct __kern_packet * pkt,struct proc * p,bool is_input)189 _pkt_agg_log(struct __kern_packet *pkt, struct proc *p, bool is_input)
190 {
191 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
192 	    (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
193 
194 	kern_packet_t ph = SK_PKT2PH(pkt);
195 	uint64_t bufcnt = 1;
196 	if (!is_input) {
197 		bufcnt = kern_packet_get_buflet_count(ph);
198 	}
199 
200 	SK_DF(logflags, "%s(%d) %spkt 0x%llx plen %u",
201 	    sk_proc_name_address(p), sk_proc_pid(p), is_input ? "s":"d",
202 	    SK_KVA(pkt), pkt->pkt_length);
203 
204 	SK_DF(logflags, "%spkt csumf/rxstart/rxval 0x%x/%u/0x%04x",
205 	    is_input ? "s":"d", pkt->pkt_csum_flags,
206 	    (uint32_t)pkt->pkt_csum_rx_start_off,
207 	    (uint32_t)pkt->pkt_csum_rx_value);
208 
209 	if (!is_input) {
210 		kern_buflet_t buf = kern_packet_get_next_buflet(ph, NULL);
211 
212 		/* Individual buflets */
213 		for (uint64_t i = 0; i < bufcnt && buf != NULL; i++) {
214 			SK_DF(logflags | SK_VERB_DUMP, "%s",
215 			    sk_dump("buf", kern_buflet_get_data_address(buf),
216 			    pkt->pkt_length, 128, NULL, 0));
217 			buf = kern_packet_get_next_buflet(ph, buf);
218 		}
219 	}
220 }
221 
222 #define pkt_agg_log(_pkt, _p, _is_input) do {                           \
223 	if (__improbable(sk_verbose != 0)) {                            \
224 	        _pkt_agg_log(_pkt, _p, _is_input);                      \
225 	}                                                               \
226 } while (0)
227 
228 SK_LOG_ATTRIBUTE
229 static void
_mbuf_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)230 _mbuf_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
231 {
232 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
233 	    (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
234 
235 	SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
236 	    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
237 	    m->m_pkthdr.len);
238 
239 	SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
240 	    m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
241 	    (uint32_t)m->m_pkthdr.csum_rx_val);
242 
243 	/* Dump the first mbuf */
244 	ASSERT(m->m_data != NULL);
245 	SK_DF(logflags | SK_VERB_DUMP, "%s", sk_dump("buf",
246 	    (uint8_t *)m->m_data, m->m_len, 128, NULL, 0));
247 }
248 
249 #define mbuf_agg_log(_m, _p, _is_mbuf) do {                             \
250 	if (__improbable(sk_verbose != 0)) {                            \
251 	        _mbuf_agg_log(_m, _p, _is_mbuf);                        \
252 	}                                                               \
253 } while (0)
254 
255 SK_LOG_ATTRIBUTE
256 static void
_mchain_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)257 _mchain_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
258 {
259 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
260 	    (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
261 
262 	while (m != NULL) {
263 		SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
264 		    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
265 		    m->m_pkthdr.len);
266 
267 		SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
268 		    m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
269 		    (uint32_t)m->m_pkthdr.csum_rx_val);
270 
271 		m = m->m_nextpkt;
272 	}
273 }
274 
275 #define mchain_agg_log(_m, _p, _is_mbuf) do {                           \
276 	if (__improbable(sk_verbose != 0)) {                            \
277 	        _mchain_agg_log(_m, _p, _is_mbuf);                      \
278 	}                                                               \
279 } while (0)
280 #else
281 #define pkt_agg_log(...)
282 #define mbuf_agg_log(...)
283 #define mchain_agg_log(...)
284 #endif /* SK_LOG */
285 
286 /*
287  * Checksum only for packet with mbuf.
288  */
289 static bool
mbuf_csum(struct __kern_packet * pkt,struct mbuf * m,bool verify_l3,uint16_t * data_csum)290 mbuf_csum(struct __kern_packet *pkt, struct mbuf *m, bool verify_l3,
291     uint16_t *data_csum)
292 {
293 	ASSERT(data_csum != NULL);
294 
295 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
296 	uint32_t plen = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen +
297 	    pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
298 	uint16_t l4len = plen - pkt->pkt_l2_len - pkt->pkt_flow_ip_hlen;
299 	uint16_t start = pkt->pkt_l2_len;
300 	uint32_t partial = 0;
301 	uint16_t csum = 0;
302 
303 	ASSERT(plen == m_pktlen(m));
304 
305 	/* Some compat drivers compute full checksum */
306 	if ((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS) ==
307 	    CSUM_RX_FULL_FLAGS) {
308 		SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
309 		    m->m_pkthdr.csum_flags, m->m_pkthdr.csum_rx_start,
310 		    m->m_pkthdr.csum_rx_val);
311 
312 		/* Compute the data_csum */
313 		struct tcphdr *tcp =
314 		    (struct tcphdr *)(void *)(mtod(m, uint8_t *) +
315 		    pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen);
316 		/* 16-bit alignment is sufficient */
317 		ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
318 
319 		uint16_t th_sum = tcp->th_sum;
320 		tcp->th_sum = 0;
321 
322 		partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
323 		    pkt->pkt_flow_tcp_hlen);
324 		partial += htons(l4len + IPPROTO_TCP);
325 		if (pkt->pkt_flow_ip_ver == IPVERSION) {
326 			csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
327 			    pkt->pkt_flow_ipv4_dst.s_addr, partial);
328 		} else {
329 			ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
330 			csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
331 			    &pkt->pkt_flow_ipv6_dst, partial);
332 		}
333 		/* Restore the original checksum */
334 		tcp->th_sum = th_sum;
335 		th_sum = __packet_fix_sum(th_sum, csum, 0);
336 		*data_csum = ~th_sum & 0xffff;
337 		if ((m->m_pkthdr.csum_rx_val ^ 0xffff) == 0) {
338 			return true;
339 		} else {
340 			return false;
341 		}
342 	}
343 	/* Reset the csum RX flags */
344 	m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
345 	if (verify_l3) {
346 		csum = m_sum16(m, start, pkt->pkt_flow_ip_hlen);
347 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
348 		    start, pkt->pkt_flow_ip_hlen, csum);
349 		m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED;
350 		if ((csum ^ 0xffff) != 0) {
351 			return false;
352 		} else {
353 			m->m_pkthdr.csum_flags |= CSUM_IP_VALID;
354 		}
355 	}
356 	/* Compute L4 header checksum */
357 	partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
358 	    pkt->pkt_flow_tcp_hlen);
359 	/* Compute payload checksum */
360 	start += (pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
361 	*data_csum = m_sum16(m, start, (plen - start));
362 
363 	/* Fold in the data checksum to TCP checksum */
364 	partial += *data_csum;
365 	partial += htons(l4len + IPPROTO_TCP);
366 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
367 		csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
368 		    pkt->pkt_flow_ipv4_dst.s_addr, partial);
369 	} else {
370 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
371 		csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
372 		    &pkt->pkt_flow_ipv6_dst, partial);
373 	}
374 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
375 	    start - pkt->pkt_flow_tcp_hlen, l4len, csum);
376 	// Set start to 0 for full checksum
377 	m->m_pkthdr.csum_rx_start = 0;
378 	m->m_pkthdr.csum_rx_val = csum;
379 	m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
380 	if ((csum ^ 0xffff) != 0) {
381 		return false;
382 	}
383 
384 	return true;
385 }
386 
387 /* structure to pass an array of data buffers */
388 typedef struct _dbuf_array {
389 	union {
390 		struct __kern_buflet *dba_buflet[MAX_BUFLET_COUNT];
391 		struct mbuf *dba_mbuf[MAX_BUFLET_COUNT];
392 	};
393 	uint8_t dba_num_dbufs;
394 	bool dba_is_buflet;
395 } _dbuf_array_t;
396 
397 static inline void
_copy_data_sum_dbuf(struct __kern_packet * spkt,uint16_t soff,uint16_t plen,uint32_t * partial_sum,boolean_t * odd_start,_dbuf_array_t * dbuf,boolean_t do_csum)398 _copy_data_sum_dbuf(struct __kern_packet *spkt, uint16_t soff, uint16_t plen,
399     uint32_t *partial_sum, boolean_t *odd_start, _dbuf_array_t *dbuf,
400     boolean_t do_csum)
401 {
402 	uint8_t i = 0;
403 	uint16_t buf_off = 0;
404 	uint16_t buflet_dlim;
405 	uint16_t buflet_dlen;
406 
407 	ASSERT(plen > 0);
408 	if (!dbuf->dba_is_buflet) {
409 		/*
410 		 * Assumption about a single mbuf is being asserted due to the
411 		 * reason that the current usage always passes one mbuf and the
412 		 * routine has not been tested with multiple mbufs.
413 		 */
414 		ASSERT(dbuf->dba_num_dbufs == 1);
415 		ASSERT((mbuf_maxlen(dbuf->dba_mbuf[0]) -
416 		    dbuf->dba_mbuf[0]->m_len) >= plen);
417 		buf_off = dbuf->dba_mbuf[0]->m_len;
418 	} else {
419 		buflet_dlim = kern_buflet_get_data_limit(dbuf->dba_buflet[0]);
420 		buflet_dlen = kern_buflet_get_data_length(dbuf->dba_buflet[0]);
421 		ASSERT(buflet_dlen < buflet_dlim);
422 		buf_off = buflet_dlen;
423 	}
424 	while (plen > 0) {
425 		uint16_t tmplen;
426 		uint16_t dbuf_lim;
427 		uint8_t *dbuf_addr;
428 
429 		if (dbuf->dba_is_buflet) {
430 			ASSERT(i < dbuf->dba_num_dbufs);
431 			ASSERT(kern_buflet_get_data_offset(dbuf->dba_buflet[i])
432 			    == 0);
433 			dbuf_addr =
434 			    kern_buflet_get_data_address(dbuf->dba_buflet[i]);
435 			dbuf_lim = buflet_dlim - buf_off;
436 		} else {
437 			dbuf_addr = mtod(dbuf->dba_mbuf[i], uint8_t *);
438 			dbuf_lim = mbuf_maxlen(dbuf->dba_mbuf[i]) - buf_off;
439 		}
440 		dbuf_addr += buf_off;
441 		tmplen = min(plen, dbuf_lim);
442 		if (PKT_IS_TRUNC_MBUF(spkt)) {
443 			if (do_csum) {
444 				*partial_sum = m_copydata_sum(spkt->pkt_mbuf,
445 				    soff, tmplen, dbuf_addr, *partial_sum,
446 				    odd_start);
447 			} else {
448 				m_copydata(spkt->pkt_mbuf, soff, tmplen,
449 				    dbuf_addr);
450 			}
451 		} else {
452 			*partial_sum = pkt_copyaddr_sum(SK_PKT2PH(spkt),
453 			    soff, dbuf_addr, tmplen, do_csum, *partial_sum,
454 			    odd_start);
455 		}
456 		if (dbuf->dba_is_buflet) {
457 			VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[i],
458 			    tmplen + buf_off) == 0);
459 		} else {
460 			dbuf->dba_mbuf[i]->m_len += tmplen;
461 			dbuf->dba_mbuf[i]->m_pkthdr.len += tmplen;
462 		}
463 		soff += tmplen;
464 		plen -= tmplen;
465 		buf_off = 0;
466 		i++;
467 	}
468 	ASSERT(plen == 0);
469 }
470 
471 /*
472  * Copy (fill) and checksum for packet.
473  * spkt: source IP packet.
474  * plen: length of data in spkt (IP hdr + TCP hdr + TCP payload).
475  * verify_l3: verify IPv4 header checksum.
476  * currm: destination mbuf.
477  * currp: destination skywalk packet.
478  * dbuf: additional destination data buffer(s), used when current destination
479  * packet is out of space.
480  * added: amount of data copied from spkt to the additional buffer.
481  * data_sum: 16-bit folded partial checksum of the copied TCP payload.
482  */
483 static bool
copy_pkt_csum_packed(struct __kern_packet * spkt,uint32_t plen,_dbuf_array_t * dbuf,bool verify_l3,struct mbuf * currm,struct __kern_buflet * currp,uint16_t * data_csum,int * added)484 copy_pkt_csum_packed(struct __kern_packet *spkt, uint32_t plen,
485     _dbuf_array_t *dbuf, bool verify_l3, struct mbuf *currm,
486     struct __kern_buflet *currp, uint16_t *data_csum, int *added)
487 {
488 	ASSERT(data_csum != NULL);
489 
490 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
491 	    SK_VERB_COPY));
492 
493 	uint16_t start = 0, csum = 0;
494 	uint32_t len = 0;
495 	uint32_t l4len;
496 	/* soff is only used for packets */
497 	uint16_t soff = spkt->pkt_headroom + spkt->pkt_l2_len;
498 	uint32_t data_partial = 0, partial = 0;
499 	int32_t curr_oldlen;
500 	uint32_t curr_trailing;
501 	char *curr_ptr;
502 	int32_t curr_len;
503 	uint16_t data_off;
504 	uint32_t tmplen;
505 	boolean_t odd_start = FALSE;
506 
507 	/* One of them must be != NULL, but they can't be both set */
508 	VERIFY((currm != NULL || currp != NULL) &&
509 	    ((currm != NULL) != (currp != NULL)));
510 
511 	if (currm != NULL) {
512 		curr_oldlen = currm->m_len;
513 		curr_trailing = (uint32_t)M_TRAILINGSPACE(currm);
514 		curr_ptr = mtod(currm, char *) + currm->m_len;
515 		curr_len = currm->m_len;
516 	} else {
517 		curr_oldlen = currp->buf_dlen;
518 		curr_trailing = currp->buf_dlim - currp->buf_doff -
519 		    currp->buf_dlen;
520 		curr_ptr = (char *)(currp->buf_addr + currp->buf_doff +
521 		    currp->buf_dlen);
522 		curr_len = currp->buf_dlen;
523 	}
524 
525 	/* Reset the checksum flags in source packet */
526 	spkt->pkt_csum_flags &= ~PACKET_CSUM_RX_FLAGS;
527 
528 	/* Verify checksum only for IPv4 */
529 	len = spkt->pkt_flow_ip_hlen;
530 	if (verify_l3) {
531 		if (PKT_IS_TRUNC_MBUF(spkt)) {
532 			partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
533 			    len, 0, 0);
534 		} else {
535 			partial = pkt_sum(SK_PKT2PH(spkt), soff, len);
536 		}
537 
538 		csum = __packet_fold_sum(partial);
539 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)", 0,
540 		    len, csum);
541 		spkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
542 		if ((csum ^ 0xffff) != 0) {
543 			/* No need to copy & checkum TCP+payload */
544 			return false;
545 		} else {
546 			spkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
547 		}
548 	}
549 
550 	/* Copy & verify TCP checksum */
551 	start = spkt->pkt_flow_ip_hlen + spkt->pkt_flow_tcp_hlen;
552 	l4len = plen - spkt->pkt_flow_ip_hlen;
553 	len = plen - start;
554 	if (PKT_IS_TRUNC_MBUF(spkt)) {
555 		tmplen = min(len, curr_trailing);
556 		odd_start = FALSE;
557 
558 		/* First, simple checksum on the TCP header */
559 		partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
560 		    spkt->pkt_flow_tcp_hlen, spkt->pkt_flow_ip_hlen, 0);
561 
562 		/* Now, copy & sum the payload */
563 		if (tmplen > 0) {
564 			data_partial = m_copydata_sum(spkt->pkt_mbuf,
565 			    start, tmplen, curr_ptr, 0, &odd_start);
566 			curr_len += tmplen;
567 		}
568 		data_off = start + tmplen;
569 	} else {
570 		tmplen = min(len, curr_trailing);
571 		odd_start = FALSE;
572 
573 		/* First, simple checksum on the TCP header */
574 		partial = pkt_sum(SK_PKT2PH(spkt),
575 		    (soff + spkt->pkt_flow_ip_hlen), spkt->pkt_flow_tcp_hlen);
576 
577 		/* Now, copy & sum the payload */
578 		if (tmplen > 0) {
579 			data_partial = pkt_copyaddr_sum(SK_PKT2PH(spkt),
580 			    (soff + start), (uint8_t *)curr_ptr, tmplen,
581 			    true, 0, &odd_start);
582 			curr_len += tmplen;
583 		}
584 		data_off = soff + start + tmplen;
585 	}
586 
587 	/* copy & sum remaining payload in additional buffers */
588 	if ((len - tmplen) > 0) {
589 		ASSERT(dbuf != NULL);
590 		_copy_data_sum_dbuf(spkt, data_off, (len - tmplen),
591 		    &data_partial, &odd_start, dbuf, true);
592 		*added = (len - tmplen);
593 	}
594 
595 	/* Fold data checksum to 16 bit */
596 	*data_csum = __packet_fold_sum(data_partial);
597 
598 	/* Fold in the data checksum to TCP checksum */
599 	partial += *data_csum;
600 
601 	if (currm != NULL) {
602 		currm->m_len = curr_len;
603 	} else {
604 		currp->buf_dlen = curr_len;
605 	}
606 
607 	partial += htons(l4len + IPPROTO_TCP);
608 	if (spkt->pkt_flow_ip_ver == IPVERSION) {
609 		csum = in_pseudo(spkt->pkt_flow_ipv4_src.s_addr,
610 		    spkt->pkt_flow_ipv4_dst.s_addr, partial);
611 	} else {
612 		ASSERT(spkt->pkt_flow_ip_ver == IPV6_VERSION);
613 		csum = in6_pseudo(&spkt->pkt_flow_ipv6_src,
614 		    &spkt->pkt_flow_ipv6_dst, partial);
615 	}
616 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
617 	    start - spkt->pkt_flow_tcp_hlen, l4len, ntohs(csum));
618 	__packet_set_inet_checksum(SK_PKT2PH(spkt), spkt->pkt_csum_flags |
619 	    PACKET_CSUM_DATA_VALID | PACKET_CSUM_PSEUDO_HDR, 0,
620 	    csum, false);
621 
622 	if ((csum ^ 0xffff) != 0) {
623 		/*
624 		 * Revert whatever we did here!
625 		 * currm/currp should be restored to previous value.
626 		 * dbuf (for additional payload) should be restore to 0.
627 		 */
628 		if (currm != NULL) {
629 			currm->m_len = curr_oldlen;
630 		} else {
631 			currp->buf_dlen = curr_oldlen;
632 		}
633 		if (dbuf != NULL) {
634 			for (int i = 0; i < dbuf->dba_num_dbufs; i++) {
635 				if (dbuf->dba_is_buflet) {
636 					struct __kern_buflet *b = dbuf->dba_buflet[i];
637 					kern_buflet_set_data_length(b, 0);
638 					kern_buflet_set_data_offset(b, 0);
639 				} else {
640 					struct mbuf *m = dbuf->dba_mbuf[i];
641 					m->m_len = m->m_pkthdr.len = 0;
642 				}
643 			}
644 		}
645 
646 		return false;
647 	}
648 
649 	return true;
650 }
651 
652 /*
653  * Copy and checksum for packet or packet with mbuf
654  * data_csum is only supported for bsd flows
655  */
656 static bool
copy_pkt_csum(struct __kern_packet * pkt,uint32_t plen,_dbuf_array_t * dbuf,uint16_t * data_csum,bool verify_l3)657 copy_pkt_csum(struct __kern_packet *pkt, uint32_t plen, _dbuf_array_t *dbuf,
658     uint16_t *data_csum, bool verify_l3)
659 {
660 	/*
661 	 * To keep this routine simple and optimal, we are asserting on the
662 	 * assumption that the smallest flowswitch packet pool buffer should
663 	 * be large enough to hold the IP and TCP headers in the first buflet.
664 	 */
665 	_CASSERT(NX_FSW_MINBUFSIZE >= NETIF_COMPAT_MAX_MBUF_DATA_COPY);
666 
667 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
668 	    (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
669 
670 	uint16_t start = 0, csum = 0;
671 	uint32_t len = 0;
672 	/* soff is only used for packets */
673 	uint16_t soff = pkt->pkt_headroom + pkt->pkt_l2_len;
674 	uint32_t data_partial = 0, partial = 0;
675 	boolean_t odd_start = false;
676 	uint32_t data_len;
677 	uint16_t dbuf_off;
678 	uint16_t copied_len = 0;
679 	bool l3_csum_ok = !verify_l3;
680 	uint8_t *daddr;
681 
682 	if (dbuf->dba_is_buflet) {
683 		daddr = kern_buflet_get_data_address(dbuf->dba_buflet[0]);
684 		daddr += kern_buflet_get_data_length(dbuf->dba_buflet[0]);
685 	} else {
686 		daddr = mtod(dbuf->dba_mbuf[0], uint8_t *);
687 		daddr += dbuf->dba_mbuf[0]->m_len;
688 		ASSERT(mbuf_maxlen(dbuf->dba_mbuf[0]) >= plen);
689 	}
690 
691 	/* Reset the checksum flags in source packet */
692 	pkt->pkt_csum_flags &= ~PACKET_CSUM_RX_FLAGS;
693 
694 	/* Some compat drivers compute full checksum */
695 	if (PKT_IS_MBUF(pkt) && ((pkt->pkt_mbuf->m_pkthdr.csum_flags &
696 	    CSUM_RX_FULL_FLAGS) == CSUM_RX_FULL_FLAGS)) {
697 		/* copy only */
698 		_copy_data_sum_dbuf(pkt, PKT_IS_TRUNC_MBUF(pkt) ? 0: soff,
699 		    plen, &partial, &odd_start, dbuf, false);
700 		csum = pkt->pkt_mbuf->m_pkthdr.csum_rx_val;
701 		SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
702 		    pkt->pkt_mbuf->m_pkthdr.csum_flags,
703 		    pkt->pkt_mbuf->m_pkthdr.csum_rx_start, csum);
704 		/* pkt and mbuf flags are same for full csum */
705 		__packet_set_inet_checksum(SK_PKT2PH(pkt), CSUM_RX_FULL_FLAGS,
706 		    0, csum, false);
707 		if ((csum ^ 0xffff) == 0) {
708 			return true;
709 		} else {
710 			return false;
711 		}
712 	}
713 	/* Copy l3 & verify checksum only for IPv4 */
714 	start = 0;
715 	len = pkt->pkt_flow_ip_hlen;
716 	if (PKT_IS_TRUNC_MBUF(pkt)) {
717 		partial = m_copydata_sum(pkt->pkt_mbuf, start, len,
718 		    (daddr + start), 0, NULL);
719 	} else {
720 		partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), soff,
721 		    (daddr + start), len, true, 0, NULL);
722 	}
723 	if (verify_l3) {
724 		csum = __packet_fold_sum(partial);
725 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
726 		    start, len, csum);
727 		pkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
728 		if ((csum ^ 0xffff) != 0) {
729 			/* proceed to copy the rest of packet */
730 		} else {
731 			pkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
732 			l3_csum_ok = true;
733 		}
734 	}
735 	copied_len += pkt->pkt_flow_ip_hlen;
736 
737 	/* Copy & verify TCP checksum */
738 	start = pkt->pkt_flow_ip_hlen;
739 	len = plen - start;
740 
741 	if (PKT_IS_TRUNC_MBUF(pkt)) {
742 		/* First, copy and sum TCP header */
743 		partial = m_copydata_sum(pkt->pkt_mbuf, start,
744 		    pkt->pkt_flow_tcp_hlen, (daddr + start), 0, NULL);
745 
746 		data_len = len - pkt->pkt_flow_tcp_hlen;
747 		start += pkt->pkt_flow_tcp_hlen;
748 		dbuf_off = start;
749 		/* Next, copy and sum payload (if any) */
750 	} else {
751 		/* First, copy and sum TCP header */
752 		partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), (soff + start),
753 		    (daddr + start), pkt->pkt_flow_tcp_hlen, true, 0, NULL);
754 
755 		data_len = len - pkt->pkt_flow_tcp_hlen;
756 		start += pkt->pkt_flow_tcp_hlen;
757 		dbuf_off = start;
758 		start += soff;
759 	}
760 	copied_len += pkt->pkt_flow_tcp_hlen;
761 
762 	if (dbuf->dba_is_buflet) {
763 		VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[0],
764 		    kern_buflet_get_data_length(dbuf->dba_buflet[0]) +
765 		    copied_len) == 0);
766 	} else {
767 		dbuf->dba_mbuf[0]->m_len += copied_len;
768 		dbuf->dba_mbuf[0]->m_pkthdr.len += copied_len;
769 	}
770 
771 	/* copy and sum payload (if any) */
772 	if (data_len > 0) {
773 		odd_start = false;
774 		_copy_data_sum_dbuf(pkt, start, data_len, &data_partial,
775 		    &odd_start, dbuf, l3_csum_ok);
776 	}
777 
778 	if (__improbable(!l3_csum_ok)) {
779 		return false;
780 	}
781 
782 	/* Fold data sum to 16 bit and then into the partial */
783 	*data_csum = __packet_fold_sum(data_partial);
784 
785 	/* Fold in the data checksum to TCP checksum */
786 	partial += *data_csum;
787 
788 	partial += htons(len + IPPROTO_TCP);
789 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
790 		csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
791 		    pkt->pkt_flow_ipv4_dst.s_addr, partial);
792 	} else {
793 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
794 		csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
795 		    &pkt->pkt_flow_ipv6_dst, partial);
796 	}
797 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
798 	    pkt->pkt_flow_ip_hlen, len, csum);
799 	__packet_set_inet_checksum(SK_PKT2PH(pkt), pkt->pkt_csum_flags |
800 	    PACKET_CSUM_DATA_VALID | PACKET_CSUM_PSEUDO_HDR, 0,
801 	    csum, false);
802 	if ((csum ^ 0xffff) != 0) {
803 		return false;
804 	}
805 
806 	return true;
807 }
808 
809 SK_INLINE_ATTRIBUTE
810 static void
flow_agg_init_common(struct flow_agg * fa,struct __kern_packet * pkt)811 flow_agg_init_common(struct flow_agg *fa, struct __kern_packet *pkt)
812 {
813 	switch (pkt->pkt_flow_ip_ver) {
814 	case IPVERSION:
815 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip)) {
816 			return;
817 		}
818 		break;
819 	case IPV6_VERSION:
820 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
821 			return;
822 		}
823 		break;
824 	default:
825 		VERIFY(0);
826 		/* NOTREACHED */
827 		__builtin_unreachable();
828 	}
829 
830 	fa->fa_tcp_seq = ntohl(pkt->pkt_flow_tcp_seq) + pkt->pkt_flow_ulen;
831 	fa->fa_ulen = pkt->pkt_flow_ulen;
832 	fa->fa_total = pkt->pkt_flow_ip_hlen +
833 	    pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
834 }
835 
836 static void
flow_agg_init_smbuf(struct flow_agg * fa,struct mbuf * smbuf,struct __kern_packet * pkt)837 flow_agg_init_smbuf(struct flow_agg *fa, struct mbuf *smbuf,
838     struct __kern_packet *pkt)
839 {
840 	FLOW_AGG_CLEAR(fa);
841 
842 	ASSERT(smbuf != NULL);
843 	fa->fa_smbuf = smbuf;
844 
845 	fa->fa_sptr = mtod(smbuf, uint8_t *);
846 	ASSERT(fa->fa_sptr != NULL);
847 
848 	/*
849 	 * Note here we use 'pkt' instead of 'smbuf', since we rely on the
850 	 * contents of the flow structure which don't exist in 'smbuf'.
851 	 */
852 	flow_agg_init_common(fa, pkt);
853 }
854 
855 static void
flow_agg_init_spkt(struct flow_agg * fa,struct __kern_packet * spkt,struct __kern_packet * pkt)856 flow_agg_init_spkt(struct flow_agg *fa, struct __kern_packet *spkt,
857     struct __kern_packet *pkt)
858 {
859 	FLOW_AGG_CLEAR(fa);
860 
861 	ASSERT(spkt != NULL);
862 	fa->fa_spkt = spkt;
863 	fa->fa_sobj_is_pkt = true;
864 	VERIFY(spkt->pkt_headroom == 0 && spkt->pkt_l2_len == 0);
865 
866 	MD_BUFLET_ADDR_ABS(spkt, fa->fa_sptr);
867 	ASSERT(fa->fa_sptr != NULL);
868 
869 	/*
870 	 * Note here we use 'pkt' instead of 'spkt', since we rely on the
871 	 * contents of the flow structure which don't exist in 'spkt'.
872 	 */
873 	flow_agg_init_common(fa, pkt);
874 }
875 
876 SK_INLINE_ATTRIBUTE
877 static bool
ipv4_tcp_memcmp(const uint8_t * h1,const uint8_t * h2)878 ipv4_tcp_memcmp(const uint8_t *h1, const uint8_t *h2)
879 {
880 	return sk_memcmp_mask_64B(h1, h2, (const uint8_t *)&ip_tcp_mask) == 0;
881 }
882 
883 SK_INLINE_ATTRIBUTE
884 static bool
ipv6_tcp_memcmp(const uint8_t * h1,const uint8_t * h2)885 ipv6_tcp_memcmp(const uint8_t *h1, const uint8_t *h2)
886 {
887 	return sk_memcmp_mask_80B(h1, h2, (const uint8_t *)&ip6_tcp_mask) == 0;
888 }
889 
890 SK_INLINE_ATTRIBUTE
891 static bool
can_agg_fastpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)892 can_agg_fastpath(struct flow_agg *fa, struct __kern_packet *pkt,
893     struct fsw_stats *fsws)
894 {
895 	bool match;
896 
897 	ASSERT(fa->fa_sptr != NULL);
898 	_CASSERT(sizeof(struct ip6_tcp_mask) == MASK_SIZE);
899 	_CASSERT(sizeof(struct ip_tcp_mask) == MASK_SIZE);
900 
901 	if (__improbable(pkt->pkt_length < MASK_SIZE)) {
902 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_TCP);
903 		goto slow_path;
904 	}
905 
906 	if (__improbable(fa->fa_sobj_is_short)) {
907 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_MBUF);
908 		goto slow_path;
909 	}
910 
911 	if (__improbable(pkt->pkt_flow_tcp_hlen !=
912 	    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA))) {
913 		goto slow_path;
914 	}
915 
916 	switch (pkt->pkt_flow_ip_ver) {
917 	case IPVERSION:
918 		match = ipv4_tcp_memcmp(fa->fa_sptr,
919 		    (uint8_t *)pkt->pkt_flow_ip_hdr);
920 		break;
921 	case IPV6_VERSION:
922 		match = ipv6_tcp_memcmp(fa->fa_sptr,
923 		    (uint8_t *)pkt->pkt_flow_ip_hdr);
924 		break;
925 	default:
926 		VERIFY(0);
927 		/* NOTREACHED */
928 		__builtin_unreachable();
929 	}
930 
931 	if (__improbable(!match)) {
932 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_MASK_TCP);
933 		goto slow_path;
934 	}
935 	if (__improbable(pkt->pkt_flow_ulen != fa->fa_ulen)) {
936 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ULEN_TCP);
937 		goto slow_path;
938 	}
939 
940 	STATS_INC(fsws, FSW_STATS_RX_AGG_OK_FASTPATH_TCP);
941 	fa->fa_tcp_seq += pkt->pkt_flow_ulen;
942 	fa->fa_ulen = pkt->pkt_flow_ulen;
943 	return true;
944 
945 slow_path:
946 	return false;
947 }
948 
949 SK_NO_INLINE_ATTRIBUTE
950 static bool
can_agg_slowpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)951 can_agg_slowpath(struct flow_agg *fa, struct __kern_packet *pkt,
952     struct fsw_stats *fsws)
953 {
954 	uint8_t *sl3_hdr = fa->fa_sptr;
955 	uint32_t sl3tlen = 0;
956 	uint16_t sl3hlen = 0;
957 
958 	DTRACE_SKYWALK2(aggr__slow, struct __kern_packet *, pkt,
959 	    uint8_t *, sl3_hdr);
960 
961 	ASSERT(sl3_hdr != NULL);
962 
963 	/*
964 	 * Compare IP header length, TOS, frag flags and IP options
965 	 * For IPv4, the options should match exactly
966 	 * For IPv6, if options are present, bail out
967 	 */
968 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
969 		struct ip *siph = (struct ip *)(void *)sl3_hdr;
970 		struct ip *iph = (struct ip *)pkt->pkt_flow_ip_hdr;
971 
972 		ASSERT(siph->ip_v == IPVERSION);
973 		/* 16-bit alignment is sufficient (handles mbuf case) */
974 		ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
975 		ASSERT(IS_P2ALIGNED(iph, sizeof(uint16_t)));
976 
977 		sl3hlen = (siph->ip_hl << 2);
978 		if (sl3hlen != pkt->pkt_flow_ip_hlen) {
979 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
980 			DTRACE_SKYWALK2(aggr__fail2, uint16_t, sl3hlen, uint8_t,
981 			    pkt->pkt_flow_ip_hlen);
982 			return false;
983 		}
984 
985 		if (siph->ip_ttl != iph->ip_ttl) {
986 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
987 			DTRACE_SKYWALK2(aggr__fail3, uint8_t, siph->ip_ttl,
988 			    uint8_t, iph->ip_ttl);
989 			return false;
990 		}
991 
992 		if (siph->ip_tos != iph->ip_tos) {
993 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
994 			DTRACE_SKYWALK2(aggr__fail4, uint8_t, siph->ip_tos,
995 			    uint8_t, iph->ip_tos);
996 			return false;
997 		}
998 		/* For IPv4, DF bit should match */
999 		if ((ntohs(siph->ip_off) & (IP_DF | IP_RF)) !=
1000 		    (ntohs(iph->ip_off) & (IP_DF | IP_RF))) {
1001 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OFF_IP);
1002 			DTRACE_SKYWALK2(aggr__fail5, uint16_t,
1003 			    ntohs(siph->ip_off), uint16_t, ntohs(iph->ip_off));
1004 			return false;
1005 		}
1006 
1007 		uint8_t ip_opts_len = pkt->pkt_flow_ip_hlen -
1008 		    sizeof(struct ip);
1009 		if (ip_opts_len > 0 &&
1010 		    memcmp((uint8_t *)(siph + 1), (uint8_t *)(iph + 1),
1011 		    ip_opts_len) != 0) {
1012 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPT_IP);
1013 			DTRACE_SKYWALK3(aggr__fail6, uint8_t, ip_opts_len,
1014 			    uint8_t *, (uint8_t *)(siph + 1), uint8_t *,
1015 			    (uint8_t *)(iph + 1));
1016 			return false;
1017 		}
1018 		sl3tlen = ntohs(siph->ip_len);
1019 	} else {
1020 		struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1021 		struct ip6_hdr *ip6 = (struct ip6_hdr *)pkt->pkt_flow_ip_hdr;
1022 
1023 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1024 		ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1025 		/* 16-bit alignment is sufficient (handles mbuf case) */
1026 		ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1027 
1028 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
1029 			/*
1030 			 * Don't aggregate if extension header is present in
1031 			 * packet. N.B. currently flow switch only classifies
1032 			 * frag header
1033 			 */
1034 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1035 			DTRACE_SKYWALK1(aggr__fail7, uint8_t,
1036 			    pkt->pkt_flow_ip_hlen);
1037 			return false;
1038 		}
1039 
1040 		sl3hlen = sizeof(struct ip6_hdr);
1041 		/* For IPv6, flow info mask covers TOS and flow label */
1042 		if (memcmp(&sip6->ip6_flow, &ip6->ip6_flow,
1043 		    sizeof(sip6->ip6_flow)) != 0) {
1044 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1045 			DTRACE_SKYWALK2(aggr__fail8, uint32_t,
1046 			    ntohl(sip6->ip6_flow), uint32_t,
1047 			    ntohl(ip6->ip6_flow));
1048 			return false;
1049 		}
1050 
1051 		if (sip6->ip6_hlim != ip6->ip6_hlim) {
1052 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1053 			DTRACE_SKYWALK2(aggr__fail9, uint8_t, sip6->ip6_hlim,
1054 			    uint8_t, ip6->ip6_hlim);
1055 			return false;
1056 		}
1057 
1058 		sl3tlen = (sizeof(struct ip6_hdr) + ntohs(sip6->ip6_plen));
1059 	}
1060 
1061 	/*
1062 	 * For TCP header, compare ACK number and window size
1063 	 * Compare TCP flags
1064 	 * Compare TCP header length and TCP options
1065 	 */
1066 	struct tcphdr *stcp = (struct tcphdr *)(void *)(sl3_hdr + sl3hlen);
1067 	struct tcphdr *tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
1068 
1069 	uint16_t sl4hlen = (stcp->th_off << 2);
1070 	if (memcmp(&stcp->th_ack, &tcp->th_ack, sizeof(stcp->th_ack)) != 0 ||
1071 	    memcmp(&stcp->th_win, &tcp->th_win, sizeof(stcp->th_win)) != 0) {
1072 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ACKWIN_TCP);
1073 		DTRACE_SKYWALK4(aggr__fail9, uint32_t, ntohl(stcp->th_ack),
1074 		    uint32_t, ntohl(tcp->th_ack), uint16_t, ntohs(stcp->th_win),
1075 		    uint16_t, ntohs(tcp->th_win));
1076 		return false;
1077 	}
1078 
1079 	if ((stcp->th_flags & ~(TH_PUSH)) != (tcp->th_flags & ~(TH_PUSH))) {
1080 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_FLAGS_TCP);
1081 		DTRACE_SKYWALK2(aggr__fail10, uint8_t, stcp->th_flags,
1082 		    uint8_t, tcp->th_flags);
1083 		return false;
1084 	}
1085 
1086 	if (sl4hlen != pkt->pkt_flow_tcp_hlen) {
1087 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_TCP);
1088 		DTRACE_SKYWALK2(aggr__fail11, uint8_t, sl4hlen,
1089 		    uint8_t, pkt->pkt_flow_tcp_hlen);
1090 		return false;
1091 	}
1092 
1093 	uint8_t tcp_opts_len = pkt->pkt_flow_tcp_hlen - sizeof(struct tcphdr);
1094 	/*
1095 	 * We know that the TCP-option lengthes are the same thanks to the above
1096 	 * sl4hlen check
1097 	 */
1098 	if (tcp_opts_len > 0 && memcmp((uint8_t *)(stcp + 1),
1099 	    (uint8_t *)(tcp + 1), tcp_opts_len) != 0) {
1100 		/*
1101 		 * Fast-path header prediction:
1102 		 *
1103 		 * TCP Timestamp option is usually put after two NOP-headers,
1104 		 * and thus total TCP-option length is 12. If that's the case,
1105 		 * we can aggregate as only the TCP time-stamp option differs.
1106 		 */
1107 		if (tcp_opts_len != TCPOLEN_TSTAMP_APPA) {
1108 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_EXOPT_TCP);
1109 			DTRACE_SKYWALK1(aggr__fail13, uint8_t, tcp_opts_len);
1110 			return false;
1111 		} else {
1112 			uint32_t sts_hdr, ts_hdr;
1113 			if (IS_P2ALIGNED(stcp + 1, sizeof(uint32_t))) {
1114 				sts_hdr = *((uint32_t *)(stcp + 1));
1115 			} else {
1116 				bcopy(stcp + 1, &sts_hdr, sizeof(sts_hdr));
1117 			}
1118 			if (IS_P2ALIGNED(tcp + 1, sizeof(uint32_t))) {
1119 				ts_hdr = *((uint32_t *)(tcp + 1));
1120 			} else {
1121 				bcopy(tcp + 1, &ts_hdr, sizeof(ts_hdr));
1122 			}
1123 
1124 			if (sts_hdr != htonl(TCPOPT_TSTAMP_HDR) ||
1125 			    ts_hdr != htonl(TCPOPT_TSTAMP_HDR)) {
1126 				STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPTTS_TCP);
1127 				DTRACE_SKYWALK2(aggr__fail14, uint32_t,
1128 				    sts_hdr, uint32_t, ts_hdr);
1129 				return false;
1130 			}
1131 		}
1132 	}
1133 	STATS_INC(fsws, FSW_STATS_RX_AGG_OK_SLOWPATH_TCP);
1134 	fa->fa_tcp_seq += pkt->pkt_flow_ulen;
1135 	fa->fa_ulen = pkt->pkt_flow_ulen;
1136 	return true;
1137 }
1138 
1139 static bool
flow_agg_is_ok(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)1140 flow_agg_is_ok(struct flow_agg *fa, struct __kern_packet *pkt,
1141     struct fsw_stats *fsws)
1142 {
1143 	/* Shouldn't exceed the ip_len beyond MIN(custom ip_len, 64K) */
1144 	const uint32_t max_ip_len = MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET);
1145 	bool can_agg = false;
1146 
1147 	DTRACE_SKYWALK2(aggr__check, struct flow_agg *, fa,
1148 	    struct __kern_packet *, pkt);
1149 
1150 	ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1151 	if (__improbable(pkt->pkt_flow_tcp_agg_fast != 0)) {
1152 		pkt->pkt_flow_tcp_agg_fast = 0;
1153 	}
1154 	/*
1155 	 * Don't aggregate if any of the following is true:
1156 	 * 1. TCP flag is other than TH_{ACK,PUSH}
1157 	 * 2. Payload length is 0 (pure ACK)
1158 	 * 3. This is the first packet
1159 	 * 4. TCP sequence number is not expected
1160 	 * 5. We would've exceeded the maximum aggregated size
1161 	 * 6. It's not the first packet and the wake flag is set
1162 	 */
1163 	if (__improbable((pkt->pkt_flow_tcp_flags & TCP_FLAGS_IGNORE) != 0 ||
1164 	    pkt->pkt_flow_ulen == 0 || fa->fa_sobj == NULL)) {
1165 		DTRACE_SKYWALK1(aggr__fail1a, struct __kern_packet *, pkt);
1166 		goto done;
1167 	}
1168 	if (__improbable(ntohl(pkt->pkt_flow_tcp_seq) != fa->fa_tcp_seq)) {
1169 		DTRACE_SKYWALK2(aggr__fail1b, uint32_t,
1170 		    ntohl(pkt->pkt_flow_tcp_seq), uint32_t, fa->fa_tcp_seq);
1171 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SEQN_TCP);
1172 		goto done;
1173 	}
1174 	if (__improbable((fa->fa_total + pkt->pkt_flow_ulen) > max_ip_len)) {
1175 		DTRACE_SKYWALK3(aggr__fail1c, uint32_t, fa->fa_total,
1176 		    uint32_t, pkt->pkt_flow_ulen, uint32_t, max_ip_len);
1177 		/* We've reached aggregation limit */
1178 		STATS_INC(fsws, FSW_STATS_RX_AGG_LIMIT);
1179 		goto done;
1180 	}
1181 	if (__improbable((pkt->pkt_pflags & PKT_F_WAKE_PKT) && fa->fa_total > 0)) {
1182 		DTRACE_SKYWALK1(aggr__fail1d, struct __kern_packet *, pkt);
1183 		goto done;
1184 	}
1185 
1186 	can_agg = can_agg_fastpath(fa, pkt, fsws);
1187 	if (can_agg) {
1188 		pkt->pkt_flow_tcp_agg_fast = 1;
1189 		goto done;
1190 	}
1191 
1192 	can_agg = can_agg_slowpath(fa, pkt, fsws);
1193 	ASSERT(!pkt->pkt_flow_tcp_agg_fast);
1194 
1195 done:
1196 	return can_agg;
1197 }
1198 
1199 static void
flow_agg_merge_hdr(struct flow_agg * fa,struct __kern_packet * pkt,uint16_t data_csum,struct fsw_stats * fsws)1200 flow_agg_merge_hdr(struct flow_agg *fa, struct __kern_packet *pkt,
1201     uint16_t data_csum, struct fsw_stats *fsws)
1202 {
1203 	struct tcphdr *stcp, *tcp;
1204 	uint8_t *l3hdr, l3hlen;
1205 	uint16_t old_l3len = 0;
1206 	uint8_t result;
1207 
1208 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1209 
1210 	ASSERT(fa->fa_sobj != NULL);
1211 	ASSERT(!fa->fa_sobj_is_pkt ||
1212 	    (fa->fa_spkt->pkt_headroom == 0 && fa->fa_spkt->pkt_l2_len == 0));
1213 	uint8_t *sl3_hdr = fa->fa_sptr;
1214 	ASSERT(sl3_hdr != NULL);
1215 
1216 	fa->fa_total += pkt->pkt_flow_ulen;
1217 
1218 	/*
1219 	 * Update the IP header as:
1220 	 * 1. Set the IP ID (IPv4 only) to that of the new packet
1221 	 * 2. Set the ttl to the lowest of the two
1222 	 * 3. Increment the IP length by the payload length of new packet
1223 	 * 4. Leave the IP (IPv4 only) checksum as is
1224 	 * Update the resp. flow classification fields, if any
1225 	 * Nothing to update for TCP header for now
1226 	 */
1227 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
1228 		struct ip *siph = (struct ip *)(void *)sl3_hdr;
1229 
1230 		/* 16-bit alignment is sufficient (handles mbuf case) */
1231 		ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1232 
1233 		l3hdr = (uint8_t *)siph;
1234 		l3hlen = siph->ip_hl << 2;
1235 
1236 		old_l3len = ntohs(siph->ip_len);
1237 		uint16_t l3tlen = ntohs(siph->ip_len) + pkt->pkt_flow_ulen;
1238 		siph->ip_len = htons(l3tlen);
1239 		siph->ip_sum = __packet_fix_sum(siph->ip_sum, 0,
1240 		    htons(pkt->pkt_flow_ulen));
1241 
1242 		SK_DF(logflags, "Agg IP len %u", ntohs(siph->ip_len));
1243 	} else {
1244 		struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1245 
1246 		/* 16-bit alignment is sufficient (handles mbuf case) */
1247 		ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1248 		ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1249 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1250 
1251 		l3hdr = (uint8_t *)sip6;
1252 		l3hlen = sizeof(struct ip6_hdr);
1253 
1254 		/* No extension headers should be present */
1255 		ASSERT(pkt->pkt_flow_ip_hlen == sizeof(struct ip6_hdr));
1256 
1257 		old_l3len = ntohs(sip6->ip6_plen) + sizeof(struct ip6_hdr);
1258 		uint16_t l3plen = ntohs(sip6->ip6_plen) + pkt->pkt_flow_ulen;
1259 		sip6->ip6_plen = htons(l3plen);
1260 
1261 		SK_DF(logflags, "Agg IP6 len %u", ntohs(sip6->ip6_plen));
1262 	}
1263 
1264 	if (__probable(pkt->pkt_flow_tcp_agg_fast)) {
1265 		STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_IP);
1266 	} else {
1267 		STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_IP);
1268 	}
1269 
1270 	stcp = (struct tcphdr *)(void *)(l3hdr + l3hlen);
1271 	tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
1272 	/* 16-bit alignment is sufficient (handles mbuf case) */
1273 	ASSERT(IS_P2ALIGNED(stcp, sizeof(uint16_t)));
1274 	ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
1275 
1276 	/*
1277 	 * If it is bigger, that means there are TCP-options that need to be
1278 	 * copied over.
1279 	 */
1280 	if (pkt->pkt_flow_tcp_hlen > sizeof(struct tcphdr) ||
1281 	    (stcp->th_flags & TH_PUSH) == 0) {
1282 		VERIFY(stcp->th_off << 2 == pkt->pkt_flow_tcp_hlen);
1283 		if (__improbable(!pkt->pkt_flow_tcp_agg_fast &&
1284 		    memcmp(stcp + 1, tcp + 1, (pkt->pkt_flow_tcp_hlen -
1285 		    sizeof(struct tcphdr))) != 0)) {
1286 			uint8_t *sopt = (uint8_t *)(stcp + 1);
1287 			uint8_t *opt = (uint8_t *)(tcp + 1);
1288 
1289 			uint32_t ntsval, ntsecr;
1290 			bcopy((void *)(opt + 4), &ntsval, sizeof(ntsval));
1291 			bcopy((void *)(opt + 8), &ntsecr, sizeof(ntsecr));
1292 
1293 			__packet_fix_hdr_sum(sopt + 4, &stcp->th_sum, ntsval);
1294 			__packet_fix_hdr_sum(sopt + 8, &stcp->th_sum, ntsecr);
1295 
1296 			STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_TCP);
1297 		} else {
1298 			STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_TCP);
1299 		}
1300 
1301 		if ((stcp->th_flags & TH_PUSH) == 0 &&
1302 		    (tcp->th_flags & TH_PUSH) != 0) {
1303 			uint16_t old, new;
1304 			old = *(uint16_t *)(void *)(&stcp->th_ack + 1);
1305 			/* If the new segment has a PUSH-flag, append it! */
1306 			stcp->th_flags |= tcp->th_flags & TH_PUSH;
1307 			new = *(uint16_t *)(void *)(&stcp->th_ack + 1);
1308 			stcp->th_sum = __packet_fix_sum(stcp->th_sum, old, new);
1309 		}
1310 	}
1311 
1312 	/* Update pseudo header checksum */
1313 	stcp->th_sum = __packet_fix_sum(stcp->th_sum, 0,
1314 	    htons(pkt->pkt_flow_ulen));
1315 
1316 	/* Update data checksum  */
1317 	if (__improbable(old_l3len & 0x1)) {
1318 		/* swap the byte order, refer to rfc 1071 section 2 */
1319 		stcp->th_sum = __packet_fix_sum(stcp->th_sum, 0,
1320 		    ntohs(data_csum));
1321 	} else {
1322 		stcp->th_sum = __packet_fix_sum(stcp->th_sum, 0, data_csum);
1323 	}
1324 
1325 	if (fa->fa_sobj_is_pkt) {
1326 		struct __kern_packet *spkt = fa->fa_spkt;
1327 		spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1328 		spkt->pkt_flow_ulen += pkt->pkt_flow_ulen;
1329 		/*
1330 		 * Super packet length includes L3 and L4
1331 		 * header length for first packet only.
1332 		 */
1333 		spkt->pkt_length += pkt->pkt_flow_ulen;
1334 		if (spkt->pkt_seg_cnt == 0) {
1335 			/* First time we append packets, need to set it to 1 */
1336 			spkt->pkt_seg_cnt = 1;
1337 		}
1338 		_CASSERT(sizeof(result) == sizeof(spkt->pkt_seg_cnt));
1339 		if (!os_add_overflow(1, spkt->pkt_seg_cnt, &result)) {
1340 			spkt->pkt_seg_cnt = result;
1341 		}
1342 		SK_DF(logflags, "Agg pkt len %u TCP csum 0x%04x",
1343 		    spkt->pkt_length, ntohs(stcp->th_sum));
1344 	} else {
1345 		struct mbuf *smbuf = fa->fa_smbuf;
1346 		smbuf->m_pkthdr.len += pkt->pkt_flow_ulen;
1347 		if (smbuf->m_pkthdr.seg_cnt == 0) {
1348 			/* First time we append packets, need to set it to 1 */
1349 			smbuf->m_pkthdr.seg_cnt = 1;
1350 		}
1351 		_CASSERT(sizeof(result) == sizeof(smbuf->m_pkthdr.seg_cnt));
1352 		if (!os_add_overflow(1, smbuf->m_pkthdr.seg_cnt, &result)) {
1353 			smbuf->m_pkthdr.seg_cnt = result;
1354 		}
1355 		SK_DF(logflags, "Agg mbuf len %u TCP csum 0x%04x",
1356 		    smbuf->m_pkthdr.len, ntohs(stcp->th_sum));
1357 	}
1358 }
1359 
1360 /*
1361  * Copy metadata from source packet to destination packet
1362  */
1363 static void
pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)1364 pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
1365 {
1366 	/* Copy packet metadata */
1367 	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
1368 	_PKT_COPY(spkt, dpkt);
1369 }
1370 
1371 static void
pkt_finalize(kern_packet_t ph)1372 pkt_finalize(kern_packet_t ph)
1373 {
1374 	int err = __packet_finalize(ph);
1375 	VERIFY(err == 0);
1376 #if (DEVELOPMENT || DEBUG)
1377 	struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1378 	uint8_t *buf;
1379 	MD_BUFLET_ADDR_ABS(pkt, buf);
1380 	buf += pkt->pkt_headroom + pkt->pkt_l2_len;
1381 	DTRACE_SKYWALK2(aggr__finalize, struct __kern_packet *, pkt,
1382 	    uint8_t *, buf);
1383 #endif
1384 }
1385 
1386 SK_INLINE_ATTRIBUTE
1387 static inline uint32_t
_estimate_buflet_cnt(struct flow_entry * fe,struct kern_pbufpool * pp)1388 _estimate_buflet_cnt(struct flow_entry *fe, struct kern_pbufpool *pp)
1389 {
1390 	uint32_t cnt;
1391 
1392 	_CASSERT(MAX_BUFLET_COUNT <= UINT8_MAX);
1393 	cnt = howmany(((fe->fe_rx_pktq_bytes + sizeof(struct ip6_hdr)) +
1394 	    sizeof(struct tcphdr)), pp->pp_buflet_size);
1395 	cnt = MAX(KPKTQ_LEN(&fe->fe_rx_pktq), cnt);
1396 	cnt = MIN(cnt, MAX_BUFLET_COUNT);
1397 	return cnt;
1398 }
1399 
1400 SK_INLINE_ATTRIBUTE
1401 static inline void
_append_dbuf_array_to_kpkt(kern_packet_t ph,kern_buflet_t pbuf,_dbuf_array_t * dbuf_array,kern_buflet_t * lbuf)1402 _append_dbuf_array_to_kpkt(kern_packet_t ph, kern_buflet_t pbuf,
1403     _dbuf_array_t *dbuf_array, kern_buflet_t *lbuf)
1404 {
1405 	for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1406 		kern_buflet_t buf = dbuf_array->dba_buflet[i];
1407 		VERIFY(kern_packet_add_buflet(ph, pbuf, buf) == 0);
1408 		pbuf = buf;
1409 		dbuf_array->dba_buflet[i] = NULL;
1410 	}
1411 	ASSERT(pbuf != NULL);
1412 	dbuf_array->dba_num_dbufs = 0;
1413 	*lbuf = pbuf;
1414 }
1415 
1416 SK_INLINE_ATTRIBUTE
1417 static inline void
_free_dbuf_array(struct kern_pbufpool * pp,_dbuf_array_t * dbuf_array)1418 _free_dbuf_array(struct kern_pbufpool *pp,
1419     _dbuf_array_t *dbuf_array)
1420 {
1421 	for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1422 		kern_buflet_t buf = dbuf_array->dba_buflet[i];
1423 		pp_free_buflet(pp, buf);
1424 		dbuf_array->dba_buflet[i] = NULL;
1425 	}
1426 	dbuf_array->dba_num_dbufs = 0;
1427 }
1428 
1429 SK_NO_INLINE_ATTRIBUTE
1430 static void
flow_rx_agg_channel(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * dropped_pkts,bool is_mbuf)1431 flow_rx_agg_channel(struct nx_flowswitch *fsw, struct flow_entry *fe,
1432     struct pktq *dropped_pkts, bool is_mbuf)
1433 {
1434 	struct flow_agg fa;             /* states */
1435 	FLOW_AGG_CLEAR(&fa);
1436 
1437 	struct pktq pkts;               /* dst super packets */
1438 	struct pktq disposed_pkts;      /* done src packets */
1439 
1440 	KPKTQ_INIT(&pkts);
1441 	KPKTQ_INIT(&disposed_pkts);
1442 
1443 	struct __kern_channel_ring *ring;
1444 	ring = fsw_flow_get_rx_ring(fsw, fe);
1445 	if (__improbable(ring == NULL)) {
1446 		SK_ERR("Rx ring is NULL");
1447 		KPKTQ_CONCAT(dropped_pkts, &fe->fe_rx_pktq);
1448 		STATS_ADD(&fsw->fsw_stats, FSW_STATS_DST_NXPORT_INVALID,
1449 		    KPKTQ_LEN(dropped_pkts));
1450 		return;
1451 	}
1452 	struct kern_pbufpool *dpp = ring->ckr_pp;
1453 	ASSERT(dpp->pp_max_frags > 1);
1454 
1455 	struct __kern_packet *pkt, *tpkt;
1456 	/* state for super packet */
1457 	struct __kern_packet *spkt = NULL;
1458 	kern_packet_t sph = 0;
1459 	kern_buflet_t sbuf = NULL;
1460 	bool prev_csum_ok = false, csum_ok, agg_ok;
1461 	uint16_t spkts = 0, bufcnt = 0;
1462 	int err;
1463 
1464 	struct fsw_stats *fsws = &fsw->fsw_stats;
1465 
1466 	/* state for buflet batch alloc */
1467 	uint32_t bh_cnt, bh_cnt_tmp;
1468 	uint8_t iter = 0;
1469 	uint64_t buf_arr[MAX_BUFLET_COUNT];
1470 	_dbuf_array_t dbuf_array = {.dba_is_buflet = true, .dba_num_dbufs = 0};
1471 
1472 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1473 	SK_DF(logflags, "Rx input queue len %u", KPKTQ_LEN(&fe->fe_rx_pktq));
1474 
1475 	bh_cnt_tmp = bh_cnt = _estimate_buflet_cnt(fe, dpp);
1476 	err = pp_alloc_buflet_batch(dpp, buf_arr, &bh_cnt, SKMEM_NOSLEEP);
1477 	if (__improbable(bh_cnt == 0)) {
1478 		SK_ERR("failed to alloc %u buflets (err %d), use slow path",
1479 		    bh_cnt_tmp, err);
1480 	}
1481 	bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1482 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1483 		if (tpkt != NULL) {
1484 			void *baddr;
1485 			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1486 			SK_PREFETCH(baddr, 0);
1487 		}
1488 
1489 		ASSERT(pkt->pkt_qum.qum_pp != dpp);
1490 		ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
1491 		ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
1492 		ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
1493 		ASSERT(!pkt->pkt_flow_ip_is_frag);
1494 		ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1495 
1496 		csum_ok = false;
1497 		agg_ok = false;
1498 		/* supports TCP only */
1499 		uint32_t thlen = (pkt->pkt_flow_ip_hlen +
1500 		    pkt->pkt_flow_tcp_hlen);
1501 		uint32_t plen = (thlen + pkt->pkt_flow_ulen);
1502 		uint16_t data_csum = 0;
1503 
1504 		KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1505 		fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1506 		err = flow_pkt_track(fe, pkt, true);
1507 		if (__improbable(err != 0)) {
1508 			STATS_INC(fsws, FSW_STATS_RX_FLOW_TRACK_ERR);
1509 			/* if need to trigger RST then deliver to host */
1510 			if (err == ENETRESET) {
1511 				struct flow_entry *host_fe;
1512 				host_fe =
1513 				    flow_mgr_get_host_fe(fsw->fsw_flow_mgr);
1514 				KPKTQ_ENQUEUE(&host_fe->fe_rx_pktq, pkt);
1515 				continue;
1516 			}
1517 			SK_ERR("flow_pkt_track failed (err %d)", err);
1518 			KPKTQ_ENQUEUE(dropped_pkts, pkt);
1519 			continue;
1520 		}
1521 
1522 		if (is_mbuf) {          /* compat */
1523 			m_adj(pkt->pkt_mbuf, pkt->pkt_l2_len);
1524 			pkt->pkt_svc_class = m_get_service_class(pkt->pkt_mbuf);
1525 		}
1526 
1527 		if (prev_csum_ok && sbuf) {
1528 			ASSERT(fa.fa_spkt == spkt);
1529 			ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1530 			agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
1531 			agg_ok = (agg_ok && bufcnt < dpp->pp_max_frags);
1532 
1533 			if (agg_ok && sbuf->buf_dlim - sbuf->buf_doff -
1534 			    sbuf->buf_dlen >= plen - thlen) {
1535 				/*
1536 				 * No need for a new packet, just
1537 				 * append to curr_m.
1538 				 */
1539 				csum_ok = copy_pkt_csum_packed(pkt, plen, NULL,
1540 				    is_ipv4, NULL, sbuf, &data_csum, NULL);
1541 
1542 				if (!csum_ok) {
1543 					STATS_INC(fsws,
1544 					    FSW_STATS_RX_AGG_BAD_CSUM);
1545 					SK_ERR("Checksum for aggregation "
1546 					    "is wrong");
1547 					DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail1);
1548 					/*
1549 					 * Turns out, checksum is wrong!
1550 					 * Fallback to no-agg mode.
1551 					 */
1552 					agg_ok = false;
1553 				} else {
1554 					flow_agg_merge_hdr(&fa, pkt,
1555 					    data_csum, fsws);
1556 					goto next;
1557 				}
1558 			}
1559 		}
1560 
1561 		/* calculate number of buflets required */
1562 		bh_cnt_tmp = howmany(plen, dpp->pp_buflet_size);
1563 		if (__improbable(bh_cnt_tmp > MAX_BUFLET_COUNT)) {
1564 			STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1565 			SK_ERR("packet too big: bufcnt %d len %d", bh_cnt_tmp,
1566 			    plen);
1567 			KPKTQ_ENQUEUE(dropped_pkts, pkt);
1568 			continue;
1569 		}
1570 		if (bh_cnt < bh_cnt_tmp) {
1571 			uint32_t tmp;
1572 
1573 			if (iter != 0) {
1574 				/*
1575 				 * rearrange the array for additional
1576 				 * allocation
1577 				 */
1578 				uint8_t i;
1579 				for (i = 0; i < bh_cnt; i++, iter++) {
1580 					buf_arr[i] = buf_arr[iter];
1581 					buf_arr[iter] = 0;
1582 				}
1583 				iter = 0;
1584 			}
1585 			tmp = _estimate_buflet_cnt(fe, dpp);
1586 			tmp = MAX(tmp, bh_cnt_tmp);
1587 			tmp -= bh_cnt;
1588 			ASSERT(tmp <= (MAX_BUFLET_COUNT - bh_cnt));
1589 			err = pp_alloc_buflet_batch(dpp, &buf_arr[bh_cnt],
1590 			    &tmp, SKMEM_NOSLEEP);
1591 			bh_cnt += tmp;
1592 			if (__improbable((tmp == 0) || (bh_cnt < bh_cnt_tmp))) {
1593 				STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1594 				SK_ERR("buflet alloc failed (err %d)", err);
1595 				KPKTQ_ENQUEUE(dropped_pkts, pkt);
1596 				continue;
1597 			}
1598 		}
1599 		/* Use pre-allocated buflets */
1600 		ASSERT(bh_cnt >= bh_cnt_tmp);
1601 		dbuf_array.dba_num_dbufs = bh_cnt_tmp;
1602 		while (bh_cnt_tmp-- > 0) {
1603 			dbuf_array.dba_buflet[bh_cnt_tmp] =
1604 			    (kern_buflet_t)(buf_arr[iter]);
1605 			buf_arr[iter] = 0;
1606 			bh_cnt--;
1607 			iter++;
1608 		}
1609 		/* copy and checksum TCP data */
1610 		if (agg_ok) {
1611 			int added = 0;
1612 			ASSERT(dbuf_array.dba_num_dbufs != 0);
1613 			csum_ok = copy_pkt_csum_packed(pkt, plen, &dbuf_array,
1614 			    is_ipv4, NULL, sbuf, &data_csum, &added);
1615 
1616 			if (__improbable(!csum_ok)) {
1617 				STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1618 				SK_ERR("Checksum for aggregation on new "
1619 				    "mbuf is wrong");
1620 				DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail2);
1621 				agg_ok = false;
1622 				/* reset the used buflets */
1623 				uint8_t j;
1624 				for (j = 0; j < dbuf_array.dba_num_dbufs; j++) {
1625 					VERIFY(kern_buflet_set_data_length(
1626 						    dbuf_array.dba_buflet[j], 0) == 0);
1627 				}
1628 				goto non_agg;
1629 			}
1630 
1631 			/*
1632 			 * There was not enough space in curr_m, thus we must
1633 			 * have added to m->m_data.
1634 			 */
1635 			VERIFY(added > 0);
1636 		} else {
1637 non_agg:
1638 			ASSERT(dbuf_array.dba_num_dbufs != 0);
1639 			csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
1640 			    &data_csum, is_ipv4);
1641 			if (__improbable(!csum_ok)) {
1642 				STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1643 				SK_ERR("%d incorrect csum", __LINE__);
1644 				DTRACE_SKYWALK(aggr__chan_tcp_csum_fail);
1645 			}
1646 		}
1647 		if (agg_ok) {
1648 			ASSERT(fa.fa_spkt == spkt);
1649 			ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1650 			/* update current packet header */
1651 			flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
1652 			ASSERT(dbuf_array.dba_num_dbufs > 0);
1653 			bufcnt += dbuf_array.dba_num_dbufs;
1654 			_append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1655 			    &sbuf);
1656 		} else {
1657 			/* Finalize the current super packet */
1658 			if (sph != 0) {
1659 				spkts++;
1660 				if (bufcnt > 1) {
1661 					spkt->pkt_aggr_type =
1662 					    PKT_AGGR_SINGLE_IP;
1663 				}
1664 				pkt_finalize(sph);
1665 				pkt_agg_log(spkt, kernproc, false);
1666 				DTRACE_SKYWALK1(aggr__buflet__count, uint16_t,
1667 				    bufcnt);
1668 				sph = 0;
1669 				spkt = NULL;
1670 				FLOW_AGG_CLEAR(&fa);
1671 			}
1672 
1673 			/* New super packet */
1674 			err = kern_pbufpool_alloc_nosleep(dpp, 0, &sph);
1675 			if (__improbable(err != 0)) {
1676 				STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1677 				SK_ERR("packet alloc failed (err %d)", err);
1678 				_free_dbuf_array(dpp, &dbuf_array);
1679 				KPKTQ_ENQUEUE(dropped_pkts, pkt);
1680 				continue;
1681 			}
1682 			spkt = SK_PTR_ADDR_KPKT(sph);
1683 			pkt_copy_metadata(pkt, spkt);
1684 			/* Packet length for super packet starts from L3 */
1685 			spkt->pkt_length = plen;
1686 			spkt->pkt_flow_ulen =  pkt->pkt_flow_ulen;
1687 			spkt->pkt_headroom = 0;
1688 			spkt->pkt_l2_len = 0;
1689 			spkt->pkt_seg_cnt = 1;
1690 
1691 			ASSERT(dbuf_array.dba_num_dbufs > 0);
1692 			bufcnt = dbuf_array.dba_num_dbufs;
1693 			sbuf = kern_packet_get_next_buflet(sph, NULL);
1694 			_append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1695 			    &sbuf);
1696 
1697 			KPKTQ_ENQUEUE(&pkts, spkt);
1698 			_UUID_COPY(spkt->pkt_flow_id, fe->fe_uuid);
1699 			_UUID_COPY(spkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1700 			spkt->pkt_policy_id = fe->fe_policy_id;
1701 			spkt->pkt_transport_protocol =
1702 			    fe->fe_transport_protocol;
1703 			flow_agg_init_spkt(&fa, spkt, pkt);
1704 		}
1705 next:
1706 		pkt_agg_log(pkt, kernproc, true);
1707 		prev_csum_ok = csum_ok;
1708 		KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1709 	}
1710 
1711 	/* Free unused buflets */
1712 	while (bh_cnt > 0) {
1713 		pp_free_buflet(dpp, (kern_buflet_t)(buf_arr[iter]));
1714 		buf_arr[iter] = 0;
1715 		bh_cnt--;
1716 		iter++;
1717 	}
1718 	/* Finalize the last super packet */
1719 	if (sph != 0) {
1720 		spkts++;
1721 		if (bufcnt > 1) {
1722 			spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1723 		}
1724 		pkt_finalize(sph);
1725 		pkt_agg_log(spkt, kernproc, false);
1726 		DTRACE_SKYWALK1(aggr__buflet__count, uint16_t, bufcnt);
1727 		sph = 0;
1728 		spkt = NULL;
1729 		FLOW_AGG_CLEAR(&fa);
1730 	}
1731 	DTRACE_SKYWALK1(aggr__spkt__count, uint16_t, spkts);
1732 	if (__improbable(is_mbuf)) {
1733 		STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2PKT, spkts);
1734 	} else {
1735 		STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2PKT, spkts);
1736 	}
1737 	FLOW_STATS_IN_ADD(fe, spackets, spkts);
1738 
1739 	KPKTQ_FINI(&fe->fe_rx_pktq);
1740 	KPKTQ_CONCAT(&fe->fe_rx_pktq, &pkts);
1741 	KPKTQ_FINI(&pkts);
1742 
1743 	fsw_ring_enqueue_tail_drop(fsw, ring, &fe->fe_rx_pktq);
1744 
1745 	pp_free_pktq(&disposed_pkts);
1746 }
1747 
1748 SK_NO_INLINE_ATTRIBUTE
1749 static void
flow_rx_agg_host(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * dropped_pkts,bool is_mbuf)1750 flow_rx_agg_host(struct nx_flowswitch *fsw, struct flow_entry *fe,
1751     struct pktq *dropped_pkts, bool is_mbuf)
1752 {
1753 	struct flow_agg fa;             /* states */
1754 	FLOW_AGG_CLEAR(&fa);
1755 
1756 	struct pktq disposed_pkts;      /* done src packets */
1757 	KPKTQ_INIT(&disposed_pkts);
1758 
1759 	int alloced = 0;
1760 	int factor;
1761 
1762 	struct __kern_packet *pkt, *tpkt;
1763 	/* points to the first mbuf of chain */
1764 	struct mbuf *m_chain = NULL;
1765 	/* super mbuf, at the end it points to last mbuf packet */
1766 	struct  mbuf *smbuf = NULL, *curr_m = NULL;
1767 	bool prev_csum_ok = false, csum_ok, agg_ok;
1768 	uint16_t smbufs = 0;
1769 	uint32_t bytes = 0, rcvd_ulen = 0;
1770 	uint32_t rcvd_packets = 0, rcvd_bytes = 0; /* raw packets & bytes */
1771 	uint32_t drop_packets = 0, drop_bytes = 0; /* dropped packets & bytes */
1772 	uint32_t largest_smbuf = 0;
1773 	int err = 0;
1774 
1775 	struct fsw_stats *fsws = &fsw->fsw_stats;
1776 	bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1777 
1778 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1779 
1780 	/* state for mbuf batch alloc */
1781 	uint32_t mhead_cnt;
1782 	uint32_t mhead_bufsize;
1783 	struct mbuf * mhead = NULL;
1784 
1785 	uint16_t l2len = KPKTQ_FIRST(&fe->fe_rx_pktq)->pkt_l2_len;
1786 
1787 	SK_DF(logflags, "Rx input queue bytes %u", fe->fe_rx_pktq_bytes);
1788 
1789 	if (__probable(!is_mbuf)) {
1790 		uint32_t max_ip_len = MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET);
1791 
1792 		/*
1793 		 *  Batch mbuf alloc is based on
1794 		 * convert_native_pkt_to_mbuf_chain
1795 		 */
1796 		if (__probable(fe->fe_rx_largest_msize != 0 &&
1797 		    max_ip_len > 0)) {
1798 			unsigned int one;
1799 			int wait;
1800 
1801 			if (fe->fe_rx_largest_msize <= MCLBYTES) {
1802 				mhead_bufsize = MCLBYTES;
1803 			} else if (fe->fe_rx_largest_msize <= MBIGCLBYTES) {
1804 				mhead_bufsize = MBIGCLBYTES;
1805 			} else {
1806 				mhead_bufsize = M16KCLBYTES;
1807 			}
1808 
1809 try_again:
1810 			if (fe->fe_rx_pktq_bytes != 0) {
1811 				uint32_t aggregation_size =
1812 				    MAX(fe->fe_rx_largest_msize, MCLBYTES);
1813 
1814 				aggregation_size =
1815 				    MIN(aggregation_size, mhead_bufsize);
1816 
1817 				factor = (fe->fe_rx_pktq_bytes / max_ip_len) *
1818 				    (MAX(sizeof(struct ip),
1819 				    sizeof(struct ip6_hdr)) +
1820 				    sizeof(struct tcphdr));
1821 
1822 				mhead_cnt = MAX(((fe->fe_rx_pktq_bytes +
1823 				    factor) / aggregation_size) + 1, 1);
1824 			} else {
1825 				/* No payload, thus it's all small-sized ACKs/... */
1826 				mhead_bufsize = MHLEN;
1827 				mhead_cnt = KPKTQ_LEN(&fe->fe_rx_pktq);
1828 			}
1829 
1830 			one = 1;
1831 
1832 			if (mhead_bufsize >= MBIGCLBYTES) {
1833 				wait = M_NOWAIT;
1834 			} else {
1835 				wait = M_WAITOK;
1836 			}
1837 
1838 			mhead = m_allocpacket_internal(&mhead_cnt,
1839 			    mhead_bufsize, &one, wait, 1, 0);
1840 
1841 			if (mhead == NULL) {
1842 				if (mhead_bufsize == M16KCLBYTES) {
1843 					mhead_bufsize = MBIGCLBYTES;
1844 					goto try_again;
1845 				}
1846 
1847 				if (mhead_bufsize == MBIGCLBYTES) {
1848 					mhead_bufsize = MCLBYTES;
1849 					goto try_again;
1850 				}
1851 			}
1852 		} else {
1853 			mhead = NULL;
1854 			mhead_bufsize = mhead_cnt = 0;
1855 		}
1856 		SK_DF(logflags, "batch alloc'ed %u mbufs of size %u", mhead_cnt,
1857 		    mhead_bufsize);
1858 	}
1859 
1860 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1861 		if (tpkt != NULL) {
1862 			void *baddr;
1863 			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1864 			SK_PREFETCH(baddr, 0);
1865 		}
1866 
1867 		/* Validate l2 len, ip vers, is_mbuf */
1868 		ASSERT(pkt->pkt_l2_len == l2len);
1869 		ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
1870 		ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
1871 		ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
1872 		ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
1873 		ASSERT(!pkt->pkt_flow_ip_is_frag);
1874 		ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1875 
1876 		csum_ok = false;
1877 		agg_ok = false;
1878 		/*
1879 		 * As we only agg packets with same hdr length,
1880 		 * leverage the pkt metadata
1881 		 */
1882 		uint32_t thlen = (pkt->pkt_flow_ip_hlen +
1883 		    pkt->pkt_flow_tcp_hlen);
1884 		uint32_t plen = (thlen + pkt->pkt_flow_ulen);
1885 
1886 		/*
1887 		 * Rather than calling flow_pkt_track() for each
1888 		 * packet here, we accumulate received packet stats
1889 		 * for the call to flow_track_stats() below.  This
1890 		 * is because flow tracking is a no-op for traffic
1891 		 * that belongs to the host stack.
1892 		 */
1893 		rcvd_ulen += pkt->pkt_flow_ulen;
1894 		rcvd_bytes += pkt->pkt_length;
1895 		rcvd_packets++;
1896 
1897 		KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1898 		fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1899 
1900 		/* packet is for BSD flow, create a mbuf chain */
1901 		uint32_t len = (l2len + plen);
1902 		uint16_t data_csum = 0;
1903 		struct mbuf *m;
1904 		if (__improbable(is_mbuf)) {
1905 			m = pkt->pkt_mbuf;
1906 			/* Detach mbuf from source pkt */
1907 			KPKT_CLEAR_MBUF_DATA(pkt);
1908 
1909 			uint32_t trailer = (m_pktlen(m) - len);
1910 			ASSERT((uint32_t)m_pktlen(m) >= plen);
1911 			/* Remove the trailer */
1912 			if (trailer > 0) {
1913 				m_adj(m, -trailer);
1914 			}
1915 			/* attached mbuf is already allocated */
1916 			csum_ok = mbuf_csum(pkt, m, is_ipv4, &data_csum);
1917 		} else {                /* native */
1918 			uint16_t pad = P2ROUNDUP(l2len, sizeof(uint32_t)) -
1919 			    l2len;
1920 			uint32_t tot_len = (len + pad);
1921 			/* remember largest aggregated packet size */
1922 			if (smbuf) {
1923 				if (largest_smbuf < (uint32_t)m_pktlen(smbuf)) {
1924 					largest_smbuf =
1925 					    (uint32_t)m_pktlen(smbuf);
1926 				}
1927 			}
1928 
1929 			if (prev_csum_ok && curr_m) {
1930 				ASSERT(fa.fa_smbuf == smbuf);
1931 				ASSERT(!fa.fa_sobj_is_pkt);
1932 				agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
1933 
1934 				if (agg_ok &&
1935 				    M_TRAILINGSPACE(curr_m) >= plen - thlen) {
1936 					/*
1937 					 * No need for a new mbuf,
1938 					 * just append to curr_m.
1939 					 */
1940 					csum_ok = copy_pkt_csum_packed(pkt,
1941 					    plen, NULL, is_ipv4, curr_m, NULL,
1942 					    &data_csum, NULL);
1943 
1944 					if (!csum_ok) {
1945 						STATS_INC(fsws,
1946 						    FSW_STATS_RX_AGG_BAD_CSUM);
1947 						SK_ERR("Checksum for "
1948 						    "aggregation is wrong");
1949 						DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail1);
1950 						/*
1951 						 * Turns out, checksum is wrong!
1952 						 * Fallback to no-agg mode.
1953 						 */
1954 						agg_ok = 0;
1955 					} else {
1956 						/*
1957 						 * We only added payload,
1958 						 * thus -thlen.
1959 						 */
1960 						bytes += (plen - thlen);
1961 						flow_agg_merge_hdr(&fa, pkt,
1962 						    data_csum, fsws);
1963 						goto next;
1964 					}
1965 				}
1966 			}
1967 
1968 			/*
1969 			 * If the batch allocation returned partial success,
1970 			 * we try blocking allocation here again
1971 			 */
1972 			m = mhead;
1973 			if (__improbable(m == NULL ||
1974 			    tot_len > mhead_bufsize)) {
1975 				unsigned int one = 1;
1976 
1977 				ASSERT(mhead_cnt == 0 || mhead != NULL);
1978 				err = mbuf_allocpacket(MBUF_WAITOK, tot_len,
1979 				    &one, &m);
1980 				if (err != 0) {
1981 					STATS_INC(fsws,
1982 					    FSW_STATS_RX_DROP_NOMEM_BUF);
1983 					SK_ERR("mbuf alloc failed (err %d)",
1984 					    err);
1985 					KPKTQ_ENQUEUE(dropped_pkts, pkt);
1986 					drop_packets++;
1987 					drop_bytes += pkt->pkt_length;
1988 					continue;
1989 				}
1990 				alloced++;
1991 			} else {
1992 				ASSERT(mhead_cnt > 0);
1993 				mhead = m->m_nextpkt;
1994 				m->m_nextpkt = NULL;
1995 				mhead_cnt--;
1996 			}
1997 			m->m_data += pad;
1998 			m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
1999 
2000 			/*
2001 			 * copy and checksum l3, l4 and payload
2002 			 * l2 header is copied later only if we
2003 			 * can't agg as an optimization
2004 			 */
2005 			m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
2006 			_dbuf_array_t dbuf_array = {.dba_is_buflet = false};
2007 			if (agg_ok) {
2008 				int added = 0;
2009 				dbuf_array.dba_mbuf[0] = m;
2010 				dbuf_array.dba_num_dbufs = 1;
2011 				csum_ok = copy_pkt_csum_packed(pkt, plen,
2012 				    &dbuf_array, is_ipv4, curr_m, NULL,
2013 				    &data_csum, &added);
2014 
2015 				if (!csum_ok) {
2016 					STATS_INC(fsws,
2017 					    FSW_STATS_RX_AGG_BAD_CSUM);
2018 					SK_ERR("Checksum for aggregation "
2019 					    "on new mbuf is wrong");
2020 					DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail2);
2021 					agg_ok = false;
2022 					goto non_agg;
2023 				}
2024 
2025 				/*
2026 				 * There was not enough space in curr_m,
2027 				 * thus we must have added to m->m_data.
2028 				 */
2029 				VERIFY(added > 0);
2030 				VERIFY(m->m_len == m->m_pkthdr.len &&
2031 				    (uint32_t)m->m_len <=
2032 				    (uint32_t)mbuf_maxlen(m));
2033 
2034 				/*
2035 				 * We account for whatever we added
2036 				 * to m later on, thus - added.
2037 				 */
2038 				bytes += plen - thlen - added;
2039 			} else {
2040 non_agg:
2041 				dbuf_array.dba_mbuf[0] = m;
2042 				dbuf_array.dba_num_dbufs = 1;
2043 				m->m_len += l2len;
2044 				m->m_pkthdr.len += l2len;
2045 				csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
2046 				    &data_csum, is_ipv4);
2047 				if (__improbable(!csum_ok)) {
2048 					STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
2049 					SK_ERR("%d incorrect csum", __LINE__);
2050 					DTRACE_SKYWALK(aggr__host_tcp_csum_fail);
2051 				}
2052 				VERIFY(m->m_len == m->m_pkthdr.len &&
2053 				    (uint32_t)m->m_len <=
2054 				    (uint32_t)mbuf_maxlen(m));
2055 			}
2056 
2057 			STATS_INC(fsws, FSW_STATS_RX_COPY_PKT2MBUF);
2058 			STATS_INC(fsws, FSW_STATS_RX_COPY_SUM);
2059 
2060 			m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off;
2061 			m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value;
2062 			/*
2063 			 *  Note that these flags have same value,
2064 			 * except PACKET_CSUM_PARTIAL
2065 			 */
2066 			m->m_pkthdr.csum_flags |= (pkt->pkt_csum_flags &
2067 			    PACKET_CSUM_RX_FLAGS);
2068 
2069 			/* Set the rcvif */
2070 			m->m_pkthdr.rcvif = fsw->fsw_ifp;
2071 		}
2072 		ASSERT(m != NULL);
2073 		ASSERT((m->m_flags & M_PKTHDR) && m->m_pkthdr.pkt_hdr != NULL);
2074 		ASSERT((m->m_flags & M_HASFCS) == 0);
2075 		ASSERT(m->m_nextpkt == NULL);
2076 
2077 		if (__improbable(is_mbuf)) {
2078 			if ((uint32_t) m->m_len < (l2len + thlen)) {
2079 				m = m_pullup(m, (l2len + thlen));
2080 				if (m == NULL) {
2081 					STATS_INC(fsws,
2082 					    FSW_STATS_RX_DROP_NOMEM_BUF);
2083 					SK_ERR("mbuf pullup failed (err %d)",
2084 					    err);
2085 					KPKTQ_ENQUEUE(dropped_pkts, pkt);
2086 					drop_packets++;
2087 					drop_bytes += pkt->pkt_length;
2088 					continue;
2089 				}
2090 				m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2091 			}
2092 			if (prev_csum_ok && csum_ok) {
2093 				ASSERT(fa.fa_smbuf == smbuf);
2094 				agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2095 			}
2096 		}
2097 
2098 		if (agg_ok) {
2099 			ASSERT(fa.fa_smbuf == smbuf);
2100 			ASSERT(!fa.fa_sobj_is_pkt);
2101 			if (__improbable(is_mbuf)) {
2102 				bytes += (m_pktlen(m) - l2len);
2103 				/* adjust mbuf by l2, l3 and l4  hdr */
2104 				m_adj(m, l2len + thlen);
2105 			} else {
2106 				bytes += m_pktlen(m);
2107 			}
2108 
2109 			m->m_flags &= ~M_PKTHDR;
2110 			flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
2111 			while (curr_m->m_next != NULL) {
2112 				curr_m = curr_m->m_next;
2113 			}
2114 			curr_m->m_next = m;
2115 			curr_m = m;
2116 			m = NULL;
2117 		} else {
2118 			if ((uint32_t) m->m_len < l2len) {
2119 				m = m_pullup(m, l2len);
2120 				if (m == NULL) {
2121 					STATS_INC(fsws,
2122 					    FSW_STATS_RX_DROP_NOMEM_BUF);
2123 					SK_ERR("mbuf pullup failed (err %d)",
2124 					    err);
2125 					KPKTQ_ENQUEUE(dropped_pkts, pkt);
2126 					drop_packets++;
2127 					drop_bytes += pkt->pkt_length;
2128 					continue;
2129 				}
2130 				m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2131 			}
2132 
2133 			/* copy l2 header for native */
2134 			if (__probable(!is_mbuf)) {
2135 				uint16_t llhoff = pkt->pkt_headroom;
2136 				uint8_t *baddr;
2137 				MD_BUFLET_ADDR_ABS(pkt, baddr);
2138 				ASSERT(baddr != NULL);
2139 				baddr += llhoff;
2140 				pkt_copy(baddr, m->m_data, l2len);
2141 			}
2142 			/* adjust mbuf by l2 hdr */
2143 			m_adj(m, l2len);
2144 			bytes += m_pktlen(m);
2145 
2146 			/*
2147 			 * aggregated packets can be skipped by pktap because
2148 			 * the original pre-aggregated chain already passed through
2149 			 * pktap (see fsw_snoop()) before entering this function.
2150 			 */
2151 			m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
2152 
2153 			if (m_chain == NULL) {
2154 				/* this is the start of the chain */
2155 				m_chain = m;
2156 				smbuf = m;
2157 				curr_m = m;
2158 			} else if (smbuf != NULL) {
2159 				/*
2160 				 * set m to be next packet
2161 				 */
2162 				mbuf_agg_log(smbuf, kernproc, is_mbuf);
2163 				smbuf->m_nextpkt = m;
2164 				smbuf = m;
2165 				curr_m = m;
2166 			} else {
2167 				VERIFY(0);
2168 			}
2169 
2170 			smbufs++;
2171 			m = NULL;
2172 
2173 			flow_agg_init_smbuf(&fa, smbuf, pkt);
2174 			/*
2175 			 * if the super packet is an mbuf which can't accomodate
2176 			 * (sizeof(struct ip6_tcp_mask) in a single buffer then
2177 			 * do the aggregation check in slow path.
2178 			 * Note that an mbuf without cluster has only 80 bytes
2179 			 * available for data, sizeof(struct ip6_tcp_mask) is
2180 			 * also 80 bytes, so if the packet contains an
2181 			 * ethernet header, this mbuf won't be able to fully
2182 			 * contain "struct ip6_tcp_mask" data in a single
2183 			 * buffer.
2184 			 */
2185 			if (pkt->pkt_flow_ip_ver == IPV6_VERSION) {
2186 				if (__improbable(smbuf->m_len <
2187 				    ((smbuf->m_data -
2188 				    (caddr_t)(smbuf->m_pkthdr.pkt_hdr)) +
2189 				    MASK_SIZE))) {
2190 					fa.fa_sobj_is_short = true;
2191 				}
2192 			}
2193 		}
2194 next:
2195 		pkt_agg_log(pkt, kernproc, true);
2196 		prev_csum_ok = csum_ok;
2197 		KPKTQ_ENQUEUE(&disposed_pkts, pkt);
2198 	}
2199 
2200 	KPKTQ_FINI(&fe->fe_rx_pktq);
2201 
2202 	/* Free any leftover mbufs, true only for native  */
2203 	if (__improbable(mhead != NULL)) {
2204 		ASSERT(mhead_cnt != 0);
2205 		(void) m_freem_list(mhead);
2206 		mhead = NULL;
2207 		mhead_cnt = 0;
2208 		mhead_bufsize = 0;
2209 	}
2210 
2211 	if (fe->fe_rx_largest_msize > largest_smbuf) {
2212 		/*
2213 		 * Make it slowly move towards smbuf if we consistently get
2214 		 * non-aggregatable size.
2215 		 *
2216 		 * If we start at 16K, this makes us go to 4K within 6 rounds
2217 		 * and down to 2K within 12 rounds.
2218 		 */
2219 		fe->fe_rx_largest_msize -=
2220 		    ((fe->fe_rx_largest_msize - largest_smbuf) >> 2);
2221 	} else {
2222 		fe->fe_rx_largest_msize +=
2223 		    ((largest_smbuf - fe->fe_rx_largest_msize) >> 2);
2224 	}
2225 
2226 	if (smbufs > 0) {
2227 		/* Last smbuf */
2228 		mbuf_agg_log(smbuf, kernproc, is_mbuf);
2229 		SK_DF(logflags, "smbuf count %u", smbufs);
2230 
2231 		ASSERT(m_chain != NULL);
2232 		ASSERT(smbuf != NULL);
2233 		/*
2234 		 * Call fsw_host_sendup() with mbuf chain
2235 		 * directly.
2236 		 */
2237 		mchain_agg_log(m_chain, kernproc, is_mbuf);
2238 		fsw_host_sendup(fsw->fsw_ifp, m_chain, smbuf, smbufs, bytes);
2239 
2240 		if (__improbable(is_mbuf)) {
2241 			STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2MBUF, smbufs);
2242 		} else {
2243 			STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2MBUF, smbufs);
2244 		}
2245 		FLOW_STATS_IN_ADD(fe, spackets, smbufs);
2246 
2247 		ASSERT((fe->fe_flags & FLOWENTF_TRACK) == 0);
2248 	}
2249 
2250 	/* record (raw) number of packets and bytes */
2251 	ASSERT((int)(rcvd_bytes - drop_bytes) > 0);
2252 	ASSERT((int)(rcvd_packets - drop_packets) > 0);
2253 	flow_track_stats(fe, (rcvd_bytes - drop_bytes),
2254 	    (rcvd_packets - drop_packets), (rcvd_ulen != 0), true);
2255 
2256 	pp_free_pktq(&disposed_pkts);
2257 }
2258 
2259 void
flow_rx_agg_tcp(struct nx_flowswitch * fsw,struct flow_entry * fe)2260 flow_rx_agg_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe)
2261 {
2262 	struct pktq dropped_pkts;
2263 	bool is_mbuf;
2264 
2265 	if (__improbable(fe->fe_rx_frag_count > 0)) {
2266 		dp_flow_rx_process(fsw, fe);
2267 		return;
2268 	}
2269 
2270 	KPKTQ_INIT(&dropped_pkts);
2271 
2272 	if (!dp_flow_rx_route_process(fsw, fe)) {
2273 		SK_ERR("Rx route bad");
2274 		fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
2275 		STATS_ADD(&fsw->fsw_stats, FSW_STATS_RX_FLOW_NONVIABLE,
2276 		    KPKTQ_LEN(&dropped_pkts));
2277 		goto done;
2278 	}
2279 
2280 	is_mbuf = !!(PKT_IS_MBUF(KPKTQ_FIRST(&fe->fe_rx_pktq)));
2281 
2282 	if (fe->fe_nx_port == FSW_VP_HOST) {
2283 		boolean_t do_rx_agg;
2284 
2285 		/* BSD flow */
2286 		if (sk_fsw_rx_agg_tcp_host != SK_FSW_RX_AGG_TCP_HOST_AUTO) {
2287 			do_rx_agg = (sk_fsw_rx_agg_tcp_host ==
2288 			    SK_FSW_RX_AGG_TCP_HOST_ON);
2289 		} else {
2290 			do_rx_agg = !dlil_has_ip_filter() &&
2291 			    !dlil_has_if_filter(fsw->fsw_ifp);
2292 		}
2293 		if (__improbable(!do_rx_agg)) {
2294 			fsw_host_rx(fsw, fe);
2295 			return;
2296 		}
2297 		if (__improbable(pktap_total_tap_count != 0)) {
2298 			fsw_snoop(fsw, fe, true);
2299 		}
2300 		flow_rx_agg_host(fsw, fe, &dropped_pkts, is_mbuf);
2301 	} else {
2302 		/* channel flow */
2303 		if (__improbable(pktap_total_tap_count != 0)) {
2304 			fsw_snoop(fsw, fe, true);
2305 		}
2306 		flow_rx_agg_channel(fsw, fe, &dropped_pkts, is_mbuf);
2307 	}
2308 
2309 done:
2310 	pp_free_pktq(&dropped_pkts);
2311 }
2312