xref: /xnu-11215.41.3/bsd/skywalk/nexus/flowswitch/flow/flow_agg.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2019-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
33 #include <skywalk/nexus/netif/nx_netif.h>
34 #include <skywalk/nexus/netif/nx_netif_compat.h>
35 #include <netinet/tcp.h>
36 #include <netinet/ip.h>
37 #include <netinet/ip6.h>
38 #include <net/pktap.h>
39 #include <sys/sdt.h>
40 
41 #define MAX_AGG_IP_LEN()        MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET)
42 #define MAX_BUFLET_COUNT        (32)
43 #define TCP_FLAGS_IGNORE        (TH_FIN|TH_SYN|TH_RST|TH_URG)
44 #define PKT_IS_MBUF(_pkt)       (_pkt->pkt_pflags & PKT_F_MBUF_DATA)
45 #define PKT_IS_TRUNC_MBUF(_pkt) (PKT_IS_MBUF(_pkt) &&           \
46 	                        (_pkt->pkt_pflags & PKT_F_TRUNCATED))
47 #define PKT_IS_WAKE_PKT(_pkt)   ((PKT_IS_MBUF(_pkt) &&                                  \
48 	                        (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) || \
49 	                        (!PKT_IS_MBUF(_pkt) &&                                  \
50 	                        (_pkt->pkt_pflags & PKT_F_WAKE_PKT)))
51 
52 
53 typedef uint16_t (* flow_agg_fix_pkt_sum_func)(uint16_t, uint16_t, uint16_t);
54 
55 static uint16_t
56 flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new);
57 
58 static uint16_t
59 flow_agg_pkt_fix_sum_no_op(uint16_t csum, uint16_t old, uint16_t new);
60 
61 /*
62  * This structure holds per-super object (mbuf/packet) flow aggregation.
63  */
64 struct flow_agg {
65 	union {
66 		struct {
67 			union {
68 				void *          _fa_sobj;
69 				struct mbuf *   _fa_smbuf;      /* super mbuf */
70 				struct __kern_packet *_fa_spkt; /* super pkt */
71 			};
72 			uint8_t *__indexable _fa_sptr;        /* ptr to super IP header */
73 			bool     _fa_sobj_is_pkt; /* super obj is pkt or mbuf */
74 			/*
75 			 * super obj is not large enough to hold the IP & TCP
76 			 * header in a contiguous buffer.
77 			 */
78 			bool     _fa_sobj_is_short;
79 			uint32_t _fa_tcp_seq;     /* expected next sequence # */
80 			uint32_t _fa_ulen;        /* expected next ulen */
81 			uint32_t _fa_total;       /* total aggregated bytes */
82 			/* function that fix packet checksum */
83 			flow_agg_fix_pkt_sum_func _fa_fix_pkt_sum;
84 		} __flow_agg;
85 		uint64_t __flow_agg_data[5];
86 	};
87 #define fa_sobj           __flow_agg._fa_sobj
88 #define fa_smbuf          __flow_agg._fa_smbuf
89 #define fa_spkt           __flow_agg._fa_spkt
90 #define fa_sptr           __flow_agg._fa_sptr
91 #define fa_sobj_is_pkt    __flow_agg._fa_sobj_is_pkt
92 #define fa_sobj_is_short  __flow_agg._fa_sobj_is_short
93 #define fa_tcp_seq        __flow_agg._fa_tcp_seq
94 #define fa_ulen           __flow_agg._fa_ulen
95 #define fa_total          __flow_agg._fa_total
96 #define fa_fix_pkt_sum   __flow_agg._fa_fix_pkt_sum
97 };
98 
99 #if __has_ptrcheck
100 #define FLOW_AGG_CLEAR(_fa) do {                                    \
101 	_CASSERT(sizeof(struct flow_agg) == 48);         \
102 	_CASSERT(offsetof(struct flow_agg, fa_fix_pkt_sum) == 40);              \
103 	sk_zero_48(_fa);                                                \
104 	(_fa)->fa_fix_pkt_sum = 0;                                                                             \
105 } while (0)
106 #else
107 #define FLOW_AGG_CLEAR(_fa) do {                                    \
108 	_CASSERT(sizeof(struct flow_agg) == 40);         \
109 	_CASSERT(offsetof(struct flow_agg, fa_fix_pkt_sum) == 32);              \
110 	sk_zero_32(_fa);                                                \
111 	(_fa)->fa_fix_pkt_sum = 0;                                                                             \
112 } while (0)
113 #endif
114 
115 #define MASK_SIZE       80      /* size of struct {ip,ip6}_tcp_mask */
116 
117 struct ip_tcp_mask {
118 	struct ip       ip_m;
119 	struct tcphdr   tcp_m;
120 	uint32_t        tcp_option_m[MAX_TCPOPTLEN / sizeof(uint32_t)];
121 };
122 
123 static const struct ip_tcp_mask ip_tcp_mask
124 __sk_aligned(16) =
125 {
126 	.ip_m = {
127 		.ip_hl = 0xf,
128 		.ip_v = 0xf,
129 		.ip_tos = 0xff,
130 		/* Not checked; aggregated packet's ip_len is increasing */
131 		.ip_len = 0,
132 		.ip_id = 0,
133 		.ip_off = 0xffff,
134 		.ip_ttl = 0xff,
135 		.ip_p = 0xff,
136 		.ip_sum = 0,
137 		.ip_src.s_addr = 0xffffffff,
138 		.ip_dst.s_addr = 0xffffffff,
139 	},
140 	.tcp_m = {
141 		.th_sport = 0xffff,
142 		.th_dport = 0xffff,
143 		.th_seq = 0,
144 		.th_ack = 0xffffffff,
145 		.th_x2 = 0xf,
146 		.th_off = 0xf,
147 		.th_flags = ~TH_PUSH,
148 		.th_win = 0xffff,
149 		.th_sum = 0,
150 		.th_urp = 0xffff,
151 	},
152 	.tcp_option_m = {
153 		/* Max 40 bytes of TCP options */
154 		0xffffffff,
155 		0xffffffff,
156 		0xffffffff,
157 		0,      /* Filling up to MASK_SIZE */
158 		0,      /* Filling up to MASK_SIZE */
159 		0,      /* Filling up to MASK_SIZE */
160 		0,      /* Filling up to MASK_SIZE */
161 		0,      /* Filling up to MASK_SIZE */
162 		0,      /* Filling up to MASK_SIZE */
163 		0,      /* Filling up to MASK_SIZE */
164 	},
165 };
166 
167 struct ip6_tcp_mask {
168 	struct ip6_hdr  ip6_m;
169 	struct tcphdr   tcp_m;
170 	uint32_t        tcp_option_m[5]; /* 5 bytes to fill up to MASK_SIZE */
171 };
172 
173 static const struct ip6_tcp_mask ip6_tcp_mask
174 __sk_aligned(16) =
175 {
176 	.ip6_m = {
177 		.ip6_ctlun.ip6_un1.ip6_un1_flow = 0xffffffff,
178 		/* Not checked; aggregated packet's ip_len is increasing */
179 		.ip6_ctlun.ip6_un1.ip6_un1_plen = 0,
180 		.ip6_ctlun.ip6_un1.ip6_un1_nxt = 0xff,
181 		.ip6_ctlun.ip6_un1.ip6_un1_hlim = 0xff,
182 		.ip6_src.__u6_addr.__u6_addr32[0] = 0xffffff,
183 		.ip6_src.__u6_addr.__u6_addr32[1] = 0xffffff,
184 		.ip6_src.__u6_addr.__u6_addr32[2] = 0xffffff,
185 		.ip6_src.__u6_addr.__u6_addr32[3] = 0xffffff,
186 		.ip6_dst.__u6_addr.__u6_addr32[0] = 0xffffff,
187 		.ip6_dst.__u6_addr.__u6_addr32[1] = 0xffffff,
188 		.ip6_dst.__u6_addr.__u6_addr32[2] = 0xffffff,
189 		.ip6_dst.__u6_addr.__u6_addr32[3] = 0xffffff,
190 	},
191 	.tcp_m = {
192 		.th_sport = 0xffff,
193 		.th_dport = 0xffff,
194 		.th_seq = 0,
195 		.th_ack = 0xffffffff,
196 		.th_x2 = 0xf,
197 		.th_off = 0xf,
198 		.th_flags = ~TH_PUSH,
199 		.th_win = 0xffff,
200 		.th_sum = 0,
201 		.th_urp = 0xffff,
202 	},
203 	.tcp_option_m = {
204 		/* Max 40 bytes of TCP options */
205 		0xffffffff,
206 		0xffffffff,
207 		0xffffffff,
208 		0,          /* Filling up to MASK_SIZE */
209 		0,          /* Filling up to MASK_SIZE */
210 	},
211 };
212 
213 #if SK_LOG
214 SK_LOG_ATTRIBUTE
215 static void
_pkt_agg_log(struct __kern_packet * pkt,struct proc * p,bool is_input)216 _pkt_agg_log(struct __kern_packet *pkt, struct proc *p, bool is_input)
217 {
218 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
219 	    (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
220 
221 	kern_packet_t ph = SK_PKT2PH(pkt);
222 	uint64_t bufcnt = 1;
223 	if (!is_input) {
224 		bufcnt = kern_packet_get_buflet_count(ph);
225 	}
226 
227 	SK_DF(logflags, "%s(%d) %spkt 0x%llx plen %u",
228 	    sk_proc_name_address(p), sk_proc_pid(p), is_input ? "s":"d",
229 	    SK_KVA(pkt), pkt->pkt_length);
230 
231 	SK_DF(logflags, "%spkt csumf/rxstart/rxval 0x%x/%u/0x%04x",
232 	    is_input ? "s":"d", pkt->pkt_csum_flags,
233 	    (uint32_t)pkt->pkt_csum_rx_start_off,
234 	    (uint32_t)pkt->pkt_csum_rx_value);
235 
236 	if (!is_input) {
237 		kern_buflet_t buf = kern_packet_get_next_buflet(ph, NULL);
238 
239 		/* Individual buflets */
240 		for (uint64_t i = 0; i < bufcnt && buf != NULL; i++) {
241 			SK_DF(logflags | SK_VERB_DUMP, "%s",
242 			    sk_dump("buf", __buflet_get_data_address(buf),
243 			    __buflet_get_data_length(buf), 128, NULL, 0));
244 			buf = kern_packet_get_next_buflet(ph, buf);
245 		}
246 	}
247 }
248 
249 #define pkt_agg_log(_pkt, _p, _is_input) do {                           \
250 	if (__improbable(sk_verbose != 0)) {                            \
251 	        _pkt_agg_log(_pkt, _p, _is_input);                      \
252 	}                                                               \
253 } while (0)
254 
255 SK_LOG_ATTRIBUTE
256 static void
_mbuf_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)257 _mbuf_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
258 {
259 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
260 	    (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
261 
262 	SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
263 	    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
264 	    m->m_pkthdr.len);
265 
266 	SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
267 	    m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
268 	    (uint32_t)m->m_pkthdr.csum_rx_val);
269 
270 	/* Dump the first mbuf */
271 	ASSERT(m_mtod_current(m) != NULL);
272 	SK_DF(logflags | SK_VERB_DUMP, "%s", sk_dump("buf",
273 	    (uint8_t *)m_mtod_current(m), m->m_len, 128, NULL, 0));
274 }
275 
276 #define mbuf_agg_log(_m, _p, _is_mbuf) do {                             \
277 	if (__improbable(sk_verbose != 0)) {                            \
278 	        _mbuf_agg_log(_m, _p, _is_mbuf);                        \
279 	}                                                               \
280 } while (0)
281 
282 SK_LOG_ATTRIBUTE
283 static void
_mchain_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)284 _mchain_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
285 {
286 	SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
287 	    (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
288 
289 	while (m != NULL) {
290 		SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
291 		    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
292 		    m->m_pkthdr.len);
293 
294 		SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
295 		    m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
296 		    (uint32_t)m->m_pkthdr.csum_rx_val);
297 
298 		m = m->m_nextpkt;
299 	}
300 }
301 
302 #define mchain_agg_log(_m, _p, _is_mbuf) do {                           \
303 	if (__improbable(sk_verbose != 0)) {                            \
304 	        _mchain_agg_log(_m, _p, _is_mbuf);                      \
305 	}                                                               \
306 } while (0)
307 #else
308 #define pkt_agg_log(...)
309 #define mbuf_agg_log(...)
310 #define mchain_agg_log(...)
311 #endif /* SK_LOG */
312 
313 /*
314  * Checksum only for packet with mbuf.
315  */
316 static bool
mbuf_csum(struct __kern_packet * pkt,struct mbuf * m,bool verify_l3,uint16_t * data_csum)317 mbuf_csum(struct __kern_packet *pkt, struct mbuf *m, bool verify_l3,
318     uint16_t *data_csum)
319 {
320 	ASSERT(data_csum != NULL);
321 
322 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
323 	uint32_t plen = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen +
324 	    pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
325 	uint16_t l4len = plen - pkt->pkt_l2_len - pkt->pkt_flow_ip_hlen;
326 	uint16_t start = pkt->pkt_l2_len;
327 	uint32_t partial = 0;
328 	uint16_t csum = 0;
329 
330 	ASSERT(plen == m_pktlen(m));
331 
332 	/* Some compat drivers compute full checksum */
333 	if ((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS) ==
334 	    CSUM_RX_FULL_FLAGS) {
335 		SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
336 		    m->m_pkthdr.csum_flags, m->m_pkthdr.csum_rx_start,
337 		    m->m_pkthdr.csum_rx_val);
338 
339 		/* Compute the data_csum */
340 		struct tcphdr *tcp =
341 		    (struct tcphdr *)(void *)(mtod(m, uint8_t *) +
342 		    pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen);
343 		/* 16-bit alignment is sufficient */
344 		ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
345 
346 		uint16_t th_sum = tcp->th_sum;
347 		tcp->th_sum = 0;
348 
349 		partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
350 		    pkt->pkt_flow_tcp_hlen);
351 		partial += htons(l4len + IPPROTO_TCP);
352 		if (pkt->pkt_flow_ip_ver == IPVERSION) {
353 			csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
354 			    pkt->pkt_flow_ipv4_dst.s_addr, partial);
355 		} else {
356 			ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
357 			csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
358 			    &pkt->pkt_flow_ipv6_dst, partial);
359 		}
360 		/* Restore the original checksum */
361 		tcp->th_sum = th_sum;
362 		th_sum = __packet_fix_sum(th_sum, csum, 0);
363 		*data_csum = ~th_sum & 0xffff;
364 
365 		/* pkt metadata will be transfer to super packet */
366 		__packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
367 		    0, m->m_pkthdr.csum_rx_val, false);
368 
369 		if ((m->m_pkthdr.csum_rx_val ^ 0xffff) == 0) {
370 			return true;
371 		} else {
372 			return false;
373 		}
374 	}
375 	/* Reset the csum RX flags */
376 	m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
377 	if (verify_l3) {
378 		csum = m_sum16(m, start, pkt->pkt_flow_ip_hlen);
379 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
380 		    start, pkt->pkt_flow_ip_hlen, csum);
381 		m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED;
382 		if ((csum ^ 0xffff) != 0) {
383 			return false;
384 		} else {
385 			m->m_pkthdr.csum_flags |= CSUM_IP_VALID;
386 		}
387 	}
388 	/* Compute L4 header checksum */
389 	partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
390 	    pkt->pkt_flow_tcp_hlen);
391 	/* Compute payload checksum */
392 	start += (pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
393 	*data_csum = m_sum16(m, start, (plen - start));
394 
395 	/* Fold in the data checksum to TCP checksum */
396 	partial += *data_csum;
397 	partial += htons(l4len + IPPROTO_TCP);
398 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
399 		csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
400 		    pkt->pkt_flow_ipv4_dst.s_addr, partial);
401 	} else {
402 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
403 		csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
404 		    &pkt->pkt_flow_ipv6_dst, partial);
405 	}
406 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
407 	    start - pkt->pkt_flow_tcp_hlen, l4len, csum);
408 	// Set start to 0 for full checksum
409 	m->m_pkthdr.csum_rx_start = 0;
410 	m->m_pkthdr.csum_rx_val = csum;
411 	m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
412 
413 	/* pkt metadata will be transfer to super packet */
414 	__packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
415 	    0, csum, false);
416 
417 	if ((csum ^ 0xffff) != 0) {
418 		return false;
419 	}
420 
421 	return true;
422 }
423 
424 /* structure to pass an array of data buffers */
425 typedef struct _dbuf_array {
426 	union {
427 		struct __kern_buflet *dba_buflet[MAX_BUFLET_COUNT];
428 		struct mbuf *dba_mbuf[MAX_BUFLET_COUNT];
429 	};
430 	uint8_t dba_num_dbufs;
431 	bool dba_is_buflet;
432 } _dbuf_array_t;
433 
434 static inline void
_copy_data_sum_dbuf(struct __kern_packet * spkt,uint16_t soff,uint16_t plen,uint32_t * partial_sum,boolean_t * odd_start,_dbuf_array_t * dbuf,boolean_t do_csum)435 _copy_data_sum_dbuf(struct __kern_packet *spkt, uint16_t soff, uint16_t plen,
436     uint32_t *partial_sum, boolean_t *odd_start, _dbuf_array_t *dbuf,
437     boolean_t do_csum)
438 {
439 	uint8_t i = 0;
440 	uint32_t buflet_dlim, buflet_dlen, buf_off = 0;
441 
442 	ASSERT(plen > 0);
443 	while (plen > 0) {
444 		ASSERT(i < dbuf->dba_num_dbufs);
445 		uint32_t dbuf_lim, tmplen;
446 		uint8_t *dbuf_addr;
447 
448 		if (dbuf->dba_is_buflet) {
449 			ASSERT(kern_buflet_get_data_offset(dbuf->dba_buflet[i]) == 0);
450 			/* XXX -fbounds-safety: use the inline variant to return an __indexable */
451 			dbuf_addr = __buflet_get_data_address(dbuf->dba_buflet[i]);
452 
453 			buflet_dlim = kern_buflet_get_data_limit(dbuf->dba_buflet[i]);
454 			buflet_dlen = kern_buflet_get_data_length(dbuf->dba_buflet[i]);
455 			buf_off = buflet_dlen;
456 			dbuf_lim = buflet_dlim - buf_off;
457 			dbuf_addr += buf_off;
458 		} else {
459 			dbuf_lim = (uint32_t) M_TRAILINGSPACE(dbuf->dba_mbuf[i]);
460 			dbuf_addr = mtod(dbuf->dba_mbuf[i], uint8_t *);
461 			buf_off = dbuf->dba_mbuf[i]->m_len;
462 			dbuf_addr += buf_off;
463 		}
464 
465 		tmplen = min(plen, dbuf_lim);
466 		if (PKT_IS_TRUNC_MBUF(spkt)) {
467 			if (do_csum) {
468 				*partial_sum = m_copydata_sum(spkt->pkt_mbuf,
469 				    soff, tmplen, dbuf_addr, *partial_sum,
470 				    odd_start);
471 			} else {
472 				m_copydata(spkt->pkt_mbuf, soff, tmplen,
473 				    dbuf_addr);
474 			}
475 		} else {
476 			*partial_sum = pkt_copyaddr_sum(SK_PKT2PH(spkt),
477 			    soff, dbuf_addr, tmplen, do_csum, *partial_sum,
478 			    odd_start);
479 		}
480 		if (dbuf->dba_is_buflet) {
481 			VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[i],
482 			    tmplen + buf_off) == 0);
483 		} else {
484 			dbuf->dba_mbuf[i]->m_len += tmplen;
485 			dbuf->dba_mbuf[0]->m_pkthdr.len += tmplen;
486 		}
487 		soff += tmplen;
488 		plen -= tmplen;
489 		buf_off = 0;
490 		i++;
491 	}
492 	ASSERT(plen == 0);
493 }
494 
495 /*
496  * Copy (fill) and checksum for packet.
497  * spkt: source IP packet.
498  * plen: length of data in spkt (IP hdr + TCP hdr + TCP payload).
499  * verify_l3: verify IPv4 header checksum.
500  * currm: destination mbuf.
501  * currp: destination skywalk packet.
502  * dbuf: additional destination data buffer(s), used when current destination
503  * packet is out of space.
504  * added: amount of data copied from spkt to the additional buffer.
505  * data_sum: 16-bit folded partial checksum of the copied TCP payload.
506  */
507 static bool
copy_pkt_csum_packed(struct __kern_packet * spkt,uint32_t plen,_dbuf_array_t * dbuf,bool verify_l3,struct mbuf * currm,struct __kern_buflet * currp,uint16_t * data_csum,int * added)508 copy_pkt_csum_packed(struct __kern_packet *spkt, uint32_t plen,
509     _dbuf_array_t *dbuf, bool verify_l3, struct mbuf *currm,
510     struct __kern_buflet *currp, uint16_t *data_csum, int *added)
511 {
512 	ASSERT(data_csum != NULL);
513 
514 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
515 	    SK_VERB_COPY));
516 
517 	uint16_t start = 0, csum = 0;
518 	uint32_t len = 0;
519 	uint32_t l4len;
520 	/* soff is only used for packets */
521 	uint16_t soff = spkt->pkt_headroom + spkt->pkt_l2_len;
522 	uint32_t data_partial = 0, partial = 0;
523 	int32_t curr_oldlen;
524 	uint32_t curr_trailing;
525 	char *curr_ptr;
526 	int32_t curr_len;
527 	uint16_t data_off;
528 	uint32_t tmplen;
529 	boolean_t odd_start = FALSE;
530 	bool verify_l4;
531 
532 	/* One of them must be != NULL, but they can't be both set */
533 	VERIFY((currm != NULL || currp != NULL) &&
534 	    ((currm != NULL) != (currp != NULL)));
535 
536 	if (currm != NULL) {
537 		curr_oldlen = currm->m_len;
538 		curr_trailing = (uint32_t)M_TRAILINGSPACE(currm);
539 		curr_ptr = mtod(currm, char *) + currm->m_len;
540 		curr_len = currm->m_len;
541 	} else {
542 		curr_oldlen = currp->buf_dlen;
543 		curr_trailing = currp->buf_dlim - currp->buf_doff -
544 		    currp->buf_dlen;
545 		/* XXX -fbounds-safety: use the inline variant to return an __indexable */
546 		curr_ptr = (char *)__buflet_get_data_address(currp) + currp->buf_doff +
547 		    currp->buf_dlen;
548 		curr_len = currp->buf_dlen;
549 	}
550 
551 	/* Verify checksum only for IPv4 */
552 	len = spkt->pkt_flow_ip_hlen;
553 	verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(spkt));
554 	if (verify_l3) {
555 		if (PKT_IS_TRUNC_MBUF(spkt)) {
556 			partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
557 			    len, 0, 0);
558 		} else {
559 			partial = pkt_sum(SK_PKT2PH(spkt), soff, len);
560 		}
561 
562 		csum = __packet_fold_sum(partial);
563 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)", 0,
564 		    len, csum);
565 		spkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
566 		if ((csum ^ 0xffff) != 0) {
567 			/* No need to copy & checkum TCP+payload */
568 			return false;
569 		} else {
570 			spkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
571 		}
572 	}
573 
574 	verify_l4 = !PACKET_HAS_FULL_CHECKSUM_FLAGS(spkt);
575 
576 	/* Copy & verify TCP checksum */
577 	start = spkt->pkt_flow_ip_hlen + spkt->pkt_flow_tcp_hlen;
578 	l4len = plen - spkt->pkt_flow_ip_hlen;
579 	len = plen - start;
580 	if (PKT_IS_TRUNC_MBUF(spkt)) {
581 		tmplen = min(len, curr_trailing);
582 		odd_start = FALSE;
583 
584 		/* First, simple checksum on the TCP header */
585 		if (verify_l4) {
586 			partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
587 			    spkt->pkt_flow_tcp_hlen, spkt->pkt_flow_ip_hlen, 0);
588 		}
589 
590 		/* Now, copy & sum the payload */
591 		if (tmplen > 0) {
592 			data_partial = m_copydata_sum(spkt->pkt_mbuf,
593 			    start, tmplen, curr_ptr, 0, &odd_start);
594 			curr_len += tmplen;
595 		}
596 		data_off = start + tmplen;
597 	} else {
598 		tmplen = min(len, curr_trailing);
599 		odd_start = FALSE;
600 
601 		/* First, simple checksum on the TCP header */
602 		if (verify_l4) {
603 			partial = pkt_sum(SK_PKT2PH(spkt), (soff +
604 			    spkt->pkt_flow_ip_hlen), spkt->pkt_flow_tcp_hlen);
605 		}
606 
607 		/* Now, copy & sum the payload */
608 		if (tmplen > 0) {
609 			data_partial = pkt_copyaddr_sum(SK_PKT2PH(spkt),
610 			    (soff + start), (uint8_t *)curr_ptr, tmplen,
611 			    true, 0, &odd_start);
612 			curr_len += tmplen;
613 		}
614 		data_off = soff + start + tmplen;
615 	}
616 
617 	/* copy & sum remaining payload in additional buffers */
618 	if ((len - tmplen) > 0) {
619 		ASSERT(dbuf != NULL);
620 		_copy_data_sum_dbuf(spkt, data_off, (len - tmplen),
621 		    &data_partial, &odd_start, dbuf, true);
622 		*added = (len - tmplen);
623 	}
624 
625 	/* Fold data checksum to 16 bit */
626 	*data_csum = __packet_fold_sum(data_partial);
627 
628 	if (currm != NULL) {
629 		currm->m_len = curr_len;
630 	} else {
631 		currp->buf_dlen = curr_len;
632 	}
633 
634 	if (verify_l4) {
635 		/* Fold in the data checksum to TCP checksum */
636 		partial += *data_csum;
637 		partial += htons(l4len + IPPROTO_TCP);
638 		if (spkt->pkt_flow_ip_ver == IPVERSION) {
639 			csum = in_pseudo(spkt->pkt_flow_ipv4_src.s_addr,
640 			    spkt->pkt_flow_ipv4_dst.s_addr, partial);
641 		} else {
642 			ASSERT(spkt->pkt_flow_ip_ver == IPV6_VERSION);
643 			csum = in6_pseudo(&spkt->pkt_flow_ipv6_src,
644 			    &spkt->pkt_flow_ipv6_dst, partial);
645 		}
646 		/* pkt metadata will be transfer to super packet */
647 		__packet_set_inet_checksum(SK_PKT2PH(spkt),
648 		    PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
649 	} else {
650 		/* grab csum value from offload */
651 		csum = spkt->pkt_csum_rx_value;
652 	}
653 
654 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
655 	    start - spkt->pkt_flow_tcp_hlen, l4len, ntohs(csum));
656 
657 	if ((csum ^ 0xffff) != 0) {
658 		/*
659 		 * Revert whatever we did here!
660 		 * currm/currp should be restored to previous value.
661 		 * dbuf (for additional payload) should be restore to 0.
662 		 */
663 		if (currm != NULL) {
664 			currm->m_len = curr_oldlen;
665 		} else {
666 			currp->buf_dlen = curr_oldlen;
667 		}
668 		if (dbuf != NULL) {
669 			for (int i = 0; i < dbuf->dba_num_dbufs; i++) {
670 				if (dbuf->dba_is_buflet) {
671 					struct __kern_buflet *b = dbuf->dba_buflet[i];
672 					kern_buflet_set_data_length(b, 0);
673 					kern_buflet_set_data_offset(b, 0);
674 				} else {
675 					struct mbuf *m = dbuf->dba_mbuf[i];
676 					m->m_len = m->m_pkthdr.len = 0;
677 				}
678 			}
679 		}
680 
681 		return false;
682 	}
683 
684 	return true;
685 }
686 
687 /*
688  * Copy and checksum for packet or packet with mbuf
689  * data_csum is only supported for bsd flows
690  */
691 static bool
copy_pkt_csum(struct __kern_packet * pkt,uint32_t plen,_dbuf_array_t * dbuf,uint16_t * data_csum,bool verify_l3)692 copy_pkt_csum(struct __kern_packet *pkt, uint32_t plen, _dbuf_array_t *dbuf,
693     uint16_t *data_csum, bool verify_l3)
694 {
695 	/*
696 	 * To keep this routine simple and optimal, we are asserting on the
697 	 * assumption that the smallest flowswitch packet pool buffer should
698 	 * be large enough to hold the IP and TCP headers in the first buflet.
699 	 */
700 	_CASSERT(NX_FSW_MINBUFSIZE >= NETIF_COMPAT_MAX_MBUF_DATA_COPY);
701 
702 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
703 	    (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
704 
705 	uint16_t start = 0, csum = 0;
706 	uint32_t len = 0;
707 	/* soff is only used for packets */
708 	uint16_t soff = pkt->pkt_headroom + pkt->pkt_l2_len;
709 	uint32_t data_partial = 0, partial = 0;
710 	boolean_t odd_start = false;
711 	uint32_t data_len;
712 	uint16_t dbuf_off;
713 	uint16_t copied_len = 0;
714 	bool l3_csum_ok;
715 	uint8_t *daddr;
716 
717 	if (dbuf->dba_is_buflet) {
718 		/* XXX -fbounds-safety: use the inline variant to return an __indexable */
719 		daddr = __buflet_get_data_address(dbuf->dba_buflet[0]);
720 		daddr += kern_buflet_get_data_length(dbuf->dba_buflet[0]);
721 	} else {
722 		daddr = mtod(dbuf->dba_mbuf[0], uint8_t *);
723 		daddr += dbuf->dba_mbuf[0]->m_len;
724 		/*
725 		 * available space check for payload is done later
726 		 * in _copy_data_sum_dbuf
727 		 */
728 		ASSERT(M_TRAILINGSPACE(dbuf->dba_mbuf[0]) >=
729 		    pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
730 	}
731 
732 	if (PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt)) {
733 		/* copy only */
734 		_copy_data_sum_dbuf(pkt, PKT_IS_TRUNC_MBUF(pkt) ? 0: soff,
735 		    plen, &partial, &odd_start, dbuf, false);
736 		if (PKT_IS_MBUF(pkt)) {
737 			csum = pkt->pkt_mbuf->m_pkthdr.csum_rx_val;
738 			SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
739 			    pkt->pkt_mbuf->m_pkthdr.csum_flags,
740 			    pkt->pkt_mbuf->m_pkthdr.csum_rx_start, csum);
741 		} else {
742 			csum = pkt->pkt_csum_rx_value;
743 			SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
744 			    pkt->pkt_csum_flags,
745 			    pkt->pkt_csum_rx_start_off, csum);
746 		}
747 
748 		/* pkt metadata will be transfer to super packet */
749 		__packet_set_inet_checksum(SK_PKT2PH(pkt),
750 		    PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
751 		if ((csum ^ 0xffff) == 0) {
752 			return true;
753 		} else {
754 			return false;
755 		}
756 	}
757 
758 	/* Copy l3 & verify checksum only for IPv4 */
759 	start = 0;
760 	len = pkt->pkt_flow_ip_hlen;
761 	if (PKT_IS_TRUNC_MBUF(pkt)) {
762 		partial = m_copydata_sum(pkt->pkt_mbuf, start, len,
763 		    (daddr + start), 0, NULL);
764 	} else {
765 		partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), soff,
766 		    (daddr + start), len, true, 0, NULL);
767 	}
768 	verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(pkt));
769 	l3_csum_ok = !verify_l3;
770 	if (verify_l3) {
771 		csum = __packet_fold_sum(partial);
772 		SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
773 		    start, len, csum);
774 		pkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
775 		if ((csum ^ 0xffff) != 0) {
776 			/* proceed to copy the rest of packet */
777 		} else {
778 			pkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
779 			l3_csum_ok = true;
780 		}
781 	}
782 	copied_len += pkt->pkt_flow_ip_hlen;
783 
784 	/* Copy & verify TCP checksum */
785 	start = pkt->pkt_flow_ip_hlen;
786 	len = plen - start;
787 
788 	if (PKT_IS_TRUNC_MBUF(pkt)) {
789 		/* First, copy and sum TCP header */
790 		partial = m_copydata_sum(pkt->pkt_mbuf, start,
791 		    pkt->pkt_flow_tcp_hlen, (daddr + start), 0, NULL);
792 
793 		data_len = len - pkt->pkt_flow_tcp_hlen;
794 		start += pkt->pkt_flow_tcp_hlen;
795 		dbuf_off = start;
796 		/* Next, copy and sum payload (if any) */
797 	} else {
798 		/* First, copy and sum TCP header */
799 		partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), (soff + start),
800 		    (daddr + start), pkt->pkt_flow_tcp_hlen, true, 0, NULL);
801 
802 		data_len = len - pkt->pkt_flow_tcp_hlen;
803 		start += pkt->pkt_flow_tcp_hlen;
804 		dbuf_off = start;
805 		start += soff;
806 	}
807 	copied_len += pkt->pkt_flow_tcp_hlen;
808 
809 	if (dbuf->dba_is_buflet) {
810 		VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[0],
811 		    kern_buflet_get_data_length(dbuf->dba_buflet[0]) +
812 		    copied_len) == 0);
813 	} else {
814 		dbuf->dba_mbuf[0]->m_len += copied_len;
815 		dbuf->dba_mbuf[0]->m_pkthdr.len += copied_len;
816 	}
817 
818 	/* copy and sum payload (if any) */
819 	if (data_len > 0) {
820 		odd_start = false;
821 		_copy_data_sum_dbuf(pkt, start, data_len, &data_partial,
822 		    &odd_start, dbuf, l3_csum_ok);
823 	}
824 
825 	if (__improbable(!l3_csum_ok)) {
826 		return false;
827 	}
828 
829 	/* Fold data sum to 16 bit and then into the partial */
830 	*data_csum = __packet_fold_sum(data_partial);
831 
832 	/* Fold in the data checksum to TCP checksum */
833 	partial += *data_csum;
834 
835 	partial += htons(len + IPPROTO_TCP);
836 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
837 		csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
838 		    pkt->pkt_flow_ipv4_dst.s_addr, partial);
839 	} else {
840 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
841 		csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
842 		    &pkt->pkt_flow_ipv6_dst, partial);
843 	}
844 
845 	SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
846 	    pkt->pkt_flow_ip_hlen, len, csum);
847 
848 	/* pkt metadata will be transfer to super packet */
849 	__packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
850 	    0, csum, false);
851 	if ((csum ^ 0xffff) != 0) {
852 		return false;
853 	}
854 
855 	return true;
856 }
857 
858 SK_INLINE_ATTRIBUTE
859 static void
flow_agg_init_common(struct nx_flowswitch * fsw,struct flow_agg * fa,struct __kern_packet * pkt)860 flow_agg_init_common(struct nx_flowswitch *fsw, struct flow_agg *fa,
861     struct __kern_packet *pkt)
862 {
863 	struct ifnet *ifp;
864 
865 	switch (pkt->pkt_flow_ip_ver) {
866 	case IPVERSION:
867 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip)) {
868 			return;
869 		}
870 		break;
871 	case IPV6_VERSION:
872 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
873 			return;
874 		}
875 		break;
876 	default:
877 		VERIFY(0);
878 		/* NOTREACHED */
879 		__builtin_unreachable();
880 	}
881 
882 	fa->fa_tcp_seq = ntohl(pkt->pkt_flow_tcp_seq) + pkt->pkt_flow_ulen;
883 	fa->fa_ulen = pkt->pkt_flow_ulen;
884 	fa->fa_total = pkt->pkt_flow_ip_hlen +
885 	    pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
886 
887 	ifp = fsw->fsw_ifp;
888 	ASSERT(ifp != NULL);
889 	if (__improbable((ifp->if_hwassist & IFNET_LRO) != 0)) {
890 		/* in case hardware supports LRO, don't fix checksum in the header */
891 		fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum_no_op;
892 	} else {
893 		fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum;
894 	}
895 }
896 
897 static void
flow_agg_init_smbuf(struct nx_flowswitch * fsw,struct flow_agg * fa,struct mbuf * smbuf,struct __kern_packet * pkt)898 flow_agg_init_smbuf(struct nx_flowswitch *fsw, struct flow_agg *fa,
899     struct mbuf *smbuf, struct __kern_packet *pkt)
900 {
901 	FLOW_AGG_CLEAR(fa);
902 
903 	ASSERT(smbuf != NULL);
904 	fa->fa_smbuf = smbuf;
905 
906 	fa->fa_sptr = mtod(smbuf, uint8_t *);
907 	ASSERT(fa->fa_sptr != NULL);
908 
909 	/*
910 	 * Note here we use 'pkt' instead of 'smbuf', since we rely on the
911 	 * contents of the flow structure which don't exist in 'smbuf'.
912 	 */
913 	flow_agg_init_common(fsw, fa, pkt);
914 }
915 
916 static void
flow_agg_init_spkt(struct nx_flowswitch * fsw,struct flow_agg * fa,struct __kern_packet * spkt,struct __kern_packet * pkt)917 flow_agg_init_spkt(struct nx_flowswitch *fsw, struct flow_agg *fa,
918     struct __kern_packet *spkt, struct __kern_packet *pkt)
919 {
920 	FLOW_AGG_CLEAR(fa);
921 
922 	ASSERT(spkt != NULL);
923 	fa->fa_spkt = spkt;
924 	fa->fa_sobj_is_pkt = true;
925 	VERIFY(spkt->pkt_headroom == 0 && spkt->pkt_l2_len == 0);
926 
927 	MD_BUFLET_ADDR_ABS(spkt, fa->fa_sptr);
928 	ASSERT(fa->fa_sptr != NULL);
929 
930 	/*
931 	 * Note here we use 'pkt' instead of 'spkt', since we rely on the
932 	 * contents of the flow structure which don't exist in 'spkt'.
933 	 */
934 	flow_agg_init_common(fsw, fa, pkt);
935 }
936 
937 /*
938  * -fbounds-safety: The reason hardcoded values 64 (and 80) are used here is
939  * because this function calls the 64-byte version of sk memcmp function (same
940  * thing for the 80-byte version). In can_agg_fastpath, there is a check being
941  * done for TCP header length with options: sizeof(struct tcphdr) +
942  * TCPOLEN_TSTAMP_APPA , which is 20 + 12 = 32 bytes. In case of IPv4, adding IP
943  * header size of 20 to it makes it 52 bytes. From the sk_memcmp_* variants, the
944  * closest one is the 64B option.
945  */
946 SK_INLINE_ATTRIBUTE
947 static bool
948 ipv4_tcp_memcmp(const uint8_t *__counted_by(64)h1, const uint8_t *__counted_by(64)h2)
949 {
950 	return sk_memcmp_mask_64B(h1, h2, (const uint8_t *)&ip_tcp_mask) == 0;
951 }
952 
953 SK_INLINE_ATTRIBUTE
954 static bool
955 ipv6_tcp_memcmp(const uint8_t *__counted_by(80)h1, const uint8_t *__counted_by(80)h2)
956 {
957 	return sk_memcmp_mask_80B(h1, h2, (const uint8_t *)&ip6_tcp_mask) == 0;
958 }
959 
960 SK_INLINE_ATTRIBUTE
961 static bool
can_agg_fastpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)962 can_agg_fastpath(struct flow_agg *fa, struct __kern_packet *pkt,
963     struct fsw_stats *fsws)
964 {
965 	bool match;
966 	uint8_t *ip_hdr;
967 
968 	ASSERT(fa->fa_sptr != NULL);
969 	_CASSERT(sizeof(struct ip6_tcp_mask) == MASK_SIZE);
970 	_CASSERT(sizeof(struct ip_tcp_mask) == MASK_SIZE);
971 
972 	if (__improbable(pkt->pkt_length < MASK_SIZE)) {
973 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_TCP);
974 		goto slow_path;
975 	}
976 
977 	if (__improbable(fa->fa_sobj_is_short)) {
978 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_MBUF);
979 		goto slow_path;
980 	}
981 
982 	if (__improbable(pkt->pkt_flow_tcp_hlen !=
983 	    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA))) {
984 		goto slow_path;
985 	}
986 
987 	switch (pkt->pkt_flow_ip_ver) {
988 	case IPVERSION:
989 		/*
990 		 * -fbounds-safety: pkt->pkt_flow_ip_hdr is a mach_vm_address_t,
991 		 * so we forge it here. The reason the constant values 64 and 80
992 		 * are used is because ipv4_tcp_memcmp takes a __counted_by(64)
993 		 * and __counted_by(80), respectively.
994 		 */
995 		ip_hdr = __unsafe_forge_bidi_indexable(uint8_t *,
996 		    pkt->pkt_flow_ip_hdr, 64);
997 		match = ipv4_tcp_memcmp(fa->fa_sptr, ip_hdr);
998 		break;
999 	case IPV6_VERSION:
1000 		ip_hdr = __unsafe_forge_bidi_indexable(uint8_t *,
1001 		    pkt->pkt_flow_ip_hdr, 80);
1002 		match = ipv6_tcp_memcmp(fa->fa_sptr, ip_hdr);
1003 		break;
1004 	default:
1005 		VERIFY(0);
1006 		/* NOTREACHED */
1007 		__builtin_unreachable();
1008 	}
1009 
1010 	if (__improbable(!match)) {
1011 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_MASK_TCP);
1012 		goto slow_path;
1013 	}
1014 	if (__improbable(pkt->pkt_flow_ulen != fa->fa_ulen)) {
1015 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ULEN_TCP);
1016 		goto slow_path;
1017 	}
1018 
1019 	STATS_INC(fsws, FSW_STATS_RX_AGG_OK_FASTPATH_TCP);
1020 	fa->fa_tcp_seq += pkt->pkt_flow_ulen;
1021 	fa->fa_ulen = pkt->pkt_flow_ulen;
1022 	return true;
1023 
1024 slow_path:
1025 	return false;
1026 }
1027 
1028 SK_NO_INLINE_ATTRIBUTE
1029 static bool
can_agg_slowpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)1030 can_agg_slowpath(struct flow_agg *fa, struct __kern_packet *pkt,
1031     struct fsw_stats *fsws)
1032 {
1033 	uint8_t *sl3_hdr = fa->fa_sptr;
1034 	uint8_t *l3_hdr = __unsafe_forge_bidi_indexable(uint8_t *,
1035 	    pkt->pkt_flow_ip_hdr, pkt->pkt_flow_ip_hlen);
1036 	uint32_t sl3tlen = 0;
1037 	uint16_t sl3hlen = 0;
1038 
1039 	DTRACE_SKYWALK2(aggr__slow, struct __kern_packet *, pkt,
1040 	    uint8_t *, sl3_hdr);
1041 
1042 	ASSERT(sl3_hdr != NULL);
1043 
1044 	/*
1045 	 * Compare IP header length, TOS, frag flags and IP options
1046 	 * For IPv4, the options should match exactly
1047 	 * For IPv6, if options are present, bail out
1048 	 */
1049 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
1050 		struct ip *siph = (struct ip *)(void *)sl3_hdr;
1051 		struct ip *iph = (struct ip *)(void *)l3_hdr;
1052 
1053 		ASSERT(siph->ip_v == IPVERSION);
1054 		/* 16-bit alignment is sufficient (handles mbuf case) */
1055 		ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1056 		ASSERT(IS_P2ALIGNED(iph, sizeof(uint16_t)));
1057 
1058 		sl3hlen = (siph->ip_hl << 2);
1059 		if (sl3hlen != pkt->pkt_flow_ip_hlen) {
1060 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1061 			DTRACE_SKYWALK2(aggr__fail2, uint16_t, sl3hlen, uint8_t,
1062 			    pkt->pkt_flow_ip_hlen);
1063 			return false;
1064 		}
1065 
1066 		if (siph->ip_ttl != iph->ip_ttl) {
1067 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1068 			DTRACE_SKYWALK2(aggr__fail3, uint8_t, siph->ip_ttl,
1069 			    uint8_t, iph->ip_ttl);
1070 			return false;
1071 		}
1072 
1073 		if (siph->ip_tos != iph->ip_tos) {
1074 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1075 			DTRACE_SKYWALK2(aggr__fail4, uint8_t, siph->ip_tos,
1076 			    uint8_t, iph->ip_tos);
1077 			return false;
1078 		}
1079 		/* For IPv4, DF bit should match */
1080 		if ((ntohs(siph->ip_off) & (IP_DF | IP_RF)) !=
1081 		    (ntohs(iph->ip_off) & (IP_DF | IP_RF))) {
1082 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OFF_IP);
1083 			DTRACE_SKYWALK2(aggr__fail5, uint16_t,
1084 			    ntohs(siph->ip_off), uint16_t, ntohs(iph->ip_off));
1085 			return false;
1086 		}
1087 
1088 		uint8_t ip_opts_len = pkt->pkt_flow_ip_hlen -
1089 		    sizeof(struct ip);
1090 		if (ip_opts_len > 0 &&
1091 		    memcmp((uint8_t *)(siph + 1), (uint8_t *)(iph + 1),
1092 		    ip_opts_len) != 0) {
1093 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPT_IP);
1094 			DTRACE_SKYWALK3(aggr__fail6, uint8_t, ip_opts_len,
1095 			    uint8_t *, (uint8_t *)(siph + 1), uint8_t *,
1096 			    (uint8_t *)(iph + 1));
1097 			return false;
1098 		}
1099 		sl3tlen = ntohs(siph->ip_len);
1100 	} else {
1101 		struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1102 		struct ip6_hdr *ip6 = (struct ip6_hdr *)l3_hdr;
1103 
1104 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1105 		ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1106 		/* 16-bit alignment is sufficient (handles mbuf case) */
1107 		ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1108 
1109 		if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
1110 			/*
1111 			 * Don't aggregate if extension header is present in
1112 			 * packet. N.B. currently flow switch only classifies
1113 			 * frag header
1114 			 */
1115 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1116 			DTRACE_SKYWALK1(aggr__fail7, uint8_t,
1117 			    pkt->pkt_flow_ip_hlen);
1118 			return false;
1119 		}
1120 
1121 		sl3hlen = sizeof(struct ip6_hdr);
1122 		/* For IPv6, flow info mask covers TOS and flow label */
1123 		if (memcmp((uint8_t *)&sip6->ip6_flow, (uint8_t *)&ip6->ip6_flow,
1124 		    sizeof(sip6->ip6_flow)) != 0) {
1125 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1126 			DTRACE_SKYWALK2(aggr__fail8, uint32_t,
1127 			    ntohl(sip6->ip6_flow), uint32_t,
1128 			    ntohl(ip6->ip6_flow));
1129 			return false;
1130 		}
1131 
1132 		if (sip6->ip6_hlim != ip6->ip6_hlim) {
1133 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1134 			DTRACE_SKYWALK2(aggr__fail9, uint8_t, sip6->ip6_hlim,
1135 			    uint8_t, ip6->ip6_hlim);
1136 			return false;
1137 		}
1138 
1139 		sl3tlen = (sizeof(struct ip6_hdr) + ntohs(sip6->ip6_plen));
1140 	}
1141 
1142 	/*
1143 	 * For TCP header, compare ACK number and window size
1144 	 * Compare TCP flags
1145 	 * Compare TCP header length and TCP options
1146 	 */
1147 	struct tcphdr *stcp = (struct tcphdr *)(void *)(sl3_hdr + sl3hlen);
1148 	/* -fbounds-safety: pkt_flow_tcp_hdr is a mach_vm_address_t */
1149 	struct tcphdr *tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
1150 	    pkt->pkt_flow_tcp_hdr, pkt->pkt_flow_tcp_hlen);
1151 
1152 	uint16_t sl4hlen = (stcp->th_off << 2);
1153 	if (memcmp(&stcp->th_ack, &tcp->th_ack, sizeof(stcp->th_ack)) != 0 ||
1154 	    memcmp(&stcp->th_win, &tcp->th_win, sizeof(stcp->th_win)) != 0) {
1155 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ACKWIN_TCP);
1156 		DTRACE_SKYWALK4(aggr__fail9, uint32_t, ntohl(stcp->th_ack),
1157 		    uint32_t, ntohl(tcp->th_ack), uint16_t, ntohs(stcp->th_win),
1158 		    uint16_t, ntohs(tcp->th_win));
1159 		return false;
1160 	}
1161 
1162 	if ((stcp->th_flags & ~(TH_PUSH)) != (tcp->th_flags & ~(TH_PUSH))) {
1163 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_FLAGS_TCP);
1164 		DTRACE_SKYWALK2(aggr__fail10, uint8_t, stcp->th_flags,
1165 		    uint8_t, tcp->th_flags);
1166 		return false;
1167 	}
1168 
1169 	if (sl4hlen != pkt->pkt_flow_tcp_hlen) {
1170 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_TCP);
1171 		DTRACE_SKYWALK2(aggr__fail11, uint8_t, sl4hlen,
1172 		    uint8_t, pkt->pkt_flow_tcp_hlen);
1173 		return false;
1174 	}
1175 
1176 	uint8_t tcp_opts_len = pkt->pkt_flow_tcp_hlen - sizeof(struct tcphdr);
1177 	/*
1178 	 * We know that the TCP-option lengthes are the same thanks to the above
1179 	 * sl4hlen check
1180 	 */
1181 	if (tcp_opts_len > 0 && memcmp((uint8_t *)(stcp + 1),
1182 	    (uint8_t *)(tcp + 1), tcp_opts_len) != 0) {
1183 		/*
1184 		 * Fast-path header prediction:
1185 		 *
1186 		 * TCP Timestamp option is usually put after two NOP-headers,
1187 		 * and thus total TCP-option length is 12. If that's the case,
1188 		 * we can aggregate as only the TCP time-stamp option differs.
1189 		 */
1190 		if (tcp_opts_len != TCPOLEN_TSTAMP_APPA) {
1191 			STATS_INC(fsws, FSW_STATS_RX_AGG_NO_EXOPT_TCP);
1192 			DTRACE_SKYWALK1(aggr__fail13, uint8_t, tcp_opts_len);
1193 			return false;
1194 		} else {
1195 			uint32_t sts_hdr, ts_hdr;
1196 			if (IS_P2ALIGNED(stcp + 1, sizeof(uint32_t))) {
1197 				sts_hdr = *((uint32_t *)(stcp + 1));
1198 			} else {
1199 				bcopy(stcp + 1, &sts_hdr, sizeof(sts_hdr));
1200 			}
1201 			if (IS_P2ALIGNED(tcp + 1, sizeof(uint32_t))) {
1202 				ts_hdr = *((uint32_t *)(tcp + 1));
1203 			} else {
1204 				bcopy(tcp + 1, &ts_hdr, sizeof(ts_hdr));
1205 			}
1206 
1207 			if (sts_hdr != htonl(TCPOPT_TSTAMP_HDR) ||
1208 			    ts_hdr != htonl(TCPOPT_TSTAMP_HDR)) {
1209 				STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPTTS_TCP);
1210 				DTRACE_SKYWALK2(aggr__fail14, uint32_t,
1211 				    sts_hdr, uint32_t, ts_hdr);
1212 				return false;
1213 			}
1214 		}
1215 	}
1216 	STATS_INC(fsws, FSW_STATS_RX_AGG_OK_SLOWPATH_TCP);
1217 	fa->fa_tcp_seq += pkt->pkt_flow_ulen;
1218 	fa->fa_ulen = pkt->pkt_flow_ulen;
1219 	return true;
1220 }
1221 
1222 static bool
flow_agg_is_ok(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)1223 flow_agg_is_ok(struct flow_agg *fa, struct __kern_packet *pkt,
1224     struct fsw_stats *fsws)
1225 {
1226 	/* Shouldn't exceed the ip_len beyond MIN(custom ip_len, 64K) */
1227 	const uint32_t max_ip_len = MAX_AGG_IP_LEN();
1228 	bool can_agg = false;
1229 
1230 	DTRACE_SKYWALK2(aggr__check, struct flow_agg *, fa,
1231 	    struct __kern_packet *, pkt);
1232 
1233 	ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1234 	if (__improbable(pkt->pkt_flow_tcp_agg_fast != 0)) {
1235 		pkt->pkt_flow_tcp_agg_fast = 0;
1236 	}
1237 	/*
1238 	 * Don't aggregate if any of the following is true:
1239 	 * 1. TCP flag is other than TH_{ACK,PUSH}
1240 	 * 2. Payload length is 0 (pure ACK)
1241 	 * 3. This is the first packet
1242 	 * 4. TCP sequence number is not expected
1243 	 * 5. We would've exceeded the maximum aggregated size
1244 	 * 6. It's not the first packet and the wake flag is set
1245 	 */
1246 	if (__improbable((pkt->pkt_flow_tcp_flags & TCP_FLAGS_IGNORE) != 0 ||
1247 	    pkt->pkt_flow_ulen == 0 || fa->fa_sobj == NULL)) {
1248 		DTRACE_SKYWALK1(aggr__fail1a, struct __kern_packet *, pkt);
1249 		goto done;
1250 	}
1251 	if (__improbable(ntohl(pkt->pkt_flow_tcp_seq) != fa->fa_tcp_seq)) {
1252 		DTRACE_SKYWALK2(aggr__fail1b, uint32_t,
1253 		    ntohl(pkt->pkt_flow_tcp_seq), uint32_t, fa->fa_tcp_seq);
1254 		STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SEQN_TCP);
1255 		goto done;
1256 	}
1257 	if (__improbable((fa->fa_total + pkt->pkt_flow_ulen) > max_ip_len)) {
1258 		DTRACE_SKYWALK3(aggr__fail1c, uint32_t, fa->fa_total,
1259 		    uint32_t, pkt->pkt_flow_ulen, uint32_t, max_ip_len);
1260 		/* We've reached aggregation limit */
1261 		STATS_INC(fsws, FSW_STATS_RX_AGG_LIMIT);
1262 		goto done;
1263 	}
1264 	if (__improbable(PKT_IS_WAKE_PKT(pkt) && fa->fa_total > 0)) {
1265 		DTRACE_SKYWALK1(aggr__fail1d, struct __kern_packet *, pkt);
1266 		goto done;
1267 	}
1268 
1269 	can_agg = can_agg_fastpath(fa, pkt, fsws);
1270 	if (can_agg) {
1271 		pkt->pkt_flow_tcp_agg_fast = 1;
1272 		goto done;
1273 	}
1274 
1275 	can_agg = can_agg_slowpath(fa, pkt, fsws);
1276 	ASSERT(!pkt->pkt_flow_tcp_agg_fast);
1277 
1278 done:
1279 	return can_agg;
1280 }
1281 
1282 static uint16_t
flow_agg_pkt_fix_sum(uint16_t csum,uint16_t old,uint16_t new)1283 flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new)
1284 {
1285 	return __packet_fix_sum(csum, old, new);
1286 }
1287 
1288 static uint16_t
flow_agg_pkt_fix_sum_no_op(uint16_t __unused csum,uint16_t __unused old,uint16_t __unused new)1289 flow_agg_pkt_fix_sum_no_op(uint16_t __unused csum, uint16_t __unused old,
1290     uint16_t __unused new)
1291 {
1292 	return 0;
1293 }
1294 
1295 static inline void
flow_agg_pkt_fix_hdr_sum(struct flow_agg * fa,uint8_t * __sized_by (sizeof (uint32_t))field,uint16_t * csum,uint32_t new)1296 flow_agg_pkt_fix_hdr_sum(struct flow_agg *fa,
1297     uint8_t *__sized_by(sizeof(uint32_t))field, uint16_t *csum,
1298     uint32_t new)
1299 {
1300 	uint32_t old;
1301 	memcpy((uint8_t *)&old, field, sizeof(old));
1302 	memcpy(field, (uint8_t *)&new, sizeof(uint32_t));
1303 	*csum = fa->fa_fix_pkt_sum(fa->fa_fix_pkt_sum(*csum,
1304 	    (uint16_t)(old >> 16), (uint16_t)(new >> 16)),
1305 	    (uint16_t)(old & 0xffff),
1306 	    (uint16_t)(new & 0xffff));
1307 }
1308 
1309 static void
flow_agg_merge_hdr(struct flow_agg * fa,struct __kern_packet * pkt,__unused uint16_t data_csum,struct fsw_stats * fsws)1310 flow_agg_merge_hdr(struct flow_agg *fa, struct __kern_packet *pkt,
1311     __unused uint16_t data_csum, struct fsw_stats *fsws)
1312 {
1313 	struct tcphdr *stcp, *tcp;
1314 	uint8_t *l3hdr, l3hlen;
1315 	uint16_t old_l3len = 0;
1316 	uint8_t result;
1317 
1318 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1319 
1320 	/*
1321 	 * The packet being merged should always have full checksum flags
1322 	 * and a valid checksum. Otherwise, it would fail copy_pkt_csum_packed
1323 	 * and not enter this function.
1324 	 */
1325 	ASSERT(PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt));
1326 	ASSERT((pkt->pkt_csum_rx_value ^ 0xffff) == 0);
1327 
1328 	ASSERT(fa->fa_sobj != NULL);
1329 	ASSERT(!fa->fa_sobj_is_pkt ||
1330 	    (fa->fa_spkt->pkt_headroom == 0 && fa->fa_spkt->pkt_l2_len == 0));
1331 	uint8_t *sl3_hdr = fa->fa_sptr;
1332 	ASSERT(sl3_hdr != NULL);
1333 	ASSERT(fa->fa_fix_pkt_sum != NULL);
1334 
1335 	fa->fa_total += pkt->pkt_flow_ulen;
1336 
1337 	/*
1338 	 * Update the IP header as:
1339 	 * 1. Set the IP ID (IPv4 only) to that of the new packet
1340 	 * 2. Set the ttl to the lowest of the two
1341 	 * 3. Increment the IP length by the payload length of new packet
1342 	 * 4. Leave the IP (IPv4 only) checksum as is
1343 	 * Update the resp. flow classification fields, if any
1344 	 * Nothing to update for TCP header for now
1345 	 */
1346 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
1347 		struct ip *siph = (struct ip *)(void *)sl3_hdr;
1348 
1349 		/* 16-bit alignment is sufficient (handles mbuf case) */
1350 		ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1351 
1352 		l3hdr = (uint8_t *)siph;
1353 		l3hlen = siph->ip_hl << 2;
1354 
1355 		old_l3len = ntohs(siph->ip_len);
1356 		uint16_t l3tlen = ntohs(siph->ip_len) + pkt->pkt_flow_ulen;
1357 		siph->ip_len = htons(l3tlen);
1358 		siph->ip_sum = fa->fa_fix_pkt_sum(siph->ip_sum, 0,
1359 		    htons(pkt->pkt_flow_ulen));
1360 
1361 		SK_DF(logflags, "Agg IP len %u", ntohs(siph->ip_len));
1362 	} else {
1363 		struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1364 
1365 		/* 16-bit alignment is sufficient (handles mbuf case) */
1366 		ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1367 		ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1368 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1369 
1370 		l3hdr = (uint8_t *)sip6;
1371 		l3hlen = sizeof(struct ip6_hdr);
1372 
1373 		/* No extension headers should be present */
1374 		ASSERT(pkt->pkt_flow_ip_hlen == sizeof(struct ip6_hdr));
1375 
1376 		old_l3len = ntohs(sip6->ip6_plen) + sizeof(struct ip6_hdr);
1377 		uint16_t l3plen = ntohs(sip6->ip6_plen) + pkt->pkt_flow_ulen;
1378 		sip6->ip6_plen = htons(l3plen);
1379 
1380 		SK_DF(logflags, "Agg IP6 len %u", ntohs(sip6->ip6_plen));
1381 	}
1382 
1383 	if (__probable(pkt->pkt_flow_tcp_agg_fast)) {
1384 		STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_IP);
1385 	} else {
1386 		STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_IP);
1387 	}
1388 
1389 	stcp = (struct tcphdr *)(void *)(l3hdr + l3hlen);
1390 	tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
1391 	    (struct tcphdr *)pkt->pkt_flow_tcp_hdr, pkt->pkt_flow_tcp_hlen);
1392 
1393 	/* 16-bit alignment is sufficient (handles mbuf case) */
1394 	ASSERT(IS_P2ALIGNED(stcp, sizeof(uint16_t)));
1395 	ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
1396 
1397 	/*
1398 	 * If it is bigger, that means there are TCP-options that need to be
1399 	 * copied over.
1400 	 */
1401 	if (pkt->pkt_flow_tcp_hlen > sizeof(struct tcphdr) ||
1402 	    (stcp->th_flags & TH_PUSH) == 0) {
1403 		VERIFY(stcp->th_off << 2 == pkt->pkt_flow_tcp_hlen);
1404 		if (__improbable(!pkt->pkt_flow_tcp_agg_fast &&
1405 		    memcmp(stcp + 1, tcp + 1, (pkt->pkt_flow_tcp_hlen -
1406 		    sizeof(struct tcphdr))) != 0)) {
1407 			uint8_t *sopt = (uint8_t *)(stcp + 1);
1408 			uint8_t *opt = (uint8_t *)(tcp + 1);
1409 
1410 			uint32_t ntsval, ntsecr;
1411 			bcopy((void *)(opt + 4), &ntsval, sizeof(ntsval));
1412 			bcopy((void *)(opt + 8), &ntsecr, sizeof(ntsecr));
1413 
1414 			flow_agg_pkt_fix_hdr_sum(fa, sopt + 4, &stcp->th_sum, ntsval);
1415 			flow_agg_pkt_fix_hdr_sum(fa, sopt + 8, &stcp->th_sum, ntsecr);
1416 
1417 			STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_TCP);
1418 		} else {
1419 			STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_TCP);
1420 		}
1421 
1422 		if ((stcp->th_flags & TH_PUSH) == 0 &&
1423 		    (tcp->th_flags & TH_PUSH) != 0) {
1424 			uint16_t old, new;
1425 			tcp_seq *th_ack = &stcp->th_ack;
1426 			/*
1427 			 * -fbounds-safety: C-style cast (uint16_t *)(th_ack+1)
1428 			 * doesn't work here, because th_ack's bound is a single
1429 			 * uint32_t, so trying to go one address above, and then
1430 			 * later dereferncing it would lead to a panic.
1431 			 */
1432 			uint16_t *next = __unsafe_forge_single(uint16_t *,
1433 			    th_ack + 1);
1434 			old = *next;
1435 			/* If the new segment has a PUSH-flag, append it! */
1436 			stcp->th_flags |= tcp->th_flags & TH_PUSH;
1437 			next = __unsafe_forge_single(uint16_t *, th_ack + 1);
1438 			new = *next;
1439 			stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, old, new);
1440 		}
1441 	}
1442 
1443 	/* Update pseudo header checksum */
1444 	stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
1445 	    htons(pkt->pkt_flow_ulen));
1446 
1447 	/* Update data checksum  */
1448 	if (__improbable(old_l3len & 0x1)) {
1449 		/* swap the byte order, refer to rfc 1071 section 2 */
1450 		stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
1451 		    ntohs(data_csum));
1452 	} else {
1453 		stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0, data_csum);
1454 	}
1455 
1456 	if (fa->fa_sobj_is_pkt) {
1457 		struct __kern_packet *spkt = fa->fa_spkt;
1458 		spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1459 		spkt->pkt_flow_ulen += pkt->pkt_flow_ulen;
1460 		/*
1461 		 * Super packet length includes L3 and L4
1462 		 * header length for first packet only.
1463 		 */
1464 		spkt->pkt_length += pkt->pkt_flow_ulen;
1465 		if (spkt->pkt_seg_cnt == 0) {
1466 			/* First time we append packets, need to set it to 1 */
1467 			spkt->pkt_seg_cnt = 1;
1468 		}
1469 		_CASSERT(sizeof(result) == sizeof(spkt->pkt_seg_cnt));
1470 		if (!os_add_overflow(1, spkt->pkt_seg_cnt, &result)) {
1471 			spkt->pkt_seg_cnt = result;
1472 		}
1473 		SK_DF(logflags, "Agg pkt len %u TCP csum 0x%04x",
1474 		    spkt->pkt_length, ntohs(stcp->th_sum));
1475 	} else {
1476 		struct mbuf *smbuf = fa->fa_smbuf;
1477 		smbuf->m_pkthdr.len += pkt->pkt_flow_ulen;
1478 		if (smbuf->m_pkthdr.seg_cnt == 0) {
1479 			/* First time we append packets, need to set it to 1 */
1480 			smbuf->m_pkthdr.seg_cnt = 1;
1481 		}
1482 		_CASSERT(sizeof(result) == sizeof(smbuf->m_pkthdr.seg_cnt));
1483 		if (!os_add_overflow(1, smbuf->m_pkthdr.seg_cnt, &result)) {
1484 			smbuf->m_pkthdr.seg_cnt = result;
1485 		}
1486 		SK_DF(logflags, "Agg mbuf len %u TCP csum 0x%04x",
1487 		    smbuf->m_pkthdr.len, ntohs(stcp->th_sum));
1488 	}
1489 }
1490 
1491 /*
1492  * Copy metadata from source packet to destination packet
1493  */
1494 static void
pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)1495 pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
1496 {
1497 	/* Copy packet metadata */
1498 	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
1499 	_PKT_COPY(spkt, dpkt);
1500 }
1501 
1502 static void
pkt_finalize(kern_packet_t ph)1503 pkt_finalize(kern_packet_t ph)
1504 {
1505 	int err = __packet_finalize(ph);
1506 	VERIFY(err == 0);
1507 #if (DEVELOPMENT || DEBUG)
1508 	struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1509 	uint8_t *buf;
1510 	MD_BUFLET_ADDR_ABS(pkt, buf);
1511 	buf += pkt->pkt_headroom + pkt->pkt_l2_len;
1512 	DTRACE_SKYWALK2(aggr__finalize, struct __kern_packet *, pkt,
1513 	    uint8_t *, buf);
1514 #endif
1515 }
1516 
1517 static inline uint32_t
estimate_buf_cnt(struct flow_entry * fe,uint32_t total_bytes,uint32_t total_pkts,uint32_t min_bufsize,uint32_t agg_bufsize)1518 estimate_buf_cnt(struct flow_entry *fe, uint32_t total_bytes, uint32_t total_pkts,
1519     uint32_t min_bufsize, uint32_t agg_bufsize)
1520 {
1521 	uint32_t max_ip_len = MAX_AGG_IP_LEN();
1522 	uint32_t agg_size = MAX(fe->fe_rx_largest_size, min_bufsize);
1523 	uint32_t hdr_overhead;
1524 
1525 	if (__improbable(sk_fsw_rx_agg_tcp == 0)) {
1526 		return MIN(total_pkts, MAX_BUFLET_COUNT);
1527 	}
1528 
1529 	agg_size = MIN(agg_size, agg_bufsize);
1530 
1531 	hdr_overhead = (total_bytes / max_ip_len) *
1532 	    (MAX(sizeof(struct ip), sizeof(struct ip6_hdr)) +
1533 	    sizeof(struct tcphdr));
1534 
1535 	return ((total_bytes + hdr_overhead) / agg_size) + 1;
1536 }
1537 
1538 SK_INLINE_ATTRIBUTE
1539 static inline void
_append_dbuf_array_to_kpkt(kern_packet_t ph,kern_buflet_t pbuf,_dbuf_array_t * dbuf_array,kern_buflet_t * lbuf)1540 _append_dbuf_array_to_kpkt(kern_packet_t ph, kern_buflet_t pbuf,
1541     _dbuf_array_t *dbuf_array, kern_buflet_t *lbuf)
1542 {
1543 	for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1544 		kern_buflet_t buf = dbuf_array->dba_buflet[i];
1545 		VERIFY(kern_packet_add_buflet(ph, pbuf, buf) == 0);
1546 		pbuf = buf;
1547 		dbuf_array->dba_buflet[i] = NULL;
1548 	}
1549 	ASSERT(pbuf != NULL);
1550 	dbuf_array->dba_num_dbufs = 0;
1551 	*lbuf = pbuf;
1552 }
1553 
1554 SK_INLINE_ATTRIBUTE
1555 static inline void
_free_dbuf_array(struct kern_pbufpool * pp,_dbuf_array_t * dbuf_array)1556 _free_dbuf_array(struct kern_pbufpool *pp,
1557     _dbuf_array_t *dbuf_array)
1558 {
1559 	for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1560 		kern_buflet_t buf = dbuf_array->dba_buflet[i];
1561 		pp_free_buflet(pp, buf);
1562 		dbuf_array->dba_buflet[i] = NULL;
1563 	}
1564 	dbuf_array->dba_num_dbufs = 0;
1565 }
1566 
1567 static inline void
finalize_super_packet(struct __kern_packet ** spkt,kern_packet_t * sph,struct flow_agg * fa,uint32_t * largest_spkt,uint16_t * spkts,uint16_t bufcnt)1568 finalize_super_packet(struct __kern_packet **spkt, kern_packet_t *sph,
1569     struct flow_agg *fa, uint32_t *largest_spkt, uint16_t *spkts,
1570     uint16_t bufcnt)
1571 {
1572 	(*spkts)++;
1573 	if (bufcnt > 1) {
1574 		(*spkt)->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1575 	}
1576 	pkt_finalize(*sph);
1577 	if ((*spkt)->pkt_length > *largest_spkt) {
1578 		*largest_spkt = (*spkt)->pkt_length;
1579 	}
1580 	pkt_agg_log(*spkt, kernproc, false);
1581 	DTRACE_SKYWALK1(aggr__buflet__count, uint16_t, bufcnt);
1582 	*sph = 0;
1583 	*spkt = NULL;
1584 	FLOW_AGG_CLEAR(fa);
1585 }
1586 
1587 static inline void
converge_aggregation_size(struct flow_entry * fe,uint32_t largest_agg_size)1588 converge_aggregation_size(struct flow_entry *fe, uint32_t largest_agg_size)
1589 {
1590 	if (fe->fe_rx_largest_size > largest_agg_size) {
1591 		/*
1592 		 * Make it slowly move towards largest_agg_size if we
1593 		 * consistently get non-aggregatable size.
1594 		 *
1595 		 * If we start at 16K, this makes us go to 4K within 6 rounds
1596 		 * and down to 2K within 12 rounds.
1597 		 */
1598 		fe->fe_rx_largest_size -=
1599 		    ((fe->fe_rx_largest_size - largest_agg_size) >> 2);
1600 	} else {
1601 		fe->fe_rx_largest_size +=
1602 		    ((largest_agg_size - fe->fe_rx_largest_size) >> 2);
1603 	}
1604 }
1605 
1606 SK_NO_INLINE_ATTRIBUTE
1607 static void
flow_rx_agg_channel(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,uint32_t rx_bytes,bool is_mbuf)1608 flow_rx_agg_channel(struct nx_flowswitch *fsw, struct flow_entry *fe,
1609     struct pktq *rx_pkts, uint32_t rx_bytes, bool is_mbuf)
1610 {
1611 #define __RX_AGG_CHAN_DROP_SOURCE_PACKET(_pkt, _reason, _flags)    do {    \
1612 	pp_drop_packet_single(_pkt, fsw->fsw_ifp, _flags, _reason, __func__, __LINE__); \
1613 	(_pkt) = NULL;                                                     \
1614 	FLOW_AGG_CLEAR(&fa);                                               \
1615 	prev_csum_ok = false;                                              \
1616 } while (0)
1617 	struct flow_agg fa;             /* states */
1618 	FLOW_AGG_CLEAR(&fa);
1619 
1620 	struct pktq super_pkts;         /* dst super packets */
1621 	struct pktq disposed_pkts;      /* done src packets */
1622 
1623 	KPKTQ_INIT(&super_pkts);
1624 	KPKTQ_INIT(&disposed_pkts);
1625 
1626 	struct __kern_channel_ring *ring;
1627 	ring = fsw_flow_get_rx_ring(fsw, fe);
1628 	if (__improbable(ring == NULL)) {
1629 		SK_ERR("Rx ring is NULL");
1630 		STATS_ADD(&fsw->fsw_stats, FSW_STATS_DST_NXPORT_INVALID,
1631 		    KPKTQ_LEN(rx_pkts));
1632 		pp_drop_pktq(rx_pkts, fsw->fsw_ifp, DROPTAP_FLAG_DIR_IN,
1633 		    DROP_REASON_FSW_DST_NXPORT_INVALID, __func__, __LINE__);
1634 		return;
1635 	}
1636 	struct kern_pbufpool *dpp = ring->ckr_pp;
1637 	ASSERT(dpp->pp_max_frags > 1);
1638 
1639 	struct __kern_packet *pkt, *tpkt;
1640 	/* state for super packet */
1641 	struct __kern_packet *__single spkt = NULL;
1642 	kern_packet_t sph = 0;
1643 	kern_buflet_t __single sbuf = NULL;
1644 	bool prev_csum_ok = false, csum_ok, agg_ok;
1645 	uint16_t spkts = 0, bufcnt = 0;
1646 	int err;
1647 
1648 	struct fsw_stats *fsws = &fsw->fsw_stats;
1649 
1650 	/* state for buflet batch alloc */
1651 	uint32_t bh_cnt, bh_cnt_tmp;
1652 	uint64_t buf_arr[MAX_BUFLET_COUNT];
1653 	_dbuf_array_t dbuf_array = {.dba_is_buflet = true, .dba_num_dbufs = 0};
1654 	uint32_t largest_spkt = 0; /* largest aggregated packet size */
1655 	uint32_t agg_bufsize;
1656 	uint8_t iter = 0;
1657 	bool large_buffer = false;
1658 
1659 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1660 	SK_DF(logflags, "Rx input queue len %u", KPKTQ_LEN(rx_pkts));
1661 
1662 	if (__probable(fe->fe_rx_largest_size != 0 &&
1663 	    NX_FSW_TCP_RX_AGG_ENABLED())) {
1664 		if (fe->fe_rx_largest_size <= PP_BUF_SIZE_DEF(dpp) ||
1665 		    PP_BUF_SIZE_LARGE(dpp) == 0) {
1666 			agg_bufsize = PP_BUF_SIZE_DEF(dpp);
1667 		} else {
1668 			agg_bufsize = PP_BUF_SIZE_LARGE(dpp);
1669 			large_buffer = true;
1670 		}
1671 		bh_cnt = estimate_buf_cnt(fe, rx_bytes, KPKTQ_LEN(rx_pkts),
1672 		    PP_BUF_SIZE_DEF(dpp), agg_bufsize);
1673 		DTRACE_SKYWALK1(needed_blt_cnt_agg, uint32_t, bh_cnt);
1674 		bh_cnt = MIN(bh_cnt, MAX_BUFLET_COUNT);
1675 		bh_cnt_tmp = bh_cnt;
1676 	} else {
1677 		/*
1678 		 * No payload, thus it's all small-sized ACKs/...
1679 		 * OR aggregation is disabled.
1680 		 */
1681 		agg_bufsize = PP_BUF_SIZE_DEF(dpp);
1682 		bh_cnt_tmp = bh_cnt = MIN(KPKTQ_LEN(rx_pkts), MAX_BUFLET_COUNT);
1683 		DTRACE_SKYWALK1(needed_blt_cnt_no_agg, uint32_t, bh_cnt);
1684 	}
1685 
1686 	err = pp_alloc_buflet_batch(dpp, buf_arr, &bh_cnt, SKMEM_NOSLEEP,
1687 	    large_buffer);
1688 	if (__improbable(bh_cnt == 0)) {
1689 		SK_ERR("failed to alloc %u buflets (err %d), use slow path",
1690 		    bh_cnt_tmp, err);
1691 	}
1692 	bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1693 	KPKTQ_FOREACH_SAFE(pkt, rx_pkts, tpkt) {
1694 		if (tpkt != NULL) {
1695 			void *baddr;
1696 			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1697 			SK_PREFETCH(baddr, 0);
1698 		}
1699 
1700 		ASSERT(pkt->pkt_qum.qum_pp != dpp);
1701 		ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
1702 		ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
1703 		ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
1704 		ASSERT(!pkt->pkt_flow_ip_is_frag);
1705 		ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1706 
1707 		csum_ok = false;
1708 		agg_ok = false;
1709 		/* supports TCP only */
1710 		uint32_t thlen = (pkt->pkt_flow_ip_hlen +
1711 		    pkt->pkt_flow_tcp_hlen);
1712 		uint32_t plen = (thlen + pkt->pkt_flow_ulen);
1713 		uint16_t data_csum = 0;
1714 
1715 		KPKTQ_REMOVE(rx_pkts, pkt);
1716 		rx_bytes -= pkt->pkt_flow_ulen;
1717 		err = flow_pkt_track(fe, pkt, true);
1718 		if (__improbable(err != 0)) {
1719 			STATS_INC(fsws, FSW_STATS_RX_FLOW_TRACK_ERR);
1720 			/* if need to trigger RST */
1721 			if (err == ENETRESET) {
1722 				flow_track_abort_tcp(fe, pkt, NULL);
1723 			}
1724 			SK_DF(SK_VERB_FLOW_TRACK, "flow_pkt_track failed (err %d)", err);
1725 			__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
1726 			    DROP_REASON_FSW_FLOW_TRACK_ERR, 0);
1727 			continue;
1728 		}
1729 
1730 		if (is_mbuf) {          /* compat */
1731 			m_adj(pkt->pkt_mbuf, pkt->pkt_l2_len);
1732 			pkt->pkt_svc_class = m_get_service_class(pkt->pkt_mbuf);
1733 			if (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
1734 				pkt->pkt_pflags |= PKT_F_WAKE_PKT;
1735 			}
1736 		}
1737 
1738 		if (prev_csum_ok && sbuf) {
1739 			ASSERT(fa.fa_spkt == spkt);
1740 			ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1741 			agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
1742 			agg_ok = (agg_ok && bufcnt < dpp->pp_max_frags);
1743 
1744 			if (agg_ok && sbuf->buf_dlim - sbuf->buf_doff -
1745 			    sbuf->buf_dlen >= plen - thlen) {
1746 				/*
1747 				 * No need for a new packet, just
1748 				 * append to curr_m.
1749 				 */
1750 				csum_ok = copy_pkt_csum_packed(pkt, plen, NULL,
1751 				    is_ipv4, NULL, sbuf, &data_csum, NULL);
1752 
1753 				if (!csum_ok) {
1754 					STATS_INC(fsws,
1755 					    FSW_STATS_RX_AGG_BAD_CSUM);
1756 					SK_ERR("Checksum for aggregation "
1757 					    "is wrong");
1758 					DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail1);
1759 					/*
1760 					 * Turns out, checksum is wrong!
1761 					 * Fallback to no-agg mode.
1762 					 */
1763 					agg_ok = false;
1764 				} else {
1765 					flow_agg_merge_hdr(&fa, pkt,
1766 					    data_csum, fsws);
1767 					goto next;
1768 				}
1769 			}
1770 		}
1771 
1772 		/* calculate number of buflets required */
1773 		bh_cnt_tmp = howmany(plen, agg_bufsize);
1774 		if (__improbable(bh_cnt_tmp > MAX_BUFLET_COUNT)) {
1775 			STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1776 			SK_ERR("packet too big: bufcnt %d len %d", bh_cnt_tmp,
1777 			    plen);
1778 			__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
1779 			    DROP_REASON_FSW_GSO_NOMEM_PKT, 0);
1780 			continue;
1781 		}
1782 		if (bh_cnt < bh_cnt_tmp) {
1783 			uint32_t tmp;
1784 
1785 			if (iter != 0) {
1786 				/*
1787 				 * rearrange the array for additional
1788 				 * allocation
1789 				 */
1790 				uint8_t i;
1791 				for (i = 0; i < bh_cnt; i++, iter++) {
1792 					buf_arr[i] = buf_arr[iter];
1793 					buf_arr[iter] = 0;
1794 				}
1795 				iter = 0;
1796 			}
1797 			tmp = estimate_buf_cnt(fe, rx_bytes, KPKTQ_LEN(rx_pkts),
1798 			    PP_BUF_SIZE_DEF(dpp), agg_bufsize);
1799 			tmp = MIN(tmp, MAX_BUFLET_COUNT);
1800 			tmp = MAX(tmp, bh_cnt_tmp);
1801 			tmp -= bh_cnt;
1802 			ASSERT(tmp <= (MAX_BUFLET_COUNT - bh_cnt));
1803 			DTRACE_SKYWALK1(refilled_blt_cnt, uint32_t, tmp);
1804 			err = pp_alloc_buflet_batch(dpp, &buf_arr[bh_cnt],
1805 			    &tmp, SKMEM_NOSLEEP, large_buffer);
1806 			bh_cnt += tmp;
1807 			if (__improbable((tmp == 0) || (bh_cnt < bh_cnt_tmp))) {
1808 				STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1809 				SK_ERR("buflet alloc failed (err %d)", err);
1810 				__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
1811 				    DROP_REASON_FSW_GSO_NOMEM_PKT, 0);
1812 				continue;
1813 			}
1814 		}
1815 		/* Use pre-allocated buflets */
1816 		ASSERT(bh_cnt >= bh_cnt_tmp);
1817 		dbuf_array.dba_num_dbufs = bh_cnt_tmp;
1818 		while (bh_cnt_tmp-- > 0) {
1819 			/*
1820 			 * -fbounds-safety: buf_arr[iter] is a uint64_t, so
1821 			 * forging it
1822 			 */
1823 			dbuf_array.dba_buflet[bh_cnt_tmp] =
1824 			    __unsafe_forge_single(kern_buflet_t, buf_arr[iter]);
1825 			buf_arr[iter] = 0;
1826 			bh_cnt--;
1827 			iter++;
1828 		}
1829 		/* copy and checksum TCP data */
1830 		if (agg_ok) {
1831 			int added = 0;
1832 			ASSERT(dbuf_array.dba_num_dbufs != 0);
1833 			csum_ok = copy_pkt_csum_packed(pkt, plen, &dbuf_array,
1834 			    is_ipv4, NULL, sbuf, &data_csum, &added);
1835 
1836 			if (__improbable(!csum_ok)) {
1837 				STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1838 				SK_ERR("Checksum for aggregation on new "
1839 				    "mbuf is wrong");
1840 				DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail2);
1841 				agg_ok = false;
1842 				/* reset the used buflets */
1843 				uint8_t j;
1844 				for (j = 0; j < dbuf_array.dba_num_dbufs; j++) {
1845 					VERIFY(kern_buflet_set_data_length(
1846 						    dbuf_array.dba_buflet[j], 0) == 0);
1847 				}
1848 				goto non_agg;
1849 			}
1850 
1851 			/*
1852 			 * There was not enough space in curr_m, thus we must
1853 			 * have added to m->m_data.
1854 			 */
1855 			VERIFY(added > 0);
1856 		} else {
1857 non_agg:
1858 			ASSERT(dbuf_array.dba_num_dbufs != 0);
1859 			csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
1860 			    &data_csum, is_ipv4);
1861 			if (__improbable(!csum_ok)) {
1862 				STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1863 				SK_ERR("%d incorrect csum", __LINE__);
1864 				DTRACE_SKYWALK(aggr__chan_tcp_csum_fail);
1865 			}
1866 		}
1867 		if (agg_ok) {
1868 			ASSERT(fa.fa_spkt == spkt);
1869 			ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1870 			/* update current packet header */
1871 			flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
1872 			ASSERT(dbuf_array.dba_num_dbufs > 0);
1873 			bufcnt += dbuf_array.dba_num_dbufs;
1874 			_append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1875 			    &sbuf);
1876 		} else {
1877 			/* Finalize the current super packet */
1878 			if (sph != 0) {
1879 				finalize_super_packet(&spkt, &sph, &fa,
1880 				    &largest_spkt, &spkts, bufcnt);
1881 			}
1882 
1883 			/* New super packet */
1884 			err = kern_pbufpool_alloc_nosleep(dpp, 0, &sph);
1885 			if (__improbable(err != 0)) {
1886 				STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1887 				SK_ERR("packet alloc failed (err %d)", err);
1888 				_free_dbuf_array(dpp, &dbuf_array);
1889 				__RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
1890 				    DROP_REASON_FSW_GSO_NOMEM_PKT, 0);
1891 				continue;
1892 			}
1893 			spkt = SK_PTR_ADDR_KPKT(sph);
1894 			pkt_copy_metadata(pkt, spkt);
1895 			/* Packet length for super packet starts from L3 */
1896 			spkt->pkt_length = plen;
1897 			spkt->pkt_flow_ulen =  pkt->pkt_flow_ulen;
1898 			spkt->pkt_headroom = 0;
1899 			spkt->pkt_l2_len = 0;
1900 			spkt->pkt_seg_cnt = 1;
1901 
1902 			ASSERT(dbuf_array.dba_num_dbufs > 0);
1903 			bufcnt = dbuf_array.dba_num_dbufs;
1904 			sbuf = kern_packet_get_next_buflet(sph, NULL);
1905 			_append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1906 			    &sbuf);
1907 
1908 			KPKTQ_ENQUEUE(&super_pkts, spkt);
1909 			_UUID_COPY(spkt->pkt_flow_id, fe->fe_uuid);
1910 			_UUID_COPY(spkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1911 			spkt->pkt_policy_id = fe->fe_policy_id;
1912 			spkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
1913 			spkt->pkt_transport_protocol =
1914 			    fe->fe_transport_protocol;
1915 			flow_agg_init_spkt(fsw, &fa, spkt, pkt);
1916 		}
1917 next:
1918 		pkt_agg_log(pkt, kernproc, true);
1919 		prev_csum_ok = csum_ok;
1920 		KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1921 	}
1922 
1923 	/* Free unused buflets */
1924 	STATS_ADD(fsws, FSW_STATS_RX_WASTED_BFLT, bh_cnt);
1925 	while (bh_cnt > 0) {
1926 		/* -fbounds-saftey: buf_arr[iter] is a uint64_t, so forging it */
1927 		pp_free_buflet(dpp, __unsafe_forge_single(kern_buflet_t,
1928 		    buf_arr[iter]));
1929 		buf_arr[iter] = 0;
1930 		bh_cnt--;
1931 		iter++;
1932 	}
1933 	/* Finalize the last super packet */
1934 	if (sph != 0) {
1935 		finalize_super_packet(&spkt, &sph, &fa, &largest_spkt,
1936 		    &spkts, bufcnt);
1937 	}
1938 	converge_aggregation_size(fe, largest_spkt);
1939 	DTRACE_SKYWALK1(aggr__spkt__count, uint16_t, spkts);
1940 	if (__improbable(is_mbuf)) {
1941 		STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2PKT, spkts);
1942 	} else {
1943 		STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2PKT, spkts);
1944 	}
1945 	FLOW_STATS_IN_ADD(fe, spackets, spkts);
1946 
1947 	KPKTQ_FINI(rx_pkts);
1948 
1949 	if (KPKTQ_LEN(&super_pkts) > 0) {
1950 		fsw_ring_enqueue_tail_drop(fsw, ring, &super_pkts);
1951 	}
1952 	KPKTQ_FINI(&super_pkts);
1953 
1954 	pp_free_pktq(&disposed_pkts);
1955 }
1956 
1957 /* streamline a smbuf */
1958 static bool
_finalize_smbuf(struct mbuf * smbuf)1959 _finalize_smbuf(struct mbuf *smbuf)
1960 {
1961 	/* the 1st mbuf always contains something, so start with the 2nd one */
1962 	struct mbuf *m_chained = smbuf->m_next;
1963 	struct mbuf *prev_m = smbuf;
1964 	bool freed = false;
1965 
1966 	while (m_chained != NULL) {
1967 		if (m_chained->m_len != 0) {
1968 			prev_m = m_chained;
1969 			m_chained = m_chained->m_next;
1970 			continue;
1971 		}
1972 		prev_m->m_next = m_chained->m_next;
1973 		m_free(m_chained);
1974 		m_chained = prev_m->m_next;
1975 		freed = true;
1976 	}
1977 	return freed;
1978 }
1979 
1980 SK_NO_INLINE_ATTRIBUTE
1981 static void
flow_rx_agg_host(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,uint32_t rx_bytes,bool is_mbuf)1982 flow_rx_agg_host(struct nx_flowswitch *fsw, struct flow_entry *fe,
1983     struct pktq *rx_pkts, uint32_t rx_bytes, bool is_mbuf)
1984 {
1985 #define __RX_AGG_HOST_DROP_SOURCE_PACKET(_pkt, _reason, _flags)    do {   \
1986 	drop_packets++;                                                   \
1987 	drop_bytes += (_pkt)->pkt_length;                                 \
1988 	pp_drop_packet_single(_pkt, fsw->fsw_ifp, _flags, _reason, __func__, __LINE__); \
1989 	(_pkt) = NULL;                                                    \
1990 	FLOW_AGG_CLEAR(&fa);                                              \
1991 	prev_csum_ok = false;                                             \
1992 } while (0)
1993 	struct flow_agg fa;             /* states */
1994 	FLOW_AGG_CLEAR(&fa);
1995 
1996 	struct pktq disposed_pkts;      /* done src packets */
1997 	KPKTQ_INIT(&disposed_pkts);
1998 
1999 	struct __kern_packet *pkt, *tpkt;
2000 	/* points to the first mbuf of chain */
2001 	struct mbuf *m_chain = NULL;
2002 	/* super mbuf, at the end it points to last mbuf packet */
2003 	struct  mbuf *smbuf = NULL, *curr_m = NULL;
2004 	bool prev_csum_ok = false, csum_ok, agg_ok;
2005 	uint16_t smbufs = 0, smbuf_finalized = 0;
2006 	uint32_t bytes = 0, rcvd_ulen = 0;
2007 	uint32_t rcvd_packets = 0, rcvd_bytes = 0; /* raw packets & bytes */
2008 	uint32_t drop_packets = 0, drop_bytes = 0; /* dropped packets & bytes */
2009 	uint32_t largest_smbuf = 0;
2010 	int err = 0;
2011 
2012 	struct fsw_stats *fsws = &fsw->fsw_stats;
2013 	bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
2014 
2015 	SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
2016 
2017 	/* state for mbuf batch alloc */
2018 	uint32_t mhead_cnt = 0;
2019 	uint32_t mhead_bufsize = 0;
2020 	struct mbuf * mhead = NULL;
2021 
2022 	uint16_t l2len = KPKTQ_FIRST(rx_pkts)->pkt_l2_len;
2023 
2024 	SK_DF(logflags, "Rx input queue bytes %u", rx_bytes);
2025 
2026 	if (__probable(!is_mbuf)) {
2027 		/*
2028 		 *  Batch mbuf alloc is based on
2029 		 * convert_native_pkt_to_mbuf_chain
2030 		 */
2031 		if (__probable(fe->fe_rx_largest_size != 0 &&
2032 		    NX_FSW_TCP_RX_AGG_ENABLED())) {
2033 			unsigned int num_segs = 1;
2034 			int pktq_len = KPKTQ_LEN(rx_pkts);
2035 
2036 			if (fe->fe_rx_largest_size <= MCLBYTES &&
2037 			    rx_bytes / pktq_len <= MCLBYTES) {
2038 				mhead_bufsize = MCLBYTES;
2039 			} else if (fe->fe_rx_largest_size <= MBIGCLBYTES &&
2040 			    rx_bytes / pktq_len <= MBIGCLBYTES) {
2041 				mhead_bufsize = MBIGCLBYTES;
2042 			} else if (fe->fe_rx_largest_size <= M16KCLBYTES &&
2043 			    rx_bytes / pktq_len <= M16KCLBYTES) {
2044 				mhead_bufsize = M16KCLBYTES;
2045 			} else {
2046 				mhead_bufsize = M16KCLBYTES * 2;
2047 				num_segs = 2;
2048 			}
2049 
2050 try_again:
2051 			if (rx_bytes != 0) {
2052 				mhead_cnt = estimate_buf_cnt(fe, rx_bytes, KPKTQ_LEN(rx_pkts),
2053 				    MCLBYTES, mhead_bufsize);
2054 			} else {
2055 				/* No payload, thus it's all small-sized ACKs/... */
2056 				mhead_bufsize = MHLEN;
2057 				mhead_cnt = pktq_len;
2058 			}
2059 
2060 			mhead = m_allocpacket_internal(&mhead_cnt,
2061 			    mhead_bufsize, &num_segs, M_NOWAIT, 1, 0);
2062 
2063 			if (mhead == NULL) {
2064 				if (mhead_bufsize > M16KCLBYTES) {
2065 					mhead_bufsize = M16KCLBYTES;
2066 					num_segs = 1;
2067 					goto try_again;
2068 				}
2069 
2070 				if (mhead_bufsize == M16KCLBYTES) {
2071 					mhead_bufsize = MBIGCLBYTES;
2072 					goto try_again;
2073 				}
2074 
2075 				if (mhead_bufsize == MBIGCLBYTES) {
2076 					mhead_bufsize = MCLBYTES;
2077 					goto try_again;
2078 				}
2079 			}
2080 		} else {
2081 			mhead = NULL;
2082 			mhead_bufsize = mhead_cnt = 0;
2083 		}
2084 		SK_DF(logflags, "batch alloc'ed %u mbufs of size %u", mhead_cnt,
2085 		    mhead_bufsize);
2086 	}
2087 
2088 	KPKTQ_FOREACH_SAFE(pkt, rx_pkts, tpkt) {
2089 		if (tpkt != NULL) {
2090 			void *baddr;
2091 			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
2092 			SK_PREFETCH(baddr, 0);
2093 		}
2094 
2095 		/* Validate l2 len, ip vers, is_mbuf */
2096 		ASSERT(pkt->pkt_l2_len == l2len);
2097 		ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
2098 		ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
2099 		ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
2100 		ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
2101 		ASSERT(!pkt->pkt_flow_ip_is_frag);
2102 		ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
2103 
2104 		csum_ok = false;
2105 		agg_ok = false;
2106 		/*
2107 		 * As we only agg packets with same hdr length,
2108 		 * leverage the pkt metadata
2109 		 */
2110 		uint32_t thlen = (pkt->pkt_flow_ip_hlen +
2111 		    pkt->pkt_flow_tcp_hlen);
2112 		uint32_t plen = (thlen + pkt->pkt_flow_ulen);
2113 
2114 		/*
2115 		 * Rather than calling flow_pkt_track() for each
2116 		 * packet here, we accumulate received packet stats
2117 		 * for the call to flow_track_stats() below.  This
2118 		 * is because flow tracking is a no-op for traffic
2119 		 * that belongs to the host stack.
2120 		 */
2121 		rcvd_ulen += pkt->pkt_flow_ulen;
2122 		rcvd_bytes += pkt->pkt_length;
2123 		rcvd_packets++;
2124 
2125 		KPKTQ_REMOVE(rx_pkts, pkt);
2126 		rx_bytes -= pkt->pkt_flow_ulen;
2127 
2128 		/* packet is for BSD flow, create a mbuf chain */
2129 		uint32_t len = (l2len + plen);
2130 		uint16_t data_csum = 0;
2131 		struct mbuf *__single m;
2132 		bool is_wake_pkt = false;
2133 		if (__improbable(is_mbuf)) {
2134 			m = pkt->pkt_mbuf;
2135 
2136 			if (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
2137 				is_wake_pkt = true;
2138 			}
2139 
2140 			/* Detach mbuf from source pkt */
2141 			KPKT_CLEAR_MBUF_DATA(pkt);
2142 
2143 			uint32_t trailer = (m_pktlen(m) - len);
2144 			ASSERT((uint32_t)m_pktlen(m) >= plen);
2145 			/* Remove the trailer */
2146 			if (trailer > 0) {
2147 				m_adj(m, -trailer);
2148 			}
2149 			if ((uint32_t) m->m_len < (l2len + thlen)) {
2150 				m = m_pullup(m, (l2len + thlen));
2151 				if (m == NULL) {
2152 					STATS_INC(fsws,
2153 					    FSW_STATS_RX_DROP_NOMEM_BUF);
2154 					SK_ERR("mbuf pullup failed (err %d)",
2155 					    err);
2156 					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt,
2157 					    DROP_REASON_FSW_GSO_NOMEM_MBUF, DROPTAP_FLAG_DIR_IN);
2158 					continue;
2159 				}
2160 				m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2161 			}
2162 			/* attached mbuf is already allocated */
2163 			csum_ok = mbuf_csum(pkt, m, is_ipv4, &data_csum);
2164 		} else {                /* native */
2165 			uint16_t pad = P2ROUNDUP(l2len, sizeof(uint32_t)) -
2166 			    l2len;
2167 			uint32_t tot_len = (len + pad);
2168 			/* remember largest aggregated packet size */
2169 			if (smbuf) {
2170 				/* plus 4 bytes to account for padding */
2171 				if (largest_smbuf <
2172 				    (uint32_t)m_pktlen(smbuf) + pad) {
2173 					largest_smbuf = (uint32_t)m_pktlen(smbuf) + pad;
2174 				}
2175 			}
2176 
2177 			if ((pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
2178 				is_wake_pkt = true;
2179 			}
2180 
2181 			if (prev_csum_ok && curr_m) {
2182 				ASSERT(fa.fa_smbuf == smbuf);
2183 				ASSERT(!fa.fa_sobj_is_pkt);
2184 				agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2185 
2186 				if (agg_ok &&
2187 				    M_TRAILINGSPACE(curr_m) >= plen - thlen) {
2188 					/*
2189 					 * No need for a new mbuf,
2190 					 * just append to curr_m.
2191 					 */
2192 					csum_ok = copy_pkt_csum_packed(pkt,
2193 					    plen, NULL, is_ipv4, curr_m, NULL,
2194 					    &data_csum, NULL);
2195 
2196 					if (!csum_ok) {
2197 						STATS_INC(fsws,
2198 						    FSW_STATS_RX_AGG_BAD_CSUM);
2199 						SK_ERR("Checksum for "
2200 						    "aggregation is wrong");
2201 						DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail1);
2202 						/*
2203 						 * Turns out, checksum is wrong!
2204 						 * Fallback to no-agg mode.
2205 						 */
2206 						agg_ok = 0;
2207 					} else {
2208 						/*
2209 						 * We only added payload,
2210 						 * thus -thlen.
2211 						 */
2212 						bytes += (plen - thlen);
2213 						flow_agg_merge_hdr(&fa, pkt,
2214 						    data_csum, fsws);
2215 						goto next;
2216 					}
2217 				}
2218 			}
2219 
2220 			/*
2221 			 * If the batch allocation returned partial success,
2222 			 * we try blocking allocation here again
2223 			 */
2224 			m = mhead;
2225 			if (__improbable(m == NULL ||
2226 			    tot_len > mhead_bufsize)) {
2227 				unsigned int num_segs = 1;
2228 				if (tot_len > M16KCLBYTES) {
2229 					num_segs = 0;
2230 				}
2231 
2232 				ASSERT(mhead_cnt == 0 || mhead != NULL);
2233 				err = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
2234 				    &num_segs, &m);
2235 				if (err != 0) {
2236 					STATS_INC(fsws,
2237 					    FSW_STATS_RX_DROP_NOMEM_BUF);
2238 					SK_ERR("mbuf alloc failed (err %d), "
2239 					    "maxchunks %d, len %d", err, num_segs,
2240 					    tot_len);
2241 					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt,
2242 					    DROP_REASON_FSW_GSO_NOMEM_MBUF, DROPTAP_FLAG_DIR_IN);
2243 					continue;
2244 				}
2245 			} else {
2246 				ASSERT(mhead_cnt > 0);
2247 				mhead = m->m_nextpkt;
2248 				m->m_nextpkt = NULL;
2249 				mhead_cnt--;
2250 			}
2251 			m->m_data += pad;
2252 			m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2253 
2254 			/*
2255 			 * copy and checksum l3, l4 and payload
2256 			 * l2 header is copied later only if we
2257 			 * can't agg as an optimization
2258 			 */
2259 			m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
2260 			_dbuf_array_t dbuf_array = {.dba_is_buflet = false};
2261 			if (agg_ok) {
2262 				int added = 0, dbuf_idx = 0;
2263 				struct mbuf *m_tmp = m;
2264 				dbuf_array.dba_num_dbufs = 0;
2265 				uint32_t m_chain_max_len = 0;
2266 				while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
2267 					dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
2268 					dbuf_array.dba_num_dbufs += 1;
2269 					m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
2270 					m_tmp = m_tmp->m_next;
2271 					dbuf_idx++;
2272 				}
2273 				ASSERT(m_tmp == NULL);
2274 
2275 				csum_ok = copy_pkt_csum_packed(pkt, plen,
2276 				    &dbuf_array, is_ipv4, curr_m, NULL,
2277 				    &data_csum, &added);
2278 
2279 				if (!csum_ok) {
2280 					STATS_INC(fsws,
2281 					    FSW_STATS_RX_AGG_BAD_CSUM);
2282 					SK_ERR("Checksum for aggregation "
2283 					    "on new mbuf is wrong");
2284 					DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail2);
2285 					agg_ok = false;
2286 					goto non_agg;
2287 				}
2288 
2289 				/*
2290 				 * There was not enough space in curr_m,
2291 				 * thus we must have added to m->m_data.
2292 				 */
2293 				VERIFY(added > 0);
2294 				VERIFY(m->m_len <= m->m_pkthdr.len &&
2295 				    (uint32_t)m->m_pkthdr.len <= m_chain_max_len);
2296 
2297 				/*
2298 				 * We account for whatever we added
2299 				 * to m later on, thus - added.
2300 				 */
2301 				bytes += plen - thlen - added;
2302 			} else {
2303 non_agg:
2304 				dbuf_array.dba_num_dbufs = 0;
2305 				uint32_t m_chain_max_len = 0;
2306 				struct mbuf *m_tmp = m;
2307 				int dbuf_idx = 0;
2308 				while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
2309 					dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
2310 					dbuf_array.dba_num_dbufs += 1;
2311 					m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
2312 					m_tmp = m_tmp->m_next;
2313 					dbuf_idx++;
2314 				}
2315 				ASSERT(m_tmp == NULL);
2316 
2317 				m->m_len += l2len;
2318 				m->m_pkthdr.len += l2len;
2319 				csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
2320 				    &data_csum, is_ipv4);
2321 				if (__improbable(!csum_ok)) {
2322 					STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
2323 					SK_ERR("%d incorrect csum", __LINE__);
2324 					DTRACE_SKYWALK(aggr__host_tcp_csum_fail);
2325 				}
2326 				VERIFY(m->m_len <= m->m_pkthdr.len &&
2327 				    (uint32_t)m->m_pkthdr.len <= m_chain_max_len);
2328 			}
2329 
2330 			STATS_INC(fsws, FSW_STATS_RX_COPY_PKT2MBUF);
2331 			STATS_INC(fsws, FSW_STATS_RX_COPY_SUM);
2332 
2333 			m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off;
2334 			m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value;
2335 			/*
2336 			 *  Note that these flags have same value,
2337 			 * except PACKET_CSUM_PARTIAL
2338 			 */
2339 			m->m_pkthdr.csum_flags |= (pkt->pkt_csum_flags &
2340 			    PACKET_CSUM_RX_FLAGS);
2341 
2342 			/* Set the rcvif */
2343 			m->m_pkthdr.rcvif = fsw->fsw_ifp;
2344 
2345 			/* Make sure to propagate the wake pkt flag */
2346 			if (is_wake_pkt) {
2347 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
2348 			}
2349 		}
2350 		ASSERT(m != NULL);
2351 		ASSERT((m->m_flags & M_PKTHDR) && m->m_pkthdr.pkt_hdr != NULL);
2352 		ASSERT((m->m_flags & M_HASFCS) == 0);
2353 		ASSERT(m->m_nextpkt == NULL);
2354 
2355 		if (__improbable(is_mbuf)) {
2356 			if (prev_csum_ok && csum_ok) {
2357 				ASSERT(fa.fa_smbuf == smbuf);
2358 				agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2359 			}
2360 		}
2361 
2362 		if (agg_ok) {
2363 			ASSERT(is_wake_pkt == false);
2364 			ASSERT(fa.fa_smbuf == smbuf);
2365 			ASSERT(!fa.fa_sobj_is_pkt);
2366 			if (__improbable(is_mbuf)) {
2367 				bytes += (m_pktlen(m) - l2len);
2368 				/* adjust mbuf by l2, l3 and l4  hdr */
2369 				m_adj(m, l2len + thlen);
2370 			} else {
2371 				bytes += m_pktlen(m);
2372 			}
2373 
2374 			m->m_flags &= ~M_PKTHDR;
2375 			flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
2376 			while (curr_m->m_next != NULL) {
2377 				curr_m = curr_m->m_next;
2378 			}
2379 			curr_m->m_next = m;
2380 			curr_m = m;
2381 			m = NULL;
2382 		} else {
2383 			if ((uint32_t) m->m_len < l2len) {
2384 				m = m_pullup(m, l2len);
2385 				if (m == NULL) {
2386 					STATS_INC(fsws,
2387 					    FSW_STATS_RX_DROP_NOMEM_BUF);
2388 					SK_ERR("mbuf pullup failed (err %d)",
2389 					    err);
2390 					__RX_AGG_HOST_DROP_SOURCE_PACKET(pkt,
2391 					    DROP_REASON_FSW_GSO_NOMEM_MBUF, DROPTAP_FLAG_DIR_IN);
2392 					continue;
2393 				}
2394 				m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2395 			}
2396 
2397 			/* copy l2 header for native */
2398 			if (__probable(!is_mbuf)) {
2399 				uint16_t llhoff = pkt->pkt_headroom;
2400 				uint8_t *baddr;
2401 				MD_BUFLET_ADDR_ABS(pkt, baddr);
2402 				ASSERT(baddr != NULL);
2403 				baddr += llhoff;
2404 				pkt_copy(baddr, m_mtod_current(m), l2len);
2405 			}
2406 			/* adjust mbuf by l2 hdr */
2407 			m_adj(m, l2len);
2408 			bytes += m_pktlen(m);
2409 
2410 			/*
2411 			 * aggregated packets can be skipped by pktap because
2412 			 * the original pre-aggregated chain already passed through
2413 			 * pktap (see fsw_snoop()) before entering this function.
2414 			 */
2415 			m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
2416 
2417 			if (m_chain == NULL) {
2418 				/* this is the start of the chain */
2419 				m_chain = m;
2420 				smbuf = m;
2421 				curr_m = m;
2422 			} else if (smbuf != NULL) {
2423 				/*
2424 				 * set m to be next packet
2425 				 */
2426 				mbuf_agg_log(smbuf, kernproc, is_mbuf);
2427 				smbuf->m_nextpkt = m;
2428 				/*
2429 				 * Clean up (finalize) a smbuf only if it pre-allocated >1 segments,
2430 				 * which only happens when mhead_bufsize > M16KCLBYTES
2431 				 */
2432 				if (_finalize_smbuf(smbuf)) {
2433 					FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
2434 				}
2435 				smbuf_finalized++;
2436 				smbuf = m;
2437 				curr_m = m;
2438 			} else {
2439 				VERIFY(0);
2440 			}
2441 
2442 			smbufs++;
2443 			m = NULL;
2444 
2445 			flow_agg_init_smbuf(fsw, &fa, smbuf, pkt);
2446 			/*
2447 			 * If the super packet is an mbuf which can't accomodate
2448 			 * sizeof(struct ip_tcp_mask) or sizeof(struct ip6_tcp_mask)
2449 			 * in a single buffer, then do the aggregation check in slow path.
2450 			 * Note that on Intel platforms, an mbuf without cluster
2451 			 * has only 80 bytes available for data. That means if a
2452 			 * packet contains an Ethernet header, the mbuf won't be
2453 			 * able to fully contain "struct ip6_tcp_mask" or
2454 			 * "struct ip6_tcp_mask" data in a single buffer, because
2455 			 * sizeof(struct ip_tcp_mask) and sizeof(struct ip6_tcp_mask)
2456 			 * are all 80 bytes as well.
2457 			 */
2458 			if (__improbable(smbuf->m_len <
2459 			    ((m_mtod_current(smbuf) - (caddr_t)(smbuf->m_pkthdr.pkt_hdr)) + MASK_SIZE))) {
2460 				fa.fa_sobj_is_short = true;
2461 			}
2462 		}
2463 next:
2464 		pkt_agg_log(pkt, kernproc, true);
2465 		prev_csum_ok = csum_ok;
2466 		KPKTQ_ENQUEUE(&disposed_pkts, pkt);
2467 	}
2468 
2469 	KPKTQ_FINI(rx_pkts);
2470 
2471 	/* Free any leftover mbufs, true only for native  */
2472 	if (__improbable(mhead != NULL)) {
2473 		ASSERT(mhead_cnt != 0);
2474 		STATS_ADD(fsws, FSW_STATS_RX_WASTED_MBUF, mhead_cnt);
2475 		(void) m_freem_list(mhead);
2476 		mhead = NULL;
2477 		mhead_cnt = 0;
2478 	}
2479 
2480 	converge_aggregation_size(fe, largest_smbuf);
2481 
2482 	if (smbufs > 0) {
2483 		/* Last smbuf */
2484 		mbuf_agg_log(smbuf, kernproc, is_mbuf);
2485 		SK_DF(logflags, "smbuf count %u", smbufs);
2486 
2487 		ASSERT(m_chain != NULL);
2488 		ASSERT(smbuf != NULL);
2489 
2490 		/*
2491 		 * If the last mbuf needs to be finalized (mhead_bufsize > M16KCLBYTES)
2492 		 * but is not (smbuf_finalized < smbuf), do it now.
2493 		 */
2494 		if (smbuf_finalized < smbufs &&
2495 		    _finalize_smbuf(smbuf)) {
2496 			FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
2497 		}
2498 
2499 		/*
2500 		 * Call fsw_host_sendup() with mbuf chain
2501 		 * directly.
2502 		 */
2503 		mchain_agg_log(m_chain, kernproc, is_mbuf);
2504 		fsw_host_sendup(fsw->fsw_ifp, m_chain, smbuf, smbufs, bytes);
2505 
2506 		if (__improbable(is_mbuf)) {
2507 			STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2MBUF, smbufs);
2508 		} else {
2509 			STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2MBUF, smbufs);
2510 		}
2511 		FLOW_STATS_IN_ADD(fe, spackets, smbufs);
2512 
2513 		ASSERT((fe->fe_flags & FLOWENTF_TRACK) == 0);
2514 	}
2515 
2516 	/* record (raw) number of packets and bytes */
2517 	ASSERT((int)(rcvd_bytes - drop_bytes) >= 0);
2518 	ASSERT((int)(rcvd_packets - drop_packets) >= 0);
2519 	flow_track_stats(fe, (rcvd_bytes - drop_bytes),
2520 	    (rcvd_packets - drop_packets), (rcvd_ulen != 0), true);
2521 
2522 	pp_free_pktq(&disposed_pkts);
2523 }
2524 
2525 void
flow_rx_agg_tcp(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,uint32_t rx_bytes,uint32_t flags)2526 flow_rx_agg_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe,
2527     struct pktq *rx_pkts, uint32_t rx_bytes, uint32_t flags)
2528 {
2529 #pragma unused(flags)
2530 	struct pktq dropped_pkts;
2531 	bool is_mbuf;
2532 
2533 	if (__improbable((flags & FLOW_PROC_FLAG_FRAGMENTS) != 0)) {
2534 		dp_flow_rx_process(fsw, fe, rx_pkts, rx_bytes, FLOW_PROC_FLAG_FRAGMENTS);
2535 		return;
2536 	}
2537 
2538 	KPKTQ_INIT(&dropped_pkts);
2539 
2540 	if (!dp_flow_rx_route_process(fsw, fe)) {
2541 		SK_ERR("Rx route bad");
2542 		fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
2543 		STATS_ADD(&fsw->fsw_stats, FSW_STATS_RX_FLOW_NONVIABLE,
2544 		    KPKTQ_LEN(&dropped_pkts));
2545 		pp_drop_pktq(&dropped_pkts, fsw->fsw_ifp, DROPTAP_FLAG_DIR_IN,
2546 		    DROP_REASON_FSW_FLOW_NONVIABLE, __func__, __LINE__);
2547 		return;
2548 	}
2549 
2550 	is_mbuf = !!(PKT_IS_MBUF(KPKTQ_FIRST(rx_pkts)));
2551 
2552 	if (fe->fe_nx_port == FSW_VP_HOST) {
2553 		boolean_t do_rx_agg;
2554 
2555 		/* BSD flow */
2556 		if (sk_fsw_rx_agg_tcp_host != SK_FSW_RX_AGG_TCP_HOST_AUTO) {
2557 			do_rx_agg = (sk_fsw_rx_agg_tcp_host ==
2558 			    SK_FSW_RX_AGG_TCP_HOST_ON);
2559 		} else {
2560 			do_rx_agg = !dlil_has_ip_filter() &&
2561 			    !dlil_has_if_filter(fsw->fsw_ifp);
2562 		}
2563 		if (__improbable(!do_rx_agg)) {
2564 			fsw_host_rx(fsw, rx_pkts);
2565 			return;
2566 		}
2567 		if (__improbable(pktap_total_tap_count != 0)) {
2568 			fsw_snoop(fsw, fe, rx_pkts, true);
2569 		}
2570 		flow_rx_agg_host(fsw, fe, rx_pkts, rx_bytes, is_mbuf);
2571 	} else {
2572 		/* channel flow */
2573 		if (__improbable(pktap_total_tap_count != 0)) {
2574 			fsw_snoop(fsw, fe, rx_pkts, true);
2575 		}
2576 		flow_rx_agg_channel(fsw, fe, rx_pkts, rx_bytes, is_mbuf);
2577 	}
2578 }
2579