1 /*
2 * Copyright (c) 2019-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
33 #include <skywalk/nexus/netif/nx_netif.h>
34 #include <skywalk/nexus/netif/nx_netif_compat.h>
35 #include <netinet/tcp.h>
36 #include <netinet/ip.h>
37 #include <netinet/ip6.h>
38 #include <net/pktap.h>
39 #include <sys/sdt.h>
40
41 #define MAX_BUFLET_COUNT (64)
42 #define TCP_FLAGS_IGNORE (TH_FIN|TH_SYN|TH_RST|TH_URG)
43 #define PKT_IS_MBUF(_pkt) (_pkt->pkt_pflags & PKT_F_MBUF_DATA)
44 #define PKT_IS_TRUNC_MBUF(_pkt) (PKT_IS_MBUF(_pkt) && \
45 (_pkt->pkt_pflags & PKT_F_TRUNCATED))
46
47 /*
48 * This structure holds per-super object (mbuf/packet) flow aggregation states.
49 */
50 struct flow_agg {
51 union {
52 struct {
53 union {
54 void * _fa_sobj;
55 struct mbuf * _fa_smbuf; /* super mbuf */
56 struct __kern_packet *_fa_spkt; /* super pkt */
57 };
58 uint8_t *_fa_sptr; /* ptr to super IP header */
59 bool _fa_sobj_is_pkt; /* super obj is pkt or mbuf */
60 /*
61 * super obj is not large enough to hold the IP & TCP
62 * header in a contiguous buffer.
63 */
64 bool _fa_sobj_is_short;
65 uint32_t _fa_tcp_seq; /* expected next sequence # */
66 uint32_t _fa_ulen; /* expected next ulen */
67 uint32_t _fa_total; /* total aggregated bytes */
68 } __flow_agg;
69 uint64_t __flow_agg_data[4];
70 };
71 #define fa_sobj __flow_agg._fa_sobj
72 #define fa_smbuf __flow_agg._fa_smbuf
73 #define fa_spkt __flow_agg._fa_spkt
74 #define fa_sptr __flow_agg._fa_sptr
75 #define fa_sobj_is_pkt __flow_agg._fa_sobj_is_pkt
76 #define fa_sobj_is_short __flow_agg._fa_sobj_is_short
77 #define fa_tcp_seq __flow_agg._fa_tcp_seq
78 #define fa_ulen __flow_agg._fa_ulen
79 #define fa_total __flow_agg._fa_total
80 };
81
82 #define FLOW_AGG_CLEAR(_fa) do { \
83 _CASSERT(sizeof(struct flow_agg) == 32); \
84 sk_zero_32(_fa); \
85 } while (0)
86
87 #define MASK_SIZE 80 /* size of struct {ip,ip6}_tcp_mask */
88
89 struct ip_tcp_mask {
90 struct ip ip_m;
91 struct tcphdr tcp_m;
92 uint32_t tcp_option_m[MAX_TCPOPTLEN / sizeof(uint32_t)];
93 };
94
95 static const struct ip_tcp_mask ip_tcp_mask
96 __sk_aligned(16) =
97 {
98 .ip_m = {
99 .ip_hl = 0xf,
100 .ip_v = 0xf,
101 .ip_tos = 0xff,
102 /* Not checked; aggregated packet's ip_len is increasing */
103 .ip_len = 0,
104 .ip_id = 0,
105 .ip_off = 0xffff,
106 .ip_ttl = 0xff,
107 .ip_p = 0xff,
108 .ip_sum = 0,
109 .ip_src.s_addr = 0xffffffff,
110 .ip_dst.s_addr = 0xffffffff,
111 },
112 .tcp_m = {
113 .th_sport = 0xffff,
114 .th_dport = 0xffff,
115 .th_seq = 0,
116 .th_ack = 0xffffffff,
117 .th_x2 = 0xf,
118 .th_off = 0xf,
119 .th_flags = ~TH_PUSH,
120 .th_win = 0xffff,
121 .th_sum = 0,
122 .th_urp = 0xffff,
123 },
124 .tcp_option_m = {
125 /* Max 40 bytes of TCP options */
126 0xffffffff,
127 0xffffffff,
128 0xffffffff,
129 0, /* Filling up to MASK_SIZE */
130 0, /* Filling up to MASK_SIZE */
131 0, /* Filling up to MASK_SIZE */
132 0, /* Filling up to MASK_SIZE */
133 0, /* Filling up to MASK_SIZE */
134 0, /* Filling up to MASK_SIZE */
135 0, /* Filling up to MASK_SIZE */
136 },
137 };
138
139 struct ip6_tcp_mask {
140 struct ip6_hdr ip6_m;
141 struct tcphdr tcp_m;
142 uint32_t tcp_option_m[5]; /* 5 bytes to fill up to MASK_SIZE */
143 };
144
145 static const struct ip6_tcp_mask ip6_tcp_mask
146 __sk_aligned(16) =
147 {
148 .ip6_m = {
149 .ip6_ctlun.ip6_un1.ip6_un1_flow = 0xffffffff,
150 /* Not checked; aggregated packet's ip_len is increasing */
151 .ip6_ctlun.ip6_un1.ip6_un1_plen = 0,
152 .ip6_ctlun.ip6_un1.ip6_un1_nxt = 0xff,
153 .ip6_ctlun.ip6_un1.ip6_un1_hlim = 0xff,
154 .ip6_src.__u6_addr.__u6_addr32[0] = 0xffffff,
155 .ip6_src.__u6_addr.__u6_addr32[1] = 0xffffff,
156 .ip6_src.__u6_addr.__u6_addr32[2] = 0xffffff,
157 .ip6_src.__u6_addr.__u6_addr32[3] = 0xffffff,
158 .ip6_dst.__u6_addr.__u6_addr32[0] = 0xffffff,
159 .ip6_dst.__u6_addr.__u6_addr32[1] = 0xffffff,
160 .ip6_dst.__u6_addr.__u6_addr32[2] = 0xffffff,
161 .ip6_dst.__u6_addr.__u6_addr32[3] = 0xffffff,
162 },
163 .tcp_m = {
164 .th_sport = 0xffff,
165 .th_dport = 0xffff,
166 .th_seq = 0,
167 .th_ack = 0xffffffff,
168 .th_x2 = 0xf,
169 .th_off = 0xf,
170 .th_flags = ~TH_PUSH,
171 .th_win = 0xffff,
172 .th_sum = 0,
173 .th_urp = 0xffff,
174 },
175 .tcp_option_m = {
176 /* Max 40 bytes of TCP options */
177 0xffffffff,
178 0xffffffff,
179 0xffffffff,
180 0, /* Filling up to MASK_SIZE */
181 0, /* Filling up to MASK_SIZE */
182 },
183 };
184
185
186 #if SK_LOG
187 SK_LOG_ATTRIBUTE
188 static void
_pkt_agg_log(struct __kern_packet * pkt,struct proc * p,bool is_input)189 _pkt_agg_log(struct __kern_packet *pkt, struct proc *p, bool is_input)
190 {
191 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
192 (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
193
194 kern_packet_t ph = SK_PKT2PH(pkt);
195 uint64_t bufcnt = 1;
196 if (!is_input) {
197 bufcnt = kern_packet_get_buflet_count(ph);
198 }
199
200 SK_DF(logflags, "%s(%d) %spkt 0x%llx plen %u",
201 sk_proc_name_address(p), sk_proc_pid(p), is_input ? "s":"d",
202 SK_KVA(pkt), pkt->pkt_length);
203
204 SK_DF(logflags, "%spkt csumf/rxstart/rxval 0x%x/%u/0x%04x",
205 is_input ? "s":"d", pkt->pkt_csum_flags,
206 (uint32_t)pkt->pkt_csum_rx_start_off,
207 (uint32_t)pkt->pkt_csum_rx_value);
208
209 if (!is_input) {
210 kern_buflet_t buf = kern_packet_get_next_buflet(ph, NULL);
211
212 /* Individual buflets */
213 for (uint64_t i = 0; i < bufcnt && buf != NULL; i++) {
214 SK_DF(logflags | SK_VERB_DUMP, "%s",
215 sk_dump("buf", kern_buflet_get_data_address(buf),
216 pkt->pkt_length, 128, NULL, 0));
217 buf = kern_packet_get_next_buflet(ph, buf);
218 }
219 }
220 }
221
222 #define pkt_agg_log(_pkt, _p, _is_input) do { \
223 if (__improbable(sk_verbose != 0)) { \
224 _pkt_agg_log(_pkt, _p, _is_input); \
225 } \
226 } while (0)
227
228 SK_LOG_ATTRIBUTE
229 static void
_mbuf_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)230 _mbuf_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
231 {
232 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
233 (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
234
235 SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
236 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
237 m->m_pkthdr.len);
238
239 SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
240 m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
241 (uint32_t)m->m_pkthdr.csum_rx_val);
242
243 /* Dump the first mbuf */
244 ASSERT(m->m_data != NULL);
245 SK_DF(logflags | SK_VERB_DUMP, "%s", sk_dump("buf",
246 (uint8_t *)m->m_data, m->m_len, 128, NULL, 0));
247 }
248
249 #define mbuf_agg_log(_m, _p, _is_mbuf) do { \
250 if (__improbable(sk_verbose != 0)) { \
251 _mbuf_agg_log(_m, _p, _is_mbuf); \
252 } \
253 } while (0)
254
255 SK_LOG_ATTRIBUTE
256 static void
_mchain_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)257 _mchain_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
258 {
259 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
260 (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
261
262 while (m != NULL) {
263 SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
264 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
265 m->m_pkthdr.len);
266
267 SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
268 m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
269 (uint32_t)m->m_pkthdr.csum_rx_val);
270
271 m = m->m_nextpkt;
272 }
273 }
274
275 #define mchain_agg_log(_m, _p, _is_mbuf) do { \
276 if (__improbable(sk_verbose != 0)) { \
277 _mchain_agg_log(_m, _p, _is_mbuf); \
278 } \
279 } while (0)
280 #else
281 #define pkt_agg_log(...)
282 #define mbuf_agg_log(...)
283 #define mchain_agg_log(...)
284 #endif /* SK_LOG */
285
286 /*
287 * Checksum only for packet with mbuf.
288 */
289 static bool
mbuf_csum(struct __kern_packet * pkt,struct mbuf * m,bool verify_l3,uint16_t * data_csum)290 mbuf_csum(struct __kern_packet *pkt, struct mbuf *m, bool verify_l3,
291 uint16_t *data_csum)
292 {
293 ASSERT(data_csum != NULL);
294
295 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
296 uint32_t plen = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen +
297 pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
298 uint16_t l4len = plen - pkt->pkt_l2_len - pkt->pkt_flow_ip_hlen;
299 uint16_t start = pkt->pkt_l2_len;
300 uint32_t partial = 0;
301 uint16_t csum = 0;
302
303 ASSERT(plen == m_pktlen(m));
304
305 /* Some compat drivers compute full checksum */
306 if ((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS) ==
307 CSUM_RX_FULL_FLAGS) {
308 SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
309 m->m_pkthdr.csum_flags, m->m_pkthdr.csum_rx_start,
310 m->m_pkthdr.csum_rx_val);
311
312 /* Compute the data_csum */
313 struct tcphdr *tcp =
314 (struct tcphdr *)(void *)(mtod(m, uint8_t *) +
315 pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen);
316 /* 16-bit alignment is sufficient */
317 ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
318
319 uint16_t th_sum = tcp->th_sum;
320 tcp->th_sum = 0;
321
322 partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
323 pkt->pkt_flow_tcp_hlen);
324 partial += htons(l4len + IPPROTO_TCP);
325 if (pkt->pkt_flow_ip_ver == IPVERSION) {
326 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
327 pkt->pkt_flow_ipv4_dst.s_addr, partial);
328 } else {
329 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
330 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
331 &pkt->pkt_flow_ipv6_dst, partial);
332 }
333 /* Restore the original checksum */
334 tcp->th_sum = th_sum;
335 th_sum = __packet_fix_sum(th_sum, csum, 0);
336 *data_csum = ~th_sum & 0xffff;
337 if ((m->m_pkthdr.csum_rx_val ^ 0xffff) == 0) {
338 return true;
339 } else {
340 return false;
341 }
342 }
343 /* Reset the csum RX flags */
344 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
345 if (verify_l3) {
346 csum = m_sum16(m, start, pkt->pkt_flow_ip_hlen);
347 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
348 start, pkt->pkt_flow_ip_hlen, csum);
349 m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED;
350 if ((csum ^ 0xffff) != 0) {
351 return false;
352 } else {
353 m->m_pkthdr.csum_flags |= CSUM_IP_VALID;
354 }
355 }
356 /* Compute L4 header checksum */
357 partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
358 pkt->pkt_flow_tcp_hlen);
359 /* Compute payload checksum */
360 start += (pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
361 *data_csum = m_sum16(m, start, (plen - start));
362
363 /* Fold in the data checksum to TCP checksum */
364 partial += *data_csum;
365 partial += htons(l4len + IPPROTO_TCP);
366 if (pkt->pkt_flow_ip_ver == IPVERSION) {
367 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
368 pkt->pkt_flow_ipv4_dst.s_addr, partial);
369 } else {
370 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
371 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
372 &pkt->pkt_flow_ipv6_dst, partial);
373 }
374 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
375 start - pkt->pkt_flow_tcp_hlen, l4len, csum);
376 // Set start to 0 for full checksum
377 m->m_pkthdr.csum_rx_start = 0;
378 m->m_pkthdr.csum_rx_val = csum;
379 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
380 if ((csum ^ 0xffff) != 0) {
381 return false;
382 }
383
384 return true;
385 }
386
387 /* structure to pass an array of data buffers */
388 typedef struct _dbuf_array {
389 union {
390 struct __kern_buflet *dba_buflet[MAX_BUFLET_COUNT];
391 struct mbuf *dba_mbuf[MAX_BUFLET_COUNT];
392 };
393 uint8_t dba_num_dbufs;
394 bool dba_is_buflet;
395 } _dbuf_array_t;
396
397 static inline void
_copy_data_sum_dbuf(struct __kern_packet * spkt,uint16_t soff,uint16_t plen,uint32_t * partial_sum,boolean_t * odd_start,_dbuf_array_t * dbuf,boolean_t do_csum)398 _copy_data_sum_dbuf(struct __kern_packet *spkt, uint16_t soff, uint16_t plen,
399 uint32_t *partial_sum, boolean_t *odd_start, _dbuf_array_t *dbuf,
400 boolean_t do_csum)
401 {
402 uint8_t i = 0;
403 uint16_t buf_off = 0;
404 uint16_t buflet_dlim;
405 uint16_t buflet_dlen;
406
407 ASSERT(plen > 0);
408 if (!dbuf->dba_is_buflet) {
409 /*
410 * Assumption about a single mbuf is being asserted due to the
411 * reason that the current usage always passes one mbuf and the
412 * routine has not been tested with multiple mbufs.
413 */
414 ASSERT(dbuf->dba_num_dbufs == 1);
415 ASSERT((mbuf_maxlen(dbuf->dba_mbuf[0]) -
416 dbuf->dba_mbuf[0]->m_len) >= plen);
417 buf_off = dbuf->dba_mbuf[0]->m_len;
418 } else {
419 buflet_dlim = kern_buflet_get_data_limit(dbuf->dba_buflet[0]);
420 buflet_dlen = kern_buflet_get_data_length(dbuf->dba_buflet[0]);
421 ASSERT(buflet_dlen < buflet_dlim);
422 buf_off = buflet_dlen;
423 }
424 while (plen > 0) {
425 uint16_t tmplen;
426 uint16_t dbuf_lim;
427 uint8_t *dbuf_addr;
428
429 if (dbuf->dba_is_buflet) {
430 ASSERT(i < dbuf->dba_num_dbufs);
431 ASSERT(kern_buflet_get_data_offset(dbuf->dba_buflet[i])
432 == 0);
433 dbuf_addr =
434 kern_buflet_get_data_address(dbuf->dba_buflet[i]);
435 dbuf_lim = buflet_dlim - buf_off;
436 } else {
437 dbuf_addr = mtod(dbuf->dba_mbuf[i], uint8_t *);
438 dbuf_lim = mbuf_maxlen(dbuf->dba_mbuf[i]) - buf_off;
439 }
440 dbuf_addr += buf_off;
441 tmplen = min(plen, dbuf_lim);
442 if (PKT_IS_TRUNC_MBUF(spkt)) {
443 if (do_csum) {
444 *partial_sum = m_copydata_sum(spkt->pkt_mbuf,
445 soff, tmplen, dbuf_addr, *partial_sum,
446 odd_start);
447 } else {
448 m_copydata(spkt->pkt_mbuf, soff, tmplen,
449 dbuf_addr);
450 }
451 } else {
452 *partial_sum = pkt_copyaddr_sum(SK_PKT2PH(spkt),
453 soff, dbuf_addr, tmplen, do_csum, *partial_sum,
454 odd_start);
455 }
456 if (dbuf->dba_is_buflet) {
457 VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[i],
458 tmplen + buf_off) == 0);
459 } else {
460 dbuf->dba_mbuf[i]->m_len += tmplen;
461 dbuf->dba_mbuf[i]->m_pkthdr.len += tmplen;
462 }
463 soff += tmplen;
464 plen -= tmplen;
465 buf_off = 0;
466 i++;
467 }
468 ASSERT(plen == 0);
469 }
470
471 /*
472 * Copy (fill) and checksum for packet.
473 * spkt: source IP packet.
474 * plen: length of data in spkt (IP hdr + TCP hdr + TCP payload).
475 * verify_l3: verify IPv4 header checksum.
476 * currm: destination mbuf.
477 * currp: destination skywalk packet.
478 * dbuf: additional destination data buffer(s), used when current destination
479 * packet is out of space.
480 * added: amount of data copied from spkt to the additional buffer.
481 * data_sum: 16-bit folded partial checksum of the copied TCP payload.
482 */
483 static bool
copy_pkt_csum_packed(struct __kern_packet * spkt,uint32_t plen,_dbuf_array_t * dbuf,bool verify_l3,struct mbuf * currm,struct __kern_buflet * currp,uint16_t * data_csum,int * added)484 copy_pkt_csum_packed(struct __kern_packet *spkt, uint32_t plen,
485 _dbuf_array_t *dbuf, bool verify_l3, struct mbuf *currm,
486 struct __kern_buflet *currp, uint16_t *data_csum, int *added)
487 {
488 ASSERT(data_csum != NULL);
489
490 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
491 SK_VERB_COPY));
492
493 uint16_t start = 0, csum = 0;
494 uint32_t len = 0;
495 uint32_t l4len;
496 /* soff is only used for packets */
497 uint16_t soff = spkt->pkt_headroom + spkt->pkt_l2_len;
498 uint32_t data_partial = 0, partial = 0;
499 int32_t curr_oldlen;
500 uint32_t curr_trailing;
501 char *curr_ptr;
502 int32_t curr_len;
503 uint16_t data_off;
504 uint32_t tmplen;
505 boolean_t odd_start = FALSE;
506 bool verify_l4;
507
508 /* One of them must be != NULL, but they can't be both set */
509 VERIFY((currm != NULL || currp != NULL) &&
510 ((currm != NULL) != (currp != NULL)));
511
512 if (currm != NULL) {
513 curr_oldlen = currm->m_len;
514 curr_trailing = (uint32_t)M_TRAILINGSPACE(currm);
515 curr_ptr = mtod(currm, char *) + currm->m_len;
516 curr_len = currm->m_len;
517 } else {
518 curr_oldlen = currp->buf_dlen;
519 curr_trailing = currp->buf_dlim - currp->buf_doff -
520 currp->buf_dlen;
521 curr_ptr = (char *)(currp->buf_addr + currp->buf_doff +
522 currp->buf_dlen);
523 curr_len = currp->buf_dlen;
524 }
525
526 /* Verify checksum only for IPv4 */
527 len = spkt->pkt_flow_ip_hlen;
528 verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(spkt));
529 if (verify_l3) {
530 if (PKT_IS_TRUNC_MBUF(spkt)) {
531 partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
532 len, 0, 0);
533 } else {
534 partial = pkt_sum(SK_PKT2PH(spkt), soff, len);
535 }
536
537 csum = __packet_fold_sum(partial);
538 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)", 0,
539 len, csum);
540 spkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
541 if ((csum ^ 0xffff) != 0) {
542 /* No need to copy & checkum TCP+payload */
543 return false;
544 } else {
545 spkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
546 }
547 }
548
549 verify_l4 = ((spkt->pkt_csum_flags & PACKET_CSUM_RX_FULL_FLAGS) !=
550 PACKET_CSUM_RX_FULL_FLAGS);
551
552 /* Copy & verify TCP checksum */
553 start = spkt->pkt_flow_ip_hlen + spkt->pkt_flow_tcp_hlen;
554 l4len = plen - spkt->pkt_flow_ip_hlen;
555 len = plen - start;
556 if (PKT_IS_TRUNC_MBUF(spkt)) {
557 tmplen = min(len, curr_trailing);
558 odd_start = FALSE;
559
560 /* First, simple checksum on the TCP header */
561 if (verify_l4) {
562 partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
563 spkt->pkt_flow_tcp_hlen, spkt->pkt_flow_ip_hlen, 0);
564 }
565
566 /* Now, copy & sum the payload */
567 if (tmplen > 0) {
568 data_partial = m_copydata_sum(spkt->pkt_mbuf,
569 start, tmplen, curr_ptr, 0, &odd_start);
570 curr_len += tmplen;
571 }
572 data_off = start + tmplen;
573 } else {
574 tmplen = min(len, curr_trailing);
575 odd_start = FALSE;
576
577 /* First, simple checksum on the TCP header */
578 if (verify_l4) {
579 partial = pkt_sum(SK_PKT2PH(spkt), (soff +
580 spkt->pkt_flow_ip_hlen), spkt->pkt_flow_tcp_hlen);
581 }
582
583 /* Now, copy & sum the payload */
584 if (tmplen > 0) {
585 data_partial = pkt_copyaddr_sum(SK_PKT2PH(spkt),
586 (soff + start), (uint8_t *)curr_ptr, tmplen,
587 true, 0, &odd_start);
588 curr_len += tmplen;
589 }
590 data_off = soff + start + tmplen;
591 }
592
593 /* copy & sum remaining payload in additional buffers */
594 if ((len - tmplen) > 0) {
595 ASSERT(dbuf != NULL);
596 _copy_data_sum_dbuf(spkt, data_off, (len - tmplen),
597 &data_partial, &odd_start, dbuf, true);
598 *added = (len - tmplen);
599 }
600
601 /* Fold data checksum to 16 bit */
602 *data_csum = __packet_fold_sum(data_partial);
603
604 if (currm != NULL) {
605 currm->m_len = curr_len;
606 } else {
607 currp->buf_dlen = curr_len;
608 }
609
610 if (verify_l4) {
611 /* Fold in the data checksum to TCP checksum */
612 partial += *data_csum;
613 partial += htons(l4len + IPPROTO_TCP);
614 if (spkt->pkt_flow_ip_ver == IPVERSION) {
615 csum = in_pseudo(spkt->pkt_flow_ipv4_src.s_addr,
616 spkt->pkt_flow_ipv4_dst.s_addr, partial);
617 } else {
618 ASSERT(spkt->pkt_flow_ip_ver == IPV6_VERSION);
619 csum = in6_pseudo(&spkt->pkt_flow_ipv6_src,
620 &spkt->pkt_flow_ipv6_dst, partial);
621 }
622 /* pkt metadata will be transfer to super packet */
623 __packet_set_inet_checksum(SK_PKT2PH(spkt),
624 PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
625 } else {
626 /* grab csum value from offload */
627 csum = spkt->pkt_csum_rx_value;
628 }
629
630 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
631 start - spkt->pkt_flow_tcp_hlen, l4len, ntohs(csum));
632
633 if ((csum ^ 0xffff) != 0) {
634 /*
635 * Revert whatever we did here!
636 * currm/currp should be restored to previous value.
637 * dbuf (for additional payload) should be restore to 0.
638 */
639 if (currm != NULL) {
640 currm->m_len = curr_oldlen;
641 } else {
642 currp->buf_dlen = curr_oldlen;
643 }
644 if (dbuf != NULL) {
645 for (int i = 0; i < dbuf->dba_num_dbufs; i++) {
646 if (dbuf->dba_is_buflet) {
647 struct __kern_buflet *b = dbuf->dba_buflet[i];
648 kern_buflet_set_data_length(b, 0);
649 kern_buflet_set_data_offset(b, 0);
650 } else {
651 struct mbuf *m = dbuf->dba_mbuf[i];
652 m->m_len = m->m_pkthdr.len = 0;
653 }
654 }
655 }
656
657 return false;
658 }
659
660 return true;
661 }
662
663 /*
664 * Copy and checksum for packet or packet with mbuf
665 * data_csum is only supported for bsd flows
666 */
667 static bool
copy_pkt_csum(struct __kern_packet * pkt,uint32_t plen,_dbuf_array_t * dbuf,uint16_t * data_csum,bool verify_l3)668 copy_pkt_csum(struct __kern_packet *pkt, uint32_t plen, _dbuf_array_t *dbuf,
669 uint16_t *data_csum, bool verify_l3)
670 {
671 /*
672 * To keep this routine simple and optimal, we are asserting on the
673 * assumption that the smallest flowswitch packet pool buffer should
674 * be large enough to hold the IP and TCP headers in the first buflet.
675 */
676 _CASSERT(NX_FSW_MINBUFSIZE >= NETIF_COMPAT_MAX_MBUF_DATA_COPY);
677
678 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
679 (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
680
681 uint16_t start = 0, csum = 0;
682 uint32_t len = 0;
683 /* soff is only used for packets */
684 uint16_t soff = pkt->pkt_headroom + pkt->pkt_l2_len;
685 uint32_t data_partial = 0, partial = 0;
686 boolean_t odd_start = false;
687 uint32_t data_len;
688 uint16_t dbuf_off;
689 uint16_t copied_len = 0;
690 bool l3_csum_ok;
691 uint8_t *daddr;
692
693 if (dbuf->dba_is_buflet) {
694 daddr = kern_buflet_get_data_address(dbuf->dba_buflet[0]);
695 daddr += kern_buflet_get_data_length(dbuf->dba_buflet[0]);
696 } else {
697 daddr = mtod(dbuf->dba_mbuf[0], uint8_t *);
698 daddr += dbuf->dba_mbuf[0]->m_len;
699 ASSERT(mbuf_maxlen(dbuf->dba_mbuf[0]) >= plen);
700 }
701
702 /* Some compat drivers compute full checksum */
703 if (PKT_IS_MBUF(pkt) && ((pkt->pkt_mbuf->m_pkthdr.csum_flags &
704 CSUM_RX_FULL_FLAGS) == CSUM_RX_FULL_FLAGS)) {
705 /* copy only */
706 _copy_data_sum_dbuf(pkt, PKT_IS_TRUNC_MBUF(pkt) ? 0: soff,
707 plen, &partial, &odd_start, dbuf, false);
708 csum = pkt->pkt_mbuf->m_pkthdr.csum_rx_val;
709 SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
710 pkt->pkt_mbuf->m_pkthdr.csum_flags,
711 pkt->pkt_mbuf->m_pkthdr.csum_rx_start, csum);
712 /* pkt metadata will be transfer to super packet */
713 __packet_set_inet_checksum(SK_PKT2PH(pkt),
714 PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
715 if ((csum ^ 0xffff) == 0) {
716 return true;
717 } else {
718 return false;
719 }
720 }
721 /* Copy l3 & verify checksum only for IPv4 */
722 start = 0;
723 len = pkt->pkt_flow_ip_hlen;
724 if (PKT_IS_TRUNC_MBUF(pkt)) {
725 partial = m_copydata_sum(pkt->pkt_mbuf, start, len,
726 (daddr + start), 0, NULL);
727 } else {
728 partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), soff,
729 (daddr + start), len, true, 0, NULL);
730 }
731 verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(pkt));
732 l3_csum_ok = !verify_l3;
733 if (verify_l3) {
734 csum = __packet_fold_sum(partial);
735 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
736 start, len, csum);
737 pkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
738 if ((csum ^ 0xffff) != 0) {
739 /* proceed to copy the rest of packet */
740 } else {
741 pkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
742 l3_csum_ok = true;
743 }
744 }
745 copied_len += pkt->pkt_flow_ip_hlen;
746
747 /* Copy & verify TCP checksum */
748 start = pkt->pkt_flow_ip_hlen;
749 len = plen - start;
750
751 if (PKT_IS_TRUNC_MBUF(pkt)) {
752 /* First, copy and sum TCP header */
753 partial = m_copydata_sum(pkt->pkt_mbuf, start,
754 pkt->pkt_flow_tcp_hlen, (daddr + start), 0, NULL);
755
756 data_len = len - pkt->pkt_flow_tcp_hlen;
757 start += pkt->pkt_flow_tcp_hlen;
758 dbuf_off = start;
759 /* Next, copy and sum payload (if any) */
760 } else {
761 /* First, copy and sum TCP header */
762 partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), (soff + start),
763 (daddr + start), pkt->pkt_flow_tcp_hlen, true, 0, NULL);
764
765 data_len = len - pkt->pkt_flow_tcp_hlen;
766 start += pkt->pkt_flow_tcp_hlen;
767 dbuf_off = start;
768 start += soff;
769 }
770 copied_len += pkt->pkt_flow_tcp_hlen;
771
772 if (dbuf->dba_is_buflet) {
773 VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[0],
774 kern_buflet_get_data_length(dbuf->dba_buflet[0]) +
775 copied_len) == 0);
776 } else {
777 dbuf->dba_mbuf[0]->m_len += copied_len;
778 dbuf->dba_mbuf[0]->m_pkthdr.len += copied_len;
779 }
780
781 /* copy and sum payload (if any) */
782 if (data_len > 0) {
783 odd_start = false;
784 _copy_data_sum_dbuf(pkt, start, data_len, &data_partial,
785 &odd_start, dbuf, l3_csum_ok);
786 }
787
788 if (__improbable(!l3_csum_ok)) {
789 return false;
790 }
791
792 /* Fold data sum to 16 bit and then into the partial */
793 *data_csum = __packet_fold_sum(data_partial);
794
795 /* Fold in the data checksum to TCP checksum */
796 partial += *data_csum;
797
798 partial += htons(len + IPPROTO_TCP);
799 if (pkt->pkt_flow_ip_ver == IPVERSION) {
800 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
801 pkt->pkt_flow_ipv4_dst.s_addr, partial);
802 } else {
803 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
804 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
805 &pkt->pkt_flow_ipv6_dst, partial);
806 }
807
808 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
809 pkt->pkt_flow_ip_hlen, len, csum);
810
811 /* pkt metadata will be transfer to super packet */
812 __packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
813 0, csum, false);
814 if ((csum ^ 0xffff) != 0) {
815 return false;
816 }
817
818 return true;
819 }
820
821 SK_INLINE_ATTRIBUTE
822 static void
flow_agg_init_common(struct flow_agg * fa,struct __kern_packet * pkt)823 flow_agg_init_common(struct flow_agg *fa, struct __kern_packet *pkt)
824 {
825 switch (pkt->pkt_flow_ip_ver) {
826 case IPVERSION:
827 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip)) {
828 return;
829 }
830 break;
831 case IPV6_VERSION:
832 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
833 return;
834 }
835 break;
836 default:
837 VERIFY(0);
838 /* NOTREACHED */
839 __builtin_unreachable();
840 }
841
842 fa->fa_tcp_seq = ntohl(pkt->pkt_flow_tcp_seq) + pkt->pkt_flow_ulen;
843 fa->fa_ulen = pkt->pkt_flow_ulen;
844 fa->fa_total = pkt->pkt_flow_ip_hlen +
845 pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
846 }
847
848 static void
flow_agg_init_smbuf(struct flow_agg * fa,struct mbuf * smbuf,struct __kern_packet * pkt)849 flow_agg_init_smbuf(struct flow_agg *fa, struct mbuf *smbuf,
850 struct __kern_packet *pkt)
851 {
852 FLOW_AGG_CLEAR(fa);
853
854 ASSERT(smbuf != NULL);
855 fa->fa_smbuf = smbuf;
856
857 fa->fa_sptr = mtod(smbuf, uint8_t *);
858 ASSERT(fa->fa_sptr != NULL);
859
860 /*
861 * Note here we use 'pkt' instead of 'smbuf', since we rely on the
862 * contents of the flow structure which don't exist in 'smbuf'.
863 */
864 flow_agg_init_common(fa, pkt);
865 }
866
867 static void
flow_agg_init_spkt(struct flow_agg * fa,struct __kern_packet * spkt,struct __kern_packet * pkt)868 flow_agg_init_spkt(struct flow_agg *fa, struct __kern_packet *spkt,
869 struct __kern_packet *pkt)
870 {
871 FLOW_AGG_CLEAR(fa);
872
873 ASSERT(spkt != NULL);
874 fa->fa_spkt = spkt;
875 fa->fa_sobj_is_pkt = true;
876 VERIFY(spkt->pkt_headroom == 0 && spkt->pkt_l2_len == 0);
877
878 MD_BUFLET_ADDR_ABS(spkt, fa->fa_sptr);
879 ASSERT(fa->fa_sptr != NULL);
880
881 /*
882 * Note here we use 'pkt' instead of 'spkt', since we rely on the
883 * contents of the flow structure which don't exist in 'spkt'.
884 */
885 flow_agg_init_common(fa, pkt);
886 }
887
888 SK_INLINE_ATTRIBUTE
889 static bool
ipv4_tcp_memcmp(const uint8_t * h1,const uint8_t * h2)890 ipv4_tcp_memcmp(const uint8_t *h1, const uint8_t *h2)
891 {
892 return sk_memcmp_mask_64B(h1, h2, (const uint8_t *)&ip_tcp_mask) == 0;
893 }
894
895 SK_INLINE_ATTRIBUTE
896 static bool
ipv6_tcp_memcmp(const uint8_t * h1,const uint8_t * h2)897 ipv6_tcp_memcmp(const uint8_t *h1, const uint8_t *h2)
898 {
899 return sk_memcmp_mask_80B(h1, h2, (const uint8_t *)&ip6_tcp_mask) == 0;
900 }
901
902 SK_INLINE_ATTRIBUTE
903 static bool
can_agg_fastpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)904 can_agg_fastpath(struct flow_agg *fa, struct __kern_packet *pkt,
905 struct fsw_stats *fsws)
906 {
907 bool match;
908
909 ASSERT(fa->fa_sptr != NULL);
910 _CASSERT(sizeof(struct ip6_tcp_mask) == MASK_SIZE);
911 _CASSERT(sizeof(struct ip_tcp_mask) == MASK_SIZE);
912
913 if (__improbable(pkt->pkt_length < MASK_SIZE)) {
914 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_TCP);
915 goto slow_path;
916 }
917
918 if (__improbable(fa->fa_sobj_is_short)) {
919 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_MBUF);
920 goto slow_path;
921 }
922
923 if (__improbable(pkt->pkt_flow_tcp_hlen !=
924 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA))) {
925 goto slow_path;
926 }
927
928 switch (pkt->pkt_flow_ip_ver) {
929 case IPVERSION:
930 match = ipv4_tcp_memcmp(fa->fa_sptr,
931 (uint8_t *)pkt->pkt_flow_ip_hdr);
932 break;
933 case IPV6_VERSION:
934 match = ipv6_tcp_memcmp(fa->fa_sptr,
935 (uint8_t *)pkt->pkt_flow_ip_hdr);
936 break;
937 default:
938 VERIFY(0);
939 /* NOTREACHED */
940 __builtin_unreachable();
941 }
942
943 if (__improbable(!match)) {
944 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_MASK_TCP);
945 goto slow_path;
946 }
947 if (__improbable(pkt->pkt_flow_ulen != fa->fa_ulen)) {
948 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ULEN_TCP);
949 goto slow_path;
950 }
951
952 STATS_INC(fsws, FSW_STATS_RX_AGG_OK_FASTPATH_TCP);
953 fa->fa_tcp_seq += pkt->pkt_flow_ulen;
954 fa->fa_ulen = pkt->pkt_flow_ulen;
955 return true;
956
957 slow_path:
958 return false;
959 }
960
961 SK_NO_INLINE_ATTRIBUTE
962 static bool
can_agg_slowpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)963 can_agg_slowpath(struct flow_agg *fa, struct __kern_packet *pkt,
964 struct fsw_stats *fsws)
965 {
966 uint8_t *sl3_hdr = fa->fa_sptr;
967 uint32_t sl3tlen = 0;
968 uint16_t sl3hlen = 0;
969
970 DTRACE_SKYWALK2(aggr__slow, struct __kern_packet *, pkt,
971 uint8_t *, sl3_hdr);
972
973 ASSERT(sl3_hdr != NULL);
974
975 /*
976 * Compare IP header length, TOS, frag flags and IP options
977 * For IPv4, the options should match exactly
978 * For IPv6, if options are present, bail out
979 */
980 if (pkt->pkt_flow_ip_ver == IPVERSION) {
981 struct ip *siph = (struct ip *)(void *)sl3_hdr;
982 struct ip *iph = (struct ip *)pkt->pkt_flow_ip_hdr;
983
984 ASSERT(siph->ip_v == IPVERSION);
985 /* 16-bit alignment is sufficient (handles mbuf case) */
986 ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
987 ASSERT(IS_P2ALIGNED(iph, sizeof(uint16_t)));
988
989 sl3hlen = (siph->ip_hl << 2);
990 if (sl3hlen != pkt->pkt_flow_ip_hlen) {
991 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
992 DTRACE_SKYWALK2(aggr__fail2, uint16_t, sl3hlen, uint8_t,
993 pkt->pkt_flow_ip_hlen);
994 return false;
995 }
996
997 if (siph->ip_ttl != iph->ip_ttl) {
998 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
999 DTRACE_SKYWALK2(aggr__fail3, uint8_t, siph->ip_ttl,
1000 uint8_t, iph->ip_ttl);
1001 return false;
1002 }
1003
1004 if (siph->ip_tos != iph->ip_tos) {
1005 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1006 DTRACE_SKYWALK2(aggr__fail4, uint8_t, siph->ip_tos,
1007 uint8_t, iph->ip_tos);
1008 return false;
1009 }
1010 /* For IPv4, DF bit should match */
1011 if ((ntohs(siph->ip_off) & (IP_DF | IP_RF)) !=
1012 (ntohs(iph->ip_off) & (IP_DF | IP_RF))) {
1013 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OFF_IP);
1014 DTRACE_SKYWALK2(aggr__fail5, uint16_t,
1015 ntohs(siph->ip_off), uint16_t, ntohs(iph->ip_off));
1016 return false;
1017 }
1018
1019 uint8_t ip_opts_len = pkt->pkt_flow_ip_hlen -
1020 sizeof(struct ip);
1021 if (ip_opts_len > 0 &&
1022 memcmp((uint8_t *)(siph + 1), (uint8_t *)(iph + 1),
1023 ip_opts_len) != 0) {
1024 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPT_IP);
1025 DTRACE_SKYWALK3(aggr__fail6, uint8_t, ip_opts_len,
1026 uint8_t *, (uint8_t *)(siph + 1), uint8_t *,
1027 (uint8_t *)(iph + 1));
1028 return false;
1029 }
1030 sl3tlen = ntohs(siph->ip_len);
1031 } else {
1032 struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1033 struct ip6_hdr *ip6 = (struct ip6_hdr *)pkt->pkt_flow_ip_hdr;
1034
1035 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1036 ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1037 /* 16-bit alignment is sufficient (handles mbuf case) */
1038 ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1039
1040 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
1041 /*
1042 * Don't aggregate if extension header is present in
1043 * packet. N.B. currently flow switch only classifies
1044 * frag header
1045 */
1046 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1047 DTRACE_SKYWALK1(aggr__fail7, uint8_t,
1048 pkt->pkt_flow_ip_hlen);
1049 return false;
1050 }
1051
1052 sl3hlen = sizeof(struct ip6_hdr);
1053 /* For IPv6, flow info mask covers TOS and flow label */
1054 if (memcmp(&sip6->ip6_flow, &ip6->ip6_flow,
1055 sizeof(sip6->ip6_flow)) != 0) {
1056 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1057 DTRACE_SKYWALK2(aggr__fail8, uint32_t,
1058 ntohl(sip6->ip6_flow), uint32_t,
1059 ntohl(ip6->ip6_flow));
1060 return false;
1061 }
1062
1063 if (sip6->ip6_hlim != ip6->ip6_hlim) {
1064 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1065 DTRACE_SKYWALK2(aggr__fail9, uint8_t, sip6->ip6_hlim,
1066 uint8_t, ip6->ip6_hlim);
1067 return false;
1068 }
1069
1070 sl3tlen = (sizeof(struct ip6_hdr) + ntohs(sip6->ip6_plen));
1071 }
1072
1073 /*
1074 * For TCP header, compare ACK number and window size
1075 * Compare TCP flags
1076 * Compare TCP header length and TCP options
1077 */
1078 struct tcphdr *stcp = (struct tcphdr *)(void *)(sl3_hdr + sl3hlen);
1079 struct tcphdr *tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
1080
1081 uint16_t sl4hlen = (stcp->th_off << 2);
1082 if (memcmp(&stcp->th_ack, &tcp->th_ack, sizeof(stcp->th_ack)) != 0 ||
1083 memcmp(&stcp->th_win, &tcp->th_win, sizeof(stcp->th_win)) != 0) {
1084 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ACKWIN_TCP);
1085 DTRACE_SKYWALK4(aggr__fail9, uint32_t, ntohl(stcp->th_ack),
1086 uint32_t, ntohl(tcp->th_ack), uint16_t, ntohs(stcp->th_win),
1087 uint16_t, ntohs(tcp->th_win));
1088 return false;
1089 }
1090
1091 if ((stcp->th_flags & ~(TH_PUSH)) != (tcp->th_flags & ~(TH_PUSH))) {
1092 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_FLAGS_TCP);
1093 DTRACE_SKYWALK2(aggr__fail10, uint8_t, stcp->th_flags,
1094 uint8_t, tcp->th_flags);
1095 return false;
1096 }
1097
1098 if (sl4hlen != pkt->pkt_flow_tcp_hlen) {
1099 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_TCP);
1100 DTRACE_SKYWALK2(aggr__fail11, uint8_t, sl4hlen,
1101 uint8_t, pkt->pkt_flow_tcp_hlen);
1102 return false;
1103 }
1104
1105 uint8_t tcp_opts_len = pkt->pkt_flow_tcp_hlen - sizeof(struct tcphdr);
1106 /*
1107 * We know that the TCP-option lengthes are the same thanks to the above
1108 * sl4hlen check
1109 */
1110 if (tcp_opts_len > 0 && memcmp((uint8_t *)(stcp + 1),
1111 (uint8_t *)(tcp + 1), tcp_opts_len) != 0) {
1112 /*
1113 * Fast-path header prediction:
1114 *
1115 * TCP Timestamp option is usually put after two NOP-headers,
1116 * and thus total TCP-option length is 12. If that's the case,
1117 * we can aggregate as only the TCP time-stamp option differs.
1118 */
1119 if (tcp_opts_len != TCPOLEN_TSTAMP_APPA) {
1120 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_EXOPT_TCP);
1121 DTRACE_SKYWALK1(aggr__fail13, uint8_t, tcp_opts_len);
1122 return false;
1123 } else {
1124 uint32_t sts_hdr, ts_hdr;
1125 if (IS_P2ALIGNED(stcp + 1, sizeof(uint32_t))) {
1126 sts_hdr = *((uint32_t *)(stcp + 1));
1127 } else {
1128 bcopy(stcp + 1, &sts_hdr, sizeof(sts_hdr));
1129 }
1130 if (IS_P2ALIGNED(tcp + 1, sizeof(uint32_t))) {
1131 ts_hdr = *((uint32_t *)(tcp + 1));
1132 } else {
1133 bcopy(tcp + 1, &ts_hdr, sizeof(ts_hdr));
1134 }
1135
1136 if (sts_hdr != htonl(TCPOPT_TSTAMP_HDR) ||
1137 ts_hdr != htonl(TCPOPT_TSTAMP_HDR)) {
1138 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPTTS_TCP);
1139 DTRACE_SKYWALK2(aggr__fail14, uint32_t,
1140 sts_hdr, uint32_t, ts_hdr);
1141 return false;
1142 }
1143 }
1144 }
1145 STATS_INC(fsws, FSW_STATS_RX_AGG_OK_SLOWPATH_TCP);
1146 fa->fa_tcp_seq += pkt->pkt_flow_ulen;
1147 fa->fa_ulen = pkt->pkt_flow_ulen;
1148 return true;
1149 }
1150
1151 static bool
flow_agg_is_ok(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)1152 flow_agg_is_ok(struct flow_agg *fa, struct __kern_packet *pkt,
1153 struct fsw_stats *fsws)
1154 {
1155 /* Shouldn't exceed the ip_len beyond MIN(custom ip_len, 64K) */
1156 const uint32_t max_ip_len = MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET);
1157 bool can_agg = false;
1158
1159 DTRACE_SKYWALK2(aggr__check, struct flow_agg *, fa,
1160 struct __kern_packet *, pkt);
1161
1162 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1163 if (__improbable(pkt->pkt_flow_tcp_agg_fast != 0)) {
1164 pkt->pkt_flow_tcp_agg_fast = 0;
1165 }
1166 /*
1167 * Don't aggregate if any of the following is true:
1168 * 1. TCP flag is other than TH_{ACK,PUSH}
1169 * 2. Payload length is 0 (pure ACK)
1170 * 3. This is the first packet
1171 * 4. TCP sequence number is not expected
1172 * 5. We would've exceeded the maximum aggregated size
1173 * 6. It's not the first packet and the wake flag is set
1174 */
1175 if (__improbable((pkt->pkt_flow_tcp_flags & TCP_FLAGS_IGNORE) != 0 ||
1176 pkt->pkt_flow_ulen == 0 || fa->fa_sobj == NULL)) {
1177 DTRACE_SKYWALK1(aggr__fail1a, struct __kern_packet *, pkt);
1178 goto done;
1179 }
1180 if (__improbable(ntohl(pkt->pkt_flow_tcp_seq) != fa->fa_tcp_seq)) {
1181 DTRACE_SKYWALK2(aggr__fail1b, uint32_t,
1182 ntohl(pkt->pkt_flow_tcp_seq), uint32_t, fa->fa_tcp_seq);
1183 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SEQN_TCP);
1184 goto done;
1185 }
1186 if (__improbable((fa->fa_total + pkt->pkt_flow_ulen) > max_ip_len)) {
1187 DTRACE_SKYWALK3(aggr__fail1c, uint32_t, fa->fa_total,
1188 uint32_t, pkt->pkt_flow_ulen, uint32_t, max_ip_len);
1189 /* We've reached aggregation limit */
1190 STATS_INC(fsws, FSW_STATS_RX_AGG_LIMIT);
1191 goto done;
1192 }
1193 if (__improbable((pkt->pkt_pflags & PKT_F_WAKE_PKT) && fa->fa_total > 0)) {
1194 DTRACE_SKYWALK1(aggr__fail1d, struct __kern_packet *, pkt);
1195 goto done;
1196 }
1197
1198 can_agg = can_agg_fastpath(fa, pkt, fsws);
1199 if (can_agg) {
1200 pkt->pkt_flow_tcp_agg_fast = 1;
1201 goto done;
1202 }
1203
1204 can_agg = can_agg_slowpath(fa, pkt, fsws);
1205 ASSERT(!pkt->pkt_flow_tcp_agg_fast);
1206
1207 done:
1208 return can_agg;
1209 }
1210
1211 static void
flow_agg_merge_hdr(struct flow_agg * fa,struct __kern_packet * pkt,uint16_t data_csum,struct fsw_stats * fsws)1212 flow_agg_merge_hdr(struct flow_agg *fa, struct __kern_packet *pkt,
1213 uint16_t data_csum, struct fsw_stats *fsws)
1214 {
1215 struct tcphdr *stcp, *tcp;
1216 uint8_t *l3hdr, l3hlen;
1217 uint16_t old_l3len = 0;
1218 uint8_t result;
1219
1220 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1221
1222 ASSERT(fa->fa_sobj != NULL);
1223 ASSERT(!fa->fa_sobj_is_pkt ||
1224 (fa->fa_spkt->pkt_headroom == 0 && fa->fa_spkt->pkt_l2_len == 0));
1225 uint8_t *sl3_hdr = fa->fa_sptr;
1226 ASSERT(sl3_hdr != NULL);
1227
1228 fa->fa_total += pkt->pkt_flow_ulen;
1229
1230 /*
1231 * Update the IP header as:
1232 * 1. Set the IP ID (IPv4 only) to that of the new packet
1233 * 2. Set the ttl to the lowest of the two
1234 * 3. Increment the IP length by the payload length of new packet
1235 * 4. Leave the IP (IPv4 only) checksum as is
1236 * Update the resp. flow classification fields, if any
1237 * Nothing to update for TCP header for now
1238 */
1239 if (pkt->pkt_flow_ip_ver == IPVERSION) {
1240 struct ip *siph = (struct ip *)(void *)sl3_hdr;
1241
1242 /* 16-bit alignment is sufficient (handles mbuf case) */
1243 ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1244
1245 l3hdr = (uint8_t *)siph;
1246 l3hlen = siph->ip_hl << 2;
1247
1248 old_l3len = ntohs(siph->ip_len);
1249 uint16_t l3tlen = ntohs(siph->ip_len) + pkt->pkt_flow_ulen;
1250 siph->ip_len = htons(l3tlen);
1251 siph->ip_sum = __packet_fix_sum(siph->ip_sum, 0,
1252 htons(pkt->pkt_flow_ulen));
1253
1254 SK_DF(logflags, "Agg IP len %u", ntohs(siph->ip_len));
1255 } else {
1256 struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1257
1258 /* 16-bit alignment is sufficient (handles mbuf case) */
1259 ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1260 ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1261 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1262
1263 l3hdr = (uint8_t *)sip6;
1264 l3hlen = sizeof(struct ip6_hdr);
1265
1266 /* No extension headers should be present */
1267 ASSERT(pkt->pkt_flow_ip_hlen == sizeof(struct ip6_hdr));
1268
1269 old_l3len = ntohs(sip6->ip6_plen) + sizeof(struct ip6_hdr);
1270 uint16_t l3plen = ntohs(sip6->ip6_plen) + pkt->pkt_flow_ulen;
1271 sip6->ip6_plen = htons(l3plen);
1272
1273 SK_DF(logflags, "Agg IP6 len %u", ntohs(sip6->ip6_plen));
1274 }
1275
1276 if (__probable(pkt->pkt_flow_tcp_agg_fast)) {
1277 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_IP);
1278 } else {
1279 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_IP);
1280 }
1281
1282 stcp = (struct tcphdr *)(void *)(l3hdr + l3hlen);
1283 tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
1284 /* 16-bit alignment is sufficient (handles mbuf case) */
1285 ASSERT(IS_P2ALIGNED(stcp, sizeof(uint16_t)));
1286 ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
1287
1288 /*
1289 * If it is bigger, that means there are TCP-options that need to be
1290 * copied over.
1291 */
1292 if (pkt->pkt_flow_tcp_hlen > sizeof(struct tcphdr) ||
1293 (stcp->th_flags & TH_PUSH) == 0) {
1294 VERIFY(stcp->th_off << 2 == pkt->pkt_flow_tcp_hlen);
1295 if (__improbable(!pkt->pkt_flow_tcp_agg_fast &&
1296 memcmp(stcp + 1, tcp + 1, (pkt->pkt_flow_tcp_hlen -
1297 sizeof(struct tcphdr))) != 0)) {
1298 uint8_t *sopt = (uint8_t *)(stcp + 1);
1299 uint8_t *opt = (uint8_t *)(tcp + 1);
1300
1301 uint32_t ntsval, ntsecr;
1302 bcopy((void *)(opt + 4), &ntsval, sizeof(ntsval));
1303 bcopy((void *)(opt + 8), &ntsecr, sizeof(ntsecr));
1304
1305 __packet_fix_hdr_sum(sopt + 4, &stcp->th_sum, ntsval);
1306 __packet_fix_hdr_sum(sopt + 8, &stcp->th_sum, ntsecr);
1307
1308 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_TCP);
1309 } else {
1310 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_TCP);
1311 }
1312
1313 if ((stcp->th_flags & TH_PUSH) == 0 &&
1314 (tcp->th_flags & TH_PUSH) != 0) {
1315 uint16_t old, new;
1316 old = *(uint16_t *)(void *)(&stcp->th_ack + 1);
1317 /* If the new segment has a PUSH-flag, append it! */
1318 stcp->th_flags |= tcp->th_flags & TH_PUSH;
1319 new = *(uint16_t *)(void *)(&stcp->th_ack + 1);
1320 stcp->th_sum = __packet_fix_sum(stcp->th_sum, old, new);
1321 }
1322 }
1323
1324 /* Update pseudo header checksum */
1325 stcp->th_sum = __packet_fix_sum(stcp->th_sum, 0,
1326 htons(pkt->pkt_flow_ulen));
1327
1328 /* Update data checksum */
1329 if (__improbable(old_l3len & 0x1)) {
1330 /* swap the byte order, refer to rfc 1071 section 2 */
1331 stcp->th_sum = __packet_fix_sum(stcp->th_sum, 0,
1332 ntohs(data_csum));
1333 } else {
1334 stcp->th_sum = __packet_fix_sum(stcp->th_sum, 0, data_csum);
1335 }
1336
1337 if (fa->fa_sobj_is_pkt) {
1338 struct __kern_packet *spkt = fa->fa_spkt;
1339 spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1340 spkt->pkt_flow_ulen += pkt->pkt_flow_ulen;
1341 /*
1342 * Super packet length includes L3 and L4
1343 * header length for first packet only.
1344 */
1345 spkt->pkt_length += pkt->pkt_flow_ulen;
1346 if (spkt->pkt_seg_cnt == 0) {
1347 /* First time we append packets, need to set it to 1 */
1348 spkt->pkt_seg_cnt = 1;
1349 }
1350 _CASSERT(sizeof(result) == sizeof(spkt->pkt_seg_cnt));
1351 if (!os_add_overflow(1, spkt->pkt_seg_cnt, &result)) {
1352 spkt->pkt_seg_cnt = result;
1353 }
1354 SK_DF(logflags, "Agg pkt len %u TCP csum 0x%04x",
1355 spkt->pkt_length, ntohs(stcp->th_sum));
1356 } else {
1357 struct mbuf *smbuf = fa->fa_smbuf;
1358 smbuf->m_pkthdr.len += pkt->pkt_flow_ulen;
1359 if (smbuf->m_pkthdr.seg_cnt == 0) {
1360 /* First time we append packets, need to set it to 1 */
1361 smbuf->m_pkthdr.seg_cnt = 1;
1362 }
1363 _CASSERT(sizeof(result) == sizeof(smbuf->m_pkthdr.seg_cnt));
1364 if (!os_add_overflow(1, smbuf->m_pkthdr.seg_cnt, &result)) {
1365 smbuf->m_pkthdr.seg_cnt = result;
1366 }
1367 SK_DF(logflags, "Agg mbuf len %u TCP csum 0x%04x",
1368 smbuf->m_pkthdr.len, ntohs(stcp->th_sum));
1369 }
1370 }
1371
1372 /*
1373 * Copy metadata from source packet to destination packet
1374 */
1375 static void
pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)1376 pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
1377 {
1378 /* Copy packet metadata */
1379 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
1380 _PKT_COPY(spkt, dpkt);
1381 }
1382
1383 static void
pkt_finalize(kern_packet_t ph)1384 pkt_finalize(kern_packet_t ph)
1385 {
1386 int err = __packet_finalize(ph);
1387 VERIFY(err == 0);
1388 #if (DEVELOPMENT || DEBUG)
1389 struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1390 uint8_t *buf;
1391 MD_BUFLET_ADDR_ABS(pkt, buf);
1392 buf += pkt->pkt_headroom + pkt->pkt_l2_len;
1393 DTRACE_SKYWALK2(aggr__finalize, struct __kern_packet *, pkt,
1394 uint8_t *, buf);
1395 #endif
1396 }
1397
1398 SK_INLINE_ATTRIBUTE
1399 static inline uint32_t
_estimate_buflet_cnt(struct flow_entry * fe,struct kern_pbufpool * pp)1400 _estimate_buflet_cnt(struct flow_entry *fe, struct kern_pbufpool *pp)
1401 {
1402 uint32_t cnt;
1403
1404 _CASSERT(MAX_BUFLET_COUNT <= UINT8_MAX);
1405 cnt = howmany(((fe->fe_rx_pktq_bytes + sizeof(struct ip6_hdr)) +
1406 sizeof(struct tcphdr)), pp->pp_buflet_size);
1407 cnt = MAX(KPKTQ_LEN(&fe->fe_rx_pktq), cnt);
1408 cnt = MIN(cnt, MAX_BUFLET_COUNT);
1409 return cnt;
1410 }
1411
1412 SK_INLINE_ATTRIBUTE
1413 static inline void
_append_dbuf_array_to_kpkt(kern_packet_t ph,kern_buflet_t pbuf,_dbuf_array_t * dbuf_array,kern_buflet_t * lbuf)1414 _append_dbuf_array_to_kpkt(kern_packet_t ph, kern_buflet_t pbuf,
1415 _dbuf_array_t *dbuf_array, kern_buflet_t *lbuf)
1416 {
1417 for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1418 kern_buflet_t buf = dbuf_array->dba_buflet[i];
1419 VERIFY(kern_packet_add_buflet(ph, pbuf, buf) == 0);
1420 pbuf = buf;
1421 dbuf_array->dba_buflet[i] = NULL;
1422 }
1423 ASSERT(pbuf != NULL);
1424 dbuf_array->dba_num_dbufs = 0;
1425 *lbuf = pbuf;
1426 }
1427
1428 SK_INLINE_ATTRIBUTE
1429 static inline void
_free_dbuf_array(struct kern_pbufpool * pp,_dbuf_array_t * dbuf_array)1430 _free_dbuf_array(struct kern_pbufpool *pp,
1431 _dbuf_array_t *dbuf_array)
1432 {
1433 for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1434 kern_buflet_t buf = dbuf_array->dba_buflet[i];
1435 pp_free_buflet(pp, buf);
1436 dbuf_array->dba_buflet[i] = NULL;
1437 }
1438 dbuf_array->dba_num_dbufs = 0;
1439 }
1440
1441 SK_NO_INLINE_ATTRIBUTE
1442 static void
flow_rx_agg_channel(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * dropped_pkts,bool is_mbuf)1443 flow_rx_agg_channel(struct nx_flowswitch *fsw, struct flow_entry *fe,
1444 struct pktq *dropped_pkts, bool is_mbuf)
1445 {
1446 #define __RX_AGG_CHAN_DROP_SOURCE_PACKET(_pkt) do { \
1447 KPKTQ_ENQUEUE(dropped_pkts, (_pkt)); \
1448 (_pkt) = NULL; \
1449 FLOW_AGG_CLEAR(&fa); \
1450 prev_csum_ok = false; \
1451 } while (0)
1452 struct flow_agg fa; /* states */
1453 FLOW_AGG_CLEAR(&fa);
1454
1455 struct pktq pkts; /* dst super packets */
1456 struct pktq disposed_pkts; /* done src packets */
1457
1458 KPKTQ_INIT(&pkts);
1459 KPKTQ_INIT(&disposed_pkts);
1460
1461 struct __kern_channel_ring *ring;
1462 ring = fsw_flow_get_rx_ring(fsw, fe);
1463 if (__improbable(ring == NULL)) {
1464 SK_ERR("Rx ring is NULL");
1465 KPKTQ_CONCAT(dropped_pkts, &fe->fe_rx_pktq);
1466 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DST_NXPORT_INVALID,
1467 KPKTQ_LEN(dropped_pkts));
1468 return;
1469 }
1470 struct kern_pbufpool *dpp = ring->ckr_pp;
1471 ASSERT(dpp->pp_max_frags > 1);
1472
1473 struct __kern_packet *pkt, *tpkt;
1474 /* state for super packet */
1475 struct __kern_packet *spkt = NULL;
1476 kern_packet_t sph = 0;
1477 kern_buflet_t sbuf = NULL;
1478 bool prev_csum_ok = false, csum_ok, agg_ok;
1479 uint16_t spkts = 0, bufcnt = 0;
1480 int err;
1481
1482 struct fsw_stats *fsws = &fsw->fsw_stats;
1483
1484 /* state for buflet batch alloc */
1485 uint32_t bh_cnt, bh_cnt_tmp;
1486 uint8_t iter = 0;
1487 uint64_t buf_arr[MAX_BUFLET_COUNT];
1488 _dbuf_array_t dbuf_array = {.dba_is_buflet = true, .dba_num_dbufs = 0};
1489
1490 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1491 SK_DF(logflags, "Rx input queue len %u", KPKTQ_LEN(&fe->fe_rx_pktq));
1492
1493 bh_cnt_tmp = bh_cnt = _estimate_buflet_cnt(fe, dpp);
1494 err = pp_alloc_buflet_batch(dpp, buf_arr, &bh_cnt, SKMEM_NOSLEEP);
1495 if (__improbable(bh_cnt == 0)) {
1496 SK_ERR("failed to alloc %u buflets (err %d), use slow path",
1497 bh_cnt_tmp, err);
1498 }
1499 bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1500 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1501 if (tpkt != NULL) {
1502 void *baddr;
1503 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1504 SK_PREFETCH(baddr, 0);
1505 }
1506
1507 ASSERT(pkt->pkt_qum.qum_pp != dpp);
1508 ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
1509 ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
1510 ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
1511 ASSERT(!pkt->pkt_flow_ip_is_frag);
1512 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1513
1514 csum_ok = false;
1515 agg_ok = false;
1516 /* supports TCP only */
1517 uint32_t thlen = (pkt->pkt_flow_ip_hlen +
1518 pkt->pkt_flow_tcp_hlen);
1519 uint32_t plen = (thlen + pkt->pkt_flow_ulen);
1520 uint16_t data_csum = 0;
1521
1522 KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1523 fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1524 err = flow_pkt_track(fe, pkt, true);
1525 if (__improbable(err != 0)) {
1526 STATS_INC(fsws, FSW_STATS_RX_FLOW_TRACK_ERR);
1527 /* if need to trigger RST then deliver to host */
1528 if (err == ENETRESET) {
1529 struct flow_entry *host_fe;
1530 host_fe =
1531 flow_mgr_get_host_fe(fsw->fsw_flow_mgr);
1532 KPKTQ_ENQUEUE(&host_fe->fe_rx_pktq, pkt);
1533 continue;
1534 }
1535 SK_ERR("flow_pkt_track failed (err %d)", err);
1536 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1537 continue;
1538 }
1539
1540 if (is_mbuf) { /* compat */
1541 m_adj(pkt->pkt_mbuf, pkt->pkt_l2_len);
1542 pkt->pkt_svc_class = m_get_service_class(pkt->pkt_mbuf);
1543 }
1544
1545 if (prev_csum_ok && sbuf) {
1546 ASSERT(fa.fa_spkt == spkt);
1547 ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1548 agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
1549 agg_ok = (agg_ok && bufcnt < dpp->pp_max_frags);
1550
1551 if (agg_ok && sbuf->buf_dlim - sbuf->buf_doff -
1552 sbuf->buf_dlen >= plen - thlen) {
1553 /*
1554 * No need for a new packet, just
1555 * append to curr_m.
1556 */
1557 csum_ok = copy_pkt_csum_packed(pkt, plen, NULL,
1558 is_ipv4, NULL, sbuf, &data_csum, NULL);
1559
1560 if (!csum_ok) {
1561 STATS_INC(fsws,
1562 FSW_STATS_RX_AGG_BAD_CSUM);
1563 SK_ERR("Checksum for aggregation "
1564 "is wrong");
1565 DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail1);
1566 /*
1567 * Turns out, checksum is wrong!
1568 * Fallback to no-agg mode.
1569 */
1570 agg_ok = false;
1571 } else {
1572 flow_agg_merge_hdr(&fa, pkt,
1573 data_csum, fsws);
1574 goto next;
1575 }
1576 }
1577 }
1578
1579 /* calculate number of buflets required */
1580 bh_cnt_tmp = howmany(plen, dpp->pp_buflet_size);
1581 if (__improbable(bh_cnt_tmp > MAX_BUFLET_COUNT)) {
1582 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1583 SK_ERR("packet too big: bufcnt %d len %d", bh_cnt_tmp,
1584 plen);
1585 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1586 continue;
1587 }
1588 if (bh_cnt < bh_cnt_tmp) {
1589 uint32_t tmp;
1590
1591 if (iter != 0) {
1592 /*
1593 * rearrange the array for additional
1594 * allocation
1595 */
1596 uint8_t i;
1597 for (i = 0; i < bh_cnt; i++, iter++) {
1598 buf_arr[i] = buf_arr[iter];
1599 buf_arr[iter] = 0;
1600 }
1601 iter = 0;
1602 }
1603 tmp = _estimate_buflet_cnt(fe, dpp);
1604 tmp = MAX(tmp, bh_cnt_tmp);
1605 tmp -= bh_cnt;
1606 ASSERT(tmp <= (MAX_BUFLET_COUNT - bh_cnt));
1607 err = pp_alloc_buflet_batch(dpp, &buf_arr[bh_cnt],
1608 &tmp, SKMEM_NOSLEEP);
1609 bh_cnt += tmp;
1610 if (__improbable((tmp == 0) || (bh_cnt < bh_cnt_tmp))) {
1611 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1612 SK_ERR("buflet alloc failed (err %d)", err);
1613 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1614 continue;
1615 }
1616 }
1617 /* Use pre-allocated buflets */
1618 ASSERT(bh_cnt >= bh_cnt_tmp);
1619 dbuf_array.dba_num_dbufs = bh_cnt_tmp;
1620 while (bh_cnt_tmp-- > 0) {
1621 dbuf_array.dba_buflet[bh_cnt_tmp] =
1622 (kern_buflet_t)(buf_arr[iter]);
1623 buf_arr[iter] = 0;
1624 bh_cnt--;
1625 iter++;
1626 }
1627 /* copy and checksum TCP data */
1628 if (agg_ok) {
1629 int added = 0;
1630 ASSERT(dbuf_array.dba_num_dbufs != 0);
1631 csum_ok = copy_pkt_csum_packed(pkt, plen, &dbuf_array,
1632 is_ipv4, NULL, sbuf, &data_csum, &added);
1633
1634 if (__improbable(!csum_ok)) {
1635 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1636 SK_ERR("Checksum for aggregation on new "
1637 "mbuf is wrong");
1638 DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail2);
1639 agg_ok = false;
1640 /* reset the used buflets */
1641 uint8_t j;
1642 for (j = 0; j < dbuf_array.dba_num_dbufs; j++) {
1643 VERIFY(kern_buflet_set_data_length(
1644 dbuf_array.dba_buflet[j], 0) == 0);
1645 }
1646 goto non_agg;
1647 }
1648
1649 /*
1650 * There was not enough space in curr_m, thus we must
1651 * have added to m->m_data.
1652 */
1653 VERIFY(added > 0);
1654 } else {
1655 non_agg:
1656 ASSERT(dbuf_array.dba_num_dbufs != 0);
1657 csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
1658 &data_csum, is_ipv4);
1659 if (__improbable(!csum_ok)) {
1660 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1661 SK_ERR("%d incorrect csum", __LINE__);
1662 DTRACE_SKYWALK(aggr__chan_tcp_csum_fail);
1663 }
1664 }
1665 if (agg_ok) {
1666 ASSERT(fa.fa_spkt == spkt);
1667 ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1668 /* update current packet header */
1669 flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
1670 ASSERT(dbuf_array.dba_num_dbufs > 0);
1671 bufcnt += dbuf_array.dba_num_dbufs;
1672 _append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1673 &sbuf);
1674 } else {
1675 /* Finalize the current super packet */
1676 if (sph != 0) {
1677 spkts++;
1678 if (bufcnt > 1) {
1679 spkt->pkt_aggr_type =
1680 PKT_AGGR_SINGLE_IP;
1681 }
1682 pkt_finalize(sph);
1683 pkt_agg_log(spkt, kernproc, false);
1684 DTRACE_SKYWALK1(aggr__buflet__count, uint16_t,
1685 bufcnt);
1686 sph = 0;
1687 spkt = NULL;
1688 FLOW_AGG_CLEAR(&fa);
1689 }
1690
1691 /* New super packet */
1692 err = kern_pbufpool_alloc_nosleep(dpp, 0, &sph);
1693 if (__improbable(err != 0)) {
1694 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1695 SK_ERR("packet alloc failed (err %d)", err);
1696 _free_dbuf_array(dpp, &dbuf_array);
1697 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1698 continue;
1699 }
1700 spkt = SK_PTR_ADDR_KPKT(sph);
1701 pkt_copy_metadata(pkt, spkt);
1702 /* Packet length for super packet starts from L3 */
1703 spkt->pkt_length = plen;
1704 spkt->pkt_flow_ulen = pkt->pkt_flow_ulen;
1705 spkt->pkt_headroom = 0;
1706 spkt->pkt_l2_len = 0;
1707 spkt->pkt_seg_cnt = 1;
1708
1709 ASSERT(dbuf_array.dba_num_dbufs > 0);
1710 bufcnt = dbuf_array.dba_num_dbufs;
1711 sbuf = kern_packet_get_next_buflet(sph, NULL);
1712 _append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1713 &sbuf);
1714
1715 KPKTQ_ENQUEUE(&pkts, spkt);
1716 _UUID_COPY(spkt->pkt_flow_id, fe->fe_uuid);
1717 _UUID_COPY(spkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1718 spkt->pkt_policy_id = fe->fe_policy_id;
1719 spkt->pkt_transport_protocol =
1720 fe->fe_transport_protocol;
1721 flow_agg_init_spkt(&fa, spkt, pkt);
1722 }
1723 next:
1724 pkt_agg_log(pkt, kernproc, true);
1725 prev_csum_ok = csum_ok;
1726 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1727 }
1728
1729 /* Free unused buflets */
1730 while (bh_cnt > 0) {
1731 pp_free_buflet(dpp, (kern_buflet_t)(buf_arr[iter]));
1732 buf_arr[iter] = 0;
1733 bh_cnt--;
1734 iter++;
1735 }
1736 /* Finalize the last super packet */
1737 if (sph != 0) {
1738 spkts++;
1739 if (bufcnt > 1) {
1740 spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1741 }
1742 pkt_finalize(sph);
1743 pkt_agg_log(spkt, kernproc, false);
1744 DTRACE_SKYWALK1(aggr__buflet__count, uint16_t, bufcnt);
1745 sph = 0;
1746 spkt = NULL;
1747 FLOW_AGG_CLEAR(&fa);
1748 }
1749 DTRACE_SKYWALK1(aggr__spkt__count, uint16_t, spkts);
1750 if (__improbable(is_mbuf)) {
1751 STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2PKT, spkts);
1752 } else {
1753 STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2PKT, spkts);
1754 }
1755 FLOW_STATS_IN_ADD(fe, spackets, spkts);
1756
1757 KPKTQ_FINI(&fe->fe_rx_pktq);
1758 KPKTQ_CONCAT(&fe->fe_rx_pktq, &pkts);
1759 KPKTQ_FINI(&pkts);
1760
1761 fsw_ring_enqueue_tail_drop(fsw, ring, &fe->fe_rx_pktq);
1762
1763 pp_free_pktq(&disposed_pkts);
1764 }
1765
1766 SK_NO_INLINE_ATTRIBUTE
1767 static void
flow_rx_agg_host(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * dropped_pkts,bool is_mbuf)1768 flow_rx_agg_host(struct nx_flowswitch *fsw, struct flow_entry *fe,
1769 struct pktq *dropped_pkts, bool is_mbuf)
1770 {
1771 #define __RX_AGG_HOST_DROP_SOURCE_PACKET(_pkt) do { \
1772 drop_packets++; \
1773 drop_bytes += (_pkt)->pkt_length; \
1774 KPKTQ_ENQUEUE(dropped_pkts, (_pkt)); \
1775 (_pkt) = NULL; \
1776 FLOW_AGG_CLEAR(&fa); \
1777 prev_csum_ok = false; \
1778 } while (0)
1779 struct flow_agg fa; /* states */
1780 FLOW_AGG_CLEAR(&fa);
1781
1782 struct pktq disposed_pkts; /* done src packets */
1783 KPKTQ_INIT(&disposed_pkts);
1784
1785 int alloced = 0;
1786 int factor;
1787
1788 struct __kern_packet *pkt, *tpkt;
1789 /* points to the first mbuf of chain */
1790 struct mbuf *m_chain = NULL;
1791 /* super mbuf, at the end it points to last mbuf packet */
1792 struct mbuf *smbuf = NULL, *curr_m = NULL;
1793 bool prev_csum_ok = false, csum_ok, agg_ok;
1794 uint16_t smbufs = 0;
1795 uint32_t bytes = 0, rcvd_ulen = 0;
1796 uint32_t rcvd_packets = 0, rcvd_bytes = 0; /* raw packets & bytes */
1797 uint32_t drop_packets = 0, drop_bytes = 0; /* dropped packets & bytes */
1798 uint32_t largest_smbuf = 0;
1799 int err = 0;
1800
1801 struct fsw_stats *fsws = &fsw->fsw_stats;
1802 bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1803
1804 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1805
1806 /* state for mbuf batch alloc */
1807 uint32_t mhead_cnt;
1808 uint32_t mhead_bufsize;
1809 struct mbuf * mhead = NULL;
1810
1811 uint16_t l2len = KPKTQ_FIRST(&fe->fe_rx_pktq)->pkt_l2_len;
1812
1813 SK_DF(logflags, "Rx input queue bytes %u", fe->fe_rx_pktq_bytes);
1814
1815 if (__probable(!is_mbuf)) {
1816 uint32_t max_ip_len = MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET);
1817
1818 /*
1819 * Batch mbuf alloc is based on
1820 * convert_native_pkt_to_mbuf_chain
1821 */
1822 if (__probable(fe->fe_rx_largest_msize != 0 &&
1823 max_ip_len > 0)) {
1824 unsigned int one;
1825 int wait;
1826
1827 if (fe->fe_rx_largest_msize <= MCLBYTES) {
1828 mhead_bufsize = MCLBYTES;
1829 } else if (fe->fe_rx_largest_msize <= MBIGCLBYTES) {
1830 mhead_bufsize = MBIGCLBYTES;
1831 } else {
1832 mhead_bufsize = M16KCLBYTES;
1833 }
1834
1835 try_again:
1836 if (fe->fe_rx_pktq_bytes != 0) {
1837 uint32_t aggregation_size =
1838 MAX(fe->fe_rx_largest_msize, MCLBYTES);
1839
1840 aggregation_size =
1841 MIN(aggregation_size, mhead_bufsize);
1842
1843 factor = (fe->fe_rx_pktq_bytes / max_ip_len) *
1844 (MAX(sizeof(struct ip),
1845 sizeof(struct ip6_hdr)) +
1846 sizeof(struct tcphdr));
1847
1848 mhead_cnt = MAX(((fe->fe_rx_pktq_bytes +
1849 factor) / aggregation_size) + 1, 1);
1850 } else {
1851 /* No payload, thus it's all small-sized ACKs/... */
1852 mhead_bufsize = MHLEN;
1853 mhead_cnt = KPKTQ_LEN(&fe->fe_rx_pktq);
1854 }
1855
1856 one = 1;
1857
1858 if (mhead_bufsize >= MBIGCLBYTES) {
1859 wait = M_NOWAIT;
1860 } else {
1861 wait = M_WAITOK;
1862 }
1863
1864 mhead = m_allocpacket_internal(&mhead_cnt,
1865 mhead_bufsize, &one, wait, 1, 0);
1866
1867 if (mhead == NULL) {
1868 if (mhead_bufsize == M16KCLBYTES) {
1869 mhead_bufsize = MBIGCLBYTES;
1870 goto try_again;
1871 }
1872
1873 if (mhead_bufsize == MBIGCLBYTES) {
1874 mhead_bufsize = MCLBYTES;
1875 goto try_again;
1876 }
1877 }
1878 } else {
1879 mhead = NULL;
1880 mhead_bufsize = mhead_cnt = 0;
1881 }
1882 SK_DF(logflags, "batch alloc'ed %u mbufs of size %u", mhead_cnt,
1883 mhead_bufsize);
1884 }
1885
1886 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1887 if (tpkt != NULL) {
1888 void *baddr;
1889 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1890 SK_PREFETCH(baddr, 0);
1891 }
1892
1893 /* Validate l2 len, ip vers, is_mbuf */
1894 ASSERT(pkt->pkt_l2_len == l2len);
1895 ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
1896 ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
1897 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
1898 ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
1899 ASSERT(!pkt->pkt_flow_ip_is_frag);
1900 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1901
1902 csum_ok = false;
1903 agg_ok = false;
1904 /*
1905 * As we only agg packets with same hdr length,
1906 * leverage the pkt metadata
1907 */
1908 uint32_t thlen = (pkt->pkt_flow_ip_hlen +
1909 pkt->pkt_flow_tcp_hlen);
1910 uint32_t plen = (thlen + pkt->pkt_flow_ulen);
1911
1912 /*
1913 * Rather than calling flow_pkt_track() for each
1914 * packet here, we accumulate received packet stats
1915 * for the call to flow_track_stats() below. This
1916 * is because flow tracking is a no-op for traffic
1917 * that belongs to the host stack.
1918 */
1919 rcvd_ulen += pkt->pkt_flow_ulen;
1920 rcvd_bytes += pkt->pkt_length;
1921 rcvd_packets++;
1922
1923 KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1924 fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1925
1926 /* packet is for BSD flow, create a mbuf chain */
1927 uint32_t len = (l2len + plen);
1928 uint16_t data_csum = 0;
1929 struct mbuf *m;
1930 if (__improbable(is_mbuf)) {
1931 m = pkt->pkt_mbuf;
1932 /* Detach mbuf from source pkt */
1933 KPKT_CLEAR_MBUF_DATA(pkt);
1934
1935 uint32_t trailer = (m_pktlen(m) - len);
1936 ASSERT((uint32_t)m_pktlen(m) >= plen);
1937 /* Remove the trailer */
1938 if (trailer > 0) {
1939 m_adj(m, -trailer);
1940 }
1941 /* attached mbuf is already allocated */
1942 csum_ok = mbuf_csum(pkt, m, is_ipv4, &data_csum);
1943 } else { /* native */
1944 uint16_t pad = P2ROUNDUP(l2len, sizeof(uint32_t)) -
1945 l2len;
1946 uint32_t tot_len = (len + pad);
1947 /* remember largest aggregated packet size */
1948 if (smbuf) {
1949 if (largest_smbuf < (uint32_t)m_pktlen(smbuf)) {
1950 largest_smbuf =
1951 (uint32_t)m_pktlen(smbuf);
1952 }
1953 }
1954
1955 if (prev_csum_ok && curr_m) {
1956 ASSERT(fa.fa_smbuf == smbuf);
1957 ASSERT(!fa.fa_sobj_is_pkt);
1958 agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
1959
1960 if (agg_ok &&
1961 M_TRAILINGSPACE(curr_m) >= plen - thlen) {
1962 /*
1963 * No need for a new mbuf,
1964 * just append to curr_m.
1965 */
1966 csum_ok = copy_pkt_csum_packed(pkt,
1967 plen, NULL, is_ipv4, curr_m, NULL,
1968 &data_csum, NULL);
1969
1970 if (!csum_ok) {
1971 STATS_INC(fsws,
1972 FSW_STATS_RX_AGG_BAD_CSUM);
1973 SK_ERR("Checksum for "
1974 "aggregation is wrong");
1975 DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail1);
1976 /*
1977 * Turns out, checksum is wrong!
1978 * Fallback to no-agg mode.
1979 */
1980 agg_ok = 0;
1981 } else {
1982 /*
1983 * We only added payload,
1984 * thus -thlen.
1985 */
1986 bytes += (plen - thlen);
1987 flow_agg_merge_hdr(&fa, pkt,
1988 data_csum, fsws);
1989 goto next;
1990 }
1991 }
1992 }
1993
1994 /*
1995 * If the batch allocation returned partial success,
1996 * we try blocking allocation here again
1997 */
1998 m = mhead;
1999 if (__improbable(m == NULL ||
2000 tot_len > mhead_bufsize)) {
2001 unsigned int one = 1;
2002
2003 ASSERT(mhead_cnt == 0 || mhead != NULL);
2004 err = mbuf_allocpacket(MBUF_WAITOK, tot_len,
2005 &one, &m);
2006 if (err != 0) {
2007 STATS_INC(fsws,
2008 FSW_STATS_RX_DROP_NOMEM_BUF);
2009 SK_ERR("mbuf alloc failed (err %d)",
2010 err);
2011 __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2012 continue;
2013 }
2014 alloced++;
2015 } else {
2016 ASSERT(mhead_cnt > 0);
2017 mhead = m->m_nextpkt;
2018 m->m_nextpkt = NULL;
2019 mhead_cnt--;
2020 }
2021 m->m_data += pad;
2022 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2023
2024 /*
2025 * copy and checksum l3, l4 and payload
2026 * l2 header is copied later only if we
2027 * can't agg as an optimization
2028 */
2029 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
2030 _dbuf_array_t dbuf_array = {.dba_is_buflet = false};
2031 if (agg_ok) {
2032 int added = 0;
2033 dbuf_array.dba_mbuf[0] = m;
2034 dbuf_array.dba_num_dbufs = 1;
2035 csum_ok = copy_pkt_csum_packed(pkt, plen,
2036 &dbuf_array, is_ipv4, curr_m, NULL,
2037 &data_csum, &added);
2038
2039 if (!csum_ok) {
2040 STATS_INC(fsws,
2041 FSW_STATS_RX_AGG_BAD_CSUM);
2042 SK_ERR("Checksum for aggregation "
2043 "on new mbuf is wrong");
2044 DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail2);
2045 agg_ok = false;
2046 goto non_agg;
2047 }
2048
2049 /*
2050 * There was not enough space in curr_m,
2051 * thus we must have added to m->m_data.
2052 */
2053 VERIFY(added > 0);
2054 VERIFY(m->m_len == m->m_pkthdr.len &&
2055 (uint32_t)m->m_len <=
2056 (uint32_t)mbuf_maxlen(m));
2057
2058 /*
2059 * We account for whatever we added
2060 * to m later on, thus - added.
2061 */
2062 bytes += plen - thlen - added;
2063 } else {
2064 non_agg:
2065 dbuf_array.dba_mbuf[0] = m;
2066 dbuf_array.dba_num_dbufs = 1;
2067 m->m_len += l2len;
2068 m->m_pkthdr.len += l2len;
2069 csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
2070 &data_csum, is_ipv4);
2071 if (__improbable(!csum_ok)) {
2072 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
2073 SK_ERR("%d incorrect csum", __LINE__);
2074 DTRACE_SKYWALK(aggr__host_tcp_csum_fail);
2075 }
2076 VERIFY(m->m_len == m->m_pkthdr.len &&
2077 (uint32_t)m->m_len <=
2078 (uint32_t)mbuf_maxlen(m));
2079 }
2080
2081 STATS_INC(fsws, FSW_STATS_RX_COPY_PKT2MBUF);
2082 STATS_INC(fsws, FSW_STATS_RX_COPY_SUM);
2083
2084 m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off;
2085 m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value;
2086 /*
2087 * Note that these flags have same value,
2088 * except PACKET_CSUM_PARTIAL
2089 */
2090 m->m_pkthdr.csum_flags |= (pkt->pkt_csum_flags &
2091 PACKET_CSUM_RX_FLAGS);
2092
2093 /* Set the rcvif */
2094 m->m_pkthdr.rcvif = fsw->fsw_ifp;
2095 }
2096 ASSERT(m != NULL);
2097 ASSERT((m->m_flags & M_PKTHDR) && m->m_pkthdr.pkt_hdr != NULL);
2098 ASSERT((m->m_flags & M_HASFCS) == 0);
2099 ASSERT(m->m_nextpkt == NULL);
2100
2101 if (__improbable(is_mbuf)) {
2102 if ((uint32_t) m->m_len < (l2len + thlen)) {
2103 m = m_pullup(m, (l2len + thlen));
2104 if (m == NULL) {
2105 STATS_INC(fsws,
2106 FSW_STATS_RX_DROP_NOMEM_BUF);
2107 SK_ERR("mbuf pullup failed (err %d)",
2108 err);
2109 __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2110 continue;
2111 }
2112 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2113 }
2114 if (prev_csum_ok && csum_ok) {
2115 ASSERT(fa.fa_smbuf == smbuf);
2116 agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2117 }
2118 }
2119
2120 if (agg_ok) {
2121 ASSERT(fa.fa_smbuf == smbuf);
2122 ASSERT(!fa.fa_sobj_is_pkt);
2123 if (__improbable(is_mbuf)) {
2124 bytes += (m_pktlen(m) - l2len);
2125 /* adjust mbuf by l2, l3 and l4 hdr */
2126 m_adj(m, l2len + thlen);
2127 } else {
2128 bytes += m_pktlen(m);
2129 }
2130
2131 m->m_flags &= ~M_PKTHDR;
2132 flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
2133 while (curr_m->m_next != NULL) {
2134 curr_m = curr_m->m_next;
2135 }
2136 curr_m->m_next = m;
2137 curr_m = m;
2138 m = NULL;
2139 } else {
2140 if ((uint32_t) m->m_len < l2len) {
2141 m = m_pullup(m, l2len);
2142 if (m == NULL) {
2143 STATS_INC(fsws,
2144 FSW_STATS_RX_DROP_NOMEM_BUF);
2145 SK_ERR("mbuf pullup failed (err %d)",
2146 err);
2147 __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2148 continue;
2149 }
2150 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2151 }
2152
2153 /* copy l2 header for native */
2154 if (__probable(!is_mbuf)) {
2155 uint16_t llhoff = pkt->pkt_headroom;
2156 uint8_t *baddr;
2157 MD_BUFLET_ADDR_ABS(pkt, baddr);
2158 ASSERT(baddr != NULL);
2159 baddr += llhoff;
2160 pkt_copy(baddr, m->m_data, l2len);
2161 }
2162 /* adjust mbuf by l2 hdr */
2163 m_adj(m, l2len);
2164 bytes += m_pktlen(m);
2165
2166 /*
2167 * aggregated packets can be skipped by pktap because
2168 * the original pre-aggregated chain already passed through
2169 * pktap (see fsw_snoop()) before entering this function.
2170 */
2171 m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
2172
2173 if (m_chain == NULL) {
2174 /* this is the start of the chain */
2175 m_chain = m;
2176 smbuf = m;
2177 curr_m = m;
2178 } else if (smbuf != NULL) {
2179 /*
2180 * set m to be next packet
2181 */
2182 mbuf_agg_log(smbuf, kernproc, is_mbuf);
2183 smbuf->m_nextpkt = m;
2184 smbuf = m;
2185 curr_m = m;
2186 } else {
2187 VERIFY(0);
2188 }
2189
2190 smbufs++;
2191 m = NULL;
2192
2193 flow_agg_init_smbuf(&fa, smbuf, pkt);
2194 /*
2195 * if the super packet is an mbuf which can't accomodate
2196 * (sizeof(struct ip6_tcp_mask) in a single buffer then
2197 * do the aggregation check in slow path.
2198 * Note that an mbuf without cluster has only 80 bytes
2199 * available for data, sizeof(struct ip6_tcp_mask) is
2200 * also 80 bytes, so if the packet contains an
2201 * ethernet header, this mbuf won't be able to fully
2202 * contain "struct ip6_tcp_mask" data in a single
2203 * buffer.
2204 */
2205 if (pkt->pkt_flow_ip_ver == IPV6_VERSION) {
2206 if (__improbable(smbuf->m_len <
2207 ((smbuf->m_data -
2208 (caddr_t)(smbuf->m_pkthdr.pkt_hdr)) +
2209 MASK_SIZE))) {
2210 fa.fa_sobj_is_short = true;
2211 }
2212 }
2213 }
2214 next:
2215 pkt_agg_log(pkt, kernproc, true);
2216 prev_csum_ok = csum_ok;
2217 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
2218 }
2219
2220 KPKTQ_FINI(&fe->fe_rx_pktq);
2221
2222 /* Free any leftover mbufs, true only for native */
2223 if (__improbable(mhead != NULL)) {
2224 ASSERT(mhead_cnt != 0);
2225 (void) m_freem_list(mhead);
2226 mhead = NULL;
2227 mhead_cnt = 0;
2228 mhead_bufsize = 0;
2229 }
2230
2231 if (fe->fe_rx_largest_msize > largest_smbuf) {
2232 /*
2233 * Make it slowly move towards smbuf if we consistently get
2234 * non-aggregatable size.
2235 *
2236 * If we start at 16K, this makes us go to 4K within 6 rounds
2237 * and down to 2K within 12 rounds.
2238 */
2239 fe->fe_rx_largest_msize -=
2240 ((fe->fe_rx_largest_msize - largest_smbuf) >> 2);
2241 } else {
2242 fe->fe_rx_largest_msize +=
2243 ((largest_smbuf - fe->fe_rx_largest_msize) >> 2);
2244 }
2245
2246 if (smbufs > 0) {
2247 /* Last smbuf */
2248 mbuf_agg_log(smbuf, kernproc, is_mbuf);
2249 SK_DF(logflags, "smbuf count %u", smbufs);
2250
2251 ASSERT(m_chain != NULL);
2252 ASSERT(smbuf != NULL);
2253 /*
2254 * Call fsw_host_sendup() with mbuf chain
2255 * directly.
2256 */
2257 mchain_agg_log(m_chain, kernproc, is_mbuf);
2258 fsw_host_sendup(fsw->fsw_ifp, m_chain, smbuf, smbufs, bytes);
2259
2260 if (__improbable(is_mbuf)) {
2261 STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2MBUF, smbufs);
2262 } else {
2263 STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2MBUF, smbufs);
2264 }
2265 FLOW_STATS_IN_ADD(fe, spackets, smbufs);
2266
2267 ASSERT((fe->fe_flags & FLOWENTF_TRACK) == 0);
2268 }
2269
2270 /* record (raw) number of packets and bytes */
2271 ASSERT((int)(rcvd_bytes - drop_bytes) > 0);
2272 ASSERT((int)(rcvd_packets - drop_packets) > 0);
2273 flow_track_stats(fe, (rcvd_bytes - drop_bytes),
2274 (rcvd_packets - drop_packets), (rcvd_ulen != 0), true);
2275
2276 pp_free_pktq(&disposed_pkts);
2277 }
2278
2279 void
flow_rx_agg_tcp(struct nx_flowswitch * fsw,struct flow_entry * fe)2280 flow_rx_agg_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe)
2281 {
2282 struct pktq dropped_pkts;
2283 bool is_mbuf;
2284
2285 if (__improbable(fe->fe_rx_frag_count > 0)) {
2286 dp_flow_rx_process(fsw, fe);
2287 return;
2288 }
2289
2290 KPKTQ_INIT(&dropped_pkts);
2291
2292 if (!dp_flow_rx_route_process(fsw, fe)) {
2293 SK_ERR("Rx route bad");
2294 fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
2295 STATS_ADD(&fsw->fsw_stats, FSW_STATS_RX_FLOW_NONVIABLE,
2296 KPKTQ_LEN(&dropped_pkts));
2297 goto done;
2298 }
2299
2300 is_mbuf = !!(PKT_IS_MBUF(KPKTQ_FIRST(&fe->fe_rx_pktq)));
2301
2302 if (fe->fe_nx_port == FSW_VP_HOST) {
2303 boolean_t do_rx_agg;
2304
2305 /* BSD flow */
2306 if (sk_fsw_rx_agg_tcp_host != SK_FSW_RX_AGG_TCP_HOST_AUTO) {
2307 do_rx_agg = (sk_fsw_rx_agg_tcp_host ==
2308 SK_FSW_RX_AGG_TCP_HOST_ON);
2309 } else {
2310 do_rx_agg = !dlil_has_ip_filter() &&
2311 !dlil_has_if_filter(fsw->fsw_ifp);
2312 }
2313 if (__improbable(!do_rx_agg)) {
2314 fsw_host_rx(fsw, fe);
2315 return;
2316 }
2317 if (__improbable(pktap_total_tap_count != 0)) {
2318 fsw_snoop(fsw, fe, true);
2319 }
2320 flow_rx_agg_host(fsw, fe, &dropped_pkts, is_mbuf);
2321 } else {
2322 /* channel flow */
2323 if (__improbable(pktap_total_tap_count != 0)) {
2324 fsw_snoop(fsw, fe, true);
2325 }
2326 flow_rx_agg_channel(fsw, fe, &dropped_pkts, is_mbuf);
2327 }
2328
2329 done:
2330 pp_free_pktq(&dropped_pkts);
2331 }
2332