1 /*
2 * Copyright (c) 2019-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
33 #include <skywalk/nexus/netif/nx_netif.h>
34 #include <skywalk/nexus/netif/nx_netif_compat.h>
35 #include <netinet/tcp.h>
36 #include <netinet/ip.h>
37 #include <netinet/ip6.h>
38 #include <net/pktap.h>
39 #include <sys/sdt.h>
40
41 #define MAX_BUFLET_COUNT (64)
42 #define TCP_FLAGS_IGNORE (TH_FIN|TH_SYN|TH_RST|TH_URG)
43 #define PKT_IS_MBUF(_pkt) (_pkt->pkt_pflags & PKT_F_MBUF_DATA)
44 #define PKT_IS_TRUNC_MBUF(_pkt) (PKT_IS_MBUF(_pkt) && \
45 (_pkt->pkt_pflags & PKT_F_TRUNCATED))
46
47 /*
48 * This structure holds per-super object (mbuf/packet) flow aggregation states.
49 */
50 struct flow_agg {
51 union {
52 struct {
53 union {
54 void * _fa_sobj;
55 struct mbuf * _fa_smbuf; /* super mbuf */
56 struct __kern_packet *_fa_spkt; /* super pkt */
57 };
58 uint8_t *_fa_sptr; /* ptr to super IP header */
59 bool _fa_sobj_is_pkt; /* super obj is pkt or mbuf */
60 /*
61 * super obj is not large enough to hold the IP & TCP
62 * header in a contiguous buffer.
63 */
64 bool _fa_sobj_is_short;
65 uint32_t _fa_tcp_seq; /* expected next sequence # */
66 uint32_t _fa_ulen; /* expected next ulen */
67 uint32_t _fa_total; /* total aggregated bytes */
68 } __flow_agg;
69 uint64_t __flow_agg_data[4];
70 };
71 #define fa_sobj __flow_agg._fa_sobj
72 #define fa_smbuf __flow_agg._fa_smbuf
73 #define fa_spkt __flow_agg._fa_spkt
74 #define fa_sptr __flow_agg._fa_sptr
75 #define fa_sobj_is_pkt __flow_agg._fa_sobj_is_pkt
76 #define fa_sobj_is_short __flow_agg._fa_sobj_is_short
77 #define fa_tcp_seq __flow_agg._fa_tcp_seq
78 #define fa_ulen __flow_agg._fa_ulen
79 #define fa_total __flow_agg._fa_total
80 };
81
82 #define FLOW_AGG_CLEAR(_fa) do { \
83 _CASSERT(sizeof(struct flow_agg) == 32); \
84 sk_zero_32(_fa); \
85 } while (0)
86
87 #define MASK_SIZE 80 /* size of struct {ip,ip6}_tcp_mask */
88
89 struct ip_tcp_mask {
90 struct ip ip_m;
91 struct tcphdr tcp_m;
92 uint32_t tcp_option_m[MAX_TCPOPTLEN / sizeof(uint32_t)];
93 };
94
95 static const struct ip_tcp_mask ip_tcp_mask
96 __sk_aligned(16) =
97 {
98 .ip_m = {
99 .ip_hl = 0xf,
100 .ip_v = 0xf,
101 .ip_tos = 0xff,
102 /* Not checked; aggregated packet's ip_len is increasing */
103 .ip_len = 0,
104 .ip_id = 0,
105 .ip_off = 0xffff,
106 .ip_ttl = 0xff,
107 .ip_p = 0xff,
108 .ip_sum = 0,
109 .ip_src.s_addr = 0xffffffff,
110 .ip_dst.s_addr = 0xffffffff,
111 },
112 .tcp_m = {
113 .th_sport = 0xffff,
114 .th_dport = 0xffff,
115 .th_seq = 0,
116 .th_ack = 0xffffffff,
117 .th_x2 = 0xf,
118 .th_off = 0xf,
119 .th_flags = ~TH_PUSH,
120 .th_win = 0xffff,
121 .th_sum = 0,
122 .th_urp = 0xffff,
123 },
124 .tcp_option_m = {
125 /* Max 40 bytes of TCP options */
126 0xffffffff,
127 0xffffffff,
128 0xffffffff,
129 0, /* Filling up to MASK_SIZE */
130 0, /* Filling up to MASK_SIZE */
131 0, /* Filling up to MASK_SIZE */
132 0, /* Filling up to MASK_SIZE */
133 0, /* Filling up to MASK_SIZE */
134 0, /* Filling up to MASK_SIZE */
135 0, /* Filling up to MASK_SIZE */
136 },
137 };
138
139 struct ip6_tcp_mask {
140 struct ip6_hdr ip6_m;
141 struct tcphdr tcp_m;
142 uint32_t tcp_option_m[5]; /* 5 bytes to fill up to MASK_SIZE */
143 };
144
145 static const struct ip6_tcp_mask ip6_tcp_mask
146 __sk_aligned(16) =
147 {
148 .ip6_m = {
149 .ip6_ctlun.ip6_un1.ip6_un1_flow = 0xffffffff,
150 /* Not checked; aggregated packet's ip_len is increasing */
151 .ip6_ctlun.ip6_un1.ip6_un1_plen = 0,
152 .ip6_ctlun.ip6_un1.ip6_un1_nxt = 0xff,
153 .ip6_ctlun.ip6_un1.ip6_un1_hlim = 0xff,
154 .ip6_src.__u6_addr.__u6_addr32[0] = 0xffffff,
155 .ip6_src.__u6_addr.__u6_addr32[1] = 0xffffff,
156 .ip6_src.__u6_addr.__u6_addr32[2] = 0xffffff,
157 .ip6_src.__u6_addr.__u6_addr32[3] = 0xffffff,
158 .ip6_dst.__u6_addr.__u6_addr32[0] = 0xffffff,
159 .ip6_dst.__u6_addr.__u6_addr32[1] = 0xffffff,
160 .ip6_dst.__u6_addr.__u6_addr32[2] = 0xffffff,
161 .ip6_dst.__u6_addr.__u6_addr32[3] = 0xffffff,
162 },
163 .tcp_m = {
164 .th_sport = 0xffff,
165 .th_dport = 0xffff,
166 .th_seq = 0,
167 .th_ack = 0xffffffff,
168 .th_x2 = 0xf,
169 .th_off = 0xf,
170 .th_flags = ~TH_PUSH,
171 .th_win = 0xffff,
172 .th_sum = 0,
173 .th_urp = 0xffff,
174 },
175 .tcp_option_m = {
176 /* Max 40 bytes of TCP options */
177 0xffffffff,
178 0xffffffff,
179 0xffffffff,
180 0, /* Filling up to MASK_SIZE */
181 0, /* Filling up to MASK_SIZE */
182 },
183 };
184
185
186 #if SK_LOG
187 SK_LOG_ATTRIBUTE
188 static void
_pkt_agg_log(struct __kern_packet * pkt,struct proc * p,bool is_input)189 _pkt_agg_log(struct __kern_packet *pkt, struct proc *p, bool is_input)
190 {
191 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
192 (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
193
194 kern_packet_t ph = SK_PKT2PH(pkt);
195 uint64_t bufcnt = 1;
196 if (!is_input) {
197 bufcnt = kern_packet_get_buflet_count(ph);
198 }
199
200 SK_DF(logflags, "%s(%d) %spkt 0x%llx plen %u",
201 sk_proc_name_address(p), sk_proc_pid(p), is_input ? "s":"d",
202 SK_KVA(pkt), pkt->pkt_length);
203
204 SK_DF(logflags, "%spkt csumf/rxstart/rxval 0x%x/%u/0x%04x",
205 is_input ? "s":"d", pkt->pkt_csum_flags,
206 (uint32_t)pkt->pkt_csum_rx_start_off,
207 (uint32_t)pkt->pkt_csum_rx_value);
208
209 if (!is_input) {
210 kern_buflet_t buf = kern_packet_get_next_buflet(ph, NULL);
211
212 /* Individual buflets */
213 for (uint64_t i = 0; i < bufcnt && buf != NULL; i++) {
214 SK_DF(logflags | SK_VERB_DUMP, "%s",
215 sk_dump("buf", kern_buflet_get_data_address(buf),
216 pkt->pkt_length, 128, NULL, 0));
217 buf = kern_packet_get_next_buflet(ph, buf);
218 }
219 }
220 }
221
222 #define pkt_agg_log(_pkt, _p, _is_input) do { \
223 if (__improbable(sk_verbose != 0)) { \
224 _pkt_agg_log(_pkt, _p, _is_input); \
225 } \
226 } while (0)
227
228 SK_LOG_ATTRIBUTE
229 static void
_mbuf_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)230 _mbuf_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
231 {
232 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
233 (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
234
235 SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
236 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
237 m->m_pkthdr.len);
238
239 SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
240 m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
241 (uint32_t)m->m_pkthdr.csum_rx_val);
242
243 /* Dump the first mbuf */
244 ASSERT(m->m_data != NULL);
245 SK_DF(logflags | SK_VERB_DUMP, "%s", sk_dump("buf",
246 (uint8_t *)m->m_data, m->m_len, 128, NULL, 0));
247 }
248
249 #define mbuf_agg_log(_m, _p, _is_mbuf) do { \
250 if (__improbable(sk_verbose != 0)) { \
251 _mbuf_agg_log(_m, _p, _is_mbuf); \
252 } \
253 } while (0)
254
255 SK_LOG_ATTRIBUTE
256 static void
_mchain_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)257 _mchain_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
258 {
259 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
260 (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
261
262 while (m != NULL) {
263 SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
264 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
265 m->m_pkthdr.len);
266
267 SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
268 m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
269 (uint32_t)m->m_pkthdr.csum_rx_val);
270
271 m = m->m_nextpkt;
272 }
273 }
274
275 #define mchain_agg_log(_m, _p, _is_mbuf) do { \
276 if (__improbable(sk_verbose != 0)) { \
277 _mchain_agg_log(_m, _p, _is_mbuf); \
278 } \
279 } while (0)
280 #else
281 #define pkt_agg_log(...)
282 #define mbuf_agg_log(...)
283 #define mchain_agg_log(...)
284 #endif /* SK_LOG */
285
286 /*
287 * Checksum only for packet with mbuf.
288 */
289 static bool
mbuf_csum(struct __kern_packet * pkt,struct mbuf * m,bool verify_l3,uint16_t * data_csum)290 mbuf_csum(struct __kern_packet *pkt, struct mbuf *m, bool verify_l3,
291 uint16_t *data_csum)
292 {
293 ASSERT(data_csum != NULL);
294
295 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
296 uint32_t plen = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen +
297 pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
298 uint16_t l4len = plen - pkt->pkt_l2_len - pkt->pkt_flow_ip_hlen;
299 uint16_t start = pkt->pkt_l2_len;
300 uint32_t partial = 0;
301 uint16_t csum = 0;
302
303 ASSERT(plen == m_pktlen(m));
304
305 /* Some compat drivers compute full checksum */
306 if ((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS) ==
307 CSUM_RX_FULL_FLAGS) {
308 SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
309 m->m_pkthdr.csum_flags, m->m_pkthdr.csum_rx_start,
310 m->m_pkthdr.csum_rx_val);
311
312 /* Compute the data_csum */
313 struct tcphdr *tcp =
314 (struct tcphdr *)(void *)(mtod(m, uint8_t *) +
315 pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen);
316 /* 16-bit alignment is sufficient */
317 ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
318
319 uint16_t th_sum = tcp->th_sum;
320 tcp->th_sum = 0;
321
322 partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
323 pkt->pkt_flow_tcp_hlen);
324 partial += htons(l4len + IPPROTO_TCP);
325 if (pkt->pkt_flow_ip_ver == IPVERSION) {
326 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
327 pkt->pkt_flow_ipv4_dst.s_addr, partial);
328 } else {
329 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
330 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
331 &pkt->pkt_flow_ipv6_dst, partial);
332 }
333 /* Restore the original checksum */
334 tcp->th_sum = th_sum;
335 th_sum = __packet_fix_sum(th_sum, csum, 0);
336 *data_csum = ~th_sum & 0xffff;
337 if ((m->m_pkthdr.csum_rx_val ^ 0xffff) == 0) {
338 return true;
339 } else {
340 return false;
341 }
342 }
343 /* Reset the csum RX flags */
344 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
345 if (verify_l3) {
346 csum = m_sum16(m, start, pkt->pkt_flow_ip_hlen);
347 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
348 start, pkt->pkt_flow_ip_hlen, csum);
349 m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED;
350 if ((csum ^ 0xffff) != 0) {
351 return false;
352 } else {
353 m->m_pkthdr.csum_flags |= CSUM_IP_VALID;
354 }
355 }
356 /* Compute L4 header checksum */
357 partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
358 pkt->pkt_flow_tcp_hlen);
359 /* Compute payload checksum */
360 start += (pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
361 *data_csum = m_sum16(m, start, (plen - start));
362
363 /* Fold in the data checksum to TCP checksum */
364 partial += *data_csum;
365 partial += htons(l4len + IPPROTO_TCP);
366 if (pkt->pkt_flow_ip_ver == IPVERSION) {
367 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
368 pkt->pkt_flow_ipv4_dst.s_addr, partial);
369 } else {
370 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
371 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
372 &pkt->pkt_flow_ipv6_dst, partial);
373 }
374 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
375 start - pkt->pkt_flow_tcp_hlen, l4len, csum);
376 // Set start to 0 for full checksum
377 m->m_pkthdr.csum_rx_start = 0;
378 m->m_pkthdr.csum_rx_val = csum;
379 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
380 if ((csum ^ 0xffff) != 0) {
381 return false;
382 }
383
384 return true;
385 }
386
387 /* structure to pass an array of data buffers */
388 typedef struct _dbuf_array {
389 union {
390 struct __kern_buflet *dba_buflet[MAX_BUFLET_COUNT];
391 struct mbuf *dba_mbuf[MAX_BUFLET_COUNT];
392 };
393 uint8_t dba_num_dbufs;
394 bool dba_is_buflet;
395 } _dbuf_array_t;
396
397 static inline void
_copy_data_sum_dbuf(struct __kern_packet * spkt,uint16_t soff,uint16_t plen,uint32_t * partial_sum,boolean_t * odd_start,_dbuf_array_t * dbuf,boolean_t do_csum)398 _copy_data_sum_dbuf(struct __kern_packet *spkt, uint16_t soff, uint16_t plen,
399 uint32_t *partial_sum, boolean_t *odd_start, _dbuf_array_t *dbuf,
400 boolean_t do_csum)
401 {
402 uint8_t i = 0;
403 uint16_t buf_off = 0;
404 uint16_t buflet_dlim;
405 uint16_t buflet_dlen;
406
407 ASSERT(plen > 0);
408 if (!dbuf->dba_is_buflet) {
409 /*
410 * Assumption about a single mbuf is being asserted due to the
411 * reason that the current usage always passes one mbuf and the
412 * routine has not been tested with multiple mbufs.
413 */
414 ASSERT(dbuf->dba_num_dbufs == 1);
415 ASSERT((mbuf_maxlen(dbuf->dba_mbuf[0]) -
416 dbuf->dba_mbuf[0]->m_len) >= plen);
417 buf_off = dbuf->dba_mbuf[0]->m_len;
418 } else {
419 buflet_dlim = kern_buflet_get_data_limit(dbuf->dba_buflet[0]);
420 buflet_dlen = kern_buflet_get_data_length(dbuf->dba_buflet[0]);
421 ASSERT(buflet_dlen < buflet_dlim);
422 buf_off = buflet_dlen;
423 }
424 while (plen > 0) {
425 uint16_t tmplen;
426 uint16_t dbuf_lim;
427 uint8_t *dbuf_addr;
428
429 if (dbuf->dba_is_buflet) {
430 ASSERT(i < dbuf->dba_num_dbufs);
431 ASSERT(kern_buflet_get_data_offset(dbuf->dba_buflet[i])
432 == 0);
433 dbuf_addr =
434 kern_buflet_get_data_address(dbuf->dba_buflet[i]);
435 dbuf_lim = buflet_dlim - buf_off;
436 } else {
437 dbuf_addr = mtod(dbuf->dba_mbuf[i], uint8_t *);
438 dbuf_lim = mbuf_maxlen(dbuf->dba_mbuf[i]) - buf_off;
439 }
440 dbuf_addr += buf_off;
441 tmplen = min(plen, dbuf_lim);
442 if (PKT_IS_TRUNC_MBUF(spkt)) {
443 if (do_csum) {
444 *partial_sum = m_copydata_sum(spkt->pkt_mbuf,
445 soff, tmplen, dbuf_addr, *partial_sum,
446 odd_start);
447 } else {
448 m_copydata(spkt->pkt_mbuf, soff, tmplen,
449 dbuf_addr);
450 }
451 } else {
452 *partial_sum = pkt_copyaddr_sum(SK_PKT2PH(spkt),
453 soff, dbuf_addr, tmplen, do_csum, *partial_sum,
454 odd_start);
455 }
456 if (dbuf->dba_is_buflet) {
457 VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[i],
458 tmplen + buf_off) == 0);
459 } else {
460 dbuf->dba_mbuf[i]->m_len += tmplen;
461 dbuf->dba_mbuf[i]->m_pkthdr.len += tmplen;
462 }
463 soff += tmplen;
464 plen -= tmplen;
465 buf_off = 0;
466 i++;
467 }
468 ASSERT(plen == 0);
469 }
470
471 /*
472 * Copy (fill) and checksum for packet.
473 * spkt: source IP packet.
474 * plen: length of data in spkt (IP hdr + TCP hdr + TCP payload).
475 * verify_l3: verify IPv4 header checksum.
476 * currm: destination mbuf.
477 * currp: destination skywalk packet.
478 * dbuf: additional destination data buffer(s), used when current destination
479 * packet is out of space.
480 * added: amount of data copied from spkt to the additional buffer.
481 * data_sum: 16-bit folded partial checksum of the copied TCP payload.
482 */
483 static bool
copy_pkt_csum_packed(struct __kern_packet * spkt,uint32_t plen,_dbuf_array_t * dbuf,bool verify_l3,struct mbuf * currm,struct __kern_buflet * currp,uint16_t * data_csum,int * added)484 copy_pkt_csum_packed(struct __kern_packet *spkt, uint32_t plen,
485 _dbuf_array_t *dbuf, bool verify_l3, struct mbuf *currm,
486 struct __kern_buflet *currp, uint16_t *data_csum, int *added)
487 {
488 ASSERT(data_csum != NULL);
489
490 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
491 SK_VERB_COPY));
492
493 uint16_t start = 0, csum = 0;
494 uint32_t len = 0;
495 uint32_t l4len;
496 /* soff is only used for packets */
497 uint16_t soff = spkt->pkt_headroom + spkt->pkt_l2_len;
498 uint32_t data_partial = 0, partial = 0;
499 int32_t curr_oldlen;
500 uint32_t curr_trailing;
501 char *curr_ptr;
502 int32_t curr_len;
503 uint16_t data_off;
504 uint32_t tmplen;
505 boolean_t odd_start = FALSE;
506
507 /* One of them must be != NULL, but they can't be both set */
508 VERIFY((currm != NULL || currp != NULL) &&
509 ((currm != NULL) != (currp != NULL)));
510
511 if (currm != NULL) {
512 curr_oldlen = currm->m_len;
513 curr_trailing = (uint32_t)M_TRAILINGSPACE(currm);
514 curr_ptr = mtod(currm, char *) + currm->m_len;
515 curr_len = currm->m_len;
516 } else {
517 curr_oldlen = currp->buf_dlen;
518 curr_trailing = currp->buf_dlim - currp->buf_doff -
519 currp->buf_dlen;
520 curr_ptr = (char *)(currp->buf_addr + currp->buf_doff +
521 currp->buf_dlen);
522 curr_len = currp->buf_dlen;
523 }
524
525 /* Reset the checksum flags in source packet */
526 spkt->pkt_csum_flags &= ~PACKET_CSUM_RX_FLAGS;
527
528 /* Verify checksum only for IPv4 */
529 len = spkt->pkt_flow_ip_hlen;
530 if (verify_l3) {
531 if (PKT_IS_TRUNC_MBUF(spkt)) {
532 partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
533 len, 0, 0);
534 } else {
535 partial = pkt_sum(SK_PKT2PH(spkt), soff, len);
536 }
537
538 csum = __packet_fold_sum(partial);
539 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)", 0,
540 len, csum);
541 spkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
542 if ((csum ^ 0xffff) != 0) {
543 /* No need to copy & checkum TCP+payload */
544 return false;
545 } else {
546 spkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
547 }
548 }
549
550 /* Copy & verify TCP checksum */
551 start = spkt->pkt_flow_ip_hlen + spkt->pkt_flow_tcp_hlen;
552 l4len = plen - spkt->pkt_flow_ip_hlen;
553 len = plen - start;
554 if (PKT_IS_TRUNC_MBUF(spkt)) {
555 tmplen = min(len, curr_trailing);
556 odd_start = FALSE;
557
558 /* First, simple checksum on the TCP header */
559 partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
560 spkt->pkt_flow_tcp_hlen, spkt->pkt_flow_ip_hlen, 0);
561
562 /* Now, copy & sum the payload */
563 if (tmplen > 0) {
564 data_partial = m_copydata_sum(spkt->pkt_mbuf,
565 start, tmplen, curr_ptr, 0, &odd_start);
566 curr_len += tmplen;
567 }
568 data_off = start + tmplen;
569 } else {
570 tmplen = min(len, curr_trailing);
571 odd_start = FALSE;
572
573 /* First, simple checksum on the TCP header */
574 partial = pkt_sum(SK_PKT2PH(spkt),
575 (soff + spkt->pkt_flow_ip_hlen), spkt->pkt_flow_tcp_hlen);
576
577 /* Now, copy & sum the payload */
578 if (tmplen > 0) {
579 data_partial = pkt_copyaddr_sum(SK_PKT2PH(spkt),
580 (soff + start), (uint8_t *)curr_ptr, tmplen,
581 true, 0, &odd_start);
582 curr_len += tmplen;
583 }
584 data_off = soff + start + tmplen;
585 }
586
587 /* copy & sum remaining payload in additional buffers */
588 if ((len - tmplen) > 0) {
589 ASSERT(dbuf != NULL);
590 _copy_data_sum_dbuf(spkt, data_off, (len - tmplen),
591 &data_partial, &odd_start, dbuf, true);
592 *added = (len - tmplen);
593 }
594
595 /* Fold data checksum to 16 bit */
596 *data_csum = __packet_fold_sum(data_partial);
597
598 /* Fold in the data checksum to TCP checksum */
599 partial += *data_csum;
600
601 if (currm != NULL) {
602 currm->m_len = curr_len;
603 } else {
604 currp->buf_dlen = curr_len;
605 }
606
607 partial += htons(l4len + IPPROTO_TCP);
608 if (spkt->pkt_flow_ip_ver == IPVERSION) {
609 csum = in_pseudo(spkt->pkt_flow_ipv4_src.s_addr,
610 spkt->pkt_flow_ipv4_dst.s_addr, partial);
611 } else {
612 ASSERT(spkt->pkt_flow_ip_ver == IPV6_VERSION);
613 csum = in6_pseudo(&spkt->pkt_flow_ipv6_src,
614 &spkt->pkt_flow_ipv6_dst, partial);
615 }
616 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
617 start - spkt->pkt_flow_tcp_hlen, l4len, ntohs(csum));
618 __packet_set_inet_checksum(SK_PKT2PH(spkt), spkt->pkt_csum_flags |
619 PACKET_CSUM_DATA_VALID | PACKET_CSUM_PSEUDO_HDR, 0,
620 csum, false);
621
622 if ((csum ^ 0xffff) != 0) {
623 /*
624 * Revert whatever we did here!
625 * currm/currp should be restored to previous value.
626 * dbuf (for additional payload) should be restore to 0.
627 */
628 if (currm != NULL) {
629 currm->m_len = curr_oldlen;
630 } else {
631 currp->buf_dlen = curr_oldlen;
632 }
633 if (dbuf != NULL) {
634 for (int i = 0; i < dbuf->dba_num_dbufs; i++) {
635 if (dbuf->dba_is_buflet) {
636 struct __kern_buflet *b = dbuf->dba_buflet[i];
637 kern_buflet_set_data_length(b, 0);
638 kern_buflet_set_data_offset(b, 0);
639 } else {
640 struct mbuf *m = dbuf->dba_mbuf[i];
641 m->m_len = m->m_pkthdr.len = 0;
642 }
643 }
644 }
645
646 return false;
647 }
648
649 return true;
650 }
651
652 /*
653 * Copy and checksum for packet or packet with mbuf
654 * data_csum is only supported for bsd flows
655 */
656 static bool
copy_pkt_csum(struct __kern_packet * pkt,uint32_t plen,_dbuf_array_t * dbuf,uint16_t * data_csum,bool verify_l3)657 copy_pkt_csum(struct __kern_packet *pkt, uint32_t plen, _dbuf_array_t *dbuf,
658 uint16_t *data_csum, bool verify_l3)
659 {
660 /*
661 * To keep this routine simple and optimal, we are asserting on the
662 * assumption that the smallest flowswitch packet pool buffer should
663 * be large enough to hold the IP and TCP headers in the first buflet.
664 */
665 _CASSERT(NX_FSW_MINBUFSIZE >= NETIF_COMPAT_MAX_MBUF_DATA_COPY);
666
667 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
668 (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
669
670 uint16_t start = 0, csum = 0;
671 uint32_t len = 0;
672 /* soff is only used for packets */
673 uint16_t soff = pkt->pkt_headroom + pkt->pkt_l2_len;
674 uint32_t data_partial = 0, partial = 0;
675 boolean_t odd_start = false;
676 uint32_t data_len;
677 uint16_t dbuf_off;
678 uint16_t copied_len = 0;
679 bool l3_csum_ok = !verify_l3;
680 uint8_t *daddr;
681
682 if (dbuf->dba_is_buflet) {
683 daddr = kern_buflet_get_data_address(dbuf->dba_buflet[0]);
684 daddr += kern_buflet_get_data_length(dbuf->dba_buflet[0]);
685 } else {
686 daddr = mtod(dbuf->dba_mbuf[0], uint8_t *);
687 daddr += dbuf->dba_mbuf[0]->m_len;
688 ASSERT(mbuf_maxlen(dbuf->dba_mbuf[0]) >= plen);
689 }
690
691 /* Reset the checksum flags in source packet */
692 pkt->pkt_csum_flags &= ~PACKET_CSUM_RX_FLAGS;
693
694 /* Some compat drivers compute full checksum */
695 if (PKT_IS_MBUF(pkt) && ((pkt->pkt_mbuf->m_pkthdr.csum_flags &
696 CSUM_RX_FULL_FLAGS) == CSUM_RX_FULL_FLAGS)) {
697 /* copy only */
698 _copy_data_sum_dbuf(pkt, PKT_IS_TRUNC_MBUF(pkt) ? 0: soff,
699 plen, &partial, &odd_start, dbuf, false);
700 csum = pkt->pkt_mbuf->m_pkthdr.csum_rx_val;
701 SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
702 pkt->pkt_mbuf->m_pkthdr.csum_flags,
703 pkt->pkt_mbuf->m_pkthdr.csum_rx_start, csum);
704 /* pkt and mbuf flags are same for full csum */
705 __packet_set_inet_checksum(SK_PKT2PH(pkt), CSUM_RX_FULL_FLAGS,
706 0, csum, false);
707 if ((csum ^ 0xffff) == 0) {
708 return true;
709 } else {
710 return false;
711 }
712 }
713 /* Copy l3 & verify checksum only for IPv4 */
714 start = 0;
715 len = pkt->pkt_flow_ip_hlen;
716 if (PKT_IS_TRUNC_MBUF(pkt)) {
717 partial = m_copydata_sum(pkt->pkt_mbuf, start, len,
718 (daddr + start), 0, NULL);
719 } else {
720 partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), soff,
721 (daddr + start), len, true, 0, NULL);
722 }
723 if (verify_l3) {
724 csum = __packet_fold_sum(partial);
725 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
726 start, len, csum);
727 pkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
728 if ((csum ^ 0xffff) != 0) {
729 /* proceed to copy the rest of packet */
730 } else {
731 pkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
732 l3_csum_ok = true;
733 }
734 }
735 copied_len += pkt->pkt_flow_ip_hlen;
736
737 /* Copy & verify TCP checksum */
738 start = pkt->pkt_flow_ip_hlen;
739 len = plen - start;
740
741 if (PKT_IS_TRUNC_MBUF(pkt)) {
742 /* First, copy and sum TCP header */
743 partial = m_copydata_sum(pkt->pkt_mbuf, start,
744 pkt->pkt_flow_tcp_hlen, (daddr + start), 0, NULL);
745
746 data_len = len - pkt->pkt_flow_tcp_hlen;
747 start += pkt->pkt_flow_tcp_hlen;
748 dbuf_off = start;
749 /* Next, copy and sum payload (if any) */
750 } else {
751 /* First, copy and sum TCP header */
752 partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), (soff + start),
753 (daddr + start), pkt->pkt_flow_tcp_hlen, true, 0, NULL);
754
755 data_len = len - pkt->pkt_flow_tcp_hlen;
756 start += pkt->pkt_flow_tcp_hlen;
757 dbuf_off = start;
758 start += soff;
759 }
760 copied_len += pkt->pkt_flow_tcp_hlen;
761
762 if (dbuf->dba_is_buflet) {
763 VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[0],
764 kern_buflet_get_data_length(dbuf->dba_buflet[0]) +
765 copied_len) == 0);
766 } else {
767 dbuf->dba_mbuf[0]->m_len += copied_len;
768 dbuf->dba_mbuf[0]->m_pkthdr.len += copied_len;
769 }
770
771 /* copy and sum payload (if any) */
772 if (data_len > 0) {
773 odd_start = false;
774 _copy_data_sum_dbuf(pkt, start, data_len, &data_partial,
775 &odd_start, dbuf, l3_csum_ok);
776 }
777
778 if (__improbable(!l3_csum_ok)) {
779 return false;
780 }
781
782 /* Fold data sum to 16 bit and then into the partial */
783 *data_csum = __packet_fold_sum(data_partial);
784
785 /* Fold in the data checksum to TCP checksum */
786 partial += *data_csum;
787
788 partial += htons(len + IPPROTO_TCP);
789 if (pkt->pkt_flow_ip_ver == IPVERSION) {
790 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
791 pkt->pkt_flow_ipv4_dst.s_addr, partial);
792 } else {
793 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
794 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
795 &pkt->pkt_flow_ipv6_dst, partial);
796 }
797 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
798 pkt->pkt_flow_ip_hlen, len, csum);
799 __packet_set_inet_checksum(SK_PKT2PH(pkt), pkt->pkt_csum_flags |
800 PACKET_CSUM_DATA_VALID | PACKET_CSUM_PSEUDO_HDR, 0,
801 csum, false);
802 if ((csum ^ 0xffff) != 0) {
803 return false;
804 }
805
806 return true;
807 }
808
809 SK_INLINE_ATTRIBUTE
810 static void
flow_agg_init_common(struct flow_agg * fa,struct __kern_packet * pkt)811 flow_agg_init_common(struct flow_agg *fa, struct __kern_packet *pkt)
812 {
813 switch (pkt->pkt_flow_ip_ver) {
814 case IPVERSION:
815 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip)) {
816 return;
817 }
818 break;
819 case IPV6_VERSION:
820 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
821 return;
822 }
823 break;
824 default:
825 VERIFY(0);
826 /* NOTREACHED */
827 __builtin_unreachable();
828 }
829
830 fa->fa_tcp_seq = ntohl(pkt->pkt_flow_tcp_seq) + pkt->pkt_flow_ulen;
831 fa->fa_ulen = pkt->pkt_flow_ulen;
832 fa->fa_total = pkt->pkt_flow_ip_hlen +
833 pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
834 }
835
836 static void
flow_agg_init_smbuf(struct flow_agg * fa,struct mbuf * smbuf,struct __kern_packet * pkt)837 flow_agg_init_smbuf(struct flow_agg *fa, struct mbuf *smbuf,
838 struct __kern_packet *pkt)
839 {
840 FLOW_AGG_CLEAR(fa);
841
842 ASSERT(smbuf != NULL);
843 fa->fa_smbuf = smbuf;
844
845 fa->fa_sptr = mtod(smbuf, uint8_t *);
846 ASSERT(fa->fa_sptr != NULL);
847
848 /*
849 * Note here we use 'pkt' instead of 'smbuf', since we rely on the
850 * contents of the flow structure which don't exist in 'smbuf'.
851 */
852 flow_agg_init_common(fa, pkt);
853 }
854
855 static void
flow_agg_init_spkt(struct flow_agg * fa,struct __kern_packet * spkt,struct __kern_packet * pkt)856 flow_agg_init_spkt(struct flow_agg *fa, struct __kern_packet *spkt,
857 struct __kern_packet *pkt)
858 {
859 FLOW_AGG_CLEAR(fa);
860
861 ASSERT(spkt != NULL);
862 fa->fa_spkt = spkt;
863 fa->fa_sobj_is_pkt = true;
864 VERIFY(spkt->pkt_headroom == 0 && spkt->pkt_l2_len == 0);
865
866 MD_BUFLET_ADDR_ABS(spkt, fa->fa_sptr);
867 ASSERT(fa->fa_sptr != NULL);
868
869 /*
870 * Note here we use 'pkt' instead of 'spkt', since we rely on the
871 * contents of the flow structure which don't exist in 'spkt'.
872 */
873 flow_agg_init_common(fa, pkt);
874 }
875
876 SK_INLINE_ATTRIBUTE
877 static bool
ipv4_tcp_memcmp(const uint8_t * h1,const uint8_t * h2)878 ipv4_tcp_memcmp(const uint8_t *h1, const uint8_t *h2)
879 {
880 return sk_memcmp_mask_64B(h1, h2, (const uint8_t *)&ip_tcp_mask) == 0;
881 }
882
883 SK_INLINE_ATTRIBUTE
884 static bool
ipv6_tcp_memcmp(const uint8_t * h1,const uint8_t * h2)885 ipv6_tcp_memcmp(const uint8_t *h1, const uint8_t *h2)
886 {
887 return sk_memcmp_mask_80B(h1, h2, (const uint8_t *)&ip6_tcp_mask) == 0;
888 }
889
890 SK_INLINE_ATTRIBUTE
891 static bool
can_agg_fastpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)892 can_agg_fastpath(struct flow_agg *fa, struct __kern_packet *pkt,
893 struct fsw_stats *fsws)
894 {
895 bool match;
896
897 ASSERT(fa->fa_sptr != NULL);
898 _CASSERT(sizeof(struct ip6_tcp_mask) == MASK_SIZE);
899 _CASSERT(sizeof(struct ip_tcp_mask) == MASK_SIZE);
900
901 if (__improbable(pkt->pkt_length < MASK_SIZE)) {
902 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_TCP);
903 goto slow_path;
904 }
905
906 if (__improbable(fa->fa_sobj_is_short)) {
907 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_MBUF);
908 goto slow_path;
909 }
910
911 if (__improbable(pkt->pkt_flow_tcp_hlen !=
912 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA))) {
913 goto slow_path;
914 }
915
916 switch (pkt->pkt_flow_ip_ver) {
917 case IPVERSION:
918 match = ipv4_tcp_memcmp(fa->fa_sptr,
919 (uint8_t *)pkt->pkt_flow_ip_hdr);
920 break;
921 case IPV6_VERSION:
922 match = ipv6_tcp_memcmp(fa->fa_sptr,
923 (uint8_t *)pkt->pkt_flow_ip_hdr);
924 break;
925 default:
926 VERIFY(0);
927 /* NOTREACHED */
928 __builtin_unreachable();
929 }
930
931 if (__improbable(!match)) {
932 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_MASK_TCP);
933 goto slow_path;
934 }
935 if (__improbable(pkt->pkt_flow_ulen != fa->fa_ulen)) {
936 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ULEN_TCP);
937 goto slow_path;
938 }
939
940 STATS_INC(fsws, FSW_STATS_RX_AGG_OK_FASTPATH_TCP);
941 fa->fa_tcp_seq += pkt->pkt_flow_ulen;
942 fa->fa_ulen = pkt->pkt_flow_ulen;
943 return true;
944
945 slow_path:
946 return false;
947 }
948
949 SK_NO_INLINE_ATTRIBUTE
950 static bool
can_agg_slowpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)951 can_agg_slowpath(struct flow_agg *fa, struct __kern_packet *pkt,
952 struct fsw_stats *fsws)
953 {
954 uint8_t *sl3_hdr = fa->fa_sptr;
955 uint32_t sl3tlen = 0;
956 uint16_t sl3hlen = 0;
957
958 DTRACE_SKYWALK2(aggr__slow, struct __kern_packet *, pkt,
959 uint8_t *, sl3_hdr);
960
961 ASSERT(sl3_hdr != NULL);
962
963 /*
964 * Compare IP header length, TOS, frag flags and IP options
965 * For IPv4, the options should match exactly
966 * For IPv6, if options are present, bail out
967 */
968 if (pkt->pkt_flow_ip_ver == IPVERSION) {
969 struct ip *siph = (struct ip *)(void *)sl3_hdr;
970 struct ip *iph = (struct ip *)pkt->pkt_flow_ip_hdr;
971
972 ASSERT(siph->ip_v == IPVERSION);
973 /* 16-bit alignment is sufficient (handles mbuf case) */
974 ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
975 ASSERT(IS_P2ALIGNED(iph, sizeof(uint16_t)));
976
977 sl3hlen = (siph->ip_hl << 2);
978 if (sl3hlen != pkt->pkt_flow_ip_hlen) {
979 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
980 DTRACE_SKYWALK2(aggr__fail2, uint16_t, sl3hlen, uint8_t,
981 pkt->pkt_flow_ip_hlen);
982 return false;
983 }
984
985 if (siph->ip_ttl != iph->ip_ttl) {
986 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
987 DTRACE_SKYWALK2(aggr__fail3, uint8_t, siph->ip_ttl,
988 uint8_t, iph->ip_ttl);
989 return false;
990 }
991
992 if (siph->ip_tos != iph->ip_tos) {
993 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
994 DTRACE_SKYWALK2(aggr__fail4, uint8_t, siph->ip_tos,
995 uint8_t, iph->ip_tos);
996 return false;
997 }
998 /* For IPv4, DF bit should match */
999 if ((ntohs(siph->ip_off) & (IP_DF | IP_RF)) !=
1000 (ntohs(iph->ip_off) & (IP_DF | IP_RF))) {
1001 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OFF_IP);
1002 DTRACE_SKYWALK2(aggr__fail5, uint16_t,
1003 ntohs(siph->ip_off), uint16_t, ntohs(iph->ip_off));
1004 return false;
1005 }
1006
1007 uint8_t ip_opts_len = pkt->pkt_flow_ip_hlen -
1008 sizeof(struct ip);
1009 if (ip_opts_len > 0 &&
1010 memcmp((uint8_t *)(siph + 1), (uint8_t *)(iph + 1),
1011 ip_opts_len) != 0) {
1012 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPT_IP);
1013 DTRACE_SKYWALK3(aggr__fail6, uint8_t, ip_opts_len,
1014 uint8_t *, (uint8_t *)(siph + 1), uint8_t *,
1015 (uint8_t *)(iph + 1));
1016 return false;
1017 }
1018 sl3tlen = ntohs(siph->ip_len);
1019 } else {
1020 struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1021 struct ip6_hdr *ip6 = (struct ip6_hdr *)pkt->pkt_flow_ip_hdr;
1022
1023 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1024 ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1025 /* 16-bit alignment is sufficient (handles mbuf case) */
1026 ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1027
1028 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
1029 /*
1030 * Don't aggregate if extension header is present in
1031 * packet. N.B. currently flow switch only classifies
1032 * frag header
1033 */
1034 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1035 DTRACE_SKYWALK1(aggr__fail7, uint8_t,
1036 pkt->pkt_flow_ip_hlen);
1037 return false;
1038 }
1039
1040 sl3hlen = sizeof(struct ip6_hdr);
1041 /* For IPv6, flow info mask covers TOS and flow label */
1042 if (memcmp(&sip6->ip6_flow, &ip6->ip6_flow,
1043 sizeof(sip6->ip6_flow)) != 0) {
1044 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1045 DTRACE_SKYWALK2(aggr__fail8, uint32_t,
1046 ntohl(sip6->ip6_flow), uint32_t,
1047 ntohl(ip6->ip6_flow));
1048 return false;
1049 }
1050
1051 if (sip6->ip6_hlim != ip6->ip6_hlim) {
1052 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1053 DTRACE_SKYWALK2(aggr__fail9, uint8_t, sip6->ip6_hlim,
1054 uint8_t, ip6->ip6_hlim);
1055 return false;
1056 }
1057
1058 sl3tlen = (sizeof(struct ip6_hdr) + ntohs(sip6->ip6_plen));
1059 }
1060
1061 /*
1062 * For TCP header, compare ACK number and window size
1063 * Compare TCP flags
1064 * Compare TCP header length and TCP options
1065 */
1066 struct tcphdr *stcp = (struct tcphdr *)(void *)(sl3_hdr + sl3hlen);
1067 struct tcphdr *tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
1068
1069 uint16_t sl4hlen = (stcp->th_off << 2);
1070 if (memcmp(&stcp->th_ack, &tcp->th_ack, sizeof(stcp->th_ack)) != 0 ||
1071 memcmp(&stcp->th_win, &tcp->th_win, sizeof(stcp->th_win)) != 0) {
1072 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ACKWIN_TCP);
1073 DTRACE_SKYWALK4(aggr__fail9, uint32_t, ntohl(stcp->th_ack),
1074 uint32_t, ntohl(tcp->th_ack), uint16_t, ntohs(stcp->th_win),
1075 uint16_t, ntohs(tcp->th_win));
1076 return false;
1077 }
1078
1079 if ((stcp->th_flags & ~(TH_PUSH)) != (tcp->th_flags & ~(TH_PUSH))) {
1080 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_FLAGS_TCP);
1081 DTRACE_SKYWALK2(aggr__fail10, uint8_t, stcp->th_flags,
1082 uint8_t, tcp->th_flags);
1083 return false;
1084 }
1085
1086 if (sl4hlen != pkt->pkt_flow_tcp_hlen) {
1087 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_TCP);
1088 DTRACE_SKYWALK2(aggr__fail11, uint8_t, sl4hlen,
1089 uint8_t, pkt->pkt_flow_tcp_hlen);
1090 return false;
1091 }
1092
1093 uint8_t tcp_opts_len = pkt->pkt_flow_tcp_hlen - sizeof(struct tcphdr);
1094 /*
1095 * We know that the TCP-option lengthes are the same thanks to the above
1096 * sl4hlen check
1097 */
1098 if (tcp_opts_len > 0 && memcmp((uint8_t *)(stcp + 1),
1099 (uint8_t *)(tcp + 1), tcp_opts_len) != 0) {
1100 /*
1101 * Fast-path header prediction:
1102 *
1103 * TCP Timestamp option is usually put after two NOP-headers,
1104 * and thus total TCP-option length is 12. If that's the case,
1105 * we can aggregate as only the TCP time-stamp option differs.
1106 */
1107 if (tcp_opts_len != TCPOLEN_TSTAMP_APPA) {
1108 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_EXOPT_TCP);
1109 DTRACE_SKYWALK1(aggr__fail13, uint8_t, tcp_opts_len);
1110 return false;
1111 } else {
1112 uint32_t sts_hdr, ts_hdr;
1113 if (IS_P2ALIGNED(stcp + 1, sizeof(uint32_t))) {
1114 sts_hdr = *((uint32_t *)(stcp + 1));
1115 } else {
1116 bcopy(stcp + 1, &sts_hdr, sizeof(sts_hdr));
1117 }
1118 if (IS_P2ALIGNED(tcp + 1, sizeof(uint32_t))) {
1119 ts_hdr = *((uint32_t *)(tcp + 1));
1120 } else {
1121 bcopy(tcp + 1, &ts_hdr, sizeof(ts_hdr));
1122 }
1123
1124 if (sts_hdr != htonl(TCPOPT_TSTAMP_HDR) ||
1125 ts_hdr != htonl(TCPOPT_TSTAMP_HDR)) {
1126 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPTTS_TCP);
1127 DTRACE_SKYWALK2(aggr__fail14, uint32_t,
1128 sts_hdr, uint32_t, ts_hdr);
1129 return false;
1130 }
1131 }
1132 }
1133 STATS_INC(fsws, FSW_STATS_RX_AGG_OK_SLOWPATH_TCP);
1134 fa->fa_tcp_seq += pkt->pkt_flow_ulen;
1135 fa->fa_ulen = pkt->pkt_flow_ulen;
1136 return true;
1137 }
1138
1139 static bool
flow_agg_is_ok(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)1140 flow_agg_is_ok(struct flow_agg *fa, struct __kern_packet *pkt,
1141 struct fsw_stats *fsws)
1142 {
1143 /* Shouldn't exceed the ip_len beyond MIN(custom ip_len, 64K) */
1144 const uint32_t max_ip_len = MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET);
1145 bool can_agg = false;
1146
1147 DTRACE_SKYWALK2(aggr__check, struct flow_agg *, fa,
1148 struct __kern_packet *, pkt);
1149
1150 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1151 if (__improbable(pkt->pkt_flow_tcp_agg_fast != 0)) {
1152 pkt->pkt_flow_tcp_agg_fast = 0;
1153 }
1154 /*
1155 * Don't aggregate if any of the following is true:
1156 * 1. TCP flag is other than TH_{ACK,PUSH}
1157 * 2. Payload length is 0 (pure ACK)
1158 * 3. This is the first packet
1159 * 4. TCP sequence number is not expected
1160 * 5. We would've exceeded the maximum aggregated size
1161 * 6. It's not the first packet and the wake flag is set
1162 */
1163 if (__improbable((pkt->pkt_flow_tcp_flags & TCP_FLAGS_IGNORE) != 0 ||
1164 pkt->pkt_flow_ulen == 0 || fa->fa_sobj == NULL)) {
1165 DTRACE_SKYWALK1(aggr__fail1a, struct __kern_packet *, pkt);
1166 goto done;
1167 }
1168 if (__improbable(ntohl(pkt->pkt_flow_tcp_seq) != fa->fa_tcp_seq)) {
1169 DTRACE_SKYWALK2(aggr__fail1b, uint32_t,
1170 ntohl(pkt->pkt_flow_tcp_seq), uint32_t, fa->fa_tcp_seq);
1171 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SEQN_TCP);
1172 goto done;
1173 }
1174 if (__improbable((fa->fa_total + pkt->pkt_flow_ulen) > max_ip_len)) {
1175 DTRACE_SKYWALK3(aggr__fail1c, uint32_t, fa->fa_total,
1176 uint32_t, pkt->pkt_flow_ulen, uint32_t, max_ip_len);
1177 /* We've reached aggregation limit */
1178 STATS_INC(fsws, FSW_STATS_RX_AGG_LIMIT);
1179 goto done;
1180 }
1181 if (__improbable((pkt->pkt_pflags & PKT_F_WAKE_PKT) && fa->fa_total > 0)) {
1182 DTRACE_SKYWALK1(aggr__fail1d, struct __kern_packet *, pkt);
1183 goto done;
1184 }
1185
1186 can_agg = can_agg_fastpath(fa, pkt, fsws);
1187 if (can_agg) {
1188 pkt->pkt_flow_tcp_agg_fast = 1;
1189 goto done;
1190 }
1191
1192 can_agg = can_agg_slowpath(fa, pkt, fsws);
1193 ASSERT(!pkt->pkt_flow_tcp_agg_fast);
1194
1195 done:
1196 return can_agg;
1197 }
1198
1199 static void
flow_agg_merge_hdr(struct flow_agg * fa,struct __kern_packet * pkt,uint16_t data_csum,struct fsw_stats * fsws)1200 flow_agg_merge_hdr(struct flow_agg *fa, struct __kern_packet *pkt,
1201 uint16_t data_csum, struct fsw_stats *fsws)
1202 {
1203 struct tcphdr *stcp, *tcp;
1204 uint8_t *l3hdr, l3hlen;
1205 uint16_t old_l3len = 0;
1206 uint8_t result;
1207
1208 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1209
1210 ASSERT(fa->fa_sobj != NULL);
1211 ASSERT(!fa->fa_sobj_is_pkt ||
1212 (fa->fa_spkt->pkt_headroom == 0 && fa->fa_spkt->pkt_l2_len == 0));
1213 uint8_t *sl3_hdr = fa->fa_sptr;
1214 ASSERT(sl3_hdr != NULL);
1215
1216 fa->fa_total += pkt->pkt_flow_ulen;
1217
1218 /*
1219 * Update the IP header as:
1220 * 1. Set the IP ID (IPv4 only) to that of the new packet
1221 * 2. Set the ttl to the lowest of the two
1222 * 3. Increment the IP length by the payload length of new packet
1223 * 4. Leave the IP (IPv4 only) checksum as is
1224 * Update the resp. flow classification fields, if any
1225 * Nothing to update for TCP header for now
1226 */
1227 if (pkt->pkt_flow_ip_ver == IPVERSION) {
1228 struct ip *siph = (struct ip *)(void *)sl3_hdr;
1229
1230 /* 16-bit alignment is sufficient (handles mbuf case) */
1231 ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1232
1233 l3hdr = (uint8_t *)siph;
1234 l3hlen = siph->ip_hl << 2;
1235
1236 old_l3len = ntohs(siph->ip_len);
1237 uint16_t l3tlen = ntohs(siph->ip_len) + pkt->pkt_flow_ulen;
1238 siph->ip_len = htons(l3tlen);
1239 siph->ip_sum = __packet_fix_sum(siph->ip_sum, 0,
1240 htons(pkt->pkt_flow_ulen));
1241
1242 SK_DF(logflags, "Agg IP len %u", ntohs(siph->ip_len));
1243 } else {
1244 struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1245
1246 /* 16-bit alignment is sufficient (handles mbuf case) */
1247 ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1248 ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1249 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1250
1251 l3hdr = (uint8_t *)sip6;
1252 l3hlen = sizeof(struct ip6_hdr);
1253
1254 /* No extension headers should be present */
1255 ASSERT(pkt->pkt_flow_ip_hlen == sizeof(struct ip6_hdr));
1256
1257 old_l3len = ntohs(sip6->ip6_plen) + sizeof(struct ip6_hdr);
1258 uint16_t l3plen = ntohs(sip6->ip6_plen) + pkt->pkt_flow_ulen;
1259 sip6->ip6_plen = htons(l3plen);
1260
1261 SK_DF(logflags, "Agg IP6 len %u", ntohs(sip6->ip6_plen));
1262 }
1263
1264 if (__probable(pkt->pkt_flow_tcp_agg_fast)) {
1265 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_IP);
1266 } else {
1267 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_IP);
1268 }
1269
1270 stcp = (struct tcphdr *)(void *)(l3hdr + l3hlen);
1271 tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
1272 /* 16-bit alignment is sufficient (handles mbuf case) */
1273 ASSERT(IS_P2ALIGNED(stcp, sizeof(uint16_t)));
1274 ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
1275
1276 /*
1277 * If it is bigger, that means there are TCP-options that need to be
1278 * copied over.
1279 */
1280 if (pkt->pkt_flow_tcp_hlen > sizeof(struct tcphdr) ||
1281 (stcp->th_flags & TH_PUSH) == 0) {
1282 VERIFY(stcp->th_off << 2 == pkt->pkt_flow_tcp_hlen);
1283 if (__improbable(!pkt->pkt_flow_tcp_agg_fast &&
1284 memcmp(stcp + 1, tcp + 1, (pkt->pkt_flow_tcp_hlen -
1285 sizeof(struct tcphdr))) != 0)) {
1286 uint8_t *sopt = (uint8_t *)(stcp + 1);
1287 uint8_t *opt = (uint8_t *)(tcp + 1);
1288
1289 uint32_t ntsval, ntsecr;
1290 bcopy((void *)(opt + 4), &ntsval, sizeof(ntsval));
1291 bcopy((void *)(opt + 8), &ntsecr, sizeof(ntsecr));
1292
1293 __packet_fix_hdr_sum(sopt + 4, &stcp->th_sum, ntsval);
1294 __packet_fix_hdr_sum(sopt + 8, &stcp->th_sum, ntsecr);
1295
1296 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_TCP);
1297 } else {
1298 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_TCP);
1299 }
1300
1301 if ((stcp->th_flags & TH_PUSH) == 0 &&
1302 (tcp->th_flags & TH_PUSH) != 0) {
1303 uint16_t old, new;
1304 old = *(uint16_t *)(void *)(&stcp->th_ack + 1);
1305 /* If the new segment has a PUSH-flag, append it! */
1306 stcp->th_flags |= tcp->th_flags & TH_PUSH;
1307 new = *(uint16_t *)(void *)(&stcp->th_ack + 1);
1308 stcp->th_sum = __packet_fix_sum(stcp->th_sum, old, new);
1309 }
1310 }
1311
1312 /* Update pseudo header checksum */
1313 stcp->th_sum = __packet_fix_sum(stcp->th_sum, 0,
1314 htons(pkt->pkt_flow_ulen));
1315
1316 /* Update data checksum */
1317 if (__improbable(old_l3len & 0x1)) {
1318 /* swap the byte order, refer to rfc 1071 section 2 */
1319 stcp->th_sum = __packet_fix_sum(stcp->th_sum, 0,
1320 ntohs(data_csum));
1321 } else {
1322 stcp->th_sum = __packet_fix_sum(stcp->th_sum, 0, data_csum);
1323 }
1324
1325 if (fa->fa_sobj_is_pkt) {
1326 struct __kern_packet *spkt = fa->fa_spkt;
1327 spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1328 spkt->pkt_flow_ulen += pkt->pkt_flow_ulen;
1329 /*
1330 * Super packet length includes L3 and L4
1331 * header length for first packet only.
1332 */
1333 spkt->pkt_length += pkt->pkt_flow_ulen;
1334 if (spkt->pkt_seg_cnt == 0) {
1335 /* First time we append packets, need to set it to 1 */
1336 spkt->pkt_seg_cnt = 1;
1337 }
1338 _CASSERT(sizeof(result) == sizeof(spkt->pkt_seg_cnt));
1339 if (!os_add_overflow(1, spkt->pkt_seg_cnt, &result)) {
1340 spkt->pkt_seg_cnt = result;
1341 }
1342 SK_DF(logflags, "Agg pkt len %u TCP csum 0x%04x",
1343 spkt->pkt_length, ntohs(stcp->th_sum));
1344 } else {
1345 struct mbuf *smbuf = fa->fa_smbuf;
1346 smbuf->m_pkthdr.len += pkt->pkt_flow_ulen;
1347 if (smbuf->m_pkthdr.seg_cnt == 0) {
1348 /* First time we append packets, need to set it to 1 */
1349 smbuf->m_pkthdr.seg_cnt = 1;
1350 }
1351 _CASSERT(sizeof(result) == sizeof(smbuf->m_pkthdr.seg_cnt));
1352 if (!os_add_overflow(1, smbuf->m_pkthdr.seg_cnt, &result)) {
1353 smbuf->m_pkthdr.seg_cnt = result;
1354 }
1355 SK_DF(logflags, "Agg mbuf len %u TCP csum 0x%04x",
1356 smbuf->m_pkthdr.len, ntohs(stcp->th_sum));
1357 }
1358 }
1359
1360 /*
1361 * Copy metadata from source packet to destination packet
1362 */
1363 static void
pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)1364 pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
1365 {
1366 /* Copy packet metadata */
1367 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
1368 _PKT_COPY(spkt, dpkt);
1369 }
1370
1371 static void
pkt_finalize(kern_packet_t ph)1372 pkt_finalize(kern_packet_t ph)
1373 {
1374 int err = __packet_finalize(ph);
1375 VERIFY(err == 0);
1376 #if (DEVELOPMENT || DEBUG)
1377 struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1378 uint8_t *buf;
1379 MD_BUFLET_ADDR_ABS(pkt, buf);
1380 buf += pkt->pkt_headroom + pkt->pkt_l2_len;
1381 DTRACE_SKYWALK2(aggr__finalize, struct __kern_packet *, pkt,
1382 uint8_t *, buf);
1383 #endif
1384 }
1385
1386 SK_INLINE_ATTRIBUTE
1387 static inline uint32_t
_estimate_buflet_cnt(struct flow_entry * fe,struct kern_pbufpool * pp)1388 _estimate_buflet_cnt(struct flow_entry *fe, struct kern_pbufpool *pp)
1389 {
1390 uint32_t cnt;
1391
1392 _CASSERT(MAX_BUFLET_COUNT <= UINT8_MAX);
1393 cnt = howmany(((fe->fe_rx_pktq_bytes + sizeof(struct ip6_hdr)) +
1394 sizeof(struct tcphdr)), pp->pp_buflet_size);
1395 cnt = MAX(KPKTQ_LEN(&fe->fe_rx_pktq), cnt);
1396 cnt = MIN(cnt, MAX_BUFLET_COUNT);
1397 return cnt;
1398 }
1399
1400 SK_INLINE_ATTRIBUTE
1401 static inline void
_append_dbuf_array_to_kpkt(kern_packet_t ph,kern_buflet_t pbuf,_dbuf_array_t * dbuf_array,kern_buflet_t * lbuf)1402 _append_dbuf_array_to_kpkt(kern_packet_t ph, kern_buflet_t pbuf,
1403 _dbuf_array_t *dbuf_array, kern_buflet_t *lbuf)
1404 {
1405 for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1406 kern_buflet_t buf = dbuf_array->dba_buflet[i];
1407 VERIFY(kern_packet_add_buflet(ph, pbuf, buf) == 0);
1408 pbuf = buf;
1409 dbuf_array->dba_buflet[i] = NULL;
1410 }
1411 ASSERT(pbuf != NULL);
1412 dbuf_array->dba_num_dbufs = 0;
1413 *lbuf = pbuf;
1414 }
1415
1416 SK_INLINE_ATTRIBUTE
1417 static inline void
_free_dbuf_array(struct kern_pbufpool * pp,_dbuf_array_t * dbuf_array)1418 _free_dbuf_array(struct kern_pbufpool *pp,
1419 _dbuf_array_t *dbuf_array)
1420 {
1421 for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1422 kern_buflet_t buf = dbuf_array->dba_buflet[i];
1423 pp_free_buflet(pp, buf);
1424 dbuf_array->dba_buflet[i] = NULL;
1425 }
1426 dbuf_array->dba_num_dbufs = 0;
1427 }
1428
1429 SK_NO_INLINE_ATTRIBUTE
1430 static void
flow_rx_agg_channel(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * dropped_pkts,bool is_mbuf)1431 flow_rx_agg_channel(struct nx_flowswitch *fsw, struct flow_entry *fe,
1432 struct pktq *dropped_pkts, bool is_mbuf)
1433 {
1434 struct flow_agg fa; /* states */
1435 FLOW_AGG_CLEAR(&fa);
1436
1437 struct pktq pkts; /* dst super packets */
1438 struct pktq disposed_pkts; /* done src packets */
1439
1440 KPKTQ_INIT(&pkts);
1441 KPKTQ_INIT(&disposed_pkts);
1442
1443 struct __kern_channel_ring *ring;
1444 ring = fsw_flow_get_rx_ring(fsw, fe);
1445 if (__improbable(ring == NULL)) {
1446 SK_ERR("Rx ring is NULL");
1447 KPKTQ_CONCAT(dropped_pkts, &fe->fe_rx_pktq);
1448 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DST_NXPORT_INVALID,
1449 KPKTQ_LEN(dropped_pkts));
1450 return;
1451 }
1452 struct kern_pbufpool *dpp = ring->ckr_pp;
1453 ASSERT(dpp->pp_max_frags > 1);
1454
1455 struct __kern_packet *pkt, *tpkt;
1456 /* state for super packet */
1457 struct __kern_packet *spkt = NULL;
1458 kern_packet_t sph = 0;
1459 kern_buflet_t sbuf = NULL;
1460 bool prev_csum_ok = false, csum_ok, agg_ok;
1461 uint16_t spkts = 0, bufcnt = 0;
1462 int err;
1463
1464 struct fsw_stats *fsws = &fsw->fsw_stats;
1465
1466 /* state for buflet batch alloc */
1467 uint32_t bh_cnt, bh_cnt_tmp;
1468 uint8_t iter = 0;
1469 uint64_t buf_arr[MAX_BUFLET_COUNT];
1470 _dbuf_array_t dbuf_array = {.dba_is_buflet = true, .dba_num_dbufs = 0};
1471
1472 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1473 SK_DF(logflags, "Rx input queue len %u", KPKTQ_LEN(&fe->fe_rx_pktq));
1474
1475 bh_cnt_tmp = bh_cnt = _estimate_buflet_cnt(fe, dpp);
1476 err = pp_alloc_buflet_batch(dpp, buf_arr, &bh_cnt, SKMEM_NOSLEEP);
1477 if (__improbable(bh_cnt == 0)) {
1478 SK_ERR("failed to alloc %u buflets (err %d), use slow path",
1479 bh_cnt_tmp, err);
1480 }
1481 bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1482 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1483 if (tpkt != NULL) {
1484 void *baddr;
1485 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1486 SK_PREFETCH(baddr, 0);
1487 }
1488
1489 ASSERT(pkt->pkt_qum.qum_pp != dpp);
1490 ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
1491 ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
1492 ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
1493 ASSERT(!pkt->pkt_flow_ip_is_frag);
1494 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1495
1496 csum_ok = false;
1497 agg_ok = false;
1498 /* supports TCP only */
1499 uint32_t thlen = (pkt->pkt_flow_ip_hlen +
1500 pkt->pkt_flow_tcp_hlen);
1501 uint32_t plen = (thlen + pkt->pkt_flow_ulen);
1502 uint16_t data_csum = 0;
1503
1504 KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1505 fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1506 err = flow_pkt_track(fe, pkt, true);
1507 if (__improbable(err != 0)) {
1508 STATS_INC(fsws, FSW_STATS_RX_FLOW_TRACK_ERR);
1509 /* if need to trigger RST then deliver to host */
1510 if (err == ENETRESET) {
1511 struct flow_entry *host_fe;
1512 host_fe =
1513 flow_mgr_get_host_fe(fsw->fsw_flow_mgr);
1514 KPKTQ_ENQUEUE(&host_fe->fe_rx_pktq, pkt);
1515 continue;
1516 }
1517 SK_ERR("flow_pkt_track failed (err %d)", err);
1518 KPKTQ_ENQUEUE(dropped_pkts, pkt);
1519 continue;
1520 }
1521
1522 if (is_mbuf) { /* compat */
1523 m_adj(pkt->pkt_mbuf, pkt->pkt_l2_len);
1524 pkt->pkt_svc_class = m_get_service_class(pkt->pkt_mbuf);
1525 }
1526
1527 if (prev_csum_ok && sbuf) {
1528 ASSERT(fa.fa_spkt == spkt);
1529 ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1530 agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
1531 agg_ok = (agg_ok && bufcnt < dpp->pp_max_frags);
1532
1533 if (agg_ok && sbuf->buf_dlim - sbuf->buf_doff -
1534 sbuf->buf_dlen >= plen - thlen) {
1535 /*
1536 * No need for a new packet, just
1537 * append to curr_m.
1538 */
1539 csum_ok = copy_pkt_csum_packed(pkt, plen, NULL,
1540 is_ipv4, NULL, sbuf, &data_csum, NULL);
1541
1542 if (!csum_ok) {
1543 STATS_INC(fsws,
1544 FSW_STATS_RX_AGG_BAD_CSUM);
1545 SK_ERR("Checksum for aggregation "
1546 "is wrong");
1547 DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail1);
1548 /*
1549 * Turns out, checksum is wrong!
1550 * Fallback to no-agg mode.
1551 */
1552 agg_ok = false;
1553 } else {
1554 flow_agg_merge_hdr(&fa, pkt,
1555 data_csum, fsws);
1556 goto next;
1557 }
1558 }
1559 }
1560
1561 /* calculate number of buflets required */
1562 bh_cnt_tmp = howmany(plen, dpp->pp_buflet_size);
1563 if (__improbable(bh_cnt_tmp > MAX_BUFLET_COUNT)) {
1564 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1565 SK_ERR("packet too big: bufcnt %d len %d", bh_cnt_tmp,
1566 plen);
1567 KPKTQ_ENQUEUE(dropped_pkts, pkt);
1568 continue;
1569 }
1570 if (bh_cnt < bh_cnt_tmp) {
1571 uint32_t tmp;
1572
1573 if (iter != 0) {
1574 /*
1575 * rearrange the array for additional
1576 * allocation
1577 */
1578 uint8_t i;
1579 for (i = 0; i < bh_cnt; i++, iter++) {
1580 buf_arr[i] = buf_arr[iter];
1581 buf_arr[iter] = 0;
1582 }
1583 iter = 0;
1584 }
1585 tmp = _estimate_buflet_cnt(fe, dpp);
1586 tmp = MAX(tmp, bh_cnt_tmp);
1587 tmp -= bh_cnt;
1588 ASSERT(tmp <= (MAX_BUFLET_COUNT - bh_cnt));
1589 err = pp_alloc_buflet_batch(dpp, &buf_arr[bh_cnt],
1590 &tmp, SKMEM_NOSLEEP);
1591 bh_cnt += tmp;
1592 if (__improbable((tmp == 0) || (bh_cnt < bh_cnt_tmp))) {
1593 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1594 SK_ERR("buflet alloc failed (err %d)", err);
1595 KPKTQ_ENQUEUE(dropped_pkts, pkt);
1596 continue;
1597 }
1598 }
1599 /* Use pre-allocated buflets */
1600 ASSERT(bh_cnt >= bh_cnt_tmp);
1601 dbuf_array.dba_num_dbufs = bh_cnt_tmp;
1602 while (bh_cnt_tmp-- > 0) {
1603 dbuf_array.dba_buflet[bh_cnt_tmp] =
1604 (kern_buflet_t)(buf_arr[iter]);
1605 buf_arr[iter] = 0;
1606 bh_cnt--;
1607 iter++;
1608 }
1609 /* copy and checksum TCP data */
1610 if (agg_ok) {
1611 int added = 0;
1612 ASSERT(dbuf_array.dba_num_dbufs != 0);
1613 csum_ok = copy_pkt_csum_packed(pkt, plen, &dbuf_array,
1614 is_ipv4, NULL, sbuf, &data_csum, &added);
1615
1616 if (__improbable(!csum_ok)) {
1617 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1618 SK_ERR("Checksum for aggregation on new "
1619 "mbuf is wrong");
1620 DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail2);
1621 agg_ok = false;
1622 /* reset the used buflets */
1623 uint8_t j;
1624 for (j = 0; j < dbuf_array.dba_num_dbufs; j++) {
1625 VERIFY(kern_buflet_set_data_length(
1626 dbuf_array.dba_buflet[j], 0) == 0);
1627 }
1628 goto non_agg;
1629 }
1630
1631 /*
1632 * There was not enough space in curr_m, thus we must
1633 * have added to m->m_data.
1634 */
1635 VERIFY(added > 0);
1636 } else {
1637 non_agg:
1638 ASSERT(dbuf_array.dba_num_dbufs != 0);
1639 csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
1640 &data_csum, is_ipv4);
1641 if (__improbable(!csum_ok)) {
1642 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1643 SK_ERR("%d incorrect csum", __LINE__);
1644 DTRACE_SKYWALK(aggr__chan_tcp_csum_fail);
1645 }
1646 }
1647 if (agg_ok) {
1648 ASSERT(fa.fa_spkt == spkt);
1649 ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1650 /* update current packet header */
1651 flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
1652 ASSERT(dbuf_array.dba_num_dbufs > 0);
1653 bufcnt += dbuf_array.dba_num_dbufs;
1654 _append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1655 &sbuf);
1656 } else {
1657 /* Finalize the current super packet */
1658 if (sph != 0) {
1659 spkts++;
1660 if (bufcnt > 1) {
1661 spkt->pkt_aggr_type =
1662 PKT_AGGR_SINGLE_IP;
1663 }
1664 pkt_finalize(sph);
1665 pkt_agg_log(spkt, kernproc, false);
1666 DTRACE_SKYWALK1(aggr__buflet__count, uint16_t,
1667 bufcnt);
1668 sph = 0;
1669 spkt = NULL;
1670 FLOW_AGG_CLEAR(&fa);
1671 }
1672
1673 /* New super packet */
1674 err = kern_pbufpool_alloc_nosleep(dpp, 0, &sph);
1675 if (__improbable(err != 0)) {
1676 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1677 SK_ERR("packet alloc failed (err %d)", err);
1678 _free_dbuf_array(dpp, &dbuf_array);
1679 KPKTQ_ENQUEUE(dropped_pkts, pkt);
1680 continue;
1681 }
1682 spkt = SK_PTR_ADDR_KPKT(sph);
1683 pkt_copy_metadata(pkt, spkt);
1684 /* Packet length for super packet starts from L3 */
1685 spkt->pkt_length = plen;
1686 spkt->pkt_flow_ulen = pkt->pkt_flow_ulen;
1687 spkt->pkt_headroom = 0;
1688 spkt->pkt_l2_len = 0;
1689 spkt->pkt_seg_cnt = 1;
1690
1691 ASSERT(dbuf_array.dba_num_dbufs > 0);
1692 bufcnt = dbuf_array.dba_num_dbufs;
1693 sbuf = kern_packet_get_next_buflet(sph, NULL);
1694 _append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1695 &sbuf);
1696
1697 KPKTQ_ENQUEUE(&pkts, spkt);
1698 _UUID_COPY(spkt->pkt_flow_id, fe->fe_uuid);
1699 _UUID_COPY(spkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1700 spkt->pkt_policy_id = fe->fe_policy_id;
1701 spkt->pkt_transport_protocol =
1702 fe->fe_transport_protocol;
1703 flow_agg_init_spkt(&fa, spkt, pkt);
1704 }
1705 next:
1706 pkt_agg_log(pkt, kernproc, true);
1707 prev_csum_ok = csum_ok;
1708 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1709 }
1710
1711 /* Free unused buflets */
1712 while (bh_cnt > 0) {
1713 pp_free_buflet(dpp, (kern_buflet_t)(buf_arr[iter]));
1714 buf_arr[iter] = 0;
1715 bh_cnt--;
1716 iter++;
1717 }
1718 /* Finalize the last super packet */
1719 if (sph != 0) {
1720 spkts++;
1721 if (bufcnt > 1) {
1722 spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1723 }
1724 pkt_finalize(sph);
1725 pkt_agg_log(spkt, kernproc, false);
1726 DTRACE_SKYWALK1(aggr__buflet__count, uint16_t, bufcnt);
1727 sph = 0;
1728 spkt = NULL;
1729 FLOW_AGG_CLEAR(&fa);
1730 }
1731 DTRACE_SKYWALK1(aggr__spkt__count, uint16_t, spkts);
1732 if (__improbable(is_mbuf)) {
1733 STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2PKT, spkts);
1734 } else {
1735 STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2PKT, spkts);
1736 }
1737 FLOW_STATS_IN_ADD(fe, spackets, spkts);
1738
1739 KPKTQ_FINI(&fe->fe_rx_pktq);
1740 KPKTQ_CONCAT(&fe->fe_rx_pktq, &pkts);
1741 KPKTQ_FINI(&pkts);
1742
1743 fsw_ring_enqueue_tail_drop(fsw, ring, &fe->fe_rx_pktq);
1744
1745 pp_free_pktq(&disposed_pkts);
1746 }
1747
1748 SK_NO_INLINE_ATTRIBUTE
1749 static void
flow_rx_agg_host(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * dropped_pkts,bool is_mbuf)1750 flow_rx_agg_host(struct nx_flowswitch *fsw, struct flow_entry *fe,
1751 struct pktq *dropped_pkts, bool is_mbuf)
1752 {
1753 struct flow_agg fa; /* states */
1754 FLOW_AGG_CLEAR(&fa);
1755
1756 struct pktq disposed_pkts; /* done src packets */
1757 KPKTQ_INIT(&disposed_pkts);
1758
1759 int alloced = 0;
1760 int factor;
1761
1762 struct __kern_packet *pkt, *tpkt;
1763 /* points to the first mbuf of chain */
1764 struct mbuf *m_chain = NULL;
1765 /* super mbuf, at the end it points to last mbuf packet */
1766 struct mbuf *smbuf = NULL, *curr_m = NULL;
1767 bool prev_csum_ok = false, csum_ok, agg_ok;
1768 uint16_t smbufs = 0;
1769 uint32_t bytes = 0, rcvd_ulen = 0;
1770 uint32_t rcvd_packets = 0, rcvd_bytes = 0; /* raw packets & bytes */
1771 uint32_t drop_packets = 0, drop_bytes = 0; /* dropped packets & bytes */
1772 uint32_t largest_smbuf = 0;
1773 int err = 0;
1774
1775 struct fsw_stats *fsws = &fsw->fsw_stats;
1776 bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1777
1778 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1779
1780 /* state for mbuf batch alloc */
1781 uint32_t mhead_cnt;
1782 uint32_t mhead_bufsize;
1783 struct mbuf * mhead = NULL;
1784
1785 uint16_t l2len = KPKTQ_FIRST(&fe->fe_rx_pktq)->pkt_l2_len;
1786
1787 SK_DF(logflags, "Rx input queue bytes %u", fe->fe_rx_pktq_bytes);
1788
1789 if (__probable(!is_mbuf)) {
1790 uint32_t max_ip_len = MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET);
1791
1792 /*
1793 * Batch mbuf alloc is based on
1794 * convert_native_pkt_to_mbuf_chain
1795 */
1796 if (__probable(fe->fe_rx_largest_msize != 0 &&
1797 max_ip_len > 0)) {
1798 unsigned int one;
1799 int wait;
1800
1801 if (fe->fe_rx_largest_msize <= MCLBYTES) {
1802 mhead_bufsize = MCLBYTES;
1803 } else if (fe->fe_rx_largest_msize <= MBIGCLBYTES) {
1804 mhead_bufsize = MBIGCLBYTES;
1805 } else {
1806 mhead_bufsize = M16KCLBYTES;
1807 }
1808
1809 try_again:
1810 if (fe->fe_rx_pktq_bytes != 0) {
1811 uint32_t aggregation_size =
1812 MAX(fe->fe_rx_largest_msize, MCLBYTES);
1813
1814 aggregation_size =
1815 MIN(aggregation_size, mhead_bufsize);
1816
1817 factor = (fe->fe_rx_pktq_bytes / max_ip_len) *
1818 (MAX(sizeof(struct ip),
1819 sizeof(struct ip6_hdr)) +
1820 sizeof(struct tcphdr));
1821
1822 mhead_cnt = MAX(((fe->fe_rx_pktq_bytes +
1823 factor) / aggregation_size) + 1, 1);
1824 } else {
1825 /* No payload, thus it's all small-sized ACKs/... */
1826 mhead_bufsize = MHLEN;
1827 mhead_cnt = KPKTQ_LEN(&fe->fe_rx_pktq);
1828 }
1829
1830 one = 1;
1831
1832 if (mhead_bufsize >= MBIGCLBYTES) {
1833 wait = M_NOWAIT;
1834 } else {
1835 wait = M_WAITOK;
1836 }
1837
1838 mhead = m_allocpacket_internal(&mhead_cnt,
1839 mhead_bufsize, &one, wait, 1, 0);
1840
1841 if (mhead == NULL) {
1842 if (mhead_bufsize == M16KCLBYTES) {
1843 mhead_bufsize = MBIGCLBYTES;
1844 goto try_again;
1845 }
1846
1847 if (mhead_bufsize == MBIGCLBYTES) {
1848 mhead_bufsize = MCLBYTES;
1849 goto try_again;
1850 }
1851 }
1852 } else {
1853 mhead = NULL;
1854 mhead_bufsize = mhead_cnt = 0;
1855 }
1856 SK_DF(logflags, "batch alloc'ed %u mbufs of size %u", mhead_cnt,
1857 mhead_bufsize);
1858 }
1859
1860 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1861 if (tpkt != NULL) {
1862 void *baddr;
1863 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1864 SK_PREFETCH(baddr, 0);
1865 }
1866
1867 /* Validate l2 len, ip vers, is_mbuf */
1868 ASSERT(pkt->pkt_l2_len == l2len);
1869 ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
1870 ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
1871 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
1872 ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
1873 ASSERT(!pkt->pkt_flow_ip_is_frag);
1874 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1875
1876 csum_ok = false;
1877 agg_ok = false;
1878 /*
1879 * As we only agg packets with same hdr length,
1880 * leverage the pkt metadata
1881 */
1882 uint32_t thlen = (pkt->pkt_flow_ip_hlen +
1883 pkt->pkt_flow_tcp_hlen);
1884 uint32_t plen = (thlen + pkt->pkt_flow_ulen);
1885
1886 /*
1887 * Rather than calling flow_pkt_track() for each
1888 * packet here, we accumulate received packet stats
1889 * for the call to flow_track_stats() below. This
1890 * is because flow tracking is a no-op for traffic
1891 * that belongs to the host stack.
1892 */
1893 rcvd_ulen += pkt->pkt_flow_ulen;
1894 rcvd_bytes += pkt->pkt_length;
1895 rcvd_packets++;
1896
1897 KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1898 fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1899
1900 /* packet is for BSD flow, create a mbuf chain */
1901 uint32_t len = (l2len + plen);
1902 uint16_t data_csum = 0;
1903 struct mbuf *m;
1904 if (__improbable(is_mbuf)) {
1905 m = pkt->pkt_mbuf;
1906 /* Detach mbuf from source pkt */
1907 KPKT_CLEAR_MBUF_DATA(pkt);
1908
1909 uint32_t trailer = (m_pktlen(m) - len);
1910 ASSERT((uint32_t)m_pktlen(m) >= plen);
1911 /* Remove the trailer */
1912 if (trailer > 0) {
1913 m_adj(m, -trailer);
1914 }
1915 /* attached mbuf is already allocated */
1916 csum_ok = mbuf_csum(pkt, m, is_ipv4, &data_csum);
1917 } else { /* native */
1918 uint16_t pad = P2ROUNDUP(l2len, sizeof(uint32_t)) -
1919 l2len;
1920 uint32_t tot_len = (len + pad);
1921 /* remember largest aggregated packet size */
1922 if (smbuf) {
1923 if (largest_smbuf < (uint32_t)m_pktlen(smbuf)) {
1924 largest_smbuf =
1925 (uint32_t)m_pktlen(smbuf);
1926 }
1927 }
1928
1929 if (prev_csum_ok && curr_m) {
1930 ASSERT(fa.fa_smbuf == smbuf);
1931 ASSERT(!fa.fa_sobj_is_pkt);
1932 agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
1933
1934 if (agg_ok &&
1935 M_TRAILINGSPACE(curr_m) >= plen - thlen) {
1936 /*
1937 * No need for a new mbuf,
1938 * just append to curr_m.
1939 */
1940 csum_ok = copy_pkt_csum_packed(pkt,
1941 plen, NULL, is_ipv4, curr_m, NULL,
1942 &data_csum, NULL);
1943
1944 if (!csum_ok) {
1945 STATS_INC(fsws,
1946 FSW_STATS_RX_AGG_BAD_CSUM);
1947 SK_ERR("Checksum for "
1948 "aggregation is wrong");
1949 DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail1);
1950 /*
1951 * Turns out, checksum is wrong!
1952 * Fallback to no-agg mode.
1953 */
1954 agg_ok = 0;
1955 } else {
1956 /*
1957 * We only added payload,
1958 * thus -thlen.
1959 */
1960 bytes += (plen - thlen);
1961 flow_agg_merge_hdr(&fa, pkt,
1962 data_csum, fsws);
1963 goto next;
1964 }
1965 }
1966 }
1967
1968 /*
1969 * If the batch allocation returned partial success,
1970 * we try blocking allocation here again
1971 */
1972 m = mhead;
1973 if (__improbable(m == NULL ||
1974 tot_len > mhead_bufsize)) {
1975 unsigned int one = 1;
1976
1977 ASSERT(mhead_cnt == 0 || mhead != NULL);
1978 err = mbuf_allocpacket(MBUF_WAITOK, tot_len,
1979 &one, &m);
1980 if (err != 0) {
1981 STATS_INC(fsws,
1982 FSW_STATS_RX_DROP_NOMEM_BUF);
1983 SK_ERR("mbuf alloc failed (err %d)",
1984 err);
1985 KPKTQ_ENQUEUE(dropped_pkts, pkt);
1986 drop_packets++;
1987 drop_bytes += pkt->pkt_length;
1988 continue;
1989 }
1990 alloced++;
1991 } else {
1992 ASSERT(mhead_cnt > 0);
1993 mhead = m->m_nextpkt;
1994 m->m_nextpkt = NULL;
1995 mhead_cnt--;
1996 }
1997 m->m_data += pad;
1998 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
1999
2000 /*
2001 * copy and checksum l3, l4 and payload
2002 * l2 header is copied later only if we
2003 * can't agg as an optimization
2004 */
2005 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
2006 _dbuf_array_t dbuf_array = {.dba_is_buflet = false};
2007 if (agg_ok) {
2008 int added = 0;
2009 dbuf_array.dba_mbuf[0] = m;
2010 dbuf_array.dba_num_dbufs = 1;
2011 csum_ok = copy_pkt_csum_packed(pkt, plen,
2012 &dbuf_array, is_ipv4, curr_m, NULL,
2013 &data_csum, &added);
2014
2015 if (!csum_ok) {
2016 STATS_INC(fsws,
2017 FSW_STATS_RX_AGG_BAD_CSUM);
2018 SK_ERR("Checksum for aggregation "
2019 "on new mbuf is wrong");
2020 DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail2);
2021 agg_ok = false;
2022 goto non_agg;
2023 }
2024
2025 /*
2026 * There was not enough space in curr_m,
2027 * thus we must have added to m->m_data.
2028 */
2029 VERIFY(added > 0);
2030 VERIFY(m->m_len == m->m_pkthdr.len &&
2031 (uint32_t)m->m_len <=
2032 (uint32_t)mbuf_maxlen(m));
2033
2034 /*
2035 * We account for whatever we added
2036 * to m later on, thus - added.
2037 */
2038 bytes += plen - thlen - added;
2039 } else {
2040 non_agg:
2041 dbuf_array.dba_mbuf[0] = m;
2042 dbuf_array.dba_num_dbufs = 1;
2043 m->m_len += l2len;
2044 m->m_pkthdr.len += l2len;
2045 csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
2046 &data_csum, is_ipv4);
2047 if (__improbable(!csum_ok)) {
2048 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
2049 SK_ERR("%d incorrect csum", __LINE__);
2050 DTRACE_SKYWALK(aggr__host_tcp_csum_fail);
2051 }
2052 VERIFY(m->m_len == m->m_pkthdr.len &&
2053 (uint32_t)m->m_len <=
2054 (uint32_t)mbuf_maxlen(m));
2055 }
2056
2057 STATS_INC(fsws, FSW_STATS_RX_COPY_PKT2MBUF);
2058 STATS_INC(fsws, FSW_STATS_RX_COPY_SUM);
2059
2060 m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off;
2061 m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value;
2062 /*
2063 * Note that these flags have same value,
2064 * except PACKET_CSUM_PARTIAL
2065 */
2066 m->m_pkthdr.csum_flags |= (pkt->pkt_csum_flags &
2067 PACKET_CSUM_RX_FLAGS);
2068
2069 /* Set the rcvif */
2070 m->m_pkthdr.rcvif = fsw->fsw_ifp;
2071 }
2072 ASSERT(m != NULL);
2073 ASSERT((m->m_flags & M_PKTHDR) && m->m_pkthdr.pkt_hdr != NULL);
2074 ASSERT((m->m_flags & M_HASFCS) == 0);
2075 ASSERT(m->m_nextpkt == NULL);
2076
2077 if (__improbable(is_mbuf)) {
2078 if ((uint32_t) m->m_len < (l2len + thlen)) {
2079 m = m_pullup(m, (l2len + thlen));
2080 if (m == NULL) {
2081 STATS_INC(fsws,
2082 FSW_STATS_RX_DROP_NOMEM_BUF);
2083 SK_ERR("mbuf pullup failed (err %d)",
2084 err);
2085 KPKTQ_ENQUEUE(dropped_pkts, pkt);
2086 drop_packets++;
2087 drop_bytes += pkt->pkt_length;
2088 continue;
2089 }
2090 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2091 }
2092 if (prev_csum_ok && csum_ok) {
2093 ASSERT(fa.fa_smbuf == smbuf);
2094 agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2095 }
2096 }
2097
2098 if (agg_ok) {
2099 ASSERT(fa.fa_smbuf == smbuf);
2100 ASSERT(!fa.fa_sobj_is_pkt);
2101 if (__improbable(is_mbuf)) {
2102 bytes += (m_pktlen(m) - l2len);
2103 /* adjust mbuf by l2, l3 and l4 hdr */
2104 m_adj(m, l2len + thlen);
2105 } else {
2106 bytes += m_pktlen(m);
2107 }
2108
2109 m->m_flags &= ~M_PKTHDR;
2110 flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
2111 while (curr_m->m_next != NULL) {
2112 curr_m = curr_m->m_next;
2113 }
2114 curr_m->m_next = m;
2115 curr_m = m;
2116 m = NULL;
2117 } else {
2118 if ((uint32_t) m->m_len < l2len) {
2119 m = m_pullup(m, l2len);
2120 if (m == NULL) {
2121 STATS_INC(fsws,
2122 FSW_STATS_RX_DROP_NOMEM_BUF);
2123 SK_ERR("mbuf pullup failed (err %d)",
2124 err);
2125 KPKTQ_ENQUEUE(dropped_pkts, pkt);
2126 drop_packets++;
2127 drop_bytes += pkt->pkt_length;
2128 continue;
2129 }
2130 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2131 }
2132
2133 /* copy l2 header for native */
2134 if (__probable(!is_mbuf)) {
2135 uint16_t llhoff = pkt->pkt_headroom;
2136 uint8_t *baddr;
2137 MD_BUFLET_ADDR_ABS(pkt, baddr);
2138 ASSERT(baddr != NULL);
2139 baddr += llhoff;
2140 pkt_copy(baddr, m->m_data, l2len);
2141 }
2142 /* adjust mbuf by l2 hdr */
2143 m_adj(m, l2len);
2144 bytes += m_pktlen(m);
2145
2146 /*
2147 * aggregated packets can be skipped by pktap because
2148 * the original pre-aggregated chain already passed through
2149 * pktap (see fsw_snoop()) before entering this function.
2150 */
2151 m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
2152
2153 if (m_chain == NULL) {
2154 /* this is the start of the chain */
2155 m_chain = m;
2156 smbuf = m;
2157 curr_m = m;
2158 } else if (smbuf != NULL) {
2159 /*
2160 * set m to be next packet
2161 */
2162 mbuf_agg_log(smbuf, kernproc, is_mbuf);
2163 smbuf->m_nextpkt = m;
2164 smbuf = m;
2165 curr_m = m;
2166 } else {
2167 VERIFY(0);
2168 }
2169
2170 smbufs++;
2171 m = NULL;
2172
2173 flow_agg_init_smbuf(&fa, smbuf, pkt);
2174 /*
2175 * if the super packet is an mbuf which can't accomodate
2176 * (sizeof(struct ip6_tcp_mask) in a single buffer then
2177 * do the aggregation check in slow path.
2178 * Note that an mbuf without cluster has only 80 bytes
2179 * available for data, sizeof(struct ip6_tcp_mask) is
2180 * also 80 bytes, so if the packet contains an
2181 * ethernet header, this mbuf won't be able to fully
2182 * contain "struct ip6_tcp_mask" data in a single
2183 * buffer.
2184 */
2185 if (pkt->pkt_flow_ip_ver == IPV6_VERSION) {
2186 if (__improbable(smbuf->m_len <
2187 ((smbuf->m_data -
2188 (caddr_t)(smbuf->m_pkthdr.pkt_hdr)) +
2189 MASK_SIZE))) {
2190 fa.fa_sobj_is_short = true;
2191 }
2192 }
2193 }
2194 next:
2195 pkt_agg_log(pkt, kernproc, true);
2196 prev_csum_ok = csum_ok;
2197 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
2198 }
2199
2200 KPKTQ_FINI(&fe->fe_rx_pktq);
2201
2202 /* Free any leftover mbufs, true only for native */
2203 if (__improbable(mhead != NULL)) {
2204 ASSERT(mhead_cnt != 0);
2205 (void) m_freem_list(mhead);
2206 mhead = NULL;
2207 mhead_cnt = 0;
2208 mhead_bufsize = 0;
2209 }
2210
2211 if (fe->fe_rx_largest_msize > largest_smbuf) {
2212 /*
2213 * Make it slowly move towards smbuf if we consistently get
2214 * non-aggregatable size.
2215 *
2216 * If we start at 16K, this makes us go to 4K within 6 rounds
2217 * and down to 2K within 12 rounds.
2218 */
2219 fe->fe_rx_largest_msize -=
2220 ((fe->fe_rx_largest_msize - largest_smbuf) >> 2);
2221 } else {
2222 fe->fe_rx_largest_msize +=
2223 ((largest_smbuf - fe->fe_rx_largest_msize) >> 2);
2224 }
2225
2226 if (smbufs > 0) {
2227 /* Last smbuf */
2228 mbuf_agg_log(smbuf, kernproc, is_mbuf);
2229 SK_DF(logflags, "smbuf count %u", smbufs);
2230
2231 ASSERT(m_chain != NULL);
2232 ASSERT(smbuf != NULL);
2233 /*
2234 * Call fsw_host_sendup() with mbuf chain
2235 * directly.
2236 */
2237 mchain_agg_log(m_chain, kernproc, is_mbuf);
2238 fsw_host_sendup(fsw->fsw_ifp, m_chain, smbuf, smbufs, bytes);
2239
2240 if (__improbable(is_mbuf)) {
2241 STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2MBUF, smbufs);
2242 } else {
2243 STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2MBUF, smbufs);
2244 }
2245 FLOW_STATS_IN_ADD(fe, spackets, smbufs);
2246
2247 ASSERT((fe->fe_flags & FLOWENTF_TRACK) == 0);
2248 }
2249
2250 /* record (raw) number of packets and bytes */
2251 ASSERT((int)(rcvd_bytes - drop_bytes) > 0);
2252 ASSERT((int)(rcvd_packets - drop_packets) > 0);
2253 flow_track_stats(fe, (rcvd_bytes - drop_bytes),
2254 (rcvd_packets - drop_packets), (rcvd_ulen != 0), true);
2255
2256 pp_free_pktq(&disposed_pkts);
2257 }
2258
2259 void
flow_rx_agg_tcp(struct nx_flowswitch * fsw,struct flow_entry * fe)2260 flow_rx_agg_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe)
2261 {
2262 struct pktq dropped_pkts;
2263 bool is_mbuf;
2264
2265 if (__improbable(fe->fe_rx_frag_count > 0)) {
2266 dp_flow_rx_process(fsw, fe);
2267 return;
2268 }
2269
2270 KPKTQ_INIT(&dropped_pkts);
2271
2272 if (!dp_flow_rx_route_process(fsw, fe)) {
2273 SK_ERR("Rx route bad");
2274 fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
2275 STATS_ADD(&fsw->fsw_stats, FSW_STATS_RX_FLOW_NONVIABLE,
2276 KPKTQ_LEN(&dropped_pkts));
2277 goto done;
2278 }
2279
2280 is_mbuf = !!(PKT_IS_MBUF(KPKTQ_FIRST(&fe->fe_rx_pktq)));
2281
2282 if (fe->fe_nx_port == FSW_VP_HOST) {
2283 boolean_t do_rx_agg;
2284
2285 /* BSD flow */
2286 if (sk_fsw_rx_agg_tcp_host != SK_FSW_RX_AGG_TCP_HOST_AUTO) {
2287 do_rx_agg = (sk_fsw_rx_agg_tcp_host ==
2288 SK_FSW_RX_AGG_TCP_HOST_ON);
2289 } else {
2290 do_rx_agg = !dlil_has_ip_filter() &&
2291 !dlil_has_if_filter(fsw->fsw_ifp);
2292 }
2293 if (__improbable(!do_rx_agg)) {
2294 fsw_host_rx(fsw, fe);
2295 return;
2296 }
2297 if (__improbable(pktap_total_tap_count != 0)) {
2298 fsw_snoop(fsw, fe, true);
2299 }
2300 flow_rx_agg_host(fsw, fe, &dropped_pkts, is_mbuf);
2301 } else {
2302 /* channel flow */
2303 if (__improbable(pktap_total_tap_count != 0)) {
2304 fsw_snoop(fsw, fe, true);
2305 }
2306 flow_rx_agg_channel(fsw, fe, &dropped_pkts, is_mbuf);
2307 }
2308
2309 done:
2310 pp_free_pktq(&dropped_pkts);
2311 }
2312