1 /*
2 * Copyright (c) 2019-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
33 #include <skywalk/nexus/netif/nx_netif.h>
34 #include <skywalk/nexus/netif/nx_netif_compat.h>
35 #include <netinet/tcp.h>
36 #include <netinet/ip.h>
37 #include <netinet/ip6.h>
38 #include <net/pktap.h>
39 #include <sys/sdt.h>
40
41 #define MAX_AGG_IP_LEN() MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET)
42 #define MAX_BUFLET_COUNT (32)
43 #define TCP_FLAGS_IGNORE (TH_FIN|TH_SYN|TH_RST|TH_URG)
44 #define PKT_IS_MBUF(_pkt) (_pkt->pkt_pflags & PKT_F_MBUF_DATA)
45 #define PKT_IS_TRUNC_MBUF(_pkt) (PKT_IS_MBUF(_pkt) && \
46 (_pkt->pkt_pflags & PKT_F_TRUNCATED))
47 #define PKT_IS_WAKE_PKT(_pkt) ((PKT_IS_MBUF(_pkt) && \
48 (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) || \
49 (!PKT_IS_MBUF(_pkt) && \
50 (_pkt->pkt_pflags & PKT_F_WAKE_PKT)))
51
52
53 typedef uint16_t (* flow_agg_fix_pkt_sum_func)(uint16_t, uint16_t, uint16_t);
54
55 static uint16_t
56 flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new);
57
58 static uint16_t
59 flow_agg_pkt_fix_sum_no_op(uint16_t csum, uint16_t old, uint16_t new);
60
61 /*
62 * This structure holds per-super object (mbuf/packet) flow aggregation.
63 */
64 struct flow_agg {
65 union {
66 struct {
67 union {
68 void * _fa_sobj;
69 struct mbuf * _fa_smbuf; /* super mbuf */
70 struct __kern_packet *_fa_spkt; /* super pkt */
71 };
72 uint8_t *_fa_sptr; /* ptr to super IP header */
73 bool _fa_sobj_is_pkt; /* super obj is pkt or mbuf */
74 /*
75 * super obj is not large enough to hold the IP & TCP
76 * header in a contiguous buffer.
77 */
78 bool _fa_sobj_is_short;
79 uint32_t _fa_tcp_seq; /* expected next sequence # */
80 uint32_t _fa_ulen; /* expected next ulen */
81 uint32_t _fa_total; /* total aggregated bytes */
82 /* function that fix packet checksum */
83 flow_agg_fix_pkt_sum_func _fa_fix_pkt_sum;
84 } __flow_agg;
85 uint64_t __flow_agg_data[5];
86 };
87 #define fa_sobj __flow_agg._fa_sobj
88 #define fa_smbuf __flow_agg._fa_smbuf
89 #define fa_spkt __flow_agg._fa_spkt
90 #define fa_sptr __flow_agg._fa_sptr
91 #define fa_sobj_is_pkt __flow_agg._fa_sobj_is_pkt
92 #define fa_sobj_is_short __flow_agg._fa_sobj_is_short
93 #define fa_tcp_seq __flow_agg._fa_tcp_seq
94 #define fa_ulen __flow_agg._fa_ulen
95 #define fa_total __flow_agg._fa_total
96 #define fa_fix_pkt_sum __flow_agg._fa_fix_pkt_sum
97 };
98
99 #define FLOW_AGG_CLEAR(_fa) do { \
100 _CASSERT(sizeof(struct flow_agg) == 40); \
101 _CASSERT(offsetof(struct flow_agg, fa_fix_pkt_sum) == 32); \
102 sk_zero_32(_fa); \
103 (_fa)->fa_fix_pkt_sum = 0; \
104 } while (0)
105
106 #define MASK_SIZE 80 /* size of struct {ip,ip6}_tcp_mask */
107
108 struct ip_tcp_mask {
109 struct ip ip_m;
110 struct tcphdr tcp_m;
111 uint32_t tcp_option_m[MAX_TCPOPTLEN / sizeof(uint32_t)];
112 };
113
114 static const struct ip_tcp_mask ip_tcp_mask
115 __sk_aligned(16) =
116 {
117 .ip_m = {
118 .ip_hl = 0xf,
119 .ip_v = 0xf,
120 .ip_tos = 0xff,
121 /* Not checked; aggregated packet's ip_len is increasing */
122 .ip_len = 0,
123 .ip_id = 0,
124 .ip_off = 0xffff,
125 .ip_ttl = 0xff,
126 .ip_p = 0xff,
127 .ip_sum = 0,
128 .ip_src.s_addr = 0xffffffff,
129 .ip_dst.s_addr = 0xffffffff,
130 },
131 .tcp_m = {
132 .th_sport = 0xffff,
133 .th_dport = 0xffff,
134 .th_seq = 0,
135 .th_ack = 0xffffffff,
136 .th_x2 = 0xf,
137 .th_off = 0xf,
138 .th_flags = ~TH_PUSH,
139 .th_win = 0xffff,
140 .th_sum = 0,
141 .th_urp = 0xffff,
142 },
143 .tcp_option_m = {
144 /* Max 40 bytes of TCP options */
145 0xffffffff,
146 0xffffffff,
147 0xffffffff,
148 0, /* Filling up to MASK_SIZE */
149 0, /* Filling up to MASK_SIZE */
150 0, /* Filling up to MASK_SIZE */
151 0, /* Filling up to MASK_SIZE */
152 0, /* Filling up to MASK_SIZE */
153 0, /* Filling up to MASK_SIZE */
154 0, /* Filling up to MASK_SIZE */
155 },
156 };
157
158 struct ip6_tcp_mask {
159 struct ip6_hdr ip6_m;
160 struct tcphdr tcp_m;
161 uint32_t tcp_option_m[5]; /* 5 bytes to fill up to MASK_SIZE */
162 };
163
164 static const struct ip6_tcp_mask ip6_tcp_mask
165 __sk_aligned(16) =
166 {
167 .ip6_m = {
168 .ip6_ctlun.ip6_un1.ip6_un1_flow = 0xffffffff,
169 /* Not checked; aggregated packet's ip_len is increasing */
170 .ip6_ctlun.ip6_un1.ip6_un1_plen = 0,
171 .ip6_ctlun.ip6_un1.ip6_un1_nxt = 0xff,
172 .ip6_ctlun.ip6_un1.ip6_un1_hlim = 0xff,
173 .ip6_src.__u6_addr.__u6_addr32[0] = 0xffffff,
174 .ip6_src.__u6_addr.__u6_addr32[1] = 0xffffff,
175 .ip6_src.__u6_addr.__u6_addr32[2] = 0xffffff,
176 .ip6_src.__u6_addr.__u6_addr32[3] = 0xffffff,
177 .ip6_dst.__u6_addr.__u6_addr32[0] = 0xffffff,
178 .ip6_dst.__u6_addr.__u6_addr32[1] = 0xffffff,
179 .ip6_dst.__u6_addr.__u6_addr32[2] = 0xffffff,
180 .ip6_dst.__u6_addr.__u6_addr32[3] = 0xffffff,
181 },
182 .tcp_m = {
183 .th_sport = 0xffff,
184 .th_dport = 0xffff,
185 .th_seq = 0,
186 .th_ack = 0xffffffff,
187 .th_x2 = 0xf,
188 .th_off = 0xf,
189 .th_flags = ~TH_PUSH,
190 .th_win = 0xffff,
191 .th_sum = 0,
192 .th_urp = 0xffff,
193 },
194 .tcp_option_m = {
195 /* Max 40 bytes of TCP options */
196 0xffffffff,
197 0xffffffff,
198 0xffffffff,
199 0, /* Filling up to MASK_SIZE */
200 0, /* Filling up to MASK_SIZE */
201 },
202 };
203
204 #if SK_LOG
205 SK_LOG_ATTRIBUTE
206 static void
_pkt_agg_log(struct __kern_packet * pkt,struct proc * p,bool is_input)207 _pkt_agg_log(struct __kern_packet *pkt, struct proc *p, bool is_input)
208 {
209 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
210 (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
211
212 kern_packet_t ph = SK_PKT2PH(pkt);
213 uint64_t bufcnt = 1;
214 if (!is_input) {
215 bufcnt = kern_packet_get_buflet_count(ph);
216 }
217
218 SK_DF(logflags, "%s(%d) %spkt 0x%llx plen %u",
219 sk_proc_name_address(p), sk_proc_pid(p), is_input ? "s":"d",
220 SK_KVA(pkt), pkt->pkt_length);
221
222 SK_DF(logflags, "%spkt csumf/rxstart/rxval 0x%x/%u/0x%04x",
223 is_input ? "s":"d", pkt->pkt_csum_flags,
224 (uint32_t)pkt->pkt_csum_rx_start_off,
225 (uint32_t)pkt->pkt_csum_rx_value);
226
227 if (!is_input) {
228 kern_buflet_t buf = kern_packet_get_next_buflet(ph, NULL);
229
230 /* Individual buflets */
231 for (uint64_t i = 0; i < bufcnt && buf != NULL; i++) {
232 SK_DF(logflags | SK_VERB_DUMP, "%s",
233 sk_dump("buf", kern_buflet_get_data_address(buf),
234 pkt->pkt_length, 128, NULL, 0));
235 buf = kern_packet_get_next_buflet(ph, buf);
236 }
237 }
238 }
239
240 #define pkt_agg_log(_pkt, _p, _is_input) do { \
241 if (__improbable(sk_verbose != 0)) { \
242 _pkt_agg_log(_pkt, _p, _is_input); \
243 } \
244 } while (0)
245
246 SK_LOG_ATTRIBUTE
247 static void
_mbuf_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)248 _mbuf_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
249 {
250 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
251 (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
252
253 SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
254 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
255 m->m_pkthdr.len);
256
257 SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
258 m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
259 (uint32_t)m->m_pkthdr.csum_rx_val);
260
261 /* Dump the first mbuf */
262 ASSERT(m->m_data != NULL);
263 SK_DF(logflags | SK_VERB_DUMP, "%s", sk_dump("buf",
264 (uint8_t *)m->m_data, m->m_len, 128, NULL, 0));
265 }
266
267 #define mbuf_agg_log(_m, _p, _is_mbuf) do { \
268 if (__improbable(sk_verbose != 0)) { \
269 _mbuf_agg_log(_m, _p, _is_mbuf); \
270 } \
271 } while (0)
272
273 SK_LOG_ATTRIBUTE
274 static void
_mchain_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)275 _mchain_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
276 {
277 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
278 (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
279
280 while (m != NULL) {
281 SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
282 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
283 m->m_pkthdr.len);
284
285 SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
286 m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
287 (uint32_t)m->m_pkthdr.csum_rx_val);
288
289 m = m->m_nextpkt;
290 }
291 }
292
293 #define mchain_agg_log(_m, _p, _is_mbuf) do { \
294 if (__improbable(sk_verbose != 0)) { \
295 _mchain_agg_log(_m, _p, _is_mbuf); \
296 } \
297 } while (0)
298 #else
299 #define pkt_agg_log(...)
300 #define mbuf_agg_log(...)
301 #define mchain_agg_log(...)
302 #endif /* SK_LOG */
303
304 /*
305 * Checksum only for packet with mbuf.
306 */
307 static bool
mbuf_csum(struct __kern_packet * pkt,struct mbuf * m,bool verify_l3,uint16_t * data_csum)308 mbuf_csum(struct __kern_packet *pkt, struct mbuf *m, bool verify_l3,
309 uint16_t *data_csum)
310 {
311 ASSERT(data_csum != NULL);
312
313 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
314 uint32_t plen = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen +
315 pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
316 uint16_t l4len = plen - pkt->pkt_l2_len - pkt->pkt_flow_ip_hlen;
317 uint16_t start = pkt->pkt_l2_len;
318 uint32_t partial = 0;
319 uint16_t csum = 0;
320
321 ASSERT(plen == m_pktlen(m));
322
323 /* Some compat drivers compute full checksum */
324 if ((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS) ==
325 CSUM_RX_FULL_FLAGS) {
326 SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
327 m->m_pkthdr.csum_flags, m->m_pkthdr.csum_rx_start,
328 m->m_pkthdr.csum_rx_val);
329
330 /* Compute the data_csum */
331 struct tcphdr *tcp =
332 (struct tcphdr *)(void *)(mtod(m, uint8_t *) +
333 pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen);
334 /* 16-bit alignment is sufficient */
335 ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
336
337 uint16_t th_sum = tcp->th_sum;
338 tcp->th_sum = 0;
339
340 partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
341 pkt->pkt_flow_tcp_hlen);
342 partial += htons(l4len + IPPROTO_TCP);
343 if (pkt->pkt_flow_ip_ver == IPVERSION) {
344 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
345 pkt->pkt_flow_ipv4_dst.s_addr, partial);
346 } else {
347 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
348 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
349 &pkt->pkt_flow_ipv6_dst, partial);
350 }
351 /* Restore the original checksum */
352 tcp->th_sum = th_sum;
353 th_sum = __packet_fix_sum(th_sum, csum, 0);
354 *data_csum = ~th_sum & 0xffff;
355
356 /* pkt metadata will be transfer to super packet */
357 __packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
358 0, m->m_pkthdr.csum_rx_val, false);
359
360 if ((m->m_pkthdr.csum_rx_val ^ 0xffff) == 0) {
361 return true;
362 } else {
363 return false;
364 }
365 }
366 /* Reset the csum RX flags */
367 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
368 if (verify_l3) {
369 csum = m_sum16(m, start, pkt->pkt_flow_ip_hlen);
370 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
371 start, pkt->pkt_flow_ip_hlen, csum);
372 m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED;
373 if ((csum ^ 0xffff) != 0) {
374 return false;
375 } else {
376 m->m_pkthdr.csum_flags |= CSUM_IP_VALID;
377 }
378 }
379 /* Compute L4 header checksum */
380 partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
381 pkt->pkt_flow_tcp_hlen);
382 /* Compute payload checksum */
383 start += (pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
384 *data_csum = m_sum16(m, start, (plen - start));
385
386 /* Fold in the data checksum to TCP checksum */
387 partial += *data_csum;
388 partial += htons(l4len + IPPROTO_TCP);
389 if (pkt->pkt_flow_ip_ver == IPVERSION) {
390 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
391 pkt->pkt_flow_ipv4_dst.s_addr, partial);
392 } else {
393 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
394 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
395 &pkt->pkt_flow_ipv6_dst, partial);
396 }
397 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
398 start - pkt->pkt_flow_tcp_hlen, l4len, csum);
399 // Set start to 0 for full checksum
400 m->m_pkthdr.csum_rx_start = 0;
401 m->m_pkthdr.csum_rx_val = csum;
402 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
403
404 /* pkt metadata will be transfer to super packet */
405 __packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
406 0, csum, false);
407
408 if ((csum ^ 0xffff) != 0) {
409 return false;
410 }
411
412 return true;
413 }
414
415 /* structure to pass an array of data buffers */
416 typedef struct _dbuf_array {
417 union {
418 struct __kern_buflet *dba_buflet[MAX_BUFLET_COUNT];
419 struct mbuf *dba_mbuf[MAX_BUFLET_COUNT];
420 };
421 uint8_t dba_num_dbufs;
422 bool dba_is_buflet;
423 } _dbuf_array_t;
424
425 static inline void
_copy_data_sum_dbuf(struct __kern_packet * spkt,uint16_t soff,uint16_t plen,uint32_t * partial_sum,boolean_t * odd_start,_dbuf_array_t * dbuf,boolean_t do_csum)426 _copy_data_sum_dbuf(struct __kern_packet *spkt, uint16_t soff, uint16_t plen,
427 uint32_t *partial_sum, boolean_t *odd_start, _dbuf_array_t *dbuf,
428 boolean_t do_csum)
429 {
430 uint8_t i = 0;
431 uint16_t buf_off = 0;
432 uint16_t buflet_dlim;
433 uint16_t buflet_dlen;
434
435 ASSERT(plen > 0);
436 while (plen > 0) {
437 ASSERT(i < dbuf->dba_num_dbufs);
438 uint16_t tmplen;
439 uint16_t dbuf_lim;
440 uint8_t *dbuf_addr;
441
442 if (dbuf->dba_is_buflet) {
443 ASSERT(kern_buflet_get_data_offset(dbuf->dba_buflet[i]) == 0);
444 dbuf_addr = kern_buflet_get_data_address(dbuf->dba_buflet[i]);
445
446 buflet_dlim = kern_buflet_get_data_limit(dbuf->dba_buflet[i]);
447 buflet_dlen = kern_buflet_get_data_length(dbuf->dba_buflet[i]);
448 buf_off = buflet_dlen;
449 dbuf_lim = buflet_dlim - buf_off;
450 dbuf_addr += buf_off;
451 } else {
452 dbuf_lim = M_TRAILINGSPACE(dbuf->dba_mbuf[i]);
453 dbuf_addr = mtod(dbuf->dba_mbuf[i], uint8_t *);
454 buf_off = dbuf->dba_mbuf[i]->m_len;
455 dbuf_addr += buf_off;
456 }
457 tmplen = min(plen, dbuf_lim);
458 if (PKT_IS_TRUNC_MBUF(spkt)) {
459 if (do_csum) {
460 *partial_sum = m_copydata_sum(spkt->pkt_mbuf,
461 soff, tmplen, dbuf_addr, *partial_sum,
462 odd_start);
463 } else {
464 m_copydata(spkt->pkt_mbuf, soff, tmplen,
465 dbuf_addr);
466 }
467 } else {
468 *partial_sum = pkt_copyaddr_sum(SK_PKT2PH(spkt),
469 soff, dbuf_addr, tmplen, do_csum, *partial_sum,
470 odd_start);
471 }
472 if (dbuf->dba_is_buflet) {
473 VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[i],
474 tmplen + buf_off) == 0);
475 } else {
476 dbuf->dba_mbuf[i]->m_len += tmplen;
477 dbuf->dba_mbuf[0]->m_pkthdr.len += tmplen;
478 }
479 soff += tmplen;
480 plen -= tmplen;
481 buf_off = 0;
482 i++;
483 }
484 ASSERT(plen == 0);
485 }
486
487 /*
488 * Copy (fill) and checksum for packet.
489 * spkt: source IP packet.
490 * plen: length of data in spkt (IP hdr + TCP hdr + TCP payload).
491 * verify_l3: verify IPv4 header checksum.
492 * currm: destination mbuf.
493 * currp: destination skywalk packet.
494 * dbuf: additional destination data buffer(s), used when current destination
495 * packet is out of space.
496 * added: amount of data copied from spkt to the additional buffer.
497 * data_sum: 16-bit folded partial checksum of the copied TCP payload.
498 */
499 static bool
copy_pkt_csum_packed(struct __kern_packet * spkt,uint32_t plen,_dbuf_array_t * dbuf,bool verify_l3,struct mbuf * currm,struct __kern_buflet * currp,uint16_t * data_csum,int * added)500 copy_pkt_csum_packed(struct __kern_packet *spkt, uint32_t plen,
501 _dbuf_array_t *dbuf, bool verify_l3, struct mbuf *currm,
502 struct __kern_buflet *currp, uint16_t *data_csum, int *added)
503 {
504 ASSERT(data_csum != NULL);
505
506 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
507 SK_VERB_COPY));
508
509 uint16_t start = 0, csum = 0;
510 uint32_t len = 0;
511 uint32_t l4len;
512 /* soff is only used for packets */
513 uint16_t soff = spkt->pkt_headroom + spkt->pkt_l2_len;
514 uint32_t data_partial = 0, partial = 0;
515 int32_t curr_oldlen;
516 uint32_t curr_trailing;
517 char *curr_ptr;
518 int32_t curr_len;
519 uint16_t data_off;
520 uint32_t tmplen;
521 boolean_t odd_start = FALSE;
522 bool verify_l4;
523
524 /* One of them must be != NULL, but they can't be both set */
525 VERIFY((currm != NULL || currp != NULL) &&
526 ((currm != NULL) != (currp != NULL)));
527
528 if (currm != NULL) {
529 curr_oldlen = currm->m_len;
530 curr_trailing = (uint32_t)M_TRAILINGSPACE(currm);
531 curr_ptr = mtod(currm, char *) + currm->m_len;
532 curr_len = currm->m_len;
533 } else {
534 curr_oldlen = currp->buf_dlen;
535 curr_trailing = currp->buf_dlim - currp->buf_doff -
536 currp->buf_dlen;
537 curr_ptr = (char *)(currp->buf_addr + currp->buf_doff +
538 currp->buf_dlen);
539 curr_len = currp->buf_dlen;
540 }
541
542 /* Verify checksum only for IPv4 */
543 len = spkt->pkt_flow_ip_hlen;
544 verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(spkt));
545 if (verify_l3) {
546 if (PKT_IS_TRUNC_MBUF(spkt)) {
547 partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
548 len, 0, 0);
549 } else {
550 partial = pkt_sum(SK_PKT2PH(spkt), soff, len);
551 }
552
553 csum = __packet_fold_sum(partial);
554 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)", 0,
555 len, csum);
556 spkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
557 if ((csum ^ 0xffff) != 0) {
558 /* No need to copy & checkum TCP+payload */
559 return false;
560 } else {
561 spkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
562 }
563 }
564
565 verify_l4 = !PACKET_HAS_FULL_CHECKSUM_FLAGS(spkt);
566
567 /* Copy & verify TCP checksum */
568 start = spkt->pkt_flow_ip_hlen + spkt->pkt_flow_tcp_hlen;
569 l4len = plen - spkt->pkt_flow_ip_hlen;
570 len = plen - start;
571 if (PKT_IS_TRUNC_MBUF(spkt)) {
572 tmplen = min(len, curr_trailing);
573 odd_start = FALSE;
574
575 /* First, simple checksum on the TCP header */
576 if (verify_l4) {
577 partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
578 spkt->pkt_flow_tcp_hlen, spkt->pkt_flow_ip_hlen, 0);
579 }
580
581 /* Now, copy & sum the payload */
582 if (tmplen > 0) {
583 data_partial = m_copydata_sum(spkt->pkt_mbuf,
584 start, tmplen, curr_ptr, 0, &odd_start);
585 curr_len += tmplen;
586 }
587 data_off = start + tmplen;
588 } else {
589 tmplen = min(len, curr_trailing);
590 odd_start = FALSE;
591
592 /* First, simple checksum on the TCP header */
593 if (verify_l4) {
594 partial = pkt_sum(SK_PKT2PH(spkt), (soff +
595 spkt->pkt_flow_ip_hlen), spkt->pkt_flow_tcp_hlen);
596 }
597
598 /* Now, copy & sum the payload */
599 if (tmplen > 0) {
600 data_partial = pkt_copyaddr_sum(SK_PKT2PH(spkt),
601 (soff + start), (uint8_t *)curr_ptr, tmplen,
602 true, 0, &odd_start);
603 curr_len += tmplen;
604 }
605 data_off = soff + start + tmplen;
606 }
607
608 /* copy & sum remaining payload in additional buffers */
609 if ((len - tmplen) > 0) {
610 ASSERT(dbuf != NULL);
611 _copy_data_sum_dbuf(spkt, data_off, (len - tmplen),
612 &data_partial, &odd_start, dbuf, true);
613 *added = (len - tmplen);
614 }
615
616 /* Fold data checksum to 16 bit */
617 *data_csum = __packet_fold_sum(data_partial);
618
619 if (currm != NULL) {
620 currm->m_len = curr_len;
621 } else {
622 currp->buf_dlen = curr_len;
623 }
624
625 if (verify_l4) {
626 /* Fold in the data checksum to TCP checksum */
627 partial += *data_csum;
628 partial += htons(l4len + IPPROTO_TCP);
629 if (spkt->pkt_flow_ip_ver == IPVERSION) {
630 csum = in_pseudo(spkt->pkt_flow_ipv4_src.s_addr,
631 spkt->pkt_flow_ipv4_dst.s_addr, partial);
632 } else {
633 ASSERT(spkt->pkt_flow_ip_ver == IPV6_VERSION);
634 csum = in6_pseudo(&spkt->pkt_flow_ipv6_src,
635 &spkt->pkt_flow_ipv6_dst, partial);
636 }
637 /* pkt metadata will be transfer to super packet */
638 __packet_set_inet_checksum(SK_PKT2PH(spkt),
639 PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
640 } else {
641 /* grab csum value from offload */
642 csum = spkt->pkt_csum_rx_value;
643 }
644
645 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
646 start - spkt->pkt_flow_tcp_hlen, l4len, ntohs(csum));
647
648 if ((csum ^ 0xffff) != 0) {
649 /*
650 * Revert whatever we did here!
651 * currm/currp should be restored to previous value.
652 * dbuf (for additional payload) should be restore to 0.
653 */
654 if (currm != NULL) {
655 currm->m_len = curr_oldlen;
656 } else {
657 currp->buf_dlen = curr_oldlen;
658 }
659 if (dbuf != NULL) {
660 for (int i = 0; i < dbuf->dba_num_dbufs; i++) {
661 if (dbuf->dba_is_buflet) {
662 struct __kern_buflet *b = dbuf->dba_buflet[i];
663 kern_buflet_set_data_length(b, 0);
664 kern_buflet_set_data_offset(b, 0);
665 } else {
666 struct mbuf *m = dbuf->dba_mbuf[i];
667 m->m_len = m->m_pkthdr.len = 0;
668 }
669 }
670 }
671
672 return false;
673 }
674
675 return true;
676 }
677
678 /*
679 * Copy and checksum for packet or packet with mbuf
680 * data_csum is only supported for bsd flows
681 */
682 static bool
copy_pkt_csum(struct __kern_packet * pkt,uint32_t plen,_dbuf_array_t * dbuf,uint16_t * data_csum,bool verify_l3)683 copy_pkt_csum(struct __kern_packet *pkt, uint32_t plen, _dbuf_array_t *dbuf,
684 uint16_t *data_csum, bool verify_l3)
685 {
686 /*
687 * To keep this routine simple and optimal, we are asserting on the
688 * assumption that the smallest flowswitch packet pool buffer should
689 * be large enough to hold the IP and TCP headers in the first buflet.
690 */
691 _CASSERT(NX_FSW_MINBUFSIZE >= NETIF_COMPAT_MAX_MBUF_DATA_COPY);
692
693 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
694 (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
695
696 uint16_t start = 0, csum = 0;
697 uint32_t len = 0;
698 /* soff is only used for packets */
699 uint16_t soff = pkt->pkt_headroom + pkt->pkt_l2_len;
700 uint32_t data_partial = 0, partial = 0;
701 boolean_t odd_start = false;
702 uint32_t data_len;
703 uint16_t dbuf_off;
704 uint16_t copied_len = 0;
705 bool l3_csum_ok;
706 uint8_t *daddr;
707
708 if (dbuf->dba_is_buflet) {
709 daddr = kern_buflet_get_data_address(dbuf->dba_buflet[0]);
710 daddr += kern_buflet_get_data_length(dbuf->dba_buflet[0]);
711 } else {
712 daddr = mtod(dbuf->dba_mbuf[0], uint8_t *);
713 daddr += dbuf->dba_mbuf[0]->m_len;
714 /*
715 * available space check for payload is done later
716 * in _copy_data_sum_dbuf
717 */
718 ASSERT(M_TRAILINGSPACE(dbuf->dba_mbuf[0]) >=
719 pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
720 }
721
722 if (PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt)) {
723 /* copy only */
724 _copy_data_sum_dbuf(pkt, PKT_IS_TRUNC_MBUF(pkt) ? 0: soff,
725 plen, &partial, &odd_start, dbuf, false);
726 if (PKT_IS_MBUF(pkt)) {
727 csum = pkt->pkt_mbuf->m_pkthdr.csum_rx_val;
728 SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
729 pkt->pkt_mbuf->m_pkthdr.csum_flags,
730 pkt->pkt_mbuf->m_pkthdr.csum_rx_start, csum);
731 } else {
732 csum = pkt->pkt_csum_rx_value;
733 SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
734 pkt->pkt_csum_flags,
735 pkt->pkt_csum_rx_start_off, csum);
736 }
737
738 /* pkt metadata will be transfer to super packet */
739 __packet_set_inet_checksum(SK_PKT2PH(pkt),
740 PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
741 if ((csum ^ 0xffff) == 0) {
742 return true;
743 } else {
744 return false;
745 }
746 }
747
748 /* Copy l3 & verify checksum only for IPv4 */
749 start = 0;
750 len = pkt->pkt_flow_ip_hlen;
751 if (PKT_IS_TRUNC_MBUF(pkt)) {
752 partial = m_copydata_sum(pkt->pkt_mbuf, start, len,
753 (daddr + start), 0, NULL);
754 } else {
755 partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), soff,
756 (daddr + start), len, true, 0, NULL);
757 }
758 verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(pkt));
759 l3_csum_ok = !verify_l3;
760 if (verify_l3) {
761 csum = __packet_fold_sum(partial);
762 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
763 start, len, csum);
764 pkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
765 if ((csum ^ 0xffff) != 0) {
766 /* proceed to copy the rest of packet */
767 } else {
768 pkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
769 l3_csum_ok = true;
770 }
771 }
772 copied_len += pkt->pkt_flow_ip_hlen;
773
774 /* Copy & verify TCP checksum */
775 start = pkt->pkt_flow_ip_hlen;
776 len = plen - start;
777
778 if (PKT_IS_TRUNC_MBUF(pkt)) {
779 /* First, copy and sum TCP header */
780 partial = m_copydata_sum(pkt->pkt_mbuf, start,
781 pkt->pkt_flow_tcp_hlen, (daddr + start), 0, NULL);
782
783 data_len = len - pkt->pkt_flow_tcp_hlen;
784 start += pkt->pkt_flow_tcp_hlen;
785 dbuf_off = start;
786 /* Next, copy and sum payload (if any) */
787 } else {
788 /* First, copy and sum TCP header */
789 partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), (soff + start),
790 (daddr + start), pkt->pkt_flow_tcp_hlen, true, 0, NULL);
791
792 data_len = len - pkt->pkt_flow_tcp_hlen;
793 start += pkt->pkt_flow_tcp_hlen;
794 dbuf_off = start;
795 start += soff;
796 }
797 copied_len += pkt->pkt_flow_tcp_hlen;
798
799 if (dbuf->dba_is_buflet) {
800 VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[0],
801 kern_buflet_get_data_length(dbuf->dba_buflet[0]) +
802 copied_len) == 0);
803 } else {
804 dbuf->dba_mbuf[0]->m_len += copied_len;
805 dbuf->dba_mbuf[0]->m_pkthdr.len += copied_len;
806 }
807
808 /* copy and sum payload (if any) */
809 if (data_len > 0) {
810 odd_start = false;
811 _copy_data_sum_dbuf(pkt, start, data_len, &data_partial,
812 &odd_start, dbuf, l3_csum_ok);
813 }
814
815 if (__improbable(!l3_csum_ok)) {
816 return false;
817 }
818
819 /* Fold data sum to 16 bit and then into the partial */
820 *data_csum = __packet_fold_sum(data_partial);
821
822 /* Fold in the data checksum to TCP checksum */
823 partial += *data_csum;
824
825 partial += htons(len + IPPROTO_TCP);
826 if (pkt->pkt_flow_ip_ver == IPVERSION) {
827 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
828 pkt->pkt_flow_ipv4_dst.s_addr, partial);
829 } else {
830 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
831 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
832 &pkt->pkt_flow_ipv6_dst, partial);
833 }
834
835 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
836 pkt->pkt_flow_ip_hlen, len, csum);
837
838 /* pkt metadata will be transfer to super packet */
839 __packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
840 0, csum, false);
841 if ((csum ^ 0xffff) != 0) {
842 return false;
843 }
844
845 return true;
846 }
847
848 SK_INLINE_ATTRIBUTE
849 static void
flow_agg_init_common(struct nx_flowswitch * fsw,struct flow_agg * fa,struct __kern_packet * pkt)850 flow_agg_init_common(struct nx_flowswitch *fsw, struct flow_agg *fa,
851 struct __kern_packet *pkt)
852 {
853 struct ifnet *ifp;
854
855 switch (pkt->pkt_flow_ip_ver) {
856 case IPVERSION:
857 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip)) {
858 return;
859 }
860 break;
861 case IPV6_VERSION:
862 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
863 return;
864 }
865 break;
866 default:
867 VERIFY(0);
868 /* NOTREACHED */
869 __builtin_unreachable();
870 }
871
872 fa->fa_tcp_seq = ntohl(pkt->pkt_flow_tcp_seq) + pkt->pkt_flow_ulen;
873 fa->fa_ulen = pkt->pkt_flow_ulen;
874 fa->fa_total = pkt->pkt_flow_ip_hlen +
875 pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
876
877 ifp = fsw->fsw_ifp;
878 ASSERT(ifp != NULL);
879 if (__improbable((ifp->if_hwassist & IFNET_LRO) != 0)) {
880 /* in case hardware supports LRO, don't fix checksum in the header */
881 fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum_no_op;
882 } else {
883 fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum;
884 }
885 }
886
887 static void
flow_agg_init_smbuf(struct nx_flowswitch * fsw,struct flow_agg * fa,struct mbuf * smbuf,struct __kern_packet * pkt)888 flow_agg_init_smbuf(struct nx_flowswitch *fsw, struct flow_agg *fa,
889 struct mbuf *smbuf, struct __kern_packet *pkt)
890 {
891 FLOW_AGG_CLEAR(fa);
892
893 ASSERT(smbuf != NULL);
894 fa->fa_smbuf = smbuf;
895
896 fa->fa_sptr = mtod(smbuf, uint8_t *);
897 ASSERT(fa->fa_sptr != NULL);
898
899 /*
900 * Note here we use 'pkt' instead of 'smbuf', since we rely on the
901 * contents of the flow structure which don't exist in 'smbuf'.
902 */
903 flow_agg_init_common(fsw, fa, pkt);
904 }
905
906 static void
flow_agg_init_spkt(struct nx_flowswitch * fsw,struct flow_agg * fa,struct __kern_packet * spkt,struct __kern_packet * pkt)907 flow_agg_init_spkt(struct nx_flowswitch *fsw, struct flow_agg *fa,
908 struct __kern_packet *spkt, struct __kern_packet *pkt)
909 {
910 FLOW_AGG_CLEAR(fa);
911
912 ASSERT(spkt != NULL);
913 fa->fa_spkt = spkt;
914 fa->fa_sobj_is_pkt = true;
915 VERIFY(spkt->pkt_headroom == 0 && spkt->pkt_l2_len == 0);
916
917 MD_BUFLET_ADDR_ABS(spkt, fa->fa_sptr);
918 ASSERT(fa->fa_sptr != NULL);
919
920 /*
921 * Note here we use 'pkt' instead of 'spkt', since we rely on the
922 * contents of the flow structure which don't exist in 'spkt'.
923 */
924 flow_agg_init_common(fsw, fa, pkt);
925 }
926
927 SK_INLINE_ATTRIBUTE
928 static bool
ipv4_tcp_memcmp(const uint8_t * h1,const uint8_t * h2)929 ipv4_tcp_memcmp(const uint8_t *h1, const uint8_t *h2)
930 {
931 return sk_memcmp_mask_64B(h1, h2, (const uint8_t *)&ip_tcp_mask) == 0;
932 }
933
934 SK_INLINE_ATTRIBUTE
935 static bool
ipv6_tcp_memcmp(const uint8_t * h1,const uint8_t * h2)936 ipv6_tcp_memcmp(const uint8_t *h1, const uint8_t *h2)
937 {
938 return sk_memcmp_mask_80B(h1, h2, (const uint8_t *)&ip6_tcp_mask) == 0;
939 }
940
941 SK_INLINE_ATTRIBUTE
942 static bool
can_agg_fastpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)943 can_agg_fastpath(struct flow_agg *fa, struct __kern_packet *pkt,
944 struct fsw_stats *fsws)
945 {
946 bool match;
947
948 ASSERT(fa->fa_sptr != NULL);
949 _CASSERT(sizeof(struct ip6_tcp_mask) == MASK_SIZE);
950 _CASSERT(sizeof(struct ip_tcp_mask) == MASK_SIZE);
951
952 if (__improbable(pkt->pkt_length < MASK_SIZE)) {
953 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_TCP);
954 goto slow_path;
955 }
956
957 if (__improbable(fa->fa_sobj_is_short)) {
958 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_MBUF);
959 goto slow_path;
960 }
961
962 if (__improbable(pkt->pkt_flow_tcp_hlen !=
963 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA))) {
964 goto slow_path;
965 }
966
967 switch (pkt->pkt_flow_ip_ver) {
968 case IPVERSION:
969 match = ipv4_tcp_memcmp(fa->fa_sptr,
970 (uint8_t *)pkt->pkt_flow_ip_hdr);
971 break;
972 case IPV6_VERSION:
973 match = ipv6_tcp_memcmp(fa->fa_sptr,
974 (uint8_t *)pkt->pkt_flow_ip_hdr);
975 break;
976 default:
977 VERIFY(0);
978 /* NOTREACHED */
979 __builtin_unreachable();
980 }
981
982 if (__improbable(!match)) {
983 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_MASK_TCP);
984 goto slow_path;
985 }
986 if (__improbable(pkt->pkt_flow_ulen != fa->fa_ulen)) {
987 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ULEN_TCP);
988 goto slow_path;
989 }
990
991 STATS_INC(fsws, FSW_STATS_RX_AGG_OK_FASTPATH_TCP);
992 fa->fa_tcp_seq += pkt->pkt_flow_ulen;
993 fa->fa_ulen = pkt->pkt_flow_ulen;
994 return true;
995
996 slow_path:
997 return false;
998 }
999
1000 SK_NO_INLINE_ATTRIBUTE
1001 static bool
can_agg_slowpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)1002 can_agg_slowpath(struct flow_agg *fa, struct __kern_packet *pkt,
1003 struct fsw_stats *fsws)
1004 {
1005 uint8_t *sl3_hdr = fa->fa_sptr;
1006 uint32_t sl3tlen = 0;
1007 uint16_t sl3hlen = 0;
1008
1009 DTRACE_SKYWALK2(aggr__slow, struct __kern_packet *, pkt,
1010 uint8_t *, sl3_hdr);
1011
1012 ASSERT(sl3_hdr != NULL);
1013
1014 /*
1015 * Compare IP header length, TOS, frag flags and IP options
1016 * For IPv4, the options should match exactly
1017 * For IPv6, if options are present, bail out
1018 */
1019 if (pkt->pkt_flow_ip_ver == IPVERSION) {
1020 struct ip *siph = (struct ip *)(void *)sl3_hdr;
1021 struct ip *iph = (struct ip *)pkt->pkt_flow_ip_hdr;
1022
1023 ASSERT(siph->ip_v == IPVERSION);
1024 /* 16-bit alignment is sufficient (handles mbuf case) */
1025 ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1026 ASSERT(IS_P2ALIGNED(iph, sizeof(uint16_t)));
1027
1028 sl3hlen = (siph->ip_hl << 2);
1029 if (sl3hlen != pkt->pkt_flow_ip_hlen) {
1030 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1031 DTRACE_SKYWALK2(aggr__fail2, uint16_t, sl3hlen, uint8_t,
1032 pkt->pkt_flow_ip_hlen);
1033 return false;
1034 }
1035
1036 if (siph->ip_ttl != iph->ip_ttl) {
1037 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1038 DTRACE_SKYWALK2(aggr__fail3, uint8_t, siph->ip_ttl,
1039 uint8_t, iph->ip_ttl);
1040 return false;
1041 }
1042
1043 if (siph->ip_tos != iph->ip_tos) {
1044 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1045 DTRACE_SKYWALK2(aggr__fail4, uint8_t, siph->ip_tos,
1046 uint8_t, iph->ip_tos);
1047 return false;
1048 }
1049 /* For IPv4, DF bit should match */
1050 if ((ntohs(siph->ip_off) & (IP_DF | IP_RF)) !=
1051 (ntohs(iph->ip_off) & (IP_DF | IP_RF))) {
1052 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OFF_IP);
1053 DTRACE_SKYWALK2(aggr__fail5, uint16_t,
1054 ntohs(siph->ip_off), uint16_t, ntohs(iph->ip_off));
1055 return false;
1056 }
1057
1058 uint8_t ip_opts_len = pkt->pkt_flow_ip_hlen -
1059 sizeof(struct ip);
1060 if (ip_opts_len > 0 &&
1061 memcmp((uint8_t *)(siph + 1), (uint8_t *)(iph + 1),
1062 ip_opts_len) != 0) {
1063 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPT_IP);
1064 DTRACE_SKYWALK3(aggr__fail6, uint8_t, ip_opts_len,
1065 uint8_t *, (uint8_t *)(siph + 1), uint8_t *,
1066 (uint8_t *)(iph + 1));
1067 return false;
1068 }
1069 sl3tlen = ntohs(siph->ip_len);
1070 } else {
1071 struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1072 struct ip6_hdr *ip6 = (struct ip6_hdr *)pkt->pkt_flow_ip_hdr;
1073
1074 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1075 ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1076 /* 16-bit alignment is sufficient (handles mbuf case) */
1077 ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1078
1079 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
1080 /*
1081 * Don't aggregate if extension header is present in
1082 * packet. N.B. currently flow switch only classifies
1083 * frag header
1084 */
1085 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1086 DTRACE_SKYWALK1(aggr__fail7, uint8_t,
1087 pkt->pkt_flow_ip_hlen);
1088 return false;
1089 }
1090
1091 sl3hlen = sizeof(struct ip6_hdr);
1092 /* For IPv6, flow info mask covers TOS and flow label */
1093 if (memcmp(&sip6->ip6_flow, &ip6->ip6_flow,
1094 sizeof(sip6->ip6_flow)) != 0) {
1095 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1096 DTRACE_SKYWALK2(aggr__fail8, uint32_t,
1097 ntohl(sip6->ip6_flow), uint32_t,
1098 ntohl(ip6->ip6_flow));
1099 return false;
1100 }
1101
1102 if (sip6->ip6_hlim != ip6->ip6_hlim) {
1103 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1104 DTRACE_SKYWALK2(aggr__fail9, uint8_t, sip6->ip6_hlim,
1105 uint8_t, ip6->ip6_hlim);
1106 return false;
1107 }
1108
1109 sl3tlen = (sizeof(struct ip6_hdr) + ntohs(sip6->ip6_plen));
1110 }
1111
1112 /*
1113 * For TCP header, compare ACK number and window size
1114 * Compare TCP flags
1115 * Compare TCP header length and TCP options
1116 */
1117 struct tcphdr *stcp = (struct tcphdr *)(void *)(sl3_hdr + sl3hlen);
1118 struct tcphdr *tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
1119
1120 uint16_t sl4hlen = (stcp->th_off << 2);
1121 if (memcmp(&stcp->th_ack, &tcp->th_ack, sizeof(stcp->th_ack)) != 0 ||
1122 memcmp(&stcp->th_win, &tcp->th_win, sizeof(stcp->th_win)) != 0) {
1123 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ACKWIN_TCP);
1124 DTRACE_SKYWALK4(aggr__fail9, uint32_t, ntohl(stcp->th_ack),
1125 uint32_t, ntohl(tcp->th_ack), uint16_t, ntohs(stcp->th_win),
1126 uint16_t, ntohs(tcp->th_win));
1127 return false;
1128 }
1129
1130 if ((stcp->th_flags & ~(TH_PUSH)) != (tcp->th_flags & ~(TH_PUSH))) {
1131 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_FLAGS_TCP);
1132 DTRACE_SKYWALK2(aggr__fail10, uint8_t, stcp->th_flags,
1133 uint8_t, tcp->th_flags);
1134 return false;
1135 }
1136
1137 if (sl4hlen != pkt->pkt_flow_tcp_hlen) {
1138 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_TCP);
1139 DTRACE_SKYWALK2(aggr__fail11, uint8_t, sl4hlen,
1140 uint8_t, pkt->pkt_flow_tcp_hlen);
1141 return false;
1142 }
1143
1144 uint8_t tcp_opts_len = pkt->pkt_flow_tcp_hlen - sizeof(struct tcphdr);
1145 /*
1146 * We know that the TCP-option lengthes are the same thanks to the above
1147 * sl4hlen check
1148 */
1149 if (tcp_opts_len > 0 && memcmp((uint8_t *)(stcp + 1),
1150 (uint8_t *)(tcp + 1), tcp_opts_len) != 0) {
1151 /*
1152 * Fast-path header prediction:
1153 *
1154 * TCP Timestamp option is usually put after two NOP-headers,
1155 * and thus total TCP-option length is 12. If that's the case,
1156 * we can aggregate as only the TCP time-stamp option differs.
1157 */
1158 if (tcp_opts_len != TCPOLEN_TSTAMP_APPA) {
1159 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_EXOPT_TCP);
1160 DTRACE_SKYWALK1(aggr__fail13, uint8_t, tcp_opts_len);
1161 return false;
1162 } else {
1163 uint32_t sts_hdr, ts_hdr;
1164 if (IS_P2ALIGNED(stcp + 1, sizeof(uint32_t))) {
1165 sts_hdr = *((uint32_t *)(stcp + 1));
1166 } else {
1167 bcopy(stcp + 1, &sts_hdr, sizeof(sts_hdr));
1168 }
1169 if (IS_P2ALIGNED(tcp + 1, sizeof(uint32_t))) {
1170 ts_hdr = *((uint32_t *)(tcp + 1));
1171 } else {
1172 bcopy(tcp + 1, &ts_hdr, sizeof(ts_hdr));
1173 }
1174
1175 if (sts_hdr != htonl(TCPOPT_TSTAMP_HDR) ||
1176 ts_hdr != htonl(TCPOPT_TSTAMP_HDR)) {
1177 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPTTS_TCP);
1178 DTRACE_SKYWALK2(aggr__fail14, uint32_t,
1179 sts_hdr, uint32_t, ts_hdr);
1180 return false;
1181 }
1182 }
1183 }
1184 STATS_INC(fsws, FSW_STATS_RX_AGG_OK_SLOWPATH_TCP);
1185 fa->fa_tcp_seq += pkt->pkt_flow_ulen;
1186 fa->fa_ulen = pkt->pkt_flow_ulen;
1187 return true;
1188 }
1189
1190 static bool
flow_agg_is_ok(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)1191 flow_agg_is_ok(struct flow_agg *fa, struct __kern_packet *pkt,
1192 struct fsw_stats *fsws)
1193 {
1194 /* Shouldn't exceed the ip_len beyond MIN(custom ip_len, 64K) */
1195 const uint32_t max_ip_len = MAX_AGG_IP_LEN();
1196 bool can_agg = false;
1197
1198 DTRACE_SKYWALK2(aggr__check, struct flow_agg *, fa,
1199 struct __kern_packet *, pkt);
1200
1201 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1202 if (__improbable(pkt->pkt_flow_tcp_agg_fast != 0)) {
1203 pkt->pkt_flow_tcp_agg_fast = 0;
1204 }
1205 /*
1206 * Don't aggregate if any of the following is true:
1207 * 1. TCP flag is other than TH_{ACK,PUSH}
1208 * 2. Payload length is 0 (pure ACK)
1209 * 3. This is the first packet
1210 * 4. TCP sequence number is not expected
1211 * 5. We would've exceeded the maximum aggregated size
1212 * 6. It's not the first packet and the wake flag is set
1213 */
1214 if (__improbable((pkt->pkt_flow_tcp_flags & TCP_FLAGS_IGNORE) != 0 ||
1215 pkt->pkt_flow_ulen == 0 || fa->fa_sobj == NULL)) {
1216 DTRACE_SKYWALK1(aggr__fail1a, struct __kern_packet *, pkt);
1217 goto done;
1218 }
1219 if (__improbable(ntohl(pkt->pkt_flow_tcp_seq) != fa->fa_tcp_seq)) {
1220 DTRACE_SKYWALK2(aggr__fail1b, uint32_t,
1221 ntohl(pkt->pkt_flow_tcp_seq), uint32_t, fa->fa_tcp_seq);
1222 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SEQN_TCP);
1223 goto done;
1224 }
1225 if (__improbable((fa->fa_total + pkt->pkt_flow_ulen) > max_ip_len)) {
1226 DTRACE_SKYWALK3(aggr__fail1c, uint32_t, fa->fa_total,
1227 uint32_t, pkt->pkt_flow_ulen, uint32_t, max_ip_len);
1228 /* We've reached aggregation limit */
1229 STATS_INC(fsws, FSW_STATS_RX_AGG_LIMIT);
1230 goto done;
1231 }
1232 if (__improbable(PKT_IS_WAKE_PKT(pkt) && fa->fa_total > 0)) {
1233 DTRACE_SKYWALK1(aggr__fail1d, struct __kern_packet *, pkt);
1234 goto done;
1235 }
1236
1237 can_agg = can_agg_fastpath(fa, pkt, fsws);
1238 if (can_agg) {
1239 pkt->pkt_flow_tcp_agg_fast = 1;
1240 goto done;
1241 }
1242
1243 can_agg = can_agg_slowpath(fa, pkt, fsws);
1244 ASSERT(!pkt->pkt_flow_tcp_agg_fast);
1245
1246 done:
1247 return can_agg;
1248 }
1249
1250 static uint16_t
flow_agg_pkt_fix_sum(uint16_t csum,uint16_t old,uint16_t new)1251 flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new)
1252 {
1253 return __packet_fix_sum(csum, old, new);
1254 }
1255
1256 static uint16_t
flow_agg_pkt_fix_sum_no_op(uint16_t __unused csum,uint16_t __unused old,uint16_t __unused new)1257 flow_agg_pkt_fix_sum_no_op(uint16_t __unused csum, uint16_t __unused old,
1258 uint16_t __unused new)
1259 {
1260 return 0;
1261 }
1262
1263 static inline void
flow_agg_pkt_fix_hdr_sum(struct flow_agg * fa,uint8_t * field,uint16_t * csum,uint32_t new)1264 flow_agg_pkt_fix_hdr_sum(struct flow_agg *fa, uint8_t *field, uint16_t *csum,
1265 uint32_t new)
1266 {
1267 uint32_t old;
1268 memcpy(&old, field, sizeof(old));
1269 memcpy(field, &new, sizeof(uint32_t));
1270 *csum = fa->fa_fix_pkt_sum(fa->fa_fix_pkt_sum(*csum,
1271 (uint16_t)(old >> 16), (uint16_t)(new >> 16)),
1272 (uint16_t)(old & 0xffff),
1273 (uint16_t)(new & 0xffff));
1274 }
1275
1276 static void
flow_agg_merge_hdr(struct flow_agg * fa,struct __kern_packet * pkt,__unused uint16_t data_csum,struct fsw_stats * fsws)1277 flow_agg_merge_hdr(struct flow_agg *fa, struct __kern_packet *pkt,
1278 __unused uint16_t data_csum, struct fsw_stats *fsws)
1279 {
1280 struct tcphdr *stcp, *tcp;
1281 uint8_t *l3hdr, l3hlen;
1282 uint16_t old_l3len = 0;
1283 uint8_t result;
1284
1285 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1286
1287 /*
1288 * The packet being merged should always have full checksum flags
1289 * and a valid checksum. Otherwise, it would fail copy_pkt_csum_packed
1290 * and not enter this function.
1291 */
1292 ASSERT(PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt));
1293 ASSERT((pkt->pkt_csum_rx_value ^ 0xffff) == 0);
1294
1295 ASSERT(fa->fa_sobj != NULL);
1296 ASSERT(!fa->fa_sobj_is_pkt ||
1297 (fa->fa_spkt->pkt_headroom == 0 && fa->fa_spkt->pkt_l2_len == 0));
1298 uint8_t *sl3_hdr = fa->fa_sptr;
1299 ASSERT(sl3_hdr != NULL);
1300 ASSERT(fa->fa_fix_pkt_sum != NULL);
1301
1302 fa->fa_total += pkt->pkt_flow_ulen;
1303
1304 /*
1305 * Update the IP header as:
1306 * 1. Set the IP ID (IPv4 only) to that of the new packet
1307 * 2. Set the ttl to the lowest of the two
1308 * 3. Increment the IP length by the payload length of new packet
1309 * 4. Leave the IP (IPv4 only) checksum as is
1310 * Update the resp. flow classification fields, if any
1311 * Nothing to update for TCP header for now
1312 */
1313 if (pkt->pkt_flow_ip_ver == IPVERSION) {
1314 struct ip *siph = (struct ip *)(void *)sl3_hdr;
1315
1316 /* 16-bit alignment is sufficient (handles mbuf case) */
1317 ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1318
1319 l3hdr = (uint8_t *)siph;
1320 l3hlen = siph->ip_hl << 2;
1321
1322 old_l3len = ntohs(siph->ip_len);
1323 uint16_t l3tlen = ntohs(siph->ip_len) + pkt->pkt_flow_ulen;
1324 siph->ip_len = htons(l3tlen);
1325 siph->ip_sum = fa->fa_fix_pkt_sum(siph->ip_sum, 0,
1326 htons(pkt->pkt_flow_ulen));
1327
1328 SK_DF(logflags, "Agg IP len %u", ntohs(siph->ip_len));
1329 } else {
1330 struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1331
1332 /* 16-bit alignment is sufficient (handles mbuf case) */
1333 ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1334 ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1335 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1336
1337 l3hdr = (uint8_t *)sip6;
1338 l3hlen = sizeof(struct ip6_hdr);
1339
1340 /* No extension headers should be present */
1341 ASSERT(pkt->pkt_flow_ip_hlen == sizeof(struct ip6_hdr));
1342
1343 old_l3len = ntohs(sip6->ip6_plen) + sizeof(struct ip6_hdr);
1344 uint16_t l3plen = ntohs(sip6->ip6_plen) + pkt->pkt_flow_ulen;
1345 sip6->ip6_plen = htons(l3plen);
1346
1347 SK_DF(logflags, "Agg IP6 len %u", ntohs(sip6->ip6_plen));
1348 }
1349
1350 if (__probable(pkt->pkt_flow_tcp_agg_fast)) {
1351 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_IP);
1352 } else {
1353 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_IP);
1354 }
1355
1356 stcp = (struct tcphdr *)(void *)(l3hdr + l3hlen);
1357 tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
1358 /* 16-bit alignment is sufficient (handles mbuf case) */
1359 ASSERT(IS_P2ALIGNED(stcp, sizeof(uint16_t)));
1360 ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
1361
1362 /*
1363 * If it is bigger, that means there are TCP-options that need to be
1364 * copied over.
1365 */
1366 if (pkt->pkt_flow_tcp_hlen > sizeof(struct tcphdr) ||
1367 (stcp->th_flags & TH_PUSH) == 0) {
1368 VERIFY(stcp->th_off << 2 == pkt->pkt_flow_tcp_hlen);
1369 if (__improbable(!pkt->pkt_flow_tcp_agg_fast &&
1370 memcmp(stcp + 1, tcp + 1, (pkt->pkt_flow_tcp_hlen -
1371 sizeof(struct tcphdr))) != 0)) {
1372 uint8_t *sopt = (uint8_t *)(stcp + 1);
1373 uint8_t *opt = (uint8_t *)(tcp + 1);
1374
1375 uint32_t ntsval, ntsecr;
1376 bcopy((void *)(opt + 4), &ntsval, sizeof(ntsval));
1377 bcopy((void *)(opt + 8), &ntsecr, sizeof(ntsecr));
1378
1379 flow_agg_pkt_fix_hdr_sum(fa, sopt + 4, &stcp->th_sum, ntsval);
1380 flow_agg_pkt_fix_hdr_sum(fa, sopt + 8, &stcp->th_sum, ntsecr);
1381
1382 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_TCP);
1383 } else {
1384 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_TCP);
1385 }
1386
1387 if ((stcp->th_flags & TH_PUSH) == 0 &&
1388 (tcp->th_flags & TH_PUSH) != 0) {
1389 uint16_t old, new;
1390 old = *(uint16_t *)(void *)(&stcp->th_ack + 1);
1391 /* If the new segment has a PUSH-flag, append it! */
1392 stcp->th_flags |= tcp->th_flags & TH_PUSH;
1393 new = *(uint16_t *)(void *)(&stcp->th_ack + 1);
1394 stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, old, new);
1395 }
1396 }
1397
1398 /* Update pseudo header checksum */
1399 stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
1400 htons(pkt->pkt_flow_ulen));
1401
1402 /* Update data checksum */
1403 if (__improbable(old_l3len & 0x1)) {
1404 /* swap the byte order, refer to rfc 1071 section 2 */
1405 stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
1406 ntohs(data_csum));
1407 } else {
1408 stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0, data_csum);
1409 }
1410
1411 if (fa->fa_sobj_is_pkt) {
1412 struct __kern_packet *spkt = fa->fa_spkt;
1413 spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1414 spkt->pkt_flow_ulen += pkt->pkt_flow_ulen;
1415 /*
1416 * Super packet length includes L3 and L4
1417 * header length for first packet only.
1418 */
1419 spkt->pkt_length += pkt->pkt_flow_ulen;
1420 if (spkt->pkt_seg_cnt == 0) {
1421 /* First time we append packets, need to set it to 1 */
1422 spkt->pkt_seg_cnt = 1;
1423 }
1424 _CASSERT(sizeof(result) == sizeof(spkt->pkt_seg_cnt));
1425 if (!os_add_overflow(1, spkt->pkt_seg_cnt, &result)) {
1426 spkt->pkt_seg_cnt = result;
1427 }
1428 SK_DF(logflags, "Agg pkt len %u TCP csum 0x%04x",
1429 spkt->pkt_length, ntohs(stcp->th_sum));
1430 } else {
1431 struct mbuf *smbuf = fa->fa_smbuf;
1432 smbuf->m_pkthdr.len += pkt->pkt_flow_ulen;
1433 if (smbuf->m_pkthdr.seg_cnt == 0) {
1434 /* First time we append packets, need to set it to 1 */
1435 smbuf->m_pkthdr.seg_cnt = 1;
1436 }
1437 _CASSERT(sizeof(result) == sizeof(smbuf->m_pkthdr.seg_cnt));
1438 if (!os_add_overflow(1, smbuf->m_pkthdr.seg_cnt, &result)) {
1439 smbuf->m_pkthdr.seg_cnt = result;
1440 }
1441 SK_DF(logflags, "Agg mbuf len %u TCP csum 0x%04x",
1442 smbuf->m_pkthdr.len, ntohs(stcp->th_sum));
1443 }
1444 }
1445
1446 /*
1447 * Copy metadata from source packet to destination packet
1448 */
1449 static void
pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)1450 pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
1451 {
1452 /* Copy packet metadata */
1453 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
1454 _PKT_COPY(spkt, dpkt);
1455 }
1456
1457 static void
pkt_finalize(kern_packet_t ph)1458 pkt_finalize(kern_packet_t ph)
1459 {
1460 int err = __packet_finalize(ph);
1461 VERIFY(err == 0);
1462 #if (DEVELOPMENT || DEBUG)
1463 struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1464 uint8_t *buf;
1465 MD_BUFLET_ADDR_ABS(pkt, buf);
1466 buf += pkt->pkt_headroom + pkt->pkt_l2_len;
1467 DTRACE_SKYWALK2(aggr__finalize, struct __kern_packet *, pkt,
1468 uint8_t *, buf);
1469 #endif
1470 }
1471
1472 static inline uint32_t
estimate_buf_cnt(struct flow_entry * fe,uint32_t min_bufsize,uint32_t agg_bufsize)1473 estimate_buf_cnt(struct flow_entry *fe, uint32_t min_bufsize,
1474 uint32_t agg_bufsize)
1475 {
1476 uint32_t max_ip_len = MAX_AGG_IP_LEN();
1477 uint32_t agg_size = MAX(fe->fe_rx_largest_size, min_bufsize);
1478 uint32_t hdr_overhead;
1479
1480 agg_size = MIN(agg_size, agg_bufsize);
1481
1482 hdr_overhead = (fe->fe_rx_pktq_bytes / max_ip_len) *
1483 (MAX(sizeof(struct ip), sizeof(struct ip6_hdr)) +
1484 sizeof(struct tcphdr));
1485
1486 return ((fe->fe_rx_pktq_bytes + hdr_overhead) / agg_size) + 1;
1487 }
1488
1489 SK_INLINE_ATTRIBUTE
1490 static inline void
_append_dbuf_array_to_kpkt(kern_packet_t ph,kern_buflet_t pbuf,_dbuf_array_t * dbuf_array,kern_buflet_t * lbuf)1491 _append_dbuf_array_to_kpkt(kern_packet_t ph, kern_buflet_t pbuf,
1492 _dbuf_array_t *dbuf_array, kern_buflet_t *lbuf)
1493 {
1494 for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1495 kern_buflet_t buf = dbuf_array->dba_buflet[i];
1496 VERIFY(kern_packet_add_buflet(ph, pbuf, buf) == 0);
1497 pbuf = buf;
1498 dbuf_array->dba_buflet[i] = NULL;
1499 }
1500 ASSERT(pbuf != NULL);
1501 dbuf_array->dba_num_dbufs = 0;
1502 *lbuf = pbuf;
1503 }
1504
1505 SK_INLINE_ATTRIBUTE
1506 static inline void
_free_dbuf_array(struct kern_pbufpool * pp,_dbuf_array_t * dbuf_array)1507 _free_dbuf_array(struct kern_pbufpool *pp,
1508 _dbuf_array_t *dbuf_array)
1509 {
1510 for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1511 kern_buflet_t buf = dbuf_array->dba_buflet[i];
1512 pp_free_buflet(pp, buf);
1513 dbuf_array->dba_buflet[i] = NULL;
1514 }
1515 dbuf_array->dba_num_dbufs = 0;
1516 }
1517
1518 static inline void
finalize_super_packet(struct __kern_packet ** spkt,kern_packet_t * sph,struct flow_agg * fa,uint32_t * largest_spkt,uint16_t * spkts,uint16_t bufcnt)1519 finalize_super_packet(struct __kern_packet **spkt, kern_packet_t *sph,
1520 struct flow_agg *fa, uint32_t *largest_spkt, uint16_t *spkts,
1521 uint16_t bufcnt)
1522 {
1523 (*spkts)++;
1524 if (bufcnt > 1) {
1525 (*spkt)->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1526 }
1527 pkt_finalize(*sph);
1528 if ((*spkt)->pkt_length > *largest_spkt) {
1529 *largest_spkt = (*spkt)->pkt_length;
1530 }
1531 pkt_agg_log(*spkt, kernproc, false);
1532 DTRACE_SKYWALK1(aggr__buflet__count, uint16_t, bufcnt);
1533 *sph = 0;
1534 *spkt = NULL;
1535 FLOW_AGG_CLEAR(fa);
1536 }
1537
1538 static inline void
converge_aggregation_size(struct flow_entry * fe,uint32_t largest_agg_size)1539 converge_aggregation_size(struct flow_entry *fe, uint32_t largest_agg_size)
1540 {
1541 if (fe->fe_rx_largest_size > largest_agg_size) {
1542 /*
1543 * Make it slowly move towards largest_agg_size if we
1544 * consistently get non-aggregatable size.
1545 *
1546 * If we start at 16K, this makes us go to 4K within 6 rounds
1547 * and down to 2K within 12 rounds.
1548 */
1549 fe->fe_rx_largest_size -=
1550 ((fe->fe_rx_largest_size - largest_agg_size) >> 2);
1551 } else {
1552 fe->fe_rx_largest_size +=
1553 ((largest_agg_size - fe->fe_rx_largest_size) >> 2);
1554 }
1555 }
1556
1557 SK_NO_INLINE_ATTRIBUTE
1558 static void
flow_rx_agg_channel(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * dropped_pkts,bool is_mbuf)1559 flow_rx_agg_channel(struct nx_flowswitch *fsw, struct flow_entry *fe,
1560 struct pktq *dropped_pkts, bool is_mbuf)
1561 {
1562 #define __RX_AGG_CHAN_DROP_SOURCE_PACKET(_pkt) do { \
1563 KPKTQ_ENQUEUE(dropped_pkts, (_pkt)); \
1564 (_pkt) = NULL; \
1565 FLOW_AGG_CLEAR(&fa); \
1566 prev_csum_ok = false; \
1567 } while (0)
1568 struct flow_agg fa; /* states */
1569 FLOW_AGG_CLEAR(&fa);
1570
1571 struct pktq pkts; /* dst super packets */
1572 struct pktq disposed_pkts; /* done src packets */
1573
1574 KPKTQ_INIT(&pkts);
1575 KPKTQ_INIT(&disposed_pkts);
1576
1577 struct __kern_channel_ring *ring;
1578 ring = fsw_flow_get_rx_ring(fsw, fe);
1579 if (__improbable(ring == NULL)) {
1580 SK_ERR("Rx ring is NULL");
1581 KPKTQ_CONCAT(dropped_pkts, &fe->fe_rx_pktq);
1582 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DST_NXPORT_INVALID,
1583 KPKTQ_LEN(dropped_pkts));
1584 return;
1585 }
1586 struct kern_pbufpool *dpp = ring->ckr_pp;
1587 ASSERT(dpp->pp_max_frags > 1);
1588
1589 struct __kern_packet *pkt, *tpkt;
1590 /* state for super packet */
1591 struct __kern_packet *spkt = NULL;
1592 kern_packet_t sph = 0;
1593 kern_buflet_t sbuf = NULL;
1594 bool prev_csum_ok = false, csum_ok, agg_ok;
1595 uint16_t spkts = 0, bufcnt = 0;
1596 int err;
1597
1598 struct fsw_stats *fsws = &fsw->fsw_stats;
1599
1600 /* state for buflet batch alloc */
1601 uint32_t bh_cnt, bh_cnt_tmp;
1602 uint64_t buf_arr[MAX_BUFLET_COUNT];
1603 _dbuf_array_t dbuf_array = {.dba_is_buflet = true, .dba_num_dbufs = 0};
1604 uint32_t largest_spkt = 0; /* largest aggregated packet size */
1605 uint32_t agg_bufsize;
1606 uint8_t iter = 0;
1607 uint32_t bft_alloc_flags = PP_ALLOC_BFT_ATTACH_BUFFER;
1608
1609 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1610 SK_DF(logflags, "Rx input queue len %u", KPKTQ_LEN(&fe->fe_rx_pktq));
1611
1612 if (__probable(fe->fe_rx_largest_size != 0 &&
1613 NX_FSW_TCP_RX_AGG_ENABLED())) {
1614 if (fe->fe_rx_largest_size <= PP_BUF_SIZE_DEF(dpp) ||
1615 PP_BUF_SIZE_LARGE(dpp) == 0) {
1616 agg_bufsize = PP_BUF_SIZE_DEF(dpp);
1617 } else {
1618 agg_bufsize = PP_BUF_SIZE_LARGE(dpp);
1619 bft_alloc_flags |= PP_ALLOC_BFT_LARGE;
1620 }
1621 bh_cnt = estimate_buf_cnt(fe, PP_BUF_SIZE_DEF(dpp),
1622 agg_bufsize);
1623 DTRACE_SKYWALK1(needed_blt_cnt_agg, uint32_t, bh_cnt);
1624 bh_cnt = MIN(bh_cnt, MAX_BUFLET_COUNT);
1625 bh_cnt_tmp = bh_cnt;
1626 } else {
1627 /*
1628 * No payload, thus it's all small-sized ACKs/...
1629 * OR aggregation is disabled.
1630 */
1631 agg_bufsize = PP_BUF_SIZE_DEF(dpp);
1632 bh_cnt_tmp = bh_cnt = MIN(KPKTQ_LEN(&fe->fe_rx_pktq), MAX_BUFLET_COUNT);
1633 DTRACE_SKYWALK1(needed_blt_cnt_no_agg, uint32_t, bh_cnt);
1634 }
1635
1636 err = pp_alloc_buflet_batch(dpp, buf_arr, &bh_cnt, SKMEM_NOSLEEP,
1637 bft_alloc_flags);
1638 if (__improbable(bh_cnt == 0)) {
1639 SK_ERR("failed to alloc %u buflets (err %d), use slow path",
1640 bh_cnt_tmp, err);
1641 }
1642 bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1643 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1644 if (tpkt != NULL) {
1645 void *baddr;
1646 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1647 SK_PREFETCH(baddr, 0);
1648 }
1649
1650 ASSERT(pkt->pkt_qum.qum_pp != dpp);
1651 ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
1652 ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
1653 ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
1654 ASSERT(!pkt->pkt_flow_ip_is_frag);
1655 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1656
1657 csum_ok = false;
1658 agg_ok = false;
1659 /* supports TCP only */
1660 uint32_t thlen = (pkt->pkt_flow_ip_hlen +
1661 pkt->pkt_flow_tcp_hlen);
1662 uint32_t plen = (thlen + pkt->pkt_flow_ulen);
1663 uint16_t data_csum = 0;
1664
1665 KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1666 fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1667 err = flow_pkt_track(fe, pkt, true);
1668 if (__improbable(err != 0)) {
1669 STATS_INC(fsws, FSW_STATS_RX_FLOW_TRACK_ERR);
1670 /* if need to trigger RST */
1671 if (err == ENETRESET) {
1672 flow_track_abort_tcp(fe, pkt, NULL);
1673 }
1674 SK_ERR("flow_pkt_track failed (err %d)", err);
1675 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1676 continue;
1677 }
1678
1679 if (is_mbuf) { /* compat */
1680 m_adj(pkt->pkt_mbuf, pkt->pkt_l2_len);
1681 pkt->pkt_svc_class = m_get_service_class(pkt->pkt_mbuf);
1682 if (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
1683 pkt->pkt_pflags |= PKT_F_WAKE_PKT;
1684 }
1685 }
1686
1687 if (prev_csum_ok && sbuf) {
1688 ASSERT(fa.fa_spkt == spkt);
1689 ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1690 agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
1691 agg_ok = (agg_ok && bufcnt < dpp->pp_max_frags);
1692
1693 if (agg_ok && sbuf->buf_dlim - sbuf->buf_doff -
1694 sbuf->buf_dlen >= plen - thlen) {
1695 /*
1696 * No need for a new packet, just
1697 * append to curr_m.
1698 */
1699 csum_ok = copy_pkt_csum_packed(pkt, plen, NULL,
1700 is_ipv4, NULL, sbuf, &data_csum, NULL);
1701
1702 if (!csum_ok) {
1703 STATS_INC(fsws,
1704 FSW_STATS_RX_AGG_BAD_CSUM);
1705 SK_ERR("Checksum for aggregation "
1706 "is wrong");
1707 DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail1);
1708 /*
1709 * Turns out, checksum is wrong!
1710 * Fallback to no-agg mode.
1711 */
1712 agg_ok = false;
1713 } else {
1714 flow_agg_merge_hdr(&fa, pkt,
1715 data_csum, fsws);
1716 goto next;
1717 }
1718 }
1719 }
1720
1721 /* calculate number of buflets required */
1722 bh_cnt_tmp = howmany(plen, agg_bufsize);
1723 if (__improbable(bh_cnt_tmp > MAX_BUFLET_COUNT)) {
1724 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1725 SK_ERR("packet too big: bufcnt %d len %d", bh_cnt_tmp,
1726 plen);
1727 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1728 continue;
1729 }
1730 if (bh_cnt < bh_cnt_tmp) {
1731 uint32_t tmp;
1732
1733 if (iter != 0) {
1734 /*
1735 * rearrange the array for additional
1736 * allocation
1737 */
1738 uint8_t i;
1739 for (i = 0; i < bh_cnt; i++, iter++) {
1740 buf_arr[i] = buf_arr[iter];
1741 buf_arr[iter] = 0;
1742 }
1743 iter = 0;
1744 }
1745 tmp = estimate_buf_cnt(fe, PP_BUF_SIZE_DEF(dpp),
1746 agg_bufsize);
1747 tmp = MIN(tmp, MAX_BUFLET_COUNT);
1748 tmp = MAX(tmp, bh_cnt_tmp);
1749 tmp -= bh_cnt;
1750 ASSERT(tmp <= (MAX_BUFLET_COUNT - bh_cnt));
1751 DTRACE_SKYWALK1(refilled_blt_cnt, uint32_t, tmp);
1752 err = pp_alloc_buflet_batch(dpp, &buf_arr[bh_cnt],
1753 &tmp, SKMEM_NOSLEEP, bft_alloc_flags);
1754 bh_cnt += tmp;
1755 if (__improbable((tmp == 0) || (bh_cnt < bh_cnt_tmp))) {
1756 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1757 SK_ERR("buflet alloc failed (err %d)", err);
1758 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1759 continue;
1760 }
1761 }
1762 /* Use pre-allocated buflets */
1763 ASSERT(bh_cnt >= bh_cnt_tmp);
1764 dbuf_array.dba_num_dbufs = bh_cnt_tmp;
1765 while (bh_cnt_tmp-- > 0) {
1766 dbuf_array.dba_buflet[bh_cnt_tmp] =
1767 (kern_buflet_t)(buf_arr[iter]);
1768 buf_arr[iter] = 0;
1769 bh_cnt--;
1770 iter++;
1771 }
1772 /* copy and checksum TCP data */
1773 if (agg_ok) {
1774 int added = 0;
1775 ASSERT(dbuf_array.dba_num_dbufs != 0);
1776 csum_ok = copy_pkt_csum_packed(pkt, plen, &dbuf_array,
1777 is_ipv4, NULL, sbuf, &data_csum, &added);
1778
1779 if (__improbable(!csum_ok)) {
1780 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1781 SK_ERR("Checksum for aggregation on new "
1782 "mbuf is wrong");
1783 DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail2);
1784 agg_ok = false;
1785 /* reset the used buflets */
1786 uint8_t j;
1787 for (j = 0; j < dbuf_array.dba_num_dbufs; j++) {
1788 VERIFY(kern_buflet_set_data_length(
1789 dbuf_array.dba_buflet[j], 0) == 0);
1790 }
1791 goto non_agg;
1792 }
1793
1794 /*
1795 * There was not enough space in curr_m, thus we must
1796 * have added to m->m_data.
1797 */
1798 VERIFY(added > 0);
1799 } else {
1800 non_agg:
1801 ASSERT(dbuf_array.dba_num_dbufs != 0);
1802 csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
1803 &data_csum, is_ipv4);
1804 if (__improbable(!csum_ok)) {
1805 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1806 SK_ERR("%d incorrect csum", __LINE__);
1807 DTRACE_SKYWALK(aggr__chan_tcp_csum_fail);
1808 }
1809 }
1810 if (agg_ok) {
1811 ASSERT(fa.fa_spkt == spkt);
1812 ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1813 /* update current packet header */
1814 flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
1815 ASSERT(dbuf_array.dba_num_dbufs > 0);
1816 bufcnt += dbuf_array.dba_num_dbufs;
1817 _append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1818 &sbuf);
1819 } else {
1820 /* Finalize the current super packet */
1821 if (sph != 0) {
1822 finalize_super_packet(&spkt, &sph, &fa,
1823 &largest_spkt, &spkts, bufcnt);
1824 }
1825
1826 /* New super packet */
1827 err = kern_pbufpool_alloc_nosleep(dpp, 0, &sph);
1828 if (__improbable(err != 0)) {
1829 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1830 SK_ERR("packet alloc failed (err %d)", err);
1831 _free_dbuf_array(dpp, &dbuf_array);
1832 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1833 continue;
1834 }
1835 spkt = SK_PTR_ADDR_KPKT(sph);
1836 pkt_copy_metadata(pkt, spkt);
1837 /* Packet length for super packet starts from L3 */
1838 spkt->pkt_length = plen;
1839 spkt->pkt_flow_ulen = pkt->pkt_flow_ulen;
1840 spkt->pkt_headroom = 0;
1841 spkt->pkt_l2_len = 0;
1842 spkt->pkt_seg_cnt = 1;
1843
1844 ASSERT(dbuf_array.dba_num_dbufs > 0);
1845 bufcnt = dbuf_array.dba_num_dbufs;
1846 sbuf = kern_packet_get_next_buflet(sph, NULL);
1847 _append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1848 &sbuf);
1849
1850 KPKTQ_ENQUEUE(&pkts, spkt);
1851 _UUID_COPY(spkt->pkt_flow_id, fe->fe_uuid);
1852 _UUID_COPY(spkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1853 spkt->pkt_policy_id = fe->fe_policy_id;
1854 spkt->pkt_transport_protocol =
1855 fe->fe_transport_protocol;
1856 flow_agg_init_spkt(fsw, &fa, spkt, pkt);
1857 }
1858 next:
1859 pkt_agg_log(pkt, kernproc, true);
1860 prev_csum_ok = csum_ok;
1861 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1862 }
1863
1864 /* Free unused buflets */
1865 while (bh_cnt > 0) {
1866 pp_free_buflet(dpp, (kern_buflet_t)(buf_arr[iter]));
1867 buf_arr[iter] = 0;
1868 bh_cnt--;
1869 iter++;
1870 }
1871 /* Finalize the last super packet */
1872 if (sph != 0) {
1873 finalize_super_packet(&spkt, &sph, &fa, &largest_spkt,
1874 &spkts, bufcnt);
1875 }
1876 converge_aggregation_size(fe, largest_spkt);
1877 DTRACE_SKYWALK1(aggr__spkt__count, uint16_t, spkts);
1878 if (__improbable(is_mbuf)) {
1879 STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2PKT, spkts);
1880 } else {
1881 STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2PKT, spkts);
1882 }
1883 FLOW_STATS_IN_ADD(fe, spackets, spkts);
1884
1885 KPKTQ_FINI(&fe->fe_rx_pktq);
1886 KPKTQ_CONCAT(&fe->fe_rx_pktq, &pkts);
1887 KPKTQ_FINI(&pkts);
1888
1889 fsw_ring_enqueue_tail_drop(fsw, ring, &fe->fe_rx_pktq);
1890
1891 pp_free_pktq(&disposed_pkts);
1892 }
1893
1894 SK_NO_INLINE_ATTRIBUTE
1895 static void
flow_rx_agg_host(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * dropped_pkts,bool is_mbuf)1896 flow_rx_agg_host(struct nx_flowswitch *fsw, struct flow_entry *fe,
1897 struct pktq *dropped_pkts, bool is_mbuf)
1898 {
1899 #define __RX_AGG_HOST_DROP_SOURCE_PACKET(_pkt) do { \
1900 drop_packets++; \
1901 drop_bytes += (_pkt)->pkt_length; \
1902 KPKTQ_ENQUEUE(dropped_pkts, (_pkt)); \
1903 (_pkt) = NULL; \
1904 FLOW_AGG_CLEAR(&fa); \
1905 prev_csum_ok = false; \
1906 } while (0)
1907 struct flow_agg fa; /* states */
1908 FLOW_AGG_CLEAR(&fa);
1909
1910 struct pktq disposed_pkts; /* done src packets */
1911 KPKTQ_INIT(&disposed_pkts);
1912
1913 struct __kern_packet *pkt, *tpkt;
1914 /* points to the first mbuf of chain */
1915 struct mbuf *m_chain = NULL;
1916 /* super mbuf, at the end it points to last mbuf packet */
1917 struct mbuf *smbuf = NULL, *curr_m = NULL;
1918 bool prev_csum_ok = false, csum_ok, agg_ok;
1919 uint16_t smbufs = 0;
1920 uint32_t bytes = 0, rcvd_ulen = 0;
1921 uint32_t rcvd_packets = 0, rcvd_bytes = 0; /* raw packets & bytes */
1922 uint32_t drop_packets = 0, drop_bytes = 0; /* dropped packets & bytes */
1923 uint32_t largest_smbuf = 0;
1924 int err = 0;
1925
1926 struct fsw_stats *fsws = &fsw->fsw_stats;
1927 bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1928
1929 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1930
1931 /* state for mbuf batch alloc */
1932 uint32_t mhead_cnt;
1933 uint32_t mhead_bufsize;
1934 struct mbuf * mhead = NULL;
1935
1936 uint16_t l2len = KPKTQ_FIRST(&fe->fe_rx_pktq)->pkt_l2_len;
1937
1938 SK_DF(logflags, "Rx input queue bytes %u", fe->fe_rx_pktq_bytes);
1939
1940 if (__probable(!is_mbuf)) {
1941 /*
1942 * Batch mbuf alloc is based on
1943 * convert_native_pkt_to_mbuf_chain
1944 */
1945 if (__probable(fe->fe_rx_largest_size != 0 &&
1946 NX_FSW_TCP_RX_AGG_ENABLED())) {
1947 unsigned int num_segs = 1;
1948
1949 if (fe->fe_rx_largest_size <= MCLBYTES) {
1950 mhead_bufsize = MCLBYTES;
1951 } else if (fe->fe_rx_largest_size <= MBIGCLBYTES) {
1952 mhead_bufsize = MBIGCLBYTES;
1953 } else if (fe->fe_rx_largest_size <= M16KCLBYTES) {
1954 mhead_bufsize = M16KCLBYTES;
1955 } else {
1956 mhead_bufsize = M16KCLBYTES * 2;
1957 num_segs = 2;
1958 }
1959
1960 try_again:
1961 if (fe->fe_rx_pktq_bytes != 0) {
1962 mhead_cnt = estimate_buf_cnt(fe, MCLBYTES,
1963 mhead_bufsize);
1964 } else {
1965 /* No payload, thus it's all small-sized ACKs/... */
1966 mhead_bufsize = MHLEN;
1967 mhead_cnt = KPKTQ_LEN(&fe->fe_rx_pktq);
1968 }
1969
1970 mhead = m_allocpacket_internal(&mhead_cnt,
1971 mhead_bufsize, &num_segs, M_NOWAIT, 1, 0);
1972
1973 if (mhead == NULL) {
1974 if (mhead_bufsize > M16KCLBYTES) {
1975 mhead_bufsize = M16KCLBYTES;
1976 num_segs = 1;
1977 goto try_again;
1978 }
1979
1980 if (mhead_bufsize == M16KCLBYTES) {
1981 mhead_bufsize = MBIGCLBYTES;
1982 goto try_again;
1983 }
1984
1985 if (mhead_bufsize == MBIGCLBYTES) {
1986 mhead_bufsize = MCLBYTES;
1987 goto try_again;
1988 }
1989 }
1990 } else {
1991 mhead = NULL;
1992 mhead_bufsize = mhead_cnt = 0;
1993 }
1994 SK_DF(logflags, "batch alloc'ed %u mbufs of size %u", mhead_cnt,
1995 mhead_bufsize);
1996 }
1997
1998 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1999 if (tpkt != NULL) {
2000 void *baddr;
2001 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
2002 SK_PREFETCH(baddr, 0);
2003 }
2004
2005 /* Validate l2 len, ip vers, is_mbuf */
2006 ASSERT(pkt->pkt_l2_len == l2len);
2007 ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
2008 ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
2009 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
2010 ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
2011 ASSERT(!pkt->pkt_flow_ip_is_frag);
2012 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
2013
2014 csum_ok = false;
2015 agg_ok = false;
2016 /*
2017 * As we only agg packets with same hdr length,
2018 * leverage the pkt metadata
2019 */
2020 uint32_t thlen = (pkt->pkt_flow_ip_hlen +
2021 pkt->pkt_flow_tcp_hlen);
2022 uint32_t plen = (thlen + pkt->pkt_flow_ulen);
2023
2024 /*
2025 * Rather than calling flow_pkt_track() for each
2026 * packet here, we accumulate received packet stats
2027 * for the call to flow_track_stats() below. This
2028 * is because flow tracking is a no-op for traffic
2029 * that belongs to the host stack.
2030 */
2031 rcvd_ulen += pkt->pkt_flow_ulen;
2032 rcvd_bytes += pkt->pkt_length;
2033 rcvd_packets++;
2034
2035 KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
2036 fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
2037
2038 /* packet is for BSD flow, create a mbuf chain */
2039 uint32_t len = (l2len + plen);
2040 uint16_t data_csum = 0;
2041 struct mbuf *m;
2042 bool is_wake_pkt = false;
2043 if (__improbable(is_mbuf)) {
2044 m = pkt->pkt_mbuf;
2045
2046 if (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
2047 is_wake_pkt = true;
2048 }
2049
2050 /* Detach mbuf from source pkt */
2051 KPKT_CLEAR_MBUF_DATA(pkt);
2052
2053 uint32_t trailer = (m_pktlen(m) - len);
2054 ASSERT((uint32_t)m_pktlen(m) >= plen);
2055 /* Remove the trailer */
2056 if (trailer > 0) {
2057 m_adj(m, -trailer);
2058 }
2059 /* attached mbuf is already allocated */
2060 csum_ok = mbuf_csum(pkt, m, is_ipv4, &data_csum);
2061 } else { /* native */
2062 uint16_t pad = P2ROUNDUP(l2len, sizeof(uint32_t)) -
2063 l2len;
2064 uint32_t tot_len = (len + pad);
2065 /* remember largest aggregated packet size */
2066 if (smbuf) {
2067 /* plus 4 bytes to account for padding */
2068 if (largest_smbuf <
2069 (uint32_t)m_pktlen(smbuf) + pad) {
2070 largest_smbuf = (uint32_t)m_pktlen(smbuf) + pad;
2071 }
2072 }
2073
2074 if ((pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
2075 is_wake_pkt = true;
2076 }
2077
2078 if (prev_csum_ok && curr_m) {
2079 ASSERT(fa.fa_smbuf == smbuf);
2080 ASSERT(!fa.fa_sobj_is_pkt);
2081 agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2082
2083 if (agg_ok &&
2084 M_TRAILINGSPACE(curr_m) >= plen - thlen) {
2085 /*
2086 * No need for a new mbuf,
2087 * just append to curr_m.
2088 */
2089 csum_ok = copy_pkt_csum_packed(pkt,
2090 plen, NULL, is_ipv4, curr_m, NULL,
2091 &data_csum, NULL);
2092
2093 if (!csum_ok) {
2094 STATS_INC(fsws,
2095 FSW_STATS_RX_AGG_BAD_CSUM);
2096 SK_ERR("Checksum for "
2097 "aggregation is wrong");
2098 DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail1);
2099 /*
2100 * Turns out, checksum is wrong!
2101 * Fallback to no-agg mode.
2102 */
2103 agg_ok = 0;
2104 } else {
2105 /*
2106 * We only added payload,
2107 * thus -thlen.
2108 */
2109 bytes += (plen - thlen);
2110 flow_agg_merge_hdr(&fa, pkt,
2111 data_csum, fsws);
2112 goto next;
2113 }
2114 }
2115 }
2116
2117 /*
2118 * If the batch allocation returned partial success,
2119 * we try blocking allocation here again
2120 */
2121 m = mhead;
2122 if (__improbable(m == NULL ||
2123 tot_len > mhead_bufsize)) {
2124 unsigned int num_segs = 1;
2125 if (tot_len > M16KCLBYTES) {
2126 num_segs = 0;
2127 }
2128
2129 ASSERT(mhead_cnt == 0 || mhead != NULL);
2130 err = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
2131 &num_segs, &m);
2132 if (err != 0) {
2133 STATS_INC(fsws,
2134 FSW_STATS_RX_DROP_NOMEM_BUF);
2135 SK_ERR("mbuf alloc failed (err %d), "
2136 "maxchunks %d, len %d", err, num_segs,
2137 tot_len);
2138 __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2139 continue;
2140 }
2141 } else {
2142 ASSERT(mhead_cnt > 0);
2143 mhead = m->m_nextpkt;
2144 m->m_nextpkt = NULL;
2145 mhead_cnt--;
2146 if (__improbable(mhead_bufsize >= tot_len + M16KCLBYTES)) {
2147 FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
2148 }
2149 }
2150 m->m_data += pad;
2151 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2152
2153 /*
2154 * copy and checksum l3, l4 and payload
2155 * l2 header is copied later only if we
2156 * can't agg as an optimization
2157 */
2158 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
2159 _dbuf_array_t dbuf_array = {.dba_is_buflet = false};
2160 if (agg_ok) {
2161 int added = 0, dbuf_idx = 0;
2162 struct mbuf *m_tmp = m;
2163 dbuf_array.dba_num_dbufs = 0;
2164 uint32_t m_chain_max_len = 0;
2165 while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
2166 dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
2167 dbuf_array.dba_num_dbufs += 1;
2168 m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
2169 m_tmp = m_tmp->m_next;
2170 dbuf_idx++;
2171 }
2172 ASSERT(m_tmp == NULL);
2173
2174 csum_ok = copy_pkt_csum_packed(pkt, plen,
2175 &dbuf_array, is_ipv4, curr_m, NULL,
2176 &data_csum, &added);
2177
2178 if (!csum_ok) {
2179 STATS_INC(fsws,
2180 FSW_STATS_RX_AGG_BAD_CSUM);
2181 SK_ERR("Checksum for aggregation "
2182 "on new mbuf is wrong");
2183 DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail2);
2184 agg_ok = false;
2185 goto non_agg;
2186 }
2187
2188 /*
2189 * There was not enough space in curr_m,
2190 * thus we must have added to m->m_data.
2191 */
2192 VERIFY(added > 0);
2193 VERIFY(m->m_len <= m->m_pkthdr.len &&
2194 (uint32_t)m->m_pkthdr.len <= m_chain_max_len);
2195
2196 /*
2197 * We account for whatever we added
2198 * to m later on, thus - added.
2199 */
2200 bytes += plen - thlen - added;
2201 } else {
2202 non_agg:
2203 dbuf_array.dba_num_dbufs = 0;
2204 uint32_t m_chain_max_len = 0;
2205 struct mbuf *m_tmp = m;
2206 int dbuf_idx = 0;
2207 while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
2208 dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
2209 dbuf_array.dba_num_dbufs += 1;
2210 m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
2211 m_tmp = m_tmp->m_next;
2212 dbuf_idx++;
2213 }
2214 ASSERT(m_tmp == NULL);
2215
2216 m->m_len += l2len;
2217 m->m_pkthdr.len += l2len;
2218 csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
2219 &data_csum, is_ipv4);
2220 if (__improbable(!csum_ok)) {
2221 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
2222 SK_ERR("%d incorrect csum", __LINE__);
2223 DTRACE_SKYWALK(aggr__host_tcp_csum_fail);
2224 }
2225 VERIFY(m->m_len <= m->m_pkthdr.len &&
2226 (uint32_t)m->m_pkthdr.len <= m_chain_max_len);
2227 }
2228
2229 STATS_INC(fsws, FSW_STATS_RX_COPY_PKT2MBUF);
2230 STATS_INC(fsws, FSW_STATS_RX_COPY_SUM);
2231
2232 m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off;
2233 m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value;
2234 /*
2235 * Note that these flags have same value,
2236 * except PACKET_CSUM_PARTIAL
2237 */
2238 m->m_pkthdr.csum_flags |= (pkt->pkt_csum_flags &
2239 PACKET_CSUM_RX_FLAGS);
2240
2241 /* Set the rcvif */
2242 m->m_pkthdr.rcvif = fsw->fsw_ifp;
2243
2244 /* Make sure to propagate the wake pkt flag */
2245 if (is_wake_pkt) {
2246 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
2247 }
2248 }
2249 ASSERT(m != NULL);
2250 ASSERT((m->m_flags & M_PKTHDR) && m->m_pkthdr.pkt_hdr != NULL);
2251 ASSERT((m->m_flags & M_HASFCS) == 0);
2252 ASSERT(m->m_nextpkt == NULL);
2253
2254 if (__improbable(is_mbuf)) {
2255 if ((uint32_t) m->m_len < (l2len + thlen)) {
2256 m = m_pullup(m, (l2len + thlen));
2257 if (m == NULL) {
2258 STATS_INC(fsws,
2259 FSW_STATS_RX_DROP_NOMEM_BUF);
2260 SK_ERR("mbuf pullup failed (err %d)",
2261 err);
2262 __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2263 continue;
2264 }
2265 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2266 }
2267 if (prev_csum_ok && csum_ok) {
2268 ASSERT(fa.fa_smbuf == smbuf);
2269 agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2270 }
2271 }
2272
2273 if (agg_ok) {
2274 ASSERT(is_wake_pkt == false);
2275 ASSERT(fa.fa_smbuf == smbuf);
2276 ASSERT(!fa.fa_sobj_is_pkt);
2277 if (__improbable(is_mbuf)) {
2278 bytes += (m_pktlen(m) - l2len);
2279 /* adjust mbuf by l2, l3 and l4 hdr */
2280 m_adj(m, l2len + thlen);
2281 } else {
2282 bytes += m_pktlen(m);
2283 }
2284
2285 m->m_flags &= ~M_PKTHDR;
2286 flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
2287 while (curr_m->m_next != NULL) {
2288 curr_m = curr_m->m_next;
2289 }
2290 curr_m->m_next = m;
2291 curr_m = m;
2292 m = NULL;
2293 } else {
2294 if ((uint32_t) m->m_len < l2len) {
2295 m = m_pullup(m, l2len);
2296 if (m == NULL) {
2297 STATS_INC(fsws,
2298 FSW_STATS_RX_DROP_NOMEM_BUF);
2299 SK_ERR("mbuf pullup failed (err %d)",
2300 err);
2301 __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2302 continue;
2303 }
2304 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2305 }
2306
2307 /* copy l2 header for native */
2308 if (__probable(!is_mbuf)) {
2309 uint16_t llhoff = pkt->pkt_headroom;
2310 uint8_t *baddr;
2311 MD_BUFLET_ADDR_ABS(pkt, baddr);
2312 ASSERT(baddr != NULL);
2313 baddr += llhoff;
2314 pkt_copy(baddr, m->m_data, l2len);
2315 }
2316 /* adjust mbuf by l2 hdr */
2317 m_adj(m, l2len);
2318 bytes += m_pktlen(m);
2319
2320 /*
2321 * aggregated packets can be skipped by pktap because
2322 * the original pre-aggregated chain already passed through
2323 * pktap (see fsw_snoop()) before entering this function.
2324 */
2325 m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
2326
2327 if (m_chain == NULL) {
2328 /* this is the start of the chain */
2329 m_chain = m;
2330 smbuf = m;
2331 curr_m = m;
2332 } else if (smbuf != NULL) {
2333 /*
2334 * set m to be next packet
2335 */
2336 mbuf_agg_log(smbuf, kernproc, is_mbuf);
2337 smbuf->m_nextpkt = m;
2338 smbuf = m;
2339 curr_m = m;
2340 } else {
2341 VERIFY(0);
2342 }
2343
2344 smbufs++;
2345 m = NULL;
2346
2347 flow_agg_init_smbuf(fsw, &fa, smbuf, pkt);
2348 /*
2349 * if the super packet is an mbuf which can't accomodate
2350 * (sizeof(struct ip6_tcp_mask) in a single buffer then
2351 * do the aggregation check in slow path.
2352 * Note that an mbuf without cluster has only 80 bytes
2353 * available for data, sizeof(struct ip6_tcp_mask) is
2354 * also 80 bytes, so if the packet contains an
2355 * ethernet header, this mbuf won't be able to fully
2356 * contain "struct ip6_tcp_mask" data in a single
2357 * buffer.
2358 */
2359 if (pkt->pkt_flow_ip_ver == IPV6_VERSION) {
2360 if (__improbable(smbuf->m_len <
2361 ((smbuf->m_data -
2362 (caddr_t)(smbuf->m_pkthdr.pkt_hdr)) +
2363 MASK_SIZE))) {
2364 fa.fa_sobj_is_short = true;
2365 }
2366 }
2367 }
2368 next:
2369 pkt_agg_log(pkt, kernproc, true);
2370 prev_csum_ok = csum_ok;
2371 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
2372 }
2373
2374 KPKTQ_FINI(&fe->fe_rx_pktq);
2375
2376 /* Free any leftover mbufs, true only for native */
2377 if (__improbable(mhead != NULL)) {
2378 ASSERT(mhead_cnt != 0);
2379 (void) m_freem_list(mhead);
2380 mhead = NULL;
2381 mhead_cnt = 0;
2382 mhead_bufsize = 0;
2383 }
2384
2385 converge_aggregation_size(fe, largest_smbuf);
2386
2387 if (smbufs > 0) {
2388 /* Last smbuf */
2389 mbuf_agg_log(smbuf, kernproc, is_mbuf);
2390 SK_DF(logflags, "smbuf count %u", smbufs);
2391
2392 ASSERT(m_chain != NULL);
2393 ASSERT(smbuf != NULL);
2394 /*
2395 * Call fsw_host_sendup() with mbuf chain
2396 * directly.
2397 */
2398 mchain_agg_log(m_chain, kernproc, is_mbuf);
2399 fsw_host_sendup(fsw->fsw_ifp, m_chain, smbuf, smbufs, bytes);
2400
2401 if (__improbable(is_mbuf)) {
2402 STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2MBUF, smbufs);
2403 } else {
2404 STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2MBUF, smbufs);
2405 }
2406 FLOW_STATS_IN_ADD(fe, spackets, smbufs);
2407
2408 ASSERT((fe->fe_flags & FLOWENTF_TRACK) == 0);
2409 }
2410
2411 /* record (raw) number of packets and bytes */
2412 ASSERT((int)(rcvd_bytes - drop_bytes) >= 0);
2413 ASSERT((int)(rcvd_packets - drop_packets) >= 0);
2414 flow_track_stats(fe, (rcvd_bytes - drop_bytes),
2415 (rcvd_packets - drop_packets), (rcvd_ulen != 0), true);
2416
2417 pp_free_pktq(&disposed_pkts);
2418 }
2419
2420 void
flow_rx_agg_tcp(struct nx_flowswitch * fsw,struct flow_entry * fe)2421 flow_rx_agg_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe)
2422 {
2423 struct pktq dropped_pkts;
2424 bool is_mbuf;
2425
2426 if (__improbable(fe->fe_rx_frag_count > 0)) {
2427 dp_flow_rx_process(fsw, fe);
2428 return;
2429 }
2430
2431 KPKTQ_INIT(&dropped_pkts);
2432
2433 if (!dp_flow_rx_route_process(fsw, fe)) {
2434 SK_ERR("Rx route bad");
2435 fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
2436 STATS_ADD(&fsw->fsw_stats, FSW_STATS_RX_FLOW_NONVIABLE,
2437 KPKTQ_LEN(&dropped_pkts));
2438 goto done;
2439 }
2440
2441 is_mbuf = !!(PKT_IS_MBUF(KPKTQ_FIRST(&fe->fe_rx_pktq)));
2442
2443 if (fe->fe_nx_port == FSW_VP_HOST) {
2444 boolean_t do_rx_agg;
2445
2446 /* BSD flow */
2447 if (sk_fsw_rx_agg_tcp_host != SK_FSW_RX_AGG_TCP_HOST_AUTO) {
2448 do_rx_agg = (sk_fsw_rx_agg_tcp_host ==
2449 SK_FSW_RX_AGG_TCP_HOST_ON);
2450 } else {
2451 do_rx_agg = !dlil_has_ip_filter() &&
2452 !dlil_has_if_filter(fsw->fsw_ifp);
2453 }
2454 if (__improbable(!do_rx_agg)) {
2455 fsw_host_rx(fsw, &fe->fe_rx_pktq);
2456 return;
2457 }
2458 if (__improbable(pktap_total_tap_count != 0)) {
2459 fsw_snoop(fsw, fe, true);
2460 }
2461 flow_rx_agg_host(fsw, fe, &dropped_pkts, is_mbuf);
2462 } else {
2463 /* channel flow */
2464 if (__improbable(pktap_total_tap_count != 0)) {
2465 fsw_snoop(fsw, fe, true);
2466 }
2467 flow_rx_agg_channel(fsw, fe, &dropped_pkts, is_mbuf);
2468 }
2469
2470 done:
2471 pp_free_pktq(&dropped_pkts);
2472 }
2473