1 /*
2 * Copyright (c) 2019-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
33 #include <skywalk/nexus/netif/nx_netif.h>
34 #include <skywalk/nexus/netif/nx_netif_compat.h>
35 #include <netinet/tcp.h>
36 #include <netinet/ip.h>
37 #include <netinet/ip6.h>
38 #include <net/pktap.h>
39 #include <sys/sdt.h>
40
41 #define MAX_AGG_IP_LEN() MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET)
42 #define MAX_BUFLET_COUNT (32)
43 #define TCP_FLAGS_IGNORE (TH_FIN|TH_SYN|TH_RST|TH_URG)
44 #define PKT_IS_MBUF(_pkt) (_pkt->pkt_pflags & PKT_F_MBUF_DATA)
45 #define PKT_IS_TRUNC_MBUF(_pkt) (PKT_IS_MBUF(_pkt) && \
46 (_pkt->pkt_pflags & PKT_F_TRUNCATED))
47 #define PKT_IS_WAKE_PKT(_pkt) ((PKT_IS_MBUF(_pkt) && \
48 (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) || \
49 (!PKT_IS_MBUF(_pkt) && \
50 (_pkt->pkt_pflags & PKT_F_WAKE_PKT)))
51
52
53 typedef uint16_t (* flow_agg_fix_pkt_sum_func)(uint16_t, uint16_t, uint16_t);
54
55 static uint16_t
56 flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new);
57
58 static uint16_t
59 flow_agg_pkt_fix_sum_no_op(uint16_t csum, uint16_t old, uint16_t new);
60
61 /*
62 * This structure holds per-super object (mbuf/packet) flow aggregation.
63 */
64 struct flow_agg {
65 union {
66 struct {
67 union {
68 void * _fa_sobj;
69 struct mbuf * _fa_smbuf; /* super mbuf */
70 struct __kern_packet *_fa_spkt; /* super pkt */
71 };
72 uint8_t *__indexable _fa_sptr; /* ptr to super IP header */
73 bool _fa_sobj_is_pkt; /* super obj is pkt or mbuf */
74 /*
75 * super obj is not large enough to hold the IP & TCP
76 * header in a contiguous buffer.
77 */
78 bool _fa_sobj_is_short;
79 uint32_t _fa_tcp_seq; /* expected next sequence # */
80 uint32_t _fa_ulen; /* expected next ulen */
81 uint32_t _fa_total; /* total aggregated bytes */
82 /* function that fix packet checksum */
83 flow_agg_fix_pkt_sum_func _fa_fix_pkt_sum;
84 } __flow_agg;
85 uint64_t __flow_agg_data[5];
86 };
87 #define fa_sobj __flow_agg._fa_sobj
88 #define fa_smbuf __flow_agg._fa_smbuf
89 #define fa_spkt __flow_agg._fa_spkt
90 #define fa_sptr __flow_agg._fa_sptr
91 #define fa_sobj_is_pkt __flow_agg._fa_sobj_is_pkt
92 #define fa_sobj_is_short __flow_agg._fa_sobj_is_short
93 #define fa_tcp_seq __flow_agg._fa_tcp_seq
94 #define fa_ulen __flow_agg._fa_ulen
95 #define fa_total __flow_agg._fa_total
96 #define fa_fix_pkt_sum __flow_agg._fa_fix_pkt_sum
97 };
98
99 #if __has_ptrcheck
100 #define FLOW_AGG_CLEAR(_fa) do { \
101 _CASSERT(sizeof(struct flow_agg) == 48); \
102 _CASSERT(offsetof(struct flow_agg, fa_fix_pkt_sum) == 40); \
103 sk_zero_48(_fa); \
104 (_fa)->fa_fix_pkt_sum = 0; \
105 } while (0)
106 #else
107 #define FLOW_AGG_CLEAR(_fa) do { \
108 _CASSERT(sizeof(struct flow_agg) == 40); \
109 _CASSERT(offsetof(struct flow_agg, fa_fix_pkt_sum) == 32); \
110 sk_zero_32(_fa); \
111 (_fa)->fa_fix_pkt_sum = 0; \
112 } while (0)
113 #endif
114
115 #define MASK_SIZE 80 /* size of struct {ip,ip6}_tcp_mask */
116
117 struct ip_tcp_mask {
118 struct ip ip_m;
119 struct tcphdr tcp_m;
120 uint32_t tcp_option_m[MAX_TCPOPTLEN / sizeof(uint32_t)];
121 };
122
123 static const struct ip_tcp_mask ip_tcp_mask
124 __sk_aligned(16) =
125 {
126 .ip_m = {
127 .ip_hl = 0xf,
128 .ip_v = 0xf,
129 .ip_tos = 0xff,
130 /* Not checked; aggregated packet's ip_len is increasing */
131 .ip_len = 0,
132 .ip_id = 0,
133 .ip_off = 0xffff,
134 .ip_ttl = 0xff,
135 .ip_p = 0xff,
136 .ip_sum = 0,
137 .ip_src.s_addr = 0xffffffff,
138 .ip_dst.s_addr = 0xffffffff,
139 },
140 .tcp_m = {
141 .th_sport = 0xffff,
142 .th_dport = 0xffff,
143 .th_seq = 0,
144 .th_ack = 0xffffffff,
145 .th_x2 = 0xf,
146 .th_off = 0xf,
147 .th_flags = ~TH_PUSH,
148 .th_win = 0xffff,
149 .th_sum = 0,
150 .th_urp = 0xffff,
151 },
152 .tcp_option_m = {
153 /* Max 40 bytes of TCP options */
154 0xffffffff,
155 0xffffffff,
156 0xffffffff,
157 0, /* Filling up to MASK_SIZE */
158 0, /* Filling up to MASK_SIZE */
159 0, /* Filling up to MASK_SIZE */
160 0, /* Filling up to MASK_SIZE */
161 0, /* Filling up to MASK_SIZE */
162 0, /* Filling up to MASK_SIZE */
163 0, /* Filling up to MASK_SIZE */
164 },
165 };
166
167 struct ip6_tcp_mask {
168 struct ip6_hdr ip6_m;
169 struct tcphdr tcp_m;
170 uint32_t tcp_option_m[5]; /* 5 bytes to fill up to MASK_SIZE */
171 };
172
173 static const struct ip6_tcp_mask ip6_tcp_mask
174 __sk_aligned(16) =
175 {
176 .ip6_m = {
177 .ip6_ctlun.ip6_un1.ip6_un1_flow = 0xffffffff,
178 /* Not checked; aggregated packet's ip_len is increasing */
179 .ip6_ctlun.ip6_un1.ip6_un1_plen = 0,
180 .ip6_ctlun.ip6_un1.ip6_un1_nxt = 0xff,
181 .ip6_ctlun.ip6_un1.ip6_un1_hlim = 0xff,
182 .ip6_src.__u6_addr.__u6_addr32[0] = 0xffffff,
183 .ip6_src.__u6_addr.__u6_addr32[1] = 0xffffff,
184 .ip6_src.__u6_addr.__u6_addr32[2] = 0xffffff,
185 .ip6_src.__u6_addr.__u6_addr32[3] = 0xffffff,
186 .ip6_dst.__u6_addr.__u6_addr32[0] = 0xffffff,
187 .ip6_dst.__u6_addr.__u6_addr32[1] = 0xffffff,
188 .ip6_dst.__u6_addr.__u6_addr32[2] = 0xffffff,
189 .ip6_dst.__u6_addr.__u6_addr32[3] = 0xffffff,
190 },
191 .tcp_m = {
192 .th_sport = 0xffff,
193 .th_dport = 0xffff,
194 .th_seq = 0,
195 .th_ack = 0xffffffff,
196 .th_x2 = 0xf,
197 .th_off = 0xf,
198 .th_flags = ~TH_PUSH,
199 .th_win = 0xffff,
200 .th_sum = 0,
201 .th_urp = 0xffff,
202 },
203 .tcp_option_m = {
204 /* Max 40 bytes of TCP options */
205 0xffffffff,
206 0xffffffff,
207 0xffffffff,
208 0, /* Filling up to MASK_SIZE */
209 0, /* Filling up to MASK_SIZE */
210 },
211 };
212
213 #if SK_LOG
214 SK_LOG_ATTRIBUTE
215 static void
_pkt_agg_log(struct __kern_packet * pkt,struct proc * p,bool is_input)216 _pkt_agg_log(struct __kern_packet *pkt, struct proc *p, bool is_input)
217 {
218 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
219 (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
220
221 kern_packet_t ph = SK_PKT2PH(pkt);
222 uint64_t bufcnt = 1;
223 if (!is_input) {
224 bufcnt = kern_packet_get_buflet_count(ph);
225 }
226
227 SK_DF(logflags, "%s(%d) %spkt 0x%llx plen %u",
228 sk_proc_name_address(p), sk_proc_pid(p), is_input ? "s":"d",
229 SK_KVA(pkt), pkt->pkt_length);
230
231 SK_DF(logflags, "%spkt csumf/rxstart/rxval 0x%x/%u/0x%04x",
232 is_input ? "s":"d", pkt->pkt_csum_flags,
233 (uint32_t)pkt->pkt_csum_rx_start_off,
234 (uint32_t)pkt->pkt_csum_rx_value);
235
236 if (!is_input) {
237 kern_buflet_t buf = kern_packet_get_next_buflet(ph, NULL);
238
239 /* Individual buflets */
240 for (uint64_t i = 0; i < bufcnt && buf != NULL; i++) {
241 SK_DF(logflags | SK_VERB_DUMP, "%s",
242 sk_dump("buf", __buflet_get_data_address(buf),
243 __buflet_get_data_length(buf), 128, NULL, 0));
244 buf = kern_packet_get_next_buflet(ph, buf);
245 }
246 }
247 }
248
249 #define pkt_agg_log(_pkt, _p, _is_input) do { \
250 if (__improbable(sk_verbose != 0)) { \
251 _pkt_agg_log(_pkt, _p, _is_input); \
252 } \
253 } while (0)
254
255 SK_LOG_ATTRIBUTE
256 static void
_mbuf_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)257 _mbuf_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
258 {
259 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
260 (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
261
262 SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
263 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
264 m->m_pkthdr.len);
265
266 SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
267 m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
268 (uint32_t)m->m_pkthdr.csum_rx_val);
269
270 /* Dump the first mbuf */
271 ASSERT(m_mtod_current(m) != NULL);
272 SK_DF(logflags | SK_VERB_DUMP, "%s", sk_dump("buf",
273 (uint8_t *)m_mtod_current(m), m->m_len, 128, NULL, 0));
274 }
275
276 #define mbuf_agg_log(_m, _p, _is_mbuf) do { \
277 if (__improbable(sk_verbose != 0)) { \
278 _mbuf_agg_log(_m, _p, _is_mbuf); \
279 } \
280 } while (0)
281
282 SK_LOG_ATTRIBUTE
283 static void
_mchain_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)284 _mchain_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
285 {
286 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
287 (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
288
289 while (m != NULL) {
290 SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
291 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
292 m->m_pkthdr.len);
293
294 SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
295 m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
296 (uint32_t)m->m_pkthdr.csum_rx_val);
297
298 m = m->m_nextpkt;
299 }
300 }
301
302 #define mchain_agg_log(_m, _p, _is_mbuf) do { \
303 if (__improbable(sk_verbose != 0)) { \
304 _mchain_agg_log(_m, _p, _is_mbuf); \
305 } \
306 } while (0)
307 #else
308 #define pkt_agg_log(...)
309 #define mbuf_agg_log(...)
310 #define mchain_agg_log(...)
311 #endif /* SK_LOG */
312
313 /*
314 * Checksum only for packet with mbuf.
315 */
316 static bool
mbuf_csum(struct __kern_packet * pkt,struct mbuf * m,bool verify_l3,uint16_t * data_csum)317 mbuf_csum(struct __kern_packet *pkt, struct mbuf *m, bool verify_l3,
318 uint16_t *data_csum)
319 {
320 ASSERT(data_csum != NULL);
321
322 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
323 uint32_t plen = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen +
324 pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
325 uint16_t l4len = plen - pkt->pkt_l2_len - pkt->pkt_flow_ip_hlen;
326 uint16_t start = pkt->pkt_l2_len;
327 uint32_t partial = 0;
328 uint16_t csum = 0;
329
330 ASSERT(plen == m_pktlen(m));
331
332 /* Some compat drivers compute full checksum */
333 if ((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS) ==
334 CSUM_RX_FULL_FLAGS) {
335 SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
336 m->m_pkthdr.csum_flags, m->m_pkthdr.csum_rx_start,
337 m->m_pkthdr.csum_rx_val);
338
339 /* Compute the data_csum */
340 struct tcphdr *tcp =
341 (struct tcphdr *)(void *)(mtod(m, uint8_t *) +
342 pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen);
343 /* 16-bit alignment is sufficient */
344 ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
345
346 uint16_t th_sum = tcp->th_sum;
347 tcp->th_sum = 0;
348
349 partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
350 pkt->pkt_flow_tcp_hlen);
351 partial += htons(l4len + IPPROTO_TCP);
352 if (pkt->pkt_flow_ip_ver == IPVERSION) {
353 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
354 pkt->pkt_flow_ipv4_dst.s_addr, partial);
355 } else {
356 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
357 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
358 &pkt->pkt_flow_ipv6_dst, partial);
359 }
360 /* Restore the original checksum */
361 tcp->th_sum = th_sum;
362 th_sum = __packet_fix_sum(th_sum, csum, 0);
363 *data_csum = ~th_sum & 0xffff;
364
365 /* pkt metadata will be transfer to super packet */
366 __packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
367 0, m->m_pkthdr.csum_rx_val, false);
368
369 if ((m->m_pkthdr.csum_rx_val ^ 0xffff) == 0) {
370 return true;
371 } else {
372 return false;
373 }
374 }
375 /* Reset the csum RX flags */
376 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
377 if (verify_l3) {
378 csum = m_sum16(m, start, pkt->pkt_flow_ip_hlen);
379 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
380 start, pkt->pkt_flow_ip_hlen, csum);
381 m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED;
382 if ((csum ^ 0xffff) != 0) {
383 return false;
384 } else {
385 m->m_pkthdr.csum_flags |= CSUM_IP_VALID;
386 }
387 }
388 /* Compute L4 header checksum */
389 partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
390 pkt->pkt_flow_tcp_hlen);
391 /* Compute payload checksum */
392 start += (pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
393 *data_csum = m_sum16(m, start, (plen - start));
394
395 /* Fold in the data checksum to TCP checksum */
396 partial += *data_csum;
397 partial += htons(l4len + IPPROTO_TCP);
398 if (pkt->pkt_flow_ip_ver == IPVERSION) {
399 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
400 pkt->pkt_flow_ipv4_dst.s_addr, partial);
401 } else {
402 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
403 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
404 &pkt->pkt_flow_ipv6_dst, partial);
405 }
406 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
407 start - pkt->pkt_flow_tcp_hlen, l4len, csum);
408 // Set start to 0 for full checksum
409 m->m_pkthdr.csum_rx_start = 0;
410 m->m_pkthdr.csum_rx_val = csum;
411 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
412
413 /* pkt metadata will be transfer to super packet */
414 __packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
415 0, csum, false);
416
417 if ((csum ^ 0xffff) != 0) {
418 return false;
419 }
420
421 return true;
422 }
423
424 /* structure to pass an array of data buffers */
425 typedef struct _dbuf_array {
426 union {
427 struct __kern_buflet *dba_buflet[MAX_BUFLET_COUNT];
428 struct mbuf *dba_mbuf[MAX_BUFLET_COUNT];
429 };
430 uint8_t dba_num_dbufs;
431 bool dba_is_buflet;
432 } _dbuf_array_t;
433
434 static inline void
_copy_data_sum_dbuf(struct __kern_packet * spkt,uint16_t soff,uint16_t plen,uint32_t * partial_sum,boolean_t * odd_start,_dbuf_array_t * dbuf,boolean_t do_csum)435 _copy_data_sum_dbuf(struct __kern_packet *spkt, uint16_t soff, uint16_t plen,
436 uint32_t *partial_sum, boolean_t *odd_start, _dbuf_array_t *dbuf,
437 boolean_t do_csum)
438 {
439 uint8_t i = 0;
440 uint32_t buflet_dlim, buflet_dlen, buf_off = 0;
441
442 ASSERT(plen > 0);
443 while (plen > 0) {
444 ASSERT(i < dbuf->dba_num_dbufs);
445 uint32_t dbuf_lim, tmplen;
446 uint8_t *dbuf_addr;
447
448 if (dbuf->dba_is_buflet) {
449 ASSERT(kern_buflet_get_data_offset(dbuf->dba_buflet[i]) == 0);
450 /* XXX -fbounds-safety: use the inline variant to return an __indexable */
451 dbuf_addr = __buflet_get_data_address(dbuf->dba_buflet[i]);
452
453 buflet_dlim = kern_buflet_get_data_limit(dbuf->dba_buflet[i]);
454 buflet_dlen = kern_buflet_get_data_length(dbuf->dba_buflet[i]);
455 buf_off = buflet_dlen;
456 dbuf_lim = buflet_dlim - buf_off;
457 dbuf_addr += buf_off;
458 } else {
459 dbuf_lim = (uint32_t) M_TRAILINGSPACE(dbuf->dba_mbuf[i]);
460 dbuf_addr = mtod(dbuf->dba_mbuf[i], uint8_t *);
461 buf_off = dbuf->dba_mbuf[i]->m_len;
462 dbuf_addr += buf_off;
463 }
464
465 tmplen = min(plen, dbuf_lim);
466 if (PKT_IS_TRUNC_MBUF(spkt)) {
467 if (do_csum) {
468 *partial_sum = m_copydata_sum(spkt->pkt_mbuf,
469 soff, tmplen, dbuf_addr, *partial_sum,
470 odd_start);
471 } else {
472 m_copydata(spkt->pkt_mbuf, soff, tmplen,
473 dbuf_addr);
474 }
475 } else {
476 *partial_sum = pkt_copyaddr_sum(SK_PKT2PH(spkt),
477 soff, dbuf_addr, tmplen, do_csum, *partial_sum,
478 odd_start);
479 }
480 if (dbuf->dba_is_buflet) {
481 VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[i],
482 tmplen + buf_off) == 0);
483 } else {
484 dbuf->dba_mbuf[i]->m_len += tmplen;
485 dbuf->dba_mbuf[0]->m_pkthdr.len += tmplen;
486 }
487 soff += tmplen;
488 plen -= tmplen;
489 buf_off = 0;
490 i++;
491 }
492 ASSERT(plen == 0);
493 }
494
495 /*
496 * Copy (fill) and checksum for packet.
497 * spkt: source IP packet.
498 * plen: length of data in spkt (IP hdr + TCP hdr + TCP payload).
499 * verify_l3: verify IPv4 header checksum.
500 * currm: destination mbuf.
501 * currp: destination skywalk packet.
502 * dbuf: additional destination data buffer(s), used when current destination
503 * packet is out of space.
504 * added: amount of data copied from spkt to the additional buffer.
505 * data_sum: 16-bit folded partial checksum of the copied TCP payload.
506 */
507 static bool
copy_pkt_csum_packed(struct __kern_packet * spkt,uint32_t plen,_dbuf_array_t * dbuf,bool verify_l3,struct mbuf * currm,struct __kern_buflet * currp,uint16_t * data_csum,int * added)508 copy_pkt_csum_packed(struct __kern_packet *spkt, uint32_t plen,
509 _dbuf_array_t *dbuf, bool verify_l3, struct mbuf *currm,
510 struct __kern_buflet *currp, uint16_t *data_csum, int *added)
511 {
512 ASSERT(data_csum != NULL);
513
514 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
515 SK_VERB_COPY));
516
517 uint16_t start = 0, csum = 0;
518 uint32_t len = 0;
519 uint32_t l4len;
520 /* soff is only used for packets */
521 uint16_t soff = spkt->pkt_headroom + spkt->pkt_l2_len;
522 uint32_t data_partial = 0, partial = 0;
523 int32_t curr_oldlen;
524 uint32_t curr_trailing;
525 char *curr_ptr;
526 int32_t curr_len;
527 uint16_t data_off;
528 uint32_t tmplen;
529 boolean_t odd_start = FALSE;
530 bool verify_l4;
531
532 /* One of them must be != NULL, but they can't be both set */
533 VERIFY((currm != NULL || currp != NULL) &&
534 ((currm != NULL) != (currp != NULL)));
535
536 if (currm != NULL) {
537 curr_oldlen = currm->m_len;
538 curr_trailing = (uint32_t)M_TRAILINGSPACE(currm);
539 curr_ptr = mtod(currm, char *) + currm->m_len;
540 curr_len = currm->m_len;
541 } else {
542 curr_oldlen = currp->buf_dlen;
543 curr_trailing = currp->buf_dlim - currp->buf_doff -
544 currp->buf_dlen;
545 /* XXX -fbounds-safety: use the inline variant to return an __indexable */
546 curr_ptr = (char *)__buflet_get_data_address(currp) + currp->buf_doff +
547 currp->buf_dlen;
548 curr_len = currp->buf_dlen;
549 }
550
551 /* Verify checksum only for IPv4 */
552 len = spkt->pkt_flow_ip_hlen;
553 verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(spkt));
554 if (verify_l3) {
555 if (PKT_IS_TRUNC_MBUF(spkt)) {
556 partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
557 len, 0, 0);
558 } else {
559 partial = pkt_sum(SK_PKT2PH(spkt), soff, len);
560 }
561
562 csum = __packet_fold_sum(partial);
563 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)", 0,
564 len, csum);
565 spkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
566 if ((csum ^ 0xffff) != 0) {
567 /* No need to copy & checkum TCP+payload */
568 return false;
569 } else {
570 spkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
571 }
572 }
573
574 verify_l4 = !PACKET_HAS_FULL_CHECKSUM_FLAGS(spkt);
575
576 /* Copy & verify TCP checksum */
577 start = spkt->pkt_flow_ip_hlen + spkt->pkt_flow_tcp_hlen;
578 l4len = plen - spkt->pkt_flow_ip_hlen;
579 len = plen - start;
580 if (PKT_IS_TRUNC_MBUF(spkt)) {
581 tmplen = min(len, curr_trailing);
582 odd_start = FALSE;
583
584 /* First, simple checksum on the TCP header */
585 if (verify_l4) {
586 partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
587 spkt->pkt_flow_tcp_hlen, spkt->pkt_flow_ip_hlen, 0);
588 }
589
590 /* Now, copy & sum the payload */
591 if (tmplen > 0) {
592 data_partial = m_copydata_sum(spkt->pkt_mbuf,
593 start, tmplen, curr_ptr, 0, &odd_start);
594 curr_len += tmplen;
595 }
596 data_off = start + tmplen;
597 } else {
598 tmplen = min(len, curr_trailing);
599 odd_start = FALSE;
600
601 /* First, simple checksum on the TCP header */
602 if (verify_l4) {
603 partial = pkt_sum(SK_PKT2PH(spkt), (soff +
604 spkt->pkt_flow_ip_hlen), spkt->pkt_flow_tcp_hlen);
605 }
606
607 /* Now, copy & sum the payload */
608 if (tmplen > 0) {
609 data_partial = pkt_copyaddr_sum(SK_PKT2PH(spkt),
610 (soff + start), (uint8_t *)curr_ptr, tmplen,
611 true, 0, &odd_start);
612 curr_len += tmplen;
613 }
614 data_off = soff + start + tmplen;
615 }
616
617 /* copy & sum remaining payload in additional buffers */
618 if ((len - tmplen) > 0) {
619 ASSERT(dbuf != NULL);
620 _copy_data_sum_dbuf(spkt, data_off, (len - tmplen),
621 &data_partial, &odd_start, dbuf, true);
622 *added = (len - tmplen);
623 }
624
625 /* Fold data checksum to 16 bit */
626 *data_csum = __packet_fold_sum(data_partial);
627
628 if (currm != NULL) {
629 currm->m_len = curr_len;
630 } else {
631 currp->buf_dlen = curr_len;
632 }
633
634 if (verify_l4) {
635 /* Fold in the data checksum to TCP checksum */
636 partial += *data_csum;
637 partial += htons(l4len + IPPROTO_TCP);
638 if (spkt->pkt_flow_ip_ver == IPVERSION) {
639 csum = in_pseudo(spkt->pkt_flow_ipv4_src.s_addr,
640 spkt->pkt_flow_ipv4_dst.s_addr, partial);
641 } else {
642 ASSERT(spkt->pkt_flow_ip_ver == IPV6_VERSION);
643 csum = in6_pseudo(&spkt->pkt_flow_ipv6_src,
644 &spkt->pkt_flow_ipv6_dst, partial);
645 }
646 /* pkt metadata will be transfer to super packet */
647 __packet_set_inet_checksum(SK_PKT2PH(spkt),
648 PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
649 } else {
650 /* grab csum value from offload */
651 csum = spkt->pkt_csum_rx_value;
652 }
653
654 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
655 start - spkt->pkt_flow_tcp_hlen, l4len, ntohs(csum));
656
657 if ((csum ^ 0xffff) != 0) {
658 /*
659 * Revert whatever we did here!
660 * currm/currp should be restored to previous value.
661 * dbuf (for additional payload) should be restore to 0.
662 */
663 if (currm != NULL) {
664 currm->m_len = curr_oldlen;
665 } else {
666 currp->buf_dlen = curr_oldlen;
667 }
668 if (dbuf != NULL) {
669 for (int i = 0; i < dbuf->dba_num_dbufs; i++) {
670 if (dbuf->dba_is_buflet) {
671 struct __kern_buflet *b = dbuf->dba_buflet[i];
672 kern_buflet_set_data_length(b, 0);
673 kern_buflet_set_data_offset(b, 0);
674 } else {
675 struct mbuf *m = dbuf->dba_mbuf[i];
676 m->m_len = m->m_pkthdr.len = 0;
677 }
678 }
679 }
680
681 return false;
682 }
683
684 return true;
685 }
686
687 /*
688 * Copy and checksum for packet or packet with mbuf
689 * data_csum is only supported for bsd flows
690 */
691 static bool
copy_pkt_csum(struct __kern_packet * pkt,uint32_t plen,_dbuf_array_t * dbuf,uint16_t * data_csum,bool verify_l3)692 copy_pkt_csum(struct __kern_packet *pkt, uint32_t plen, _dbuf_array_t *dbuf,
693 uint16_t *data_csum, bool verify_l3)
694 {
695 /*
696 * To keep this routine simple and optimal, we are asserting on the
697 * assumption that the smallest flowswitch packet pool buffer should
698 * be large enough to hold the IP and TCP headers in the first buflet.
699 */
700 _CASSERT(NX_FSW_MINBUFSIZE >= NETIF_COMPAT_MAX_MBUF_DATA_COPY);
701
702 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
703 (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
704
705 uint16_t start = 0, csum = 0;
706 uint32_t len = 0;
707 /* soff is only used for packets */
708 uint16_t soff = pkt->pkt_headroom + pkt->pkt_l2_len;
709 uint32_t data_partial = 0, partial = 0;
710 boolean_t odd_start = false;
711 uint32_t data_len;
712 uint16_t dbuf_off;
713 uint16_t copied_len = 0;
714 bool l3_csum_ok;
715 uint8_t *daddr;
716
717 if (dbuf->dba_is_buflet) {
718 /* XXX -fbounds-safety: use the inline variant to return an __indexable */
719 daddr = __buflet_get_data_address(dbuf->dba_buflet[0]);
720 daddr += kern_buflet_get_data_length(dbuf->dba_buflet[0]);
721 } else {
722 daddr = mtod(dbuf->dba_mbuf[0], uint8_t *);
723 daddr += dbuf->dba_mbuf[0]->m_len;
724 /*
725 * available space check for payload is done later
726 * in _copy_data_sum_dbuf
727 */
728 ASSERT(M_TRAILINGSPACE(dbuf->dba_mbuf[0]) >=
729 pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
730 }
731
732 if (PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt)) {
733 /* copy only */
734 _copy_data_sum_dbuf(pkt, PKT_IS_TRUNC_MBUF(pkt) ? 0: soff,
735 plen, &partial, &odd_start, dbuf, false);
736 if (PKT_IS_MBUF(pkt)) {
737 csum = pkt->pkt_mbuf->m_pkthdr.csum_rx_val;
738 SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
739 pkt->pkt_mbuf->m_pkthdr.csum_flags,
740 pkt->pkt_mbuf->m_pkthdr.csum_rx_start, csum);
741 } else {
742 csum = pkt->pkt_csum_rx_value;
743 SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
744 pkt->pkt_csum_flags,
745 pkt->pkt_csum_rx_start_off, csum);
746 }
747
748 /* pkt metadata will be transfer to super packet */
749 __packet_set_inet_checksum(SK_PKT2PH(pkt),
750 PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
751 if ((csum ^ 0xffff) == 0) {
752 return true;
753 } else {
754 return false;
755 }
756 }
757
758 /* Copy l3 & verify checksum only for IPv4 */
759 start = 0;
760 len = pkt->pkt_flow_ip_hlen;
761 if (PKT_IS_TRUNC_MBUF(pkt)) {
762 partial = m_copydata_sum(pkt->pkt_mbuf, start, len,
763 (daddr + start), 0, NULL);
764 } else {
765 partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), soff,
766 (daddr + start), len, true, 0, NULL);
767 }
768 verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(pkt));
769 l3_csum_ok = !verify_l3;
770 if (verify_l3) {
771 csum = __packet_fold_sum(partial);
772 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
773 start, len, csum);
774 pkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
775 if ((csum ^ 0xffff) != 0) {
776 /* proceed to copy the rest of packet */
777 } else {
778 pkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
779 l3_csum_ok = true;
780 }
781 }
782 copied_len += pkt->pkt_flow_ip_hlen;
783
784 /* Copy & verify TCP checksum */
785 start = pkt->pkt_flow_ip_hlen;
786 len = plen - start;
787
788 if (PKT_IS_TRUNC_MBUF(pkt)) {
789 /* First, copy and sum TCP header */
790 partial = m_copydata_sum(pkt->pkt_mbuf, start,
791 pkt->pkt_flow_tcp_hlen, (daddr + start), 0, NULL);
792
793 data_len = len - pkt->pkt_flow_tcp_hlen;
794 start += pkt->pkt_flow_tcp_hlen;
795 dbuf_off = start;
796 /* Next, copy and sum payload (if any) */
797 } else {
798 /* First, copy and sum TCP header */
799 partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), (soff + start),
800 (daddr + start), pkt->pkt_flow_tcp_hlen, true, 0, NULL);
801
802 data_len = len - pkt->pkt_flow_tcp_hlen;
803 start += pkt->pkt_flow_tcp_hlen;
804 dbuf_off = start;
805 start += soff;
806 }
807 copied_len += pkt->pkt_flow_tcp_hlen;
808
809 if (dbuf->dba_is_buflet) {
810 VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[0],
811 kern_buflet_get_data_length(dbuf->dba_buflet[0]) +
812 copied_len) == 0);
813 } else {
814 dbuf->dba_mbuf[0]->m_len += copied_len;
815 dbuf->dba_mbuf[0]->m_pkthdr.len += copied_len;
816 }
817
818 /* copy and sum payload (if any) */
819 if (data_len > 0) {
820 odd_start = false;
821 _copy_data_sum_dbuf(pkt, start, data_len, &data_partial,
822 &odd_start, dbuf, l3_csum_ok);
823 }
824
825 if (__improbable(!l3_csum_ok)) {
826 return false;
827 }
828
829 /* Fold data sum to 16 bit and then into the partial */
830 *data_csum = __packet_fold_sum(data_partial);
831
832 /* Fold in the data checksum to TCP checksum */
833 partial += *data_csum;
834
835 partial += htons(len + IPPROTO_TCP);
836 if (pkt->pkt_flow_ip_ver == IPVERSION) {
837 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
838 pkt->pkt_flow_ipv4_dst.s_addr, partial);
839 } else {
840 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
841 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
842 &pkt->pkt_flow_ipv6_dst, partial);
843 }
844
845 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
846 pkt->pkt_flow_ip_hlen, len, csum);
847
848 /* pkt metadata will be transfer to super packet */
849 __packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
850 0, csum, false);
851 if ((csum ^ 0xffff) != 0) {
852 return false;
853 }
854
855 return true;
856 }
857
858 SK_INLINE_ATTRIBUTE
859 static void
flow_agg_init_common(struct nx_flowswitch * fsw,struct flow_agg * fa,struct __kern_packet * pkt)860 flow_agg_init_common(struct nx_flowswitch *fsw, struct flow_agg *fa,
861 struct __kern_packet *pkt)
862 {
863 struct ifnet *ifp;
864
865 switch (pkt->pkt_flow_ip_ver) {
866 case IPVERSION:
867 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip)) {
868 return;
869 }
870 break;
871 case IPV6_VERSION:
872 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
873 return;
874 }
875 break;
876 default:
877 VERIFY(0);
878 /* NOTREACHED */
879 __builtin_unreachable();
880 }
881
882 fa->fa_tcp_seq = ntohl(pkt->pkt_flow_tcp_seq) + pkt->pkt_flow_ulen;
883 fa->fa_ulen = pkt->pkt_flow_ulen;
884 fa->fa_total = pkt->pkt_flow_ip_hlen +
885 pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
886
887 ifp = fsw->fsw_ifp;
888 ASSERT(ifp != NULL);
889 if (__improbable((ifp->if_hwassist & IFNET_LRO) != 0)) {
890 /* in case hardware supports LRO, don't fix checksum in the header */
891 fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum_no_op;
892 } else {
893 fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum;
894 }
895 }
896
897 static void
flow_agg_init_smbuf(struct nx_flowswitch * fsw,struct flow_agg * fa,struct mbuf * smbuf,struct __kern_packet * pkt)898 flow_agg_init_smbuf(struct nx_flowswitch *fsw, struct flow_agg *fa,
899 struct mbuf *smbuf, struct __kern_packet *pkt)
900 {
901 FLOW_AGG_CLEAR(fa);
902
903 ASSERT(smbuf != NULL);
904 fa->fa_smbuf = smbuf;
905
906 fa->fa_sptr = mtod(smbuf, uint8_t *);
907 ASSERT(fa->fa_sptr != NULL);
908
909 /*
910 * Note here we use 'pkt' instead of 'smbuf', since we rely on the
911 * contents of the flow structure which don't exist in 'smbuf'.
912 */
913 flow_agg_init_common(fsw, fa, pkt);
914 }
915
916 static void
flow_agg_init_spkt(struct nx_flowswitch * fsw,struct flow_agg * fa,struct __kern_packet * spkt,struct __kern_packet * pkt)917 flow_agg_init_spkt(struct nx_flowswitch *fsw, struct flow_agg *fa,
918 struct __kern_packet *spkt, struct __kern_packet *pkt)
919 {
920 FLOW_AGG_CLEAR(fa);
921
922 ASSERT(spkt != NULL);
923 fa->fa_spkt = spkt;
924 fa->fa_sobj_is_pkt = true;
925 VERIFY(spkt->pkt_headroom == 0 && spkt->pkt_l2_len == 0);
926
927 MD_BUFLET_ADDR_ABS(spkt, fa->fa_sptr);
928 ASSERT(fa->fa_sptr != NULL);
929
930 /*
931 * Note here we use 'pkt' instead of 'spkt', since we rely on the
932 * contents of the flow structure which don't exist in 'spkt'.
933 */
934 flow_agg_init_common(fsw, fa, pkt);
935 }
936
937 /*
938 * -fbounds-safety: The reason hardcoded values 64 (and 80) are used here is
939 * because this function calls the 64-byte version of sk memcmp function (same
940 * thing for the 80-byte version). In can_agg_fastpath, there is a check being
941 * done for TCP header length with options: sizeof(struct tcphdr) +
942 * TCPOLEN_TSTAMP_APPA , which is 20 + 12 = 32 bytes. In case of IPv4, adding IP
943 * header size of 20 to it makes it 52 bytes. From the sk_memcmp_* variants, the
944 * closest one is the 64B option.
945 */
946 SK_INLINE_ATTRIBUTE
947 static bool
948 ipv4_tcp_memcmp(const uint8_t *__counted_by(64)h1, const uint8_t *__counted_by(64)h2)
949 {
950 return sk_memcmp_mask_64B(h1, h2, (const uint8_t *)&ip_tcp_mask) == 0;
951 }
952
953 SK_INLINE_ATTRIBUTE
954 static bool
955 ipv6_tcp_memcmp(const uint8_t *__counted_by(80)h1, const uint8_t *__counted_by(80)h2)
956 {
957 return sk_memcmp_mask_80B(h1, h2, (const uint8_t *)&ip6_tcp_mask) == 0;
958 }
959
960 SK_INLINE_ATTRIBUTE
961 static bool
can_agg_fastpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)962 can_agg_fastpath(struct flow_agg *fa, struct __kern_packet *pkt,
963 struct fsw_stats *fsws)
964 {
965 bool match;
966 uint8_t *ip_hdr;
967
968 ASSERT(fa->fa_sptr != NULL);
969 _CASSERT(sizeof(struct ip6_tcp_mask) == MASK_SIZE);
970 _CASSERT(sizeof(struct ip_tcp_mask) == MASK_SIZE);
971
972 if (__improbable(pkt->pkt_length < MASK_SIZE)) {
973 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_TCP);
974 goto slow_path;
975 }
976
977 if (__improbable(fa->fa_sobj_is_short)) {
978 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_MBUF);
979 goto slow_path;
980 }
981
982 if (__improbable(pkt->pkt_flow_tcp_hlen !=
983 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA))) {
984 goto slow_path;
985 }
986
987 switch (pkt->pkt_flow_ip_ver) {
988 case IPVERSION:
989 /*
990 * -fbounds-safety: pkt->pkt_flow_ip_hdr is a mach_vm_address_t,
991 * so we forge it here. The reason the constant values 64 and 80
992 * are used is because ipv4_tcp_memcmp takes a __counted_by(64)
993 * and __counted_by(80), respectively.
994 */
995 ip_hdr = __unsafe_forge_bidi_indexable(uint8_t *,
996 pkt->pkt_flow_ip_hdr, 64);
997 match = ipv4_tcp_memcmp(fa->fa_sptr, ip_hdr);
998 break;
999 case IPV6_VERSION:
1000 ip_hdr = __unsafe_forge_bidi_indexable(uint8_t *,
1001 pkt->pkt_flow_ip_hdr, 80);
1002 match = ipv6_tcp_memcmp(fa->fa_sptr, ip_hdr);
1003 break;
1004 default:
1005 VERIFY(0);
1006 /* NOTREACHED */
1007 __builtin_unreachable();
1008 }
1009
1010 if (__improbable(!match)) {
1011 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_MASK_TCP);
1012 goto slow_path;
1013 }
1014 if (__improbable(pkt->pkt_flow_ulen != fa->fa_ulen)) {
1015 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ULEN_TCP);
1016 goto slow_path;
1017 }
1018
1019 STATS_INC(fsws, FSW_STATS_RX_AGG_OK_FASTPATH_TCP);
1020 fa->fa_tcp_seq += pkt->pkt_flow_ulen;
1021 fa->fa_ulen = pkt->pkt_flow_ulen;
1022 return true;
1023
1024 slow_path:
1025 return false;
1026 }
1027
1028 SK_NO_INLINE_ATTRIBUTE
1029 static bool
can_agg_slowpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)1030 can_agg_slowpath(struct flow_agg *fa, struct __kern_packet *pkt,
1031 struct fsw_stats *fsws)
1032 {
1033 uint8_t *sl3_hdr = fa->fa_sptr;
1034 uint8_t *l3_hdr = __unsafe_forge_bidi_indexable(uint8_t *,
1035 pkt->pkt_flow_ip_hdr, pkt->pkt_flow_ip_hlen);
1036 uint32_t sl3tlen = 0;
1037 uint16_t sl3hlen = 0;
1038
1039 DTRACE_SKYWALK2(aggr__slow, struct __kern_packet *, pkt,
1040 uint8_t *, sl3_hdr);
1041
1042 ASSERT(sl3_hdr != NULL);
1043
1044 /*
1045 * Compare IP header length, TOS, frag flags and IP options
1046 * For IPv4, the options should match exactly
1047 * For IPv6, if options are present, bail out
1048 */
1049 if (pkt->pkt_flow_ip_ver == IPVERSION) {
1050 struct ip *siph = (struct ip *)(void *)sl3_hdr;
1051 struct ip *iph = (struct ip *)(void *)l3_hdr;
1052
1053 ASSERT(siph->ip_v == IPVERSION);
1054 /* 16-bit alignment is sufficient (handles mbuf case) */
1055 ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1056 ASSERT(IS_P2ALIGNED(iph, sizeof(uint16_t)));
1057
1058 sl3hlen = (siph->ip_hl << 2);
1059 if (sl3hlen != pkt->pkt_flow_ip_hlen) {
1060 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1061 DTRACE_SKYWALK2(aggr__fail2, uint16_t, sl3hlen, uint8_t,
1062 pkt->pkt_flow_ip_hlen);
1063 return false;
1064 }
1065
1066 if (siph->ip_ttl != iph->ip_ttl) {
1067 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1068 DTRACE_SKYWALK2(aggr__fail3, uint8_t, siph->ip_ttl,
1069 uint8_t, iph->ip_ttl);
1070 return false;
1071 }
1072
1073 if (siph->ip_tos != iph->ip_tos) {
1074 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1075 DTRACE_SKYWALK2(aggr__fail4, uint8_t, siph->ip_tos,
1076 uint8_t, iph->ip_tos);
1077 return false;
1078 }
1079 /* For IPv4, DF bit should match */
1080 if ((ntohs(siph->ip_off) & (IP_DF | IP_RF)) !=
1081 (ntohs(iph->ip_off) & (IP_DF | IP_RF))) {
1082 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OFF_IP);
1083 DTRACE_SKYWALK2(aggr__fail5, uint16_t,
1084 ntohs(siph->ip_off), uint16_t, ntohs(iph->ip_off));
1085 return false;
1086 }
1087
1088 uint8_t ip_opts_len = pkt->pkt_flow_ip_hlen -
1089 sizeof(struct ip);
1090 if (ip_opts_len > 0 &&
1091 memcmp((uint8_t *)(siph + 1), (uint8_t *)(iph + 1),
1092 ip_opts_len) != 0) {
1093 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPT_IP);
1094 DTRACE_SKYWALK3(aggr__fail6, uint8_t, ip_opts_len,
1095 uint8_t *, (uint8_t *)(siph + 1), uint8_t *,
1096 (uint8_t *)(iph + 1));
1097 return false;
1098 }
1099 sl3tlen = ntohs(siph->ip_len);
1100 } else {
1101 struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1102 struct ip6_hdr *ip6 = (struct ip6_hdr *)l3_hdr;
1103
1104 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1105 ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1106 /* 16-bit alignment is sufficient (handles mbuf case) */
1107 ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1108
1109 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
1110 /*
1111 * Don't aggregate if extension header is present in
1112 * packet. N.B. currently flow switch only classifies
1113 * frag header
1114 */
1115 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1116 DTRACE_SKYWALK1(aggr__fail7, uint8_t,
1117 pkt->pkt_flow_ip_hlen);
1118 return false;
1119 }
1120
1121 sl3hlen = sizeof(struct ip6_hdr);
1122 /* For IPv6, flow info mask covers TOS and flow label */
1123 if (memcmp((uint8_t *)&sip6->ip6_flow, (uint8_t *)&ip6->ip6_flow,
1124 sizeof(sip6->ip6_flow)) != 0) {
1125 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1126 DTRACE_SKYWALK2(aggr__fail8, uint32_t,
1127 ntohl(sip6->ip6_flow), uint32_t,
1128 ntohl(ip6->ip6_flow));
1129 return false;
1130 }
1131
1132 if (sip6->ip6_hlim != ip6->ip6_hlim) {
1133 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1134 DTRACE_SKYWALK2(aggr__fail9, uint8_t, sip6->ip6_hlim,
1135 uint8_t, ip6->ip6_hlim);
1136 return false;
1137 }
1138
1139 sl3tlen = (sizeof(struct ip6_hdr) + ntohs(sip6->ip6_plen));
1140 }
1141
1142 /*
1143 * For TCP header, compare ACK number and window size
1144 * Compare TCP flags
1145 * Compare TCP header length and TCP options
1146 */
1147 struct tcphdr *stcp = (struct tcphdr *)(void *)(sl3_hdr + sl3hlen);
1148 /* -fbounds-safety: pkt_flow_tcp_hdr is a mach_vm_address_t */
1149 struct tcphdr *tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
1150 pkt->pkt_flow_tcp_hdr, pkt->pkt_flow_tcp_hlen);
1151
1152 uint16_t sl4hlen = (stcp->th_off << 2);
1153 if (memcmp(&stcp->th_ack, &tcp->th_ack, sizeof(stcp->th_ack)) != 0 ||
1154 memcmp(&stcp->th_win, &tcp->th_win, sizeof(stcp->th_win)) != 0) {
1155 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ACKWIN_TCP);
1156 DTRACE_SKYWALK4(aggr__fail9, uint32_t, ntohl(stcp->th_ack),
1157 uint32_t, ntohl(tcp->th_ack), uint16_t, ntohs(stcp->th_win),
1158 uint16_t, ntohs(tcp->th_win));
1159 return false;
1160 }
1161
1162 if ((stcp->th_flags & ~(TH_PUSH)) != (tcp->th_flags & ~(TH_PUSH))) {
1163 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_FLAGS_TCP);
1164 DTRACE_SKYWALK2(aggr__fail10, uint8_t, stcp->th_flags,
1165 uint8_t, tcp->th_flags);
1166 return false;
1167 }
1168
1169 if (sl4hlen != pkt->pkt_flow_tcp_hlen) {
1170 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_TCP);
1171 DTRACE_SKYWALK2(aggr__fail11, uint8_t, sl4hlen,
1172 uint8_t, pkt->pkt_flow_tcp_hlen);
1173 return false;
1174 }
1175
1176 uint8_t tcp_opts_len = pkt->pkt_flow_tcp_hlen - sizeof(struct tcphdr);
1177 /*
1178 * We know that the TCP-option lengthes are the same thanks to the above
1179 * sl4hlen check
1180 */
1181 if (tcp_opts_len > 0 && memcmp((uint8_t *)(stcp + 1),
1182 (uint8_t *)(tcp + 1), tcp_opts_len) != 0) {
1183 /*
1184 * Fast-path header prediction:
1185 *
1186 * TCP Timestamp option is usually put after two NOP-headers,
1187 * and thus total TCP-option length is 12. If that's the case,
1188 * we can aggregate as only the TCP time-stamp option differs.
1189 */
1190 if (tcp_opts_len != TCPOLEN_TSTAMP_APPA) {
1191 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_EXOPT_TCP);
1192 DTRACE_SKYWALK1(aggr__fail13, uint8_t, tcp_opts_len);
1193 return false;
1194 } else {
1195 uint32_t sts_hdr, ts_hdr;
1196 if (IS_P2ALIGNED(stcp + 1, sizeof(uint32_t))) {
1197 sts_hdr = *((uint32_t *)(stcp + 1));
1198 } else {
1199 bcopy(stcp + 1, &sts_hdr, sizeof(sts_hdr));
1200 }
1201 if (IS_P2ALIGNED(tcp + 1, sizeof(uint32_t))) {
1202 ts_hdr = *((uint32_t *)(tcp + 1));
1203 } else {
1204 bcopy(tcp + 1, &ts_hdr, sizeof(ts_hdr));
1205 }
1206
1207 if (sts_hdr != htonl(TCPOPT_TSTAMP_HDR) ||
1208 ts_hdr != htonl(TCPOPT_TSTAMP_HDR)) {
1209 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPTTS_TCP);
1210 DTRACE_SKYWALK2(aggr__fail14, uint32_t,
1211 sts_hdr, uint32_t, ts_hdr);
1212 return false;
1213 }
1214 }
1215 }
1216 STATS_INC(fsws, FSW_STATS_RX_AGG_OK_SLOWPATH_TCP);
1217 fa->fa_tcp_seq += pkt->pkt_flow_ulen;
1218 fa->fa_ulen = pkt->pkt_flow_ulen;
1219 return true;
1220 }
1221
1222 static bool
flow_agg_is_ok(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)1223 flow_agg_is_ok(struct flow_agg *fa, struct __kern_packet *pkt,
1224 struct fsw_stats *fsws)
1225 {
1226 /* Shouldn't exceed the ip_len beyond MIN(custom ip_len, 64K) */
1227 const uint32_t max_ip_len = MAX_AGG_IP_LEN();
1228 bool can_agg = false;
1229
1230 DTRACE_SKYWALK2(aggr__check, struct flow_agg *, fa,
1231 struct __kern_packet *, pkt);
1232
1233 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1234 if (__improbable(pkt->pkt_flow_tcp_agg_fast != 0)) {
1235 pkt->pkt_flow_tcp_agg_fast = 0;
1236 }
1237 /*
1238 * Don't aggregate if any of the following is true:
1239 * 1. TCP flag is other than TH_{ACK,PUSH}
1240 * 2. Payload length is 0 (pure ACK)
1241 * 3. This is the first packet
1242 * 4. TCP sequence number is not expected
1243 * 5. We would've exceeded the maximum aggregated size
1244 * 6. It's not the first packet and the wake flag is set
1245 */
1246 if (__improbable((pkt->pkt_flow_tcp_flags & TCP_FLAGS_IGNORE) != 0 ||
1247 pkt->pkt_flow_ulen == 0 || fa->fa_sobj == NULL)) {
1248 DTRACE_SKYWALK1(aggr__fail1a, struct __kern_packet *, pkt);
1249 goto done;
1250 }
1251 if (__improbable(ntohl(pkt->pkt_flow_tcp_seq) != fa->fa_tcp_seq)) {
1252 DTRACE_SKYWALK2(aggr__fail1b, uint32_t,
1253 ntohl(pkt->pkt_flow_tcp_seq), uint32_t, fa->fa_tcp_seq);
1254 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SEQN_TCP);
1255 goto done;
1256 }
1257 if (__improbable((fa->fa_total + pkt->pkt_flow_ulen) > max_ip_len)) {
1258 DTRACE_SKYWALK3(aggr__fail1c, uint32_t, fa->fa_total,
1259 uint32_t, pkt->pkt_flow_ulen, uint32_t, max_ip_len);
1260 /* We've reached aggregation limit */
1261 STATS_INC(fsws, FSW_STATS_RX_AGG_LIMIT);
1262 goto done;
1263 }
1264 if (__improbable(PKT_IS_WAKE_PKT(pkt) && fa->fa_total > 0)) {
1265 DTRACE_SKYWALK1(aggr__fail1d, struct __kern_packet *, pkt);
1266 goto done;
1267 }
1268
1269 can_agg = can_agg_fastpath(fa, pkt, fsws);
1270 if (can_agg) {
1271 pkt->pkt_flow_tcp_agg_fast = 1;
1272 goto done;
1273 }
1274
1275 can_agg = can_agg_slowpath(fa, pkt, fsws);
1276 ASSERT(!pkt->pkt_flow_tcp_agg_fast);
1277
1278 done:
1279 return can_agg;
1280 }
1281
1282 static uint16_t
flow_agg_pkt_fix_sum(uint16_t csum,uint16_t old,uint16_t new)1283 flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new)
1284 {
1285 return __packet_fix_sum(csum, old, new);
1286 }
1287
1288 static uint16_t
flow_agg_pkt_fix_sum_no_op(uint16_t __unused csum,uint16_t __unused old,uint16_t __unused new)1289 flow_agg_pkt_fix_sum_no_op(uint16_t __unused csum, uint16_t __unused old,
1290 uint16_t __unused new)
1291 {
1292 return 0;
1293 }
1294
1295 static inline void
flow_agg_pkt_fix_hdr_sum(struct flow_agg * fa,uint8_t * __sized_by (sizeof (uint32_t))field,uint16_t * csum,uint32_t new)1296 flow_agg_pkt_fix_hdr_sum(struct flow_agg *fa,
1297 uint8_t *__sized_by(sizeof(uint32_t))field, uint16_t *csum,
1298 uint32_t new)
1299 {
1300 uint32_t old;
1301 memcpy((uint8_t *)&old, field, sizeof(old));
1302 memcpy(field, (uint8_t *)&new, sizeof(uint32_t));
1303 *csum = fa->fa_fix_pkt_sum(fa->fa_fix_pkt_sum(*csum,
1304 (uint16_t)(old >> 16), (uint16_t)(new >> 16)),
1305 (uint16_t)(old & 0xffff),
1306 (uint16_t)(new & 0xffff));
1307 }
1308
1309 static void
flow_agg_merge_hdr(struct flow_agg * fa,struct __kern_packet * pkt,__unused uint16_t data_csum,struct fsw_stats * fsws)1310 flow_agg_merge_hdr(struct flow_agg *fa, struct __kern_packet *pkt,
1311 __unused uint16_t data_csum, struct fsw_stats *fsws)
1312 {
1313 struct tcphdr *stcp, *tcp;
1314 uint8_t *l3hdr, l3hlen;
1315 uint16_t old_l3len = 0;
1316 uint8_t result;
1317
1318 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1319
1320 /*
1321 * The packet being merged should always have full checksum flags
1322 * and a valid checksum. Otherwise, it would fail copy_pkt_csum_packed
1323 * and not enter this function.
1324 */
1325 ASSERT(PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt));
1326 ASSERT((pkt->pkt_csum_rx_value ^ 0xffff) == 0);
1327
1328 ASSERT(fa->fa_sobj != NULL);
1329 ASSERT(!fa->fa_sobj_is_pkt ||
1330 (fa->fa_spkt->pkt_headroom == 0 && fa->fa_spkt->pkt_l2_len == 0));
1331 uint8_t *sl3_hdr = fa->fa_sptr;
1332 ASSERT(sl3_hdr != NULL);
1333 ASSERT(fa->fa_fix_pkt_sum != NULL);
1334
1335 fa->fa_total += pkt->pkt_flow_ulen;
1336
1337 /*
1338 * Update the IP header as:
1339 * 1. Set the IP ID (IPv4 only) to that of the new packet
1340 * 2. Set the ttl to the lowest of the two
1341 * 3. Increment the IP length by the payload length of new packet
1342 * 4. Leave the IP (IPv4 only) checksum as is
1343 * Update the resp. flow classification fields, if any
1344 * Nothing to update for TCP header for now
1345 */
1346 if (pkt->pkt_flow_ip_ver == IPVERSION) {
1347 struct ip *siph = (struct ip *)(void *)sl3_hdr;
1348
1349 /* 16-bit alignment is sufficient (handles mbuf case) */
1350 ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1351
1352 l3hdr = (uint8_t *)siph;
1353 l3hlen = siph->ip_hl << 2;
1354
1355 old_l3len = ntohs(siph->ip_len);
1356 uint16_t l3tlen = ntohs(siph->ip_len) + pkt->pkt_flow_ulen;
1357 siph->ip_len = htons(l3tlen);
1358 siph->ip_sum = fa->fa_fix_pkt_sum(siph->ip_sum, 0,
1359 htons(pkt->pkt_flow_ulen));
1360
1361 SK_DF(logflags, "Agg IP len %u", ntohs(siph->ip_len));
1362 } else {
1363 struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1364
1365 /* 16-bit alignment is sufficient (handles mbuf case) */
1366 ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1367 ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1368 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1369
1370 l3hdr = (uint8_t *)sip6;
1371 l3hlen = sizeof(struct ip6_hdr);
1372
1373 /* No extension headers should be present */
1374 ASSERT(pkt->pkt_flow_ip_hlen == sizeof(struct ip6_hdr));
1375
1376 old_l3len = ntohs(sip6->ip6_plen) + sizeof(struct ip6_hdr);
1377 uint16_t l3plen = ntohs(sip6->ip6_plen) + pkt->pkt_flow_ulen;
1378 sip6->ip6_plen = htons(l3plen);
1379
1380 SK_DF(logflags, "Agg IP6 len %u", ntohs(sip6->ip6_plen));
1381 }
1382
1383 if (__probable(pkt->pkt_flow_tcp_agg_fast)) {
1384 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_IP);
1385 } else {
1386 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_IP);
1387 }
1388
1389 stcp = (struct tcphdr *)(void *)(l3hdr + l3hlen);
1390 tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
1391 (struct tcphdr *)pkt->pkt_flow_tcp_hdr, pkt->pkt_flow_tcp_hlen);
1392
1393 /* 16-bit alignment is sufficient (handles mbuf case) */
1394 ASSERT(IS_P2ALIGNED(stcp, sizeof(uint16_t)));
1395 ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
1396
1397 /*
1398 * If it is bigger, that means there are TCP-options that need to be
1399 * copied over.
1400 */
1401 if (pkt->pkt_flow_tcp_hlen > sizeof(struct tcphdr) ||
1402 (stcp->th_flags & TH_PUSH) == 0) {
1403 VERIFY(stcp->th_off << 2 == pkt->pkt_flow_tcp_hlen);
1404 if (__improbable(!pkt->pkt_flow_tcp_agg_fast &&
1405 memcmp(stcp + 1, tcp + 1, (pkt->pkt_flow_tcp_hlen -
1406 sizeof(struct tcphdr))) != 0)) {
1407 uint8_t *sopt = (uint8_t *)(stcp + 1);
1408 uint8_t *opt = (uint8_t *)(tcp + 1);
1409
1410 uint32_t ntsval, ntsecr;
1411 bcopy((void *)(opt + 4), &ntsval, sizeof(ntsval));
1412 bcopy((void *)(opt + 8), &ntsecr, sizeof(ntsecr));
1413
1414 flow_agg_pkt_fix_hdr_sum(fa, sopt + 4, &stcp->th_sum, ntsval);
1415 flow_agg_pkt_fix_hdr_sum(fa, sopt + 8, &stcp->th_sum, ntsecr);
1416
1417 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_TCP);
1418 } else {
1419 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_TCP);
1420 }
1421
1422 if ((stcp->th_flags & TH_PUSH) == 0 &&
1423 (tcp->th_flags & TH_PUSH) != 0) {
1424 uint16_t old, new;
1425 tcp_seq *th_ack = &stcp->th_ack;
1426 /*
1427 * -fbounds-safety: C-style cast (uint16_t *)(th_ack+1)
1428 * doesn't work here, because th_ack's bound is a single
1429 * uint32_t, so trying to go one address above, and then
1430 * later dereferncing it would lead to a panic.
1431 */
1432 uint16_t *next = __unsafe_forge_single(uint16_t *,
1433 th_ack + 1);
1434 old = *next;
1435 /* If the new segment has a PUSH-flag, append it! */
1436 stcp->th_flags |= tcp->th_flags & TH_PUSH;
1437 next = __unsafe_forge_single(uint16_t *, th_ack + 1);
1438 new = *next;
1439 stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, old, new);
1440 }
1441 }
1442
1443 /* Update pseudo header checksum */
1444 stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
1445 htons(pkt->pkt_flow_ulen));
1446
1447 /* Update data checksum */
1448 if (__improbable(old_l3len & 0x1)) {
1449 /* swap the byte order, refer to rfc 1071 section 2 */
1450 stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
1451 ntohs(data_csum));
1452 } else {
1453 stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0, data_csum);
1454 }
1455
1456 if (fa->fa_sobj_is_pkt) {
1457 struct __kern_packet *spkt = fa->fa_spkt;
1458 spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1459 spkt->pkt_flow_ulen += pkt->pkt_flow_ulen;
1460 /*
1461 * Super packet length includes L3 and L4
1462 * header length for first packet only.
1463 */
1464 spkt->pkt_length += pkt->pkt_flow_ulen;
1465 if (spkt->pkt_seg_cnt == 0) {
1466 /* First time we append packets, need to set it to 1 */
1467 spkt->pkt_seg_cnt = 1;
1468 }
1469 _CASSERT(sizeof(result) == sizeof(spkt->pkt_seg_cnt));
1470 if (!os_add_overflow(1, spkt->pkt_seg_cnt, &result)) {
1471 spkt->pkt_seg_cnt = result;
1472 }
1473 SK_DF(logflags, "Agg pkt len %u TCP csum 0x%04x",
1474 spkt->pkt_length, ntohs(stcp->th_sum));
1475 } else {
1476 struct mbuf *smbuf = fa->fa_smbuf;
1477 smbuf->m_pkthdr.len += pkt->pkt_flow_ulen;
1478 if (smbuf->m_pkthdr.seg_cnt == 0) {
1479 /* First time we append packets, need to set it to 1 */
1480 smbuf->m_pkthdr.seg_cnt = 1;
1481 }
1482 _CASSERT(sizeof(result) == sizeof(smbuf->m_pkthdr.seg_cnt));
1483 if (!os_add_overflow(1, smbuf->m_pkthdr.seg_cnt, &result)) {
1484 smbuf->m_pkthdr.seg_cnt = result;
1485 }
1486 SK_DF(logflags, "Agg mbuf len %u TCP csum 0x%04x",
1487 smbuf->m_pkthdr.len, ntohs(stcp->th_sum));
1488 }
1489 }
1490
1491 /*
1492 * Copy metadata from source packet to destination packet
1493 */
1494 static void
pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)1495 pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
1496 {
1497 /* Copy packet metadata */
1498 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
1499 _PKT_COPY(spkt, dpkt);
1500 }
1501
1502 static void
pkt_finalize(kern_packet_t ph)1503 pkt_finalize(kern_packet_t ph)
1504 {
1505 int err = __packet_finalize(ph);
1506 VERIFY(err == 0);
1507 #if (DEVELOPMENT || DEBUG)
1508 struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1509 uint8_t *buf;
1510 MD_BUFLET_ADDR_ABS(pkt, buf);
1511 buf += pkt->pkt_headroom + pkt->pkt_l2_len;
1512 DTRACE_SKYWALK2(aggr__finalize, struct __kern_packet *, pkt,
1513 uint8_t *, buf);
1514 #endif
1515 }
1516
1517 static inline uint32_t
estimate_buf_cnt(struct flow_entry * fe,uint32_t total_bytes,uint32_t total_pkts,uint32_t min_bufsize,uint32_t agg_bufsize)1518 estimate_buf_cnt(struct flow_entry *fe, uint32_t total_bytes, uint32_t total_pkts,
1519 uint32_t min_bufsize, uint32_t agg_bufsize)
1520 {
1521 uint32_t max_ip_len = MAX_AGG_IP_LEN();
1522 uint32_t agg_size = MAX(fe->fe_rx_largest_size, min_bufsize);
1523 uint32_t hdr_overhead;
1524
1525 if (__improbable(sk_fsw_rx_agg_tcp == 0)) {
1526 return MIN(total_pkts, MAX_BUFLET_COUNT);
1527 }
1528
1529 agg_size = MIN(agg_size, agg_bufsize);
1530
1531 hdr_overhead = (total_bytes / max_ip_len) *
1532 (MAX(sizeof(struct ip), sizeof(struct ip6_hdr)) +
1533 sizeof(struct tcphdr));
1534
1535 return ((total_bytes + hdr_overhead) / agg_size) + 1;
1536 }
1537
1538 SK_INLINE_ATTRIBUTE
1539 static inline void
_append_dbuf_array_to_kpkt(kern_packet_t ph,kern_buflet_t pbuf,_dbuf_array_t * dbuf_array,kern_buflet_t * lbuf)1540 _append_dbuf_array_to_kpkt(kern_packet_t ph, kern_buflet_t pbuf,
1541 _dbuf_array_t *dbuf_array, kern_buflet_t *lbuf)
1542 {
1543 for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1544 kern_buflet_t buf = dbuf_array->dba_buflet[i];
1545 VERIFY(kern_packet_add_buflet(ph, pbuf, buf) == 0);
1546 pbuf = buf;
1547 dbuf_array->dba_buflet[i] = NULL;
1548 }
1549 ASSERT(pbuf != NULL);
1550 dbuf_array->dba_num_dbufs = 0;
1551 *lbuf = pbuf;
1552 }
1553
1554 SK_INLINE_ATTRIBUTE
1555 static inline void
_free_dbuf_array(struct kern_pbufpool * pp,_dbuf_array_t * dbuf_array)1556 _free_dbuf_array(struct kern_pbufpool *pp,
1557 _dbuf_array_t *dbuf_array)
1558 {
1559 for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1560 kern_buflet_t buf = dbuf_array->dba_buflet[i];
1561 pp_free_buflet(pp, buf);
1562 dbuf_array->dba_buflet[i] = NULL;
1563 }
1564 dbuf_array->dba_num_dbufs = 0;
1565 }
1566
1567 static inline void
finalize_super_packet(struct __kern_packet ** spkt,kern_packet_t * sph,struct flow_agg * fa,uint32_t * largest_spkt,uint16_t * spkts,uint16_t bufcnt)1568 finalize_super_packet(struct __kern_packet **spkt, kern_packet_t *sph,
1569 struct flow_agg *fa, uint32_t *largest_spkt, uint16_t *spkts,
1570 uint16_t bufcnt)
1571 {
1572 (*spkts)++;
1573 if (bufcnt > 1) {
1574 (*spkt)->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1575 }
1576 pkt_finalize(*sph);
1577 if ((*spkt)->pkt_length > *largest_spkt) {
1578 *largest_spkt = (*spkt)->pkt_length;
1579 }
1580 pkt_agg_log(*spkt, kernproc, false);
1581 DTRACE_SKYWALK1(aggr__buflet__count, uint16_t, bufcnt);
1582 *sph = 0;
1583 *spkt = NULL;
1584 FLOW_AGG_CLEAR(fa);
1585 }
1586
1587 static inline void
converge_aggregation_size(struct flow_entry * fe,uint32_t largest_agg_size)1588 converge_aggregation_size(struct flow_entry *fe, uint32_t largest_agg_size)
1589 {
1590 if (fe->fe_rx_largest_size > largest_agg_size) {
1591 /*
1592 * Make it slowly move towards largest_agg_size if we
1593 * consistently get non-aggregatable size.
1594 *
1595 * If we start at 16K, this makes us go to 4K within 6 rounds
1596 * and down to 2K within 12 rounds.
1597 */
1598 fe->fe_rx_largest_size -=
1599 ((fe->fe_rx_largest_size - largest_agg_size) >> 2);
1600 } else {
1601 fe->fe_rx_largest_size +=
1602 ((largest_agg_size - fe->fe_rx_largest_size) >> 2);
1603 }
1604 }
1605
1606 SK_NO_INLINE_ATTRIBUTE
1607 static void
flow_rx_agg_channel(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,uint32_t rx_bytes,bool is_mbuf)1608 flow_rx_agg_channel(struct nx_flowswitch *fsw, struct flow_entry *fe,
1609 struct pktq *rx_pkts, uint32_t rx_bytes, bool is_mbuf)
1610 {
1611 #define __RX_AGG_CHAN_DROP_SOURCE_PACKET(_pkt, _reason, _flags) do { \
1612 pp_drop_packet_single(_pkt, fsw->fsw_ifp, _flags, _reason, __func__, __LINE__); \
1613 (_pkt) = NULL; \
1614 FLOW_AGG_CLEAR(&fa); \
1615 prev_csum_ok = false; \
1616 } while (0)
1617 struct flow_agg fa; /* states */
1618 FLOW_AGG_CLEAR(&fa);
1619
1620 struct pktq super_pkts; /* dst super packets */
1621 struct pktq disposed_pkts; /* done src packets */
1622
1623 KPKTQ_INIT(&super_pkts);
1624 KPKTQ_INIT(&disposed_pkts);
1625
1626 struct __kern_channel_ring *ring;
1627 ring = fsw_flow_get_rx_ring(fsw, fe);
1628 if (__improbable(ring == NULL)) {
1629 SK_ERR("Rx ring is NULL");
1630 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DST_NXPORT_INVALID,
1631 KPKTQ_LEN(rx_pkts));
1632 pp_drop_pktq(rx_pkts, fsw->fsw_ifp, DROPTAP_FLAG_DIR_IN,
1633 DROP_REASON_FSW_DST_NXPORT_INVALID, __func__, __LINE__);
1634 return;
1635 }
1636 struct kern_pbufpool *dpp = ring->ckr_pp;
1637 ASSERT(dpp->pp_max_frags > 1);
1638
1639 struct __kern_packet *pkt, *tpkt;
1640 /* state for super packet */
1641 struct __kern_packet *__single spkt = NULL;
1642 kern_packet_t sph = 0;
1643 kern_buflet_t __single sbuf = NULL;
1644 bool prev_csum_ok = false, csum_ok, agg_ok;
1645 uint16_t spkts = 0, bufcnt = 0;
1646 int err;
1647
1648 struct fsw_stats *fsws = &fsw->fsw_stats;
1649
1650 /* state for buflet batch alloc */
1651 uint32_t bh_cnt, bh_cnt_tmp;
1652 uint64_t buf_arr[MAX_BUFLET_COUNT];
1653 _dbuf_array_t dbuf_array = {.dba_is_buflet = true, .dba_num_dbufs = 0};
1654 uint32_t largest_spkt = 0; /* largest aggregated packet size */
1655 uint32_t agg_bufsize;
1656 uint8_t iter = 0;
1657 bool large_buffer = false;
1658
1659 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1660 SK_DF(logflags, "Rx input queue len %u", KPKTQ_LEN(rx_pkts));
1661
1662 if (__probable(fe->fe_rx_largest_size != 0 &&
1663 NX_FSW_TCP_RX_AGG_ENABLED())) {
1664 if (fe->fe_rx_largest_size <= PP_BUF_SIZE_DEF(dpp) ||
1665 PP_BUF_SIZE_LARGE(dpp) == 0) {
1666 agg_bufsize = PP_BUF_SIZE_DEF(dpp);
1667 } else {
1668 agg_bufsize = PP_BUF_SIZE_LARGE(dpp);
1669 large_buffer = true;
1670 }
1671 bh_cnt = estimate_buf_cnt(fe, rx_bytes, KPKTQ_LEN(rx_pkts),
1672 PP_BUF_SIZE_DEF(dpp), agg_bufsize);
1673 DTRACE_SKYWALK1(needed_blt_cnt_agg, uint32_t, bh_cnt);
1674 bh_cnt = MIN(bh_cnt, MAX_BUFLET_COUNT);
1675 bh_cnt_tmp = bh_cnt;
1676 } else {
1677 /*
1678 * No payload, thus it's all small-sized ACKs/...
1679 * OR aggregation is disabled.
1680 */
1681 agg_bufsize = PP_BUF_SIZE_DEF(dpp);
1682 bh_cnt_tmp = bh_cnt = MIN(KPKTQ_LEN(rx_pkts), MAX_BUFLET_COUNT);
1683 DTRACE_SKYWALK1(needed_blt_cnt_no_agg, uint32_t, bh_cnt);
1684 }
1685
1686 err = pp_alloc_buflet_batch(dpp, buf_arr, &bh_cnt, SKMEM_NOSLEEP,
1687 large_buffer);
1688 if (__improbable(bh_cnt == 0)) {
1689 SK_ERR("failed to alloc %u buflets (err %d), use slow path",
1690 bh_cnt_tmp, err);
1691 }
1692 bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1693 KPKTQ_FOREACH_SAFE(pkt, rx_pkts, tpkt) {
1694 if (tpkt != NULL) {
1695 void *baddr;
1696 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1697 SK_PREFETCH(baddr, 0);
1698 }
1699
1700 ASSERT(pkt->pkt_qum.qum_pp != dpp);
1701 ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
1702 ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
1703 ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
1704 ASSERT(!pkt->pkt_flow_ip_is_frag);
1705 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1706
1707 csum_ok = false;
1708 agg_ok = false;
1709 /* supports TCP only */
1710 uint32_t thlen = (pkt->pkt_flow_ip_hlen +
1711 pkt->pkt_flow_tcp_hlen);
1712 uint32_t plen = (thlen + pkt->pkt_flow_ulen);
1713 uint16_t data_csum = 0;
1714
1715 KPKTQ_REMOVE(rx_pkts, pkt);
1716 rx_bytes -= pkt->pkt_flow_ulen;
1717 err = flow_pkt_track(fe, pkt, true);
1718 if (__improbable(err != 0)) {
1719 STATS_INC(fsws, FSW_STATS_RX_FLOW_TRACK_ERR);
1720 /* if need to trigger RST */
1721 if (err == ENETRESET) {
1722 flow_track_abort_tcp(fe, pkt, NULL);
1723 }
1724 SK_DF(SK_VERB_FLOW_TRACK, "flow_pkt_track failed (err %d)", err);
1725 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
1726 DROP_REASON_FSW_FLOW_TRACK_ERR, 0);
1727 continue;
1728 }
1729
1730 if (is_mbuf) { /* compat */
1731 m_adj(pkt->pkt_mbuf, pkt->pkt_l2_len);
1732 pkt->pkt_svc_class = m_get_service_class(pkt->pkt_mbuf);
1733 if (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
1734 pkt->pkt_pflags |= PKT_F_WAKE_PKT;
1735 }
1736 }
1737
1738 if (prev_csum_ok && sbuf) {
1739 ASSERT(fa.fa_spkt == spkt);
1740 ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1741 agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
1742 agg_ok = (agg_ok && bufcnt < dpp->pp_max_frags);
1743
1744 if (agg_ok && sbuf->buf_dlim - sbuf->buf_doff -
1745 sbuf->buf_dlen >= plen - thlen) {
1746 /*
1747 * No need for a new packet, just
1748 * append to curr_m.
1749 */
1750 csum_ok = copy_pkt_csum_packed(pkt, plen, NULL,
1751 is_ipv4, NULL, sbuf, &data_csum, NULL);
1752
1753 if (!csum_ok) {
1754 STATS_INC(fsws,
1755 FSW_STATS_RX_AGG_BAD_CSUM);
1756 SK_ERR("Checksum for aggregation "
1757 "is wrong");
1758 DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail1);
1759 /*
1760 * Turns out, checksum is wrong!
1761 * Fallback to no-agg mode.
1762 */
1763 agg_ok = false;
1764 } else {
1765 flow_agg_merge_hdr(&fa, pkt,
1766 data_csum, fsws);
1767 goto next;
1768 }
1769 }
1770 }
1771
1772 /* calculate number of buflets required */
1773 bh_cnt_tmp = howmany(plen, agg_bufsize);
1774 if (__improbable(bh_cnt_tmp > MAX_BUFLET_COUNT)) {
1775 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1776 SK_ERR("packet too big: bufcnt %d len %d", bh_cnt_tmp,
1777 plen);
1778 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
1779 DROP_REASON_FSW_GSO_NOMEM_PKT, 0);
1780 continue;
1781 }
1782 if (bh_cnt < bh_cnt_tmp) {
1783 uint32_t tmp;
1784
1785 if (iter != 0) {
1786 /*
1787 * rearrange the array for additional
1788 * allocation
1789 */
1790 uint8_t i;
1791 for (i = 0; i < bh_cnt; i++, iter++) {
1792 buf_arr[i] = buf_arr[iter];
1793 buf_arr[iter] = 0;
1794 }
1795 iter = 0;
1796 }
1797 tmp = estimate_buf_cnt(fe, rx_bytes, KPKTQ_LEN(rx_pkts),
1798 PP_BUF_SIZE_DEF(dpp), agg_bufsize);
1799 tmp = MIN(tmp, MAX_BUFLET_COUNT);
1800 tmp = MAX(tmp, bh_cnt_tmp);
1801 tmp -= bh_cnt;
1802 ASSERT(tmp <= (MAX_BUFLET_COUNT - bh_cnt));
1803 DTRACE_SKYWALK1(refilled_blt_cnt, uint32_t, tmp);
1804 err = pp_alloc_buflet_batch(dpp, &buf_arr[bh_cnt],
1805 &tmp, SKMEM_NOSLEEP, large_buffer);
1806 bh_cnt += tmp;
1807 if (__improbable((tmp == 0) || (bh_cnt < bh_cnt_tmp))) {
1808 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1809 SK_ERR("buflet alloc failed (err %d)", err);
1810 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
1811 DROP_REASON_FSW_GSO_NOMEM_PKT, 0);
1812 continue;
1813 }
1814 }
1815 /* Use pre-allocated buflets */
1816 ASSERT(bh_cnt >= bh_cnt_tmp);
1817 dbuf_array.dba_num_dbufs = bh_cnt_tmp;
1818 while (bh_cnt_tmp-- > 0) {
1819 /*
1820 * -fbounds-safety: buf_arr[iter] is a uint64_t, so
1821 * forging it
1822 */
1823 dbuf_array.dba_buflet[bh_cnt_tmp] =
1824 __unsafe_forge_single(kern_buflet_t, buf_arr[iter]);
1825 buf_arr[iter] = 0;
1826 bh_cnt--;
1827 iter++;
1828 }
1829 /* copy and checksum TCP data */
1830 if (agg_ok) {
1831 int added = 0;
1832 ASSERT(dbuf_array.dba_num_dbufs != 0);
1833 csum_ok = copy_pkt_csum_packed(pkt, plen, &dbuf_array,
1834 is_ipv4, NULL, sbuf, &data_csum, &added);
1835
1836 if (__improbable(!csum_ok)) {
1837 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1838 SK_ERR("Checksum for aggregation on new "
1839 "mbuf is wrong");
1840 DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail2);
1841 agg_ok = false;
1842 /* reset the used buflets */
1843 uint8_t j;
1844 for (j = 0; j < dbuf_array.dba_num_dbufs; j++) {
1845 VERIFY(kern_buflet_set_data_length(
1846 dbuf_array.dba_buflet[j], 0) == 0);
1847 }
1848 goto non_agg;
1849 }
1850
1851 /*
1852 * There was not enough space in curr_m, thus we must
1853 * have added to m->m_data.
1854 */
1855 VERIFY(added > 0);
1856 } else {
1857 non_agg:
1858 ASSERT(dbuf_array.dba_num_dbufs != 0);
1859 csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
1860 &data_csum, is_ipv4);
1861 if (__improbable(!csum_ok)) {
1862 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1863 SK_ERR("%d incorrect csum", __LINE__);
1864 DTRACE_SKYWALK(aggr__chan_tcp_csum_fail);
1865 }
1866 }
1867 if (agg_ok) {
1868 ASSERT(fa.fa_spkt == spkt);
1869 ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1870 /* update current packet header */
1871 flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
1872 ASSERT(dbuf_array.dba_num_dbufs > 0);
1873 bufcnt += dbuf_array.dba_num_dbufs;
1874 _append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1875 &sbuf);
1876 } else {
1877 /* Finalize the current super packet */
1878 if (sph != 0) {
1879 finalize_super_packet(&spkt, &sph, &fa,
1880 &largest_spkt, &spkts, bufcnt);
1881 }
1882
1883 /* New super packet */
1884 err = kern_pbufpool_alloc_nosleep(dpp, 0, &sph);
1885 if (__improbable(err != 0)) {
1886 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1887 SK_ERR("packet alloc failed (err %d)", err);
1888 _free_dbuf_array(dpp, &dbuf_array);
1889 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
1890 DROP_REASON_FSW_GSO_NOMEM_PKT, 0);
1891 continue;
1892 }
1893 spkt = SK_PTR_ADDR_KPKT(sph);
1894 pkt_copy_metadata(pkt, spkt);
1895 /* Packet length for super packet starts from L3 */
1896 spkt->pkt_length = plen;
1897 spkt->pkt_flow_ulen = pkt->pkt_flow_ulen;
1898 spkt->pkt_headroom = 0;
1899 spkt->pkt_l2_len = 0;
1900 spkt->pkt_seg_cnt = 1;
1901
1902 ASSERT(dbuf_array.dba_num_dbufs > 0);
1903 bufcnt = dbuf_array.dba_num_dbufs;
1904 sbuf = kern_packet_get_next_buflet(sph, NULL);
1905 _append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1906 &sbuf);
1907
1908 KPKTQ_ENQUEUE(&super_pkts, spkt);
1909 _UUID_COPY(spkt->pkt_flow_id, fe->fe_uuid);
1910 _UUID_COPY(spkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1911 spkt->pkt_policy_id = fe->fe_policy_id;
1912 spkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
1913 spkt->pkt_transport_protocol =
1914 fe->fe_transport_protocol;
1915 flow_agg_init_spkt(fsw, &fa, spkt, pkt);
1916 }
1917 next:
1918 pkt_agg_log(pkt, kernproc, true);
1919 prev_csum_ok = csum_ok;
1920 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1921 }
1922
1923 /* Free unused buflets */
1924 STATS_ADD(fsws, FSW_STATS_RX_WASTED_BFLT, bh_cnt);
1925 while (bh_cnt > 0) {
1926 /* -fbounds-saftey: buf_arr[iter] is a uint64_t, so forging it */
1927 pp_free_buflet(dpp, __unsafe_forge_single(kern_buflet_t,
1928 buf_arr[iter]));
1929 buf_arr[iter] = 0;
1930 bh_cnt--;
1931 iter++;
1932 }
1933 /* Finalize the last super packet */
1934 if (sph != 0) {
1935 finalize_super_packet(&spkt, &sph, &fa, &largest_spkt,
1936 &spkts, bufcnt);
1937 }
1938 converge_aggregation_size(fe, largest_spkt);
1939 DTRACE_SKYWALK1(aggr__spkt__count, uint16_t, spkts);
1940 if (__improbable(is_mbuf)) {
1941 STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2PKT, spkts);
1942 } else {
1943 STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2PKT, spkts);
1944 }
1945 FLOW_STATS_IN_ADD(fe, spackets, spkts);
1946
1947 KPKTQ_FINI(rx_pkts);
1948
1949 if (KPKTQ_LEN(&super_pkts) > 0) {
1950 fsw_ring_enqueue_tail_drop(fsw, ring, &super_pkts);
1951 }
1952 KPKTQ_FINI(&super_pkts);
1953
1954 pp_free_pktq(&disposed_pkts);
1955 }
1956
1957 /* streamline a smbuf */
1958 static bool
_finalize_smbuf(struct mbuf * smbuf)1959 _finalize_smbuf(struct mbuf *smbuf)
1960 {
1961 /* the 1st mbuf always contains something, so start with the 2nd one */
1962 struct mbuf *m_chained = smbuf->m_next;
1963 struct mbuf *prev_m = smbuf;
1964 bool freed = false;
1965
1966 while (m_chained != NULL) {
1967 if (m_chained->m_len != 0) {
1968 prev_m = m_chained;
1969 m_chained = m_chained->m_next;
1970 continue;
1971 }
1972 prev_m->m_next = m_chained->m_next;
1973 m_free(m_chained);
1974 m_chained = prev_m->m_next;
1975 freed = true;
1976 }
1977 return freed;
1978 }
1979
1980 SK_NO_INLINE_ATTRIBUTE
1981 static void
flow_rx_agg_host(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,uint32_t rx_bytes,bool is_mbuf)1982 flow_rx_agg_host(struct nx_flowswitch *fsw, struct flow_entry *fe,
1983 struct pktq *rx_pkts, uint32_t rx_bytes, bool is_mbuf)
1984 {
1985 #define __RX_AGG_HOST_DROP_SOURCE_PACKET(_pkt, _reason, _flags) do { \
1986 drop_packets++; \
1987 drop_bytes += (_pkt)->pkt_length; \
1988 pp_drop_packet_single(_pkt, fsw->fsw_ifp, _flags, _reason, __func__, __LINE__); \
1989 (_pkt) = NULL; \
1990 FLOW_AGG_CLEAR(&fa); \
1991 prev_csum_ok = false; \
1992 } while (0)
1993 struct flow_agg fa; /* states */
1994 FLOW_AGG_CLEAR(&fa);
1995
1996 struct pktq disposed_pkts; /* done src packets */
1997 KPKTQ_INIT(&disposed_pkts);
1998
1999 struct __kern_packet *pkt, *tpkt;
2000 /* points to the first mbuf of chain */
2001 struct mbuf *m_chain = NULL;
2002 /* super mbuf, at the end it points to last mbuf packet */
2003 struct mbuf *smbuf = NULL, *curr_m = NULL;
2004 bool prev_csum_ok = false, csum_ok, agg_ok;
2005 uint16_t smbufs = 0, smbuf_finalized = 0;
2006 uint32_t bytes = 0, rcvd_ulen = 0;
2007 uint32_t rcvd_packets = 0, rcvd_bytes = 0; /* raw packets & bytes */
2008 uint32_t drop_packets = 0, drop_bytes = 0; /* dropped packets & bytes */
2009 uint32_t largest_smbuf = 0;
2010 int err = 0;
2011
2012 struct fsw_stats *fsws = &fsw->fsw_stats;
2013 bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
2014
2015 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
2016
2017 /* state for mbuf batch alloc */
2018 uint32_t mhead_cnt = 0;
2019 uint32_t mhead_bufsize = 0;
2020 struct mbuf * mhead = NULL;
2021
2022 uint16_t l2len = KPKTQ_FIRST(rx_pkts)->pkt_l2_len;
2023
2024 SK_DF(logflags, "Rx input queue bytes %u", rx_bytes);
2025
2026 if (__probable(!is_mbuf)) {
2027 /*
2028 * Batch mbuf alloc is based on
2029 * convert_native_pkt_to_mbuf_chain
2030 */
2031 if (__probable(fe->fe_rx_largest_size != 0 &&
2032 NX_FSW_TCP_RX_AGG_ENABLED())) {
2033 unsigned int num_segs = 1;
2034 int pktq_len = KPKTQ_LEN(rx_pkts);
2035
2036 if (fe->fe_rx_largest_size <= MCLBYTES &&
2037 rx_bytes / pktq_len <= MCLBYTES) {
2038 mhead_bufsize = MCLBYTES;
2039 } else if (fe->fe_rx_largest_size <= MBIGCLBYTES &&
2040 rx_bytes / pktq_len <= MBIGCLBYTES) {
2041 mhead_bufsize = MBIGCLBYTES;
2042 } else if (fe->fe_rx_largest_size <= M16KCLBYTES &&
2043 rx_bytes / pktq_len <= M16KCLBYTES) {
2044 mhead_bufsize = M16KCLBYTES;
2045 } else {
2046 mhead_bufsize = M16KCLBYTES * 2;
2047 num_segs = 2;
2048 }
2049
2050 try_again:
2051 if (rx_bytes != 0) {
2052 mhead_cnt = estimate_buf_cnt(fe, rx_bytes, KPKTQ_LEN(rx_pkts),
2053 MCLBYTES, mhead_bufsize);
2054 } else {
2055 /* No payload, thus it's all small-sized ACKs/... */
2056 mhead_bufsize = MHLEN;
2057 mhead_cnt = pktq_len;
2058 }
2059
2060 mhead = m_allocpacket_internal(&mhead_cnt,
2061 mhead_bufsize, &num_segs, M_NOWAIT, 1, 0);
2062
2063 if (mhead == NULL) {
2064 if (mhead_bufsize > M16KCLBYTES) {
2065 mhead_bufsize = M16KCLBYTES;
2066 num_segs = 1;
2067 goto try_again;
2068 }
2069
2070 if (mhead_bufsize == M16KCLBYTES) {
2071 mhead_bufsize = MBIGCLBYTES;
2072 goto try_again;
2073 }
2074
2075 if (mhead_bufsize == MBIGCLBYTES) {
2076 mhead_bufsize = MCLBYTES;
2077 goto try_again;
2078 }
2079 }
2080 } else {
2081 mhead = NULL;
2082 mhead_bufsize = mhead_cnt = 0;
2083 }
2084 SK_DF(logflags, "batch alloc'ed %u mbufs of size %u", mhead_cnt,
2085 mhead_bufsize);
2086 }
2087
2088 KPKTQ_FOREACH_SAFE(pkt, rx_pkts, tpkt) {
2089 if (tpkt != NULL) {
2090 void *baddr;
2091 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
2092 SK_PREFETCH(baddr, 0);
2093 }
2094
2095 /* Validate l2 len, ip vers, is_mbuf */
2096 ASSERT(pkt->pkt_l2_len == l2len);
2097 ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
2098 ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
2099 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
2100 ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
2101 ASSERT(!pkt->pkt_flow_ip_is_frag);
2102 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
2103
2104 csum_ok = false;
2105 agg_ok = false;
2106 /*
2107 * As we only agg packets with same hdr length,
2108 * leverage the pkt metadata
2109 */
2110 uint32_t thlen = (pkt->pkt_flow_ip_hlen +
2111 pkt->pkt_flow_tcp_hlen);
2112 uint32_t plen = (thlen + pkt->pkt_flow_ulen);
2113
2114 /*
2115 * Rather than calling flow_pkt_track() for each
2116 * packet here, we accumulate received packet stats
2117 * for the call to flow_track_stats() below. This
2118 * is because flow tracking is a no-op for traffic
2119 * that belongs to the host stack.
2120 */
2121 rcvd_ulen += pkt->pkt_flow_ulen;
2122 rcvd_bytes += pkt->pkt_length;
2123 rcvd_packets++;
2124
2125 KPKTQ_REMOVE(rx_pkts, pkt);
2126 rx_bytes -= pkt->pkt_flow_ulen;
2127
2128 /* packet is for BSD flow, create a mbuf chain */
2129 uint32_t len = (l2len + plen);
2130 uint16_t data_csum = 0;
2131 struct mbuf *__single m;
2132 bool is_wake_pkt = false;
2133 if (__improbable(is_mbuf)) {
2134 m = pkt->pkt_mbuf;
2135
2136 if (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
2137 is_wake_pkt = true;
2138 }
2139
2140 /* Detach mbuf from source pkt */
2141 KPKT_CLEAR_MBUF_DATA(pkt);
2142
2143 uint32_t trailer = (m_pktlen(m) - len);
2144 ASSERT((uint32_t)m_pktlen(m) >= plen);
2145 /* Remove the trailer */
2146 if (trailer > 0) {
2147 m_adj(m, -trailer);
2148 }
2149 if ((uint32_t) m->m_len < (l2len + thlen)) {
2150 m = m_pullup(m, (l2len + thlen));
2151 if (m == NULL) {
2152 STATS_INC(fsws,
2153 FSW_STATS_RX_DROP_NOMEM_BUF);
2154 SK_ERR("mbuf pullup failed (err %d)",
2155 err);
2156 __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt,
2157 DROP_REASON_FSW_GSO_NOMEM_MBUF, DROPTAP_FLAG_DIR_IN);
2158 continue;
2159 }
2160 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2161 }
2162 /* attached mbuf is already allocated */
2163 csum_ok = mbuf_csum(pkt, m, is_ipv4, &data_csum);
2164 } else { /* native */
2165 uint16_t pad = P2ROUNDUP(l2len, sizeof(uint32_t)) -
2166 l2len;
2167 uint32_t tot_len = (len + pad);
2168 /* remember largest aggregated packet size */
2169 if (smbuf) {
2170 /* plus 4 bytes to account for padding */
2171 if (largest_smbuf <
2172 (uint32_t)m_pktlen(smbuf) + pad) {
2173 largest_smbuf = (uint32_t)m_pktlen(smbuf) + pad;
2174 }
2175 }
2176
2177 if ((pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
2178 is_wake_pkt = true;
2179 }
2180
2181 if (prev_csum_ok && curr_m) {
2182 ASSERT(fa.fa_smbuf == smbuf);
2183 ASSERT(!fa.fa_sobj_is_pkt);
2184 agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2185
2186 if (agg_ok &&
2187 M_TRAILINGSPACE(curr_m) >= plen - thlen) {
2188 /*
2189 * No need for a new mbuf,
2190 * just append to curr_m.
2191 */
2192 csum_ok = copy_pkt_csum_packed(pkt,
2193 plen, NULL, is_ipv4, curr_m, NULL,
2194 &data_csum, NULL);
2195
2196 if (!csum_ok) {
2197 STATS_INC(fsws,
2198 FSW_STATS_RX_AGG_BAD_CSUM);
2199 SK_ERR("Checksum for "
2200 "aggregation is wrong");
2201 DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail1);
2202 /*
2203 * Turns out, checksum is wrong!
2204 * Fallback to no-agg mode.
2205 */
2206 agg_ok = 0;
2207 } else {
2208 /*
2209 * We only added payload,
2210 * thus -thlen.
2211 */
2212 bytes += (plen - thlen);
2213 flow_agg_merge_hdr(&fa, pkt,
2214 data_csum, fsws);
2215 goto next;
2216 }
2217 }
2218 }
2219
2220 /*
2221 * If the batch allocation returned partial success,
2222 * we try blocking allocation here again
2223 */
2224 m = mhead;
2225 if (__improbable(m == NULL ||
2226 tot_len > mhead_bufsize)) {
2227 unsigned int num_segs = 1;
2228 if (tot_len > M16KCLBYTES) {
2229 num_segs = 0;
2230 }
2231
2232 ASSERT(mhead_cnt == 0 || mhead != NULL);
2233 err = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
2234 &num_segs, &m);
2235 if (err != 0) {
2236 STATS_INC(fsws,
2237 FSW_STATS_RX_DROP_NOMEM_BUF);
2238 SK_ERR("mbuf alloc failed (err %d), "
2239 "maxchunks %d, len %d", err, num_segs,
2240 tot_len);
2241 __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt,
2242 DROP_REASON_FSW_GSO_NOMEM_MBUF, DROPTAP_FLAG_DIR_IN);
2243 continue;
2244 }
2245 } else {
2246 ASSERT(mhead_cnt > 0);
2247 mhead = m->m_nextpkt;
2248 m->m_nextpkt = NULL;
2249 mhead_cnt--;
2250 }
2251 m->m_data += pad;
2252 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2253
2254 /*
2255 * copy and checksum l3, l4 and payload
2256 * l2 header is copied later only if we
2257 * can't agg as an optimization
2258 */
2259 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
2260 _dbuf_array_t dbuf_array = {.dba_is_buflet = false};
2261 if (agg_ok) {
2262 int added = 0, dbuf_idx = 0;
2263 struct mbuf *m_tmp = m;
2264 dbuf_array.dba_num_dbufs = 0;
2265 uint32_t m_chain_max_len = 0;
2266 while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
2267 dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
2268 dbuf_array.dba_num_dbufs += 1;
2269 m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
2270 m_tmp = m_tmp->m_next;
2271 dbuf_idx++;
2272 }
2273 ASSERT(m_tmp == NULL);
2274
2275 csum_ok = copy_pkt_csum_packed(pkt, plen,
2276 &dbuf_array, is_ipv4, curr_m, NULL,
2277 &data_csum, &added);
2278
2279 if (!csum_ok) {
2280 STATS_INC(fsws,
2281 FSW_STATS_RX_AGG_BAD_CSUM);
2282 SK_ERR("Checksum for aggregation "
2283 "on new mbuf is wrong");
2284 DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail2);
2285 agg_ok = false;
2286 goto non_agg;
2287 }
2288
2289 /*
2290 * There was not enough space in curr_m,
2291 * thus we must have added to m->m_data.
2292 */
2293 VERIFY(added > 0);
2294 VERIFY(m->m_len <= m->m_pkthdr.len &&
2295 (uint32_t)m->m_pkthdr.len <= m_chain_max_len);
2296
2297 /*
2298 * We account for whatever we added
2299 * to m later on, thus - added.
2300 */
2301 bytes += plen - thlen - added;
2302 } else {
2303 non_agg:
2304 dbuf_array.dba_num_dbufs = 0;
2305 uint32_t m_chain_max_len = 0;
2306 struct mbuf *m_tmp = m;
2307 int dbuf_idx = 0;
2308 while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
2309 dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
2310 dbuf_array.dba_num_dbufs += 1;
2311 m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
2312 m_tmp = m_tmp->m_next;
2313 dbuf_idx++;
2314 }
2315 ASSERT(m_tmp == NULL);
2316
2317 m->m_len += l2len;
2318 m->m_pkthdr.len += l2len;
2319 csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
2320 &data_csum, is_ipv4);
2321 if (__improbable(!csum_ok)) {
2322 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
2323 SK_ERR("%d incorrect csum", __LINE__);
2324 DTRACE_SKYWALK(aggr__host_tcp_csum_fail);
2325 }
2326 VERIFY(m->m_len <= m->m_pkthdr.len &&
2327 (uint32_t)m->m_pkthdr.len <= m_chain_max_len);
2328 }
2329
2330 STATS_INC(fsws, FSW_STATS_RX_COPY_PKT2MBUF);
2331 STATS_INC(fsws, FSW_STATS_RX_COPY_SUM);
2332
2333 m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off;
2334 m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value;
2335 /*
2336 * Note that these flags have same value,
2337 * except PACKET_CSUM_PARTIAL
2338 */
2339 m->m_pkthdr.csum_flags |= (pkt->pkt_csum_flags &
2340 PACKET_CSUM_RX_FLAGS);
2341
2342 /* Set the rcvif */
2343 m->m_pkthdr.rcvif = fsw->fsw_ifp;
2344
2345 /* Make sure to propagate the wake pkt flag */
2346 if (is_wake_pkt) {
2347 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
2348 }
2349 }
2350 ASSERT(m != NULL);
2351 ASSERT((m->m_flags & M_PKTHDR) && m->m_pkthdr.pkt_hdr != NULL);
2352 ASSERT((m->m_flags & M_HASFCS) == 0);
2353 ASSERT(m->m_nextpkt == NULL);
2354
2355 if (__improbable(is_mbuf)) {
2356 if (prev_csum_ok && csum_ok) {
2357 ASSERT(fa.fa_smbuf == smbuf);
2358 agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2359 }
2360 }
2361
2362 if (agg_ok) {
2363 ASSERT(is_wake_pkt == false);
2364 ASSERT(fa.fa_smbuf == smbuf);
2365 ASSERT(!fa.fa_sobj_is_pkt);
2366 if (__improbable(is_mbuf)) {
2367 bytes += (m_pktlen(m) - l2len);
2368 /* adjust mbuf by l2, l3 and l4 hdr */
2369 m_adj(m, l2len + thlen);
2370 } else {
2371 bytes += m_pktlen(m);
2372 }
2373
2374 m->m_flags &= ~M_PKTHDR;
2375 flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
2376 while (curr_m->m_next != NULL) {
2377 curr_m = curr_m->m_next;
2378 }
2379 curr_m->m_next = m;
2380 curr_m = m;
2381 m = NULL;
2382 } else {
2383 if ((uint32_t) m->m_len < l2len) {
2384 m = m_pullup(m, l2len);
2385 if (m == NULL) {
2386 STATS_INC(fsws,
2387 FSW_STATS_RX_DROP_NOMEM_BUF);
2388 SK_ERR("mbuf pullup failed (err %d)",
2389 err);
2390 __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt,
2391 DROP_REASON_FSW_GSO_NOMEM_MBUF, DROPTAP_FLAG_DIR_IN);
2392 continue;
2393 }
2394 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2395 }
2396
2397 /* copy l2 header for native */
2398 if (__probable(!is_mbuf)) {
2399 uint16_t llhoff = pkt->pkt_headroom;
2400 uint8_t *baddr;
2401 MD_BUFLET_ADDR_ABS(pkt, baddr);
2402 ASSERT(baddr != NULL);
2403 baddr += llhoff;
2404 pkt_copy(baddr, m_mtod_current(m), l2len);
2405 }
2406 /* adjust mbuf by l2 hdr */
2407 m_adj(m, l2len);
2408 bytes += m_pktlen(m);
2409
2410 /*
2411 * aggregated packets can be skipped by pktap because
2412 * the original pre-aggregated chain already passed through
2413 * pktap (see fsw_snoop()) before entering this function.
2414 */
2415 m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
2416
2417 if (m_chain == NULL) {
2418 /* this is the start of the chain */
2419 m_chain = m;
2420 smbuf = m;
2421 curr_m = m;
2422 } else if (smbuf != NULL) {
2423 /*
2424 * set m to be next packet
2425 */
2426 mbuf_agg_log(smbuf, kernproc, is_mbuf);
2427 smbuf->m_nextpkt = m;
2428 /*
2429 * Clean up (finalize) a smbuf only if it pre-allocated >1 segments,
2430 * which only happens when mhead_bufsize > M16KCLBYTES
2431 */
2432 if (_finalize_smbuf(smbuf)) {
2433 FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
2434 }
2435 smbuf_finalized++;
2436 smbuf = m;
2437 curr_m = m;
2438 } else {
2439 VERIFY(0);
2440 }
2441
2442 smbufs++;
2443 m = NULL;
2444
2445 flow_agg_init_smbuf(fsw, &fa, smbuf, pkt);
2446 /*
2447 * If the super packet is an mbuf which can't accomodate
2448 * sizeof(struct ip_tcp_mask) or sizeof(struct ip6_tcp_mask)
2449 * in a single buffer, then do the aggregation check in slow path.
2450 * Note that on Intel platforms, an mbuf without cluster
2451 * has only 80 bytes available for data. That means if a
2452 * packet contains an Ethernet header, the mbuf won't be
2453 * able to fully contain "struct ip6_tcp_mask" or
2454 * "struct ip6_tcp_mask" data in a single buffer, because
2455 * sizeof(struct ip_tcp_mask) and sizeof(struct ip6_tcp_mask)
2456 * are all 80 bytes as well.
2457 */
2458 if (__improbable(smbuf->m_len <
2459 ((m_mtod_current(smbuf) - (caddr_t)(smbuf->m_pkthdr.pkt_hdr)) + MASK_SIZE))) {
2460 fa.fa_sobj_is_short = true;
2461 }
2462 }
2463 next:
2464 pkt_agg_log(pkt, kernproc, true);
2465 prev_csum_ok = csum_ok;
2466 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
2467 }
2468
2469 KPKTQ_FINI(rx_pkts);
2470
2471 /* Free any leftover mbufs, true only for native */
2472 if (__improbable(mhead != NULL)) {
2473 ASSERT(mhead_cnt != 0);
2474 STATS_ADD(fsws, FSW_STATS_RX_WASTED_MBUF, mhead_cnt);
2475 (void) m_freem_list(mhead);
2476 mhead = NULL;
2477 mhead_cnt = 0;
2478 }
2479
2480 converge_aggregation_size(fe, largest_smbuf);
2481
2482 if (smbufs > 0) {
2483 /* Last smbuf */
2484 mbuf_agg_log(smbuf, kernproc, is_mbuf);
2485 SK_DF(logflags, "smbuf count %u", smbufs);
2486
2487 ASSERT(m_chain != NULL);
2488 ASSERT(smbuf != NULL);
2489
2490 /*
2491 * If the last mbuf needs to be finalized (mhead_bufsize > M16KCLBYTES)
2492 * but is not (smbuf_finalized < smbuf), do it now.
2493 */
2494 if (smbuf_finalized < smbufs &&
2495 _finalize_smbuf(smbuf)) {
2496 FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
2497 }
2498
2499 /*
2500 * Call fsw_host_sendup() with mbuf chain
2501 * directly.
2502 */
2503 mchain_agg_log(m_chain, kernproc, is_mbuf);
2504 fsw_host_sendup(fsw->fsw_ifp, m_chain, smbuf, smbufs, bytes);
2505
2506 if (__improbable(is_mbuf)) {
2507 STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2MBUF, smbufs);
2508 } else {
2509 STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2MBUF, smbufs);
2510 }
2511 FLOW_STATS_IN_ADD(fe, spackets, smbufs);
2512
2513 ASSERT((fe->fe_flags & FLOWENTF_TRACK) == 0);
2514 }
2515
2516 /* record (raw) number of packets and bytes */
2517 ASSERT((int)(rcvd_bytes - drop_bytes) >= 0);
2518 ASSERT((int)(rcvd_packets - drop_packets) >= 0);
2519 flow_track_stats(fe, (rcvd_bytes - drop_bytes),
2520 (rcvd_packets - drop_packets), (rcvd_ulen != 0), true);
2521
2522 pp_free_pktq(&disposed_pkts);
2523 }
2524
2525 void
flow_rx_agg_tcp(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,uint32_t rx_bytes,uint32_t flags)2526 flow_rx_agg_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe,
2527 struct pktq *rx_pkts, uint32_t rx_bytes, uint32_t flags)
2528 {
2529 #pragma unused(flags)
2530 struct pktq dropped_pkts;
2531 bool is_mbuf;
2532
2533 if (__improbable((flags & FLOW_PROC_FLAG_FRAGMENTS) != 0)) {
2534 dp_flow_rx_process(fsw, fe, rx_pkts, rx_bytes, FLOW_PROC_FLAG_FRAGMENTS);
2535 return;
2536 }
2537
2538 KPKTQ_INIT(&dropped_pkts);
2539
2540 if (!dp_flow_rx_route_process(fsw, fe)) {
2541 SK_ERR("Rx route bad");
2542 fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
2543 STATS_ADD(&fsw->fsw_stats, FSW_STATS_RX_FLOW_NONVIABLE,
2544 KPKTQ_LEN(&dropped_pkts));
2545 pp_drop_pktq(&dropped_pkts, fsw->fsw_ifp, DROPTAP_FLAG_DIR_IN,
2546 DROP_REASON_FSW_FLOW_NONVIABLE, __func__, __LINE__);
2547 return;
2548 }
2549
2550 is_mbuf = !!(PKT_IS_MBUF(KPKTQ_FIRST(rx_pkts)));
2551
2552 if (fe->fe_nx_port == FSW_VP_HOST) {
2553 boolean_t do_rx_agg;
2554
2555 /* BSD flow */
2556 if (sk_fsw_rx_agg_tcp_host != SK_FSW_RX_AGG_TCP_HOST_AUTO) {
2557 do_rx_agg = (sk_fsw_rx_agg_tcp_host ==
2558 SK_FSW_RX_AGG_TCP_HOST_ON);
2559 } else {
2560 do_rx_agg = !dlil_has_ip_filter() &&
2561 !dlil_has_if_filter(fsw->fsw_ifp);
2562 }
2563 if (__improbable(!do_rx_agg)) {
2564 fsw_host_rx(fsw, rx_pkts);
2565 return;
2566 }
2567 if (__improbable(pktap_total_tap_count != 0)) {
2568 fsw_snoop(fsw, fe, rx_pkts, true);
2569 }
2570 flow_rx_agg_host(fsw, fe, rx_pkts, rx_bytes, is_mbuf);
2571 } else {
2572 /* channel flow */
2573 if (__improbable(pktap_total_tap_count != 0)) {
2574 fsw_snoop(fsw, fe, rx_pkts, true);
2575 }
2576 flow_rx_agg_channel(fsw, fe, rx_pkts, rx_bytes, is_mbuf);
2577 }
2578 }
2579