1 /*
2 * Copyright (c) 2019-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
33 #include <skywalk/nexus/netif/nx_netif.h>
34 #include <skywalk/nexus/netif/nx_netif_compat.h>
35 #include <netinet/tcp.h>
36 #include <netinet/ip.h>
37 #include <netinet/ip6.h>
38 #include <net/pktap.h>
39 #include <sys/sdt.h>
40
41 #define MAX_AGG_IP_LEN() MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET)
42 #define MAX_BUFLET_COUNT (32)
43 #define TCP_FLAGS_IGNORE (TH_FIN|TH_SYN|TH_RST|TH_URG)
44 #define PKT_IS_MBUF(_pkt) (_pkt->pkt_pflags & PKT_F_MBUF_DATA)
45 #define PKT_IS_TRUNC_MBUF(_pkt) (PKT_IS_MBUF(_pkt) && \
46 (_pkt->pkt_pflags & PKT_F_TRUNCATED))
47 #define PKT_IS_WAKE_PKT(_pkt) ((PKT_IS_MBUF(_pkt) && \
48 (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) || \
49 (!PKT_IS_MBUF(_pkt) && \
50 (_pkt->pkt_pflags & PKT_F_WAKE_PKT)))
51
52
53 typedef uint16_t (* flow_agg_fix_pkt_sum_func)(uint16_t, uint16_t, uint16_t);
54
55 static uint16_t
56 flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new);
57
58 static uint16_t
59 flow_agg_pkt_fix_sum_no_op(uint16_t csum, uint16_t old, uint16_t new);
60
61 /*
62 * This structure holds per-super object (mbuf/packet) flow aggregation.
63 */
64 struct flow_agg {
65 union {
66 struct {
67 union {
68 void * _fa_sobj;
69 struct mbuf * _fa_smbuf; /* super mbuf */
70 struct __kern_packet *_fa_spkt; /* super pkt */
71 };
72 uint8_t *__indexable _fa_sptr; /* ptr to super IP header */
73 bool _fa_sobj_is_pkt; /* super obj is pkt or mbuf */
74 /*
75 * super obj is not large enough to hold the IP & TCP
76 * header in a contiguous buffer.
77 */
78 bool _fa_sobj_is_short;
79 uint32_t _fa_tcp_seq; /* expected next sequence # */
80 uint32_t _fa_ulen; /* expected next ulen */
81 uint32_t _fa_total; /* total aggregated bytes */
82 /* function that fix packet checksum */
83 flow_agg_fix_pkt_sum_func _fa_fix_pkt_sum;
84 } __flow_agg;
85 uint64_t __flow_agg_data[5];
86 };
87 #define fa_sobj __flow_agg._fa_sobj
88 #define fa_smbuf __flow_agg._fa_smbuf
89 #define fa_spkt __flow_agg._fa_spkt
90 #define fa_sptr __flow_agg._fa_sptr
91 #define fa_sobj_is_pkt __flow_agg._fa_sobj_is_pkt
92 #define fa_sobj_is_short __flow_agg._fa_sobj_is_short
93 #define fa_tcp_seq __flow_agg._fa_tcp_seq
94 #define fa_ulen __flow_agg._fa_ulen
95 #define fa_total __flow_agg._fa_total
96 #define fa_fix_pkt_sum __flow_agg._fa_fix_pkt_sum
97 };
98
99 #if __has_ptrcheck
100 #define FLOW_AGG_CLEAR(_fa) do { \
101 static_assert(sizeof(struct flow_agg) == 48); \
102 static_assert(offsetof(struct flow_agg, fa_fix_pkt_sum) == 40); \
103 sk_zero_48(_fa); \
104 (_fa)->fa_fix_pkt_sum = 0; \
105 } while (0)
106 #else
107 #define FLOW_AGG_CLEAR(_fa) do { \
108 static_assert(sizeof(struct flow_agg) == 40); \
109 static_assert(offsetof(struct flow_agg, fa_fix_pkt_sum) == 32); \
110 sk_zero_32(_fa); \
111 (_fa)->fa_fix_pkt_sum = 0; \
112 } while (0)
113 #endif
114
115 #define MASK_SIZE 80 /* size of struct {ip,ip6}_tcp_mask */
116
117 struct ip_tcp_mask {
118 struct ip ip_m;
119 struct tcphdr tcp_m;
120 uint32_t tcp_option_m[MAX_TCPOPTLEN / sizeof(uint32_t)];
121 };
122
123 static const struct ip_tcp_mask ip_tcp_mask
124 __sk_aligned(16) =
125 {
126 .ip_m = {
127 .ip_hl = 0xf,
128 .ip_v = 0xf,
129 .ip_tos = 0xff,
130 /* Not checked; aggregated packet's ip_len is increasing */
131 .ip_len = 0,
132 .ip_id = 0,
133 .ip_off = 0xffff,
134 .ip_ttl = 0xff,
135 .ip_p = 0xff,
136 .ip_sum = 0,
137 .ip_src.s_addr = 0xffffffff,
138 .ip_dst.s_addr = 0xffffffff,
139 },
140 .tcp_m = {
141 .th_sport = 0xffff,
142 .th_dport = 0xffff,
143 .th_seq = 0,
144 .th_ack = 0xffffffff,
145 .th_x2 = 0xf,
146 .th_off = 0xf,
147 .th_flags = ~TH_PUSH,
148 .th_win = 0xffff,
149 .th_sum = 0,
150 .th_urp = 0xffff,
151 },
152 .tcp_option_m = {
153 /* Max 40 bytes of TCP options */
154 0xffffffff,
155 0xffffffff,
156 0xffffffff,
157 0, /* Filling up to MASK_SIZE */
158 0, /* Filling up to MASK_SIZE */
159 0, /* Filling up to MASK_SIZE */
160 0, /* Filling up to MASK_SIZE */
161 0, /* Filling up to MASK_SIZE */
162 0, /* Filling up to MASK_SIZE */
163 0, /* Filling up to MASK_SIZE */
164 },
165 };
166
167 struct ip6_tcp_mask {
168 struct ip6_hdr ip6_m;
169 struct tcphdr tcp_m;
170 uint32_t tcp_option_m[5]; /* 5 bytes to fill up to MASK_SIZE */
171 };
172
173 static const struct ip6_tcp_mask ip6_tcp_mask
174 __sk_aligned(16) =
175 {
176 .ip6_m = {
177 .ip6_ctlun.ip6_un1.ip6_un1_flow = 0xffffffff,
178 /* Not checked; aggregated packet's ip_len is increasing */
179 .ip6_ctlun.ip6_un1.ip6_un1_plen = 0,
180 .ip6_ctlun.ip6_un1.ip6_un1_nxt = 0xff,
181 .ip6_ctlun.ip6_un1.ip6_un1_hlim = 0xff,
182 .ip6_src.__u6_addr.__u6_addr32[0] = 0xffffff,
183 .ip6_src.__u6_addr.__u6_addr32[1] = 0xffffff,
184 .ip6_src.__u6_addr.__u6_addr32[2] = 0xffffff,
185 .ip6_src.__u6_addr.__u6_addr32[3] = 0xffffff,
186 .ip6_dst.__u6_addr.__u6_addr32[0] = 0xffffff,
187 .ip6_dst.__u6_addr.__u6_addr32[1] = 0xffffff,
188 .ip6_dst.__u6_addr.__u6_addr32[2] = 0xffffff,
189 .ip6_dst.__u6_addr.__u6_addr32[3] = 0xffffff,
190 },
191 .tcp_m = {
192 .th_sport = 0xffff,
193 .th_dport = 0xffff,
194 .th_seq = 0,
195 .th_ack = 0xffffffff,
196 .th_x2 = 0xf,
197 .th_off = 0xf,
198 .th_flags = ~TH_PUSH,
199 .th_win = 0xffff,
200 .th_sum = 0,
201 .th_urp = 0xffff,
202 },
203 .tcp_option_m = {
204 /* Max 40 bytes of TCP options */
205 0xffffffff,
206 0xffffffff,
207 0xffffffff,
208 0, /* Filling up to MASK_SIZE */
209 0, /* Filling up to MASK_SIZE */
210 },
211 };
212
213 #if SK_LOG
214 SK_LOG_ATTRIBUTE
215 static void
_pkt_agg_log(struct __kern_packet * pkt,struct proc * p,bool is_input)216 _pkt_agg_log(struct __kern_packet *pkt, struct proc *p, bool is_input)
217 {
218 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
219 (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
220
221 kern_packet_t ph = SK_PKT2PH(pkt);
222 uint64_t bufcnt = 1;
223 if (!is_input) {
224 bufcnt = kern_packet_get_buflet_count(ph);
225 }
226
227 SK_DF(logflags, "%s(%d) %spkt %p plen %u",
228 sk_proc_name(p), sk_proc_pid(p), is_input ? "s":"d",
229 SK_KVA(pkt), pkt->pkt_length);
230
231 SK_DF(logflags, "%spkt csumf/rxstart/rxval 0x%x/%u/0x%04x",
232 is_input ? "s":"d", pkt->pkt_csum_flags,
233 (uint32_t)pkt->pkt_csum_rx_start_off,
234 (uint32_t)pkt->pkt_csum_rx_value);
235
236 if (!is_input) {
237 kern_buflet_t buf = kern_packet_get_next_buflet(ph, NULL);
238
239 /* Individual buflets */
240 for (uint64_t i = 0; i < bufcnt && buf != NULL; i++) {
241 SK_DF(logflags | SK_VERB_DUMP, "%s",
242 sk_dump("buf", __buflet_get_data_address(buf),
243 __buflet_get_data_length(buf), 128));
244 buf = kern_packet_get_next_buflet(ph, buf);
245 }
246 }
247 }
248
249 #define pkt_agg_log(_pkt, _p, _is_input) do { \
250 if (__improbable(sk_verbose != 0)) { \
251 _pkt_agg_log(_pkt, _p, _is_input); \
252 } \
253 } while (0)
254
255 SK_LOG_ATTRIBUTE
256 static void
_mbuf_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)257 _mbuf_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
258 {
259 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
260 (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
261
262 SK_DF(logflags, "%s(%d) dest mbuf %p pktlen %u",
263 sk_proc_name(p), sk_proc_pid(p), SK_KVA(m),
264 m->m_pkthdr.len);
265
266 SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
267 m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
268 (uint32_t)m->m_pkthdr.csum_rx_val);
269
270 /* Dump the first mbuf */
271 ASSERT(m_mtod_current(m) != NULL);
272 SK_DF(logflags | SK_VERB_DUMP, "%s", sk_dump("buf",
273 (uint8_t *)m_mtod_current(m), m->m_len, 128));
274 }
275
276 #define mbuf_agg_log(_m, _p, _is_mbuf) do { \
277 if (__improbable(sk_verbose != 0)) { \
278 _mbuf_agg_log(_m, _p, _is_mbuf); \
279 } \
280 } while (0)
281
282 SK_LOG_ATTRIBUTE
283 static void
_mchain_agg_log(struct mbuf * m,struct proc * p,bool is_mbuf)284 _mchain_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
285 {
286 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
287 (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
288
289 while (m != NULL) {
290 SK_DF(logflags, "%s(%d) dest mbuf %p pktlen %u",
291 sk_proc_name(p), sk_proc_pid(p), SK_KVA(m),
292 m->m_pkthdr.len);
293
294 SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
295 m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
296 (uint32_t)m->m_pkthdr.csum_rx_val);
297
298 m = m->m_nextpkt;
299 }
300 }
301
302 #define mchain_agg_log(_m, _p, _is_mbuf) do { \
303 if (__improbable(sk_verbose != 0)) { \
304 _mchain_agg_log(_m, _p, _is_mbuf); \
305 } \
306 } while (0)
307 #else
308 #define pkt_agg_log(...)
309 #define mbuf_agg_log(...)
310 #define mchain_agg_log(...)
311 #endif /* SK_LOG */
312
313 /*
314 * Checksum only for packet with mbuf.
315 */
316 static bool
mbuf_csum(struct __kern_packet * pkt,struct mbuf * m,bool verify_l3,uint16_t * data_csum)317 mbuf_csum(struct __kern_packet *pkt, struct mbuf *m, bool verify_l3,
318 uint16_t *data_csum)
319 {
320 ASSERT(data_csum != NULL);
321
322 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
323 uint32_t plen = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen +
324 pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
325 uint16_t l4len = plen - pkt->pkt_l2_len - pkt->pkt_flow_ip_hlen;
326 uint16_t start = pkt->pkt_l2_len;
327 uint32_t partial = 0;
328 uint16_t csum = 0;
329
330 ASSERT(plen == m_pktlen(m));
331
332 /* Some compat drivers compute full checksum */
333 if ((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS) ==
334 CSUM_RX_FULL_FLAGS) {
335 SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
336 m->m_pkthdr.csum_flags, m->m_pkthdr.csum_rx_start,
337 m->m_pkthdr.csum_rx_val);
338
339 /* Compute the data_csum */
340 struct tcphdr *tcp =
341 (struct tcphdr *)(void *)(mtod(m, uint8_t *) +
342 pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen);
343 /* 16-bit alignment is sufficient */
344 ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
345
346 uint16_t th_sum = tcp->th_sum;
347 tcp->th_sum = 0;
348
349 partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
350 pkt->pkt_flow_tcp_hlen);
351 partial += htons(l4len + IPPROTO_TCP);
352 if (pkt->pkt_flow_ip_ver == IPVERSION) {
353 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
354 pkt->pkt_flow_ipv4_dst.s_addr, partial);
355 } else {
356 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
357 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
358 &pkt->pkt_flow_ipv6_dst, partial);
359 }
360 /* Restore the original checksum */
361 tcp->th_sum = th_sum;
362 th_sum = __packet_fix_sum(th_sum, csum, 0);
363 *data_csum = ~th_sum & 0xffff;
364
365 /* pkt metadata will be transfer to super packet */
366 __packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
367 0, m->m_pkthdr.csum_rx_val, false);
368
369 if ((m->m_pkthdr.csum_rx_val ^ 0xffff) == 0) {
370 return true;
371 } else {
372 return false;
373 }
374 }
375 /* Reset the csum RX flags */
376 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
377 if (verify_l3) {
378 csum = m_sum16(m, start, pkt->pkt_flow_ip_hlen);
379 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
380 start, pkt->pkt_flow_ip_hlen, csum);
381 m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED;
382 if ((csum ^ 0xffff) != 0) {
383 return false;
384 } else {
385 m->m_pkthdr.csum_flags |= CSUM_IP_VALID;
386 }
387 }
388 /* Compute L4 header checksum */
389 partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
390 pkt->pkt_flow_tcp_hlen);
391 /* Compute payload checksum */
392 start += (pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
393 *data_csum = m_sum16(m, start, (plen - start));
394
395 /* Fold in the data checksum to TCP checksum */
396 partial += *data_csum;
397 partial += htons(l4len + IPPROTO_TCP);
398 if (pkt->pkt_flow_ip_ver == IPVERSION) {
399 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
400 pkt->pkt_flow_ipv4_dst.s_addr, partial);
401 } else {
402 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
403 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
404 &pkt->pkt_flow_ipv6_dst, partial);
405 }
406 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
407 start - pkt->pkt_flow_tcp_hlen, l4len, csum);
408 // Set start to 0 for full checksum
409 m->m_pkthdr.csum_rx_start = 0;
410 m->m_pkthdr.csum_rx_val = csum;
411 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
412
413 /* pkt metadata will be transfer to super packet */
414 __packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
415 0, csum, false);
416
417 if ((csum ^ 0xffff) != 0) {
418 return false;
419 }
420
421 return true;
422 }
423
424 /* structure to pass an array of data buffers */
425 typedef struct _dbuf_array {
426 union {
427 struct __kern_buflet *dba_buflet[MAX_BUFLET_COUNT];
428 struct mbuf *dba_mbuf[MAX_BUFLET_COUNT];
429 };
430 uint8_t dba_num_dbufs;
431 bool dba_is_buflet;
432 } _dbuf_array_t;
433
434 static inline void
_copy_data_sum_dbuf(struct __kern_packet * spkt,uint16_t soff,uint16_t plen,uint32_t * partial_sum,boolean_t * odd_start,_dbuf_array_t * dbuf,boolean_t do_csum)435 _copy_data_sum_dbuf(struct __kern_packet *spkt, uint16_t soff, uint16_t plen,
436 uint32_t *partial_sum, boolean_t *odd_start, _dbuf_array_t *dbuf,
437 boolean_t do_csum)
438 {
439 uint8_t i = 0;
440 uint32_t buflet_dlim, buflet_dlen, buf_off = 0;
441
442 ASSERT(plen > 0);
443 while (plen > 0) {
444 ASSERT(i < dbuf->dba_num_dbufs);
445 uint32_t dbuf_lim, tmplen;
446 uint8_t *dbuf_addr;
447
448 if (dbuf->dba_is_buflet) {
449 ASSERT(kern_buflet_get_data_offset(dbuf->dba_buflet[i]) == 0);
450 /* XXX -fbounds-safety: use the inline variant to return an __indexable */
451 dbuf_addr = __buflet_get_data_address(dbuf->dba_buflet[i]);
452
453 buflet_dlim = kern_buflet_get_data_limit(dbuf->dba_buflet[i]);
454 buflet_dlen = kern_buflet_get_data_length(dbuf->dba_buflet[i]);
455 buf_off = buflet_dlen;
456 dbuf_lim = buflet_dlim - buf_off;
457 dbuf_addr += buf_off;
458 } else {
459 dbuf_lim = (uint32_t) M_TRAILINGSPACE(dbuf->dba_mbuf[i]);
460 dbuf_addr = mtod(dbuf->dba_mbuf[i], uint8_t *);
461 buf_off = dbuf->dba_mbuf[i]->m_len;
462 dbuf_addr += buf_off;
463 }
464
465 tmplen = min(plen, dbuf_lim);
466 if (PKT_IS_TRUNC_MBUF(spkt)) {
467 if (do_csum) {
468 *partial_sum = m_copydata_sum(spkt->pkt_mbuf,
469 soff, tmplen, dbuf_addr, *partial_sum,
470 odd_start);
471 } else {
472 m_copydata(spkt->pkt_mbuf, soff, tmplen,
473 dbuf_addr);
474 }
475 } else {
476 *partial_sum = pkt_copyaddr_sum(SK_PKT2PH(spkt),
477 soff, dbuf_addr, tmplen, do_csum, *partial_sum,
478 odd_start);
479 }
480 if (dbuf->dba_is_buflet) {
481 VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[i],
482 tmplen + buf_off) == 0);
483 } else {
484 dbuf->dba_mbuf[i]->m_len += tmplen;
485 dbuf->dba_mbuf[0]->m_pkthdr.len += tmplen;
486 }
487 soff += tmplen;
488 plen -= tmplen;
489 buf_off = 0;
490 i++;
491 }
492 ASSERT(plen == 0);
493 }
494
495 /*
496 * Copy (fill) and checksum for packet.
497 * spkt: source IP packet.
498 * plen: length of data in spkt (IP hdr + TCP hdr + TCP payload).
499 * verify_l3: verify IPv4 header checksum.
500 * currm: destination mbuf.
501 * currp: destination skywalk packet.
502 * dbuf: additional destination data buffer(s), used when current destination
503 * packet is out of space.
504 * added: amount of data copied from spkt to the additional buffer.
505 * data_sum: 16-bit folded partial checksum of the copied TCP payload.
506 */
507 static bool
copy_pkt_csum_packed(struct __kern_packet * spkt,uint32_t plen,_dbuf_array_t * dbuf,bool verify_l3,struct mbuf * currm,struct __kern_buflet * currp,uint16_t * data_csum,int * added)508 copy_pkt_csum_packed(struct __kern_packet *spkt, uint32_t plen,
509 _dbuf_array_t *dbuf, bool verify_l3, struct mbuf *currm,
510 struct __kern_buflet *currp, uint16_t *data_csum, int *added)
511 {
512 ASSERT(data_csum != NULL);
513
514 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
515 SK_VERB_COPY));
516
517 uint16_t start = 0, csum = 0;
518 uint32_t len = 0;
519 uint32_t l4len;
520 /* soff is only used for packets */
521 uint16_t soff = spkt->pkt_headroom + spkt->pkt_l2_len;
522 uint32_t data_partial = 0, partial = 0;
523 int32_t curr_oldlen;
524 uint32_t curr_trailing;
525 char *curr_ptr;
526 int32_t curr_len;
527 uint16_t data_off;
528 uint32_t tmplen;
529 boolean_t odd_start = FALSE;
530 bool verify_l4;
531
532 /* One of them must be != NULL, but they can't be both set */
533 VERIFY((currm != NULL || currp != NULL) &&
534 ((currm != NULL) != (currp != NULL)));
535
536 if (currm != NULL) {
537 curr_oldlen = currm->m_len;
538 curr_trailing = (uint32_t)M_TRAILINGSPACE(currm);
539 curr_ptr = mtod(currm, char *) + currm->m_len;
540 curr_len = currm->m_len;
541 } else {
542 curr_oldlen = currp->buf_dlen;
543 curr_trailing = currp->buf_dlim - currp->buf_doff -
544 currp->buf_dlen;
545 /* XXX -fbounds-safety: use the inline variant to return an __indexable */
546 curr_ptr = (char *)__buflet_get_data_address(currp) + currp->buf_doff +
547 currp->buf_dlen;
548 curr_len = currp->buf_dlen;
549 }
550
551 /* Verify checksum only for IPv4 */
552 len = spkt->pkt_flow_ip_hlen;
553 verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(spkt));
554 if (verify_l3) {
555 if (PKT_IS_TRUNC_MBUF(spkt)) {
556 partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
557 len, 0, 0);
558 } else {
559 partial = pkt_sum(SK_PKT2PH(spkt), soff, len);
560 }
561
562 csum = __packet_fold_sum(partial);
563 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)", 0,
564 len, csum);
565 spkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
566 if ((csum ^ 0xffff) != 0) {
567 /* No need to copy & checkum TCP+payload */
568 return false;
569 } else {
570 spkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
571 }
572 }
573
574 verify_l4 = !PACKET_HAS_FULL_CHECKSUM_FLAGS(spkt);
575
576 /* Copy & verify TCP checksum */
577 start = spkt->pkt_flow_ip_hlen + spkt->pkt_flow_tcp_hlen;
578 l4len = plen - spkt->pkt_flow_ip_hlen;
579 len = plen - start;
580 if (PKT_IS_TRUNC_MBUF(spkt)) {
581 tmplen = min(len, curr_trailing);
582 odd_start = FALSE;
583
584 /* First, simple checksum on the TCP header */
585 if (verify_l4) {
586 partial = os_cpu_in_cksum_mbuf(spkt->pkt_mbuf,
587 spkt->pkt_flow_tcp_hlen, spkt->pkt_flow_ip_hlen, 0);
588 }
589
590 /* Now, copy & sum the payload */
591 if (tmplen > 0) {
592 data_partial = m_copydata_sum(spkt->pkt_mbuf,
593 start, tmplen, curr_ptr, 0, &odd_start);
594 curr_len += tmplen;
595 }
596 data_off = start + tmplen;
597 } else {
598 tmplen = min(len, curr_trailing);
599 odd_start = FALSE;
600
601 /* First, simple checksum on the TCP header */
602 if (verify_l4) {
603 partial = pkt_sum(SK_PKT2PH(spkt), (soff +
604 spkt->pkt_flow_ip_hlen), spkt->pkt_flow_tcp_hlen);
605 }
606
607 /* Now, copy & sum the payload */
608 if (tmplen > 0) {
609 data_partial = pkt_copyaddr_sum(SK_PKT2PH(spkt),
610 (soff + start), (uint8_t *)curr_ptr, tmplen,
611 true, 0, &odd_start);
612 curr_len += tmplen;
613 }
614 data_off = soff + start + tmplen;
615 }
616
617 /* copy & sum remaining payload in additional buffers */
618 if ((len - tmplen) > 0) {
619 ASSERT(dbuf != NULL);
620 _copy_data_sum_dbuf(spkt, data_off, (len - tmplen),
621 &data_partial, &odd_start, dbuf, true);
622 *added = (len - tmplen);
623 }
624
625 /* Fold data checksum to 16 bit */
626 *data_csum = __packet_fold_sum(data_partial);
627
628 if (currm != NULL) {
629 currm->m_len = curr_len;
630 } else {
631 currp->buf_dlen = curr_len;
632 }
633
634 if (verify_l4) {
635 /* Fold in the data checksum to TCP checksum */
636 partial += *data_csum;
637 partial += htons(l4len + IPPROTO_TCP);
638 if (spkt->pkt_flow_ip_ver == IPVERSION) {
639 csum = in_pseudo(spkt->pkt_flow_ipv4_src.s_addr,
640 spkt->pkt_flow_ipv4_dst.s_addr, partial);
641 } else {
642 ASSERT(spkt->pkt_flow_ip_ver == IPV6_VERSION);
643 csum = in6_pseudo(&spkt->pkt_flow_ipv6_src,
644 &spkt->pkt_flow_ipv6_dst, partial);
645 }
646 /* pkt metadata will be transfer to super packet */
647 __packet_set_inet_checksum(SK_PKT2PH(spkt),
648 PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
649 } else {
650 /* grab csum value from offload */
651 csum = spkt->pkt_csum_rx_value;
652 }
653
654 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
655 start - spkt->pkt_flow_tcp_hlen, l4len, ntohs(csum));
656
657 if ((csum ^ 0xffff) != 0) {
658 /*
659 * Revert whatever we did here!
660 * currm/currp should be restored to previous value.
661 * dbuf (for additional payload) should be restore to 0.
662 */
663 if (currm != NULL) {
664 currm->m_len = curr_oldlen;
665 } else {
666 currp->buf_dlen = curr_oldlen;
667 }
668 if (dbuf != NULL) {
669 for (int i = 0; i < dbuf->dba_num_dbufs; i++) {
670 if (dbuf->dba_is_buflet) {
671 struct __kern_buflet *b = dbuf->dba_buflet[i];
672 kern_buflet_set_data_length(b, 0);
673 kern_buflet_set_data_offset(b, 0);
674 } else {
675 struct mbuf *m = dbuf->dba_mbuf[i];
676 m->m_len = m->m_pkthdr.len = 0;
677 }
678 }
679 }
680
681 return false;
682 }
683
684 return true;
685 }
686
687 /*
688 * Copy and checksum for packet or packet with mbuf
689 * data_csum is only supported for bsd flows
690 */
691 static bool
copy_pkt_csum(struct __kern_packet * pkt,uint32_t plen,_dbuf_array_t * dbuf,uint16_t * data_csum,bool verify_l3)692 copy_pkt_csum(struct __kern_packet *pkt, uint32_t plen, _dbuf_array_t *dbuf,
693 uint16_t *data_csum, bool verify_l3)
694 {
695 /*
696 * To keep this routine simple and optimal, we are asserting on the
697 * assumption that the smallest flowswitch packet pool buffer should
698 * be large enough to hold the IP and TCP headers in the first buflet.
699 */
700 static_assert(NX_FSW_MINBUFSIZE >= NETIF_COMPAT_MAX_MBUF_DATA_COPY);
701
702 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
703 (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
704
705 uint16_t start = 0, csum = 0;
706 uint32_t len = 0;
707 /* soff is only used for packets */
708 uint16_t soff = pkt->pkt_headroom + pkt->pkt_l2_len;
709 uint32_t data_partial = 0, partial = 0;
710 boolean_t odd_start = false;
711 uint32_t data_len;
712 uint16_t dbuf_off;
713 uint16_t copied_len = 0;
714 bool l3_csum_ok;
715 uint8_t *daddr;
716
717 if (dbuf->dba_is_buflet) {
718 /* XXX -fbounds-safety: use the inline variant to return an __indexable */
719 daddr = __buflet_get_data_address(dbuf->dba_buflet[0]);
720 daddr += kern_buflet_get_data_length(dbuf->dba_buflet[0]);
721 } else {
722 daddr = mtod(dbuf->dba_mbuf[0], uint8_t *);
723 daddr += dbuf->dba_mbuf[0]->m_len;
724 /*
725 * available space check for payload is done later
726 * in _copy_data_sum_dbuf
727 */
728 ASSERT(M_TRAILINGSPACE(dbuf->dba_mbuf[0]) >=
729 pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
730 }
731
732 if (PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt)) {
733 /* copy only */
734 _copy_data_sum_dbuf(pkt, PKT_IS_TRUNC_MBUF(pkt) ? 0: soff,
735 plen, &partial, &odd_start, dbuf, false);
736 if (PKT_IS_MBUF(pkt)) {
737 csum = pkt->pkt_mbuf->m_pkthdr.csum_rx_val;
738 SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
739 pkt->pkt_mbuf->m_pkthdr.csum_flags,
740 pkt->pkt_mbuf->m_pkthdr.csum_rx_start, csum);
741 } else {
742 csum = pkt->pkt_csum_rx_value;
743 SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
744 pkt->pkt_csum_flags,
745 pkt->pkt_csum_rx_start_off, csum);
746 }
747
748 /* pkt metadata will be transfer to super packet */
749 __packet_set_inet_checksum(SK_PKT2PH(pkt),
750 PACKET_CSUM_RX_FULL_FLAGS, 0, csum, false);
751 if ((csum ^ 0xffff) == 0) {
752 return true;
753 } else {
754 return false;
755 }
756 }
757
758 /* Copy l3 & verify checksum only for IPv4 */
759 start = 0;
760 len = pkt->pkt_flow_ip_hlen;
761 if (PKT_IS_TRUNC_MBUF(pkt)) {
762 partial = m_copydata_sum(pkt->pkt_mbuf, start, len,
763 (daddr + start), 0, NULL);
764 } else {
765 partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), soff,
766 (daddr + start), len, true, 0, NULL);
767 }
768 verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(pkt));
769 l3_csum_ok = !verify_l3;
770 if (verify_l3) {
771 csum = __packet_fold_sum(partial);
772 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
773 start, len, csum);
774 pkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
775 if ((csum ^ 0xffff) != 0) {
776 /* proceed to copy the rest of packet */
777 } else {
778 pkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
779 l3_csum_ok = true;
780 }
781 }
782 copied_len += pkt->pkt_flow_ip_hlen;
783
784 /* Copy & verify TCP checksum */
785 start = pkt->pkt_flow_ip_hlen;
786 len = plen - start;
787
788 if (PKT_IS_TRUNC_MBUF(pkt)) {
789 /* First, copy and sum TCP header */
790 partial = m_copydata_sum(pkt->pkt_mbuf, start,
791 pkt->pkt_flow_tcp_hlen, (daddr + start), 0, NULL);
792
793 data_len = len - pkt->pkt_flow_tcp_hlen;
794 start += pkt->pkt_flow_tcp_hlen;
795 dbuf_off = start;
796 /* Next, copy and sum payload (if any) */
797 } else {
798 /* First, copy and sum TCP header */
799 partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), (soff + start),
800 (daddr + start), pkt->pkt_flow_tcp_hlen, true, 0, NULL);
801
802 data_len = len - pkt->pkt_flow_tcp_hlen;
803 start += pkt->pkt_flow_tcp_hlen;
804 dbuf_off = start;
805 start += soff;
806 }
807 copied_len += pkt->pkt_flow_tcp_hlen;
808
809 if (dbuf->dba_is_buflet) {
810 VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[0],
811 kern_buflet_get_data_length(dbuf->dba_buflet[0]) +
812 copied_len) == 0);
813 } else {
814 dbuf->dba_mbuf[0]->m_len += copied_len;
815 dbuf->dba_mbuf[0]->m_pkthdr.len += copied_len;
816 }
817
818 /* copy and sum payload (if any) */
819 if (data_len > 0) {
820 odd_start = false;
821 _copy_data_sum_dbuf(pkt, start, data_len, &data_partial,
822 &odd_start, dbuf, l3_csum_ok);
823 }
824
825 if (__improbable(!l3_csum_ok)) {
826 return false;
827 }
828
829 /* Fold data sum to 16 bit and then into the partial */
830 *data_csum = __packet_fold_sum(data_partial);
831
832 /* Fold in the data checksum to TCP checksum */
833 partial += *data_csum;
834
835 partial += htons(len + IPPROTO_TCP);
836 if (pkt->pkt_flow_ip_ver == IPVERSION) {
837 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
838 pkt->pkt_flow_ipv4_dst.s_addr, partial);
839 } else {
840 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
841 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
842 &pkt->pkt_flow_ipv6_dst, partial);
843 }
844
845 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
846 pkt->pkt_flow_ip_hlen, len, csum);
847
848 /* pkt metadata will be transfer to super packet */
849 __packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
850 0, csum, false);
851 if ((csum ^ 0xffff) != 0) {
852 return false;
853 }
854
855 return true;
856 }
857
858 SK_INLINE_ATTRIBUTE
859 static void
flow_agg_init_common(struct nx_flowswitch * fsw,struct flow_agg * fa,struct __kern_packet * pkt)860 flow_agg_init_common(struct nx_flowswitch *fsw, struct flow_agg *fa,
861 struct __kern_packet *pkt)
862 {
863 struct ifnet *ifp;
864
865 switch (pkt->pkt_flow_ip_ver) {
866 case IPVERSION:
867 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip)) {
868 return;
869 }
870 break;
871 case IPV6_VERSION:
872 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
873 return;
874 }
875 break;
876 default:
877 VERIFY(0);
878 /* NOTREACHED */
879 __builtin_unreachable();
880 }
881
882 fa->fa_tcp_seq = ntohl(pkt->pkt_flow_tcp_seq) + pkt->pkt_flow_ulen;
883 fa->fa_ulen = pkt->pkt_flow_ulen;
884 fa->fa_total = pkt->pkt_flow_ip_hlen +
885 pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
886
887 ifp = fsw->fsw_ifp;
888 ASSERT(ifp != NULL);
889 if (__improbable((ifp->if_hwassist & IFNET_LRO) != 0)) {
890 /* in case hardware supports LRO, don't fix checksum in the header */
891 fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum_no_op;
892 } else {
893 fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum;
894 }
895 }
896
897 static void
flow_agg_init_smbuf(struct nx_flowswitch * fsw,struct flow_agg * fa,struct mbuf * smbuf,struct __kern_packet * pkt)898 flow_agg_init_smbuf(struct nx_flowswitch *fsw, struct flow_agg *fa,
899 struct mbuf *smbuf, struct __kern_packet *pkt)
900 {
901 FLOW_AGG_CLEAR(fa);
902
903 ASSERT(smbuf != NULL);
904 fa->fa_smbuf = smbuf;
905
906 fa->fa_sptr = mtod(smbuf, uint8_t *);
907 ASSERT(fa->fa_sptr != NULL);
908
909 /*
910 * Note here we use 'pkt' instead of 'smbuf', since we rely on the
911 * contents of the flow structure which don't exist in 'smbuf'.
912 */
913 flow_agg_init_common(fsw, fa, pkt);
914 }
915
916 static void
flow_agg_init_spkt(struct nx_flowswitch * fsw,struct flow_agg * fa,struct __kern_packet * spkt,struct __kern_packet * pkt)917 flow_agg_init_spkt(struct nx_flowswitch *fsw, struct flow_agg *fa,
918 struct __kern_packet *spkt, struct __kern_packet *pkt)
919 {
920 FLOW_AGG_CLEAR(fa);
921
922 ASSERT(spkt != NULL);
923 fa->fa_spkt = spkt;
924 fa->fa_sobj_is_pkt = true;
925 VERIFY(spkt->pkt_headroom == 0 && spkt->pkt_l2_len == 0);
926
927 MD_BUFLET_ADDR_ABS(spkt, fa->fa_sptr);
928 ASSERT(fa->fa_sptr != NULL);
929
930 /*
931 * Note here we use 'pkt' instead of 'spkt', since we rely on the
932 * contents of the flow structure which don't exist in 'spkt'.
933 */
934 flow_agg_init_common(fsw, fa, pkt);
935 }
936
937 /*
938 * -fbounds-safety: The reason hardcoded values 64 (and 80) are used here is
939 * because this function calls the 64-byte version of sk memcmp function (same
940 * thing for the 80-byte version). In can_agg_fastpath, there is a check being
941 * done for TCP header length with options: sizeof(struct tcphdr) +
942 * TCPOLEN_TSTAMP_APPA , which is 20 + 12 = 32 bytes. In case of IPv4, adding IP
943 * header size of 20 to it makes it 52 bytes. From the sk_memcmp_* variants, the
944 * closest one is the 64B option.
945 */
946 SK_INLINE_ATTRIBUTE
947 static bool
948 ipv4_tcp_memcmp(const uint8_t *__counted_by(64)h1, const uint8_t *__counted_by(64)h2)
949 {
950 return sk_memcmp_mask_64B(h1, h2, (const uint8_t *)&ip_tcp_mask) == 0;
951 }
952
953 SK_INLINE_ATTRIBUTE
954 static bool
955 ipv6_tcp_memcmp(const uint8_t *__counted_by(80)h1, const uint8_t *__counted_by(80)h2)
956 {
957 return sk_memcmp_mask_80B(h1, h2, (const uint8_t *)&ip6_tcp_mask) == 0;
958 }
959
960 SK_INLINE_ATTRIBUTE
961 static bool
can_agg_fastpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)962 can_agg_fastpath(struct flow_agg *fa, struct __kern_packet *pkt,
963 struct fsw_stats *fsws)
964 {
965 bool match;
966 uint8_t *ip_hdr;
967
968 ASSERT(fa->fa_sptr != NULL);
969 static_assert(sizeof(struct ip6_tcp_mask) == MASK_SIZE);
970 static_assert(sizeof(struct ip_tcp_mask) == MASK_SIZE);
971
972 if (__improbable(pkt->pkt_length < MASK_SIZE)) {
973 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_TCP);
974 goto slow_path;
975 }
976
977 if (__improbable(fa->fa_sobj_is_short)) {
978 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_MBUF);
979 goto slow_path;
980 }
981
982 if (__improbable(pkt->pkt_flow_tcp_hlen !=
983 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA))) {
984 goto slow_path;
985 }
986
987 switch (pkt->pkt_flow_ip_ver) {
988 case IPVERSION:
989 /*
990 * -fbounds-safety: pkt->pkt_flow_ip_hdr is a mach_vm_address_t,
991 * so we forge it here. The reason the constant values 64 and 80
992 * are used is because ipv4_tcp_memcmp takes a __counted_by(64)
993 * and __counted_by(80), respectively.
994 */
995 ip_hdr = __unsafe_forge_bidi_indexable(uint8_t *,
996 pkt->pkt_flow_ip_hdr, 64);
997 match = ipv4_tcp_memcmp(fa->fa_sptr, ip_hdr);
998 break;
999 case IPV6_VERSION:
1000 ip_hdr = __unsafe_forge_bidi_indexable(uint8_t *,
1001 pkt->pkt_flow_ip_hdr, 80);
1002 match = ipv6_tcp_memcmp(fa->fa_sptr, ip_hdr);
1003 break;
1004 default:
1005 VERIFY(0);
1006 /* NOTREACHED */
1007 __builtin_unreachable();
1008 }
1009
1010 if (__improbable(!match)) {
1011 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_MASK_TCP);
1012 goto slow_path;
1013 }
1014 if (__improbable(pkt->pkt_flow_ulen != fa->fa_ulen)) {
1015 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ULEN_TCP);
1016 goto slow_path;
1017 }
1018
1019 STATS_INC(fsws, FSW_STATS_RX_AGG_OK_FASTPATH_TCP);
1020 fa->fa_tcp_seq += pkt->pkt_flow_ulen;
1021 fa->fa_ulen = pkt->pkt_flow_ulen;
1022 return true;
1023
1024 slow_path:
1025 return false;
1026 }
1027
1028 SK_NO_INLINE_ATTRIBUTE
1029 static bool
can_agg_slowpath(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)1030 can_agg_slowpath(struct flow_agg *fa, struct __kern_packet *pkt,
1031 struct fsw_stats *fsws)
1032 {
1033 uint8_t *sl3_hdr = fa->fa_sptr;
1034 uint8_t *l3_hdr = __unsafe_forge_bidi_indexable(uint8_t *,
1035 pkt->pkt_flow_ip_hdr, pkt->pkt_flow_ip_hlen);
1036 uint32_t sl3tlen = 0;
1037 uint16_t sl3hlen = 0;
1038
1039 DTRACE_SKYWALK2(aggr__slow, struct __kern_packet *, pkt,
1040 uint8_t *, sl3_hdr);
1041
1042 ASSERT(sl3_hdr != NULL);
1043
1044 /*
1045 * Compare IP header length, TOS, frag flags and IP options
1046 * For IPv4, the options should match exactly
1047 * For IPv6, if options are present, bail out
1048 */
1049 if (pkt->pkt_flow_ip_ver == IPVERSION) {
1050 struct ip *siph = (struct ip *)(void *)sl3_hdr;
1051 struct ip *iph = (struct ip *)(void *)l3_hdr;
1052
1053 ASSERT(siph->ip_v == IPVERSION);
1054 /* 16-bit alignment is sufficient (handles mbuf case) */
1055 ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1056 ASSERT(IS_P2ALIGNED(iph, sizeof(uint16_t)));
1057
1058 sl3hlen = (siph->ip_hl << 2);
1059 if (sl3hlen != pkt->pkt_flow_ip_hlen) {
1060 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1061 DTRACE_SKYWALK2(aggr__fail2, uint16_t, sl3hlen, uint8_t,
1062 pkt->pkt_flow_ip_hlen);
1063 return false;
1064 }
1065
1066 if (siph->ip_ttl != iph->ip_ttl) {
1067 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1068 DTRACE_SKYWALK2(aggr__fail3, uint8_t, siph->ip_ttl,
1069 uint8_t, iph->ip_ttl);
1070 return false;
1071 }
1072
1073 if (siph->ip_tos != iph->ip_tos) {
1074 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1075 DTRACE_SKYWALK2(aggr__fail4, uint8_t, siph->ip_tos,
1076 uint8_t, iph->ip_tos);
1077 return false;
1078 }
1079 /* For IPv4, DF bit should match */
1080 if ((ntohs(siph->ip_off) & (IP_DF | IP_RF)) !=
1081 (ntohs(iph->ip_off) & (IP_DF | IP_RF))) {
1082 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OFF_IP);
1083 DTRACE_SKYWALK2(aggr__fail5, uint16_t,
1084 ntohs(siph->ip_off), uint16_t, ntohs(iph->ip_off));
1085 return false;
1086 }
1087
1088 uint8_t ip_opts_len = pkt->pkt_flow_ip_hlen -
1089 sizeof(struct ip);
1090 if (ip_opts_len > 0 &&
1091 memcmp((uint8_t *)(siph + 1), (uint8_t *)(iph + 1),
1092 ip_opts_len) != 0) {
1093 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPT_IP);
1094 DTRACE_SKYWALK3(aggr__fail6, uint8_t, ip_opts_len,
1095 uint8_t *, (uint8_t *)(siph + 1), uint8_t *,
1096 (uint8_t *)(iph + 1));
1097 return false;
1098 }
1099 sl3tlen = ntohs(siph->ip_len);
1100 } else {
1101 struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1102 struct ip6_hdr *ip6 = (struct ip6_hdr *)l3_hdr;
1103
1104 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1105 ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1106 /* 16-bit alignment is sufficient (handles mbuf case) */
1107 ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1108
1109 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
1110 /*
1111 * Don't aggregate if extension header is present in
1112 * packet. N.B. currently flow switch only classifies
1113 * frag header
1114 */
1115 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1116 DTRACE_SKYWALK1(aggr__fail7, uint8_t,
1117 pkt->pkt_flow_ip_hlen);
1118 return false;
1119 }
1120
1121 sl3hlen = sizeof(struct ip6_hdr);
1122 /* For IPv6, flow info mask covers TOS and flow label */
1123 if (memcmp((uint8_t *)&sip6->ip6_flow, (uint8_t *)&ip6->ip6_flow,
1124 sizeof(sip6->ip6_flow)) != 0) {
1125 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1126 DTRACE_SKYWALK2(aggr__fail8, uint32_t,
1127 ntohl(sip6->ip6_flow), uint32_t,
1128 ntohl(ip6->ip6_flow));
1129 return false;
1130 }
1131
1132 if (sip6->ip6_hlim != ip6->ip6_hlim) {
1133 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1134 DTRACE_SKYWALK2(aggr__fail9, uint8_t, sip6->ip6_hlim,
1135 uint8_t, ip6->ip6_hlim);
1136 return false;
1137 }
1138
1139 sl3tlen = (sizeof(struct ip6_hdr) + ntohs(sip6->ip6_plen));
1140 }
1141
1142 /*
1143 * For TCP header, compare ACK number and window size
1144 * Compare TCP flags
1145 * Compare TCP header length and TCP options
1146 */
1147 struct tcphdr *stcp = (struct tcphdr *)(void *)(sl3_hdr + sl3hlen);
1148 /* -fbounds-safety: pkt_flow_tcp_hdr is a mach_vm_address_t */
1149 struct tcphdr *tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
1150 pkt->pkt_flow_tcp_hdr, pkt->pkt_flow_tcp_hlen);
1151
1152 uint16_t sl4hlen = (stcp->th_off << 2);
1153 if (memcmp(&stcp->th_ack, &tcp->th_ack, sizeof(stcp->th_ack)) != 0 ||
1154 memcmp(&stcp->th_win, &tcp->th_win, sizeof(stcp->th_win)) != 0) {
1155 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ACKWIN_TCP);
1156 DTRACE_SKYWALK4(aggr__fail9, uint32_t, ntohl(stcp->th_ack),
1157 uint32_t, ntohl(tcp->th_ack), uint16_t, ntohs(stcp->th_win),
1158 uint16_t, ntohs(tcp->th_win));
1159 return false;
1160 }
1161
1162 if ((stcp->th_flags & ~(TH_PUSH)) != (tcp->th_flags & ~(TH_PUSH))) {
1163 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_FLAGS_TCP);
1164 DTRACE_SKYWALK2(aggr__fail10, uint8_t, stcp->th_flags,
1165 uint8_t, tcp->th_flags);
1166 return false;
1167 }
1168
1169 if (sl4hlen != pkt->pkt_flow_tcp_hlen) {
1170 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_TCP);
1171 DTRACE_SKYWALK2(aggr__fail11, uint8_t, sl4hlen,
1172 uint8_t, pkt->pkt_flow_tcp_hlen);
1173 return false;
1174 }
1175
1176 uint8_t tcp_opts_len = pkt->pkt_flow_tcp_hlen - sizeof(struct tcphdr);
1177 /*
1178 * We know that the TCP-option lengthes are the same thanks to the above
1179 * sl4hlen check
1180 */
1181 if (tcp_opts_len > 0 && memcmp((uint8_t *)(stcp + 1),
1182 (uint8_t *)(tcp + 1), tcp_opts_len) != 0) {
1183 /*
1184 * Fast-path header prediction:
1185 *
1186 * TCP Timestamp option is usually put after two NOP-headers,
1187 * and thus total TCP-option length is 12. If that's the case,
1188 * we can aggregate as only the TCP time-stamp option differs.
1189 */
1190 if (tcp_opts_len != TCPOLEN_TSTAMP_APPA) {
1191 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_EXOPT_TCP);
1192 DTRACE_SKYWALK1(aggr__fail13, uint8_t, tcp_opts_len);
1193 return false;
1194 } else {
1195 uint32_t sts_hdr, ts_hdr;
1196 if (IS_P2ALIGNED(stcp + 1, sizeof(uint32_t))) {
1197 sts_hdr = *((uint32_t *)(stcp + 1));
1198 } else {
1199 bcopy(stcp + 1, &sts_hdr, sizeof(sts_hdr));
1200 }
1201 if (IS_P2ALIGNED(tcp + 1, sizeof(uint32_t))) {
1202 ts_hdr = *((uint32_t *)(tcp + 1));
1203 } else {
1204 bcopy(tcp + 1, &ts_hdr, sizeof(ts_hdr));
1205 }
1206
1207 if (sts_hdr != htonl(TCPOPT_TSTAMP_HDR) ||
1208 ts_hdr != htonl(TCPOPT_TSTAMP_HDR)) {
1209 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPTTS_TCP);
1210 DTRACE_SKYWALK2(aggr__fail14, uint32_t,
1211 sts_hdr, uint32_t, ts_hdr);
1212 return false;
1213 }
1214 }
1215 }
1216 STATS_INC(fsws, FSW_STATS_RX_AGG_OK_SLOWPATH_TCP);
1217 fa->fa_tcp_seq += pkt->pkt_flow_ulen;
1218 fa->fa_ulen = pkt->pkt_flow_ulen;
1219 return true;
1220 }
1221
1222 static bool
flow_agg_is_ok(struct flow_agg * fa,struct __kern_packet * pkt,struct fsw_stats * fsws)1223 flow_agg_is_ok(struct flow_agg *fa, struct __kern_packet *pkt,
1224 struct fsw_stats *fsws)
1225 {
1226 /* Shouldn't exceed the ip_len beyond MIN(custom ip_len, 64K) */
1227 const uint32_t max_ip_len = MAX_AGG_IP_LEN();
1228 bool can_agg = false;
1229
1230 DTRACE_SKYWALK2(aggr__check, struct flow_agg *, fa,
1231 struct __kern_packet *, pkt);
1232
1233 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1234 if (__improbable(pkt->pkt_flow_tcp_agg_fast != 0)) {
1235 pkt->pkt_flow_tcp_agg_fast = 0;
1236 }
1237 /*
1238 * Don't aggregate if any of the following is true:
1239 * 1. TCP flag is other than TH_{ACK,PUSH}
1240 * 2. Payload length is 0 (pure ACK)
1241 * 3. This is the first packet
1242 * 4. pkt was received as a broadcast / multicast
1243 * 5. TCP sequence number is not expected
1244 * 6. We would've exceeded the maximum aggregated size
1245 * 7. It's not the first packet and the wake flag is set
1246 */
1247 if (__improbable((pkt->pkt_flow_tcp_flags & TCP_FLAGS_IGNORE) != 0 ||
1248 pkt->pkt_flow_ulen == 0 || fa->fa_sobj == NULL ||
1249 (pkt->pkt_link_flags & (PKT_LINKF_BCAST | PKT_LINKF_MCAST)) != 0)) {
1250 DTRACE_SKYWALK1(aggr__fail1a, struct __kern_packet *, pkt);
1251 goto done;
1252 }
1253 if (__improbable(ntohl(pkt->pkt_flow_tcp_seq) != fa->fa_tcp_seq)) {
1254 DTRACE_SKYWALK2(aggr__fail1b, uint32_t,
1255 ntohl(pkt->pkt_flow_tcp_seq), uint32_t, fa->fa_tcp_seq);
1256 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SEQN_TCP);
1257 goto done;
1258 }
1259 if (__improbable((fa->fa_total + pkt->pkt_flow_ulen) > max_ip_len)) {
1260 DTRACE_SKYWALK3(aggr__fail1c, uint32_t, fa->fa_total,
1261 uint32_t, pkt->pkt_flow_ulen, uint32_t, max_ip_len);
1262 /* We've reached aggregation limit */
1263 STATS_INC(fsws, FSW_STATS_RX_AGG_LIMIT);
1264 goto done;
1265 }
1266 if (__improbable(PKT_IS_WAKE_PKT(pkt) && fa->fa_total > 0)) {
1267 DTRACE_SKYWALK1(aggr__fail1d, struct __kern_packet *, pkt);
1268 goto done;
1269 }
1270
1271 can_agg = can_agg_fastpath(fa, pkt, fsws);
1272 if (can_agg) {
1273 pkt->pkt_flow_tcp_agg_fast = 1;
1274 goto done;
1275 }
1276
1277 can_agg = can_agg_slowpath(fa, pkt, fsws);
1278 ASSERT(!pkt->pkt_flow_tcp_agg_fast);
1279
1280 done:
1281 return can_agg;
1282 }
1283
1284 static uint16_t
flow_agg_pkt_fix_sum(uint16_t csum,uint16_t old,uint16_t new)1285 flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new)
1286 {
1287 return __packet_fix_sum(csum, old, new);
1288 }
1289
1290 static uint16_t
flow_agg_pkt_fix_sum_no_op(uint16_t __unused csum,uint16_t __unused old,uint16_t __unused new)1291 flow_agg_pkt_fix_sum_no_op(uint16_t __unused csum, uint16_t __unused old,
1292 uint16_t __unused new)
1293 {
1294 return 0;
1295 }
1296
1297 static inline void
flow_agg_pkt_fix_hdr_sum(struct flow_agg * fa,uint8_t * __sized_by (sizeof (uint32_t))field,uint16_t * csum,uint32_t new)1298 flow_agg_pkt_fix_hdr_sum(struct flow_agg *fa,
1299 uint8_t *__sized_by(sizeof(uint32_t))field, uint16_t *csum,
1300 uint32_t new)
1301 {
1302 uint32_t old;
1303 memcpy((uint8_t *)&old, field, sizeof(old));
1304 memcpy(field, (uint8_t *)&new, sizeof(uint32_t));
1305 *csum = fa->fa_fix_pkt_sum(fa->fa_fix_pkt_sum(*csum,
1306 (uint16_t)(old >> 16), (uint16_t)(new >> 16)),
1307 (uint16_t)(old & 0xffff),
1308 (uint16_t)(new & 0xffff));
1309 }
1310
1311 static void
flow_agg_merge_hdr(struct flow_agg * fa,struct __kern_packet * pkt,__unused uint16_t data_csum,struct fsw_stats * fsws)1312 flow_agg_merge_hdr(struct flow_agg *fa, struct __kern_packet *pkt,
1313 __unused uint16_t data_csum, struct fsw_stats *fsws)
1314 {
1315 struct tcphdr *stcp, *tcp;
1316 uint8_t *l3hdr, l3hlen;
1317 uint16_t old_l3len = 0;
1318 uint8_t result;
1319
1320 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1321
1322 /*
1323 * The packet being merged should always have full checksum flags
1324 * and a valid checksum. Otherwise, it would fail copy_pkt_csum_packed
1325 * and not enter this function.
1326 */
1327 ASSERT(PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt));
1328 ASSERT((pkt->pkt_csum_rx_value ^ 0xffff) == 0);
1329
1330 ASSERT(fa->fa_sobj != NULL);
1331 ASSERT(!fa->fa_sobj_is_pkt ||
1332 (fa->fa_spkt->pkt_headroom == 0 && fa->fa_spkt->pkt_l2_len == 0));
1333 uint8_t *sl3_hdr = fa->fa_sptr;
1334 ASSERT(sl3_hdr != NULL);
1335 ASSERT(fa->fa_fix_pkt_sum != NULL);
1336
1337 fa->fa_total += pkt->pkt_flow_ulen;
1338
1339 /*
1340 * Update the IP header as:
1341 * 1. Set the IP ID (IPv4 only) to that of the new packet
1342 * 2. Set the ttl to the lowest of the two
1343 * 3. Increment the IP length by the payload length of new packet
1344 * 4. Leave the IP (IPv4 only) checksum as is
1345 * Update the resp. flow classification fields, if any
1346 * Nothing to update for TCP header for now
1347 */
1348 if (pkt->pkt_flow_ip_ver == IPVERSION) {
1349 struct ip *siph = (struct ip *)(void *)sl3_hdr;
1350
1351 /* 16-bit alignment is sufficient (handles mbuf case) */
1352 ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1353
1354 l3hdr = (uint8_t *)siph;
1355 l3hlen = siph->ip_hl << 2;
1356
1357 old_l3len = ntohs(siph->ip_len);
1358 uint16_t l3tlen = ntohs(siph->ip_len) + pkt->pkt_flow_ulen;
1359 siph->ip_len = htons(l3tlen);
1360 siph->ip_sum = fa->fa_fix_pkt_sum(siph->ip_sum, 0,
1361 htons(pkt->pkt_flow_ulen));
1362
1363 SK_DF(logflags, "Agg IP len %u", ntohs(siph->ip_len));
1364 } else {
1365 struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1366
1367 /* 16-bit alignment is sufficient (handles mbuf case) */
1368 ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1369 ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1370 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1371
1372 l3hdr = (uint8_t *)sip6;
1373 l3hlen = sizeof(struct ip6_hdr);
1374
1375 /* No extension headers should be present */
1376 ASSERT(pkt->pkt_flow_ip_hlen == sizeof(struct ip6_hdr));
1377
1378 old_l3len = ntohs(sip6->ip6_plen) + sizeof(struct ip6_hdr);
1379 uint16_t l3plen = ntohs(sip6->ip6_plen) + pkt->pkt_flow_ulen;
1380 sip6->ip6_plen = htons(l3plen);
1381
1382 SK_DF(logflags, "Agg IP6 len %u", ntohs(sip6->ip6_plen));
1383 }
1384
1385 if (__probable(pkt->pkt_flow_tcp_agg_fast)) {
1386 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_IP);
1387 } else {
1388 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_IP);
1389 }
1390
1391 stcp = (struct tcphdr *)(void *)(l3hdr + l3hlen);
1392 tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
1393 (struct tcphdr *)pkt->pkt_flow_tcp_hdr, pkt->pkt_flow_tcp_hlen);
1394
1395 /* 16-bit alignment is sufficient (handles mbuf case) */
1396 ASSERT(IS_P2ALIGNED(stcp, sizeof(uint16_t)));
1397 ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
1398
1399 /*
1400 * If it is bigger, that means there are TCP-options that need to be
1401 * copied over.
1402 */
1403 if (pkt->pkt_flow_tcp_hlen > sizeof(struct tcphdr) ||
1404 (stcp->th_flags & TH_PUSH) == 0) {
1405 VERIFY(stcp->th_off << 2 == pkt->pkt_flow_tcp_hlen);
1406 if (__improbable(!pkt->pkt_flow_tcp_agg_fast &&
1407 memcmp(stcp + 1, tcp + 1, (pkt->pkt_flow_tcp_hlen -
1408 sizeof(struct tcphdr))) != 0)) {
1409 uint8_t *sopt = (uint8_t *)(stcp + 1);
1410 uint8_t *opt = (uint8_t *)(tcp + 1);
1411
1412 uint32_t ntsval, ntsecr;
1413 bcopy((void *)(opt + 4), &ntsval, sizeof(ntsval));
1414 bcopy((void *)(opt + 8), &ntsecr, sizeof(ntsecr));
1415
1416 flow_agg_pkt_fix_hdr_sum(fa, sopt + 4, &stcp->th_sum, ntsval);
1417 flow_agg_pkt_fix_hdr_sum(fa, sopt + 8, &stcp->th_sum, ntsecr);
1418
1419 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_TCP);
1420 } else {
1421 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_TCP);
1422 }
1423
1424 if ((stcp->th_flags & TH_PUSH) == 0 &&
1425 (tcp->th_flags & TH_PUSH) != 0) {
1426 uint16_t old, new;
1427 tcp_seq *th_ack = &stcp->th_ack;
1428 /*
1429 * -fbounds-safety: C-style cast (uint16_t *)(th_ack+1)
1430 * doesn't work here, because th_ack's bound is a single
1431 * uint32_t, so trying to go one address above, and then
1432 * later dereferncing it would lead to a panic.
1433 */
1434 uint16_t *next = __unsafe_forge_single(uint16_t *,
1435 th_ack + 1);
1436 old = *next;
1437 /* If the new segment has a PUSH-flag, append it! */
1438 stcp->th_flags |= tcp->th_flags & TH_PUSH;
1439 next = __unsafe_forge_single(uint16_t *, th_ack + 1);
1440 new = *next;
1441 stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, old, new);
1442 }
1443 }
1444
1445 /* Update pseudo header checksum */
1446 stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
1447 htons(pkt->pkt_flow_ulen));
1448
1449 /* Update data checksum */
1450 if (__improbable(old_l3len & 0x1)) {
1451 /* swap the byte order, refer to rfc 1071 section 2 */
1452 stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
1453 ntohs(data_csum));
1454 } else {
1455 stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0, data_csum);
1456 }
1457
1458 if (fa->fa_sobj_is_pkt) {
1459 struct __kern_packet *spkt = fa->fa_spkt;
1460 spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1461 spkt->pkt_flow_ulen += pkt->pkt_flow_ulen;
1462 /*
1463 * Super packet length includes L3 and L4
1464 * header length for first packet only.
1465 */
1466 spkt->pkt_length += pkt->pkt_flow_ulen;
1467 if (spkt->pkt_seg_cnt == 0) {
1468 /* First time we append packets, need to set it to 1 */
1469 spkt->pkt_seg_cnt = 1;
1470 }
1471 static_assert(sizeof(result) == sizeof(spkt->pkt_seg_cnt));
1472 if (!os_add_overflow(1, spkt->pkt_seg_cnt, &result)) {
1473 spkt->pkt_seg_cnt = result;
1474 }
1475 SK_DF(logflags, "Agg pkt len %u TCP csum 0x%04x",
1476 spkt->pkt_length, ntohs(stcp->th_sum));
1477 } else {
1478 struct mbuf *smbuf = fa->fa_smbuf;
1479 smbuf->m_pkthdr.len += pkt->pkt_flow_ulen;
1480 if (smbuf->m_pkthdr.rx_seg_cnt == 0) {
1481 /* First time we append packets, need to set it to 1 */
1482 smbuf->m_pkthdr.rx_seg_cnt = 1;
1483 }
1484 static_assert(sizeof(result) == sizeof(smbuf->m_pkthdr.rx_seg_cnt));
1485 if (!os_add_overflow(1, smbuf->m_pkthdr.rx_seg_cnt, &result)) {
1486 smbuf->m_pkthdr.rx_seg_cnt = result;
1487 }
1488 SK_DF(logflags, "Agg mbuf len %u TCP csum 0x%04x",
1489 smbuf->m_pkthdr.len, ntohs(stcp->th_sum));
1490 }
1491 }
1492
1493 /*
1494 * Copy metadata from source packet to destination packet
1495 */
1496 static void
pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)1497 pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
1498 {
1499 /* Copy packet metadata */
1500 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
1501 _PKT_COPY(spkt, dpkt);
1502 }
1503
1504 static void
pkt_finalize(kern_packet_t ph)1505 pkt_finalize(kern_packet_t ph)
1506 {
1507 int err = __packet_finalize(ph);
1508 VERIFY(err == 0);
1509 #if (DEVELOPMENT || DEBUG)
1510 struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1511 uint8_t *buf;
1512 MD_BUFLET_ADDR_ABS(pkt, buf);
1513 buf += pkt->pkt_headroom + pkt->pkt_l2_len;
1514 DTRACE_SKYWALK2(aggr__finalize, struct __kern_packet *, pkt,
1515 uint8_t *, buf);
1516 #endif
1517 }
1518
1519 static inline uint32_t
estimate_buf_cnt(struct flow_entry * fe,uint32_t total_bytes,uint32_t total_pkts,uint32_t min_bufsize,uint32_t agg_bufsize)1520 estimate_buf_cnt(struct flow_entry *fe, uint32_t total_bytes, uint32_t total_pkts,
1521 uint32_t min_bufsize, uint32_t agg_bufsize)
1522 {
1523 uint32_t max_ip_len = MAX_AGG_IP_LEN();
1524 uint32_t agg_size = MAX(fe->fe_rx_largest_size, min_bufsize);
1525 uint32_t hdr_overhead;
1526
1527 if (__improbable(sk_fsw_rx_agg_tcp == 0)) {
1528 return MIN(total_pkts, MAX_BUFLET_COUNT);
1529 }
1530
1531 agg_size = MIN(agg_size, agg_bufsize);
1532
1533 hdr_overhead = (total_bytes / max_ip_len) *
1534 (MAX(sizeof(struct ip), sizeof(struct ip6_hdr)) +
1535 sizeof(struct tcphdr));
1536
1537 return ((total_bytes + hdr_overhead) / agg_size) + 1;
1538 }
1539
1540 SK_INLINE_ATTRIBUTE
1541 static inline void
_append_dbuf_array_to_kpkt(kern_packet_t ph,kern_buflet_t pbuf,_dbuf_array_t * dbuf_array,kern_buflet_t * lbuf)1542 _append_dbuf_array_to_kpkt(kern_packet_t ph, kern_buflet_t pbuf,
1543 _dbuf_array_t *dbuf_array, kern_buflet_t *lbuf)
1544 {
1545 for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1546 kern_buflet_t buf = dbuf_array->dba_buflet[i];
1547 VERIFY(kern_packet_add_buflet(ph, pbuf, buf) == 0);
1548 pbuf = buf;
1549 dbuf_array->dba_buflet[i] = NULL;
1550 }
1551 ASSERT(pbuf != NULL);
1552 dbuf_array->dba_num_dbufs = 0;
1553 *lbuf = pbuf;
1554 }
1555
1556 SK_INLINE_ATTRIBUTE
1557 static inline void
_free_dbuf_array(struct kern_pbufpool * pp,_dbuf_array_t * dbuf_array)1558 _free_dbuf_array(struct kern_pbufpool *pp,
1559 _dbuf_array_t *dbuf_array)
1560 {
1561 for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1562 kern_buflet_t buf = dbuf_array->dba_buflet[i];
1563 pp_free_buflet(pp, buf);
1564 dbuf_array->dba_buflet[i] = NULL;
1565 }
1566 dbuf_array->dba_num_dbufs = 0;
1567 }
1568
1569 static inline void
finalize_super_packet(struct __kern_packet ** spkt,kern_packet_t * sph,struct flow_agg * fa,uint32_t * largest_spkt,uint16_t * spkts,uint16_t bufcnt)1570 finalize_super_packet(struct __kern_packet **spkt, kern_packet_t *sph,
1571 struct flow_agg *fa, uint32_t *largest_spkt, uint16_t *spkts,
1572 uint16_t bufcnt)
1573 {
1574 (*spkts)++;
1575 if (bufcnt > 1) {
1576 (*spkt)->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1577 }
1578 pkt_finalize(*sph);
1579 if ((*spkt)->pkt_length > *largest_spkt) {
1580 *largest_spkt = (*spkt)->pkt_length;
1581 }
1582 pkt_agg_log(*spkt, kernproc, false);
1583 DTRACE_SKYWALK1(aggr__buflet__count, uint16_t, bufcnt);
1584 *sph = 0;
1585 *spkt = NULL;
1586 FLOW_AGG_CLEAR(fa);
1587 }
1588
1589 static inline void
converge_aggregation_size(struct flow_entry * fe,uint32_t largest_agg_size)1590 converge_aggregation_size(struct flow_entry *fe, uint32_t largest_agg_size)
1591 {
1592 if (fe->fe_rx_largest_size > largest_agg_size) {
1593 /*
1594 * Make it slowly move towards largest_agg_size if we
1595 * consistently get non-aggregatable size.
1596 *
1597 * If we start at 16K, this makes us go to 4K within 6 rounds
1598 * and down to 2K within 12 rounds.
1599 */
1600 fe->fe_rx_largest_size -=
1601 ((fe->fe_rx_largest_size - largest_agg_size) >> 2);
1602 } else {
1603 fe->fe_rx_largest_size +=
1604 ((largest_agg_size - fe->fe_rx_largest_size) >> 2);
1605 }
1606 }
1607
1608 SK_NO_INLINE_ATTRIBUTE
1609 static void
flow_rx_agg_channel(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,uint32_t rx_bytes,bool is_mbuf)1610 flow_rx_agg_channel(struct nx_flowswitch *fsw, struct flow_entry *fe,
1611 struct pktq *rx_pkts, uint32_t rx_bytes, bool is_mbuf)
1612 {
1613 #define __RX_AGG_CHAN_DROP_SOURCE_PACKET(_pkt, _reason, _flags) do { \
1614 pp_drop_packet_single(_pkt, fsw->fsw_ifp, _flags, _reason, __func__, __LINE__); \
1615 (_pkt) = NULL; \
1616 FLOW_AGG_CLEAR(&fa); \
1617 prev_csum_ok = false; \
1618 } while (0)
1619 struct flow_agg fa; /* states */
1620 FLOW_AGG_CLEAR(&fa);
1621
1622 struct pktq super_pkts; /* dst super packets */
1623 struct pktq disposed_pkts; /* done src packets */
1624
1625 KPKTQ_INIT(&super_pkts);
1626 KPKTQ_INIT(&disposed_pkts);
1627
1628 struct __kern_channel_ring *ring;
1629 ring = fsw_flow_get_rx_ring(fsw, fe);
1630 if (__improbable(ring == NULL)) {
1631 SK_ERR("Rx ring is NULL");
1632 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DST_NXPORT_INVALID,
1633 KPKTQ_LEN(rx_pkts));
1634 pp_drop_pktq(rx_pkts, fsw->fsw_ifp, DROPTAP_FLAG_DIR_IN,
1635 DROP_REASON_FSW_DST_NXPORT_INVALID, __func__, __LINE__);
1636 return;
1637 }
1638 struct kern_pbufpool *dpp = ring->ckr_pp;
1639 ASSERT(dpp->pp_max_frags > 1);
1640
1641 struct __kern_packet *pkt, *tpkt;
1642 /* state for super packet */
1643 struct __kern_packet *__single spkt = NULL;
1644 kern_packet_t sph = 0;
1645 kern_buflet_t __single sbuf = NULL;
1646 bool prev_csum_ok = false, csum_ok, agg_ok;
1647 uint16_t spkts = 0, bufcnt = 0;
1648 int err;
1649
1650 struct fsw_stats *fsws = &fsw->fsw_stats;
1651
1652 /* state for buflet batch alloc */
1653 uint32_t bh_cnt, bh_cnt_tmp;
1654 uint64_t buf_arr[MAX_BUFLET_COUNT];
1655 _dbuf_array_t dbuf_array = {.dba_is_buflet = true, .dba_num_dbufs = 0};
1656 uint32_t largest_spkt = 0; /* largest aggregated packet size */
1657 uint32_t agg_bufsize;
1658 uint8_t iter = 0;
1659 bool large_buffer = false;
1660
1661 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1662 SK_DF(logflags, "Rx input queue len %u", KPKTQ_LEN(rx_pkts));
1663
1664 if (__probable(fe->fe_rx_largest_size != 0 &&
1665 NX_FSW_TCP_RX_AGG_ENABLED())) {
1666 if (fe->fe_rx_largest_size <= PP_BUF_SIZE_DEF(dpp) ||
1667 PP_BUF_SIZE_LARGE(dpp) == 0) {
1668 agg_bufsize = PP_BUF_SIZE_DEF(dpp);
1669 } else {
1670 agg_bufsize = PP_BUF_SIZE_LARGE(dpp);
1671 large_buffer = true;
1672 }
1673 bh_cnt = estimate_buf_cnt(fe, rx_bytes, KPKTQ_LEN(rx_pkts),
1674 PP_BUF_SIZE_DEF(dpp), agg_bufsize);
1675 DTRACE_SKYWALK1(needed_blt_cnt_agg, uint32_t, bh_cnt);
1676 bh_cnt = MIN(bh_cnt, MAX_BUFLET_COUNT);
1677 bh_cnt_tmp = bh_cnt;
1678 } else {
1679 /*
1680 * No payload, thus it's all small-sized ACKs/...
1681 * OR aggregation is disabled.
1682 */
1683 agg_bufsize = PP_BUF_SIZE_DEF(dpp);
1684 bh_cnt_tmp = bh_cnt = MIN(KPKTQ_LEN(rx_pkts), MAX_BUFLET_COUNT);
1685 DTRACE_SKYWALK1(needed_blt_cnt_no_agg, uint32_t, bh_cnt);
1686 }
1687
1688 err = pp_alloc_buflet_batch(dpp, buf_arr, &bh_cnt, SKMEM_NOSLEEP,
1689 large_buffer);
1690 if (__improbable(bh_cnt == 0)) {
1691 SK_ERR("failed to alloc %u buflets (err %d), use slow path",
1692 bh_cnt_tmp, err);
1693 }
1694 bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1695 KPKTQ_FOREACH_SAFE(pkt, rx_pkts, tpkt) {
1696 if (tpkt != NULL) {
1697 void *baddr;
1698 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1699 SK_PREFETCH(baddr, 0);
1700 }
1701
1702 ASSERT(pkt->pkt_qum.qum_pp != dpp);
1703 ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
1704 ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
1705 ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
1706 ASSERT(!pkt->pkt_flow_ip_is_frag);
1707 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1708
1709 csum_ok = false;
1710 agg_ok = false;
1711 /* supports TCP only */
1712 uint32_t thlen = (pkt->pkt_flow_ip_hlen +
1713 pkt->pkt_flow_tcp_hlen);
1714 uint32_t plen = (thlen + pkt->pkt_flow_ulen);
1715 uint16_t data_csum = 0;
1716
1717 KPKTQ_REMOVE(rx_pkts, pkt);
1718 rx_bytes -= pkt->pkt_flow_ulen;
1719 err = flow_pkt_track(fe, pkt, true);
1720 if (__improbable(err != 0)) {
1721 STATS_INC(fsws, FSW_STATS_RX_FLOW_TRACK_ERR);
1722 /* if need to trigger RST */
1723 if (err == ENETRESET) {
1724 flow_track_abort_tcp(fe, pkt, NULL);
1725 }
1726 SK_DF(SK_VERB_FLOW_TRACK, "flow_pkt_track failed (err %d)", err);
1727 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
1728 DROP_REASON_FSW_FLOW_TRACK_ERR, DROPTAP_FLAG_DIR_IN);
1729 continue;
1730 }
1731
1732 if (is_mbuf) { /* compat */
1733 m_adj(pkt->pkt_mbuf, pkt->pkt_l2_len);
1734 pkt->pkt_svc_class = m_get_service_class(pkt->pkt_mbuf);
1735 if (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
1736 pkt->pkt_pflags |= PKT_F_WAKE_PKT;
1737 }
1738 }
1739
1740 if (prev_csum_ok && sbuf) {
1741 ASSERT(fa.fa_spkt == spkt);
1742 ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1743 agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
1744 agg_ok = (agg_ok && bufcnt < dpp->pp_max_frags);
1745
1746 if (agg_ok && sbuf->buf_dlim - sbuf->buf_doff -
1747 sbuf->buf_dlen >= plen - thlen) {
1748 /*
1749 * No need for a new packet, just
1750 * append to curr_m.
1751 */
1752 csum_ok = copy_pkt_csum_packed(pkt, plen, NULL,
1753 is_ipv4, NULL, sbuf, &data_csum, NULL);
1754
1755 if (!csum_ok) {
1756 STATS_INC(fsws,
1757 FSW_STATS_RX_AGG_BAD_CSUM);
1758 SK_ERR("Checksum for aggregation "
1759 "is wrong");
1760 DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail1);
1761 /*
1762 * Turns out, checksum is wrong!
1763 * Fallback to no-agg mode.
1764 */
1765 agg_ok = false;
1766 } else {
1767 flow_agg_merge_hdr(&fa, pkt,
1768 data_csum, fsws);
1769 goto next;
1770 }
1771 }
1772 }
1773
1774 /* calculate number of buflets required */
1775 bh_cnt_tmp = howmany(plen, agg_bufsize);
1776 if (__improbable(bh_cnt_tmp > MAX_BUFLET_COUNT)) {
1777 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1778 SK_ERR("packet too big: bufcnt %d len %d", bh_cnt_tmp,
1779 plen);
1780 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
1781 DROP_REASON_FSW_GSO_NOMEM_PKT, DROPTAP_FLAG_DIR_IN);
1782 continue;
1783 }
1784 if (bh_cnt < bh_cnt_tmp) {
1785 uint32_t tmp;
1786
1787 if (iter != 0) {
1788 /*
1789 * rearrange the array for additional
1790 * allocation
1791 */
1792 uint8_t i;
1793 for (i = 0; i < bh_cnt; i++, iter++) {
1794 buf_arr[i] = buf_arr[iter];
1795 buf_arr[iter] = 0;
1796 }
1797 iter = 0;
1798 }
1799 tmp = estimate_buf_cnt(fe, rx_bytes, KPKTQ_LEN(rx_pkts),
1800 PP_BUF_SIZE_DEF(dpp), agg_bufsize);
1801 tmp = MIN(tmp, MAX_BUFLET_COUNT);
1802 tmp = MAX(tmp, bh_cnt_tmp);
1803 tmp -= bh_cnt;
1804 ASSERT(tmp <= (MAX_BUFLET_COUNT - bh_cnt));
1805 DTRACE_SKYWALK1(refilled_blt_cnt, uint32_t, tmp);
1806 err = pp_alloc_buflet_batch(dpp, &buf_arr[bh_cnt],
1807 &tmp, SKMEM_NOSLEEP, large_buffer);
1808 bh_cnt += tmp;
1809 if (__improbable((tmp == 0) || (bh_cnt < bh_cnt_tmp))) {
1810 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1811 SK_ERR("buflet alloc failed (err %d)", err);
1812 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
1813 DROP_REASON_FSW_GSO_NOMEM_PKT, DROPTAP_FLAG_DIR_IN);
1814 continue;
1815 }
1816 }
1817 /* Use pre-allocated buflets */
1818 ASSERT(bh_cnt >= bh_cnt_tmp);
1819 dbuf_array.dba_num_dbufs = bh_cnt_tmp;
1820 while (bh_cnt_tmp-- > 0) {
1821 /*
1822 * -fbounds-safety: buf_arr[iter] is a uint64_t, so
1823 * forging it
1824 */
1825 dbuf_array.dba_buflet[bh_cnt_tmp] =
1826 __unsafe_forge_single(kern_buflet_t, buf_arr[iter]);
1827 buf_arr[iter] = 0;
1828 bh_cnt--;
1829 iter++;
1830 }
1831 /* copy and checksum TCP data */
1832 if (agg_ok) {
1833 int added = 0;
1834 ASSERT(dbuf_array.dba_num_dbufs != 0);
1835 csum_ok = copy_pkt_csum_packed(pkt, plen, &dbuf_array,
1836 is_ipv4, NULL, sbuf, &data_csum, &added);
1837
1838 if (__improbable(!csum_ok)) {
1839 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1840 SK_ERR("Checksum for aggregation on new "
1841 "mbuf is wrong");
1842 DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail2);
1843 agg_ok = false;
1844 /* reset the used buflets */
1845 uint8_t j;
1846 for (j = 0; j < dbuf_array.dba_num_dbufs; j++) {
1847 VERIFY(kern_buflet_set_data_length(
1848 dbuf_array.dba_buflet[j], 0) == 0);
1849 }
1850 goto non_agg;
1851 }
1852
1853 /*
1854 * There was not enough space in curr_m, thus we must
1855 * have added to m->m_data.
1856 */
1857 VERIFY(added > 0);
1858 } else {
1859 non_agg:
1860 ASSERT(dbuf_array.dba_num_dbufs != 0);
1861 csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
1862 &data_csum, is_ipv4);
1863 if (__improbable(!csum_ok)) {
1864 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1865 SK_ERR("%d incorrect csum", __LINE__);
1866 DTRACE_SKYWALK(aggr__chan_tcp_csum_fail);
1867 }
1868 }
1869 if (agg_ok) {
1870 ASSERT(fa.fa_spkt == spkt);
1871 ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1872 /* update current packet header */
1873 flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
1874 ASSERT(dbuf_array.dba_num_dbufs > 0);
1875 bufcnt += dbuf_array.dba_num_dbufs;
1876 _append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1877 &sbuf);
1878 } else {
1879 /* Finalize the current super packet */
1880 if (sph != 0) {
1881 finalize_super_packet(&spkt, &sph, &fa,
1882 &largest_spkt, &spkts, bufcnt);
1883 }
1884
1885 /* New super packet */
1886 err = kern_pbufpool_alloc_nosleep(dpp, 0, &sph);
1887 if (__improbable(err != 0)) {
1888 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1889 SK_ERR("packet alloc failed (err %d)", err);
1890 _free_dbuf_array(dpp, &dbuf_array);
1891 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt,
1892 DROP_REASON_FSW_GSO_NOMEM_PKT, DROPTAP_FLAG_DIR_IN);
1893 continue;
1894 }
1895 spkt = SK_PTR_ADDR_KPKT(sph);
1896 pkt_copy_metadata(pkt, spkt);
1897 /* Packet length for super packet starts from L3 */
1898 spkt->pkt_length = plen;
1899 spkt->pkt_flow_ulen = pkt->pkt_flow_ulen;
1900 spkt->pkt_headroom = 0;
1901 spkt->pkt_l2_len = 0;
1902 spkt->pkt_seg_cnt = 1;
1903
1904 ASSERT(dbuf_array.dba_num_dbufs > 0);
1905 bufcnt = dbuf_array.dba_num_dbufs;
1906 sbuf = kern_packet_get_next_buflet(sph, NULL);
1907 _append_dbuf_array_to_kpkt(sph, sbuf, &dbuf_array,
1908 &sbuf);
1909
1910 KPKTQ_ENQUEUE(&super_pkts, spkt);
1911 _UUID_COPY(spkt->pkt_flow_id, fe->fe_uuid);
1912 _UUID_COPY(spkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1913 spkt->pkt_policy_id = fe->fe_policy_id;
1914 spkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
1915 spkt->pkt_transport_protocol =
1916 fe->fe_transport_protocol;
1917 flow_agg_init_spkt(fsw, &fa, spkt, pkt);
1918 }
1919 next:
1920 pkt_agg_log(pkt, kernproc, true);
1921 prev_csum_ok = csum_ok;
1922 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1923 }
1924
1925 /* Free unused buflets */
1926 STATS_ADD(fsws, FSW_STATS_RX_WASTED_BFLT, bh_cnt);
1927 while (bh_cnt > 0) {
1928 /* -fbounds-saftey: buf_arr[iter] is a uint64_t, so forging it */
1929 pp_free_buflet(dpp, __unsafe_forge_single(kern_buflet_t,
1930 buf_arr[iter]));
1931 buf_arr[iter] = 0;
1932 bh_cnt--;
1933 iter++;
1934 }
1935 /* Finalize the last super packet */
1936 if (sph != 0) {
1937 finalize_super_packet(&spkt, &sph, &fa, &largest_spkt,
1938 &spkts, bufcnt);
1939 }
1940 converge_aggregation_size(fe, largest_spkt);
1941 DTRACE_SKYWALK1(aggr__spkt__count, uint16_t, spkts);
1942 if (__improbable(is_mbuf)) {
1943 STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2PKT, spkts);
1944 } else {
1945 STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2PKT, spkts);
1946 }
1947 FLOW_STATS_IN_ADD(fe, spackets, spkts);
1948
1949 KPKTQ_FINI(rx_pkts);
1950
1951 if (KPKTQ_LEN(&super_pkts) > 0) {
1952 fsw_ring_enqueue_tail_drop(fsw, ring, &super_pkts);
1953 }
1954 KPKTQ_FINI(&super_pkts);
1955
1956 pp_free_pktq(&disposed_pkts);
1957 }
1958
1959 /* streamline a smbuf */
1960 static bool
_finalize_smbuf(struct mbuf * smbuf)1961 _finalize_smbuf(struct mbuf *smbuf)
1962 {
1963 /* the 1st mbuf always contains something, so start with the 2nd one */
1964 struct mbuf *m_chained = smbuf->m_next;
1965 struct mbuf *prev_m = smbuf;
1966 bool freed = false;
1967
1968 while (m_chained != NULL) {
1969 if (m_chained->m_len != 0) {
1970 prev_m = m_chained;
1971 m_chained = m_chained->m_next;
1972 continue;
1973 }
1974 prev_m->m_next = m_chained->m_next;
1975 m_free(m_chained);
1976 m_chained = prev_m->m_next;
1977 freed = true;
1978 }
1979 return freed;
1980 }
1981
1982 SK_NO_INLINE_ATTRIBUTE
1983 static void
flow_rx_agg_host(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,struct mbufq * host_mq,uint32_t rx_bytes,bool is_mbuf)1984 flow_rx_agg_host(struct nx_flowswitch *fsw, struct flow_entry *fe,
1985 struct pktq *rx_pkts, struct mbufq *host_mq,
1986 uint32_t rx_bytes, bool is_mbuf)
1987 {
1988 #define __RX_AGG_HOST_DROP_SOURCE_PACKET(_pkt, _reason, _flags) do { \
1989 drop_packets++; \
1990 drop_bytes += (_pkt)->pkt_length; \
1991 pp_drop_packet_single(_pkt, fsw->fsw_ifp, _flags, _reason, __func__, __LINE__); \
1992 (_pkt) = NULL; \
1993 FLOW_AGG_CLEAR(&fa); \
1994 prev_csum_ok = false; \
1995 } while (0)
1996 struct flow_agg fa; /* states */
1997 FLOW_AGG_CLEAR(&fa);
1998
1999 struct pktq disposed_pkts; /* done src packets */
2000 KPKTQ_INIT(&disposed_pkts);
2001
2002 struct __kern_packet *pkt, *tpkt;
2003 /* points to the first mbuf of chain */
2004 struct mbuf *m_chain = NULL;
2005 /* super mbuf, at the end it points to last mbuf packet */
2006 struct mbuf *smbuf = NULL, *curr_m = NULL;
2007 bool prev_csum_ok = false, csum_ok, agg_ok;
2008 uint16_t smbufs = 0, smbuf_finalized = 0;
2009 uint32_t bytes = 0, rcvd_ulen = 0;
2010 uint32_t rcvd_packets = 0, rcvd_bytes = 0; /* raw packets & bytes */
2011 uint32_t drop_packets = 0, drop_bytes = 0; /* dropped packets & bytes */
2012 uint32_t largest_smbuf = 0;
2013 int err = 0;
2014
2015 struct fsw_stats *fsws = &fsw->fsw_stats;
2016 bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
2017
2018 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
2019
2020 /* state for mbuf batch alloc */
2021 uint32_t mhead_cnt = 0;
2022 uint32_t mhead_bufsize = 0;
2023 struct mbuf * mhead = NULL;
2024
2025 uint16_t l2len = KPKTQ_FIRST(rx_pkts)->pkt_l2_len;
2026
2027 SK_DF(logflags, "Rx input queue bytes %u", rx_bytes);
2028
2029 if (__probable(!is_mbuf)) {
2030 /*
2031 * Batch mbuf alloc is based on
2032 * convert_native_pkt_to_mbuf_chain
2033 */
2034 if (__probable(fe->fe_rx_largest_size != 0 &&
2035 NX_FSW_TCP_RX_AGG_ENABLED())) {
2036 unsigned int num_segs = 1;
2037 int pktq_len = KPKTQ_LEN(rx_pkts);
2038
2039 if (fe->fe_rx_largest_size <= MCLBYTES &&
2040 rx_bytes / pktq_len <= MCLBYTES) {
2041 mhead_bufsize = MCLBYTES;
2042 } else if (fe->fe_rx_largest_size <= MBIGCLBYTES &&
2043 rx_bytes / pktq_len <= MBIGCLBYTES) {
2044 mhead_bufsize = MBIGCLBYTES;
2045 } else if (fe->fe_rx_largest_size <= M16KCLBYTES &&
2046 rx_bytes / pktq_len <= M16KCLBYTES) {
2047 mhead_bufsize = M16KCLBYTES;
2048 } else {
2049 mhead_bufsize = M16KCLBYTES * 2;
2050 num_segs = 2;
2051 }
2052
2053 try_again:
2054 if (rx_bytes != 0) {
2055 mhead_cnt = estimate_buf_cnt(fe, rx_bytes, KPKTQ_LEN(rx_pkts),
2056 MCLBYTES, mhead_bufsize);
2057 } else {
2058 /* No payload, thus it's all small-sized ACKs/... */
2059 mhead_bufsize = MHLEN;
2060 mhead_cnt = pktq_len;
2061 }
2062
2063 mhead = m_allocpacket_internal(&mhead_cnt,
2064 mhead_bufsize, &num_segs, M_NOWAIT, 1, 0);
2065
2066 if (mhead == NULL) {
2067 if (mhead_bufsize > M16KCLBYTES) {
2068 mhead_bufsize = M16KCLBYTES;
2069 num_segs = 1;
2070 goto try_again;
2071 }
2072
2073 if (mhead_bufsize == M16KCLBYTES) {
2074 mhead_bufsize = MBIGCLBYTES;
2075 goto try_again;
2076 }
2077
2078 if (mhead_bufsize == MBIGCLBYTES) {
2079 mhead_bufsize = MCLBYTES;
2080 goto try_again;
2081 }
2082 }
2083 } else {
2084 mhead = NULL;
2085 mhead_bufsize = mhead_cnt = 0;
2086 }
2087 SK_DF(logflags, "batch alloc'ed %u mbufs of size %u", mhead_cnt,
2088 mhead_bufsize);
2089 }
2090
2091 KPKTQ_FOREACH_SAFE(pkt, rx_pkts, tpkt) {
2092 if (tpkt != NULL) {
2093 void *baddr;
2094 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
2095 SK_PREFETCH(baddr, 0);
2096 }
2097
2098 /* Validate l2 len, ip vers, is_mbuf */
2099 ASSERT(pkt->pkt_l2_len == l2len);
2100 ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
2101 ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
2102 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
2103 ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
2104 ASSERT(!pkt->pkt_flow_ip_is_frag);
2105 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
2106
2107 csum_ok = false;
2108 agg_ok = false;
2109 /*
2110 * As we only agg packets with same hdr length,
2111 * leverage the pkt metadata
2112 */
2113 uint32_t thlen = (pkt->pkt_flow_ip_hlen +
2114 pkt->pkt_flow_tcp_hlen);
2115 uint32_t plen = (thlen + pkt->pkt_flow_ulen);
2116
2117 /*
2118 * Rather than calling flow_pkt_track() for each
2119 * packet here, we accumulate received packet stats
2120 * for the call to flow_track_stats() below. This
2121 * is because flow tracking is a no-op for traffic
2122 * that belongs to the host stack.
2123 */
2124 rcvd_ulen += pkt->pkt_flow_ulen;
2125 rcvd_bytes += pkt->pkt_length;
2126 rcvd_packets++;
2127
2128 KPKTQ_REMOVE(rx_pkts, pkt);
2129 rx_bytes -= pkt->pkt_flow_ulen;
2130
2131 /* packet is for BSD flow, create a mbuf chain */
2132 uint32_t len = (l2len + plen);
2133 uint16_t data_csum = 0;
2134 struct mbuf *__single m;
2135 bool is_wake_pkt = false;
2136 if (__improbable(is_mbuf)) {
2137 m = pkt->pkt_mbuf;
2138
2139 if (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
2140 is_wake_pkt = true;
2141 }
2142
2143 /* Detach mbuf from source pkt */
2144 KPKT_CLEAR_MBUF_DATA(pkt);
2145
2146 uint32_t trailer = (m_pktlen(m) - len);
2147 ASSERT((uint32_t)m_pktlen(m) >= plen);
2148 /* Remove the trailer */
2149 if (trailer > 0) {
2150 m_adj(m, -trailer);
2151 }
2152 if ((uint32_t) m->m_len < (l2len + thlen)) {
2153 m = m_pullup(m, (l2len + thlen));
2154 if (m == NULL) {
2155 STATS_INC(fsws,
2156 FSW_STATS_RX_DROP_NOMEM_BUF);
2157 SK_ERR("mbuf pullup failed (err %d)",
2158 err);
2159 __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt,
2160 DROP_REASON_FSW_GSO_NOMEM_MBUF, DROPTAP_FLAG_DIR_IN);
2161 continue;
2162 }
2163 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2164 }
2165 /* attached mbuf is already allocated */
2166 csum_ok = mbuf_csum(pkt, m, is_ipv4, &data_csum);
2167 } else { /* native */
2168 uint16_t pad = P2ROUNDUP(l2len, sizeof(uint32_t)) -
2169 l2len;
2170 uint32_t tot_len = (len + pad);
2171 /* remember largest aggregated packet size */
2172 if (smbuf) {
2173 /* plus 4 bytes to account for padding */
2174 if (largest_smbuf <
2175 (uint32_t)m_pktlen(smbuf) + pad) {
2176 largest_smbuf = (uint32_t)m_pktlen(smbuf) + pad;
2177 }
2178 }
2179
2180 if ((pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
2181 is_wake_pkt = true;
2182 }
2183
2184 if (prev_csum_ok && curr_m) {
2185 ASSERT(fa.fa_smbuf == smbuf);
2186 ASSERT(!fa.fa_sobj_is_pkt);
2187 agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2188
2189 if (agg_ok &&
2190 M_TRAILINGSPACE(curr_m) >= plen - thlen) {
2191 /*
2192 * No need for a new mbuf,
2193 * just append to curr_m.
2194 */
2195 csum_ok = copy_pkt_csum_packed(pkt,
2196 plen, NULL, is_ipv4, curr_m, NULL,
2197 &data_csum, NULL);
2198
2199 if (!csum_ok) {
2200 STATS_INC(fsws,
2201 FSW_STATS_RX_AGG_BAD_CSUM);
2202 SK_ERR("Checksum for "
2203 "aggregation is wrong");
2204 DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail1);
2205 /*
2206 * Turns out, checksum is wrong!
2207 * Fallback to no-agg mode.
2208 */
2209 agg_ok = 0;
2210 } else {
2211 /*
2212 * We only added payload,
2213 * thus -thlen.
2214 */
2215 bytes += (plen - thlen);
2216 flow_agg_merge_hdr(&fa, pkt,
2217 data_csum, fsws);
2218 goto next;
2219 }
2220 }
2221 }
2222
2223 /*
2224 * If the batch allocation returned partial success,
2225 * we try blocking allocation here again
2226 */
2227 m = mhead;
2228 if (__improbable(m == NULL ||
2229 tot_len > mhead_bufsize)) {
2230 unsigned int num_segs = 1;
2231 if (tot_len > M16KCLBYTES) {
2232 num_segs = 0;
2233 }
2234
2235 ASSERT(mhead_cnt == 0 || mhead != NULL);
2236 err = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
2237 &num_segs, &m);
2238 if (err != 0) {
2239 STATS_INC(fsws,
2240 FSW_STATS_RX_DROP_NOMEM_BUF);
2241 SK_ERR("mbuf alloc failed (err %d), "
2242 "maxchunks %d, len %d", err, num_segs,
2243 tot_len);
2244 __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt,
2245 DROP_REASON_FSW_GSO_NOMEM_MBUF, DROPTAP_FLAG_DIR_IN);
2246 continue;
2247 }
2248 } else {
2249 ASSERT(mhead_cnt > 0);
2250 mhead = m->m_nextpkt;
2251 m->m_nextpkt = NULL;
2252 mhead_cnt--;
2253 }
2254 m->m_data += pad;
2255 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2256
2257 /*
2258 * copy and checksum l3, l4 and payload
2259 * l2 header is copied later only if we
2260 * can't agg as an optimization
2261 */
2262 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
2263 _dbuf_array_t dbuf_array = {.dba_is_buflet = false};
2264 if (agg_ok) {
2265 int added = 0, dbuf_idx = 0;
2266 struct mbuf *m_tmp = m;
2267 dbuf_array.dba_num_dbufs = 0;
2268 uint32_t m_chain_max_len = 0;
2269 while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
2270 dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
2271 dbuf_array.dba_num_dbufs += 1;
2272 m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
2273 m_tmp = m_tmp->m_next;
2274 dbuf_idx++;
2275 }
2276 ASSERT(m_tmp == NULL);
2277
2278 csum_ok = copy_pkt_csum_packed(pkt, plen,
2279 &dbuf_array, is_ipv4, curr_m, NULL,
2280 &data_csum, &added);
2281
2282 if (!csum_ok) {
2283 STATS_INC(fsws,
2284 FSW_STATS_RX_AGG_BAD_CSUM);
2285 SK_ERR("Checksum for aggregation "
2286 "on new mbuf is wrong");
2287 DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail2);
2288 agg_ok = false;
2289 goto non_agg;
2290 }
2291
2292 /*
2293 * There was not enough space in curr_m,
2294 * thus we must have added to m->m_data.
2295 */
2296 VERIFY(added > 0);
2297 VERIFY(m->m_len <= m->m_pkthdr.len &&
2298 (uint32_t)m->m_pkthdr.len <= m_chain_max_len);
2299
2300 /*
2301 * We account for whatever we added
2302 * to m later on, thus - added.
2303 */
2304 bytes += plen - thlen - added;
2305 } else {
2306 non_agg:
2307 dbuf_array.dba_num_dbufs = 0;
2308 uint32_t m_chain_max_len = 0;
2309 struct mbuf *m_tmp = m;
2310 int dbuf_idx = 0;
2311 while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
2312 dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
2313 dbuf_array.dba_num_dbufs += 1;
2314 m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
2315 m_tmp = m_tmp->m_next;
2316 dbuf_idx++;
2317 }
2318 ASSERT(m_tmp == NULL);
2319
2320 m->m_len += l2len;
2321 m->m_pkthdr.len += l2len;
2322 csum_ok = copy_pkt_csum(pkt, plen, &dbuf_array,
2323 &data_csum, is_ipv4);
2324 if (__improbable(!csum_ok)) {
2325 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
2326 SK_ERR("%d incorrect csum", __LINE__);
2327 DTRACE_SKYWALK(aggr__host_tcp_csum_fail);
2328 }
2329 VERIFY(m->m_len <= m->m_pkthdr.len &&
2330 (uint32_t)m->m_pkthdr.len <= m_chain_max_len);
2331 }
2332
2333 STATS_INC(fsws, FSW_STATS_RX_COPY_PKT2MBUF);
2334 STATS_INC(fsws, FSW_STATS_RX_COPY_SUM);
2335
2336 m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off;
2337 m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value;
2338 if (__improbable((pkt->pkt_link_flags &
2339 PKT_LINKF_BCAST) != 0)) {
2340 m->m_flags |= M_BCAST;
2341 }
2342 if (__improbable((pkt->pkt_link_flags &
2343 PKT_LINKF_MCAST) != 0)) {
2344 m->m_flags |= M_MCAST;
2345 }
2346 /*
2347 * Note that these flags have same value,
2348 * except PACKET_CSUM_PARTIAL
2349 */
2350 m->m_pkthdr.csum_flags |= (pkt->pkt_csum_flags &
2351 PACKET_CSUM_RX_FLAGS);
2352
2353 /* Set the rcvif */
2354 m->m_pkthdr.rcvif = fsw->fsw_ifp;
2355
2356 /* Make sure to propagate the wake pkt flag */
2357 if (is_wake_pkt) {
2358 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
2359 }
2360 }
2361 ASSERT(m != NULL);
2362 ASSERT((m->m_flags & M_PKTHDR) && m->m_pkthdr.pkt_hdr != NULL);
2363 ASSERT((m->m_flags & M_HASFCS) == 0);
2364 ASSERT(m->m_nextpkt == NULL);
2365
2366 if (__improbable(is_mbuf)) {
2367 if (prev_csum_ok && csum_ok) {
2368 ASSERT(fa.fa_smbuf == smbuf);
2369 agg_ok = flow_agg_is_ok(&fa, pkt, fsws);
2370 }
2371 }
2372
2373 if (agg_ok) {
2374 ASSERT(is_wake_pkt == false);
2375 ASSERT(fa.fa_smbuf == smbuf);
2376 ASSERT(!fa.fa_sobj_is_pkt);
2377 if (__improbable(is_mbuf)) {
2378 bytes += (m_pktlen(m) - l2len);
2379 /* adjust mbuf by l2, l3 and l4 hdr */
2380 m_adj(m, l2len + thlen);
2381 } else {
2382 bytes += m_pktlen(m);
2383 }
2384
2385 m->m_flags &= ~M_PKTHDR;
2386 flow_agg_merge_hdr(&fa, pkt, data_csum, fsws);
2387 while (curr_m->m_next != NULL) {
2388 curr_m = curr_m->m_next;
2389 }
2390 curr_m->m_next = m;
2391 curr_m = m;
2392 m = NULL;
2393 } else {
2394 if ((uint32_t) m->m_len < l2len) {
2395 m = m_pullup(m, l2len);
2396 if (m == NULL) {
2397 STATS_INC(fsws,
2398 FSW_STATS_RX_DROP_NOMEM_BUF);
2399 SK_ERR("mbuf pullup failed (err %d)",
2400 err);
2401 __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt,
2402 DROP_REASON_FSW_GSO_NOMEM_MBUF, DROPTAP_FLAG_DIR_IN);
2403 continue;
2404 }
2405 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2406 }
2407
2408 /* copy l2 header for native */
2409 if (__probable(!is_mbuf)) {
2410 uint16_t llhoff = pkt->pkt_headroom;
2411 uint8_t *baddr;
2412 MD_BUFLET_ADDR_ABS(pkt, baddr);
2413 ASSERT(baddr != NULL);
2414 baddr += llhoff;
2415 pkt_copy(baddr, m_mtod_current(m), l2len);
2416 }
2417 /* adjust mbuf by l2 hdr */
2418 m_adj(m, l2len);
2419 bytes += m_pktlen(m);
2420
2421 /*
2422 * aggregated packets can be skipped by pktap because
2423 * the original pre-aggregated chain already passed through
2424 * pktap (see fsw_snoop()) before entering this function.
2425 */
2426 m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
2427
2428 if (m_chain == NULL) {
2429 /* this is the start of the chain */
2430 m_chain = m;
2431 smbuf = m;
2432 curr_m = m;
2433 } else if (smbuf != NULL) {
2434 /*
2435 * set m to be next packet
2436 */
2437 mbuf_agg_log(smbuf, kernproc, is_mbuf);
2438 smbuf->m_nextpkt = m;
2439 /*
2440 * Clean up (finalize) a smbuf only if it pre-allocated >1 segments,
2441 * which only happens when mhead_bufsize > M16KCLBYTES
2442 */
2443 if (_finalize_smbuf(smbuf)) {
2444 FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
2445 }
2446 smbuf_finalized++;
2447 smbuf = m;
2448 curr_m = m;
2449 } else {
2450 VERIFY(0);
2451 }
2452
2453 smbufs++;
2454 m = NULL;
2455
2456 flow_agg_init_smbuf(fsw, &fa, smbuf, pkt);
2457 /*
2458 * If the super packet is an mbuf which can't accomodate
2459 * sizeof(struct ip_tcp_mask) or sizeof(struct ip6_tcp_mask)
2460 * in a single buffer, then do the aggregation check in slow path.
2461 * Note that on Intel platforms, an mbuf without cluster
2462 * has only 80 bytes available for data. That means if a
2463 * packet contains an Ethernet header, the mbuf won't be
2464 * able to fully contain "struct ip6_tcp_mask" or
2465 * "struct ip6_tcp_mask" data in a single buffer, because
2466 * sizeof(struct ip_tcp_mask) and sizeof(struct ip6_tcp_mask)
2467 * are all 80 bytes as well.
2468 */
2469 if (__improbable(smbuf->m_len <
2470 ((m_mtod_current(smbuf) - (caddr_t)(smbuf->m_pkthdr.pkt_hdr)) + MASK_SIZE))) {
2471 fa.fa_sobj_is_short = true;
2472 }
2473 }
2474 next:
2475 pkt_agg_log(pkt, kernproc, true);
2476 prev_csum_ok = csum_ok;
2477 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
2478 }
2479
2480 KPKTQ_FINI(rx_pkts);
2481
2482 /* Free any leftover mbufs, true only for native */
2483 if (__improbable(mhead != NULL)) {
2484 ASSERT(mhead_cnt != 0);
2485 STATS_ADD(fsws, FSW_STATS_RX_WASTED_MBUF, mhead_cnt);
2486 (void) m_freem_list(mhead);
2487 mhead = NULL;
2488 mhead_cnt = 0;
2489 }
2490
2491 converge_aggregation_size(fe, largest_smbuf);
2492
2493 if (smbufs > 0) {
2494 /* Last smbuf */
2495 mbuf_agg_log(smbuf, kernproc, is_mbuf);
2496 SK_DF(logflags, "smbuf count %u", smbufs);
2497
2498 ASSERT(m_chain != NULL);
2499 ASSERT(smbuf != NULL);
2500
2501 /*
2502 * If the last mbuf needs to be finalized (mhead_bufsize > M16KCLBYTES)
2503 * but is not (smbuf_finalized < smbuf), do it now.
2504 */
2505 if (smbuf_finalized < smbufs &&
2506 _finalize_smbuf(smbuf)) {
2507 FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
2508 }
2509
2510 /*
2511 * Enqueue smbufs for caller to process.
2512 */
2513 mchain_agg_log(m_chain, kernproc, is_mbuf);
2514 mbufq_enqueue(host_mq, m_chain, smbuf, smbufs, bytes);
2515
2516 if (__improbable(is_mbuf)) {
2517 STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2MBUF, smbufs);
2518 } else {
2519 STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2MBUF, smbufs);
2520 }
2521 FLOW_STATS_IN_ADD(fe, spackets, smbufs);
2522
2523 ASSERT((fe->fe_flags & FLOWENTF_TRACK) == 0);
2524 }
2525
2526 /* record (raw) number of packets and bytes */
2527 ASSERT((int)(rcvd_bytes - drop_bytes) >= 0);
2528 ASSERT((int)(rcvd_packets - drop_packets) >= 0);
2529 flow_track_stats(fe, (rcvd_bytes - drop_bytes),
2530 (rcvd_packets - drop_packets), (rcvd_ulen != 0), true);
2531
2532 pp_free_pktq(&disposed_pkts);
2533 }
2534
2535 void
flow_rx_agg_tcp(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,uint32_t rx_bytes,struct mbufq * host_mq,uint32_t flags)2536 flow_rx_agg_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe,
2537 struct pktq *rx_pkts, uint32_t rx_bytes, struct mbufq *host_mq,
2538 uint32_t flags)
2539 {
2540 #pragma unused(flags)
2541 struct pktq dropped_pkts;
2542 bool is_mbuf;
2543
2544 if (__improbable((flags & FLOW_PROC_FLAG_FRAGMENTS) != 0)) {
2545 dp_flow_rx_process(fsw, fe, rx_pkts, rx_bytes, host_mq, FLOW_PROC_FLAG_FRAGMENTS);
2546 return;
2547 }
2548
2549 KPKTQ_INIT(&dropped_pkts);
2550
2551 if (!dp_flow_rx_route_process(fsw, fe)) {
2552 SK_ERR("Rx route bad");
2553 fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
2554 STATS_ADD(&fsw->fsw_stats, FSW_STATS_RX_FLOW_NONVIABLE,
2555 KPKTQ_LEN(&dropped_pkts));
2556 pp_drop_pktq(&dropped_pkts, fsw->fsw_ifp, DROPTAP_FLAG_DIR_IN,
2557 DROP_REASON_FSW_FLOW_NONVIABLE, __func__, __LINE__);
2558 return;
2559 }
2560
2561 is_mbuf = !!(PKT_IS_MBUF(KPKTQ_FIRST(rx_pkts)));
2562
2563 if (fe->fe_nx_port == FSW_VP_HOST) {
2564 boolean_t do_rx_agg;
2565
2566 /* BSD flow */
2567 if (sk_fsw_rx_agg_tcp_host != SK_FSW_RX_AGG_TCP_HOST_AUTO) {
2568 do_rx_agg = (sk_fsw_rx_agg_tcp_host ==
2569 SK_FSW_RX_AGG_TCP_HOST_ON);
2570 } else {
2571 do_rx_agg = !dlil_has_ip_filter() &&
2572 !dlil_has_if_filter(fsw->fsw_ifp);
2573 }
2574 if (__improbable(!do_rx_agg)) {
2575 fsw_host_rx_enqueue_mbq(fsw, rx_pkts, host_mq);
2576 return;
2577 }
2578 if (__improbable(pktap_total_tap_count != 0)) {
2579 fsw_snoop(fsw, fe, rx_pkts, true);
2580 }
2581 flow_rx_agg_host(fsw, fe, rx_pkts, host_mq, rx_bytes, is_mbuf);
2582 } else {
2583 /* channel flow */
2584 if (__improbable(pktap_total_tap_count != 0)) {
2585 fsw_snoop(fsw, fe, rx_pkts, true);
2586 }
2587 flow_rx_agg_channel(fsw, fe, rx_pkts, rx_bytes, is_mbuf);
2588 }
2589 }
2590