1 /*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 */
53
54 /*
55 * BSD LICENSE
56 *
57 * Copyright(c) 2015 NEC Europe Ltd. All rights reserved.
58 * All rights reserved.
59 *
60 * Redistribution and use in source and binary forms, with or without
61 * modification, are permitted provided that the following conditions
62 * are met:
63 *
64 * * Redistributions of source code must retain the above copyright
65 * notice, this list of conditions and the following disclaimer.
66 * * Redistributions in binary form must reproduce the above copyright
67 * notice, this list of conditions and the following disclaimer in
68 * the documentation and/or other materials provided with the
69 * distribution.
70 * * Neither the name of NEC Europe Ltd. nor the names of
71 * its contributors may be used to endorse or promote products derived
72 * from this software without specific prior written permission.
73 *
74 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
75 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
76 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
77 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
78 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
79 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
80 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
81 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
82 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
83 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
84 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85 */
86
87 #include <skywalk/os_skywalk_private.h>
88 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
89 #include <skywalk/nexus/flowswitch/fsw_var.h>
90 #include <skywalk/nexus/netif/nx_netif.h>
91 #include <skywalk/nexus/netif/nx_netif_compat.h>
92 #include <kern/sched_prim.h>
93 #include <sys/kdebug.h>
94 #include <sys/sdt.h>
95 #include <net/bpf.h>
96 #include <net/if_ports_used.h>
97 #include <net/pktap.h>
98 #include <net/pktsched/pktsched_netem.h>
99 #include <netinet/tcp.h>
100 #include <netinet/tcp_fsm.h>
101 #include <netinet/tcp_seq.h>
102 #include <netinet/udp.h>
103 #include <netinet/ip.h>
104 #include <netinet/ip6.h>
105
106 extern kern_return_t thread_terminate(thread_t);
107
108 #define FSW_ZONE_MAX 256
109 #define FSW_ZONE_NAME "skywalk.nx.fsw"
110
111 #define FSW_STATS_VAL(x) STATS_VAL(&fsw->fsw_stats, x)
112 #define FSW_STATS_INC(x) STATS_INC(&fsw->fsw_stats, x)
113 #define FSW_STATS_ADD(x, n) STATS_ADD(&fsw->fsw_stats, x, n)
114
115 static uint64_t fsw_reap_last __sk_aligned(8);
116 static uint64_t fsw_want_purge __sk_aligned(8);
117
118 #define NX_FSW_FE_TABLESZ 256 /* some power of 2 */
119 static uint32_t fsw_fe_table_size = NX_FSW_FE_TABLESZ;
120
121 #define NX_FSW_FOB_HASHSZ 31 /* some mersenne prime */
122 static uint32_t fsw_flow_owner_buckets = NX_FSW_FOB_HASHSZ;
123
124 #define NX_FSW_FRB_HASHSZ 128 /* some power of 2 */
125 static uint32_t fsw_flow_route_buckets = NX_FSW_FRB_HASHSZ;
126
127 #define NX_FSW_FRIB_HASHSZ 13 /* some mersenne prime */
128 static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ;
129
130 #define NX_FSW_FLOW_REAP_INTERVAL 1 /* seconds */
131 static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL;
132
133 #define NX_FSW_FLOW_PURGE_THRES 0 /* purge every N reaps (0 = disable) */
134 static uint32_t fsw_flow_purge_thresh = NX_FSW_FLOW_PURGE_THRES;
135
136 #define FSW_REAP_IVAL (MAX(1, fsw_flow_reap_interval))
137 #define FSW_REAP_SK_THRES (FSW_REAP_IVAL << 5)
138 #define FSW_REAP_IF_THRES (FSW_REAP_IVAL << 5)
139 #define FSW_DRAIN_CH_THRES (FSW_REAP_IVAL << 5)
140 #define FSW_IFSTATS_THRES 1
141
142 #define RX_BUFLET_BATCH_COUNT 64 /* max batch size for buflet allocation */
143
144 uint32_t fsw_rx_batch = NX_FSW_RXBATCH; /* # of packets per batch (RX) */
145 uint32_t fsw_tx_batch = NX_FSW_TXBATCH; /* # of packets per batch (TX) */
146 #if (DEVELOPMENT || DEBUG)
147 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_batch,
148 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_batch, 0,
149 "flowswitch Rx batch size");
150 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, tx_batch,
151 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_tx_batch, 0,
152 "flowswitch Tx batch size");
153 #endif /* !DEVELOPMENT && !DEBUG */
154
155 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp,
156 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp, 0,
157 "flowswitch RX aggregation for tcp flows (enable/disable)");
158 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp_host,
159 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp_host, 0,
160 "flowswitch RX aggregation for tcp kernel path (0/1/2 (off/on/auto))");
161
162 /*
163 * IP reassembly
164 * The "kern.skywalk.flowswitch.ip_reass" sysctl can be used to force
165 * enable/disable the reassembly routine regardless of whether the
166 * transport netagent is enabled or not.
167 *
168 * 'fsw_ip_reass' is a tri-state:
169 * 0 means force IP reassembly off
170 * 1 means force IP reassembly on
171 * 2 means don't force the value, use what's appropriate for this flowswitch
172 */
173 #define FSW_IP_REASS_FORCE_OFF 0
174 #define FSW_IP_REASS_FORCE_ON 1
175 #define FSW_IP_REASS_NO_FORCE 2
176
177 uint32_t fsw_ip_reass = FSW_IP_REASS_NO_FORCE;
178
179 static int
180 fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS
181 {
182 #pragma unused(oidp, arg1, arg2)
183 unsigned int new_value;
184 int changed;
185 int error;
186
187 error = sysctl_io_number(req, fsw_ip_reass, sizeof(fsw_ip_reass),
188 &new_value, &changed);
189 if (error == 0 && changed != 0) {
190 if (new_value > FSW_IP_REASS_NO_FORCE) {
191 return EINVAL;
192 }
193 fsw_ip_reass = new_value;
194 }
195 return error;
196 }
197
198 SYSCTL_PROC(_kern_skywalk_flowswitch, OID_AUTO, ip_reass,
199 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
200 0, 0, fsw_ip_reass_sysctl, "IU",
201 "adjust flowswitch IP reassembly");
202
203 #if (DEVELOPMENT || DEBUG)
204 static uint64_t _fsw_inject_error = 0;
205 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) \
206 _SK_INJECT_ERROR(_fsw_inject_error, _en, _ev, _ec, \
207 &FSW_STATS_VAL(_FSW_STATS_ERROR_INJECTIONS), _f, __VA_ARGS__)
208
209 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { \
210 if (__improbable(((_fsw_inject_error) & (1ULL << (_en))) != 0)) { \
211 SK_DF(SK_VERB_ERROR_INJECT, "injecting error %d", (_en));\
212 if ((_f) != NULL) \
213 (_f)(__VA_ARGS__); \
214 } \
215 } while (0)
216
217 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_owner_buckets,
218 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_owner_buckets, 0, "");
219 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, fe_table_size,
220 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_fe_table_size, 0, "");
221 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_buckets,
222 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_route_buckets, 0, "");
223 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO,
224 flow_route_id_buckets, CTLFLAG_RW | CTLFLAG_LOCKED,
225 &fsw_flow_route_id_buckets, 0, "");
226 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_reap_interval,
227 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_reap_interval, 0, "");
228 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_purge_thresh,
229 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_purge_thresh, 0, "");
230 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, fsw_inject_error,
231 CTLFLAG_RW | CTLFLAG_LOCKED, &_fsw_inject_error, "");
232 #else
233 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0)
234 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { } while (0)
235 #endif /* !DEVELOPMENT && !DEBUG */
236
237 static void fsw_linger_remove_internal(struct flow_entry_linger_head *,
238 struct flow_entry *);
239 static void fsw_reap_thread_func(void *, wait_result_t);
240 static void fsw_reap_thread_cont(void *, wait_result_t);
241 static void fsw_purge_cache(struct nx_flowswitch *, boolean_t);
242 static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t);
243 static uint32_t fsw_process_deferred(struct nx_flowswitch *);
244 static uint32_t fsw_process_linger(struct nx_flowswitch *, uint32_t *);
245
246 static int copy_packet_from_dev(struct nx_flowswitch *, struct __kern_packet *,
247 struct __kern_packet *);
248
249 static void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *, kern_packet_t);
250 static void fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *, uint32_t,
251 uint32_t, uint32_t);
252
253 static int __fsw_dp_inited = 0;
254
255 int
fsw_dp_init(void)256 fsw_dp_init(void)
257 {
258 _CASSERT(FSW_VP_DEV == 0);
259 _CASSERT(FSW_VP_HOST == 1);
260 _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN);
261 _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT);
262
263 ASSERT(!__fsw_dp_inited);
264
265 flow_mgr_init();
266 flow_init();
267
268 __fsw_dp_inited = 1;
269
270 return 0;
271 }
272
273 void
fsw_dp_uninit(void)274 fsw_dp_uninit(void)
275 {
276 if (__fsw_dp_inited) {
277 flow_fini();
278 flow_mgr_fini();
279
280 __fsw_dp_inited = 0;
281 }
282 }
283
284 static void
dp_free_pktq(struct nx_flowswitch * fsw __sk_unused,struct pktq * pktq)285 dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq)
286 {
287 pp_free_pktq(pktq);
288 }
289
290 #define dp_drop_pktq(fsw, pktq) do { \
291 uint32_t _len = KPKTQ_LEN(pktq); \
292 if (KPKTQ_EMPTY(pktq)) { \
293 ASSERT(_len == 0); \
294 return; \
295 } \
296 SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop %d packets", _len); \
297 FSW_STATS_ADD(FSW_STATS_DROP, _len); \
298 DTRACE_SKYWALK1(fsw__dp__drop, int, _len); \
299 dp_free_pktq(fsw, pktq); \
300 } while (0)
301
302 SK_NO_INLINE_ATTRIBUTE
303 void
fsw_snoop(struct nx_flowswitch * fsw,struct flow_entry * fe,bool input)304 fsw_snoop(struct nx_flowswitch *fsw, struct flow_entry *fe, bool input)
305 {
306 pid_t pid;
307 char proc_name_buf[FLOW_PROCESS_NAME_LENGTH];
308 char *proc_name = NULL;
309 pid_t epid;
310 char eproc_name_buf[FLOW_PROCESS_NAME_LENGTH];
311 char *eproc_name = NULL;
312 sa_family_t af;
313 bool tap_early = false;
314 struct __kern_packet *pkt;
315
316 ASSERT(fe != NULL);
317 ASSERT(fsw->fsw_ifp != NULL);
318
319 if (fe->fe_nx_port == FSW_VP_HOST) {
320 /* allow packets to be tapped before aggregation happens */
321 tap_early = (input && fe->fe_key.fk_proto == IPPROTO_TCP);
322 if (!tap_early) {
323 /* all other traffic will be tapped in the dlil input path */
324 return;
325 }
326 }
327 if (fe->fe_key.fk_ipver == IPVERSION) {
328 af = AF_INET;
329 } else if (fe->fe_key.fk_ipver == IPV6_VERSION) {
330 af = AF_INET6;
331 } else {
332 return;
333 }
334
335 pid = fe->fe_pid;
336 if (fe->fe_proc_name[0] != '\0') {
337 (void) strlcpy(proc_name_buf, fe->fe_proc_name,
338 sizeof(proc_name_buf));
339 proc_name = proc_name_buf;
340 }
341 epid = fe->fe_epid;
342 if (fe->fe_eproc_name[0] != '\0') {
343 (void) strlcpy(eproc_name_buf, fe->fe_eproc_name,
344 sizeof(eproc_name_buf));
345 eproc_name = eproc_name_buf;
346 }
347 if (input) {
348 KPKTQ_FOREACH(pkt, &fe->fe_rx_pktq) {
349 pktap_input_packet(fsw->fsw_ifp, af,
350 fsw->fsw_ifp_dlt, pid, proc_name, epid,
351 eproc_name, SK_PKT2PH(pkt), NULL, 0,
352 IPPROTO_TCP, fe->fe_inp_flowhash,
353 tap_early ? PTH_FLAG_SOCKET: PTH_FLAG_NEXUS_CHAN);
354 }
355 } else {
356 KPKTQ_FOREACH(pkt, &fe->fe_tx_pktq) {
357 pktap_output_packet(fsw->fsw_ifp, af,
358 fsw->fsw_ifp_dlt, pid, proc_name, epid,
359 eproc_name, SK_PKT2PH(pkt), NULL, 0,
360 0, 0, PTH_FLAG_NEXUS_CHAN);
361 }
362 }
363 }
364
365 #if (DEVELOPMENT || DEBUG)
366 static void
_fsw_error35_handler(int step,struct flow_route * fr,struct __kern_packet * pkt,int * ret)367 _fsw_error35_handler(int step, struct flow_route *fr, struct __kern_packet *pkt,
368 int *ret)
369 {
370 static boolean_t _err35_flag_modified = FALSE;
371
372 switch (step) {
373 case 1:
374 if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
375 (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
376 fr->fr_flags &= ~FLOWRTF_RESOLVED;
377 _err35_flag_modified = TRUE;
378 }
379 break;
380
381 case 2:
382 if (!_err35_flag_modified) {
383 return;
384 }
385 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
386 m_freem(pkt->pkt_mbuf);
387 pkt->pkt_pflags &= ~PKT_F_MBUF_DATA;
388 pkt->pkt_mbuf = NULL;
389 }
390 *ret = EJUSTRETURN;
391 fr->fr_flags |= FLOWRTF_RESOLVED;
392 _err35_flag_modified = FALSE;
393 break;
394
395 default:
396 VERIFY(0);
397 /* not reached */
398 }
399 }
400
401 static void
_fsw_error36_handler(int step,struct flow_route * fr,int * ret)402 _fsw_error36_handler(int step, struct flow_route *fr, int *ret)
403 {
404 static boolean_t _err36_flag_modified = FALSE;
405
406 switch (step) {
407 case 1:
408 if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
409 (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
410 fr->fr_flags &= ~FLOWRTF_RESOLVED;
411 _err36_flag_modified = TRUE;
412 }
413 break;
414
415 case 2:
416 if (!_err36_flag_modified) {
417 return;
418 }
419 *ret = ENETUNREACH;
420 fr->fr_flags |= FLOWRTF_RESOLVED;
421 _err36_flag_modified = FALSE;
422 break;
423
424 default:
425 VERIFY(0);
426 /* not reached */
427 }
428 }
429 #else /* !DEVELOPMENT && !DEBUG */
430 #define _fsw_error35_handler(...)
431 #define _fsw_error36_handler(...)
432 #endif /* DEVELOPMENT || DEBUG */
433
434 /*
435 * Check if the source packet content can fit into the destination
436 * ring's packet. Returns TRUE if the source packet can fit.
437 * Note: Failures could be caused by misconfigured packet pool sizes,
438 * missing packet size check again MTU or if the source packet is from
439 * a compat netif and the attached mbuf is larger than MTU due to LRO.
440 */
441 static inline boolean_t
validate_pkt_len(struct __kern_packet * spkt,kern_packet_t dph,uint32_t skip_l2hlen,uint32_t l2hlen,uint16_t headroom,uint32_t * copy_len)442 validate_pkt_len(struct __kern_packet *spkt, kern_packet_t dph,
443 uint32_t skip_l2hlen, uint32_t l2hlen, uint16_t headroom,
444 uint32_t *copy_len)
445 {
446 uint32_t tlen = 0;
447 uint32_t splen = spkt->pkt_length - skip_l2hlen;
448
449 if (l2hlen != 0) {
450 VERIFY(skip_l2hlen == 0);
451 tlen += l2hlen;
452 } else if ((spkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0) {
453 splen -= ETHER_CRC_LEN;
454 }
455
456 tlen += splen;
457 *copy_len = splen;
458
459 return tlen <= ((__packet_get_buflet_count(dph) *
460 SK_PTR_ADDR_KPKT(dph)->pkt_qum.qum_pp->pp_buflet_size) - headroom);
461 }
462
463 #if SK_LOG
464 /* Hoisted out of line to reduce kernel stack footprint */
465 SK_LOG_ATTRIBUTE
466 static void
copy_packet_from_dev_log(struct __kern_packet * spkt,struct __kern_packet * dpkt,struct proc * p)467 copy_packet_from_dev_log(struct __kern_packet *spkt,
468 struct __kern_packet *dpkt, struct proc *p)
469 {
470 uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
471 ((spkt->pkt_pflags & PKT_F_MBUF_DATA) ?
472 SK_VERB_COPY_MBUF : SK_VERB_COPY));
473 char *daddr;
474 MD_BUFLET_ADDR_ABS(dpkt, daddr);
475 SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u",
476 sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
477 dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
478 (uint32_t)dpkt->pkt_l2_len);
479 SK_DF(logflags | SK_VERB_DUMP, "%s",
480 sk_dump("buf", daddr, dpkt->pkt_length, 128, NULL, 0));
481 }
482 #else
483 #define copy_packet_from_dev_log(...)
484 #endif /* SK_LOG */
485
486
487 static inline int
copy_packet_from_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)488 copy_packet_from_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
489 struct __kern_packet *dpkt)
490 {
491 /*
492 * source and destination nexus don't share the packet pool
493 * sync operation here is to
494 * - alloc packet for the rx(dst) ring
495 * - copy data/metadata from src packet to dst packet
496 * - attach alloc'd packet to rx(dst) ring
497 */
498 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
499 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
500 kern_packet_t sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt),
501 METADATA_SUBTYPE(spkt));
502 boolean_t do_cksum_rx;
503 uint16_t skip_l2h_len = spkt->pkt_l2_len;
504 uint16_t iphlen;
505 uint32_t dlen;
506 int err;
507
508 if (__improbable(!validate_pkt_len(spkt, dph, skip_l2h_len, 0, 0,
509 &dlen))) {
510 SK_ERR("bufcnt %d, bufsz %d", __packet_get_buflet_count(dph),
511 dpkt->pkt_qum.qum_pp->pp_buflet_size);
512 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
513 return EINVAL;
514 }
515
516 /* Copy packet metadata */
517 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
518 _PKT_COPY(spkt, dpkt);
519 ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
520 PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
521 ASSERT(dpkt->pkt_mbuf == NULL);
522
523 dpkt->pkt_headroom = 0;
524 dpkt->pkt_l2_len = 0;
525
526 /* don't include IP header from partial sum */
527 if (__probable((spkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) {
528 iphlen = spkt->pkt_flow_ip_hlen;
529 do_cksum_rx = sk_cksum_rx;
530 } else {
531 iphlen = 0;
532 do_cksum_rx = FALSE;
533 }
534
535 /* Copy packet payload */
536 if ((spkt->pkt_pflags & PKT_F_MBUF_DATA) &&
537 (spkt->pkt_pflags & PKT_F_TRUNCATED)) {
538 FSW_STATS_INC(FSW_STATS_RX_COPY_MBUF2PKT);
539 /*
540 * Source packet has truncated contents (just enough for
541 * the classifer) of an mbuf from the compat driver; copy
542 * the entire entire mbuf contents to destination packet.
543 */
544 m_adj(spkt->pkt_mbuf, skip_l2h_len);
545 ASSERT((uint32_t)m_pktlen(spkt->pkt_mbuf) >= dlen);
546 fsw->fsw_pkt_copy_from_mbuf(NR_RX, dph, 0,
547 spkt->pkt_mbuf, 0, dlen, do_cksum_rx, iphlen);
548 } else {
549 FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2PKT);
550 /*
551 * Source packet has full contents, either from an mbuf
552 * that came up from the compat driver, or because it
553 * originated on the native driver; copy to destination.
554 */
555 fsw->fsw_pkt_copy_from_pkt(NR_RX, dph, 0, sph,
556 (spkt->pkt_headroom + spkt->pkt_l2_len), dlen, do_cksum_rx,
557 iphlen, 0, FALSE);
558 }
559
560 #if DEBUG || DEVELOPMENT
561 if (__improbable(pkt_trailers > 0)) {
562 dlen += pkt_add_trailers(dph, dlen, iphlen);
563 }
564 #endif /* DEBUG || DEVELOPMENT */
565
566 /* Finalize and attach packet to Rx ring */
567 METADATA_ADJUST_LEN(dpkt, 0, 0);
568 err = __packet_finalize(dph);
569 VERIFY(err == 0);
570
571 copy_packet_from_dev_log(spkt, dpkt, kernproc);
572
573 if (spkt->pkt_pflags & PKT_F_MBUF_DATA) {
574 ifp_inc_traffic_class_in(fsw->fsw_ifp, spkt->pkt_mbuf);
575 mbuf_free(spkt->pkt_mbuf);
576 KPKT_CLEAR_MBUF_DATA(spkt);
577 } else {
578 fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph);
579 }
580
581 if (__probable(do_cksum_rx != 0)) {
582 FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
583 }
584
585 return 0;
586 }
587
588 SK_NO_INLINE_ATTRIBUTE
589 static struct __kern_packet *
rx_process_ip_frag(struct nx_flowswitch * fsw,struct __kern_packet * pkt)590 rx_process_ip_frag(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
591 {
592 char *pkt_buf;
593 void *l3_hdr;
594 uint16_t nfrags, tlen;
595 int err = 0;
596
597 switch (fsw_ip_reass) {
598 case FSW_IP_REASS_FORCE_OFF:
599 return pkt;
600 case FSW_IP_REASS_FORCE_ON:
601 break;
602 default:
603 if (!FSW_NETAGENT_ENABLED(fsw)) {
604 return pkt;
605 }
606 break;
607 }
608
609 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
610 l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
611
612 ASSERT(fsw->fsw_ipfm != NULL);
613 ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
614
615 if (pkt->pkt_flow_ip_ver == IPVERSION) {
616 err = fsw_ip_frag_reass_v4(fsw->fsw_ipfm, &pkt,
617 (struct ip *)l3_hdr, &nfrags, &tlen);
618 } else {
619 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
620 /* we only handle frag header immediately after v6 header */
621 err = fsw_ip_frag_reass_v6(fsw->fsw_ipfm, &pkt,
622 (struct ip6_hdr *)l3_hdr,
623 (struct ip6_frag *)((uintptr_t)l3_hdr + sizeof(struct ip6_hdr)),
624 &nfrags, &tlen);
625 }
626 if (__improbable(err != 0)) {
627 /* if we get a bad fragment, free it */
628 pp_free_packet_single(pkt);
629 pkt = NULL;
630 } else {
631 ASSERT(!((pkt != NULL) ^ (nfrags > 0)));
632 }
633
634 return pkt;
635 }
636
637 SK_NO_INLINE_ATTRIBUTE
638 static void
rx_prepare_packet_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)639 rx_prepare_packet_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
640 {
641 ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
642 uint32_t mlen = (uint32_t)m_pktlen(pkt->pkt_mbuf);
643 kern_packet_t ph = SK_PTR_ENCODE(pkt,
644 METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
645 /*
646 * This is the case when the packet is coming in from
647 * compat-netif. This packet only has valid metadata
648 * and an attached mbuf. We need to copy enough data
649 * from the mbuf to the packet buffer for the
650 * classifier. Compat netif packet pool is configured
651 * with buffer size of NETIF_COMPAT_MAX_MBUF_DATA_COPY
652 * which is just enough to hold the protocol headers
653 * for the flowswitch classifier.
654 */
655
656 pkt->pkt_headroom = 0;
657 METADATA_ADJUST_LEN(pkt, 0, 0);
658 /*
659 * Copy the initial 128 bytes of the packet for
660 * classification.
661 * Ethernet(14) + IPv6 header(40) +
662 * + IPv6 fragment header(8) +
663 * TCP header with options(60).
664 */
665 fsw->fsw_pkt_copy_from_mbuf(NR_RX, ph,
666 pkt->pkt_headroom, pkt->pkt_mbuf, 0,
667 MIN(mlen, NETIF_COMPAT_MAX_MBUF_DATA_COPY),
668 FALSE, 0);
669
670 int err = __packet_finalize_with_mbuf(pkt);
671 VERIFY(err == 0);
672 }
673
674 static struct __kern_packet *
rx_prepare_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)675 rx_prepare_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
676 {
677 pkt->pkt_qum_qflags &= ~QUM_F_FLOW_CLASSIFIED;
678
679 if (__improbable(pkt->pkt_pflags & PKT_F_MBUF_DATA)) {
680 rx_prepare_packet_mbuf(fsw, pkt);
681 }
682
683 return pkt;
684 }
685
686 static struct flow_entry *
lookup_flow_with_key(struct nx_flowswitch * fsw,struct __kern_packet * pkt,bool input,struct flow_entry * prev_fe)687 lookup_flow_with_key(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
688 bool input, struct flow_entry *prev_fe)
689 {
690 struct flow_key key __sk_aligned(16);
691 struct flow_entry *fe;
692
693 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
694 flow_pkt2key(pkt, input, &key);
695
696 if (__probable(prev_fe != NULL &&
697 prev_fe->fe_key.fk_mask == FKMASK_5TUPLE)) {
698 uint16_t saved_mask = key.fk_mask;
699 bool match;
700 key.fk_mask = FKMASK_5TUPLE;
701 match = (flow_key_cmp_mask(&prev_fe->fe_key,
702 &key, &fk_mask_5tuple)) == 0;
703 if (match) {
704 flow_entry_retain(prev_fe);
705 return prev_fe;
706 }
707 key.fk_mask = saved_mask;
708 }
709
710 fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &key);
711
712 SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]);
713 SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP,
714 "%s %s %s \"%s\" fe 0x%llx",
715 input ? "Rx" : "Tx", if_name(fsw->fsw_ifp),
716 sk_proc_name_address(current_proc()),
717 fk_as_string(&key, fkbuf, sizeof(fkbuf)),
718 SK_KVA(fe));
719
720 return fe;
721 }
722
723 static struct flow_entry *
rx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)724 rx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
725 struct flow_entry *prev_fe)
726 {
727 struct flow_entry *fe;
728 fe = lookup_flow_with_key(fsw, pkt, true, prev_fe);
729 _FSW_INJECT_ERROR(2, fe, NULL, flow_entry_release, &fe);
730 if (fe == NULL) {
731 FSW_STATS_INC(FSW_STATS_RX_FLOW_NOT_FOUND);
732 fe = flow_mgr_get_host_fe(fsw->fsw_flow_mgr);
733 }
734
735 if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
736 FSW_STATS_INC(FSW_STATS_RX_FLOW_TORNDOWN);
737 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
738 "Rx flow torn down, use host fe");
739 flow_entry_release(&fe);
740 fe = flow_mgr_get_host_fe(fsw->fsw_flow_mgr);
741 }
742
743 return fe;
744 }
745
746 static inline void
rx_flow_batch_packet(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt)747 rx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
748 struct __kern_packet *pkt)
749 {
750 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
751 fe->fe_rx_frag_count++;
752 }
753
754 /* KPKTQ_ENQUEUE_LIST is needed until frags become chained buflet */
755 if (KPKTQ_EMPTY(&fe->fe_rx_pktq)) {
756 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) == 0);
757 TAILQ_INSERT_TAIL(fes, fe, fe_rx_link);
758 KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
759 } else {
760 ASSERT(!TAILQ_EMPTY(fes));
761 KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
762 flow_entry_release(&fe);
763 }
764 }
765
766 static void
tx_flow_batch_packet(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt)767 tx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
768 struct __kern_packet *pkt)
769 {
770 /* record frag continuation */
771 if (__improbable(pkt->pkt_flow_ip_is_first_frag)) {
772 ASSERT(pkt->pkt_flow_ip_is_frag);
773 fe->fe_tx_is_cont_frag = true;
774 fe->fe_tx_frag_id = pkt->pkt_flow_ip_frag_id;
775 } else if (__probable(!pkt->pkt_flow_ip_is_frag)) {
776 fe->fe_tx_is_cont_frag = false;
777 fe->fe_tx_frag_id = 0;
778 }
779
780 if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
781 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
782 TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
783 KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
784 } else {
785 ASSERT(!TAILQ_EMPTY(fes));
786 KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
787 flow_entry_release(&fe);
788 }
789 }
790
791 static inline void
fsw_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes)792 fsw_ring_dequeue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
793 uint32_t n_pkts_max, struct pktq *pktq, uint32_t *n_bytes)
794 {
795 uint32_t n_pkts = 0;
796
797 KPKTQ_INIT(pktq);
798
799 slot_idx_t idx, idx_end;
800 idx = r->ckr_khead;
801 idx_end = r->ckr_rhead;
802
803 *n_bytes = 0;
804 for (; n_pkts < n_pkts_max && idx != idx_end;
805 idx = SLOT_NEXT(idx, r->ckr_lim)) {
806 struct __kern_slot_desc *ksd = KR_KSD(r, idx);
807 struct __kern_packet *pkt = ksd->sd_pkt;
808
809 ASSERT(pkt->pkt_nextpkt == NULL);
810 KR_SLOT_DETACH_METADATA(r, ksd);
811
812 _FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
813 pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
814 if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
815 || (pkt->pkt_length == 0)) {
816 FSW_STATS_INC(FSW_STATS_DROP);
817 pp_free_packet_single(pkt);
818 continue;
819 }
820
821 n_pkts++;
822 *n_bytes += pkt->pkt_length;
823
824 KPKTQ_ENQUEUE(pktq, pkt);
825 }
826
827 r->ckr_khead = idx;
828 r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
829 }
830
831 static void
fsw_ring_enqueue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)832 fsw_ring_enqueue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
833 struct pktq *pktq)
834 {
835 #pragma unused(fsw)
836 struct __kern_packet *pkt;
837 struct __kern_quantum *kqum;
838 uint32_t kr_space_avail = 0;
839 uint32_t n, n_pkts = 0, n_bytes = 0;
840 slot_idx_t idx = 0, idx_start = 0, idx_end = 0;
841
842 idx_start = r->ckr_ktail;
843 kr_space_avail = kr_available_slots_rxring(r);
844 _FSW_INJECT_ERROR(40, kr_space_avail, 0, null_func);
845 n = MIN(kr_space_avail, KPKTQ_LEN(pktq));
846 _FSW_INJECT_ERROR(41, n, 0, null_func);
847 idx_end = SLOT_INCREMENT(idx_start, n, r->ckr_lim);
848
849 idx = idx_start;
850 while (idx != idx_end) {
851 KPKTQ_DEQUEUE(pktq, pkt);
852 kqum = SK_PTR_ADDR_KQUM(pkt);
853 kqum->qum_qflags |= QUM_F_FINALIZED;
854 n_pkts++;
855 n_bytes += pkt->pkt_length;
856 KR_SLOT_ATTACH_METADATA(r, KR_KSD(r, idx), kqum);
857 if (__improbable(pkt->pkt_trace_id != 0)) {
858 KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
859 KDBG(SK_KTRACE_PKT_RX_CHN | DBG_FUNC_START, pkt->pkt_trace_id);
860 }
861 idx = SLOT_NEXT(idx, r->ckr_lim);
862 }
863
864 kr_update_stats(r, n_pkts, n_bytes);
865
866 /*
867 * ensure slot attachments are visible before updating the
868 * tail pointer
869 */
870 membar_sync();
871
872 r->ckr_ktail = idx_end;
873
874 /* ensure global visibility */
875 membar_sync();
876
877 r->ckr_na_notify(r, kernproc, NA_NOTEF_PUSH);
878
879 SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "%s enqueued %d pkts",
880 r->ckr_name, n_pkts);
881 }
882
883 static void
pkts_to_pktq(struct __kern_packet * pkts[],uint32_t n_pkts,struct pktq * pktq)884 pkts_to_pktq(struct __kern_packet *pkts[], uint32_t n_pkts, struct pktq *pktq)
885 {
886 ASSERT(KPKTQ_EMPTY(pktq));
887
888 for (uint32_t i = 0; i < n_pkts; i++) {
889 struct __kern_packet *pkt = pkts[i];
890 ASSERT(pkt->pkt_nextpkt == NULL);
891 KPKTQ_ENQUEUE(pktq, pkt);
892 }
893 }
894
895 /*
896 * This function is modeled after nx_netif_host_grab_pkts() in nx_netif_host.c.
897 */
898 SK_NO_INLINE_ATTRIBUTE
899 static void
convert_native_pkt_to_mbuf_chain(struct nx_flowswitch * fsw,struct flow_entry * fe,struct __kern_packet * pkt_chain,struct mbuf ** m_chain,struct mbuf ** m_tail,uint32_t * cnt,uint32_t * bytes)900 convert_native_pkt_to_mbuf_chain(struct nx_flowswitch *fsw,
901 struct flow_entry *fe, struct __kern_packet *pkt_chain,
902 struct mbuf **m_chain, struct mbuf **m_tail, uint32_t *cnt,
903 uint32_t *bytes)
904 {
905 uint32_t tot_cnt;
906 unsigned int one = 1;
907 struct mbuf *mhead, *chain = NULL, *tail = NULL, **tailp = &chain;
908 uint32_t mhead_cnt, mhead_bufsize;
909 uint32_t mhead_waste = 0;
910 uint32_t mcnt = 0, mbytes = 0;
911 uint32_t largest, max_pkt_len;
912 struct __kern_packet *pkt;
913 struct kern_pbufpool *pp;
914
915 tot_cnt = *cnt;
916 ASSERT(tot_cnt > 0);
917 mhead_cnt = tot_cnt;
918
919 /*
920 * Opportunistically batch-allocate the mbufs based on the largest
921 * packet size we've seen in the recent past. Note that we reset
922 * fe_rx_largest_msize below if we notice that we're under-utilizing the
923 * allocated buffers (thus disabling this batch allocation).
924 */
925 if (__probable((largest = fe->fe_rx_largest_msize) != 0)) {
926 if (largest <= MCLBYTES) {
927 mhead = m_allocpacket_internal(&mhead_cnt, MCLBYTES,
928 &one, M_WAIT, 1, 0);
929 mhead_bufsize = MCLBYTES;
930 } else if (largest <= MBIGCLBYTES) {
931 mhead = m_allocpacket_internal(&mhead_cnt, MBIGCLBYTES,
932 &one, M_WAIT, 1, 0);
933 mhead_bufsize = MBIGCLBYTES;
934 } else if (largest <= M16KCLBYTES) {
935 mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES,
936 &one, M_WAIT, 1, 0);
937 mhead_bufsize = M16KCLBYTES;
938 } else {
939 mhead = NULL;
940 mhead_bufsize = mhead_cnt = 0;
941 }
942 } else {
943 mhead = NULL;
944 mhead_bufsize = mhead_cnt = 0;
945 }
946 DTRACE_SKYWALK4(bufstats, uint32_t, largest, uint32_t, mhead_bufsize,
947 uint32_t, mhead_cnt, uint32_t, tot_cnt);
948
949 pp = __DECONST(struct kern_pbufpool *, pkt_chain->pkt_qum.qum_pp);
950 max_pkt_len = pp->pp_buflet_size * pp->pp_max_frags;
951
952 for (pkt = pkt_chain; pkt != NULL; pkt = pkt->pkt_nextpkt) {
953 uint32_t tot_len, len;
954 uint16_t pad, llhlen, iphlen;
955 boolean_t do_cksum_rx;
956 struct mbuf *m;
957 int error;
958
959 llhlen = pkt->pkt_l2_len;
960 len = pkt->pkt_length;
961 if (__improbable(len > max_pkt_len || llhlen > len)) {
962 DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
963 struct __kern_packet *, pkt);
964 FSW_STATS_INC(FSW_STATS_DROP);
965 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
966 continue;
967 }
968 /* begin payload on 32-bit boundary; figure out the padding */
969 pad = (uint16_t)P2ROUNDUP(llhlen, sizeof(uint32_t)) - llhlen;
970 tot_len = pad + len;
971
972 /* remember largest packet size */
973 if (__improbable(fe->fe_rx_largest_msize < tot_len)) {
974 fe->fe_rx_largest_msize = MAX(tot_len, MCLBYTES);
975 }
976
977 /*
978 * If the above batch allocation returned partial
979 * success, we try a blocking allocation here again.
980 */
981 m = mhead;
982 if (__improbable(m == NULL || tot_len > mhead_bufsize)) {
983 ASSERT(mhead != NULL || mhead_cnt == 0);
984 one = 1;
985 if ((error = mbuf_allocpacket(MBUF_WAITOK, tot_len,
986 &one, &m)) != 0) {
987 DTRACE_SKYWALK2(bad__len,
988 struct nx_flowswitch *, fsw,
989 struct __kern_packet *, pkt);
990 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
991 FSW_STATS_INC(FSW_STATS_DROP);
992 continue;
993 }
994 } else {
995 mhead = m->m_nextpkt;
996 m->m_nextpkt = NULL;
997 ASSERT(mhead_cnt != 0);
998 --mhead_cnt;
999
1000 /* check if we're underutilizing large buffers */
1001 if (__improbable(mhead_bufsize > MCLBYTES &&
1002 tot_len < (mhead_bufsize >> 1))) {
1003 ++mhead_waste;
1004 }
1005 }
1006 m->m_data += pad;
1007 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
1008
1009 /* don't include IP header from partial sum */
1010 if (__probable((pkt->pkt_qum_qflags &
1011 QUM_F_FLOW_CLASSIFIED) != 0)) {
1012 iphlen = pkt->pkt_flow_ip_hlen;
1013 do_cksum_rx = sk_cksum_rx;
1014 } else {
1015 iphlen = 0;
1016 do_cksum_rx = FALSE;
1017 }
1018
1019 fsw->fsw_pkt_copy_to_mbuf(NR_RX, SK_PKT2PH(pkt),
1020 pkt->pkt_headroom, m, 0, len, do_cksum_rx,
1021 llhlen + iphlen);
1022
1023 FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2MBUF);
1024 if (do_cksum_rx) {
1025 FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
1026 }
1027 #if DEBUG || DEVELOPMENT
1028 if (__improbable(pkt_trailers > 0)) {
1029 (void) pkt_add_trailers_mbuf(m, llhlen + iphlen);
1030 }
1031 #endif /* DEBUG || DEVELOPMENT */
1032 m_adj(m, llhlen);
1033
1034 m->m_pkthdr.rcvif = fsw->fsw_ifp;
1035 if (__improbable((pkt->pkt_link_flags &
1036 PKT_LINKF_ETHFCS) != 0)) {
1037 m->m_flags |= M_HASFCS;
1038 }
1039 if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1040 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
1041 }
1042 ASSERT(m->m_nextpkt == NULL);
1043 tail = m;
1044 *tailp = m;
1045 tailp = &m->m_nextpkt;
1046 mcnt++;
1047 mbytes += m_pktlen(m);
1048 }
1049 /* free any leftovers */
1050 if (__improbable(mhead != NULL)) {
1051 DTRACE_SKYWALK1(mhead__leftover, uint32_t, mhead_cnt);
1052 ASSERT(mhead_cnt != 0);
1053 (void) m_freem_list(mhead);
1054 mhead = NULL;
1055 mhead_cnt = 0;
1056 }
1057
1058 /* reset if most packets (>50%) are smaller than our batch buffers */
1059 if (__improbable(mhead_waste > ((uint32_t)tot_cnt >> 1))) {
1060 DTRACE_SKYWALK4(mhead__waste, struct nx_flowswitch *, fsw,
1061 struct flow_entry *, fe, uint32_t, mhead_waste,
1062 uint32_t, tot_cnt);
1063 fe->fe_rx_largest_msize = 0;
1064 }
1065 pp_free_packet_chain(pkt_chain, NULL);
1066 *m_chain = chain;
1067 *m_tail = tail;
1068 *cnt = mcnt;
1069 *bytes = mbytes;
1070 }
1071
1072 /*
1073 * This function only extracts the mbuf from the packet. The caller frees
1074 * the packet.
1075 */
1076 static inline struct mbuf *
convert_compat_pkt_to_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1077 convert_compat_pkt_to_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1078 {
1079 struct mbuf *m;
1080 struct pkthdr *mhdr;
1081 uint16_t llhlen;
1082
1083 m = pkt->pkt_mbuf;
1084 ASSERT(m != NULL);
1085
1086 llhlen = pkt->pkt_l2_len;
1087 if (llhlen > pkt->pkt_length) {
1088 m_freem(m);
1089 KPKT_CLEAR_MBUF_DATA(pkt);
1090 DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1091 struct __kern_packet *, pkt);
1092 FSW_STATS_INC(FSW_STATS_DROP);
1093 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1094 return NULL;
1095 }
1096 mhdr = &m->m_pkthdr;
1097 if ((mhdr->csum_flags & CSUM_DATA_VALID) == 0 &&
1098 PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
1099 mhdr->csum_flags &= ~CSUM_RX_FLAGS;
1100 mhdr->csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
1101 mhdr->csum_rx_start = pkt->pkt_csum_rx_start_off;
1102 mhdr->csum_rx_val = pkt->pkt_csum_rx_value;
1103 }
1104 #if DEBUG || DEVELOPMENT
1105 uint32_t extra = 0;
1106 if (__improbable(pkt_trailers > 0)) {
1107 extra = pkt_add_trailers_mbuf(m, llhlen);
1108 }
1109 #endif /* DEBUG || DEVELOPMENT */
1110 m_adj(m, llhlen);
1111 ASSERT((uint32_t)m_pktlen(m) == ((pkt->pkt_length - llhlen) + extra));
1112 KPKT_CLEAR_MBUF_DATA(pkt);
1113 return m;
1114 }
1115
1116 SK_NO_INLINE_ATTRIBUTE
1117 static void
convert_compat_pkt_to_mbuf_chain(struct nx_flowswitch * fsw,struct flow_entry * fe,struct __kern_packet * pkt_chain,struct mbuf ** m_chain,struct mbuf ** m_tail,uint32_t * cnt,uint32_t * bytes)1118 convert_compat_pkt_to_mbuf_chain(struct nx_flowswitch *fsw,
1119 struct flow_entry *fe, struct __kern_packet *pkt_chain,
1120 struct mbuf **m_chain, struct mbuf **m_tail, uint32_t *cnt,
1121 uint32_t *bytes)
1122 {
1123 #pragma unused (fe)
1124 struct __kern_packet *pkt;
1125 struct mbuf *m, *head = NULL, *tail = NULL, **tailp = &head;
1126 uint32_t c = 0, b = 0;
1127
1128 for (pkt = pkt_chain; pkt != NULL; pkt = pkt->pkt_nextpkt) {
1129 m = convert_compat_pkt_to_mbuf(fsw, pkt);
1130 if (__improbable(m == NULL)) {
1131 continue;
1132 }
1133 tail = m;
1134 *tailp = m;
1135 tailp = &m->m_nextpkt;
1136 c++;
1137 b += m_pktlen(m);
1138 }
1139 ASSERT(c <= *cnt);
1140 pp_free_packet_chain(pkt_chain, NULL);
1141 *m_chain = head;
1142 *m_tail = tail;
1143 *cnt = c;
1144 *bytes = b;
1145 }
1146
1147 void
fsw_host_sendup(ifnet_t ifp,struct mbuf * m_chain,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes)1148 fsw_host_sendup(ifnet_t ifp, struct mbuf *m_chain, struct mbuf *m_tail,
1149 uint32_t cnt, uint32_t bytes)
1150 {
1151 struct ifnet_stat_increment_param s;
1152
1153 bzero(&s, sizeof(s));
1154 s.packets_in = cnt;
1155 s.bytes_in = bytes;
1156 dlil_input_handler(ifp, m_chain, m_tail, &s, FALSE, NULL);
1157 }
1158
1159 void
fsw_host_rx(struct nx_flowswitch * fsw,struct flow_entry * fe)1160 fsw_host_rx(struct nx_flowswitch *fsw, struct flow_entry *fe)
1161 {
1162 struct pktq *q;
1163 struct __kern_packet *pkt_chain;
1164 struct mbuf *m_chain = NULL, *m_tail = NULL;
1165 uint32_t cnt = 0, bytes = 0;
1166 boolean_t compat;
1167
1168 q = &fe->fe_rx_pktq;
1169 pkt_chain = KPKTQ_FIRST(q);
1170 cnt = KPKTQ_LEN(q);
1171 KPKTQ_INIT(q);
1172 if (__improbable(pkt_chain == NULL)) {
1173 DTRACE_SKYWALK2(empty__pktq, struct nx_flowswitch *,
1174 fsw, struct flow_entry *, fe);
1175 return;
1176 }
1177
1178 /* All packets in the chain must have the same type */
1179 compat = ((pkt_chain->pkt_pflags & PKT_F_MBUF_DATA) != 0);
1180 if (compat) {
1181 convert_compat_pkt_to_mbuf_chain(fsw, fe, pkt_chain, &m_chain,
1182 &m_tail, &cnt, &bytes);
1183 } else {
1184 convert_native_pkt_to_mbuf_chain(fsw, fe, pkt_chain, &m_chain,
1185 &m_tail, &cnt, &bytes);
1186 }
1187 if (__improbable(m_chain == NULL)) {
1188 DTRACE_SKYWALK2(empty__chain, struct nx_flowswitch *, fsw,
1189 struct flow_entry *, fe);
1190 return;
1191 }
1192 fsw_host_sendup(fsw->fsw_ifp, m_chain, m_tail, cnt, bytes);
1193 }
1194
1195 void
fsw_ring_enqueue_tail_drop(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1196 fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw,
1197 struct __kern_channel_ring *r, struct pktq *pktq)
1198 {
1199 fsw_ring_enqueue_pktq(fsw, r, pktq);
1200 FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq));
1201 dp_drop_pktq(fsw, pktq);
1202 }
1203
1204 static struct nexus_adapter *
flow_get_na(struct nx_flowswitch * fsw,struct flow_entry * fe)1205 flow_get_na(struct nx_flowswitch *fsw, struct flow_entry *fe)
1206 {
1207 struct kern_nexus *nx = fsw->fsw_nx;
1208 struct nexus_adapter *na = NULL;
1209 nexus_port_t port = fe->fe_nx_port;
1210
1211 if (port == FSW_VP_DEV || port == FSW_VP_HOST) {
1212 SK_ERR("dev or host ports have no NA");
1213 return NULL;
1214 }
1215
1216 if (__improbable(!nx_port_is_valid(nx, port))) {
1217 SK_DF(SK_VERB_FSW_DP, "%s[%d] port no longer valid",
1218 if_name(fsw->fsw_ifp), port);
1219 return NULL;
1220 }
1221
1222 na = nx_port_get_na(nx, port);
1223 if (__improbable(na == NULL)) {
1224 FSW_STATS_INC(FSW_STATS_DST_NXPORT_INVALID);
1225 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer valid",
1226 if_name(fsw->fsw_ifp), port);
1227 return NULL;
1228 }
1229
1230 if (__improbable(!NA_IS_ACTIVE(na))) {
1231 FSW_STATS_INC(FSW_STATS_DST_NXPORT_INACTIVE);
1232 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer active",
1233 if_name(fsw->fsw_ifp), port);
1234 return NULL;
1235 }
1236
1237 if (__improbable(nx_port_is_defunct(nx, port))) {
1238 FSW_STATS_INC(FSW_STATS_DST_NXPORT_DEFUNCT);
1239 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA defuncted",
1240 if_name(fsw->fsw_ifp), port);
1241 return NULL;
1242 }
1243
1244 return na;
1245 }
1246
1247 static inline struct __kern_channel_ring *
flow_get_ring(struct nx_flowswitch * fsw,struct flow_entry * fe,enum txrx txrx)1248 flow_get_ring(struct nx_flowswitch *fsw, struct flow_entry *fe, enum txrx txrx)
1249 {
1250 struct nexus_vp_adapter *na = NULL;
1251 struct __kern_channel_ring *r = NULL;
1252
1253 na = VPNA(flow_get_na(fsw, fe));
1254 if (__improbable(na == NULL)) {
1255 return NULL;
1256 }
1257
1258 switch (txrx) {
1259 case NR_RX:
1260 r = &na->vpna_up.na_rx_rings[0];
1261 break;
1262 case NR_TX:
1263 r = &na->vpna_up.na_tx_rings[0];
1264 break;
1265 default:
1266 __builtin_unreachable();
1267 VERIFY(0);
1268 }
1269
1270 if (__improbable(KR_DROP(r))) {
1271 FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE);
1272 SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %0xllx %s drop mode",
1273 r->ckr_name, SK_KVA(r));
1274 return NULL;
1275 }
1276
1277 ASSERT(KRNA(r)->na_md_type == NEXUS_META_TYPE_PACKET);
1278
1279 #if (DEVELOPMENT || DEBUG)
1280 if (r != NULL) {
1281 _FSW_INJECT_ERROR(4, r, NULL, null_func);
1282 }
1283 #endif /* DEVELOPMENT || DEBUG */
1284
1285 return r;
1286 }
1287
1288 struct __kern_channel_ring *
fsw_flow_get_rx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1289 fsw_flow_get_rx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1290 {
1291 return flow_get_ring(fsw, fe, NR_RX);
1292 }
1293
1294 static inline struct __kern_channel_ring *
fsw_flow_get_tx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1295 fsw_flow_get_tx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1296 {
1297 return flow_get_ring(fsw, fe, NR_TX);
1298 }
1299
1300 static bool
dp_flow_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1301 dp_flow_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1302 {
1303 struct flow_route *fr = fe->fe_route;
1304 struct ifnet *ifp = fsw->fsw_ifp;
1305
1306 if (__improbable(!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
1307 !fe->fe_want_nonviable && (fe->fe_key.fk_mask & FKMASK_SRC) &&
1308 fe->fe_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt &&
1309 !flow_route_key_validate(&fe->fe_key, ifp, &fe->fe_laddr_gencnt))) {
1310 /*
1311 * The source address is no longer around; we want this
1312 * flow to be nonviable, but that requires holding the lock
1313 * as writer (which isn't the case now.) Indicate that
1314 * we need to finalize the nonviable later down below.
1315 *
1316 * We also request that the flow route be re-configured,
1317 * if this is a connected mode flow.
1318 *
1319 */
1320 if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) {
1321 /*
1322 * fsw_pending_nonviable is a hint for reaper thread;
1323 * due to the fact that setting fe_want_nonviable and
1324 * incrementing fsw_pending_nonviable counter is not
1325 * atomic, let the increment happen first, and the
1326 * thread losing the CAS does decrement.
1327 */
1328 atomic_add_32(&fsw->fsw_pending_nonviable, 1);
1329 if (atomic_test_set_32(&fe->fe_want_nonviable, 0, 1)) {
1330 fsw_reap_sched(fsw);
1331 } else {
1332 atomic_add_32(&fsw->fsw_pending_nonviable, -1);
1333 }
1334 }
1335 if (fr != NULL) {
1336 atomic_add_32(&fr->fr_want_configure, 1);
1337 }
1338 }
1339
1340 /* if flow was (or is going to be) marked as nonviable, drop it */
1341 if (__improbable(fe->fe_want_nonviable ||
1342 (fe->fe_flags & FLOWENTF_NONVIABLE) != 0)) {
1343 SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow 0x%llx non-viable",
1344 SK_KVA(fe));
1345 return false;
1346 }
1347
1348 return true;
1349 }
1350
1351 bool
dp_flow_rx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1352 dp_flow_rx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1353 {
1354 bool okay;
1355 okay = dp_flow_route_process(fsw, fe);
1356 #if (DEVELOPMENT || DEBUG)
1357 if (okay) {
1358 _FSW_INJECT_ERROR(5, okay, false, null_func);
1359 }
1360 #endif /* DEVELOPMENT || DEBUG */
1361
1362 return okay;
1363 }
1364
1365 void
dp_flow_rx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1366 dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1367 {
1368 struct pktq dpkts; /* dst pool alloc'ed packets */
1369 struct pktq disposed_pkts; /* done src packets */
1370 struct pktq dropped_pkts; /* dropped src packets */
1371 struct pktq transferred_pkts; /* dst packet ready for ring */
1372 struct __kern_packet *pkt, *tpkt;
1373 struct kern_pbufpool *dpp;
1374 uint32_t n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq);
1375 uint64_t buf_array[RX_BUFLET_BATCH_COUNT];
1376 uint16_t buf_array_iter = 0;
1377 uint32_t cnt, buf_cnt = 0;
1378 int err;
1379
1380 KPKTQ_INIT(&dpkts);
1381 KPKTQ_INIT(&dropped_pkts);
1382 KPKTQ_INIT(&disposed_pkts);
1383 KPKTQ_INIT(&transferred_pkts);
1384
1385 if (__improbable(!dp_flow_rx_route_process(fsw, fe))) {
1386 SK_ERR("Rx route bad");
1387 fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
1388 FSW_STATS_ADD(FSW_STATS_RX_FLOW_NONVIABLE, n_pkts);
1389 goto done;
1390 }
1391
1392 if (fe->fe_nx_port == FSW_VP_HOST) {
1393 /*
1394 * The host ring does not exist anymore so we can't take
1395 * the enqueue path below. This path should only be hit
1396 * for the rare tcp fragmentation case.
1397 */
1398 fsw_host_rx(fsw, fe);
1399 return;
1400 }
1401
1402 /* find the ring */
1403 struct __kern_channel_ring *r;
1404 r = fsw_flow_get_rx_ring(fsw, fe);
1405 if (__improbable(r == NULL)) {
1406 fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
1407 goto done;
1408 }
1409
1410 /* snoop before L2 is stripped */
1411 if (__improbable(pktap_total_tap_count != 0)) {
1412 fsw_snoop(fsw, fe, true);
1413 }
1414
1415 dpp = r->ckr_pp;
1416 /* batch allocate enough packets */
1417 err = pp_alloc_pktq(dpp, 1, &dpkts, n_pkts, NULL, NULL,
1418 SKMEM_NOSLEEP);
1419 if (__improbable(err == ENOMEM)) {
1420 ASSERT(KPKTQ_EMPTY(&dpkts));
1421 KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq);
1422 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1423 SK_ERR("failed to alloc %u pkts for kr %s, 0x%llu", n_pkts,
1424 r->ckr_name, SK_KVA(r));
1425 goto done;
1426 }
1427
1428 /*
1429 * estimate total number of buflets for the packet chain.
1430 */
1431 cnt = howmany(fe->fe_rx_pktq_bytes, dpp->pp_buflet_size);
1432 if (cnt > n_pkts) {
1433 ASSERT(dpp->pp_max_frags > 1);
1434 cnt -= n_pkts;
1435 buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, cnt);
1436 err = pp_alloc_buflet_batch(dpp, buf_array, &buf_cnt,
1437 SKMEM_NOSLEEP);
1438 if (__improbable(buf_cnt == 0)) {
1439 KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq);
1440 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1441 SK_ERR("failed to alloc %d buflets (err %d) for kr %s, "
1442 "0x%llu", cnt, err, r->ckr_name, SK_KVA(r));
1443 goto done;
1444 }
1445 err = 0;
1446 }
1447
1448 /* extra processing for user flow */
1449 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1450 err = 0;
1451 KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1452 if (fe->fe_rx_pktq_bytes > pkt->pkt_flow_ulen) {
1453 fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1454 } else {
1455 fe->fe_rx_pktq_bytes = 0;
1456 }
1457 err = flow_pkt_track(fe, pkt, true);
1458 _FSW_INJECT_ERROR(33, err, EPROTO, null_func);
1459 if (__improbable(err != 0)) {
1460 SK_ERR("flow_pkt_track failed (err %d)", err);
1461 FSW_STATS_INC(FSW_STATS_RX_FLOW_TRACK_ERR);
1462 /* if need to trigger RST then deliver to host */
1463 if (err == ENETRESET) {
1464 struct flow_entry *host_fe;
1465 host_fe =
1466 flow_mgr_get_host_fe(fsw->fsw_flow_mgr);
1467 KPKTQ_ENQUEUE(&host_fe->fe_rx_pktq, pkt);
1468 continue;
1469 }
1470 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1471 continue;
1472 }
1473
1474 /* transfer to dpkt */
1475 if (pkt->pkt_qum.qum_pp != dpp) {
1476 struct __kern_buflet *bprev, *bnew;
1477 struct __kern_packet *dpkt = NULL;
1478 uint32_t n_bufs, i;
1479
1480 KPKTQ_DEQUEUE(&dpkts, dpkt);
1481 if (__improbable(dpkt == NULL)) {
1482 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1483 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1484 continue;
1485 }
1486 n_bufs = howmany(pkt->pkt_length, dpp->pp_buflet_size);
1487 n_bufs--;
1488 for (i = 0; i < n_bufs; i++) {
1489 if (__improbable(buf_cnt == 0)) {
1490 ASSERT(dpp->pp_max_frags > 1);
1491 buf_array_iter = 0;
1492 cnt = howmany(fe->fe_rx_pktq_bytes,
1493 dpp->pp_buflet_size);
1494 n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq);
1495 if (cnt >= n_pkts) {
1496 cnt -= n_pkts;
1497 } else {
1498 cnt = 0;
1499 }
1500 cnt += (n_bufs - i);
1501 buf_cnt = MIN(RX_BUFLET_BATCH_COUNT,
1502 cnt);
1503 cnt = buf_cnt;
1504 err = pp_alloc_buflet_batch(dpp,
1505 buf_array, &buf_cnt,
1506 SKMEM_NOSLEEP);
1507 if (__improbable(buf_cnt == 0)) {
1508 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1509 KPKTQ_ENQUEUE(&dropped_pkts,
1510 pkt);
1511 pkt = NULL;
1512 pp_free_packet_single(dpkt);
1513 dpkt = NULL;
1514 SK_ERR("failed to alloc %d "
1515 "buflets (err %d) for "
1516 "kr %s, 0x%llu", cnt, err,
1517 r->ckr_name, SK_KVA(r));
1518 break;
1519 }
1520 err = 0;
1521 }
1522 ASSERT(buf_cnt != 0);
1523 if (i == 0) {
1524 PKT_GET_FIRST_BUFLET(dpkt, 1, bprev);
1525 }
1526 bnew = (kern_buflet_t)buf_array[buf_array_iter];
1527 buf_array[buf_array_iter] = 0;
1528 buf_array_iter++;
1529 buf_cnt--;
1530 VERIFY(kern_packet_add_buflet(SK_PKT2PH(dpkt),
1531 bprev, bnew) == 0);
1532 bprev = bnew;
1533 }
1534 if (__improbable(err != 0)) {
1535 continue;
1536 }
1537 err = copy_packet_from_dev(fsw, pkt, dpkt);
1538 _FSW_INJECT_ERROR(43, err, EINVAL, null_func);
1539 if (__improbable(err != 0)) {
1540 SK_ERR("copy packet failed (err %d)", err);
1541 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1542 pp_free_packet_single(dpkt);
1543 dpkt = NULL;
1544 continue;
1545 }
1546 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1547 pkt = dpkt;
1548 }
1549 _UUID_COPY(pkt->pkt_flow_id, fe->fe_uuid);
1550 _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1551 pkt->pkt_policy_id = fe->fe_policy_id;
1552 pkt->pkt_transport_protocol = fe->fe_transport_protocol;
1553 if (pkt->pkt_bufs_cnt > 1) {
1554 pkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1555 pkt->pkt_seg_cnt = 1;
1556 }
1557 KPKTQ_ENQUEUE(&transferred_pkts, pkt);
1558 }
1559 KPKTQ_FINI(&fe->fe_rx_pktq);
1560 KPKTQ_CONCAT(&fe->fe_rx_pktq, &transferred_pkts);
1561 KPKTQ_FINI(&transferred_pkts);
1562
1563 fsw_ring_enqueue_tail_drop(fsw, r, &fe->fe_rx_pktq);
1564
1565 done:
1566 /* Free unused buflets */
1567 while (buf_cnt > 0) {
1568 pp_free_buflet(dpp, (kern_buflet_t)(buf_array[buf_array_iter]));
1569 buf_array[buf_array_iter] = 0;
1570 buf_array_iter++;
1571 buf_cnt--;
1572 }
1573 dp_free_pktq(fsw, &dpkts);
1574 dp_free_pktq(fsw, &disposed_pkts);
1575 dp_drop_pktq(fsw, &dropped_pkts);
1576 }
1577
1578 static inline void
rx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1579 rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1580 {
1581 ASSERT(!KPKTQ_EMPTY(&fe->fe_rx_pktq));
1582 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0);
1583
1584 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX, "Rx %d pkts for fe %p port %d",
1585 KPKTQ_LEN(&fe->fe_rx_pktq), fe, fe->fe_nx_port);
1586
1587 /* flow related processing (default, agg, fpd, etc.) */
1588 fe->fe_rx_process(fsw, fe);
1589
1590 if (__improbable(fe->fe_want_withdraw)) {
1591 fsw_reap_sched(fsw);
1592 }
1593
1594 KPKTQ_FINI(&fe->fe_rx_pktq);
1595 }
1596
1597 static inline void
dp_rx_process_wake_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1598 dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1599 {
1600 /*
1601 * We only care about wake packets of flows that belong the flow switch
1602 * as wake packets for the host stack are handled by the host input
1603 * function
1604 */
1605 #if (DEBUG || DEVELOPMENT)
1606 if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
1607 /*
1608 * This is a one shot command
1609 */
1610 fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
1611
1612 pkt->pkt_pflags |= PKT_F_WAKE_PKT;
1613 }
1614 #endif /* (DEBUG || DEVELOPMENT) */
1615 if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1616 if_ports_used_match_pkt(fsw->fsw_ifp, pkt);
1617 }
1618 }
1619
1620 static void
dp_rx_pktq(struct nx_flowswitch * fsw,struct pktq * pktq)1621 dp_rx_pktq(struct nx_flowswitch *fsw, struct pktq *pktq)
1622 {
1623 struct __kern_packet *pkt, *tpkt;
1624 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
1625 struct flow_entry *fe, *prev_fe;
1626 sa_family_t af;
1627 struct pktq dropped_pkts;
1628 int err;
1629
1630 KPKTQ_INIT(&dropped_pkts);
1631
1632 FSW_RLOCK(fsw);
1633 if (__improbable(FSW_QUIESCED(fsw))) {
1634 DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw);
1635 KPKTQ_CONCAT(&dropped_pkts, pktq);
1636 goto done;
1637 }
1638 if (__improbable(fsw->fsw_demux == NULL)) {
1639 KPKTQ_CONCAT(&dropped_pkts, pktq);
1640 goto done;
1641 }
1642
1643 prev_fe = NULL;
1644 KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
1645 if (__probable(tpkt)) {
1646 void *baddr;
1647 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1648 SK_PREFETCH(baddr, 0);
1649 /* prefetch L3 and L4 flow structs */
1650 SK_PREFETCHW(tpkt->pkt_flow, 0);
1651 SK_PREFETCHW(tpkt->pkt_flow, 128);
1652 }
1653
1654 KPKTQ_REMOVE(pktq, pkt);
1655
1656 pkt = rx_prepare_packet(fsw, pkt);
1657
1658 af = fsw->fsw_demux(fsw, pkt);
1659 if (__improbable(af == AF_UNSPEC)) {
1660 fe = flow_mgr_get_host_fe(fsw->fsw_flow_mgr);
1661 goto flow_batch;
1662 }
1663
1664 err = flow_pkt_classify(pkt, fsw->fsw_ifp, af, TRUE);
1665 _FSW_INJECT_ERROR(1, err, ENXIO, null_func);
1666 if (__improbable(err != 0)) {
1667 FSW_STATS_INC(FSW_STATS_RX_FLOW_EXTRACT_ERR);
1668 fe = flow_mgr_get_host_fe(fsw->fsw_flow_mgr);
1669 goto flow_batch;
1670 }
1671
1672 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
1673 pkt = rx_process_ip_frag(fsw, pkt);
1674 if (pkt == NULL) {
1675 continue;
1676 }
1677 }
1678
1679 fe = rx_lookup_flow(fsw, pkt, prev_fe);
1680 if (__improbable(fe == NULL)) {
1681 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1682 prev_fe = NULL;
1683 continue;
1684 }
1685
1686 fe->fe_rx_pktq_bytes += pkt->pkt_flow_ulen;
1687
1688 dp_rx_process_wake_packet(fsw, pkt);
1689
1690 flow_batch:
1691 rx_flow_batch_packet(&fes, fe, pkt);
1692 prev_fe = fe;
1693 }
1694
1695 struct flow_entry *tfe = NULL;
1696 TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) {
1697 rx_flow_process(fsw, fe);
1698 TAILQ_REMOVE(&fes, fe, fe_rx_link);
1699 fe->fe_rx_pktq_bytes = 0;
1700 fe->fe_rx_frag_count = 0;
1701 flow_entry_release(&fe);
1702 }
1703
1704 /* XXX(OPTIMIZE) need to re-circulate extras back to HOST */
1705 fe = flow_mgr_get_host_fe(fsw->fsw_flow_mgr);
1706 if (!KPKTQ_EMPTY(&fe->fe_rx_pktq)) {
1707 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0);
1708 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX,
1709 "re-circulate %d pkts to HOST", KPKTQ_LEN(&fe->fe_rx_pktq));
1710 rx_flow_process(fsw, fe);
1711 }
1712 flow_entry_release(&fe);
1713
1714 done:
1715 FSW_RUNLOCK(fsw);
1716
1717 dp_drop_pktq(fsw, &dropped_pkts);
1718 }
1719
1720 static void
dp_rx_pkts(struct nx_flowswitch * fsw,struct __kern_packet * pkts[],uint32_t n_pkts)1721 dp_rx_pkts(struct nx_flowswitch *fsw, struct __kern_packet *pkts[],
1722 uint32_t n_pkts)
1723 {
1724 struct pktq pktq;
1725 KPKTQ_INIT(&pktq);
1726 pkts_to_pktq(pkts, n_pkts, &pktq);
1727 dp_rx_pktq(fsw, &pktq);
1728 KPKTQ_FINI(&pktq);
1729 }
1730
1731 int
fsw_dev_input_netem_dequeue(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)1732 fsw_dev_input_netem_dequeue(void *handle, pktsched_pkt_t *pkts,
1733 uint32_t n_pkts)
1734 {
1735 #pragma unused(handle)
1736 struct nx_flowswitch *fsw = handle;
1737 struct __kern_packet *kpkts[FSW_VP_DEV_BATCH_MAX];
1738 sk_protect_t protect;
1739 uint32_t i;
1740
1741 ASSERT(n_pkts <= FSW_VP_DEV_BATCH_MAX);
1742
1743 for (i = 0; i < n_pkts; i++) {
1744 ASSERT(pkts[i].pktsched_ptype == QP_PACKET);
1745 ASSERT(pkts[i].pktsched_pkt_kpkt != NULL);
1746 kpkts[i] = pkts[i].pktsched_pkt_kpkt;
1747 }
1748
1749 protect = sk_sync_protect();
1750 dp_rx_pkts(fsw, kpkts, n_pkts);
1751 sk_sync_unprotect(protect);
1752
1753 return 0;
1754 }
1755
1756 static void
fsw_dev_input_netem_enqueue(struct nx_flowswitch * fsw,struct pktq * q)1757 fsw_dev_input_netem_enqueue(struct nx_flowswitch *fsw, struct pktq *q)
1758 {
1759 classq_pkt_t p;
1760 struct netem *ne;
1761 struct __kern_packet *pkt, *tpkt;
1762
1763 ASSERT(fsw->fsw_ifp != NULL);
1764 ne = fsw->fsw_ifp->if_input_netem;
1765 ASSERT(ne != NULL);
1766 KPKTQ_FOREACH_SAFE(pkt, q, tpkt) {
1767 boolean_t pdrop;
1768 KPKTQ_REMOVE(q, pkt);
1769 CLASSQ_PKT_INIT_PACKET(&p, pkt);
1770 netem_enqueue(ne, &p, &pdrop);
1771 }
1772 }
1773
1774 void
fsw_devna_rx(struct nexus_adapter * devna,struct __kern_packet * pkt_chain,struct nexus_pkt_stats * out_stats)1775 fsw_devna_rx(struct nexus_adapter *devna, struct __kern_packet *pkt_chain,
1776 struct nexus_pkt_stats *out_stats)
1777 {
1778 struct __kern_packet *pkt = pkt_chain, *next;
1779 struct nx_flowswitch *fsw;
1780 uint32_t n_bytes = 0, n_pkts = 0;
1781 uint64_t total_pkts = 0, total_bytes = 0;
1782 struct pktq q;
1783
1784 KPKTQ_INIT(&q);
1785 if (__improbable(devna->na_ifp == NULL ||
1786 (fsw = fsw_ifp_to_fsw(devna->na_ifp)) == NULL)) {
1787 SK_ERR("fsw not attached, dropping %d pkts", KPKTQ_LEN(&q));
1788 pp_free_packet_chain(pkt_chain, NULL);
1789 return;
1790 }
1791 while (pkt != NULL) {
1792 if (__improbable(pkt->pkt_trace_id != 0)) {
1793 KDBG(SK_KTRACE_PKT_RX_DRV | DBG_FUNC_END, pkt->pkt_trace_id);
1794 KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_START, pkt->pkt_trace_id);
1795 }
1796 next = pkt->pkt_nextpkt;
1797 pkt->pkt_nextpkt = NULL;
1798
1799 if (__probable((pkt->pkt_qum_qflags & QUM_F_DROPPED) == 0)) {
1800 KPKTQ_ENQUEUE(&q, pkt);
1801 n_bytes += pkt->pkt_length;
1802 } else {
1803 DTRACE_SKYWALK1(non__finalized__drop,
1804 struct __kern_packet *, pkt);
1805 FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_FINALIZED);
1806 pp_free_packet_single(pkt);
1807 pkt = NULL;
1808 }
1809 n_pkts = KPKTQ_LEN(&q);
1810 if (n_pkts == fsw_rx_batch || (next == NULL && n_pkts > 0)) {
1811 if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) {
1812 dp_rx_pktq(fsw, &q);
1813 } else {
1814 fsw_dev_input_netem_enqueue(fsw, &q);
1815 }
1816 total_pkts += n_pkts;
1817 total_bytes += n_bytes;
1818 n_pkts = 0;
1819 n_bytes = 0;
1820 KPKTQ_FINI(&q);
1821 }
1822 pkt = next;
1823 }
1824 ASSERT(KPKTQ_LEN(&q) == 0);
1825 FSW_STATS_ADD(FSW_STATS_RX_PACKETS, total_pkts);
1826 if (out_stats != NULL) {
1827 out_stats->nps_pkts = total_pkts;
1828 out_stats->nps_bytes = total_bytes;
1829 }
1830 KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(devna), total_pkts, total_bytes);
1831 }
1832
1833 static int
dp_copy_to_dev_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)1834 dp_copy_to_dev_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
1835 struct __kern_packet *dpkt)
1836 {
1837 struct mbuf *m = NULL;
1838 uint16_t bdlen, bdlim, bdoff;
1839 uint8_t *bdaddr;
1840 unsigned int one = 1;
1841 int err = 0;
1842
1843 err = mbuf_allocpacket(MBUF_DONTWAIT,
1844 (fsw->fsw_frame_headroom + spkt->pkt_length), &one, &m);
1845 #if (DEVELOPMENT || DEBUG)
1846 if (m != NULL) {
1847 _FSW_INJECT_ERROR(11, m, NULL, m_freem, m);
1848 }
1849 #endif /* DEVELOPMENT || DEBUG */
1850 if (__improbable(m == NULL)) {
1851 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
1852 err = ENOBUFS;
1853 goto done;
1854 }
1855
1856 MD_BUFLET_ADDR_ABS_DLEN(dpkt, bdaddr, bdlen, bdlim, bdoff);
1857 if (fsw->fsw_frame_headroom > bdlim) {
1858 SK_ERR("not enough space in buffer for headroom");
1859 err = EINVAL;
1860 goto done;
1861 }
1862
1863 dpkt->pkt_headroom = fsw->fsw_frame_headroom;
1864 dpkt->pkt_mbuf = m;
1865 dpkt->pkt_pflags |= PKT_F_MBUF_DATA;
1866
1867 /* packet copy into mbuf */
1868 fsw->fsw_pkt_copy_to_mbuf(NR_TX, SK_PTR_ENCODE(spkt,
1869 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)), 0, m,
1870 fsw->fsw_frame_headroom, spkt->pkt_length,
1871 PACKET_HAS_PARTIAL_CHECKSUM(spkt),
1872 spkt->pkt_csum_tx_start_off);
1873 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2MBUF);
1874
1875 /* header copy into dpkt buffer for classification */
1876 kern_packet_t sph = SK_PTR_ENCODE(spkt,
1877 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
1878 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
1879 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
1880 uint32_t copy_len = MIN(spkt->pkt_length, bdlim - dpkt->pkt_headroom);
1881 fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, dpkt->pkt_headroom,
1882 sph, spkt->pkt_headroom, copy_len, FALSE, 0, 0, 0);
1883
1884 /*
1885 * fsw->fsw_frame_headroom is after m_data, thus we treat m_data same as
1886 * buflet baddr m_data always points to the beginning of packet and
1887 * should represents the same as baddr + headroom
1888 */
1889 ASSERT((uintptr_t)m->m_data ==
1890 ((uintptr_t)mbuf_datastart(m) + fsw->fsw_frame_headroom));
1891
1892 done:
1893 return err;
1894 }
1895
1896 static int
dp_copy_to_dev_pkt(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)1897 dp_copy_to_dev_pkt(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
1898 struct __kern_packet *dpkt)
1899 {
1900 struct ifnet *ifp = fsw->fsw_ifp;
1901 uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
1902
1903 if (headroom > UINT8_MAX) {
1904 SK_ERR("headroom too large %d", headroom);
1905 return ERANGE;
1906 }
1907 dpkt->pkt_headroom = (uint8_t)headroom;
1908 ASSERT((dpkt->pkt_headroom & 0x7) == 0);
1909 dpkt->pkt_l2_len = 0;
1910 dpkt->pkt_link_flags = spkt->pkt_link_flags;
1911
1912 kern_packet_t sph = SK_PTR_ENCODE(spkt,
1913 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
1914 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
1915 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
1916 fsw->fsw_pkt_copy_from_pkt(NR_TX, dph,
1917 dpkt->pkt_headroom, sph, spkt->pkt_headroom,
1918 spkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(spkt),
1919 (spkt->pkt_csum_tx_start_off - spkt->pkt_headroom),
1920 (spkt->pkt_csum_tx_stuff_off - spkt->pkt_headroom),
1921 (spkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT));
1922
1923 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
1924
1925 return 0;
1926 }
1927
1928 #if SK_LOG
1929 /* Hoisted out of line to reduce kernel stack footprint */
1930 SK_LOG_ATTRIBUTE
1931 static void
dp_copy_to_dev_log(struct nx_flowswitch * fsw,const struct kern_pbufpool * pp,struct __kern_packet * spkt,struct __kern_packet * dpkt,int error)1932 dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp,
1933 struct __kern_packet *spkt, struct __kern_packet *dpkt, int error)
1934 {
1935 struct proc *p = current_proc();
1936 struct ifnet *ifp = fsw->fsw_ifp;
1937 uint64_t logflags = (SK_VERB_FSW_DP | SK_VERB_TX);
1938
1939 if (error == ERANGE) {
1940 SK_ERR("packet too long, hr(fr+tx)+slen (%u+%u)+%u > "
1941 "dev_pp_max %u", (uint32_t)fsw->fsw_frame_headroom,
1942 (uint32_t)ifp->if_tx_headroom, spkt->pkt_length,
1943 (uint32_t)pp->pp_max_frags * pp->pp_buflet_size);
1944 } else if (error == ENOBUFS) {
1945 SK_DF(logflags, "%s(%d) packet allocation failure",
1946 sk_proc_name_address(p), sk_proc_pid(p));
1947 } else if (error == 0) {
1948 ASSERT(dpkt != NULL);
1949 char *daddr;
1950 MD_BUFLET_ADDR_ABS(dpkt, daddr);
1951 SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)",
1952 sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
1953 dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
1954 (uint32_t)fsw->fsw_frame_headroom,
1955 (uint32_t)ifp->if_tx_headroom);
1956 SK_DF(logflags | SK_VERB_DUMP, "%s",
1957 sk_dump("buf", daddr, dpkt->pkt_length, 128, NULL, 0));
1958 } else {
1959 SK_DF(logflags, "%s(%d) error %d", error);
1960 }
1961 }
1962 #else
1963 #define dp_copy_to_dev_log(...)
1964 #endif /* SK_LOG */
1965
1966 static int
dp_copy_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)1967 dp_copy_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
1968 struct __kern_packet *dpkt)
1969 {
1970 const struct kern_pbufpool *pp = dpkt->pkt_qum.qum_pp;
1971 struct ifnet *ifp = fsw->fsw_ifp;
1972 uint32_t dev_pkt_len;
1973 int err = 0;
1974
1975 ASSERT(!(spkt->pkt_pflags & PKT_F_MBUF_MASK));
1976 ASSERT(!(spkt->pkt_pflags & PKT_F_PKT_MASK));
1977
1978 SK_PREFETCHW(dpkt->pkt_qum_buf.buf_addr, 0);
1979 /* Copy packet metadata */
1980 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
1981 _PKT_COPY(spkt, dpkt);
1982 ASSERT((dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
1983 !PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
1984 ASSERT(dpkt->pkt_mbuf == NULL);
1985
1986 /* Copy AQM metadata */
1987 dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type;
1988 dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx;
1989 _CASSERT((offsetof(struct __flow, flow_src_id) % 8) == 0);
1990 _UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id);
1991 _UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid);
1992 dpkt->pkt_policy_id = spkt->pkt_policy_id;
1993
1994 switch (fsw->fsw_classq_enq_ptype) {
1995 case QP_MBUF:
1996 err = dp_copy_to_dev_mbuf(fsw, spkt, dpkt);
1997 break;
1998
1999 case QP_PACKET:
2000 dev_pkt_len = fsw->fsw_frame_headroom + ifp->if_tx_headroom +
2001 spkt->pkt_length;
2002 if (dev_pkt_len > pp->pp_max_frags * pp->pp_buflet_size) {
2003 FSW_STATS_INC(FSW_STATS_TX_COPY_BAD_LEN);
2004 err = ERANGE;
2005 goto done;
2006 }
2007 err = dp_copy_to_dev_pkt(fsw, spkt, dpkt);
2008 break;
2009
2010 default:
2011 VERIFY(0);
2012 __builtin_unreachable();
2013 }
2014 done:
2015 dp_copy_to_dev_log(fsw, pp, spkt, dpkt, err);
2016 return err;
2017 }
2018
2019 static struct mbuf *
convert_pkt_to_mbuf(struct __kern_packet * pkt)2020 convert_pkt_to_mbuf(struct __kern_packet *pkt)
2021 {
2022 ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
2023 ASSERT(pkt->pkt_mbuf != NULL);
2024 struct mbuf *m = pkt->pkt_mbuf;
2025
2026 /* pass additional metadata generated from flow parse/lookup */
2027 _CASSERT(sizeof(m->m_pkthdr.pkt_flowid) ==
2028 sizeof(pkt->pkt_flow_token));
2029 _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) ==
2030 sizeof(pkt->pkt_flowsrc_token));
2031 _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) ==
2032 sizeof(pkt->pkt_flowsrc_fidx));
2033 m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
2034 m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
2035 m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
2036 m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
2037 m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
2038 m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
2039 m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx;
2040
2041 /* The packet should have a timestamp by the time we get here. */
2042 m->m_pkthdr.pkt_timestamp = pkt->pkt_timestamp;
2043 m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
2044
2045 m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
2046 m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
2047 if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
2048 m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
2049 }
2050 KPKT_CLEAR_MBUF_DATA(pkt);
2051
2052 /* mbuf has been consumed, release packet as well */
2053 ASSERT(pkt->pkt_qum.qum_ksd == NULL);
2054 pp_free_packet_single(pkt);
2055 return m;
2056 }
2057
2058 static void
convert_pkt_to_mbuf_chain(struct __kern_packet * pkt_chain,struct mbuf ** chain,struct mbuf ** tail,uint32_t * cnt,uint32_t * bytes)2059 convert_pkt_to_mbuf_chain(struct __kern_packet *pkt_chain,
2060 struct mbuf **chain, struct mbuf **tail,
2061 uint32_t *cnt, uint32_t *bytes)
2062 {
2063 struct __kern_packet *pkt = pkt_chain, *next;
2064 struct mbuf *m_chain = NULL, **m_tailp = &m_chain, *m = NULL;
2065 uint32_t c = 0, b = 0;
2066
2067 while (pkt != NULL) {
2068 next = pkt->pkt_nextpkt;
2069 pkt->pkt_nextpkt = NULL;
2070 m = convert_pkt_to_mbuf(pkt);
2071 ASSERT(m != NULL);
2072
2073 *m_tailp = m;
2074 m_tailp = &m->m_nextpkt;
2075 c++;
2076 b += m_pktlen(m);
2077 pkt = next;
2078 }
2079 if (chain != NULL) {
2080 *chain = m_chain;
2081 }
2082 if (tail != NULL) {
2083 *tail = m;
2084 }
2085 if (cnt != NULL) {
2086 *cnt = c;
2087 }
2088 if (bytes != NULL) {
2089 *bytes = b;
2090 }
2091 }
2092
2093 SK_NO_INLINE_ATTRIBUTE
2094 static int
classq_enqueue_flow_single(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2095 classq_enqueue_flow_single(struct nx_flowswitch *fsw,
2096 struct __kern_packet *pkt)
2097 {
2098 struct ifnet *ifp = fsw->fsw_ifp;
2099 boolean_t pkt_drop = FALSE;
2100 int err;
2101
2102 FSW_LOCK_ASSERT_HELD(fsw);
2103 ASSERT(fsw->fsw_classq_enabled);
2104 /*
2105 * we are using the first 4 bytes of flow_id as the AQM flow
2106 * identifier.
2107 */
2108 ASSERT(!uuid_is_null(pkt->pkt_flow_id));
2109 fsw_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class,
2110 1, pkt->pkt_length);
2111
2112 if (__improbable(pkt->pkt_trace_id != 0)) {
2113 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
2114 KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
2115 }
2116
2117 switch (fsw->fsw_classq_enq_ptype) {
2118 case QP_MBUF: { /* compat interface */
2119 struct mbuf *m;
2120
2121 m = convert_pkt_to_mbuf(pkt);
2122 ASSERT(m != NULL);
2123 pkt = NULL;
2124
2125 /* ifnet_enqueue consumes mbuf */
2126 err = ifnet_enqueue_mbuf(ifp, m, false, &pkt_drop);
2127 m = NULL;
2128 #if (DEVELOPMENT || DEBUG)
2129 if (__improbable(!pkt_drop)) {
2130 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2131 }
2132 #endif /* DEVELOPMENT || DEBUG */
2133 if (pkt_drop) {
2134 FSW_STATS_INC(FSW_STATS_DROP);
2135 FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2136 }
2137 break;
2138 }
2139 case QP_PACKET: { /* native interface */
2140 /* ifnet_enqueue consumes packet */
2141 err = ifnet_enqueue_pkt(ifp, pkt, false, &pkt_drop);
2142 pkt = NULL;
2143 #if (DEVELOPMENT || DEBUG)
2144 if (__improbable(!pkt_drop)) {
2145 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2146 }
2147 #endif /* DEVELOPMENT || DEBUG */
2148 if (pkt_drop) {
2149 FSW_STATS_INC(FSW_STATS_DROP);
2150 FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2151 }
2152 break;
2153 }
2154 default:
2155 err = EINVAL;
2156 VERIFY(0);
2157 /* NOTREACHED */
2158 __builtin_unreachable();
2159 }
2160
2161 return err;
2162 }
2163
2164 static int
classq_enqueue_flow_chain(struct nx_flowswitch * fsw,struct __kern_packet * pkt_chain,struct __kern_packet * pkt_tail,uint32_t cnt,uint32_t bytes)2165 classq_enqueue_flow_chain(struct nx_flowswitch *fsw,
2166 struct __kern_packet *pkt_chain, struct __kern_packet *pkt_tail,
2167 uint32_t cnt, uint32_t bytes)
2168 {
2169 struct ifnet *ifp = fsw->fsw_ifp;
2170 boolean_t pkt_drop = FALSE;
2171 uint32_t svc;
2172 int err;
2173
2174 FSW_LOCK_ASSERT_HELD(fsw);
2175 ASSERT(fsw->fsw_classq_enabled);
2176 /*
2177 * we are using the first 4 bytes of flow_id as the AQM flow
2178 * identifier.
2179 */
2180 ASSERT(!uuid_is_null(pkt_chain->pkt_flow_id));
2181
2182 /*
2183 * All packets in the flow should have the same svc.
2184 */
2185 svc = pkt_chain->pkt_svc_class;
2186 fsw_ifp_inc_traffic_class_out_pkt(ifp, svc, cnt, bytes);
2187
2188 switch (fsw->fsw_classq_enq_ptype) {
2189 case QP_MBUF: { /* compat interface */
2190 struct mbuf *m_chain = NULL, *m_tail = NULL;
2191 uint32_t c = 0, b = 0;
2192
2193 convert_pkt_to_mbuf_chain(pkt_chain, &m_chain, &m_tail, &c, &b);
2194 ASSERT(m_chain != NULL && m_tail != NULL);
2195 ASSERT(c == cnt);
2196 ASSERT(b == bytes);
2197 pkt_chain = NULL;
2198
2199 /* ifnet_enqueue consumes mbuf */
2200 err = ifnet_enqueue_mbuf_chain(ifp, m_chain, m_tail, cnt,
2201 bytes, FALSE, &pkt_drop);
2202 m_chain = NULL;
2203 m_tail = NULL;
2204 #if (DEVELOPMENT || DEBUG)
2205 if (__improbable(!pkt_drop)) {
2206 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2207 }
2208 #endif /* DEVELOPMENT || DEBUG */
2209 if (pkt_drop) {
2210 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2211 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2212 cnt);
2213 }
2214 break;
2215 }
2216 case QP_PACKET: { /* native interface */
2217 /* ifnet_enqueue consumes packet */
2218 err = ifnet_enqueue_pkt_chain(ifp, pkt_chain, pkt_tail, cnt,
2219 bytes, FALSE, &pkt_drop);
2220 pkt_chain = NULL;
2221 #if (DEVELOPMENT || DEBUG)
2222 if (__improbable(!pkt_drop)) {
2223 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2224 }
2225 #endif /* DEVELOPMENT || DEBUG */
2226 if (pkt_drop) {
2227 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2228 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2229 cnt);
2230 }
2231 break;
2232 }
2233 default:
2234 err = EINVAL;
2235 VERIFY(0);
2236 /* NOTREACHED */
2237 __builtin_unreachable();
2238 }
2239
2240 return err;
2241 }
2242
2243 /*
2244 * This code path needs to be kept for interfaces without logical link support.
2245 */
2246 static void
classq_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,boolean_t chain,uint32_t cnt,uint32_t bytes)2247 classq_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
2248 boolean_t chain, uint32_t cnt, uint32_t bytes)
2249 {
2250 bool flowadv_is_set = false;
2251 struct __kern_packet *pkt, *tail, *tpkt;
2252 flowadv_idx_t flow_adv_idx;
2253 bool flowadv_cap;
2254 flowadv_token_t flow_adv_token;
2255 int err;
2256
2257 SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
2258 if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
2259
2260 if (chain) {
2261 pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
2262 tail = KPKTQ_LAST(&fe->fe_tx_pktq);
2263 KPKTQ_INIT(&fe->fe_tx_pktq);
2264 if (pkt == NULL) {
2265 return;
2266 }
2267 flow_adv_idx = pkt->pkt_flowsrc_fidx;
2268 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2269 flow_adv_token = pkt->pkt_flow_token;
2270
2271 err = classq_enqueue_flow_chain(fsw, pkt, tail, cnt, bytes);
2272
2273 /* set flow advisory if needed */
2274 if (__improbable((err == EQFULL || err == EQSUSPENDED) &&
2275 flowadv_cap)) {
2276 flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
2277 flow_adv_idx, flow_adv_token);
2278 }
2279 } else {
2280 uint32_t c = 0, b = 0;
2281
2282 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2283 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2284
2285 flow_adv_idx = pkt->pkt_flowsrc_fidx;
2286 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2287 flow_adv_token = pkt->pkt_flow_token;
2288
2289 c++;
2290 b += pkt->pkt_length;
2291 err = classq_enqueue_flow_single(fsw, pkt);
2292
2293 /* set flow advisory if needed */
2294 if (__improbable(!flowadv_is_set &&
2295 ((err == EQFULL || err == EQSUSPENDED) &&
2296 flowadv_cap))) {
2297 flowadv_is_set = na_flowadv_set(
2298 flow_get_na(fsw, fe), flow_adv_idx,
2299 flow_adv_token);
2300 }
2301 }
2302 ASSERT(c == cnt);
2303 ASSERT(b == bytes);
2304 }
2305
2306 /* notify flow advisory event */
2307 if (__improbable(flowadv_is_set)) {
2308 struct __kern_channel_ring *r = fsw_flow_get_tx_ring(fsw, fe);
2309 if (__probable(r)) {
2310 na_flowadv_event(r);
2311 SK_DF(SK_VERB_FLOW_ADVISORY | SK_VERB_TX,
2312 "%s(%d) notified of flow update",
2313 sk_proc_name_address(current_proc()),
2314 sk_proc_pid(current_proc()));
2315 }
2316 }
2317 }
2318
2319 /*
2320 * Logical link code path
2321 */
2322 static void
classq_qset_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,boolean_t chain,uint32_t cnt,uint32_t bytes)2323 classq_qset_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
2324 boolean_t chain, uint32_t cnt, uint32_t bytes)
2325 {
2326 struct __kern_packet *pkt, *tail;
2327 flowadv_idx_t flow_adv_idx;
2328 bool flowadv_is_set = false;
2329 bool flowadv_cap;
2330 flowadv_token_t flow_adv_token;
2331 uint32_t flowctl = 0, dropped = 0;
2332 int err;
2333
2334 SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
2335 if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
2336
2337 /*
2338 * Not supporting chains for now
2339 */
2340 VERIFY(!chain);
2341 pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
2342 tail = KPKTQ_LAST(&fe->fe_tx_pktq);
2343 KPKTQ_INIT(&fe->fe_tx_pktq);
2344 if (pkt == NULL) {
2345 return;
2346 }
2347 flow_adv_idx = pkt->pkt_flowsrc_fidx;
2348 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2349 flow_adv_token = pkt->pkt_flow_token;
2350
2351 err = netif_qset_enqueue(fe->fe_qset, pkt, tail, cnt, bytes,
2352 &flowctl, &dropped);
2353
2354 if (__improbable(err != 0)) {
2355 /* set flow advisory if needed */
2356 if (flowctl > 0 && flowadv_cap) {
2357 flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
2358 flow_adv_idx, flow_adv_token);
2359
2360 /* notify flow advisory event */
2361 if (flowadv_is_set) {
2362 struct __kern_channel_ring *r =
2363 fsw_flow_get_tx_ring(fsw, fe);
2364 if (__probable(r)) {
2365 na_flowadv_event(r);
2366 SK_DF(SK_VERB_FLOW_ADVISORY |
2367 SK_VERB_TX,
2368 "%s(%d) notified of flow update",
2369 sk_proc_name_address(current_proc()),
2370 sk_proc_pid(current_proc()));
2371 }
2372 }
2373 }
2374 if (dropped > 0) {
2375 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, dropped);
2376 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2377 dropped);
2378 }
2379 }
2380 }
2381
2382 static void
tx_finalize_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2383 tx_finalize_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2384 {
2385 #pragma unused(fsw)
2386 /* finalize here; no more changes to buflets after classq */
2387 if (__probable(!(pkt->pkt_pflags & PKT_F_MBUF_DATA))) {
2388 kern_packet_t ph = SK_PTR_ENCODE(pkt,
2389 METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
2390 int err = __packet_finalize(ph);
2391 VERIFY(err == 0);
2392 }
2393 }
2394
2395 static bool
dp_flow_tx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)2396 dp_flow_tx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2397 {
2398 struct flow_route *fr = fe->fe_route;
2399 int err;
2400
2401 ASSERT(fr != NULL);
2402
2403 if (__improbable(!dp_flow_route_process(fsw, fe))) {
2404 return false;
2405 }
2406
2407 _FSW_INJECT_ERROR(35, fr->fr_flags, fr->fr_flags,
2408 _fsw_error35_handler, 1, fr, NULL, NULL);
2409 _FSW_INJECT_ERROR(36, fr->fr_flags, fr->fr_flags,
2410 _fsw_error36_handler, 1, fr, NULL);
2411
2412 /*
2413 * See if we need to resolve the flow route; note the test against
2414 * fr_flags here is done without any lock for performance. Thus
2415 * it's possible that we race against the thread performing route
2416 * event updates for a packet (which is OK). In any case we should
2417 * not have any assertion on fr_flags value(s) due to the lack of
2418 * serialization.
2419 */
2420 if (fr->fr_flags & FLOWRTF_RESOLVED) {
2421 goto frame;
2422 }
2423
2424 struct __kern_packet *pkt, *tpkt;
2425 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2426 err = fsw->fsw_resolve(fsw, fr, pkt);
2427 _FSW_INJECT_ERROR_SET(35, _fsw_error35_handler, 2, fr, pkt, &err);
2428 _FSW_INJECT_ERROR_SET(36, _fsw_error36_handler, 2, fr, &err);
2429 /*
2430 * If resolver returns EJUSTRETURN then we drop the pkt as the
2431 * resolver should have converted the pkt into mbuf (or
2432 * detached the attached mbuf from pkt) and added it to the
2433 * llinfo queue. If we do have a cached llinfo, then proceed
2434 * to using it even though it may be stale (very unlikely)
2435 * while the resolution is in progress.
2436 * Otherwise, any other error results in dropping pkt.
2437 */
2438 if (err == EJUSTRETURN) {
2439 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2440 pp_free_packet_single(pkt);
2441 FSW_STATS_INC(FSW_STATS_TX_RESOLV_PENDING);
2442 continue;
2443 } else if (err != 0 && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
2444 /* use existing llinfo */
2445 FSW_STATS_INC(FSW_STATS_TX_RESOLV_STALE);
2446 } else if (err != 0) {
2447 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2448 pp_free_packet_single(pkt);
2449 FSW_STATS_INC(FSW_STATS_TX_RESOLV_FAIL);
2450 continue;
2451 }
2452 }
2453
2454 frame:
2455 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2456 if (fsw->fsw_frame != NULL) {
2457 fsw->fsw_frame(fsw, fr, pkt);
2458 }
2459 }
2460
2461 return true;
2462 }
2463
2464 static void
dp_listener_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)2465 dp_listener_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2466 {
2467 struct __kern_packet *pkt, *tpkt;
2468 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2469 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2470 /* listener is only allowed TCP RST */
2471 if (pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
2472 (pkt->pkt_flow_tcp_flags & TH_RST) != 0) {
2473 fsw_flow_abort_tcp(fsw, fe, pkt);
2474 } else {
2475 char *addr;
2476 MD_BUFLET_ADDR_ABS(pkt, addr);
2477 SK_ERR("listener flow sends non-RST packet %s",
2478 sk_dump(sk_proc_name_address(current_proc()),
2479 addr, pkt->pkt_length, 128, NULL, 0));
2480 }
2481 pp_free_packet_single(pkt);
2482 }
2483 }
2484
2485 static void
fsw_update_timestamps(struct __kern_packet * pkt,volatile uint64_t * fg_ts,volatile uint64_t * rt_ts,ifnet_t ifp)2486 fsw_update_timestamps(struct __kern_packet *pkt, volatile uint64_t *fg_ts,
2487 volatile uint64_t *rt_ts, ifnet_t ifp)
2488 {
2489 struct timespec now;
2490 uint64_t now_nsec = 0;
2491
2492 if (!(pkt->pkt_pflags & PKT_F_TS_VALID) || pkt->pkt_timestamp == 0) {
2493 nanouptime(&now);
2494 net_timernsec(&now, &now_nsec);
2495 pkt->pkt_timestamp = now_nsec;
2496 }
2497 pkt->pkt_pflags &= ~PKT_F_TS_VALID;
2498
2499 /*
2500 * If the packet service class is not background,
2501 * update the timestamps on the interface, as well as
2502 * the ones in nexus-wide advisory to indicate recent
2503 * activity on a foreground flow.
2504 */
2505 if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) {
2506 ifp->if_fg_sendts = (uint32_t)_net_uptime;
2507 if (fg_ts != NULL) {
2508 *fg_ts = _net_uptime;
2509 }
2510 }
2511 if (pkt->pkt_pflags & PKT_F_REALTIME) {
2512 ifp->if_rt_sendts = (uint32_t)_net_uptime;
2513 if (rt_ts != NULL) {
2514 *rt_ts = _net_uptime;
2515 }
2516 }
2517 }
2518
2519 /*
2520 * TODO:
2521 * We can check the flow entry as well to only allow chain enqueue
2522 * on flows matching a certain criteria.
2523 */
2524 static boolean_t
fsw_chain_enqueue_enabled(struct nx_flowswitch * fsw,struct flow_entry * fe)2525 fsw_chain_enqueue_enabled(struct nx_flowswitch *fsw, struct flow_entry *fe)
2526 {
2527 #pragma unused(fe)
2528 return fsw_chain_enqueue != 0 &&
2529 fsw->fsw_ifp->if_output_netem == NULL &&
2530 (fsw->fsw_ifp->if_eflags & IFEF_ENQUEUE_MULTI) == 0 &&
2531 fe->fe_qset == NULL;
2532 }
2533
2534 void
dp_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)2535 dp_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2536 {
2537 struct pktq dropped_pkts;
2538 boolean_t chain;
2539 uint32_t cnt = 0, bytes = 0;
2540 volatile struct sk_nexusadv *nxadv = NULL;
2541 volatile uint64_t *fg_ts = NULL;
2542 volatile uint64_t *rt_ts = NULL;
2543
2544 KPKTQ_INIT(&dropped_pkts);
2545 ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
2546 if (__improbable(fe->fe_flags & FLOWENTF_LISTENER)) {
2547 dp_listener_flow_tx_process(fsw, fe);
2548 return;
2549 }
2550 if (__improbable(!dp_flow_tx_route_process(fsw, fe))) {
2551 SK_RDERR(5, "Tx route bad");
2552 FSW_STATS_ADD(FSW_STATS_TX_FLOW_NONVIABLE,
2553 KPKTQ_LEN(&fe->fe_tx_pktq));
2554 KPKTQ_CONCAT(&dropped_pkts, &fe->fe_tx_pktq);
2555 goto done;
2556 }
2557 chain = fsw_chain_enqueue_enabled(fsw, fe);
2558 if (chain) {
2559 nxadv = fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
2560 if (nxadv != NULL) {
2561 fg_ts = &nxadv->nxadv_fg_sendts;
2562 rt_ts = &nxadv->nxadv_rt_sendts;
2563 }
2564 }
2565 struct __kern_packet *pkt, *tpkt;
2566 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2567 int err = flow_pkt_track(fe, pkt, false);
2568 if (__improbable(err != 0)) {
2569 SK_RDERR(5, "flow_pkt_track failed (err %d)", err);
2570 FSW_STATS_INC(FSW_STATS_TX_FLOW_TRACK_ERR);
2571 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2572 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
2573 continue;
2574 }
2575
2576 _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
2577 pkt->pkt_transport_protocol = fe->fe_transport_protocol;
2578
2579 /* set AQM related values for outgoing packet */
2580 if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
2581 pkt->pkt_pflags |= PKT_F_FLOW_ADV;
2582 pkt->pkt_flowsrc_type = FLOWSRC_CHANNEL;
2583 pkt->pkt_flowsrc_fidx = fe->fe_adv_idx;
2584 } else {
2585 pkt->pkt_pflags &= ~PKT_F_FLOW_ADV;
2586 }
2587 pkt->pkt_pflags |= PKT_F_FLOW_ID;
2588
2589 /*
2590 * The same code is exercised per packet for the non-chain case
2591 * (see ifnet_enqueue_ifclassq()). It's replicated here to avoid
2592 * re-walking the chain later.
2593 */
2594 if (chain) {
2595 fsw_update_timestamps(pkt, fg_ts, rt_ts, fsw->fsw_ifp);
2596 }
2597 /* mark packet tos/svc_class */
2598 fsw_qos_mark(fsw, fe, pkt);
2599
2600 tx_finalize_packet(fsw, pkt);
2601 bytes += pkt->pkt_length;
2602 cnt++;
2603 }
2604
2605 /* snoop after it's finalized */
2606 if (__improbable(pktap_total_tap_count != 0)) {
2607 fsw_snoop(fsw, fe, false);
2608 }
2609 if (fe->fe_qset != NULL) {
2610 classq_qset_enqueue_flow(fsw, fe, chain, cnt, bytes);
2611 } else {
2612 classq_enqueue_flow(fsw, fe, chain, cnt, bytes);
2613 }
2614 done:
2615 dp_drop_pktq(fsw, &dropped_pkts);
2616 }
2617
2618 static struct flow_entry *
tx_process_continuous_ip_frag(struct nx_flowswitch * fsw,struct flow_entry * prev_fe,struct __kern_packet * pkt)2619 tx_process_continuous_ip_frag(struct nx_flowswitch *fsw,
2620 struct flow_entry *prev_fe, struct __kern_packet *pkt)
2621 {
2622 ASSERT(!pkt->pkt_flow_ip_is_first_frag);
2623
2624 if (__improbable(pkt->pkt_flow_ip_frag_id == 0)) {
2625 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID);
2626 SK_ERR("%s(%d) invalid zero fragment id",
2627 sk_proc_name_address(current_proc()),
2628 sk_proc_pid(current_proc()));
2629 return NULL;
2630 }
2631
2632 SK_DF(SK_VERB_FSW_DP | SK_VERB_TX,
2633 "%s(%d) continuation frag, id %u",
2634 sk_proc_name_address(current_proc()),
2635 sk_proc_pid(current_proc()),
2636 pkt->pkt_flow_ip_frag_id);
2637 if (__improbable(prev_fe == NULL ||
2638 !prev_fe->fe_tx_is_cont_frag)) {
2639 SK_ERR("%s(%d) unexpected continuation frag",
2640 sk_proc_name_address(current_proc()),
2641 sk_proc_pid(current_proc()),
2642 pkt->pkt_flow_ip_frag_id);
2643 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
2644 return NULL;
2645 }
2646 if (__improbable(pkt->pkt_flow_ip_frag_id !=
2647 prev_fe->fe_tx_frag_id)) {
2648 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
2649 SK_ERR("%s(%d) wrong continuation frag id %u expecting %u",
2650 sk_proc_name_address(current_proc()),
2651 sk_proc_pid(current_proc()),
2652 pkt->pkt_flow_ip_frag_id,
2653 prev_fe->fe_tx_frag_id);
2654 return NULL;
2655 }
2656
2657 return prev_fe;
2658 }
2659
2660 static struct flow_entry *
tx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)2661 tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
2662 struct flow_entry *prev_fe)
2663 {
2664 struct flow_entry *fe;
2665
2666 fe = lookup_flow_with_key(fsw, pkt, false, prev_fe);
2667 if (__improbable(fe == NULL)) {
2668 goto done;
2669 }
2670
2671 if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
2672 SK_RDERR(5, "Tx flow torn down");
2673 FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN);
2674 flow_entry_release(&fe);
2675 goto done;
2676 }
2677
2678 _FSW_INJECT_ERROR(34, pkt->pkt_flow_id[0], fe->fe_uuid[0] + 1,
2679 null_func);
2680
2681 if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
2682 uuid_string_t flow_id_str, pkt_id_str;
2683 sk_uuid_unparse(fe->fe_uuid, flow_id_str);
2684 sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str);
2685 SK_ERR("pkt flow id %s != flow id %s", pkt_id_str, flow_id_str);
2686 flow_entry_release(&fe);
2687 FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID);
2688 }
2689
2690 done:
2691 return fe;
2692 }
2693
2694 static inline void
tx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe)2695 tx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2696 {
2697 ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
2698 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) != 0);
2699
2700 SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, "TX %d pkts from fe %p port %d",
2701 KPKTQ_LEN(&fe->fe_tx_pktq), fe, fe->fe_nx_port);
2702
2703 /* flow related processing (default, agg, etc.) */
2704 fe->fe_tx_process(fsw, fe);
2705
2706 KPKTQ_FINI(&fe->fe_tx_pktq);
2707 }
2708
2709 #if SK_LOG
2710 static void
dp_tx_log_pkt(uint64_t verb,char * desc,struct __kern_packet * pkt)2711 dp_tx_log_pkt(uint64_t verb, char *desc, struct __kern_packet *pkt)
2712 {
2713 char *pkt_buf;
2714 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
2715 SK_DF(verb, "%s(%d) %s %s", sk_proc_name_address(current_proc()),
2716 sk_proc_pid(current_proc()), desc, sk_dump("buf", pkt_buf,
2717 pkt->pkt_length, 128, NULL, 0));
2718 }
2719 #else /* !SK_LOG */
2720 #define dp_tx_log_pkt(...)
2721 #endif /* !SK_LOG */
2722
2723 static void
dp_tx_pktq(struct nx_flowswitch * fsw,struct pktq * spktq)2724 dp_tx_pktq(struct nx_flowswitch *fsw, struct pktq *spktq)
2725 {
2726 struct __kern_packet *spkt, *pkt;
2727 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
2728 struct flow_entry *fe, *prev_fe;
2729 struct pktq dropped_pkts, dpktq;
2730 struct nexus_adapter *dev_na;
2731 struct kern_pbufpool *dev_pp;
2732 struct ifnet *ifp;
2733 sa_family_t af;
2734 uint32_t n_pkts, n_flows = 0;
2735
2736 int err;
2737 KPKTQ_INIT(&dpktq);
2738 KPKTQ_INIT(&dropped_pkts);
2739 n_pkts = KPKTQ_LEN(spktq);
2740
2741 FSW_RLOCK(fsw);
2742 if (__improbable(FSW_QUIESCED(fsw))) {
2743 DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
2744 SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
2745 KPKTQ_CONCAT(&dropped_pkts, spktq);
2746 goto done;
2747 }
2748 dev_na = fsw->fsw_dev_ch->ch_na;
2749 if (__improbable(dev_na == NULL)) {
2750 SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
2751 FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
2752 KPKTQ_CONCAT(&dropped_pkts, spktq);
2753 goto done;
2754 }
2755 /*
2756 * fsw_ifp should still be valid at this point. If fsw is detached
2757 * after fsw_lock is released, this ifp will remain valid and
2758 * netif_transmit() will behave properly even if the ifp is in
2759 * detached state.
2760 */
2761 ifp = fsw->fsw_ifp;
2762
2763 /* batch allocate enough packets */
2764 dev_pp = na_kr_get_pp(dev_na, NR_TX);
2765
2766 err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, n_pkts, NULL,
2767 NULL, SKMEM_NOSLEEP);
2768 #if DEVELOPMENT || DEBUG
2769 if (__probable(err != ENOMEM)) {
2770 _FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
2771 }
2772 #endif /* DEVELOPMENT || DEBUG */
2773 if (__improbable(err == ENOMEM)) {
2774 ASSERT(KPKTQ_EMPTY(&dpktq));
2775 KPKTQ_CONCAT(&dropped_pkts, spktq);
2776 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
2777 SK_ERR("failed to alloc %u pkts from device pool", n_pkts);
2778 goto done;
2779 } else if (__improbable(err == EAGAIN)) {
2780 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT,
2781 (n_pkts - KPKTQ_LEN(&dpktq)));
2782 FSW_STATS_ADD(FSW_STATS_DROP,
2783 (n_pkts - KPKTQ_LEN(&dpktq)));
2784 }
2785
2786 n_pkts = KPKTQ_LEN(&dpktq);
2787 prev_fe = NULL;
2788 KPKTQ_FOREACH(spkt, spktq) {
2789 if (n_pkts == 0) {
2790 break;
2791 }
2792 --n_pkts;
2793
2794 KPKTQ_DEQUEUE(&dpktq, pkt);
2795 ASSERT(pkt != NULL);
2796 err = dp_copy_to_dev(fsw, spkt, pkt);
2797 if (__improbable(err != 0)) {
2798 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
2799 continue;
2800 }
2801
2802 af = fsw_ip_demux(fsw, pkt);
2803 if (__improbable(af == AF_UNSPEC)) {
2804 dp_tx_log_pkt(SK_VERB_ERROR, "demux err", pkt);
2805 FSW_STATS_INC(FSW_STATS_TX_DEMUX_ERR);
2806 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
2807 continue;
2808 }
2809
2810 err = flow_pkt_classify(pkt, ifp, af, false);
2811 if (__improbable(err != 0)) {
2812 dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
2813 FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
2814 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
2815 continue;
2816 }
2817
2818 if (__improbable(pkt->pkt_flow_ip_is_frag &&
2819 !pkt->pkt_flow_ip_is_first_frag)) {
2820 fe = tx_process_continuous_ip_frag(fsw, prev_fe, pkt);
2821 if (__probable(fe != NULL)) {
2822 flow_entry_retain(fe);
2823 goto flow_batch;
2824 } else {
2825 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
2826 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
2827 continue;
2828 }
2829 }
2830
2831 fe = tx_lookup_flow(fsw, pkt, prev_fe);
2832 if (__improbable(fe == NULL)) {
2833 FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
2834 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
2835 prev_fe = NULL;
2836 continue;
2837 }
2838 flow_batch:
2839 tx_flow_batch_packet(&fes, fe, pkt);
2840 prev_fe = fe;
2841 }
2842
2843 struct flow_entry *tfe = NULL;
2844 TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
2845 tx_flow_process(fsw, fe);
2846 TAILQ_REMOVE(&fes, fe, fe_tx_link);
2847 fe->fe_tx_is_cont_frag = false;
2848 fe->fe_tx_frag_id = 0;
2849 flow_entry_release(&fe);
2850 n_flows++;
2851 }
2852
2853 done:
2854 FSW_RUNLOCK(fsw);
2855 if (n_flows > 0) {
2856 netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL);
2857 }
2858 dp_drop_pktq(fsw, &dropped_pkts);
2859 KPKTQ_FINI(&dropped_pkts);
2860 KPKTQ_FINI(&dpktq);
2861 }
2862
2863 static inline void
fsw_dev_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)2864 fsw_dev_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
2865 struct proc *p)
2866 {
2867 #pragma unused(p)
2868 uint32_t total_pkts = 0, total_bytes = 0;
2869
2870 for (;;) {
2871 struct pktq pktq;
2872 KPKTQ_INIT(&pktq);
2873 uint32_t n_bytes;
2874 fsw_ring_dequeue_pktq(fsw, r, fsw_rx_batch, &pktq, &n_bytes);
2875 if (n_bytes == 0) {
2876 break;
2877 }
2878 total_pkts += KPKTQ_LEN(&pktq);
2879 total_bytes += n_bytes;
2880
2881 if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) {
2882 dp_rx_pktq(fsw, &pktq);
2883 } else {
2884 fsw_dev_input_netem_enqueue(fsw, &pktq);
2885 }
2886 KPKTQ_FINI(&pktq);
2887 }
2888
2889 KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
2890 DTRACE_SKYWALK2(fsw__dp__dev__ring__flush, uint32_t, total_pkts,
2891 uint32_t, total_bytes);
2892
2893 /* compute mitigation rate for delivered traffic */
2894 if (__probable(r->ckr_netif_mit_stats != NULL)) {
2895 r->ckr_netif_mit_stats(r, total_pkts, total_bytes);
2896 }
2897 }
2898
2899 static inline void
fsw_user_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)2900 fsw_user_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
2901 struct proc *p)
2902 {
2903 #pragma unused(p)
2904 static packet_trace_id_t trace_id = 0;
2905 uint32_t total_pkts = 0, total_bytes = 0;
2906
2907 for (;;) {
2908 struct pktq pktq;
2909 KPKTQ_INIT(&pktq);
2910 uint32_t n_bytes;
2911 fsw_ring_dequeue_pktq(fsw, r, fsw_tx_batch, &pktq, &n_bytes);
2912 if (n_bytes == 0) {
2913 break;
2914 }
2915 total_pkts += KPKTQ_LEN(&pktq);
2916 total_bytes += n_bytes;
2917
2918 KPKTQ_FIRST(&pktq)->pkt_trace_id = ++trace_id;
2919 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_START, KPKTQ_FIRST(&pktq)->pkt_trace_id);
2920
2921 dp_tx_pktq(fsw, &pktq);
2922 dp_free_pktq(fsw, &pktq);
2923 KPKTQ_FINI(&pktq);
2924 }
2925
2926 kr_update_stats(r, total_pkts, total_bytes);
2927
2928 KDBG(SK_KTRACE_FSW_USER_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
2929 DTRACE_SKYWALK2(fsw__dp__user__ring__flush, uint32_t, total_pkts,
2930 uint32_t, total_bytes);
2931 }
2932
2933 void
fsw_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)2934 fsw_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
2935 struct proc *p)
2936 {
2937 struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
2938
2939 ASSERT(sk_is_sync_protected());
2940 ASSERT(vpna->vpna_nx_port != FSW_VP_HOST);
2941 ASSERT(vpna->vpna_up.na_md_type == NEXUS_META_TYPE_PACKET);
2942
2943 if (vpna->vpna_nx_port == FSW_VP_DEV) {
2944 fsw_dev_ring_flush(fsw, r, p);
2945 } else {
2946 fsw_user_ring_flush(fsw, r, p);
2947 }
2948 }
2949
2950 int
fsw_dp_ctor(struct nx_flowswitch * fsw)2951 fsw_dp_ctor(struct nx_flowswitch *fsw)
2952 {
2953 uint32_t fe_cnt = fsw_fe_table_size;
2954 uint32_t fob_cnt = fsw_flow_owner_buckets;
2955 uint32_t frb_cnt = fsw_flow_route_buckets;
2956 uint32_t frib_cnt = fsw_flow_route_id_buckets;
2957 struct kern_nexus *nx = fsw->fsw_nx;
2958 char name[64];
2959 int error = 0;
2960
2961 /* just in case */
2962 if (fe_cnt == 0) {
2963 fe_cnt = NX_FSW_FE_TABLESZ;
2964 ASSERT(fe_cnt != 0);
2965 }
2966 if (fob_cnt == 0) {
2967 fob_cnt = NX_FSW_FOB_HASHSZ;
2968 ASSERT(fob_cnt != 0);
2969 }
2970 if (frb_cnt == 0) {
2971 frb_cnt = NX_FSW_FRB_HASHSZ;
2972 ASSERT(frb_cnt != 0);
2973 }
2974 if (frib_cnt == 0) {
2975 frib_cnt = NX_FSW_FRIB_HASHSZ;
2976 ASSERT(frib_cnt != 0);
2977 }
2978
2979 /* make sure fe_cnt is a power of two, else round up */
2980 if ((fe_cnt & (fe_cnt - 1)) != 0) {
2981 fe_cnt--;
2982 fe_cnt |= (fe_cnt >> 1);
2983 fe_cnt |= (fe_cnt >> 2);
2984 fe_cnt |= (fe_cnt >> 4);
2985 fe_cnt |= (fe_cnt >> 8);
2986 fe_cnt |= (fe_cnt >> 16);
2987 fe_cnt++;
2988 }
2989
2990 /* make sure frb_cnt is a power of two, else round up */
2991 if ((frb_cnt & (frb_cnt - 1)) != 0) {
2992 frb_cnt--;
2993 frb_cnt |= (frb_cnt >> 1);
2994 frb_cnt |= (frb_cnt >> 2);
2995 frb_cnt |= (frb_cnt >> 4);
2996 frb_cnt |= (frb_cnt >> 8);
2997 frb_cnt |= (frb_cnt >> 16);
2998 frb_cnt++;
2999 }
3000
3001 lck_mtx_init(&fsw->fsw_detach_barrier_lock, &nexus_lock_group,
3002 &nexus_lock_attr);
3003 lck_mtx_init(&fsw->fsw_reap_lock, &nexus_lock_group, &nexus_lock_attr);
3004 lck_mtx_init(&fsw->fsw_linger_lock, &nexus_lock_group, &nexus_lock_attr);
3005 TAILQ_INIT(&fsw->fsw_linger_head);
3006
3007 (void) snprintf(name, sizeof(name), "%s_%llu", NX_FSW_NAME, nx->nx_id);
3008 error = nx_advisory_alloc(nx, name,
3009 &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
3010 NEXUS_ADVISORY_TYPE_FLOWSWITCH);
3011 if (error != 0) {
3012 fsw_dp_dtor(fsw);
3013 return error;
3014 }
3015
3016 fsw->fsw_flow_mgr = flow_mgr_create(fe_cnt, fob_cnt, frb_cnt, frib_cnt);
3017 if (fsw->fsw_flow_mgr == NULL) {
3018 fsw_dp_dtor(fsw);
3019 return error;
3020 }
3021
3022 flow_mgr_setup_host_flow(fsw->fsw_flow_mgr, fsw);
3023
3024 /* generic name; will be customized upon ifattach */
3025 (void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
3026 FSW_REAP_THREADNAME, name, "");
3027
3028 if (kernel_thread_start(fsw_reap_thread_func, fsw,
3029 &fsw->fsw_reap_thread) != KERN_SUCCESS) {
3030 panic_plain("%s: can't create thread", __func__);
3031 /* NOTREACHED */
3032 __builtin_unreachable();
3033 }
3034 /* this must not fail */
3035 VERIFY(fsw->fsw_reap_thread != NULL);
3036
3037 SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
3038
3039
3040 return error;
3041 }
3042
3043 void
fsw_dp_dtor(struct nx_flowswitch * fsw)3044 fsw_dp_dtor(struct nx_flowswitch *fsw)
3045 {
3046 uint64_t f = (1 * NSEC_PER_MSEC); /* 1 ms */
3047 uint64_t s = (1000 * NSEC_PER_SEC); /* 1 sec */
3048 uint32_t i = 0;
3049
3050 nx_advisory_free(fsw->fsw_nx);
3051
3052 if (fsw->fsw_reap_thread != THREAD_NULL) {
3053 /* signal thread to begin self-termination */
3054 lck_mtx_lock(&fsw->fsw_reap_lock);
3055 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATING;
3056
3057 /*
3058 * And wait for thread to terminate; use another
3059 * wait channel here other than fsw_reap_flags to
3060 * make it more explicit. In the event the reaper
3061 * thread misses a wakeup, we'll try again once
3062 * every second (except for the first time).
3063 */
3064 while (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)) {
3065 uint64_t t = 0;
3066
3067 nanoseconds_to_absolutetime((i++ == 0) ? f : s, &t);
3068 clock_absolutetime_interval_to_deadline(t, &t);
3069 ASSERT(t != 0);
3070
3071 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATEBLOCK;
3072 if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)) {
3073 thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
3074 }
3075 (void) assert_wait_deadline(&fsw->fsw_reap_thread,
3076 THREAD_UNINT, t);
3077 lck_mtx_unlock(&fsw->fsw_reap_lock);
3078 thread_block(THREAD_CONTINUE_NULL);
3079 lck_mtx_lock(&fsw->fsw_reap_lock);
3080 fsw->fsw_reap_flags &= ~FSW_REAPF_TERMINATEBLOCK;
3081 }
3082 ASSERT(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED);
3083 lck_mtx_unlock(&fsw->fsw_reap_lock);
3084 fsw->fsw_reap_thread = THREAD_NULL;
3085 }
3086
3087 /* free any remaining flow entries in the linger list */
3088 fsw_linger_purge(fsw);
3089
3090 if (fsw->fsw_flow_mgr != NULL) {
3091 flow_mgr_teardown_host_flow(fsw->fsw_flow_mgr);
3092 flow_mgr_destroy(fsw->fsw_flow_mgr);
3093 fsw->fsw_flow_mgr = NULL;
3094 }
3095
3096 lck_mtx_destroy(&fsw->fsw_detach_barrier_lock, &nexus_lock_group);
3097 lck_mtx_destroy(&fsw->fsw_reap_lock, &nexus_lock_group);
3098 lck_mtx_destroy(&fsw->fsw_linger_lock, &nexus_lock_group);
3099 }
3100
3101 void
fsw_linger_insert(struct flow_entry * fe)3102 fsw_linger_insert(struct flow_entry *fe)
3103 {
3104 struct nx_flowswitch *fsw = fe->fe_fsw;
3105 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
3106 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
3107 fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
3108 fe->fe_flags, FLOWENTF_BITS);
3109
3110 net_update_uptime();
3111
3112 ASSERT(flow_entry_refcnt(fe) >= 1);
3113 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
3114 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
3115 ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
3116 ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE);
3117 ASSERT(fe->fe_linger_wait != 0);
3118 fe->fe_linger_expire = (_net_uptime + fe->fe_linger_wait);
3119 atomic_bitset_32(&fe->fe_flags, FLOWENTF_LINGERING);
3120
3121 lck_mtx_lock_spin(&fsw->fsw_linger_lock);
3122 TAILQ_INSERT_TAIL(&fsw->fsw_linger_head, fe, fe_linger_link);
3123 fsw->fsw_linger_cnt++;
3124 VERIFY(fsw->fsw_linger_cnt != 0);
3125 lck_mtx_unlock(&fsw->fsw_linger_lock);
3126
3127 fsw_reap_sched(fsw);
3128 }
3129
3130 static void
fsw_linger_remove_internal(struct flow_entry_linger_head * linger_head,struct flow_entry * fe)3131 fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head,
3132 struct flow_entry *fe)
3133 {
3134 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
3135 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
3136 fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
3137 fe->fe_flags, FLOWENTF_BITS);
3138
3139 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
3140 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
3141 ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
3142 atomic_bitclear_32(&fe->fe_flags, FLOWENTF_LINGERING);
3143
3144 TAILQ_REMOVE(linger_head, fe, fe_linger_link);
3145 flow_entry_release(&fe);
3146 }
3147
3148 static void
fsw_linger_remove(struct flow_entry * fe)3149 fsw_linger_remove(struct flow_entry *fe)
3150 {
3151 struct nx_flowswitch *fsw = fe->fe_fsw;
3152
3153 LCK_MTX_ASSERT(&fsw->fsw_linger_lock, LCK_MTX_ASSERT_OWNED);
3154
3155 fsw_linger_remove_internal(&fsw->fsw_linger_head, fe);
3156 VERIFY(fsw->fsw_linger_cnt != 0);
3157 fsw->fsw_linger_cnt--;
3158 }
3159
3160 void
fsw_linger_purge(struct nx_flowswitch * fsw)3161 fsw_linger_purge(struct nx_flowswitch *fsw)
3162 {
3163 struct flow_entry *fe, *tfe;
3164
3165 lck_mtx_lock(&fsw->fsw_linger_lock);
3166 TAILQ_FOREACH_SAFE(fe, &fsw->fsw_linger_head, fe_linger_link, tfe) {
3167 fsw_linger_remove(fe);
3168 }
3169 ASSERT(fsw->fsw_linger_cnt == 0);
3170 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
3171 lck_mtx_unlock(&fsw->fsw_linger_lock);
3172 }
3173
3174 void
fsw_reap_sched(struct nx_flowswitch * fsw)3175 fsw_reap_sched(struct nx_flowswitch *fsw)
3176 {
3177 ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
3178 lck_mtx_lock_spin(&fsw->fsw_reap_lock);
3179 if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING) &&
3180 !(fsw->fsw_reap_flags & (FSW_REAPF_TERMINATING | FSW_REAPF_TERMINATED))) {
3181 thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
3182 }
3183 lck_mtx_unlock(&fsw->fsw_reap_lock);
3184 }
3185
3186 __attribute__((noreturn))
3187 static void
fsw_reap_thread_func(void * v,wait_result_t w)3188 fsw_reap_thread_func(void *v, wait_result_t w)
3189 {
3190 #pragma unused(w)
3191 struct nx_flowswitch *fsw = v;
3192
3193 ASSERT(fsw->fsw_reap_thread == current_thread());
3194 thread_set_thread_name(current_thread(), fsw->fsw_reap_name);
3195
3196 net_update_uptime();
3197
3198 lck_mtx_lock(&fsw->fsw_reap_lock);
3199 VERIFY(!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING));
3200 (void) assert_wait(&fsw->fsw_reap_flags, THREAD_UNINT);
3201 lck_mtx_unlock(&fsw->fsw_reap_lock);
3202 thread_block_parameter(fsw_reap_thread_cont, fsw);
3203 /* NOTREACHED */
3204 __builtin_unreachable();
3205 }
3206
3207 __attribute__((noreturn))
3208 static void
fsw_reap_thread_cont(void * v,wait_result_t wres)3209 fsw_reap_thread_cont(void *v, wait_result_t wres)
3210 {
3211 struct nx_flowswitch *fsw = v;
3212 boolean_t low;
3213 uint64_t t = 0;
3214
3215 SK_DF(SK_VERB_FLOW, "%s: running", fsw->fsw_reap_name);
3216
3217 lck_mtx_lock(&fsw->fsw_reap_lock);
3218 if (__improbable(wres == THREAD_INTERRUPTED ||
3219 (fsw->fsw_reap_flags & FSW_REAPF_TERMINATING) != 0)) {
3220 goto terminate;
3221 }
3222
3223 ASSERT(!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED));
3224 fsw->fsw_reap_flags |= FSW_REAPF_RUNNING;
3225 lck_mtx_unlock(&fsw->fsw_reap_lock);
3226
3227 net_update_uptime();
3228
3229 /* prevent detach from happening while we're here */
3230 if (!fsw_detach_barrier_add(fsw)) {
3231 SK_ERR("%s: netagent detached", fsw->fsw_reap_name);
3232 t = 0;
3233 } else {
3234 uint32_t fe_nonviable, fe_freed, fe_aborted;
3235 uint32_t fr_freed, fr_resid = 0;
3236 struct ifnet *ifp = fsw->fsw_ifp;
3237 uint64_t i = FSW_REAP_IVAL;
3238 uint64_t now = _net_uptime;
3239 uint64_t last;
3240
3241 ASSERT(fsw->fsw_ifp != NULL);
3242
3243 /*
3244 * Pass 1: process any deferred {withdrawn,nonviable} requests.
3245 */
3246 fe_nonviable = fsw_process_deferred(fsw);
3247
3248 /*
3249 * Pass 2: remove any expired lingering flows.
3250 */
3251 fe_freed = fsw_process_linger(fsw, &fe_aborted);
3252
3253 /*
3254 * Pass 3: prune idle flow routes.
3255 */
3256 fr_freed = flow_route_prune(fsw->fsw_flow_mgr,
3257 ifp, &fr_resid);
3258
3259 /*
3260 * Pass 4: prune flow table
3261 *
3262 */
3263 cuckoo_hashtable_try_shrink(fsw->fsw_flow_mgr->fm_flow_table);
3264
3265 SK_DF(SK_VERB_FLOW, "%s: fe_nonviable %u/%u fe_freed %u/%u "
3266 "fe_aborted %u fr_freed %u/%u",
3267 fsw->fsw_flow_mgr->fm_name, fe_nonviable,
3268 (fe_nonviable + fsw->fsw_pending_nonviable),
3269 fe_freed, fsw->fsw_linger_cnt, fe_aborted, fe_freed,
3270 (fe_freed + fr_resid));
3271
3272 /* see if VM memory level is critical */
3273 low = skmem_lowmem_check();
3274
3275 /*
3276 * If things appear to be idle, we can prune away cached
3277 * object that have fallen out of the working sets (this
3278 * is different than purging). Every once in a while, we
3279 * also purge the caches. Note that this is done across
3280 * all flowswitch instances, and so we limit this to no
3281 * more than once every FSW_REAP_SK_THRES seconds.
3282 */
3283 atomic_get_64(last, &fsw_reap_last);
3284 if ((low || (last != 0 && (now - last) >= FSW_REAP_SK_THRES)) &&
3285 atomic_test_set_64(&fsw_reap_last, last, now)) {
3286 fsw_purge_cache(fsw, low);
3287
3288 /* increase sleep interval if idle */
3289 if (kdebug_enable == 0 && fsw->fsw_linger_cnt == 0 &&
3290 fsw->fsw_pending_nonviable == 0 && fr_resid == 0) {
3291 i <<= 3;
3292 }
3293 } else if (last == 0) {
3294 atomic_set_64(&fsw_reap_last, now);
3295 }
3296
3297 /*
3298 * Additionally, run thru the list of channels and prune
3299 * or purge away cached objects on "idle" channels. This
3300 * check is rate limited to no more than once every
3301 * FSW_DRAIN_CH_THRES seconds.
3302 */
3303 last = fsw->fsw_drain_channel_chk_last;
3304 if (low || (last != 0 && (now - last) >= FSW_DRAIN_CH_THRES)) {
3305 SK_DF(SK_VERB_FLOW, "%s: pruning channels",
3306 fsw->fsw_flow_mgr->fm_name);
3307
3308 fsw->fsw_drain_channel_chk_last = now;
3309 fsw_drain_channels(fsw, now, low);
3310 } else if (__improbable(last == 0)) {
3311 fsw->fsw_drain_channel_chk_last = now;
3312 }
3313
3314 /*
3315 * Finally, invoke the interface's reap callback to
3316 * tell it to prune or purge away cached objects if
3317 * it is idle. This check is rate limited to no more
3318 * than once every FSW_REAP_IF_THRES seconds.
3319 */
3320 last = fsw->fsw_drain_netif_chk_last;
3321 if (low || (last != 0 && (now - last) >= FSW_REAP_IF_THRES)) {
3322 ASSERT(fsw->fsw_nifna != NULL);
3323
3324 if (ifp->if_na_ops != NULL &&
3325 ifp->if_na_ops->ni_reap != NULL) {
3326 SK_DF(SK_VERB_FLOW, "%s: pruning netif",
3327 fsw->fsw_flow_mgr->fm_name);
3328 ifp->if_na_ops->ni_reap(ifp->if_na, ifp,
3329 FSW_REAP_IF_THRES, low);
3330 }
3331
3332 fsw->fsw_drain_netif_chk_last = now;
3333 } else if (__improbable(last == 0)) {
3334 fsw->fsw_drain_netif_chk_last = now;
3335 }
3336
3337 /* emit periodic interface stats ktrace */
3338 last = fsw->fsw_reap_last;
3339 if (last != 0 && (now - last) >= FSW_IFSTATS_THRES) {
3340 KDBG(SK_KTRACE_AON_IF_STATS, ifp->if_data.ifi_ipackets,
3341 ifp->if_data.ifi_ibytes * 8,
3342 ifp->if_data.ifi_opackets,
3343 ifp->if_data.ifi_obytes * 8);
3344
3345 fsw->fsw_reap_last = now;
3346 } else if (__improbable(last == 0)) {
3347 fsw->fsw_reap_last = now;
3348 }
3349
3350 nanoseconds_to_absolutetime(i * NSEC_PER_SEC, &t);
3351 clock_absolutetime_interval_to_deadline(t, &t);
3352 ASSERT(t != 0);
3353
3354 /* allow any pending detach to proceed */
3355 fsw_detach_barrier_remove(fsw);
3356 }
3357
3358 lck_mtx_lock(&fsw->fsw_reap_lock);
3359 if (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING)) {
3360 fsw->fsw_reap_flags &= ~FSW_REAPF_RUNNING;
3361 (void) assert_wait_deadline(&fsw->fsw_reap_flags,
3362 THREAD_UNINT, t);
3363 lck_mtx_unlock(&fsw->fsw_reap_lock);
3364 thread_block_parameter(fsw_reap_thread_cont, fsw);
3365 /* NOTREACHED */
3366 __builtin_unreachable();
3367 } else {
3368 terminate:
3369 LCK_MTX_ASSERT(&fsw->fsw_reap_lock, LCK_MTX_ASSERT_OWNED);
3370 fsw->fsw_reap_flags &= ~(FSW_REAPF_RUNNING | FSW_REAPF_TERMINATING);
3371 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATED;
3372 /*
3373 * And signal any thread waiting for us to terminate;
3374 * wait channel here other than fsw_reap_flags to make
3375 * it more explicit.
3376 */
3377 if (fsw->fsw_reap_flags & FSW_REAPF_TERMINATEBLOCK) {
3378 thread_wakeup((caddr_t)&fsw->fsw_reap_thread);
3379 }
3380 lck_mtx_unlock(&fsw->fsw_reap_lock);
3381
3382 SK_DF(SK_VERB_FLOW, "%s: terminating", fsw->fsw_reap_name);
3383
3384 /* for the extra refcnt from kernel_thread_start() */
3385 thread_deallocate(current_thread());
3386 /* this is the end */
3387 thread_terminate(current_thread());
3388 /* NOTREACHED */
3389 __builtin_unreachable();
3390 }
3391
3392 /* must never get here */
3393 VERIFY(0);
3394 /* NOTREACHED */
3395 __builtin_unreachable();
3396 }
3397
3398 static void
fsw_drain_channels(struct nx_flowswitch * fsw,uint64_t now,boolean_t low)3399 fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low)
3400 {
3401 struct kern_nexus *nx = fsw->fsw_nx;
3402
3403 /* flowswitch protects NA via fsw_lock, see fsw_port_alloc/free */
3404 FSW_RLOCK(fsw);
3405
3406 /* uncrustify doesn't handle C blocks properly */
3407 /* BEGIN IGNORE CODESTYLE */
3408 nx_port_foreach(nx, ^(nexus_port_t p) {
3409 struct nexus_adapter *na = nx_port_get_na(nx, p);
3410 if (na == NULL || na->na_work_ts == 0 ||
3411 (now - na->na_work_ts) < FSW_DRAIN_CH_THRES) {
3412 return;
3413 }
3414
3415 /*
3416 * If NA has been inactive for some time (twice the drain
3417 * threshold), we clear the work timestamp to temporarily skip
3418 * this channel until it's active again. Purging cached objects
3419 * can be expensive since we'd need to allocate and construct
3420 * them again, so we do it only when necessary.
3421 */
3422 boolean_t purge;
3423 if (low || ((now - na->na_work_ts) >= (FSW_DRAIN_CH_THRES << 1))) {
3424 na->na_work_ts = 0;
3425 purge = TRUE;
3426 } else {
3427 purge = FALSE;
3428 }
3429
3430 na_drain(na, purge); /* purge/prune caches */
3431 });
3432 /* END IGNORE CODESTYLE */
3433
3434 FSW_RUNLOCK(fsw);
3435 }
3436
3437 static void
fsw_purge_cache(struct nx_flowswitch * fsw,boolean_t low)3438 fsw_purge_cache(struct nx_flowswitch *fsw, boolean_t low)
3439 {
3440 #pragma unused(fsw)
3441 uint64_t o = atomic_add_64_ov(&fsw_want_purge, 1);
3442 uint32_t p = fsw_flow_purge_thresh;
3443 boolean_t purge = (low || (o != 0 && p != 0 && (o % p) == 0));
3444
3445 SK_DF(SK_VERB_FLOW, "%s: %s caches",
3446 fsw->fsw_flow_mgr->fm_name,
3447 (purge ? "purge" : "prune"));
3448
3449 skmem_cache_reap_now(sk_fo_cache, purge);
3450 skmem_cache_reap_now(sk_fe_cache, purge);
3451 skmem_cache_reap_now(sk_fab_cache, purge);
3452 skmem_cache_reap_now(flow_route_cache, purge);
3453 skmem_cache_reap_now(flow_stats_cache, purge);
3454 eventhandler_reap_caches(purge);
3455 netns_reap_caches(purge);
3456 skmem_reap_caches(purge);
3457 necp_client_reap_caches(purge);
3458
3459 if (if_is_fsw_transport_netagent_enabled() && purge) {
3460 mbuf_drain(FALSE);
3461 }
3462 }
3463
3464 static void
fsw_flow_handle_low_power(struct nx_flowswitch * fsw,struct flow_entry * fe)3465 fsw_flow_handle_low_power(struct nx_flowswitch *fsw, struct flow_entry *fe)
3466 {
3467 /* When the interface is in low power mode, the flow is nonviable */
3468 if (!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
3469 atomic_test_set_32(&fe->fe_want_nonviable, 0, 1)) {
3470 atomic_add_32(&fsw->fsw_pending_nonviable, 1);
3471 }
3472 }
3473
3474 static uint32_t
fsw_process_deferred(struct nx_flowswitch * fsw)3475 fsw_process_deferred(struct nx_flowswitch *fsw)
3476 {
3477 struct flow_entry_dead sfed __sk_aligned(8);
3478 struct flow_mgr *fm = fsw->fsw_flow_mgr;
3479 struct flow_entry_dead *fed, *tfed;
3480 LIST_HEAD(, flow_entry_dead) fed_head =
3481 LIST_HEAD_INITIALIZER(fed_head);
3482 uint32_t i, nonviable = 0;
3483 boolean_t lowpowermode = FALSE;
3484
3485 bzero(&sfed, sizeof(sfed));
3486
3487 /*
3488 * The flows become nonviable when the interface
3489 * is in low power mode (edge trigger)
3490 */
3491 if ((fsw->fsw_ifp->if_xflags & IFXF_LOW_POWER) &&
3492 fsw->fsw_ifp->if_low_power_gencnt != fsw->fsw_low_power_gencnt) {
3493 lowpowermode = TRUE;
3494 fsw->fsw_low_power_gencnt = fsw->fsw_ifp->if_low_power_gencnt;
3495 }
3496
3497 /*
3498 * Scan thru the flow entry tree, and commit any pending withdraw or
3499 * nonviable requests. We may need to push stats and/or unassign the
3500 * nexus from NECP, but we cannot do that while holding the locks;
3501 * build a temporary list for those entries.
3502 */
3503 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
3504 struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
3505 struct flow_owner *fo;
3506
3507 /*
3508 * Grab the lock at all costs when handling low power mode
3509 */
3510 if (__probable(!lowpowermode)) {
3511 if (!FOB_TRY_LOCK(fob)) {
3512 continue;
3513 }
3514 } else {
3515 FOB_LOCK(fob);
3516 }
3517
3518 FOB_LOCK_ASSERT_HELD(fob);
3519 RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
3520 struct flow_entry *fe;
3521
3522 RB_FOREACH(fe, flow_entry_id_tree,
3523 &fo->fo_flow_entry_id_head) {
3524 /* try first as reader; skip if we can't */
3525 if (__improbable(lowpowermode)) {
3526 fsw_flow_handle_low_power(fsw, fe);
3527 }
3528 if (__improbable(fe->fe_flags & FLOWENTF_HALF_CLOSED)) {
3529 atomic_bitclear_32(&fe->fe_flags, FLOWENTF_HALF_CLOSED);
3530 flow_namespace_half_close(&fe->fe_port_reservation);
3531 }
3532
3533 /* if not withdrawn/nonviable, skip */
3534 if (!fe->fe_want_withdraw &&
3535 !fe->fe_want_nonviable) {
3536 continue;
3537 }
3538 /*
3539 * Here we're holding the lock as writer;
3540 * don't spend too much time as we're
3541 * blocking the data path now.
3542 */
3543 ASSERT(!uuid_is_null(fe->fe_uuid));
3544 /* only need flow UUID and booleans */
3545 uuid_copy(sfed.fed_uuid, fe->fe_uuid);
3546 sfed.fed_want_clonotify =
3547 (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY);
3548 sfed.fed_want_nonviable = fe->fe_want_nonviable;
3549 flow_entry_teardown(fo, fe);
3550
3551 /* do this outside the flow bucket lock */
3552 fed = flow_entry_dead_alloc(Z_WAITOK);
3553 ASSERT(fed != NULL);
3554 *fed = sfed;
3555 LIST_INSERT_HEAD(&fed_head, fed, fed_link);
3556 }
3557 }
3558 FOB_UNLOCK(fob);
3559 }
3560
3561 /*
3562 * These nonviable flows are no longer useful since we've lost
3563 * the source IP address; in the event the client monitors the
3564 * viability of the flow, explicitly mark it as nonviable so
3565 * that a new flow can be created.
3566 */
3567 LIST_FOREACH_SAFE(fed, &fed_head, fed_link, tfed) {
3568 LIST_REMOVE(fed, fed_link);
3569 ASSERT(fsw->fsw_agent_session != NULL);
3570
3571 /* if flow is closed early */
3572 if (fed->fed_want_clonotify) {
3573 necp_client_early_close(fed->fed_uuid);
3574 }
3575
3576 /* if nonviable, unassign nexus attributes */
3577 if (fed->fed_want_nonviable) {
3578 (void) netagent_assign_nexus(fsw->fsw_agent_session,
3579 fed->fed_uuid, NULL, 0);
3580 }
3581
3582 flow_entry_dead_free(fed);
3583 ++nonviable;
3584 }
3585 ASSERT(LIST_EMPTY(&fed_head));
3586
3587 return nonviable;
3588 }
3589
3590 static uint32_t
fsw_process_linger(struct nx_flowswitch * fsw,uint32_t * abort)3591 fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort)
3592 {
3593 struct flow_entry_linger_head linger_head =
3594 TAILQ_HEAD_INITIALIZER(linger_head);
3595 struct flow_entry *fe, *tfe;
3596 uint64_t now = _net_uptime;
3597 uint32_t i = 0, cnt = 0, freed = 0;
3598
3599 ASSERT(fsw->fsw_ifp != NULL);
3600 ASSERT(abort != NULL);
3601 *abort = 0;
3602
3603 /*
3604 * We don't want to contend with the datapath, so move
3605 * everything that's in the linger list into a local list.
3606 * This allows us to generate RSTs or free the flow entry
3607 * outside the lock. Any remaining flow entry in the local
3608 * list will get re-added back to the head of the linger
3609 * list, in front of any new ones added since then.
3610 */
3611 lck_mtx_lock(&fsw->fsw_linger_lock);
3612 TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
3613 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
3614 cnt = fsw->fsw_linger_cnt;
3615 fsw->fsw_linger_cnt = 0;
3616 lck_mtx_unlock(&fsw->fsw_linger_lock);
3617
3618 TAILQ_FOREACH_SAFE(fe, &linger_head, fe_linger_link, tfe) {
3619 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
3620 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
3621 ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
3622
3623 /*
3624 * See if this is a TCP flow that needs to generate
3625 * a RST to the remote peer (if not already).
3626 */
3627 if (flow_track_tcp_want_abort(fe)) {
3628 VERIFY(fe->fe_flags & FLOWENTF_ABORTED);
3629 ASSERT(!uuid_is_null(fe->fe_uuid));
3630 fsw_flow_abort_tcp(fsw, fe, NULL);
3631 (*abort)++;
3632 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
3633 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx "
3634 "flags 0x%b [RST]", fe_as_string(fe, dbgbuf,
3635 sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags,
3636 FLOWENTF_BITS);
3637 }
3638
3639 /*
3640 * If flow has expired, remove from list and free;
3641 * otherwise leave it around in the linger list.
3642 */
3643 if (fe->fe_linger_expire <= now) {
3644 freed++;
3645 fsw_linger_remove_internal(&linger_head, fe);
3646 fe = NULL;
3647 }
3648 ++i;
3649 }
3650 VERIFY(i == cnt && cnt >= freed);
3651
3652 /*
3653 * Add any remaining ones back into the linger list.
3654 */
3655 lck_mtx_lock(&fsw->fsw_linger_lock);
3656 if (!TAILQ_EMPTY(&linger_head)) {
3657 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head) || fsw->fsw_linger_cnt);
3658 TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
3659 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
3660 TAILQ_CONCAT(&fsw->fsw_linger_head, &linger_head, fe_linger_link);
3661 fsw->fsw_linger_cnt += (cnt - freed);
3662 }
3663 ASSERT(TAILQ_EMPTY(&linger_head));
3664 lck_mtx_unlock(&fsw->fsw_linger_lock);
3665
3666 return freed;
3667 }
3668
3669 /* Send RST for a given TCP flow; Use @pkt as template if given */
3670 void
fsw_flow_abort_tcp(struct nx_flowswitch * fsw,struct flow_entry * fe,struct __kern_packet * pkt)3671 fsw_flow_abort_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe,
3672 struct __kern_packet *pkt)
3673 {
3674 struct flow_track *src, *dst;
3675 struct ip *ip;
3676 struct ip6_hdr *ip6;
3677 struct tcphdr *th;
3678 uint16_t len, tlen;
3679 struct mbuf *m;
3680 uint8_t ipver;
3681
3682 /* guaranteed by caller */
3683 ASSERT(fsw->fsw_ifp != NULL);
3684
3685 src = &fe->fe_ltrack;
3686 dst = &fe->fe_rtrack;
3687
3688 if (pkt != NULL) {
3689 ipver = pkt->pkt_flow_ip_ver;
3690 } else {
3691 ipver = fe->fe_key.fk_ipver;
3692 }
3693
3694 tlen = sizeof(struct tcphdr);
3695 if (ipver == IPVERSION) {
3696 len = sizeof(struct ip) + tlen;
3697 } else {
3698 ASSERT(ipver == IPV6_VERSION);
3699 len = sizeof(struct ip6_hdr) + tlen;
3700 }
3701
3702 m = m_gethdr(M_WAITOK, MT_HEADER);
3703 VERIFY(m != NULL);
3704
3705 m->m_pkthdr.pkt_proto = IPPROTO_TCP;
3706 m->m_data += max_linkhdr; /* 32-bit aligned */
3707 m->m_pkthdr.len = m->m_len = len;
3708
3709 /* zero out for checksum */
3710 bzero(m->m_data, len);
3711
3712 if (ipver == IPVERSION) {
3713 ip = mtod(m, struct ip *);
3714
3715 /* IP header fields included in the TCP checksum */
3716 ip->ip_p = IPPROTO_TCP;
3717 ip->ip_len = htons(tlen);
3718 if (pkt == NULL) {
3719 ip->ip_src = fe->fe_key.fk_src4;
3720 ip->ip_dst = fe->fe_key.fk_dst4;
3721 } else {
3722 ip->ip_src = pkt->pkt_flow_ipv4_src;
3723 ip->ip_dst = pkt->pkt_flow_ipv4_dst;
3724 }
3725
3726 th = (struct tcphdr *)(void *)((char *)ip + sizeof(*ip));
3727 } else {
3728 ip6 = mtod(m, struct ip6_hdr *);
3729
3730 /* IP header fields included in the TCP checksum */
3731 ip6->ip6_nxt = IPPROTO_TCP;
3732 ip6->ip6_plen = htons(tlen);
3733 if (pkt == NULL) {
3734 ip6->ip6_src = fe->fe_key.fk_src6;
3735 ip6->ip6_dst = fe->fe_key.fk_dst6;
3736 } else {
3737 ip6->ip6_src = pkt->pkt_flow_ipv6_src;
3738 ip6->ip6_dst = pkt->pkt_flow_ipv6_dst;
3739 }
3740
3741 th = (struct tcphdr *)(void *)((char *)ip6 + sizeof(*ip6));
3742 }
3743
3744 /*
3745 * TCP header (fabricate a pure RST).
3746 */
3747 if (pkt == NULL) {
3748 th->th_sport = fe->fe_key.fk_sport;
3749 th->th_dport = fe->fe_key.fk_dport;
3750 th->th_seq = htonl(src->fse_seqlo); /* peer's last ACK */
3751 th->th_ack = 0;
3752 th->th_flags = TH_RST;
3753 } else {
3754 th->th_sport = pkt->pkt_flow_tcp_src;
3755 th->th_dport = pkt->pkt_flow_tcp_dst;
3756 th->th_seq = pkt->pkt_flow_tcp_seq;
3757 th->th_ack = pkt->pkt_flow_tcp_ack;
3758 th->th_flags = pkt->pkt_flow_tcp_flags;
3759 }
3760 th->th_off = (tlen >> 2);
3761 th->th_win = 0;
3762
3763 FSW_STATS_INC(FSW_STATS_FLOWS_ABORTED);
3764
3765 if (ipver == IPVERSION) {
3766 struct ip_out_args ipoa;
3767 struct route ro;
3768
3769 bzero(&ipoa, sizeof(ipoa));
3770 ipoa.ipoa_boundif = fsw->fsw_ifp->if_index;
3771 ipoa.ipoa_flags = (IPOAF_SELECT_SRCIF | IPOAF_BOUND_IF |
3772 IPOAF_BOUND_SRCADDR);
3773 ipoa.ipoa_sotc = SO_TC_UNSPEC;
3774 ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
3775
3776 /* TCP checksum */
3777 th->th_sum = in_cksum(m, len);
3778
3779 ip->ip_v = IPVERSION;
3780 ip->ip_hl = sizeof(*ip) >> 2;
3781 ip->ip_tos = 0;
3782 /*
3783 * ip_output() expects ip_len and ip_off to be in host order.
3784 */
3785 ip->ip_len = len;
3786 ip->ip_off = IP_DF;
3787 ip->ip_ttl = (uint8_t)ip_defttl;
3788 ip->ip_sum = 0;
3789
3790 bzero(&ro, sizeof(ro));
3791 (void) ip_output(m, NULL, &ro, IP_OUTARGS, NULL, &ipoa);
3792 ROUTE_RELEASE(&ro);
3793 } else {
3794 struct ip6_out_args ip6oa;
3795 struct route_in6 ro6;
3796
3797 bzero(&ip6oa, sizeof(ip6oa));
3798 ip6oa.ip6oa_boundif = fsw->fsw_ifp->if_index;
3799 ip6oa.ip6oa_flags = (IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_IF |
3800 IP6OAF_BOUND_SRCADDR);
3801 ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
3802 ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
3803
3804 /* TCP checksum */
3805 th->th_sum = in6_cksum(m, IPPROTO_TCP,
3806 sizeof(struct ip6_hdr), tlen);
3807
3808 ip6->ip6_vfc |= IPV6_VERSION;
3809 ip6->ip6_hlim = IPV6_DEFHLIM;
3810
3811 ip6_output_setsrcifscope(m, fsw->fsw_ifp->if_index, NULL);
3812 ip6_output_setdstifscope(m, fsw->fsw_ifp->if_index, NULL);
3813
3814 bzero(&ro6, sizeof(ro6));
3815 (void) ip6_output(m, NULL, &ro6, IPV6_OUTARGS,
3816 NULL, NULL, &ip6oa);
3817 ROUTE_RELEASE(&ro6);
3818 }
3819 }
3820
3821 void
fsw_flow_abort_quic(struct flow_entry * fe,uint8_t * token)3822 fsw_flow_abort_quic(struct flow_entry *fe, uint8_t *token)
3823 {
3824 struct quic_stateless_reset {
3825 uint8_t ssr_header[30];
3826 uint8_t ssr_token[QUIC_STATELESS_RESET_TOKEN_SIZE];
3827 };
3828 struct nx_flowswitch *fsw = fe->fe_fsw;
3829 struct ip *ip;
3830 struct ip6_hdr *ip6;
3831 struct udphdr *uh;
3832 struct quic_stateless_reset *qssr;
3833 uint16_t len, l3hlen, ulen;
3834 struct mbuf *m;
3835 unsigned int one = 1;
3836 int error;
3837
3838 /* guaranteed by caller */
3839 ASSERT(fsw->fsw_ifp != NULL);
3840
3841 /* skip zero token */
3842 bool is_zero_token = true;
3843 for (size_t i = 0; i < QUIC_STATELESS_RESET_TOKEN_SIZE; i++) {
3844 if (token[i] != 0) {
3845 is_zero_token = false;
3846 break;
3847 }
3848 }
3849 if (is_zero_token) {
3850 return;
3851 }
3852
3853 ulen = sizeof(struct udphdr) + sizeof(struct quic_stateless_reset);
3854 if (fe->fe_key.fk_ipver == IPVERSION) {
3855 l3hlen = sizeof(struct ip);
3856 } else {
3857 ASSERT(fe->fe_key.fk_ipver == IPV6_VERSION);
3858 l3hlen = sizeof(struct ip6_hdr);
3859 }
3860
3861 len = l3hlen + ulen;
3862
3863 error = mbuf_allocpacket(MBUF_DONTWAIT, max_linkhdr + len, &one, &m);
3864 if (error != 0) {
3865 return;
3866 }
3867 VERIFY(m != 0);
3868
3869 m->m_pkthdr.pkt_proto = IPPROTO_UDP;
3870 m->m_data += max_linkhdr; /* 32-bit aligned */
3871 m->m_pkthdr.len = m->m_len = len;
3872
3873 /* zero out for checksum */
3874 bzero(m->m_data, len);
3875
3876 if (fe->fe_key.fk_ipver == IPVERSION) {
3877 ip = mtod(m, struct ip *);
3878 ip->ip_p = IPPROTO_UDP;
3879 ip->ip_len = htons(ulen);
3880 ip->ip_src = fe->fe_key.fk_src4;
3881 ip->ip_dst = fe->fe_key.fk_dst4;
3882 uh = (struct udphdr *)(void *)((char *)ip + sizeof(*ip));
3883 } else {
3884 ip6 = mtod(m, struct ip6_hdr *);
3885 ip6->ip6_nxt = IPPROTO_UDP;
3886 ip6->ip6_plen = htons(ulen);
3887 ip6->ip6_src = fe->fe_key.fk_src6;
3888 ip6->ip6_dst = fe->fe_key.fk_dst6;
3889 uh = (struct udphdr *)(void *)((char *)ip6 + sizeof(*ip6));
3890 }
3891
3892 /* UDP header */
3893 uh->uh_sport = fe->fe_key.fk_sport;
3894 uh->uh_dport = fe->fe_key.fk_dport;
3895 uh->uh_ulen = htons(ulen);
3896
3897 /* QUIC stateless reset */
3898 qssr = (struct quic_stateless_reset *)(uh + 1);
3899 read_frandom(&qssr->ssr_header, sizeof(qssr->ssr_header));
3900 qssr->ssr_header[0] = (qssr->ssr_header[0] & 0x3f) | 0x40;
3901 memcpy(qssr->ssr_token, token, QUIC_STATELESS_RESET_TOKEN_SIZE);
3902
3903 FSW_STATS_INC(FSW_STATS_FLOWS_ABORTED);
3904
3905 if (fe->fe_key.fk_ipver == IPVERSION) {
3906 struct ip_out_args ipoa;
3907 struct route ro;
3908
3909 bzero(&ipoa, sizeof(ipoa));
3910 ipoa.ipoa_boundif = fsw->fsw_ifp->if_index;
3911 ipoa.ipoa_flags = (IPOAF_SELECT_SRCIF | IPOAF_BOUND_IF |
3912 IPOAF_BOUND_SRCADDR);
3913 ipoa.ipoa_sotc = SO_TC_UNSPEC;
3914 ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
3915
3916 uh->uh_sum = in_cksum(m, len);
3917 if (uh->uh_sum == 0) {
3918 uh->uh_sum = 0xffff;
3919 }
3920
3921 ip->ip_v = IPVERSION;
3922 ip->ip_hl = sizeof(*ip) >> 2;
3923 ip->ip_tos = 0;
3924 /*
3925 * ip_output() expects ip_len and ip_off to be in host order.
3926 */
3927 ip->ip_len = len;
3928 ip->ip_off = IP_DF;
3929 ip->ip_ttl = (uint8_t)ip_defttl;
3930 ip->ip_sum = 0;
3931
3932 bzero(&ro, sizeof(ro));
3933 (void) ip_output(m, NULL, &ro, IP_OUTARGS, NULL, &ipoa);
3934 ROUTE_RELEASE(&ro);
3935 } else {
3936 struct ip6_out_args ip6oa;
3937 struct route_in6 ro6;
3938
3939 bzero(&ip6oa, sizeof(ip6oa));
3940 ip6oa.ip6oa_boundif = fsw->fsw_ifp->if_index;
3941 ip6oa.ip6oa_flags = (IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_IF |
3942 IP6OAF_BOUND_SRCADDR);
3943 ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
3944 ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
3945
3946 uh->uh_sum = in6_cksum(m, IPPROTO_UDP, sizeof(struct ip6_hdr),
3947 ulen);
3948 if (uh->uh_sum == 0) {
3949 uh->uh_sum = 0xffff;
3950 }
3951
3952 ip6->ip6_vfc |= IPV6_VERSION;
3953 ip6->ip6_hlim = IPV6_DEFHLIM;
3954 ip6_output_setsrcifscope(m, fsw->fsw_ifp->if_index, NULL);
3955 ip6_output_setdstifscope(m, fsw->fsw_ifp->if_index, NULL);
3956
3957 bzero(&ro6, sizeof(ro6));
3958 (void) ip6_output(m, NULL, &ro6, IPV6_OUTARGS,
3959 NULL, NULL, &ip6oa);
3960 ROUTE_RELEASE(&ro6);
3961 }
3962 }
3963
3964 __attribute__((always_inline))
3965 static inline void
fsw_ifp_inc_traffic_class_in_pkt(struct ifnet * ifp,kern_packet_t ph)3966 fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph)
3967 {
3968 switch (__packet_get_traffic_class(ph)) {
3969 case PKT_TC_BE:
3970 ifp->if_tc.ifi_ibepackets++;
3971 ifp->if_tc.ifi_ibebytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
3972 break;
3973 case PKT_TC_BK:
3974 ifp->if_tc.ifi_ibkpackets++;
3975 ifp->if_tc.ifi_ibkbytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
3976 break;
3977 case PKT_TC_VI:
3978 ifp->if_tc.ifi_ivipackets++;
3979 ifp->if_tc.ifi_ivibytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
3980 break;
3981 case PKT_TC_VO:
3982 ifp->if_tc.ifi_ivopackets++;
3983 ifp->if_tc.ifi_ivobytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
3984 break;
3985 default:
3986 break;
3987 }
3988 }
3989
3990 __attribute__((always_inline))
3991 static inline void
fsw_ifp_inc_traffic_class_out_pkt(struct ifnet * ifp,uint32_t svc,uint32_t cnt,uint32_t len)3992 fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *ifp, uint32_t svc,
3993 uint32_t cnt, uint32_t len)
3994 {
3995 switch (svc) {
3996 case PKT_TC_BE:
3997 ifp->if_tc.ifi_obepackets += cnt;
3998 ifp->if_tc.ifi_obebytes += len;
3999 break;
4000 case PKT_TC_BK:
4001 ifp->if_tc.ifi_obkpackets += cnt;
4002 ifp->if_tc.ifi_obkbytes += len;
4003 break;
4004 case PKT_TC_VI:
4005 ifp->if_tc.ifi_ovipackets += cnt;
4006 ifp->if_tc.ifi_ovibytes += len;
4007 break;
4008 case PKT_TC_VO:
4009 ifp->if_tc.ifi_ovopackets += cnt;
4010 ifp->if_tc.ifi_ovobytes += len;
4011 break;
4012 default:
4013 break;
4014 }
4015 }
4016