1 /*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 */
53
54 /*
55 * BSD LICENSE
56 *
57 * Copyright(c) 2015 NEC Europe Ltd. All rights reserved.
58 * All rights reserved.
59 *
60 * Redistribution and use in source and binary forms, with or without
61 * modification, are permitted provided that the following conditions
62 * are met:
63 *
64 * * Redistributions of source code must retain the above copyright
65 * notice, this list of conditions and the following disclaimer.
66 * * Redistributions in binary form must reproduce the above copyright
67 * notice, this list of conditions and the following disclaimer in
68 * the documentation and/or other materials provided with the
69 * distribution.
70 * * Neither the name of NEC Europe Ltd. nor the names of
71 * its contributors may be used to endorse or promote products derived
72 * from this software without specific prior written permission.
73 *
74 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
75 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
76 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
77 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
78 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
79 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
80 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
81 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
82 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
83 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
84 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85 */
86
87 #include <skywalk/os_skywalk_private.h>
88 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
89 #include <skywalk/nexus/flowswitch/fsw_var.h>
90 #include <skywalk/nexus/netif/nx_netif.h>
91 #include <skywalk/nexus/netif/nx_netif_compat.h>
92 #include <kern/sched_prim.h>
93 #include <sys/kdebug.h>
94 #include <sys/sdt.h>
95 #include <net/bpf.h>
96 #include <net/if_ports_used.h>
97 #include <net/pktap.h>
98 #include <net/pktsched/pktsched_netem.h>
99 #include <netinet/tcp.h>
100 #include <netinet/tcp_fsm.h>
101 #include <netinet/tcp_seq.h>
102 #include <netinet/udp.h>
103 #include <netinet/ip.h>
104 #include <netinet/ip6.h>
105
106 extern kern_return_t thread_terminate(thread_t);
107
108 #define FSW_ZONE_MAX 256
109 #define FSW_ZONE_NAME "skywalk.nx.fsw"
110
111 #define FSW_STATS_VAL(x) STATS_VAL(&fsw->fsw_stats, x)
112 #define FSW_STATS_INC(x) STATS_INC(&fsw->fsw_stats, x)
113 #define FSW_STATS_ADD(x, n) STATS_ADD(&fsw->fsw_stats, x, n)
114
115 static uint64_t fsw_reap_last __sk_aligned(8);
116 static uint64_t fsw_want_purge __sk_aligned(8);
117
118 #define NX_FSW_FE_TABLESZ 256 /* some power of 2 */
119 static uint32_t fsw_fe_table_size = NX_FSW_FE_TABLESZ;
120
121 #define NX_FSW_FOB_HASHSZ 31 /* some mersenne prime */
122 static uint32_t fsw_flow_owner_buckets = NX_FSW_FOB_HASHSZ;
123
124 #define NX_FSW_FRB_HASHSZ 128 /* some power of 2 */
125 static uint32_t fsw_flow_route_buckets = NX_FSW_FRB_HASHSZ;
126
127 #define NX_FSW_FRIB_HASHSZ 13 /* some mersenne prime */
128 static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ;
129
130 #define NX_FSW_FLOW_REAP_INTERVAL 1 /* seconds */
131 static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL;
132
133 #define NX_FSW_FLOW_PURGE_THRES 0 /* purge every N reaps (0 = disable) */
134 static uint32_t fsw_flow_purge_thresh = NX_FSW_FLOW_PURGE_THRES;
135
136 #define FSW_REAP_IVAL (MAX(1, fsw_flow_reap_interval))
137 #define FSW_REAP_SK_THRES (FSW_REAP_IVAL << 5)
138 #define FSW_REAP_IF_THRES (FSW_REAP_IVAL << 5)
139 #define FSW_DRAIN_CH_THRES (FSW_REAP_IVAL << 5)
140 #define FSW_IFSTATS_THRES 1
141
142 #define RX_BUFLET_BATCH_COUNT 64 /* max batch size for buflet allocation */
143
144 uint32_t fsw_rx_batch = NX_FSW_RXBATCH; /* # of packets per batch (RX) */
145 uint32_t fsw_tx_batch = NX_FSW_TXBATCH; /* # of packets per batch (TX) */
146 #if (DEVELOPMENT || DEBUG)
147 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_batch,
148 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_batch, 0,
149 "flowswitch Rx batch size");
150 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, tx_batch,
151 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_tx_batch, 0,
152 "flowswitch Tx batch size");
153 #endif /* !DEVELOPMENT && !DEBUG */
154
155 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp,
156 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp, 0,
157 "flowswitch RX aggregation for tcp flows (enable/disable)");
158 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp_host,
159 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp_host, 0,
160 "flowswitch RX aggregation for tcp kernel path (0/1/2 (off/on/auto))");
161
162 /*
163 * IP reassembly
164 * The "kern.skywalk.flowswitch.ip_reass" sysctl can be used to force
165 * enable/disable the reassembly routine regardless of whether the
166 * transport netagent is enabled or not.
167 *
168 * 'fsw_ip_reass' is a tri-state:
169 * 0 means force IP reassembly off
170 * 1 means force IP reassembly on
171 * 2 means don't force the value, use what's appropriate for this flowswitch
172 */
173 #define FSW_IP_REASS_FORCE_OFF 0
174 #define FSW_IP_REASS_FORCE_ON 1
175 #define FSW_IP_REASS_NO_FORCE 2
176
177 uint32_t fsw_ip_reass = FSW_IP_REASS_NO_FORCE;
178
179 static int
180 fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS
181 {
182 #pragma unused(oidp, arg1, arg2)
183 unsigned int new_value;
184 int changed;
185 int error;
186
187 error = sysctl_io_number(req, fsw_ip_reass, sizeof(fsw_ip_reass),
188 &new_value, &changed);
189 if (error == 0 && changed != 0) {
190 if (new_value > FSW_IP_REASS_NO_FORCE) {
191 return EINVAL;
192 }
193 fsw_ip_reass = new_value;
194 }
195 return error;
196 }
197
198 SYSCTL_PROC(_kern_skywalk_flowswitch, OID_AUTO, ip_reass,
199 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
200 0, 0, fsw_ip_reass_sysctl, "IU",
201 "adjust flowswitch IP reassembly");
202
203 #if (DEVELOPMENT || DEBUG)
204 static uint64_t _fsw_inject_error = 0;
205 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) \
206 _SK_INJECT_ERROR(_fsw_inject_error, _en, _ev, _ec, \
207 &FSW_STATS_VAL(_FSW_STATS_ERROR_INJECTIONS), _f, __VA_ARGS__)
208
209 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { \
210 if (__improbable(((_fsw_inject_error) & (1ULL << (_en))) != 0)) { \
211 SK_DF(SK_VERB_ERROR_INJECT, "injecting error %d", (_en));\
212 if ((_f) != NULL) \
213 (_f)(__VA_ARGS__); \
214 } \
215 } while (0)
216
217 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_owner_buckets,
218 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_owner_buckets, 0, "");
219 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, fe_table_size,
220 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_fe_table_size, 0, "");
221 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_buckets,
222 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_route_buckets, 0, "");
223 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO,
224 flow_route_id_buckets, CTLFLAG_RW | CTLFLAG_LOCKED,
225 &fsw_flow_route_id_buckets, 0, "");
226 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_reap_interval,
227 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_reap_interval, 0, "");
228 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_purge_thresh,
229 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_purge_thresh, 0, "");
230 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, fsw_inject_error,
231 CTLFLAG_RW | CTLFLAG_LOCKED, &_fsw_inject_error, "");
232 #else
233 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0)
234 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { } while (0)
235 #endif /* !DEVELOPMENT && !DEBUG */
236
237 static void fsw_linger_remove_internal(struct flow_entry_linger_head *,
238 struct flow_entry *);
239 static void fsw_reap_thread_func(void *, wait_result_t);
240 static void fsw_reap_thread_cont(void *, wait_result_t);
241 static void fsw_purge_cache(struct nx_flowswitch *, boolean_t);
242 static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t);
243 static uint32_t fsw_process_deferred(struct nx_flowswitch *);
244 static uint32_t fsw_process_linger(struct nx_flowswitch *, uint32_t *);
245
246 static int copy_packet_from_dev(struct nx_flowswitch *, struct __kern_packet *,
247 struct __kern_packet *);
248
249 static void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *, kern_packet_t);
250 static void fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *, uint32_t,
251 uint32_t, uint32_t);
252
253 static int __fsw_dp_inited = 0;
254
255 int
fsw_dp_init(void)256 fsw_dp_init(void)
257 {
258 _CASSERT(FSW_VP_DEV == 0);
259 _CASSERT(FSW_VP_HOST == 1);
260 _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN);
261 _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT);
262
263 ASSERT(!__fsw_dp_inited);
264
265 flow_mgr_init();
266 flow_init();
267
268 __fsw_dp_inited = 1;
269
270 return 0;
271 }
272
273 void
fsw_dp_uninit(void)274 fsw_dp_uninit(void)
275 {
276 if (__fsw_dp_inited) {
277 flow_fini();
278 flow_mgr_fini();
279
280 __fsw_dp_inited = 0;
281 }
282 }
283
284 static void
dp_free_pktq(struct nx_flowswitch * fsw __sk_unused,struct pktq * pktq)285 dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq)
286 {
287 pp_free_pktq(pktq);
288 }
289
290 #define dp_drop_pktq(fsw, pktq) do { \
291 uint32_t _len = KPKTQ_LEN(pktq); \
292 if (KPKTQ_EMPTY(pktq)) { \
293 ASSERT(_len == 0); \
294 return; \
295 } \
296 SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop %d packets", _len); \
297 FSW_STATS_ADD(FSW_STATS_DROP, _len); \
298 DTRACE_SKYWALK1(fsw__dp__drop, int, _len); \
299 dp_free_pktq(fsw, pktq); \
300 } while (0)
301
302 SK_NO_INLINE_ATTRIBUTE
303 void
fsw_snoop(struct nx_flowswitch * fsw,struct flow_entry * fe,bool input)304 fsw_snoop(struct nx_flowswitch *fsw, struct flow_entry *fe, bool input)
305 {
306 pid_t pid;
307 char proc_name_buf[FLOW_PROCESS_NAME_LENGTH];
308 char *proc_name = NULL;
309 pid_t epid;
310 char eproc_name_buf[FLOW_PROCESS_NAME_LENGTH];
311 char *eproc_name = NULL;
312 sa_family_t af;
313 bool tap_early = false;
314 struct __kern_packet *pkt;
315
316 ASSERT(fe != NULL);
317 ASSERT(fsw->fsw_ifp != NULL);
318
319 if (fe->fe_nx_port == FSW_VP_HOST) {
320 /* allow packets to be tapped before aggregation happens */
321 tap_early = (input && fe->fe_key.fk_proto == IPPROTO_TCP);
322 if (!tap_early) {
323 /* all other traffic will be tapped in the dlil input path */
324 return;
325 }
326 }
327 if (fe->fe_key.fk_ipver == IPVERSION) {
328 af = AF_INET;
329 } else if (fe->fe_key.fk_ipver == IPV6_VERSION) {
330 af = AF_INET6;
331 } else {
332 return;
333 }
334
335 pid = fe->fe_pid;
336 if (fe->fe_proc_name[0] != '\0') {
337 (void) strlcpy(proc_name_buf, fe->fe_proc_name,
338 sizeof(proc_name_buf));
339 proc_name = proc_name_buf;
340 }
341 epid = fe->fe_epid;
342 if (fe->fe_eproc_name[0] != '\0') {
343 (void) strlcpy(eproc_name_buf, fe->fe_eproc_name,
344 sizeof(eproc_name_buf));
345 eproc_name = eproc_name_buf;
346 }
347 if (input) {
348 KPKTQ_FOREACH(pkt, &fe->fe_rx_pktq) {
349 pktap_input_packet(fsw->fsw_ifp, af,
350 fsw->fsw_ifp_dlt, pid, proc_name, epid,
351 eproc_name, SK_PKT2PH(pkt), NULL, 0,
352 IPPROTO_TCP, fe->fe_inp_flowhash,
353 tap_early ? PTH_FLAG_SOCKET: PTH_FLAG_NEXUS_CHAN);
354 }
355 } else {
356 KPKTQ_FOREACH(pkt, &fe->fe_tx_pktq) {
357 pktap_output_packet(fsw->fsw_ifp, af,
358 fsw->fsw_ifp_dlt, pid, proc_name, epid,
359 eproc_name, SK_PKT2PH(pkt), NULL, 0,
360 0, 0, PTH_FLAG_NEXUS_CHAN);
361 }
362 }
363 }
364
365 #if (DEVELOPMENT || DEBUG)
366 static void
_fsw_error35_handler(int step,struct flow_route * fr,struct __kern_packet * pkt,int * ret)367 _fsw_error35_handler(int step, struct flow_route *fr, struct __kern_packet *pkt,
368 int *ret)
369 {
370 static boolean_t _err35_flag_modified = FALSE;
371
372 switch (step) {
373 case 1:
374 if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
375 (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
376 fr->fr_flags &= ~FLOWRTF_RESOLVED;
377 _err35_flag_modified = TRUE;
378 }
379 break;
380
381 case 2:
382 if (!_err35_flag_modified) {
383 return;
384 }
385 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
386 m_freem(pkt->pkt_mbuf);
387 pkt->pkt_pflags &= ~PKT_F_MBUF_DATA;
388 pkt->pkt_mbuf = NULL;
389 }
390 *ret = EJUSTRETURN;
391 fr->fr_flags |= FLOWRTF_RESOLVED;
392 _err35_flag_modified = FALSE;
393 break;
394
395 default:
396 VERIFY(0);
397 /* not reached */
398 }
399 }
400
401 static void
_fsw_error36_handler(int step,struct flow_route * fr,int * ret)402 _fsw_error36_handler(int step, struct flow_route *fr, int *ret)
403 {
404 static boolean_t _err36_flag_modified = FALSE;
405
406 switch (step) {
407 case 1:
408 if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
409 (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
410 fr->fr_flags &= ~FLOWRTF_RESOLVED;
411 _err36_flag_modified = TRUE;
412 }
413 break;
414
415 case 2:
416 if (!_err36_flag_modified) {
417 return;
418 }
419 *ret = ENETUNREACH;
420 fr->fr_flags |= FLOWRTF_RESOLVED;
421 _err36_flag_modified = FALSE;
422 break;
423
424 default:
425 VERIFY(0);
426 /* not reached */
427 }
428 }
429 #else /* !DEVELOPMENT && !DEBUG */
430 #define _fsw_error35_handler(...)
431 #define _fsw_error36_handler(...)
432 #endif /* DEVELOPMENT || DEBUG */
433
434 /*
435 * Check if the source packet content can fit into the destination
436 * ring's packet. Returns TRUE if the source packet can fit.
437 * Note: Failures could be caused by misconfigured packet pool sizes,
438 * missing packet size check again MTU or if the source packet is from
439 * a compat netif and the attached mbuf is larger than MTU due to LRO.
440 */
441 static inline boolean_t
validate_pkt_len(struct __kern_packet * spkt,kern_packet_t dph,uint32_t skip_l2hlen,uint32_t l2hlen,uint16_t headroom,uint32_t * copy_len)442 validate_pkt_len(struct __kern_packet *spkt, kern_packet_t dph,
443 uint32_t skip_l2hlen, uint32_t l2hlen, uint16_t headroom,
444 uint32_t *copy_len)
445 {
446 uint32_t tlen = 0;
447 uint32_t splen = spkt->pkt_length - skip_l2hlen;
448
449 if (l2hlen != 0) {
450 VERIFY(skip_l2hlen == 0);
451 tlen += l2hlen;
452 } else if ((spkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0) {
453 splen -= ETHER_CRC_LEN;
454 }
455
456 tlen += splen;
457 *copy_len = splen;
458
459 return tlen <= ((__packet_get_buflet_count(dph) *
460 SK_PTR_ADDR_KPKT(dph)->pkt_qum.qum_pp->pp_buflet_size) - headroom);
461 }
462
463 #if SK_LOG
464 /* Hoisted out of line to reduce kernel stack footprint */
465 SK_LOG_ATTRIBUTE
466 static void
copy_packet_from_dev_log(struct __kern_packet * spkt,struct __kern_packet * dpkt,struct proc * p)467 copy_packet_from_dev_log(struct __kern_packet *spkt,
468 struct __kern_packet *dpkt, struct proc *p)
469 {
470 uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
471 ((spkt->pkt_pflags & PKT_F_MBUF_DATA) ?
472 SK_VERB_COPY_MBUF : SK_VERB_COPY));
473 char *daddr;
474 MD_BUFLET_ADDR_ABS(dpkt, daddr);
475 SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u",
476 sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
477 dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
478 (uint32_t)dpkt->pkt_l2_len);
479 SK_DF(logflags | SK_VERB_DUMP, "%s",
480 sk_dump("buf", daddr, dpkt->pkt_length, 128, NULL, 0));
481 }
482 #else
483 #define copy_packet_from_dev_log(...)
484 #endif /* SK_LOG */
485
486
487 static inline int
copy_packet_from_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)488 copy_packet_from_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
489 struct __kern_packet *dpkt)
490 {
491 /*
492 * source and destination nexus don't share the packet pool
493 * sync operation here is to
494 * - alloc packet for the rx(dst) ring
495 * - copy data/metadata from src packet to dst packet
496 * - attach alloc'd packet to rx(dst) ring
497 */
498 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
499 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
500 kern_packet_t sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt),
501 METADATA_SUBTYPE(spkt));
502 boolean_t do_cksum_rx;
503 uint16_t skip_l2h_len = spkt->pkt_l2_len;
504 uint16_t iphlen;
505 uint32_t dlen;
506 int err;
507
508 if (__improbable(!validate_pkt_len(spkt, dph, skip_l2h_len, 0, 0,
509 &dlen))) {
510 SK_ERR("bufcnt %d, bufsz %d", __packet_get_buflet_count(dph),
511 dpkt->pkt_qum.qum_pp->pp_buflet_size);
512 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
513 return EINVAL;
514 }
515
516 /* Copy packet metadata */
517 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
518 _PKT_COPY(spkt, dpkt);
519 ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
520 PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
521 ASSERT(dpkt->pkt_mbuf == NULL);
522
523 dpkt->pkt_headroom = 0;
524 dpkt->pkt_l2_len = 0;
525
526 /* don't include IP header from partial sum */
527 if (__probable((spkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) {
528 iphlen = spkt->pkt_flow_ip_hlen;
529 do_cksum_rx = sk_cksum_rx;
530 } else {
531 iphlen = 0;
532 do_cksum_rx = FALSE;
533 }
534
535 /* Copy packet payload */
536 if ((spkt->pkt_pflags & PKT_F_MBUF_DATA) &&
537 (spkt->pkt_pflags & PKT_F_TRUNCATED)) {
538 FSW_STATS_INC(FSW_STATS_RX_COPY_MBUF2PKT);
539 /*
540 * Source packet has truncated contents (just enough for
541 * the classifer) of an mbuf from the compat driver; copy
542 * the entire entire mbuf contents to destination packet.
543 */
544 m_adj(spkt->pkt_mbuf, skip_l2h_len);
545 ASSERT((uint32_t)m_pktlen(spkt->pkt_mbuf) >= dlen);
546 fsw->fsw_pkt_copy_from_mbuf(NR_RX, dph, 0,
547 spkt->pkt_mbuf, 0, dlen, do_cksum_rx, iphlen);
548 } else {
549 FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2PKT);
550 /*
551 * Source packet has full contents, either from an mbuf
552 * that came up from the compat driver, or because it
553 * originated on the native driver; copy to destination.
554 */
555 fsw->fsw_pkt_copy_from_pkt(NR_RX, dph, 0, sph,
556 (spkt->pkt_headroom + spkt->pkt_l2_len), dlen, do_cksum_rx,
557 iphlen, 0, FALSE);
558 }
559
560 #if DEBUG || DEVELOPMENT
561 if (__improbable(pkt_trailers > 0)) {
562 dlen += pkt_add_trailers(dph, dlen, iphlen);
563 }
564 #endif /* DEBUG || DEVELOPMENT */
565
566 /* Finalize and attach packet to Rx ring */
567 METADATA_ADJUST_LEN(dpkt, 0, 0);
568 err = __packet_finalize(dph);
569 VERIFY(err == 0);
570
571 copy_packet_from_dev_log(spkt, dpkt, kernproc);
572
573 if (spkt->pkt_pflags & PKT_F_MBUF_DATA) {
574 ifp_inc_traffic_class_in(fsw->fsw_ifp, spkt->pkt_mbuf);
575 mbuf_free(spkt->pkt_mbuf);
576 KPKT_CLEAR_MBUF_DATA(spkt);
577 } else {
578 fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph);
579 }
580
581 if (__probable(do_cksum_rx != 0)) {
582 FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
583 }
584
585 return 0;
586 }
587
588 static struct __kern_packet *
rx_process_ip_frag(struct nx_flowswitch * fsw,struct __kern_packet * pkt)589 rx_process_ip_frag(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
590 {
591 char *pkt_buf;
592 void *l3_hdr;
593 uint16_t nfrags, tlen;
594 int err = 0;
595
596 switch (fsw_ip_reass) {
597 case FSW_IP_REASS_FORCE_OFF:
598 return pkt;
599 case FSW_IP_REASS_FORCE_ON:
600 break;
601 default:
602 if (!FSW_NETAGENT_ENABLED(fsw)) {
603 return pkt;
604 }
605 break;
606 }
607
608 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
609 l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
610
611 ASSERT(fsw->fsw_ipfm != NULL);
612 ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
613
614 if (pkt->pkt_flow_ip_ver == IPVERSION) {
615 err = fsw_ip_frag_reass_v4(fsw->fsw_ipfm, &pkt,
616 (struct ip *)l3_hdr, &nfrags, &tlen);
617 } else {
618 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
619 /* we only handle frag header immediately after v6 header */
620 err = fsw_ip_frag_reass_v6(fsw->fsw_ipfm, &pkt,
621 (struct ip6_hdr *)l3_hdr,
622 (struct ip6_frag *)((uintptr_t)l3_hdr + sizeof(struct ip6_hdr)),
623 &nfrags, &tlen);
624 }
625 if (__improbable(err != 0)) {
626 /* if we get a bad fragment, free it */
627 pp_free_packet_single(pkt);
628 pkt = NULL;
629 } else {
630 ASSERT(!((pkt != NULL) ^ (nfrags > 0)));
631 }
632
633 return pkt;
634 }
635
636 SK_NO_INLINE_ATTRIBUTE
637 static void
rx_prepare_packet_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)638 rx_prepare_packet_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
639 {
640 ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
641 uint32_t mlen = (uint32_t)m_pktlen(pkt->pkt_mbuf);
642 kern_packet_t ph = SK_PTR_ENCODE(pkt,
643 METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
644 /*
645 * This is the case when the packet is coming in from
646 * compat-netif. This packet only has valid metadata
647 * and an attached mbuf. We need to copy enough data
648 * from the mbuf to the packet buffer for the
649 * classifier. Compat netif packet pool is configured
650 * with buffer size of NETIF_COMPAT_MAX_MBUF_DATA_COPY
651 * which is just enough to hold the protocol headers
652 * for the flowswitch classifier.
653 */
654
655 pkt->pkt_headroom = 0;
656 METADATA_ADJUST_LEN(pkt, 0, 0);
657 /*
658 * Copy the initial 128 bytes of the packet for
659 * classification.
660 * Ethernet(14) + IPv6 header(40) +
661 * + IPv6 fragment header(8) +
662 * TCP header with options(60).
663 */
664 fsw->fsw_pkt_copy_from_mbuf(NR_RX, ph,
665 pkt->pkt_headroom, pkt->pkt_mbuf, 0,
666 MIN(mlen, NETIF_COMPAT_MAX_MBUF_DATA_COPY),
667 FALSE, 0);
668
669 int err = __packet_finalize_with_mbuf(pkt);
670 VERIFY(err == 0);
671 }
672
673 static struct __kern_packet *
rx_prepare_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)674 rx_prepare_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
675 {
676 pkt->pkt_qum_qflags &= ~QUM_F_FLOW_CLASSIFIED;
677
678 if (__improbable(pkt->pkt_pflags & PKT_F_MBUF_DATA)) {
679 rx_prepare_packet_mbuf(fsw, pkt);
680 }
681
682 return pkt;
683 }
684
685 static struct flow_entry *
lookup_flow_with_key(struct nx_flowswitch * fsw,struct __kern_packet * pkt,bool input,struct flow_entry * prev_fe)686 lookup_flow_with_key(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
687 bool input, struct flow_entry *prev_fe)
688 {
689 struct flow_key key __sk_aligned(16);
690 struct flow_entry *fe;
691
692 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
693 flow_pkt2key(pkt, input, &key);
694
695 if (__probable(prev_fe != NULL &&
696 prev_fe->fe_key.fk_mask == FKMASK_5TUPLE)) {
697 uint16_t saved_mask = key.fk_mask;
698 bool match;
699 key.fk_mask = FKMASK_5TUPLE;
700 match = (flow_key_cmp_mask(&prev_fe->fe_key,
701 &key, &fk_mask_5tuple)) == 0;
702 if (match) {
703 flow_entry_retain(prev_fe);
704 return prev_fe;
705 }
706 key.fk_mask = saved_mask;
707 }
708
709 fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &key);
710
711 SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]);
712 SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP,
713 "%s %s %s \"%s\" fe 0x%llx",
714 input ? "Rx" : "Tx", if_name(fsw->fsw_ifp),
715 sk_proc_name_address(current_proc()),
716 fk_as_string(&key, fkbuf, sizeof(fkbuf)),
717 SK_KVA(fe));
718
719 return fe;
720 }
721
722 static struct flow_entry *
rx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)723 rx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
724 struct flow_entry *prev_fe)
725 {
726 struct flow_entry *fe;
727 fe = lookup_flow_with_key(fsw, pkt, true, prev_fe);
728 _FSW_INJECT_ERROR(2, fe, NULL, flow_entry_release, &fe);
729 if (fe == NULL) {
730 FSW_STATS_INC(FSW_STATS_RX_FLOW_NOT_FOUND);
731 fe = flow_mgr_get_host_fe(fsw->fsw_flow_mgr);
732 }
733
734 if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
735 FSW_STATS_INC(FSW_STATS_RX_FLOW_TORNDOWN);
736 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
737 "Rx flow torn down, use host fe");
738 flow_entry_release(&fe);
739 fe = flow_mgr_get_host_fe(fsw->fsw_flow_mgr);
740 }
741
742 SK_LOG_VAR(char febuf[FLOWENTRY_DBGBUF_SIZE]);
743 SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP | SK_VERB_RX,
744 "fe 0x%llx \"%s\"",
745 SK_KVA(fe), fe_as_string(fe, febuf, sizeof(febuf)));
746
747 return fe;
748 }
749
750 static inline void
rx_flow_batch_packet(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt)751 rx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
752 struct __kern_packet *pkt)
753 {
754 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
755 fe->fe_rx_frag_count++;
756 }
757
758 /* KPKTQ_ENQUEUE_LIST is needed until frags become chained buflet */
759 if (KPKTQ_EMPTY(&fe->fe_rx_pktq)) {
760 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) == 0);
761 TAILQ_INSERT_TAIL(fes, fe, fe_rx_link);
762 KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
763 } else {
764 ASSERT(!TAILQ_EMPTY(fes));
765 KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
766 flow_entry_release(&fe);
767 }
768 }
769
770 static void
tx_flow_batch_packet(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt)771 tx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
772 struct __kern_packet *pkt)
773 {
774 /* record frag continuation */
775 if (__improbable(pkt->pkt_flow_ip_is_first_frag)) {
776 ASSERT(pkt->pkt_flow_ip_is_frag);
777 fe->fe_tx_is_cont_frag = true;
778 fe->fe_tx_frag_id = pkt->pkt_flow_ip_frag_id;
779 } else if (__probable(!pkt->pkt_flow_ip_is_frag)) {
780 fe->fe_tx_is_cont_frag = false;
781 fe->fe_tx_frag_id = 0;
782 }
783
784 if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
785 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
786 TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
787 KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
788 } else {
789 ASSERT(!TAILQ_EMPTY(fes));
790 KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
791 flow_entry_release(&fe);
792 }
793 }
794
795 static inline void
fsw_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes)796 fsw_ring_dequeue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
797 uint32_t n_pkts_max, struct pktq *pktq, uint32_t *n_bytes)
798 {
799 uint32_t n_pkts = 0;
800
801 KPKTQ_INIT(pktq);
802
803 slot_idx_t idx, idx_end;
804 idx = r->ckr_khead;
805 idx_end = r->ckr_rhead;
806
807 *n_bytes = 0;
808 for (; n_pkts < n_pkts_max && idx != idx_end;
809 idx = SLOT_NEXT(idx, r->ckr_lim)) {
810 struct __kern_slot_desc *ksd = KR_KSD(r, idx);
811 struct __kern_packet *pkt = ksd->sd_pkt;
812
813 ASSERT(pkt->pkt_nextpkt == NULL);
814 KR_SLOT_DETACH_METADATA(r, ksd);
815
816 _FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
817 pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
818 if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
819 || (pkt->pkt_length == 0)) {
820 FSW_STATS_INC(FSW_STATS_DROP);
821 pp_free_packet_single(pkt);
822 continue;
823 }
824
825 n_pkts++;
826 *n_bytes += pkt->pkt_length;
827
828 KPKTQ_ENQUEUE(pktq, pkt);
829 }
830
831 r->ckr_khead = idx;
832 r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
833 }
834
835 static void
fsw_ring_enqueue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)836 fsw_ring_enqueue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
837 struct pktq *pktq)
838 {
839 #pragma unused(fsw)
840 struct __kern_packet *pkt;
841 struct __kern_quantum *kqum;
842 uint32_t kr_space_avail = 0;
843 uint32_t n, n_pkts = 0, n_bytes = 0;
844 slot_idx_t idx = 0, idx_start = 0, idx_end = 0;
845
846 idx_start = r->ckr_ktail;
847 kr_space_avail = kr_available_slots_rxring(r);
848 _FSW_INJECT_ERROR(40, kr_space_avail, 0, null_func);
849 n = MIN(kr_space_avail, KPKTQ_LEN(pktq));
850 _FSW_INJECT_ERROR(41, n, 0, null_func);
851 idx_end = SLOT_INCREMENT(idx_start, n, r->ckr_lim);
852
853 idx = idx_start;
854 while (idx != idx_end) {
855 KPKTQ_DEQUEUE(pktq, pkt);
856 kqum = SK_PTR_ADDR_KQUM(pkt);
857 kqum->qum_qflags |= QUM_F_FINALIZED;
858 n_pkts++;
859 n_bytes += pkt->pkt_length;
860 KR_SLOT_ATTACH_METADATA(r, KR_KSD(r, idx), kqum);
861 if (__improbable(pkt->pkt_trace_id != 0)) {
862 KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
863 KDBG(SK_KTRACE_PKT_RX_CHN | DBG_FUNC_START, pkt->pkt_trace_id);
864 }
865 idx = SLOT_NEXT(idx, r->ckr_lim);
866 }
867
868 kr_update_stats(r, n_pkts, n_bytes);
869
870 /*
871 * ensure slot attachments are visible before updating the
872 * tail pointer
873 */
874 membar_sync();
875
876 r->ckr_ktail = idx_end;
877
878 /* ensure global visibility */
879 membar_sync();
880
881 r->ckr_na_notify(r, kernproc, NA_NOTEF_PUSH);
882
883 SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "%s enqueued %d pkts",
884 r->ckr_name, n_pkts);
885 }
886
887 static void
pkts_to_pktq(struct __kern_packet * pkts[],uint32_t n_pkts,struct pktq * pktq)888 pkts_to_pktq(struct __kern_packet *pkts[], uint32_t n_pkts, struct pktq *pktq)
889 {
890 ASSERT(KPKTQ_EMPTY(pktq));
891
892 for (uint32_t i = 0; i < n_pkts; i++) {
893 struct __kern_packet *pkt = pkts[i];
894 ASSERT(pkt->pkt_nextpkt == NULL);
895 KPKTQ_ENQUEUE(pktq, pkt);
896 }
897 }
898
899 /*
900 * This function is modeled after nx_netif_host_grab_pkts() in nx_netif_host.c.
901 */
902 SK_NO_INLINE_ATTRIBUTE
903 static void
convert_native_pkt_to_mbuf_chain(struct nx_flowswitch * fsw,struct flow_entry * fe,struct __kern_packet * pkt_chain,struct mbuf ** m_chain,struct mbuf ** m_tail,uint32_t * cnt,uint32_t * bytes)904 convert_native_pkt_to_mbuf_chain(struct nx_flowswitch *fsw,
905 struct flow_entry *fe, struct __kern_packet *pkt_chain,
906 struct mbuf **m_chain, struct mbuf **m_tail, uint32_t *cnt,
907 uint32_t *bytes)
908 {
909 uint32_t tot_cnt;
910 unsigned int one = 1;
911 struct mbuf *mhead, *chain = NULL, *tail = NULL, **tailp = &chain;
912 uint32_t mhead_cnt, mhead_bufsize;
913 uint32_t mhead_waste = 0;
914 uint32_t mcnt = 0, mbytes = 0;
915 uint32_t largest, max_pkt_len;
916 struct __kern_packet *pkt;
917 struct kern_pbufpool *pp;
918
919 tot_cnt = *cnt;
920 ASSERT(tot_cnt > 0);
921 mhead_cnt = tot_cnt;
922
923 /*
924 * Opportunistically batch-allocate the mbufs based on the largest
925 * packet size we've seen in the recent past. Note that we reset
926 * fe_rx_largest_msize below if we notice that we're under-utilizing the
927 * allocated buffers (thus disabling this batch allocation).
928 */
929 if (__probable((largest = fe->fe_rx_largest_msize) != 0)) {
930 if (largest <= MCLBYTES) {
931 mhead = m_allocpacket_internal(&mhead_cnt, MCLBYTES,
932 &one, M_WAIT, 1, 0);
933 mhead_bufsize = MCLBYTES;
934 } else if (largest <= MBIGCLBYTES) {
935 mhead = m_allocpacket_internal(&mhead_cnt, MBIGCLBYTES,
936 &one, M_WAIT, 1, 0);
937 mhead_bufsize = MBIGCLBYTES;
938 } else if (largest <= M16KCLBYTES) {
939 mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES,
940 &one, M_WAIT, 1, 0);
941 mhead_bufsize = M16KCLBYTES;
942 } else {
943 mhead = NULL;
944 mhead_bufsize = mhead_cnt = 0;
945 }
946 } else {
947 mhead = NULL;
948 mhead_bufsize = mhead_cnt = 0;
949 }
950 DTRACE_SKYWALK4(bufstats, uint32_t, largest, uint32_t, mhead_bufsize,
951 uint32_t, mhead_cnt, uint32_t, tot_cnt);
952
953 pp = __DECONST(struct kern_pbufpool *, pkt_chain->pkt_qum.qum_pp);
954 max_pkt_len = pp->pp_buflet_size * pp->pp_max_frags;
955
956 for (pkt = pkt_chain; pkt != NULL; pkt = pkt->pkt_nextpkt) {
957 uint32_t tot_len, len;
958 uint16_t pad, llhlen, iphlen;
959 boolean_t do_cksum_rx;
960 struct mbuf *m;
961 int error;
962
963 llhlen = pkt->pkt_l2_len;
964 len = pkt->pkt_length;
965 if (__improbable(len > max_pkt_len || llhlen > len)) {
966 DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
967 struct __kern_packet *, pkt);
968 FSW_STATS_INC(FSW_STATS_DROP);
969 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
970 continue;
971 }
972 /* begin payload on 32-bit boundary; figure out the padding */
973 pad = (uint16_t)P2ROUNDUP(llhlen, sizeof(uint32_t)) - llhlen;
974 tot_len = pad + len;
975
976 /* remember largest packet size */
977 if (__improbable(fe->fe_rx_largest_msize < tot_len)) {
978 fe->fe_rx_largest_msize = MAX(tot_len, MCLBYTES);
979 }
980
981 /*
982 * If the above batch allocation returned partial
983 * success, we try a blocking allocation here again.
984 */
985 m = mhead;
986 if (__improbable(m == NULL || tot_len > mhead_bufsize)) {
987 ASSERT(mhead != NULL || mhead_cnt == 0);
988 one = 1;
989 if ((error = mbuf_allocpacket(MBUF_WAITOK, tot_len,
990 &one, &m)) != 0) {
991 DTRACE_SKYWALK2(bad__len,
992 struct nx_flowswitch *, fsw,
993 struct __kern_packet *, pkt);
994 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
995 FSW_STATS_INC(FSW_STATS_DROP);
996 continue;
997 }
998 } else {
999 mhead = m->m_nextpkt;
1000 m->m_nextpkt = NULL;
1001 ASSERT(mhead_cnt != 0);
1002 --mhead_cnt;
1003
1004 /* check if we're underutilizing large buffers */
1005 if (__improbable(mhead_bufsize > MCLBYTES &&
1006 tot_len < (mhead_bufsize >> 1))) {
1007 ++mhead_waste;
1008 }
1009 }
1010 m->m_data += pad;
1011 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
1012
1013 /* don't include IP header from partial sum */
1014 if (__probable((pkt->pkt_qum_qflags &
1015 QUM_F_FLOW_CLASSIFIED) != 0)) {
1016 iphlen = pkt->pkt_flow_ip_hlen;
1017 do_cksum_rx = sk_cksum_rx;
1018 } else {
1019 iphlen = 0;
1020 do_cksum_rx = FALSE;
1021 }
1022
1023 fsw->fsw_pkt_copy_to_mbuf(NR_RX, SK_PKT2PH(pkt),
1024 pkt->pkt_headroom, m, 0, len, do_cksum_rx,
1025 llhlen + iphlen);
1026
1027 FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2MBUF);
1028 if (do_cksum_rx) {
1029 FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
1030 }
1031 #if DEBUG || DEVELOPMENT
1032 if (__improbable(pkt_trailers > 0)) {
1033 (void) pkt_add_trailers_mbuf(m, llhlen + iphlen);
1034 }
1035 #endif /* DEBUG || DEVELOPMENT */
1036 m_adj(m, llhlen);
1037
1038 m->m_pkthdr.rcvif = fsw->fsw_ifp;
1039 if (__improbable((pkt->pkt_link_flags &
1040 PKT_LINKF_ETHFCS) != 0)) {
1041 m->m_flags |= M_HASFCS;
1042 }
1043 if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1044 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
1045 }
1046 ASSERT(m->m_nextpkt == NULL);
1047 tail = m;
1048 *tailp = m;
1049 tailp = &m->m_nextpkt;
1050 mcnt++;
1051 mbytes += m_pktlen(m);
1052 }
1053 /* free any leftovers */
1054 if (__improbable(mhead != NULL)) {
1055 DTRACE_SKYWALK1(mhead__leftover, uint32_t, mhead_cnt);
1056 ASSERT(mhead_cnt != 0);
1057 (void) m_freem_list(mhead);
1058 mhead = NULL;
1059 mhead_cnt = 0;
1060 }
1061
1062 /* reset if most packets (>50%) are smaller than our batch buffers */
1063 if (__improbable(mhead_waste > ((uint32_t)tot_cnt >> 1))) {
1064 DTRACE_SKYWALK4(mhead__waste, struct nx_flowswitch *, fsw,
1065 struct flow_entry *, fe, uint32_t, mhead_waste,
1066 uint32_t, tot_cnt);
1067 fe->fe_rx_largest_msize = 0;
1068 }
1069 pp_free_packet_chain(pkt_chain, NULL);
1070 *m_chain = chain;
1071 *m_tail = tail;
1072 *cnt = mcnt;
1073 *bytes = mbytes;
1074 }
1075
1076 /*
1077 * This function only extracts the mbuf from the packet. The caller frees
1078 * the packet.
1079 */
1080 static inline struct mbuf *
convert_compat_pkt_to_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1081 convert_compat_pkt_to_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1082 {
1083 struct mbuf *m;
1084 struct pkthdr *mhdr;
1085 uint16_t llhlen;
1086
1087 m = pkt->pkt_mbuf;
1088 ASSERT(m != NULL);
1089
1090 llhlen = pkt->pkt_l2_len;
1091 if (llhlen > pkt->pkt_length) {
1092 m_freem(m);
1093 KPKT_CLEAR_MBUF_DATA(pkt);
1094 DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1095 struct __kern_packet *, pkt);
1096 FSW_STATS_INC(FSW_STATS_DROP);
1097 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1098 return NULL;
1099 }
1100 mhdr = &m->m_pkthdr;
1101 if ((mhdr->csum_flags & CSUM_DATA_VALID) == 0 &&
1102 PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
1103 mhdr->csum_flags &= ~CSUM_RX_FLAGS;
1104 mhdr->csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
1105 mhdr->csum_rx_start = pkt->pkt_csum_rx_start_off;
1106 mhdr->csum_rx_val = pkt->pkt_csum_rx_value;
1107 }
1108 #if DEBUG || DEVELOPMENT
1109 uint32_t extra = 0;
1110 if (__improbable(pkt_trailers > 0)) {
1111 extra = pkt_add_trailers_mbuf(m, llhlen);
1112 }
1113 #endif /* DEBUG || DEVELOPMENT */
1114 m_adj(m, llhlen);
1115 ASSERT((uint32_t)m_pktlen(m) == ((pkt->pkt_length - llhlen) + extra));
1116 KPKT_CLEAR_MBUF_DATA(pkt);
1117 return m;
1118 }
1119
1120 SK_NO_INLINE_ATTRIBUTE
1121 static void
convert_compat_pkt_to_mbuf_chain(struct nx_flowswitch * fsw,struct flow_entry * fe,struct __kern_packet * pkt_chain,struct mbuf ** m_chain,struct mbuf ** m_tail,uint32_t * cnt,uint32_t * bytes)1122 convert_compat_pkt_to_mbuf_chain(struct nx_flowswitch *fsw,
1123 struct flow_entry *fe, struct __kern_packet *pkt_chain,
1124 struct mbuf **m_chain, struct mbuf **m_tail, uint32_t *cnt,
1125 uint32_t *bytes)
1126 {
1127 #pragma unused (fe)
1128 struct __kern_packet *pkt;
1129 struct mbuf *m, *head = NULL, *tail = NULL, **tailp = &head;
1130 uint32_t c = 0, b = 0;
1131
1132 for (pkt = pkt_chain; pkt != NULL; pkt = pkt->pkt_nextpkt) {
1133 m = convert_compat_pkt_to_mbuf(fsw, pkt);
1134 if (__improbable(m == NULL)) {
1135 continue;
1136 }
1137 tail = m;
1138 *tailp = m;
1139 tailp = &m->m_nextpkt;
1140 c++;
1141 b += m_pktlen(m);
1142 }
1143 ASSERT(c <= *cnt);
1144 pp_free_packet_chain(pkt_chain, NULL);
1145 *m_chain = head;
1146 *m_tail = tail;
1147 *cnt = c;
1148 *bytes = b;
1149 }
1150
1151 void
fsw_host_sendup(ifnet_t ifp,struct mbuf * m_chain,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes)1152 fsw_host_sendup(ifnet_t ifp, struct mbuf *m_chain, struct mbuf *m_tail,
1153 uint32_t cnt, uint32_t bytes)
1154 {
1155 struct ifnet_stat_increment_param s;
1156
1157 bzero(&s, sizeof(s));
1158 s.packets_in = cnt;
1159 s.bytes_in = bytes;
1160 dlil_input_handler(ifp, m_chain, m_tail, &s, FALSE, NULL);
1161 }
1162
1163 void
fsw_host_rx(struct nx_flowswitch * fsw,struct flow_entry * fe)1164 fsw_host_rx(struct nx_flowswitch *fsw, struct flow_entry *fe)
1165 {
1166 struct pktq *q;
1167 struct __kern_packet *pkt_chain;
1168 struct mbuf *m_chain = NULL, *m_tail = NULL;
1169 uint32_t cnt = 0, bytes = 0;
1170 boolean_t compat;
1171
1172 q = &fe->fe_rx_pktq;
1173 pkt_chain = KPKTQ_FIRST(q);
1174 cnt = KPKTQ_LEN(q);
1175 KPKTQ_INIT(q);
1176 if (__improbable(pkt_chain == NULL)) {
1177 DTRACE_SKYWALK2(empty__pktq, struct nx_flowswitch *,
1178 fsw, struct flow_entry *, fe);
1179 return;
1180 }
1181
1182 /* All packets in the chain must have the same type */
1183 compat = ((pkt_chain->pkt_pflags & PKT_F_MBUF_DATA) != 0);
1184 if (compat) {
1185 convert_compat_pkt_to_mbuf_chain(fsw, fe, pkt_chain, &m_chain,
1186 &m_tail, &cnt, &bytes);
1187 } else {
1188 convert_native_pkt_to_mbuf_chain(fsw, fe, pkt_chain, &m_chain,
1189 &m_tail, &cnt, &bytes);
1190 }
1191 if (__improbable(m_chain == NULL)) {
1192 DTRACE_SKYWALK2(empty__chain, struct nx_flowswitch *, fsw,
1193 struct flow_entry *, fe);
1194 return;
1195 }
1196 fsw_host_sendup(fsw->fsw_ifp, m_chain, m_tail, cnt, bytes);
1197 }
1198
1199 void
fsw_ring_enqueue_tail_drop(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1200 fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw,
1201 struct __kern_channel_ring *r, struct pktq *pktq)
1202 {
1203 fsw_ring_enqueue_pktq(fsw, r, pktq);
1204 FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq));
1205 dp_drop_pktq(fsw, pktq);
1206 }
1207
1208 static struct nexus_adapter *
flow_get_na(struct nx_flowswitch * fsw,struct flow_entry * fe)1209 flow_get_na(struct nx_flowswitch *fsw, struct flow_entry *fe)
1210 {
1211 struct kern_nexus *nx = fsw->fsw_nx;
1212 struct nexus_adapter *na = NULL;
1213 nexus_port_t port = fe->fe_nx_port;
1214
1215 if (port == FSW_VP_DEV || port == FSW_VP_HOST) {
1216 SK_ERR("dev or host ports have no NA");
1217 return NULL;
1218 }
1219
1220 if (__improbable(!nx_port_is_valid(nx, port))) {
1221 SK_DF(SK_VERB_FSW_DP, "%s[%d] port no longer valid",
1222 if_name(fsw->fsw_ifp), port);
1223 return NULL;
1224 }
1225
1226 na = nx_port_get_na(nx, port);
1227 if (__improbable(na == NULL)) {
1228 FSW_STATS_INC(FSW_STATS_DST_NXPORT_INVALID);
1229 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer valid",
1230 if_name(fsw->fsw_ifp), port);
1231 return NULL;
1232 }
1233
1234 if (__improbable(!NA_IS_ACTIVE(na))) {
1235 FSW_STATS_INC(FSW_STATS_DST_NXPORT_INACTIVE);
1236 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer active",
1237 if_name(fsw->fsw_ifp), port);
1238 return NULL;
1239 }
1240
1241 if (__improbable(nx_port_is_defunct(nx, port))) {
1242 FSW_STATS_INC(FSW_STATS_DST_NXPORT_DEFUNCT);
1243 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA defuncted",
1244 if_name(fsw->fsw_ifp), port);
1245 return NULL;
1246 }
1247
1248 return na;
1249 }
1250
1251 static inline struct __kern_channel_ring *
flow_get_ring(struct nx_flowswitch * fsw,struct flow_entry * fe,enum txrx txrx)1252 flow_get_ring(struct nx_flowswitch *fsw, struct flow_entry *fe, enum txrx txrx)
1253 {
1254 struct nexus_vp_adapter *na = NULL;
1255 struct __kern_channel_ring *r = NULL;
1256
1257 na = VPNA(flow_get_na(fsw, fe));
1258 if (__improbable(na == NULL)) {
1259 return NULL;
1260 }
1261
1262 switch (txrx) {
1263 case NR_RX:
1264 r = &na->vpna_up.na_rx_rings[0];
1265 break;
1266 case NR_TX:
1267 r = &na->vpna_up.na_tx_rings[0];
1268 break;
1269 default:
1270 __builtin_unreachable();
1271 VERIFY(0);
1272 }
1273
1274 if (__improbable(KR_DROP(r))) {
1275 FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE);
1276 SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %0xllx %s drop mode",
1277 r->ckr_name, SK_KVA(r));
1278 return NULL;
1279 }
1280
1281 ASSERT(KRNA(r)->na_md_type == NEXUS_META_TYPE_PACKET);
1282
1283 #if (DEVELOPMENT || DEBUG)
1284 if (r != NULL) {
1285 _FSW_INJECT_ERROR(4, r, NULL, null_func);
1286 }
1287 #endif /* DEVELOPMENT || DEBUG */
1288
1289 return r;
1290 }
1291
1292 struct __kern_channel_ring *
fsw_flow_get_rx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1293 fsw_flow_get_rx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1294 {
1295 return flow_get_ring(fsw, fe, NR_RX);
1296 }
1297
1298 static inline struct __kern_channel_ring *
fsw_flow_get_tx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1299 fsw_flow_get_tx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1300 {
1301 return flow_get_ring(fsw, fe, NR_TX);
1302 }
1303
1304 static bool
dp_flow_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1305 dp_flow_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1306 {
1307 struct flow_route *fr = fe->fe_route;
1308 struct ifnet *ifp = fsw->fsw_ifp;
1309
1310 if (__improbable(!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
1311 !fe->fe_want_nonviable && (fe->fe_key.fk_mask & FKMASK_SRC) &&
1312 fe->fe_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt &&
1313 !flow_route_key_validate(&fe->fe_key, ifp, &fe->fe_laddr_gencnt))) {
1314 /*
1315 * The source address is no longer around; we want this
1316 * flow to be nonviable, but that requires holding the lock
1317 * as writer (which isn't the case now.) Indicate that
1318 * we need to finalize the nonviable later down below.
1319 *
1320 * We also request that the flow route be re-configured,
1321 * if this is a connected mode flow.
1322 *
1323 */
1324 if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) {
1325 /*
1326 * fsw_pending_nonviable is a hint for reaper thread;
1327 * due to the fact that setting fe_want_nonviable and
1328 * incrementing fsw_pending_nonviable counter is not
1329 * atomic, let the increment happen first, and the
1330 * thread losing the CAS does decrement.
1331 */
1332 atomic_add_32(&fsw->fsw_pending_nonviable, 1);
1333 if (atomic_test_set_32(&fe->fe_want_nonviable, 0, 1)) {
1334 fsw_reap_sched(fsw);
1335 } else {
1336 atomic_add_32(&fsw->fsw_pending_nonviable, -1);
1337 }
1338 }
1339 if (fr != NULL) {
1340 atomic_add_32(&fr->fr_want_configure, 1);
1341 }
1342 }
1343
1344 /* if flow was (or is going to be) marked as nonviable, drop it */
1345 if (__improbable(fe->fe_want_nonviable ||
1346 (fe->fe_flags & FLOWENTF_NONVIABLE) != 0)) {
1347 SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow 0x%llx non-viable",
1348 SK_KVA(fe));
1349 return false;
1350 }
1351
1352 return true;
1353 }
1354
1355 bool
dp_flow_rx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1356 dp_flow_rx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1357 {
1358 bool okay;
1359 okay = dp_flow_route_process(fsw, fe);
1360 #if (DEVELOPMENT || DEBUG)
1361 if (okay) {
1362 _FSW_INJECT_ERROR(5, okay, false, null_func);
1363 }
1364 #endif /* DEVELOPMENT || DEBUG */
1365
1366 return okay;
1367 }
1368
1369 void
dp_flow_rx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1370 dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1371 {
1372 struct pktq dpkts; /* dst pool alloc'ed packets */
1373 struct pktq disposed_pkts; /* done src packets */
1374 struct pktq dropped_pkts; /* dropped src packets */
1375 struct pktq transferred_pkts; /* dst packet ready for ring */
1376 struct __kern_packet *pkt, *tpkt;
1377 struct kern_pbufpool *dpp;
1378 uint32_t n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq);
1379 uint64_t buf_array[RX_BUFLET_BATCH_COUNT];
1380 uint16_t buf_array_iter = 0;
1381 uint32_t cnt, buf_cnt = 0;
1382 int err;
1383
1384 KPKTQ_INIT(&dpkts);
1385 KPKTQ_INIT(&dropped_pkts);
1386 KPKTQ_INIT(&disposed_pkts);
1387 KPKTQ_INIT(&transferred_pkts);
1388
1389 if (__improbable(!dp_flow_rx_route_process(fsw, fe))) {
1390 SK_ERR("Rx route bad");
1391 fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
1392 FSW_STATS_ADD(FSW_STATS_RX_FLOW_NONVIABLE, n_pkts);
1393 goto done;
1394 }
1395
1396 if (fe->fe_nx_port == FSW_VP_HOST) {
1397 /*
1398 * The host ring does not exist anymore so we can't take
1399 * the enqueue path below. This path should only be hit
1400 * for the rare tcp fragmentation case.
1401 */
1402 fsw_host_rx(fsw, fe);
1403 return;
1404 }
1405
1406 /* find the ring */
1407 struct __kern_channel_ring *r;
1408 r = fsw_flow_get_rx_ring(fsw, fe);
1409 if (__improbable(r == NULL)) {
1410 fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
1411 goto done;
1412 }
1413
1414 /* snoop before L2 is stripped */
1415 if (__improbable(pktap_total_tap_count != 0)) {
1416 fsw_snoop(fsw, fe, true);
1417 }
1418
1419 dpp = r->ckr_pp;
1420 /* batch allocate enough packets */
1421 err = pp_alloc_pktq(dpp, 1, &dpkts, n_pkts, NULL, NULL,
1422 SKMEM_NOSLEEP);
1423 if (__improbable(err == ENOMEM)) {
1424 ASSERT(KPKTQ_EMPTY(&dpkts));
1425 KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq);
1426 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1427 SK_ERR("failed to alloc %u pkts for kr %s, 0x%llu", n_pkts,
1428 r->ckr_name, SK_KVA(r));
1429 goto done;
1430 }
1431
1432 /*
1433 * estimate total number of buflets for the packet chain.
1434 */
1435 cnt = howmany(fe->fe_rx_pktq_bytes, dpp->pp_buflet_size);
1436 if (cnt > n_pkts) {
1437 ASSERT(dpp->pp_max_frags > 1);
1438 cnt -= n_pkts;
1439 buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, cnt);
1440 err = pp_alloc_buflet_batch(dpp, buf_array, &buf_cnt,
1441 SKMEM_NOSLEEP);
1442 if (__improbable(buf_cnt == 0)) {
1443 KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq);
1444 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1445 SK_ERR("failed to alloc %d buflets (err %d) for kr %s, "
1446 "0x%llu", cnt, err, r->ckr_name, SK_KVA(r));
1447 goto done;
1448 }
1449 err = 0;
1450 }
1451
1452 /* extra processing for user flow */
1453 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1454 err = 0;
1455 KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1456 if (fe->fe_rx_pktq_bytes > pkt->pkt_flow_ulen) {
1457 fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1458 } else {
1459 fe->fe_rx_pktq_bytes = 0;
1460 }
1461 err = flow_pkt_track(fe, pkt, true);
1462 _FSW_INJECT_ERROR(33, err, EPROTO, null_func);
1463 if (__improbable(err != 0)) {
1464 SK_ERR("flow_pkt_track failed (err %d)", err);
1465 FSW_STATS_INC(FSW_STATS_RX_FLOW_TRACK_ERR);
1466 /* if need to trigger RST then deliver to host */
1467 if (err == ENETRESET) {
1468 struct flow_entry *host_fe;
1469 host_fe =
1470 flow_mgr_get_host_fe(fsw->fsw_flow_mgr);
1471 KPKTQ_ENQUEUE(&host_fe->fe_rx_pktq, pkt);
1472 continue;
1473 }
1474 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1475 continue;
1476 }
1477
1478 /* transfer to dpkt */
1479 if (pkt->pkt_qum.qum_pp != dpp) {
1480 struct __kern_buflet *bprev, *bnew;
1481 struct __kern_packet *dpkt = NULL;
1482 uint32_t n_bufs, i;
1483
1484 KPKTQ_DEQUEUE(&dpkts, dpkt);
1485 if (__improbable(dpkt == NULL)) {
1486 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1487 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1488 continue;
1489 }
1490 n_bufs = howmany(pkt->pkt_length, dpp->pp_buflet_size);
1491 n_bufs--;
1492 for (i = 0; i < n_bufs; i++) {
1493 if (__improbable(buf_cnt == 0)) {
1494 ASSERT(dpp->pp_max_frags > 1);
1495 buf_array_iter = 0;
1496 cnt = howmany(fe->fe_rx_pktq_bytes,
1497 dpp->pp_buflet_size);
1498 n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq);
1499 if (cnt >= n_pkts) {
1500 cnt -= n_pkts;
1501 } else {
1502 cnt = 0;
1503 }
1504 cnt += (n_bufs - i);
1505 buf_cnt = MIN(RX_BUFLET_BATCH_COUNT,
1506 cnt);
1507 cnt = buf_cnt;
1508 err = pp_alloc_buflet_batch(dpp,
1509 buf_array, &buf_cnt,
1510 SKMEM_NOSLEEP);
1511 if (__improbable(buf_cnt == 0)) {
1512 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1513 KPKTQ_ENQUEUE(&dropped_pkts,
1514 pkt);
1515 pkt = NULL;
1516 pp_free_packet_single(dpkt);
1517 dpkt = NULL;
1518 SK_ERR("failed to alloc %d "
1519 "buflets (err %d) for "
1520 "kr %s, 0x%llu", cnt, err,
1521 r->ckr_name, SK_KVA(r));
1522 break;
1523 }
1524 err = 0;
1525 }
1526 ASSERT(buf_cnt != 0);
1527 if (i == 0) {
1528 PKT_GET_FIRST_BUFLET(dpkt, 1, bprev);
1529 }
1530 bnew = (kern_buflet_t)buf_array[buf_array_iter];
1531 buf_array[buf_array_iter] = 0;
1532 buf_array_iter++;
1533 buf_cnt--;
1534 VERIFY(kern_packet_add_buflet(SK_PKT2PH(dpkt),
1535 bprev, bnew) == 0);
1536 bprev = bnew;
1537 }
1538 if (__improbable(err != 0)) {
1539 continue;
1540 }
1541 err = copy_packet_from_dev(fsw, pkt, dpkt);
1542 _FSW_INJECT_ERROR(43, err, EINVAL, null_func);
1543 if (__improbable(err != 0)) {
1544 SK_ERR("copy packet failed (err %d)", err);
1545 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1546 pp_free_packet_single(dpkt);
1547 dpkt = NULL;
1548 continue;
1549 }
1550 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1551 pkt = dpkt;
1552 }
1553 _UUID_COPY(pkt->pkt_flow_id, fe->fe_uuid);
1554 _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1555 pkt->pkt_policy_id = fe->fe_policy_id;
1556 pkt->pkt_transport_protocol = fe->fe_transport_protocol;
1557 if (pkt->pkt_bufs_cnt > 1) {
1558 pkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1559 pkt->pkt_seg_cnt = 1;
1560 }
1561 KPKTQ_ENQUEUE(&transferred_pkts, pkt);
1562 }
1563 KPKTQ_FINI(&fe->fe_rx_pktq);
1564 KPKTQ_CONCAT(&fe->fe_rx_pktq, &transferred_pkts);
1565 KPKTQ_FINI(&transferred_pkts);
1566
1567 fsw_ring_enqueue_tail_drop(fsw, r, &fe->fe_rx_pktq);
1568
1569 done:
1570 /* Free unused buflets */
1571 while (buf_cnt > 0) {
1572 pp_free_buflet(dpp, (kern_buflet_t)(buf_array[buf_array_iter]));
1573 buf_array[buf_array_iter] = 0;
1574 buf_array_iter++;
1575 buf_cnt--;
1576 }
1577 dp_free_pktq(fsw, &dpkts);
1578 dp_free_pktq(fsw, &disposed_pkts);
1579 dp_drop_pktq(fsw, &dropped_pkts);
1580 }
1581
1582 static inline void
rx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1583 rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1584 {
1585 ASSERT(!KPKTQ_EMPTY(&fe->fe_rx_pktq));
1586 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0);
1587
1588 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX, "Rx %d pkts for fe %p port %d",
1589 KPKTQ_LEN(&fe->fe_rx_pktq), fe, fe->fe_nx_port);
1590
1591 /* flow related processing (default, agg, fpd, etc.) */
1592 fe->fe_rx_process(fsw, fe);
1593
1594 if (__improbable(fe->fe_want_withdraw)) {
1595 fsw_reap_sched(fsw);
1596 }
1597
1598 KPKTQ_FINI(&fe->fe_rx_pktq);
1599 }
1600
1601 static inline void
dp_rx_process_wake_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1602 dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1603 {
1604 /*
1605 * We only care about wake packets of flows that belong the flow switch
1606 * as wake packets for the host stack are handled by the host input
1607 * function
1608 */
1609 #if (DEBUG || DEVELOPMENT)
1610 if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
1611 /*
1612 * This is a one shot command
1613 */
1614 fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
1615
1616 pkt->pkt_pflags |= PKT_F_WAKE_PKT;
1617 }
1618 #endif /* (DEBUG || DEVELOPMENT) */
1619 if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1620 if_ports_used_match_pkt(fsw->fsw_ifp, pkt);
1621 }
1622 }
1623
1624 static void
dp_rx_pktq(struct nx_flowswitch * fsw,struct pktq * pktq)1625 dp_rx_pktq(struct nx_flowswitch *fsw, struct pktq *pktq)
1626 {
1627 struct __kern_packet *pkt, *tpkt;
1628 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
1629 struct flow_entry *fe, *prev_fe;
1630 sa_family_t af;
1631 struct pktq dropped_pkts;
1632 int err;
1633
1634 KPKTQ_INIT(&dropped_pkts);
1635
1636 FSW_RLOCK(fsw);
1637 if (__improbable(FSW_QUIESCED(fsw))) {
1638 DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw);
1639 KPKTQ_CONCAT(&dropped_pkts, pktq);
1640 goto done;
1641 }
1642 if (__improbable(fsw->fsw_demux == NULL)) {
1643 KPKTQ_CONCAT(&dropped_pkts, pktq);
1644 goto done;
1645 }
1646
1647 prev_fe = NULL;
1648 KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
1649 if (__probable(tpkt)) {
1650 void *baddr;
1651 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1652 SK_PREFETCH(baddr, 0);
1653 /* prefetch L3 and L4 flow structs */
1654 SK_PREFETCHW(tpkt->pkt_flow, 0);
1655 SK_PREFETCHW(tpkt->pkt_flow, 128);
1656 }
1657
1658 KPKTQ_REMOVE(pktq, pkt);
1659
1660 pkt = rx_prepare_packet(fsw, pkt);
1661
1662 af = fsw->fsw_demux(fsw, pkt);
1663 if (__improbable(af == AF_UNSPEC)) {
1664 fe = flow_mgr_get_host_fe(fsw->fsw_flow_mgr);
1665 goto flow_batch;
1666 }
1667
1668 err = flow_pkt_classify(pkt, fsw->fsw_ifp, af, TRUE);
1669 _FSW_INJECT_ERROR(1, err, ENXIO, null_func);
1670 if (__improbable(err != 0)) {
1671 FSW_STATS_INC(FSW_STATS_RX_FLOW_EXTRACT_ERR);
1672 fe = flow_mgr_get_host_fe(fsw->fsw_flow_mgr);
1673 goto flow_batch;
1674 }
1675
1676 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
1677 pkt = rx_process_ip_frag(fsw, pkt);
1678 if (pkt == NULL) {
1679 continue;
1680 }
1681 }
1682
1683 fe = rx_lookup_flow(fsw, pkt, prev_fe);
1684 if (__improbable(fe == NULL)) {
1685 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1686 prev_fe = NULL;
1687 continue;
1688 }
1689
1690 fe->fe_rx_pktq_bytes += pkt->pkt_flow_ulen;
1691
1692 dp_rx_process_wake_packet(fsw, pkt);
1693
1694 flow_batch:
1695 rx_flow_batch_packet(&fes, fe, pkt);
1696 prev_fe = fe;
1697 }
1698
1699 struct flow_entry *tfe = NULL;
1700 TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) {
1701 rx_flow_process(fsw, fe);
1702 TAILQ_REMOVE(&fes, fe, fe_rx_link);
1703 fe->fe_rx_pktq_bytes = 0;
1704 fe->fe_rx_frag_count = 0;
1705 flow_entry_release(&fe);
1706 }
1707
1708 /* XXX(OPTIMIZE) need to re-circulate extras back to HOST */
1709 fe = flow_mgr_get_host_fe(fsw->fsw_flow_mgr);
1710 if (!KPKTQ_EMPTY(&fe->fe_rx_pktq)) {
1711 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0);
1712 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX,
1713 "re-circulate %d pkts to HOST", KPKTQ_LEN(&fe->fe_rx_pktq));
1714 rx_flow_process(fsw, fe);
1715 }
1716 flow_entry_release(&fe);
1717
1718 done:
1719 FSW_RUNLOCK(fsw);
1720
1721 dp_drop_pktq(fsw, &dropped_pkts);
1722 }
1723
1724 static void
dp_rx_pkts(struct nx_flowswitch * fsw,struct __kern_packet * pkts[],uint32_t n_pkts)1725 dp_rx_pkts(struct nx_flowswitch *fsw, struct __kern_packet *pkts[],
1726 uint32_t n_pkts)
1727 {
1728 struct pktq pktq;
1729 KPKTQ_INIT(&pktq);
1730 pkts_to_pktq(pkts, n_pkts, &pktq);
1731 dp_rx_pktq(fsw, &pktq);
1732 KPKTQ_FINI(&pktq);
1733 }
1734
1735 int
fsw_dev_input_netem_dequeue(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)1736 fsw_dev_input_netem_dequeue(void *handle, pktsched_pkt_t *pkts,
1737 uint32_t n_pkts)
1738 {
1739 #pragma unused(handle)
1740 struct nx_flowswitch *fsw = handle;
1741 struct __kern_packet *kpkts[FSW_VP_DEV_BATCH_MAX];
1742 sk_protect_t protect;
1743 uint32_t i;
1744
1745 ASSERT(n_pkts <= FSW_VP_DEV_BATCH_MAX);
1746
1747 for (i = 0; i < n_pkts; i++) {
1748 ASSERT(pkts[i].pktsched_ptype == QP_PACKET);
1749 ASSERT(pkts[i].pktsched_pkt_kpkt != NULL);
1750 kpkts[i] = pkts[i].pktsched_pkt_kpkt;
1751 }
1752
1753 protect = sk_sync_protect();
1754 dp_rx_pkts(fsw, kpkts, n_pkts);
1755 sk_sync_unprotect(protect);
1756
1757 return 0;
1758 }
1759
1760 static void
fsw_dev_input_netem_enqueue(struct nx_flowswitch * fsw,struct pktq * q)1761 fsw_dev_input_netem_enqueue(struct nx_flowswitch *fsw, struct pktq *q)
1762 {
1763 classq_pkt_t p;
1764 struct netem *ne;
1765 struct __kern_packet *pkt, *tpkt;
1766
1767 ASSERT(fsw->fsw_ifp != NULL);
1768 ne = fsw->fsw_ifp->if_input_netem;
1769 ASSERT(ne != NULL);
1770 KPKTQ_FOREACH_SAFE(pkt, q, tpkt) {
1771 boolean_t pdrop;
1772 KPKTQ_REMOVE(q, pkt);
1773 CLASSQ_PKT_INIT_PACKET(&p, pkt);
1774 netem_enqueue(ne, &p, &pdrop);
1775 }
1776 }
1777
1778 void
fsw_devna_rx(struct nexus_adapter * devna,struct __kern_packet * pkt_chain,struct nexus_pkt_stats * out_stats)1779 fsw_devna_rx(struct nexus_adapter *devna, struct __kern_packet *pkt_chain,
1780 struct nexus_pkt_stats *out_stats)
1781 {
1782 struct __kern_packet *pkt = pkt_chain, *next;
1783 struct nx_flowswitch *fsw;
1784 uint32_t n_bytes = 0, n_pkts = 0;
1785 uint64_t total_pkts = 0, total_bytes = 0;
1786 struct pktq q;
1787
1788 KPKTQ_INIT(&q);
1789 if (__improbable(devna->na_ifp == NULL ||
1790 (fsw = fsw_ifp_to_fsw(devna->na_ifp)) == NULL)) {
1791 SK_ERR("fsw not attached, dropping %d pkts", KPKTQ_LEN(&q));
1792 pp_free_packet_chain(pkt_chain, NULL);
1793 return;
1794 }
1795 while (pkt != NULL) {
1796 if (__improbable(pkt->pkt_trace_id != 0)) {
1797 KDBG(SK_KTRACE_PKT_RX_DRV | DBG_FUNC_END, pkt->pkt_trace_id);
1798 KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_START, pkt->pkt_trace_id);
1799 }
1800 next = pkt->pkt_nextpkt;
1801 pkt->pkt_nextpkt = NULL;
1802
1803 if (__probable((pkt->pkt_qum_qflags & QUM_F_DROPPED) == 0)) {
1804 KPKTQ_ENQUEUE(&q, pkt);
1805 n_bytes += pkt->pkt_length;
1806 } else {
1807 DTRACE_SKYWALK1(non__finalized__drop,
1808 struct __kern_packet *, pkt);
1809 FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_FINALIZED);
1810 pp_free_packet_single(pkt);
1811 pkt = NULL;
1812 }
1813 n_pkts = KPKTQ_LEN(&q);
1814 if (n_pkts == fsw_rx_batch || (next == NULL && n_pkts > 0)) {
1815 if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) {
1816 dp_rx_pktq(fsw, &q);
1817 } else {
1818 fsw_dev_input_netem_enqueue(fsw, &q);
1819 }
1820 total_pkts += n_pkts;
1821 total_bytes += n_bytes;
1822 n_pkts = 0;
1823 n_bytes = 0;
1824 KPKTQ_FINI(&q);
1825 }
1826 pkt = next;
1827 }
1828 ASSERT(KPKTQ_LEN(&q) == 0);
1829 FSW_STATS_ADD(FSW_STATS_RX_PACKETS, total_pkts);
1830 if (out_stats != NULL) {
1831 out_stats->nps_pkts = total_pkts;
1832 out_stats->nps_bytes = total_bytes;
1833 }
1834 KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(devna), total_pkts, total_bytes);
1835 }
1836
1837 static int
dp_copy_to_dev_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)1838 dp_copy_to_dev_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
1839 struct __kern_packet *dpkt)
1840 {
1841 struct mbuf *m = NULL;
1842 uint16_t bdlen, bdlim, bdoff;
1843 uint8_t *bdaddr;
1844 unsigned int one = 1;
1845 int err = 0;
1846
1847 err = mbuf_allocpacket(MBUF_DONTWAIT,
1848 (fsw->fsw_frame_headroom + spkt->pkt_length), &one, &m);
1849 #if (DEVELOPMENT || DEBUG)
1850 if (m != NULL) {
1851 _FSW_INJECT_ERROR(11, m, NULL, m_freem, m);
1852 }
1853 #endif /* DEVELOPMENT || DEBUG */
1854 if (__improbable(m == NULL)) {
1855 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
1856 err = ENOBUFS;
1857 goto done;
1858 }
1859
1860 MD_BUFLET_ADDR_ABS_DLEN(dpkt, bdaddr, bdlen, bdlim, bdoff);
1861 if (fsw->fsw_frame_headroom > bdlim) {
1862 SK_ERR("not enough space in buffer for headroom");
1863 err = EINVAL;
1864 goto done;
1865 }
1866
1867 dpkt->pkt_headroom = fsw->fsw_frame_headroom;
1868 dpkt->pkt_mbuf = m;
1869 dpkt->pkt_pflags |= PKT_F_MBUF_DATA;
1870
1871 /* packet copy into mbuf */
1872 fsw->fsw_pkt_copy_to_mbuf(NR_TX, SK_PTR_ENCODE(spkt,
1873 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)), 0, m,
1874 fsw->fsw_frame_headroom, spkt->pkt_length,
1875 PACKET_HAS_PARTIAL_CHECKSUM(spkt),
1876 spkt->pkt_csum_tx_start_off);
1877 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2MBUF);
1878
1879 /* header copy into dpkt buffer for classification */
1880 kern_packet_t sph = SK_PTR_ENCODE(spkt,
1881 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
1882 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
1883 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
1884 uint32_t copy_len = MIN(spkt->pkt_length, bdlim - dpkt->pkt_headroom);
1885 fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, dpkt->pkt_headroom,
1886 sph, spkt->pkt_headroom, copy_len, FALSE, 0, 0, 0);
1887
1888 /*
1889 * fsw->fsw_frame_headroom is after m_data, thus we treat m_data same as
1890 * buflet baddr m_data always points to the beginning of packet and
1891 * should represents the same as baddr + headroom
1892 */
1893 ASSERT((uintptr_t)m->m_data ==
1894 ((uintptr_t)mbuf_datastart(m) + fsw->fsw_frame_headroom));
1895
1896 done:
1897 return err;
1898 }
1899
1900 static int
dp_copy_to_dev_pkt(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)1901 dp_copy_to_dev_pkt(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
1902 struct __kern_packet *dpkt)
1903 {
1904 struct ifnet *ifp = fsw->fsw_ifp;
1905 uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
1906
1907 if (headroom > UINT8_MAX) {
1908 SK_ERR("headroom too large %d", headroom);
1909 return ERANGE;
1910 }
1911 dpkt->pkt_headroom = (uint8_t)headroom;
1912 ASSERT((dpkt->pkt_headroom & 0x7) == 0);
1913 dpkt->pkt_l2_len = 0;
1914 dpkt->pkt_link_flags = spkt->pkt_link_flags;
1915
1916 kern_packet_t sph = SK_PTR_ENCODE(spkt,
1917 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
1918 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
1919 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
1920 fsw->fsw_pkt_copy_from_pkt(NR_TX, dph,
1921 dpkt->pkt_headroom, sph, spkt->pkt_headroom,
1922 spkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(spkt),
1923 (spkt->pkt_csum_tx_start_off - spkt->pkt_headroom),
1924 (spkt->pkt_csum_tx_stuff_off - spkt->pkt_headroom),
1925 (spkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT));
1926
1927 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
1928
1929 return 0;
1930 }
1931
1932 #if SK_LOG
1933 /* Hoisted out of line to reduce kernel stack footprint */
1934 SK_LOG_ATTRIBUTE
1935 static void
dp_copy_to_dev_log(struct nx_flowswitch * fsw,const struct kern_pbufpool * pp,struct __kern_packet * spkt,struct __kern_packet * dpkt,int error)1936 dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp,
1937 struct __kern_packet *spkt, struct __kern_packet *dpkt, int error)
1938 {
1939 struct proc *p = current_proc();
1940 struct ifnet *ifp = fsw->fsw_ifp;
1941 uint64_t logflags = (SK_VERB_FSW_DP | SK_VERB_TX);
1942
1943 if (error == ERANGE) {
1944 SK_ERR("packet too long, hr(fr+tx)+slen (%u+%u)+%u > "
1945 "dev_pp_max %u", (uint32_t)fsw->fsw_frame_headroom,
1946 (uint32_t)ifp->if_tx_headroom, spkt->pkt_length,
1947 (uint32_t)pp->pp_max_frags * pp->pp_buflet_size);
1948 } else if (error == ENOBUFS) {
1949 SK_DF(logflags, "%s(%d) packet allocation failure",
1950 sk_proc_name_address(p), sk_proc_pid(p));
1951 } else if (error == 0) {
1952 ASSERT(dpkt != NULL);
1953 char *daddr;
1954 MD_BUFLET_ADDR_ABS(dpkt, daddr);
1955 SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)",
1956 sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
1957 dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
1958 (uint32_t)fsw->fsw_frame_headroom,
1959 (uint32_t)ifp->if_tx_headroom);
1960 SK_DF(logflags | SK_VERB_DUMP, "%s",
1961 sk_dump("buf", daddr, dpkt->pkt_length, 128, NULL, 0));
1962 } else {
1963 SK_DF(logflags, "%s(%d) error %d", error);
1964 }
1965 }
1966 #else
1967 #define dp_copy_to_dev_log(...)
1968 #endif /* SK_LOG */
1969
1970 static int
dp_copy_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)1971 dp_copy_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
1972 struct __kern_packet *dpkt)
1973 {
1974 const struct kern_pbufpool *pp = dpkt->pkt_qum.qum_pp;
1975 struct ifnet *ifp = fsw->fsw_ifp;
1976 uint32_t dev_pkt_len;
1977 int err = 0;
1978
1979 ASSERT(!(spkt->pkt_pflags & PKT_F_MBUF_MASK));
1980 ASSERT(!(spkt->pkt_pflags & PKT_F_PKT_MASK));
1981
1982 SK_PREFETCHW(dpkt->pkt_qum_buf.buf_addr, 0);
1983 /* Copy packet metadata */
1984 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
1985 _PKT_COPY(spkt, dpkt);
1986 ASSERT((dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
1987 !PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
1988 ASSERT(dpkt->pkt_mbuf == NULL);
1989
1990 /* Copy AQM metadata */
1991 dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type;
1992 dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx;
1993 _CASSERT((offsetof(struct __flow, flow_src_id) % 8) == 0);
1994 _UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id);
1995 _UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid);
1996 dpkt->pkt_policy_id = spkt->pkt_policy_id;
1997
1998 switch (fsw->fsw_classq_enq_ptype) {
1999 case QP_MBUF:
2000 err = dp_copy_to_dev_mbuf(fsw, spkt, dpkt);
2001 break;
2002
2003 case QP_PACKET:
2004 dev_pkt_len = fsw->fsw_frame_headroom + ifp->if_tx_headroom +
2005 spkt->pkt_length;
2006 if (dev_pkt_len > pp->pp_max_frags * pp->pp_buflet_size) {
2007 FSW_STATS_INC(FSW_STATS_TX_COPY_BAD_LEN);
2008 err = ERANGE;
2009 goto done;
2010 }
2011 err = dp_copy_to_dev_pkt(fsw, spkt, dpkt);
2012 break;
2013
2014 default:
2015 VERIFY(0);
2016 __builtin_unreachable();
2017 }
2018 done:
2019 dp_copy_to_dev_log(fsw, pp, spkt, dpkt, err);
2020 return err;
2021 }
2022
2023 static struct mbuf *
convert_pkt_to_mbuf(struct __kern_packet * pkt)2024 convert_pkt_to_mbuf(struct __kern_packet *pkt)
2025 {
2026 ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
2027 ASSERT(pkt->pkt_mbuf != NULL);
2028 struct mbuf *m = pkt->pkt_mbuf;
2029
2030 /* pass additional metadata generated from flow parse/lookup */
2031 _CASSERT(sizeof(m->m_pkthdr.pkt_flowid) ==
2032 sizeof(pkt->pkt_flow_token));
2033 _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) ==
2034 sizeof(pkt->pkt_flowsrc_token));
2035 _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) ==
2036 sizeof(pkt->pkt_flowsrc_fidx));
2037 m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
2038 m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
2039 m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
2040 m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
2041 m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
2042 m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
2043 m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx;
2044
2045 /* The packet should have a timestamp by the time we get here. */
2046 m->m_pkthdr.pkt_timestamp = pkt->pkt_timestamp;
2047 m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
2048
2049 m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
2050 m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
2051 if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
2052 m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
2053 }
2054 KPKT_CLEAR_MBUF_DATA(pkt);
2055
2056 /* mbuf has been consumed, release packet as well */
2057 ASSERT(pkt->pkt_qum.qum_ksd == NULL);
2058 pp_free_packet_single(pkt);
2059 return m;
2060 }
2061
2062 static void
convert_pkt_to_mbuf_chain(struct __kern_packet * pkt_chain,struct mbuf ** chain,struct mbuf ** tail,uint32_t * cnt,uint32_t * bytes)2063 convert_pkt_to_mbuf_chain(struct __kern_packet *pkt_chain,
2064 struct mbuf **chain, struct mbuf **tail,
2065 uint32_t *cnt, uint32_t *bytes)
2066 {
2067 struct __kern_packet *pkt = pkt_chain, *next;
2068 struct mbuf *m_chain = NULL, **m_tailp = &m_chain, *m = NULL;
2069 uint32_t c = 0, b = 0;
2070
2071 while (pkt != NULL) {
2072 next = pkt->pkt_nextpkt;
2073 pkt->pkt_nextpkt = NULL;
2074 m = convert_pkt_to_mbuf(pkt);
2075 ASSERT(m != NULL);
2076
2077 *m_tailp = m;
2078 m_tailp = &m->m_nextpkt;
2079 c++;
2080 b += m_pktlen(m);
2081 pkt = next;
2082 }
2083 if (chain != NULL) {
2084 *chain = m_chain;
2085 }
2086 if (tail != NULL) {
2087 *tail = m;
2088 }
2089 if (cnt != NULL) {
2090 *cnt = c;
2091 }
2092 if (bytes != NULL) {
2093 *bytes = b;
2094 }
2095 }
2096
2097 SK_NO_INLINE_ATTRIBUTE
2098 static int
classq_enqueue_flow_single(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2099 classq_enqueue_flow_single(struct nx_flowswitch *fsw,
2100 struct __kern_packet *pkt)
2101 {
2102 struct ifnet *ifp = fsw->fsw_ifp;
2103 boolean_t pkt_drop = FALSE;
2104 int err;
2105
2106 FSW_LOCK_ASSERT_HELD(fsw);
2107 ASSERT(fsw->fsw_classq_enabled);
2108 /*
2109 * we are using the first 4 bytes of flow_id as the AQM flow
2110 * identifier.
2111 */
2112 ASSERT(!uuid_is_null(pkt->pkt_flow_id));
2113 fsw_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class,
2114 1, pkt->pkt_length);
2115
2116 if (__improbable(pkt->pkt_trace_id != 0)) {
2117 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
2118 KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
2119 }
2120
2121 switch (fsw->fsw_classq_enq_ptype) {
2122 case QP_MBUF: { /* compat interface */
2123 struct mbuf *m;
2124
2125 m = convert_pkt_to_mbuf(pkt);
2126 ASSERT(m != NULL);
2127 pkt = NULL;
2128
2129 /* ifnet_enqueue consumes mbuf */
2130 err = ifnet_enqueue_mbuf(ifp, m, false, &pkt_drop);
2131 m = NULL;
2132 #if (DEVELOPMENT || DEBUG)
2133 if (__improbable(!pkt_drop)) {
2134 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2135 }
2136 #endif /* DEVELOPMENT || DEBUG */
2137 if (pkt_drop) {
2138 FSW_STATS_INC(FSW_STATS_DROP);
2139 FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2140 }
2141 break;
2142 }
2143 case QP_PACKET: { /* native interface */
2144 /* ifnet_enqueue consumes packet */
2145 err = ifnet_enqueue_pkt(ifp, pkt, false, &pkt_drop);
2146 pkt = NULL;
2147 #if (DEVELOPMENT || DEBUG)
2148 if (__improbable(!pkt_drop)) {
2149 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2150 }
2151 #endif /* DEVELOPMENT || DEBUG */
2152 if (pkt_drop) {
2153 FSW_STATS_INC(FSW_STATS_DROP);
2154 FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2155 }
2156 break;
2157 }
2158 default:
2159 err = EINVAL;
2160 VERIFY(0);
2161 /* NOTREACHED */
2162 __builtin_unreachable();
2163 }
2164
2165 return err;
2166 }
2167
2168 static int
classq_enqueue_flow_chain(struct nx_flowswitch * fsw,struct __kern_packet * pkt_chain,struct __kern_packet * pkt_tail,uint32_t cnt,uint32_t bytes)2169 classq_enqueue_flow_chain(struct nx_flowswitch *fsw,
2170 struct __kern_packet *pkt_chain, struct __kern_packet *pkt_tail,
2171 uint32_t cnt, uint32_t bytes)
2172 {
2173 struct ifnet *ifp = fsw->fsw_ifp;
2174 boolean_t pkt_drop = FALSE;
2175 uint32_t svc;
2176 int err;
2177
2178 FSW_LOCK_ASSERT_HELD(fsw);
2179 ASSERT(fsw->fsw_classq_enabled);
2180 /*
2181 * we are using the first 4 bytes of flow_id as the AQM flow
2182 * identifier.
2183 */
2184 ASSERT(!uuid_is_null(pkt_chain->pkt_flow_id));
2185
2186 /*
2187 * All packets in the flow should have the same svc.
2188 */
2189 svc = pkt_chain->pkt_svc_class;
2190 fsw_ifp_inc_traffic_class_out_pkt(ifp, svc, cnt, bytes);
2191
2192 switch (fsw->fsw_classq_enq_ptype) {
2193 case QP_MBUF: { /* compat interface */
2194 struct mbuf *m_chain = NULL, *m_tail = NULL;
2195 uint32_t c = 0, b = 0;
2196
2197 convert_pkt_to_mbuf_chain(pkt_chain, &m_chain, &m_tail, &c, &b);
2198 ASSERT(m_chain != NULL && m_tail != NULL);
2199 ASSERT(c == cnt);
2200 ASSERT(b == bytes);
2201 pkt_chain = NULL;
2202
2203 /* ifnet_enqueue consumes mbuf */
2204 err = ifnet_enqueue_mbuf_chain(ifp, m_chain, m_tail, cnt,
2205 bytes, FALSE, &pkt_drop);
2206 m_chain = NULL;
2207 m_tail = NULL;
2208 #if (DEVELOPMENT || DEBUG)
2209 if (__improbable(!pkt_drop)) {
2210 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2211 }
2212 #endif /* DEVELOPMENT || DEBUG */
2213 if (pkt_drop) {
2214 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2215 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2216 cnt);
2217 }
2218 break;
2219 }
2220 case QP_PACKET: { /* native interface */
2221 /* ifnet_enqueue consumes packet */
2222 err = ifnet_enqueue_pkt_chain(ifp, pkt_chain, pkt_tail, cnt,
2223 bytes, FALSE, &pkt_drop);
2224 pkt_chain = NULL;
2225 #if (DEVELOPMENT || DEBUG)
2226 if (__improbable(!pkt_drop)) {
2227 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2228 }
2229 #endif /* DEVELOPMENT || DEBUG */
2230 if (pkt_drop) {
2231 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2232 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2233 cnt);
2234 }
2235 break;
2236 }
2237 default:
2238 err = EINVAL;
2239 VERIFY(0);
2240 /* NOTREACHED */
2241 __builtin_unreachable();
2242 }
2243
2244 return err;
2245 }
2246
2247 /*
2248 * This code path needs to be kept for interfaces without logical link support.
2249 */
2250 static void
classq_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,boolean_t chain,uint32_t cnt,uint32_t bytes)2251 classq_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
2252 boolean_t chain, uint32_t cnt, uint32_t bytes)
2253 {
2254 bool flowadv_is_set = false;
2255 struct __kern_packet *pkt, *tail, *tpkt;
2256 flowadv_idx_t flow_adv_idx;
2257 bool flowadv_cap;
2258 flowadv_token_t flow_adv_token;
2259 int err;
2260
2261 SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
2262 if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
2263
2264 if (chain) {
2265 pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
2266 tail = KPKTQ_LAST(&fe->fe_tx_pktq);
2267 KPKTQ_INIT(&fe->fe_tx_pktq);
2268 if (pkt == NULL) {
2269 return;
2270 }
2271 flow_adv_idx = pkt->pkt_flowsrc_fidx;
2272 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2273 flow_adv_token = pkt->pkt_flow_token;
2274
2275 err = classq_enqueue_flow_chain(fsw, pkt, tail, cnt, bytes);
2276
2277 /* set flow advisory if needed */
2278 if (__improbable((err == EQFULL || err == EQSUSPENDED) &&
2279 flowadv_cap)) {
2280 flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
2281 flow_adv_idx, flow_adv_token);
2282 }
2283 } else {
2284 uint32_t c = 0, b = 0;
2285
2286 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2287 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2288
2289 flow_adv_idx = pkt->pkt_flowsrc_fidx;
2290 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2291 flow_adv_token = pkt->pkt_flow_token;
2292
2293 c++;
2294 b += pkt->pkt_length;
2295 err = classq_enqueue_flow_single(fsw, pkt);
2296
2297 /* set flow advisory if needed */
2298 if (__improbable(!flowadv_is_set &&
2299 ((err == EQFULL || err == EQSUSPENDED) &&
2300 flowadv_cap))) {
2301 flowadv_is_set = na_flowadv_set(
2302 flow_get_na(fsw, fe), flow_adv_idx,
2303 flow_adv_token);
2304 }
2305 }
2306 ASSERT(c == cnt);
2307 ASSERT(b == bytes);
2308 }
2309
2310 /* notify flow advisory event */
2311 if (__improbable(flowadv_is_set)) {
2312 struct __kern_channel_ring *r = fsw_flow_get_tx_ring(fsw, fe);
2313 if (__probable(r)) {
2314 na_flowadv_event(r);
2315 SK_DF(SK_VERB_FLOW_ADVISORY | SK_VERB_TX,
2316 "%s(%d) notified of flow update",
2317 sk_proc_name_address(current_proc()),
2318 sk_proc_pid(current_proc()));
2319 }
2320 }
2321 }
2322
2323 /*
2324 * Logical link code path
2325 */
2326 static void
classq_qset_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,boolean_t chain,uint32_t cnt,uint32_t bytes)2327 classq_qset_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
2328 boolean_t chain, uint32_t cnt, uint32_t bytes)
2329 {
2330 struct __kern_packet *pkt, *tail;
2331 flowadv_idx_t flow_adv_idx;
2332 bool flowadv_is_set = false;
2333 bool flowadv_cap;
2334 flowadv_token_t flow_adv_token;
2335 uint32_t flowctl = 0, dropped = 0;
2336 int err;
2337
2338 SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
2339 if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
2340
2341 /*
2342 * Not supporting chains for now
2343 */
2344 VERIFY(!chain);
2345 pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
2346 tail = KPKTQ_LAST(&fe->fe_tx_pktq);
2347 KPKTQ_INIT(&fe->fe_tx_pktq);
2348 if (pkt == NULL) {
2349 return;
2350 }
2351 flow_adv_idx = pkt->pkt_flowsrc_fidx;
2352 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2353 flow_adv_token = pkt->pkt_flow_token;
2354
2355 err = netif_qset_enqueue(fe->fe_qset, pkt, tail, cnt, bytes,
2356 &flowctl, &dropped);
2357
2358 if (__improbable(err != 0)) {
2359 /* set flow advisory if needed */
2360 if (flowctl > 0 && flowadv_cap) {
2361 flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
2362 flow_adv_idx, flow_adv_token);
2363
2364 /* notify flow advisory event */
2365 if (flowadv_is_set) {
2366 struct __kern_channel_ring *r =
2367 fsw_flow_get_tx_ring(fsw, fe);
2368 if (__probable(r)) {
2369 na_flowadv_event(r);
2370 SK_DF(SK_VERB_FLOW_ADVISORY |
2371 SK_VERB_TX,
2372 "%s(%d) notified of flow update",
2373 sk_proc_name_address(current_proc()),
2374 sk_proc_pid(current_proc()));
2375 }
2376 }
2377 }
2378 if (dropped > 0) {
2379 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, dropped);
2380 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2381 dropped);
2382 }
2383 }
2384 }
2385
2386 static void
tx_finalize_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2387 tx_finalize_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2388 {
2389 #pragma unused(fsw)
2390 /* finalize here; no more changes to buflets after classq */
2391 if (__probable(!(pkt->pkt_pflags & PKT_F_MBUF_DATA))) {
2392 kern_packet_t ph = SK_PTR_ENCODE(pkt,
2393 METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
2394 int err = __packet_finalize(ph);
2395 VERIFY(err == 0);
2396 }
2397 }
2398
2399 static bool
dp_flow_tx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)2400 dp_flow_tx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2401 {
2402 struct flow_route *fr = fe->fe_route;
2403 int err;
2404
2405 ASSERT(fr != NULL);
2406
2407 if (__improbable(!dp_flow_route_process(fsw, fe))) {
2408 return false;
2409 }
2410
2411 _FSW_INJECT_ERROR(35, fr->fr_flags, fr->fr_flags,
2412 _fsw_error35_handler, 1, fr, NULL, NULL);
2413 _FSW_INJECT_ERROR(36, fr->fr_flags, fr->fr_flags,
2414 _fsw_error36_handler, 1, fr, NULL);
2415
2416 /*
2417 * See if we need to resolve the flow route; note the test against
2418 * fr_flags here is done without any lock for performance. Thus
2419 * it's possible that we race against the thread performing route
2420 * event updates for a packet (which is OK). In any case we should
2421 * not have any assertion on fr_flags value(s) due to the lack of
2422 * serialization.
2423 */
2424 if (fr->fr_flags & FLOWRTF_RESOLVED) {
2425 goto frame;
2426 }
2427
2428 struct __kern_packet *pkt, *tpkt;
2429 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2430 err = fsw->fsw_resolve(fsw, fr, pkt);
2431 _FSW_INJECT_ERROR_SET(35, _fsw_error35_handler, 2, fr, pkt, &err);
2432 _FSW_INJECT_ERROR_SET(36, _fsw_error36_handler, 2, fr, &err);
2433 /*
2434 * If resolver returns EJUSTRETURN then we drop the pkt as the
2435 * resolver should have converted the pkt into mbuf (or
2436 * detached the attached mbuf from pkt) and added it to the
2437 * llinfo queue. If we do have a cached llinfo, then proceed
2438 * to using it even though it may be stale (very unlikely)
2439 * while the resolution is in progress.
2440 * Otherwise, any other error results in dropping pkt.
2441 */
2442 if (err == EJUSTRETURN) {
2443 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2444 pp_free_packet_single(pkt);
2445 FSW_STATS_INC(FSW_STATS_TX_RESOLV_PENDING);
2446 continue;
2447 } else if (err != 0 && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
2448 /* use existing llinfo */
2449 FSW_STATS_INC(FSW_STATS_TX_RESOLV_STALE);
2450 } else if (err != 0) {
2451 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2452 pp_free_packet_single(pkt);
2453 FSW_STATS_INC(FSW_STATS_TX_RESOLV_FAIL);
2454 continue;
2455 }
2456 }
2457
2458 frame:
2459 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2460 if (fsw->fsw_frame != NULL) {
2461 fsw->fsw_frame(fsw, fr, pkt);
2462 }
2463 }
2464
2465 return true;
2466 }
2467
2468 static void
dp_listener_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)2469 dp_listener_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2470 {
2471 struct __kern_packet *pkt, *tpkt;
2472 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2473 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2474 /* listener is only allowed TCP RST */
2475 if (pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
2476 (pkt->pkt_flow_tcp_flags & TH_RST) != 0) {
2477 fsw_flow_abort_tcp(fsw, fe, pkt);
2478 } else {
2479 char *addr;
2480 MD_BUFLET_ADDR_ABS(pkt, addr);
2481 SK_ERR("listener flow sends non-RST packet %s",
2482 sk_dump(sk_proc_name_address(current_proc()),
2483 addr, pkt->pkt_length, 128, NULL, 0));
2484 }
2485 pp_free_packet_single(pkt);
2486 }
2487 }
2488
2489 static void
fsw_update_timestamps(struct __kern_packet * pkt,volatile uint64_t * fg_ts,volatile uint64_t * rt_ts,ifnet_t ifp)2490 fsw_update_timestamps(struct __kern_packet *pkt, volatile uint64_t *fg_ts,
2491 volatile uint64_t *rt_ts, ifnet_t ifp)
2492 {
2493 struct timespec now;
2494 uint64_t now_nsec = 0;
2495
2496 if (!(pkt->pkt_pflags & PKT_F_TS_VALID) || pkt->pkt_timestamp == 0) {
2497 nanouptime(&now);
2498 net_timernsec(&now, &now_nsec);
2499 pkt->pkt_timestamp = now_nsec;
2500 }
2501 pkt->pkt_pflags &= ~PKT_F_TS_VALID;
2502
2503 /*
2504 * If the packet service class is not background,
2505 * update the timestamps on the interface, as well as
2506 * the ones in nexus-wide advisory to indicate recent
2507 * activity on a foreground flow.
2508 */
2509 if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) {
2510 ifp->if_fg_sendts = (uint32_t)_net_uptime;
2511 if (fg_ts != NULL) {
2512 *fg_ts = _net_uptime;
2513 }
2514 }
2515 if (pkt->pkt_pflags & PKT_F_REALTIME) {
2516 ifp->if_rt_sendts = (uint32_t)_net_uptime;
2517 if (rt_ts != NULL) {
2518 *rt_ts = _net_uptime;
2519 }
2520 }
2521 }
2522
2523 /*
2524 * TODO:
2525 * We can check the flow entry as well to only allow chain enqueue
2526 * on flows matching a certain criteria.
2527 */
2528 static boolean_t
fsw_chain_enqueue_enabled(struct nx_flowswitch * fsw,struct flow_entry * fe)2529 fsw_chain_enqueue_enabled(struct nx_flowswitch *fsw, struct flow_entry *fe)
2530 {
2531 #pragma unused(fe)
2532 return fsw_chain_enqueue != 0 &&
2533 fsw->fsw_ifp->if_output_netem == NULL &&
2534 (fsw->fsw_ifp->if_eflags & IFEF_ENQUEUE_MULTI) == 0 &&
2535 fe->fe_qset == NULL;
2536 }
2537
2538 void
dp_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)2539 dp_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2540 {
2541 struct pktq dropped_pkts;
2542 boolean_t chain;
2543 uint32_t cnt = 0, bytes = 0;
2544 volatile struct sk_nexusadv *nxadv = NULL;
2545 volatile uint64_t *fg_ts = NULL;
2546 volatile uint64_t *rt_ts = NULL;
2547
2548 KPKTQ_INIT(&dropped_pkts);
2549 ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
2550 if (__improbable(fe->fe_flags & FLOWENTF_LISTENER)) {
2551 dp_listener_flow_tx_process(fsw, fe);
2552 return;
2553 }
2554 if (__improbable(!dp_flow_tx_route_process(fsw, fe))) {
2555 SK_RDERR(5, "Tx route bad");
2556 FSW_STATS_ADD(FSW_STATS_TX_FLOW_NONVIABLE,
2557 KPKTQ_LEN(&fe->fe_tx_pktq));
2558 KPKTQ_CONCAT(&dropped_pkts, &fe->fe_tx_pktq);
2559 goto done;
2560 }
2561 chain = fsw_chain_enqueue_enabled(fsw, fe);
2562 if (chain) {
2563 nxadv = fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
2564 if (nxadv != NULL) {
2565 fg_ts = &nxadv->nxadv_fg_sendts;
2566 rt_ts = &nxadv->nxadv_rt_sendts;
2567 }
2568 }
2569 struct __kern_packet *pkt, *tpkt;
2570 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2571 int err = flow_pkt_track(fe, pkt, false);
2572 if (__improbable(err != 0)) {
2573 SK_RDERR(5, "flow_pkt_track failed (err %d)", err);
2574 FSW_STATS_INC(FSW_STATS_TX_FLOW_TRACK_ERR);
2575 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2576 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
2577 continue;
2578 }
2579
2580 _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
2581 pkt->pkt_transport_protocol = fe->fe_transport_protocol;
2582
2583 /* set AQM related values for outgoing packet */
2584 if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
2585 pkt->pkt_pflags |= PKT_F_FLOW_ADV;
2586 pkt->pkt_flowsrc_type = FLOWSRC_CHANNEL;
2587 pkt->pkt_flowsrc_fidx = fe->fe_adv_idx;
2588 } else {
2589 pkt->pkt_pflags &= ~PKT_F_FLOW_ADV;
2590 }
2591 pkt->pkt_pflags |= PKT_F_FLOW_ID;
2592
2593 /*
2594 * The same code is exercised per packet for the non-chain case
2595 * (see ifnet_enqueue_ifclassq()). It's replicated here to avoid
2596 * re-walking the chain later.
2597 */
2598 if (chain) {
2599 fsw_update_timestamps(pkt, fg_ts, rt_ts, fsw->fsw_ifp);
2600 }
2601 /* mark packet tos/svc_class */
2602 fsw_qos_mark(fsw, fe, pkt);
2603
2604 tx_finalize_packet(fsw, pkt);
2605 bytes += pkt->pkt_length;
2606 cnt++;
2607 }
2608
2609 /* snoop after it's finalized */
2610 if (__improbable(pktap_total_tap_count != 0)) {
2611 fsw_snoop(fsw, fe, false);
2612 }
2613 if (fe->fe_qset != NULL) {
2614 classq_qset_enqueue_flow(fsw, fe, chain, cnt, bytes);
2615 } else {
2616 classq_enqueue_flow(fsw, fe, chain, cnt, bytes);
2617 }
2618 done:
2619 dp_drop_pktq(fsw, &dropped_pkts);
2620 }
2621
2622 static struct flow_entry *
tx_process_continuous_ip_frag(struct nx_flowswitch * fsw,struct flow_entry * prev_fe,struct __kern_packet * pkt)2623 tx_process_continuous_ip_frag(struct nx_flowswitch *fsw,
2624 struct flow_entry *prev_fe, struct __kern_packet *pkt)
2625 {
2626 ASSERT(!pkt->pkt_flow_ip_is_first_frag);
2627
2628 if (__improbable(pkt->pkt_flow_ip_frag_id == 0)) {
2629 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID);
2630 SK_ERR("%s(%d) invalid zero fragment id",
2631 sk_proc_name_address(current_proc()),
2632 sk_proc_pid(current_proc()));
2633 return NULL;
2634 }
2635
2636 SK_DF(SK_VERB_FSW_DP | SK_VERB_TX,
2637 "%s(%d) continuation frag, id %u",
2638 sk_proc_name_address(current_proc()),
2639 sk_proc_pid(current_proc()),
2640 pkt->pkt_flow_ip_frag_id);
2641 if (__improbable(prev_fe == NULL ||
2642 !prev_fe->fe_tx_is_cont_frag)) {
2643 SK_ERR("%s(%d) unexpected continuation frag",
2644 sk_proc_name_address(current_proc()),
2645 sk_proc_pid(current_proc()),
2646 pkt->pkt_flow_ip_frag_id);
2647 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
2648 return NULL;
2649 }
2650 if (__improbable(pkt->pkt_flow_ip_frag_id !=
2651 prev_fe->fe_tx_frag_id)) {
2652 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
2653 SK_ERR("%s(%d) wrong continuation frag id %u expecting %u",
2654 sk_proc_name_address(current_proc()),
2655 sk_proc_pid(current_proc()),
2656 pkt->pkt_flow_ip_frag_id,
2657 prev_fe->fe_tx_frag_id);
2658 return NULL;
2659 }
2660
2661 return prev_fe;
2662 }
2663
2664 static struct flow_entry *
tx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)2665 tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
2666 struct flow_entry *prev_fe)
2667 {
2668 struct flow_entry *fe;
2669
2670 fe = lookup_flow_with_key(fsw, pkt, false, prev_fe);
2671 if (__improbable(fe == NULL)) {
2672 goto done;
2673 }
2674
2675 if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
2676 SK_RDERR(5, "Tx flow torn down");
2677 FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN);
2678 flow_entry_release(&fe);
2679 goto done;
2680 }
2681
2682 SK_LOG_VAR(char febuf[FLOWENTRY_DBGBUF_SIZE]);
2683 SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP | SK_VERB_TX,
2684 "fe 0x%llx \"%s\"",
2685 SK_KVA(fe), fe_as_string(fe, febuf, sizeof(febuf)));
2686
2687 _FSW_INJECT_ERROR(34, pkt->pkt_flow_id[0], fe->fe_uuid[0] + 1,
2688 null_func);
2689
2690 if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
2691 uuid_string_t flow_id_str, pkt_id_str;
2692 sk_uuid_unparse(fe->fe_uuid, flow_id_str);
2693 sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str);
2694 SK_ERR("pkt flow id %s != flow id %s", pkt_id_str, flow_id_str);
2695 flow_entry_release(&fe);
2696 FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID);
2697 }
2698
2699 done:
2700 return fe;
2701 }
2702
2703 static inline void
tx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe)2704 tx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2705 {
2706 ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
2707 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) != 0);
2708
2709 SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, "TX %d pkts from fe %p port %d",
2710 KPKTQ_LEN(&fe->fe_tx_pktq), fe, fe->fe_nx_port);
2711
2712 /* flow related processing (default, agg, etc.) */
2713 fe->fe_tx_process(fsw, fe);
2714
2715 KPKTQ_FINI(&fe->fe_tx_pktq);
2716 }
2717
2718 #if SK_LOG
2719 static void
dp_tx_log_pkt(uint64_t verb,char * desc,struct __kern_packet * pkt)2720 dp_tx_log_pkt(uint64_t verb, char *desc, struct __kern_packet *pkt)
2721 {
2722 char *pkt_buf;
2723 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
2724 SK_DF(verb, "%s(%d) %s %s", sk_proc_name_address(current_proc()),
2725 sk_proc_pid(current_proc()), desc, sk_dump("buf", pkt_buf,
2726 pkt->pkt_length, 128, NULL, 0));
2727 }
2728 #else /* !SK_LOG */
2729 #define dp_tx_log_pkt(...)
2730 #endif /* !SK_LOG */
2731
2732 static void
dp_tx_pktq(struct nx_flowswitch * fsw,struct pktq * spktq)2733 dp_tx_pktq(struct nx_flowswitch *fsw, struct pktq *spktq)
2734 {
2735 struct __kern_packet *spkt, *pkt;
2736 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
2737 struct flow_entry *fe, *prev_fe;
2738 struct pktq dropped_pkts, dpktq;
2739 struct nexus_adapter *dev_na;
2740 struct kern_pbufpool *dev_pp;
2741 struct ifnet *ifp;
2742 sa_family_t af;
2743 uint32_t n_pkts, n_flows = 0;
2744
2745 int err;
2746 KPKTQ_INIT(&dpktq);
2747 KPKTQ_INIT(&dropped_pkts);
2748 n_pkts = KPKTQ_LEN(spktq);
2749
2750 FSW_RLOCK(fsw);
2751 if (__improbable(FSW_QUIESCED(fsw))) {
2752 DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
2753 SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
2754 KPKTQ_CONCAT(&dropped_pkts, spktq);
2755 goto done;
2756 }
2757 dev_na = fsw->fsw_dev_ch->ch_na;
2758 if (__improbable(dev_na == NULL)) {
2759 SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
2760 FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
2761 KPKTQ_CONCAT(&dropped_pkts, spktq);
2762 goto done;
2763 }
2764 /*
2765 * fsw_ifp should still be valid at this point. If fsw is detached
2766 * after fsw_lock is released, this ifp will remain valid and
2767 * netif_transmit() will behave properly even if the ifp is in
2768 * detached state.
2769 */
2770 ifp = fsw->fsw_ifp;
2771
2772 /* batch allocate enough packets */
2773 dev_pp = na_kr_get_pp(dev_na, NR_TX);
2774
2775 err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, n_pkts, NULL,
2776 NULL, SKMEM_NOSLEEP);
2777 #if DEVELOPMENT || DEBUG
2778 if (__probable(err != ENOMEM)) {
2779 _FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
2780 }
2781 #endif /* DEVELOPMENT || DEBUG */
2782 if (__improbable(err == ENOMEM)) {
2783 ASSERT(KPKTQ_EMPTY(&dpktq));
2784 KPKTQ_CONCAT(&dropped_pkts, spktq);
2785 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
2786 SK_ERR("failed to alloc %u pkts from device pool", n_pkts);
2787 goto done;
2788 } else if (__improbable(err == EAGAIN)) {
2789 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT,
2790 (n_pkts - KPKTQ_LEN(&dpktq)));
2791 FSW_STATS_ADD(FSW_STATS_DROP,
2792 (n_pkts - KPKTQ_LEN(&dpktq)));
2793 }
2794
2795 n_pkts = KPKTQ_LEN(&dpktq);
2796 prev_fe = NULL;
2797 KPKTQ_FOREACH(spkt, spktq) {
2798 if (n_pkts == 0) {
2799 break;
2800 }
2801 --n_pkts;
2802
2803 KPKTQ_DEQUEUE(&dpktq, pkt);
2804 ASSERT(pkt != NULL);
2805 err = dp_copy_to_dev(fsw, spkt, pkt);
2806 if (__improbable(err != 0)) {
2807 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
2808 continue;
2809 }
2810
2811 af = fsw_ip_demux(fsw, pkt);
2812 if (__improbable(af == AF_UNSPEC)) {
2813 dp_tx_log_pkt(SK_VERB_ERROR, "demux err", pkt);
2814 FSW_STATS_INC(FSW_STATS_TX_DEMUX_ERR);
2815 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
2816 continue;
2817 }
2818
2819 err = flow_pkt_classify(pkt, ifp, af, false);
2820 if (__improbable(err != 0)) {
2821 dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
2822 FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
2823 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
2824 continue;
2825 }
2826
2827 if (__improbable(pkt->pkt_flow_ip_is_frag &&
2828 !pkt->pkt_flow_ip_is_first_frag)) {
2829 fe = tx_process_continuous_ip_frag(fsw, prev_fe, pkt);
2830 if (__probable(fe != NULL)) {
2831 flow_entry_retain(fe);
2832 goto flow_batch;
2833 } else {
2834 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
2835 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
2836 continue;
2837 }
2838 }
2839
2840 fe = tx_lookup_flow(fsw, pkt, prev_fe);
2841 if (__improbable(fe == NULL)) {
2842 FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
2843 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
2844 prev_fe = NULL;
2845 continue;
2846 }
2847 flow_batch:
2848 tx_flow_batch_packet(&fes, fe, pkt);
2849 prev_fe = fe;
2850 }
2851
2852 struct flow_entry *tfe = NULL;
2853 TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
2854 tx_flow_process(fsw, fe);
2855 TAILQ_REMOVE(&fes, fe, fe_tx_link);
2856 fe->fe_tx_is_cont_frag = false;
2857 fe->fe_tx_frag_id = 0;
2858 flow_entry_release(&fe);
2859 n_flows++;
2860 }
2861
2862 done:
2863 FSW_RUNLOCK(fsw);
2864 if (n_flows > 0) {
2865 netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL);
2866 }
2867 dp_drop_pktq(fsw, &dropped_pkts);
2868 KPKTQ_FINI(&dropped_pkts);
2869 KPKTQ_FINI(&dpktq);
2870 }
2871
2872 static inline void
fsw_dev_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)2873 fsw_dev_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
2874 struct proc *p)
2875 {
2876 #pragma unused(p)
2877 uint32_t total_pkts = 0, total_bytes = 0;
2878
2879 for (;;) {
2880 struct pktq pktq;
2881 KPKTQ_INIT(&pktq);
2882 uint32_t n_bytes;
2883 fsw_ring_dequeue_pktq(fsw, r, fsw_rx_batch, &pktq, &n_bytes);
2884 if (n_bytes == 0) {
2885 break;
2886 }
2887 total_pkts += KPKTQ_LEN(&pktq);
2888 total_bytes += n_bytes;
2889
2890 if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) {
2891 dp_rx_pktq(fsw, &pktq);
2892 } else {
2893 fsw_dev_input_netem_enqueue(fsw, &pktq);
2894 }
2895 KPKTQ_FINI(&pktq);
2896 }
2897
2898 KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
2899 DTRACE_SKYWALK2(fsw__dp__dev__ring__flush, uint32_t, total_pkts,
2900 uint32_t, total_bytes);
2901
2902 /* compute mitigation rate for delivered traffic */
2903 if (__probable(r->ckr_netif_mit_stats != NULL)) {
2904 r->ckr_netif_mit_stats(r, total_pkts, total_bytes);
2905 }
2906 }
2907
2908 static inline void
fsw_user_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)2909 fsw_user_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
2910 struct proc *p)
2911 {
2912 #pragma unused(p)
2913 static packet_trace_id_t trace_id = 0;
2914 uint32_t total_pkts = 0, total_bytes = 0;
2915
2916 for (;;) {
2917 struct pktq pktq;
2918 KPKTQ_INIT(&pktq);
2919 uint32_t n_bytes;
2920 fsw_ring_dequeue_pktq(fsw, r, fsw_tx_batch, &pktq, &n_bytes);
2921 if (n_bytes == 0) {
2922 break;
2923 }
2924 total_pkts += KPKTQ_LEN(&pktq);
2925 total_bytes += n_bytes;
2926
2927 KPKTQ_FIRST(&pktq)->pkt_trace_id = ++trace_id;
2928 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_START, KPKTQ_FIRST(&pktq)->pkt_trace_id);
2929
2930 dp_tx_pktq(fsw, &pktq);
2931 dp_free_pktq(fsw, &pktq);
2932 KPKTQ_FINI(&pktq);
2933 }
2934
2935 kr_update_stats(r, total_pkts, total_bytes);
2936
2937 KDBG(SK_KTRACE_FSW_USER_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
2938 DTRACE_SKYWALK2(fsw__dp__user__ring__flush, uint32_t, total_pkts,
2939 uint32_t, total_bytes);
2940 }
2941
2942 void
fsw_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)2943 fsw_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
2944 struct proc *p)
2945 {
2946 struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
2947
2948 ASSERT(sk_is_sync_protected());
2949 ASSERT(vpna->vpna_nx_port != FSW_VP_HOST);
2950 ASSERT(vpna->vpna_up.na_md_type == NEXUS_META_TYPE_PACKET);
2951
2952 if (vpna->vpna_nx_port == FSW_VP_DEV) {
2953 fsw_dev_ring_flush(fsw, r, p);
2954 } else {
2955 fsw_user_ring_flush(fsw, r, p);
2956 }
2957 }
2958
2959 int
fsw_dp_ctor(struct nx_flowswitch * fsw)2960 fsw_dp_ctor(struct nx_flowswitch *fsw)
2961 {
2962 uint32_t fe_cnt = fsw_fe_table_size;
2963 uint32_t fob_cnt = fsw_flow_owner_buckets;
2964 uint32_t frb_cnt = fsw_flow_route_buckets;
2965 uint32_t frib_cnt = fsw_flow_route_id_buckets;
2966 struct kern_nexus *nx = fsw->fsw_nx;
2967 char name[64];
2968 int error = 0;
2969
2970 /* just in case */
2971 if (fe_cnt == 0) {
2972 fe_cnt = NX_FSW_FE_TABLESZ;
2973 ASSERT(fe_cnt != 0);
2974 }
2975 if (fob_cnt == 0) {
2976 fob_cnt = NX_FSW_FOB_HASHSZ;
2977 ASSERT(fob_cnt != 0);
2978 }
2979 if (frb_cnt == 0) {
2980 frb_cnt = NX_FSW_FRB_HASHSZ;
2981 ASSERT(frb_cnt != 0);
2982 }
2983 if (frib_cnt == 0) {
2984 frib_cnt = NX_FSW_FRIB_HASHSZ;
2985 ASSERT(frib_cnt != 0);
2986 }
2987
2988 /* make sure fe_cnt is a power of two, else round up */
2989 if ((fe_cnt & (fe_cnt - 1)) != 0) {
2990 fe_cnt--;
2991 fe_cnt |= (fe_cnt >> 1);
2992 fe_cnt |= (fe_cnt >> 2);
2993 fe_cnt |= (fe_cnt >> 4);
2994 fe_cnt |= (fe_cnt >> 8);
2995 fe_cnt |= (fe_cnt >> 16);
2996 fe_cnt++;
2997 }
2998
2999 /* make sure frb_cnt is a power of two, else round up */
3000 if ((frb_cnt & (frb_cnt - 1)) != 0) {
3001 frb_cnt--;
3002 frb_cnt |= (frb_cnt >> 1);
3003 frb_cnt |= (frb_cnt >> 2);
3004 frb_cnt |= (frb_cnt >> 4);
3005 frb_cnt |= (frb_cnt >> 8);
3006 frb_cnt |= (frb_cnt >> 16);
3007 frb_cnt++;
3008 }
3009
3010 lck_mtx_init(&fsw->fsw_detach_barrier_lock, &nexus_lock_group,
3011 &nexus_lock_attr);
3012 lck_mtx_init(&fsw->fsw_reap_lock, &nexus_lock_group, &nexus_lock_attr);
3013 lck_mtx_init(&fsw->fsw_linger_lock, &nexus_lock_group, &nexus_lock_attr);
3014 TAILQ_INIT(&fsw->fsw_linger_head);
3015
3016 (void) snprintf(name, sizeof(name), "%s_%llu", NX_FSW_NAME, nx->nx_id);
3017 error = nx_advisory_alloc(nx, name,
3018 &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
3019 NEXUS_ADVISORY_TYPE_FLOWSWITCH);
3020 if (error != 0) {
3021 fsw_dp_dtor(fsw);
3022 return error;
3023 }
3024
3025 fsw->fsw_flow_mgr = flow_mgr_create(fe_cnt, fob_cnt, frb_cnt, frib_cnt);
3026 if (fsw->fsw_flow_mgr == NULL) {
3027 fsw_dp_dtor(fsw);
3028 return error;
3029 }
3030
3031 flow_mgr_setup_host_flow(fsw->fsw_flow_mgr, fsw);
3032
3033 /* generic name; will be customized upon ifattach */
3034 (void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
3035 FSW_REAP_THREADNAME, name, "");
3036
3037 if (kernel_thread_start(fsw_reap_thread_func, fsw,
3038 &fsw->fsw_reap_thread) != KERN_SUCCESS) {
3039 panic_plain("%s: can't create thread", __func__);
3040 /* NOTREACHED */
3041 __builtin_unreachable();
3042 }
3043 /* this must not fail */
3044 VERIFY(fsw->fsw_reap_thread != NULL);
3045
3046 SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
3047
3048
3049 return error;
3050 }
3051
3052 void
fsw_dp_dtor(struct nx_flowswitch * fsw)3053 fsw_dp_dtor(struct nx_flowswitch *fsw)
3054 {
3055 uint64_t f = (1 * NSEC_PER_MSEC); /* 1 ms */
3056 uint64_t s = (1000 * NSEC_PER_SEC); /* 1 sec */
3057 uint32_t i = 0;
3058
3059 nx_advisory_free(fsw->fsw_nx);
3060
3061 if (fsw->fsw_reap_thread != THREAD_NULL) {
3062 /* signal thread to begin self-termination */
3063 lck_mtx_lock(&fsw->fsw_reap_lock);
3064 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATING;
3065
3066 /*
3067 * And wait for thread to terminate; use another
3068 * wait channel here other than fsw_reap_flags to
3069 * make it more explicit. In the event the reaper
3070 * thread misses a wakeup, we'll try again once
3071 * every second (except for the first time).
3072 */
3073 while (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)) {
3074 uint64_t t = 0;
3075
3076 nanoseconds_to_absolutetime((i++ == 0) ? f : s, &t);
3077 clock_absolutetime_interval_to_deadline(t, &t);
3078 ASSERT(t != 0);
3079
3080 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATEBLOCK;
3081 if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)) {
3082 thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
3083 }
3084 (void) assert_wait_deadline(&fsw->fsw_reap_thread,
3085 THREAD_UNINT, t);
3086 lck_mtx_unlock(&fsw->fsw_reap_lock);
3087 thread_block(THREAD_CONTINUE_NULL);
3088 lck_mtx_lock(&fsw->fsw_reap_lock);
3089 fsw->fsw_reap_flags &= ~FSW_REAPF_TERMINATEBLOCK;
3090 }
3091 ASSERT(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED);
3092 lck_mtx_unlock(&fsw->fsw_reap_lock);
3093 fsw->fsw_reap_thread = THREAD_NULL;
3094 }
3095
3096 /* free any remaining flow entries in the linger list */
3097 fsw_linger_purge(fsw);
3098
3099 if (fsw->fsw_flow_mgr != NULL) {
3100 flow_mgr_teardown_host_flow(fsw->fsw_flow_mgr);
3101 flow_mgr_destroy(fsw->fsw_flow_mgr);
3102 fsw->fsw_flow_mgr = NULL;
3103 }
3104
3105 lck_mtx_destroy(&fsw->fsw_detach_barrier_lock, &nexus_lock_group);
3106 lck_mtx_destroy(&fsw->fsw_reap_lock, &nexus_lock_group);
3107 lck_mtx_destroy(&fsw->fsw_linger_lock, &nexus_lock_group);
3108 }
3109
3110 void
fsw_linger_insert(struct flow_entry * fe)3111 fsw_linger_insert(struct flow_entry *fe)
3112 {
3113 struct nx_flowswitch *fsw = fe->fe_fsw;
3114 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
3115 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
3116 fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
3117 fe->fe_flags, FLOWENTF_BITS);
3118
3119 net_update_uptime();
3120
3121 ASSERT(flow_entry_refcnt(fe) >= 1);
3122 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
3123 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
3124 ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
3125 ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE);
3126 ASSERT(fe->fe_linger_wait != 0);
3127 fe->fe_linger_expire = (_net_uptime + fe->fe_linger_wait);
3128 atomic_bitset_32(&fe->fe_flags, FLOWENTF_LINGERING);
3129
3130 lck_mtx_lock_spin(&fsw->fsw_linger_lock);
3131 TAILQ_INSERT_TAIL(&fsw->fsw_linger_head, fe, fe_linger_link);
3132 fsw->fsw_linger_cnt++;
3133 VERIFY(fsw->fsw_linger_cnt != 0);
3134 lck_mtx_unlock(&fsw->fsw_linger_lock);
3135
3136 fsw_reap_sched(fsw);
3137 }
3138
3139 static void
fsw_linger_remove_internal(struct flow_entry_linger_head * linger_head,struct flow_entry * fe)3140 fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head,
3141 struct flow_entry *fe)
3142 {
3143 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
3144 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
3145 fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
3146 fe->fe_flags, FLOWENTF_BITS);
3147
3148 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
3149 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
3150 ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
3151 atomic_bitclear_32(&fe->fe_flags, FLOWENTF_LINGERING);
3152
3153 TAILQ_REMOVE(linger_head, fe, fe_linger_link);
3154 flow_entry_release(&fe);
3155 }
3156
3157 static void
fsw_linger_remove(struct flow_entry * fe)3158 fsw_linger_remove(struct flow_entry *fe)
3159 {
3160 struct nx_flowswitch *fsw = fe->fe_fsw;
3161
3162 LCK_MTX_ASSERT(&fsw->fsw_linger_lock, LCK_MTX_ASSERT_OWNED);
3163
3164 fsw_linger_remove_internal(&fsw->fsw_linger_head, fe);
3165 VERIFY(fsw->fsw_linger_cnt != 0);
3166 fsw->fsw_linger_cnt--;
3167 }
3168
3169 void
fsw_linger_purge(struct nx_flowswitch * fsw)3170 fsw_linger_purge(struct nx_flowswitch *fsw)
3171 {
3172 struct flow_entry *fe, *tfe;
3173
3174 lck_mtx_lock(&fsw->fsw_linger_lock);
3175 TAILQ_FOREACH_SAFE(fe, &fsw->fsw_linger_head, fe_linger_link, tfe) {
3176 fsw_linger_remove(fe);
3177 }
3178 ASSERT(fsw->fsw_linger_cnt == 0);
3179 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
3180 lck_mtx_unlock(&fsw->fsw_linger_lock);
3181 }
3182
3183 void
fsw_reap_sched(struct nx_flowswitch * fsw)3184 fsw_reap_sched(struct nx_flowswitch *fsw)
3185 {
3186 ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
3187 lck_mtx_lock_spin(&fsw->fsw_reap_lock);
3188 if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING) &&
3189 !(fsw->fsw_reap_flags & (FSW_REAPF_TERMINATING | FSW_REAPF_TERMINATED))) {
3190 thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
3191 }
3192 lck_mtx_unlock(&fsw->fsw_reap_lock);
3193 }
3194
3195 __attribute__((noreturn))
3196 static void
fsw_reap_thread_func(void * v,wait_result_t w)3197 fsw_reap_thread_func(void *v, wait_result_t w)
3198 {
3199 #pragma unused(w)
3200 struct nx_flowswitch *fsw = v;
3201
3202 ASSERT(fsw->fsw_reap_thread == current_thread());
3203 thread_set_thread_name(current_thread(), fsw->fsw_reap_name);
3204
3205 net_update_uptime();
3206
3207 lck_mtx_lock(&fsw->fsw_reap_lock);
3208 VERIFY(!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING));
3209 (void) assert_wait(&fsw->fsw_reap_flags, THREAD_UNINT);
3210 lck_mtx_unlock(&fsw->fsw_reap_lock);
3211 thread_block_parameter(fsw_reap_thread_cont, fsw);
3212 /* NOTREACHED */
3213 __builtin_unreachable();
3214 }
3215
3216 __attribute__((noreturn))
3217 static void
fsw_reap_thread_cont(void * v,wait_result_t wres)3218 fsw_reap_thread_cont(void *v, wait_result_t wres)
3219 {
3220 struct nx_flowswitch *fsw = v;
3221 boolean_t low;
3222 uint64_t t = 0;
3223
3224 SK_DF(SK_VERB_FLOW, "%s: running", fsw->fsw_reap_name);
3225
3226 lck_mtx_lock(&fsw->fsw_reap_lock);
3227 if (__improbable(wres == THREAD_INTERRUPTED ||
3228 (fsw->fsw_reap_flags & FSW_REAPF_TERMINATING) != 0)) {
3229 goto terminate;
3230 }
3231
3232 ASSERT(!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED));
3233 fsw->fsw_reap_flags |= FSW_REAPF_RUNNING;
3234 lck_mtx_unlock(&fsw->fsw_reap_lock);
3235
3236 net_update_uptime();
3237
3238 /* prevent detach from happening while we're here */
3239 if (!fsw_detach_barrier_add(fsw)) {
3240 SK_ERR("%s: netagent detached", fsw->fsw_reap_name);
3241 t = 0;
3242 } else {
3243 uint32_t fe_nonviable, fe_freed, fe_aborted;
3244 uint32_t fr_freed, fr_resid = 0;
3245 struct ifnet *ifp = fsw->fsw_ifp;
3246 uint64_t i = FSW_REAP_IVAL;
3247 uint64_t now = _net_uptime;
3248 uint64_t last;
3249
3250 ASSERT(fsw->fsw_ifp != NULL);
3251
3252 /*
3253 * Pass 1: process any deferred {withdrawn,nonviable} requests.
3254 */
3255 fe_nonviable = fsw_process_deferred(fsw);
3256
3257 /*
3258 * Pass 2: remove any expired lingering flows.
3259 */
3260 fe_freed = fsw_process_linger(fsw, &fe_aborted);
3261
3262 /*
3263 * Pass 3: prune idle flow routes.
3264 */
3265 fr_freed = flow_route_prune(fsw->fsw_flow_mgr,
3266 ifp, &fr_resid);
3267
3268 /*
3269 * Pass 4: prune flow table
3270 *
3271 */
3272 cuckoo_hashtable_try_shrink(fsw->fsw_flow_mgr->fm_flow_table);
3273
3274 SK_DF(SK_VERB_FLOW, "%s: fe_nonviable %u/%u fe_freed %u/%u "
3275 "fe_aborted %u fr_freed %u/%u",
3276 fsw->fsw_flow_mgr->fm_name, fe_nonviable,
3277 (fe_nonviable + fsw->fsw_pending_nonviable),
3278 fe_freed, fsw->fsw_linger_cnt, fe_aborted, fe_freed,
3279 (fe_freed + fr_resid));
3280
3281 /* see if VM memory level is critical */
3282 low = skmem_lowmem_check();
3283
3284 /*
3285 * If things appear to be idle, we can prune away cached
3286 * object that have fallen out of the working sets (this
3287 * is different than purging). Every once in a while, we
3288 * also purge the caches. Note that this is done across
3289 * all flowswitch instances, and so we limit this to no
3290 * more than once every FSW_REAP_SK_THRES seconds.
3291 */
3292 atomic_get_64(last, &fsw_reap_last);
3293 if ((low || (last != 0 && (now - last) >= FSW_REAP_SK_THRES)) &&
3294 atomic_test_set_64(&fsw_reap_last, last, now)) {
3295 fsw_purge_cache(fsw, low);
3296
3297 /* increase sleep interval if idle */
3298 if (kdebug_enable == 0 && fsw->fsw_linger_cnt == 0 &&
3299 fsw->fsw_pending_nonviable == 0 && fr_resid == 0) {
3300 i <<= 3;
3301 }
3302 } else if (last == 0) {
3303 atomic_set_64(&fsw_reap_last, now);
3304 }
3305
3306 /*
3307 * Additionally, run thru the list of channels and prune
3308 * or purge away cached objects on "idle" channels. This
3309 * check is rate limited to no more than once every
3310 * FSW_DRAIN_CH_THRES seconds.
3311 */
3312 last = fsw->fsw_drain_channel_chk_last;
3313 if (low || (last != 0 && (now - last) >= FSW_DRAIN_CH_THRES)) {
3314 SK_DF(SK_VERB_FLOW, "%s: pruning channels",
3315 fsw->fsw_flow_mgr->fm_name);
3316
3317 fsw->fsw_drain_channel_chk_last = now;
3318 fsw_drain_channels(fsw, now, low);
3319 } else if (__improbable(last == 0)) {
3320 fsw->fsw_drain_channel_chk_last = now;
3321 }
3322
3323 /*
3324 * Finally, invoke the interface's reap callback to
3325 * tell it to prune or purge away cached objects if
3326 * it is idle. This check is rate limited to no more
3327 * than once every FSW_REAP_IF_THRES seconds.
3328 */
3329 last = fsw->fsw_drain_netif_chk_last;
3330 if (low || (last != 0 && (now - last) >= FSW_REAP_IF_THRES)) {
3331 ASSERT(fsw->fsw_nifna != NULL);
3332
3333 if (ifp->if_na_ops != NULL &&
3334 ifp->if_na_ops->ni_reap != NULL) {
3335 SK_DF(SK_VERB_FLOW, "%s: pruning netif",
3336 fsw->fsw_flow_mgr->fm_name);
3337 ifp->if_na_ops->ni_reap(ifp->if_na, ifp,
3338 FSW_REAP_IF_THRES, low);
3339 }
3340
3341 fsw->fsw_drain_netif_chk_last = now;
3342 } else if (__improbable(last == 0)) {
3343 fsw->fsw_drain_netif_chk_last = now;
3344 }
3345
3346 /* emit periodic interface stats ktrace */
3347 last = fsw->fsw_reap_last;
3348 if (last != 0 && (now - last) >= FSW_IFSTATS_THRES) {
3349 KDBG(SK_KTRACE_AON_IF_STATS, ifp->if_data.ifi_ipackets,
3350 ifp->if_data.ifi_ibytes * 8,
3351 ifp->if_data.ifi_opackets,
3352 ifp->if_data.ifi_obytes * 8);
3353
3354 fsw->fsw_reap_last = now;
3355 } else if (__improbable(last == 0)) {
3356 fsw->fsw_reap_last = now;
3357 }
3358
3359 nanoseconds_to_absolutetime(i * NSEC_PER_SEC, &t);
3360 clock_absolutetime_interval_to_deadline(t, &t);
3361 ASSERT(t != 0);
3362
3363 /* allow any pending detach to proceed */
3364 fsw_detach_barrier_remove(fsw);
3365 }
3366
3367 lck_mtx_lock(&fsw->fsw_reap_lock);
3368 if (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING)) {
3369 fsw->fsw_reap_flags &= ~FSW_REAPF_RUNNING;
3370 (void) assert_wait_deadline(&fsw->fsw_reap_flags,
3371 THREAD_UNINT, t);
3372 lck_mtx_unlock(&fsw->fsw_reap_lock);
3373 thread_block_parameter(fsw_reap_thread_cont, fsw);
3374 /* NOTREACHED */
3375 __builtin_unreachable();
3376 } else {
3377 terminate:
3378 LCK_MTX_ASSERT(&fsw->fsw_reap_lock, LCK_MTX_ASSERT_OWNED);
3379 fsw->fsw_reap_flags &= ~(FSW_REAPF_RUNNING | FSW_REAPF_TERMINATING);
3380 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATED;
3381 /*
3382 * And signal any thread waiting for us to terminate;
3383 * wait channel here other than fsw_reap_flags to make
3384 * it more explicit.
3385 */
3386 if (fsw->fsw_reap_flags & FSW_REAPF_TERMINATEBLOCK) {
3387 thread_wakeup((caddr_t)&fsw->fsw_reap_thread);
3388 }
3389 lck_mtx_unlock(&fsw->fsw_reap_lock);
3390
3391 SK_DF(SK_VERB_FLOW, "%s: terminating", fsw->fsw_reap_name);
3392
3393 /* for the extra refcnt from kernel_thread_start() */
3394 thread_deallocate(current_thread());
3395 /* this is the end */
3396 thread_terminate(current_thread());
3397 /* NOTREACHED */
3398 __builtin_unreachable();
3399 }
3400
3401 /* must never get here */
3402 VERIFY(0);
3403 /* NOTREACHED */
3404 __builtin_unreachable();
3405 }
3406
3407 static void
fsw_drain_channels(struct nx_flowswitch * fsw,uint64_t now,boolean_t low)3408 fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low)
3409 {
3410 struct kern_nexus *nx = fsw->fsw_nx;
3411
3412 /* flowswitch protects NA via fsw_lock, see fsw_port_alloc/free */
3413 FSW_RLOCK(fsw);
3414
3415 /* uncrustify doesn't handle C blocks properly */
3416 /* BEGIN IGNORE CODESTYLE */
3417 nx_port_foreach(nx, ^(nexus_port_t p) {
3418 struct nexus_adapter *na = nx_port_get_na(nx, p);
3419 if (na == NULL || na->na_work_ts == 0 ||
3420 (now - na->na_work_ts) < FSW_DRAIN_CH_THRES) {
3421 return;
3422 }
3423
3424 /*
3425 * If NA has been inactive for some time (twice the drain
3426 * threshold), we clear the work timestamp to temporarily skip
3427 * this channel until it's active again. Purging cached objects
3428 * can be expensive since we'd need to allocate and construct
3429 * them again, so we do it only when necessary.
3430 */
3431 boolean_t purge;
3432 if (low || ((now - na->na_work_ts) >= (FSW_DRAIN_CH_THRES << 1))) {
3433 na->na_work_ts = 0;
3434 purge = TRUE;
3435 } else {
3436 purge = FALSE;
3437 }
3438
3439 na_drain(na, purge); /* purge/prune caches */
3440 });
3441 /* END IGNORE CODESTYLE */
3442
3443 FSW_RUNLOCK(fsw);
3444 }
3445
3446 static void
fsw_purge_cache(struct nx_flowswitch * fsw,boolean_t low)3447 fsw_purge_cache(struct nx_flowswitch *fsw, boolean_t low)
3448 {
3449 #pragma unused(fsw)
3450 uint64_t o = atomic_add_64_ov(&fsw_want_purge, 1);
3451 uint32_t p = fsw_flow_purge_thresh;
3452 boolean_t purge = (low || (o != 0 && p != 0 && (o % p) == 0));
3453
3454 SK_DF(SK_VERB_FLOW, "%s: %s caches",
3455 fsw->fsw_flow_mgr->fm_name,
3456 (purge ? "purge" : "prune"));
3457
3458 skmem_cache_reap_now(sk_fo_cache, purge);
3459 skmem_cache_reap_now(sk_fe_cache, purge);
3460 skmem_cache_reap_now(sk_fab_cache, purge);
3461 skmem_cache_reap_now(flow_route_cache, purge);
3462 skmem_cache_reap_now(flow_stats_cache, purge);
3463 eventhandler_reap_caches(purge);
3464 netns_reap_caches(purge);
3465 skmem_reap_caches(purge);
3466 necp_client_reap_caches(purge);
3467
3468 if (if_is_fsw_transport_netagent_enabled() && purge) {
3469 mbuf_drain(FALSE);
3470 }
3471 }
3472
3473 static void
fsw_flow_handle_low_power(struct nx_flowswitch * fsw,struct flow_entry * fe)3474 fsw_flow_handle_low_power(struct nx_flowswitch *fsw, struct flow_entry *fe)
3475 {
3476 /* When the interface is in low power mode, the flow is nonviable */
3477 if (!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
3478 atomic_test_set_32(&fe->fe_want_nonviable, 0, 1)) {
3479 atomic_add_32(&fsw->fsw_pending_nonviable, 1);
3480 }
3481 }
3482
3483 static uint32_t
fsw_process_deferred(struct nx_flowswitch * fsw)3484 fsw_process_deferred(struct nx_flowswitch *fsw)
3485 {
3486 struct flow_entry_dead sfed __sk_aligned(8);
3487 struct flow_mgr *fm = fsw->fsw_flow_mgr;
3488 struct flow_entry_dead *fed, *tfed;
3489 LIST_HEAD(, flow_entry_dead) fed_head =
3490 LIST_HEAD_INITIALIZER(fed_head);
3491 uint32_t i, nonviable = 0;
3492 boolean_t lowpowermode = FALSE;
3493
3494 bzero(&sfed, sizeof(sfed));
3495
3496 /*
3497 * The flows become nonviable when the interface
3498 * is in low power mode (edge trigger)
3499 */
3500 if ((fsw->fsw_ifp->if_xflags & IFXF_LOW_POWER) &&
3501 fsw->fsw_ifp->if_low_power_gencnt != fsw->fsw_low_power_gencnt) {
3502 lowpowermode = TRUE;
3503 fsw->fsw_low_power_gencnt = fsw->fsw_ifp->if_low_power_gencnt;
3504 }
3505
3506 /*
3507 * Scan thru the flow entry tree, and commit any pending withdraw or
3508 * nonviable requests. We may need to push stats and/or unassign the
3509 * nexus from NECP, but we cannot do that while holding the locks;
3510 * build a temporary list for those entries.
3511 */
3512 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
3513 struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
3514 struct flow_owner *fo;
3515
3516 /*
3517 * Grab the lock at all costs when handling low power mode
3518 */
3519 if (__probable(!lowpowermode)) {
3520 if (!FOB_TRY_LOCK(fob)) {
3521 continue;
3522 }
3523 } else {
3524 FOB_LOCK(fob);
3525 }
3526
3527 FOB_LOCK_ASSERT_HELD(fob);
3528 RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
3529 struct flow_entry *fe;
3530
3531 RB_FOREACH(fe, flow_entry_id_tree,
3532 &fo->fo_flow_entry_id_head) {
3533 /* try first as reader; skip if we can't */
3534 if (__improbable(lowpowermode)) {
3535 fsw_flow_handle_low_power(fsw, fe);
3536 }
3537 if (__improbable(fe->fe_flags & FLOWENTF_HALF_CLOSED)) {
3538 atomic_bitclear_32(&fe->fe_flags, FLOWENTF_HALF_CLOSED);
3539 flow_namespace_half_close(&fe->fe_port_reservation);
3540 }
3541
3542 /* if not withdrawn/nonviable, skip */
3543 if (!fe->fe_want_withdraw &&
3544 !fe->fe_want_nonviable) {
3545 continue;
3546 }
3547 /*
3548 * Here we're holding the lock as writer;
3549 * don't spend too much time as we're
3550 * blocking the data path now.
3551 */
3552 ASSERT(!uuid_is_null(fe->fe_uuid));
3553 /* only need flow UUID and booleans */
3554 uuid_copy(sfed.fed_uuid, fe->fe_uuid);
3555 sfed.fed_want_clonotify =
3556 (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY);
3557 sfed.fed_want_nonviable = fe->fe_want_nonviable;
3558 flow_entry_teardown(fo, fe);
3559
3560 /* do this outside the flow bucket lock */
3561 fed = flow_entry_dead_alloc(Z_WAITOK);
3562 ASSERT(fed != NULL);
3563 *fed = sfed;
3564 LIST_INSERT_HEAD(&fed_head, fed, fed_link);
3565 }
3566 }
3567 FOB_UNLOCK(fob);
3568 }
3569
3570 /*
3571 * These nonviable flows are no longer useful since we've lost
3572 * the source IP address; in the event the client monitors the
3573 * viability of the flow, explicitly mark it as nonviable so
3574 * that a new flow can be created.
3575 */
3576 LIST_FOREACH_SAFE(fed, &fed_head, fed_link, tfed) {
3577 LIST_REMOVE(fed, fed_link);
3578 ASSERT(fsw->fsw_agent_session != NULL);
3579
3580 /* if flow is closed early */
3581 if (fed->fed_want_clonotify) {
3582 necp_client_early_close(fed->fed_uuid);
3583 }
3584
3585 /* if nonviable, unassign nexus attributes */
3586 if (fed->fed_want_nonviable) {
3587 (void) netagent_assign_nexus(fsw->fsw_agent_session,
3588 fed->fed_uuid, NULL, 0);
3589 }
3590
3591 flow_entry_dead_free(fed);
3592 ++nonviable;
3593 }
3594 ASSERT(LIST_EMPTY(&fed_head));
3595
3596 return nonviable;
3597 }
3598
3599 static uint32_t
fsw_process_linger(struct nx_flowswitch * fsw,uint32_t * abort)3600 fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort)
3601 {
3602 struct flow_entry_linger_head linger_head =
3603 TAILQ_HEAD_INITIALIZER(linger_head);
3604 struct flow_entry *fe, *tfe;
3605 uint64_t now = _net_uptime;
3606 uint32_t i = 0, cnt = 0, freed = 0;
3607
3608 ASSERT(fsw->fsw_ifp != NULL);
3609 ASSERT(abort != NULL);
3610 *abort = 0;
3611
3612 /*
3613 * We don't want to contend with the datapath, so move
3614 * everything that's in the linger list into a local list.
3615 * This allows us to generate RSTs or free the flow entry
3616 * outside the lock. Any remaining flow entry in the local
3617 * list will get re-added back to the head of the linger
3618 * list, in front of any new ones added since then.
3619 */
3620 lck_mtx_lock(&fsw->fsw_linger_lock);
3621 TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
3622 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
3623 cnt = fsw->fsw_linger_cnt;
3624 fsw->fsw_linger_cnt = 0;
3625 lck_mtx_unlock(&fsw->fsw_linger_lock);
3626
3627 TAILQ_FOREACH_SAFE(fe, &linger_head, fe_linger_link, tfe) {
3628 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
3629 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
3630 ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
3631
3632 /*
3633 * See if this is a TCP flow that needs to generate
3634 * a RST to the remote peer (if not already).
3635 */
3636 if (flow_track_tcp_want_abort(fe)) {
3637 VERIFY(fe->fe_flags & FLOWENTF_ABORTED);
3638 ASSERT(!uuid_is_null(fe->fe_uuid));
3639 fsw_flow_abort_tcp(fsw, fe, NULL);
3640 (*abort)++;
3641 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
3642 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx "
3643 "flags 0x%b [RST]", fe_as_string(fe, dbgbuf,
3644 sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags,
3645 FLOWENTF_BITS);
3646 }
3647
3648 /*
3649 * If flow has expired, remove from list and free;
3650 * otherwise leave it around in the linger list.
3651 */
3652 if (fe->fe_linger_expire <= now) {
3653 freed++;
3654 fsw_linger_remove_internal(&linger_head, fe);
3655 fe = NULL;
3656 }
3657 ++i;
3658 }
3659 VERIFY(i == cnt && cnt >= freed);
3660
3661 /*
3662 * Add any remaining ones back into the linger list.
3663 */
3664 lck_mtx_lock(&fsw->fsw_linger_lock);
3665 if (!TAILQ_EMPTY(&linger_head)) {
3666 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head) || fsw->fsw_linger_cnt);
3667 TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
3668 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
3669 TAILQ_CONCAT(&fsw->fsw_linger_head, &linger_head, fe_linger_link);
3670 fsw->fsw_linger_cnt += (cnt - freed);
3671 }
3672 ASSERT(TAILQ_EMPTY(&linger_head));
3673 lck_mtx_unlock(&fsw->fsw_linger_lock);
3674
3675 return freed;
3676 }
3677
3678 /* Send RST for a given TCP flow; Use @pkt as template if given */
3679 void
fsw_flow_abort_tcp(struct nx_flowswitch * fsw,struct flow_entry * fe,struct __kern_packet * pkt)3680 fsw_flow_abort_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe,
3681 struct __kern_packet *pkt)
3682 {
3683 struct flow_track *src, *dst;
3684 struct ip *ip;
3685 struct ip6_hdr *ip6;
3686 struct tcphdr *th;
3687 uint16_t len, tlen;
3688 struct mbuf *m;
3689 uint8_t ipver;
3690
3691 /* guaranteed by caller */
3692 ASSERT(fsw->fsw_ifp != NULL);
3693
3694 src = &fe->fe_ltrack;
3695 dst = &fe->fe_rtrack;
3696
3697 if (pkt != NULL) {
3698 ipver = pkt->pkt_flow_ip_ver;
3699 } else {
3700 ipver = fe->fe_key.fk_ipver;
3701 }
3702
3703 tlen = sizeof(struct tcphdr);
3704 if (ipver == IPVERSION) {
3705 len = sizeof(struct ip) + tlen;
3706 } else {
3707 ASSERT(ipver == IPV6_VERSION);
3708 len = sizeof(struct ip6_hdr) + tlen;
3709 }
3710
3711 m = m_gethdr(M_WAITOK, MT_HEADER);
3712 VERIFY(m != NULL);
3713
3714 m->m_pkthdr.pkt_proto = IPPROTO_TCP;
3715 m->m_data += max_linkhdr; /* 32-bit aligned */
3716 m->m_pkthdr.len = m->m_len = len;
3717
3718 /* zero out for checksum */
3719 bzero(m->m_data, len);
3720
3721 if (ipver == IPVERSION) {
3722 ip = mtod(m, struct ip *);
3723
3724 /* IP header fields included in the TCP checksum */
3725 ip->ip_p = IPPROTO_TCP;
3726 ip->ip_len = htons(tlen);
3727 if (pkt == NULL) {
3728 ip->ip_src = fe->fe_key.fk_src4;
3729 ip->ip_dst = fe->fe_key.fk_dst4;
3730 } else {
3731 ip->ip_src = pkt->pkt_flow_ipv4_src;
3732 ip->ip_dst = pkt->pkt_flow_ipv4_dst;
3733 }
3734
3735 th = (struct tcphdr *)(void *)((char *)ip + sizeof(*ip));
3736 } else {
3737 ip6 = mtod(m, struct ip6_hdr *);
3738
3739 /* IP header fields included in the TCP checksum */
3740 ip6->ip6_nxt = IPPROTO_TCP;
3741 ip6->ip6_plen = htons(tlen);
3742 if (pkt == NULL) {
3743 ip6->ip6_src = fe->fe_key.fk_src6;
3744 ip6->ip6_dst = fe->fe_key.fk_dst6;
3745 } else {
3746 ip6->ip6_src = pkt->pkt_flow_ipv6_src;
3747 ip6->ip6_dst = pkt->pkt_flow_ipv6_dst;
3748 }
3749
3750 th = (struct tcphdr *)(void *)((char *)ip6 + sizeof(*ip6));
3751 }
3752
3753 /*
3754 * TCP header (fabricate a pure RST).
3755 */
3756 if (pkt == NULL) {
3757 th->th_sport = fe->fe_key.fk_sport;
3758 th->th_dport = fe->fe_key.fk_dport;
3759 th->th_seq = htonl(src->fse_seqlo); /* peer's last ACK */
3760 th->th_ack = 0;
3761 th->th_flags = TH_RST;
3762 } else {
3763 th->th_sport = pkt->pkt_flow_tcp_src;
3764 th->th_dport = pkt->pkt_flow_tcp_dst;
3765 th->th_seq = pkt->pkt_flow_tcp_seq;
3766 th->th_ack = pkt->pkt_flow_tcp_ack;
3767 th->th_flags = pkt->pkt_flow_tcp_flags;
3768 }
3769 th->th_off = (tlen >> 2);
3770 th->th_win = 0;
3771
3772 FSW_STATS_INC(FSW_STATS_FLOWS_ABORTED);
3773
3774 if (ipver == IPVERSION) {
3775 struct ip_out_args ipoa;
3776 struct route ro;
3777
3778 bzero(&ipoa, sizeof(ipoa));
3779 ipoa.ipoa_boundif = fsw->fsw_ifp->if_index;
3780 ipoa.ipoa_flags = (IPOAF_SELECT_SRCIF | IPOAF_BOUND_IF |
3781 IPOAF_BOUND_SRCADDR);
3782 ipoa.ipoa_sotc = SO_TC_UNSPEC;
3783 ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
3784
3785 /* TCP checksum */
3786 th->th_sum = in_cksum(m, len);
3787
3788 ip->ip_v = IPVERSION;
3789 ip->ip_hl = sizeof(*ip) >> 2;
3790 ip->ip_tos = 0;
3791 /*
3792 * ip_output() expects ip_len and ip_off to be in host order.
3793 */
3794 ip->ip_len = len;
3795 ip->ip_off = IP_DF;
3796 ip->ip_ttl = (uint8_t)ip_defttl;
3797 ip->ip_sum = 0;
3798
3799 bzero(&ro, sizeof(ro));
3800 (void) ip_output(m, NULL, &ro, IP_OUTARGS, NULL, &ipoa);
3801 ROUTE_RELEASE(&ro);
3802 } else {
3803 struct ip6_out_args ip6oa;
3804 struct route_in6 ro6;
3805
3806 bzero(&ip6oa, sizeof(ip6oa));
3807 ip6oa.ip6oa_boundif = fsw->fsw_ifp->if_index;
3808 ip6oa.ip6oa_flags = (IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_IF |
3809 IP6OAF_BOUND_SRCADDR);
3810 ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
3811 ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
3812
3813 /* TCP checksum */
3814 th->th_sum = in6_cksum(m, IPPROTO_TCP,
3815 sizeof(struct ip6_hdr), tlen);
3816
3817 ip6->ip6_vfc |= IPV6_VERSION;
3818 ip6->ip6_hlim = IPV6_DEFHLIM;
3819
3820 ip6_output_setsrcifscope(m, fsw->fsw_ifp->if_index, NULL);
3821 ip6_output_setdstifscope(m, fsw->fsw_ifp->if_index, NULL);
3822
3823 bzero(&ro6, sizeof(ro6));
3824 (void) ip6_output(m, NULL, &ro6, IPV6_OUTARGS,
3825 NULL, NULL, &ip6oa);
3826 ROUTE_RELEASE(&ro6);
3827 }
3828 }
3829
3830 void
fsw_flow_abort_quic(struct flow_entry * fe,uint8_t * token)3831 fsw_flow_abort_quic(struct flow_entry *fe, uint8_t *token)
3832 {
3833 struct quic_stateless_reset {
3834 uint8_t ssr_header[30];
3835 uint8_t ssr_token[QUIC_STATELESS_RESET_TOKEN_SIZE];
3836 };
3837 struct nx_flowswitch *fsw = fe->fe_fsw;
3838 struct ip *ip;
3839 struct ip6_hdr *ip6;
3840 struct udphdr *uh;
3841 struct quic_stateless_reset *qssr;
3842 uint16_t len, l3hlen, ulen;
3843 struct mbuf *m;
3844 unsigned int one = 1;
3845 int error;
3846
3847 /* guaranteed by caller */
3848 ASSERT(fsw->fsw_ifp != NULL);
3849
3850 /* skip zero token */
3851 bool is_zero_token = true;
3852 for (size_t i = 0; i < QUIC_STATELESS_RESET_TOKEN_SIZE; i++) {
3853 if (token[i] != 0) {
3854 is_zero_token = false;
3855 break;
3856 }
3857 }
3858 if (is_zero_token) {
3859 return;
3860 }
3861
3862 ulen = sizeof(struct udphdr) + sizeof(struct quic_stateless_reset);
3863 if (fe->fe_key.fk_ipver == IPVERSION) {
3864 l3hlen = sizeof(struct ip);
3865 } else {
3866 ASSERT(fe->fe_key.fk_ipver == IPV6_VERSION);
3867 l3hlen = sizeof(struct ip6_hdr);
3868 }
3869
3870 len = l3hlen + ulen;
3871
3872 error = mbuf_allocpacket(MBUF_DONTWAIT, max_linkhdr + len, &one, &m);
3873 if (error != 0) {
3874 return;
3875 }
3876 VERIFY(m != 0);
3877
3878 m->m_pkthdr.pkt_proto = IPPROTO_UDP;
3879 m->m_data += max_linkhdr; /* 32-bit aligned */
3880 m->m_pkthdr.len = m->m_len = len;
3881
3882 /* zero out for checksum */
3883 bzero(m->m_data, len);
3884
3885 if (fe->fe_key.fk_ipver == IPVERSION) {
3886 ip = mtod(m, struct ip *);
3887 ip->ip_p = IPPROTO_UDP;
3888 ip->ip_len = htons(ulen);
3889 ip->ip_src = fe->fe_key.fk_src4;
3890 ip->ip_dst = fe->fe_key.fk_dst4;
3891 uh = (struct udphdr *)(void *)((char *)ip + sizeof(*ip));
3892 } else {
3893 ip6 = mtod(m, struct ip6_hdr *);
3894 ip6->ip6_nxt = IPPROTO_UDP;
3895 ip6->ip6_plen = htons(ulen);
3896 ip6->ip6_src = fe->fe_key.fk_src6;
3897 ip6->ip6_dst = fe->fe_key.fk_dst6;
3898 uh = (struct udphdr *)(void *)((char *)ip6 + sizeof(*ip6));
3899 }
3900
3901 /* UDP header */
3902 uh->uh_sport = fe->fe_key.fk_sport;
3903 uh->uh_dport = fe->fe_key.fk_dport;
3904 uh->uh_ulen = htons(ulen);
3905
3906 /* QUIC stateless reset */
3907 qssr = (struct quic_stateless_reset *)(uh + 1);
3908 read_frandom(&qssr->ssr_header, sizeof(qssr->ssr_header));
3909 qssr->ssr_header[0] = (qssr->ssr_header[0] & 0x3f) | 0x40;
3910 memcpy(qssr->ssr_token, token, QUIC_STATELESS_RESET_TOKEN_SIZE);
3911
3912 FSW_STATS_INC(FSW_STATS_FLOWS_ABORTED);
3913
3914 if (fe->fe_key.fk_ipver == IPVERSION) {
3915 struct ip_out_args ipoa;
3916 struct route ro;
3917
3918 bzero(&ipoa, sizeof(ipoa));
3919 ipoa.ipoa_boundif = fsw->fsw_ifp->if_index;
3920 ipoa.ipoa_flags = (IPOAF_SELECT_SRCIF | IPOAF_BOUND_IF |
3921 IPOAF_BOUND_SRCADDR);
3922 ipoa.ipoa_sotc = SO_TC_UNSPEC;
3923 ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
3924
3925 uh->uh_sum = in_cksum(m, len);
3926 if (uh->uh_sum == 0) {
3927 uh->uh_sum = 0xffff;
3928 }
3929
3930 ip->ip_v = IPVERSION;
3931 ip->ip_hl = sizeof(*ip) >> 2;
3932 ip->ip_tos = 0;
3933 /*
3934 * ip_output() expects ip_len and ip_off to be in host order.
3935 */
3936 ip->ip_len = len;
3937 ip->ip_off = IP_DF;
3938 ip->ip_ttl = (uint8_t)ip_defttl;
3939 ip->ip_sum = 0;
3940
3941 bzero(&ro, sizeof(ro));
3942 (void) ip_output(m, NULL, &ro, IP_OUTARGS, NULL, &ipoa);
3943 ROUTE_RELEASE(&ro);
3944 } else {
3945 struct ip6_out_args ip6oa;
3946 struct route_in6 ro6;
3947
3948 bzero(&ip6oa, sizeof(ip6oa));
3949 ip6oa.ip6oa_boundif = fsw->fsw_ifp->if_index;
3950 ip6oa.ip6oa_flags = (IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_IF |
3951 IP6OAF_BOUND_SRCADDR);
3952 ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
3953 ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
3954
3955 uh->uh_sum = in6_cksum(m, IPPROTO_UDP, sizeof(struct ip6_hdr),
3956 ulen);
3957 if (uh->uh_sum == 0) {
3958 uh->uh_sum = 0xffff;
3959 }
3960
3961 ip6->ip6_vfc |= IPV6_VERSION;
3962 ip6->ip6_hlim = IPV6_DEFHLIM;
3963 ip6_output_setsrcifscope(m, fsw->fsw_ifp->if_index, NULL);
3964 ip6_output_setdstifscope(m, fsw->fsw_ifp->if_index, NULL);
3965
3966 bzero(&ro6, sizeof(ro6));
3967 (void) ip6_output(m, NULL, &ro6, IPV6_OUTARGS,
3968 NULL, NULL, &ip6oa);
3969 ROUTE_RELEASE(&ro6);
3970 }
3971 }
3972
3973 __attribute__((always_inline))
3974 static inline void
fsw_ifp_inc_traffic_class_in_pkt(struct ifnet * ifp,kern_packet_t ph)3975 fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph)
3976 {
3977 switch (__packet_get_traffic_class(ph)) {
3978 case PKT_TC_BE:
3979 ifp->if_tc.ifi_ibepackets++;
3980 ifp->if_tc.ifi_ibebytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
3981 break;
3982 case PKT_TC_BK:
3983 ifp->if_tc.ifi_ibkpackets++;
3984 ifp->if_tc.ifi_ibkbytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
3985 break;
3986 case PKT_TC_VI:
3987 ifp->if_tc.ifi_ivipackets++;
3988 ifp->if_tc.ifi_ivibytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
3989 break;
3990 case PKT_TC_VO:
3991 ifp->if_tc.ifi_ivopackets++;
3992 ifp->if_tc.ifi_ivobytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
3993 break;
3994 default:
3995 break;
3996 }
3997 }
3998
3999 __attribute__((always_inline))
4000 static inline void
fsw_ifp_inc_traffic_class_out_pkt(struct ifnet * ifp,uint32_t svc,uint32_t cnt,uint32_t len)4001 fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *ifp, uint32_t svc,
4002 uint32_t cnt, uint32_t len)
4003 {
4004 switch (svc) {
4005 case PKT_TC_BE:
4006 ifp->if_tc.ifi_obepackets += cnt;
4007 ifp->if_tc.ifi_obebytes += len;
4008 break;
4009 case PKT_TC_BK:
4010 ifp->if_tc.ifi_obkpackets += cnt;
4011 ifp->if_tc.ifi_obkbytes += len;
4012 break;
4013 case PKT_TC_VI:
4014 ifp->if_tc.ifi_ovipackets += cnt;
4015 ifp->if_tc.ifi_ovibytes += len;
4016 break;
4017 case PKT_TC_VO:
4018 ifp->if_tc.ifi_ovopackets += cnt;
4019 ifp->if_tc.ifi_ovobytes += len;
4020 break;
4021 default:
4022 break;
4023 }
4024 }
4025