1 /*
2 * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 */
53
54 /*
55 * BSD LICENSE
56 *
57 * Copyright(c) 2015 NEC Europe Ltd. All rights reserved.
58 * All rights reserved.
59 *
60 * Redistribution and use in source and binary forms, with or without
61 * modification, are permitted provided that the following conditions
62 * are met:
63 *
64 * * Redistributions of source code must retain the above copyright
65 * notice, this list of conditions and the following disclaimer.
66 * * Redistributions in binary form must reproduce the above copyright
67 * notice, this list of conditions and the following disclaimer in
68 * the documentation and/or other materials provided with the
69 * distribution.
70 * * Neither the name of NEC Europe Ltd. nor the names of
71 * its contributors may be used to endorse or promote products derived
72 * from this software without specific prior written permission.
73 *
74 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
75 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
76 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
77 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
78 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
79 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
80 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
81 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
82 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
83 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
84 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85 */
86
87 #include <skywalk/os_skywalk_private.h>
88 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
89 #include <skywalk/nexus/flowswitch/fsw_var.h>
90 #include <skywalk/nexus/netif/nx_netif.h>
91 #include <skywalk/nexus/netif/nx_netif_compat.h>
92 #include <kern/sched_prim.h>
93 #include <sys/kdebug.h>
94 #include <sys/sdt.h>
95 #include <net/bpf.h>
96 #include <net/if_ports_used.h>
97 #include <net/pktap.h>
98 #include <net/pktsched/pktsched_netem.h>
99 #include <netinet/tcp.h>
100 #include <netinet/udp.h>
101 #include <netinet/ip.h>
102 #include <netinet/ip6.h>
103
104 extern kern_return_t thread_terminate(thread_t);
105
106 #define FSW_ZONE_MAX 256
107 #define FSW_ZONE_NAME "skywalk.nx.fsw"
108
109 static uint64_t fsw_reap_last __sk_aligned(8);
110 static uint64_t fsw_want_purge __sk_aligned(8);
111
112 #define NX_FSW_FE_TABLESZ 256 /* some power of 2 */
113 static uint32_t fsw_fe_table_size = NX_FSW_FE_TABLESZ;
114
115 #define NX_FSW_FOB_HASHSZ 31 /* some mersenne prime */
116 static uint32_t fsw_flow_owner_buckets = NX_FSW_FOB_HASHSZ;
117
118 #define NX_FSW_FRB_HASHSZ 128 /* some power of 2 */
119 static uint32_t fsw_flow_route_buckets = NX_FSW_FRB_HASHSZ;
120
121 #define NX_FSW_FRIB_HASHSZ 13 /* some mersenne prime */
122 static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ;
123
124 #define NX_FSW_FLOW_REAP_INTERVAL 1 /* seconds */
125 static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL;
126
127 #define NX_FSW_FLOW_PURGE_THRES 0 /* purge every N reaps (0 = disable) */
128 static uint32_t fsw_flow_purge_thresh = NX_FSW_FLOW_PURGE_THRES;
129
130 #define FSW_REAP_IVAL (MAX(1, fsw_flow_reap_interval))
131 #define FSW_REAP_SK_THRES (FSW_REAP_IVAL << 5)
132 #define FSW_REAP_IF_THRES (FSW_REAP_IVAL << 5)
133 #define FSW_DRAIN_CH_THRES (FSW_REAP_IVAL << 5)
134 #define FSW_IFSTATS_THRES 1
135
136 #define RX_BUFLET_BATCH_COUNT 64 /* max batch size for buflet allocation */
137
138 uint32_t fsw_rx_batch = NX_FSW_RXBATCH; /* # of packets per batch (RX) */
139 uint32_t fsw_tx_batch = NX_FSW_TXBATCH; /* # of packets per batch (TX) */
140 #if (DEVELOPMENT || DEBUG)
141 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_batch,
142 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_batch, 0,
143 "flowswitch Rx batch size");
144 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, tx_batch,
145 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_tx_batch, 0,
146 "flowswitch Tx batch size");
147 #endif /* !DEVELOPMENT && !DEBUG */
148
149 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp,
150 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp, 0,
151 "flowswitch RX aggregation for tcp flows (enable/disable)");
152 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp_host,
153 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp_host, 0,
154 "flowswitch RX aggregation for tcp kernel path (0/1/2 (off/on/auto))");
155
156 /*
157 * IP reassembly
158 * The "kern.skywalk.flowswitch.ip_reass" sysctl can be used to force
159 * enable/disable the reassembly routine regardless of whether the
160 * transport netagent is enabled or not.
161 *
162 * 'fsw_ip_reass' is a tri-state:
163 * 0 means force IP reassembly off
164 * 1 means force IP reassembly on
165 * 2 means don't force the value, use what's appropriate for this flowswitch
166 */
167 #define FSW_IP_REASS_FORCE_OFF 0
168 #define FSW_IP_REASS_FORCE_ON 1
169 #define FSW_IP_REASS_NO_FORCE 2
170
171 uint32_t fsw_ip_reass = FSW_IP_REASS_NO_FORCE;
172
173 static int
174 fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS
175 {
176 #pragma unused(oidp, arg1, arg2)
177 unsigned int new_value;
178 int changed;
179 int error;
180
181 error = sysctl_io_number(req, fsw_ip_reass, sizeof(fsw_ip_reass),
182 &new_value, &changed);
183 if (error == 0 && changed != 0) {
184 if (new_value > FSW_IP_REASS_NO_FORCE) {
185 return EINVAL;
186 }
187 fsw_ip_reass = new_value;
188 }
189 return error;
190 }
191
192 SYSCTL_PROC(_kern_skywalk_flowswitch, OID_AUTO, ip_reass,
193 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
194 0, 0, fsw_ip_reass_sysctl, "IU",
195 "adjust flowswitch IP reassembly");
196
197 #if (DEVELOPMENT || DEBUG)
198 static uint64_t _fsw_inject_error = 0;
199 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) \
200 _SK_INJECT_ERROR(_fsw_inject_error, _en, _ev, _ec, \
201 &FSW_STATS_VAL(_FSW_STATS_ERROR_INJECTIONS), _f, __VA_ARGS__)
202
203 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { \
204 if (__improbable(((_fsw_inject_error) & (1ULL << (_en))) != 0)) { \
205 SK_DF(SK_VERB_ERROR_INJECT, "injecting error %d", (_en));\
206 if ((_f) != NULL) \
207 (_f)(__VA_ARGS__); \
208 } \
209 } while (0)
210
211 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_owner_buckets,
212 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_owner_buckets, 0, "");
213 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, fe_table_size,
214 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_fe_table_size, 0, "");
215 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_buckets,
216 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_route_buckets, 0, "");
217 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO,
218 flow_route_id_buckets, CTLFLAG_RW | CTLFLAG_LOCKED,
219 &fsw_flow_route_id_buckets, 0, "");
220 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_reap_interval,
221 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_reap_interval, 0, "");
222 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_purge_thresh,
223 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_purge_thresh, 0, "");
224 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, fsw_inject_error,
225 CTLFLAG_RW | CTLFLAG_LOCKED, &_fsw_inject_error, "");
226 #else
227 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0)
228 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { } while (0)
229 #endif /* !DEVELOPMENT && !DEBUG */
230
231 static void fsw_linger_remove_internal(struct flow_entry_linger_head *,
232 struct flow_entry *);
233 static void fsw_reap_thread_func(void *, wait_result_t);
234 static void fsw_reap_thread_cont(void *, wait_result_t);
235 static void fsw_purge_cache(struct nx_flowswitch *, boolean_t);
236 static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t);
237 static uint32_t fsw_process_deferred(struct nx_flowswitch *);
238 static uint32_t fsw_process_linger(struct nx_flowswitch *, uint32_t *);
239
240 static int copy_packet_from_dev(struct nx_flowswitch *, struct __kern_packet *,
241 struct __kern_packet *);
242
243 static void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *, kern_packet_t);
244 static void fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *, uint32_t,
245 uint32_t, uint32_t);
246
247 static int __fsw_dp_inited = 0;
248
249 int
fsw_dp_init(void)250 fsw_dp_init(void)
251 {
252 _CASSERT(FSW_VP_DEV == 0);
253 _CASSERT(FSW_VP_HOST == 1);
254 _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN);
255 _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT);
256
257 ASSERT(!__fsw_dp_inited);
258
259 flow_mgr_init();
260 flow_init();
261
262 __fsw_dp_inited = 1;
263
264 return 0;
265 }
266
267 void
fsw_dp_uninit(void)268 fsw_dp_uninit(void)
269 {
270 if (__fsw_dp_inited) {
271 flow_fini();
272 flow_mgr_fini();
273
274 __fsw_dp_inited = 0;
275 }
276 }
277
278 static void
dp_free_pktq(struct nx_flowswitch * fsw __sk_unused,struct pktq * pktq)279 dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq)
280 {
281 pp_free_pktq(pktq);
282 }
283
284 #define dp_drop_pktq(fsw, pktq) do { \
285 uint32_t _len = KPKTQ_LEN(pktq); \
286 if (KPKTQ_EMPTY(pktq)) { \
287 ASSERT(_len == 0); \
288 return; \
289 } \
290 SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop %d packets", _len); \
291 FSW_STATS_ADD(FSW_STATS_DROP, _len); \
292 DTRACE_SKYWALK1(fsw__dp__drop, int, _len); \
293 dp_free_pktq(fsw, pktq); \
294 } while (0)
295
296 SK_NO_INLINE_ATTRIBUTE
297 void
fsw_snoop(struct nx_flowswitch * fsw,struct flow_entry * fe,bool input)298 fsw_snoop(struct nx_flowswitch *fsw, struct flow_entry *fe, bool input)
299 {
300 pid_t pid;
301 char proc_name_buf[FLOW_PROCESS_NAME_LENGTH];
302 char *proc_name = NULL;
303 pid_t epid;
304 char eproc_name_buf[FLOW_PROCESS_NAME_LENGTH];
305 char *eproc_name = NULL;
306 sa_family_t af;
307 bool tap_early = false;
308 struct __kern_packet *pkt;
309
310 ASSERT(fe != NULL);
311 ASSERT(fsw->fsw_ifp != NULL);
312
313 if (fe->fe_nx_port == FSW_VP_HOST) {
314 /* allow packets to be tapped before aggregation happens */
315 tap_early = (input && fe->fe_key.fk_proto == IPPROTO_TCP);
316 if (!tap_early) {
317 /* all other traffic will be tapped in the dlil input path */
318 return;
319 }
320 }
321 if (fe->fe_key.fk_ipver == IPVERSION) {
322 af = AF_INET;
323 } else if (fe->fe_key.fk_ipver == IPV6_VERSION) {
324 af = AF_INET6;
325 } else {
326 return;
327 }
328
329 pid = fe->fe_pid;
330 if (fe->fe_proc_name[0] != '\0') {
331 (void) strlcpy(proc_name_buf, fe->fe_proc_name,
332 sizeof(proc_name_buf));
333 proc_name = proc_name_buf;
334 }
335 epid = fe->fe_epid;
336 if (fe->fe_eproc_name[0] != '\0') {
337 (void) strlcpy(eproc_name_buf, fe->fe_eproc_name,
338 sizeof(eproc_name_buf));
339 eproc_name = eproc_name_buf;
340 }
341 if (input) {
342 KPKTQ_FOREACH(pkt, &fe->fe_rx_pktq) {
343 pktap_input_packet(fsw->fsw_ifp, af,
344 fsw->fsw_ifp_dlt, pid, proc_name, epid,
345 eproc_name, SK_PKT2PH(pkt), NULL, 0,
346 IPPROTO_TCP, fe->fe_flowid,
347 tap_early ? PTH_FLAG_SOCKET: PTH_FLAG_NEXUS_CHAN);
348 }
349 } else {
350 KPKTQ_FOREACH(pkt, &fe->fe_tx_pktq) {
351 pktap_output_packet(fsw->fsw_ifp, af,
352 fsw->fsw_ifp_dlt, pid, proc_name, epid,
353 eproc_name, SK_PKT2PH(pkt), NULL, 0,
354 0, 0, PTH_FLAG_NEXUS_CHAN);
355 }
356 }
357 }
358
359 #if (DEVELOPMENT || DEBUG)
360 static void
_fsw_error35_handler(int step,struct flow_route * fr,struct __kern_packet * pkt,int * ret)361 _fsw_error35_handler(int step, struct flow_route *fr, struct __kern_packet *pkt,
362 int *ret)
363 {
364 static boolean_t _err35_flag_modified = FALSE;
365
366 switch (step) {
367 case 1:
368 if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
369 (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
370 fr->fr_flags &= ~FLOWRTF_RESOLVED;
371 _err35_flag_modified = TRUE;
372 }
373 break;
374
375 case 2:
376 if (!_err35_flag_modified) {
377 return;
378 }
379 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
380 m_freem(pkt->pkt_mbuf);
381 pkt->pkt_pflags &= ~PKT_F_MBUF_DATA;
382 pkt->pkt_mbuf = NULL;
383 }
384 *ret = EJUSTRETURN;
385 fr->fr_flags |= FLOWRTF_RESOLVED;
386 _err35_flag_modified = FALSE;
387 break;
388
389 default:
390 VERIFY(0);
391 /* not reached */
392 }
393 }
394
395 static void
_fsw_error36_handler(int step,struct flow_route * fr,int * ret)396 _fsw_error36_handler(int step, struct flow_route *fr, int *ret)
397 {
398 static boolean_t _err36_flag_modified = FALSE;
399
400 switch (step) {
401 case 1:
402 if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
403 (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
404 fr->fr_flags &= ~FLOWRTF_RESOLVED;
405 _err36_flag_modified = TRUE;
406 }
407 break;
408
409 case 2:
410 if (!_err36_flag_modified) {
411 return;
412 }
413 *ret = ENETUNREACH;
414 fr->fr_flags |= FLOWRTF_RESOLVED;
415 _err36_flag_modified = FALSE;
416 break;
417
418 default:
419 VERIFY(0);
420 /* not reached */
421 }
422 }
423 #else /* !DEVELOPMENT && !DEBUG */
424 #define _fsw_error35_handler(...)
425 #define _fsw_error36_handler(...)
426 #endif /* DEVELOPMENT || DEBUG */
427
428 /*
429 * Check if the source packet content can fit into the destination
430 * ring's packet. Returns TRUE if the source packet can fit.
431 * Note: Failures could be caused by misconfigured packet pool sizes,
432 * missing packet size check again MTU or if the source packet is from
433 * a compat netif and the attached mbuf is larger than MTU due to LRO.
434 */
435 static inline boolean_t
validate_pkt_len(struct __kern_packet * spkt,kern_packet_t dph,uint32_t skip_l2hlen,uint32_t l2hlen,uint16_t headroom,uint32_t * copy_len)436 validate_pkt_len(struct __kern_packet *spkt, kern_packet_t dph,
437 uint32_t skip_l2hlen, uint32_t l2hlen, uint16_t headroom,
438 uint32_t *copy_len)
439 {
440 uint32_t tlen = 0;
441 uint32_t splen = spkt->pkt_length - skip_l2hlen;
442
443 if (l2hlen != 0) {
444 VERIFY(skip_l2hlen == 0);
445 tlen += l2hlen;
446 } else if ((spkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0) {
447 splen -= ETHER_CRC_LEN;
448 }
449
450 tlen += splen;
451 *copy_len = splen;
452
453 return tlen <= ((__packet_get_buflet_count(dph) *
454 PP_BUF_SIZE_DEF(SK_PTR_ADDR_KPKT(dph)->pkt_qum.qum_pp)) -
455 headroom);
456 }
457
458 #if SK_LOG
459 /* Hoisted out of line to reduce kernel stack footprint */
460 SK_LOG_ATTRIBUTE
461 static void
copy_packet_from_dev_log(struct __kern_packet * spkt,struct __kern_packet * dpkt,struct proc * p)462 copy_packet_from_dev_log(struct __kern_packet *spkt,
463 struct __kern_packet *dpkt, struct proc *p)
464 {
465 uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
466 ((spkt->pkt_pflags & PKT_F_MBUF_DATA) ?
467 SK_VERB_COPY_MBUF : SK_VERB_COPY));
468 char *daddr;
469 MD_BUFLET_ADDR_ABS(dpkt, daddr);
470 SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u",
471 sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
472 dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
473 (uint32_t)dpkt->pkt_l2_len);
474 SK_DF(logflags | SK_VERB_DUMP, "%s",
475 sk_dump("buf", daddr, dpkt->pkt_length, 128, NULL, 0));
476 }
477 #else
478 #define copy_packet_from_dev_log(...)
479 #endif /* SK_LOG */
480
481
482 static inline int
copy_packet_from_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)483 copy_packet_from_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
484 struct __kern_packet *dpkt)
485 {
486 /*
487 * source and destination nexus don't share the packet pool
488 * sync operation here is to
489 * - alloc packet for the rx(dst) ring
490 * - copy data/metadata from src packet to dst packet
491 * - attach alloc'd packet to rx(dst) ring
492 */
493 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
494 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
495 kern_packet_t sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt),
496 METADATA_SUBTYPE(spkt));
497 boolean_t do_cksum_rx;
498 uint16_t skip_l2h_len = spkt->pkt_l2_len;
499 uint16_t iphlen;
500 uint32_t dlen;
501 int err;
502
503 if (__improbable(!validate_pkt_len(spkt, dph, skip_l2h_len, 0, 0,
504 &dlen))) {
505 SK_ERR("bufcnt %d, bufsz %d", __packet_get_buflet_count(dph),
506 PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp));
507 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
508 return EINVAL;
509 }
510
511 /* Copy packet metadata */
512 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
513 _PKT_COPY(spkt, dpkt);
514 ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
515 PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
516 ASSERT(dpkt->pkt_mbuf == NULL);
517
518 dpkt->pkt_headroom = 0;
519 dpkt->pkt_l2_len = 0;
520
521 /* don't include IP header from partial sum */
522 if (__probable((spkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) {
523 iphlen = spkt->pkt_flow_ip_hlen;
524 do_cksum_rx = sk_cksum_rx;
525 } else {
526 iphlen = 0;
527 do_cksum_rx = FALSE;
528 }
529
530 /* Copy packet payload */
531 if ((spkt->pkt_pflags & PKT_F_MBUF_DATA) &&
532 (spkt->pkt_pflags & PKT_F_TRUNCATED)) {
533 FSW_STATS_INC(FSW_STATS_RX_COPY_MBUF2PKT);
534 /*
535 * Source packet has truncated contents (just enough for
536 * the classifer) of an mbuf from the compat driver; copy
537 * the entire entire mbuf contents to destination packet.
538 */
539 m_adj(spkt->pkt_mbuf, skip_l2h_len);
540 ASSERT((uint32_t)m_pktlen(spkt->pkt_mbuf) >= dlen);
541 fsw->fsw_pkt_copy_from_mbuf(NR_RX, dph, 0,
542 spkt->pkt_mbuf, 0, dlen, do_cksum_rx, iphlen);
543 } else {
544 FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2PKT);
545 /*
546 * Source packet has full contents, either from an mbuf
547 * that came up from the compat driver, or because it
548 * originated on the native driver; copy to destination.
549 */
550 fsw->fsw_pkt_copy_from_pkt(NR_RX, dph, 0, sph,
551 (spkt->pkt_headroom + spkt->pkt_l2_len), dlen, do_cksum_rx,
552 iphlen, 0, FALSE);
553 }
554
555 #if DEBUG || DEVELOPMENT
556 if (__improbable(pkt_trailers > 0)) {
557 dlen += pkt_add_trailers(dph, dlen, iphlen);
558 }
559 #endif /* DEBUG || DEVELOPMENT */
560
561 /* Finalize and attach packet to Rx ring */
562 METADATA_ADJUST_LEN(dpkt, 0, 0);
563 err = __packet_finalize(dph);
564 VERIFY(err == 0);
565
566 copy_packet_from_dev_log(spkt, dpkt, kernproc);
567
568 if (spkt->pkt_pflags & PKT_F_MBUF_DATA) {
569 ifp_inc_traffic_class_in(fsw->fsw_ifp, spkt->pkt_mbuf);
570 mbuf_free(spkt->pkt_mbuf);
571 KPKT_CLEAR_MBUF_DATA(spkt);
572 } else {
573 fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph);
574 }
575
576 if (__probable(do_cksum_rx != 0)) {
577 FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
578 }
579
580 return 0;
581 }
582
583 SK_NO_INLINE_ATTRIBUTE
584 static struct __kern_packet *
rx_process_ip_frag(struct nx_flowswitch * fsw,struct __kern_packet * pkt)585 rx_process_ip_frag(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
586 {
587 char *pkt_buf;
588 void *l3_hdr;
589 uint16_t nfrags, tlen;
590 int err = 0;
591
592 switch (fsw_ip_reass) {
593 case FSW_IP_REASS_FORCE_OFF:
594 return pkt;
595 case FSW_IP_REASS_FORCE_ON:
596 break;
597 default:
598 if (!FSW_NETAGENT_ENABLED(fsw) ||
599 flow_mgr_get_num_flows(fsw->fsw_flow_mgr) == 0) {
600 return pkt;
601 }
602 break;
603 }
604
605 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
606 l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
607
608 ASSERT(fsw->fsw_ipfm != NULL);
609 ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
610
611 if (pkt->pkt_flow_ip_ver == IPVERSION) {
612 err = fsw_ip_frag_reass_v4(fsw->fsw_ipfm, &pkt,
613 (struct ip *)l3_hdr, &nfrags, &tlen);
614 } else {
615 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
616 /* we only handle frag header immediately after v6 header */
617 err = fsw_ip_frag_reass_v6(fsw->fsw_ipfm, &pkt,
618 (struct ip6_hdr *)l3_hdr,
619 (struct ip6_frag *)((uintptr_t)l3_hdr + sizeof(struct ip6_hdr)),
620 &nfrags, &tlen);
621 }
622 if (__improbable(err != 0)) {
623 /* if we get a bad fragment, free it */
624 pp_free_packet_single(pkt);
625 pkt = NULL;
626 } else {
627 ASSERT(!((pkt != NULL) ^ (nfrags > 0)));
628 }
629
630 return pkt;
631 }
632
633 SK_NO_INLINE_ATTRIBUTE
634 static void
rx_prepare_packet_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)635 rx_prepare_packet_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
636 {
637 ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
638 uint32_t mlen = (uint32_t)m_pktlen(pkt->pkt_mbuf);
639 kern_packet_t ph = SK_PTR_ENCODE(pkt,
640 METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
641 /*
642 * This is the case when the packet is coming in from
643 * compat-netif. This packet only has valid metadata
644 * and an attached mbuf. We need to copy enough data
645 * from the mbuf to the packet buffer for the
646 * classifier. Compat netif packet pool is configured
647 * with buffer size of NETIF_COMPAT_MAX_MBUF_DATA_COPY
648 * which is just enough to hold the protocol headers
649 * for the flowswitch classifier.
650 */
651
652 pkt->pkt_headroom = 0;
653 METADATA_ADJUST_LEN(pkt, 0, 0);
654 /*
655 * Copy the initial 128 bytes of the packet for
656 * classification.
657 * Ethernet(14) + IPv6 header(40) +
658 * + IPv6 fragment header(8) +
659 * TCP header with options(60).
660 */
661 fsw->fsw_pkt_copy_from_mbuf(NR_RX, ph,
662 pkt->pkt_headroom, pkt->pkt_mbuf, 0,
663 MIN(mlen, NETIF_COMPAT_MAX_MBUF_DATA_COPY),
664 FALSE, 0);
665
666 int err = __packet_finalize_with_mbuf(pkt);
667 VERIFY(err == 0);
668 }
669
670 static struct __kern_packet *
rx_prepare_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)671 rx_prepare_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
672 {
673 pkt->pkt_qum_qflags &= ~QUM_F_FLOW_CLASSIFIED;
674
675 if (__improbable(pkt->pkt_pflags & PKT_F_MBUF_DATA)) {
676 rx_prepare_packet_mbuf(fsw, pkt);
677 }
678
679 return pkt;
680 }
681
682 static struct flow_entry *
lookup_flow_with_pkt(struct nx_flowswitch * fsw,struct __kern_packet * pkt,bool input,struct flow_entry * prev_fe)683 lookup_flow_with_pkt(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
684 bool input, struct flow_entry *prev_fe)
685 {
686 struct flow_key key __sk_aligned(16);
687 struct flow_entry *fe = NULL;
688
689 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
690 flow_pkt2key(pkt, input, &key);
691
692 if (__probable(prev_fe != NULL &&
693 prev_fe->fe_key.fk_mask == FKMASK_5TUPLE)) {
694 uint16_t saved_mask = key.fk_mask;
695 key.fk_mask = FKMASK_5TUPLE;
696 if (flow_key_cmp_mask(&prev_fe->fe_key, &key, &fk_mask_5tuple) == 0) {
697 flow_entry_retain(prev_fe);
698 fe = prev_fe;
699 } else {
700 key.fk_mask = saved_mask;
701 }
702 }
703
704 top:
705 if (__improbable(fe == NULL)) {
706 fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &key);
707 }
708
709 if (__improbable(fe != NULL &&
710 (fe->fe_flags & (FLOWENT_PARENT | FLOWENT_CHILD)) != 0)) {
711 /* Rx */
712 if (input) {
713 if (fe->fe_flags & FLOWENT_PARENT) {
714 struct flow_entry *child_fe = rx_lookup_child_flow(fsw, fe, pkt);
715 if (child_fe != NULL) {
716 flow_entry_release(&fe);
717 fe = child_fe;
718 }
719 } else {
720 if (!rx_flow_demux_match(fsw, fe, pkt)) {
721 flow_entry_release(&fe);
722 fe = NULL;
723 goto top;
724 }
725 }
726 } else {
727 /* Tx */
728 if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
729 if (__probable(fe->fe_flags & FLOWENT_PARENT)) {
730 struct flow_entry *parent_fe = fe;
731 fe = tx_lookup_child_flow(parent_fe, pkt->pkt_flow_id);
732 flow_entry_release(&parent_fe);
733 } else {
734 flow_entry_release(&fe);
735 fe = NULL;
736 goto top;
737 }
738 }
739 }
740 }
741
742 SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]);
743 SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP,
744 "%s %s %s \"%s\" fe 0x%llx",
745 input ? "Rx" : "Tx", if_name(fsw->fsw_ifp),
746 sk_proc_name_address(current_proc()),
747 fk_as_string(&key, fkbuf, sizeof(fkbuf)),
748 SK_KVA(fe));
749
750 return fe;
751 }
752
753 static struct flow_entry *
rx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)754 rx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
755 struct flow_entry *prev_fe)
756 {
757 struct flow_entry *fe;
758
759 fe = lookup_flow_with_pkt(fsw, pkt, true, prev_fe);
760 _FSW_INJECT_ERROR(2, fe, NULL, flow_entry_release, &fe);
761 if (fe == NULL) {
762 FSW_STATS_INC(FSW_STATS_RX_FLOW_NOT_FOUND);
763 return NULL;
764 }
765
766 if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
767 FSW_STATS_INC(FSW_STATS_RX_FLOW_TORNDOWN);
768 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
769 "Rx flow torn down");
770 flow_entry_release(&fe);
771 fe = NULL;
772 }
773
774 return fe;
775 }
776
777 static inline void
rx_flow_batch_packet(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt)778 rx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
779 struct __kern_packet *pkt)
780 {
781 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
782 fe->fe_rx_frag_count++;
783 }
784
785 /* KPKTQ_ENQUEUE_LIST is needed until frags become chained buflet */
786 if (KPKTQ_EMPTY(&fe->fe_rx_pktq)) {
787 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) == 0);
788 TAILQ_INSERT_TAIL(fes, fe, fe_rx_link);
789 KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
790 } else {
791 ASSERT(!TAILQ_EMPTY(fes));
792 KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
793 flow_entry_release(&fe);
794 }
795 }
796
797 static void
tx_flow_batch_packet(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt)798 tx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
799 struct __kern_packet *pkt)
800 {
801 /* record frag continuation */
802 if (__improbable(pkt->pkt_flow_ip_is_first_frag)) {
803 ASSERT(pkt->pkt_flow_ip_is_frag);
804 fe->fe_tx_is_cont_frag = true;
805 fe->fe_tx_frag_id = pkt->pkt_flow_ip_frag_id;
806 } else if (__probable(!pkt->pkt_flow_ip_is_frag)) {
807 fe->fe_tx_is_cont_frag = false;
808 fe->fe_tx_frag_id = 0;
809 }
810
811 if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
812 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
813 TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
814 KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
815 } else {
816 ASSERT(!TAILQ_EMPTY(fes));
817 KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
818 flow_entry_release(&fe);
819 }
820 }
821
822 static inline void
fsw_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes)823 fsw_ring_dequeue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
824 uint32_t n_pkts_max, struct pktq *pktq, uint32_t *n_bytes)
825 {
826 uint32_t n_pkts = 0;
827
828 KPKTQ_INIT(pktq);
829
830 slot_idx_t idx, idx_end;
831 idx = r->ckr_khead;
832 idx_end = r->ckr_rhead;
833 struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
834
835 *n_bytes = 0;
836 for (; n_pkts < n_pkts_max && idx != idx_end;
837 idx = SLOT_NEXT(idx, r->ckr_lim)) {
838 struct __kern_slot_desc *ksd = KR_KSD(r, idx);
839 struct __kern_packet *pkt = ksd->sd_pkt;
840
841 ASSERT(pkt->pkt_nextpkt == NULL);
842 KR_SLOT_DETACH_METADATA(r, ksd);
843
844 _FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
845 pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
846 if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
847 || (pkt->pkt_length == 0)) {
848 FSW_STATS_INC(FSW_STATS_DROP);
849 pp_free_packet_single(pkt);
850 continue;
851 }
852 if (NA_CHANNEL_EVENT_ATTACHED(&vpna->vpna_up)) {
853 __packet_set_tx_nx_port(SK_PKT2PH(pkt),
854 vpna->vpna_nx_port, vpna->vpna_gencnt);
855 }
856
857 n_pkts++;
858 *n_bytes += pkt->pkt_length;
859
860 KPKTQ_ENQUEUE(pktq, pkt);
861 }
862
863 r->ckr_khead = idx;
864 r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
865 }
866
867 static void
fsw_ring_enqueue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)868 fsw_ring_enqueue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
869 struct pktq *pktq)
870 {
871 #pragma unused(fsw)
872 struct __kern_packet *pkt;
873 struct __kern_quantum *kqum;
874 uint32_t kr_space_avail = 0;
875 uint32_t n, n_pkts = 0, n_bytes = 0;
876 slot_idx_t idx = 0, idx_start = 0, idx_end = 0;
877
878 kr_enter(r, TRUE);
879
880 idx_start = r->ckr_ktail;
881 kr_space_avail = kr_available_slots_rxring(r);
882 _FSW_INJECT_ERROR(40, kr_space_avail, 0, null_func);
883 n = MIN(kr_space_avail, KPKTQ_LEN(pktq));
884 _FSW_INJECT_ERROR(41, n, 0, null_func);
885 idx_end = SLOT_INCREMENT(idx_start, n, r->ckr_lim);
886
887 idx = idx_start;
888 while (idx != idx_end) {
889 KPKTQ_DEQUEUE(pktq, pkt);
890 kqum = SK_PTR_ADDR_KQUM(pkt);
891 kqum->qum_qflags |= QUM_F_FINALIZED;
892 n_pkts++;
893 n_bytes += pkt->pkt_length;
894 KR_SLOT_ATTACH_METADATA(r, KR_KSD(r, idx), kqum);
895 if (__improbable(pkt->pkt_trace_id != 0)) {
896 KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
897 KDBG(SK_KTRACE_PKT_RX_CHN | DBG_FUNC_START, pkt->pkt_trace_id);
898 }
899 idx = SLOT_NEXT(idx, r->ckr_lim);
900 }
901
902 kr_update_stats(r, n_pkts, n_bytes);
903
904 /*
905 * ensure slot attachments are visible before updating the
906 * tail pointer
907 */
908 membar_sync();
909
910 r->ckr_ktail = idx_end;
911
912 kr_exit(r);
913
914 r->ckr_na_notify(r, kernproc, NA_NOTEF_PUSH);
915
916 SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "%s enqueued %d pkts",
917 r->ckr_name, n_pkts);
918 }
919
920 static void
pkts_to_pktq(struct __kern_packet * pkts[],uint32_t n_pkts,struct pktq * pktq)921 pkts_to_pktq(struct __kern_packet *pkts[], uint32_t n_pkts, struct pktq *pktq)
922 {
923 ASSERT(KPKTQ_EMPTY(pktq));
924
925 for (uint32_t i = 0; i < n_pkts; i++) {
926 struct __kern_packet *pkt = pkts[i];
927 ASSERT(pkt->pkt_nextpkt == NULL);
928 KPKTQ_ENQUEUE(pktq, pkt);
929 }
930 }
931
932 /*
933 * This function is modeled after nx_netif_host_grab_pkts() in nx_netif_host.c.
934 */
935 SK_NO_INLINE_ATTRIBUTE
936 static void
convert_native_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_headp,struct mbuf ** m_tailp,uint32_t * cnt,uint32_t * bytes)937 convert_native_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
938 struct mbuf **m_headp, struct mbuf **m_tailp, uint32_t *cnt, uint32_t *bytes)
939 {
940 uint32_t tot_cnt;
941 unsigned int num_segs = 1;
942 struct mbuf *mhead, *head = NULL, *tail = NULL, **tailp = &head;
943 uint32_t mhead_cnt, mhead_bufsize;
944 uint32_t mhead_waste = 0;
945 uint32_t mcnt = 0, mbytes = 0;
946 uint32_t largest, max_pkt_len;
947 struct __kern_packet *pkt;
948 struct kern_pbufpool *pp;
949
950 tot_cnt = KPKTQ_LEN(pktq);
951 ASSERT(tot_cnt > 0);
952 mhead_cnt = tot_cnt;
953
954 /*
955 * Opportunistically batch-allocate the mbufs based on the largest
956 * packet size we've seen in the recent past. Note that we reset
957 * fe_rx_largest_size below if we notice that we're under-utilizing the
958 * allocated buffers (thus disabling this batch allocation).
959 */
960 largest = *(volatile uint32_t*)&fsw->fsw_rx_largest_size; /* read once */
961 if (__probable(largest != 0)) {
962 if (largest <= MCLBYTES) {
963 mhead = m_allocpacket_internal(&mhead_cnt, MCLBYTES,
964 &num_segs, M_NOWAIT, 1, 0);
965 mhead_bufsize = MCLBYTES;
966 } else if (largest <= MBIGCLBYTES) {
967 mhead = m_allocpacket_internal(&mhead_cnt, MBIGCLBYTES,
968 &num_segs, M_NOWAIT, 1, 0);
969 mhead_bufsize = MBIGCLBYTES;
970 } else if (largest <= M16KCLBYTES) {
971 mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES,
972 &num_segs, M_NOWAIT, 1, 0);
973 mhead_bufsize = M16KCLBYTES;
974 } else if (largest <= M16KCLBYTES * 2) {
975 num_segs = 2;
976 mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES * 2,
977 &num_segs, M_NOWAIT, 1, 0);
978 mhead_bufsize = M16KCLBYTES * 2;
979 } else {
980 mhead = NULL;
981 mhead_bufsize = mhead_cnt = 0;
982 }
983 } else {
984 mhead = NULL;
985 mhead_bufsize = mhead_cnt = 0;
986 }
987 DTRACE_SKYWALK4(bufstats, uint32_t, largest, uint32_t, mhead_bufsize,
988 uint32_t, mhead_cnt, uint32_t, tot_cnt);
989
990 pp = __DECONST(struct kern_pbufpool *, KPKTQ_FIRST(pktq)->pkt_qum.qum_pp);
991 max_pkt_len = PP_BUF_SIZE_DEF(pp) * pp->pp_max_frags;
992
993 KPKTQ_FOREACH(pkt, pktq) {
994 uint32_t tot_len, len;
995 uint16_t pad, llhlen, iphlen;
996 boolean_t do_cksum_rx;
997 struct mbuf *m;
998 int error;
999
1000 llhlen = pkt->pkt_l2_len;
1001 len = pkt->pkt_length;
1002 if (__improbable(len > max_pkt_len || llhlen > len)) {
1003 DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1004 struct __kern_packet *, pkt);
1005 FSW_STATS_INC(FSW_STATS_DROP);
1006 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1007 continue;
1008 }
1009 /* begin payload on 32-bit boundary; figure out the padding */
1010 pad = (uint16_t)P2ROUNDUP(llhlen, sizeof(uint32_t)) - llhlen;
1011 tot_len = pad + len;
1012
1013 /* remember largest packet size */
1014 if (__improbable(largest < tot_len)) {
1015 largest = MAX(tot_len, MCLBYTES);
1016 }
1017
1018 /*
1019 * If the above batch allocation returned partial
1020 * success, we try a blocking allocation here again.
1021 */
1022 m = mhead;
1023 if (__improbable(m == NULL || tot_len > mhead_bufsize)) {
1024 ASSERT(mhead != NULL || mhead_cnt == 0);
1025 num_segs = 1;
1026 if (tot_len > M16KCLBYTES) {
1027 num_segs = 0;
1028 }
1029 if ((error = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
1030 &num_segs, &m)) != 0) {
1031 DTRACE_SKYWALK2(bad__len,
1032 struct nx_flowswitch *, fsw,
1033 struct __kern_packet *, pkt);
1034 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
1035 FSW_STATS_INC(FSW_STATS_DROP);
1036 continue;
1037 }
1038 } else {
1039 mhead = m->m_nextpkt;
1040 m->m_nextpkt = NULL;
1041 ASSERT(mhead_cnt != 0);
1042 --mhead_cnt;
1043
1044 /* check if we're underutilizing large buffers */
1045 if (__improbable(mhead_bufsize > MCLBYTES &&
1046 tot_len < (mhead_bufsize >> 1))) {
1047 ++mhead_waste;
1048 }
1049 if (__improbable(mhead_bufsize >= tot_len + M16KCLBYTES)) {
1050 FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
1051 }
1052 }
1053 m->m_data += pad;
1054 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
1055
1056 /* don't include IP header from partial sum */
1057 if (__probable((pkt->pkt_qum_qflags &
1058 QUM_F_FLOW_CLASSIFIED) != 0)) {
1059 iphlen = pkt->pkt_flow_ip_hlen;
1060 do_cksum_rx = sk_cksum_rx;
1061 } else {
1062 iphlen = 0;
1063 do_cksum_rx = FALSE;
1064 }
1065
1066 fsw->fsw_pkt_copy_to_mbuf(NR_RX, SK_PKT2PH(pkt),
1067 pkt->pkt_headroom, m, 0, len, do_cksum_rx,
1068 llhlen + iphlen);
1069
1070 FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2MBUF);
1071 if (do_cksum_rx) {
1072 FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
1073 }
1074 #if DEBUG || DEVELOPMENT
1075 if (__improbable(pkt_trailers > 0)) {
1076 (void) pkt_add_trailers_mbuf(m, llhlen + iphlen);
1077 }
1078 #endif /* DEBUG || DEVELOPMENT */
1079 m_adj(m, llhlen);
1080
1081 m->m_pkthdr.rcvif = fsw->fsw_ifp;
1082 if (__improbable((pkt->pkt_link_flags &
1083 PKT_LINKF_ETHFCS) != 0)) {
1084 m->m_flags |= M_HASFCS;
1085 }
1086 if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1087 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
1088 }
1089 ASSERT(m->m_nextpkt == NULL);
1090 tail = m;
1091 *tailp = m;
1092 tailp = &m->m_nextpkt;
1093 mcnt++;
1094 mbytes += m_pktlen(m);
1095 }
1096 /* free any leftovers */
1097 if (__improbable(mhead != NULL)) {
1098 DTRACE_SKYWALK1(mhead__leftover, uint32_t, mhead_cnt);
1099 ASSERT(mhead_cnt != 0);
1100 (void) m_freem_list(mhead);
1101 mhead = NULL;
1102 mhead_cnt = 0;
1103 }
1104
1105 /* reset if most packets (>50%) are smaller than our batch buffers */
1106 if (__improbable(mhead_waste > ((uint32_t)tot_cnt >> 1))) {
1107 DTRACE_SKYWALK4(mhead__waste, struct nx_flowswitch *, fsw,
1108 struct flow_entry *, NULL, uint32_t, mhead_waste,
1109 uint32_t, tot_cnt);
1110 largest = 0;
1111 }
1112
1113 if (largest != fsw->fsw_rx_largest_size) {
1114 atomic_set_32(&fsw->fsw_rx_largest_size, largest);
1115 }
1116
1117 pp_free_pktq(pktq);
1118 *m_headp = head;
1119 *m_tailp = tail;
1120 *cnt = mcnt;
1121 *bytes = mbytes;
1122 }
1123
1124 /*
1125 * This function only extracts the mbuf from the packet. The caller frees
1126 * the packet.
1127 */
1128 static inline struct mbuf *
convert_compat_pkt_to_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1129 convert_compat_pkt_to_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1130 {
1131 struct mbuf *m;
1132 struct pkthdr *mhdr;
1133 uint16_t llhlen;
1134
1135 m = pkt->pkt_mbuf;
1136 ASSERT(m != NULL);
1137
1138 llhlen = pkt->pkt_l2_len;
1139 if (llhlen > pkt->pkt_length) {
1140 m_freem(m);
1141 KPKT_CLEAR_MBUF_DATA(pkt);
1142 DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1143 struct __kern_packet *, pkt);
1144 FSW_STATS_INC(FSW_STATS_DROP);
1145 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1146 return NULL;
1147 }
1148 mhdr = &m->m_pkthdr;
1149 if ((mhdr->csum_flags & CSUM_DATA_VALID) == 0 &&
1150 PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
1151 mhdr->csum_flags &= ~CSUM_RX_FLAGS;
1152 mhdr->csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
1153 mhdr->csum_rx_start = pkt->pkt_csum_rx_start_off;
1154 mhdr->csum_rx_val = pkt->pkt_csum_rx_value;
1155 }
1156 #if DEBUG || DEVELOPMENT
1157 uint32_t extra = 0;
1158 if (__improbable(pkt_trailers > 0)) {
1159 extra = pkt_add_trailers_mbuf(m, llhlen);
1160 }
1161 #endif /* DEBUG || DEVELOPMENT */
1162 m_adj(m, llhlen);
1163 ASSERT((uint32_t)m_pktlen(m) == ((pkt->pkt_length - llhlen) + extra));
1164 KPKT_CLEAR_MBUF_DATA(pkt);
1165 return m;
1166 }
1167
1168 SK_NO_INLINE_ATTRIBUTE
1169 static void
convert_compat_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_head,struct mbuf ** m_tail,uint32_t * cnt,uint32_t * bytes)1170 convert_compat_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1171 struct mbuf **m_head, struct mbuf **m_tail, uint32_t *cnt, uint32_t *bytes)
1172 {
1173 struct __kern_packet *pkt;
1174 struct mbuf *m, *head = NULL, *tail = NULL, **tailp = &head;
1175 uint32_t c = 0, b = 0;
1176
1177 KPKTQ_FOREACH(pkt, pktq) {
1178 m = convert_compat_pkt_to_mbuf(fsw, pkt);
1179 if (__improbable(m == NULL)) {
1180 continue;
1181 }
1182 tail = m;
1183 *tailp = m;
1184 tailp = &m->m_nextpkt;
1185 c++;
1186 b += m_pktlen(m);
1187 }
1188 pp_free_pktq(pktq);
1189 *m_head = head;
1190 *m_tail = tail;
1191 *cnt = c;
1192 *bytes = b;
1193 }
1194
1195 void
fsw_host_sendup(ifnet_t ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes)1196 fsw_host_sendup(ifnet_t ifp, struct mbuf *m_head, struct mbuf *m_tail,
1197 uint32_t cnt, uint32_t bytes)
1198 {
1199 struct ifnet_stat_increment_param s;
1200
1201 bzero(&s, sizeof(s));
1202 s.packets_in = cnt;
1203 s.bytes_in = bytes;
1204 dlil_input_handler(ifp, m_head, m_tail, &s, FALSE, NULL);
1205 }
1206
1207 void
fsw_host_rx(struct nx_flowswitch * fsw,struct pktq * pktq)1208 fsw_host_rx(struct nx_flowswitch *fsw, struct pktq *pktq)
1209 {
1210 struct mbuf *m_head = NULL, *m_tail = NULL;
1211 uint32_t cnt = 0, bytes = 0;
1212 boolean_t compat;
1213
1214 ASSERT(!KPKTQ_EMPTY(pktq));
1215
1216 /* All packets in the pktq must have the same type */
1217 compat = ((KPKTQ_FIRST(pktq)->pkt_pflags & PKT_F_MBUF_DATA) != 0);
1218 if (compat) {
1219 convert_compat_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1220 &bytes);
1221 } else {
1222 convert_native_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1223 &bytes);
1224 }
1225 if (__improbable(m_head == NULL)) {
1226 DTRACE_SKYWALK1(empty__head, struct nx_flowswitch *, fsw);
1227 return;
1228 }
1229 fsw_host_sendup(fsw->fsw_ifp, m_head, m_tail, cnt, bytes);
1230 }
1231
1232 void
fsw_ring_enqueue_tail_drop(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1233 fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw,
1234 struct __kern_channel_ring *r, struct pktq *pktq)
1235 {
1236 fsw_ring_enqueue_pktq(fsw, r, pktq);
1237 FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq));
1238 dp_drop_pktq(fsw, pktq);
1239 }
1240
1241 static struct nexus_adapter *
flow_get_na(struct nx_flowswitch * fsw,struct flow_entry * fe)1242 flow_get_na(struct nx_flowswitch *fsw, struct flow_entry *fe)
1243 {
1244 struct kern_nexus *nx = fsw->fsw_nx;
1245 struct nexus_adapter *na = NULL;
1246 nexus_port_t port = fe->fe_nx_port;
1247
1248 if (port == FSW_VP_DEV || port == FSW_VP_HOST) {
1249 SK_ERR("dev or host ports have no NA");
1250 return NULL;
1251 }
1252
1253 if (__improbable(!nx_port_is_valid(nx, port))) {
1254 SK_DF(SK_VERB_FSW_DP, "%s[%d] port no longer valid",
1255 if_name(fsw->fsw_ifp), port);
1256 return NULL;
1257 }
1258
1259 na = nx_port_get_na(nx, port);
1260 if (__improbable(na == NULL)) {
1261 FSW_STATS_INC(FSW_STATS_DST_NXPORT_INVALID);
1262 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer valid",
1263 if_name(fsw->fsw_ifp), port);
1264 return NULL;
1265 }
1266
1267 if (__improbable(!NA_IS_ACTIVE(na))) {
1268 FSW_STATS_INC(FSW_STATS_DST_NXPORT_INACTIVE);
1269 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer active",
1270 if_name(fsw->fsw_ifp), port);
1271 return NULL;
1272 }
1273
1274 if (__improbable(nx_port_is_defunct(nx, port))) {
1275 FSW_STATS_INC(FSW_STATS_DST_NXPORT_DEFUNCT);
1276 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA defuncted",
1277 if_name(fsw->fsw_ifp), port);
1278 return NULL;
1279 }
1280
1281 return na;
1282 }
1283
1284 static inline struct __kern_channel_ring *
flow_get_ring(struct nx_flowswitch * fsw,struct flow_entry * fe,enum txrx txrx)1285 flow_get_ring(struct nx_flowswitch *fsw, struct flow_entry *fe, enum txrx txrx)
1286 {
1287 struct nexus_vp_adapter *na = NULL;
1288 struct __kern_channel_ring *r = NULL;
1289
1290 na = VPNA(flow_get_na(fsw, fe));
1291 if (__improbable(na == NULL)) {
1292 return NULL;
1293 }
1294
1295 switch (txrx) {
1296 case NR_RX:
1297 r = &na->vpna_up.na_rx_rings[0];
1298 break;
1299 case NR_TX:
1300 r = &na->vpna_up.na_tx_rings[0];
1301 break;
1302 default:
1303 __builtin_unreachable();
1304 VERIFY(0);
1305 }
1306
1307 if (__improbable(KR_DROP(r))) {
1308 FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE);
1309 SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %0xllx %s drop mode",
1310 r->ckr_name, SK_KVA(r));
1311 return NULL;
1312 }
1313
1314 ASSERT(KRNA(r)->na_md_type == NEXUS_META_TYPE_PACKET);
1315
1316 #if (DEVELOPMENT || DEBUG)
1317 if (r != NULL) {
1318 _FSW_INJECT_ERROR(4, r, NULL, null_func);
1319 }
1320 #endif /* DEVELOPMENT || DEBUG */
1321
1322 return r;
1323 }
1324
1325 struct __kern_channel_ring *
fsw_flow_get_rx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1326 fsw_flow_get_rx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1327 {
1328 return flow_get_ring(fsw, fe, NR_RX);
1329 }
1330
1331 static inline struct __kern_channel_ring *
fsw_flow_get_tx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1332 fsw_flow_get_tx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1333 {
1334 return flow_get_ring(fsw, fe, NR_TX);
1335 }
1336
1337 static bool
dp_flow_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1338 dp_flow_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1339 {
1340 struct flow_route *fr = fe->fe_route;
1341 struct ifnet *ifp = fsw->fsw_ifp;
1342
1343 if (__improbable(!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
1344 !fe->fe_want_nonviable && (fe->fe_key.fk_mask & FKMASK_SRC) &&
1345 fe->fe_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt &&
1346 !flow_route_key_validate(&fe->fe_key, ifp, &fe->fe_laddr_gencnt))) {
1347 /*
1348 * The source address is no longer around; we want this
1349 * flow to be nonviable, but that requires holding the lock
1350 * as writer (which isn't the case now.) Indicate that
1351 * we need to finalize the nonviable later down below.
1352 *
1353 * We also request that the flow route be re-configured,
1354 * if this is a connected mode flow.
1355 *
1356 */
1357 if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) {
1358 /*
1359 * fsw_pending_nonviable is a hint for reaper thread;
1360 * due to the fact that setting fe_want_nonviable and
1361 * incrementing fsw_pending_nonviable counter is not
1362 * atomic, let the increment happen first, and the
1363 * thread losing the CAS does decrement.
1364 */
1365 atomic_add_32(&fsw->fsw_pending_nonviable, 1);
1366 if (atomic_test_set_32(&fe->fe_want_nonviable, 0, 1)) {
1367 fsw_reap_sched(fsw);
1368 } else {
1369 atomic_add_32(&fsw->fsw_pending_nonviable, -1);
1370 }
1371 }
1372 if (fr != NULL) {
1373 atomic_add_32(&fr->fr_want_configure, 1);
1374 }
1375 }
1376
1377 /* if flow was (or is going to be) marked as nonviable, drop it */
1378 if (__improbable(fe->fe_want_nonviable ||
1379 (fe->fe_flags & FLOWENTF_NONVIABLE) != 0)) {
1380 SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow 0x%llx non-viable",
1381 SK_KVA(fe));
1382 return false;
1383 }
1384 return true;
1385 }
1386
1387 bool
dp_flow_rx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1388 dp_flow_rx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1389 {
1390 bool okay;
1391 okay = dp_flow_route_process(fsw, fe);
1392 #if (DEVELOPMENT || DEBUG)
1393 if (okay) {
1394 _FSW_INJECT_ERROR(5, okay, false, null_func);
1395 }
1396 #endif /* DEVELOPMENT || DEBUG */
1397
1398 return okay;
1399 }
1400
1401 void
dp_flow_rx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1402 dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1403 {
1404 struct pktq dpkts; /* dst pool alloc'ed packets */
1405 struct pktq disposed_pkts; /* done src packets */
1406 struct pktq dropped_pkts; /* dropped src packets */
1407 struct pktq transferred_pkts; /* dst packet ready for ring */
1408 struct __kern_packet *pkt, *tpkt;
1409 struct kern_pbufpool *dpp;
1410 uint32_t n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq);
1411 uint64_t buf_array[RX_BUFLET_BATCH_COUNT];
1412 uint16_t buf_array_iter = 0;
1413 uint32_t cnt, buf_cnt = 0;
1414 int err;
1415
1416 KPKTQ_INIT(&dpkts);
1417 KPKTQ_INIT(&dropped_pkts);
1418 KPKTQ_INIT(&disposed_pkts);
1419 KPKTQ_INIT(&transferred_pkts);
1420
1421 if (__improbable(!dp_flow_rx_route_process(fsw, fe))) {
1422 SK_ERR("Rx route bad");
1423 fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
1424 FSW_STATS_ADD(FSW_STATS_RX_FLOW_NONVIABLE, n_pkts);
1425 goto done;
1426 }
1427
1428 if (fe->fe_nx_port == FSW_VP_HOST) {
1429 /*
1430 * The host ring does not exist anymore so we can't take
1431 * the enqueue path below. This path should only be hit
1432 * for the rare tcp fragmentation case.
1433 */
1434 fsw_host_rx(fsw, &fe->fe_rx_pktq);
1435 return;
1436 }
1437
1438 /* find the ring */
1439 struct __kern_channel_ring *r;
1440 r = fsw_flow_get_rx_ring(fsw, fe);
1441 if (__improbable(r == NULL)) {
1442 fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
1443 goto done;
1444 }
1445
1446 /* snoop before L2 is stripped */
1447 if (__improbable(pktap_total_tap_count != 0)) {
1448 fsw_snoop(fsw, fe, true);
1449 }
1450
1451 dpp = r->ckr_pp;
1452 /* batch allocate enough packets */
1453 err = pp_alloc_pktq(dpp, 1, &dpkts, n_pkts, NULL, NULL,
1454 SKMEM_NOSLEEP);
1455 if (__improbable(err == ENOMEM)) {
1456 ASSERT(KPKTQ_EMPTY(&dpkts));
1457 KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq);
1458 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1459 SK_ERR("failed to alloc %u pkts for kr %s, 0x%llu", n_pkts,
1460 r->ckr_name, SK_KVA(r));
1461 goto done;
1462 }
1463
1464 /*
1465 * estimate total number of buflets for the packet chain.
1466 */
1467 cnt = howmany(fe->fe_rx_pktq_bytes, PP_BUF_SIZE_DEF(dpp));
1468 if (cnt > n_pkts) {
1469 ASSERT(dpp->pp_max_frags > 1);
1470 cnt -= n_pkts;
1471 buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, cnt);
1472 err = pp_alloc_buflet_batch(dpp, buf_array, &buf_cnt,
1473 SKMEM_NOSLEEP, PP_ALLOC_BFT_ATTACH_BUFFER);
1474 if (__improbable(buf_cnt == 0)) {
1475 KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq);
1476 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1477 SK_ERR("failed to alloc %d buflets (err %d) for kr %s, "
1478 "0x%llu", cnt, err, r->ckr_name, SK_KVA(r));
1479 goto done;
1480 }
1481 err = 0;
1482 }
1483
1484 /* extra processing for user flow */
1485 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1486 err = 0;
1487 KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1488 if (fe->fe_rx_pktq_bytes > pkt->pkt_flow_ulen) {
1489 fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1490 } else {
1491 fe->fe_rx_pktq_bytes = 0;
1492 }
1493 err = flow_pkt_track(fe, pkt, true);
1494 _FSW_INJECT_ERROR(33, err, EPROTO, null_func);
1495 if (__improbable(err != 0)) {
1496 SK_ERR("flow_pkt_track failed (err %d)", err);
1497 FSW_STATS_INC(FSW_STATS_RX_FLOW_TRACK_ERR);
1498 /* if need to trigger RST */
1499 if (err == ENETRESET) {
1500 flow_track_abort_tcp(fe, pkt, NULL);
1501 }
1502 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1503 continue;
1504 }
1505
1506 /* transfer to dpkt */
1507 if (pkt->pkt_qum.qum_pp != dpp) {
1508 struct __kern_buflet *bprev, *bnew;
1509 struct __kern_packet *dpkt = NULL;
1510 uint32_t n_bufs, i;
1511
1512 KPKTQ_DEQUEUE(&dpkts, dpkt);
1513 if (__improbable(dpkt == NULL)) {
1514 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1515 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1516 continue;
1517 }
1518 n_bufs = howmany(pkt->pkt_length, PP_BUF_SIZE_DEF(dpp));
1519 n_bufs--;
1520 for (i = 0; i < n_bufs; i++) {
1521 if (__improbable(buf_cnt == 0)) {
1522 ASSERT(dpp->pp_max_frags > 1);
1523 buf_array_iter = 0;
1524 cnt = howmany(fe->fe_rx_pktq_bytes,
1525 PP_BUF_SIZE_DEF(dpp));
1526 n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq);
1527 if (cnt >= n_pkts) {
1528 cnt -= n_pkts;
1529 } else {
1530 cnt = 0;
1531 }
1532 cnt += (n_bufs - i);
1533 buf_cnt = MIN(RX_BUFLET_BATCH_COUNT,
1534 cnt);
1535 cnt = buf_cnt;
1536 err = pp_alloc_buflet_batch(dpp,
1537 buf_array, &buf_cnt,
1538 SKMEM_NOSLEEP, PP_ALLOC_BFT_ATTACH_BUFFER);
1539 if (__improbable(buf_cnt == 0)) {
1540 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1541 KPKTQ_ENQUEUE(&dropped_pkts,
1542 pkt);
1543 pkt = NULL;
1544 pp_free_packet_single(dpkt);
1545 dpkt = NULL;
1546 SK_ERR("failed to alloc %d "
1547 "buflets (err %d) for "
1548 "kr %s, 0x%llu", cnt, err,
1549 r->ckr_name, SK_KVA(r));
1550 break;
1551 }
1552 err = 0;
1553 }
1554 ASSERT(buf_cnt != 0);
1555 if (i == 0) {
1556 PKT_GET_FIRST_BUFLET(dpkt, 1, bprev);
1557 }
1558 bnew = (kern_buflet_t)buf_array[buf_array_iter];
1559 buf_array[buf_array_iter] = 0;
1560 buf_array_iter++;
1561 buf_cnt--;
1562 VERIFY(kern_packet_add_buflet(SK_PKT2PH(dpkt),
1563 bprev, bnew) == 0);
1564 bprev = bnew;
1565 }
1566 if (__improbable(err != 0)) {
1567 continue;
1568 }
1569 err = copy_packet_from_dev(fsw, pkt, dpkt);
1570 _FSW_INJECT_ERROR(43, err, EINVAL, null_func);
1571 if (__improbable(err != 0)) {
1572 SK_ERR("copy packet failed (err %d)", err);
1573 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1574 pp_free_packet_single(dpkt);
1575 dpkt = NULL;
1576 continue;
1577 }
1578 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1579 pkt = dpkt;
1580 }
1581 _UUID_COPY(pkt->pkt_flow_id, fe->fe_uuid);
1582 _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1583 pkt->pkt_policy_id = fe->fe_policy_id;
1584 pkt->pkt_transport_protocol = fe->fe_transport_protocol;
1585 if (pkt->pkt_bufs_cnt > 1) {
1586 pkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1587 pkt->pkt_seg_cnt = 1;
1588 }
1589 KPKTQ_ENQUEUE(&transferred_pkts, pkt);
1590 }
1591 KPKTQ_FINI(&fe->fe_rx_pktq);
1592 KPKTQ_CONCAT(&fe->fe_rx_pktq, &transferred_pkts);
1593 KPKTQ_FINI(&transferred_pkts);
1594
1595 fsw_ring_enqueue_tail_drop(fsw, r, &fe->fe_rx_pktq);
1596
1597 done:
1598 /* Free unused buflets */
1599 while (buf_cnt > 0) {
1600 pp_free_buflet(dpp, (kern_buflet_t)(buf_array[buf_array_iter]));
1601 buf_array[buf_array_iter] = 0;
1602 buf_array_iter++;
1603 buf_cnt--;
1604 }
1605 dp_free_pktq(fsw, &dpkts);
1606 dp_free_pktq(fsw, &disposed_pkts);
1607 dp_drop_pktq(fsw, &dropped_pkts);
1608 }
1609
1610 static inline void
rx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1611 rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1612 {
1613 ASSERT(!KPKTQ_EMPTY(&fe->fe_rx_pktq));
1614 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0);
1615
1616 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX, "Rx %d pkts for fe %p port %d",
1617 KPKTQ_LEN(&fe->fe_rx_pktq), fe, fe->fe_nx_port);
1618
1619 /* flow related processing (default, agg, fpd, etc.) */
1620 fe->fe_rx_process(fsw, fe);
1621
1622 if (__improbable(fe->fe_want_withdraw)) {
1623 fsw_reap_sched(fsw);
1624 }
1625
1626 KPKTQ_FINI(&fe->fe_rx_pktq);
1627 }
1628
1629 static inline void
dp_rx_process_wake_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1630 dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1631 {
1632 /*
1633 * We only care about wake packets of flows that belong the flow switch
1634 * as wake packets for the host stack are handled by the host input
1635 * function
1636 */
1637 #if (DEBUG || DEVELOPMENT)
1638 if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
1639 /*
1640 * This is a one shot command
1641 */
1642 fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
1643
1644 pkt->pkt_pflags |= PKT_F_WAKE_PKT;
1645 }
1646 #endif /* (DEBUG || DEVELOPMENT) */
1647 if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1648 if_ports_used_match_pkt(fsw->fsw_ifp, pkt);
1649 }
1650 }
1651
1652 static void
_fsw_receive_locked(struct nx_flowswitch * fsw,struct pktq * pktq)1653 _fsw_receive_locked(struct nx_flowswitch *fsw, struct pktq *pktq)
1654 {
1655 struct __kern_packet *pkt, *tpkt;
1656 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
1657 struct flow_entry *fe, *prev_fe;
1658 sa_family_t af;
1659 struct pktq host_pkts, dropped_pkts;
1660 int err;
1661
1662 KPKTQ_INIT(&host_pkts);
1663 KPKTQ_INIT(&dropped_pkts);
1664
1665 if (__improbable(FSW_QUIESCED(fsw))) {
1666 DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw);
1667 KPKTQ_CONCAT(&dropped_pkts, pktq);
1668 goto done;
1669 }
1670 if (__improbable(fsw->fsw_demux == NULL)) {
1671 KPKTQ_CONCAT(&dropped_pkts, pktq);
1672 goto done;
1673 }
1674
1675 prev_fe = NULL;
1676 KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
1677 if (__probable(tpkt)) {
1678 void *baddr;
1679 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1680 SK_PREFETCH(baddr, 0);
1681 /* prefetch L3 and L4 flow structs */
1682 SK_PREFETCHW(tpkt->pkt_flow, 0);
1683 SK_PREFETCHW(tpkt->pkt_flow, 128);
1684 }
1685
1686 KPKTQ_REMOVE(pktq, pkt);
1687
1688 pkt = rx_prepare_packet(fsw, pkt);
1689
1690 af = fsw->fsw_demux(fsw, pkt);
1691 if (__improbable(af == AF_UNSPEC)) {
1692 KPKTQ_ENQUEUE(&host_pkts, pkt);
1693 continue;
1694 }
1695
1696 err = flow_pkt_classify(pkt, fsw->fsw_ifp, af, TRUE);
1697 _FSW_INJECT_ERROR(1, err, ENXIO, null_func);
1698 if (__improbable(err != 0)) {
1699 FSW_STATS_INC(FSW_STATS_RX_FLOW_EXTRACT_ERR);
1700 KPKTQ_ENQUEUE(&host_pkts, pkt);
1701 continue;
1702 }
1703
1704 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
1705 pkt = rx_process_ip_frag(fsw, pkt);
1706 if (pkt == NULL) {
1707 continue;
1708 }
1709 }
1710
1711 #if DEVELOPMENT || DEBUG
1712 trace_pkt_dump_payload(fsw->fsw_ifp, pkt, true);
1713 #endif /* DEVELOPMENT || DEBUG */
1714
1715 prev_fe = fe = rx_lookup_flow(fsw, pkt, prev_fe);
1716 if (__improbable(fe == NULL)) {
1717 KPKTQ_ENQUEUE_LIST(&host_pkts, pkt);
1718 continue;
1719 }
1720
1721 fe->fe_rx_pktq_bytes += pkt->pkt_flow_ulen;
1722
1723 dp_rx_process_wake_packet(fsw, pkt);
1724
1725 rx_flow_batch_packet(&fes, fe, pkt);
1726 prev_fe = fe;
1727 }
1728
1729 struct flow_entry *tfe = NULL;
1730 TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) {
1731 rx_flow_process(fsw, fe);
1732 TAILQ_REMOVE(&fes, fe, fe_rx_link);
1733 fe->fe_rx_pktq_bytes = 0;
1734 fe->fe_rx_frag_count = 0;
1735 flow_entry_release(&fe);
1736 }
1737
1738 if (!KPKTQ_EMPTY(&host_pkts)) {
1739 fsw_host_rx(fsw, &host_pkts);
1740 }
1741
1742 done:
1743 dp_drop_pktq(fsw, &dropped_pkts);
1744 }
1745
1746 #if (DEVELOPMENT || DEBUG)
1747 static void
fsw_rps_rx(struct nx_flowswitch * fsw,uint32_t id,struct __kern_packet * pkt)1748 fsw_rps_rx(struct nx_flowswitch *fsw, uint32_t id,
1749 struct __kern_packet *pkt)
1750 {
1751 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
1752
1753 lck_mtx_lock_spin(&frt->frt_lock);
1754 KPKTQ_ENQUEUE(&frt->frt_pktq, pkt);
1755 lck_mtx_unlock(&frt->frt_lock);
1756 }
1757
1758 static void
fsw_rps_thread_schedule(struct nx_flowswitch * fsw,uint32_t id)1759 fsw_rps_thread_schedule(struct nx_flowswitch *fsw, uint32_t id)
1760 {
1761 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
1762
1763 ASSERT(frt->frt_thread != THREAD_NULL);
1764 lck_mtx_lock_spin(&frt->frt_lock);
1765 ASSERT(!(frt->frt_flags & (FRT_TERMINATING | FRT_TERMINATED)));
1766
1767 frt->frt_requests++;
1768 if (!(frt->frt_flags & FRT_RUNNING)) {
1769 thread_wakeup((caddr_t)frt);
1770 }
1771 lck_mtx_unlock(&frt->frt_lock);
1772 }
1773
1774 __attribute__((noreturn))
1775 static void
fsw_rps_thread_cont(void * v,wait_result_t w)1776 fsw_rps_thread_cont(void *v, wait_result_t w)
1777 {
1778 struct fsw_rps_thread *frt = v;
1779 struct nx_flowswitch *fsw = frt->frt_fsw;
1780
1781 lck_mtx_lock(&frt->frt_lock);
1782 if (__improbable(w == THREAD_INTERRUPTIBLE ||
1783 (frt->frt_flags & FRT_TERMINATING) != 0)) {
1784 goto terminate;
1785 }
1786 if (KPKTQ_EMPTY(&frt->frt_pktq)) {
1787 goto done;
1788 }
1789 frt->frt_flags |= FRT_RUNNING;
1790
1791 for (;;) {
1792 uint32_t requests = frt->frt_requests;
1793 struct pktq pkts;
1794
1795 KPKTQ_INIT(&pkts);
1796 KPKTQ_CONCAT(&pkts, &frt->frt_pktq);
1797 lck_mtx_unlock(&frt->frt_lock);
1798
1799 sk_protect_t protect;
1800 protect = sk_sync_protect();
1801 FSW_RLOCK(fsw);
1802 _fsw_receive_locked(fsw, &pkts);
1803 FSW_RUNLOCK(fsw);
1804 sk_sync_unprotect(protect);
1805
1806 lck_mtx_lock(&frt->frt_lock);
1807 if ((frt->frt_flags & FRT_TERMINATING) != 0 ||
1808 requests == frt->frt_requests) {
1809 frt->frt_requests = 0;
1810 break;
1811 }
1812 }
1813
1814 done:
1815 lck_mtx_unlock(&frt->frt_lock);
1816 if (!(frt->frt_flags & FRT_TERMINATING)) {
1817 frt->frt_flags &= ~FRT_RUNNING;
1818 assert_wait(frt, THREAD_UNINT);
1819 thread_block_parameter(fsw_rps_thread_cont, frt);
1820 __builtin_unreachable();
1821 } else {
1822 terminate:
1823 LCK_MTX_ASSERT(&frt->frt_lock, LCK_MTX_ASSERT_OWNED);
1824 frt->frt_flags &= ~(FRT_RUNNING | FRT_TERMINATING);
1825 frt->frt_flags |= FRT_TERMINATED;
1826
1827 if (frt->frt_flags & FRT_TERMINATEBLOCK) {
1828 thread_wakeup((caddr_t)&frt);
1829 }
1830 lck_mtx_unlock(&frt->frt_lock);
1831
1832 SK_D("fsw_rx_%s_%d terminated", if_name(fsw->fsw_ifp),
1833 frt->frt_idx);
1834
1835 /* for the extra refcnt from kernel_thread_start() */
1836 thread_deallocate(current_thread());
1837 /* this is the end */
1838 thread_terminate(current_thread());
1839 /* NOTREACHED */
1840 __builtin_unreachable();
1841 }
1842
1843 /* must never get here */
1844 VERIFY(0);
1845 /* NOTREACHED */
1846 __builtin_unreachable();
1847 }
1848
1849 __attribute__((noreturn))
1850 static void
fsw_rps_thread_func(void * v,wait_result_t w)1851 fsw_rps_thread_func(void *v, wait_result_t w)
1852 {
1853 #pragma unused(w)
1854 struct fsw_rps_thread *frt = v;
1855 struct nx_flowswitch *fsw = frt->frt_fsw;
1856
1857 char thread_name[MAXTHREADNAMESIZE];
1858 bzero(thread_name, sizeof(thread_name));
1859 (void) snprintf(thread_name, sizeof(thread_name), "fsw_rx_%s_%d",
1860 if_name(fsw->fsw_ifp), frt->frt_idx);
1861 thread_set_thread_name(frt->frt_thread, thread_name);
1862 SK_D("%s spawned", thread_name);
1863
1864 net_thread_marks_push(NET_THREAD_SYNC_RX);
1865 assert_wait(frt, THREAD_UNINT);
1866 (void) thread_block_parameter(fsw_rps_thread_cont, frt);
1867
1868 __builtin_unreachable();
1869 }
1870
1871 static void
fsw_rps_thread_join(struct nx_flowswitch * fsw,uint32_t i)1872 fsw_rps_thread_join(struct nx_flowswitch *fsw, uint32_t i)
1873 {
1874 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
1875 uint64_t f = (1 * NSEC_PER_MSEC);
1876 uint64_t s = (1000 * NSEC_PER_SEC);
1877 uint32_t c = 0;
1878
1879 lck_mtx_lock(&frt->frt_lock);
1880 frt->frt_flags |= FRT_TERMINATING;
1881
1882 while (!(frt->frt_flags & FRT_TERMINATED)) {
1883 uint64_t t = 0;
1884 nanoseconds_to_absolutetime((c++ == 0) ? f : s, &t);
1885 clock_absolutetime_interval_to_deadline(t, &t);
1886 ASSERT(t != 0);
1887
1888 frt->frt_flags |= FRT_TERMINATEBLOCK;
1889 if (!(frt->frt_flags & FRT_RUNNING)) {
1890 thread_wakeup_one((caddr_t)frt);
1891 }
1892 (void) assert_wait_deadline(&frt->frt_thread, THREAD_UNINT, t);
1893 lck_mtx_unlock(&frt->frt_lock);
1894 thread_block(THREAD_CONTINUE_NULL);
1895 lck_mtx_lock(&frt->frt_lock);
1896 frt->frt_flags &= ~FRT_TERMINATEBLOCK;
1897 }
1898 ASSERT(frt->frt_flags & FRT_TERMINATED);
1899 lck_mtx_unlock(&frt->frt_lock);
1900 frt->frt_thread = THREAD_NULL;
1901 }
1902
1903 static void
fsw_rps_thread_spawn(struct nx_flowswitch * fsw,uint32_t i)1904 fsw_rps_thread_spawn(struct nx_flowswitch *fsw, uint32_t i)
1905 {
1906 kern_return_t error;
1907 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
1908 lck_mtx_init(&frt->frt_lock, &nexus_lock_group, &nexus_lock_attr);
1909 frt->frt_idx = i;
1910 frt->frt_fsw = fsw;
1911 error = kernel_thread_start(fsw_rps_thread_func, frt, &frt->frt_thread);
1912 ASSERT(!error);
1913 KPKTQ_INIT(&frt->frt_pktq);
1914 }
1915
1916 int
fsw_rps_set_nthreads(struct nx_flowswitch * fsw,uint32_t n)1917 fsw_rps_set_nthreads(struct nx_flowswitch* fsw, uint32_t n)
1918 {
1919 if (n > FSW_RPS_MAX_NTHREADS) {
1920 SK_ERR("rps nthreads %d, max %d", n, FSW_RPS_MAX_NTHREADS);
1921 return EINVAL;
1922 }
1923
1924 FSW_WLOCK(fsw);
1925 if (n < fsw->fsw_rps_nthreads) {
1926 for (uint32_t i = n; i < fsw->fsw_rps_nthreads; i++) {
1927 fsw_rps_thread_join(fsw, i);
1928 }
1929 fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
1930 fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads,
1931 Z_WAITOK | Z_ZERO | Z_NOFAIL);
1932 ASSERT(n != 0 ^ fsw->fsw_rps_threads == NULL);
1933 } else if (n > fsw->fsw_rps_nthreads) {
1934 fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
1935 fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads,
1936 Z_WAITOK | Z_ZERO | Z_NOFAIL);
1937 for (uint32_t i = fsw->fsw_rps_nthreads; i < n; i++) {
1938 fsw_rps_thread_spawn(fsw, i);
1939 }
1940 }
1941 fsw->fsw_rps_nthreads = n;
1942 FSW_WUNLOCK(fsw);
1943 return 0;
1944 }
1945
1946 static uint32_t
get_rps_id(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1947 get_rps_id(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1948 {
1949 sa_family_t af = fsw->fsw_demux(fsw, pkt);
1950 if (__improbable(af == AF_UNSPEC)) {
1951 return 0;
1952 }
1953
1954 flow_pkt_classify(pkt, fsw->fsw_ifp, af, true);
1955
1956 if (__improbable((pkt->pkt_qum_qflags &
1957 QUM_F_FLOW_CLASSIFIED) == 0)) {
1958 return 0;
1959 }
1960
1961 struct flow_key key;
1962 flow_pkt2key(pkt, true, &key);
1963 key.fk_mask = FKMASK_5TUPLE;
1964
1965 uint32_t id = flow_key_hash(&key) % fsw->fsw_rps_nthreads;
1966
1967 return id;
1968 }
1969
1970 #endif /* !DEVELOPMENT && !DEBUG */
1971
1972 void
fsw_receive(struct nx_flowswitch * fsw,struct pktq * pktq)1973 fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq)
1974 {
1975 FSW_RLOCK(fsw);
1976 #if (DEVELOPMENT || DEBUG)
1977 if (fsw->fsw_rps_nthreads != 0) {
1978 struct __kern_packet *pkt, *tpkt;
1979 bitmap_t map = 0;
1980
1981 _CASSERT(BITMAP_LEN(FSW_RPS_MAX_NTHREADS) == 1);
1982 KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
1983 uint32_t id = get_rps_id(fsw, pkt);
1984 KPKTQ_REMOVE(pktq, pkt);
1985 fsw_rps_rx(fsw, id, pkt);
1986 bitmap_set(&map, id);
1987 }
1988 for (int i = bitmap_first(&map, 64); i >= 0;
1989 i = bitmap_next(&map, i)) {
1990 fsw_rps_thread_schedule(fsw, i);
1991 }
1992 } else
1993 #endif /* !DEVELOPMENT && !DEBUG */
1994 {
1995 _fsw_receive_locked(fsw, pktq);
1996 }
1997 FSW_RUNLOCK(fsw);
1998 }
1999
2000 int
fsw_dev_input_netem_dequeue(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)2001 fsw_dev_input_netem_dequeue(void *handle, pktsched_pkt_t * pkts,
2002 uint32_t n_pkts)
2003 {
2004 #pragma unused(handle)
2005 struct nx_flowswitch *fsw = handle;
2006 struct __kern_packet *kpkts[FSW_VP_DEV_BATCH_MAX];
2007 struct pktq pktq;
2008 sk_protect_t protect;
2009 uint32_t i;
2010
2011 ASSERT(n_pkts <= FSW_VP_DEV_BATCH_MAX);
2012
2013 for (i = 0; i < n_pkts; i++) {
2014 ASSERT(pkts[i].pktsched_ptype == QP_PACKET);
2015 ASSERT(pkts[i].pktsched_pkt_kpkt != NULL);
2016 kpkts[i] = pkts[i].pktsched_pkt_kpkt;
2017 }
2018
2019 protect = sk_sync_protect();
2020 KPKTQ_INIT(&pktq);
2021 pkts_to_pktq(kpkts, n_pkts, &pktq);
2022
2023 fsw_receive(fsw, &pktq);
2024 KPKTQ_FINI(&pktq);
2025 sk_sync_unprotect(protect);
2026
2027 return 0;
2028 }
2029
2030 static void
fsw_dev_input_netem_enqueue(struct nx_flowswitch * fsw,struct pktq * q)2031 fsw_dev_input_netem_enqueue(struct nx_flowswitch *fsw, struct pktq *q)
2032 {
2033 classq_pkt_t p;
2034 struct netem *ne;
2035 struct __kern_packet *pkt, *tpkt;
2036
2037 ASSERT(fsw->fsw_ifp != NULL);
2038 ne = fsw->fsw_ifp->if_input_netem;
2039 ASSERT(ne != NULL);
2040 KPKTQ_FOREACH_SAFE(pkt, q, tpkt) {
2041 bool pdrop;
2042 KPKTQ_REMOVE(q, pkt);
2043 CLASSQ_PKT_INIT_PACKET(&p, pkt);
2044 netem_enqueue(ne, &p, &pdrop);
2045 }
2046 }
2047
2048 void
fsw_devna_rx(struct nexus_adapter * devna,struct __kern_packet * pkt_head,struct nexus_pkt_stats * out_stats)2049 fsw_devna_rx(struct nexus_adapter *devna, struct __kern_packet *pkt_head,
2050 struct nexus_pkt_stats *out_stats)
2051 {
2052 struct __kern_packet *pkt = pkt_head, *next;
2053 struct nx_flowswitch *fsw;
2054 uint32_t n_bytes = 0, n_pkts = 0;
2055 uint64_t total_pkts = 0, total_bytes = 0;
2056 struct pktq q;
2057
2058 KPKTQ_INIT(&q);
2059 if (__improbable(devna->na_ifp == NULL ||
2060 (fsw = fsw_ifp_to_fsw(devna->na_ifp)) == NULL)) {
2061 SK_ERR("fsw not attached, dropping %d pkts", KPKTQ_LEN(&q));
2062 pp_free_packet_chain(pkt_head, NULL);
2063 return;
2064 }
2065 while (pkt != NULL) {
2066 if (__improbable(pkt->pkt_trace_id != 0)) {
2067 KDBG(SK_KTRACE_PKT_RX_DRV | DBG_FUNC_END, pkt->pkt_trace_id);
2068 KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_START, pkt->pkt_trace_id);
2069 }
2070 next = pkt->pkt_nextpkt;
2071 pkt->pkt_nextpkt = NULL;
2072
2073 if (__probable((pkt->pkt_qum_qflags & QUM_F_DROPPED) == 0)) {
2074 KPKTQ_ENQUEUE(&q, pkt);
2075 n_bytes += pkt->pkt_length;
2076 } else {
2077 DTRACE_SKYWALK1(non__finalized__drop,
2078 struct __kern_packet *, pkt);
2079 FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_FINALIZED);
2080 pp_free_packet_single(pkt);
2081 pkt = NULL;
2082 }
2083 n_pkts = KPKTQ_LEN(&q);
2084 if (n_pkts == fsw_rx_batch || (next == NULL && n_pkts > 0)) {
2085 if (__improbable(fsw->fsw_ifp->if_input_netem != NULL)) {
2086 fsw_dev_input_netem_enqueue(fsw, &q);
2087 } else {
2088 fsw_receive(fsw, &q);
2089 }
2090 total_pkts += n_pkts;
2091 total_bytes += n_bytes;
2092 n_pkts = 0;
2093 n_bytes = 0;
2094 KPKTQ_FINI(&q);
2095 }
2096 pkt = next;
2097 }
2098 ASSERT(KPKTQ_LEN(&q) == 0);
2099 FSW_STATS_ADD(FSW_STATS_RX_PACKETS, total_pkts);
2100 if (out_stats != NULL) {
2101 out_stats->nps_pkts = total_pkts;
2102 out_stats->nps_bytes = total_bytes;
2103 }
2104 KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(devna), total_pkts, total_bytes);
2105 }
2106
2107 static int
dp_copy_to_dev_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2108 dp_copy_to_dev_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2109 struct __kern_packet *dpkt)
2110 {
2111 struct mbuf *m = NULL;
2112 uint16_t bdlen, bdlim, bdoff;
2113 uint8_t *bdaddr;
2114 unsigned int one = 1;
2115 int err = 0;
2116
2117 err = mbuf_allocpacket(MBUF_DONTWAIT,
2118 (fsw->fsw_frame_headroom + spkt->pkt_length), &one, &m);
2119 #if (DEVELOPMENT || DEBUG)
2120 if (m != NULL) {
2121 _FSW_INJECT_ERROR(11, m, NULL, m_freem, m);
2122 }
2123 #endif /* DEVELOPMENT || DEBUG */
2124 if (__improbable(m == NULL)) {
2125 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
2126 err = ENOBUFS;
2127 goto done;
2128 }
2129
2130 MD_BUFLET_ADDR_ABS_DLEN(dpkt, bdaddr, bdlen, bdlim, bdoff);
2131 if (fsw->fsw_frame_headroom > bdlim) {
2132 SK_ERR("not enough space in buffer for headroom");
2133 err = EINVAL;
2134 goto done;
2135 }
2136
2137 dpkt->pkt_headroom = fsw->fsw_frame_headroom;
2138 dpkt->pkt_mbuf = m;
2139 dpkt->pkt_pflags |= PKT_F_MBUF_DATA;
2140
2141 /* packet copy into mbuf */
2142 fsw->fsw_pkt_copy_to_mbuf(NR_TX, SK_PTR_ENCODE(spkt,
2143 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)), 0, m,
2144 fsw->fsw_frame_headroom, spkt->pkt_length,
2145 PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2146 spkt->pkt_csum_tx_start_off);
2147 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2MBUF);
2148
2149 /* header copy into dpkt buffer for classification */
2150 kern_packet_t sph = SK_PTR_ENCODE(spkt,
2151 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2152 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2153 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2154 uint32_t copy_len = MIN(spkt->pkt_length, bdlim - dpkt->pkt_headroom);
2155 fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, dpkt->pkt_headroom,
2156 sph, spkt->pkt_headroom, copy_len, FALSE, 0, 0, 0);
2157
2158 /*
2159 * fsw->fsw_frame_headroom is after m_data, thus we treat m_data same as
2160 * buflet baddr m_data always points to the beginning of packet and
2161 * should represents the same as baddr + headroom
2162 */
2163 ASSERT((uintptr_t)m->m_data ==
2164 ((uintptr_t)mbuf_datastart(m) + fsw->fsw_frame_headroom));
2165
2166 done:
2167 return err;
2168 }
2169
2170 static int
dp_copy_to_dev_pkt(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2171 dp_copy_to_dev_pkt(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2172 struct __kern_packet *dpkt)
2173 {
2174 struct ifnet *ifp = fsw->fsw_ifp;
2175 uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
2176
2177 if (headroom > UINT8_MAX) {
2178 SK_ERR("headroom too large %d", headroom);
2179 return ERANGE;
2180 }
2181 dpkt->pkt_headroom = (uint8_t)headroom;
2182 ASSERT((dpkt->pkt_headroom & 0x7) == 0);
2183 dpkt->pkt_l2_len = 0;
2184 dpkt->pkt_link_flags = spkt->pkt_link_flags;
2185
2186 kern_packet_t sph = SK_PTR_ENCODE(spkt,
2187 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2188 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2189 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2190 fsw->fsw_pkt_copy_from_pkt(NR_TX, dph,
2191 dpkt->pkt_headroom, sph, spkt->pkt_headroom,
2192 spkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2193 (spkt->pkt_csum_tx_start_off - spkt->pkt_headroom),
2194 (spkt->pkt_csum_tx_stuff_off - spkt->pkt_headroom),
2195 (spkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT));
2196
2197 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
2198
2199 return 0;
2200 }
2201
2202 #if SK_LOG
2203 /* Hoisted out of line to reduce kernel stack footprint */
2204 SK_LOG_ATTRIBUTE
2205 static void
dp_copy_to_dev_log(struct nx_flowswitch * fsw,const struct kern_pbufpool * pp,struct __kern_packet * spkt,struct __kern_packet * dpkt,int error)2206 dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp,
2207 struct __kern_packet *spkt, struct __kern_packet *dpkt, int error)
2208 {
2209 struct proc *p = current_proc();
2210 struct ifnet *ifp = fsw->fsw_ifp;
2211 uint64_t logflags = (SK_VERB_FSW_DP | SK_VERB_TX);
2212
2213 if (error == ERANGE) {
2214 SK_ERR("packet too long, hr(fr+tx)+slen (%u+%u)+%u > "
2215 "dev_pp_max %u", (uint32_t)fsw->fsw_frame_headroom,
2216 (uint32_t)ifp->if_tx_headroom, spkt->pkt_length,
2217 (uint32_t)pp->pp_max_frags * PP_BUF_SIZE_DEF(pp));
2218 } else if (error == ENOBUFS) {
2219 SK_DF(logflags, "%s(%d) packet allocation failure",
2220 sk_proc_name_address(p), sk_proc_pid(p));
2221 } else if (error == 0) {
2222 ASSERT(dpkt != NULL);
2223 char *daddr;
2224 MD_BUFLET_ADDR_ABS(dpkt, daddr);
2225 SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)",
2226 sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
2227 dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
2228 (uint32_t)fsw->fsw_frame_headroom,
2229 (uint32_t)ifp->if_tx_headroom);
2230 SK_DF(logflags | SK_VERB_DUMP, "%s",
2231 sk_dump("buf", daddr, dpkt->pkt_length, 128, NULL, 0));
2232 } else {
2233 SK_DF(logflags, "%s(%d) error %d", error);
2234 }
2235 }
2236 #else
2237 #define dp_copy_to_dev_log(...)
2238 #endif /* SK_LOG */
2239
2240 static int
dp_copy_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2241 dp_copy_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2242 struct __kern_packet *dpkt)
2243 {
2244 const struct kern_pbufpool *pp = dpkt->pkt_qum.qum_pp;
2245 struct ifnet *ifp = fsw->fsw_ifp;
2246 uint32_t dev_pkt_len;
2247 int err = 0;
2248
2249 ASSERT(!(spkt->pkt_pflags & PKT_F_MBUF_MASK));
2250 ASSERT(!(spkt->pkt_pflags & PKT_F_PKT_MASK));
2251
2252 SK_PREFETCHW(dpkt->pkt_qum_buf.buf_addr, 0);
2253 /* Copy packet metadata */
2254 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
2255 _PKT_COPY(spkt, dpkt);
2256 _PKT_COPY_TX_PORT_DATA(spkt, dpkt);
2257 ASSERT((dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
2258 !PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
2259 ASSERT(dpkt->pkt_mbuf == NULL);
2260
2261 /* Copy AQM metadata */
2262 dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type;
2263 dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx;
2264 _CASSERT((offsetof(struct __flow, flow_src_id) % 8) == 0);
2265 _UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id);
2266 _UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid);
2267 dpkt->pkt_policy_id = spkt->pkt_policy_id;
2268
2269 switch (fsw->fsw_classq_enq_ptype) {
2270 case QP_MBUF:
2271 err = dp_copy_to_dev_mbuf(fsw, spkt, dpkt);
2272 break;
2273
2274 case QP_PACKET:
2275 dev_pkt_len = fsw->fsw_frame_headroom + ifp->if_tx_headroom +
2276 spkt->pkt_length;
2277 if (dev_pkt_len > pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)) {
2278 FSW_STATS_INC(FSW_STATS_TX_COPY_BAD_LEN);
2279 err = ERANGE;
2280 goto done;
2281 }
2282 err = dp_copy_to_dev_pkt(fsw, spkt, dpkt);
2283 break;
2284
2285 default:
2286 VERIFY(0);
2287 __builtin_unreachable();
2288 }
2289 done:
2290 dp_copy_to_dev_log(fsw, pp, spkt, dpkt, err);
2291 return err;
2292 }
2293
2294 static struct mbuf *
convert_pkt_to_mbuf(struct __kern_packet * pkt)2295 convert_pkt_to_mbuf(struct __kern_packet *pkt)
2296 {
2297 ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
2298 ASSERT(pkt->pkt_mbuf != NULL);
2299 struct mbuf *m = pkt->pkt_mbuf;
2300
2301 /* pass additional metadata generated from flow parse/lookup */
2302 _CASSERT(sizeof(m->m_pkthdr.pkt_flowid) ==
2303 sizeof(pkt->pkt_flow_token));
2304 _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) ==
2305 sizeof(pkt->pkt_flowsrc_token));
2306 _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) ==
2307 sizeof(pkt->pkt_flowsrc_fidx));
2308 m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
2309 m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
2310 m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
2311 m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
2312 m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
2313 m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
2314 m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx;
2315
2316 /* The packet should have a timestamp by the time we get here. */
2317 m->m_pkthdr.pkt_timestamp = pkt->pkt_timestamp;
2318 m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
2319
2320 m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
2321 m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
2322 if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
2323 m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
2324 }
2325 if ((pkt->pkt_pflags & PKT_F_L4S) != 0) {
2326 m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_L4S;
2327 }
2328 KPKT_CLEAR_MBUF_DATA(pkt);
2329
2330 /* mbuf has been consumed, release packet as well */
2331 ASSERT(pkt->pkt_qum.qum_ksd == NULL);
2332 pp_free_packet_single(pkt);
2333 return m;
2334 }
2335
2336 static void
convert_pkt_to_mbuf_list(struct __kern_packet * pkt_list,struct mbuf ** head,struct mbuf ** tail,uint32_t * cnt,uint32_t * bytes)2337 convert_pkt_to_mbuf_list(struct __kern_packet *pkt_list,
2338 struct mbuf **head, struct mbuf **tail,
2339 uint32_t *cnt, uint32_t *bytes)
2340 {
2341 struct __kern_packet *pkt = pkt_list, *next;
2342 struct mbuf *m_head = NULL, **m_tailp = &m_head, *m = NULL;
2343 uint32_t c = 0, b = 0;
2344
2345 while (pkt != NULL) {
2346 next = pkt->pkt_nextpkt;
2347 pkt->pkt_nextpkt = NULL;
2348 m = convert_pkt_to_mbuf(pkt);
2349 ASSERT(m != NULL);
2350
2351 *m_tailp = m;
2352 m_tailp = &m->m_nextpkt;
2353 c++;
2354 b += m_pktlen(m);
2355 pkt = next;
2356 }
2357 if (head != NULL) {
2358 *head = m_head;
2359 }
2360 if (tail != NULL) {
2361 *tail = m;
2362 }
2363 if (cnt != NULL) {
2364 *cnt = c;
2365 }
2366 if (bytes != NULL) {
2367 *bytes = b;
2368 }
2369 }
2370
2371 SK_NO_INLINE_ATTRIBUTE
2372 static int
classq_enqueue_flow_single(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2373 classq_enqueue_flow_single(struct nx_flowswitch *fsw,
2374 struct __kern_packet *pkt)
2375 {
2376 struct ifnet *ifp = fsw->fsw_ifp;
2377 boolean_t pkt_drop = FALSE;
2378 int err;
2379
2380 FSW_LOCK_ASSERT_HELD(fsw);
2381 ASSERT(fsw->fsw_classq_enabled);
2382 ASSERT(pkt->pkt_flow_token != 0);
2383 fsw_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class,
2384 1, pkt->pkt_length);
2385
2386 if (__improbable(pkt->pkt_trace_id != 0)) {
2387 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
2388 KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
2389 }
2390
2391 switch (fsw->fsw_classq_enq_ptype) {
2392 case QP_MBUF: { /* compat interface */
2393 struct mbuf *m;
2394
2395 m = convert_pkt_to_mbuf(pkt);
2396 ASSERT(m != NULL);
2397 pkt = NULL;
2398
2399 /* ifnet_enqueue consumes mbuf */
2400 err = ifnet_enqueue_mbuf(ifp, m, false, &pkt_drop);
2401 m = NULL;
2402 #if (DEVELOPMENT || DEBUG)
2403 if (__improbable(!pkt_drop)) {
2404 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2405 }
2406 #endif /* DEVELOPMENT || DEBUG */
2407 if (pkt_drop) {
2408 FSW_STATS_INC(FSW_STATS_DROP);
2409 FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2410 }
2411 break;
2412 }
2413 case QP_PACKET: { /* native interface */
2414 /* ifnet_enqueue consumes packet */
2415 err = ifnet_enqueue_pkt(ifp, pkt, false, &pkt_drop);
2416 pkt = NULL;
2417 #if (DEVELOPMENT || DEBUG)
2418 if (__improbable(!pkt_drop)) {
2419 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2420 }
2421 #endif /* DEVELOPMENT || DEBUG */
2422 if (pkt_drop) {
2423 FSW_STATS_INC(FSW_STATS_DROP);
2424 FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2425 }
2426 break;
2427 }
2428 default:
2429 err = EINVAL;
2430 VERIFY(0);
2431 /* NOTREACHED */
2432 __builtin_unreachable();
2433 }
2434
2435 return err;
2436 }
2437
2438 static int
classq_enqueue_flow_chain(struct nx_flowswitch * fsw,struct __kern_packet * pkt_head,struct __kern_packet * pkt_tail,uint32_t cnt,uint32_t bytes)2439 classq_enqueue_flow_chain(struct nx_flowswitch *fsw,
2440 struct __kern_packet *pkt_head, struct __kern_packet *pkt_tail,
2441 uint32_t cnt, uint32_t bytes)
2442 {
2443 struct ifnet *ifp = fsw->fsw_ifp;
2444 boolean_t pkt_drop = FALSE;
2445 uint32_t svc;
2446 int err;
2447
2448 FSW_LOCK_ASSERT_HELD(fsw);
2449 ASSERT(fsw->fsw_classq_enabled);
2450 ASSERT(pkt_head->pkt_flow_token != 0);
2451
2452 /*
2453 * All packets in the flow should have the same svc.
2454 */
2455 svc = pkt_head->pkt_svc_class;
2456 fsw_ifp_inc_traffic_class_out_pkt(ifp, svc, cnt, bytes);
2457
2458 switch (fsw->fsw_classq_enq_ptype) {
2459 case QP_MBUF: { /* compat interface */
2460 struct mbuf *m_head = NULL, *m_tail = NULL;
2461 uint32_t c = 0, b = 0;
2462
2463 convert_pkt_to_mbuf_list(pkt_head, &m_head, &m_tail, &c, &b);
2464 ASSERT(m_head != NULL && m_tail != NULL);
2465 ASSERT(c == cnt);
2466 ASSERT(b == bytes);
2467 pkt_head = NULL;
2468
2469 /* ifnet_enqueue consumes mbuf */
2470 err = ifnet_enqueue_mbuf_chain(ifp, m_head, m_tail, cnt,
2471 bytes, FALSE, &pkt_drop);
2472 m_head = NULL;
2473 m_tail = NULL;
2474 #if (DEVELOPMENT || DEBUG)
2475 if (__improbable(!pkt_drop)) {
2476 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2477 }
2478 #endif /* DEVELOPMENT || DEBUG */
2479 if (pkt_drop) {
2480 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2481 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2482 cnt);
2483 }
2484 break;
2485 }
2486 case QP_PACKET: { /* native interface */
2487 /* ifnet_enqueue consumes packet */
2488 err = ifnet_enqueue_pkt_chain(ifp, pkt_head, pkt_tail, cnt,
2489 bytes, FALSE, &pkt_drop);
2490 pkt_head = NULL;
2491 #if (DEVELOPMENT || DEBUG)
2492 if (__improbable(!pkt_drop)) {
2493 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2494 }
2495 #endif /* DEVELOPMENT || DEBUG */
2496 if (pkt_drop) {
2497 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2498 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2499 cnt);
2500 }
2501 break;
2502 }
2503 default:
2504 err = EINVAL;
2505 VERIFY(0);
2506 /* NOTREACHED */
2507 __builtin_unreachable();
2508 }
2509
2510 return err;
2511 }
2512
2513 /*
2514 * This code path needs to be kept for interfaces without logical link support.
2515 */
2516 static void
classq_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)2517 classq_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
2518 bool chain, uint32_t cnt, uint32_t bytes)
2519 {
2520 bool flowadv_is_set = false;
2521 struct __kern_packet *pkt, *tail, *tpkt;
2522 flowadv_idx_t flow_adv_idx;
2523 bool flowadv_cap;
2524 flowadv_token_t flow_adv_token;
2525 int err;
2526
2527 SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
2528 if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
2529
2530 if (chain) {
2531 pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
2532 tail = KPKTQ_LAST(&fe->fe_tx_pktq);
2533 KPKTQ_INIT(&fe->fe_tx_pktq);
2534 if (pkt == NULL) {
2535 return;
2536 }
2537 flow_adv_idx = pkt->pkt_flowsrc_fidx;
2538 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2539 flow_adv_token = pkt->pkt_flow_token;
2540
2541 err = classq_enqueue_flow_chain(fsw, pkt, tail, cnt, bytes);
2542
2543 /* set flow advisory if needed */
2544 if (__improbable((err == EQFULL || err == EQSUSPENDED) &&
2545 flowadv_cap)) {
2546 flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
2547 flow_adv_idx, flow_adv_token);
2548 }
2549 } else {
2550 uint32_t c = 0, b = 0;
2551
2552 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2553 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2554
2555 flow_adv_idx = pkt->pkt_flowsrc_fidx;
2556 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2557 flow_adv_token = pkt->pkt_flow_token;
2558
2559 c++;
2560 b += pkt->pkt_length;
2561 err = classq_enqueue_flow_single(fsw, pkt);
2562
2563 /* set flow advisory if needed */
2564 if (__improbable(!flowadv_is_set &&
2565 ((err == EQFULL || err == EQSUSPENDED) &&
2566 flowadv_cap))) {
2567 flowadv_is_set = na_flowadv_set(
2568 flow_get_na(fsw, fe), flow_adv_idx,
2569 flow_adv_token);
2570 }
2571 }
2572 ASSERT(c == cnt);
2573 ASSERT(b == bytes);
2574 }
2575
2576 /* notify flow advisory event */
2577 if (__improbable(flowadv_is_set)) {
2578 struct __kern_channel_ring *r = fsw_flow_get_tx_ring(fsw, fe);
2579 if (__probable(r)) {
2580 na_flowadv_event(r);
2581 SK_DF(SK_VERB_FLOW_ADVISORY | SK_VERB_TX,
2582 "%s(%d) notified of flow update",
2583 sk_proc_name_address(current_proc()),
2584 sk_proc_pid(current_proc()));
2585 }
2586 }
2587 }
2588
2589 /*
2590 * Logical link code path
2591 */
2592 static void
classq_qset_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)2593 classq_qset_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
2594 bool chain, uint32_t cnt, uint32_t bytes)
2595 {
2596 struct __kern_packet *pkt, *tail;
2597 flowadv_idx_t flow_adv_idx;
2598 bool flowadv_is_set = false;
2599 bool flowadv_cap;
2600 flowadv_token_t flow_adv_token;
2601 uint32_t flowctl = 0, dropped = 0;
2602 int err;
2603
2604 SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
2605 if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
2606
2607 /*
2608 * Not supporting chains for now
2609 */
2610 VERIFY(!chain);
2611 pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
2612 tail = KPKTQ_LAST(&fe->fe_tx_pktq);
2613 KPKTQ_INIT(&fe->fe_tx_pktq);
2614 if (pkt == NULL) {
2615 return;
2616 }
2617 flow_adv_idx = pkt->pkt_flowsrc_fidx;
2618 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2619 flow_adv_token = pkt->pkt_flow_token;
2620
2621 err = netif_qset_enqueue(fe->fe_qset, pkt, tail, cnt, bytes,
2622 &flowctl, &dropped);
2623
2624 if (__improbable(err != 0)) {
2625 /* set flow advisory if needed */
2626 if (flowctl > 0 && flowadv_cap) {
2627 flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
2628 flow_adv_idx, flow_adv_token);
2629
2630 /* notify flow advisory event */
2631 if (flowadv_is_set) {
2632 struct __kern_channel_ring *r =
2633 fsw_flow_get_tx_ring(fsw, fe);
2634 if (__probable(r)) {
2635 na_flowadv_event(r);
2636 SK_DF(SK_VERB_FLOW_ADVISORY |
2637 SK_VERB_TX,
2638 "%s(%d) notified of flow update",
2639 sk_proc_name_address(current_proc()),
2640 sk_proc_pid(current_proc()));
2641 }
2642 }
2643 }
2644 if (dropped > 0) {
2645 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, dropped);
2646 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2647 dropped);
2648 }
2649 }
2650 }
2651
2652 static void
tx_finalize_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2653 tx_finalize_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2654 {
2655 #pragma unused(fsw)
2656 /* finalize here; no more changes to buflets after classq */
2657 if (__probable(!(pkt->pkt_pflags & PKT_F_MBUF_DATA))) {
2658 kern_packet_t ph = SK_PTR_ENCODE(pkt,
2659 METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
2660 int err = __packet_finalize(ph);
2661 VERIFY(err == 0);
2662 }
2663 }
2664
2665 static bool
dp_flow_tx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)2666 dp_flow_tx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2667 {
2668 struct flow_route *fr = fe->fe_route;
2669 int err;
2670
2671 ASSERT(fr != NULL);
2672
2673 if (__improbable(!dp_flow_route_process(fsw, fe))) {
2674 return false;
2675 }
2676 if (fe->fe_qset_select == FE_QSET_SELECT_DYNAMIC) {
2677 flow_qset_select_dynamic(fsw, fe, TRUE);
2678 }
2679
2680 _FSW_INJECT_ERROR(35, fr->fr_flags, fr->fr_flags,
2681 _fsw_error35_handler, 1, fr, NULL, NULL);
2682 _FSW_INJECT_ERROR(36, fr->fr_flags, fr->fr_flags,
2683 _fsw_error36_handler, 1, fr, NULL);
2684
2685 /*
2686 * See if we need to resolve the flow route; note the test against
2687 * fr_flags here is done without any lock for performance. Thus
2688 * it's possible that we race against the thread performing route
2689 * event updates for a packet (which is OK). In any case we should
2690 * not have any assertion on fr_flags value(s) due to the lack of
2691 * serialization.
2692 */
2693 if (fr->fr_flags & FLOWRTF_RESOLVED) {
2694 goto frame;
2695 }
2696
2697 struct __kern_packet *pkt, *tpkt;
2698 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2699 err = fsw->fsw_resolve(fsw, fr, pkt);
2700 _FSW_INJECT_ERROR_SET(35, _fsw_error35_handler, 2, fr, pkt, &err);
2701 _FSW_INJECT_ERROR_SET(36, _fsw_error36_handler, 2, fr, &err);
2702 /*
2703 * If resolver returns EJUSTRETURN then we drop the pkt as the
2704 * resolver should have converted the pkt into mbuf (or
2705 * detached the attached mbuf from pkt) and added it to the
2706 * llinfo queue. If we do have a cached llinfo, then proceed
2707 * to using it even though it may be stale (very unlikely)
2708 * while the resolution is in progress.
2709 * Otherwise, any other error results in dropping pkt.
2710 */
2711 if (err == EJUSTRETURN) {
2712 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2713 pp_free_packet_single(pkt);
2714 FSW_STATS_INC(FSW_STATS_TX_RESOLV_PENDING);
2715 continue;
2716 } else if (err != 0 && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
2717 /* use existing llinfo */
2718 FSW_STATS_INC(FSW_STATS_TX_RESOLV_STALE);
2719 } else if (err != 0) {
2720 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2721 pp_free_packet_single(pkt);
2722 FSW_STATS_INC(FSW_STATS_TX_RESOLV_FAIL);
2723 continue;
2724 }
2725 }
2726
2727 frame:
2728 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2729 if (fsw->fsw_frame != NULL) {
2730 fsw->fsw_frame(fsw, fr, pkt);
2731 }
2732 }
2733
2734 return true;
2735 }
2736
2737 static void
dp_listener_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)2738 dp_listener_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2739 {
2740 #pragma unused(fsw)
2741 struct __kern_packet *pkt, *tpkt;
2742 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2743 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2744 /* listener is only allowed TCP RST */
2745 if (pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
2746 (pkt->pkt_flow_tcp_flags & TH_RST) != 0) {
2747 flow_track_abort_tcp(fe, NULL, pkt);
2748 } else {
2749 char *addr;
2750 MD_BUFLET_ADDR_ABS(pkt, addr);
2751 SK_ERR("listener flow sends non-RST packet %s",
2752 sk_dump(sk_proc_name_address(current_proc()),
2753 addr, pkt->pkt_length, 128, NULL, 0));
2754 }
2755 pp_free_packet_single(pkt);
2756 }
2757 }
2758
2759 static void
fsw_update_timestamps(struct __kern_packet * pkt,volatile uint64_t * fg_ts,volatile uint64_t * rt_ts,ifnet_t ifp)2760 fsw_update_timestamps(struct __kern_packet *pkt, volatile uint64_t *fg_ts,
2761 volatile uint64_t *rt_ts, ifnet_t ifp)
2762 {
2763 struct timespec now;
2764 uint64_t now_nsec = 0;
2765
2766 if (!(pkt->pkt_pflags & PKT_F_TS_VALID) || pkt->pkt_timestamp == 0) {
2767 nanouptime(&now);
2768 net_timernsec(&now, &now_nsec);
2769 pkt->pkt_timestamp = now_nsec;
2770 }
2771 pkt->pkt_pflags &= ~PKT_F_TS_VALID;
2772
2773 /*
2774 * If the packet service class is not background,
2775 * update the timestamps on the interface, as well as
2776 * the ones in nexus-wide advisory to indicate recent
2777 * activity on a foreground flow.
2778 */
2779 if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) {
2780 ifp->if_fg_sendts = (uint32_t)_net_uptime;
2781 if (fg_ts != NULL) {
2782 *fg_ts = _net_uptime;
2783 }
2784 }
2785 if (pkt->pkt_pflags & PKT_F_REALTIME) {
2786 ifp->if_rt_sendts = (uint32_t)_net_uptime;
2787 if (rt_ts != NULL) {
2788 *rt_ts = _net_uptime;
2789 }
2790 }
2791 }
2792
2793 /*
2794 * TODO:
2795 * We can check the flow entry as well to only allow chain enqueue
2796 * on flows matching a certain criteria.
2797 */
2798 static bool
fsw_chain_enqueue_enabled(struct nx_flowswitch * fsw,struct flow_entry * fe)2799 fsw_chain_enqueue_enabled(struct nx_flowswitch *fsw, struct flow_entry *fe)
2800 {
2801 #pragma unused(fe)
2802 return fsw_chain_enqueue != 0 &&
2803 fsw->fsw_ifp->if_output_netem == NULL &&
2804 (fsw->fsw_ifp->if_eflags & IFEF_ENQUEUE_MULTI) == 0 &&
2805 fe->fe_qset == NULL;
2806 }
2807
2808 void
dp_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)2809 dp_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2810 {
2811 struct pktq dropped_pkts;
2812 bool chain;
2813 uint32_t cnt = 0, bytes = 0;
2814 volatile struct sk_nexusadv *nxadv = NULL;
2815 volatile uint64_t *fg_ts = NULL;
2816 volatile uint64_t *rt_ts = NULL;
2817 uint8_t qset_idx = (fe->fe_qset != NULL) ? fe->fe_qset->nqs_idx : 0;
2818
2819 KPKTQ_INIT(&dropped_pkts);
2820 ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
2821 if (__improbable(fe->fe_flags & FLOWENTF_LISTENER)) {
2822 dp_listener_flow_tx_process(fsw, fe);
2823 return;
2824 }
2825 if (__improbable(!dp_flow_tx_route_process(fsw, fe))) {
2826 SK_RDERR(5, "Tx route bad");
2827 FSW_STATS_ADD(FSW_STATS_TX_FLOW_NONVIABLE,
2828 KPKTQ_LEN(&fe->fe_tx_pktq));
2829 KPKTQ_CONCAT(&dropped_pkts, &fe->fe_tx_pktq);
2830 goto done;
2831 }
2832 chain = fsw_chain_enqueue_enabled(fsw, fe);
2833 if (chain) {
2834 nxadv = fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
2835 if (nxadv != NULL) {
2836 fg_ts = &nxadv->nxadv_fg_sendts;
2837 rt_ts = &nxadv->nxadv_rt_sendts;
2838 }
2839 }
2840 struct __kern_packet *pkt, *tpkt;
2841 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2842 int err = flow_pkt_track(fe, pkt, false);
2843 if (__improbable(err != 0)) {
2844 SK_RDERR(5, "flow_pkt_track failed (err %d)", err);
2845 FSW_STATS_INC(FSW_STATS_TX_FLOW_TRACK_ERR);
2846 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2847 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
2848 continue;
2849 }
2850
2851 _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
2852 pkt->pkt_transport_protocol = fe->fe_transport_protocol;
2853
2854 /* set AQM related values for outgoing packet */
2855 if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
2856 pkt->pkt_pflags |= PKT_F_FLOW_ADV;
2857 pkt->pkt_flowsrc_type = FLOWSRC_CHANNEL;
2858 pkt->pkt_flowsrc_fidx = fe->fe_adv_idx;
2859 } else {
2860 pkt->pkt_pflags &= ~PKT_F_FLOW_ADV;
2861 }
2862 _UUID_CLEAR(pkt->pkt_flow_id);
2863 pkt->pkt_flow_token = fe->fe_flowid;
2864 pkt->pkt_pflags |= PKT_F_FLOW_ID;
2865 pkt->pkt_qset_idx = qset_idx;
2866 /*
2867 * The same code is exercised per packet for the non-chain case
2868 * (see ifnet_enqueue_ifclassq()). It's replicated here to avoid
2869 * re-walking the chain later.
2870 */
2871 if (chain) {
2872 fsw_update_timestamps(pkt, fg_ts, rt_ts, fsw->fsw_ifp);
2873 }
2874 /* mark packet tos/svc_class */
2875 fsw_qos_mark(fsw, fe, pkt);
2876
2877 tx_finalize_packet(fsw, pkt);
2878 bytes += pkt->pkt_length;
2879 cnt++;
2880 }
2881
2882 /* snoop after it's finalized */
2883 if (__improbable(pktap_total_tap_count != 0)) {
2884 fsw_snoop(fsw, fe, false);
2885 }
2886 if (fe->fe_qset != NULL) {
2887 classq_qset_enqueue_flow(fsw, fe, chain, cnt, bytes);
2888 } else {
2889 classq_enqueue_flow(fsw, fe, chain, cnt, bytes);
2890 }
2891 done:
2892 dp_drop_pktq(fsw, &dropped_pkts);
2893 }
2894
2895 static struct flow_entry *
tx_process_continuous_ip_frag(struct nx_flowswitch * fsw,struct flow_entry * prev_fe,struct __kern_packet * pkt)2896 tx_process_continuous_ip_frag(struct nx_flowswitch *fsw,
2897 struct flow_entry *prev_fe, struct __kern_packet *pkt)
2898 {
2899 ASSERT(!pkt->pkt_flow_ip_is_first_frag);
2900
2901 if (__improbable(pkt->pkt_flow_ip_frag_id == 0)) {
2902 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID);
2903 SK_ERR("%s(%d) invalid zero fragment id",
2904 sk_proc_name_address(current_proc()),
2905 sk_proc_pid(current_proc()));
2906 return NULL;
2907 }
2908
2909 SK_DF(SK_VERB_FSW_DP | SK_VERB_TX,
2910 "%s(%d) continuation frag, id %u",
2911 sk_proc_name_address(current_proc()),
2912 sk_proc_pid(current_proc()),
2913 pkt->pkt_flow_ip_frag_id);
2914 if (__improbable(prev_fe == NULL ||
2915 !prev_fe->fe_tx_is_cont_frag)) {
2916 SK_ERR("%s(%d) unexpected continuation frag",
2917 sk_proc_name_address(current_proc()),
2918 sk_proc_pid(current_proc()),
2919 pkt->pkt_flow_ip_frag_id);
2920 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
2921 return NULL;
2922 }
2923 if (__improbable(pkt->pkt_flow_ip_frag_id !=
2924 prev_fe->fe_tx_frag_id)) {
2925 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
2926 SK_ERR("%s(%d) wrong continuation frag id %u expecting %u",
2927 sk_proc_name_address(current_proc()),
2928 sk_proc_pid(current_proc()),
2929 pkt->pkt_flow_ip_frag_id,
2930 prev_fe->fe_tx_frag_id);
2931 return NULL;
2932 }
2933
2934 return prev_fe;
2935 }
2936
2937 static struct flow_entry *
tx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)2938 tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
2939 struct flow_entry *prev_fe)
2940 {
2941 struct flow_entry *fe;
2942
2943 fe = lookup_flow_with_pkt(fsw, pkt, false, prev_fe);
2944 if (__improbable(fe == NULL)) {
2945 goto done;
2946 }
2947
2948 if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
2949 SK_RDERR(5, "Tx flow torn down");
2950 FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN);
2951 flow_entry_release(&fe);
2952 goto done;
2953 }
2954
2955 _FSW_INJECT_ERROR(34, pkt->pkt_flow_id[0], fe->fe_uuid[0] + 1,
2956 null_func);
2957
2958 if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
2959 uuid_string_t flow_id_str, pkt_id_str;
2960 sk_uuid_unparse(fe->fe_uuid, flow_id_str);
2961 sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str);
2962 SK_ERR("pkt flow id %s != flow id %s", pkt_id_str, flow_id_str);
2963 flow_entry_release(&fe);
2964 FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID);
2965 }
2966
2967 done:
2968 return fe;
2969 }
2970
2971 static inline void
tx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe)2972 tx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2973 {
2974 ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
2975 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) != 0);
2976
2977 SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, "TX %d pkts from fe %p port %d",
2978 KPKTQ_LEN(&fe->fe_tx_pktq), fe, fe->fe_nx_port);
2979
2980 /* flow related processing (default, agg, etc.) */
2981 fe->fe_tx_process(fsw, fe);
2982
2983 KPKTQ_FINI(&fe->fe_tx_pktq);
2984 }
2985
2986 #if SK_LOG
2987 static void
dp_tx_log_pkt(uint64_t verb,char * desc,struct __kern_packet * pkt)2988 dp_tx_log_pkt(uint64_t verb, char *desc, struct __kern_packet *pkt)
2989 {
2990 char *pkt_buf;
2991 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
2992 SK_DF(verb, "%s(%d) %s %s", sk_proc_name_address(current_proc()),
2993 sk_proc_pid(current_proc()), desc, sk_dump("buf", pkt_buf,
2994 pkt->pkt_length, 128, NULL, 0));
2995 }
2996 #else /* !SK_LOG */
2997 #define dp_tx_log_pkt(...)
2998 #endif /* !SK_LOG */
2999
3000 static void
dp_tx_pktq(struct nx_flowswitch * fsw,struct pktq * spktq)3001 dp_tx_pktq(struct nx_flowswitch *fsw, struct pktq *spktq)
3002 {
3003 struct __kern_packet *spkt, *pkt;
3004 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3005 struct flow_entry *fe, *prev_fe;
3006 struct pktq dropped_pkts, dpktq;
3007 struct nexus_adapter *dev_na;
3008 struct kern_pbufpool *dev_pp;
3009 struct ifnet *ifp;
3010 sa_family_t af;
3011 uint32_t n_pkts, n_flows = 0;
3012
3013 int err;
3014 KPKTQ_INIT(&dpktq);
3015 KPKTQ_INIT(&dropped_pkts);
3016 n_pkts = KPKTQ_LEN(spktq);
3017
3018 FSW_RLOCK(fsw);
3019 if (__improbable(FSW_QUIESCED(fsw))) {
3020 DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3021 SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3022 KPKTQ_CONCAT(&dropped_pkts, spktq);
3023 goto done;
3024 }
3025 dev_na = fsw->fsw_dev_ch->ch_na;
3026 if (__improbable(dev_na == NULL)) {
3027 SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3028 FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3029 KPKTQ_CONCAT(&dropped_pkts, spktq);
3030 goto done;
3031 }
3032 /*
3033 * fsw_ifp should still be valid at this point. If fsw is detached
3034 * after fsw_lock is released, this ifp will remain valid and
3035 * netif_transmit() will behave properly even if the ifp is in
3036 * detached state.
3037 */
3038 ifp = fsw->fsw_ifp;
3039
3040 /* batch allocate enough packets */
3041 dev_pp = na_kr_get_pp(dev_na, NR_TX);
3042
3043 err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, n_pkts, NULL,
3044 NULL, SKMEM_NOSLEEP);
3045 #if DEVELOPMENT || DEBUG
3046 if (__probable(err != ENOMEM)) {
3047 _FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
3048 }
3049 #endif /* DEVELOPMENT || DEBUG */
3050 if (__improbable(err == ENOMEM)) {
3051 ASSERT(KPKTQ_EMPTY(&dpktq));
3052 KPKTQ_CONCAT(&dropped_pkts, spktq);
3053 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3054 SK_ERR("failed to alloc %u pkts from device pool", n_pkts);
3055 goto done;
3056 } else if (__improbable(err == EAGAIN)) {
3057 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT,
3058 (n_pkts - KPKTQ_LEN(&dpktq)));
3059 FSW_STATS_ADD(FSW_STATS_DROP,
3060 (n_pkts - KPKTQ_LEN(&dpktq)));
3061 }
3062
3063 n_pkts = KPKTQ_LEN(&dpktq);
3064 prev_fe = NULL;
3065 KPKTQ_FOREACH(spkt, spktq) {
3066 if (n_pkts == 0) {
3067 break;
3068 }
3069 --n_pkts;
3070
3071 KPKTQ_DEQUEUE(&dpktq, pkt);
3072 ASSERT(pkt != NULL);
3073 err = dp_copy_to_dev(fsw, spkt, pkt);
3074 if (__improbable(err != 0)) {
3075 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3076 continue;
3077 }
3078
3079 af = fsw_ip_demux(fsw, pkt);
3080 if (__improbable(af == AF_UNSPEC)) {
3081 dp_tx_log_pkt(SK_VERB_ERROR, "demux err", pkt);
3082 FSW_STATS_INC(FSW_STATS_TX_DEMUX_ERR);
3083 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3084 continue;
3085 }
3086
3087 err = flow_pkt_classify(pkt, ifp, af, false);
3088 if (__improbable(err != 0)) {
3089 dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
3090 FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
3091 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3092 continue;
3093 }
3094
3095 if (__improbable(pkt->pkt_flow_ip_is_frag &&
3096 !pkt->pkt_flow_ip_is_first_frag)) {
3097 fe = tx_process_continuous_ip_frag(fsw, prev_fe, pkt);
3098 if (__probable(fe != NULL)) {
3099 flow_entry_retain(fe);
3100 goto flow_batch;
3101 } else {
3102 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3103 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3104 continue;
3105 }
3106 }
3107
3108 fe = tx_lookup_flow(fsw, pkt, prev_fe);
3109 if (__improbable(fe == NULL)) {
3110 FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
3111 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3112 prev_fe = NULL;
3113 continue;
3114 }
3115 flow_batch:
3116 tx_flow_batch_packet(&fes, fe, pkt);
3117 prev_fe = fe;
3118 }
3119
3120 struct flow_entry *tfe = NULL;
3121 TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
3122 tx_flow_process(fsw, fe);
3123 TAILQ_REMOVE(&fes, fe, fe_tx_link);
3124 fe->fe_tx_is_cont_frag = false;
3125 fe->fe_tx_frag_id = 0;
3126 flow_entry_release(&fe);
3127 n_flows++;
3128 }
3129
3130 done:
3131 FSW_RUNLOCK(fsw);
3132 if (n_flows > 0) {
3133 netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL);
3134 }
3135 dp_drop_pktq(fsw, &dropped_pkts);
3136 KPKTQ_FINI(&dropped_pkts);
3137 KPKTQ_FINI(&dpktq);
3138 }
3139
3140 static inline void
fsw_dev_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)3141 fsw_dev_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
3142 struct proc *p)
3143 {
3144 #pragma unused(p)
3145 uint32_t total_pkts = 0, total_bytes = 0;
3146
3147 for (;;) {
3148 struct pktq pktq;
3149 KPKTQ_INIT(&pktq);
3150 uint32_t n_bytes;
3151 fsw_ring_dequeue_pktq(fsw, r, fsw_rx_batch, &pktq, &n_bytes);
3152 if (n_bytes == 0) {
3153 break;
3154 }
3155 total_pkts += KPKTQ_LEN(&pktq);
3156 total_bytes += n_bytes;
3157
3158 if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) {
3159 fsw_receive(fsw, &pktq);
3160 } else {
3161 fsw_dev_input_netem_enqueue(fsw, &pktq);
3162 }
3163 KPKTQ_FINI(&pktq);
3164 }
3165
3166 KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
3167 DTRACE_SKYWALK2(fsw__dp__dev__ring__flush, uint32_t, total_pkts,
3168 uint32_t, total_bytes);
3169
3170 /* compute mitigation rate for delivered traffic */
3171 if (__probable(r->ckr_netif_mit_stats != NULL)) {
3172 r->ckr_netif_mit_stats(r, total_pkts, total_bytes);
3173 }
3174 }
3175
3176 static inline void
fsw_user_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)3177 fsw_user_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
3178 struct proc *p)
3179 {
3180 #pragma unused(p)
3181 static packet_trace_id_t trace_id = 0;
3182 uint32_t total_pkts = 0, total_bytes = 0;
3183
3184 for (;;) {
3185 struct pktq pktq;
3186 KPKTQ_INIT(&pktq);
3187 uint32_t n_bytes;
3188 fsw_ring_dequeue_pktq(fsw, r, fsw_tx_batch, &pktq, &n_bytes);
3189 if (n_bytes == 0) {
3190 break;
3191 }
3192 total_pkts += KPKTQ_LEN(&pktq);
3193 total_bytes += n_bytes;
3194
3195 KPKTQ_FIRST(&pktq)->pkt_trace_id = ++trace_id;
3196 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_START, KPKTQ_FIRST(&pktq)->pkt_trace_id);
3197
3198 dp_tx_pktq(fsw, &pktq);
3199 dp_free_pktq(fsw, &pktq);
3200 KPKTQ_FINI(&pktq);
3201 }
3202
3203 kr_update_stats(r, total_pkts, total_bytes);
3204
3205 KDBG(SK_KTRACE_FSW_USER_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
3206 DTRACE_SKYWALK2(fsw__dp__user__ring__flush, uint32_t, total_pkts,
3207 uint32_t, total_bytes);
3208 }
3209
3210 void
fsw_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)3211 fsw_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
3212 struct proc *p)
3213 {
3214 struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
3215
3216 ASSERT(sk_is_sync_protected());
3217 ASSERT(vpna->vpna_nx_port != FSW_VP_HOST);
3218 ASSERT(vpna->vpna_up.na_md_type == NEXUS_META_TYPE_PACKET);
3219
3220 if (vpna->vpna_nx_port == FSW_VP_DEV) {
3221 fsw_dev_ring_flush(fsw, r, p);
3222 } else {
3223 fsw_user_ring_flush(fsw, r, p);
3224 }
3225 }
3226
3227 int
fsw_dp_ctor(struct nx_flowswitch * fsw)3228 fsw_dp_ctor(struct nx_flowswitch *fsw)
3229 {
3230 uint32_t fe_cnt = fsw_fe_table_size;
3231 uint32_t fob_cnt = fsw_flow_owner_buckets;
3232 uint32_t frb_cnt = fsw_flow_route_buckets;
3233 uint32_t frib_cnt = fsw_flow_route_id_buckets;
3234 struct kern_nexus *nx = fsw->fsw_nx;
3235 char name[64];
3236 int error = 0;
3237
3238 /* just in case */
3239 if (fe_cnt == 0) {
3240 fe_cnt = NX_FSW_FE_TABLESZ;
3241 ASSERT(fe_cnt != 0);
3242 }
3243 if (fob_cnt == 0) {
3244 fob_cnt = NX_FSW_FOB_HASHSZ;
3245 ASSERT(fob_cnt != 0);
3246 }
3247 if (frb_cnt == 0) {
3248 frb_cnt = NX_FSW_FRB_HASHSZ;
3249 ASSERT(frb_cnt != 0);
3250 }
3251 if (frib_cnt == 0) {
3252 frib_cnt = NX_FSW_FRIB_HASHSZ;
3253 ASSERT(frib_cnt != 0);
3254 }
3255
3256 /* make sure fe_cnt is a power of two, else round up */
3257 if ((fe_cnt & (fe_cnt - 1)) != 0) {
3258 fe_cnt--;
3259 fe_cnt |= (fe_cnt >> 1);
3260 fe_cnt |= (fe_cnt >> 2);
3261 fe_cnt |= (fe_cnt >> 4);
3262 fe_cnt |= (fe_cnt >> 8);
3263 fe_cnt |= (fe_cnt >> 16);
3264 fe_cnt++;
3265 }
3266
3267 /* make sure frb_cnt is a power of two, else round up */
3268 if ((frb_cnt & (frb_cnt - 1)) != 0) {
3269 frb_cnt--;
3270 frb_cnt |= (frb_cnt >> 1);
3271 frb_cnt |= (frb_cnt >> 2);
3272 frb_cnt |= (frb_cnt >> 4);
3273 frb_cnt |= (frb_cnt >> 8);
3274 frb_cnt |= (frb_cnt >> 16);
3275 frb_cnt++;
3276 }
3277
3278 lck_mtx_init(&fsw->fsw_detach_barrier_lock, &nexus_lock_group,
3279 &nexus_lock_attr);
3280 lck_mtx_init(&fsw->fsw_reap_lock, &nexus_lock_group, &nexus_lock_attr);
3281 lck_mtx_init(&fsw->fsw_linger_lock, &nexus_lock_group, &nexus_lock_attr);
3282 TAILQ_INIT(&fsw->fsw_linger_head);
3283
3284 (void) snprintf(name, sizeof(name), "%s_%llu", NX_FSW_NAME, nx->nx_id);
3285 error = nx_advisory_alloc(nx, name,
3286 &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
3287 NEXUS_ADVISORY_TYPE_FLOWSWITCH);
3288 if (error != 0) {
3289 fsw_dp_dtor(fsw);
3290 return error;
3291 }
3292
3293 fsw->fsw_flow_mgr = flow_mgr_create(fe_cnt, fob_cnt, frb_cnt, frib_cnt);
3294 if (fsw->fsw_flow_mgr == NULL) {
3295 fsw_dp_dtor(fsw);
3296 return error;
3297 }
3298
3299 /* generic name; will be customized upon ifattach */
3300 (void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
3301 FSW_REAP_THREADNAME, name, "");
3302
3303 if (kernel_thread_start(fsw_reap_thread_func, fsw,
3304 &fsw->fsw_reap_thread) != KERN_SUCCESS) {
3305 panic_plain("%s: can't create thread", __func__);
3306 /* NOTREACHED */
3307 __builtin_unreachable();
3308 }
3309 /* this must not fail */
3310 VERIFY(fsw->fsw_reap_thread != NULL);
3311
3312 SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
3313
3314
3315 return error;
3316 }
3317
3318 void
fsw_dp_dtor(struct nx_flowswitch * fsw)3319 fsw_dp_dtor(struct nx_flowswitch *fsw)
3320 {
3321 uint64_t f = (1 * NSEC_PER_MSEC); /* 1 ms */
3322 uint64_t s = (1000 * NSEC_PER_SEC); /* 1 sec */
3323 uint32_t i = 0;
3324
3325 #if (DEVELOPMENT || DEBUG)
3326 if (fsw->fsw_rps_threads != NULL) {
3327 for (i = 0; i < fsw->fsw_rps_nthreads; i++) {
3328 fsw_rps_thread_join(fsw, i);
3329 }
3330 kfree_type(struct fsw_rps_thread, fsw->fsw_rps_threads);
3331 }
3332 #endif /* !DEVELOPMENT && !DEBUG */
3333
3334 nx_advisory_free(fsw->fsw_nx);
3335
3336 if (fsw->fsw_reap_thread != THREAD_NULL) {
3337 /* signal thread to begin self-termination */
3338 lck_mtx_lock(&fsw->fsw_reap_lock);
3339 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATING;
3340
3341 /*
3342 * And wait for thread to terminate; use another
3343 * wait channel here other than fsw_reap_flags to
3344 * make it more explicit. In the event the reaper
3345 * thread misses a wakeup, we'll try again once
3346 * every second (except for the first time).
3347 */
3348 while (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)) {
3349 uint64_t t = 0;
3350
3351 nanoseconds_to_absolutetime((i++ == 0) ? f : s, &t);
3352 clock_absolutetime_interval_to_deadline(t, &t);
3353 ASSERT(t != 0);
3354
3355 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATEBLOCK;
3356 if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)) {
3357 thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
3358 }
3359 (void) assert_wait_deadline(&fsw->fsw_reap_thread,
3360 THREAD_UNINT, t);
3361 lck_mtx_unlock(&fsw->fsw_reap_lock);
3362 thread_block(THREAD_CONTINUE_NULL);
3363 lck_mtx_lock(&fsw->fsw_reap_lock);
3364 fsw->fsw_reap_flags &= ~FSW_REAPF_TERMINATEBLOCK;
3365 }
3366 ASSERT(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED);
3367 lck_mtx_unlock(&fsw->fsw_reap_lock);
3368 fsw->fsw_reap_thread = THREAD_NULL;
3369 }
3370
3371 /* free any remaining flow entries in the linger list */
3372 fsw_linger_purge(fsw);
3373
3374 if (fsw->fsw_flow_mgr != NULL) {
3375 flow_mgr_destroy(fsw->fsw_flow_mgr);
3376 fsw->fsw_flow_mgr = NULL;
3377 }
3378
3379
3380 lck_mtx_destroy(&fsw->fsw_detach_barrier_lock, &nexus_lock_group);
3381 lck_mtx_destroy(&fsw->fsw_reap_lock, &nexus_lock_group);
3382 lck_mtx_destroy(&fsw->fsw_linger_lock, &nexus_lock_group);
3383 }
3384
3385 void
fsw_linger_insert(struct flow_entry * fe)3386 fsw_linger_insert(struct flow_entry *fe)
3387 {
3388 struct nx_flowswitch *fsw = fe->fe_fsw;
3389 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
3390 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
3391 fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
3392 fe->fe_flags, FLOWENTF_BITS);
3393
3394 net_update_uptime();
3395
3396 ASSERT(flow_entry_refcnt(fe) >= 1);
3397 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
3398 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
3399 ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
3400 ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE);
3401 ASSERT(fe->fe_linger_wait != 0);
3402 fe->fe_linger_expire = (_net_uptime + fe->fe_linger_wait);
3403 atomic_bitset_32(&fe->fe_flags, FLOWENTF_LINGERING);
3404
3405 lck_mtx_lock_spin(&fsw->fsw_linger_lock);
3406 TAILQ_INSERT_TAIL(&fsw->fsw_linger_head, fe, fe_linger_link);
3407 fsw->fsw_linger_cnt++;
3408 VERIFY(fsw->fsw_linger_cnt != 0);
3409 lck_mtx_unlock(&fsw->fsw_linger_lock);
3410
3411 fsw_reap_sched(fsw);
3412 }
3413
3414 static void
fsw_linger_remove_internal(struct flow_entry_linger_head * linger_head,struct flow_entry * fe)3415 fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head,
3416 struct flow_entry *fe)
3417 {
3418 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
3419 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
3420 fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
3421 fe->fe_flags, FLOWENTF_BITS);
3422
3423 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
3424 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
3425 ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
3426 atomic_bitclear_32(&fe->fe_flags, FLOWENTF_LINGERING);
3427
3428 TAILQ_REMOVE(linger_head, fe, fe_linger_link);
3429 flow_entry_release(&fe);
3430 }
3431
3432 static void
fsw_linger_remove(struct flow_entry * fe)3433 fsw_linger_remove(struct flow_entry *fe)
3434 {
3435 struct nx_flowswitch *fsw = fe->fe_fsw;
3436
3437 LCK_MTX_ASSERT(&fsw->fsw_linger_lock, LCK_MTX_ASSERT_OWNED);
3438
3439 fsw_linger_remove_internal(&fsw->fsw_linger_head, fe);
3440 VERIFY(fsw->fsw_linger_cnt != 0);
3441 fsw->fsw_linger_cnt--;
3442 }
3443
3444 void
fsw_linger_purge(struct nx_flowswitch * fsw)3445 fsw_linger_purge(struct nx_flowswitch *fsw)
3446 {
3447 struct flow_entry *fe, *tfe;
3448
3449 lck_mtx_lock(&fsw->fsw_linger_lock);
3450 TAILQ_FOREACH_SAFE(fe, &fsw->fsw_linger_head, fe_linger_link, tfe) {
3451 fsw_linger_remove(fe);
3452 }
3453 ASSERT(fsw->fsw_linger_cnt == 0);
3454 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
3455 lck_mtx_unlock(&fsw->fsw_linger_lock);
3456 }
3457
3458 void
fsw_reap_sched(struct nx_flowswitch * fsw)3459 fsw_reap_sched(struct nx_flowswitch *fsw)
3460 {
3461 ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
3462 lck_mtx_lock_spin(&fsw->fsw_reap_lock);
3463 if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING) &&
3464 !(fsw->fsw_reap_flags & (FSW_REAPF_TERMINATING | FSW_REAPF_TERMINATED))) {
3465 thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
3466 }
3467 lck_mtx_unlock(&fsw->fsw_reap_lock);
3468 }
3469
3470 __attribute__((noreturn))
3471 static void
fsw_reap_thread_func(void * v,wait_result_t w)3472 fsw_reap_thread_func(void *v, wait_result_t w)
3473 {
3474 #pragma unused(w)
3475 struct nx_flowswitch *fsw = v;
3476
3477 ASSERT(fsw->fsw_reap_thread == current_thread());
3478 thread_set_thread_name(current_thread(), fsw->fsw_reap_name);
3479
3480 net_update_uptime();
3481
3482 lck_mtx_lock(&fsw->fsw_reap_lock);
3483 VERIFY(!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING));
3484 (void) assert_wait(&fsw->fsw_reap_flags, THREAD_UNINT);
3485 lck_mtx_unlock(&fsw->fsw_reap_lock);
3486 thread_block_parameter(fsw_reap_thread_cont, fsw);
3487 /* NOTREACHED */
3488 __builtin_unreachable();
3489 }
3490
3491 __attribute__((noreturn))
3492 static void
fsw_reap_thread_cont(void * v,wait_result_t wres)3493 fsw_reap_thread_cont(void *v, wait_result_t wres)
3494 {
3495 struct nx_flowswitch *fsw = v;
3496 boolean_t low;
3497 uint64_t t = 0;
3498
3499 SK_DF(SK_VERB_FLOW, "%s: running", fsw->fsw_reap_name);
3500
3501 lck_mtx_lock(&fsw->fsw_reap_lock);
3502 if (__improbable(wres == THREAD_INTERRUPTED ||
3503 (fsw->fsw_reap_flags & FSW_REAPF_TERMINATING) != 0)) {
3504 goto terminate;
3505 }
3506
3507 ASSERT(!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED));
3508 fsw->fsw_reap_flags |= FSW_REAPF_RUNNING;
3509 lck_mtx_unlock(&fsw->fsw_reap_lock);
3510
3511 net_update_uptime();
3512
3513 /* prevent detach from happening while we're here */
3514 if (!fsw_detach_barrier_add(fsw)) {
3515 SK_ERR("%s: netagent detached", fsw->fsw_reap_name);
3516 t = 0;
3517 } else {
3518 uint32_t fe_nonviable, fe_freed, fe_aborted;
3519 uint32_t fr_freed, fr_resid = 0;
3520 struct ifnet *ifp = fsw->fsw_ifp;
3521 uint64_t i = FSW_REAP_IVAL;
3522 uint64_t now = _net_uptime;
3523 uint64_t last;
3524
3525 ASSERT(fsw->fsw_ifp != NULL);
3526
3527 /*
3528 * Pass 1: process any deferred {withdrawn,nonviable} requests.
3529 */
3530 fe_nonviable = fsw_process_deferred(fsw);
3531
3532 /*
3533 * Pass 2: remove any expired lingering flows.
3534 */
3535 fe_freed = fsw_process_linger(fsw, &fe_aborted);
3536
3537 /*
3538 * Pass 3: prune idle flow routes.
3539 */
3540 fr_freed = flow_route_prune(fsw->fsw_flow_mgr,
3541 ifp, &fr_resid);
3542
3543 /*
3544 * Pass 4: prune flow table
3545 *
3546 */
3547 cuckoo_hashtable_try_shrink(fsw->fsw_flow_mgr->fm_flow_table);
3548
3549 SK_DF(SK_VERB_FLOW, "%s: fe_nonviable %u/%u fe_freed %u/%u "
3550 "fe_aborted %u fr_freed %u/%u",
3551 fsw->fsw_flow_mgr->fm_name, fe_nonviable,
3552 (fe_nonviable + fsw->fsw_pending_nonviable),
3553 fe_freed, fsw->fsw_linger_cnt, fe_aborted, fe_freed,
3554 (fe_freed + fr_resid));
3555
3556 /* see if VM memory level is critical */
3557 low = skmem_lowmem_check();
3558
3559 /*
3560 * If things appear to be idle, we can prune away cached
3561 * object that have fallen out of the working sets (this
3562 * is different than purging). Every once in a while, we
3563 * also purge the caches. Note that this is done across
3564 * all flowswitch instances, and so we limit this to no
3565 * more than once every FSW_REAP_SK_THRES seconds.
3566 */
3567 atomic_get_64(last, &fsw_reap_last);
3568 if ((low || (last != 0 && (now - last) >= FSW_REAP_SK_THRES)) &&
3569 atomic_test_set_64(&fsw_reap_last, last, now)) {
3570 fsw_purge_cache(fsw, low);
3571
3572 /* increase sleep interval if idle */
3573 if (kdebug_enable == 0 && fsw->fsw_linger_cnt == 0 &&
3574 fsw->fsw_pending_nonviable == 0 && fr_resid == 0) {
3575 i <<= 3;
3576 }
3577 } else if (last == 0) {
3578 atomic_set_64(&fsw_reap_last, now);
3579 }
3580
3581 /*
3582 * Additionally, run thru the list of channels and prune
3583 * or purge away cached objects on "idle" channels. This
3584 * check is rate limited to no more than once every
3585 * FSW_DRAIN_CH_THRES seconds.
3586 */
3587 last = fsw->fsw_drain_channel_chk_last;
3588 if (low || (last != 0 && (now - last) >= FSW_DRAIN_CH_THRES)) {
3589 SK_DF(SK_VERB_FLOW, "%s: pruning channels",
3590 fsw->fsw_flow_mgr->fm_name);
3591
3592 fsw->fsw_drain_channel_chk_last = now;
3593 fsw_drain_channels(fsw, now, low);
3594 } else if (__improbable(last == 0)) {
3595 fsw->fsw_drain_channel_chk_last = now;
3596 }
3597
3598 /*
3599 * Finally, invoke the interface's reap callback to
3600 * tell it to prune or purge away cached objects if
3601 * it is idle. This check is rate limited to no more
3602 * than once every FSW_REAP_IF_THRES seconds.
3603 */
3604 last = fsw->fsw_drain_netif_chk_last;
3605 if (low || (last != 0 && (now - last) >= FSW_REAP_IF_THRES)) {
3606 ASSERT(fsw->fsw_nifna != NULL);
3607
3608 if (ifp->if_na_ops != NULL &&
3609 ifp->if_na_ops->ni_reap != NULL) {
3610 SK_DF(SK_VERB_FLOW, "%s: pruning netif",
3611 fsw->fsw_flow_mgr->fm_name);
3612 ifp->if_na_ops->ni_reap(ifp->if_na, ifp,
3613 FSW_REAP_IF_THRES, low);
3614 }
3615
3616 fsw->fsw_drain_netif_chk_last = now;
3617 } else if (__improbable(last == 0)) {
3618 fsw->fsw_drain_netif_chk_last = now;
3619 }
3620
3621 /* emit periodic interface stats ktrace */
3622 last = fsw->fsw_reap_last;
3623 if (last != 0 && (now - last) >= FSW_IFSTATS_THRES) {
3624 KDBG(SK_KTRACE_AON_IF_STATS, ifp->if_data.ifi_ipackets,
3625 ifp->if_data.ifi_ibytes * 8,
3626 ifp->if_data.ifi_opackets,
3627 ifp->if_data.ifi_obytes * 8);
3628
3629 fsw->fsw_reap_last = now;
3630 } else if (__improbable(last == 0)) {
3631 fsw->fsw_reap_last = now;
3632 }
3633
3634 nanoseconds_to_absolutetime(i * NSEC_PER_SEC, &t);
3635 clock_absolutetime_interval_to_deadline(t, &t);
3636 ASSERT(t != 0);
3637
3638 /* allow any pending detach to proceed */
3639 fsw_detach_barrier_remove(fsw);
3640 }
3641
3642 lck_mtx_lock(&fsw->fsw_reap_lock);
3643 if (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING)) {
3644 fsw->fsw_reap_flags &= ~FSW_REAPF_RUNNING;
3645 (void) assert_wait_deadline(&fsw->fsw_reap_flags,
3646 THREAD_UNINT, t);
3647 lck_mtx_unlock(&fsw->fsw_reap_lock);
3648 thread_block_parameter(fsw_reap_thread_cont, fsw);
3649 /* NOTREACHED */
3650 __builtin_unreachable();
3651 } else {
3652 terminate:
3653 LCK_MTX_ASSERT(&fsw->fsw_reap_lock, LCK_MTX_ASSERT_OWNED);
3654 fsw->fsw_reap_flags &= ~(FSW_REAPF_RUNNING | FSW_REAPF_TERMINATING);
3655 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATED;
3656 /*
3657 * And signal any thread waiting for us to terminate;
3658 * wait channel here other than fsw_reap_flags to make
3659 * it more explicit.
3660 */
3661 if (fsw->fsw_reap_flags & FSW_REAPF_TERMINATEBLOCK) {
3662 thread_wakeup((caddr_t)&fsw->fsw_reap_thread);
3663 }
3664 lck_mtx_unlock(&fsw->fsw_reap_lock);
3665
3666 SK_DF(SK_VERB_FLOW, "%s: terminating", fsw->fsw_reap_name);
3667
3668 /* for the extra refcnt from kernel_thread_start() */
3669 thread_deallocate(current_thread());
3670 /* this is the end */
3671 thread_terminate(current_thread());
3672 /* NOTREACHED */
3673 __builtin_unreachable();
3674 }
3675
3676 /* must never get here */
3677 VERIFY(0);
3678 /* NOTREACHED */
3679 __builtin_unreachable();
3680 }
3681
3682 static void
fsw_drain_channels(struct nx_flowswitch * fsw,uint64_t now,boolean_t low)3683 fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low)
3684 {
3685 struct kern_nexus *nx = fsw->fsw_nx;
3686
3687 /* flowswitch protects NA via fsw_lock, see fsw_port_alloc/free */
3688 FSW_RLOCK(fsw);
3689
3690 /* uncrustify doesn't handle C blocks properly */
3691 /* BEGIN IGNORE CODESTYLE */
3692 nx_port_foreach(nx, ^(nexus_port_t p) {
3693 struct nexus_adapter *na = nx_port_get_na(nx, p);
3694 if (na == NULL || na->na_work_ts == 0 ||
3695 (now - na->na_work_ts) < FSW_DRAIN_CH_THRES) {
3696 return;
3697 }
3698
3699 /*
3700 * If NA has been inactive for some time (twice the drain
3701 * threshold), we clear the work timestamp to temporarily skip
3702 * this channel until it's active again. Purging cached objects
3703 * can be expensive since we'd need to allocate and construct
3704 * them again, so we do it only when necessary.
3705 */
3706 boolean_t purge;
3707 if (low || ((now - na->na_work_ts) >= (FSW_DRAIN_CH_THRES << 1))) {
3708 na->na_work_ts = 0;
3709 purge = TRUE;
3710 } else {
3711 purge = FALSE;
3712 }
3713
3714 na_drain(na, purge); /* purge/prune caches */
3715 });
3716 /* END IGNORE CODESTYLE */
3717
3718 FSW_RUNLOCK(fsw);
3719 }
3720
3721 static void
fsw_purge_cache(struct nx_flowswitch * fsw,boolean_t low)3722 fsw_purge_cache(struct nx_flowswitch *fsw, boolean_t low)
3723 {
3724 #pragma unused(fsw)
3725 uint64_t o = atomic_add_64_ov(&fsw_want_purge, 1);
3726 uint32_t p = fsw_flow_purge_thresh;
3727 boolean_t purge = (low || (o != 0 && p != 0 && (o % p) == 0));
3728
3729 SK_DF(SK_VERB_FLOW, "%s: %s caches",
3730 fsw->fsw_flow_mgr->fm_name,
3731 (purge ? "purge" : "prune"));
3732
3733 skmem_cache_reap_now(sk_fo_cache, purge);
3734 skmem_cache_reap_now(sk_fe_cache, purge);
3735 skmem_cache_reap_now(sk_fab_cache, purge);
3736 skmem_cache_reap_now(flow_route_cache, purge);
3737 skmem_cache_reap_now(flow_stats_cache, purge);
3738 eventhandler_reap_caches(purge);
3739 netns_reap_caches(purge);
3740 skmem_reap_caches(purge);
3741
3742 if (if_is_fsw_transport_netagent_enabled() && purge) {
3743 mbuf_drain(FALSE);
3744 }
3745 }
3746
3747 static void
fsw_flow_handle_low_power(struct nx_flowswitch * fsw,struct flow_entry * fe)3748 fsw_flow_handle_low_power(struct nx_flowswitch *fsw, struct flow_entry *fe)
3749 {
3750 /* When the interface is in low power mode, the flow is nonviable */
3751 if (!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
3752 atomic_test_set_32(&fe->fe_want_nonviable, 0, 1)) {
3753 atomic_add_32(&fsw->fsw_pending_nonviable, 1);
3754 }
3755 }
3756
3757 static uint32_t
fsw_process_deferred(struct nx_flowswitch * fsw)3758 fsw_process_deferred(struct nx_flowswitch *fsw)
3759 {
3760 struct flow_entry_dead sfed __sk_aligned(8);
3761 struct flow_mgr *fm = fsw->fsw_flow_mgr;
3762 struct flow_entry_dead *fed, *tfed;
3763 LIST_HEAD(, flow_entry_dead) fed_head =
3764 LIST_HEAD_INITIALIZER(fed_head);
3765 uint32_t i, nonviable = 0;
3766 boolean_t lowpowermode = FALSE;
3767
3768 bzero(&sfed, sizeof(sfed));
3769
3770 /*
3771 * The flows become nonviable when the interface
3772 * is in low power mode (edge trigger)
3773 */
3774 if ((fsw->fsw_ifp->if_xflags & IFXF_LOW_POWER) &&
3775 fsw->fsw_ifp->if_low_power_gencnt != fsw->fsw_low_power_gencnt) {
3776 lowpowermode = TRUE;
3777 fsw->fsw_low_power_gencnt = fsw->fsw_ifp->if_low_power_gencnt;
3778 }
3779
3780 /*
3781 * Scan thru the flow entry tree, and commit any pending withdraw or
3782 * nonviable requests. We may need to push stats and/or unassign the
3783 * nexus from NECP, but we cannot do that while holding the locks;
3784 * build a temporary list for those entries.
3785 */
3786 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
3787 struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
3788 struct flow_owner *fo;
3789
3790 /*
3791 * Grab the lock at all costs when handling low power mode
3792 */
3793 if (__probable(!lowpowermode)) {
3794 if (!FOB_TRY_LOCK(fob)) {
3795 continue;
3796 }
3797 } else {
3798 FOB_LOCK(fob);
3799 }
3800
3801 FOB_LOCK_ASSERT_HELD(fob);
3802 RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
3803 struct flow_entry *fe;
3804
3805 RB_FOREACH(fe, flow_entry_id_tree,
3806 &fo->fo_flow_entry_id_head) {
3807 /* try first as reader; skip if we can't */
3808 if (__improbable(lowpowermode)) {
3809 fsw_flow_handle_low_power(fsw, fe);
3810 }
3811 if (__improbable(fe->fe_flags & FLOWENTF_HALF_CLOSED)) {
3812 atomic_bitclear_32(&fe->fe_flags, FLOWENTF_HALF_CLOSED);
3813 flow_namespace_half_close(&fe->fe_port_reservation);
3814 }
3815
3816 /* if not withdrawn/nonviable, skip */
3817 if (!fe->fe_want_withdraw &&
3818 !fe->fe_want_nonviable) {
3819 continue;
3820 }
3821 /*
3822 * Here we're holding the lock as writer;
3823 * don't spend too much time as we're
3824 * blocking the data path now.
3825 */
3826 ASSERT(!uuid_is_null(fe->fe_uuid));
3827 /* only need flow UUID and booleans */
3828 uuid_copy(sfed.fed_uuid, fe->fe_uuid);
3829 sfed.fed_want_clonotify =
3830 (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY);
3831 sfed.fed_want_nonviable = fe->fe_want_nonviable;
3832 flow_entry_teardown(fo, fe);
3833
3834 /* do this outside the flow bucket lock */
3835 fed = flow_entry_dead_alloc(Z_WAITOK);
3836 ASSERT(fed != NULL);
3837 *fed = sfed;
3838 LIST_INSERT_HEAD(&fed_head, fed, fed_link);
3839 }
3840 }
3841 FOB_UNLOCK(fob);
3842 }
3843
3844 /*
3845 * These nonviable flows are no longer useful since we've lost
3846 * the source IP address; in the event the client monitors the
3847 * viability of the flow, explicitly mark it as nonviable so
3848 * that a new flow can be created.
3849 */
3850 LIST_FOREACH_SAFE(fed, &fed_head, fed_link, tfed) {
3851 LIST_REMOVE(fed, fed_link);
3852 ASSERT(fsw->fsw_agent_session != NULL);
3853
3854 /* if flow is closed early */
3855 if (fed->fed_want_clonotify) {
3856 necp_client_early_close(fed->fed_uuid);
3857 }
3858
3859 /* if nonviable, unassign nexus attributes */
3860 if (fed->fed_want_nonviable) {
3861 (void) netagent_assign_nexus(fsw->fsw_agent_session,
3862 fed->fed_uuid, NULL, 0);
3863 }
3864
3865 flow_entry_dead_free(fed);
3866 ++nonviable;
3867 }
3868 ASSERT(LIST_EMPTY(&fed_head));
3869
3870 return nonviable;
3871 }
3872
3873 static uint32_t
fsw_process_linger(struct nx_flowswitch * fsw,uint32_t * abort)3874 fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort)
3875 {
3876 struct flow_entry_linger_head linger_head =
3877 TAILQ_HEAD_INITIALIZER(linger_head);
3878 struct flow_entry *fe, *tfe;
3879 uint64_t now = _net_uptime;
3880 uint32_t i = 0, cnt = 0, freed = 0;
3881
3882 ASSERT(fsw->fsw_ifp != NULL);
3883 ASSERT(abort != NULL);
3884 *abort = 0;
3885
3886 /*
3887 * We don't want to contend with the datapath, so move
3888 * everything that's in the linger list into a local list.
3889 * This allows us to generate RSTs or free the flow entry
3890 * outside the lock. Any remaining flow entry in the local
3891 * list will get re-added back to the head of the linger
3892 * list, in front of any new ones added since then.
3893 */
3894 lck_mtx_lock(&fsw->fsw_linger_lock);
3895 TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
3896 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
3897 cnt = fsw->fsw_linger_cnt;
3898 fsw->fsw_linger_cnt = 0;
3899 lck_mtx_unlock(&fsw->fsw_linger_lock);
3900
3901 TAILQ_FOREACH_SAFE(fe, &linger_head, fe_linger_link, tfe) {
3902 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
3903 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
3904 ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
3905
3906 /*
3907 * See if this is a TCP flow that needs to generate
3908 * a RST to the remote peer (if not already).
3909 */
3910 if (flow_track_tcp_want_abort(fe)) {
3911 VERIFY(fe->fe_flags & FLOWENTF_ABORTED);
3912 ASSERT(!uuid_is_null(fe->fe_uuid));
3913 flow_track_abort_tcp(fe, NULL, NULL);
3914 (*abort)++;
3915 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
3916 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx "
3917 "flags 0x%b [RST]", fe_as_string(fe, dbgbuf,
3918 sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags,
3919 FLOWENTF_BITS);
3920 }
3921
3922 /*
3923 * If flow has expired, remove from list and free;
3924 * otherwise leave it around in the linger list.
3925 */
3926 if (fe->fe_linger_expire <= now) {
3927 freed++;
3928 fsw_linger_remove_internal(&linger_head, fe);
3929 fe = NULL;
3930 }
3931 ++i;
3932 }
3933 VERIFY(i == cnt && cnt >= freed);
3934
3935 /*
3936 * Add any remaining ones back into the linger list.
3937 */
3938 lck_mtx_lock(&fsw->fsw_linger_lock);
3939 if (!TAILQ_EMPTY(&linger_head)) {
3940 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head) || fsw->fsw_linger_cnt);
3941 TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
3942 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
3943 TAILQ_CONCAT(&fsw->fsw_linger_head, &linger_head, fe_linger_link);
3944 fsw->fsw_linger_cnt += (cnt - freed);
3945 }
3946 ASSERT(TAILQ_EMPTY(&linger_head));
3947 lck_mtx_unlock(&fsw->fsw_linger_lock);
3948
3949 return freed;
3950 }
3951
3952 __attribute__((always_inline))
3953 static inline void
fsw_ifp_inc_traffic_class_in_pkt(struct ifnet * ifp,kern_packet_t ph)3954 fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph)
3955 {
3956 switch (__packet_get_traffic_class(ph)) {
3957 case PKT_TC_BE:
3958 ifp->if_tc.ifi_ibepackets++;
3959 ifp->if_tc.ifi_ibebytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
3960 break;
3961 case PKT_TC_BK:
3962 ifp->if_tc.ifi_ibkpackets++;
3963 ifp->if_tc.ifi_ibkbytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
3964 break;
3965 case PKT_TC_VI:
3966 ifp->if_tc.ifi_ivipackets++;
3967 ifp->if_tc.ifi_ivibytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
3968 break;
3969 case PKT_TC_VO:
3970 ifp->if_tc.ifi_ivopackets++;
3971 ifp->if_tc.ifi_ivobytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
3972 break;
3973 default:
3974 break;
3975 }
3976 }
3977
3978 __attribute__((always_inline))
3979 static inline void
fsw_ifp_inc_traffic_class_out_pkt(struct ifnet * ifp,uint32_t svc,uint32_t cnt,uint32_t len)3980 fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *ifp, uint32_t svc,
3981 uint32_t cnt, uint32_t len)
3982 {
3983 switch (svc) {
3984 case PKT_TC_BE:
3985 ifp->if_tc.ifi_obepackets += cnt;
3986 ifp->if_tc.ifi_obebytes += len;
3987 break;
3988 case PKT_TC_BK:
3989 ifp->if_tc.ifi_obkpackets += cnt;
3990 ifp->if_tc.ifi_obkbytes += len;
3991 break;
3992 case PKT_TC_VI:
3993 ifp->if_tc.ifi_ovipackets += cnt;
3994 ifp->if_tc.ifi_ovibytes += len;
3995 break;
3996 case PKT_TC_VO:
3997 ifp->if_tc.ifi_ovopackets += cnt;
3998 ifp->if_tc.ifi_ovobytes += len;
3999 break;
4000 default:
4001 break;
4002 }
4003 }
4004