1 /*
2 * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 */
53
54 /*
55 * BSD LICENSE
56 *
57 * Copyright(c) 2015 NEC Europe Ltd. All rights reserved.
58 * All rights reserved.
59 *
60 * Redistribution and use in source and binary forms, with or without
61 * modification, are permitted provided that the following conditions
62 * are met:
63 *
64 * * Redistributions of source code must retain the above copyright
65 * notice, this list of conditions and the following disclaimer.
66 * * Redistributions in binary form must reproduce the above copyright
67 * notice, this list of conditions and the following disclaimer in
68 * the documentation and/or other materials provided with the
69 * distribution.
70 * * Neither the name of NEC Europe Ltd. nor the names of
71 * its contributors may be used to endorse or promote products derived
72 * from this software without specific prior written permission.
73 *
74 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
75 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
76 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
77 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
78 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
79 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
80 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
81 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
82 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
83 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
84 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85 */
86
87 #include <skywalk/os_skywalk_private.h>
88 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
89 #include <skywalk/nexus/flowswitch/fsw_var.h>
90 #include <skywalk/nexus/netif/nx_netif.h>
91 #include <skywalk/nexus/netif/nx_netif_compat.h>
92 #include <kern/sched_prim.h>
93 #include <kern/uipc_domain.h>
94 #include <sys/kdebug.h>
95 #include <sys/sdt.h>
96 #include <net/bpf.h>
97 #include <net/if_ports_used.h>
98 #include <net/pktap.h>
99 #include <net/droptap.h>
100 #include <net/pktsched/pktsched_netem.h>
101 #include <netinet/tcp.h>
102 #include <netinet/udp.h>
103 #include <netinet/ip.h>
104 #include <netinet/ip6.h>
105 #include <netinet/in_var.h>
106
107 extern kern_return_t thread_terminate(thread_t);
108
109 #define FSW_ZONE_MAX 256
110 #define FSW_ZONE_NAME "skywalk.nx.fsw"
111
112 static uint64_t fsw_reap_last __sk_aligned(8);
113 static uint64_t fsw_want_purge __sk_aligned(8);
114
115 #define NX_FSW_FE_TABLESZ 256 /* some power of 2 */
116 static uint32_t fsw_fe_table_size = NX_FSW_FE_TABLESZ;
117
118 #define NX_FSW_FOB_HASHSZ 31 /* some mersenne prime */
119 static uint32_t fsw_flow_owner_buckets = NX_FSW_FOB_HASHSZ;
120
121 #define NX_FSW_FRB_HASHSZ 128 /* some power of 2 */
122 static uint32_t fsw_flow_route_buckets = NX_FSW_FRB_HASHSZ;
123
124 #define NX_FSW_FRIB_HASHSZ 13 /* some mersenne prime */
125 static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ;
126
127 #define NX_FSW_FLOW_REAP_INTERVAL 1 /* seconds */
128 static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL;
129
130 #define NX_FSW_RX_STALL_THRES 0 /* seconds (0 = disable) */
131 static uint32_t fsw_rx_stall_thresh = NX_FSW_RX_STALL_THRES;
132
133 #define NX_FSW_RX_STALL_DEFUNCT 1 /* defunct Rx-stalled channel (0 = disable) */
134 static uint32_t fsw_rx_stall_defunct = NX_FSW_RX_STALL_DEFUNCT;
135
136 #define NX_FSW_FLOW_PURGE_THRES 0 /* purge every N reaps (0 = disable) */
137 static uint32_t fsw_flow_purge_thresh = NX_FSW_FLOW_PURGE_THRES;
138
139 #define FSW_REAP_IVAL (MAX(1, fsw_flow_reap_interval))
140 #define FSW_REAP_SK_THRES (FSW_REAP_IVAL << 5)
141 #define FSW_REAP_IF_THRES (FSW_REAP_IVAL << 5)
142 #define FSW_DRAIN_CH_THRES (FSW_REAP_IVAL << 5)
143 #define FSW_IFSTATS_THRES 1
144
145 #define NX_FSW_CHANNEL_REAP_THRES 1000 /* threshold (bytes/sec) for reaping*/
146 uint64_t fsw_channel_reap_thresh = NX_FSW_CHANNEL_REAP_THRES;
147
148 #define RX_BUFLET_BATCH_COUNT 64 /* max batch size for buflet allocation */
149
150 uint32_t fsw_rx_batch = NX_FSW_RXBATCH; /* # of packets per batch (RX) */
151 uint32_t fsw_tx_batch = NX_FSW_TXBATCH; /* # of packets per batch (TX) */
152 uint32_t fsw_gso_batch = 8;
153 #if (DEVELOPMENT || DEBUG)
154 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_batch,
155 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_batch, 0,
156 "flowswitch Rx batch size");
157 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, tx_batch,
158 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_tx_batch, 0,
159 "flowswitch Tx batch size");
160 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_batch,
161 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_gso_batch, 0,
162 "flowswitch GSO batch size");
163 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, reap_throughput,
164 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_channel_reap_thresh,
165 "flowswitch channel reap threshold throughput (bytes/sec)");
166 #endif /* !DEVELOPMENT && !DEBUG */
167
168 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp,
169 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp, 0,
170 "flowswitch RX aggregation for tcp flows (enable/disable)");
171 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp_host,
172 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp_host, 0,
173 "flowswitch RX aggregation for tcp kernel path (0/1/2 (off/on/auto))");
174 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_mtu,
175 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_gso_mtu, 0,
176 "flowswitch GSO for tcp flows (mtu > 0: enable, mtu == 0: disable)");
177
178 /*
179 * IP reassembly
180 * The "kern.skywalk.flowswitch.ip_reass" sysctl can be used to force
181 * enable/disable the reassembly routine regardless of whether the
182 * transport netagent is enabled or not.
183 *
184 * 'fsw_ip_reass' is a tri-state:
185 * 0 means force IP reassembly off
186 * 1 means force IP reassembly on
187 * 2 means don't force the value, use what's appropriate for this flowswitch
188 */
189 #define FSW_IP_REASS_FORCE_OFF 0
190 #define FSW_IP_REASS_FORCE_ON 1
191 #define FSW_IP_REASS_AUTO 2
192
193 uint32_t fsw_ip_reass = FSW_IP_REASS_AUTO;
194
195 static int
196 fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS
197 {
198 #pragma unused(oidp, arg1, arg2)
199 unsigned int new_value;
200 int changed;
201 int error;
202
203 error = sysctl_io_number(req, fsw_ip_reass, sizeof(fsw_ip_reass),
204 &new_value, &changed);
205 if (error == 0 && changed != 0) {
206 if (new_value > FSW_IP_REASS_AUTO) {
207 return EINVAL;
208 }
209 fsw_ip_reass = new_value;
210 }
211 return error;
212 }
213
214 SYSCTL_PROC(_kern_skywalk_flowswitch, OID_AUTO, ip_reass,
215 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
216 0, 0, fsw_ip_reass_sysctl, "IU",
217 "adjust flowswitch IP reassembly");
218
219 #if (DEVELOPMENT || DEBUG)
220 static uint64_t _fsw_inject_error = 0;
221 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) \
222 _SK_INJECT_ERROR(_fsw_inject_error, _en, _ev, _ec, \
223 &FSW_STATS_VAL(_FSW_STATS_ERROR_INJECTIONS), _f, __VA_ARGS__)
224
225 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { \
226 if (__improbable(((_fsw_inject_error) & (1ULL << (_en))) != 0)) { \
227 SK_DF(SK_VERB_ERROR_INJECT, "injecting error %d", (_en));\
228 if ((_f) != NULL) \
229 (_f)(__VA_ARGS__); \
230 } \
231 } while (0)
232
233 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_owner_buckets,
234 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_owner_buckets, 0, "");
235 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, fe_table_size,
236 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_fe_table_size, 0, "");
237 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_buckets,
238 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_route_buckets, 0, "");
239 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO,
240 flow_route_id_buckets, CTLFLAG_RW | CTLFLAG_LOCKED,
241 &fsw_flow_route_id_buckets, 0, "");
242 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_reap_interval,
243 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_reap_interval, 0, "");
244 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_stall_thresh,
245 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_stall_thresh, 0, "");
246 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_stall_defunct,
247 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_stall_defunct, 0, "");
248 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_purge_thresh,
249 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_purge_thresh, 0, "");
250 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, fsw_inject_error,
251 CTLFLAG_RW | CTLFLAG_LOCKED, &_fsw_inject_error, "");
252 #else
253 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0)
254 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { } while (0)
255 #endif /* !DEVELOPMENT && !DEBUG */
256
257 static void fsw_linger_remove_internal(struct flow_entry_linger_head *,
258 struct flow_entry *);
259 static void fsw_reap_thread_func(void *, wait_result_t);
260 static void fsw_reap_thread_cont(void *, wait_result_t);
261 static void fsw_purge_cache(struct nx_flowswitch *, boolean_t);
262 static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t);
263 static uint32_t fsw_process_deferred(struct nx_flowswitch *);
264 static uint32_t fsw_process_linger(struct nx_flowswitch *, uint32_t *);
265 static void fsw_process_rxstrc(struct nx_flowswitch *);
266
267 static int copy_packet_from_dev(struct nx_flowswitch *, struct __kern_packet *,
268 struct __kern_packet *);
269
270 static void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *, kern_packet_t);
271 static void fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *, uint32_t,
272 uint32_t, uint32_t);
273
274 static int __fsw_dp_inited = 0;
275
276 int
fsw_dp_init(void)277 fsw_dp_init(void)
278 {
279 static_assert(FSW_VP_DEV == 0);
280 static_assert(FSW_VP_HOST == 1);
281 static_assert((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN);
282 static_assert((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT);
283
284 ASSERT(!__fsw_dp_inited);
285
286 flow_mgr_init();
287 flow_init();
288
289 __fsw_dp_inited = 1;
290
291 return 0;
292 }
293
294 void
fsw_dp_uninit(void)295 fsw_dp_uninit(void)
296 {
297 if (__fsw_dp_inited) {
298 flow_fini();
299 flow_mgr_fini();
300
301 __fsw_dp_inited = 0;
302 }
303 }
304
305 static void
dp_free_pktq(struct nx_flowswitch * fsw __sk_unused,struct pktq * pktq)306 dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq)
307 {
308 pp_free_pktq(pktq);
309 }
310
311 #define dp_drop_pktq(fsw, pktq, outgoing, _reason, line, _flags) do { \
312 uint32_t _len = KPKTQ_LEN(pktq); \
313 if (KPKTQ_EMPTY(pktq)) { \
314 ASSERT(_len == 0); \
315 break; \
316 } \
317 SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop %d packets", _len); \
318 FSW_STATS_ADD(FSW_STATS_DROP, _len); \
319 DTRACE_SKYWALK1(fsw__dp__drop, int, _len); \
320 if (__probable(droptap_total_tap_count == 0)) { \
321 dp_free_pktq(fsw, pktq); \
322 break; \
323 } \
324 drop_func_t dropfunc; \
325 dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
326 struct __kern_packet *kpkt = KPKTQ_FIRST(pktq); \
327 struct __kern_packet *next_pkt; \
328 for (; kpkt != NULL; kpkt = next_pkt) { \
329 next_pkt = kpkt->pkt_nextpkt; \
330 dropfunc(SK_PKT2PH(kpkt), _reason, __func__, line, _flags, \
331 fsw->fsw_ifp, kpkt->pkt_qum.qum_pid, NULL, -1, NULL, \
332 0, 0); \
333 } \
334 dp_free_pktq(fsw, pktq); \
335 } while (0)
336
337 #define dp_drop_pkt_single_nofree(fsw, pkt, outgoing, _reason, _flags) do { \
338 SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop 1 packet"); \
339 FSW_STATS_ADD(FSW_STATS_DROP, 1); \
340 if (__probable(droptap_total_tap_count == 0)) { \
341 break; \
342 } \
343 drop_func_t dropfunc; \
344 dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
345 dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags, \
346 fsw->fsw_ifp, (pkt)->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0); \
347 } while (0)
348
349 #define dp_drop_pkt_single(fsw, pkt, outgoing, _reason, _flags) do { \
350 SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop 1 packet"); \
351 FSW_STATS_ADD(FSW_STATS_DROP, 1); \
352 if (__probable(droptap_total_tap_count == 0)) { \
353 pp_free_packet_single(pkt); \
354 break; \
355 } \
356 drop_func_t dropfunc; \
357 dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
358 dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags, \
359 fsw->fsw_ifp, (pkt)->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0); \
360 pp_free_packet_single(pkt); \
361 } while (0)
362
363 #define dp_drop_pkt_chain(pkt, outgoing, _reason, _flags) do { \
364 if (__probable(droptap_total_tap_count == 0)) { \
365 pp_free_packet_chain(pkt, NULL); \
366 break; \
367 } \
368 drop_func_t dropfunc; \
369 dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
370 struct __kern_packet *next_pkt; \
371 for (; pkt != NULL; pkt = next_pkt) { \
372 next_pkt = pkt->pkt_nextpkt; \
373 dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags, \
374 NULL, pkt->pkt_qum.qum_pid, NULL, -1, NULL, \
375 0, 0); \
376 } \
377 pp_free_packet_chain(pkt, NULL); \
378 } while (0)
379
380
381 SK_NO_INLINE_ATTRIBUTE
382 void
fsw_snoop(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * pktq,bool input)383 fsw_snoop(struct nx_flowswitch *fsw, struct flow_entry *fe, struct pktq *pktq,
384 bool input)
385 {
386 pid_t pid;
387 char proc_name_buf[FLOW_PROCESS_NAME_LENGTH];
388 const char *__null_terminated proc_name = NULL;
389 pid_t epid;
390 char eproc_name_buf[FLOW_PROCESS_NAME_LENGTH];
391 const char *__null_terminated eproc_name = NULL;
392 sa_family_t af;
393 bool tap_early = false;
394 struct __kern_packet *pkt;
395
396 ASSERT(fe != NULL);
397 ASSERT(fsw->fsw_ifp != NULL);
398
399 if (fe->fe_nx_port == FSW_VP_HOST) {
400 /* allow packets to be tapped before aggregation happens */
401 tap_early = (input && fe->fe_key.fk_proto == IPPROTO_TCP);
402 if (!tap_early) {
403 /* all other traffic will be tapped in the dlil input path */
404 return;
405 }
406 }
407 if (fe->fe_key.fk_ipver == IPVERSION) {
408 af = AF_INET;
409 } else if (fe->fe_key.fk_ipver == IPV6_VERSION) {
410 af = AF_INET6;
411 } else {
412 return;
413 }
414
415 pid = fe->fe_pid;
416 if (fe->fe_proc_name[0] != '\0') {
417 proc_name = strbufcpy(proc_name_buf, sizeof(proc_name_buf),
418 fe->fe_proc_name, sizeof(fe->fe_proc_name));
419 }
420 epid = fe->fe_epid;
421 if (fe->fe_eproc_name[0] != '\0') {
422 eproc_name = strbufcpy(eproc_name_buf, sizeof(eproc_name_buf),
423 fe->fe_eproc_name, sizeof(fe->fe_eproc_name));
424 }
425 if (input) {
426 KPKTQ_FOREACH(pkt, pktq) {
427 pktap_input_packet(fsw->fsw_ifp, af,
428 fsw->fsw_ifp_dlt, pid, proc_name, epid,
429 eproc_name, SK_PKT2PH(pkt), NULL, 0,
430 IPPROTO_TCP, fe->fe_flowid,
431 tap_early ? PTH_FLAG_SOCKET: PTH_FLAG_NEXUS_CHAN);
432 }
433 } else {
434 KPKTQ_FOREACH(pkt, pktq) {
435 pktap_output_packet(fsw->fsw_ifp, af,
436 fsw->fsw_ifp_dlt, pid, proc_name, epid,
437 eproc_name, SK_PKT2PH(pkt), NULL, 0,
438 0, 0, PTH_FLAG_NEXUS_CHAN);
439 }
440 }
441 }
442
443 #if (DEVELOPMENT || DEBUG)
444 static void
_fsw_error35_handler(int step,struct flow_route * fr,struct __kern_packet * pkt,int * ret)445 _fsw_error35_handler(int step, struct flow_route *fr, struct __kern_packet *pkt,
446 int *ret)
447 {
448 static boolean_t _err35_flag_modified = FALSE;
449
450 switch (step) {
451 case 1:
452 if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
453 (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
454 fr->fr_flags &= ~FLOWRTF_RESOLVED;
455 _err35_flag_modified = TRUE;
456 }
457 break;
458
459 case 2:
460 if (!_err35_flag_modified) {
461 return;
462 }
463 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
464 m_freem(pkt->pkt_mbuf);
465 pkt->pkt_pflags &= ~PKT_F_MBUF_DATA;
466 pkt->pkt_mbuf = NULL;
467 }
468 *ret = EJUSTRETURN;
469 fr->fr_flags |= FLOWRTF_RESOLVED;
470 _err35_flag_modified = FALSE;
471 break;
472
473 default:
474 VERIFY(0);
475 /* not reached */
476 }
477 }
478
479 static void
_fsw_error36_handler(int step,struct flow_route * fr,int * ret)480 _fsw_error36_handler(int step, struct flow_route *fr, int *ret)
481 {
482 static boolean_t _err36_flag_modified = FALSE;
483
484 switch (step) {
485 case 1:
486 if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
487 (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
488 fr->fr_flags &= ~FLOWRTF_RESOLVED;
489 _err36_flag_modified = TRUE;
490 }
491 break;
492
493 case 2:
494 if (!_err36_flag_modified) {
495 return;
496 }
497 *ret = ENETUNREACH;
498 fr->fr_flags |= FLOWRTF_RESOLVED;
499 _err36_flag_modified = FALSE;
500 break;
501
502 default:
503 VERIFY(0);
504 /* not reached */
505 }
506 }
507 #else /* !DEVELOPMENT && !DEBUG */
508 #define _fsw_error35_handler(...)
509 #define _fsw_error36_handler(...)
510 #endif /* DEVELOPMENT || DEBUG */
511
512 /*
513 * Check if the source packet content can fit into the destination
514 * ring's packet. Returns TRUE if the source packet can fit.
515 * Note: Failures could be caused by misconfigured packet pool sizes,
516 * missing packet size check again MTU or if the source packet is from
517 * a compat netif and the attached mbuf is larger than MTU due to LRO.
518 */
519 static inline boolean_t
validate_pkt_len(struct __kern_packet * spkt,kern_packet_t dph,uint32_t skip_l2hlen,uint32_t l2hlen,uint16_t headroom,uint32_t * copy_len)520 validate_pkt_len(struct __kern_packet *spkt, kern_packet_t dph,
521 uint32_t skip_l2hlen, uint32_t l2hlen, uint16_t headroom,
522 uint32_t *copy_len)
523 {
524 uint32_t tlen = 0;
525 uint32_t splen = spkt->pkt_length - skip_l2hlen;
526
527 if (l2hlen != 0) {
528 VERIFY(skip_l2hlen == 0);
529 tlen += l2hlen;
530 } else if ((spkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0) {
531 splen -= ETHER_CRC_LEN;
532 }
533
534 tlen += splen;
535 *copy_len = splen;
536
537 return tlen <= ((__packet_get_buflet_count(dph) *
538 PP_BUF_SIZE_DEF(SK_PTR_ADDR_KPKT(dph)->pkt_qum.qum_pp)) -
539 headroom);
540 }
541
542 #if SK_LOG
543 /* Hoisted out of line to reduce kernel stack footprint */
544 SK_LOG_ATTRIBUTE
545 static void
copy_packet_from_dev_log(struct __kern_packet * spkt,struct __kern_packet * dpkt,struct proc * p)546 copy_packet_from_dev_log(struct __kern_packet *spkt,
547 struct __kern_packet *dpkt, struct proc *p)
548 {
549 uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
550 ((spkt->pkt_pflags & PKT_F_MBUF_DATA) ?
551 SK_VERB_COPY_MBUF : SK_VERB_COPY));
552 char *daddr;
553 uint32_t pkt_len;
554
555 MD_BUFLET_ADDR_ABS(dpkt, daddr);
556 pkt_len = __packet_get_real_data_length(dpkt);
557 SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u",
558 sk_proc_name(p), sk_proc_pid(p), spkt->pkt_length,
559 dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
560 (uint32_t)dpkt->pkt_l2_len);
561 SK_DF(logflags | SK_VERB_DUMP, "%s",
562 sk_dump("buf", daddr, pkt_len, 128));
563 }
564 #else
565 #define copy_packet_from_dev_log(...)
566 #endif /* SK_LOG */
567
568
569 static inline int
copy_packet_from_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)570 copy_packet_from_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
571 struct __kern_packet *dpkt)
572 {
573 /*
574 * source and destination nexus don't share the packet pool
575 * sync operation here is to
576 * - alloc packet for the rx(dst) ring
577 * - copy data/metadata from src packet to dst packet
578 * - attach alloc'd packet to rx(dst) ring
579 */
580 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
581 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
582 kern_packet_t sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt),
583 METADATA_SUBTYPE(spkt));
584 boolean_t do_cksum_rx;
585 uint16_t skip_l2h_len = spkt->pkt_l2_len;
586 uint16_t iphlen;
587 uint32_t dlen;
588 int err;
589
590 if (__improbable(!validate_pkt_len(spkt, dph, skip_l2h_len, 0, 0,
591 &dlen))) {
592 SK_ERR("bufcnt %d, bufsz %d", __packet_get_buflet_count(dph),
593 PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp));
594 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
595 return EINVAL;
596 }
597
598 /* Copy packet metadata */
599 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
600 _PKT_COPY(spkt, dpkt);
601 ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
602 PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
603 ASSERT(dpkt->pkt_mbuf == NULL);
604
605 dpkt->pkt_headroom = 0;
606 dpkt->pkt_l2_len = 0;
607
608 /* don't include IP header from partial sum */
609 if (__probable((spkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) {
610 iphlen = spkt->pkt_flow_ip_hlen;
611 do_cksum_rx = sk_cksum_rx;
612 } else {
613 iphlen = 0;
614 do_cksum_rx = FALSE;
615 }
616
617 /* Copy packet payload */
618 if ((spkt->pkt_pflags & PKT_F_MBUF_DATA) &&
619 (spkt->pkt_pflags & PKT_F_TRUNCATED)) {
620 FSW_STATS_INC(FSW_STATS_RX_COPY_MBUF2PKT);
621 /*
622 * Source packet has truncated contents (just enough for
623 * the classifer) of an mbuf from the compat driver; copy
624 * the entire entire mbuf contents to destination packet.
625 */
626 m_adj(spkt->pkt_mbuf, skip_l2h_len);
627 ASSERT((uint32_t)m_pktlen(spkt->pkt_mbuf) >= dlen);
628 fsw->fsw_pkt_copy_from_mbuf(NR_RX, dph, 0,
629 spkt->pkt_mbuf, 0, dlen, do_cksum_rx, iphlen);
630 } else {
631 FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2PKT);
632 /*
633 * Source packet has full contents, either from an mbuf
634 * that came up from the compat driver, or because it
635 * originated on the native driver; copy to destination.
636 */
637 fsw->fsw_pkt_copy_from_pkt(NR_RX, dph, 0, sph,
638 (spkt->pkt_headroom + spkt->pkt_l2_len), dlen, do_cksum_rx,
639 iphlen, 0, FALSE);
640 }
641
642 #if DEBUG || DEVELOPMENT
643 if (__improbable(pkt_trailers > 0)) {
644 dlen += pkt_add_trailers(dph, dlen, iphlen);
645 }
646 #endif /* DEBUG || DEVELOPMENT */
647
648 /* Finalize and attach packet to Rx ring */
649 METADATA_ADJUST_LEN(dpkt, 0, 0);
650 err = __packet_finalize(dph);
651 VERIFY(err == 0);
652
653 copy_packet_from_dev_log(spkt, dpkt, kernproc);
654
655 if (spkt->pkt_pflags & PKT_F_MBUF_DATA) {
656 ifp_inc_traffic_class_in(fsw->fsw_ifp, spkt->pkt_mbuf);
657 mbuf_freem(spkt->pkt_mbuf);
658 KPKT_CLEAR_MBUF_DATA(spkt);
659 } else {
660 fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph);
661 }
662
663 if (__probable(do_cksum_rx != 0)) {
664 FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
665 }
666
667 return 0;
668 }
669
670 SK_NO_INLINE_ATTRIBUTE
671 static struct __kern_packet *
rx_process_ip_frag(struct nx_flowswitch * fsw,struct __kern_packet * pkt)672 rx_process_ip_frag(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
673 {
674 char *pkt_buf;
675 void *l3_hdr;
676 uint16_t nfrags, tlen;
677 int err = 0;
678
679 switch (fsw_ip_reass) {
680 case FSW_IP_REASS_FORCE_OFF:
681 return pkt;
682 case FSW_IP_REASS_FORCE_ON:
683 break;
684 default:
685 if (!FSW_NETAGENT_ENABLED(fsw) ||
686 flow_mgr_get_num_flows(fsw->fsw_flow_mgr) == 0) {
687 return pkt;
688 }
689 break;
690 }
691
692 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
693 l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
694
695 ASSERT(fsw->fsw_ipfm != NULL);
696 ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
697
698 if (pkt->pkt_flow_ip_ver == IPVERSION) {
699 struct ip *ip = l3_hdr;
700 err = fsw_ip_frag_reass_v4(fsw->fsw_ipfm, &pkt, ip, &nfrags, &tlen);
701 } else {
702 struct ip6_hdr *ip6_hdr = l3_hdr;
703 struct ip6_frag *__single ip6_frag =
704 (struct ip6_frag *)((uint8_t *)l3_hdr + sizeof(struct ip6_hdr));
705
706 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
707 /* we only handle frag header immediately after v6 header */
708 err = fsw_ip_frag_reass_v6(fsw->fsw_ipfm, &pkt, ip6_hdr, ip6_frag,
709 &nfrags, &tlen);
710 }
711 if (__improbable(err != 0)) {
712 /* if we get a bad fragment, free it */
713 pp_free_packet_single(pkt);
714 pkt = NULL;
715 } else {
716 ASSERT(!((pkt != NULL) ^ (nfrags > 0)));
717 }
718
719 return pkt;
720 }
721
722 SK_NO_INLINE_ATTRIBUTE
723 static void
rx_prepare_packet_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)724 rx_prepare_packet_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
725 {
726 ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
727 uint32_t mlen = (uint32_t)m_pktlen(pkt->pkt_mbuf);
728 kern_packet_t ph = SK_PTR_ENCODE(pkt,
729 METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
730 /*
731 * This is the case when the packet is coming in from
732 * compat-netif. This packet only has valid metadata
733 * and an attached mbuf. We need to copy enough data
734 * from the mbuf to the packet buffer for the
735 * classifier. Compat netif packet pool is configured
736 * with buffer size of NETIF_COMPAT_MAX_MBUF_DATA_COPY
737 * which is just enough to hold the protocol headers
738 * for the flowswitch classifier.
739 */
740
741 pkt->pkt_headroom = 0;
742 METADATA_ADJUST_LEN(pkt, 0, 0);
743 /*
744 * Copy the initial 128 bytes of the packet for
745 * classification.
746 * Ethernet(14) + IPv6 header(40) +
747 * + IPv6 fragment header(8) +
748 * TCP header with options(60).
749 */
750 fsw->fsw_pkt_copy_from_mbuf(NR_RX, ph,
751 pkt->pkt_headroom, pkt->pkt_mbuf, 0,
752 MIN(mlen, NETIF_COMPAT_MAX_MBUF_DATA_COPY),
753 FALSE, 0);
754
755 int err = __packet_finalize_with_mbuf(pkt);
756 VERIFY(err == 0);
757 }
758
759 static struct __kern_packet *
rx_prepare_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)760 rx_prepare_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
761 {
762 pkt->pkt_qum_qflags &= ~QUM_F_FLOW_CLASSIFIED;
763
764 if (__improbable(pkt->pkt_pflags & PKT_F_MBUF_DATA)) {
765 rx_prepare_packet_mbuf(fsw, pkt);
766 }
767
768 return pkt;
769 }
770
771 static struct flow_entry *
lookup_flow_with_pkt(struct nx_flowswitch * fsw,struct __kern_packet * pkt,bool input,struct flow_entry * prev_fe)772 lookup_flow_with_pkt(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
773 bool input, struct flow_entry *prev_fe)
774 {
775 struct flow_key key __sk_aligned(16);
776 struct flow_entry *__single fe = NULL;
777
778 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
779 flow_pkt2key(pkt, input, &key);
780
781 if (__probable(prev_fe != NULL &&
782 prev_fe->fe_key.fk_mask == FKMASK_5TUPLE)) {
783 uint16_t saved_mask = key.fk_mask;
784 key.fk_mask = FKMASK_5TUPLE;
785 if (flow_key_cmp_mask(&prev_fe->fe_key, &key, &fk_mask_5tuple) == 0) {
786 flow_entry_retain(prev_fe);
787 fe = prev_fe;
788 } else {
789 key.fk_mask = saved_mask;
790 }
791 }
792
793 top:
794 if (__improbable(fe == NULL)) {
795 fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &key);
796 }
797
798 if (__improbable(fe != NULL &&
799 (fe->fe_flags & (FLOWENTF_PARENT | FLOWENTF_CHILD)) != 0)) {
800 /* Rx */
801 if (input) {
802 if (fe->fe_flags & FLOWENTF_PARENT) {
803 struct flow_entry *child_fe = rx_lookup_child_flow(fsw, fe, pkt);
804 if (child_fe != NULL) {
805 flow_entry_release(&fe);
806 fe = child_fe;
807 }
808 } else {
809 if (!rx_flow_demux_match(fsw, fe, pkt)) {
810 flow_entry_release(&fe);
811 fe = NULL;
812 goto top;
813 }
814 }
815 } else {
816 /* Tx */
817 if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
818 if (__probable(fe->fe_flags & FLOWENTF_PARENT)) {
819 struct flow_entry *__single parent_fe = fe;
820 fe = tx_lookup_child_flow(parent_fe, pkt->pkt_flow_id);
821 flow_entry_release(&parent_fe);
822 } else {
823 flow_entry_release(&fe);
824 fe = NULL;
825 goto top;
826 }
827 }
828 }
829 }
830
831 SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]);
832 SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP,
833 "%s %s %s \"%s\" fe %p",
834 input ? "Rx" : "Tx", if_name(fsw->fsw_ifp),
835 sk_proc_name(current_proc()),
836 fk2str(&key, fkbuf, sizeof(fkbuf)), SK_KVA(fe));
837
838 return fe;
839 }
840
841 SK_NO_INLINE_ATTRIBUTE
842 static bool
pkt_is_for_listener(struct flow_entry * fe,struct __kern_packet * pkt)843 pkt_is_for_listener(struct flow_entry *fe, struct __kern_packet *pkt)
844 {
845 struct nx_flowswitch *fsw = fe->fe_fsw;
846 struct ifnet *ifp = fsw->fsw_ifp;
847 struct in_ifaddr *ia = NULL;
848 struct in_ifaddr *best_ia = NULL;
849 struct in6_ifaddr *ia6 = NULL;
850 struct in6_ifaddr *best_ia6 = NULL;
851 struct ifnet *match_ifp = NULL;
852 struct __flow *flow = pkt->pkt_flow;
853 bool result = false;
854
855 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
856
857 if (flow->flow_ip_ver == IPVERSION) {
858 if (IN_ZERONET(ntohl(flow->flow_ipv4_dst.s_addr)) ||
859 IN_LOOPBACK(ntohl(flow->flow_ipv4_dst.s_addr)) ||
860 IN_LINKLOCAL(ntohl(flow->flow_ipv4_dst.s_addr)) ||
861 IN_DS_LITE(ntohl(flow->flow_ipv4_dst.s_addr)) ||
862 IN_6TO4_RELAY_ANYCAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
863 IN_MULTICAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
864 INADDR_BROADCAST == flow->flow_ipv4_dst.s_addr) {
865 result = true;
866 goto done;
867 }
868
869 /*
870 * Check for a match in the hash bucket.
871 */
872 lck_rw_lock_shared(&in_ifaddr_rwlock);
873 TAILQ_FOREACH(ia, INADDR_HASH(flow->flow_ipv4_dst.s_addr), ia_hash) {
874 if (IA_SIN(ia)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr) {
875 best_ia = ia;
876 match_ifp = ia->ia_ifp;
877
878 if (match_ifp == ifp) {
879 break;
880 }
881 /*
882 * Continue the loop in case there's a exact match with another
883 * interface
884 */
885 }
886 }
887
888 if (best_ia != NULL) {
889 if (match_ifp != ifp && ipforwarding == 0 &&
890 (match_ifp->if_family == IFNET_FAMILY_IPSEC ||
891 match_ifp->if_family == IFNET_FAMILY_UTUN)) {
892 /*
893 * Drop when interface address check is strict and forwarding
894 * is disabled
895 */
896 } else {
897 lck_rw_done(&in_ifaddr_rwlock);
898 result = true;
899 goto done;
900 }
901 }
902 lck_rw_done(&in_ifaddr_rwlock);
903
904 if (ifp->if_flags & IFF_BROADCAST) {
905 /*
906 * Check for broadcast addresses.
907 *
908 * Only accept broadcast packets that arrive via the matching
909 * interface. Reception of forwarded directed broadcasts would be
910 * handled via ip_forward() and ether_frameout() with the loopback
911 * into the stack for SIMPLEX interfaces handled by ether_frameout().
912 */
913 struct ifaddr *ifa;
914
915 ifnet_lock_shared(ifp);
916 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
917 if (ifa->ifa_addr->sa_family != AF_INET) {
918 continue;
919 }
920 ia = ifatoia(ifa);
921 if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr ||
922 ia->ia_netbroadcast.s_addr == flow->flow_ipv4_dst.s_addr) {
923 ifnet_lock_done(ifp);
924 result = true;
925 goto done;
926 }
927 }
928 ifnet_lock_done(ifp);
929 }
930 } else {
931 struct in6_ifaddrhashhead *ia6_hash_head;
932
933 if (IN6_IS_ADDR_LOOPBACK(&flow->flow_ipv6_dst) ||
934 IN6_IS_ADDR_LINKLOCAL(&flow->flow_ipv6_dst) ||
935 IN6_IS_ADDR_MULTICAST(&flow->flow_ipv6_dst)) {
936 result = true;
937 goto done;
938 }
939
940 /*
941 * Check for exact addresses in the hash bucket.
942 */
943 lck_rw_lock_shared(&in6_ifaddr_rwlock);
944 /* XXX -fbounds-safety: external dependency on ip6_input.c */
945 ia6_hash_head = __unsafe_forge_bidi_indexable(struct in6_ifaddrhashhead *,
946 in6_ifaddrhashtbl, in6addr_nhash * sizeof(*in6_ifaddrhashtbl));
947 ia6_hash_head = &ia6_hash_head[in6addr_hashval(&flow->flow_ipv6_dst)];
948
949 TAILQ_FOREACH(ia6, ia6_hash_head, ia6_hash) {
950 if (in6_are_addr_equal_scoped(&ia6->ia_addr.sin6_addr, &flow->flow_ipv6_dst,
951 ia6->ia_ifp->if_index, ifp->if_index)) {
952 if ((ia6->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46))) {
953 continue;
954 }
955 best_ia6 = ia6;
956 if (ia6->ia_ifp == ifp) {
957 break;
958 }
959 /*
960 * Continue the loop in case there's a exact match with another
961 * interface
962 */
963 }
964 }
965 if (best_ia6 != NULL) {
966 if (best_ia6->ia_ifp != ifp && ip6_forwarding == 0 &&
967 (best_ia6->ia_ifp->if_family == IFNET_FAMILY_IPSEC ||
968 best_ia6->ia_ifp->if_family == IFNET_FAMILY_UTUN)) {
969 /*
970 * Drop when interface address check is strict and forwarding
971 * is disabled
972 */
973 } else {
974 lck_rw_done(&in6_ifaddr_rwlock);
975 result = true;
976 goto done;
977 }
978 }
979 lck_rw_done(&in6_ifaddr_rwlock);
980 }
981
982 /*
983 * In forwarding mode, if the destination address
984 * of the packet does not match any interface
985 * address, it maybe destined to the client device
986 */
987 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
988 "Rx flow does not match interface address");
989 done:
990 return result;
991 }
992
993 static struct flow_entry *
rx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)994 rx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
995 struct flow_entry *prev_fe)
996 {
997 struct flow_entry *__single fe;
998
999 fe = lookup_flow_with_pkt(fsw, pkt, true, prev_fe);
1000 _FSW_INJECT_ERROR(2, fe, NULL, flow_entry_release, &fe);
1001 if (fe == NULL) {
1002 FSW_STATS_INC(FSW_STATS_RX_FLOW_NOT_FOUND);
1003 return NULL;
1004 }
1005
1006 if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE &&
1007 fe->fe_flags & FLOWENTF_LISTENER) &&
1008 !pkt_is_for_listener(fe, pkt)) {
1009 FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_LISTENER);
1010 flow_entry_release(&fe);
1011 return NULL;
1012 }
1013
1014 if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
1015 FSW_STATS_INC(FSW_STATS_RX_FLOW_TORNDOWN);
1016 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
1017 "Rx flow torn down");
1018 flow_entry_release(&fe);
1019 return NULL;
1020 }
1021
1022 if (__improbable(fe->fe_flags & FLOWENTF_AOP_OFFLOAD)) {
1023 FSW_STATS_INC(FSW_STATS_RX_DISABLED);
1024 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
1025 "Rx not allowed for this flow");
1026 flow_entry_release(&fe);
1027 }
1028 return fe;
1029 }
1030
1031 static inline void
rx_flow_batch_packets(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt,uint64_t tid)1032 rx_flow_batch_packets(struct flow_entry_list *fes, struct flow_entry *fe,
1033 struct __kern_packet *pkt, uint64_t tid)
1034 {
1035 /*
1036 * Among threads working on the same fe, the first thread that reaches here
1037 * will be responsible for processing all the packets until a point when
1038 * it does not see new packets in fe_rx_pktq. Other threads only
1039 * enqueue their packets but do not add the flow entry to their flow entry list.
1040 */
1041 lck_mtx_lock(&fe->fe_rx_pktq_lock);
1042
1043 if (fe->fe_rx_worker_tid == 0) {
1044 fe->fe_rx_worker_tid = tid;
1045 } else if (__improbable(fe->fe_rx_worker_tid != tid)) {
1046 STATS_INC(&fe->fe_fsw->fsw_stats, FSW_STATS_RX_FLOW_IN_USE);
1047 }
1048
1049 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
1050 fe->fe_rx_frag_count++;
1051 }
1052
1053 fe->fe_rx_pktq_bytes += pkt->pkt_flow_ulen;
1054 /* KPKTQ_ENQUEUE_LIST is needed until frags become chained buflet */
1055 if (KPKTQ_EMPTY(&fe->fe_rx_pktq) && tid == fe->fe_rx_worker_tid) {
1056 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) == 0);
1057 TAILQ_INSERT_TAIL(fes, fe, fe_rx_link);
1058 KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
1059 lck_mtx_unlock(&fe->fe_rx_pktq_lock);
1060 } else {
1061 KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
1062 lck_mtx_unlock(&fe->fe_rx_pktq_lock);
1063 flow_entry_release(&fe);
1064 }
1065 }
1066
1067 static void
tx_flow_batch_packet(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt)1068 tx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
1069 struct __kern_packet *pkt)
1070 {
1071 /* record frag continuation */
1072 if (__improbable(pkt->pkt_flow_ip_is_first_frag)) {
1073 ASSERT(pkt->pkt_flow_ip_is_frag);
1074 fe->fe_tx_is_cont_frag = true;
1075 fe->fe_tx_frag_id = pkt->pkt_flow_ip_frag_id;
1076 } else if (__probable(!pkt->pkt_flow_ip_is_frag)) {
1077 fe->fe_tx_is_cont_frag = false;
1078 fe->fe_tx_frag_id = 0;
1079 }
1080
1081 if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
1082 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
1083 TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
1084 KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
1085 } else {
1086 ASSERT(!TAILQ_EMPTY(fes));
1087 KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
1088 flow_entry_release(&fe);
1089 }
1090 }
1091
1092 static inline void
fsw_rx_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes)1093 fsw_rx_ring_dequeue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
1094 uint32_t n_pkts_max, struct pktq *pktq, uint32_t *n_bytes)
1095 {
1096 uint32_t n_pkts = 0;
1097 slot_idx_t idx, idx_end;
1098 idx = r->ckr_khead;
1099 idx_end = r->ckr_rhead;
1100
1101 ASSERT(KPKTQ_EMPTY(pktq));
1102 *n_bytes = 0;
1103 for (; n_pkts < n_pkts_max && idx != idx_end;
1104 idx = SLOT_NEXT(idx, r->ckr_lim)) {
1105 struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1106 struct __kern_packet *pkt = ksd->sd_pkt;
1107
1108 ASSERT(pkt->pkt_nextpkt == NULL);
1109 KR_SLOT_DETACH_METADATA(r, ksd);
1110
1111 _FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1112 pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1113 if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1114 || (pkt->pkt_length == 0)) {
1115 FSW_STATS_INC(FSW_STATS_DROP);
1116 pp_free_packet_single(pkt);
1117 continue;
1118 }
1119 n_pkts++;
1120 *n_bytes += pkt->pkt_length;
1121
1122 KPKTQ_ENQUEUE(pktq, pkt);
1123 }
1124 r->ckr_khead = idx;
1125 r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
1126 }
1127
1128 /*
1129 * This is only for estimating how many packets each GSO packet will need.
1130 * The number does not need to be exact because any leftover packets allocated
1131 * will be freed.
1132 */
1133 static uint32_t
estimate_gso_pkts(struct __kern_packet * pkt)1134 estimate_gso_pkts(struct __kern_packet *pkt)
1135 {
1136 packet_tso_flags_t tso_flags;
1137 uint16_t mss;
1138 uint32_t n_pkts = 0, total_hlen = 0, total_len = 0;
1139
1140 tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
1141 mss = pkt->pkt_proto_seg_sz;
1142
1143 if (tso_flags == PACKET_TSO_IPV4) {
1144 total_hlen = sizeof(struct ip) + sizeof(struct tcphdr);
1145 } else if (tso_flags == PACKET_TSO_IPV6) {
1146 total_hlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1147 }
1148 if (total_hlen != 0 && mss != 0) {
1149 total_len = pkt->pkt_length;
1150 n_pkts = (uint32_t)
1151 (SK_ROUNDUP((total_len - total_hlen), mss) / mss);
1152 }
1153 DTRACE_SKYWALK5(estimate__gso, packet_tso_flags_t, tso_flags,
1154 uint32_t, total_hlen, uint32_t, total_len, uint16_t, mss,
1155 uint32_t, n_pkts);
1156 return n_pkts;
1157 }
1158
1159 /*
1160 * This function retrieves a chain of packets of the same type only
1161 * (GSO or non-GSO).
1162 */
1163 static inline void
fsw_tx_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes,uint32_t * gso_pkts_estimate)1164 fsw_tx_ring_dequeue_pktq(struct nx_flowswitch *fsw,
1165 struct __kern_channel_ring *r, uint32_t n_pkts_max,
1166 struct pktq *pktq, uint32_t *n_bytes, uint32_t *gso_pkts_estimate)
1167 {
1168 uint32_t n_pkts = 0;
1169 slot_idx_t idx, idx_end;
1170 idx = r->ckr_khead;
1171 idx_end = r->ckr_rhead;
1172 struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
1173 boolean_t gso_enabled, gso_required;
1174 uint32_t gso_pkts;
1175
1176 gso_enabled = (fsw->fsw_tso_mode == FSW_TSO_MODE_SW);
1177 ASSERT(KPKTQ_EMPTY(pktq));
1178 *n_bytes = 0;
1179 for (; n_pkts < n_pkts_max &&
1180 (!gso_enabled || fsw_gso_batch == 0 ||
1181 *gso_pkts_estimate < fsw_gso_batch) &&
1182 idx != idx_end; idx = SLOT_NEXT(idx, r->ckr_lim)) {
1183 struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1184 struct __kern_packet *pkt = ksd->sd_pkt;
1185
1186 ASSERT(pkt->pkt_nextpkt == NULL);
1187
1188 _FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1189 pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1190 if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1191 || (pkt->pkt_length == 0)) {
1192 KR_SLOT_DETACH_METADATA(r, ksd);
1193 FSW_STATS_INC(FSW_STATS_DROP);
1194 pp_free_packet_single(pkt);
1195 continue;
1196 }
1197 if (gso_enabled) {
1198 gso_pkts = estimate_gso_pkts(pkt);
1199
1200 /*
1201 * We use the first packet to determine what
1202 * type the subsequent ones need to be (GSO or
1203 * non-GSO).
1204 */
1205 if (n_pkts == 0) {
1206 gso_required = (gso_pkts != 0);
1207 } else {
1208 if (gso_required != (gso_pkts != 0)) {
1209 break;
1210 }
1211 }
1212 *gso_pkts_estimate += gso_pkts;
1213 }
1214 KR_SLOT_DETACH_METADATA(r, ksd);
1215 if (NA_CHANNEL_EVENT_ATTACHED(&vpna->vpna_up)) {
1216 __packet_set_tx_nx_port(SK_PKT2PH(pkt),
1217 vpna->vpna_nx_port, vpna->vpna_gencnt);
1218 }
1219 n_pkts++;
1220 *n_bytes += pkt->pkt_length;
1221 KPKTQ_ENQUEUE(pktq, pkt);
1222 }
1223 r->ckr_khead = idx;
1224 r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
1225 DTRACE_SKYWALK5(tx__ring__dequeue, struct nx_flowswitch *, fsw,
1226 ifnet_t, fsw->fsw_ifp, uint32_t, n_pkts, uint32_t, *n_bytes,
1227 uint32_t, *gso_pkts_estimate);
1228 }
1229
1230 static void
fsw_ring_enqueue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1231 fsw_ring_enqueue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
1232 struct pktq *pktq)
1233 {
1234 #pragma unused(fsw)
1235 struct __kern_packet *pkt;
1236 struct __kern_quantum *kqum;
1237 uint32_t kr_space_avail = 0;
1238 uint32_t n, n_pkts = 0, n_bytes = 0;
1239 slot_idx_t idx = 0, idx_start = 0, idx_end = 0;
1240
1241 kr_enter(r, TRUE);
1242
1243 idx_start = r->ckr_ktail;
1244 kr_space_avail = kr_available_slots_rxring(r);
1245 _FSW_INJECT_ERROR(40, kr_space_avail, 0, null_func);
1246 n = MIN(kr_space_avail, KPKTQ_LEN(pktq));
1247 _FSW_INJECT_ERROR(41, n, 0, null_func);
1248 idx_end = SLOT_INCREMENT(idx_start, n, r->ckr_lim);
1249
1250 idx = idx_start;
1251 while (idx != idx_end) {
1252 KPKTQ_DEQUEUE(pktq, pkt);
1253 kqum = SK_PTR_ADDR_KQUM(pkt);
1254 kqum->qum_qflags |= QUM_F_FINALIZED;
1255 n_pkts++;
1256 n_bytes += pkt->pkt_length;
1257 KR_SLOT_ATTACH_METADATA(r, KR_KSD(r, idx), kqum);
1258 if (__improbable(pkt->pkt_trace_id != 0)) {
1259 KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
1260 KDBG(SK_KTRACE_PKT_RX_CHN | DBG_FUNC_START, pkt->pkt_trace_id);
1261 }
1262 idx = SLOT_NEXT(idx, r->ckr_lim);
1263 }
1264
1265 kr_update_stats(r, n_pkts, n_bytes);
1266
1267 /*
1268 * ensure slot attachments are visible before updating the
1269 * tail pointer
1270 */
1271 os_atomic_thread_fence(seq_cst);
1272
1273 r->ckr_ktail = idx_end;
1274
1275 kr_exit(r);
1276
1277 r->ckr_na_notify(r, kernproc, NA_NOTEF_PUSH);
1278
1279 SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "%s enqueued %d pkts",
1280 r->ckr_name, n_pkts);
1281 }
1282
1283 static void
pkts_to_pktq(struct __kern_packet ** __counted_by (n_pkts)pkts,uint32_t n_pkts,struct pktq * pktq)1284 pkts_to_pktq(struct __kern_packet **__counted_by(n_pkts)pkts, uint32_t n_pkts, struct pktq *pktq)
1285 {
1286 ASSERT(KPKTQ_EMPTY(pktq));
1287
1288 for (uint32_t i = 0; i < n_pkts; i++) {
1289 struct __kern_packet *__single pkt = pkts[i];
1290 ASSERT(pkt->pkt_nextpkt == NULL);
1291 KPKTQ_ENQUEUE(pktq, pkt);
1292 }
1293 }
1294
1295 /*
1296 * This function is modeled after nx_netif_host_grab_pkts() in nx_netif_host.c.
1297 */
1298 SK_NO_INLINE_ATTRIBUTE
1299 static void
convert_native_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_headp,struct mbuf ** m_tailp,uint32_t * cnt,uint32_t * bytes)1300 convert_native_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1301 struct mbuf **m_headp, struct mbuf **m_tailp, uint32_t *cnt, uint32_t *bytes)
1302 {
1303 uint32_t tot_cnt;
1304 unsigned int num_segs = 1;
1305 struct mbuf *__single mhead, *__single head = NULL;
1306 struct mbuf *__single tail = NULL, **__single tailp = &head;
1307 uint32_t mhead_cnt, mhead_bufsize;
1308 uint32_t mhead_waste = 0;
1309 uint32_t mcnt = 0, mbytes = 0;
1310 uint32_t largest, max_pkt_len;
1311 struct __kern_packet *__single pkt;
1312 struct kern_pbufpool *pp;
1313
1314 tot_cnt = KPKTQ_LEN(pktq);
1315 ASSERT(tot_cnt > 0);
1316 mhead_cnt = tot_cnt;
1317
1318 /*
1319 * Opportunistically batch-allocate the mbufs based on the largest
1320 * packet size we've seen in the recent past. Note that we reset
1321 * fe_rx_largest_size below if we notice that we're under-utilizing the
1322 * allocated buffers (thus disabling this batch allocation).
1323 */
1324 largest = *(volatile uint32_t*)&fsw->fsw_rx_largest_size; /* read once */
1325 if (__probable(largest != 0)) {
1326 if (largest <= MCLBYTES) {
1327 mhead = m_allocpacket_internal(&mhead_cnt, MCLBYTES,
1328 &num_segs, M_NOWAIT, 1, 0);
1329 mhead_bufsize = MCLBYTES;
1330 } else if (largest <= MBIGCLBYTES) {
1331 mhead = m_allocpacket_internal(&mhead_cnt, MBIGCLBYTES,
1332 &num_segs, M_NOWAIT, 1, 0);
1333 mhead_bufsize = MBIGCLBYTES;
1334 } else if (largest <= M16KCLBYTES) {
1335 mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES,
1336 &num_segs, M_NOWAIT, 1, 0);
1337 mhead_bufsize = M16KCLBYTES;
1338 } else if (largest <= M16KCLBYTES * 2) {
1339 num_segs = 2;
1340 mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES * 2,
1341 &num_segs, M_NOWAIT, 1, 0);
1342 mhead_bufsize = M16KCLBYTES * 2;
1343 } else {
1344 mhead = NULL;
1345 mhead_bufsize = mhead_cnt = 0;
1346 }
1347 } else {
1348 mhead = NULL;
1349 mhead_bufsize = mhead_cnt = 0;
1350 }
1351 DTRACE_SKYWALK4(bufstats, uint32_t, largest, uint32_t, mhead_bufsize,
1352 uint32_t, mhead_cnt, uint32_t, tot_cnt);
1353
1354 pp = __DECONST(struct kern_pbufpool *, KPKTQ_FIRST(pktq)->pkt_qum.qum_pp);
1355 max_pkt_len = PP_BUF_SIZE_DEF(pp) * pp->pp_max_frags;
1356
1357 KPKTQ_FOREACH(pkt, pktq) {
1358 uint32_t tot_len, len;
1359 uint16_t pad, llhlen, iphlen;
1360 boolean_t do_cksum_rx;
1361 struct mbuf *__single m;
1362 int error;
1363
1364 llhlen = pkt->pkt_l2_len;
1365 len = pkt->pkt_length;
1366 if (__improbable(len > max_pkt_len || len == 0 || llhlen > len)) {
1367 DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1368 struct __kern_packet *, pkt);
1369 FSW_STATS_INC(FSW_STATS_DROP);
1370 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1371 continue;
1372 }
1373 /* begin payload on 32-bit boundary; figure out the padding */
1374 pad = (uint16_t)P2ROUNDUP(llhlen, sizeof(uint32_t)) - llhlen;
1375 tot_len = pad + len;
1376
1377 /* remember largest packet size */
1378 if (__improbable(largest < tot_len)) {
1379 largest = MAX(tot_len, MCLBYTES);
1380 }
1381
1382 /*
1383 * If the above batch allocation returned partial
1384 * success, we try a blocking allocation here again.
1385 */
1386 m = mhead;
1387 if (__improbable(m == NULL || tot_len > mhead_bufsize)) {
1388 ASSERT(mhead != NULL || mhead_cnt == 0);
1389 num_segs = 1;
1390 if (tot_len > M16KCLBYTES) {
1391 num_segs = 0;
1392 }
1393 if ((error = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
1394 &num_segs, &m)) != 0) {
1395 DTRACE_SKYWALK2(bad__len,
1396 struct nx_flowswitch *, fsw,
1397 struct __kern_packet *, pkt);
1398 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
1399 FSW_STATS_INC(FSW_STATS_DROP);
1400 continue;
1401 }
1402 } else {
1403 mhead = m->m_nextpkt;
1404 m->m_nextpkt = NULL;
1405 ASSERT(mhead_cnt != 0);
1406 --mhead_cnt;
1407
1408 /* check if we're underutilizing large buffers */
1409 if (__improbable(mhead_bufsize > MCLBYTES &&
1410 tot_len < (mhead_bufsize >> 1))) {
1411 ++mhead_waste;
1412 }
1413 /*
1414 * Clean up unused mbuf.
1415 * Ony need to do this when we pre-alloc 2x16K mbufs
1416 */
1417 if (__improbable(mhead_bufsize >= tot_len + M16KCLBYTES)) {
1418 ASSERT(mhead_bufsize == 2 * M16KCLBYTES);
1419 struct mbuf *m_extra = m->m_next;
1420 ASSERT(m_extra != NULL);
1421 ASSERT(m_extra->m_len == 0);
1422 ASSERT(M_SIZE(m_extra) == M16KCLBYTES);
1423 m->m_next = NULL;
1424 m_freem(m_extra);
1425 FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
1426 }
1427 }
1428 m->m_data += pad;
1429 /*
1430 * XXX -fbounds-safety: external dependency
1431 * mtod does not work because m_len is 0
1432 */
1433 m->m_pkthdr.pkt_hdr = m_mtod_current(m);
1434
1435 /* don't include IP header from partial sum */
1436 if (__probable((pkt->pkt_qum_qflags &
1437 QUM_F_FLOW_CLASSIFIED) != 0)) {
1438 iphlen = pkt->pkt_flow_ip_hlen;
1439 do_cksum_rx = sk_cksum_rx;
1440 } else {
1441 iphlen = 0;
1442 do_cksum_rx = FALSE;
1443 }
1444
1445 fsw->fsw_pkt_copy_to_mbuf(NR_RX, SK_PKT2PH(pkt),
1446 pkt->pkt_headroom, m, 0, len, do_cksum_rx,
1447 llhlen + iphlen);
1448
1449 FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2MBUF);
1450 if (do_cksum_rx) {
1451 FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
1452 }
1453 #if DEBUG || DEVELOPMENT
1454 if (__improbable(pkt_trailers > 0)) {
1455 (void) pkt_add_trailers_mbuf(m, llhlen + iphlen);
1456 }
1457 #endif /* DEBUG || DEVELOPMENT */
1458 m_adj(m, llhlen);
1459
1460 m->m_pkthdr.rcvif = fsw->fsw_ifp;
1461 if (__improbable((pkt->pkt_link_flags &
1462 PKT_LINKF_ETHFCS) != 0)) {
1463 m->m_flags |= M_HASFCS;
1464 }
1465 if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1466 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
1467 }
1468 ASSERT(m->m_nextpkt == NULL);
1469 tail = m;
1470 *tailp = m;
1471 tailp = &m->m_nextpkt;
1472 mcnt++;
1473 mbytes += m_pktlen(m);
1474 }
1475 /* free any leftovers */
1476 if (__improbable(mhead != NULL)) {
1477 DTRACE_SKYWALK1(mhead__leftover, uint32_t, mhead_cnt);
1478 ASSERT(mhead_cnt != 0);
1479 (void) m_freem_list(mhead);
1480 mhead = NULL;
1481 mhead_cnt = 0;
1482 }
1483
1484 /* reset if most packets (>50%) are smaller than our batch buffers */
1485 if (__improbable(mhead_waste > ((uint32_t)tot_cnt >> 1))) {
1486 DTRACE_SKYWALK4(mhead__waste, struct nx_flowswitch *, fsw,
1487 struct flow_entry *, NULL, uint32_t, mhead_waste,
1488 uint32_t, tot_cnt);
1489 largest = 0;
1490 }
1491
1492 if (largest != fsw->fsw_rx_largest_size) {
1493 os_atomic_store(&fsw->fsw_rx_largest_size, largest, release);
1494 }
1495
1496 pp_free_pktq(pktq);
1497 *m_headp = head;
1498 *m_tailp = tail;
1499 *cnt = mcnt;
1500 *bytes = mbytes;
1501 }
1502
1503 /*
1504 * This function only extracts the mbuf from the packet. The caller frees
1505 * the packet.
1506 */
1507 static inline struct mbuf *
convert_compat_pkt_to_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1508 convert_compat_pkt_to_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1509 {
1510 struct mbuf *m;
1511 struct pkthdr *mhdr;
1512 uint16_t llhlen;
1513
1514 m = pkt->pkt_mbuf;
1515 ASSERT(m != NULL);
1516
1517 llhlen = pkt->pkt_l2_len;
1518 if (llhlen > pkt->pkt_length) {
1519 m_freem(m);
1520 KPKT_CLEAR_MBUF_DATA(pkt);
1521 DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1522 struct __kern_packet *, pkt);
1523 FSW_STATS_INC(FSW_STATS_DROP);
1524 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1525 return NULL;
1526 }
1527 mhdr = &m->m_pkthdr;
1528 if ((mhdr->csum_flags & CSUM_DATA_VALID) == 0 &&
1529 PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
1530 mhdr->csum_flags &= ~CSUM_RX_FLAGS;
1531 mhdr->csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
1532 mhdr->csum_rx_start = pkt->pkt_csum_rx_start_off;
1533 mhdr->csum_rx_val = pkt->pkt_csum_rx_value;
1534 }
1535 #if DEBUG || DEVELOPMENT
1536 uint32_t extra = 0;
1537 if (__improbable(pkt_trailers > 0)) {
1538 extra = pkt_add_trailers_mbuf(m, llhlen);
1539 }
1540 #endif /* DEBUG || DEVELOPMENT */
1541 m_adj(m, llhlen);
1542 ASSERT((uint32_t)m_pktlen(m) == ((pkt->pkt_length - llhlen) + extra));
1543 KPKT_CLEAR_MBUF_DATA(pkt);
1544 return m;
1545 }
1546
1547 SK_NO_INLINE_ATTRIBUTE
1548 static void
convert_compat_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_head,struct mbuf ** m_tail,uint32_t * cnt,uint32_t * bytes)1549 convert_compat_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1550 struct mbuf **m_head, struct mbuf **m_tail, uint32_t *cnt, uint32_t *bytes)
1551 {
1552 struct __kern_packet *pkt;
1553 struct mbuf *__single m, *__single head = NULL;
1554 struct mbuf *__single tail = NULL, **__single tailp = &head;
1555 uint32_t c = 0, b = 0;
1556
1557 KPKTQ_FOREACH(pkt, pktq) {
1558 m = convert_compat_pkt_to_mbuf(fsw, pkt);
1559 if (__improbable(m == NULL)) {
1560 continue;
1561 }
1562 tail = m;
1563 *tailp = m;
1564 tailp = &m->m_nextpkt;
1565 c++;
1566 b += m_pktlen(m);
1567 }
1568 pp_free_pktq(pktq);
1569 *m_head = head;
1570 *m_tail = tail;
1571 *cnt = c;
1572 *bytes = b;
1573 }
1574
1575 void
fsw_host_sendup(struct ifnet * ifp,struct mbufq * host_mq)1576 fsw_host_sendup(struct ifnet *ifp, struct mbufq *host_mq)
1577 {
1578 struct ifnet_stat_increment_param s;
1579
1580 if (mbufq_empty(host_mq)) {
1581 return;
1582 }
1583
1584 bzero(&s, sizeof(s));
1585 s.packets_in = host_mq->count;
1586 s.bytes_in = host_mq->bytes;
1587 dlil_input_handler(ifp, mbufq_first(host_mq), mbufq_last(host_mq), &s, FALSE, NULL);
1588 }
1589
1590 void
fsw_host_rx_cb(struct nx_flowswitch * fsw,struct pktq * pktq)1591 fsw_host_rx_cb(struct nx_flowswitch *fsw, struct pktq *pktq)
1592 {
1593 ifnet_fsw_rx_cb_t __single cb;
1594 void *__single cb_arg;
1595
1596 ASSERT(!KPKTQ_EMPTY(pktq));
1597 if (ifnet_get_flowswitch_rx_callback(fsw->fsw_ifp, &cb, &cb_arg) == 0) {
1598 ASSERT(cb != NULL);
1599 ASSERT(cb_arg != NULL);
1600 (*cb)(cb_arg, pktq);
1601 ifnet_release_flowswitch_rx_callback(fsw->fsw_ifp);
1602 if (KPKTQ_EMPTY(pktq)) {
1603 return;
1604 } else {
1605 DTRACE_SKYWALK2(leftover__pkts, struct nx_flowswitch *, fsw,
1606 struct pktq *, pktq);
1607 }
1608 }
1609 }
1610
1611 void
fsw_host_rx_enqueue_mbq(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbufq * host_mq)1612 fsw_host_rx_enqueue_mbq(struct nx_flowswitch *fsw, struct pktq *pktq,
1613 struct mbufq *host_mq)
1614 {
1615 struct mbuf *__single m_head = NULL, *__single m_tail = NULL;
1616 uint32_t cnt = 0, bytes = 0;
1617 boolean_t compat;
1618
1619 if (KPKTQ_EMPTY(pktq)) {
1620 return;
1621 }
1622
1623 /* All packets in the pktq must have the same type */
1624 compat = ((KPKTQ_FIRST(pktq)->pkt_pflags & PKT_F_MBUF_DATA) != 0);
1625 if (compat) {
1626 convert_compat_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1627 &bytes);
1628 } else {
1629 convert_native_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1630 &bytes);
1631 }
1632 if (__improbable(m_head == NULL)) {
1633 DTRACE_SKYWALK1(empty__head, struct nx_flowswitch *, fsw);
1634 return;
1635 }
1636
1637 mbufq_enqueue(host_mq, m_head, m_tail, cnt, bytes);
1638 }
1639
1640 void
fsw_ring_enqueue_tail_drop(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1641 fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw,
1642 struct __kern_channel_ring *r, struct pktq *pktq)
1643 {
1644 fsw_ring_enqueue_pktq(fsw, r, pktq);
1645 /*
1646 * Rx stall detection: don't update enqueue ts if dequeue ts < enqueue ts.
1647 * This is to ensure we use the timestamp of the earliest enqueue without
1648 * a dequeue.
1649 */
1650 if (r->ckr_rx_dequeue_ts >= r->ckr_rx_enqueue_ts) {
1651 r->ckr_rx_enqueue_ts = net_uptime();
1652 }
1653 FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq));
1654 dp_drop_pktq(fsw, pktq, 0, DROP_REASON_RX_DST_RING_FULL, __LINE__,
1655 DROPTAP_FLAG_L2_MISSING);
1656 }
1657
1658 static struct nexus_adapter *
flow_get_na(struct nx_flowswitch * fsw,struct flow_entry * fe)1659 flow_get_na(struct nx_flowswitch *fsw, struct flow_entry *fe)
1660 {
1661 struct kern_nexus *nx = fsw->fsw_nx;
1662 struct nexus_adapter *na = NULL;
1663 nexus_port_t port = fe->fe_nx_port;
1664
1665 if (port == FSW_VP_DEV || port == FSW_VP_HOST) {
1666 SK_ERR("dev or host ports have no NA");
1667 return NULL;
1668 }
1669
1670 if (__improbable(!nx_port_is_valid(nx, port))) {
1671 SK_DF(SK_VERB_FSW_DP, "%s[%d] port no longer valid",
1672 if_name(fsw->fsw_ifp), port);
1673 return NULL;
1674 }
1675
1676 na = nx_port_get_na(nx, port);
1677 if (__improbable(na == NULL)) {
1678 FSW_STATS_INC(FSW_STATS_DST_NXPORT_INVALID);
1679 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer valid",
1680 if_name(fsw->fsw_ifp), port);
1681 return NULL;
1682 }
1683
1684 if (__improbable(!NA_IS_ACTIVE(na))) {
1685 FSW_STATS_INC(FSW_STATS_DST_NXPORT_INACTIVE);
1686 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer active",
1687 if_name(fsw->fsw_ifp), port);
1688 return NULL;
1689 }
1690
1691 if (__improbable(nx_port_is_defunct(nx, port))) {
1692 FSW_STATS_INC(FSW_STATS_DST_NXPORT_DEFUNCT);
1693 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA defuncted",
1694 if_name(fsw->fsw_ifp), port);
1695 return NULL;
1696 }
1697
1698 return na;
1699 }
1700
1701 static inline struct __kern_channel_ring *
flow_get_ring(struct nx_flowswitch * fsw,struct flow_entry * fe,enum txrx txrx)1702 flow_get_ring(struct nx_flowswitch *fsw, struct flow_entry *fe, enum txrx txrx)
1703 {
1704 struct nexus_vp_adapter *na = NULL;
1705 struct __kern_channel_ring *__single r = NULL;
1706
1707 na = VPNA(flow_get_na(fsw, fe));
1708 if (__improbable(na == NULL)) {
1709 return NULL;
1710 }
1711
1712 switch (txrx) {
1713 case NR_RX:
1714 r = KR_SINGLE(&na->vpna_up.na_rx_rings[0]);
1715 break;
1716 case NR_TX:
1717 r = KR_SINGLE(&na->vpna_up.na_tx_rings[0]);
1718 break;
1719 default:
1720 __builtin_unreachable();
1721 VERIFY(0);
1722 }
1723
1724 if (__improbable(KR_DROP(r))) {
1725 FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE);
1726 SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %p %s drop mode",
1727 SK_KVA(r), r->ckr_name);
1728 return NULL;
1729 }
1730
1731 ASSERT(KRNA(r)->na_md_type == NEXUS_META_TYPE_PACKET);
1732
1733 #if (DEVELOPMENT || DEBUG)
1734 if (r != NULL) {
1735 _FSW_INJECT_ERROR(4, r, NULL, null_func);
1736 }
1737 #endif /* DEVELOPMENT || DEBUG */
1738
1739 return r;
1740 }
1741
1742 struct __kern_channel_ring *
fsw_flow_get_rx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1743 fsw_flow_get_rx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1744 {
1745 return flow_get_ring(fsw, fe, NR_RX);
1746 }
1747
1748 static bool
dp_flow_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1749 dp_flow_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1750 {
1751 struct flow_route *fr = fe->fe_route;
1752 struct ifnet *ifp = fsw->fsw_ifp;
1753
1754 if (__improbable(!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
1755 !fe->fe_want_nonviable && (fe->fe_key.fk_mask & FKMASK_SRC) &&
1756 fe->fe_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt &&
1757 !flow_route_key_validate(&fe->fe_key, ifp, &fe->fe_laddr_gencnt))) {
1758 /*
1759 * The source address is no longer around; we want this
1760 * flow to be nonviable, but that requires holding the lock
1761 * as writer (which isn't the case now.) Indicate that
1762 * we need to finalize the nonviable later down below.
1763 *
1764 * We also request that the flow route be re-configured,
1765 * if this is a connected mode flow.
1766 *
1767 */
1768 if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) {
1769 /*
1770 * fsw_pending_nonviable is a hint for reaper thread;
1771 * due to the fact that setting fe_want_nonviable and
1772 * incrementing fsw_pending_nonviable counter is not
1773 * atomic, let the increment happen first, and the
1774 * thread losing the CAS does decrement.
1775 */
1776 os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
1777 if (os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
1778 fsw_reap_sched(fsw);
1779 } else {
1780 os_atomic_dec(&fsw->fsw_pending_nonviable, relaxed);
1781 }
1782 }
1783 if (fr != NULL) {
1784 os_atomic_inc(&fr->fr_want_configure, relaxed);
1785 }
1786 }
1787
1788 /* if flow was (or is going to be) marked as nonviable, drop it */
1789 if (__improbable(fe->fe_want_nonviable ||
1790 (fe->fe_flags & FLOWENTF_NONVIABLE) != 0)) {
1791 SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow %p non-viable",
1792 SK_KVA(fe));
1793 return false;
1794 }
1795 return true;
1796 }
1797
1798 bool
dp_flow_rx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1799 dp_flow_rx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1800 {
1801 bool okay;
1802 okay = dp_flow_route_process(fsw, fe);
1803 #if (DEVELOPMENT || DEBUG)
1804 if (okay) {
1805 _FSW_INJECT_ERROR(5, okay, false, null_func);
1806 }
1807 #endif /* DEVELOPMENT || DEBUG */
1808
1809 return okay;
1810 }
1811
1812 void
dp_flow_rx_process(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,uint32_t rx_bytes,struct mbufq * host_mq,uint32_t flags)1813 dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
1814 struct pktq *rx_pkts, uint32_t rx_bytes, struct mbufq *host_mq,
1815 uint32_t flags)
1816 {
1817 #pragma unused(flags)
1818 struct pktq dpkts; /* dst pool alloc'ed packets */
1819 struct pktq disposed_pkts; /* done src packets */
1820 struct pktq dropped_pkts; /* dropped src packets */
1821 struct pktq transferred_pkts; /* dst packet ready for ring */
1822 struct __kern_packet *pkt, *tpkt;
1823 struct kern_pbufpool *dpp;
1824 uint32_t n_pkts = KPKTQ_LEN(rx_pkts);
1825 uint64_t buf_array[RX_BUFLET_BATCH_COUNT];
1826 uint16_t buf_array_iter = 0;
1827 uint32_t cnt, buf_cnt = 0;
1828 int err;
1829 drop_reason_t reason = DROP_REASON_UNSPECIFIED;
1830 uint16_t line = 0;
1831
1832 KPKTQ_INIT(&dpkts);
1833 KPKTQ_INIT(&dropped_pkts);
1834 KPKTQ_INIT(&disposed_pkts);
1835 KPKTQ_INIT(&transferred_pkts);
1836
1837 if (__improbable(!dp_flow_rx_route_process(fsw, fe))) {
1838 SK_ERR("Rx route bad");
1839 fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
1840 FSW_STATS_ADD(FSW_STATS_RX_FLOW_NONVIABLE, n_pkts);
1841 reason = DROP_REASON_FSW_FLOW_NONVIABLE;
1842 line = __LINE__;
1843 goto done;
1844 }
1845
1846 if (fe->fe_nx_port == FSW_VP_HOST) {
1847 /*
1848 * The host ring does not exist anymore so we can't take
1849 * the enqueue path below. This path should only be hit
1850 * for the rare tcp fragmentation case.
1851 */
1852
1853 fsw_host_rx_enqueue_mbq(fsw, rx_pkts, host_mq);
1854 return;
1855 }
1856
1857 /* find the ring */
1858 struct __kern_channel_ring *r;
1859 r = fsw_flow_get_rx_ring(fsw, fe);
1860 if (__improbable(r == NULL)) {
1861 fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
1862 reason = DROP_REASON_FSW_RX_RING_NOT_FOUND;
1863 line = __LINE__;
1864 goto done;
1865 }
1866
1867 /* snoop before L2 is stripped */
1868 if (__improbable(pktap_total_tap_count != 0)) {
1869 fsw_snoop(fsw, fe, rx_pkts, true);
1870 }
1871
1872 dpp = r->ckr_pp;
1873 /* batch allocate enough packets */
1874 err = pp_alloc_pktq(dpp, 1, &dpkts, n_pkts, NULL, NULL,
1875 SKMEM_NOSLEEP);
1876 if (__improbable(err == ENOMEM)) {
1877 ASSERT(KPKTQ_EMPTY(&dpkts));
1878 KPKTQ_CONCAT(&dropped_pkts, rx_pkts);
1879 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1880 SK_ERR("failed to alloc %u pkts for kr %s, %p", n_pkts,
1881 r->ckr_name, SK_KVA(r));
1882 reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
1883 line = __LINE__;
1884 goto done;
1885 }
1886
1887 /*
1888 * estimate total number of buflets for the packet chain.
1889 */
1890 cnt = howmany(rx_bytes, PP_BUF_SIZE_DEF(dpp));
1891 if (cnt > n_pkts) {
1892 ASSERT(dpp->pp_max_frags > 1);
1893 cnt -= n_pkts;
1894 buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, cnt);
1895 err = pp_alloc_buflet_batch(dpp, buf_array, &buf_cnt,
1896 SKMEM_NOSLEEP, false);
1897 if (__improbable(buf_cnt == 0)) {
1898 KPKTQ_CONCAT(&dropped_pkts, rx_pkts);
1899 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1900 SK_ERR("failed to alloc %d buflets (err %d) for kr %s %p",
1901 cnt, err, r->ckr_name, SK_KVA(r));
1902 reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
1903 line = __LINE__;
1904 goto done;
1905 }
1906 err = 0;
1907 }
1908
1909 /* extra processing for user flow */
1910 KPKTQ_FOREACH_SAFE(pkt, rx_pkts, tpkt) {
1911 err = 0;
1912 KPKTQ_REMOVE(rx_pkts, pkt);
1913 if (rx_bytes > pkt->pkt_flow_ulen) {
1914 rx_bytes -= pkt->pkt_flow_ulen;
1915 } else {
1916 rx_bytes = 0;
1917 }
1918 err = flow_pkt_track(fe, pkt, true);
1919 _FSW_INJECT_ERROR(33, err, EPROTO, null_func);
1920 if (__improbable(err != 0)) {
1921 SK_DF(SK_VERB_FLOW_TRACK, "flow_pkt_track failed (err %d)", err);
1922 FSW_STATS_INC(FSW_STATS_RX_FLOW_TRACK_ERR);
1923 /* if need to trigger RST */
1924 if (err == ENETRESET) {
1925 flow_track_abort_tcp(fe, pkt, NULL);
1926 }
1927 dp_drop_pkt_single(fsw, pkt, 0, DROP_REASON_FSW_FLOW_TRACK_ERR,
1928 DROPTAP_FLAG_L2_MISSING);
1929 continue;
1930 }
1931
1932 /* transfer to dpkt */
1933 if (pkt->pkt_qum.qum_pp != dpp) {
1934 struct __kern_buflet *bprev, *bnew;
1935 struct __kern_packet *dpkt = NULL;
1936 uint32_t n_bufs, i;
1937
1938 KPKTQ_DEQUEUE(&dpkts, dpkt);
1939 /* XXX Why would dpkt be NULL at this point? */
1940 if (__improbable(dpkt == NULL)) {
1941 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1942 dp_drop_pkt_single(fsw, pkt, 0,
1943 DROP_REASON_FSW_PP_ALLOC_FAILED, DROPTAP_FLAG_L2_MISSING);
1944 continue;
1945 }
1946 n_bufs = howmany(pkt->pkt_length, PP_BUF_SIZE_DEF(dpp));
1947 n_bufs--;
1948 for (i = 0; i < n_bufs; i++) {
1949 if (__improbable(buf_cnt == 0)) {
1950 ASSERT(dpp->pp_max_frags > 1);
1951 buf_array_iter = 0;
1952 cnt = howmany(rx_bytes, PP_BUF_SIZE_DEF(dpp));
1953 n_pkts = KPKTQ_LEN(rx_pkts);
1954 if (cnt >= n_pkts) {
1955 cnt -= n_pkts;
1956 } else {
1957 cnt = 0;
1958 }
1959 cnt += (n_bufs - i);
1960 buf_cnt = MIN(RX_BUFLET_BATCH_COUNT,
1961 cnt);
1962 cnt = buf_cnt;
1963 err = pp_alloc_buflet_batch(dpp,
1964 buf_array, &buf_cnt,
1965 SKMEM_NOSLEEP, false);
1966 if (__improbable(buf_cnt == 0)) {
1967 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1968 dp_drop_pkt_single(fsw, pkt, 0,
1969 DROP_REASON_FSW_PP_ALLOC_FAILED,
1970 DROPTAP_FLAG_L2_MISSING);
1971 pkt = NULL;
1972 pp_free_packet_single(dpkt);
1973 dpkt = NULL;
1974 SK_ERR("failed to alloc %d "
1975 "buflets (err %d) for "
1976 "kr %s, %p", cnt, err,
1977 r->ckr_name, SK_KVA(r));
1978 break;
1979 }
1980 err = 0;
1981 }
1982 ASSERT(buf_cnt != 0);
1983 if (i == 0) {
1984 PKT_GET_FIRST_BUFLET(dpkt, 1, bprev);
1985 }
1986 /*
1987 * XXX -fbounds-safety: can't avoid using forge
1988 * unless we change the signature of
1989 * pp_alloc_buflet_batch().
1990 */
1991 bnew = __unsafe_forge_single(kern_buflet_t,
1992 buf_array[buf_array_iter]);
1993 buf_array[buf_array_iter] = 0;
1994 buf_array_iter++;
1995 buf_cnt--;
1996 VERIFY(kern_packet_add_buflet(SK_PKT2PH(dpkt),
1997 bprev, bnew) == 0);
1998 bprev = bnew;
1999 }
2000 if (__improbable(err != 0)) {
2001 continue;
2002 }
2003 err = copy_packet_from_dev(fsw, pkt, dpkt);
2004 _FSW_INJECT_ERROR(43, err, EINVAL, null_func);
2005 if (__improbable(err != 0)) {
2006 SK_ERR("copy packet failed (err %d)", err);
2007 dp_drop_pkt_single(fsw, pkt, 0,
2008 DROP_REASON_FSW_PKT_COPY_FAILED,
2009 DROPTAP_FLAG_L2_MISSING);
2010 pp_free_packet_single(dpkt);
2011 dpkt = NULL;
2012 continue;
2013 }
2014 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
2015 pkt = dpkt;
2016 }
2017 _UUID_COPY(pkt->pkt_flow_id, fe->fe_uuid);
2018 _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
2019 pkt->pkt_policy_id = fe->fe_policy_id;
2020 pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
2021 pkt->pkt_transport_protocol = fe->fe_transport_protocol;
2022 if (pkt->pkt_bufs_cnt > 1) {
2023 pkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
2024 }
2025 KPKTQ_ENQUEUE(&transferred_pkts, pkt);
2026 }
2027 KPKTQ_FINI(rx_pkts);
2028
2029 if (KPKTQ_LEN(&transferred_pkts) > 0) {
2030 fsw_ring_enqueue_tail_drop(fsw, r, &transferred_pkts);
2031 }
2032 KPKTQ_FINI(&transferred_pkts);
2033
2034 done:
2035 /* Free unused buflets */
2036 while (buf_cnt > 0) {
2037 /*
2038 * XXX -fbounds-safety: can't avoid using forge unless we change
2039 * the signature of pp_alloc_buflet_batch().
2040 */
2041 pp_free_buflet(dpp, __unsafe_forge_single(kern_buflet_t,
2042 (kern_buflet_t)(buf_array[buf_array_iter])));
2043 buf_array[buf_array_iter] = 0;
2044 buf_array_iter++;
2045 buf_cnt--;
2046 }
2047 dp_free_pktq(fsw, &dpkts);
2048 dp_free_pktq(fsw, &disposed_pkts);
2049 dp_drop_pktq(fsw, &dropped_pkts, 0, reason, line, DROPTAP_FLAG_L2_MISSING);
2050 }
2051
2052 static inline void
rx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe,struct flow_entry_list * fes,struct mbufq * host_mq)2053 rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
2054 struct flow_entry_list *fes, struct mbufq *host_mq)
2055 {
2056 struct pktq rx_pkts;
2057 uint32_t rx_bytes;
2058 uint32_t rx_proc_flags;
2059
2060 ASSERT(!KPKTQ_EMPTY(&fe->fe_rx_pktq));
2061 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0);
2062
2063 KPKTQ_INIT(&rx_pkts);
2064 for (;;) {
2065 lck_mtx_lock(&fe->fe_rx_pktq_lock);
2066 if (KPKTQ_EMPTY(&fe->fe_rx_pktq)) {
2067 fe->fe_rx_worker_tid = 0;
2068 TAILQ_REMOVE(fes, fe, fe_rx_link);
2069 lck_mtx_unlock(&fe->fe_rx_pktq_lock);
2070 break;
2071 }
2072 KPKTQ_CONCAT(&rx_pkts, &fe->fe_rx_pktq);
2073 KPKTQ_DISPOSE(&fe->fe_rx_pktq);
2074 rx_bytes = fe->fe_rx_pktq_bytes;
2075 rx_proc_flags = fe->fe_rx_frag_count ? FLOW_PROC_FLAG_FRAGMENTS : 0;
2076 fe->fe_rx_pktq_bytes = 0;
2077 fe->fe_rx_frag_count = 0;
2078 lck_mtx_unlock(&fe->fe_rx_pktq_lock);
2079 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX, "Rx %d pkts for fe %p port %d",
2080 KPKTQ_LEN(&rx_pkts), fe, fe->fe_nx_port);
2081 /* flow related processing (default, agg, fpd, etc.) */
2082 fe->fe_rx_process(fsw, fe, &rx_pkts, rx_bytes, host_mq, rx_proc_flags);
2083 }
2084 ASSERT(KPKTQ_EMPTY(&rx_pkts));
2085
2086 if (__improbable(fe->fe_want_withdraw)) {
2087 fsw_reap_sched(fsw);
2088 }
2089 }
2090
2091 static void
dp_rx_process_low_power_wake(struct nx_flowswitch * fsw,struct flow_entry * fe)2092 dp_rx_process_low_power_wake(struct nx_flowswitch *fsw, struct flow_entry *fe)
2093 {
2094 if (fe->fe_port_reservation == NULL || (fe->fe_flags & FLOWENTF_EXTRL_PORT) != 0) {
2095 return;
2096 }
2097 if (fe->fe_key.fk_proto == IPPROTO_TCP && (fe->fe_flags & FLOWENTF_CONNECTION_IDLE)) {
2098 os_log(wake_packet_log_handle, "dp_rx_process_low_power_wake LPW TCP connection idle");
2099
2100 if (flow_track_tcp_want_abort(fe)) {
2101 os_atomic_or(&fe->fe_flags, FLOWENTF_CLOSE_NOTIFY | FLOWENTF_WAIT_CLOSE, relaxed);
2102 fe->fe_want_withdraw = 1;
2103 flow_track_abort_tcp(fe, NULL, NULL);
2104 }
2105 } else {
2106 if_exit_lpw(fsw->fsw_ifp, "dp_rx_process_low_power_wake LPW connection not idle");
2107 }
2108 }
2109
2110 static inline void
dp_rx_process_wake_packet(struct nx_flowswitch * fsw,struct flow_entry * fe,struct __kern_packet * pkt)2111 dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct flow_entry *fe, struct __kern_packet *pkt)
2112 {
2113 /*
2114 * We only care about wake packets of flows that belong the flow switch
2115 * as wake packets for the host stack are handled by the host input
2116 * function
2117 */
2118
2119 #if (DEBUG || DEVELOPMENT)
2120 /* For testing only */
2121 if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
2122 if (check_wake_pkt(fsw->fsw_ifp, pkt) == true) {
2123 /*
2124 * This is a one shot command
2125 */
2126 fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
2127
2128 pkt->pkt_pflags |= PKT_F_WAKE_PKT;
2129 }
2130 }
2131 #endif /* (DEBUG || DEVELOPMENT) */
2132
2133 if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
2134 if_ports_used_match_pkt(fsw->fsw_ifp, pkt);
2135
2136 /*
2137 * When a packet is received in LPW mode for an idle TCP connection, the connection
2138 * is aborted immediately with a RST so the peer drops the connection at once
2139 */
2140 if (if_is_lpw_enabled(fsw->fsw_ifp)) {
2141 pkt->pkt_pflags |= __PKT_F_LPW;
2142 dp_rx_process_low_power_wake(fsw, fe);
2143 }
2144 }
2145 }
2146
2147 static void
_fsw_receive(struct nx_flowswitch * fsw,struct pktq * pktq)2148 _fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq)
2149 {
2150 struct __kern_packet *__single pkt, *__single tpkt;
2151 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
2152 struct flow_entry *__single fe, *__single prev_fe;
2153 sa_family_t af;
2154 struct pktq host_pkts, dropped_pkts;
2155 drop_reason_t reason = DROP_REASON_UNSPECIFIED;
2156 uint16_t line = 0;
2157 int err;
2158 uint64_t thread_id;
2159 struct mbufq host_mq;
2160 struct ifnet *ifp;
2161
2162 mbufq_init(&host_mq);
2163 KPKTQ_INIT(&host_pkts);
2164 KPKTQ_INIT(&dropped_pkts);
2165
2166 FSW_RLOCK(fsw);
2167
2168 if (__improbable(FSW_QUIESCED(fsw))) {
2169 DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw);
2170 KPKTQ_CONCAT(&dropped_pkts, pktq);
2171 reason = DROP_REASON_FSW_QUIESCED;
2172 line = __LINE__;
2173 goto done;
2174 }
2175 if (__improbable(fsw->fsw_demux == NULL)) {
2176 KPKTQ_CONCAT(&dropped_pkts, pktq);
2177 reason = DROP_REASON_FSW_DEMUX_FAILED;
2178 line = __LINE__;
2179 goto done;
2180 }
2181
2182 ifp = fsw->fsw_ifp;
2183 thread_id = thread_tid(current_thread());
2184 prev_fe = NULL;
2185 KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
2186 if (__probable(tpkt)) {
2187 void *baddr;
2188 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
2189 SK_PREFETCH(baddr, 0);
2190 /* prefetch L3 and L4 flow structs */
2191 SK_PREFETCHW(tpkt->pkt_flow, 0);
2192 SK_PREFETCHW(tpkt->pkt_flow, 128);
2193 }
2194
2195 KPKTQ_REMOVE(pktq, pkt);
2196
2197 pkt = rx_prepare_packet(fsw, pkt);
2198
2199 af = fsw->fsw_demux(fsw, pkt);
2200 if (__improbable(af == AF_UNSPEC)) {
2201 KPKTQ_ENQUEUE(&host_pkts, pkt);
2202 continue;
2203 }
2204
2205 err = flow_pkt_classify(pkt, fsw->fsw_ifp, af, TRUE);
2206 _FSW_INJECT_ERROR(1, err, ENXIO, null_func);
2207 if (__improbable(err != 0)) {
2208 FSW_STATS_INC(FSW_STATS_RX_FLOW_EXTRACT_ERR);
2209 KPKTQ_ENQUEUE(&host_pkts, pkt);
2210 continue;
2211 }
2212
2213 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
2214 pkt = rx_process_ip_frag(fsw, pkt);
2215 if (pkt == NULL) {
2216 continue;
2217 }
2218 }
2219
2220 prev_fe = fe = rx_lookup_flow(fsw, pkt, prev_fe);
2221 if (__improbable(fe == NULL)) {
2222 KPKTQ_ENQUEUE_LIST(&host_pkts, pkt);
2223 continue;
2224 }
2225
2226 dp_rx_process_wake_packet(fsw, fe, pkt);
2227
2228 rx_flow_batch_packets(&fes, fe, pkt, thread_id);
2229 prev_fe = fe;
2230 }
2231
2232 struct flow_entry *tfe = NULL;
2233 TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) {
2234 rx_flow_process(fsw, fe, &fes, &host_mq);
2235 flow_entry_release(&fe);
2236 }
2237
2238 if (!KPKTQ_EMPTY(&host_pkts)) {
2239 fsw_host_rx_cb(fsw, &host_pkts);
2240 fsw_host_rx_enqueue_mbq(fsw, &host_pkts, &host_mq);
2241 }
2242
2243 done:
2244 dp_drop_pktq(fsw, &dropped_pkts, 0, reason, line, 0);
2245 FSW_RUNLOCK(fsw);
2246
2247 fsw_host_sendup(ifp, &host_mq);
2248 }
2249
2250 #if (DEVELOPMENT || DEBUG)
2251 static void
fsw_rps_rx(struct nx_flowswitch * fsw,uint32_t id,struct __kern_packet * pkt)2252 fsw_rps_rx(struct nx_flowswitch *fsw, uint32_t id,
2253 struct __kern_packet *pkt)
2254 {
2255 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2256
2257 lck_mtx_lock_spin(&frt->frt_lock);
2258 KPKTQ_ENQUEUE(&frt->frt_pktq, pkt);
2259 lck_mtx_unlock(&frt->frt_lock);
2260 }
2261
2262 static void
fsw_rps_thread_schedule(struct nx_flowswitch * fsw,uint32_t id)2263 fsw_rps_thread_schedule(struct nx_flowswitch *fsw, uint32_t id)
2264 {
2265 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2266
2267 ASSERT(frt->frt_thread != THREAD_NULL);
2268 lck_mtx_lock_spin(&frt->frt_lock);
2269 ASSERT(!(frt->frt_flags & (FRT_TERMINATING | FRT_TERMINATED)));
2270
2271 frt->frt_requests++;
2272 if (!(frt->frt_flags & FRT_RUNNING)) {
2273 thread_wakeup((caddr_t)frt);
2274 }
2275 lck_mtx_unlock(&frt->frt_lock);
2276 }
2277
2278 __attribute__((noreturn))
2279 static void
fsw_rps_thread_cont(void * v,wait_result_t w)2280 fsw_rps_thread_cont(void *v, wait_result_t w)
2281 {
2282 struct fsw_rps_thread *__single frt = v;
2283 struct nx_flowswitch *fsw = frt->frt_fsw;
2284
2285 lck_mtx_lock(&frt->frt_lock);
2286 if (__improbable(w == THREAD_INTERRUPTIBLE ||
2287 (frt->frt_flags & FRT_TERMINATING) != 0)) {
2288 goto terminate;
2289 }
2290 if (KPKTQ_EMPTY(&frt->frt_pktq)) {
2291 goto done;
2292 }
2293 frt->frt_flags |= FRT_RUNNING;
2294
2295 for (;;) {
2296 uint32_t requests = frt->frt_requests;
2297 struct pktq pkts;
2298
2299 KPKTQ_INIT(&pkts);
2300 KPKTQ_CONCAT(&pkts, &frt->frt_pktq);
2301 lck_mtx_unlock(&frt->frt_lock);
2302
2303 sk_protect_t protect;
2304 protect = sk_sync_protect();
2305 _fsw_receive(fsw, &pkts);
2306 sk_sync_unprotect(protect);
2307
2308 lck_mtx_lock(&frt->frt_lock);
2309 if ((frt->frt_flags & FRT_TERMINATING) != 0 ||
2310 requests == frt->frt_requests) {
2311 frt->frt_requests = 0;
2312 break;
2313 }
2314 }
2315
2316 done:
2317 lck_mtx_unlock(&frt->frt_lock);
2318 if (!(frt->frt_flags & FRT_TERMINATING)) {
2319 frt->frt_flags &= ~FRT_RUNNING;
2320 assert_wait(frt, THREAD_UNINT);
2321 thread_block_parameter(fsw_rps_thread_cont, frt);
2322 __builtin_unreachable();
2323 } else {
2324 terminate:
2325 LCK_MTX_ASSERT(&frt->frt_lock, LCK_MTX_ASSERT_OWNED);
2326 frt->frt_flags &= ~(FRT_RUNNING | FRT_TERMINATING);
2327 frt->frt_flags |= FRT_TERMINATED;
2328
2329 if (frt->frt_flags & FRT_TERMINATEBLOCK) {
2330 thread_wakeup((caddr_t)&frt);
2331 }
2332 lck_mtx_unlock(&frt->frt_lock);
2333
2334 SK_D("fsw_rx_%s_%d terminated", if_name(fsw->fsw_ifp),
2335 frt->frt_idx);
2336
2337 /* for the extra refcnt from kernel_thread_start() */
2338 thread_deallocate(current_thread());
2339 /* this is the end */
2340 thread_terminate(current_thread());
2341 /* NOTREACHED */
2342 __builtin_unreachable();
2343 }
2344
2345 /* must never get here */
2346 VERIFY(0);
2347 /* NOTREACHED */
2348 __builtin_unreachable();
2349 }
2350
2351 __attribute__((noreturn))
2352 static void
fsw_rps_thread_func(void * v,wait_result_t w)2353 fsw_rps_thread_func(void *v, wait_result_t w)
2354 {
2355 #pragma unused(w)
2356 struct fsw_rps_thread *__single frt = v;
2357 struct nx_flowswitch *fsw = frt->frt_fsw;
2358 const char *__null_terminated tname = NULL;
2359
2360 char thread_name[MAXTHREADNAMESIZE];
2361 bzero(thread_name, sizeof(thread_name));
2362 tname = tsnprintf(thread_name, sizeof(thread_name), "fsw_rx_%s_%d",
2363 if_name(fsw->fsw_ifp), frt->frt_idx);
2364
2365 thread_set_thread_name(frt->frt_thread, tname);
2366 SK_D("%s spawned", tname);
2367
2368 net_thread_marks_push(NET_THREAD_SYNC_RX);
2369 assert_wait(frt, THREAD_UNINT);
2370 (void) thread_block_parameter(fsw_rps_thread_cont, frt);
2371
2372 __builtin_unreachable();
2373 }
2374
2375 static void
fsw_rps_thread_join(struct nx_flowswitch * fsw,uint32_t i)2376 fsw_rps_thread_join(struct nx_flowswitch *fsw, uint32_t i)
2377 {
2378 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2379 uint64_t f = (1 * NSEC_PER_MSEC);
2380 uint64_t s = (1000 * NSEC_PER_SEC);
2381 uint32_t c = 0;
2382
2383 lck_mtx_lock(&frt->frt_lock);
2384 frt->frt_flags |= FRT_TERMINATING;
2385
2386 while (!(frt->frt_flags & FRT_TERMINATED)) {
2387 uint64_t t = 0;
2388 nanoseconds_to_absolutetime((c++ == 0) ? f : s, &t);
2389 clock_absolutetime_interval_to_deadline(t, &t);
2390 ASSERT(t != 0);
2391
2392 frt->frt_flags |= FRT_TERMINATEBLOCK;
2393 if (!(frt->frt_flags & FRT_RUNNING)) {
2394 thread_wakeup_one((caddr_t)frt);
2395 }
2396 (void) assert_wait_deadline(&frt->frt_thread, THREAD_UNINT, t);
2397 lck_mtx_unlock(&frt->frt_lock);
2398 thread_block(THREAD_CONTINUE_NULL);
2399 lck_mtx_lock(&frt->frt_lock);
2400 frt->frt_flags &= ~FRT_TERMINATEBLOCK;
2401 }
2402 ASSERT(frt->frt_flags & FRT_TERMINATED);
2403 lck_mtx_unlock(&frt->frt_lock);
2404 frt->frt_thread = THREAD_NULL;
2405 }
2406
2407 static void
fsw_rps_thread_spawn(struct nx_flowswitch * fsw,uint32_t i)2408 fsw_rps_thread_spawn(struct nx_flowswitch *fsw, uint32_t i)
2409 {
2410 kern_return_t error;
2411 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2412
2413 lck_mtx_init(&frt->frt_lock, &nexus_lock_group, &nexus_lock_attr);
2414 frt->frt_idx = i;
2415 frt->frt_fsw = fsw;
2416 error = kernel_thread_start(fsw_rps_thread_func, frt, &frt->frt_thread);
2417 ASSERT(!error);
2418 KPKTQ_INIT(&frt->frt_pktq);
2419 }
2420
2421 int
fsw_rps_set_nthreads(struct nx_flowswitch * fsw,uint32_t n)2422 fsw_rps_set_nthreads(struct nx_flowswitch* fsw, uint32_t n)
2423 {
2424 if (n > FSW_RPS_MAX_NTHREADS) {
2425 SK_ERR("rps nthreads %d, max %d", n, FSW_RPS_MAX_NTHREADS);
2426 return EINVAL;
2427 }
2428
2429 FSW_WLOCK(fsw);
2430 if (n < fsw->fsw_rps_nthreads) {
2431 for (uint32_t i = n; i < fsw->fsw_rps_nthreads; i++) {
2432 fsw_rps_thread_join(fsw, i);
2433 }
2434 fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2435 fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2436 fsw->fsw_rps_nthreads = n;
2437 } else if (n > fsw->fsw_rps_nthreads) {
2438 uint32_t nthreads_old = fsw->fsw_rps_nthreads;
2439
2440 fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2441 fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2442 fsw->fsw_rps_nthreads = n;
2443 for (uint32_t i = nthreads_old; i < n; i++) {
2444 fsw_rps_thread_spawn(fsw, i);
2445 }
2446 }
2447 FSW_WUNLOCK(fsw);
2448 return 0;
2449 }
2450
2451 static uint32_t
get_rps_id(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2452 get_rps_id(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2453 {
2454 sa_family_t af = fsw->fsw_demux(fsw, pkt);
2455 if (__improbable(af == AF_UNSPEC)) {
2456 return 0;
2457 }
2458
2459 flow_pkt_classify(pkt, fsw->fsw_ifp, af, true);
2460
2461 if (__improbable((pkt->pkt_qum_qflags &
2462 QUM_F_FLOW_CLASSIFIED) == 0)) {
2463 return 0;
2464 }
2465
2466 struct flow_key key;
2467 flow_pkt2key(pkt, true, &key);
2468 key.fk_mask = FKMASK_5TUPLE;
2469
2470 uint32_t id = flow_key_hash(&key) % fsw->fsw_rps_nthreads;
2471
2472 return id;
2473 }
2474
2475 #endif /* !DEVELOPMENT && !DEBUG */
2476
2477 void
fsw_receive(struct nx_flowswitch * fsw,struct pktq * pktq)2478 fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq)
2479 {
2480 #if (DEVELOPMENT || DEBUG)
2481 FSW_RLOCK(fsw);
2482 if (fsw->fsw_rps_nthreads != 0) {
2483 struct __kern_packet *pkt, *tpkt;
2484 bitmap_t map = 0;
2485
2486 static_assert(BITMAP_LEN(FSW_RPS_MAX_NTHREADS) == 1);
2487 KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
2488 uint32_t id = get_rps_id(fsw, pkt);
2489 KPKTQ_REMOVE(pktq, pkt);
2490 fsw_rps_rx(fsw, id, pkt);
2491 bitmap_set(&map, id);
2492 }
2493 for (int i = bitmap_first(&map, 64); i >= 0;
2494 i = bitmap_next(&map, i)) {
2495 fsw_rps_thread_schedule(fsw, i);
2496 }
2497 FSW_RUNLOCK(fsw);
2498 } else
2499 #endif /* !DEVELOPMENT && !DEBUG */
2500 {
2501 #if (DEVELOPMENT || DEBUG)
2502 FSW_RUNLOCK(fsw);
2503 #endif /* !DEVELOPMENT && !DEBUG */
2504 _fsw_receive(fsw, pktq);
2505 }
2506 }
2507
2508 int
fsw_dev_input_netem_dequeue(void * handle,pktsched_pkt_t * __counted_by (n_pkts)pkts,uint32_t n_pkts)2509 fsw_dev_input_netem_dequeue(void *handle,
2510 pktsched_pkt_t *__counted_by(n_pkts)pkts, uint32_t n_pkts)
2511 {
2512 #pragma unused(handle)
2513 struct nx_flowswitch *__single fsw = handle;
2514 struct __kern_packet *kpkts[FSW_VP_DEV_BATCH_MAX];
2515 struct pktq pktq;
2516 sk_protect_t protect;
2517 uint32_t i;
2518
2519 ASSERT(n_pkts <= FSW_VP_DEV_BATCH_MAX);
2520
2521 for (i = 0; i < n_pkts; i++) {
2522 ASSERT(pkts[i].pktsched_ptype == QP_PACKET);
2523 ASSERT(pkts[i].pktsched_pkt_kpkt != NULL);
2524 kpkts[i] = pkts[i].pktsched_pkt_kpkt;
2525 }
2526
2527 protect = sk_sync_protect();
2528 KPKTQ_INIT(&pktq);
2529 pkts_to_pktq(kpkts, n_pkts, &pktq);
2530
2531 fsw_receive(fsw, &pktq);
2532 KPKTQ_FINI(&pktq);
2533 sk_sync_unprotect(protect);
2534
2535 return 0;
2536 }
2537
2538 static void
fsw_dev_input_netem_enqueue(struct nx_flowswitch * fsw,struct pktq * q)2539 fsw_dev_input_netem_enqueue(struct nx_flowswitch *fsw, struct pktq *q)
2540 {
2541 classq_pkt_t p;
2542 struct netem *__single ne;
2543 struct __kern_packet *pkt, *tpkt;
2544
2545 ASSERT(fsw->fsw_ifp != NULL);
2546 ne = fsw->fsw_ifp->if_input_netem;
2547 ASSERT(ne != NULL);
2548 KPKTQ_FOREACH_SAFE(pkt, q, tpkt) {
2549 bool pdrop;
2550 KPKTQ_REMOVE(q, pkt);
2551 CLASSQ_PKT_INIT_PACKET(&p, pkt);
2552 netem_enqueue(ne, &p, &pdrop);
2553 }
2554 }
2555
2556 void
fsw_devna_rx(struct nexus_adapter * devna,struct __kern_packet * pkt_head,struct nexus_pkt_stats * out_stats)2557 fsw_devna_rx(struct nexus_adapter *devna, struct __kern_packet *pkt_head,
2558 struct nexus_pkt_stats *out_stats)
2559 {
2560 struct __kern_packet *pkt = pkt_head, *next;
2561 struct nx_flowswitch *fsw;
2562 uint32_t n_bytes = 0, n_pkts = 0;
2563 uint64_t total_pkts = 0, total_bytes = 0;
2564 struct pktq q;
2565
2566 KPKTQ_INIT(&q);
2567 if (__improbable(devna->na_ifp == NULL ||
2568 (fsw = fsw_ifp_to_fsw(devna->na_ifp)) == NULL)) {
2569 SK_ERR("fsw not attached, dropping %d pkts", KPKTQ_LEN(&q));
2570 dp_drop_pkt_chain(pkt_head, 0, DROP_REASON_FSW_QUIESCED, DROPTAP_FLAG_L2_MISSING);
2571 return;
2572 }
2573 while (pkt != NULL) {
2574 if (__improbable(pkt->pkt_trace_id != 0)) {
2575 KDBG(SK_KTRACE_PKT_RX_DRV | DBG_FUNC_END, pkt->pkt_trace_id);
2576 KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_START, pkt->pkt_trace_id);
2577 }
2578 next = pkt->pkt_nextpkt;
2579 pkt->pkt_nextpkt = NULL;
2580
2581 if (__probable((pkt->pkt_qum_qflags & QUM_F_DROPPED) == 0)) {
2582 KPKTQ_ENQUEUE(&q, pkt);
2583 n_bytes += pkt->pkt_length;
2584 } else {
2585 DTRACE_SKYWALK1(non__finalized__drop,
2586 struct __kern_packet *, pkt);
2587 FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_FINALIZED);
2588 dp_drop_pkt_single(fsw, pkt, 0,
2589 DROP_REASON_FSW_RX_PKT_NOT_FINALIZED,
2590 DROPTAP_FLAG_L2_MISSING);
2591 pkt = NULL;
2592 }
2593 n_pkts = KPKTQ_LEN(&q);
2594 if (n_pkts == fsw_rx_batch || (next == NULL && n_pkts > 0)) {
2595 if (__improbable(fsw->fsw_ifp->if_input_netem != NULL)) {
2596 fsw_dev_input_netem_enqueue(fsw, &q);
2597 } else {
2598 fsw_receive(fsw, &q);
2599 }
2600 total_pkts += n_pkts;
2601 total_bytes += n_bytes;
2602 n_pkts = 0;
2603 n_bytes = 0;
2604 KPKTQ_FINI(&q);
2605 }
2606 pkt = next;
2607 }
2608 ASSERT(KPKTQ_LEN(&q) == 0);
2609 FSW_STATS_ADD(FSW_STATS_RX_PACKETS, total_pkts);
2610 if (out_stats != NULL) {
2611 out_stats->nps_pkts += total_pkts;
2612 out_stats->nps_bytes += total_bytes;
2613 }
2614 KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(devna), total_pkts, total_bytes);
2615 }
2616
2617 static int
dp_copy_to_dev_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2618 dp_copy_to_dev_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2619 struct __kern_packet *dpkt)
2620 {
2621 struct mbuf *__single m = NULL;
2622 uint32_t bdlen, bdlim, bdoff;
2623 uint8_t *bdaddr;
2624 unsigned int one = 1;
2625 int err = 0;
2626
2627 err = mbuf_allocpacket(MBUF_DONTWAIT,
2628 (fsw->fsw_frame_headroom + spkt->pkt_length), &one, &m);
2629 #if (DEVELOPMENT || DEBUG)
2630 if (m != NULL) {
2631 _FSW_INJECT_ERROR(11, m, NULL, m_freem, m);
2632 }
2633 #endif /* DEVELOPMENT || DEBUG */
2634 if (__improbable(m == NULL)) {
2635 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
2636 err = ENOBUFS;
2637 goto done;
2638 }
2639
2640 MD_BUFLET_ADDR_ABS_DLEN(dpkt, bdaddr, bdlen, bdlim, bdoff);
2641 if (fsw->fsw_frame_headroom > bdlim) {
2642 SK_ERR("not enough space in buffer for headroom");
2643 err = EINVAL;
2644 goto done;
2645 }
2646
2647 dpkt->pkt_headroom = fsw->fsw_frame_headroom;
2648 dpkt->pkt_mbuf = m;
2649 dpkt->pkt_pflags |= PKT_F_MBUF_DATA;
2650
2651 /* packet copy into mbuf */
2652 fsw->fsw_pkt_copy_to_mbuf(NR_TX, SK_PTR_ENCODE(spkt,
2653 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)), 0, m,
2654 fsw->fsw_frame_headroom, spkt->pkt_length,
2655 PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2656 spkt->pkt_csum_tx_start_off);
2657 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2MBUF);
2658
2659 /* header copy into dpkt buffer for classification */
2660 kern_packet_t sph = SK_PTR_ENCODE(spkt,
2661 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2662 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2663 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2664 uint32_t copy_len = MIN(spkt->pkt_length, bdlim - dpkt->pkt_headroom);
2665 fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, dpkt->pkt_headroom,
2666 sph, spkt->pkt_headroom, copy_len, FALSE, 0, 0, 0);
2667 if (copy_len < spkt->pkt_length) {
2668 dpkt->pkt_pflags |= PKT_F_TRUNCATED;
2669 }
2670
2671 /*
2672 * fsw->fsw_frame_headroom is after m_data, thus we treat m_data same as
2673 * buflet baddr m_data always points to the beginning of packet and
2674 * should represents the same as baddr + headroom
2675 */
2676 ASSERT((uintptr_t)m->m_data ==
2677 ((uintptr_t)mbuf_datastart(m) + fsw->fsw_frame_headroom));
2678
2679 done:
2680 return err;
2681 }
2682
2683 static int
dp_copy_to_dev_pkt(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2684 dp_copy_to_dev_pkt(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2685 struct __kern_packet *dpkt)
2686 {
2687 struct ifnet *ifp = fsw->fsw_ifp;
2688 uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
2689
2690 if (headroom > UINT8_MAX) {
2691 SK_ERR("headroom too large %d", headroom);
2692 return ERANGE;
2693 }
2694 dpkt->pkt_headroom = (uint8_t)headroom;
2695 ASSERT((dpkt->pkt_headroom & 0x7) == 0);
2696 dpkt->pkt_l2_len = 0;
2697 dpkt->pkt_link_flags = spkt->pkt_link_flags;
2698
2699 kern_packet_t sph = SK_PTR_ENCODE(spkt,
2700 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2701 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2702 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2703 fsw->fsw_pkt_copy_from_pkt(NR_TX, dph,
2704 dpkt->pkt_headroom, sph, spkt->pkt_headroom,
2705 spkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2706 (spkt->pkt_csum_tx_start_off - spkt->pkt_headroom),
2707 (spkt->pkt_csum_tx_stuff_off - spkt->pkt_headroom),
2708 (spkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT));
2709
2710 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
2711
2712 return 0;
2713 }
2714
2715 #if SK_LOG
2716 /* Hoisted out of line to reduce kernel stack footprint */
2717 SK_LOG_ATTRIBUTE
2718 static void
dp_copy_to_dev_log(struct nx_flowswitch * fsw,const struct kern_pbufpool * pp,struct __kern_packet * spkt,struct __kern_packet * dpkt,int error)2719 dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp,
2720 struct __kern_packet *spkt, struct __kern_packet *dpkt, int error)
2721 {
2722 struct proc *p = current_proc();
2723 struct ifnet *ifp = fsw->fsw_ifp;
2724 uint64_t logflags = (SK_VERB_FSW_DP | SK_VERB_TX);
2725
2726 if (error == ERANGE) {
2727 SK_ERR("packet too long, hr(fr+tx)+slen (%u+%u)+%u > "
2728 "dev_pp_max %u", (uint32_t)fsw->fsw_frame_headroom,
2729 (uint32_t)ifp->if_tx_headroom, spkt->pkt_length,
2730 (uint32_t)pp->pp_max_frags * PP_BUF_SIZE_DEF(pp));
2731 } else if (error == ENOBUFS) {
2732 SK_DF(logflags, "%s(%d) packet allocation failure",
2733 sk_proc_name(p), sk_proc_pid(p));
2734 } else if (error == 0) {
2735 ASSERT(dpkt != NULL);
2736 char *daddr;
2737 uint32_t pkt_len;
2738
2739 MD_BUFLET_ADDR_ABS(dpkt, daddr);
2740 pkt_len = __packet_get_real_data_length(dpkt);
2741 SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)",
2742 sk_proc_name(p), sk_proc_pid(p), spkt->pkt_length,
2743 dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
2744 (uint32_t)fsw->fsw_frame_headroom,
2745 (uint32_t)ifp->if_tx_headroom);
2746 SK_DF(logflags | SK_VERB_DUMP, "%s",
2747 sk_dump("buf", daddr, pkt_len, 128));
2748 } else {
2749 SK_DF(logflags, "%s(%d) error %d", sk_proc_name(p),
2750 sk_proc_pid(p), error);
2751 }
2752 }
2753 #else
2754 #define dp_copy_to_dev_log(...)
2755 #endif /* SK_LOG */
2756
2757 static void
fsw_pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)2758 fsw_pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
2759 {
2760 ASSERT(!(spkt->pkt_pflags & PKT_F_MBUF_MASK));
2761 ASSERT(!(spkt->pkt_pflags & PKT_F_PKT_MASK));
2762
2763 SK_PREFETCHW(dpkt->pkt_qum_buf.buf_addr, 0);
2764 /* Copy packet metadata */
2765 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
2766 _PKT_COPY(spkt, dpkt);
2767 _PKT_COPY_TX_PORT_DATA(spkt, dpkt);
2768 ASSERT((dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
2769 !PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
2770 ASSERT(dpkt->pkt_mbuf == NULL);
2771
2772 /* Copy AQM metadata */
2773 dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type;
2774 dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx;
2775 static_assert((offsetof(struct __flow, flow_src_id) % 8) == 0);
2776 _UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id);
2777 _UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid);
2778 dpkt->pkt_policy_id = spkt->pkt_policy_id;
2779 dpkt->pkt_skip_policy_id = spkt->pkt_skip_policy_id;
2780 }
2781
2782 static int
dp_copy_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2783 dp_copy_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2784 struct __kern_packet *dpkt)
2785 {
2786 const struct kern_pbufpool *pp = dpkt->pkt_qum.qum_pp;
2787 struct ifnet *ifp = fsw->fsw_ifp;
2788 uint32_t dev_pkt_len;
2789 int err = 0;
2790
2791 fsw_pkt_copy_metadata(spkt, dpkt);
2792 switch (fsw->fsw_classq_enq_ptype) {
2793 case QP_MBUF:
2794 err = dp_copy_to_dev_mbuf(fsw, spkt, dpkt);
2795 break;
2796
2797 case QP_PACKET:
2798 dev_pkt_len = fsw->fsw_frame_headroom + ifp->if_tx_headroom +
2799 spkt->pkt_length;
2800 if (dev_pkt_len > pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)) {
2801 FSW_STATS_INC(FSW_STATS_TX_COPY_BAD_LEN);
2802 err = ERANGE;
2803 goto done;
2804 }
2805 err = dp_copy_to_dev_pkt(fsw, spkt, dpkt);
2806 break;
2807
2808 default:
2809 VERIFY(0);
2810 __builtin_unreachable();
2811 }
2812 done:
2813 dp_copy_to_dev_log(fsw, pp, spkt, dpkt, err);
2814 return err;
2815 }
2816
2817 static int
dp_copy_headers_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2818 dp_copy_headers_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2819 struct __kern_packet *dpkt)
2820 {
2821 uint8_t *sbaddr, *dbaddr;
2822 uint16_t headroom = fsw->fsw_frame_headroom + fsw->fsw_ifp->if_tx_headroom;
2823 uint16_t hdrs_len_estimate = (uint16_t)MIN(spkt->pkt_length, 128);
2824
2825 fsw_pkt_copy_metadata(spkt, dpkt);
2826
2827 MD_BUFLET_ADDR_ABS(spkt, sbaddr);
2828 ASSERT(sbaddr != NULL);
2829 sbaddr += spkt->pkt_headroom;
2830
2831 MD_BUFLET_ADDR_ABS(dpkt, dbaddr);
2832 ASSERT(dbaddr != NULL);
2833 dpkt->pkt_headroom = (uint8_t)headroom;
2834 dbaddr += headroom;
2835
2836 pkt_copy(sbaddr, dbaddr, hdrs_len_estimate);
2837 METADATA_SET_LEN(dpkt, hdrs_len_estimate, headroom);
2838
2839 /* packet length is set to the full length */
2840 dpkt->pkt_length = spkt->pkt_length;
2841 dpkt->pkt_pflags |= PKT_F_TRUNCATED;
2842 return 0;
2843 }
2844
2845 static struct mbuf *
convert_pkt_to_mbuf(struct __kern_packet * pkt)2846 convert_pkt_to_mbuf(struct __kern_packet *pkt)
2847 {
2848 ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
2849 ASSERT(pkt->pkt_mbuf != NULL);
2850 struct mbuf *m = pkt->pkt_mbuf;
2851
2852 /* pass additional metadata generated from flow parse/lookup */
2853 static_assert(sizeof(m->m_pkthdr.pkt_flowid) == sizeof(pkt->pkt_flow_token));
2854 static_assert(sizeof(m->m_pkthdr.pkt_mpriv_srcid) == sizeof(pkt->pkt_flowsrc_token));
2855 static_assert(sizeof(m->m_pkthdr.pkt_mpriv_fidx) == sizeof(pkt->pkt_flowsrc_fidx));
2856 m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
2857 m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
2858 m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
2859 m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
2860 m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
2861 m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
2862 m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx;
2863
2864 if (pkt->pkt_transport_protocol == IPPROTO_QUIC) {
2865 m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_QUIC;
2866 }
2867
2868 /* The packet should have a timestamp by the time we get here. */
2869 m->m_pkthdr.pkt_timestamp = pkt->pkt_timestamp;
2870 m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
2871
2872 m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
2873 m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
2874 /* set pkt_hdr so that AQM can find IP header and mark ECN bits */
2875 m->m_pkthdr.pkt_hdr = m_mtod_current(m) + pkt->pkt_l2_len;
2876
2877 if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
2878 m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
2879 }
2880 KPKT_CLEAR_MBUF_DATA(pkt);
2881
2882 /* mbuf has been consumed, release packet as well */
2883 ASSERT(pkt->pkt_qum.qum_ksd == NULL);
2884 pp_free_packet_single(pkt);
2885 return m;
2886 }
2887
2888 static void
convert_pkt_to_mbuf_list(struct __kern_packet * pkt_list,struct mbuf ** head,struct mbuf ** tail,uint32_t * cnt,uint32_t * bytes)2889 convert_pkt_to_mbuf_list(struct __kern_packet *pkt_list,
2890 struct mbuf **head, struct mbuf **tail,
2891 uint32_t *cnt, uint32_t *bytes)
2892 {
2893 struct __kern_packet *pkt = pkt_list, *next;
2894 struct mbuf *__single m_head = NULL, **__single m_tailp = &m_head;
2895 struct mbuf *__single m = NULL;
2896 uint32_t c = 0, b = 0;
2897
2898 while (pkt != NULL) {
2899 next = pkt->pkt_nextpkt;
2900 pkt->pkt_nextpkt = NULL;
2901 m = convert_pkt_to_mbuf(pkt);
2902 ASSERT(m != NULL);
2903
2904 *m_tailp = m;
2905 m_tailp = &m->m_nextpkt;
2906 c++;
2907 b += m_pktlen(m);
2908 pkt = next;
2909 }
2910 if (head != NULL) {
2911 *head = m_head;
2912 }
2913 if (tail != NULL) {
2914 *tail = m;
2915 }
2916 if (cnt != NULL) {
2917 *cnt = c;
2918 }
2919 if (bytes != NULL) {
2920 *bytes = b;
2921 }
2922 }
2923
2924 SK_NO_INLINE_ATTRIBUTE
2925 static int
classq_enqueue_flow_single(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2926 classq_enqueue_flow_single(struct nx_flowswitch *fsw,
2927 struct __kern_packet *pkt)
2928 {
2929 struct ifnet *ifp = fsw->fsw_ifp;
2930 boolean_t pkt_drop = FALSE;
2931 int err;
2932
2933 FSW_LOCK_ASSERT_HELD(fsw);
2934 ASSERT(fsw->fsw_classq_enabled);
2935 ASSERT(pkt->pkt_flow_token != 0);
2936 fsw_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class,
2937 1, pkt->pkt_length);
2938
2939 if (__improbable(pkt->pkt_trace_id != 0)) {
2940 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
2941 KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
2942 }
2943
2944 switch (fsw->fsw_classq_enq_ptype) {
2945 case QP_MBUF: { /* compat interface */
2946 struct mbuf *m;
2947
2948 m = convert_pkt_to_mbuf(pkt);
2949 ASSERT(m != NULL);
2950 pkt = NULL;
2951
2952 /* ifnet_enqueue consumes mbuf */
2953 err = ifnet_enqueue_mbuf(ifp, m, false, &pkt_drop);
2954 m = NULL;
2955 #if (DEVELOPMENT || DEBUG)
2956 if (__improbable(!pkt_drop)) {
2957 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2958 }
2959 #endif /* DEVELOPMENT || DEBUG */
2960 if (pkt_drop) {
2961 FSW_STATS_INC(FSW_STATS_DROP);
2962 FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2963 }
2964 break;
2965 }
2966 case QP_PACKET: { /* native interface */
2967 /* ifnet_enqueue consumes packet */
2968 err = ifnet_enqueue_pkt(ifp, ifp->if_snd, pkt, false, &pkt_drop);
2969 pkt = NULL;
2970 #if (DEVELOPMENT || DEBUG)
2971 if (__improbable(!pkt_drop)) {
2972 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2973 }
2974 #endif /* DEVELOPMENT || DEBUG */
2975 if (pkt_drop) {
2976 FSW_STATS_INC(FSW_STATS_DROP);
2977 FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2978 }
2979 break;
2980 }
2981 default:
2982 err = EINVAL;
2983 VERIFY(0);
2984 /* NOTREACHED */
2985 __builtin_unreachable();
2986 }
2987
2988 return err;
2989 }
2990
2991 static int
classq_enqueue_flow_chain(struct nx_flowswitch * fsw,struct __kern_packet * pkt_head,struct __kern_packet * pkt_tail,uint32_t cnt,uint32_t bytes)2992 classq_enqueue_flow_chain(struct nx_flowswitch *fsw,
2993 struct __kern_packet *pkt_head, struct __kern_packet *pkt_tail,
2994 uint32_t cnt, uint32_t bytes)
2995 {
2996 struct ifnet *ifp = fsw->fsw_ifp;
2997 boolean_t pkt_drop = FALSE;
2998 uint32_t svc;
2999 int err;
3000
3001 FSW_LOCK_ASSERT_HELD(fsw);
3002 ASSERT(fsw->fsw_classq_enabled);
3003 ASSERT(pkt_head->pkt_flow_token != 0);
3004
3005 /*
3006 * All packets in the flow should have the same svc.
3007 */
3008 svc = pkt_head->pkt_svc_class;
3009 fsw_ifp_inc_traffic_class_out_pkt(ifp, svc, cnt, bytes);
3010
3011 switch (fsw->fsw_classq_enq_ptype) {
3012 case QP_MBUF: { /* compat interface */
3013 struct mbuf *__single m_head = NULL, *__single m_tail = NULL;
3014 uint32_t c = 0, b = 0;
3015
3016 convert_pkt_to_mbuf_list(pkt_head, &m_head, &m_tail, &c, &b);
3017 ASSERT(m_head != NULL && m_tail != NULL);
3018 ASSERT(c == cnt);
3019 ASSERT(b == bytes);
3020 pkt_head = NULL;
3021
3022 /* ifnet_enqueue consumes mbuf */
3023 err = ifnet_enqueue_mbuf_chain(ifp, m_head, m_tail, cnt,
3024 bytes, FALSE, &pkt_drop);
3025 m_head = NULL;
3026 m_tail = NULL;
3027 #if (DEVELOPMENT || DEBUG)
3028 if (__improbable(!pkt_drop)) {
3029 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
3030 }
3031 #endif /* DEVELOPMENT || DEBUG */
3032 if (pkt_drop) {
3033 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
3034 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
3035 cnt);
3036 }
3037 break;
3038 }
3039 case QP_PACKET: { /* native interface */
3040 /* ifnet_enqueue consumes packet */
3041 err = ifnet_enqueue_pkt_chain(ifp, ifp->if_snd, pkt_head, pkt_tail, cnt,
3042 bytes, FALSE, &pkt_drop);
3043 pkt_head = NULL;
3044 #if (DEVELOPMENT || DEBUG)
3045 if (__improbable(!pkt_drop)) {
3046 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
3047 }
3048 #endif /* DEVELOPMENT || DEBUG */
3049 if (pkt_drop) {
3050 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
3051 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
3052 cnt);
3053 }
3054 break;
3055 }
3056 default:
3057 err = EINVAL;
3058 VERIFY(0);
3059 /* NOTREACHED */
3060 __builtin_unreachable();
3061 }
3062
3063 return err;
3064 }
3065
3066 /*
3067 * This code path needs to be kept for interfaces without logical link support.
3068 */
3069 static void
classq_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)3070 classq_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
3071 bool chain, uint32_t cnt, uint32_t bytes)
3072 {
3073 struct __kern_packet *pkt, *tail, *tpkt;
3074 flowadv_idx_t flow_adv_idx;
3075 bool flowadv_cap;
3076 flowadv_token_t flow_adv_token;
3077 int err;
3078
3079 SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
3080 if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
3081
3082 if (chain) {
3083 pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
3084 tail = KPKTQ_LAST(&fe->fe_tx_pktq);
3085 KPKTQ_INIT(&fe->fe_tx_pktq);
3086 if (pkt == NULL) {
3087 return;
3088 }
3089 flow_adv_idx = pkt->pkt_flowsrc_fidx;
3090 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3091 flow_adv_token = pkt->pkt_flow_token;
3092
3093 err = classq_enqueue_flow_chain(fsw, pkt, tail, cnt, bytes);
3094 DTRACE_SKYWALK3(chain__enqueue, uint32_t, cnt, uint32_t, bytes, int, err);
3095 } else {
3096 uint32_t c = 0, b = 0;
3097
3098 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3099 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3100
3101 flow_adv_idx = pkt->pkt_flowsrc_fidx;
3102 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3103 flow_adv_token = pkt->pkt_flow_token;
3104
3105 c++;
3106 b += pkt->pkt_length;
3107 err = classq_enqueue_flow_single(fsw, pkt);
3108 }
3109 ASSERT(c == cnt);
3110 ASSERT(b == bytes);
3111 DTRACE_SKYWALK3(non__chain__enqueue, uint32_t, cnt, uint32_t, bytes,
3112 int, err);
3113 }
3114 }
3115
3116 /*
3117 * Logical link code path
3118 */
3119 static void
classq_qset_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)3120 classq_qset_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
3121 bool chain, uint32_t cnt, uint32_t bytes)
3122 {
3123 struct __kern_packet *pkt, *tail;
3124 flowadv_idx_t flow_adv_idx;
3125 bool flowadv_cap;
3126 flowadv_token_t flow_adv_token;
3127 uint32_t flowctl = 0, dropped = 0;
3128 int err;
3129
3130 SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
3131 if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
3132
3133 pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
3134 tail = KPKTQ_LAST(&fe->fe_tx_pktq);
3135 KPKTQ_INIT(&fe->fe_tx_pktq);
3136 if (pkt == NULL) {
3137 return;
3138 }
3139 flow_adv_idx = pkt->pkt_flowsrc_fidx;
3140 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3141 flow_adv_token = pkt->pkt_flow_token;
3142
3143 err = netif_qset_enqueue(fe->fe_qset, chain, pkt, tail, cnt, bytes,
3144 &flowctl, &dropped);
3145
3146 if (__improbable(err != 0) && dropped > 0) {
3147 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, dropped);
3148 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP, dropped);
3149 }
3150 }
3151
3152 static void
tx_finalize_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)3153 tx_finalize_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
3154 {
3155 #pragma unused(fsw)
3156 /* finalize here; no more changes to buflets after classq */
3157 if (__probable(!(pkt->pkt_pflags & PKT_F_MBUF_DATA))) {
3158 kern_packet_t ph = SK_PTR_ENCODE(pkt,
3159 METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
3160 int err = __packet_finalize(ph);
3161 VERIFY(err == 0);
3162 }
3163 }
3164
3165 static bool
dp_flow_tx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)3166 dp_flow_tx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
3167 {
3168 struct flow_route *fr = fe->fe_route;
3169 int err;
3170
3171 ASSERT(fr != NULL);
3172
3173 if (__improbable(!dp_flow_route_process(fsw, fe))) {
3174 return false;
3175 }
3176 if (fe->fe_qset_select == FE_QSET_SELECT_DYNAMIC) {
3177 flow_qset_select_dynamic(fsw, fe, TRUE);
3178 }
3179
3180 _FSW_INJECT_ERROR(35, fr->fr_flags, fr->fr_flags,
3181 _fsw_error35_handler, 1, fr, NULL, NULL);
3182 _FSW_INJECT_ERROR(36, fr->fr_flags, fr->fr_flags,
3183 _fsw_error36_handler, 1, fr, NULL);
3184
3185 /*
3186 * See if we need to resolve the flow route; note the test against
3187 * fr_flags here is done without any lock for performance. Thus
3188 * it's possible that we race against the thread performing route
3189 * event updates for a packet (which is OK). In any case we should
3190 * not have any assertion on fr_flags value(s) due to the lack of
3191 * serialization.
3192 */
3193 if (fr->fr_flags & FLOWRTF_RESOLVED) {
3194 goto frame;
3195 }
3196
3197 struct __kern_packet *pkt, *tpkt;
3198 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3199 err = fsw->fsw_resolve(fsw, fr, pkt);
3200 _FSW_INJECT_ERROR_SET(35, _fsw_error35_handler, 2, fr, pkt, &err);
3201 _FSW_INJECT_ERROR_SET(36, _fsw_error36_handler, 2, fr, &err);
3202 /*
3203 * If resolver returns EJUSTRETURN then we drop the pkt as the
3204 * resolver should have converted the pkt into mbuf (or
3205 * detached the attached mbuf from pkt) and added it to the
3206 * llinfo queue. If we do have a cached llinfo, then proceed
3207 * to using it even though it may be stale (very unlikely)
3208 * while the resolution is in progress.
3209 * Otherwise, any other error results in dropping pkt.
3210 */
3211 if (err == EJUSTRETURN) {
3212 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3213 pp_free_packet_single(pkt);
3214 FSW_STATS_INC(FSW_STATS_TX_RESOLV_PENDING);
3215 continue;
3216 } else if (err != 0 && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
3217 /* use existing llinfo */
3218 FSW_STATS_INC(FSW_STATS_TX_RESOLV_STALE);
3219 } else if (err != 0) {
3220 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3221 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_RESOLV_FAILED,
3222 DROPTAP_FLAG_L2_MISSING);
3223 FSW_STATS_INC(FSW_STATS_TX_RESOLV_FAIL);
3224 continue;
3225 }
3226 }
3227
3228 frame:
3229 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3230 if (fsw->fsw_frame != NULL) {
3231 fsw->fsw_frame(fsw, fr, pkt);
3232 }
3233 }
3234
3235 return true;
3236 }
3237
3238 static void
dp_listener_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)3239 dp_listener_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
3240 {
3241 #pragma unused(fsw)
3242 struct __kern_packet *pkt, *tpkt;
3243 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3244 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3245 /* listener is only allowed TCP RST */
3246 if (pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
3247 (pkt->pkt_flow_tcp_flags & TH_RST) != 0) {
3248 flow_track_abort_tcp(fe, NULL, pkt);
3249 } else {
3250 char *addr;
3251
3252 MD_BUFLET_ADDR_ABS(pkt, addr);
3253 SK_ERR("listener flow sends non-RST packet %s",
3254 sk_dump(sk_proc_name(current_proc()),
3255 addr, __packet_get_real_data_length(pkt), 128));
3256 }
3257 pp_free_packet_single(pkt);
3258 }
3259 }
3260
3261 static void
fsw_update_timestamps(struct __kern_packet * pkt,volatile uint64_t * fg_ts,volatile uint64_t * rt_ts,ifnet_t ifp,uint64_t now)3262 fsw_update_timestamps(struct __kern_packet *pkt, volatile uint64_t *fg_ts,
3263 volatile uint64_t *rt_ts, ifnet_t ifp, uint64_t now)
3264 {
3265 if (!(pkt->pkt_pflags & PKT_F_TS_VALID) || pkt->pkt_timestamp == 0) {
3266 pkt->pkt_timestamp = now;
3267 }
3268 pkt->pkt_pflags &= ~PKT_F_TS_VALID;
3269
3270 /*
3271 * If the packet service class is not background,
3272 * update the timestamps on the interface, as well as
3273 * the ones in nexus-wide advisory to indicate recent
3274 * activity on a foreground flow.
3275 */
3276 if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) {
3277 ifp->if_fg_sendts = (uint32_t)net_uptime();
3278 if (fg_ts != NULL) {
3279 *fg_ts = net_uptime();
3280 }
3281 }
3282 if (pkt->pkt_pflags & PKT_F_REALTIME) {
3283 ifp->if_rt_sendts = (uint32_t)net_uptime();
3284 if (rt_ts != NULL) {
3285 *rt_ts = net_uptime();
3286 }
3287 }
3288 }
3289
3290 static bool
fsw_chain_enqueue_enabled(struct nx_flowswitch * fsw)3291 fsw_chain_enqueue_enabled(struct nx_flowswitch *fsw)
3292 {
3293 return fsw_chain_enqueue != 0 &&
3294 fsw->fsw_ifp->if_output_netem == NULL &&
3295 (fsw->fsw_ifp->if_eflags & IFEF_ENQUEUE_MULTI) == 0;
3296 }
3297
3298 void
dp_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)3299 dp_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3300 uint32_t flags)
3301 {
3302 struct pktq dropped_pkts;
3303 bool chain, same_svc = true;
3304 bool gso = ((flags & FLOW_PROC_FLAG_GSO) != 0);
3305 uint32_t cnt = 0, bytes = 0;
3306 volatile struct sk_nexusadv *nxadv = NULL;
3307 volatile uint64_t *fg_ts = NULL;
3308 volatile uint64_t *rt_ts = NULL;
3309 uint8_t qset_idx = (fe->fe_qset != NULL) ? fe->fe_qset->nqs_idx : 0;
3310 drop_reason_t reason = DROP_REASON_UNSPECIFIED;
3311 uint16_t line = 0;
3312 uint32_t svc = 0;
3313 struct timespec now;
3314 uint64_t now_nsec = 0;
3315
3316 KPKTQ_INIT(&dropped_pkts);
3317 ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3318 if (__improbable(fe->fe_flags & FLOWENTF_LISTENER)) {
3319 dp_listener_flow_tx_process(fsw, fe);
3320 return;
3321 }
3322 if (__improbable(!dp_flow_tx_route_process(fsw, fe))) {
3323 SK_RDERR(5, "Tx route bad");
3324 FSW_STATS_ADD(FSW_STATS_TX_FLOW_NONVIABLE,
3325 KPKTQ_LEN(&fe->fe_tx_pktq));
3326 KPKTQ_CONCAT(&dropped_pkts, &fe->fe_tx_pktq);
3327 reason = DROP_REASON_FSW_FLOW_NONVIABLE;
3328 line = __LINE__;
3329 goto done;
3330 }
3331 chain = fsw_chain_enqueue_enabled(fsw) && KPKTQ_LEN(&fe->fe_tx_pktq) > 1;
3332 if (chain) {
3333 nanouptime(&now);
3334 net_timernsec(&now, &now_nsec);
3335 nxadv = fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
3336 if (nxadv != NULL) {
3337 fg_ts = &nxadv->nxadv_fg_sendts;
3338 rt_ts = &nxadv->nxadv_rt_sendts;
3339 }
3340 }
3341
3342 struct __kern_packet *pkt, *tpkt;
3343 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3344 int err = 0;
3345 if (svc == 0) {
3346 svc = pkt->pkt_svc_class;
3347 }
3348
3349 err = flow_pkt_track(fe, pkt, false);
3350 if (__improbable(err != 0)) {
3351 SK_RDERR(5, "flow_pkt_track failed (err %d)", err);
3352 FSW_STATS_INC(FSW_STATS_TX_FLOW_TRACK_ERR);
3353 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3354 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_FLOW_TRACK_ERR,
3355 DROPTAP_FLAG_L2_MISSING);
3356 continue;
3357 }
3358 _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
3359 pkt->pkt_transport_protocol = fe->fe_transport_protocol;
3360
3361 /* set AQM related values for outgoing packet */
3362 if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
3363 pkt->pkt_pflags |= PKT_F_FLOW_ADV;
3364 pkt->pkt_flowsrc_type = FLOWSRC_CHANNEL;
3365 pkt->pkt_flowsrc_fidx = fe->fe_adv_idx;
3366 } else {
3367 pkt->pkt_pflags &= ~PKT_F_FLOW_ADV;
3368 }
3369 _UUID_CLEAR(pkt->pkt_flow_id);
3370 pkt->pkt_flow_token = fe->fe_flowid;
3371 pkt->pkt_pflags |= PKT_F_FLOW_ID;
3372 pkt->pkt_qset_idx = qset_idx;
3373 pkt->pkt_policy_id = fe->fe_policy_id;
3374 pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
3375
3376 /*
3377 * The same code is exercised per packet for the non-chain case
3378 * (see ifnet_enqueue_ifclassq()). It's replicated here to avoid
3379 * re-walking the chain later.
3380 */
3381 if (chain && (gso || same_svc)) {
3382 fsw_update_timestamps(pkt, fg_ts, rt_ts, fsw->fsw_ifp, now_nsec);
3383 }
3384 /* mark packet tos/svc_class */
3385 fsw_qos_mark(fsw, fe, pkt);
3386
3387 tx_finalize_packet(fsw, pkt);
3388 bytes += pkt->pkt_length;
3389 cnt++;
3390
3391 same_svc = (same_svc && (svc == pkt->pkt_svc_class));
3392 /*
3393 * we are using the first 4 bytes of flow_id as the AQM flow
3394 * identifier.
3395 */
3396 ASSERT(!uuid_is_null(pkt->pkt_flow_id));
3397
3398 if (__improbable(pkt->pkt_trace_id != 0)) {
3399 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
3400 KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
3401 }
3402 }
3403
3404 /* snoop after it's finalized */
3405 if (__improbable(pktap_total_tap_count != 0)) {
3406 fsw_snoop(fsw, fe, &fe->fe_tx_pktq, false);
3407 }
3408
3409 chain = chain && (gso || same_svc);
3410 if (fe->fe_qset != NULL) {
3411 classq_qset_enqueue_flow(fsw, fe, chain, cnt, bytes);
3412 } else {
3413 classq_enqueue_flow(fsw, fe, chain, cnt, bytes);
3414 }
3415 done:
3416 dp_drop_pktq(fsw, &dropped_pkts, 1, reason, line, 0);
3417 }
3418
3419 static struct flow_entry *
tx_process_continuous_ip_frag(struct nx_flowswitch * fsw,struct flow_entry * prev_fe,struct __kern_packet * pkt)3420 tx_process_continuous_ip_frag(struct nx_flowswitch *fsw,
3421 struct flow_entry *prev_fe, struct __kern_packet *pkt)
3422 {
3423 ASSERT(!pkt->pkt_flow_ip_is_first_frag);
3424
3425 if (__improbable(pkt->pkt_flow_ip_frag_id == 0)) {
3426 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID);
3427 SK_PERR(current_proc(), "invalid zero fragment id");
3428 return NULL;
3429 }
3430
3431 SK_PDF(SK_VERB_FSW_DP | SK_VERB_TX, current_proc(),
3432 "continuation frag, id %u", pkt->pkt_flow_ip_frag_id);
3433 if (__improbable(prev_fe == NULL ||
3434 !prev_fe->fe_tx_is_cont_frag)) {
3435 SK_PERR(current_proc(), "unexpected continuation frag %u",
3436 pkt->pkt_flow_ip_frag_id);
3437 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3438 return NULL;
3439 }
3440 if (__improbable(pkt->pkt_flow_ip_frag_id !=
3441 prev_fe->fe_tx_frag_id)) {
3442 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3443 SK_PERR(current_proc(), "wrong continuation frag id %u expecting %u",
3444 pkt->pkt_flow_ip_frag_id, prev_fe->fe_tx_frag_id);
3445 return NULL;
3446 }
3447
3448 return prev_fe;
3449 }
3450
3451 static struct flow_entry *
tx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)3452 tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
3453 struct flow_entry *prev_fe)
3454 {
3455 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
3456 struct flow_entry *__single fe;
3457
3458 fe = lookup_flow_with_pkt(fsw, pkt, false, prev_fe);
3459 if (__improbable(fe == NULL)) {
3460 goto done;
3461 }
3462
3463 if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
3464 SK_RDERR(5, "Tx flow torn down %s",
3465 fe2str(fe, dbgbuf, sizeof(dbgbuf)));
3466 FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN);
3467 flow_entry_release(&fe);
3468 goto done;
3469 }
3470
3471 if (__improbable(fe->fe_flags & FLOWENTF_AOP_OFFLOAD)) {
3472 SK_RDERR(5, "Tx not allowed for this flow");
3473 SK_RDERR(5, "Tx not allowed for this flow %s",
3474 fe2str(fe, dbgbuf, sizeof(dbgbuf)));
3475 FSW_STATS_INC(FSW_STATS_TX_DISABLED);
3476 flow_entry_release(&fe);
3477 goto done;
3478 }
3479
3480 _FSW_INJECT_ERROR(34, pkt->pkt_flow_id[0], fe->fe_uuid[0] + 1,
3481 null_func);
3482
3483 if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
3484 uuid_string_t flow_id_str, pkt_id_str;
3485 sk_uuid_unparse(fe->fe_uuid, flow_id_str);
3486 sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str);
3487 SK_ERR("pkt flow id %s != flow id %s, %s", pkt_id_str,
3488 flow_id_str, fe2str(fe, dbgbuf, sizeof(dbgbuf)));
3489 flow_entry_release(&fe);
3490 FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID);
3491 }
3492
3493 done:
3494 return fe;
3495 }
3496
3497 static inline void
tx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)3498 tx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3499 uint32_t flags)
3500 {
3501 ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3502 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) != 0);
3503
3504 SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, "TX %d pkts from fe %p port %d",
3505 KPKTQ_LEN(&fe->fe_tx_pktq), fe, fe->fe_nx_port);
3506
3507 /* flow related processing (default, agg, etc.) */
3508 fe->fe_tx_process(fsw, fe, flags);
3509
3510 KPKTQ_FINI(&fe->fe_tx_pktq);
3511 }
3512
3513 #if SK_LOG
3514 static void
dp_tx_log_pkt(uint64_t verb,char * desc,struct __kern_packet * pkt)3515 dp_tx_log_pkt(uint64_t verb, char *desc, struct __kern_packet *pkt)
3516 {
3517 char *pkt_buf;
3518 uint32_t pkt_len;
3519
3520 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
3521 pkt_len = __packet_get_real_data_length(pkt);
3522 SK_DF(verb, "%s(%d) %s %s", sk_proc_name(current_proc()),
3523 sk_proc_pid(current_proc()), desc, sk_dump("buf", pkt_buf, pkt_len,
3524 128));
3525 }
3526 #else /* !SK_LOG */
3527 #define dp_tx_log_pkt(...)
3528 #endif /* !SK_LOG */
3529
3530 static inline struct ifnet *
fsw_datamov_begin(struct nx_flowswitch * fsw)3531 fsw_datamov_begin(struct nx_flowswitch *fsw)
3532 {
3533 struct ifnet *ifp;
3534
3535 ifp = fsw->fsw_ifp;
3536 if (!ifnet_datamov_begin(ifp)) {
3537 DTRACE_SKYWALK1(ifnet__detached, struct ifnet *, ifp);
3538 return NULL;
3539 }
3540 return ifp;
3541 }
3542
3543 static inline void
fsw_datamov_end(struct nx_flowswitch * fsw)3544 fsw_datamov_end(struct nx_flowswitch *fsw)
3545 {
3546 ifnet_datamov_end(fsw->fsw_ifp);
3547 }
3548
3549 static void
dp_tx_pktq(struct nx_flowswitch * fsw,struct pktq * spktq)3550 dp_tx_pktq(struct nx_flowswitch *fsw, struct pktq *spktq)
3551 {
3552 struct __kern_packet *spkt, *pkt;
3553 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3554 struct flow_entry *__single fe, *__single prev_fe;
3555 struct pktq dropped_pkts, dpktq;
3556 struct nexus_adapter *dev_na;
3557 struct kern_pbufpool *dev_pp;
3558 struct ifnet *ifp = NULL;
3559 sa_family_t af;
3560 uint32_t n_pkts, n_flows = 0;
3561 boolean_t do_pacing = FALSE;
3562 drop_reason_t reason = DROP_REASON_UNSPECIFIED;
3563 uint16_t line = 0;
3564
3565 int err;
3566 KPKTQ_INIT(&dpktq);
3567 KPKTQ_INIT(&dropped_pkts);
3568 n_pkts = KPKTQ_LEN(spktq);
3569
3570 FSW_RLOCK(fsw);
3571 if (__improbable(FSW_QUIESCED(fsw))) {
3572 DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3573 SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3574 KPKTQ_CONCAT(&dropped_pkts, spktq);
3575 reason = DROP_REASON_FSW_QUIESCED;
3576 line = __LINE__;
3577 goto done;
3578 }
3579 dev_na = fsw->fsw_dev_ch->ch_na;
3580 if (__improbable(dev_na == NULL)) {
3581 SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3582 FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3583 KPKTQ_CONCAT(&dropped_pkts, spktq);
3584 reason = DROP_REASON_FSW_TX_DEVPORT_NOT_ATTACHED;
3585 line = __LINE__;
3586 goto done;
3587 }
3588 ifp = fsw_datamov_begin(fsw);
3589 if (ifp == NULL) {
3590 SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
3591 KPKTQ_CONCAT(&dropped_pkts, spktq);
3592 reason = DROP_REASON_FSW_IFNET_NOT_ATTACHED;
3593 line = __LINE__;
3594 goto done;
3595 }
3596
3597 /* batch allocate enough packets */
3598 dev_pp = na_kr_get_pp(dev_na, NR_TX);
3599
3600 err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, n_pkts, NULL,
3601 NULL, SKMEM_NOSLEEP);
3602 #if DEVELOPMENT || DEBUG
3603 if (__probable(err != ENOMEM)) {
3604 _FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
3605 }
3606 #endif /* DEVELOPMENT || DEBUG */
3607 if (__improbable(err == ENOMEM)) {
3608 ASSERT(KPKTQ_EMPTY(&dpktq));
3609 KPKTQ_CONCAT(&dropped_pkts, spktq);
3610 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3611 SK_ERR("failed to alloc %u pkts from device pool", n_pkts);
3612 reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
3613 line = __LINE__;
3614 goto done;
3615 } else if (__improbable(err == EAGAIN)) {
3616 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT,
3617 (n_pkts - KPKTQ_LEN(&dpktq)));
3618 FSW_STATS_ADD(FSW_STATS_DROP,
3619 (n_pkts - KPKTQ_LEN(&dpktq)));
3620 }
3621
3622 n_pkts = KPKTQ_LEN(&dpktq);
3623 prev_fe = NULL;
3624 KPKTQ_FOREACH(spkt, spktq) {
3625 if (n_pkts == 0) {
3626 break;
3627 }
3628 --n_pkts;
3629
3630 KPKTQ_DEQUEUE(&dpktq, pkt);
3631 ASSERT(pkt != NULL);
3632 err = dp_copy_to_dev(fsw, spkt, pkt);
3633 if (__improbable(err != 0)) {
3634 /*
3635 * Copy to dev pool failed, so droptap should capture
3636 * the source pkt because dev pkt might not have metadata
3637 * or buffer filled out yet. Source pkt is freed by
3638 * fsw_user_ring_flush, so defer the free to that.
3639 */
3640 dp_drop_pkt_single_nofree(fsw, spkt, 1,
3641 DROP_REASON_FSW_PKT_COPY_FAILED, DROPTAP_FLAG_L2_MISSING);
3642 /* Free the dev pool packet */
3643 pp_free_packet_single(pkt);
3644 continue;
3645 }
3646
3647 do_pacing |= __packet_get_tx_timestamp(SK_PKT2PH(pkt)) != 0;
3648 af = fsw_ip_demux(fsw, pkt);
3649 if (__improbable(af == AF_UNSPEC)) {
3650 dp_tx_log_pkt(SK_VERB_ERROR, "demux err", pkt);
3651 FSW_STATS_INC(FSW_STATS_TX_DEMUX_ERR);
3652 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_DEMUX_FAILED,
3653 DROPTAP_FLAG_L2_MISSING);
3654 continue;
3655 }
3656
3657 err = flow_pkt_classify(pkt, ifp, af, false);
3658 if (__improbable(err != 0)) {
3659 dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
3660 FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
3661 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_EXTRACT_FAILED,
3662 DROPTAP_FLAG_L2_MISSING);
3663 continue;
3664 }
3665
3666 if (__improbable(pkt->pkt_flow_ip_is_frag &&
3667 !pkt->pkt_flow_ip_is_first_frag)) {
3668 fe = tx_process_continuous_ip_frag(fsw, prev_fe, pkt);
3669 if (__probable(fe != NULL)) {
3670 flow_entry_retain(fe);
3671 goto flow_batch;
3672 } else {
3673 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3674 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FRAG_BAD_CONT,
3675 DROPTAP_FLAG_L2_MISSING);
3676 continue;
3677 }
3678 }
3679
3680 fe = tx_lookup_flow(fsw, pkt, prev_fe);
3681 if (__improbable(fe == NULL)) {
3682 FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
3683 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_NOT_FOUND,
3684 DROPTAP_FLAG_L2_MISSING);
3685 prev_fe = NULL;
3686 continue;
3687 }
3688 flow_batch:
3689 tx_flow_batch_packet(&fes, fe, pkt);
3690 prev_fe = fe;
3691 }
3692
3693 struct flow_entry *tfe = NULL;
3694 TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
3695 tx_flow_process(fsw, fe, 0);
3696 TAILQ_REMOVE(&fes, fe, fe_tx_link);
3697 fe->fe_tx_is_cont_frag = false;
3698 fe->fe_tx_frag_id = 0;
3699 flow_entry_release(&fe);
3700 n_flows++;
3701 }
3702
3703 done:
3704 FSW_RUNLOCK(fsw);
3705 if (n_flows > 0) {
3706 netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL | (do_pacing ? NETIF_XMIT_FLAG_PACING : 0));
3707 }
3708 if (ifp != NULL) {
3709 fsw_datamov_end(fsw);
3710 }
3711 dp_drop_pktq(fsw, &dropped_pkts, 1, reason, line, DROPTAP_FLAG_L2_MISSING);
3712 KPKTQ_FINI(&dropped_pkts);
3713 KPKTQ_FINI(&dpktq);
3714 }
3715
3716 static sa_family_t
get_tso_af(struct __kern_packet * pkt)3717 get_tso_af(struct __kern_packet *pkt)
3718 {
3719 packet_tso_flags_t tso_flags;
3720
3721 tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
3722 if (tso_flags == PACKET_TSO_IPV4) {
3723 return AF_INET;
3724 } else if (tso_flags == PACKET_TSO_IPV6) {
3725 return AF_INET6;
3726 } else {
3727 panic("invalid tso flags: 0x%x\n", tso_flags);
3728 /* NOTREACHED */
3729 __builtin_unreachable();
3730 }
3731 }
3732
3733 static inline void
update_flow_info(struct __kern_packet * pkt,void * iphdr,void * tcphdr,uint16_t payload_sz)3734 update_flow_info(struct __kern_packet *pkt, void *iphdr, void *tcphdr, uint16_t payload_sz)
3735 {
3736 struct tcphdr *__single tcp = tcphdr;
3737
3738 DTRACE_SKYWALK4(update__flow__info, struct __kern_packet *, pkt,
3739 void *, iphdr, void *, tcphdr, uint16_t, payload_sz);
3740 pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iphdr;
3741 pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcphdr;
3742 pkt->pkt_flow_tcp_flags = tcp->th_flags;
3743 pkt->pkt_flow_tcp_seq = tcp->th_seq;
3744 pkt->pkt_flow_ulen = payload_sz;
3745 }
3746
3747 static int
do_gso(struct nx_flowswitch * fsw,int af,struct __kern_packet * orig_pkt,struct __kern_packet * first_pkt,struct pktq * dev_pktq,struct pktq * gso_pktq)3748 do_gso(struct nx_flowswitch *fsw, int af, struct __kern_packet *orig_pkt,
3749 struct __kern_packet *first_pkt, struct pktq *dev_pktq,
3750 struct pktq *gso_pktq)
3751 {
3752 ifnet_t ifp = fsw->fsw_ifp;
3753 struct __kern_packet *pkt = first_pkt;
3754 uint8_t proto = pkt->pkt_flow_ip_proto;
3755 uint16_t ip_hlen = pkt->pkt_flow_ip_hlen;
3756 uint16_t tcp_hlen = pkt->pkt_flow_tcp_hlen;
3757 uint16_t total_hlen = ip_hlen + tcp_hlen;
3758 uint16_t mtu = (uint16_t)ifp->if_mtu;
3759 uint16_t mss = pkt->pkt_proto_seg_sz, payload_sz;
3760 uint32_t n, n_pkts, off = 0, total_len = orig_pkt->pkt_length;
3761 uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
3762 kern_packet_t orig_ph = SK_PKT2PH(orig_pkt);
3763 uint8_t *orig_pkt_baddr;
3764 struct tcphdr *tcp;
3765 struct ip *ip;
3766 struct ip6_hdr *ip6;
3767 uint32_t tcp_seq;
3768 uint16_t ipid;
3769 uint32_t pseudo_hdr_csum, bufsz;
3770 uint64_t pkt_tx_timestamp = 0;
3771
3772 ASSERT(headroom <= UINT8_MAX);
3773 if (proto != IPPROTO_TCP) {
3774 SK_ERR("invalid proto: %d", proto);
3775 DTRACE_SKYWALK3(invalid__proto, struct nx_flowswitch *,
3776 fsw, ifnet_t, ifp, uint8_t, proto);
3777 return EINVAL;
3778 }
3779 if (mss == 0 || mss > (mtu - total_hlen)) {
3780 SK_ERR("invalid args: mss %d, mtu %d, total_hlen %d",
3781 mss, mtu, total_hlen);
3782 DTRACE_SKYWALK5(invalid__args1, struct nx_flowswitch *,
3783 fsw, ifnet_t, ifp, uint16_t, mss, uint16_t, mtu,
3784 uint32_t, total_hlen);
3785 return EINVAL;
3786 }
3787 bufsz = PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp);
3788 if ((headroom + total_hlen + mss) > bufsz) {
3789 SK_ERR("invalid args: headroom %d, total_hlen %d, "
3790 "mss %d, bufsz %d", headroom, total_hlen, mss, bufsz);
3791 DTRACE_SKYWALK6(invalid__args2, struct nx_flowswitch *,
3792 fsw, ifnet_t, ifp, uint16_t, headroom, uint16_t,
3793 total_hlen, uint16_t, mss, uint32_t, bufsz);
3794 return EINVAL;
3795 }
3796 n_pkts = (uint32_t)(SK_ROUNDUP((total_len - total_hlen), mss) / mss);
3797
3798 ASSERT(pkt->pkt_headroom == headroom);
3799 ASSERT(pkt->pkt_length == total_len);
3800 ASSERT(pkt->pkt_l2_len == 0);
3801 ASSERT((pkt->pkt_qum.qum_qflags & QUM_F_FINALIZED) == 0);
3802 ASSERT((pkt->pkt_pflags & PKT_F_TRUNCATED) != 0);
3803 pkt->pkt_pflags &= ~PKT_F_TRUNCATED;
3804 pkt->pkt_proto_seg_sz = 0;
3805 pkt->pkt_csum_flags = 0;
3806 MD_BUFLET_ADDR_ABS(orig_pkt, orig_pkt_baddr);
3807 orig_pkt_baddr += orig_pkt->pkt_headroom;
3808
3809 if (af == AF_INET) {
3810 /*
3811 * XXX -fbounds-safety: can't avoid using forge unless we change
3812 * the flow metadata definition.
3813 */
3814 ip = __unsafe_forge_bidi_indexable(struct ip *,
3815 pkt->pkt_flow_ip_hdr, pkt->pkt_length);
3816 tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
3817 pkt->pkt_flow_tcp_hdr, pkt->pkt_length - ip_hlen);
3818 ipid = ip->ip_id;
3819 pseudo_hdr_csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
3820 pkt->pkt_flow_ipv4_dst.s_addr, 0);
3821 } else {
3822 ASSERT(af == AF_INET6);
3823 tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
3824 pkt->pkt_flow_tcp_hdr, pkt->pkt_length - ip_hlen);
3825 pseudo_hdr_csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
3826 &pkt->pkt_flow_ipv6_dst, 0);
3827 }
3828 tcp_seq = ntohl(tcp->th_seq);
3829
3830 pkt_tx_timestamp = __packet_get_tx_timestamp(orig_ph);
3831
3832 for (n = 1, payload_sz = mss, off = total_hlen; off < total_len;
3833 off += payload_sz) {
3834 uint8_t *baddr, *baddr0;
3835 uint32_t partial;
3836
3837 if (pkt == NULL) {
3838 n++;
3839 KPKTQ_DEQUEUE(dev_pktq, pkt);
3840 ASSERT(pkt != NULL);
3841 }
3842 MD_BUFLET_ADDR_ABS(pkt, baddr0);
3843 baddr = baddr0;
3844 baddr += headroom;
3845
3846 /* Copy headers from the original packet */
3847 if (n != 1) {
3848 ASSERT(pkt != first_pkt);
3849 pkt_copy(orig_pkt_baddr, baddr, total_hlen);
3850 fsw_pkt_copy_metadata(first_pkt, pkt);
3851
3852 ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
3853 /* flow info still needs to be updated below */
3854 bcopy(first_pkt->pkt_flow, pkt->pkt_flow,
3855 sizeof(*pkt->pkt_flow));
3856 pkt->pkt_trace_id = 0;
3857 ASSERT(pkt->pkt_headroom == headroom);
3858 } else {
3859 METADATA_SET_LEN(pkt, 0, 0);
3860 }
3861 baddr += total_hlen;
3862
3863 /* copy tx timestamp from the orignal packet */
3864 __packet_set_tx_timestamp(SK_PKT2PH(pkt), pkt_tx_timestamp);
3865
3866 /* Copy/checksum the payload from the original packet */
3867 if (off + payload_sz > total_len) {
3868 payload_sz = (uint16_t)(total_len - off);
3869 }
3870 pkt_copypkt_sum(orig_ph,
3871 (uint16_t)(orig_pkt->pkt_headroom + off),
3872 SK_PKT2PH(pkt), headroom + total_hlen, payload_sz,
3873 &partial, TRUE);
3874
3875 DTRACE_SKYWALK6(copy__csum, struct nx_flowswitch *, fsw,
3876 ifnet_t, ifp, uint8_t *, baddr, uint16_t, payload_sz,
3877 uint16_t, mss, uint32_t, partial);
3878 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
3879
3880 /*
3881 * Adjust header information and fill in the missing fields.
3882 */
3883 if (af == AF_INET) {
3884 ip = (struct ip *)(void *)(baddr0 + pkt->pkt_headroom);
3885 tcp = (struct tcphdr *)(void *)((caddr_t)ip + ip_hlen);
3886
3887 if (n != n_pkts) {
3888 tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3889 }
3890 if (n != 1) {
3891 tcp->th_flags &= ~TH_CWR;
3892 tcp->th_seq = htonl(tcp_seq);
3893 }
3894 update_flow_info(pkt, ip, tcp, payload_sz);
3895
3896 ip->ip_id = htons((ipid)++);
3897 ip->ip_len = htons(ip_hlen + tcp_hlen + payload_sz);
3898 ip->ip_sum = 0;
3899 ip->ip_sum = inet_cksum_buffer(ip, 0, 0, ip_hlen);
3900 tcp->th_sum = 0;
3901
3902 partial = __packet_cksum(tcp, tcp_hlen, partial);
3903 partial += htons(tcp_hlen + IPPROTO_TCP + payload_sz);
3904 partial += pseudo_hdr_csum;
3905 ADDCARRY(partial);
3906 tcp->th_sum = ~(uint16_t)partial;
3907 } else {
3908 ASSERT(af == AF_INET6);
3909 ip6 = (struct ip6_hdr *)(baddr0 + pkt->pkt_headroom);
3910 tcp = (struct tcphdr *)(void *)((caddr_t)ip6 + ip_hlen);
3911
3912 if (n != n_pkts) {
3913 tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3914 }
3915 if (n != 1) {
3916 tcp->th_flags &= ~TH_CWR;
3917 tcp->th_seq = htonl(tcp_seq);
3918 }
3919 update_flow_info(pkt, ip6, tcp, payload_sz);
3920
3921 ip6->ip6_plen = htons(tcp_hlen + payload_sz);
3922 tcp->th_sum = 0;
3923 partial = __packet_cksum(tcp, tcp_hlen, partial);
3924 partial += htonl(tcp_hlen + IPPROTO_TCP + payload_sz);
3925 partial += pseudo_hdr_csum;
3926 ADDCARRY(partial);
3927 tcp->th_sum = ~(uint16_t)partial;
3928 }
3929 tcp_seq += payload_sz;
3930 METADATA_ADJUST_LEN(pkt, total_hlen, headroom);
3931 #if (DEVELOPMENT || DEBUG)
3932 struct __kern_buflet *bft;
3933 uint32_t blen;
3934 PKT_GET_FIRST_BUFLET(pkt, 1, bft);
3935 blen = __buflet_get_data_length(bft);
3936 if (blen != total_hlen + payload_sz) {
3937 panic("blen (%d) != total_len + payload_sz (%d)\n",
3938 blen, total_hlen + payload_sz);
3939 }
3940 #endif /* DEVELOPMENT || DEBUG */
3941
3942 pkt->pkt_length = total_hlen + payload_sz;
3943 KPKTQ_ENQUEUE(gso_pktq, pkt);
3944 pkt = NULL;
3945
3946 /*
3947 * Note that at this point the packet is not yet finalized.
3948 * The finalization happens in dp_flow_tx_process() after
3949 * the framing is done.
3950 */
3951 }
3952 ASSERT(n == n_pkts);
3953 ASSERT(off == total_len);
3954 DTRACE_SKYWALK7(gso__done, struct nx_flowswitch *, fsw, ifnet_t, ifp,
3955 uint32_t, n_pkts, uint32_t, total_len, uint16_t, ip_hlen,
3956 uint16_t, tcp_hlen, uint8_t *, orig_pkt_baddr);
3957 return 0;
3958 }
3959
3960 static void
tx_flow_enqueue_gso_pktq(struct flow_entry_list * fes,struct flow_entry * fe,struct pktq * gso_pktq)3961 tx_flow_enqueue_gso_pktq(struct flow_entry_list *fes, struct flow_entry *fe,
3962 struct pktq *gso_pktq)
3963 {
3964 if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
3965 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
3966 TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
3967 KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3968 KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3969 KPKTQ_INIT(gso_pktq);
3970 } else {
3971 ASSERT(!TAILQ_EMPTY(fes));
3972 KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3973 KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3974 KPKTQ_INIT(gso_pktq);
3975 flow_entry_release(&fe);
3976 }
3977 }
3978
3979 static void
dp_gso_pktq(struct nx_flowswitch * fsw,struct pktq * spktq,uint32_t gso_pkts_estimate)3980 dp_gso_pktq(struct nx_flowswitch *fsw, struct pktq *spktq,
3981 uint32_t gso_pkts_estimate)
3982 {
3983 struct __kern_packet *spkt, *pkt;
3984 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3985 struct flow_entry *__single fe, *__single prev_fe;
3986 struct pktq dpktq;
3987 struct nexus_adapter *dev_na;
3988 struct kern_pbufpool *dev_pp;
3989 struct ifnet *ifp = NULL;
3990 sa_family_t af;
3991 uint32_t n_pkts, n_flows = 0;
3992 int err;
3993
3994 KPKTQ_INIT(&dpktq);
3995 n_pkts = KPKTQ_LEN(spktq);
3996
3997 FSW_RLOCK(fsw);
3998 if (__improbable(FSW_QUIESCED(fsw))) {
3999 DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
4000 SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
4001 dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_QUIESCED, __LINE__,
4002 DROPTAP_FLAG_L2_MISSING);
4003 goto done;
4004 }
4005 dev_na = fsw->fsw_dev_ch->ch_na;
4006 if (__improbable(dev_na == NULL)) {
4007 SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
4008 FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
4009 dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_TX_DEVPORT_NOT_ATTACHED,
4010 __LINE__, DROPTAP_FLAG_L2_MISSING);
4011 goto done;
4012 }
4013 ifp = fsw_datamov_begin(fsw);
4014 if (ifp == NULL) {
4015 SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
4016 dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_IFNET_NOT_ATTACHED,
4017 __LINE__, DROPTAP_FLAG_L2_MISSING);
4018 goto done;
4019 }
4020
4021 dev_pp = na_kr_get_pp(dev_na, NR_TX);
4022
4023 /*
4024 * Batch allocate enough packets to perform GSO on all
4025 * packets in spktq.
4026 */
4027 err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq,
4028 gso_pkts_estimate, NULL, NULL, SKMEM_NOSLEEP);
4029 #if DEVELOPMENT || DEBUG
4030 if (__probable(err != ENOMEM)) {
4031 _FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
4032 }
4033 #endif /* DEVELOPMENT || DEBUG */
4034 /*
4035 * We either get all packets or none. No partial allocations.
4036 */
4037 if (__improbable(err != 0)) {
4038 if (err == ENOMEM) {
4039 ASSERT(KPKTQ_EMPTY(&dpktq));
4040 } else {
4041 dp_free_pktq(fsw, &dpktq);
4042 }
4043 DTRACE_SKYWALK1(gso__no__mem, int, err);
4044 dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_PP_ALLOC_FAILED,
4045 __LINE__, DROPTAP_FLAG_L2_MISSING);
4046 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
4047 SK_ERR("failed to alloc %u pkts from device pool",
4048 gso_pkts_estimate);
4049 goto done;
4050 }
4051 prev_fe = NULL;
4052 KPKTQ_FOREACH(spkt, spktq) {
4053 KPKTQ_DEQUEUE(&dpktq, pkt);
4054 ASSERT(pkt != NULL);
4055 /*
4056 * Copy only headers to the first packet of the GSO chain.
4057 * The headers will be used for classification below.
4058 */
4059 err = dp_copy_headers_to_dev(fsw, spkt, pkt);
4060 if (__improbable(err != 0)) {
4061 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_PKT_COPY_FAILED,
4062 DROPTAP_FLAG_L2_MISSING);
4063 DTRACE_SKYWALK2(copy__headers__failed,
4064 struct nx_flowswitch *, fsw,
4065 struct __kern_packet *, spkt);
4066 continue;
4067 }
4068 af = get_tso_af(pkt);
4069 ASSERT(af == AF_INET || af == AF_INET6);
4070
4071 err = flow_pkt_classify(pkt, ifp, af, false);
4072 if (__improbable(err != 0)) {
4073 dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
4074 FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
4075 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_EXTRACT_FAILED,
4076 DROPTAP_FLAG_L2_MISSING);
4077 DTRACE_SKYWALK4(classify__failed,
4078 struct nx_flowswitch *, fsw,
4079 struct __kern_packet *, spkt,
4080 struct __kern_packet *, pkt,
4081 int, err);
4082 continue;
4083 }
4084 /*
4085 * GSO cannot be done on a fragment and it's a bug in user
4086 * space to mark a fragment as needing GSO.
4087 */
4088 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
4089 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
4090 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FRAG_BAD_CONT,
4091 DROPTAP_FLAG_L2_MISSING);
4092 DTRACE_SKYWALK3(is__frag,
4093 struct nx_flowswitch *, fsw,
4094 struct __kern_packet *, spkt,
4095 struct __kern_packet *, pkt);
4096 continue;
4097 }
4098 fe = tx_lookup_flow(fsw, pkt, prev_fe);
4099 if (__improbable(fe == NULL)) {
4100 FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
4101 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_NOT_FOUND,
4102 DROPTAP_FLAG_L2_MISSING);
4103 DTRACE_SKYWALK3(lookup__failed,
4104 struct nx_flowswitch *, fsw,
4105 struct __kern_packet *, spkt,
4106 struct __kern_packet *, pkt);
4107 prev_fe = NULL;
4108 continue;
4109 }
4110 /*
4111 * Perform GSO on spkt using the flow information
4112 * obtained above.
4113 */
4114 struct pktq gso_pktq;
4115 KPKTQ_INIT(&gso_pktq);
4116 err = do_gso(fsw, af, spkt, pkt, &dpktq, &gso_pktq);
4117 if (__probable(err == 0)) {
4118 tx_flow_enqueue_gso_pktq(&fes, fe, &gso_pktq);
4119 prev_fe = fe;
4120 } else {
4121 DTRACE_SKYWALK1(gso__error, int, err);
4122 /* TODO: increment error stat */
4123 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_GSO_FAILED,
4124 DROPTAP_FLAG_L2_MISSING);
4125 flow_entry_release(&fe);
4126 prev_fe = NULL;
4127 }
4128 KPKTQ_FINI(&gso_pktq);
4129 }
4130 struct flow_entry *tfe = NULL;
4131 TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
4132 /* Chain-enqueue can be used for GSO chains */
4133 tx_flow_process(fsw, fe, FLOW_PROC_FLAG_GSO);
4134 TAILQ_REMOVE(&fes, fe, fe_tx_link);
4135 flow_entry_release(&fe);
4136 n_flows++;
4137 }
4138 done:
4139 FSW_RUNLOCK(fsw);
4140 if (n_flows > 0) {
4141 netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL);
4142 }
4143 if (ifp != NULL) {
4144 fsw_datamov_end(fsw);
4145 }
4146
4147 /*
4148 * It's possible for packets to be left in dpktq because
4149 * gso_pkts_estimate is only an estimate. The actual number
4150 * of packets needed could be less.
4151 */
4152 uint32_t dpktq_len;
4153 if ((dpktq_len = KPKTQ_LEN(&dpktq)) > 0) {
4154 DTRACE_SKYWALK2(leftover__dev__pkts,
4155 struct nx_flowswitch *, fsw, uint32_t, dpktq_len);
4156 dp_free_pktq(fsw, &dpktq);
4157 }
4158 KPKTQ_FINI(&dpktq);
4159 }
4160
4161 static inline void
fsw_dev_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4162 fsw_dev_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4163 struct proc *p)
4164 {
4165 #pragma unused(p)
4166 uint32_t total_pkts = 0, total_bytes = 0;
4167
4168 for (;;) {
4169 struct pktq pktq;
4170 KPKTQ_INIT(&pktq);
4171 uint32_t n_bytes;
4172 fsw_rx_ring_dequeue_pktq(fsw, r, fsw_rx_batch, &pktq, &n_bytes);
4173 if (n_bytes == 0) {
4174 break;
4175 }
4176 total_pkts += KPKTQ_LEN(&pktq);
4177 total_bytes += n_bytes;
4178
4179 if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) {
4180 fsw_receive(fsw, &pktq);
4181 } else {
4182 fsw_dev_input_netem_enqueue(fsw, &pktq);
4183 }
4184 KPKTQ_FINI(&pktq);
4185 }
4186
4187 KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
4188 DTRACE_SKYWALK2(fsw__dp__dev__ring__flush, uint32_t, total_pkts,
4189 uint32_t, total_bytes);
4190
4191 /* compute mitigation rate for delivered traffic */
4192 if (__probable(r->ckr_netif_mit_stats != NULL)) {
4193 r->ckr_netif_mit_stats(r, total_pkts, total_bytes);
4194 }
4195 }
4196
4197 static inline void
fsw_user_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4198 fsw_user_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4199 struct proc *p)
4200 {
4201 #pragma unused(p)
4202 static packet_trace_id_t trace_id = 0;
4203 uint32_t total_pkts = 0, total_bytes = 0;
4204
4205 for (;;) {
4206 struct pktq pktq;
4207 KPKTQ_INIT(&pktq);
4208 uint32_t n_bytes;
4209 uint32_t gso_pkts_estimate = 0;
4210
4211 fsw_tx_ring_dequeue_pktq(fsw, r, fsw_tx_batch, &pktq, &n_bytes,
4212 &gso_pkts_estimate);
4213 if (n_bytes == 0) {
4214 break;
4215 }
4216 total_pkts += KPKTQ_LEN(&pktq);
4217 total_bytes += n_bytes;
4218
4219 KPKTQ_FIRST(&pktq)->pkt_trace_id = ++trace_id;
4220 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_START,
4221 KPKTQ_FIRST(&pktq)->pkt_trace_id);
4222
4223 if (gso_pkts_estimate > 0) {
4224 dp_gso_pktq(fsw, &pktq, gso_pkts_estimate);
4225 } else {
4226 dp_tx_pktq(fsw, &pktq);
4227 }
4228 dp_free_pktq(fsw, &pktq);
4229 KPKTQ_FINI(&pktq);
4230 }
4231 kr_update_stats(r, total_pkts, total_bytes);
4232
4233 KDBG(SK_KTRACE_FSW_USER_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
4234 DTRACE_SKYWALK2(fsw__dp__user__ring__flush, uint32_t, total_pkts,
4235 uint32_t, total_bytes);
4236 }
4237
4238 void
fsw_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4239 fsw_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4240 struct proc *p)
4241 {
4242 struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
4243
4244 ASSERT(sk_is_sync_protected());
4245 ASSERT(vpna->vpna_nx_port != FSW_VP_HOST);
4246 ASSERT(vpna->vpna_up.na_md_type == NEXUS_META_TYPE_PACKET);
4247
4248 if (vpna->vpna_nx_port == FSW_VP_DEV) {
4249 fsw_dev_ring_flush(fsw, r, p);
4250 } else {
4251 fsw_user_ring_flush(fsw, r, p);
4252 }
4253 }
4254
4255 int
fsw_dp_ctor(struct nx_flowswitch * fsw)4256 fsw_dp_ctor(struct nx_flowswitch *fsw)
4257 {
4258 uint32_t fe_cnt = fsw_fe_table_size;
4259 uint32_t fob_cnt = fsw_flow_owner_buckets;
4260 uint32_t frb_cnt = fsw_flow_route_buckets;
4261 uint32_t frib_cnt = fsw_flow_route_id_buckets;
4262 struct kern_nexus *nx = fsw->fsw_nx;
4263 char name[64];
4264 const char *__null_terminated fsw_name = NULL;
4265 int error = 0;
4266
4267 /* just in case */
4268 if (fe_cnt == 0) {
4269 fe_cnt = NX_FSW_FE_TABLESZ;
4270 ASSERT(fe_cnt != 0);
4271 }
4272 if (fob_cnt == 0) {
4273 fob_cnt = NX_FSW_FOB_HASHSZ;
4274 ASSERT(fob_cnt != 0);
4275 }
4276 if (frb_cnt == 0) {
4277 frb_cnt = NX_FSW_FRB_HASHSZ;
4278 ASSERT(frb_cnt != 0);
4279 }
4280 if (frib_cnt == 0) {
4281 frib_cnt = NX_FSW_FRIB_HASHSZ;
4282 ASSERT(frib_cnt != 0);
4283 }
4284
4285 /* make sure fe_cnt is a power of two, else round up */
4286 if ((fe_cnt & (fe_cnt - 1)) != 0) {
4287 fe_cnt--;
4288 fe_cnt |= (fe_cnt >> 1);
4289 fe_cnt |= (fe_cnt >> 2);
4290 fe_cnt |= (fe_cnt >> 4);
4291 fe_cnt |= (fe_cnt >> 8);
4292 fe_cnt |= (fe_cnt >> 16);
4293 fe_cnt++;
4294 }
4295
4296 /* make sure frb_cnt is a power of two, else round up */
4297 if ((frb_cnt & (frb_cnt - 1)) != 0) {
4298 frb_cnt--;
4299 frb_cnt |= (frb_cnt >> 1);
4300 frb_cnt |= (frb_cnt >> 2);
4301 frb_cnt |= (frb_cnt >> 4);
4302 frb_cnt |= (frb_cnt >> 8);
4303 frb_cnt |= (frb_cnt >> 16);
4304 frb_cnt++;
4305 }
4306
4307 lck_mtx_init(&fsw->fsw_detach_barrier_lock, &nexus_lock_group,
4308 &nexus_lock_attr);
4309 lck_mtx_init(&fsw->fsw_reap_lock, &nexus_lock_group, &nexus_lock_attr);
4310 lck_mtx_init(&fsw->fsw_linger_lock, &nexus_lock_group, &nexus_lock_attr);
4311 TAILQ_INIT(&fsw->fsw_linger_head);
4312 lck_mtx_init(&fsw->fsw_rxstrc_lock, &nexus_lock_group, &nexus_lock_attr);
4313 TAILQ_INIT(&fsw->fsw_rxstrc_head);
4314
4315 fsw_name = tsnprintf(name, sizeof(name), "%s_%llu", NX_FSW_NAME, nx->nx_id);
4316 error = nx_advisory_alloc(nx, fsw_name,
4317 &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
4318 NEXUS_ADVISORY_TYPE_FLOWSWITCH);
4319 if (error != 0) {
4320 fsw_dp_dtor(fsw);
4321 return error;
4322 }
4323
4324 fsw->fsw_flow_mgr = flow_mgr_create(fe_cnt, fob_cnt, frb_cnt, frib_cnt);
4325 if (fsw->fsw_flow_mgr == NULL) {
4326 fsw_dp_dtor(fsw);
4327 return error;
4328 }
4329
4330 /* generic name; will be customized upon ifattach */
4331 (void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
4332 FSW_REAP_THREADNAME, name, "");
4333
4334 if (kernel_thread_start(fsw_reap_thread_func, fsw,
4335 &fsw->fsw_reap_thread) != KERN_SUCCESS) {
4336 panic_plain("%s: can't create thread", __func__);
4337 /* NOTREACHED */
4338 __builtin_unreachable();
4339 }
4340 /* this must not fail */
4341 VERIFY(fsw->fsw_reap_thread != NULL);
4342
4343 SK_DF(SK_VERB_MEM, "fsw %p ALLOC", SK_KVA(fsw));
4344
4345
4346 return error;
4347 }
4348
4349 void
fsw_dp_dtor(struct nx_flowswitch * fsw)4350 fsw_dp_dtor(struct nx_flowswitch *fsw)
4351 {
4352 uint64_t f = (1 * NSEC_PER_MSEC); /* 1 ms */
4353 uint64_t s = (1000 * NSEC_PER_SEC); /* 1 sec */
4354 uint32_t i = 0;
4355
4356 #if (DEVELOPMENT || DEBUG)
4357 if (fsw->fsw_rps_threads != NULL) {
4358 for (i = 0; i < fsw->fsw_rps_nthreads; i++) {
4359 fsw_rps_thread_join(fsw, i);
4360 }
4361 kfree_type_counted_by(struct fsw_rps_thread, fsw->fsw_rps_nthreads,
4362 fsw->fsw_rps_threads);
4363 }
4364 #endif /* !DEVELOPMENT && !DEBUG */
4365
4366 nx_advisory_free(fsw->fsw_nx);
4367
4368 if (fsw->fsw_reap_thread != THREAD_NULL) {
4369 /* signal thread to begin self-termination */
4370 lck_mtx_lock(&fsw->fsw_reap_lock);
4371 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATING;
4372
4373 /*
4374 * And wait for thread to terminate; use another
4375 * wait channel here other than fsw_reap_flags to
4376 * make it more explicit. In the event the reaper
4377 * thread misses a wakeup, we'll try again once
4378 * every second (except for the first time).
4379 */
4380 while (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)) {
4381 uint64_t t = 0;
4382
4383 nanoseconds_to_absolutetime((i++ == 0) ? f : s, &t);
4384 clock_absolutetime_interval_to_deadline(t, &t);
4385 ASSERT(t != 0);
4386
4387 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATEBLOCK;
4388 if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)) {
4389 thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4390 }
4391 (void) assert_wait_deadline(&fsw->fsw_reap_thread,
4392 THREAD_UNINT, t);
4393 lck_mtx_unlock(&fsw->fsw_reap_lock);
4394 thread_block(THREAD_CONTINUE_NULL);
4395 lck_mtx_lock(&fsw->fsw_reap_lock);
4396 fsw->fsw_reap_flags &= ~FSW_REAPF_TERMINATEBLOCK;
4397 }
4398 ASSERT(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED);
4399 lck_mtx_unlock(&fsw->fsw_reap_lock);
4400 fsw->fsw_reap_thread = THREAD_NULL;
4401 }
4402
4403 /* free any remaining flow entries in the linger list */
4404 fsw_linger_purge(fsw);
4405 fsw_rxstrc_purge(fsw);
4406
4407 if (fsw->fsw_flow_mgr != NULL) {
4408 flow_mgr_destroy(fsw->fsw_flow_mgr);
4409 fsw->fsw_flow_mgr = NULL;
4410 }
4411
4412
4413 lck_mtx_destroy(&fsw->fsw_detach_barrier_lock, &nexus_lock_group);
4414 lck_mtx_destroy(&fsw->fsw_reap_lock, &nexus_lock_group);
4415 lck_mtx_destroy(&fsw->fsw_linger_lock, &nexus_lock_group);
4416 }
4417
4418 void
fsw_linger_insert(struct flow_entry * fe)4419 fsw_linger_insert(struct flow_entry *fe)
4420 {
4421 struct nx_flowswitch *fsw = fe->fe_fsw;
4422 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4423 SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf)));
4424
4425 net_update_uptime();
4426
4427 ASSERT(flow_entry_refcnt(fe) >= 1);
4428 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4429 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4430 ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
4431 ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE);
4432 ASSERT(fe->fe_linger_wait != 0);
4433 fe->fe_linger_expire = (net_uptime() + fe->fe_linger_wait);
4434 os_atomic_or(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4435
4436 lck_mtx_lock_spin(&fsw->fsw_linger_lock);
4437 TAILQ_INSERT_TAIL(&fsw->fsw_linger_head, fe, fe_linger_link);
4438 fsw->fsw_linger_cnt++;
4439 VERIFY(fsw->fsw_linger_cnt != 0);
4440 lck_mtx_unlock(&fsw->fsw_linger_lock);
4441
4442 fsw_reap_sched(fsw);
4443 }
4444
4445 static void
fsw_linger_remove_internal(struct flow_entry_linger_head * linger_head,struct flow_entry * fe)4446 fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head,
4447 struct flow_entry *fe)
4448 {
4449 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4450 SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf)));
4451
4452 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4453 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4454 ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
4455 os_atomic_andnot(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4456
4457 TAILQ_REMOVE(linger_head, fe, fe_linger_link);
4458 flow_entry_release(&fe);
4459 }
4460
4461 static void
fsw_linger_remove(struct flow_entry * fe)4462 fsw_linger_remove(struct flow_entry *fe)
4463 {
4464 struct nx_flowswitch *fsw = fe->fe_fsw;
4465
4466 LCK_MTX_ASSERT(&fsw->fsw_linger_lock, LCK_MTX_ASSERT_OWNED);
4467
4468 fsw_linger_remove_internal(&fsw->fsw_linger_head, fe);
4469 VERIFY(fsw->fsw_linger_cnt != 0);
4470 fsw->fsw_linger_cnt--;
4471 }
4472
4473 void
fsw_linger_purge(struct nx_flowswitch * fsw)4474 fsw_linger_purge(struct nx_flowswitch *fsw)
4475 {
4476 struct flow_entry *fe, *tfe;
4477
4478 lck_mtx_lock(&fsw->fsw_linger_lock);
4479 TAILQ_FOREACH_SAFE(fe, &fsw->fsw_linger_head, fe_linger_link, tfe) {
4480 fsw_linger_remove(fe);
4481 }
4482 ASSERT(fsw->fsw_linger_cnt == 0);
4483 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4484 lck_mtx_unlock(&fsw->fsw_linger_lock);
4485 }
4486
4487 void
fsw_rxstrc_insert(struct flow_entry * fe)4488 fsw_rxstrc_insert(struct flow_entry *fe)
4489 {
4490 struct nx_flowswitch *fsw = fe->fe_fsw;
4491 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4492 SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf)));
4493
4494 ASSERT(flow_entry_refcnt(fe) >= 1);
4495 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4496 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4497 ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD);
4498 ASSERT(!(fe->fe_flags & FLOWENTF_RXSTRC_PENDING));
4499 os_atomic_or(&fe->fe_flags, FLOWENTF_RXSTRC_PENDING, relaxed);
4500
4501 flow_entry_retain(fe);
4502
4503 lck_mtx_lock_spin(&fsw->fsw_rxstrc_lock);
4504 TAILQ_INSERT_TAIL(&fsw->fsw_rxstrc_head, fe, fe_rxstrc_link);
4505 fsw->fsw_rxstrc_cnt++;
4506 VERIFY(fsw->fsw_rxstrc_cnt != 0);
4507 lck_mtx_unlock(&fsw->fsw_rxstrc_lock);
4508
4509 fsw_reap_sched(fsw);
4510 }
4511
4512 static void
fsw_rxstrc_remove_internal(struct flow_entry_rxstrc_head * rxstrc_head,struct flow_entry * fe)4513 fsw_rxstrc_remove_internal(struct flow_entry_rxstrc_head *rxstrc_head,
4514 struct flow_entry *fe)
4515 {
4516 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4517 SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf)));
4518
4519 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4520 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4521 ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD);
4522 ASSERT(fe->fe_flags & FLOWENTF_RXSTRC_PENDING);
4523 os_atomic_andnot(&fe->fe_flags, FLOWENTF_RXSTRC_PENDING, relaxed);
4524
4525 TAILQ_REMOVE(rxstrc_head, fe, fe_rxstrc_link);
4526 flow_entry_release(&fe);
4527 }
4528
4529 static void
fsw_rxstrc_remove(struct flow_entry * fe)4530 fsw_rxstrc_remove(struct flow_entry *fe)
4531 {
4532 struct nx_flowswitch *fsw = fe->fe_fsw;
4533
4534 LCK_MTX_ASSERT(&fsw->fsw_rxstrc_lock, LCK_MTX_ASSERT_OWNED);
4535
4536 fsw_rxstrc_remove_internal(&fsw->fsw_rxstrc_head, fe);
4537 VERIFY(fsw->fsw_rxstrc_cnt != 0);
4538 fsw->fsw_rxstrc_cnt--;
4539 }
4540
4541 void
fsw_rxstrc_purge(struct nx_flowswitch * fsw)4542 fsw_rxstrc_purge(struct nx_flowswitch *fsw)
4543 {
4544 struct flow_entry *fe, *tfe;
4545
4546 lck_mtx_lock(&fsw->fsw_rxstrc_lock);
4547 TAILQ_FOREACH_SAFE(fe, &fsw->fsw_rxstrc_head, fe_rxstrc_link, tfe) {
4548 fsw_rxstrc_remove(fe);
4549 }
4550 ASSERT(fsw->fsw_rxstrc_cnt == 0);
4551 ASSERT(TAILQ_EMPTY(&fsw->fsw_rxstrc_head));
4552 lck_mtx_unlock(&fsw->fsw_rxstrc_lock);
4553 }
4554
4555 static void
fsw_defunct_rx_stall_channel(struct nx_flowswitch * fsw)4556 fsw_defunct_rx_stall_channel(struct nx_flowswitch *fsw)
4557 {
4558 struct kern_nexus *nx;
4559 uint64_t now = net_uptime();
4560
4561 nx = fsw->fsw_nx;
4562
4563 /* Walk through all channels and check for Rx stall condition */
4564 /* uncrustify doesn't handle C blocks properly */
4565 /* BEGIN IGNORE CODESTYLE */
4566 nx_port_foreach(nx, ^(nexus_port_t nxport) {
4567 struct nexus_adapter *na = nx_port_get_na(nx, nxport);
4568 uint64_t elapsed, enqueue_ts, dequeue_ts;
4569 struct __kern_channel_ring *ring;
4570 struct kern_channel *ch;
4571 struct proc *p;
4572
4573 if (na == NULL || na->na_work_ts == 0 || na->na_rx_rings == NULL) {
4574 return;
4575 }
4576 ch = (struct kern_channel *)na->na_private;
4577 if (ch == NULL) {
4578 return;
4579 }
4580 ring = KR_SINGLE(na->na_rx_rings);
4581 enqueue_ts = ring->ckr_rx_enqueue_ts;
4582 dequeue_ts = ring->ckr_rx_dequeue_ts;
4583 /* Elapsed time since last Rx enqueue */
4584 elapsed = now - enqueue_ts;
4585 if ((dequeue_ts < enqueue_ts) && (elapsed > fsw_rx_stall_thresh)) {
4586 p = proc_find(ch->ch_pid);
4587 if (p == NULL) {
4588 return;
4589 }
4590 if (fsw_rx_stall_defunct) {
4591 kern_channel_defunct(p, ch);
4592 }
4593 proc_rele(p);
4594 DTRACE_SKYWALK3(rx__stall, struct nx_flowswitch *, fsw,
4595 struct nexus_adapter *, na, struct __kern_channel_ring *, ring);
4596 FSW_STATS_INC(FSW_STATS_RX_STALL);
4597 SK_ERR("Rx stall detected in proc %s(%d) (%s): "
4598 "elapsed %llu (s), now: %llu, enqueue: %llu, dequeue: %llu, "
4599 "defunct: %s",
4600 ch->ch_name, ch->ch_pid, fsw->fsw_ifp->if_xname,
4601 elapsed, now, enqueue_ts, dequeue_ts,
4602 fsw_rx_stall_defunct ? "yes" : "no");
4603 }
4604 });
4605 /* END IGNORE CODESTYLE */
4606 }
4607
4608 void
fsw_reap_sched(struct nx_flowswitch * fsw)4609 fsw_reap_sched(struct nx_flowswitch *fsw)
4610 {
4611 ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
4612 lck_mtx_lock_spin(&fsw->fsw_reap_lock);
4613 if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING) &&
4614 !(fsw->fsw_reap_flags & (FSW_REAPF_TERMINATING | FSW_REAPF_TERMINATED))) {
4615 thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4616 }
4617 lck_mtx_unlock(&fsw->fsw_reap_lock);
4618 }
4619
4620 __attribute__((noreturn))
4621 static void
fsw_reap_thread_func(void * v,wait_result_t w)4622 fsw_reap_thread_func(void *v, wait_result_t w)
4623 {
4624 #pragma unused(w)
4625 struct nx_flowswitch *__single fsw = v;
4626
4627 ASSERT(fsw->fsw_reap_thread == current_thread());
4628 /*
4629 * -fbounds-safety: __unsafe_null_terminated_from_indexable provides
4630 * checks to ensure source contains the null terminator, by doing a
4631 * linear scan of the string.
4632 */
4633 thread_set_thread_name(current_thread(),
4634 __unsafe_null_terminated_from_indexable(fsw->fsw_reap_name));
4635
4636 net_update_uptime();
4637
4638 lck_mtx_lock(&fsw->fsw_reap_lock);
4639 VERIFY(!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING));
4640 (void) assert_wait(&fsw->fsw_reap_flags, THREAD_UNINT);
4641 lck_mtx_unlock(&fsw->fsw_reap_lock);
4642 thread_block_parameter(fsw_reap_thread_cont, fsw);
4643 /* NOTREACHED */
4644 __builtin_unreachable();
4645 }
4646
4647 __attribute__((noreturn))
4648 static void
fsw_reap_thread_cont(void * v,wait_result_t wres)4649 fsw_reap_thread_cont(void *v, wait_result_t wres)
4650 {
4651 struct nx_flowswitch *__single fsw = v;
4652 boolean_t low;
4653 uint64_t t = 0;
4654
4655 SK_DF(SK_VERB_FLOW, "%s: running", fsw->fsw_reap_name);
4656
4657 lck_mtx_lock(&fsw->fsw_reap_lock);
4658 if (__improbable(wres == THREAD_INTERRUPTED ||
4659 (fsw->fsw_reap_flags & FSW_REAPF_TERMINATING) != 0)) {
4660 goto terminate;
4661 }
4662
4663 ASSERT(!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED));
4664 fsw->fsw_reap_flags |= FSW_REAPF_RUNNING;
4665 lck_mtx_unlock(&fsw->fsw_reap_lock);
4666
4667 net_update_uptime();
4668
4669 /* prevent detach from happening while we're here */
4670 if (!fsw_detach_barrier_add(fsw)) {
4671 SK_ERR("%s: netagent detached", fsw->fsw_reap_name);
4672 t = 0;
4673 } else {
4674 uint32_t fe_nonviable, fe_freed, fe_aborted;
4675 uint32_t fr_freed, fr_resid = 0;
4676 struct ifnet *ifp = fsw->fsw_ifp;
4677 uint64_t i = FSW_REAP_IVAL;
4678 uint64_t now = net_uptime();
4679 uint64_t last;
4680
4681 ASSERT(fsw->fsw_ifp != NULL);
4682
4683 /*
4684 * Pass 1: process any deferred {withdrawn,nonviable} requests.
4685 */
4686 fe_nonviable = fsw_process_deferred(fsw);
4687
4688 /*
4689 * Pass 2: remove any expired lingering flows.
4690 */
4691 fe_freed = fsw_process_linger(fsw, &fe_aborted);
4692
4693 /*
4694 * Pass 3: process any pending Rx steering rule cleanup flows
4695 */
4696 fsw_process_rxstrc(fsw);
4697
4698 /*
4699 * Pass 4: prune idle flow routes.
4700 */
4701 fr_freed = flow_route_prune(fsw->fsw_flow_mgr,
4702 ifp, &fr_resid);
4703
4704 /*
4705 * Pass 5: prune flow table
4706 *
4707 */
4708 cuckoo_hashtable_try_shrink(fsw->fsw_flow_mgr->fm_flow_table);
4709
4710 SK_DF(SK_VERB_FLOW, "%s: fe_nonviable %u/%u fe_freed %u/%u "
4711 "fe_aborted %u fr_freed %u/%u",
4712 fsw->fsw_flow_mgr->fm_name, fe_nonviable,
4713 (fe_nonviable + fsw->fsw_pending_nonviable),
4714 fe_freed, fsw->fsw_linger_cnt, fe_aborted, fe_freed,
4715 (fe_freed + fr_resid));
4716
4717 /* see if VM memory level is critical */
4718 low = skmem_lowmem_check();
4719
4720 /*
4721 * If things appear to be idle, we can prune away cached
4722 * object that have fallen out of the working sets (this
4723 * is different than purging). Every once in a while, we
4724 * also purge the caches. Note that this is done across
4725 * all flowswitch instances, and so we limit this to no
4726 * more than once every FSW_REAP_SK_THRES seconds.
4727 */
4728 last = os_atomic_load(&fsw_reap_last, relaxed);
4729 if ((low || (last != 0 && (now - last) >= FSW_REAP_SK_THRES)) &&
4730 os_atomic_cmpxchg(&fsw_reap_last, last, now, acq_rel)) {
4731 fsw_purge_cache(fsw, low);
4732
4733 /* increase sleep interval if idle */
4734 if (kdebug_enable == 0 && fsw->fsw_linger_cnt == 0 &&
4735 fsw->fsw_pending_nonviable == 0 && fr_resid == 0) {
4736 i <<= 3;
4737 }
4738 } else if (last == 0) {
4739 os_atomic_store(&fsw_reap_last, now, release);
4740 }
4741
4742 /*
4743 * Additionally, run thru the list of channels and prune
4744 * or purge away cached objects on "idle" channels. This
4745 * check is rate limited to no more than once every
4746 * FSW_DRAIN_CH_THRES seconds.
4747 */
4748 last = fsw->fsw_drain_channel_chk_last;
4749 if (low || (last != 0 && (now - last) >= FSW_DRAIN_CH_THRES)) {
4750 SK_DF(SK_VERB_FLOW, "%s: pruning channels",
4751 fsw->fsw_flow_mgr->fm_name);
4752
4753 fsw->fsw_drain_channel_chk_last = now;
4754 fsw_drain_channels(fsw, now, low);
4755 } else if (__improbable(last == 0)) {
4756 fsw->fsw_drain_channel_chk_last = now;
4757 }
4758
4759 /*
4760 * Finally, invoke the interface's reap callback to
4761 * tell it to prune or purge away cached objects if
4762 * it is idle. This check is rate limited to no more
4763 * than once every FSW_REAP_IF_THRES seconds.
4764 */
4765 last = fsw->fsw_drain_netif_chk_last;
4766 if (low || (last != 0 && (now - last) >= FSW_REAP_IF_THRES)) {
4767 ASSERT(fsw->fsw_nifna != NULL);
4768
4769 if (ifp->if_na_ops != NULL &&
4770 ifp->if_na_ops->ni_reap != NULL) {
4771 SK_DF(SK_VERB_FLOW, "%s: pruning netif",
4772 fsw->fsw_flow_mgr->fm_name);
4773 ifp->if_na_ops->ni_reap(ifp->if_na, ifp,
4774 FSW_REAP_IF_THRES, low);
4775 }
4776
4777 fsw->fsw_drain_netif_chk_last = now;
4778 } else if (__improbable(last == 0)) {
4779 fsw->fsw_drain_netif_chk_last = now;
4780 }
4781
4782 /* emit periodic interface stats ktrace */
4783 last = fsw->fsw_reap_last;
4784 if (last != 0 && (now - last) >= FSW_IFSTATS_THRES) {
4785 KDBG(SK_KTRACE_AON_IF_STATS, ifp->if_data.ifi_ipackets,
4786 ifp->if_data.ifi_ibytes * 8,
4787 ifp->if_data.ifi_opackets,
4788 ifp->if_data.ifi_obytes * 8);
4789
4790 fsw->fsw_reap_last = now;
4791 } else if (__improbable(last == 0)) {
4792 fsw->fsw_reap_last = now;
4793 }
4794
4795 /* Check for Rx stall condition every fsw_rx_stall_thresh seconds */
4796 last = fsw->fsw_rx_stall_chk_last;
4797 if (fsw_rx_stall_thresh != 0) {
4798 if (last != 0 && (now - last) >= fsw_rx_stall_thresh) {
4799 fsw_defunct_rx_stall_channel(fsw);
4800 fsw->fsw_rx_stall_chk_last = now;
4801 } else if (__improbable(last == 0)) {
4802 fsw->fsw_rx_stall_chk_last = now;
4803 }
4804 }
4805
4806 nanoseconds_to_absolutetime(i * NSEC_PER_SEC, &t);
4807 clock_absolutetime_interval_to_deadline(t, &t);
4808 ASSERT(t != 0);
4809
4810 /* allow any pending detach to proceed */
4811 fsw_detach_barrier_remove(fsw);
4812 }
4813
4814 lck_mtx_lock(&fsw->fsw_reap_lock);
4815 if (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING)) {
4816 fsw->fsw_reap_flags &= ~FSW_REAPF_RUNNING;
4817 (void) assert_wait_deadline(&fsw->fsw_reap_flags,
4818 THREAD_UNINT, t);
4819 lck_mtx_unlock(&fsw->fsw_reap_lock);
4820 thread_block_parameter(fsw_reap_thread_cont, fsw);
4821 /* NOTREACHED */
4822 __builtin_unreachable();
4823 } else {
4824 terminate:
4825 LCK_MTX_ASSERT(&fsw->fsw_reap_lock, LCK_MTX_ASSERT_OWNED);
4826 fsw->fsw_reap_flags &= ~(FSW_REAPF_RUNNING | FSW_REAPF_TERMINATING);
4827 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATED;
4828 /*
4829 * And signal any thread waiting for us to terminate;
4830 * wait channel here other than fsw_reap_flags to make
4831 * it more explicit.
4832 */
4833 if (fsw->fsw_reap_flags & FSW_REAPF_TERMINATEBLOCK) {
4834 thread_wakeup((caddr_t)&fsw->fsw_reap_thread);
4835 }
4836 lck_mtx_unlock(&fsw->fsw_reap_lock);
4837
4838 SK_DF(SK_VERB_FLOW, "%s: terminating", fsw->fsw_reap_name);
4839
4840 /* for the extra refcnt from kernel_thread_start() */
4841 thread_deallocate(current_thread());
4842 /* this is the end */
4843 thread_terminate(current_thread());
4844 /* NOTREACHED */
4845 __builtin_unreachable();
4846 }
4847
4848 /* must never get here */
4849 VERIFY(0);
4850 /* NOTREACHED */
4851 __builtin_unreachable();
4852 }
4853
4854 static void
fsw_drain_channels(struct nx_flowswitch * fsw,uint64_t now,boolean_t low)4855 fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low)
4856 {
4857 struct kern_nexus *nx = fsw->fsw_nx;
4858
4859 /* flowswitch protects NA via fsw_lock, see fsw_port_alloc/free */
4860 FSW_RLOCK(fsw);
4861
4862 /* uncrustify doesn't handle C blocks properly */
4863 /* BEGIN IGNORE CODESTYLE */
4864 nx_port_foreach(nx, ^(nexus_port_t p) {
4865 boolean_t purge;
4866 struct nexus_adapter *na = nx_port_get_na(nx, p);
4867
4868 if (na == NULL) {
4869 DTRACE_SKYWALK1(ch__drain__na__null, struct nexus_adapter *, na);
4870 return;
4871 }
4872
4873 /*
4874 * If NA is deactivated, no need to proceed further with channel drain.
4875 * Note: fsw_vp_na_activate takes FSW_WLOCK before clearing the
4876 * NAF_ACTIVE flag.
4877 */
4878 if ((na->na_flags & NAF_ACTIVE) == 0) {
4879 DTRACE_SKYWALK1(ch__drain__na__inactive, struct nexus_adapter *, na);
4880 return;
4881 }
4882
4883 if (na->na_work_ts == 0 || na->na_rx_rings == NULL) {
4884 DTRACE_SKYWALK1(ch__drain__na__invalid, struct nexus_adapter *, na);
4885 return;
4886 }
4887
4888 /*
4889 * If some activity happened in the last FSW_DRAIN_CH_THRES
4890 * seconds on this channel, we reclaim memory if the channel
4891 * throughput is less than the reap threshold value.
4892 */
4893 if ((now - na->na_work_ts) < FSW_DRAIN_CH_THRES) {
4894 struct __kern_channel_ring *__single ring;
4895 channel_ring_stats *stats;
4896 uint64_t bps;
4897
4898 ring = KR_SINGLE(na->na_rx_rings);
4899 stats = &ring->ckr_stats;
4900 bps = stats->crs_bytes_per_second;
4901
4902 if (bps < fsw_channel_reap_thresh) {
4903 purge = FALSE;
4904 na_drain(na, purge);
4905 }
4906 return;
4907 }
4908
4909 /*
4910 * If NA has been inactive for some time (twice the drain
4911 * threshold), we clear the work timestamp to temporarily skip
4912 * this channel until it's active again. Purging cached objects
4913 * can be expensive since we'd need to allocate and construct
4914 * them again, so we do it only when necessary.
4915 */
4916 if (low || ((now - na->na_work_ts) >= (FSW_DRAIN_CH_THRES << 1))) {
4917 na->na_work_ts = 0;
4918 purge = TRUE;
4919 } else {
4920 purge = FALSE;
4921 }
4922
4923 na_drain(na, purge); /* purge/prune caches */
4924 });
4925 /* END IGNORE CODESTYLE */
4926
4927 FSW_RUNLOCK(fsw);
4928 }
4929
4930 static void
fsw_purge_cache(struct nx_flowswitch * fsw,boolean_t low)4931 fsw_purge_cache(struct nx_flowswitch *fsw, boolean_t low)
4932 {
4933 #pragma unused(fsw)
4934 uint64_t o = os_atomic_inc_orig(&fsw_want_purge, relaxed);
4935 uint32_t p = fsw_flow_purge_thresh;
4936 boolean_t purge = (low || (o != 0 && p != 0 && (o % p) == 0));
4937
4938 SK_DF(SK_VERB_FLOW, "%s: %s caches",
4939 fsw->fsw_flow_mgr->fm_name,
4940 (purge ? "purge" : "prune"));
4941
4942 skmem_cache_reap_now(sk_fo_cache, purge);
4943 skmem_cache_reap_now(sk_fe_cache, purge);
4944 skmem_cache_reap_now(sk_fab_cache, purge);
4945 skmem_cache_reap_now(flow_route_cache, purge);
4946 skmem_cache_reap_now(flow_stats_cache, purge);
4947 netns_reap_caches(purge);
4948 skmem_reap_caches(purge);
4949
4950 #if CONFIG_MBUF_MCACHE
4951 if (if_is_fsw_transport_netagent_enabled() && purge) {
4952 mbuf_drain(FALSE);
4953 }
4954 #endif /* CONFIG_MBUF_MCACHE */
4955 }
4956
4957 static void
fsw_flow_handle_low_power(struct nx_flowswitch * fsw,struct flow_entry * fe)4958 fsw_flow_handle_low_power(struct nx_flowswitch *fsw, struct flow_entry *fe)
4959 {
4960 /* When the interface is in low power mode, the flow is nonviable */
4961 if (!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
4962 os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
4963 os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
4964 }
4965 }
4966
4967 static uint32_t
fsw_process_deferred(struct nx_flowswitch * fsw)4968 fsw_process_deferred(struct nx_flowswitch *fsw)
4969 {
4970 struct flow_entry_dead sfed __sk_aligned(8);
4971 struct flow_mgr *fm = fsw->fsw_flow_mgr;
4972 struct flow_entry_dead *fed, *tfed;
4973 LIST_HEAD(, flow_entry_dead) fed_head =
4974 LIST_HEAD_INITIALIZER(fed_head);
4975 uint32_t i, nonviable = 0;
4976 boolean_t lowpowermode = FALSE;
4977
4978 bzero(&sfed, sizeof(sfed));
4979
4980 /*
4981 * The flows become nonviable when the interface
4982 * is in low power mode (edge trigger)
4983 */
4984 if ((fsw->fsw_ifp->if_xflags & IFXF_LOW_POWER) &&
4985 fsw->fsw_ifp->if_low_power_gencnt != fsw->fsw_low_power_gencnt) {
4986 lowpowermode = TRUE;
4987 fsw->fsw_low_power_gencnt = fsw->fsw_ifp->if_low_power_gencnt;
4988 }
4989
4990 /*
4991 * Scan thru the flow entry tree, and commit any pending withdraw or
4992 * nonviable requests. We may need to push stats and/or unassign the
4993 * nexus from NECP, but we cannot do that while holding the locks;
4994 * build a temporary list for those entries.
4995 */
4996 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
4997 struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
4998 struct flow_owner *fo;
4999
5000 /*
5001 * Grab the lock at all costs when handling low power mode
5002 */
5003 if (__probable(!lowpowermode)) {
5004 if (!FOB_TRY_LOCK(fob)) {
5005 continue;
5006 }
5007 } else {
5008 FOB_LOCK(fob);
5009 }
5010
5011 FOB_LOCK_ASSERT_HELD(fob);
5012 RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
5013 struct flow_entry *fe;
5014
5015 RB_FOREACH(fe, flow_entry_id_tree,
5016 &fo->fo_flow_entry_id_head) {
5017 /* try first as reader; skip if we can't */
5018 if (__improbable(lowpowermode)) {
5019 fsw_flow_handle_low_power(fsw, fe);
5020 }
5021 if (__improbable(fe->fe_flags & FLOWENTF_HALF_CLOSED)) {
5022 os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
5023 flow_namespace_half_close(&fe->fe_port_reservation);
5024 }
5025
5026 /* if not withdrawn/nonviable, skip */
5027 if (!fe->fe_want_withdraw &&
5028 !fe->fe_want_nonviable) {
5029 continue;
5030 }
5031 /*
5032 * Here we're holding the lock as writer;
5033 * don't spend too much time as we're
5034 * blocking the data path now.
5035 */
5036 ASSERT(!uuid_is_null(fe->fe_uuid));
5037 /* only need flow UUID and booleans */
5038 uuid_copy(sfed.fed_uuid, fe->fe_uuid);
5039 sfed.fed_want_clonotify =
5040 (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY);
5041 sfed.fed_want_nonviable = fe->fe_want_nonviable;
5042 flow_entry_teardown(fo, fe);
5043
5044 /* do this outside the flow bucket lock */
5045 fed = flow_entry_dead_alloc(Z_WAITOK);
5046 ASSERT(fed != NULL);
5047 *fed = sfed;
5048 LIST_INSERT_HEAD(&fed_head, fed, fed_link);
5049 }
5050 }
5051 FOB_UNLOCK(fob);
5052 }
5053
5054 /*
5055 * These nonviable flows are no longer useful since we've lost
5056 * the source IP address; in the event the client monitors the
5057 * viability of the flow, explicitly mark it as nonviable so
5058 * that a new flow can be created.
5059 */
5060 LIST_FOREACH_SAFE(fed, &fed_head, fed_link, tfed) {
5061 LIST_REMOVE(fed, fed_link);
5062 ASSERT(fsw->fsw_agent_session != NULL);
5063
5064 /* if flow is closed early */
5065 if (fed->fed_want_clonotify) {
5066 necp_client_early_close(fed->fed_uuid);
5067 }
5068
5069 /* if nonviable, unassign nexus attributes */
5070 if (fed->fed_want_nonviable) {
5071 (void) netagent_assign_nexus(fsw->fsw_agent_session,
5072 fed->fed_uuid, NULL, 0);
5073 }
5074
5075 flow_entry_dead_free(fed);
5076 ++nonviable;
5077 }
5078 ASSERT(LIST_EMPTY(&fed_head));
5079
5080 return nonviable;
5081 }
5082
5083 static uint32_t
fsw_process_linger(struct nx_flowswitch * fsw,uint32_t * abort)5084 fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort)
5085 {
5086 struct flow_entry_linger_head linger_head =
5087 TAILQ_HEAD_INITIALIZER(linger_head);
5088 struct flow_entry *fe, *tfe;
5089 uint64_t now = net_uptime();
5090 uint32_t i = 0, cnt = 0, freed = 0;
5091
5092 ASSERT(fsw->fsw_ifp != NULL);
5093 ASSERT(abort != NULL);
5094 *abort = 0;
5095
5096 /*
5097 * We don't want to contend with the datapath, so move
5098 * everything that's in the linger list into a local list.
5099 * This allows us to generate RSTs or free the flow entry
5100 * outside the lock. Any remaining flow entry in the local
5101 * list will get re-added back to the head of the linger
5102 * list, in front of any new ones added since then.
5103 */
5104 lck_mtx_lock(&fsw->fsw_linger_lock);
5105 TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
5106 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
5107 cnt = fsw->fsw_linger_cnt;
5108 fsw->fsw_linger_cnt = 0;
5109 lck_mtx_unlock(&fsw->fsw_linger_lock);
5110
5111 TAILQ_FOREACH_SAFE(fe, &linger_head, fe_linger_link, tfe) {
5112 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
5113 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
5114 ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
5115
5116 /*
5117 * See if this is a TCP flow that needs to generate
5118 * a RST to the remote peer (if not already).
5119 */
5120 if (flow_track_tcp_want_abort(fe)) {
5121 VERIFY(fe->fe_flags & FLOWENTF_ABORTED);
5122 ASSERT(!uuid_is_null(fe->fe_uuid));
5123 flow_track_abort_tcp(fe, NULL, NULL);
5124 (*abort)++;
5125 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
5126 SK_DF(SK_VERB_FLOW, "fe \"%s\" [RST]",
5127 fe2str(fe, dbgbuf, sizeof(dbgbuf)));
5128 }
5129
5130 /*
5131 * If flow has expired, remove from list and free;
5132 * otherwise leave it around in the linger list.
5133 */
5134 if (fe->fe_linger_expire <= now) {
5135 freed++;
5136 fsw_linger_remove_internal(&linger_head, fe);
5137 fe = NULL;
5138 }
5139 ++i;
5140 }
5141 VERIFY(i == cnt && cnt >= freed);
5142
5143 /*
5144 * Add any remaining ones back into the linger list.
5145 */
5146 lck_mtx_lock(&fsw->fsw_linger_lock);
5147 if (!TAILQ_EMPTY(&linger_head)) {
5148 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head) || fsw->fsw_linger_cnt);
5149 TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
5150 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
5151 TAILQ_CONCAT(&fsw->fsw_linger_head, &linger_head, fe_linger_link);
5152 fsw->fsw_linger_cnt += (cnt - freed);
5153 }
5154 ASSERT(TAILQ_EMPTY(&linger_head));
5155 lck_mtx_unlock(&fsw->fsw_linger_lock);
5156
5157 return freed;
5158 }
5159
5160 static void
fsw_process_rxstrc(struct nx_flowswitch * fsw)5161 fsw_process_rxstrc(struct nx_flowswitch *fsw)
5162 {
5163 struct flow_entry_rxstrc_head rxstrc_head =
5164 TAILQ_HEAD_INITIALIZER(rxstrc_head);
5165 struct flow_entry *fe, *tfe;
5166
5167 /*
5168 * We don't want to contend with the datapath, so move
5169 * everything that's in the rxstrc list into a local list.
5170 * This allows us to cleanup Rx steering rules or free the flow entry
5171 * outside the lock.
5172 */
5173 lck_mtx_lock(&fsw->fsw_rxstrc_lock);
5174 TAILQ_CONCAT(&rxstrc_head, &fsw->fsw_rxstrc_head, fe_rxstrc_link);
5175 ASSERT(TAILQ_EMPTY(&fsw->fsw_rxstrc_head));
5176 fsw->fsw_rxstrc_cnt = 0;
5177 lck_mtx_unlock(&fsw->fsw_rxstrc_lock);
5178
5179 TAILQ_FOREACH_SAFE(fe, &rxstrc_head, fe_rxstrc_link, tfe) {
5180 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
5181 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
5182 ASSERT(fe->fe_flags & FLOWENTF_RXSTRC_PENDING);
5183 ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD);
5184
5185 flow_entry_rx_steering_rule_cleanup(fsw, fe);
5186 fsw_rxstrc_remove_internal(&rxstrc_head, fe);
5187 fe = NULL;
5188 }
5189 }
5190
5191 __attribute__((always_inline))
5192 static inline void
fsw_ifp_inc_traffic_class_in_pkt(struct ifnet * ifp,kern_packet_t ph)5193 fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph)
5194 {
5195 switch (__packet_get_traffic_class(ph)) {
5196 case PKT_TC_BE:
5197 ifp->if_tc.ifi_ibepackets++;
5198 ifp->if_tc.ifi_ibebytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5199 break;
5200 case PKT_TC_BK:
5201 ifp->if_tc.ifi_ibkpackets++;
5202 ifp->if_tc.ifi_ibkbytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5203 break;
5204 case PKT_TC_VI:
5205 ifp->if_tc.ifi_ivipackets++;
5206 ifp->if_tc.ifi_ivibytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5207 break;
5208 case PKT_TC_VO:
5209 ifp->if_tc.ifi_ivopackets++;
5210 ifp->if_tc.ifi_ivobytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5211 break;
5212 default:
5213 break;
5214 }
5215 }
5216
5217 __attribute__((always_inline))
5218 static inline void
fsw_ifp_inc_traffic_class_out_pkt(struct ifnet * ifp,uint32_t svc,uint32_t cnt,uint32_t len)5219 fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *ifp, uint32_t svc,
5220 uint32_t cnt, uint32_t len)
5221 {
5222 switch (svc) {
5223 case PKT_TC_BE:
5224 ifp->if_tc.ifi_obepackets += cnt;
5225 ifp->if_tc.ifi_obebytes += len;
5226 break;
5227 case PKT_TC_BK:
5228 ifp->if_tc.ifi_obkpackets += cnt;
5229 ifp->if_tc.ifi_obkbytes += len;
5230 break;
5231 case PKT_TC_VI:
5232 ifp->if_tc.ifi_ovipackets += cnt;
5233 ifp->if_tc.ifi_ovibytes += len;
5234 break;
5235 case PKT_TC_VO:
5236 ifp->if_tc.ifi_ovopackets += cnt;
5237 ifp->if_tc.ifi_ovobytes += len;
5238 break;
5239 default:
5240 break;
5241 }
5242 }
5243