1 /*
2 * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 */
53
54 /*
55 * BSD LICENSE
56 *
57 * Copyright(c) 2015 NEC Europe Ltd. All rights reserved.
58 * All rights reserved.
59 *
60 * Redistribution and use in source and binary forms, with or without
61 * modification, are permitted provided that the following conditions
62 * are met:
63 *
64 * * Redistributions of source code must retain the above copyright
65 * notice, this list of conditions and the following disclaimer.
66 * * Redistributions in binary form must reproduce the above copyright
67 * notice, this list of conditions and the following disclaimer in
68 * the documentation and/or other materials provided with the
69 * distribution.
70 * * Neither the name of NEC Europe Ltd. nor the names of
71 * its contributors may be used to endorse or promote products derived
72 * from this software without specific prior written permission.
73 *
74 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
75 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
76 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
77 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
78 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
79 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
80 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
81 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
82 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
83 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
84 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85 */
86
87 #include <skywalk/os_skywalk_private.h>
88 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
89 #include <skywalk/nexus/flowswitch/fsw_var.h>
90 #include <skywalk/nexus/netif/nx_netif.h>
91 #include <skywalk/nexus/netif/nx_netif_compat.h>
92 #include <kern/sched_prim.h>
93 #include <kern/uipc_domain.h>
94 #include <sys/kdebug.h>
95 #include <sys/sdt.h>
96 #include <net/bpf.h>
97 #include <net/if_ports_used.h>
98 #include <net/pktap.h>
99 #include <net/droptap.h>
100 #include <net/pktsched/pktsched_netem.h>
101 #include <netinet/tcp.h>
102 #include <netinet/udp.h>
103 #include <netinet/ip.h>
104 #include <netinet/ip6.h>
105 #include <netinet/in_var.h>
106
107 extern kern_return_t thread_terminate(thread_t);
108
109 #define FSW_ZONE_MAX 256
110 #define FSW_ZONE_NAME "skywalk.nx.fsw"
111
112 static uint64_t fsw_reap_last __sk_aligned(8);
113 static uint64_t fsw_want_purge __sk_aligned(8);
114
115 #define NX_FSW_FE_TABLESZ 256 /* some power of 2 */
116 static uint32_t fsw_fe_table_size = NX_FSW_FE_TABLESZ;
117
118 #define NX_FSW_FOB_HASHSZ 31 /* some mersenne prime */
119 static uint32_t fsw_flow_owner_buckets = NX_FSW_FOB_HASHSZ;
120
121 #define NX_FSW_FRB_HASHSZ 128 /* some power of 2 */
122 static uint32_t fsw_flow_route_buckets = NX_FSW_FRB_HASHSZ;
123
124 #define NX_FSW_FRIB_HASHSZ 13 /* some mersenne prime */
125 static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ;
126
127 #define NX_FSW_FLOW_REAP_INTERVAL 1 /* seconds */
128 static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL;
129
130 #define NX_FSW_RX_STALL_THRES 0 /* seconds (0 = disable) */
131 static uint32_t fsw_rx_stall_thresh = NX_FSW_RX_STALL_THRES;
132
133 #define NX_FSW_RX_STALL_DEFUNCT 1 /* defunct Rx-stalled channel (0 = disable) */
134 static uint32_t fsw_rx_stall_defunct = NX_FSW_RX_STALL_DEFUNCT;
135
136 #define NX_FSW_FLOW_PURGE_THRES 0 /* purge every N reaps (0 = disable) */
137 static uint32_t fsw_flow_purge_thresh = NX_FSW_FLOW_PURGE_THRES;
138
139 #define FSW_REAP_IVAL (MAX(1, fsw_flow_reap_interval))
140 #define FSW_REAP_SK_THRES (FSW_REAP_IVAL << 5)
141 #define FSW_REAP_IF_THRES (FSW_REAP_IVAL << 5)
142 #define FSW_DRAIN_CH_THRES (FSW_REAP_IVAL << 5)
143 #define FSW_IFSTATS_THRES 1
144
145 #define NX_FSW_CHANNEL_REAP_THRES 1000 /* threshold (bytes/sec) for reaping*/
146 uint64_t fsw_channel_reap_thresh = NX_FSW_CHANNEL_REAP_THRES;
147
148 #define RX_BUFLET_BATCH_COUNT 64 /* max batch size for buflet allocation */
149
150 uint32_t fsw_rx_batch = NX_FSW_RXBATCH; /* # of packets per batch (RX) */
151 uint32_t fsw_tx_batch = NX_FSW_TXBATCH; /* # of packets per batch (TX) */
152 uint32_t fsw_gso_batch = 8;
153 #if (DEVELOPMENT || DEBUG)
154 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_batch,
155 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_batch, 0,
156 "flowswitch Rx batch size");
157 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, tx_batch,
158 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_tx_batch, 0,
159 "flowswitch Tx batch size");
160 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_batch,
161 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_gso_batch, 0,
162 "flowswitch GSO batch size");
163 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, reap_throughput,
164 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_channel_reap_thresh,
165 "flowswitch channel reap threshold throughput (bytes/sec)");
166 #endif /* !DEVELOPMENT && !DEBUG */
167
168 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp,
169 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp, 0,
170 "flowswitch RX aggregation for tcp flows (enable/disable)");
171 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp_host,
172 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp_host, 0,
173 "flowswitch RX aggregation for tcp kernel path (0/1/2 (off/on/auto))");
174 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_mtu,
175 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_gso_mtu, 0,
176 "flowswitch GSO for tcp flows (mtu > 0: enable, mtu == 0: disable)");
177
178 /*
179 * IP reassembly
180 * The "kern.skywalk.flowswitch.ip_reass" sysctl can be used to force
181 * enable/disable the reassembly routine regardless of whether the
182 * transport netagent is enabled or not.
183 *
184 * 'fsw_ip_reass' is a tri-state:
185 * 0 means force IP reassembly off
186 * 1 means force IP reassembly on
187 * 2 means don't force the value, use what's appropriate for this flowswitch
188 */
189 #define FSW_IP_REASS_FORCE_OFF 0
190 #define FSW_IP_REASS_FORCE_ON 1
191 #define FSW_IP_REASS_AUTO 2
192
193 uint32_t fsw_ip_reass = FSW_IP_REASS_AUTO;
194
195 static int
196 fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS
197 {
198 #pragma unused(oidp, arg1, arg2)
199 unsigned int new_value;
200 int changed;
201 int error;
202
203 error = sysctl_io_number(req, fsw_ip_reass, sizeof(fsw_ip_reass),
204 &new_value, &changed);
205 if (error == 0 && changed != 0) {
206 if (new_value > FSW_IP_REASS_AUTO) {
207 return EINVAL;
208 }
209 fsw_ip_reass = new_value;
210 }
211 return error;
212 }
213
214 SYSCTL_PROC(_kern_skywalk_flowswitch, OID_AUTO, ip_reass,
215 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
216 0, 0, fsw_ip_reass_sysctl, "IU",
217 "adjust flowswitch IP reassembly");
218
219 #if (DEVELOPMENT || DEBUG)
220 static uint64_t _fsw_inject_error = 0;
221 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) \
222 _SK_INJECT_ERROR(_fsw_inject_error, _en, _ev, _ec, \
223 &FSW_STATS_VAL(_FSW_STATS_ERROR_INJECTIONS), _f, __VA_ARGS__)
224
225 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { \
226 if (__improbable(((_fsw_inject_error) & (1ULL << (_en))) != 0)) { \
227 SK_DF(SK_VERB_ERROR_INJECT, "injecting error %d", (_en));\
228 if ((_f) != NULL) \
229 (_f)(__VA_ARGS__); \
230 } \
231 } while (0)
232
233 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_owner_buckets,
234 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_owner_buckets, 0, "");
235 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, fe_table_size,
236 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_fe_table_size, 0, "");
237 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_buckets,
238 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_route_buckets, 0, "");
239 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO,
240 flow_route_id_buckets, CTLFLAG_RW | CTLFLAG_LOCKED,
241 &fsw_flow_route_id_buckets, 0, "");
242 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_reap_interval,
243 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_reap_interval, 0, "");
244 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_stall_thresh,
245 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_stall_thresh, 0, "");
246 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_stall_defunct,
247 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_stall_defunct, 0, "");
248 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_purge_thresh,
249 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_purge_thresh, 0, "");
250 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, fsw_inject_error,
251 CTLFLAG_RW | CTLFLAG_LOCKED, &_fsw_inject_error, "");
252 #else
253 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0)
254 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { } while (0)
255 #endif /* !DEVELOPMENT && !DEBUG */
256
257 static void fsw_linger_remove_internal(struct flow_entry_linger_head *,
258 struct flow_entry *);
259 static void fsw_reap_thread_func(void *, wait_result_t);
260 static void fsw_reap_thread_cont(void *, wait_result_t);
261 static void fsw_purge_cache(struct nx_flowswitch *, boolean_t);
262 static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t);
263 static uint32_t fsw_process_deferred(struct nx_flowswitch *);
264 static uint32_t fsw_process_linger(struct nx_flowswitch *, uint32_t *);
265 static void fsw_process_rxstrc(struct nx_flowswitch *);
266
267 static int copy_packet_from_dev(struct nx_flowswitch *, struct __kern_packet *,
268 struct __kern_packet *);
269
270 static void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *, kern_packet_t);
271 static void fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *, uint32_t,
272 uint32_t, uint32_t);
273
274 static int __fsw_dp_inited = 0;
275
276 int
fsw_dp_init(void)277 fsw_dp_init(void)
278 {
279 static_assert(FSW_VP_DEV == 0);
280 static_assert(FSW_VP_HOST == 1);
281 static_assert((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN);
282 static_assert((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT);
283
284 ASSERT(!__fsw_dp_inited);
285
286 flow_mgr_init();
287 flow_init();
288
289 __fsw_dp_inited = 1;
290
291 return 0;
292 }
293
294 void
fsw_dp_uninit(void)295 fsw_dp_uninit(void)
296 {
297 if (__fsw_dp_inited) {
298 flow_fini();
299 flow_mgr_fini();
300
301 __fsw_dp_inited = 0;
302 }
303 }
304
305 static void
dp_free_pktq(struct nx_flowswitch * fsw __sk_unused,struct pktq * pktq)306 dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq)
307 {
308 pp_free_pktq(pktq);
309 }
310
311 #define dp_drop_pktq(fsw, pktq, outgoing, _reason, line, _flags) do { \
312 uint32_t _len = KPKTQ_LEN(pktq); \
313 if (KPKTQ_EMPTY(pktq)) { \
314 ASSERT(_len == 0); \
315 break; \
316 } \
317 SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop %d packets", _len); \
318 FSW_STATS_ADD(FSW_STATS_DROP, _len); \
319 DTRACE_SKYWALK1(fsw__dp__drop, int, _len); \
320 if (__probable(droptap_total_tap_count == 0)) { \
321 dp_free_pktq(fsw, pktq); \
322 break; \
323 } \
324 drop_func_t dropfunc; \
325 dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
326 struct __kern_packet *kpkt = KPKTQ_FIRST(pktq); \
327 struct __kern_packet *next_pkt; \
328 for (; kpkt != NULL; kpkt = next_pkt) { \
329 next_pkt = kpkt->pkt_nextpkt; \
330 dropfunc(SK_PKT2PH(kpkt), _reason, __func__, line, _flags, \
331 fsw->fsw_ifp, kpkt->pkt_qum.qum_pid, NULL, -1, NULL, \
332 0, 0); \
333 } \
334 dp_free_pktq(fsw, pktq); \
335 } while (0)
336
337 #define dp_drop_pkt_single_nofree(fsw, pkt, outgoing, _reason, _flags) do { \
338 SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop 1 packet"); \
339 FSW_STATS_ADD(FSW_STATS_DROP, 1); \
340 if (__probable(droptap_total_tap_count == 0)) { \
341 break; \
342 } \
343 drop_func_t dropfunc; \
344 dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
345 dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags, \
346 fsw->fsw_ifp, (pkt)->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0); \
347 } while (0)
348
349 #define dp_drop_pkt_single(fsw, pkt, outgoing, _reason, _flags) do { \
350 SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop 1 packet"); \
351 FSW_STATS_ADD(FSW_STATS_DROP, 1); \
352 if (__probable(droptap_total_tap_count == 0)) { \
353 pp_free_packet_single(pkt); \
354 break; \
355 } \
356 drop_func_t dropfunc; \
357 dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
358 dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags, \
359 fsw->fsw_ifp, (pkt)->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0); \
360 pp_free_packet_single(pkt); \
361 } while (0)
362
363 #define dp_drop_pkt_chain(pkt, outgoing, _reason, _flags) do { \
364 if (__probable(droptap_total_tap_count == 0)) { \
365 pp_free_packet_chain(pkt, NULL); \
366 break; \
367 } \
368 drop_func_t dropfunc; \
369 dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
370 struct __kern_packet *next_pkt; \
371 for (; pkt != NULL; pkt = next_pkt) { \
372 next_pkt = pkt->pkt_nextpkt; \
373 dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags, \
374 NULL, pkt->pkt_qum.qum_pid, NULL, -1, NULL, \
375 0, 0); \
376 } \
377 pp_free_packet_chain(pkt, NULL); \
378 } while (0)
379
380
381 SK_NO_INLINE_ATTRIBUTE
382 void
fsw_snoop(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * pktq,bool input)383 fsw_snoop(struct nx_flowswitch *fsw, struct flow_entry *fe, struct pktq *pktq,
384 bool input)
385 {
386 pid_t pid;
387 char proc_name_buf[FLOW_PROCESS_NAME_LENGTH];
388 const char *__null_terminated proc_name = NULL;
389 pid_t epid;
390 char eproc_name_buf[FLOW_PROCESS_NAME_LENGTH];
391 const char *__null_terminated eproc_name = NULL;
392 sa_family_t af;
393 bool tap_early = false;
394 struct __kern_packet *pkt;
395
396 ASSERT(fe != NULL);
397 ASSERT(fsw->fsw_ifp != NULL);
398
399 if (fe->fe_nx_port == FSW_VP_HOST) {
400 /* allow packets to be tapped before aggregation happens */
401 tap_early = (input && fe->fe_key.fk_proto == IPPROTO_TCP);
402 if (!tap_early) {
403 /* all other traffic will be tapped in the dlil input path */
404 return;
405 }
406 }
407 if (fe->fe_key.fk_ipver == IPVERSION) {
408 af = AF_INET;
409 } else if (fe->fe_key.fk_ipver == IPV6_VERSION) {
410 af = AF_INET6;
411 } else {
412 return;
413 }
414
415 pid = fe->fe_pid;
416 if (fe->fe_proc_name[0] != '\0') {
417 proc_name = strbufcpy(proc_name_buf, sizeof(proc_name_buf),
418 fe->fe_proc_name, sizeof(fe->fe_proc_name));
419 }
420 epid = fe->fe_epid;
421 if (fe->fe_eproc_name[0] != '\0') {
422 eproc_name = strbufcpy(eproc_name_buf, sizeof(eproc_name_buf),
423 fe->fe_eproc_name, sizeof(fe->fe_eproc_name));
424 }
425 if (input) {
426 KPKTQ_FOREACH(pkt, pktq) {
427 pktap_input_packet(fsw->fsw_ifp, af,
428 fsw->fsw_ifp_dlt, pid, proc_name, epid,
429 eproc_name, SK_PKT2PH(pkt), NULL, 0,
430 IPPROTO_TCP, fe->fe_flowid,
431 tap_early ? PTH_FLAG_SOCKET: PTH_FLAG_NEXUS_CHAN);
432 }
433 } else {
434 KPKTQ_FOREACH(pkt, pktq) {
435 pktap_output_packet(fsw->fsw_ifp, af,
436 fsw->fsw_ifp_dlt, pid, proc_name, epid,
437 eproc_name, SK_PKT2PH(pkt), NULL, 0,
438 0, 0, PTH_FLAG_NEXUS_CHAN);
439 }
440 }
441 }
442
443 #if (DEVELOPMENT || DEBUG)
444 static void
_fsw_error35_handler(int step,struct flow_route * fr,struct __kern_packet * pkt,int * ret)445 _fsw_error35_handler(int step, struct flow_route *fr, struct __kern_packet *pkt,
446 int *ret)
447 {
448 static boolean_t _err35_flag_modified = FALSE;
449
450 switch (step) {
451 case 1:
452 if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
453 (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
454 fr->fr_flags &= ~FLOWRTF_RESOLVED;
455 _err35_flag_modified = TRUE;
456 }
457 break;
458
459 case 2:
460 if (!_err35_flag_modified) {
461 return;
462 }
463 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
464 m_freem(pkt->pkt_mbuf);
465 pkt->pkt_pflags &= ~PKT_F_MBUF_DATA;
466 pkt->pkt_mbuf = NULL;
467 }
468 *ret = EJUSTRETURN;
469 fr->fr_flags |= FLOWRTF_RESOLVED;
470 _err35_flag_modified = FALSE;
471 break;
472
473 default:
474 VERIFY(0);
475 /* not reached */
476 }
477 }
478
479 static void
_fsw_error36_handler(int step,struct flow_route * fr,int * ret)480 _fsw_error36_handler(int step, struct flow_route *fr, int *ret)
481 {
482 static boolean_t _err36_flag_modified = FALSE;
483
484 switch (step) {
485 case 1:
486 if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
487 (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
488 fr->fr_flags &= ~FLOWRTF_RESOLVED;
489 _err36_flag_modified = TRUE;
490 }
491 break;
492
493 case 2:
494 if (!_err36_flag_modified) {
495 return;
496 }
497 *ret = ENETUNREACH;
498 fr->fr_flags |= FLOWRTF_RESOLVED;
499 _err36_flag_modified = FALSE;
500 break;
501
502 default:
503 VERIFY(0);
504 /* not reached */
505 }
506 }
507 #else /* !DEVELOPMENT && !DEBUG */
508 #define _fsw_error35_handler(...)
509 #define _fsw_error36_handler(...)
510 #endif /* DEVELOPMENT || DEBUG */
511
512 /*
513 * Check if the source packet content can fit into the destination
514 * ring's packet. Returns TRUE if the source packet can fit.
515 * Note: Failures could be caused by misconfigured packet pool sizes,
516 * missing packet size check again MTU or if the source packet is from
517 * a compat netif and the attached mbuf is larger than MTU due to LRO.
518 */
519 static inline boolean_t
validate_pkt_len(struct __kern_packet * spkt,kern_packet_t dph,uint32_t skip_l2hlen,uint32_t l2hlen,uint16_t headroom,uint32_t * copy_len)520 validate_pkt_len(struct __kern_packet *spkt, kern_packet_t dph,
521 uint32_t skip_l2hlen, uint32_t l2hlen, uint16_t headroom,
522 uint32_t *copy_len)
523 {
524 uint32_t tlen = 0;
525 uint32_t splen = spkt->pkt_length - skip_l2hlen;
526
527 if (l2hlen != 0) {
528 VERIFY(skip_l2hlen == 0);
529 tlen += l2hlen;
530 } else if ((spkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0) {
531 splen -= ETHER_CRC_LEN;
532 }
533
534 tlen += splen;
535 *copy_len = splen;
536
537 return tlen <= ((__packet_get_buflet_count(dph) *
538 PP_BUF_SIZE_DEF(SK_PTR_ADDR_KPKT(dph)->pkt_qum.qum_pp)) -
539 headroom);
540 }
541
542 #if SK_LOG
543 /* Hoisted out of line to reduce kernel stack footprint */
544 SK_LOG_ATTRIBUTE
545 static void
copy_packet_from_dev_log(struct __kern_packet * spkt,struct __kern_packet * dpkt,struct proc * p)546 copy_packet_from_dev_log(struct __kern_packet *spkt,
547 struct __kern_packet *dpkt, struct proc *p)
548 {
549 uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
550 ((spkt->pkt_pflags & PKT_F_MBUF_DATA) ?
551 SK_VERB_COPY_MBUF : SK_VERB_COPY));
552 char *daddr;
553 uint32_t pkt_len;
554
555 MD_BUFLET_ADDR_ABS(dpkt, daddr);
556 pkt_len = __packet_get_real_data_length(dpkt);
557 SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u",
558 sk_proc_name(p), sk_proc_pid(p), spkt->pkt_length,
559 dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
560 (uint32_t)dpkt->pkt_l2_len);
561 SK_DF(logflags | SK_VERB_DUMP, "%s",
562 sk_dump("buf", daddr, pkt_len, 128));
563 }
564 #else
565 #define copy_packet_from_dev_log(...)
566 #endif /* SK_LOG */
567
568
569 static inline int
copy_packet_from_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)570 copy_packet_from_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
571 struct __kern_packet *dpkt)
572 {
573 /*
574 * source and destination nexus don't share the packet pool
575 * sync operation here is to
576 * - alloc packet for the rx(dst) ring
577 * - copy data/metadata from src packet to dst packet
578 * - attach alloc'd packet to rx(dst) ring
579 */
580 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
581 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
582 kern_packet_t sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt),
583 METADATA_SUBTYPE(spkt));
584 boolean_t do_cksum_rx;
585 uint16_t skip_l2h_len = spkt->pkt_l2_len;
586 uint16_t iphlen;
587 uint32_t dlen;
588 int err;
589
590 if (__improbable(!validate_pkt_len(spkt, dph, skip_l2h_len, 0, 0,
591 &dlen))) {
592 SK_ERR("bufcnt %d, bufsz %d", __packet_get_buflet_count(dph),
593 PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp));
594 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
595 return EINVAL;
596 }
597
598 /* Copy packet metadata */
599 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
600 _PKT_COPY(spkt, dpkt);
601 ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
602 PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
603 ASSERT(dpkt->pkt_mbuf == NULL);
604
605 dpkt->pkt_headroom = 0;
606 dpkt->pkt_l2_len = 0;
607
608 /* don't include IP header from partial sum */
609 if (__probable((spkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) {
610 iphlen = spkt->pkt_flow_ip_hlen;
611 do_cksum_rx = sk_cksum_rx;
612 } else {
613 iphlen = 0;
614 do_cksum_rx = FALSE;
615 }
616
617 /* Copy packet payload */
618 if ((spkt->pkt_pflags & PKT_F_MBUF_DATA) &&
619 (spkt->pkt_pflags & PKT_F_TRUNCATED)) {
620 FSW_STATS_INC(FSW_STATS_RX_COPY_MBUF2PKT);
621 /*
622 * Source packet has truncated contents (just enough for
623 * the classifer) of an mbuf from the compat driver; copy
624 * the entire entire mbuf contents to destination packet.
625 */
626 m_adj(spkt->pkt_mbuf, skip_l2h_len);
627 ASSERT((uint32_t)m_pktlen(spkt->pkt_mbuf) >= dlen);
628 fsw->fsw_pkt_copy_from_mbuf(NR_RX, dph, 0,
629 spkt->pkt_mbuf, 0, dlen, do_cksum_rx, iphlen);
630 } else {
631 FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2PKT);
632 /*
633 * Source packet has full contents, either from an mbuf
634 * that came up from the compat driver, or because it
635 * originated on the native driver; copy to destination.
636 */
637 fsw->fsw_pkt_copy_from_pkt(NR_RX, dph, 0, sph,
638 (spkt->pkt_headroom + spkt->pkt_l2_len), dlen, do_cksum_rx,
639 iphlen, 0, FALSE);
640 }
641
642 #if DEBUG || DEVELOPMENT
643 if (__improbable(pkt_trailers > 0)) {
644 dlen += pkt_add_trailers(dph, dlen, iphlen);
645 }
646 #endif /* DEBUG || DEVELOPMENT */
647
648 /* Finalize and attach packet to Rx ring */
649 METADATA_ADJUST_LEN(dpkt, 0, 0);
650 err = __packet_finalize(dph);
651 VERIFY(err == 0);
652
653 copy_packet_from_dev_log(spkt, dpkt, kernproc);
654
655 if (spkt->pkt_pflags & PKT_F_MBUF_DATA) {
656 ifp_inc_traffic_class_in(fsw->fsw_ifp, spkt->pkt_mbuf);
657 mbuf_freem(spkt->pkt_mbuf);
658 KPKT_CLEAR_MBUF_DATA(spkt);
659 } else {
660 fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph);
661 }
662
663 if (__probable(do_cksum_rx != 0)) {
664 FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
665 }
666
667 return 0;
668 }
669
670 SK_NO_INLINE_ATTRIBUTE
671 static struct __kern_packet *
rx_process_ip_frag(struct nx_flowswitch * fsw,struct __kern_packet * pkt)672 rx_process_ip_frag(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
673 {
674 char *pkt_buf;
675 void *l3_hdr;
676 uint16_t nfrags, tlen;
677 int err = 0;
678
679 switch (fsw_ip_reass) {
680 case FSW_IP_REASS_FORCE_OFF:
681 return pkt;
682 case FSW_IP_REASS_FORCE_ON:
683 break;
684 default:
685 if (!FSW_NETAGENT_ENABLED(fsw) ||
686 flow_mgr_get_num_flows(fsw->fsw_flow_mgr) == 0) {
687 return pkt;
688 }
689 break;
690 }
691
692 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
693 l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
694
695 ASSERT(fsw->fsw_ipfm != NULL);
696 ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
697
698 if (pkt->pkt_flow_ip_ver == IPVERSION) {
699 struct ip *ip = l3_hdr;
700 err = fsw_ip_frag_reass_v4(fsw->fsw_ipfm, &pkt, ip, &nfrags, &tlen);
701 } else {
702 struct ip6_hdr *ip6_hdr = l3_hdr;
703 struct ip6_frag *__single ip6_frag =
704 (struct ip6_frag *)((uint8_t *)l3_hdr + sizeof(struct ip6_hdr));
705
706 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
707 /* we only handle frag header immediately after v6 header */
708 err = fsw_ip_frag_reass_v6(fsw->fsw_ipfm, &pkt, ip6_hdr, ip6_frag,
709 &nfrags, &tlen);
710 }
711 if (__improbable(err != 0)) {
712 /* if we get a bad fragment, free it */
713 pp_free_packet_single(pkt);
714 pkt = NULL;
715 } else {
716 ASSERT(!((pkt != NULL) ^ (nfrags > 0)));
717 }
718
719 return pkt;
720 }
721
722 SK_NO_INLINE_ATTRIBUTE
723 static void
rx_prepare_packet_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)724 rx_prepare_packet_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
725 {
726 ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
727 uint32_t mlen = (uint32_t)m_pktlen(pkt->pkt_mbuf);
728 kern_packet_t ph = SK_PTR_ENCODE(pkt,
729 METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
730 /*
731 * This is the case when the packet is coming in from
732 * compat-netif. This packet only has valid metadata
733 * and an attached mbuf. We need to copy enough data
734 * from the mbuf to the packet buffer for the
735 * classifier. Compat netif packet pool is configured
736 * with buffer size of NETIF_COMPAT_MAX_MBUF_DATA_COPY
737 * which is just enough to hold the protocol headers
738 * for the flowswitch classifier.
739 */
740
741 pkt->pkt_headroom = 0;
742 METADATA_ADJUST_LEN(pkt, 0, 0);
743 /*
744 * Copy the initial 128 bytes of the packet for
745 * classification.
746 * Ethernet(14) + IPv6 header(40) +
747 * + IPv6 fragment header(8) +
748 * TCP header with options(60).
749 */
750 fsw->fsw_pkt_copy_from_mbuf(NR_RX, ph,
751 pkt->pkt_headroom, pkt->pkt_mbuf, 0,
752 MIN(mlen, NETIF_COMPAT_MAX_MBUF_DATA_COPY),
753 FALSE, 0);
754
755 int err = __packet_finalize_with_mbuf(pkt);
756 VERIFY(err == 0);
757 }
758
759 static struct __kern_packet *
rx_prepare_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)760 rx_prepare_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
761 {
762 pkt->pkt_qum_qflags &= ~QUM_F_FLOW_CLASSIFIED;
763
764 if (__improbable(pkt->pkt_pflags & PKT_F_MBUF_DATA)) {
765 rx_prepare_packet_mbuf(fsw, pkt);
766 }
767
768 return pkt;
769 }
770
771 static struct flow_entry *
lookup_flow_with_pkt(struct nx_flowswitch * fsw,struct __kern_packet * pkt,bool input,struct flow_entry * prev_fe)772 lookup_flow_with_pkt(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
773 bool input, struct flow_entry *prev_fe)
774 {
775 struct flow_key key __sk_aligned(16);
776 struct flow_entry *__single fe = NULL;
777
778 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
779 flow_pkt2key(pkt, input, &key);
780
781 if (__probable(prev_fe != NULL &&
782 prev_fe->fe_key.fk_mask == FKMASK_5TUPLE)) {
783 uint16_t saved_mask = key.fk_mask;
784 key.fk_mask = FKMASK_5TUPLE;
785 if (flow_key_cmp_mask(&prev_fe->fe_key, &key, &fk_mask_5tuple) == 0) {
786 flow_entry_retain(prev_fe);
787 fe = prev_fe;
788 } else {
789 key.fk_mask = saved_mask;
790 }
791 }
792
793 top:
794 if (__improbable(fe == NULL)) {
795 fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &key);
796 }
797
798 if (__improbable(fe != NULL &&
799 (fe->fe_flags & (FLOWENTF_PARENT | FLOWENTF_CHILD)) != 0)) {
800 /* Rx */
801 if (input) {
802 if (fe->fe_flags & FLOWENTF_PARENT) {
803 struct flow_entry *child_fe = rx_lookup_child_flow(fsw, fe, pkt);
804 if (child_fe != NULL) {
805 flow_entry_release(&fe);
806 fe = child_fe;
807 }
808 } else {
809 if (!rx_flow_demux_match(fsw, fe, pkt)) {
810 flow_entry_release(&fe);
811 fe = NULL;
812 goto top;
813 }
814 }
815 } else {
816 /* Tx */
817 if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
818 if (__probable(fe->fe_flags & FLOWENTF_PARENT)) {
819 struct flow_entry *__single parent_fe = fe;
820 fe = tx_lookup_child_flow(parent_fe, pkt->pkt_flow_id);
821 flow_entry_release(&parent_fe);
822 } else {
823 flow_entry_release(&fe);
824 fe = NULL;
825 goto top;
826 }
827 }
828 }
829 }
830
831 SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]);
832 SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP,
833 "%s %s %s \"%s\" fe %p",
834 input ? "Rx" : "Tx", if_name(fsw->fsw_ifp),
835 sk_proc_name(current_proc()),
836 fk2str(&key, fkbuf, sizeof(fkbuf)), SK_KVA(fe));
837
838 return fe;
839 }
840
841 SK_NO_INLINE_ATTRIBUTE
842 static bool
pkt_is_for_listener(struct flow_entry * fe,struct __kern_packet * pkt)843 pkt_is_for_listener(struct flow_entry *fe, struct __kern_packet *pkt)
844 {
845 struct nx_flowswitch *fsw = fe->fe_fsw;
846 struct ifnet *ifp = fsw->fsw_ifp;
847 struct in_ifaddr *ia = NULL;
848 struct in_ifaddr *best_ia = NULL;
849 struct in6_ifaddr *ia6 = NULL;
850 struct in6_ifaddr *best_ia6 = NULL;
851 struct ifnet *match_ifp = NULL;
852 struct __flow *flow = pkt->pkt_flow;
853 bool result = false;
854
855 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
856
857 if (flow->flow_ip_ver == IPVERSION) {
858 if (IN_ZERONET(ntohl(flow->flow_ipv4_dst.s_addr)) ||
859 IN_LOOPBACK(ntohl(flow->flow_ipv4_dst.s_addr)) ||
860 IN_LINKLOCAL(ntohl(flow->flow_ipv4_dst.s_addr)) ||
861 IN_DS_LITE(ntohl(flow->flow_ipv4_dst.s_addr)) ||
862 IN_6TO4_RELAY_ANYCAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
863 IN_MULTICAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
864 INADDR_BROADCAST == flow->flow_ipv4_dst.s_addr) {
865 result = true;
866 goto done;
867 }
868
869 /*
870 * Check for a match in the hash bucket.
871 */
872 lck_rw_lock_shared(&in_ifaddr_rwlock);
873 TAILQ_FOREACH(ia, INADDR_HASH(flow->flow_ipv4_dst.s_addr), ia_hash) {
874 if (IA_SIN(ia)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr) {
875 best_ia = ia;
876 match_ifp = ia->ia_ifp;
877
878 if (match_ifp == ifp) {
879 break;
880 }
881 /*
882 * Continue the loop in case there's a exact match with another
883 * interface
884 */
885 }
886 }
887
888 if (best_ia != NULL) {
889 if (match_ifp != ifp && ipforwarding == 0 &&
890 (match_ifp->if_family == IFNET_FAMILY_IPSEC ||
891 match_ifp->if_family == IFNET_FAMILY_UTUN)) {
892 /*
893 * Drop when interface address check is strict and forwarding
894 * is disabled
895 */
896 } else {
897 lck_rw_done(&in_ifaddr_rwlock);
898 result = true;
899 goto done;
900 }
901 }
902 lck_rw_done(&in_ifaddr_rwlock);
903
904 if (ifp->if_flags & IFF_BROADCAST) {
905 /*
906 * Check for broadcast addresses.
907 *
908 * Only accept broadcast packets that arrive via the matching
909 * interface. Reception of forwarded directed broadcasts would be
910 * handled via ip_forward() and ether_frameout() with the loopback
911 * into the stack for SIMPLEX interfaces handled by ether_frameout().
912 */
913 struct ifaddr *ifa;
914
915 ifnet_lock_shared(ifp);
916 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
917 if (ifa->ifa_addr->sa_family != AF_INET) {
918 continue;
919 }
920 ia = ifatoia(ifa);
921 if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr ||
922 ia->ia_netbroadcast.s_addr == flow->flow_ipv4_dst.s_addr) {
923 ifnet_lock_done(ifp);
924 result = true;
925 goto done;
926 }
927 }
928 ifnet_lock_done(ifp);
929 }
930 } else {
931 struct in6_ifaddrhashhead *ia6_hash_head;
932
933 if (IN6_IS_ADDR_LOOPBACK(&flow->flow_ipv6_dst) ||
934 IN6_IS_ADDR_LINKLOCAL(&flow->flow_ipv6_dst) ||
935 IN6_IS_ADDR_MULTICAST(&flow->flow_ipv6_dst)) {
936 result = true;
937 goto done;
938 }
939
940 /*
941 * Check for exact addresses in the hash bucket.
942 */
943 lck_rw_lock_shared(&in6_ifaddr_rwlock);
944 /* XXX -fbounds-safety: external dependency on ip6_input.c */
945 ia6_hash_head = __unsafe_forge_bidi_indexable(struct in6_ifaddrhashhead *,
946 in6_ifaddrhashtbl, in6addr_nhash * sizeof(*in6_ifaddrhashtbl));
947 ia6_hash_head = &ia6_hash_head[in6addr_hashval(&flow->flow_ipv6_dst)];
948
949 TAILQ_FOREACH(ia6, ia6_hash_head, ia6_hash) {
950 if (in6_are_addr_equal_scoped(&ia6->ia_addr.sin6_addr, &flow->flow_ipv6_dst,
951 ia6->ia_ifp->if_index, ifp->if_index)) {
952 if ((ia6->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46))) {
953 continue;
954 }
955 best_ia6 = ia6;
956 if (ia6->ia_ifp == ifp) {
957 break;
958 }
959 /*
960 * Continue the loop in case there's a exact match with another
961 * interface
962 */
963 }
964 }
965 if (best_ia6 != NULL) {
966 if (best_ia6->ia_ifp != ifp && ip6_forwarding == 0 &&
967 (best_ia6->ia_ifp->if_family == IFNET_FAMILY_IPSEC ||
968 best_ia6->ia_ifp->if_family == IFNET_FAMILY_UTUN)) {
969 /*
970 * Drop when interface address check is strict and forwarding
971 * is disabled
972 */
973 } else {
974 lck_rw_done(&in6_ifaddr_rwlock);
975 result = true;
976 goto done;
977 }
978 }
979 lck_rw_done(&in6_ifaddr_rwlock);
980 }
981
982 /*
983 * In forwarding mode, if the destination address
984 * of the packet does not match any interface
985 * address, it maybe destined to the client device
986 */
987 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
988 "Rx flow does not match interface address");
989 done:
990 return result;
991 }
992
993 static struct flow_entry *
rx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)994 rx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
995 struct flow_entry *prev_fe)
996 {
997 struct flow_entry *__single fe;
998
999 fe = lookup_flow_with_pkt(fsw, pkt, true, prev_fe);
1000 _FSW_INJECT_ERROR(2, fe, NULL, flow_entry_release, &fe);
1001 if (fe == NULL) {
1002 FSW_STATS_INC(FSW_STATS_RX_FLOW_NOT_FOUND);
1003 return NULL;
1004 }
1005
1006 if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE &&
1007 fe->fe_flags & FLOWENTF_LISTENER) &&
1008 !pkt_is_for_listener(fe, pkt)) {
1009 FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_LISTENER);
1010 flow_entry_release(&fe);
1011 return NULL;
1012 }
1013
1014 if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
1015 FSW_STATS_INC(FSW_STATS_RX_FLOW_TORNDOWN);
1016 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
1017 "Rx flow torn down");
1018 flow_entry_release(&fe);
1019 return NULL;
1020 }
1021
1022 if (__improbable(fe->fe_flags & FLOWENTF_AOP_OFFLOAD)) {
1023 FSW_STATS_INC(FSW_STATS_RX_DISABLED);
1024 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
1025 "Rx not allowed for this flow");
1026 flow_entry_release(&fe);
1027 }
1028 return fe;
1029 }
1030
1031 static inline void
rx_flow_batch_packets(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt,uint64_t tid)1032 rx_flow_batch_packets(struct flow_entry_list *fes, struct flow_entry *fe,
1033 struct __kern_packet *pkt, uint64_t tid)
1034 {
1035 /*
1036 * Among threads working on the same fe, the first thread that reaches here
1037 * will be responsible for processing all the packets until a point when
1038 * it does not see new packets in fe_rx_pktq. Other threads only
1039 * enqueue their packets but do not add the flow entry to their flow entry list.
1040 */
1041 lck_mtx_lock(&fe->fe_rx_pktq_lock);
1042
1043 if (fe->fe_rx_worker_tid == 0) {
1044 fe->fe_rx_worker_tid = tid;
1045 } else if (__improbable(fe->fe_rx_worker_tid != tid)) {
1046 STATS_INC(&fe->fe_fsw->fsw_stats, FSW_STATS_RX_FLOW_IN_USE);
1047 }
1048
1049 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
1050 fe->fe_rx_frag_count++;
1051 }
1052
1053 fe->fe_rx_pktq_bytes += pkt->pkt_flow_ulen;
1054 /* KPKTQ_ENQUEUE_LIST is needed until frags become chained buflet */
1055 if (KPKTQ_EMPTY(&fe->fe_rx_pktq) && tid == fe->fe_rx_worker_tid) {
1056 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) == 0);
1057 TAILQ_INSERT_TAIL(fes, fe, fe_rx_link);
1058 KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
1059 lck_mtx_unlock(&fe->fe_rx_pktq_lock);
1060 } else {
1061 KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
1062 lck_mtx_unlock(&fe->fe_rx_pktq_lock);
1063 flow_entry_release(&fe);
1064 }
1065 }
1066
1067 static void
tx_flow_batch_packet(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt)1068 tx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
1069 struct __kern_packet *pkt)
1070 {
1071 /* record frag continuation */
1072 if (__improbable(pkt->pkt_flow_ip_is_first_frag)) {
1073 ASSERT(pkt->pkt_flow_ip_is_frag);
1074 fe->fe_tx_is_cont_frag = true;
1075 fe->fe_tx_frag_id = pkt->pkt_flow_ip_frag_id;
1076 } else if (__probable(!pkt->pkt_flow_ip_is_frag)) {
1077 fe->fe_tx_is_cont_frag = false;
1078 fe->fe_tx_frag_id = 0;
1079 }
1080
1081 if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
1082 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
1083 TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
1084 KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
1085 } else {
1086 ASSERT(!TAILQ_EMPTY(fes));
1087 KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
1088 flow_entry_release(&fe);
1089 }
1090 }
1091
1092 static inline void
fsw_rx_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes)1093 fsw_rx_ring_dequeue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
1094 uint32_t n_pkts_max, struct pktq *pktq, uint32_t *n_bytes)
1095 {
1096 uint32_t n_pkts = 0;
1097 slot_idx_t idx, idx_end;
1098 idx = r->ckr_khead;
1099 idx_end = r->ckr_rhead;
1100
1101 ASSERT(KPKTQ_EMPTY(pktq));
1102 *n_bytes = 0;
1103 for (; n_pkts < n_pkts_max && idx != idx_end;
1104 idx = SLOT_NEXT(idx, r->ckr_lim)) {
1105 struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1106 struct __kern_packet *pkt = ksd->sd_pkt;
1107
1108 ASSERT(pkt->pkt_nextpkt == NULL);
1109 KR_SLOT_DETACH_METADATA(r, ksd);
1110
1111 _FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1112 pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1113 if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1114 || (pkt->pkt_length == 0)) {
1115 FSW_STATS_INC(FSW_STATS_DROP);
1116 pp_free_packet_single(pkt);
1117 continue;
1118 }
1119 n_pkts++;
1120 *n_bytes += pkt->pkt_length;
1121
1122 KPKTQ_ENQUEUE(pktq, pkt);
1123 }
1124 r->ckr_khead = idx;
1125 r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
1126 }
1127
1128 /*
1129 * This is only for estimating how many packets each GSO packet will need.
1130 * The number does not need to be exact because any leftover packets allocated
1131 * will be freed.
1132 */
1133 static uint32_t
estimate_gso_pkts(struct __kern_packet * pkt)1134 estimate_gso_pkts(struct __kern_packet *pkt)
1135 {
1136 packet_tso_flags_t tso_flags;
1137 uint16_t mss;
1138 uint32_t n_pkts = 0, total_hlen = 0, total_len = 0;
1139
1140 tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
1141 mss = pkt->pkt_proto_seg_sz;
1142
1143 if (tso_flags == PACKET_TSO_IPV4) {
1144 total_hlen = sizeof(struct ip) + sizeof(struct tcphdr);
1145 } else if (tso_flags == PACKET_TSO_IPV6) {
1146 total_hlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1147 }
1148 if (total_hlen != 0 && mss != 0) {
1149 total_len = pkt->pkt_length;
1150 n_pkts = (uint32_t)
1151 (SK_ROUNDUP((total_len - total_hlen), mss) / mss);
1152 }
1153 DTRACE_SKYWALK5(estimate__gso, packet_tso_flags_t, tso_flags,
1154 uint32_t, total_hlen, uint32_t, total_len, uint16_t, mss,
1155 uint32_t, n_pkts);
1156 return n_pkts;
1157 }
1158
1159 /*
1160 * This function retrieves a chain of packets of the same type only
1161 * (GSO or non-GSO).
1162 */
1163 static inline void
fsw_tx_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes,uint32_t * gso_pkts_estimate)1164 fsw_tx_ring_dequeue_pktq(struct nx_flowswitch *fsw,
1165 struct __kern_channel_ring *r, uint32_t n_pkts_max,
1166 struct pktq *pktq, uint32_t *n_bytes, uint32_t *gso_pkts_estimate)
1167 {
1168 uint32_t n_pkts = 0;
1169 slot_idx_t idx, idx_end;
1170 idx = r->ckr_khead;
1171 idx_end = r->ckr_rhead;
1172 struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
1173 boolean_t gso_enabled, gso_required;
1174 uint32_t gso_pkts;
1175
1176 gso_enabled = (fsw->fsw_tso_mode == FSW_TSO_MODE_SW);
1177 ASSERT(KPKTQ_EMPTY(pktq));
1178 *n_bytes = 0;
1179 for (; n_pkts < n_pkts_max &&
1180 (!gso_enabled || fsw_gso_batch == 0 ||
1181 *gso_pkts_estimate < fsw_gso_batch) &&
1182 idx != idx_end; idx = SLOT_NEXT(idx, r->ckr_lim)) {
1183 struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1184 struct __kern_packet *pkt = ksd->sd_pkt;
1185
1186 ASSERT(pkt->pkt_nextpkt == NULL);
1187
1188 _FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1189 pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1190 if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1191 || (pkt->pkt_length == 0)) {
1192 KR_SLOT_DETACH_METADATA(r, ksd);
1193 FSW_STATS_INC(FSW_STATS_DROP);
1194 pp_free_packet_single(pkt);
1195 continue;
1196 }
1197 if (gso_enabled) {
1198 gso_pkts = estimate_gso_pkts(pkt);
1199
1200 /*
1201 * We use the first packet to determine what
1202 * type the subsequent ones need to be (GSO or
1203 * non-GSO).
1204 */
1205 if (n_pkts == 0) {
1206 gso_required = (gso_pkts != 0);
1207 } else {
1208 if (gso_required != (gso_pkts != 0)) {
1209 break;
1210 }
1211 }
1212 *gso_pkts_estimate += gso_pkts;
1213 }
1214 KR_SLOT_DETACH_METADATA(r, ksd);
1215 if (NA_CHANNEL_EVENT_ATTACHED(&vpna->vpna_up)) {
1216 __packet_set_tx_nx_port(SK_PKT2PH(pkt),
1217 vpna->vpna_nx_port, vpna->vpna_gencnt);
1218 }
1219 n_pkts++;
1220 *n_bytes += pkt->pkt_length;
1221 KPKTQ_ENQUEUE(pktq, pkt);
1222 }
1223 r->ckr_khead = idx;
1224 r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
1225 DTRACE_SKYWALK5(tx__ring__dequeue, struct nx_flowswitch *, fsw,
1226 ifnet_t, fsw->fsw_ifp, uint32_t, n_pkts, uint32_t, *n_bytes,
1227 uint32_t, *gso_pkts_estimate);
1228 }
1229
1230 static void
fsw_ring_enqueue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1231 fsw_ring_enqueue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
1232 struct pktq *pktq)
1233 {
1234 #pragma unused(fsw)
1235 struct __kern_packet *pkt;
1236 struct __kern_quantum *kqum;
1237 uint32_t kr_space_avail = 0;
1238 uint32_t n, n_pkts = 0, n_bytes = 0;
1239 slot_idx_t idx = 0, idx_start = 0, idx_end = 0;
1240
1241 kr_enter(r, TRUE);
1242
1243 idx_start = r->ckr_ktail;
1244 kr_space_avail = kr_available_slots_rxring(r);
1245 _FSW_INJECT_ERROR(40, kr_space_avail, 0, null_func);
1246 n = MIN(kr_space_avail, KPKTQ_LEN(pktq));
1247 _FSW_INJECT_ERROR(41, n, 0, null_func);
1248 idx_end = SLOT_INCREMENT(idx_start, n, r->ckr_lim);
1249
1250 idx = idx_start;
1251 while (idx != idx_end) {
1252 KPKTQ_DEQUEUE(pktq, pkt);
1253 kqum = SK_PTR_ADDR_KQUM(pkt);
1254 kqum->qum_qflags |= QUM_F_FINALIZED;
1255 n_pkts++;
1256 n_bytes += pkt->pkt_length;
1257 KR_SLOT_ATTACH_METADATA(r, KR_KSD(r, idx), kqum);
1258 if (__improbable(pkt->pkt_trace_id != 0)) {
1259 KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
1260 KDBG(SK_KTRACE_PKT_RX_CHN | DBG_FUNC_START, pkt->pkt_trace_id);
1261 }
1262 idx = SLOT_NEXT(idx, r->ckr_lim);
1263 }
1264
1265 kr_update_stats(r, n_pkts, n_bytes);
1266
1267 /*
1268 * ensure slot attachments are visible before updating the
1269 * tail pointer
1270 */
1271 os_atomic_thread_fence(seq_cst);
1272
1273 r->ckr_ktail = idx_end;
1274
1275 kr_exit(r);
1276
1277 r->ckr_na_notify(r, kernproc, NA_NOTEF_PUSH);
1278
1279 SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "%s enqueued %d pkts",
1280 r->ckr_name, n_pkts);
1281 }
1282
1283 static void
pkts_to_pktq(struct __kern_packet ** __counted_by (n_pkts)pkts,uint32_t n_pkts,struct pktq * pktq)1284 pkts_to_pktq(struct __kern_packet **__counted_by(n_pkts)pkts, uint32_t n_pkts, struct pktq *pktq)
1285 {
1286 ASSERT(KPKTQ_EMPTY(pktq));
1287
1288 for (uint32_t i = 0; i < n_pkts; i++) {
1289 struct __kern_packet *__single pkt = pkts[i];
1290 ASSERT(pkt->pkt_nextpkt == NULL);
1291 KPKTQ_ENQUEUE(pktq, pkt);
1292 }
1293 }
1294
1295 /*
1296 * This function is modeled after nx_netif_host_grab_pkts() in nx_netif_host.c.
1297 */
1298 SK_NO_INLINE_ATTRIBUTE
1299 static void
convert_native_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_headp,struct mbuf ** m_tailp,uint32_t * cnt,uint32_t * bytes)1300 convert_native_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1301 struct mbuf **m_headp, struct mbuf **m_tailp, uint32_t *cnt, uint32_t *bytes)
1302 {
1303 uint32_t tot_cnt;
1304 unsigned int num_segs = 1;
1305 struct mbuf *__single mhead, *__single head = NULL;
1306 struct mbuf *__single tail = NULL, **__single tailp = &head;
1307 uint32_t mhead_cnt, mhead_bufsize;
1308 uint32_t mhead_waste = 0;
1309 uint32_t mcnt = 0, mbytes = 0;
1310 uint32_t largest, max_pkt_len;
1311 struct __kern_packet *__single pkt;
1312 struct kern_pbufpool *pp;
1313
1314 tot_cnt = KPKTQ_LEN(pktq);
1315 ASSERT(tot_cnt > 0);
1316 mhead_cnt = tot_cnt;
1317
1318 /*
1319 * Opportunistically batch-allocate the mbufs based on the largest
1320 * packet size we've seen in the recent past. Note that we reset
1321 * fe_rx_largest_size below if we notice that we're under-utilizing the
1322 * allocated buffers (thus disabling this batch allocation).
1323 */
1324 largest = *(volatile uint32_t*)&fsw->fsw_rx_largest_size; /* read once */
1325 if (__probable(largest != 0)) {
1326 if (largest <= MCLBYTES) {
1327 mhead = m_allocpacket_internal(&mhead_cnt, MCLBYTES,
1328 &num_segs, M_NOWAIT, 1, 0);
1329 mhead_bufsize = MCLBYTES;
1330 } else if (largest <= MBIGCLBYTES) {
1331 mhead = m_allocpacket_internal(&mhead_cnt, MBIGCLBYTES,
1332 &num_segs, M_NOWAIT, 1, 0);
1333 mhead_bufsize = MBIGCLBYTES;
1334 } else if (largest <= M16KCLBYTES) {
1335 mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES,
1336 &num_segs, M_NOWAIT, 1, 0);
1337 mhead_bufsize = M16KCLBYTES;
1338 } else if (largest <= M16KCLBYTES * 2) {
1339 num_segs = 2;
1340 mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES * 2,
1341 &num_segs, M_NOWAIT, 1, 0);
1342 mhead_bufsize = M16KCLBYTES * 2;
1343 } else {
1344 mhead = NULL;
1345 mhead_bufsize = mhead_cnt = 0;
1346 }
1347 } else {
1348 mhead = NULL;
1349 mhead_bufsize = mhead_cnt = 0;
1350 }
1351 DTRACE_SKYWALK4(bufstats, uint32_t, largest, uint32_t, mhead_bufsize,
1352 uint32_t, mhead_cnt, uint32_t, tot_cnt);
1353
1354 pp = __DECONST(struct kern_pbufpool *, KPKTQ_FIRST(pktq)->pkt_qum.qum_pp);
1355 max_pkt_len = PP_BUF_SIZE_DEF(pp) * pp->pp_max_frags;
1356
1357 KPKTQ_FOREACH(pkt, pktq) {
1358 uint32_t tot_len, len;
1359 uint16_t pad, llhlen, iphlen;
1360 boolean_t do_cksum_rx;
1361 struct mbuf *__single m;
1362 int error;
1363
1364 llhlen = pkt->pkt_l2_len;
1365 len = pkt->pkt_length;
1366 if (__improbable(len > max_pkt_len || len == 0 || llhlen > len)) {
1367 DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1368 struct __kern_packet *, pkt);
1369 FSW_STATS_INC(FSW_STATS_DROP);
1370 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1371 continue;
1372 }
1373 /* begin payload on 32-bit boundary; figure out the padding */
1374 pad = (uint16_t)P2ROUNDUP(llhlen, sizeof(uint32_t)) - llhlen;
1375 tot_len = pad + len;
1376
1377 /* remember largest packet size */
1378 if (__improbable(largest < tot_len)) {
1379 largest = MAX(tot_len, MCLBYTES);
1380 }
1381
1382 /*
1383 * If the above batch allocation returned partial
1384 * success, we try a blocking allocation here again.
1385 */
1386 m = mhead;
1387 if (__improbable(m == NULL || tot_len > mhead_bufsize)) {
1388 ASSERT(mhead != NULL || mhead_cnt == 0);
1389 num_segs = 1;
1390 if (tot_len > M16KCLBYTES) {
1391 num_segs = 0;
1392 }
1393 if ((error = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
1394 &num_segs, &m)) != 0) {
1395 DTRACE_SKYWALK2(bad__len,
1396 struct nx_flowswitch *, fsw,
1397 struct __kern_packet *, pkt);
1398 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
1399 FSW_STATS_INC(FSW_STATS_DROP);
1400 continue;
1401 }
1402 } else {
1403 mhead = m->m_nextpkt;
1404 m->m_nextpkt = NULL;
1405 ASSERT(mhead_cnt != 0);
1406 --mhead_cnt;
1407
1408 /* check if we're underutilizing large buffers */
1409 if (__improbable(mhead_bufsize > MCLBYTES &&
1410 tot_len < (mhead_bufsize >> 1))) {
1411 ++mhead_waste;
1412 }
1413 /*
1414 * Clean up unused mbuf.
1415 * Ony need to do this when we pre-alloc 2x16K mbufs
1416 */
1417 if (__improbable(mhead_bufsize >= tot_len + M16KCLBYTES)) {
1418 ASSERT(mhead_bufsize == 2 * M16KCLBYTES);
1419 struct mbuf *m_extra = m->m_next;
1420 ASSERT(m_extra != NULL);
1421 ASSERT(m_extra->m_len == 0);
1422 ASSERT(M_SIZE(m_extra) == M16KCLBYTES);
1423 m->m_next = NULL;
1424 m_freem(m_extra);
1425 FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
1426 }
1427 }
1428 m->m_data += pad;
1429 /*
1430 * XXX -fbounds-safety: external dependency
1431 * mtod does not work because m_len is 0
1432 */
1433 m->m_pkthdr.pkt_hdr = m_mtod_current(m);
1434
1435 /* don't include IP header from partial sum */
1436 if (__probable((pkt->pkt_qum_qflags &
1437 QUM_F_FLOW_CLASSIFIED) != 0)) {
1438 iphlen = pkt->pkt_flow_ip_hlen;
1439 do_cksum_rx = sk_cksum_rx;
1440 } else {
1441 iphlen = 0;
1442 do_cksum_rx = FALSE;
1443 }
1444
1445 fsw->fsw_pkt_copy_to_mbuf(NR_RX, SK_PKT2PH(pkt),
1446 pkt->pkt_headroom, m, 0, len, do_cksum_rx,
1447 llhlen + iphlen);
1448
1449 FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2MBUF);
1450 if (do_cksum_rx) {
1451 FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
1452 }
1453 #if DEBUG || DEVELOPMENT
1454 if (__improbable(pkt_trailers > 0)) {
1455 (void) pkt_add_trailers_mbuf(m, llhlen + iphlen);
1456 }
1457 #endif /* DEBUG || DEVELOPMENT */
1458 m_adj(m, llhlen);
1459
1460 m->m_pkthdr.rcvif = fsw->fsw_ifp;
1461 if (__improbable((pkt->pkt_link_flags &
1462 PKT_LINKF_ETHFCS) != 0)) {
1463 m->m_flags |= M_HASFCS;
1464 }
1465 if (__improbable((pkt->pkt_link_flags &
1466 PKT_LINKF_BCAST) != 0)) {
1467 m->m_flags |= M_BCAST;
1468 }
1469 if (__improbable((pkt->pkt_link_flags &
1470 PKT_LINKF_MCAST) != 0)) {
1471 m->m_flags |= M_MCAST;
1472 }
1473 if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1474 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
1475 }
1476 ASSERT(m->m_nextpkt == NULL);
1477 tail = m;
1478 *tailp = m;
1479 tailp = &m->m_nextpkt;
1480 mcnt++;
1481 mbytes += m_pktlen(m);
1482 }
1483 /* free any leftovers */
1484 if (__improbable(mhead != NULL)) {
1485 DTRACE_SKYWALK1(mhead__leftover, uint32_t, mhead_cnt);
1486 ASSERT(mhead_cnt != 0);
1487 (void) m_freem_list(mhead);
1488 mhead = NULL;
1489 mhead_cnt = 0;
1490 }
1491
1492 /* reset if most packets (>50%) are smaller than our batch buffers */
1493 if (__improbable(mhead_waste > ((uint32_t)tot_cnt >> 1))) {
1494 DTRACE_SKYWALK4(mhead__waste, struct nx_flowswitch *, fsw,
1495 struct flow_entry *, NULL, uint32_t, mhead_waste,
1496 uint32_t, tot_cnt);
1497 largest = 0;
1498 }
1499
1500 if (largest != fsw->fsw_rx_largest_size) {
1501 os_atomic_store(&fsw->fsw_rx_largest_size, largest, release);
1502 }
1503
1504 pp_free_pktq(pktq);
1505 *m_headp = head;
1506 *m_tailp = tail;
1507 *cnt = mcnt;
1508 *bytes = mbytes;
1509 }
1510
1511 /*
1512 * This function only extracts the mbuf from the packet. The caller frees
1513 * the packet.
1514 */
1515 static inline struct mbuf *
convert_compat_pkt_to_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1516 convert_compat_pkt_to_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1517 {
1518 struct mbuf *m;
1519 struct pkthdr *mhdr;
1520 uint16_t llhlen;
1521
1522 m = pkt->pkt_mbuf;
1523 ASSERT(m != NULL);
1524
1525 llhlen = pkt->pkt_l2_len;
1526 if (llhlen > pkt->pkt_length) {
1527 m_freem(m);
1528 KPKT_CLEAR_MBUF_DATA(pkt);
1529 DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1530 struct __kern_packet *, pkt);
1531 FSW_STATS_INC(FSW_STATS_DROP);
1532 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1533 return NULL;
1534 }
1535 mhdr = &m->m_pkthdr;
1536 if ((mhdr->csum_flags & CSUM_DATA_VALID) == 0 &&
1537 PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
1538 mhdr->csum_flags &= ~CSUM_RX_FLAGS;
1539 mhdr->csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
1540 mhdr->csum_rx_start = pkt->pkt_csum_rx_start_off;
1541 mhdr->csum_rx_val = pkt->pkt_csum_rx_value;
1542 }
1543 #if DEBUG || DEVELOPMENT
1544 uint32_t extra = 0;
1545 if (__improbable(pkt_trailers > 0)) {
1546 extra = pkt_add_trailers_mbuf(m, llhlen);
1547 }
1548 #endif /* DEBUG || DEVELOPMENT */
1549 m_adj(m, llhlen);
1550 ASSERT((uint32_t)m_pktlen(m) == ((pkt->pkt_length - llhlen) + extra));
1551 KPKT_CLEAR_MBUF_DATA(pkt);
1552 return m;
1553 }
1554
1555 SK_NO_INLINE_ATTRIBUTE
1556 static void
convert_compat_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_head,struct mbuf ** m_tail,uint32_t * cnt,uint32_t * bytes)1557 convert_compat_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1558 struct mbuf **m_head, struct mbuf **m_tail, uint32_t *cnt, uint32_t *bytes)
1559 {
1560 struct __kern_packet *pkt;
1561 struct mbuf *__single m, *__single head = NULL;
1562 struct mbuf *__single tail = NULL, **__single tailp = &head;
1563 uint32_t c = 0, b = 0;
1564
1565 KPKTQ_FOREACH(pkt, pktq) {
1566 m = convert_compat_pkt_to_mbuf(fsw, pkt);
1567 if (__improbable(m == NULL)) {
1568 continue;
1569 }
1570 tail = m;
1571 *tailp = m;
1572 tailp = &m->m_nextpkt;
1573 c++;
1574 b += m_pktlen(m);
1575 }
1576 pp_free_pktq(pktq);
1577 *m_head = head;
1578 *m_tail = tail;
1579 *cnt = c;
1580 *bytes = b;
1581 }
1582
1583 void
fsw_host_sendup(struct ifnet * ifp,struct mbufq * host_mq)1584 fsw_host_sendup(struct ifnet *ifp, struct mbufq *host_mq)
1585 {
1586 struct ifnet_stat_increment_param s;
1587
1588 if (mbufq_empty(host_mq)) {
1589 return;
1590 }
1591
1592 bzero(&s, sizeof(s));
1593 s.packets_in = host_mq->count;
1594 s.bytes_in = host_mq->bytes;
1595 dlil_input_handler(ifp, mbufq_first(host_mq), mbufq_last(host_mq), &s, FALSE, NULL);
1596 }
1597
1598 void
fsw_host_rx_cb(struct nx_flowswitch * fsw,struct pktq * pktq)1599 fsw_host_rx_cb(struct nx_flowswitch *fsw, struct pktq *pktq)
1600 {
1601 ifnet_fsw_rx_cb_t __single cb;
1602 void *__single cb_arg;
1603
1604 ASSERT(!KPKTQ_EMPTY(pktq));
1605 if (ifnet_get_flowswitch_rx_callback(fsw->fsw_ifp, &cb, &cb_arg) == 0) {
1606 ASSERT(cb != NULL);
1607 ASSERT(cb_arg != NULL);
1608 (*cb)(cb_arg, pktq);
1609 ifnet_release_flowswitch_rx_callback(fsw->fsw_ifp);
1610 if (KPKTQ_EMPTY(pktq)) {
1611 return;
1612 } else {
1613 DTRACE_SKYWALK2(leftover__pkts, struct nx_flowswitch *, fsw,
1614 struct pktq *, pktq);
1615 }
1616 }
1617 }
1618
1619 void
fsw_host_rx_enqueue_mbq(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbufq * host_mq)1620 fsw_host_rx_enqueue_mbq(struct nx_flowswitch *fsw, struct pktq *pktq,
1621 struct mbufq *host_mq)
1622 {
1623 struct mbuf *__single m_head = NULL, *__single m_tail = NULL;
1624 uint32_t cnt = 0, bytes = 0;
1625 boolean_t compat;
1626
1627 if (KPKTQ_EMPTY(pktq)) {
1628 return;
1629 }
1630
1631 /* All packets in the pktq must have the same type */
1632 compat = ((KPKTQ_FIRST(pktq)->pkt_pflags & PKT_F_MBUF_DATA) != 0);
1633 if (compat) {
1634 convert_compat_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1635 &bytes);
1636 } else {
1637 convert_native_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1638 &bytes);
1639 }
1640 if (__improbable(m_head == NULL)) {
1641 DTRACE_SKYWALK1(empty__head, struct nx_flowswitch *, fsw);
1642 return;
1643 }
1644
1645 mbufq_enqueue(host_mq, m_head, m_tail, cnt, bytes);
1646 }
1647
1648 void
fsw_ring_enqueue_tail_drop(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1649 fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw,
1650 struct __kern_channel_ring *r, struct pktq *pktq)
1651 {
1652 fsw_ring_enqueue_pktq(fsw, r, pktq);
1653 /*
1654 * Rx stall detection: don't update enqueue ts if dequeue ts < enqueue ts.
1655 * This is to ensure we use the timestamp of the earliest enqueue without
1656 * a dequeue.
1657 */
1658 if (r->ckr_rx_dequeue_ts >= r->ckr_rx_enqueue_ts) {
1659 r->ckr_rx_enqueue_ts = net_uptime();
1660 }
1661 FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq));
1662 dp_drop_pktq(fsw, pktq, 0, DROP_REASON_RX_DST_RING_FULL, __LINE__,
1663 DROPTAP_FLAG_L2_MISSING);
1664 }
1665
1666 static struct nexus_adapter *
flow_get_na(struct nx_flowswitch * fsw,struct flow_entry * fe)1667 flow_get_na(struct nx_flowswitch *fsw, struct flow_entry *fe)
1668 {
1669 struct kern_nexus *nx = fsw->fsw_nx;
1670 struct nexus_adapter *na = NULL;
1671 nexus_port_t port = fe->fe_nx_port;
1672
1673 if (port == FSW_VP_DEV || port == FSW_VP_HOST) {
1674 SK_ERR("dev or host ports have no NA");
1675 return NULL;
1676 }
1677
1678 if (__improbable(!nx_port_is_valid(nx, port))) {
1679 SK_DF(SK_VERB_FSW_DP, "%s[%d] port no longer valid",
1680 if_name(fsw->fsw_ifp), port);
1681 return NULL;
1682 }
1683
1684 na = nx_port_get_na(nx, port);
1685 if (__improbable(na == NULL)) {
1686 FSW_STATS_INC(FSW_STATS_DST_NXPORT_INVALID);
1687 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer valid",
1688 if_name(fsw->fsw_ifp), port);
1689 return NULL;
1690 }
1691
1692 if (__improbable(!NA_IS_ACTIVE(na))) {
1693 FSW_STATS_INC(FSW_STATS_DST_NXPORT_INACTIVE);
1694 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer active",
1695 if_name(fsw->fsw_ifp), port);
1696 return NULL;
1697 }
1698
1699 if (__improbable(nx_port_is_defunct(nx, port))) {
1700 FSW_STATS_INC(FSW_STATS_DST_NXPORT_DEFUNCT);
1701 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA defuncted",
1702 if_name(fsw->fsw_ifp), port);
1703 return NULL;
1704 }
1705
1706 return na;
1707 }
1708
1709 static inline struct __kern_channel_ring *
flow_get_ring(struct nx_flowswitch * fsw,struct flow_entry * fe,enum txrx txrx)1710 flow_get_ring(struct nx_flowswitch *fsw, struct flow_entry *fe, enum txrx txrx)
1711 {
1712 struct nexus_vp_adapter *na = NULL;
1713 struct __kern_channel_ring *__single r = NULL;
1714
1715 na = VPNA(flow_get_na(fsw, fe));
1716 if (__improbable(na == NULL)) {
1717 return NULL;
1718 }
1719
1720 switch (txrx) {
1721 case NR_RX:
1722 r = KR_SINGLE(&na->vpna_up.na_rx_rings[0]);
1723 break;
1724 case NR_TX:
1725 r = KR_SINGLE(&na->vpna_up.na_tx_rings[0]);
1726 break;
1727 default:
1728 __builtin_unreachable();
1729 VERIFY(0);
1730 }
1731
1732 if (__improbable(KR_DROP(r))) {
1733 FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE);
1734 SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %p %s drop mode",
1735 SK_KVA(r), r->ckr_name);
1736 return NULL;
1737 }
1738
1739 ASSERT(KRNA(r)->na_md_type == NEXUS_META_TYPE_PACKET);
1740
1741 #if (DEVELOPMENT || DEBUG)
1742 if (r != NULL) {
1743 _FSW_INJECT_ERROR(4, r, NULL, null_func);
1744 }
1745 #endif /* DEVELOPMENT || DEBUG */
1746
1747 return r;
1748 }
1749
1750 struct __kern_channel_ring *
fsw_flow_get_rx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1751 fsw_flow_get_rx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1752 {
1753 return flow_get_ring(fsw, fe, NR_RX);
1754 }
1755
1756 static bool
dp_flow_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1757 dp_flow_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1758 {
1759 struct flow_route *fr = fe->fe_route;
1760 struct ifnet *ifp = fsw->fsw_ifp;
1761
1762 if (__improbable(!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
1763 !fe->fe_want_nonviable && (fe->fe_key.fk_mask & FKMASK_SRC) &&
1764 fe->fe_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt &&
1765 !flow_route_key_validate(&fe->fe_key, ifp, &fe->fe_laddr_gencnt))) {
1766 /*
1767 * The source address is no longer around; we want this
1768 * flow to be nonviable, but that requires holding the lock
1769 * as writer (which isn't the case now.) Indicate that
1770 * we need to finalize the nonviable later down below.
1771 *
1772 * We also request that the flow route be re-configured,
1773 * if this is a connected mode flow.
1774 *
1775 */
1776 if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) {
1777 /*
1778 * fsw_pending_nonviable is a hint for reaper thread;
1779 * due to the fact that setting fe_want_nonviable and
1780 * incrementing fsw_pending_nonviable counter is not
1781 * atomic, let the increment happen first, and the
1782 * thread losing the CAS does decrement.
1783 */
1784 os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
1785 if (os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
1786 fsw_reap_sched(fsw);
1787 } else {
1788 os_atomic_dec(&fsw->fsw_pending_nonviable, relaxed);
1789 }
1790 }
1791 if (fr != NULL) {
1792 os_atomic_inc(&fr->fr_want_configure, relaxed);
1793 }
1794 }
1795
1796 /* if flow was (or is going to be) marked as nonviable, drop it */
1797 if (__improbable(fe->fe_want_nonviable ||
1798 (fe->fe_flags & FLOWENTF_NONVIABLE) != 0)) {
1799 SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow %p non-viable",
1800 SK_KVA(fe));
1801 return false;
1802 }
1803 return true;
1804 }
1805
1806 bool
dp_flow_rx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1807 dp_flow_rx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1808 {
1809 bool okay;
1810 okay = dp_flow_route_process(fsw, fe);
1811 #if (DEVELOPMENT || DEBUG)
1812 if (okay) {
1813 _FSW_INJECT_ERROR(5, okay, false, null_func);
1814 }
1815 #endif /* DEVELOPMENT || DEBUG */
1816
1817 return okay;
1818 }
1819
1820 void
dp_flow_rx_process(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,uint32_t rx_bytes,struct mbufq * host_mq,uint32_t flags)1821 dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
1822 struct pktq *rx_pkts, uint32_t rx_bytes, struct mbufq *host_mq,
1823 uint32_t flags)
1824 {
1825 #pragma unused(flags)
1826 struct pktq dpkts; /* dst pool alloc'ed packets */
1827 struct pktq disposed_pkts; /* done src packets */
1828 struct pktq dropped_pkts; /* dropped src packets */
1829 struct pktq transferred_pkts; /* dst packet ready for ring */
1830 struct __kern_packet *pkt, *tpkt;
1831 struct kern_pbufpool *dpp;
1832 uint32_t n_pkts = KPKTQ_LEN(rx_pkts);
1833 uint64_t buf_array[RX_BUFLET_BATCH_COUNT];
1834 uint16_t buf_array_iter = 0;
1835 uint32_t cnt, buf_cnt = 0;
1836 int err;
1837 drop_reason_t reason = DROP_REASON_UNSPECIFIED;
1838 uint16_t line = 0;
1839
1840 KPKTQ_INIT(&dpkts);
1841 KPKTQ_INIT(&dropped_pkts);
1842 KPKTQ_INIT(&disposed_pkts);
1843 KPKTQ_INIT(&transferred_pkts);
1844
1845 if (__improbable(!dp_flow_rx_route_process(fsw, fe))) {
1846 SK_ERR("Rx route bad");
1847 fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
1848 FSW_STATS_ADD(FSW_STATS_RX_FLOW_NONVIABLE, n_pkts);
1849 reason = DROP_REASON_FSW_FLOW_NONVIABLE;
1850 line = __LINE__;
1851 goto done;
1852 }
1853
1854 if (fe->fe_nx_port == FSW_VP_HOST) {
1855 /*
1856 * The host ring does not exist anymore so we can't take
1857 * the enqueue path below. This path should only be hit
1858 * for the rare tcp fragmentation case.
1859 */
1860
1861 fsw_host_rx_enqueue_mbq(fsw, rx_pkts, host_mq);
1862 return;
1863 }
1864
1865 /* find the ring */
1866 struct __kern_channel_ring *r;
1867 r = fsw_flow_get_rx_ring(fsw, fe);
1868 if (__improbable(r == NULL)) {
1869 fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
1870 reason = DROP_REASON_FSW_RX_RING_NOT_FOUND;
1871 line = __LINE__;
1872 goto done;
1873 }
1874
1875 /* snoop before L2 is stripped */
1876 if (__improbable(pktap_total_tap_count != 0)) {
1877 fsw_snoop(fsw, fe, rx_pkts, true);
1878 }
1879
1880 dpp = r->ckr_pp;
1881 /* batch allocate enough packets */
1882 err = pp_alloc_pktq(dpp, 1, &dpkts, n_pkts, NULL, NULL,
1883 SKMEM_NOSLEEP);
1884 if (__improbable(err == ENOMEM)) {
1885 ASSERT(KPKTQ_EMPTY(&dpkts));
1886 KPKTQ_CONCAT(&dropped_pkts, rx_pkts);
1887 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1888 SK_ERR("failed to alloc %u pkts for kr %s, %p", n_pkts,
1889 r->ckr_name, SK_KVA(r));
1890 reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
1891 line = __LINE__;
1892 goto done;
1893 }
1894
1895 /*
1896 * estimate total number of buflets for the packet chain.
1897 */
1898 cnt = howmany(rx_bytes, PP_BUF_SIZE_DEF(dpp));
1899 if (cnt > n_pkts) {
1900 ASSERT(dpp->pp_max_frags > 1);
1901 cnt -= n_pkts;
1902 buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, cnt);
1903 err = pp_alloc_buflet_batch(dpp, buf_array, &buf_cnt,
1904 SKMEM_NOSLEEP, false);
1905 if (__improbable(buf_cnt == 0)) {
1906 KPKTQ_CONCAT(&dropped_pkts, rx_pkts);
1907 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1908 SK_ERR("failed to alloc %d buflets (err %d) for kr %s %p",
1909 cnt, err, r->ckr_name, SK_KVA(r));
1910 reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
1911 line = __LINE__;
1912 goto done;
1913 }
1914 err = 0;
1915 }
1916
1917 /* extra processing for user flow */
1918 KPKTQ_FOREACH_SAFE(pkt, rx_pkts, tpkt) {
1919 err = 0;
1920 KPKTQ_REMOVE(rx_pkts, pkt);
1921 if (rx_bytes > pkt->pkt_flow_ulen) {
1922 rx_bytes -= pkt->pkt_flow_ulen;
1923 } else {
1924 rx_bytes = 0;
1925 }
1926 err = flow_pkt_track(fe, pkt, true);
1927 _FSW_INJECT_ERROR(33, err, EPROTO, null_func);
1928 if (__improbable(err != 0)) {
1929 SK_DF(SK_VERB_FLOW_TRACK, "flow_pkt_track failed (err %d)", err);
1930 FSW_STATS_INC(FSW_STATS_RX_FLOW_TRACK_ERR);
1931 /* if need to trigger RST */
1932 if (err == ENETRESET) {
1933 flow_track_abort_tcp(fe, pkt, NULL);
1934 }
1935 dp_drop_pkt_single(fsw, pkt, 0, DROP_REASON_FSW_FLOW_TRACK_ERR,
1936 DROPTAP_FLAG_L2_MISSING);
1937 continue;
1938 }
1939
1940 /* transfer to dpkt */
1941 if (pkt->pkt_qum.qum_pp != dpp) {
1942 struct __kern_buflet *bprev, *bnew;
1943 struct __kern_packet *dpkt = NULL;
1944 uint32_t n_bufs, i;
1945
1946 KPKTQ_DEQUEUE(&dpkts, dpkt);
1947 /* XXX Why would dpkt be NULL at this point? */
1948 if (__improbable(dpkt == NULL)) {
1949 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1950 dp_drop_pkt_single(fsw, pkt, 0,
1951 DROP_REASON_FSW_PP_ALLOC_FAILED, DROPTAP_FLAG_L2_MISSING);
1952 continue;
1953 }
1954 n_bufs = howmany(pkt->pkt_length, PP_BUF_SIZE_DEF(dpp));
1955 n_bufs--;
1956 for (i = 0; i < n_bufs; i++) {
1957 if (__improbable(buf_cnt == 0)) {
1958 ASSERT(dpp->pp_max_frags > 1);
1959 buf_array_iter = 0;
1960 cnt = howmany(rx_bytes, PP_BUF_SIZE_DEF(dpp));
1961 n_pkts = KPKTQ_LEN(rx_pkts);
1962 if (cnt >= n_pkts) {
1963 cnt -= n_pkts;
1964 } else {
1965 cnt = 0;
1966 }
1967 cnt += (n_bufs - i);
1968 buf_cnt = MIN(RX_BUFLET_BATCH_COUNT,
1969 cnt);
1970 cnt = buf_cnt;
1971 err = pp_alloc_buflet_batch(dpp,
1972 buf_array, &buf_cnt,
1973 SKMEM_NOSLEEP, false);
1974 if (__improbable(buf_cnt == 0)) {
1975 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1976 dp_drop_pkt_single(fsw, pkt, 0,
1977 DROP_REASON_FSW_PP_ALLOC_FAILED,
1978 DROPTAP_FLAG_L2_MISSING);
1979 pkt = NULL;
1980 pp_free_packet_single(dpkt);
1981 dpkt = NULL;
1982 SK_ERR("failed to alloc %d "
1983 "buflets (err %d) for "
1984 "kr %s, %p", cnt, err,
1985 r->ckr_name, SK_KVA(r));
1986 break;
1987 }
1988 err = 0;
1989 }
1990 ASSERT(buf_cnt != 0);
1991 if (i == 0) {
1992 PKT_GET_FIRST_BUFLET(dpkt, 1, bprev);
1993 }
1994 /*
1995 * XXX -fbounds-safety: can't avoid using forge
1996 * unless we change the signature of
1997 * pp_alloc_buflet_batch().
1998 */
1999 bnew = __unsafe_forge_single(kern_buflet_t,
2000 buf_array[buf_array_iter]);
2001 buf_array[buf_array_iter] = 0;
2002 buf_array_iter++;
2003 buf_cnt--;
2004 VERIFY(kern_packet_add_buflet(SK_PKT2PH(dpkt),
2005 bprev, bnew) == 0);
2006 bprev = bnew;
2007 }
2008 if (__improbable(err != 0)) {
2009 continue;
2010 }
2011 err = copy_packet_from_dev(fsw, pkt, dpkt);
2012 _FSW_INJECT_ERROR(43, err, EINVAL, null_func);
2013 if (__improbable(err != 0)) {
2014 SK_ERR("copy packet failed (err %d)", err);
2015 dp_drop_pkt_single(fsw, pkt, 0,
2016 DROP_REASON_FSW_PKT_COPY_FAILED,
2017 DROPTAP_FLAG_L2_MISSING);
2018 pp_free_packet_single(dpkt);
2019 dpkt = NULL;
2020 continue;
2021 }
2022 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
2023 pkt = dpkt;
2024 }
2025 _UUID_COPY(pkt->pkt_flow_id, fe->fe_uuid);
2026 _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
2027 pkt->pkt_policy_id = fe->fe_policy_id;
2028 pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
2029 pkt->pkt_transport_protocol = fe->fe_transport_protocol;
2030 if (pkt->pkt_bufs_cnt > 1) {
2031 pkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
2032 }
2033 KPKTQ_ENQUEUE(&transferred_pkts, pkt);
2034 }
2035 KPKTQ_FINI(rx_pkts);
2036
2037 if (KPKTQ_LEN(&transferred_pkts) > 0) {
2038 fsw_ring_enqueue_tail_drop(fsw, r, &transferred_pkts);
2039 }
2040 KPKTQ_FINI(&transferred_pkts);
2041
2042 done:
2043 /* Free unused buflets */
2044 while (buf_cnt > 0) {
2045 /*
2046 * XXX -fbounds-safety: can't avoid using forge unless we change
2047 * the signature of pp_alloc_buflet_batch().
2048 */
2049 pp_free_buflet(dpp, __unsafe_forge_single(kern_buflet_t,
2050 (kern_buflet_t)(buf_array[buf_array_iter])));
2051 buf_array[buf_array_iter] = 0;
2052 buf_array_iter++;
2053 buf_cnt--;
2054 }
2055 dp_free_pktq(fsw, &dpkts);
2056 dp_free_pktq(fsw, &disposed_pkts);
2057 dp_drop_pktq(fsw, &dropped_pkts, 0, reason, line, DROPTAP_FLAG_L2_MISSING);
2058 }
2059
2060 static inline void
rx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe,struct flow_entry_list * fes,struct mbufq * host_mq)2061 rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
2062 struct flow_entry_list *fes, struct mbufq *host_mq)
2063 {
2064 struct pktq rx_pkts;
2065 uint32_t rx_bytes;
2066 uint32_t rx_proc_flags;
2067
2068 ASSERT(!KPKTQ_EMPTY(&fe->fe_rx_pktq));
2069 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0);
2070
2071 KPKTQ_INIT(&rx_pkts);
2072 for (;;) {
2073 lck_mtx_lock(&fe->fe_rx_pktq_lock);
2074 if (KPKTQ_EMPTY(&fe->fe_rx_pktq)) {
2075 fe->fe_rx_worker_tid = 0;
2076 TAILQ_REMOVE(fes, fe, fe_rx_link);
2077 lck_mtx_unlock(&fe->fe_rx_pktq_lock);
2078 break;
2079 }
2080 KPKTQ_CONCAT(&rx_pkts, &fe->fe_rx_pktq);
2081 KPKTQ_DISPOSE(&fe->fe_rx_pktq);
2082 rx_bytes = fe->fe_rx_pktq_bytes;
2083 rx_proc_flags = fe->fe_rx_frag_count ? FLOW_PROC_FLAG_FRAGMENTS : 0;
2084 fe->fe_rx_pktq_bytes = 0;
2085 fe->fe_rx_frag_count = 0;
2086 lck_mtx_unlock(&fe->fe_rx_pktq_lock);
2087 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX, "Rx %d pkts for fe %p port %d",
2088 KPKTQ_LEN(&rx_pkts), fe, fe->fe_nx_port);
2089 /* flow related processing (default, agg, fpd, etc.) */
2090 fe->fe_rx_process(fsw, fe, &rx_pkts, rx_bytes, host_mq, rx_proc_flags);
2091 }
2092 ASSERT(KPKTQ_EMPTY(&rx_pkts));
2093
2094 if (__improbable(fe->fe_want_withdraw)) {
2095 fsw_reap_sched(fsw);
2096 }
2097 }
2098
2099 static void
dp_rx_process_low_power_wake(struct nx_flowswitch * fsw,struct flow_entry * fe)2100 dp_rx_process_low_power_wake(struct nx_flowswitch *fsw, struct flow_entry *fe)
2101 {
2102 if (fe->fe_port_reservation == NULL || (fe->fe_flags & FLOWENTF_EXTRL_PORT) != 0) {
2103 return;
2104 }
2105 if (fe->fe_key.fk_proto == IPPROTO_TCP && (fe->fe_flags & FLOWENTF_CONNECTION_IDLE)) {
2106 os_log(wake_packet_log_handle, "dp_rx_process_low_power_wake LPW TCP connection idle");
2107
2108 if (flow_track_tcp_want_abort(fe)) {
2109 os_atomic_or(&fe->fe_flags, FLOWENTF_CLOSE_NOTIFY | FLOWENTF_WAIT_CLOSE, relaxed);
2110 fe->fe_want_withdraw = 1;
2111 flow_track_abort_tcp(fe, NULL, NULL);
2112 }
2113 } else {
2114 if_exit_lpw(fsw->fsw_ifp, "dp_rx_process_low_power_wake LPW connection not idle");
2115 }
2116 }
2117
2118 static inline void
dp_rx_process_wake_packet(struct nx_flowswitch * fsw,struct flow_entry * fe,struct __kern_packet * pkt)2119 dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct flow_entry *fe, struct __kern_packet *pkt)
2120 {
2121 /*
2122 * We only care about wake packets of flows that belong the flow switch
2123 * as wake packets for the host stack are handled by the host input
2124 * function
2125 */
2126
2127 #if (DEBUG || DEVELOPMENT)
2128 /* For testing only */
2129 if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
2130 if (check_wake_pkt(fsw->fsw_ifp, pkt) == true) {
2131 /*
2132 * This is a one shot command
2133 */
2134 fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
2135
2136 pkt->pkt_pflags |= PKT_F_WAKE_PKT;
2137 }
2138 }
2139 #endif /* (DEBUG || DEVELOPMENT) */
2140
2141 if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
2142 if_ports_used_match_pkt(fsw->fsw_ifp, pkt);
2143
2144 /*
2145 * When a packet is received in LPW mode for an idle TCP connection, the connection
2146 * is aborted immediately with a RST so the peer drops the connection at once
2147 */
2148 if (if_is_lpw_enabled(fsw->fsw_ifp)) {
2149 pkt->pkt_pflags |= __PKT_F_LPW;
2150 dp_rx_process_low_power_wake(fsw, fe);
2151 }
2152 }
2153 }
2154
2155 static void
_fsw_receive(struct nx_flowswitch * fsw,struct pktq * pktq)2156 _fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq)
2157 {
2158 struct __kern_packet *__single pkt, *__single tpkt;
2159 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
2160 struct flow_entry *__single fe, *__single prev_fe;
2161 sa_family_t af;
2162 struct pktq host_pkts, dropped_pkts;
2163 drop_reason_t reason = DROP_REASON_UNSPECIFIED;
2164 uint16_t line = 0;
2165 int err;
2166 uint64_t thread_id;
2167 struct mbufq host_mq;
2168 struct ifnet *ifp;
2169
2170 mbufq_init(&host_mq);
2171 KPKTQ_INIT(&host_pkts);
2172 KPKTQ_INIT(&dropped_pkts);
2173
2174 FSW_RLOCK(fsw);
2175
2176 if (__improbable(FSW_QUIESCED(fsw))) {
2177 DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw);
2178 KPKTQ_CONCAT(&dropped_pkts, pktq);
2179 reason = DROP_REASON_FSW_QUIESCED;
2180 line = __LINE__;
2181 goto done;
2182 }
2183 if (__improbable(fsw->fsw_demux == NULL)) {
2184 KPKTQ_CONCAT(&dropped_pkts, pktq);
2185 reason = DROP_REASON_FSW_DEMUX_FAILED;
2186 line = __LINE__;
2187 goto done;
2188 }
2189
2190 ifp = fsw->fsw_ifp;
2191 thread_id = thread_tid(current_thread());
2192 prev_fe = NULL;
2193 KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
2194 if (__probable(tpkt)) {
2195 void *baddr;
2196 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
2197 SK_PREFETCH(baddr, 0);
2198 /* prefetch L3 and L4 flow structs */
2199 SK_PREFETCHW(tpkt->pkt_flow, 0);
2200 SK_PREFETCHW(tpkt->pkt_flow, 128);
2201 }
2202
2203 KPKTQ_REMOVE(pktq, pkt);
2204
2205 pkt = rx_prepare_packet(fsw, pkt);
2206
2207 af = fsw->fsw_demux(fsw, pkt);
2208 if (__improbable(af == AF_UNSPEC)) {
2209 KPKTQ_ENQUEUE(&host_pkts, pkt);
2210 continue;
2211 }
2212
2213 err = flow_pkt_classify(pkt, fsw->fsw_ifp, af, TRUE);
2214 _FSW_INJECT_ERROR(1, err, ENXIO, null_func);
2215 if (__improbable(err != 0)) {
2216 FSW_STATS_INC(FSW_STATS_RX_FLOW_EXTRACT_ERR);
2217 KPKTQ_ENQUEUE(&host_pkts, pkt);
2218 continue;
2219 }
2220
2221 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
2222 pkt = rx_process_ip_frag(fsw, pkt);
2223 if (pkt == NULL) {
2224 continue;
2225 }
2226 }
2227
2228 prev_fe = fe = rx_lookup_flow(fsw, pkt, prev_fe);
2229 if (__improbable(fe == NULL)) {
2230 KPKTQ_ENQUEUE_LIST(&host_pkts, pkt);
2231 continue;
2232 }
2233
2234 dp_rx_process_wake_packet(fsw, fe, pkt);
2235
2236 rx_flow_batch_packets(&fes, fe, pkt, thread_id);
2237 prev_fe = fe;
2238 }
2239
2240 struct flow_entry *tfe = NULL;
2241 TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) {
2242 rx_flow_process(fsw, fe, &fes, &host_mq);
2243 flow_entry_release(&fe);
2244 }
2245
2246 if (!KPKTQ_EMPTY(&host_pkts)) {
2247 fsw_host_rx_cb(fsw, &host_pkts);
2248 fsw_host_rx_enqueue_mbq(fsw, &host_pkts, &host_mq);
2249 }
2250
2251 done:
2252 dp_drop_pktq(fsw, &dropped_pkts, 0, reason, line, 0);
2253 FSW_RUNLOCK(fsw);
2254
2255 fsw_host_sendup(ifp, &host_mq);
2256 }
2257
2258 #if (DEVELOPMENT || DEBUG)
2259 static void
fsw_rps_rx(struct nx_flowswitch * fsw,uint32_t id,struct __kern_packet * pkt)2260 fsw_rps_rx(struct nx_flowswitch *fsw, uint32_t id,
2261 struct __kern_packet *pkt)
2262 {
2263 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2264
2265 lck_mtx_lock_spin(&frt->frt_lock);
2266 KPKTQ_ENQUEUE(&frt->frt_pktq, pkt);
2267 lck_mtx_unlock(&frt->frt_lock);
2268 }
2269
2270 static void
fsw_rps_thread_schedule(struct nx_flowswitch * fsw,uint32_t id)2271 fsw_rps_thread_schedule(struct nx_flowswitch *fsw, uint32_t id)
2272 {
2273 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2274
2275 ASSERT(frt->frt_thread != THREAD_NULL);
2276 lck_mtx_lock_spin(&frt->frt_lock);
2277 ASSERT(!(frt->frt_flags & (FRT_TERMINATING | FRT_TERMINATED)));
2278
2279 frt->frt_requests++;
2280 if (!(frt->frt_flags & FRT_RUNNING)) {
2281 thread_wakeup((caddr_t)frt);
2282 }
2283 lck_mtx_unlock(&frt->frt_lock);
2284 }
2285
2286 __attribute__((noreturn))
2287 static void
fsw_rps_thread_cont(void * v,wait_result_t w)2288 fsw_rps_thread_cont(void *v, wait_result_t w)
2289 {
2290 struct fsw_rps_thread *__single frt = v;
2291 struct nx_flowswitch *fsw = frt->frt_fsw;
2292
2293 lck_mtx_lock(&frt->frt_lock);
2294 if (__improbable(w == THREAD_INTERRUPTIBLE ||
2295 (frt->frt_flags & FRT_TERMINATING) != 0)) {
2296 goto terminate;
2297 }
2298 if (KPKTQ_EMPTY(&frt->frt_pktq)) {
2299 goto done;
2300 }
2301 frt->frt_flags |= FRT_RUNNING;
2302
2303 for (;;) {
2304 uint32_t requests = frt->frt_requests;
2305 struct pktq pkts;
2306
2307 KPKTQ_INIT(&pkts);
2308 KPKTQ_CONCAT(&pkts, &frt->frt_pktq);
2309 lck_mtx_unlock(&frt->frt_lock);
2310
2311 sk_protect_t protect;
2312 protect = sk_sync_protect();
2313 _fsw_receive(fsw, &pkts);
2314 sk_sync_unprotect(protect);
2315
2316 lck_mtx_lock(&frt->frt_lock);
2317 if ((frt->frt_flags & FRT_TERMINATING) != 0 ||
2318 requests == frt->frt_requests) {
2319 frt->frt_requests = 0;
2320 break;
2321 }
2322 }
2323
2324 done:
2325 lck_mtx_unlock(&frt->frt_lock);
2326 if (!(frt->frt_flags & FRT_TERMINATING)) {
2327 frt->frt_flags &= ~FRT_RUNNING;
2328 assert_wait(frt, THREAD_UNINT);
2329 thread_block_parameter(fsw_rps_thread_cont, frt);
2330 __builtin_unreachable();
2331 } else {
2332 terminate:
2333 LCK_MTX_ASSERT(&frt->frt_lock, LCK_MTX_ASSERT_OWNED);
2334 frt->frt_flags &= ~(FRT_RUNNING | FRT_TERMINATING);
2335 frt->frt_flags |= FRT_TERMINATED;
2336
2337 if (frt->frt_flags & FRT_TERMINATEBLOCK) {
2338 thread_wakeup((caddr_t)&frt);
2339 }
2340 lck_mtx_unlock(&frt->frt_lock);
2341
2342 SK_D("fsw_rx_%s_%d terminated", if_name(fsw->fsw_ifp),
2343 frt->frt_idx);
2344
2345 /* for the extra refcnt from kernel_thread_start() */
2346 thread_deallocate(current_thread());
2347 /* this is the end */
2348 thread_terminate(current_thread());
2349 /* NOTREACHED */
2350 __builtin_unreachable();
2351 }
2352
2353 /* must never get here */
2354 VERIFY(0);
2355 /* NOTREACHED */
2356 __builtin_unreachable();
2357 }
2358
2359 __attribute__((noreturn))
2360 static void
fsw_rps_thread_func(void * v,wait_result_t w)2361 fsw_rps_thread_func(void *v, wait_result_t w)
2362 {
2363 #pragma unused(w)
2364 struct fsw_rps_thread *__single frt = v;
2365 struct nx_flowswitch *fsw = frt->frt_fsw;
2366 const char *__null_terminated tname = NULL;
2367
2368 char thread_name[MAXTHREADNAMESIZE];
2369 bzero(thread_name, sizeof(thread_name));
2370 tname = tsnprintf(thread_name, sizeof(thread_name), "fsw_rx_%s_%d",
2371 if_name(fsw->fsw_ifp), frt->frt_idx);
2372
2373 thread_set_thread_name(frt->frt_thread, tname);
2374 SK_D("%s spawned", tname);
2375
2376 net_thread_marks_push(NET_THREAD_SYNC_RX);
2377 assert_wait(frt, THREAD_UNINT);
2378 (void) thread_block_parameter(fsw_rps_thread_cont, frt);
2379
2380 __builtin_unreachable();
2381 }
2382
2383 static void
fsw_rps_thread_join(struct nx_flowswitch * fsw,uint32_t i)2384 fsw_rps_thread_join(struct nx_flowswitch *fsw, uint32_t i)
2385 {
2386 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2387 uint64_t f = (1 * NSEC_PER_MSEC);
2388 uint64_t s = (1000 * NSEC_PER_SEC);
2389 uint32_t c = 0;
2390
2391 lck_mtx_lock(&frt->frt_lock);
2392 frt->frt_flags |= FRT_TERMINATING;
2393
2394 while (!(frt->frt_flags & FRT_TERMINATED)) {
2395 uint64_t t = 0;
2396 nanoseconds_to_absolutetime((c++ == 0) ? f : s, &t);
2397 clock_absolutetime_interval_to_deadline(t, &t);
2398 ASSERT(t != 0);
2399
2400 frt->frt_flags |= FRT_TERMINATEBLOCK;
2401 if (!(frt->frt_flags & FRT_RUNNING)) {
2402 thread_wakeup_one((caddr_t)frt);
2403 }
2404 (void) assert_wait_deadline(&frt->frt_thread, THREAD_UNINT, t);
2405 lck_mtx_unlock(&frt->frt_lock);
2406 thread_block(THREAD_CONTINUE_NULL);
2407 lck_mtx_lock(&frt->frt_lock);
2408 frt->frt_flags &= ~FRT_TERMINATEBLOCK;
2409 }
2410 ASSERT(frt->frt_flags & FRT_TERMINATED);
2411 lck_mtx_unlock(&frt->frt_lock);
2412 frt->frt_thread = THREAD_NULL;
2413 }
2414
2415 static void
fsw_rps_thread_spawn(struct nx_flowswitch * fsw,uint32_t i)2416 fsw_rps_thread_spawn(struct nx_flowswitch *fsw, uint32_t i)
2417 {
2418 kern_return_t error;
2419 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2420
2421 lck_mtx_init(&frt->frt_lock, &nexus_lock_group, &nexus_lock_attr);
2422 frt->frt_idx = i;
2423 frt->frt_fsw = fsw;
2424 error = kernel_thread_start(fsw_rps_thread_func, frt, &frt->frt_thread);
2425 ASSERT(!error);
2426 KPKTQ_INIT(&frt->frt_pktq);
2427 }
2428
2429 int
fsw_rps_set_nthreads(struct nx_flowswitch * fsw,uint32_t n)2430 fsw_rps_set_nthreads(struct nx_flowswitch* fsw, uint32_t n)
2431 {
2432 if (n > FSW_RPS_MAX_NTHREADS) {
2433 SK_ERR("rps nthreads %d, max %d", n, FSW_RPS_MAX_NTHREADS);
2434 return EINVAL;
2435 }
2436
2437 FSW_WLOCK(fsw);
2438 if (n < fsw->fsw_rps_nthreads) {
2439 for (uint32_t i = n; i < fsw->fsw_rps_nthreads; i++) {
2440 fsw_rps_thread_join(fsw, i);
2441 }
2442 fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2443 fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2444 fsw->fsw_rps_nthreads = n;
2445 } else if (n > fsw->fsw_rps_nthreads) {
2446 uint32_t nthreads_old = fsw->fsw_rps_nthreads;
2447
2448 fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2449 fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2450 fsw->fsw_rps_nthreads = n;
2451 for (uint32_t i = nthreads_old; i < n; i++) {
2452 fsw_rps_thread_spawn(fsw, i);
2453 }
2454 }
2455 FSW_WUNLOCK(fsw);
2456 return 0;
2457 }
2458
2459 static uint32_t
get_rps_id(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2460 get_rps_id(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2461 {
2462 sa_family_t af = fsw->fsw_demux(fsw, pkt);
2463 if (__improbable(af == AF_UNSPEC)) {
2464 return 0;
2465 }
2466
2467 flow_pkt_classify(pkt, fsw->fsw_ifp, af, true);
2468
2469 if (__improbable((pkt->pkt_qum_qflags &
2470 QUM_F_FLOW_CLASSIFIED) == 0)) {
2471 return 0;
2472 }
2473
2474 struct flow_key key;
2475 flow_pkt2key(pkt, true, &key);
2476 key.fk_mask = FKMASK_5TUPLE;
2477
2478 uint32_t id = flow_key_hash(&key) % fsw->fsw_rps_nthreads;
2479
2480 return id;
2481 }
2482
2483 #endif /* !DEVELOPMENT && !DEBUG */
2484
2485 void
fsw_receive(struct nx_flowswitch * fsw,struct pktq * pktq)2486 fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq)
2487 {
2488 #if (DEVELOPMENT || DEBUG)
2489 FSW_RLOCK(fsw);
2490 if (fsw->fsw_rps_nthreads != 0) {
2491 struct __kern_packet *pkt, *tpkt;
2492 bitmap_t map = 0;
2493
2494 static_assert(BITMAP_LEN(FSW_RPS_MAX_NTHREADS) == 1);
2495 KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
2496 uint32_t id = get_rps_id(fsw, pkt);
2497 KPKTQ_REMOVE(pktq, pkt);
2498 fsw_rps_rx(fsw, id, pkt);
2499 bitmap_set(&map, id);
2500 }
2501 for (int i = bitmap_first(&map, 64); i >= 0;
2502 i = bitmap_next(&map, i)) {
2503 fsw_rps_thread_schedule(fsw, i);
2504 }
2505 FSW_RUNLOCK(fsw);
2506 } else
2507 #endif /* !DEVELOPMENT && !DEBUG */
2508 {
2509 #if (DEVELOPMENT || DEBUG)
2510 FSW_RUNLOCK(fsw);
2511 #endif /* !DEVELOPMENT && !DEBUG */
2512 _fsw_receive(fsw, pktq);
2513 }
2514 }
2515
2516 int
fsw_dev_input_netem_dequeue(void * handle,pktsched_pkt_t * __counted_by (n_pkts)pkts,uint32_t n_pkts)2517 fsw_dev_input_netem_dequeue(void *handle,
2518 pktsched_pkt_t *__counted_by(n_pkts)pkts, uint32_t n_pkts)
2519 {
2520 #pragma unused(handle)
2521 struct nx_flowswitch *__single fsw = handle;
2522 struct __kern_packet *kpkts[FSW_VP_DEV_BATCH_MAX];
2523 struct pktq pktq;
2524 sk_protect_t protect;
2525 uint32_t i;
2526
2527 ASSERT(n_pkts <= FSW_VP_DEV_BATCH_MAX);
2528
2529 for (i = 0; i < n_pkts; i++) {
2530 ASSERT(pkts[i].pktsched_ptype == QP_PACKET);
2531 ASSERT(pkts[i].pktsched_pkt_kpkt != NULL);
2532 kpkts[i] = pkts[i].pktsched_pkt_kpkt;
2533 }
2534
2535 protect = sk_sync_protect();
2536 KPKTQ_INIT(&pktq);
2537 pkts_to_pktq(kpkts, n_pkts, &pktq);
2538
2539 fsw_receive(fsw, &pktq);
2540 KPKTQ_FINI(&pktq);
2541 sk_sync_unprotect(protect);
2542
2543 return 0;
2544 }
2545
2546 static void
fsw_dev_input_netem_enqueue(struct nx_flowswitch * fsw,struct pktq * q)2547 fsw_dev_input_netem_enqueue(struct nx_flowswitch *fsw, struct pktq *q)
2548 {
2549 classq_pkt_t p;
2550 struct netem *__single ne;
2551 struct __kern_packet *pkt, *tpkt;
2552
2553 ASSERT(fsw->fsw_ifp != NULL);
2554 ne = fsw->fsw_ifp->if_input_netem;
2555 ASSERT(ne != NULL);
2556 KPKTQ_FOREACH_SAFE(pkt, q, tpkt) {
2557 bool pdrop;
2558 KPKTQ_REMOVE(q, pkt);
2559 CLASSQ_PKT_INIT_PACKET(&p, pkt);
2560 netem_enqueue(ne, &p, &pdrop);
2561 }
2562 }
2563
2564 void
fsw_devna_rx(struct nexus_adapter * devna,struct __kern_packet * pkt_head,struct nexus_pkt_stats * out_stats)2565 fsw_devna_rx(struct nexus_adapter *devna, struct __kern_packet *pkt_head,
2566 struct nexus_pkt_stats *out_stats)
2567 {
2568 struct __kern_packet *pkt = pkt_head, *next;
2569 struct nx_flowswitch *fsw;
2570 uint32_t n_bytes = 0, n_pkts = 0;
2571 uint64_t total_pkts = 0, total_bytes = 0;
2572 struct pktq q;
2573
2574 KPKTQ_INIT(&q);
2575 if (__improbable(devna->na_ifp == NULL ||
2576 (fsw = fsw_ifp_to_fsw(devna->na_ifp)) == NULL)) {
2577 SK_ERR("fsw not attached, dropping %d pkts", KPKTQ_LEN(&q));
2578 dp_drop_pkt_chain(pkt_head, 0, DROP_REASON_FSW_QUIESCED, DROPTAP_FLAG_L2_MISSING);
2579 return;
2580 }
2581 while (pkt != NULL) {
2582 if (__improbable(pkt->pkt_trace_id != 0)) {
2583 KDBG(SK_KTRACE_PKT_RX_DRV | DBG_FUNC_END, pkt->pkt_trace_id);
2584 KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_START, pkt->pkt_trace_id);
2585 }
2586 next = pkt->pkt_nextpkt;
2587 pkt->pkt_nextpkt = NULL;
2588
2589 if (__probable((pkt->pkt_qum_qflags & QUM_F_DROPPED) == 0)) {
2590 KPKTQ_ENQUEUE(&q, pkt);
2591 n_bytes += pkt->pkt_length;
2592 } else {
2593 DTRACE_SKYWALK1(non__finalized__drop,
2594 struct __kern_packet *, pkt);
2595 FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_FINALIZED);
2596 dp_drop_pkt_single(fsw, pkt, 0,
2597 DROP_REASON_FSW_RX_PKT_NOT_FINALIZED,
2598 DROPTAP_FLAG_L2_MISSING);
2599 pkt = NULL;
2600 }
2601 n_pkts = KPKTQ_LEN(&q);
2602 if (n_pkts == fsw_rx_batch || (next == NULL && n_pkts > 0)) {
2603 if (__improbable(fsw->fsw_ifp->if_input_netem != NULL)) {
2604 fsw_dev_input_netem_enqueue(fsw, &q);
2605 } else {
2606 fsw_receive(fsw, &q);
2607 }
2608 total_pkts += n_pkts;
2609 total_bytes += n_bytes;
2610 n_pkts = 0;
2611 n_bytes = 0;
2612 KPKTQ_FINI(&q);
2613 }
2614 pkt = next;
2615 }
2616 ASSERT(KPKTQ_LEN(&q) == 0);
2617 FSW_STATS_ADD(FSW_STATS_RX_PACKETS, total_pkts);
2618 if (out_stats != NULL) {
2619 out_stats->nps_pkts += total_pkts;
2620 out_stats->nps_bytes += total_bytes;
2621 }
2622 KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(devna), total_pkts, total_bytes);
2623 }
2624
2625 static int
dp_copy_to_dev_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2626 dp_copy_to_dev_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2627 struct __kern_packet *dpkt)
2628 {
2629 struct mbuf *__single m = NULL;
2630 uint32_t bdlen, bdlim, bdoff;
2631 uint8_t *bdaddr;
2632 unsigned int one = 1;
2633 int err = 0;
2634
2635 err = mbuf_allocpacket(MBUF_DONTWAIT,
2636 (fsw->fsw_frame_headroom + spkt->pkt_length), &one, &m);
2637 #if (DEVELOPMENT || DEBUG)
2638 if (m != NULL) {
2639 _FSW_INJECT_ERROR(11, m, NULL, m_freem, m);
2640 }
2641 #endif /* DEVELOPMENT || DEBUG */
2642 if (__improbable(m == NULL)) {
2643 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
2644 err = ENOBUFS;
2645 goto done;
2646 }
2647
2648 MD_BUFLET_ADDR_ABS_DLEN(dpkt, bdaddr, bdlen, bdlim, bdoff);
2649 if (fsw->fsw_frame_headroom > bdlim) {
2650 SK_ERR("not enough space in buffer for headroom");
2651 err = EINVAL;
2652 goto done;
2653 }
2654
2655 dpkt->pkt_headroom = fsw->fsw_frame_headroom;
2656 dpkt->pkt_mbuf = m;
2657 dpkt->pkt_pflags |= PKT_F_MBUF_DATA;
2658
2659 /* packet copy into mbuf */
2660 fsw->fsw_pkt_copy_to_mbuf(NR_TX, SK_PTR_ENCODE(spkt,
2661 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)), 0, m,
2662 fsw->fsw_frame_headroom, spkt->pkt_length,
2663 PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2664 spkt->pkt_csum_tx_start_off);
2665 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2MBUF);
2666
2667 /* header copy into dpkt buffer for classification */
2668 kern_packet_t sph = SK_PTR_ENCODE(spkt,
2669 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2670 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2671 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2672 uint32_t copy_len = MIN(spkt->pkt_length, bdlim - dpkt->pkt_headroom);
2673 fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, dpkt->pkt_headroom,
2674 sph, spkt->pkt_headroom, copy_len, FALSE, 0, 0, 0);
2675 if (copy_len < spkt->pkt_length) {
2676 dpkt->pkt_pflags |= PKT_F_TRUNCATED;
2677 }
2678
2679 /*
2680 * fsw->fsw_frame_headroom is after m_data, thus we treat m_data same as
2681 * buflet baddr m_data always points to the beginning of packet and
2682 * should represents the same as baddr + headroom
2683 */
2684 ASSERT((uintptr_t)m->m_data ==
2685 ((uintptr_t)mbuf_datastart(m) + fsw->fsw_frame_headroom));
2686
2687 done:
2688 return err;
2689 }
2690
2691 static int
dp_copy_to_dev_pkt(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2692 dp_copy_to_dev_pkt(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2693 struct __kern_packet *dpkt)
2694 {
2695 struct ifnet *ifp = fsw->fsw_ifp;
2696 uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
2697
2698 if (headroom > UINT8_MAX) {
2699 SK_ERR("headroom too large %d", headroom);
2700 return ERANGE;
2701 }
2702 dpkt->pkt_headroom = (uint8_t)headroom;
2703 ASSERT((dpkt->pkt_headroom & 0x7) == 0);
2704 dpkt->pkt_l2_len = 0;
2705 dpkt->pkt_link_flags = spkt->pkt_link_flags;
2706
2707 kern_packet_t sph = SK_PTR_ENCODE(spkt,
2708 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2709 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2710 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2711 fsw->fsw_pkt_copy_from_pkt(NR_TX, dph,
2712 dpkt->pkt_headroom, sph, spkt->pkt_headroom,
2713 spkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2714 (spkt->pkt_csum_tx_start_off - spkt->pkt_headroom),
2715 (spkt->pkt_csum_tx_stuff_off - spkt->pkt_headroom),
2716 (spkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT));
2717
2718 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
2719
2720 return 0;
2721 }
2722
2723 #if SK_LOG
2724 /* Hoisted out of line to reduce kernel stack footprint */
2725 SK_LOG_ATTRIBUTE
2726 static void
dp_copy_to_dev_log(struct nx_flowswitch * fsw,const struct kern_pbufpool * pp,struct __kern_packet * spkt,struct __kern_packet * dpkt,int error)2727 dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp,
2728 struct __kern_packet *spkt, struct __kern_packet *dpkt, int error)
2729 {
2730 struct proc *p = current_proc();
2731 struct ifnet *ifp = fsw->fsw_ifp;
2732 uint64_t logflags = (SK_VERB_FSW_DP | SK_VERB_TX);
2733
2734 if (error == ERANGE) {
2735 SK_ERR("packet too long, hr(fr+tx)+slen (%u+%u)+%u > "
2736 "dev_pp_max %u", (uint32_t)fsw->fsw_frame_headroom,
2737 (uint32_t)ifp->if_tx_headroom, spkt->pkt_length,
2738 (uint32_t)pp->pp_max_frags * PP_BUF_SIZE_DEF(pp));
2739 } else if (error == ENOBUFS) {
2740 SK_DF(logflags, "%s(%d) packet allocation failure",
2741 sk_proc_name(p), sk_proc_pid(p));
2742 } else if (error == 0) {
2743 ASSERT(dpkt != NULL);
2744 char *daddr;
2745 uint32_t pkt_len;
2746
2747 MD_BUFLET_ADDR_ABS(dpkt, daddr);
2748 pkt_len = __packet_get_real_data_length(dpkt);
2749 SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)",
2750 sk_proc_name(p), sk_proc_pid(p), spkt->pkt_length,
2751 dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
2752 (uint32_t)fsw->fsw_frame_headroom,
2753 (uint32_t)ifp->if_tx_headroom);
2754 SK_DF(logflags | SK_VERB_DUMP, "%s",
2755 sk_dump("buf", daddr, pkt_len, 128));
2756 } else {
2757 SK_DF(logflags, "%s(%d) error %d", sk_proc_name(p),
2758 sk_proc_pid(p), error);
2759 }
2760 }
2761 #else
2762 #define dp_copy_to_dev_log(...)
2763 #endif /* SK_LOG */
2764
2765 static void
fsw_pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)2766 fsw_pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
2767 {
2768 ASSERT(!(spkt->pkt_pflags & PKT_F_MBUF_MASK));
2769 ASSERT(!(spkt->pkt_pflags & PKT_F_PKT_MASK));
2770
2771 SK_PREFETCHW(dpkt->pkt_qum_buf.buf_addr, 0);
2772 /* Copy packet metadata */
2773 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
2774 _PKT_COPY(spkt, dpkt);
2775 _PKT_COPY_TX_PORT_DATA(spkt, dpkt);
2776 ASSERT((dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
2777 !PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
2778 ASSERT(dpkt->pkt_mbuf == NULL);
2779
2780 /* Copy AQM metadata */
2781 dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type;
2782 dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx;
2783 static_assert((offsetof(struct __flow, flow_src_id) % 8) == 0);
2784 _UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id);
2785 _UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid);
2786 dpkt->pkt_policy_id = spkt->pkt_policy_id;
2787 dpkt->pkt_skip_policy_id = spkt->pkt_skip_policy_id;
2788 }
2789
2790 static int
dp_copy_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2791 dp_copy_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2792 struct __kern_packet *dpkt)
2793 {
2794 const struct kern_pbufpool *pp = dpkt->pkt_qum.qum_pp;
2795 struct ifnet *ifp = fsw->fsw_ifp;
2796 uint32_t dev_pkt_len;
2797 int err = 0;
2798
2799 fsw_pkt_copy_metadata(spkt, dpkt);
2800 switch (fsw->fsw_classq_enq_ptype) {
2801 case QP_MBUF:
2802 err = dp_copy_to_dev_mbuf(fsw, spkt, dpkt);
2803 break;
2804
2805 case QP_PACKET:
2806 dev_pkt_len = fsw->fsw_frame_headroom + ifp->if_tx_headroom +
2807 spkt->pkt_length;
2808 if (dev_pkt_len > pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)) {
2809 FSW_STATS_INC(FSW_STATS_TX_COPY_BAD_LEN);
2810 err = ERANGE;
2811 goto done;
2812 }
2813 err = dp_copy_to_dev_pkt(fsw, spkt, dpkt);
2814 break;
2815
2816 default:
2817 VERIFY(0);
2818 __builtin_unreachable();
2819 }
2820 done:
2821 dp_copy_to_dev_log(fsw, pp, spkt, dpkt, err);
2822 return err;
2823 }
2824
2825 static int
dp_copy_headers_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2826 dp_copy_headers_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2827 struct __kern_packet *dpkt)
2828 {
2829 uint8_t *sbaddr, *dbaddr;
2830 uint16_t headroom = fsw->fsw_frame_headroom + fsw->fsw_ifp->if_tx_headroom;
2831 uint16_t hdrs_len_estimate = (uint16_t)MIN(spkt->pkt_length, 128);
2832
2833 fsw_pkt_copy_metadata(spkt, dpkt);
2834
2835 MD_BUFLET_ADDR_ABS(spkt, sbaddr);
2836 ASSERT(sbaddr != NULL);
2837 sbaddr += spkt->pkt_headroom;
2838
2839 MD_BUFLET_ADDR_ABS(dpkt, dbaddr);
2840 ASSERT(dbaddr != NULL);
2841 dpkt->pkt_headroom = (uint8_t)headroom;
2842 dbaddr += headroom;
2843
2844 pkt_copy(sbaddr, dbaddr, hdrs_len_estimate);
2845 METADATA_SET_LEN(dpkt, hdrs_len_estimate, headroom);
2846
2847 /* packet length is set to the full length */
2848 dpkt->pkt_length = spkt->pkt_length;
2849 dpkt->pkt_pflags |= PKT_F_TRUNCATED;
2850 return 0;
2851 }
2852
2853 static struct mbuf *
convert_pkt_to_mbuf(struct __kern_packet * pkt)2854 convert_pkt_to_mbuf(struct __kern_packet *pkt)
2855 {
2856 ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
2857 ASSERT(pkt->pkt_mbuf != NULL);
2858 struct mbuf *m = pkt->pkt_mbuf;
2859
2860 /* pass additional metadata generated from flow parse/lookup */
2861 static_assert(sizeof(m->m_pkthdr.pkt_flowid) == sizeof(pkt->pkt_flow_token));
2862 static_assert(sizeof(m->m_pkthdr.pkt_mpriv_srcid) == sizeof(pkt->pkt_flowsrc_token));
2863 static_assert(sizeof(m->m_pkthdr.pkt_mpriv_fidx) == sizeof(pkt->pkt_flowsrc_fidx));
2864 m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
2865 m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
2866 m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
2867 m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
2868 m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
2869 m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
2870 m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx;
2871
2872 if (pkt->pkt_transport_protocol == IPPROTO_QUIC) {
2873 m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_QUIC;
2874 }
2875
2876 /* The packet should have a timestamp by the time we get here. */
2877 m->m_pkthdr.pkt_timestamp = pkt->pkt_timestamp;
2878 m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
2879
2880 m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
2881 m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
2882 /* set pkt_hdr so that AQM can find IP header and mark ECN bits */
2883 m->m_pkthdr.pkt_hdr = m_mtod_current(m) + pkt->pkt_l2_len;
2884
2885 if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
2886 m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
2887 }
2888 KPKT_CLEAR_MBUF_DATA(pkt);
2889
2890 /* mbuf has been consumed, release packet as well */
2891 ASSERT(pkt->pkt_qum.qum_ksd == NULL);
2892 pp_free_packet_single(pkt);
2893 return m;
2894 }
2895
2896 static void
convert_pkt_to_mbuf_list(struct __kern_packet * pkt_list,struct mbuf ** head,struct mbuf ** tail,uint32_t * cnt,uint32_t * bytes)2897 convert_pkt_to_mbuf_list(struct __kern_packet *pkt_list,
2898 struct mbuf **head, struct mbuf **tail,
2899 uint32_t *cnt, uint32_t *bytes)
2900 {
2901 struct __kern_packet *pkt = pkt_list, *next;
2902 struct mbuf *__single m_head = NULL, **__single m_tailp = &m_head;
2903 struct mbuf *__single m = NULL;
2904 uint32_t c = 0, b = 0;
2905
2906 while (pkt != NULL) {
2907 next = pkt->pkt_nextpkt;
2908 pkt->pkt_nextpkt = NULL;
2909 m = convert_pkt_to_mbuf(pkt);
2910 ASSERT(m != NULL);
2911
2912 *m_tailp = m;
2913 m_tailp = &m->m_nextpkt;
2914 c++;
2915 b += m_pktlen(m);
2916 pkt = next;
2917 }
2918 if (head != NULL) {
2919 *head = m_head;
2920 }
2921 if (tail != NULL) {
2922 *tail = m;
2923 }
2924 if (cnt != NULL) {
2925 *cnt = c;
2926 }
2927 if (bytes != NULL) {
2928 *bytes = b;
2929 }
2930 }
2931
2932 SK_NO_INLINE_ATTRIBUTE
2933 static int
classq_enqueue_flow_single(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2934 classq_enqueue_flow_single(struct nx_flowswitch *fsw,
2935 struct __kern_packet *pkt)
2936 {
2937 struct ifnet *ifp = fsw->fsw_ifp;
2938 boolean_t pkt_drop = FALSE;
2939 int err;
2940
2941 FSW_LOCK_ASSERT_HELD(fsw);
2942 ASSERT(fsw->fsw_classq_enabled);
2943 ASSERT(pkt->pkt_flow_token != 0);
2944 fsw_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class,
2945 1, pkt->pkt_length);
2946
2947 if (__improbable(pkt->pkt_trace_id != 0)) {
2948 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
2949 KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
2950 }
2951
2952 switch (fsw->fsw_classq_enq_ptype) {
2953 case QP_MBUF: { /* compat interface */
2954 struct mbuf *m;
2955
2956 m = convert_pkt_to_mbuf(pkt);
2957 ASSERT(m != NULL);
2958 pkt = NULL;
2959
2960 /* ifnet_enqueue consumes mbuf */
2961 err = ifnet_enqueue_mbuf(ifp, m, false, &pkt_drop);
2962 m = NULL;
2963 #if (DEVELOPMENT || DEBUG)
2964 if (__improbable(!pkt_drop)) {
2965 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2966 }
2967 #endif /* DEVELOPMENT || DEBUG */
2968 if (pkt_drop) {
2969 FSW_STATS_INC(FSW_STATS_DROP);
2970 FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2971 }
2972 break;
2973 }
2974 case QP_PACKET: { /* native interface */
2975 /* ifnet_enqueue consumes packet */
2976 err = ifnet_enqueue_pkt(ifp, ifp->if_snd, pkt, false, &pkt_drop);
2977 pkt = NULL;
2978 #if (DEVELOPMENT || DEBUG)
2979 if (__improbable(!pkt_drop)) {
2980 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2981 }
2982 #endif /* DEVELOPMENT || DEBUG */
2983 if (pkt_drop) {
2984 FSW_STATS_INC(FSW_STATS_DROP);
2985 FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2986 }
2987 break;
2988 }
2989 default:
2990 err = EINVAL;
2991 VERIFY(0);
2992 /* NOTREACHED */
2993 __builtin_unreachable();
2994 }
2995
2996 return err;
2997 }
2998
2999 static int
classq_enqueue_flow_chain(struct nx_flowswitch * fsw,struct __kern_packet * pkt_head,struct __kern_packet * pkt_tail,uint32_t cnt,uint32_t bytes)3000 classq_enqueue_flow_chain(struct nx_flowswitch *fsw,
3001 struct __kern_packet *pkt_head, struct __kern_packet *pkt_tail,
3002 uint32_t cnt, uint32_t bytes)
3003 {
3004 struct ifnet *ifp = fsw->fsw_ifp;
3005 boolean_t pkt_drop = FALSE;
3006 uint32_t svc;
3007 int err;
3008
3009 FSW_LOCK_ASSERT_HELD(fsw);
3010 ASSERT(fsw->fsw_classq_enabled);
3011 ASSERT(pkt_head->pkt_flow_token != 0);
3012
3013 /*
3014 * All packets in the flow should have the same svc.
3015 */
3016 svc = pkt_head->pkt_svc_class;
3017 fsw_ifp_inc_traffic_class_out_pkt(ifp, svc, cnt, bytes);
3018
3019 switch (fsw->fsw_classq_enq_ptype) {
3020 case QP_MBUF: { /* compat interface */
3021 struct mbuf *__single m_head = NULL, *__single m_tail = NULL;
3022 uint32_t c = 0, b = 0;
3023
3024 convert_pkt_to_mbuf_list(pkt_head, &m_head, &m_tail, &c, &b);
3025 ASSERT(m_head != NULL && m_tail != NULL);
3026 ASSERT(c == cnt);
3027 ASSERT(b == bytes);
3028 pkt_head = NULL;
3029
3030 /* ifnet_enqueue consumes mbuf */
3031 err = ifnet_enqueue_mbuf_chain(ifp, m_head, m_tail, cnt,
3032 bytes, FALSE, &pkt_drop);
3033 m_head = NULL;
3034 m_tail = NULL;
3035 #if (DEVELOPMENT || DEBUG)
3036 if (__improbable(!pkt_drop)) {
3037 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
3038 }
3039 #endif /* DEVELOPMENT || DEBUG */
3040 if (pkt_drop) {
3041 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
3042 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
3043 cnt);
3044 }
3045 break;
3046 }
3047 case QP_PACKET: { /* native interface */
3048 /* ifnet_enqueue consumes packet */
3049 err = ifnet_enqueue_pkt_chain(ifp, ifp->if_snd, pkt_head, pkt_tail, cnt,
3050 bytes, FALSE, &pkt_drop);
3051 pkt_head = NULL;
3052 #if (DEVELOPMENT || DEBUG)
3053 if (__improbable(!pkt_drop)) {
3054 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
3055 }
3056 #endif /* DEVELOPMENT || DEBUG */
3057 if (pkt_drop) {
3058 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
3059 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
3060 cnt);
3061 }
3062 break;
3063 }
3064 default:
3065 err = EINVAL;
3066 VERIFY(0);
3067 /* NOTREACHED */
3068 __builtin_unreachable();
3069 }
3070
3071 return err;
3072 }
3073
3074 /*
3075 * This code path needs to be kept for interfaces without logical link support.
3076 */
3077 static void
classq_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)3078 classq_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
3079 bool chain, uint32_t cnt, uint32_t bytes)
3080 {
3081 struct __kern_packet *pkt, *tail, *tpkt;
3082 flowadv_idx_t flow_adv_idx;
3083 bool flowadv_cap;
3084 flowadv_token_t flow_adv_token;
3085 int err;
3086
3087 SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
3088 if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
3089
3090 if (chain) {
3091 pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
3092 tail = KPKTQ_LAST(&fe->fe_tx_pktq);
3093 KPKTQ_INIT(&fe->fe_tx_pktq);
3094 if (pkt == NULL) {
3095 return;
3096 }
3097 flow_adv_idx = pkt->pkt_flowsrc_fidx;
3098 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3099 flow_adv_token = pkt->pkt_flow_token;
3100
3101 err = classq_enqueue_flow_chain(fsw, pkt, tail, cnt, bytes);
3102 DTRACE_SKYWALK3(chain__enqueue, uint32_t, cnt, uint32_t, bytes, int, err);
3103 } else {
3104 uint32_t c = 0, b = 0;
3105
3106 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3107 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3108
3109 flow_adv_idx = pkt->pkt_flowsrc_fidx;
3110 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3111 flow_adv_token = pkt->pkt_flow_token;
3112
3113 c++;
3114 b += pkt->pkt_length;
3115 err = classq_enqueue_flow_single(fsw, pkt);
3116 }
3117 ASSERT(c == cnt);
3118 ASSERT(b == bytes);
3119 DTRACE_SKYWALK3(non__chain__enqueue, uint32_t, cnt, uint32_t, bytes,
3120 int, err);
3121 }
3122 }
3123
3124 /*
3125 * Logical link code path
3126 */
3127 static void
classq_qset_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)3128 classq_qset_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
3129 bool chain, uint32_t cnt, uint32_t bytes)
3130 {
3131 struct __kern_packet *pkt, *tail;
3132 flowadv_idx_t flow_adv_idx;
3133 bool flowadv_cap;
3134 flowadv_token_t flow_adv_token;
3135 uint32_t flowctl = 0, dropped = 0;
3136 int err;
3137
3138 SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
3139 if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
3140
3141 pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
3142 tail = KPKTQ_LAST(&fe->fe_tx_pktq);
3143 KPKTQ_INIT(&fe->fe_tx_pktq);
3144 if (pkt == NULL) {
3145 return;
3146 }
3147 flow_adv_idx = pkt->pkt_flowsrc_fidx;
3148 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3149 flow_adv_token = pkt->pkt_flow_token;
3150
3151 err = netif_qset_enqueue(fe->fe_qset, chain, pkt, tail, cnt, bytes,
3152 &flowctl, &dropped);
3153
3154 if (__improbable(err != 0) && dropped > 0) {
3155 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, dropped);
3156 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP, dropped);
3157 }
3158 }
3159
3160 static void
tx_finalize_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)3161 tx_finalize_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
3162 {
3163 #pragma unused(fsw)
3164 /* finalize here; no more changes to buflets after classq */
3165 if (__probable(!(pkt->pkt_pflags & PKT_F_MBUF_DATA))) {
3166 kern_packet_t ph = SK_PTR_ENCODE(pkt,
3167 METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
3168 int err = __packet_finalize(ph);
3169 VERIFY(err == 0);
3170 }
3171 }
3172
3173 static bool
dp_flow_tx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)3174 dp_flow_tx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
3175 {
3176 struct flow_route *fr = fe->fe_route;
3177 int err;
3178
3179 ASSERT(fr != NULL);
3180
3181 if (__improbable(!dp_flow_route_process(fsw, fe))) {
3182 return false;
3183 }
3184 if (fe->fe_qset_select == FE_QSET_SELECT_DYNAMIC) {
3185 flow_qset_select_dynamic(fsw, fe, TRUE);
3186 }
3187
3188 _FSW_INJECT_ERROR(35, fr->fr_flags, fr->fr_flags,
3189 _fsw_error35_handler, 1, fr, NULL, NULL);
3190 _FSW_INJECT_ERROR(36, fr->fr_flags, fr->fr_flags,
3191 _fsw_error36_handler, 1, fr, NULL);
3192
3193 /*
3194 * See if we need to resolve the flow route; note the test against
3195 * fr_flags here is done without any lock for performance. Thus
3196 * it's possible that we race against the thread performing route
3197 * event updates for a packet (which is OK). In any case we should
3198 * not have any assertion on fr_flags value(s) due to the lack of
3199 * serialization.
3200 */
3201 if (fr->fr_flags & FLOWRTF_RESOLVED) {
3202 goto frame;
3203 }
3204
3205 struct __kern_packet *pkt, *tpkt;
3206 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3207 err = fsw->fsw_resolve(fsw, fr, pkt);
3208 _FSW_INJECT_ERROR_SET(35, _fsw_error35_handler, 2, fr, pkt, &err);
3209 _FSW_INJECT_ERROR_SET(36, _fsw_error36_handler, 2, fr, &err);
3210 /*
3211 * If resolver returns EJUSTRETURN then we drop the pkt as the
3212 * resolver should have converted the pkt into mbuf (or
3213 * detached the attached mbuf from pkt) and added it to the
3214 * llinfo queue. If we do have a cached llinfo, then proceed
3215 * to using it even though it may be stale (very unlikely)
3216 * while the resolution is in progress.
3217 * Otherwise, any other error results in dropping pkt.
3218 */
3219 if (err == EJUSTRETURN) {
3220 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3221 pp_free_packet_single(pkt);
3222 FSW_STATS_INC(FSW_STATS_TX_RESOLV_PENDING);
3223 continue;
3224 } else if (err != 0 && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
3225 /* use existing llinfo */
3226 FSW_STATS_INC(FSW_STATS_TX_RESOLV_STALE);
3227 } else if (err != 0) {
3228 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3229 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_RESOLV_FAILED,
3230 DROPTAP_FLAG_L2_MISSING);
3231 FSW_STATS_INC(FSW_STATS_TX_RESOLV_FAIL);
3232 continue;
3233 }
3234 }
3235
3236 frame:
3237 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3238 if (fsw->fsw_frame != NULL) {
3239 fsw->fsw_frame(fsw, fr, pkt);
3240 }
3241 }
3242
3243 return true;
3244 }
3245
3246 static void
dp_listener_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)3247 dp_listener_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
3248 {
3249 #pragma unused(fsw)
3250 struct __kern_packet *pkt, *tpkt;
3251 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3252 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3253 /* listener is only allowed TCP RST */
3254 if (pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
3255 (pkt->pkt_flow_tcp_flags & TH_RST) != 0) {
3256 flow_track_abort_tcp(fe, NULL, pkt);
3257 } else {
3258 char *addr;
3259
3260 MD_BUFLET_ADDR_ABS(pkt, addr);
3261 SK_ERR("listener flow sends non-RST packet %s",
3262 sk_dump(sk_proc_name(current_proc()),
3263 addr, __packet_get_real_data_length(pkt), 128));
3264 }
3265 pp_free_packet_single(pkt);
3266 }
3267 }
3268
3269 static void
fsw_update_timestamps(struct __kern_packet * pkt,volatile uint64_t * fg_ts,volatile uint64_t * rt_ts,ifnet_t ifp,uint64_t now)3270 fsw_update_timestamps(struct __kern_packet *pkt, volatile uint64_t *fg_ts,
3271 volatile uint64_t *rt_ts, ifnet_t ifp, uint64_t now)
3272 {
3273 if (!(pkt->pkt_pflags & PKT_F_TS_VALID) || pkt->pkt_timestamp == 0) {
3274 pkt->pkt_timestamp = now;
3275 }
3276 pkt->pkt_pflags &= ~PKT_F_TS_VALID;
3277
3278 /*
3279 * If the packet service class is not background,
3280 * update the timestamps on the interface, as well as
3281 * the ones in nexus-wide advisory to indicate recent
3282 * activity on a foreground flow.
3283 */
3284 if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) {
3285 ifp->if_fg_sendts = (uint32_t)net_uptime();
3286 if (fg_ts != NULL) {
3287 *fg_ts = net_uptime();
3288 }
3289 }
3290 if (pkt->pkt_pflags & PKT_F_REALTIME) {
3291 ifp->if_rt_sendts = (uint32_t)net_uptime();
3292 if (rt_ts != NULL) {
3293 *rt_ts = net_uptime();
3294 }
3295 }
3296 }
3297
3298 static bool
fsw_chain_enqueue_enabled(struct nx_flowswitch * fsw)3299 fsw_chain_enqueue_enabled(struct nx_flowswitch *fsw)
3300 {
3301 return fsw_chain_enqueue != 0 &&
3302 fsw->fsw_ifp->if_output_netem == NULL &&
3303 (fsw->fsw_ifp->if_eflags & IFEF_ENQUEUE_MULTI) == 0;
3304 }
3305
3306 void
dp_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)3307 dp_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3308 uint32_t flags)
3309 {
3310 struct pktq dropped_pkts;
3311 bool chain, same_svc = true;
3312 bool gso = ((flags & FLOW_PROC_FLAG_GSO) != 0);
3313 uint32_t cnt = 0, bytes = 0;
3314 volatile struct sk_nexusadv *nxadv = NULL;
3315 volatile uint64_t *fg_ts = NULL;
3316 volatile uint64_t *rt_ts = NULL;
3317 uint8_t qset_idx = (fe->fe_qset != NULL) ? fe->fe_qset->nqs_idx : 0;
3318 drop_reason_t reason = DROP_REASON_UNSPECIFIED;
3319 uint16_t line = 0;
3320 uint32_t svc = 0;
3321 struct timespec now;
3322 uint64_t now_nsec = 0;
3323
3324 KPKTQ_INIT(&dropped_pkts);
3325 ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3326 if (__improbable(fe->fe_flags & FLOWENTF_LISTENER)) {
3327 dp_listener_flow_tx_process(fsw, fe);
3328 return;
3329 }
3330 if (__improbable(!dp_flow_tx_route_process(fsw, fe))) {
3331 SK_RDERR(5, "Tx route bad");
3332 FSW_STATS_ADD(FSW_STATS_TX_FLOW_NONVIABLE,
3333 KPKTQ_LEN(&fe->fe_tx_pktq));
3334 KPKTQ_CONCAT(&dropped_pkts, &fe->fe_tx_pktq);
3335 reason = DROP_REASON_FSW_FLOW_NONVIABLE;
3336 line = __LINE__;
3337 goto done;
3338 }
3339 chain = fsw_chain_enqueue_enabled(fsw) && KPKTQ_LEN(&fe->fe_tx_pktq) > 1;
3340 if (chain) {
3341 nanouptime(&now);
3342 net_timernsec(&now, &now_nsec);
3343 nxadv = fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
3344 if (nxadv != NULL) {
3345 fg_ts = &nxadv->nxadv_fg_sendts;
3346 rt_ts = &nxadv->nxadv_rt_sendts;
3347 }
3348 }
3349
3350 struct __kern_packet *pkt, *tpkt;
3351 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3352 int err = 0;
3353 if (svc == 0) {
3354 svc = pkt->pkt_svc_class;
3355 }
3356
3357 err = flow_pkt_track(fe, pkt, false);
3358 if (__improbable(err != 0)) {
3359 SK_RDERR(5, "flow_pkt_track failed (err %d)", err);
3360 FSW_STATS_INC(FSW_STATS_TX_FLOW_TRACK_ERR);
3361 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3362 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_FLOW_TRACK_ERR,
3363 DROPTAP_FLAG_L2_MISSING);
3364 continue;
3365 }
3366 _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
3367 pkt->pkt_transport_protocol = fe->fe_transport_protocol;
3368
3369 /* set AQM related values for outgoing packet */
3370 if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
3371 pkt->pkt_pflags |= PKT_F_FLOW_ADV;
3372 pkt->pkt_flowsrc_type = FLOWSRC_CHANNEL;
3373 pkt->pkt_flowsrc_fidx = fe->fe_adv_idx;
3374 } else {
3375 pkt->pkt_pflags &= ~PKT_F_FLOW_ADV;
3376 }
3377 _UUID_CLEAR(pkt->pkt_flow_id);
3378 pkt->pkt_flow_token = fe->fe_flowid;
3379 pkt->pkt_pflags |= PKT_F_FLOW_ID;
3380 pkt->pkt_qset_idx = qset_idx;
3381 pkt->pkt_policy_id = fe->fe_policy_id;
3382 pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
3383
3384 /*
3385 * The same code is exercised per packet for the non-chain case
3386 * (see ifnet_enqueue_ifclassq()). It's replicated here to avoid
3387 * re-walking the chain later.
3388 */
3389 if (chain && (gso || same_svc)) {
3390 fsw_update_timestamps(pkt, fg_ts, rt_ts, fsw->fsw_ifp, now_nsec);
3391 }
3392 /* mark packet tos/svc_class */
3393 fsw_qos_mark(fsw, fe, pkt);
3394
3395 tx_finalize_packet(fsw, pkt);
3396 bytes += pkt->pkt_length;
3397 cnt++;
3398
3399 same_svc = (same_svc && (svc == pkt->pkt_svc_class));
3400 /*
3401 * we are using the first 4 bytes of flow_id as the AQM flow
3402 * identifier.
3403 */
3404 ASSERT(!uuid_is_null(pkt->pkt_flow_id));
3405
3406 if (__improbable(pkt->pkt_trace_id != 0)) {
3407 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
3408 KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
3409 }
3410 }
3411
3412 /* snoop after it's finalized */
3413 if (__improbable(pktap_total_tap_count != 0)) {
3414 fsw_snoop(fsw, fe, &fe->fe_tx_pktq, false);
3415 }
3416
3417 chain = chain && (gso || same_svc);
3418 if (fe->fe_qset != NULL) {
3419 classq_qset_enqueue_flow(fsw, fe, chain, cnt, bytes);
3420 } else {
3421 classq_enqueue_flow(fsw, fe, chain, cnt, bytes);
3422 }
3423 done:
3424 dp_drop_pktq(fsw, &dropped_pkts, 1, reason, line, 0);
3425 }
3426
3427 static struct flow_entry *
tx_process_continuous_ip_frag(struct nx_flowswitch * fsw,struct flow_entry * prev_fe,struct __kern_packet * pkt)3428 tx_process_continuous_ip_frag(struct nx_flowswitch *fsw,
3429 struct flow_entry *prev_fe, struct __kern_packet *pkt)
3430 {
3431 ASSERT(!pkt->pkt_flow_ip_is_first_frag);
3432
3433 if (__improbable(pkt->pkt_flow_ip_frag_id == 0)) {
3434 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID);
3435 SK_PERR(current_proc(), "invalid zero fragment id");
3436 return NULL;
3437 }
3438
3439 SK_PDF(SK_VERB_FSW_DP | SK_VERB_TX, current_proc(),
3440 "continuation frag, id %u", pkt->pkt_flow_ip_frag_id);
3441 if (__improbable(prev_fe == NULL ||
3442 !prev_fe->fe_tx_is_cont_frag)) {
3443 SK_PERR(current_proc(), "unexpected continuation frag %u",
3444 pkt->pkt_flow_ip_frag_id);
3445 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3446 return NULL;
3447 }
3448 if (__improbable(pkt->pkt_flow_ip_frag_id !=
3449 prev_fe->fe_tx_frag_id)) {
3450 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3451 SK_PERR(current_proc(), "wrong continuation frag id %u expecting %u",
3452 pkt->pkt_flow_ip_frag_id, prev_fe->fe_tx_frag_id);
3453 return NULL;
3454 }
3455
3456 return prev_fe;
3457 }
3458
3459 static struct flow_entry *
tx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)3460 tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
3461 struct flow_entry *prev_fe)
3462 {
3463 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
3464 struct flow_entry *__single fe;
3465
3466 fe = lookup_flow_with_pkt(fsw, pkt, false, prev_fe);
3467 if (__improbable(fe == NULL)) {
3468 goto done;
3469 }
3470
3471 if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
3472 SK_RDERR(5, "Tx flow torn down %s",
3473 fe2str(fe, dbgbuf, sizeof(dbgbuf)));
3474 FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN);
3475 flow_entry_release(&fe);
3476 goto done;
3477 }
3478
3479 if (__improbable(fe->fe_flags & FLOWENTF_AOP_OFFLOAD)) {
3480 SK_RDERR(5, "Tx not allowed for this flow");
3481 SK_RDERR(5, "Tx not allowed for this flow %s",
3482 fe2str(fe, dbgbuf, sizeof(dbgbuf)));
3483 FSW_STATS_INC(FSW_STATS_TX_DISABLED);
3484 flow_entry_release(&fe);
3485 goto done;
3486 }
3487
3488 _FSW_INJECT_ERROR(34, pkt->pkt_flow_id[0], fe->fe_uuid[0] + 1,
3489 null_func);
3490
3491 if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
3492 uuid_string_t flow_id_str, pkt_id_str;
3493 sk_uuid_unparse(fe->fe_uuid, flow_id_str);
3494 sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str);
3495 SK_ERR("pkt flow id %s != flow id %s, %s", pkt_id_str,
3496 flow_id_str, fe2str(fe, dbgbuf, sizeof(dbgbuf)));
3497 flow_entry_release(&fe);
3498 FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID);
3499 }
3500
3501 done:
3502 return fe;
3503 }
3504
3505 static inline void
tx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)3506 tx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3507 uint32_t flags)
3508 {
3509 ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3510 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) != 0);
3511
3512 SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, "TX %d pkts from fe %p port %d",
3513 KPKTQ_LEN(&fe->fe_tx_pktq), fe, fe->fe_nx_port);
3514
3515 /* flow related processing (default, agg, etc.) */
3516 fe->fe_tx_process(fsw, fe, flags);
3517
3518 KPKTQ_FINI(&fe->fe_tx_pktq);
3519 }
3520
3521 #if SK_LOG
3522 static void
dp_tx_log_pkt(uint64_t verb,char * desc,struct __kern_packet * pkt)3523 dp_tx_log_pkt(uint64_t verb, char *desc, struct __kern_packet *pkt)
3524 {
3525 char *pkt_buf;
3526 uint32_t pkt_len;
3527
3528 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
3529 pkt_len = __packet_get_real_data_length(pkt);
3530 SK_DF(verb, "%s(%d) %s %s", sk_proc_name(current_proc()),
3531 sk_proc_pid(current_proc()), desc, sk_dump("buf", pkt_buf, pkt_len,
3532 128));
3533 }
3534 #else /* !SK_LOG */
3535 #define dp_tx_log_pkt(...)
3536 #endif /* !SK_LOG */
3537
3538 static inline struct ifnet *
fsw_datamov_begin(struct nx_flowswitch * fsw)3539 fsw_datamov_begin(struct nx_flowswitch *fsw)
3540 {
3541 struct ifnet *ifp;
3542
3543 ifp = fsw->fsw_ifp;
3544 if (!ifnet_datamov_begin(ifp)) {
3545 DTRACE_SKYWALK1(ifnet__detached, struct ifnet *, ifp);
3546 return NULL;
3547 }
3548 return ifp;
3549 }
3550
3551 static inline void
fsw_datamov_end(struct nx_flowswitch * fsw)3552 fsw_datamov_end(struct nx_flowswitch *fsw)
3553 {
3554 ifnet_datamov_end(fsw->fsw_ifp);
3555 }
3556
3557 static void
dp_tx_pktq(struct nx_flowswitch * fsw,struct pktq * spktq)3558 dp_tx_pktq(struct nx_flowswitch *fsw, struct pktq *spktq)
3559 {
3560 struct __kern_packet *spkt, *pkt;
3561 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3562 struct flow_entry *__single fe, *__single prev_fe;
3563 struct pktq dropped_pkts, dpktq;
3564 struct nexus_adapter *dev_na;
3565 struct kern_pbufpool *dev_pp;
3566 struct ifnet *ifp = NULL;
3567 sa_family_t af;
3568 uint32_t n_pkts, n_flows = 0;
3569 boolean_t do_pacing = FALSE;
3570 drop_reason_t reason = DROP_REASON_UNSPECIFIED;
3571 uint16_t line = 0;
3572
3573 int err;
3574 KPKTQ_INIT(&dpktq);
3575 KPKTQ_INIT(&dropped_pkts);
3576 n_pkts = KPKTQ_LEN(spktq);
3577
3578 FSW_RLOCK(fsw);
3579 if (__improbable(FSW_QUIESCED(fsw))) {
3580 DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3581 SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3582 KPKTQ_CONCAT(&dropped_pkts, spktq);
3583 reason = DROP_REASON_FSW_QUIESCED;
3584 line = __LINE__;
3585 goto done;
3586 }
3587 dev_na = fsw->fsw_dev_ch->ch_na;
3588 if (__improbable(dev_na == NULL)) {
3589 SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3590 FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3591 KPKTQ_CONCAT(&dropped_pkts, spktq);
3592 reason = DROP_REASON_FSW_TX_DEVPORT_NOT_ATTACHED;
3593 line = __LINE__;
3594 goto done;
3595 }
3596 ifp = fsw_datamov_begin(fsw);
3597 if (ifp == NULL) {
3598 SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
3599 KPKTQ_CONCAT(&dropped_pkts, spktq);
3600 reason = DROP_REASON_FSW_IFNET_NOT_ATTACHED;
3601 line = __LINE__;
3602 goto done;
3603 }
3604
3605 /* batch allocate enough packets */
3606 dev_pp = na_kr_get_pp(dev_na, NR_TX);
3607
3608 err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, n_pkts, NULL,
3609 NULL, SKMEM_NOSLEEP);
3610 #if DEVELOPMENT || DEBUG
3611 if (__probable(err != ENOMEM)) {
3612 _FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
3613 }
3614 #endif /* DEVELOPMENT || DEBUG */
3615 if (__improbable(err == ENOMEM)) {
3616 ASSERT(KPKTQ_EMPTY(&dpktq));
3617 KPKTQ_CONCAT(&dropped_pkts, spktq);
3618 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3619 SK_ERR("failed to alloc %u pkts from device pool", n_pkts);
3620 reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
3621 line = __LINE__;
3622 goto done;
3623 } else if (__improbable(err == EAGAIN)) {
3624 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT,
3625 (n_pkts - KPKTQ_LEN(&dpktq)));
3626 FSW_STATS_ADD(FSW_STATS_DROP,
3627 (n_pkts - KPKTQ_LEN(&dpktq)));
3628 }
3629
3630 n_pkts = KPKTQ_LEN(&dpktq);
3631 prev_fe = NULL;
3632 KPKTQ_FOREACH(spkt, spktq) {
3633 if (n_pkts == 0) {
3634 break;
3635 }
3636 --n_pkts;
3637
3638 KPKTQ_DEQUEUE(&dpktq, pkt);
3639 ASSERT(pkt != NULL);
3640 err = dp_copy_to_dev(fsw, spkt, pkt);
3641 if (__improbable(err != 0)) {
3642 /*
3643 * Copy to dev pool failed, so droptap should capture
3644 * the source pkt because dev pkt might not have metadata
3645 * or buffer filled out yet. Source pkt is freed by
3646 * fsw_user_ring_flush, so defer the free to that.
3647 */
3648 dp_drop_pkt_single_nofree(fsw, spkt, 1,
3649 DROP_REASON_FSW_PKT_COPY_FAILED, DROPTAP_FLAG_L2_MISSING);
3650 /* Free the dev pool packet */
3651 pp_free_packet_single(pkt);
3652 continue;
3653 }
3654
3655 do_pacing |= __packet_get_tx_timestamp(SK_PKT2PH(pkt)) != 0;
3656 af = fsw_ip_demux(fsw, pkt);
3657 if (__improbable(af == AF_UNSPEC)) {
3658 dp_tx_log_pkt(SK_VERB_ERROR, "demux err", pkt);
3659 FSW_STATS_INC(FSW_STATS_TX_DEMUX_ERR);
3660 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_DEMUX_FAILED,
3661 DROPTAP_FLAG_L2_MISSING);
3662 continue;
3663 }
3664
3665 err = flow_pkt_classify(pkt, ifp, af, false);
3666 if (__improbable(err != 0)) {
3667 dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
3668 FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
3669 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_EXTRACT_FAILED,
3670 DROPTAP_FLAG_L2_MISSING);
3671 continue;
3672 }
3673
3674 if (__improbable(pkt->pkt_flow_ip_is_frag &&
3675 !pkt->pkt_flow_ip_is_first_frag)) {
3676 fe = tx_process_continuous_ip_frag(fsw, prev_fe, pkt);
3677 if (__probable(fe != NULL)) {
3678 flow_entry_retain(fe);
3679 goto flow_batch;
3680 } else {
3681 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3682 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FRAG_BAD_CONT,
3683 DROPTAP_FLAG_L2_MISSING);
3684 continue;
3685 }
3686 }
3687
3688 fe = tx_lookup_flow(fsw, pkt, prev_fe);
3689 if (__improbable(fe == NULL)) {
3690 FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
3691 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_NOT_FOUND,
3692 DROPTAP_FLAG_L2_MISSING);
3693 prev_fe = NULL;
3694 continue;
3695 }
3696 flow_batch:
3697 tx_flow_batch_packet(&fes, fe, pkt);
3698 prev_fe = fe;
3699 }
3700
3701 struct flow_entry *tfe = NULL;
3702 TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
3703 tx_flow_process(fsw, fe, 0);
3704 TAILQ_REMOVE(&fes, fe, fe_tx_link);
3705 fe->fe_tx_is_cont_frag = false;
3706 fe->fe_tx_frag_id = 0;
3707 flow_entry_release(&fe);
3708 n_flows++;
3709 }
3710
3711 done:
3712 FSW_RUNLOCK(fsw);
3713 if (n_flows > 0) {
3714 netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL | (do_pacing ? NETIF_XMIT_FLAG_PACING : 0));
3715 }
3716 if (ifp != NULL) {
3717 fsw_datamov_end(fsw);
3718 }
3719 dp_drop_pktq(fsw, &dropped_pkts, 1, reason, line, DROPTAP_FLAG_L2_MISSING);
3720 KPKTQ_FINI(&dropped_pkts);
3721 KPKTQ_FINI(&dpktq);
3722 }
3723
3724 static sa_family_t
get_tso_af(struct __kern_packet * pkt)3725 get_tso_af(struct __kern_packet *pkt)
3726 {
3727 packet_tso_flags_t tso_flags;
3728
3729 tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
3730 if (tso_flags == PACKET_TSO_IPV4) {
3731 return AF_INET;
3732 } else if (tso_flags == PACKET_TSO_IPV6) {
3733 return AF_INET6;
3734 } else {
3735 panic("invalid tso flags: 0x%x\n", tso_flags);
3736 /* NOTREACHED */
3737 __builtin_unreachable();
3738 }
3739 }
3740
3741 static inline void
update_flow_info(struct __kern_packet * pkt,void * iphdr,void * tcphdr,uint16_t payload_sz)3742 update_flow_info(struct __kern_packet *pkt, void *iphdr, void *tcphdr, uint16_t payload_sz)
3743 {
3744 struct tcphdr *__single tcp = tcphdr;
3745
3746 DTRACE_SKYWALK4(update__flow__info, struct __kern_packet *, pkt,
3747 void *, iphdr, void *, tcphdr, uint16_t, payload_sz);
3748 pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iphdr;
3749 pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcphdr;
3750 pkt->pkt_flow_tcp_flags = tcp->th_flags;
3751 pkt->pkt_flow_tcp_seq = tcp->th_seq;
3752 pkt->pkt_flow_ulen = payload_sz;
3753 }
3754
3755 static int
do_gso(struct nx_flowswitch * fsw,int af,struct __kern_packet * orig_pkt,struct __kern_packet * first_pkt,struct pktq * dev_pktq,struct pktq * gso_pktq)3756 do_gso(struct nx_flowswitch *fsw, int af, struct __kern_packet *orig_pkt,
3757 struct __kern_packet *first_pkt, struct pktq *dev_pktq,
3758 struct pktq *gso_pktq)
3759 {
3760 ifnet_t ifp = fsw->fsw_ifp;
3761 struct __kern_packet *pkt = first_pkt;
3762 uint8_t proto = pkt->pkt_flow_ip_proto;
3763 uint16_t ip_hlen = pkt->pkt_flow_ip_hlen;
3764 uint16_t tcp_hlen = pkt->pkt_flow_tcp_hlen;
3765 uint16_t total_hlen = ip_hlen + tcp_hlen;
3766 uint16_t mtu = (uint16_t)ifp->if_mtu;
3767 uint16_t mss = pkt->pkt_proto_seg_sz, payload_sz;
3768 uint32_t n, n_pkts, off = 0, total_len = orig_pkt->pkt_length;
3769 uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
3770 kern_packet_t orig_ph = SK_PKT2PH(orig_pkt);
3771 uint8_t *orig_pkt_baddr;
3772 struct tcphdr *tcp;
3773 struct ip *ip;
3774 struct ip6_hdr *ip6;
3775 uint32_t tcp_seq;
3776 uint16_t ipid;
3777 uint32_t pseudo_hdr_csum, bufsz;
3778 uint64_t pkt_tx_timestamp = 0;
3779
3780 ASSERT(headroom <= UINT8_MAX);
3781 if (proto != IPPROTO_TCP) {
3782 SK_ERR("invalid proto: %d", proto);
3783 DTRACE_SKYWALK3(invalid__proto, struct nx_flowswitch *,
3784 fsw, ifnet_t, ifp, uint8_t, proto);
3785 return EINVAL;
3786 }
3787 if (mss == 0 || mss > (mtu - total_hlen)) {
3788 SK_ERR("invalid args: mss %d, mtu %d, total_hlen %d",
3789 mss, mtu, total_hlen);
3790 DTRACE_SKYWALK5(invalid__args1, struct nx_flowswitch *,
3791 fsw, ifnet_t, ifp, uint16_t, mss, uint16_t, mtu,
3792 uint32_t, total_hlen);
3793 return EINVAL;
3794 }
3795 bufsz = PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp);
3796 if ((headroom + total_hlen + mss) > bufsz) {
3797 SK_ERR("invalid args: headroom %d, total_hlen %d, "
3798 "mss %d, bufsz %d", headroom, total_hlen, mss, bufsz);
3799 DTRACE_SKYWALK6(invalid__args2, struct nx_flowswitch *,
3800 fsw, ifnet_t, ifp, uint16_t, headroom, uint16_t,
3801 total_hlen, uint16_t, mss, uint32_t, bufsz);
3802 return EINVAL;
3803 }
3804 n_pkts = (uint32_t)(SK_ROUNDUP((total_len - total_hlen), mss) / mss);
3805
3806 ASSERT(pkt->pkt_headroom == headroom);
3807 ASSERT(pkt->pkt_length == total_len);
3808 ASSERT(pkt->pkt_l2_len == 0);
3809 ASSERT((pkt->pkt_qum.qum_qflags & QUM_F_FINALIZED) == 0);
3810 ASSERT((pkt->pkt_pflags & PKT_F_TRUNCATED) != 0);
3811 pkt->pkt_pflags &= ~PKT_F_TRUNCATED;
3812 pkt->pkt_proto_seg_sz = 0;
3813 pkt->pkt_csum_flags = 0;
3814 MD_BUFLET_ADDR_ABS(orig_pkt, orig_pkt_baddr);
3815 orig_pkt_baddr += orig_pkt->pkt_headroom;
3816
3817 if (af == AF_INET) {
3818 /*
3819 * XXX -fbounds-safety: can't avoid using forge unless we change
3820 * the flow metadata definition.
3821 */
3822 ip = __unsafe_forge_bidi_indexable(struct ip *,
3823 pkt->pkt_flow_ip_hdr, pkt->pkt_length);
3824 tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
3825 pkt->pkt_flow_tcp_hdr, pkt->pkt_length - ip_hlen);
3826 ipid = ip->ip_id;
3827 pseudo_hdr_csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
3828 pkt->pkt_flow_ipv4_dst.s_addr, 0);
3829 } else {
3830 ASSERT(af == AF_INET6);
3831 tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
3832 pkt->pkt_flow_tcp_hdr, pkt->pkt_length - ip_hlen);
3833 pseudo_hdr_csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
3834 &pkt->pkt_flow_ipv6_dst, 0);
3835 }
3836 tcp_seq = ntohl(tcp->th_seq);
3837
3838 pkt_tx_timestamp = __packet_get_tx_timestamp(orig_ph);
3839
3840 for (n = 1, payload_sz = mss, off = total_hlen; off < total_len;
3841 off += payload_sz) {
3842 uint8_t *baddr, *baddr0;
3843 uint32_t partial;
3844
3845 if (pkt == NULL) {
3846 n++;
3847 KPKTQ_DEQUEUE(dev_pktq, pkt);
3848 ASSERT(pkt != NULL);
3849 }
3850 MD_BUFLET_ADDR_ABS(pkt, baddr0);
3851 baddr = baddr0;
3852 baddr += headroom;
3853
3854 /* Copy headers from the original packet */
3855 if (n != 1) {
3856 ASSERT(pkt != first_pkt);
3857 pkt_copy(orig_pkt_baddr, baddr, total_hlen);
3858 fsw_pkt_copy_metadata(first_pkt, pkt);
3859
3860 ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
3861 /* flow info still needs to be updated below */
3862 bcopy(first_pkt->pkt_flow, pkt->pkt_flow,
3863 sizeof(*pkt->pkt_flow));
3864 pkt->pkt_trace_id = 0;
3865 ASSERT(pkt->pkt_headroom == headroom);
3866 } else {
3867 METADATA_SET_LEN(pkt, 0, 0);
3868 }
3869 baddr += total_hlen;
3870
3871 /* copy tx timestamp from the orignal packet */
3872 __packet_set_tx_timestamp(SK_PKT2PH(pkt), pkt_tx_timestamp);
3873
3874 /* Copy/checksum the payload from the original packet */
3875 if (off + payload_sz > total_len) {
3876 payload_sz = (uint16_t)(total_len - off);
3877 }
3878 pkt_copypkt_sum(orig_ph,
3879 (uint16_t)(orig_pkt->pkt_headroom + off),
3880 SK_PKT2PH(pkt), headroom + total_hlen, payload_sz,
3881 &partial, TRUE);
3882
3883 DTRACE_SKYWALK6(copy__csum, struct nx_flowswitch *, fsw,
3884 ifnet_t, ifp, uint8_t *, baddr, uint16_t, payload_sz,
3885 uint16_t, mss, uint32_t, partial);
3886 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
3887
3888 /*
3889 * Adjust header information and fill in the missing fields.
3890 */
3891 if (af == AF_INET) {
3892 ip = (struct ip *)(void *)(baddr0 + pkt->pkt_headroom);
3893 tcp = (struct tcphdr *)(void *)((caddr_t)ip + ip_hlen);
3894
3895 if (n != n_pkts) {
3896 tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3897 }
3898 if (n != 1) {
3899 tcp->th_flags &= ~TH_CWR;
3900 tcp->th_seq = htonl(tcp_seq);
3901 }
3902 update_flow_info(pkt, ip, tcp, payload_sz);
3903
3904 ip->ip_id = htons((ipid)++);
3905 ip->ip_len = htons(ip_hlen + tcp_hlen + payload_sz);
3906 ip->ip_sum = 0;
3907 ip->ip_sum = inet_cksum_buffer(ip, 0, 0, ip_hlen);
3908 tcp->th_sum = 0;
3909
3910 partial = __packet_cksum(tcp, tcp_hlen, partial);
3911 partial += htons(tcp_hlen + IPPROTO_TCP + payload_sz);
3912 partial += pseudo_hdr_csum;
3913 ADDCARRY(partial);
3914 tcp->th_sum = ~(uint16_t)partial;
3915 } else {
3916 ASSERT(af == AF_INET6);
3917 ip6 = (struct ip6_hdr *)(baddr0 + pkt->pkt_headroom);
3918 tcp = (struct tcphdr *)(void *)((caddr_t)ip6 + ip_hlen);
3919
3920 if (n != n_pkts) {
3921 tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3922 }
3923 if (n != 1) {
3924 tcp->th_flags &= ~TH_CWR;
3925 tcp->th_seq = htonl(tcp_seq);
3926 }
3927 update_flow_info(pkt, ip6, tcp, payload_sz);
3928
3929 ip6->ip6_plen = htons(tcp_hlen + payload_sz);
3930 tcp->th_sum = 0;
3931 partial = __packet_cksum(tcp, tcp_hlen, partial);
3932 partial += htonl(tcp_hlen + IPPROTO_TCP + payload_sz);
3933 partial += pseudo_hdr_csum;
3934 ADDCARRY(partial);
3935 tcp->th_sum = ~(uint16_t)partial;
3936 }
3937 tcp_seq += payload_sz;
3938 METADATA_ADJUST_LEN(pkt, total_hlen, headroom);
3939 #if (DEVELOPMENT || DEBUG)
3940 struct __kern_buflet *bft;
3941 uint32_t blen;
3942 PKT_GET_FIRST_BUFLET(pkt, 1, bft);
3943 blen = __buflet_get_data_length(bft);
3944 if (blen != total_hlen + payload_sz) {
3945 panic("blen (%d) != total_len + payload_sz (%d)\n",
3946 blen, total_hlen + payload_sz);
3947 }
3948 #endif /* DEVELOPMENT || DEBUG */
3949
3950 pkt->pkt_length = total_hlen + payload_sz;
3951 KPKTQ_ENQUEUE(gso_pktq, pkt);
3952 pkt = NULL;
3953
3954 /*
3955 * Note that at this point the packet is not yet finalized.
3956 * The finalization happens in dp_flow_tx_process() after
3957 * the framing is done.
3958 */
3959 }
3960 ASSERT(n == n_pkts);
3961 ASSERT(off == total_len);
3962 DTRACE_SKYWALK7(gso__done, struct nx_flowswitch *, fsw, ifnet_t, ifp,
3963 uint32_t, n_pkts, uint32_t, total_len, uint16_t, ip_hlen,
3964 uint16_t, tcp_hlen, uint8_t *, orig_pkt_baddr);
3965 return 0;
3966 }
3967
3968 static void
tx_flow_enqueue_gso_pktq(struct flow_entry_list * fes,struct flow_entry * fe,struct pktq * gso_pktq)3969 tx_flow_enqueue_gso_pktq(struct flow_entry_list *fes, struct flow_entry *fe,
3970 struct pktq *gso_pktq)
3971 {
3972 if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
3973 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
3974 TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
3975 KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3976 KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3977 KPKTQ_INIT(gso_pktq);
3978 } else {
3979 ASSERT(!TAILQ_EMPTY(fes));
3980 KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3981 KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3982 KPKTQ_INIT(gso_pktq);
3983 flow_entry_release(&fe);
3984 }
3985 }
3986
3987 static void
dp_gso_pktq(struct nx_flowswitch * fsw,struct pktq * spktq,uint32_t gso_pkts_estimate)3988 dp_gso_pktq(struct nx_flowswitch *fsw, struct pktq *spktq,
3989 uint32_t gso_pkts_estimate)
3990 {
3991 struct __kern_packet *spkt, *pkt;
3992 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3993 struct flow_entry *__single fe, *__single prev_fe;
3994 struct pktq dpktq;
3995 struct nexus_adapter *dev_na;
3996 struct kern_pbufpool *dev_pp;
3997 struct ifnet *ifp = NULL;
3998 sa_family_t af;
3999 uint32_t n_pkts, n_flows = 0;
4000 int err;
4001
4002 KPKTQ_INIT(&dpktq);
4003 n_pkts = KPKTQ_LEN(spktq);
4004
4005 FSW_RLOCK(fsw);
4006 if (__improbable(FSW_QUIESCED(fsw))) {
4007 DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
4008 SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
4009 dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_QUIESCED, __LINE__,
4010 DROPTAP_FLAG_L2_MISSING);
4011 goto done;
4012 }
4013 dev_na = fsw->fsw_dev_ch->ch_na;
4014 if (__improbable(dev_na == NULL)) {
4015 SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
4016 FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
4017 dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_TX_DEVPORT_NOT_ATTACHED,
4018 __LINE__, DROPTAP_FLAG_L2_MISSING);
4019 goto done;
4020 }
4021 ifp = fsw_datamov_begin(fsw);
4022 if (ifp == NULL) {
4023 SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
4024 dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_IFNET_NOT_ATTACHED,
4025 __LINE__, DROPTAP_FLAG_L2_MISSING);
4026 goto done;
4027 }
4028
4029 dev_pp = na_kr_get_pp(dev_na, NR_TX);
4030
4031 /*
4032 * Batch allocate enough packets to perform GSO on all
4033 * packets in spktq.
4034 */
4035 err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq,
4036 gso_pkts_estimate, NULL, NULL, SKMEM_NOSLEEP);
4037 #if DEVELOPMENT || DEBUG
4038 if (__probable(err != ENOMEM)) {
4039 _FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
4040 }
4041 #endif /* DEVELOPMENT || DEBUG */
4042 /*
4043 * We either get all packets or none. No partial allocations.
4044 */
4045 if (__improbable(err != 0)) {
4046 if (err == ENOMEM) {
4047 ASSERT(KPKTQ_EMPTY(&dpktq));
4048 } else {
4049 dp_free_pktq(fsw, &dpktq);
4050 }
4051 DTRACE_SKYWALK1(gso__no__mem, int, err);
4052 dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_PP_ALLOC_FAILED,
4053 __LINE__, DROPTAP_FLAG_L2_MISSING);
4054 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
4055 SK_ERR("failed to alloc %u pkts from device pool",
4056 gso_pkts_estimate);
4057 goto done;
4058 }
4059 prev_fe = NULL;
4060 KPKTQ_FOREACH(spkt, spktq) {
4061 KPKTQ_DEQUEUE(&dpktq, pkt);
4062 ASSERT(pkt != NULL);
4063 /*
4064 * Copy only headers to the first packet of the GSO chain.
4065 * The headers will be used for classification below.
4066 */
4067 err = dp_copy_headers_to_dev(fsw, spkt, pkt);
4068 if (__improbable(err != 0)) {
4069 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_PKT_COPY_FAILED,
4070 DROPTAP_FLAG_L2_MISSING);
4071 DTRACE_SKYWALK2(copy__headers__failed,
4072 struct nx_flowswitch *, fsw,
4073 struct __kern_packet *, spkt);
4074 continue;
4075 }
4076 af = get_tso_af(pkt);
4077 ASSERT(af == AF_INET || af == AF_INET6);
4078
4079 err = flow_pkt_classify(pkt, ifp, af, false);
4080 if (__improbable(err != 0)) {
4081 dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
4082 FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
4083 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_EXTRACT_FAILED,
4084 DROPTAP_FLAG_L2_MISSING);
4085 DTRACE_SKYWALK4(classify__failed,
4086 struct nx_flowswitch *, fsw,
4087 struct __kern_packet *, spkt,
4088 struct __kern_packet *, pkt,
4089 int, err);
4090 continue;
4091 }
4092 /*
4093 * GSO cannot be done on a fragment and it's a bug in user
4094 * space to mark a fragment as needing GSO.
4095 */
4096 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
4097 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
4098 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FRAG_BAD_CONT,
4099 DROPTAP_FLAG_L2_MISSING);
4100 DTRACE_SKYWALK3(is__frag,
4101 struct nx_flowswitch *, fsw,
4102 struct __kern_packet *, spkt,
4103 struct __kern_packet *, pkt);
4104 continue;
4105 }
4106 fe = tx_lookup_flow(fsw, pkt, prev_fe);
4107 if (__improbable(fe == NULL)) {
4108 FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
4109 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_NOT_FOUND,
4110 DROPTAP_FLAG_L2_MISSING);
4111 DTRACE_SKYWALK3(lookup__failed,
4112 struct nx_flowswitch *, fsw,
4113 struct __kern_packet *, spkt,
4114 struct __kern_packet *, pkt);
4115 prev_fe = NULL;
4116 continue;
4117 }
4118 /*
4119 * Perform GSO on spkt using the flow information
4120 * obtained above.
4121 */
4122 struct pktq gso_pktq;
4123 KPKTQ_INIT(&gso_pktq);
4124 err = do_gso(fsw, af, spkt, pkt, &dpktq, &gso_pktq);
4125 if (__probable(err == 0)) {
4126 tx_flow_enqueue_gso_pktq(&fes, fe, &gso_pktq);
4127 prev_fe = fe;
4128 } else {
4129 DTRACE_SKYWALK1(gso__error, int, err);
4130 /* TODO: increment error stat */
4131 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_GSO_FAILED,
4132 DROPTAP_FLAG_L2_MISSING);
4133 flow_entry_release(&fe);
4134 prev_fe = NULL;
4135 }
4136 KPKTQ_FINI(&gso_pktq);
4137 }
4138 struct flow_entry *tfe = NULL;
4139 TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
4140 /* Chain-enqueue can be used for GSO chains */
4141 tx_flow_process(fsw, fe, FLOW_PROC_FLAG_GSO);
4142 TAILQ_REMOVE(&fes, fe, fe_tx_link);
4143 flow_entry_release(&fe);
4144 n_flows++;
4145 }
4146 done:
4147 FSW_RUNLOCK(fsw);
4148 if (n_flows > 0) {
4149 netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL);
4150 }
4151 if (ifp != NULL) {
4152 fsw_datamov_end(fsw);
4153 }
4154
4155 /*
4156 * It's possible for packets to be left in dpktq because
4157 * gso_pkts_estimate is only an estimate. The actual number
4158 * of packets needed could be less.
4159 */
4160 uint32_t dpktq_len;
4161 if ((dpktq_len = KPKTQ_LEN(&dpktq)) > 0) {
4162 DTRACE_SKYWALK2(leftover__dev__pkts,
4163 struct nx_flowswitch *, fsw, uint32_t, dpktq_len);
4164 dp_free_pktq(fsw, &dpktq);
4165 }
4166 KPKTQ_FINI(&dpktq);
4167 }
4168
4169 static inline void
fsw_dev_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4170 fsw_dev_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4171 struct proc *p)
4172 {
4173 #pragma unused(p)
4174 uint32_t total_pkts = 0, total_bytes = 0;
4175
4176 for (;;) {
4177 struct pktq pktq;
4178 KPKTQ_INIT(&pktq);
4179 uint32_t n_bytes;
4180 fsw_rx_ring_dequeue_pktq(fsw, r, fsw_rx_batch, &pktq, &n_bytes);
4181 if (n_bytes == 0) {
4182 break;
4183 }
4184 total_pkts += KPKTQ_LEN(&pktq);
4185 total_bytes += n_bytes;
4186
4187 if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) {
4188 fsw_receive(fsw, &pktq);
4189 } else {
4190 fsw_dev_input_netem_enqueue(fsw, &pktq);
4191 }
4192 KPKTQ_FINI(&pktq);
4193 }
4194
4195 KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
4196 DTRACE_SKYWALK2(fsw__dp__dev__ring__flush, uint32_t, total_pkts,
4197 uint32_t, total_bytes);
4198
4199 /* compute mitigation rate for delivered traffic */
4200 if (__probable(r->ckr_netif_mit_stats != NULL)) {
4201 r->ckr_netif_mit_stats(r, total_pkts, total_bytes);
4202 }
4203 }
4204
4205 static inline void
fsw_user_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4206 fsw_user_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4207 struct proc *p)
4208 {
4209 #pragma unused(p)
4210 static packet_trace_id_t trace_id = 0;
4211 uint32_t total_pkts = 0, total_bytes = 0;
4212
4213 for (;;) {
4214 struct pktq pktq;
4215 KPKTQ_INIT(&pktq);
4216 uint32_t n_bytes;
4217 uint32_t gso_pkts_estimate = 0;
4218
4219 fsw_tx_ring_dequeue_pktq(fsw, r, fsw_tx_batch, &pktq, &n_bytes,
4220 &gso_pkts_estimate);
4221 if (n_bytes == 0) {
4222 break;
4223 }
4224 total_pkts += KPKTQ_LEN(&pktq);
4225 total_bytes += n_bytes;
4226
4227 KPKTQ_FIRST(&pktq)->pkt_trace_id = ++trace_id;
4228 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_START,
4229 KPKTQ_FIRST(&pktq)->pkt_trace_id);
4230
4231 if (gso_pkts_estimate > 0) {
4232 dp_gso_pktq(fsw, &pktq, gso_pkts_estimate);
4233 } else {
4234 dp_tx_pktq(fsw, &pktq);
4235 }
4236 dp_free_pktq(fsw, &pktq);
4237 KPKTQ_FINI(&pktq);
4238 }
4239 kr_update_stats(r, total_pkts, total_bytes);
4240
4241 KDBG(SK_KTRACE_FSW_USER_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
4242 DTRACE_SKYWALK2(fsw__dp__user__ring__flush, uint32_t, total_pkts,
4243 uint32_t, total_bytes);
4244 }
4245
4246 void
fsw_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4247 fsw_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4248 struct proc *p)
4249 {
4250 struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
4251
4252 ASSERT(sk_is_sync_protected());
4253 ASSERT(vpna->vpna_nx_port != FSW_VP_HOST);
4254 ASSERT(vpna->vpna_up.na_md_type == NEXUS_META_TYPE_PACKET);
4255
4256 if (vpna->vpna_nx_port == FSW_VP_DEV) {
4257 fsw_dev_ring_flush(fsw, r, p);
4258 } else {
4259 fsw_user_ring_flush(fsw, r, p);
4260 }
4261 }
4262
4263 int
fsw_dp_ctor(struct nx_flowswitch * fsw)4264 fsw_dp_ctor(struct nx_flowswitch *fsw)
4265 {
4266 uint32_t fe_cnt = fsw_fe_table_size;
4267 uint32_t fob_cnt = fsw_flow_owner_buckets;
4268 uint32_t frb_cnt = fsw_flow_route_buckets;
4269 uint32_t frib_cnt = fsw_flow_route_id_buckets;
4270 struct kern_nexus *nx = fsw->fsw_nx;
4271 char name[64];
4272 const char *__null_terminated fsw_name = NULL;
4273 int error = 0;
4274
4275 /* just in case */
4276 if (fe_cnt == 0) {
4277 fe_cnt = NX_FSW_FE_TABLESZ;
4278 ASSERT(fe_cnt != 0);
4279 }
4280 if (fob_cnt == 0) {
4281 fob_cnt = NX_FSW_FOB_HASHSZ;
4282 ASSERT(fob_cnt != 0);
4283 }
4284 if (frb_cnt == 0) {
4285 frb_cnt = NX_FSW_FRB_HASHSZ;
4286 ASSERT(frb_cnt != 0);
4287 }
4288 if (frib_cnt == 0) {
4289 frib_cnt = NX_FSW_FRIB_HASHSZ;
4290 ASSERT(frib_cnt != 0);
4291 }
4292
4293 /* make sure fe_cnt is a power of two, else round up */
4294 if ((fe_cnt & (fe_cnt - 1)) != 0) {
4295 fe_cnt--;
4296 fe_cnt |= (fe_cnt >> 1);
4297 fe_cnt |= (fe_cnt >> 2);
4298 fe_cnt |= (fe_cnt >> 4);
4299 fe_cnt |= (fe_cnt >> 8);
4300 fe_cnt |= (fe_cnt >> 16);
4301 fe_cnt++;
4302 }
4303
4304 /* make sure frb_cnt is a power of two, else round up */
4305 if ((frb_cnt & (frb_cnt - 1)) != 0) {
4306 frb_cnt--;
4307 frb_cnt |= (frb_cnt >> 1);
4308 frb_cnt |= (frb_cnt >> 2);
4309 frb_cnt |= (frb_cnt >> 4);
4310 frb_cnt |= (frb_cnt >> 8);
4311 frb_cnt |= (frb_cnt >> 16);
4312 frb_cnt++;
4313 }
4314
4315 lck_mtx_init(&fsw->fsw_detach_barrier_lock, &nexus_lock_group,
4316 &nexus_lock_attr);
4317 lck_mtx_init(&fsw->fsw_reap_lock, &nexus_lock_group, &nexus_lock_attr);
4318 lck_mtx_init(&fsw->fsw_linger_lock, &nexus_lock_group, &nexus_lock_attr);
4319 TAILQ_INIT(&fsw->fsw_linger_head);
4320 lck_mtx_init(&fsw->fsw_rxstrc_lock, &nexus_lock_group, &nexus_lock_attr);
4321 TAILQ_INIT(&fsw->fsw_rxstrc_head);
4322
4323 fsw_name = tsnprintf(name, sizeof(name), "%s_%llu", NX_FSW_NAME, nx->nx_id);
4324 error = nx_advisory_alloc(nx, fsw_name,
4325 &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
4326 NEXUS_ADVISORY_TYPE_FLOWSWITCH);
4327 if (error != 0) {
4328 fsw_dp_dtor(fsw);
4329 return error;
4330 }
4331
4332 fsw->fsw_flow_mgr = flow_mgr_create(fe_cnt, fob_cnt, frb_cnt, frib_cnt);
4333 if (fsw->fsw_flow_mgr == NULL) {
4334 fsw_dp_dtor(fsw);
4335 return error;
4336 }
4337
4338 /* generic name; will be customized upon ifattach */
4339 (void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
4340 FSW_REAP_THREADNAME, name, "");
4341
4342 if (kernel_thread_start(fsw_reap_thread_func, fsw,
4343 &fsw->fsw_reap_thread) != KERN_SUCCESS) {
4344 panic_plain("%s: can't create thread", __func__);
4345 /* NOTREACHED */
4346 __builtin_unreachable();
4347 }
4348 /* this must not fail */
4349 VERIFY(fsw->fsw_reap_thread != NULL);
4350
4351 SK_DF(SK_VERB_MEM, "fsw %p ALLOC", SK_KVA(fsw));
4352
4353
4354 return error;
4355 }
4356
4357 void
fsw_dp_dtor(struct nx_flowswitch * fsw)4358 fsw_dp_dtor(struct nx_flowswitch *fsw)
4359 {
4360 uint64_t f = (1 * NSEC_PER_MSEC); /* 1 ms */
4361 uint64_t s = (1000 * NSEC_PER_SEC); /* 1 sec */
4362 uint32_t i = 0;
4363
4364 #if (DEVELOPMENT || DEBUG)
4365 if (fsw->fsw_rps_threads != NULL) {
4366 for (i = 0; i < fsw->fsw_rps_nthreads; i++) {
4367 fsw_rps_thread_join(fsw, i);
4368 }
4369 kfree_type_counted_by(struct fsw_rps_thread, fsw->fsw_rps_nthreads,
4370 fsw->fsw_rps_threads);
4371 }
4372 #endif /* !DEVELOPMENT && !DEBUG */
4373
4374 nx_advisory_free(fsw->fsw_nx);
4375
4376 if (fsw->fsw_reap_thread != THREAD_NULL) {
4377 /* signal thread to begin self-termination */
4378 lck_mtx_lock(&fsw->fsw_reap_lock);
4379 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATING;
4380
4381 /*
4382 * And wait for thread to terminate; use another
4383 * wait channel here other than fsw_reap_flags to
4384 * make it more explicit. In the event the reaper
4385 * thread misses a wakeup, we'll try again once
4386 * every second (except for the first time).
4387 */
4388 while (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)) {
4389 uint64_t t = 0;
4390
4391 nanoseconds_to_absolutetime((i++ == 0) ? f : s, &t);
4392 clock_absolutetime_interval_to_deadline(t, &t);
4393 ASSERT(t != 0);
4394
4395 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATEBLOCK;
4396 if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)) {
4397 thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4398 }
4399 (void) assert_wait_deadline(&fsw->fsw_reap_thread,
4400 THREAD_UNINT, t);
4401 lck_mtx_unlock(&fsw->fsw_reap_lock);
4402 thread_block(THREAD_CONTINUE_NULL);
4403 lck_mtx_lock(&fsw->fsw_reap_lock);
4404 fsw->fsw_reap_flags &= ~FSW_REAPF_TERMINATEBLOCK;
4405 }
4406 ASSERT(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED);
4407 lck_mtx_unlock(&fsw->fsw_reap_lock);
4408 fsw->fsw_reap_thread = THREAD_NULL;
4409 }
4410
4411 /* free any remaining flow entries in the linger list */
4412 fsw_linger_purge(fsw);
4413 fsw_rxstrc_purge(fsw);
4414
4415 if (fsw->fsw_flow_mgr != NULL) {
4416 flow_mgr_destroy(fsw->fsw_flow_mgr);
4417 fsw->fsw_flow_mgr = NULL;
4418 }
4419
4420
4421 lck_mtx_destroy(&fsw->fsw_detach_barrier_lock, &nexus_lock_group);
4422 lck_mtx_destroy(&fsw->fsw_reap_lock, &nexus_lock_group);
4423 lck_mtx_destroy(&fsw->fsw_linger_lock, &nexus_lock_group);
4424 }
4425
4426 void
fsw_linger_insert(struct flow_entry * fe)4427 fsw_linger_insert(struct flow_entry *fe)
4428 {
4429 struct nx_flowswitch *fsw = fe->fe_fsw;
4430 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4431 SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf)));
4432
4433 net_update_uptime();
4434
4435 ASSERT(flow_entry_refcnt(fe) >= 1);
4436 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4437 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4438 ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
4439 ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE);
4440 ASSERT(fe->fe_linger_wait != 0);
4441 fe->fe_linger_expire = (net_uptime() + fe->fe_linger_wait);
4442 os_atomic_or(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4443
4444 lck_mtx_lock_spin(&fsw->fsw_linger_lock);
4445 TAILQ_INSERT_TAIL(&fsw->fsw_linger_head, fe, fe_linger_link);
4446 fsw->fsw_linger_cnt++;
4447 VERIFY(fsw->fsw_linger_cnt != 0);
4448 lck_mtx_unlock(&fsw->fsw_linger_lock);
4449
4450 fsw_reap_sched(fsw);
4451 }
4452
4453 static void
fsw_linger_remove_internal(struct flow_entry_linger_head * linger_head,struct flow_entry * fe)4454 fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head,
4455 struct flow_entry *fe)
4456 {
4457 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4458 SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf)));
4459
4460 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4461 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4462 ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
4463 os_atomic_andnot(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4464
4465 TAILQ_REMOVE(linger_head, fe, fe_linger_link);
4466 flow_entry_release(&fe);
4467 }
4468
4469 static void
fsw_linger_remove(struct flow_entry * fe)4470 fsw_linger_remove(struct flow_entry *fe)
4471 {
4472 struct nx_flowswitch *fsw = fe->fe_fsw;
4473
4474 LCK_MTX_ASSERT(&fsw->fsw_linger_lock, LCK_MTX_ASSERT_OWNED);
4475
4476 fsw_linger_remove_internal(&fsw->fsw_linger_head, fe);
4477 VERIFY(fsw->fsw_linger_cnt != 0);
4478 fsw->fsw_linger_cnt--;
4479 }
4480
4481 void
fsw_linger_purge(struct nx_flowswitch * fsw)4482 fsw_linger_purge(struct nx_flowswitch *fsw)
4483 {
4484 struct flow_entry *fe, *tfe;
4485
4486 lck_mtx_lock(&fsw->fsw_linger_lock);
4487 TAILQ_FOREACH_SAFE(fe, &fsw->fsw_linger_head, fe_linger_link, tfe) {
4488 fsw_linger_remove(fe);
4489 }
4490 ASSERT(fsw->fsw_linger_cnt == 0);
4491 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4492 lck_mtx_unlock(&fsw->fsw_linger_lock);
4493 }
4494
4495 void
fsw_rxstrc_insert(struct flow_entry * fe)4496 fsw_rxstrc_insert(struct flow_entry *fe)
4497 {
4498 struct nx_flowswitch *fsw = fe->fe_fsw;
4499 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4500 SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf)));
4501
4502 ASSERT(flow_entry_refcnt(fe) >= 1);
4503 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4504 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4505 ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD);
4506 ASSERT(!(fe->fe_flags & FLOWENTF_RXSTRC_PENDING));
4507 os_atomic_or(&fe->fe_flags, FLOWENTF_RXSTRC_PENDING, relaxed);
4508
4509 flow_entry_retain(fe);
4510
4511 lck_mtx_lock_spin(&fsw->fsw_rxstrc_lock);
4512 TAILQ_INSERT_TAIL(&fsw->fsw_rxstrc_head, fe, fe_rxstrc_link);
4513 fsw->fsw_rxstrc_cnt++;
4514 VERIFY(fsw->fsw_rxstrc_cnt != 0);
4515 lck_mtx_unlock(&fsw->fsw_rxstrc_lock);
4516
4517 fsw_reap_sched(fsw);
4518 }
4519
4520 static void
fsw_rxstrc_remove_internal(struct flow_entry_rxstrc_head * rxstrc_head,struct flow_entry * fe)4521 fsw_rxstrc_remove_internal(struct flow_entry_rxstrc_head *rxstrc_head,
4522 struct flow_entry *fe)
4523 {
4524 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4525 SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf)));
4526
4527 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4528 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4529 ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD);
4530 ASSERT(fe->fe_flags & FLOWENTF_RXSTRC_PENDING);
4531 os_atomic_andnot(&fe->fe_flags, FLOWENTF_RXSTRC_PENDING, relaxed);
4532
4533 TAILQ_REMOVE(rxstrc_head, fe, fe_rxstrc_link);
4534 flow_entry_release(&fe);
4535 }
4536
4537 static void
fsw_rxstrc_remove(struct flow_entry * fe)4538 fsw_rxstrc_remove(struct flow_entry *fe)
4539 {
4540 struct nx_flowswitch *fsw = fe->fe_fsw;
4541
4542 LCK_MTX_ASSERT(&fsw->fsw_rxstrc_lock, LCK_MTX_ASSERT_OWNED);
4543
4544 fsw_rxstrc_remove_internal(&fsw->fsw_rxstrc_head, fe);
4545 VERIFY(fsw->fsw_rxstrc_cnt != 0);
4546 fsw->fsw_rxstrc_cnt--;
4547 }
4548
4549 void
fsw_rxstrc_purge(struct nx_flowswitch * fsw)4550 fsw_rxstrc_purge(struct nx_flowswitch *fsw)
4551 {
4552 struct flow_entry *fe, *tfe;
4553
4554 lck_mtx_lock(&fsw->fsw_rxstrc_lock);
4555 TAILQ_FOREACH_SAFE(fe, &fsw->fsw_rxstrc_head, fe_rxstrc_link, tfe) {
4556 fsw_rxstrc_remove(fe);
4557 }
4558 ASSERT(fsw->fsw_rxstrc_cnt == 0);
4559 ASSERT(TAILQ_EMPTY(&fsw->fsw_rxstrc_head));
4560 lck_mtx_unlock(&fsw->fsw_rxstrc_lock);
4561 }
4562
4563 static void
fsw_defunct_rx_stall_channel(struct nx_flowswitch * fsw)4564 fsw_defunct_rx_stall_channel(struct nx_flowswitch *fsw)
4565 {
4566 struct kern_nexus *nx;
4567 uint64_t now = net_uptime();
4568
4569 nx = fsw->fsw_nx;
4570
4571 /* Walk through all channels and check for Rx stall condition */
4572 /* uncrustify doesn't handle C blocks properly */
4573 /* BEGIN IGNORE CODESTYLE */
4574 nx_port_foreach(nx, ^(nexus_port_t nxport) {
4575 struct nexus_adapter *na = nx_port_get_na(nx, nxport);
4576 uint64_t elapsed, enqueue_ts, dequeue_ts;
4577 struct __kern_channel_ring *ring;
4578 struct kern_channel *ch;
4579 struct proc *p;
4580
4581 if (na == NULL || na->na_work_ts == 0 || na->na_rx_rings == NULL) {
4582 return;
4583 }
4584 ch = (struct kern_channel *)na->na_private;
4585 if (ch == NULL) {
4586 return;
4587 }
4588 ring = KR_SINGLE(na->na_rx_rings);
4589 enqueue_ts = ring->ckr_rx_enqueue_ts;
4590 dequeue_ts = ring->ckr_rx_dequeue_ts;
4591 /* Elapsed time since last Rx enqueue */
4592 elapsed = now - enqueue_ts;
4593 if ((dequeue_ts < enqueue_ts) && (elapsed > fsw_rx_stall_thresh)) {
4594 p = proc_find(ch->ch_pid);
4595 if (p == NULL) {
4596 return;
4597 }
4598 if (fsw_rx_stall_defunct) {
4599 kern_channel_defunct(p, ch);
4600 }
4601 proc_rele(p);
4602 DTRACE_SKYWALK3(rx__stall, struct nx_flowswitch *, fsw,
4603 struct nexus_adapter *, na, struct __kern_channel_ring *, ring);
4604 FSW_STATS_INC(FSW_STATS_RX_STALL);
4605 SK_ERR("Rx stall detected in proc %s(%d) (%s): "
4606 "elapsed %llu (s), now: %llu, enqueue: %llu, dequeue: %llu, "
4607 "defunct: %s",
4608 ch->ch_name, ch->ch_pid, fsw->fsw_ifp->if_xname,
4609 elapsed, now, enqueue_ts, dequeue_ts,
4610 fsw_rx_stall_defunct ? "yes" : "no");
4611 }
4612 });
4613 /* END IGNORE CODESTYLE */
4614 }
4615
4616 void
fsw_reap_sched(struct nx_flowswitch * fsw)4617 fsw_reap_sched(struct nx_flowswitch *fsw)
4618 {
4619 ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
4620 lck_mtx_lock_spin(&fsw->fsw_reap_lock);
4621 if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING) &&
4622 !(fsw->fsw_reap_flags & (FSW_REAPF_TERMINATING | FSW_REAPF_TERMINATED))) {
4623 thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4624 }
4625 lck_mtx_unlock(&fsw->fsw_reap_lock);
4626 }
4627
4628 __attribute__((noreturn))
4629 static void
fsw_reap_thread_func(void * v,wait_result_t w)4630 fsw_reap_thread_func(void *v, wait_result_t w)
4631 {
4632 #pragma unused(w)
4633 struct nx_flowswitch *__single fsw = v;
4634
4635 ASSERT(fsw->fsw_reap_thread == current_thread());
4636 /*
4637 * -fbounds-safety: __unsafe_null_terminated_from_indexable provides
4638 * checks to ensure source contains the null terminator, by doing a
4639 * linear scan of the string.
4640 */
4641 thread_set_thread_name(current_thread(),
4642 __unsafe_null_terminated_from_indexable(fsw->fsw_reap_name));
4643
4644 net_update_uptime();
4645
4646 lck_mtx_lock(&fsw->fsw_reap_lock);
4647 VERIFY(!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING));
4648 (void) assert_wait(&fsw->fsw_reap_flags, THREAD_UNINT);
4649 lck_mtx_unlock(&fsw->fsw_reap_lock);
4650 thread_block_parameter(fsw_reap_thread_cont, fsw);
4651 /* NOTREACHED */
4652 __builtin_unreachable();
4653 }
4654
4655 __attribute__((noreturn))
4656 static void
fsw_reap_thread_cont(void * v,wait_result_t wres)4657 fsw_reap_thread_cont(void *v, wait_result_t wres)
4658 {
4659 struct nx_flowswitch *__single fsw = v;
4660 boolean_t low;
4661 uint64_t t = 0;
4662
4663 SK_DF(SK_VERB_FLOW, "%s: running", fsw->fsw_reap_name);
4664
4665 lck_mtx_lock(&fsw->fsw_reap_lock);
4666 if (__improbable(wres == THREAD_INTERRUPTED ||
4667 (fsw->fsw_reap_flags & FSW_REAPF_TERMINATING) != 0)) {
4668 goto terminate;
4669 }
4670
4671 ASSERT(!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED));
4672 fsw->fsw_reap_flags |= FSW_REAPF_RUNNING;
4673 lck_mtx_unlock(&fsw->fsw_reap_lock);
4674
4675 net_update_uptime();
4676
4677 /* prevent detach from happening while we're here */
4678 if (!fsw_detach_barrier_add(fsw)) {
4679 SK_ERR("%s: netagent detached", fsw->fsw_reap_name);
4680 t = 0;
4681 } else {
4682 uint32_t fe_nonviable, fe_freed, fe_aborted;
4683 uint32_t fr_freed, fr_resid = 0;
4684 struct ifnet *ifp = fsw->fsw_ifp;
4685 uint64_t i = FSW_REAP_IVAL;
4686 uint64_t now = net_uptime();
4687 uint64_t last;
4688
4689 ASSERT(fsw->fsw_ifp != NULL);
4690
4691 /*
4692 * Pass 1: process any deferred {withdrawn,nonviable} requests.
4693 */
4694 fe_nonviable = fsw_process_deferred(fsw);
4695
4696 /*
4697 * Pass 2: remove any expired lingering flows.
4698 */
4699 fe_freed = fsw_process_linger(fsw, &fe_aborted);
4700
4701 /*
4702 * Pass 3: process any pending Rx steering rule cleanup flows
4703 */
4704 fsw_process_rxstrc(fsw);
4705
4706 /*
4707 * Pass 4: prune idle flow routes.
4708 */
4709 fr_freed = flow_route_prune(fsw->fsw_flow_mgr,
4710 ifp, &fr_resid);
4711
4712 /*
4713 * Pass 5: prune flow table
4714 *
4715 */
4716 cuckoo_hashtable_try_shrink(fsw->fsw_flow_mgr->fm_flow_table);
4717
4718 SK_DF(SK_VERB_FLOW, "%s: fe_nonviable %u/%u fe_freed %u/%u "
4719 "fe_aborted %u fr_freed %u/%u",
4720 fsw->fsw_flow_mgr->fm_name, fe_nonviable,
4721 (fe_nonviable + fsw->fsw_pending_nonviable),
4722 fe_freed, fsw->fsw_linger_cnt, fe_aborted, fe_freed,
4723 (fe_freed + fr_resid));
4724
4725 /* see if VM memory level is critical */
4726 low = skmem_lowmem_check();
4727
4728 /*
4729 * If things appear to be idle, we can prune away cached
4730 * object that have fallen out of the working sets (this
4731 * is different than purging). Every once in a while, we
4732 * also purge the caches. Note that this is done across
4733 * all flowswitch instances, and so we limit this to no
4734 * more than once every FSW_REAP_SK_THRES seconds.
4735 */
4736 last = os_atomic_load(&fsw_reap_last, relaxed);
4737 if ((low || (last != 0 && (now - last) >= FSW_REAP_SK_THRES)) &&
4738 os_atomic_cmpxchg(&fsw_reap_last, last, now, acq_rel)) {
4739 fsw_purge_cache(fsw, low);
4740
4741 /* increase sleep interval if idle */
4742 if (kdebug_enable == 0 && fsw->fsw_linger_cnt == 0 &&
4743 fsw->fsw_pending_nonviable == 0 && fr_resid == 0) {
4744 i <<= 3;
4745 }
4746 } else if (last == 0) {
4747 os_atomic_store(&fsw_reap_last, now, release);
4748 }
4749
4750 /*
4751 * Additionally, run thru the list of channels and prune
4752 * or purge away cached objects on "idle" channels. This
4753 * check is rate limited to no more than once every
4754 * FSW_DRAIN_CH_THRES seconds.
4755 */
4756 last = fsw->fsw_drain_channel_chk_last;
4757 if (low || (last != 0 && (now - last) >= FSW_DRAIN_CH_THRES)) {
4758 SK_DF(SK_VERB_FLOW, "%s: pruning channels",
4759 fsw->fsw_flow_mgr->fm_name);
4760
4761 fsw->fsw_drain_channel_chk_last = now;
4762 fsw_drain_channels(fsw, now, low);
4763 } else if (__improbable(last == 0)) {
4764 fsw->fsw_drain_channel_chk_last = now;
4765 }
4766
4767 /*
4768 * Finally, invoke the interface's reap callback to
4769 * tell it to prune or purge away cached objects if
4770 * it is idle. This check is rate limited to no more
4771 * than once every FSW_REAP_IF_THRES seconds.
4772 */
4773 last = fsw->fsw_drain_netif_chk_last;
4774 if (low || (last != 0 && (now - last) >= FSW_REAP_IF_THRES)) {
4775 ASSERT(fsw->fsw_nifna != NULL);
4776
4777 if (ifp->if_na_ops != NULL &&
4778 ifp->if_na_ops->ni_reap != NULL) {
4779 SK_DF(SK_VERB_FLOW, "%s: pruning netif",
4780 fsw->fsw_flow_mgr->fm_name);
4781 ifp->if_na_ops->ni_reap(ifp->if_na, ifp,
4782 FSW_REAP_IF_THRES, low);
4783 }
4784
4785 fsw->fsw_drain_netif_chk_last = now;
4786 } else if (__improbable(last == 0)) {
4787 fsw->fsw_drain_netif_chk_last = now;
4788 }
4789
4790 /* emit periodic interface stats ktrace */
4791 last = fsw->fsw_reap_last;
4792 if (last != 0 && (now - last) >= FSW_IFSTATS_THRES) {
4793 KDBG(SK_KTRACE_AON_IF_STATS, ifp->if_data.ifi_ipackets,
4794 ifp->if_data.ifi_ibytes * 8,
4795 ifp->if_data.ifi_opackets,
4796 ifp->if_data.ifi_obytes * 8);
4797
4798 fsw->fsw_reap_last = now;
4799 } else if (__improbable(last == 0)) {
4800 fsw->fsw_reap_last = now;
4801 }
4802
4803 /* Check for Rx stall condition every fsw_rx_stall_thresh seconds */
4804 last = fsw->fsw_rx_stall_chk_last;
4805 if (fsw_rx_stall_thresh != 0) {
4806 if (last != 0 && (now - last) >= fsw_rx_stall_thresh) {
4807 fsw_defunct_rx_stall_channel(fsw);
4808 fsw->fsw_rx_stall_chk_last = now;
4809 } else if (__improbable(last == 0)) {
4810 fsw->fsw_rx_stall_chk_last = now;
4811 }
4812 }
4813
4814 nanoseconds_to_absolutetime(i * NSEC_PER_SEC, &t);
4815 clock_absolutetime_interval_to_deadline(t, &t);
4816 ASSERT(t != 0);
4817
4818 /* allow any pending detach to proceed */
4819 fsw_detach_barrier_remove(fsw);
4820 }
4821
4822 lck_mtx_lock(&fsw->fsw_reap_lock);
4823 if (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING)) {
4824 fsw->fsw_reap_flags &= ~FSW_REAPF_RUNNING;
4825 (void) assert_wait_deadline(&fsw->fsw_reap_flags,
4826 THREAD_UNINT, t);
4827 lck_mtx_unlock(&fsw->fsw_reap_lock);
4828 thread_block_parameter(fsw_reap_thread_cont, fsw);
4829 /* NOTREACHED */
4830 __builtin_unreachable();
4831 } else {
4832 terminate:
4833 LCK_MTX_ASSERT(&fsw->fsw_reap_lock, LCK_MTX_ASSERT_OWNED);
4834 fsw->fsw_reap_flags &= ~(FSW_REAPF_RUNNING | FSW_REAPF_TERMINATING);
4835 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATED;
4836 /*
4837 * And signal any thread waiting for us to terminate;
4838 * wait channel here other than fsw_reap_flags to make
4839 * it more explicit.
4840 */
4841 if (fsw->fsw_reap_flags & FSW_REAPF_TERMINATEBLOCK) {
4842 thread_wakeup((caddr_t)&fsw->fsw_reap_thread);
4843 }
4844 lck_mtx_unlock(&fsw->fsw_reap_lock);
4845
4846 SK_DF(SK_VERB_FLOW, "%s: terminating", fsw->fsw_reap_name);
4847
4848 /* for the extra refcnt from kernel_thread_start() */
4849 thread_deallocate(current_thread());
4850 /* this is the end */
4851 thread_terminate(current_thread());
4852 /* NOTREACHED */
4853 __builtin_unreachable();
4854 }
4855
4856 /* must never get here */
4857 VERIFY(0);
4858 /* NOTREACHED */
4859 __builtin_unreachable();
4860 }
4861
4862 static void
fsw_drain_channels(struct nx_flowswitch * fsw,uint64_t now,boolean_t low)4863 fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low)
4864 {
4865 struct kern_nexus *nx = fsw->fsw_nx;
4866
4867 /* flowswitch protects NA via fsw_lock, see fsw_port_alloc/free */
4868 FSW_RLOCK(fsw);
4869
4870 /* uncrustify doesn't handle C blocks properly */
4871 /* BEGIN IGNORE CODESTYLE */
4872 nx_port_foreach(nx, ^(nexus_port_t p) {
4873 boolean_t purge;
4874 struct nexus_adapter *na = nx_port_get_na(nx, p);
4875
4876 if (na == NULL) {
4877 DTRACE_SKYWALK1(ch__drain__na__null, struct nexus_adapter *, na);
4878 return;
4879 }
4880
4881 /*
4882 * If NA is deactivated, no need to proceed further with channel drain.
4883 * Note: fsw_vp_na_activate takes FSW_WLOCK before clearing the
4884 * NAF_ACTIVE flag.
4885 */
4886 if ((na->na_flags & NAF_ACTIVE) == 0) {
4887 DTRACE_SKYWALK1(ch__drain__na__inactive, struct nexus_adapter *, na);
4888 return;
4889 }
4890
4891 if (na->na_work_ts == 0 || na->na_rx_rings == NULL) {
4892 DTRACE_SKYWALK1(ch__drain__na__invalid, struct nexus_adapter *, na);
4893 return;
4894 }
4895
4896 /*
4897 * If some activity happened in the last FSW_DRAIN_CH_THRES
4898 * seconds on this channel, we reclaim memory if the channel
4899 * throughput is less than the reap threshold value.
4900 */
4901 if ((now - na->na_work_ts) < FSW_DRAIN_CH_THRES) {
4902 struct __kern_channel_ring *__single ring;
4903 channel_ring_stats *stats;
4904 uint64_t bps;
4905
4906 ring = KR_SINGLE(na->na_rx_rings);
4907 stats = &ring->ckr_stats;
4908 bps = stats->crs_bytes_per_second;
4909
4910 if (bps < fsw_channel_reap_thresh) {
4911 purge = FALSE;
4912 na_drain(na, purge);
4913 }
4914 return;
4915 }
4916
4917 /*
4918 * If NA has been inactive for some time (twice the drain
4919 * threshold), we clear the work timestamp to temporarily skip
4920 * this channel until it's active again. Purging cached objects
4921 * can be expensive since we'd need to allocate and construct
4922 * them again, so we do it only when necessary.
4923 */
4924 if (low || ((now - na->na_work_ts) >= (FSW_DRAIN_CH_THRES << 1))) {
4925 na->na_work_ts = 0;
4926 purge = TRUE;
4927 } else {
4928 purge = FALSE;
4929 }
4930
4931 na_drain(na, purge); /* purge/prune caches */
4932 });
4933 /* END IGNORE CODESTYLE */
4934
4935 FSW_RUNLOCK(fsw);
4936 }
4937
4938 static void
fsw_purge_cache(struct nx_flowswitch * fsw,boolean_t low)4939 fsw_purge_cache(struct nx_flowswitch *fsw, boolean_t low)
4940 {
4941 #pragma unused(fsw)
4942 uint64_t o = os_atomic_inc_orig(&fsw_want_purge, relaxed);
4943 uint32_t p = fsw_flow_purge_thresh;
4944 boolean_t purge = (low || (o != 0 && p != 0 && (o % p) == 0));
4945
4946 SK_DF(SK_VERB_FLOW, "%s: %s caches",
4947 fsw->fsw_flow_mgr->fm_name,
4948 (purge ? "purge" : "prune"));
4949
4950 skmem_cache_reap_now(sk_fo_cache, purge);
4951 skmem_cache_reap_now(sk_fe_cache, purge);
4952 skmem_cache_reap_now(sk_fab_cache, purge);
4953 skmem_cache_reap_now(flow_route_cache, purge);
4954 skmem_cache_reap_now(flow_stats_cache, purge);
4955 netns_reap_caches(purge);
4956 skmem_reap_caches(purge);
4957
4958 #if CONFIG_MBUF_MCACHE
4959 if (if_is_fsw_transport_netagent_enabled() && purge) {
4960 mbuf_drain(FALSE);
4961 }
4962 #endif /* CONFIG_MBUF_MCACHE */
4963 }
4964
4965 static void
fsw_flow_handle_low_power(struct nx_flowswitch * fsw,struct flow_entry * fe)4966 fsw_flow_handle_low_power(struct nx_flowswitch *fsw, struct flow_entry *fe)
4967 {
4968 /* When the interface is in low power mode, the flow is nonviable */
4969 if (!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
4970 os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
4971 os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
4972 }
4973 }
4974
4975 static uint32_t
fsw_process_deferred(struct nx_flowswitch * fsw)4976 fsw_process_deferred(struct nx_flowswitch *fsw)
4977 {
4978 struct flow_entry_dead sfed __sk_aligned(8);
4979 struct flow_mgr *fm = fsw->fsw_flow_mgr;
4980 struct flow_entry_dead *fed, *tfed;
4981 LIST_HEAD(, flow_entry_dead) fed_head =
4982 LIST_HEAD_INITIALIZER(fed_head);
4983 uint32_t i, nonviable = 0;
4984 boolean_t lowpowermode = FALSE;
4985
4986 bzero(&sfed, sizeof(sfed));
4987
4988 /*
4989 * The flows become nonviable when the interface
4990 * is in low power mode (edge trigger)
4991 */
4992 if ((fsw->fsw_ifp->if_xflags & IFXF_LOW_POWER) &&
4993 fsw->fsw_ifp->if_low_power_gencnt != fsw->fsw_low_power_gencnt) {
4994 lowpowermode = TRUE;
4995 fsw->fsw_low_power_gencnt = fsw->fsw_ifp->if_low_power_gencnt;
4996 }
4997
4998 /*
4999 * Scan thru the flow entry tree, and commit any pending withdraw or
5000 * nonviable requests. We may need to push stats and/or unassign the
5001 * nexus from NECP, but we cannot do that while holding the locks;
5002 * build a temporary list for those entries.
5003 */
5004 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
5005 struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
5006 struct flow_owner *fo;
5007
5008 /*
5009 * Grab the lock at all costs when handling low power mode
5010 */
5011 if (__probable(!lowpowermode)) {
5012 if (!FOB_TRY_LOCK(fob)) {
5013 continue;
5014 }
5015 } else {
5016 FOB_LOCK(fob);
5017 }
5018
5019 FOB_LOCK_ASSERT_HELD(fob);
5020 RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
5021 struct flow_entry *fe;
5022
5023 RB_FOREACH(fe, flow_entry_id_tree,
5024 &fo->fo_flow_entry_id_head) {
5025 /* try first as reader; skip if we can't */
5026 if (__improbable(lowpowermode)) {
5027 fsw_flow_handle_low_power(fsw, fe);
5028 }
5029 if (__improbable(fe->fe_flags & FLOWENTF_HALF_CLOSED)) {
5030 os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
5031 flow_namespace_half_close(&fe->fe_port_reservation);
5032 }
5033
5034 /* if not withdrawn/nonviable, skip */
5035 if (!fe->fe_want_withdraw &&
5036 !fe->fe_want_nonviable) {
5037 continue;
5038 }
5039 /*
5040 * Here we're holding the lock as writer;
5041 * don't spend too much time as we're
5042 * blocking the data path now.
5043 */
5044 ASSERT(!uuid_is_null(fe->fe_uuid));
5045 /* only need flow UUID and booleans */
5046 uuid_copy(sfed.fed_uuid, fe->fe_uuid);
5047 sfed.fed_want_clonotify =
5048 (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY);
5049 sfed.fed_want_nonviable = fe->fe_want_nonviable;
5050 flow_entry_teardown(fo, fe);
5051
5052 /* do this outside the flow bucket lock */
5053 fed = flow_entry_dead_alloc(Z_WAITOK);
5054 ASSERT(fed != NULL);
5055 *fed = sfed;
5056 LIST_INSERT_HEAD(&fed_head, fed, fed_link);
5057 }
5058 }
5059 FOB_UNLOCK(fob);
5060 }
5061
5062 /*
5063 * These nonviable flows are no longer useful since we've lost
5064 * the source IP address; in the event the client monitors the
5065 * viability of the flow, explicitly mark it as nonviable so
5066 * that a new flow can be created.
5067 */
5068 LIST_FOREACH_SAFE(fed, &fed_head, fed_link, tfed) {
5069 LIST_REMOVE(fed, fed_link);
5070 ASSERT(fsw->fsw_agent_session != NULL);
5071
5072 /* if flow is closed early */
5073 if (fed->fed_want_clonotify) {
5074 necp_client_early_close(fed->fed_uuid);
5075 }
5076
5077 /* if nonviable, unassign nexus attributes */
5078 if (fed->fed_want_nonviable) {
5079 (void) netagent_assign_nexus(fsw->fsw_agent_session,
5080 fed->fed_uuid, NULL, 0);
5081 }
5082
5083 flow_entry_dead_free(fed);
5084 ++nonviable;
5085 }
5086 ASSERT(LIST_EMPTY(&fed_head));
5087
5088 return nonviable;
5089 }
5090
5091 static uint32_t
fsw_process_linger(struct nx_flowswitch * fsw,uint32_t * abort)5092 fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort)
5093 {
5094 struct flow_entry_linger_head linger_head =
5095 TAILQ_HEAD_INITIALIZER(linger_head);
5096 struct flow_entry *fe, *tfe;
5097 uint64_t now = net_uptime();
5098 uint32_t i = 0, cnt = 0, freed = 0;
5099
5100 ASSERT(fsw->fsw_ifp != NULL);
5101 ASSERT(abort != NULL);
5102 *abort = 0;
5103
5104 /*
5105 * We don't want to contend with the datapath, so move
5106 * everything that's in the linger list into a local list.
5107 * This allows us to generate RSTs or free the flow entry
5108 * outside the lock. Any remaining flow entry in the local
5109 * list will get re-added back to the head of the linger
5110 * list, in front of any new ones added since then.
5111 */
5112 lck_mtx_lock(&fsw->fsw_linger_lock);
5113 TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
5114 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
5115 cnt = fsw->fsw_linger_cnt;
5116 fsw->fsw_linger_cnt = 0;
5117 lck_mtx_unlock(&fsw->fsw_linger_lock);
5118
5119 TAILQ_FOREACH_SAFE(fe, &linger_head, fe_linger_link, tfe) {
5120 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
5121 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
5122 ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
5123
5124 /*
5125 * See if this is a TCP flow that needs to generate
5126 * a RST to the remote peer (if not already).
5127 */
5128 if (flow_track_tcp_want_abort(fe)) {
5129 VERIFY(fe->fe_flags & FLOWENTF_ABORTED);
5130 ASSERT(!uuid_is_null(fe->fe_uuid));
5131 flow_track_abort_tcp(fe, NULL, NULL);
5132 (*abort)++;
5133 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
5134 SK_DF(SK_VERB_FLOW, "fe \"%s\" [RST]",
5135 fe2str(fe, dbgbuf, sizeof(dbgbuf)));
5136 }
5137
5138 /*
5139 * If flow has expired, remove from list and free;
5140 * otherwise leave it around in the linger list.
5141 */
5142 if (fe->fe_linger_expire <= now) {
5143 freed++;
5144 fsw_linger_remove_internal(&linger_head, fe);
5145 fe = NULL;
5146 }
5147 ++i;
5148 }
5149 VERIFY(i == cnt && cnt >= freed);
5150
5151 /*
5152 * Add any remaining ones back into the linger list.
5153 */
5154 lck_mtx_lock(&fsw->fsw_linger_lock);
5155 if (!TAILQ_EMPTY(&linger_head)) {
5156 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head) || fsw->fsw_linger_cnt);
5157 TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
5158 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
5159 TAILQ_CONCAT(&fsw->fsw_linger_head, &linger_head, fe_linger_link);
5160 fsw->fsw_linger_cnt += (cnt - freed);
5161 }
5162 ASSERT(TAILQ_EMPTY(&linger_head));
5163 lck_mtx_unlock(&fsw->fsw_linger_lock);
5164
5165 return freed;
5166 }
5167
5168 static void
fsw_process_rxstrc(struct nx_flowswitch * fsw)5169 fsw_process_rxstrc(struct nx_flowswitch *fsw)
5170 {
5171 struct flow_entry_rxstrc_head rxstrc_head =
5172 TAILQ_HEAD_INITIALIZER(rxstrc_head);
5173 struct flow_entry *fe, *tfe;
5174
5175 /*
5176 * We don't want to contend with the datapath, so move
5177 * everything that's in the rxstrc list into a local list.
5178 * This allows us to cleanup Rx steering rules or free the flow entry
5179 * outside the lock.
5180 */
5181 lck_mtx_lock(&fsw->fsw_rxstrc_lock);
5182 TAILQ_CONCAT(&rxstrc_head, &fsw->fsw_rxstrc_head, fe_rxstrc_link);
5183 ASSERT(TAILQ_EMPTY(&fsw->fsw_rxstrc_head));
5184 fsw->fsw_rxstrc_cnt = 0;
5185 lck_mtx_unlock(&fsw->fsw_rxstrc_lock);
5186
5187 TAILQ_FOREACH_SAFE(fe, &rxstrc_head, fe_rxstrc_link, tfe) {
5188 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
5189 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
5190 ASSERT(fe->fe_flags & FLOWENTF_RXSTRC_PENDING);
5191 ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD);
5192
5193 flow_entry_rx_steering_rule_cleanup(fsw, fe);
5194 fsw_rxstrc_remove_internal(&rxstrc_head, fe);
5195 fe = NULL;
5196 }
5197 }
5198
5199 __attribute__((always_inline))
5200 static inline void
fsw_ifp_inc_traffic_class_in_pkt(struct ifnet * ifp,kern_packet_t ph)5201 fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph)
5202 {
5203 switch (__packet_get_traffic_class(ph)) {
5204 case PKT_TC_BE:
5205 ifp->if_tc.ifi_ibepackets++;
5206 ifp->if_tc.ifi_ibebytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5207 break;
5208 case PKT_TC_BK:
5209 ifp->if_tc.ifi_ibkpackets++;
5210 ifp->if_tc.ifi_ibkbytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5211 break;
5212 case PKT_TC_VI:
5213 ifp->if_tc.ifi_ivipackets++;
5214 ifp->if_tc.ifi_ivibytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5215 break;
5216 case PKT_TC_VO:
5217 ifp->if_tc.ifi_ivopackets++;
5218 ifp->if_tc.ifi_ivobytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5219 break;
5220 default:
5221 break;
5222 }
5223 }
5224
5225 __attribute__((always_inline))
5226 static inline void
fsw_ifp_inc_traffic_class_out_pkt(struct ifnet * ifp,uint32_t svc,uint32_t cnt,uint32_t len)5227 fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *ifp, uint32_t svc,
5228 uint32_t cnt, uint32_t len)
5229 {
5230 switch (svc) {
5231 case PKT_TC_BE:
5232 ifp->if_tc.ifi_obepackets += cnt;
5233 ifp->if_tc.ifi_obebytes += len;
5234 break;
5235 case PKT_TC_BK:
5236 ifp->if_tc.ifi_obkpackets += cnt;
5237 ifp->if_tc.ifi_obkbytes += len;
5238 break;
5239 case PKT_TC_VI:
5240 ifp->if_tc.ifi_ovipackets += cnt;
5241 ifp->if_tc.ifi_ovibytes += len;
5242 break;
5243 case PKT_TC_VO:
5244 ifp->if_tc.ifi_ovopackets += cnt;
5245 ifp->if_tc.ifi_ovobytes += len;
5246 break;
5247 default:
5248 break;
5249 }
5250 }
5251