1 /*
2 * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 */
53
54 /*
55 * BSD LICENSE
56 *
57 * Copyright(c) 2015 NEC Europe Ltd. All rights reserved.
58 * All rights reserved.
59 *
60 * Redistribution and use in source and binary forms, with or without
61 * modification, are permitted provided that the following conditions
62 * are met:
63 *
64 * * Redistributions of source code must retain the above copyright
65 * notice, this list of conditions and the following disclaimer.
66 * * Redistributions in binary form must reproduce the above copyright
67 * notice, this list of conditions and the following disclaimer in
68 * the documentation and/or other materials provided with the
69 * distribution.
70 * * Neither the name of NEC Europe Ltd. nor the names of
71 * its contributors may be used to endorse or promote products derived
72 * from this software without specific prior written permission.
73 *
74 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
75 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
76 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
77 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
78 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
79 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
80 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
81 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
82 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
83 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
84 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85 */
86
87 #include <skywalk/os_skywalk_private.h>
88 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
89 #include <skywalk/nexus/flowswitch/fsw_var.h>
90 #include <skywalk/nexus/netif/nx_netif.h>
91 #include <skywalk/nexus/netif/nx_netif_compat.h>
92 #include <kern/sched_prim.h>
93 #include <sys/kdebug.h>
94 #include <sys/sdt.h>
95 #include <net/bpf.h>
96 #include <net/if_ports_used.h>
97 #include <net/pktap.h>
98 #include <net/droptap.h>
99 #include <net/pktsched/pktsched_netem.h>
100 #include <netinet/tcp.h>
101 #include <netinet/udp.h>
102 #include <netinet/ip.h>
103 #include <netinet/ip6.h>
104 #include <netinet/in_var.h>
105
106 extern kern_return_t thread_terminate(thread_t);
107
108 #define FSW_ZONE_MAX 256
109 #define FSW_ZONE_NAME "skywalk.nx.fsw"
110
111 static uint64_t fsw_reap_last __sk_aligned(8);
112 static uint64_t fsw_want_purge __sk_aligned(8);
113
114 #define NX_FSW_FE_TABLESZ 256 /* some power of 2 */
115 static uint32_t fsw_fe_table_size = NX_FSW_FE_TABLESZ;
116
117 #define NX_FSW_FOB_HASHSZ 31 /* some mersenne prime */
118 static uint32_t fsw_flow_owner_buckets = NX_FSW_FOB_HASHSZ;
119
120 #define NX_FSW_FRB_HASHSZ 128 /* some power of 2 */
121 static uint32_t fsw_flow_route_buckets = NX_FSW_FRB_HASHSZ;
122
123 #define NX_FSW_FRIB_HASHSZ 13 /* some mersenne prime */
124 static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ;
125
126 #define NX_FSW_FLOW_REAP_INTERVAL 1 /* seconds */
127 static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL;
128
129 #define NX_FSW_RX_STALL_THRES 10 /* seconds */
130 static uint32_t fsw_rx_stall_thresh = NX_FSW_RX_STALL_THRES;
131
132 #define NX_FSW_RX_STALL_DEFUNCT 1 /* defunct Rx-stalled channel (0 = disable) */
133 static uint32_t fsw_rx_stall_defunct = NX_FSW_RX_STALL_DEFUNCT;
134
135 #define NX_FSW_FLOW_PURGE_THRES 0 /* purge every N reaps (0 = disable) */
136 static uint32_t fsw_flow_purge_thresh = NX_FSW_FLOW_PURGE_THRES;
137
138 #define FSW_REAP_IVAL (MAX(1, fsw_flow_reap_interval))
139 #define FSW_REAP_SK_THRES (FSW_REAP_IVAL << 5)
140 #define FSW_REAP_IF_THRES (FSW_REAP_IVAL << 5)
141 #define FSW_DRAIN_CH_THRES (FSW_REAP_IVAL << 5)
142 #define FSW_IFSTATS_THRES 1
143
144 #define NX_FSW_CHANNEL_REAP_THRES 1000 /* threshold (bytes/sec) for reaping*/
145 uint64_t fsw_channel_reap_thresh = NX_FSW_CHANNEL_REAP_THRES;
146
147 #define RX_BUFLET_BATCH_COUNT 64 /* max batch size for buflet allocation */
148
149 uint32_t fsw_rx_batch = NX_FSW_RXBATCH; /* # of packets per batch (RX) */
150 uint32_t fsw_tx_batch = NX_FSW_TXBATCH; /* # of packets per batch (TX) */
151 uint32_t fsw_gso_batch = 8;
152 #if (DEVELOPMENT || DEBUG)
153 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_batch,
154 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_batch, 0,
155 "flowswitch Rx batch size");
156 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, tx_batch,
157 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_tx_batch, 0,
158 "flowswitch Tx batch size");
159 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_batch,
160 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_gso_batch, 0,
161 "flowswitch GSO batch size");
162 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, reap_throughput,
163 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_channel_reap_thresh,
164 "flowswitch channel reap threshold throughput (bytes/sec)");
165 #endif /* !DEVELOPMENT && !DEBUG */
166
167 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp,
168 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp, 0,
169 "flowswitch RX aggregation for tcp flows (enable/disable)");
170 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp_host,
171 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp_host, 0,
172 "flowswitch RX aggregation for tcp kernel path (0/1/2 (off/on/auto))");
173 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_mtu,
174 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_gso_mtu, 0,
175 "flowswitch GSO for tcp flows (mtu > 0: enable, mtu == 0: disable)");
176
177 /*
178 * IP reassembly
179 * The "kern.skywalk.flowswitch.ip_reass" sysctl can be used to force
180 * enable/disable the reassembly routine regardless of whether the
181 * transport netagent is enabled or not.
182 *
183 * 'fsw_ip_reass' is a tri-state:
184 * 0 means force IP reassembly off
185 * 1 means force IP reassembly on
186 * 2 means don't force the value, use what's appropriate for this flowswitch
187 */
188 #define FSW_IP_REASS_FORCE_OFF 0
189 #define FSW_IP_REASS_FORCE_ON 1
190 #define FSW_IP_REASS_NO_FORCE 2
191
192 uint32_t fsw_ip_reass = FSW_IP_REASS_NO_FORCE;
193
194 static int
195 fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS
196 {
197 #pragma unused(oidp, arg1, arg2)
198 unsigned int new_value;
199 int changed;
200 int error;
201
202 error = sysctl_io_number(req, fsw_ip_reass, sizeof(fsw_ip_reass),
203 &new_value, &changed);
204 if (error == 0 && changed != 0) {
205 if (new_value > FSW_IP_REASS_NO_FORCE) {
206 return EINVAL;
207 }
208 fsw_ip_reass = new_value;
209 }
210 return error;
211 }
212
213 SYSCTL_PROC(_kern_skywalk_flowswitch, OID_AUTO, ip_reass,
214 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
215 0, 0, fsw_ip_reass_sysctl, "IU",
216 "adjust flowswitch IP reassembly");
217
218 #if (DEVELOPMENT || DEBUG)
219 static uint64_t _fsw_inject_error = 0;
220 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) \
221 _SK_INJECT_ERROR(_fsw_inject_error, _en, _ev, _ec, \
222 &FSW_STATS_VAL(_FSW_STATS_ERROR_INJECTIONS), _f, __VA_ARGS__)
223
224 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { \
225 if (__improbable(((_fsw_inject_error) & (1ULL << (_en))) != 0)) { \
226 SK_DF(SK_VERB_ERROR_INJECT, "injecting error %d", (_en));\
227 if ((_f) != NULL) \
228 (_f)(__VA_ARGS__); \
229 } \
230 } while (0)
231
232 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_owner_buckets,
233 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_owner_buckets, 0, "");
234 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, fe_table_size,
235 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_fe_table_size, 0, "");
236 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_buckets,
237 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_route_buckets, 0, "");
238 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO,
239 flow_route_id_buckets, CTLFLAG_RW | CTLFLAG_LOCKED,
240 &fsw_flow_route_id_buckets, 0, "");
241 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_reap_interval,
242 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_reap_interval, 0, "");
243 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_stall_thresh,
244 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_stall_thresh, 0, "");
245 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_stall_defunct,
246 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_stall_defunct, 0, "");
247 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_purge_thresh,
248 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_purge_thresh, 0, "");
249 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, fsw_inject_error,
250 CTLFLAG_RW | CTLFLAG_LOCKED, &_fsw_inject_error, "");
251 #else
252 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0)
253 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { } while (0)
254 #endif /* !DEVELOPMENT && !DEBUG */
255
256 static void fsw_linger_remove_internal(struct flow_entry_linger_head *,
257 struct flow_entry *);
258 static void fsw_reap_thread_func(void *, wait_result_t);
259 static void fsw_reap_thread_cont(void *, wait_result_t);
260 static void fsw_purge_cache(struct nx_flowswitch *, boolean_t);
261 static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t);
262 static uint32_t fsw_process_deferred(struct nx_flowswitch *);
263 static uint32_t fsw_process_linger(struct nx_flowswitch *, uint32_t *);
264
265 static int copy_packet_from_dev(struct nx_flowswitch *, struct __kern_packet *,
266 struct __kern_packet *);
267
268 static void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *, kern_packet_t);
269 static void fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *, uint32_t,
270 uint32_t, uint32_t);
271
272 static int __fsw_dp_inited = 0;
273
274 int
fsw_dp_init(void)275 fsw_dp_init(void)
276 {
277 _CASSERT(FSW_VP_DEV == 0);
278 _CASSERT(FSW_VP_HOST == 1);
279 _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN);
280 _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT);
281
282 ASSERT(!__fsw_dp_inited);
283
284 flow_mgr_init();
285 flow_init();
286
287 __fsw_dp_inited = 1;
288
289 return 0;
290 }
291
292 void
fsw_dp_uninit(void)293 fsw_dp_uninit(void)
294 {
295 if (__fsw_dp_inited) {
296 flow_fini();
297 flow_mgr_fini();
298
299 __fsw_dp_inited = 0;
300 }
301 }
302
303 static void
dp_free_pktq(struct nx_flowswitch * fsw __sk_unused,struct pktq * pktq)304 dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq)
305 {
306 pp_free_pktq(pktq);
307 }
308
309 #define dp_drop_pktq(fsw, pktq, outgoing, _reason, line, _flags) do { \
310 uint32_t _len = KPKTQ_LEN(pktq); \
311 if (KPKTQ_EMPTY(pktq)) { \
312 ASSERT(_len == 0); \
313 return; \
314 } \
315 SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop %d packets", _len); \
316 FSW_STATS_ADD(FSW_STATS_DROP, _len); \
317 DTRACE_SKYWALK1(fsw__dp__drop, int, _len); \
318 if (__probable(droptap_total_tap_count == 0)) { \
319 dp_free_pktq(fsw, pktq); \
320 break; \
321 } \
322 drop_func_t dropfunc; \
323 dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
324 struct __kern_packet *kpkt = KPKTQ_FIRST(pktq); \
325 struct __kern_packet *next_pkt; \
326 for (; kpkt != NULL; kpkt = next_pkt) { \
327 next_pkt = kpkt->pkt_nextpkt; \
328 dropfunc(SK_PKT2PH(kpkt), _reason, __func__, line, _flags, \
329 fsw->fsw_ifp, kpkt->pkt_qum.qum_pid, NULL, -1, NULL, \
330 0, 0); \
331 } \
332 dp_free_pktq(fsw, pktq); \
333 } while (0)
334
335 #define dp_drop_pkt_single(fsw, pkt, outgoing, _reason, _flags) do { \
336 SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop 1 packet"); \
337 FSW_STATS_ADD(FSW_STATS_DROP, 1); \
338 if (__probable(droptap_total_tap_count == 0)) { \
339 pp_free_packet_single(pkt); \
340 break; \
341 } \
342 drop_func_t dropfunc; \
343 dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
344 dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags, \
345 fsw->fsw_ifp, (pkt)->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0); \
346 pp_free_packet_single(pkt); \
347 } while (0)
348
349 #define dp_drop_pkt_chain(pkt, outgoing, _reason, _flags) do { \
350 if (__probable(droptap_total_tap_count == 0)) { \
351 pp_free_packet_chain(pkt, NULL); \
352 break; \
353 } \
354 drop_func_t dropfunc; \
355 dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
356 struct __kern_packet *next_pkt; \
357 for (; pkt != NULL; pkt = next_pkt) { \
358 next_pkt = pkt->pkt_nextpkt; \
359 dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags, \
360 NULL, pkt->pkt_qum.qum_pid, NULL, -1, NULL, \
361 0, 0); \
362 } \
363 pp_free_packet_chain(pkt, NULL); \
364 } while (0)
365
366
367 SK_NO_INLINE_ATTRIBUTE
368 void
fsw_snoop(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * pktq,bool input)369 fsw_snoop(struct nx_flowswitch *fsw, struct flow_entry *fe, struct pktq *pktq,
370 bool input)
371 {
372 pid_t pid;
373 char proc_name_buf[FLOW_PROCESS_NAME_LENGTH];
374 const char *__null_terminated proc_name = NULL;
375 pid_t epid;
376 char eproc_name_buf[FLOW_PROCESS_NAME_LENGTH];
377 const char *__null_terminated eproc_name = NULL;
378 sa_family_t af;
379 bool tap_early = false;
380 struct __kern_packet *pkt;
381
382 ASSERT(fe != NULL);
383 ASSERT(fsw->fsw_ifp != NULL);
384
385 if (fe->fe_nx_port == FSW_VP_HOST) {
386 /* allow packets to be tapped before aggregation happens */
387 tap_early = (input && fe->fe_key.fk_proto == IPPROTO_TCP);
388 if (!tap_early) {
389 /* all other traffic will be tapped in the dlil input path */
390 return;
391 }
392 }
393 if (fe->fe_key.fk_ipver == IPVERSION) {
394 af = AF_INET;
395 } else if (fe->fe_key.fk_ipver == IPV6_VERSION) {
396 af = AF_INET6;
397 } else {
398 return;
399 }
400
401 pid = fe->fe_pid;
402 if (fe->fe_proc_name[0] != '\0') {
403 proc_name = strbufcpy(proc_name_buf, sizeof(proc_name_buf),
404 fe->fe_proc_name, sizeof(fe->fe_proc_name));
405 }
406 epid = fe->fe_epid;
407 if (fe->fe_eproc_name[0] != '\0') {
408 eproc_name = strbufcpy(eproc_name_buf, sizeof(eproc_name_buf),
409 fe->fe_eproc_name, sizeof(fe->fe_eproc_name));
410 }
411 if (input) {
412 KPKTQ_FOREACH(pkt, pktq) {
413 pktap_input_packet(fsw->fsw_ifp, af,
414 fsw->fsw_ifp_dlt, pid, proc_name, epid,
415 eproc_name, SK_PKT2PH(pkt), NULL, 0,
416 IPPROTO_TCP, fe->fe_flowid,
417 tap_early ? PTH_FLAG_SOCKET: PTH_FLAG_NEXUS_CHAN);
418 }
419 } else {
420 KPKTQ_FOREACH(pkt, pktq) {
421 pktap_output_packet(fsw->fsw_ifp, af,
422 fsw->fsw_ifp_dlt, pid, proc_name, epid,
423 eproc_name, SK_PKT2PH(pkt), NULL, 0,
424 0, 0, PTH_FLAG_NEXUS_CHAN);
425 }
426 }
427 }
428
429 #if (DEVELOPMENT || DEBUG)
430 static void
_fsw_error35_handler(int step,struct flow_route * fr,struct __kern_packet * pkt,int * ret)431 _fsw_error35_handler(int step, struct flow_route *fr, struct __kern_packet *pkt,
432 int *ret)
433 {
434 static boolean_t _err35_flag_modified = FALSE;
435
436 switch (step) {
437 case 1:
438 if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
439 (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
440 fr->fr_flags &= ~FLOWRTF_RESOLVED;
441 _err35_flag_modified = TRUE;
442 }
443 break;
444
445 case 2:
446 if (!_err35_flag_modified) {
447 return;
448 }
449 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
450 m_freem(pkt->pkt_mbuf);
451 pkt->pkt_pflags &= ~PKT_F_MBUF_DATA;
452 pkt->pkt_mbuf = NULL;
453 }
454 *ret = EJUSTRETURN;
455 fr->fr_flags |= FLOWRTF_RESOLVED;
456 _err35_flag_modified = FALSE;
457 break;
458
459 default:
460 VERIFY(0);
461 /* not reached */
462 }
463 }
464
465 static void
_fsw_error36_handler(int step,struct flow_route * fr,int * ret)466 _fsw_error36_handler(int step, struct flow_route *fr, int *ret)
467 {
468 static boolean_t _err36_flag_modified = FALSE;
469
470 switch (step) {
471 case 1:
472 if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
473 (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
474 fr->fr_flags &= ~FLOWRTF_RESOLVED;
475 _err36_flag_modified = TRUE;
476 }
477 break;
478
479 case 2:
480 if (!_err36_flag_modified) {
481 return;
482 }
483 *ret = ENETUNREACH;
484 fr->fr_flags |= FLOWRTF_RESOLVED;
485 _err36_flag_modified = FALSE;
486 break;
487
488 default:
489 VERIFY(0);
490 /* not reached */
491 }
492 }
493 #else /* !DEVELOPMENT && !DEBUG */
494 #define _fsw_error35_handler(...)
495 #define _fsw_error36_handler(...)
496 #endif /* DEVELOPMENT || DEBUG */
497
498 /*
499 * Check if the source packet content can fit into the destination
500 * ring's packet. Returns TRUE if the source packet can fit.
501 * Note: Failures could be caused by misconfigured packet pool sizes,
502 * missing packet size check again MTU or if the source packet is from
503 * a compat netif and the attached mbuf is larger than MTU due to LRO.
504 */
505 static inline boolean_t
validate_pkt_len(struct __kern_packet * spkt,kern_packet_t dph,uint32_t skip_l2hlen,uint32_t l2hlen,uint16_t headroom,uint32_t * copy_len)506 validate_pkt_len(struct __kern_packet *spkt, kern_packet_t dph,
507 uint32_t skip_l2hlen, uint32_t l2hlen, uint16_t headroom,
508 uint32_t *copy_len)
509 {
510 uint32_t tlen = 0;
511 uint32_t splen = spkt->pkt_length - skip_l2hlen;
512
513 if (l2hlen != 0) {
514 VERIFY(skip_l2hlen == 0);
515 tlen += l2hlen;
516 } else if ((spkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0) {
517 splen -= ETHER_CRC_LEN;
518 }
519
520 tlen += splen;
521 *copy_len = splen;
522
523 return tlen <= ((__packet_get_buflet_count(dph) *
524 PP_BUF_SIZE_DEF(SK_PTR_ADDR_KPKT(dph)->pkt_qum.qum_pp)) -
525 headroom);
526 }
527
528 #if SK_LOG
529 /* Hoisted out of line to reduce kernel stack footprint */
530 SK_LOG_ATTRIBUTE
531 static void
copy_packet_from_dev_log(struct __kern_packet * spkt,struct __kern_packet * dpkt,struct proc * p)532 copy_packet_from_dev_log(struct __kern_packet *spkt,
533 struct __kern_packet *dpkt, struct proc *p)
534 {
535 uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
536 ((spkt->pkt_pflags & PKT_F_MBUF_DATA) ?
537 SK_VERB_COPY_MBUF : SK_VERB_COPY));
538 char *daddr;
539 uint32_t pkt_len;
540
541 MD_BUFLET_ADDR_ABS(dpkt, daddr);
542 pkt_len = __packet_get_real_data_length(dpkt);
543 SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u",
544 sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
545 dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
546 (uint32_t)dpkt->pkt_l2_len);
547 SK_DF(logflags | SK_VERB_DUMP, "%s",
548 sk_dump("buf", daddr, pkt_len, 128, NULL, 0));
549 }
550 #else
551 #define copy_packet_from_dev_log(...)
552 #endif /* SK_LOG */
553
554
555 static inline int
copy_packet_from_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)556 copy_packet_from_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
557 struct __kern_packet *dpkt)
558 {
559 /*
560 * source and destination nexus don't share the packet pool
561 * sync operation here is to
562 * - alloc packet for the rx(dst) ring
563 * - copy data/metadata from src packet to dst packet
564 * - attach alloc'd packet to rx(dst) ring
565 */
566 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
567 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
568 kern_packet_t sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt),
569 METADATA_SUBTYPE(spkt));
570 boolean_t do_cksum_rx;
571 uint16_t skip_l2h_len = spkt->pkt_l2_len;
572 uint16_t iphlen;
573 uint32_t dlen;
574 int err;
575
576 if (__improbable(!validate_pkt_len(spkt, dph, skip_l2h_len, 0, 0,
577 &dlen))) {
578 SK_ERR("bufcnt %d, bufsz %d", __packet_get_buflet_count(dph),
579 PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp));
580 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
581 return EINVAL;
582 }
583
584 /* Copy packet metadata */
585 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
586 _PKT_COPY(spkt, dpkt);
587 ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
588 PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
589 ASSERT(dpkt->pkt_mbuf == NULL);
590
591 dpkt->pkt_headroom = 0;
592 dpkt->pkt_l2_len = 0;
593
594 /* don't include IP header from partial sum */
595 if (__probable((spkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) {
596 iphlen = spkt->pkt_flow_ip_hlen;
597 do_cksum_rx = sk_cksum_rx;
598 } else {
599 iphlen = 0;
600 do_cksum_rx = FALSE;
601 }
602
603 /* Copy packet payload */
604 if ((spkt->pkt_pflags & PKT_F_MBUF_DATA) &&
605 (spkt->pkt_pflags & PKT_F_TRUNCATED)) {
606 FSW_STATS_INC(FSW_STATS_RX_COPY_MBUF2PKT);
607 /*
608 * Source packet has truncated contents (just enough for
609 * the classifer) of an mbuf from the compat driver; copy
610 * the entire entire mbuf contents to destination packet.
611 */
612 m_adj(spkt->pkt_mbuf, skip_l2h_len);
613 ASSERT((uint32_t)m_pktlen(spkt->pkt_mbuf) >= dlen);
614 fsw->fsw_pkt_copy_from_mbuf(NR_RX, dph, 0,
615 spkt->pkt_mbuf, 0, dlen, do_cksum_rx, iphlen);
616 } else {
617 FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2PKT);
618 /*
619 * Source packet has full contents, either from an mbuf
620 * that came up from the compat driver, or because it
621 * originated on the native driver; copy to destination.
622 */
623 fsw->fsw_pkt_copy_from_pkt(NR_RX, dph, 0, sph,
624 (spkt->pkt_headroom + spkt->pkt_l2_len), dlen, do_cksum_rx,
625 iphlen, 0, FALSE);
626 }
627
628 #if DEBUG || DEVELOPMENT
629 if (__improbable(pkt_trailers > 0)) {
630 dlen += pkt_add_trailers(dph, dlen, iphlen);
631 }
632 #endif /* DEBUG || DEVELOPMENT */
633
634 /* Finalize and attach packet to Rx ring */
635 METADATA_ADJUST_LEN(dpkt, 0, 0);
636 err = __packet_finalize(dph);
637 VERIFY(err == 0);
638
639 copy_packet_from_dev_log(spkt, dpkt, kernproc);
640
641 if (spkt->pkt_pflags & PKT_F_MBUF_DATA) {
642 ifp_inc_traffic_class_in(fsw->fsw_ifp, spkt->pkt_mbuf);
643 mbuf_freem(spkt->pkt_mbuf);
644 KPKT_CLEAR_MBUF_DATA(spkt);
645 } else {
646 fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph);
647 }
648
649 if (__probable(do_cksum_rx != 0)) {
650 FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
651 }
652
653 return 0;
654 }
655
656 SK_NO_INLINE_ATTRIBUTE
657 static struct __kern_packet *
rx_process_ip_frag(struct nx_flowswitch * fsw,struct __kern_packet * pkt)658 rx_process_ip_frag(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
659 {
660 char *pkt_buf;
661 void *l3_hdr;
662 uint16_t nfrags, tlen;
663 int err = 0;
664
665 switch (fsw_ip_reass) {
666 case FSW_IP_REASS_FORCE_OFF:
667 return pkt;
668 case FSW_IP_REASS_FORCE_ON:
669 break;
670 default:
671 if (!FSW_NETAGENT_ENABLED(fsw) ||
672 flow_mgr_get_num_flows(fsw->fsw_flow_mgr) == 0) {
673 return pkt;
674 }
675 break;
676 }
677
678 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
679 l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
680
681 ASSERT(fsw->fsw_ipfm != NULL);
682 ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
683
684 if (pkt->pkt_flow_ip_ver == IPVERSION) {
685 struct ip *ip = l3_hdr;
686 err = fsw_ip_frag_reass_v4(fsw->fsw_ipfm, &pkt, ip, &nfrags, &tlen);
687 } else {
688 struct ip6_hdr *ip6_hdr = l3_hdr;
689 struct ip6_frag *__single ip6_frag =
690 (struct ip6_frag *)((uint8_t *)l3_hdr + sizeof(struct ip6_hdr));
691
692 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
693 /* we only handle frag header immediately after v6 header */
694 err = fsw_ip_frag_reass_v6(fsw->fsw_ipfm, &pkt, ip6_hdr, ip6_frag,
695 &nfrags, &tlen);
696 }
697 if (__improbable(err != 0)) {
698 /* if we get a bad fragment, free it */
699 pp_free_packet_single(pkt);
700 pkt = NULL;
701 } else {
702 ASSERT(!((pkt != NULL) ^ (nfrags > 0)));
703 }
704
705 return pkt;
706 }
707
708 SK_NO_INLINE_ATTRIBUTE
709 static void
rx_prepare_packet_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)710 rx_prepare_packet_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
711 {
712 ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
713 uint32_t mlen = (uint32_t)m_pktlen(pkt->pkt_mbuf);
714 kern_packet_t ph = SK_PTR_ENCODE(pkt,
715 METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
716 /*
717 * This is the case when the packet is coming in from
718 * compat-netif. This packet only has valid metadata
719 * and an attached mbuf. We need to copy enough data
720 * from the mbuf to the packet buffer for the
721 * classifier. Compat netif packet pool is configured
722 * with buffer size of NETIF_COMPAT_MAX_MBUF_DATA_COPY
723 * which is just enough to hold the protocol headers
724 * for the flowswitch classifier.
725 */
726
727 pkt->pkt_headroom = 0;
728 METADATA_ADJUST_LEN(pkt, 0, 0);
729 /*
730 * Copy the initial 128 bytes of the packet for
731 * classification.
732 * Ethernet(14) + IPv6 header(40) +
733 * + IPv6 fragment header(8) +
734 * TCP header with options(60).
735 */
736 fsw->fsw_pkt_copy_from_mbuf(NR_RX, ph,
737 pkt->pkt_headroom, pkt->pkt_mbuf, 0,
738 MIN(mlen, NETIF_COMPAT_MAX_MBUF_DATA_COPY),
739 FALSE, 0);
740
741 int err = __packet_finalize_with_mbuf(pkt);
742 VERIFY(err == 0);
743 }
744
745 static struct __kern_packet *
rx_prepare_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)746 rx_prepare_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
747 {
748 pkt->pkt_qum_qflags &= ~QUM_F_FLOW_CLASSIFIED;
749
750 if (__improbable(pkt->pkt_pflags & PKT_F_MBUF_DATA)) {
751 rx_prepare_packet_mbuf(fsw, pkt);
752 }
753
754 return pkt;
755 }
756
757 static struct flow_entry *
lookup_flow_with_pkt(struct nx_flowswitch * fsw,struct __kern_packet * pkt,bool input,struct flow_entry * prev_fe)758 lookup_flow_with_pkt(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
759 bool input, struct flow_entry *prev_fe)
760 {
761 struct flow_key key __sk_aligned(16);
762 struct flow_entry *__single fe = NULL;
763
764 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
765 flow_pkt2key(pkt, input, &key);
766
767 if (__probable(prev_fe != NULL &&
768 prev_fe->fe_key.fk_mask == FKMASK_5TUPLE)) {
769 uint16_t saved_mask = key.fk_mask;
770 key.fk_mask = FKMASK_5TUPLE;
771 if (flow_key_cmp_mask(&prev_fe->fe_key, &key, &fk_mask_5tuple) == 0) {
772 flow_entry_retain(prev_fe);
773 fe = prev_fe;
774 } else {
775 key.fk_mask = saved_mask;
776 }
777 }
778
779 top:
780 if (__improbable(fe == NULL)) {
781 fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &key);
782 }
783
784 if (__improbable(fe != NULL &&
785 (fe->fe_flags & (FLOWENTF_PARENT | FLOWENTF_CHILD)) != 0)) {
786 /* Rx */
787 if (input) {
788 if (fe->fe_flags & FLOWENTF_PARENT) {
789 struct flow_entry *child_fe = rx_lookup_child_flow(fsw, fe, pkt);
790 if (child_fe != NULL) {
791 flow_entry_release(&fe);
792 fe = child_fe;
793 }
794 } else {
795 if (!rx_flow_demux_match(fsw, fe, pkt)) {
796 flow_entry_release(&fe);
797 fe = NULL;
798 goto top;
799 }
800 }
801 } else {
802 /* Tx */
803 if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
804 if (__probable(fe->fe_flags & FLOWENTF_PARENT)) {
805 struct flow_entry *__single parent_fe = fe;
806 fe = tx_lookup_child_flow(parent_fe, pkt->pkt_flow_id);
807 flow_entry_release(&parent_fe);
808 } else {
809 flow_entry_release(&fe);
810 fe = NULL;
811 goto top;
812 }
813 }
814 }
815 }
816
817 SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]);
818 SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP,
819 "%s %s %s \"%s\" fe 0x%llx",
820 input ? "Rx" : "Tx", if_name(fsw->fsw_ifp),
821 sk_proc_name_address(current_proc()),
822 fk_as_string(&key, fkbuf, sizeof(fkbuf)),
823 SK_KVA(fe));
824
825 return fe;
826 }
827
828 SK_NO_INLINE_ATTRIBUTE
829 static bool
pkt_is_for_listener(struct flow_entry * fe,struct __kern_packet * pkt)830 pkt_is_for_listener(struct flow_entry *fe, struct __kern_packet *pkt)
831 {
832 struct nx_flowswitch *fsw = fe->fe_fsw;
833 struct ifnet *ifp = fsw->fsw_ifp;
834 struct in_ifaddr *ia = NULL;
835 struct in_ifaddr *best_ia = NULL;
836 struct in6_ifaddr *ia6 = NULL;
837 struct in6_ifaddr *best_ia6 = NULL;
838 struct ifnet *match_ifp = NULL;
839 struct __flow *flow = pkt->pkt_flow;
840 bool result = false;
841
842 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
843
844 if (flow->flow_ip_ver == IPVERSION) {
845 if (IN_ZERONET(ntohl(flow->flow_ipv4_dst.s_addr)) ||
846 IN_LOOPBACK(ntohl(flow->flow_ipv4_dst.s_addr)) ||
847 IN_LINKLOCAL(ntohl(flow->flow_ipv4_dst.s_addr)) ||
848 IN_DS_LITE(ntohl(flow->flow_ipv4_dst.s_addr)) ||
849 IN_6TO4_RELAY_ANYCAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
850 IN_MULTICAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
851 INADDR_BROADCAST == flow->flow_ipv4_dst.s_addr) {
852 result = true;
853 goto done;
854 }
855
856 /*
857 * Check for a match in the hash bucket.
858 */
859 lck_rw_lock_shared(&in_ifaddr_rwlock);
860 TAILQ_FOREACH(ia, INADDR_HASH(flow->flow_ipv4_dst.s_addr), ia_hash) {
861 if (IA_SIN(ia)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr) {
862 best_ia = ia;
863 match_ifp = ia->ia_ifp;
864
865 if (match_ifp == ifp) {
866 break;
867 }
868 /*
869 * Continue the loop in case there's a exact match with another
870 * interface
871 */
872 }
873 }
874
875 if (best_ia != NULL) {
876 if (match_ifp != ifp && ipforwarding == 0 &&
877 (match_ifp->if_family == IFNET_FAMILY_IPSEC ||
878 match_ifp->if_family == IFNET_FAMILY_UTUN)) {
879 /*
880 * Drop when interface address check is strict and forwarding
881 * is disabled
882 */
883 } else {
884 lck_rw_done(&in_ifaddr_rwlock);
885 result = true;
886 goto done;
887 }
888 }
889 lck_rw_done(&in_ifaddr_rwlock);
890
891 if (ifp->if_flags & IFF_BROADCAST) {
892 /*
893 * Check for broadcast addresses.
894 *
895 * Only accept broadcast packets that arrive via the matching
896 * interface. Reception of forwarded directed broadcasts would be
897 * handled via ip_forward() and ether_frameout() with the loopback
898 * into the stack for SIMPLEX interfaces handled by ether_frameout().
899 */
900 struct ifaddr *ifa;
901
902 ifnet_lock_shared(ifp);
903 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
904 if (ifa->ifa_addr->sa_family != AF_INET) {
905 continue;
906 }
907 ia = ifatoia(ifa);
908 if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr ||
909 ia->ia_netbroadcast.s_addr == flow->flow_ipv4_dst.s_addr) {
910 ifnet_lock_done(ifp);
911 result = true;
912 goto done;
913 }
914 }
915 ifnet_lock_done(ifp);
916 }
917 } else {
918 struct in6_ifaddrhashhead *ia6_hash_head;
919
920 if (IN6_IS_ADDR_LOOPBACK(&flow->flow_ipv6_dst) ||
921 IN6_IS_ADDR_LINKLOCAL(&flow->flow_ipv6_dst) ||
922 IN6_IS_ADDR_MULTICAST(&flow->flow_ipv6_dst)) {
923 result = true;
924 goto done;
925 }
926
927 /*
928 * Check for exact addresses in the hash bucket.
929 */
930 lck_rw_lock_shared(&in6_ifaddr_rwlock);
931 /* XXX -fbounds-safety: external dependency on ip6_input.c */
932 ia6_hash_head = __unsafe_forge_bidi_indexable(struct in6_ifaddrhashhead *,
933 in6_ifaddrhashtbl, in6addr_nhash * sizeof(*in6_ifaddrhashtbl));
934 ia6_hash_head = &ia6_hash_head[in6addr_hashval(&flow->flow_ipv6_dst)];
935
936 TAILQ_FOREACH(ia6, ia6_hash_head, ia6_hash) {
937 if (in6_are_addr_equal_scoped(&ia6->ia_addr.sin6_addr, &flow->flow_ipv6_dst,
938 ia6->ia_ifp->if_index, ifp->if_index)) {
939 if ((ia6->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46))) {
940 continue;
941 }
942 best_ia6 = ia6;
943 if (ia6->ia_ifp == ifp) {
944 break;
945 }
946 /*
947 * Continue the loop in case there's a exact match with another
948 * interface
949 */
950 }
951 }
952 if (best_ia6 != NULL) {
953 if (best_ia6->ia_ifp != ifp && ip6_forwarding == 0 &&
954 (best_ia6->ia_ifp->if_family == IFNET_FAMILY_IPSEC ||
955 best_ia6->ia_ifp->if_family == IFNET_FAMILY_UTUN)) {
956 /*
957 * Drop when interface address check is strict and forwarding
958 * is disabled
959 */
960 } else {
961 lck_rw_done(&in6_ifaddr_rwlock);
962 result = true;
963 goto done;
964 }
965 }
966 lck_rw_done(&in6_ifaddr_rwlock);
967 }
968
969 /*
970 * In forwarding mode, if the destination address
971 * of the packet does not match any interface
972 * address, it maybe destined to the client device
973 */
974 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
975 "Rx flow does not match interface address");
976 done:
977 return result;
978 }
979
980 static struct flow_entry *
rx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)981 rx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
982 struct flow_entry *prev_fe)
983 {
984 struct flow_entry *__single fe;
985
986 fe = lookup_flow_with_pkt(fsw, pkt, true, prev_fe);
987 _FSW_INJECT_ERROR(2, fe, NULL, flow_entry_release, &fe);
988 if (fe == NULL) {
989 FSW_STATS_INC(FSW_STATS_RX_FLOW_NOT_FOUND);
990 return NULL;
991 }
992
993 if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE &&
994 fe->fe_flags & FLOWENTF_LISTENER) &&
995 !pkt_is_for_listener(fe, pkt)) {
996 FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_LISTENER);
997 flow_entry_release(&fe);
998 return NULL;
999 }
1000
1001 if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
1002 FSW_STATS_INC(FSW_STATS_RX_FLOW_TORNDOWN);
1003 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
1004 "Rx flow torn down");
1005 flow_entry_release(&fe);
1006 fe = NULL;
1007 }
1008
1009 return fe;
1010 }
1011
1012 static inline void
rx_flow_batch_packets(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt,uint64_t tid)1013 rx_flow_batch_packets(struct flow_entry_list *fes, struct flow_entry *fe,
1014 struct __kern_packet *pkt, uint64_t tid)
1015 {
1016 /*
1017 * Among threads working on the same fe, the first thread that reaches here
1018 * will be responsible for processing all the packets until a point when
1019 * it does not see new packets in fe_rx_pktq. Other threads only
1020 * enqueue their packets but do not add the flow entry to their flow entry list.
1021 */
1022 lck_mtx_lock(&fe->fe_rx_pktq_lock);
1023
1024 if (fe->fe_rx_worker_tid == 0) {
1025 fe->fe_rx_worker_tid = tid;
1026 } else if (__improbable(fe->fe_rx_worker_tid != tid)) {
1027 STATS_INC(&fe->fe_fsw->fsw_stats, FSW_STATS_RX_FLOW_IN_USE);
1028 }
1029
1030 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
1031 fe->fe_rx_frag_count++;
1032 }
1033
1034 fe->fe_rx_pktq_bytes += pkt->pkt_flow_ulen;
1035 /* KPKTQ_ENQUEUE_LIST is needed until frags become chained buflet */
1036 if (KPKTQ_EMPTY(&fe->fe_rx_pktq) && tid == fe->fe_rx_worker_tid) {
1037 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) == 0);
1038 TAILQ_INSERT_TAIL(fes, fe, fe_rx_link);
1039 KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
1040 lck_mtx_unlock(&fe->fe_rx_pktq_lock);
1041 } else {
1042 KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
1043 lck_mtx_unlock(&fe->fe_rx_pktq_lock);
1044 flow_entry_release(&fe);
1045 }
1046 }
1047
1048 static void
tx_flow_batch_packet(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt)1049 tx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
1050 struct __kern_packet *pkt)
1051 {
1052 /* record frag continuation */
1053 if (__improbable(pkt->pkt_flow_ip_is_first_frag)) {
1054 ASSERT(pkt->pkt_flow_ip_is_frag);
1055 fe->fe_tx_is_cont_frag = true;
1056 fe->fe_tx_frag_id = pkt->pkt_flow_ip_frag_id;
1057 } else if (__probable(!pkt->pkt_flow_ip_is_frag)) {
1058 fe->fe_tx_is_cont_frag = false;
1059 fe->fe_tx_frag_id = 0;
1060 }
1061
1062 if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
1063 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
1064 TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
1065 KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
1066 } else {
1067 ASSERT(!TAILQ_EMPTY(fes));
1068 KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
1069 flow_entry_release(&fe);
1070 }
1071 }
1072
1073 static inline void
fsw_rx_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes)1074 fsw_rx_ring_dequeue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
1075 uint32_t n_pkts_max, struct pktq *pktq, uint32_t *n_bytes)
1076 {
1077 uint32_t n_pkts = 0;
1078 slot_idx_t idx, idx_end;
1079 idx = r->ckr_khead;
1080 idx_end = r->ckr_rhead;
1081
1082 ASSERT(KPKTQ_EMPTY(pktq));
1083 *n_bytes = 0;
1084 for (; n_pkts < n_pkts_max && idx != idx_end;
1085 idx = SLOT_NEXT(idx, r->ckr_lim)) {
1086 struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1087 struct __kern_packet *pkt = ksd->sd_pkt;
1088
1089 ASSERT(pkt->pkt_nextpkt == NULL);
1090 KR_SLOT_DETACH_METADATA(r, ksd);
1091
1092 _FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1093 pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1094 if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1095 || (pkt->pkt_length == 0)) {
1096 FSW_STATS_INC(FSW_STATS_DROP);
1097 pp_free_packet_single(pkt);
1098 continue;
1099 }
1100 n_pkts++;
1101 *n_bytes += pkt->pkt_length;
1102
1103 KPKTQ_ENQUEUE(pktq, pkt);
1104 }
1105 r->ckr_khead = idx;
1106 r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
1107 }
1108
1109 /*
1110 * This is only for estimating how many packets each GSO packet will need.
1111 * The number does not need to be exact because any leftover packets allocated
1112 * will be freed.
1113 */
1114 static uint32_t
estimate_gso_pkts(struct __kern_packet * pkt)1115 estimate_gso_pkts(struct __kern_packet *pkt)
1116 {
1117 packet_tso_flags_t tso_flags;
1118 uint16_t mss;
1119 uint32_t n_pkts = 0, total_hlen = 0, total_len = 0;
1120
1121 tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
1122 mss = pkt->pkt_proto_seg_sz;
1123
1124 if (tso_flags == PACKET_TSO_IPV4) {
1125 total_hlen = sizeof(struct ip) + sizeof(struct tcphdr);
1126 } else if (tso_flags == PACKET_TSO_IPV6) {
1127 total_hlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1128 }
1129 if (total_hlen != 0 && mss != 0) {
1130 total_len = pkt->pkt_length;
1131 n_pkts = (uint32_t)
1132 (SK_ROUNDUP((total_len - total_hlen), mss) / mss);
1133 }
1134 DTRACE_SKYWALK5(estimate__gso, packet_tso_flags_t, tso_flags,
1135 uint32_t, total_hlen, uint32_t, total_len, uint16_t, mss,
1136 uint32_t, n_pkts);
1137 return n_pkts;
1138 }
1139
1140 /*
1141 * This function retrieves a chain of packets of the same type only
1142 * (GSO or non-GSO).
1143 */
1144 static inline void
fsw_tx_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes,uint32_t * gso_pkts_estimate)1145 fsw_tx_ring_dequeue_pktq(struct nx_flowswitch *fsw,
1146 struct __kern_channel_ring *r, uint32_t n_pkts_max,
1147 struct pktq *pktq, uint32_t *n_bytes, uint32_t *gso_pkts_estimate)
1148 {
1149 uint32_t n_pkts = 0;
1150 slot_idx_t idx, idx_end;
1151 idx = r->ckr_khead;
1152 idx_end = r->ckr_rhead;
1153 struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
1154 boolean_t gso_enabled, gso_required;
1155 uint32_t gso_pkts;
1156
1157 gso_enabled = (fsw->fsw_tso_mode == FSW_TSO_MODE_SW);
1158 ASSERT(KPKTQ_EMPTY(pktq));
1159 *n_bytes = 0;
1160 for (; n_pkts < n_pkts_max &&
1161 (!gso_enabled || fsw_gso_batch == 0 ||
1162 *gso_pkts_estimate < fsw_gso_batch) &&
1163 idx != idx_end; idx = SLOT_NEXT(idx, r->ckr_lim)) {
1164 struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1165 struct __kern_packet *pkt = ksd->sd_pkt;
1166
1167 ASSERT(pkt->pkt_nextpkt == NULL);
1168
1169 _FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1170 pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1171 if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1172 || (pkt->pkt_length == 0)) {
1173 KR_SLOT_DETACH_METADATA(r, ksd);
1174 FSW_STATS_INC(FSW_STATS_DROP);
1175 pp_free_packet_single(pkt);
1176 continue;
1177 }
1178 if (gso_enabled) {
1179 gso_pkts = estimate_gso_pkts(pkt);
1180
1181 /*
1182 * We use the first packet to determine what
1183 * type the subsequent ones need to be (GSO or
1184 * non-GSO).
1185 */
1186 if (n_pkts == 0) {
1187 gso_required = (gso_pkts != 0);
1188 } else {
1189 if (gso_required != (gso_pkts != 0)) {
1190 break;
1191 }
1192 }
1193 *gso_pkts_estimate += gso_pkts;
1194 }
1195 KR_SLOT_DETACH_METADATA(r, ksd);
1196 if (NA_CHANNEL_EVENT_ATTACHED(&vpna->vpna_up)) {
1197 __packet_set_tx_nx_port(SK_PKT2PH(pkt),
1198 vpna->vpna_nx_port, vpna->vpna_gencnt);
1199 }
1200 n_pkts++;
1201 *n_bytes += pkt->pkt_length;
1202 KPKTQ_ENQUEUE(pktq, pkt);
1203 }
1204 r->ckr_khead = idx;
1205 r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
1206 DTRACE_SKYWALK5(tx__ring__dequeue, struct nx_flowswitch *, fsw,
1207 ifnet_t, fsw->fsw_ifp, uint32_t, n_pkts, uint32_t, *n_bytes,
1208 uint32_t, *gso_pkts_estimate);
1209 }
1210
1211 static void
fsw_ring_enqueue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1212 fsw_ring_enqueue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
1213 struct pktq *pktq)
1214 {
1215 #pragma unused(fsw)
1216 struct __kern_packet *pkt;
1217 struct __kern_quantum *kqum;
1218 uint32_t kr_space_avail = 0;
1219 uint32_t n, n_pkts = 0, n_bytes = 0;
1220 slot_idx_t idx = 0, idx_start = 0, idx_end = 0;
1221
1222 kr_enter(r, TRUE);
1223
1224 idx_start = r->ckr_ktail;
1225 kr_space_avail = kr_available_slots_rxring(r);
1226 _FSW_INJECT_ERROR(40, kr_space_avail, 0, null_func);
1227 n = MIN(kr_space_avail, KPKTQ_LEN(pktq));
1228 _FSW_INJECT_ERROR(41, n, 0, null_func);
1229 idx_end = SLOT_INCREMENT(idx_start, n, r->ckr_lim);
1230
1231 idx = idx_start;
1232 while (idx != idx_end) {
1233 KPKTQ_DEQUEUE(pktq, pkt);
1234 kqum = SK_PTR_ADDR_KQUM(pkt);
1235 kqum->qum_qflags |= QUM_F_FINALIZED;
1236 n_pkts++;
1237 n_bytes += pkt->pkt_length;
1238 KR_SLOT_ATTACH_METADATA(r, KR_KSD(r, idx), kqum);
1239 if (__improbable(pkt->pkt_trace_id != 0)) {
1240 KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
1241 KDBG(SK_KTRACE_PKT_RX_CHN | DBG_FUNC_START, pkt->pkt_trace_id);
1242 }
1243 idx = SLOT_NEXT(idx, r->ckr_lim);
1244 }
1245
1246 kr_update_stats(r, n_pkts, n_bytes);
1247
1248 /*
1249 * ensure slot attachments are visible before updating the
1250 * tail pointer
1251 */
1252 os_atomic_thread_fence(seq_cst);
1253
1254 r->ckr_ktail = idx_end;
1255
1256 kr_exit(r);
1257
1258 r->ckr_na_notify(r, kernproc, NA_NOTEF_PUSH);
1259
1260 SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "%s enqueued %d pkts",
1261 r->ckr_name, n_pkts);
1262 }
1263
1264 static void
pkts_to_pktq(struct __kern_packet ** __counted_by (n_pkts)pkts,uint32_t n_pkts,struct pktq * pktq)1265 pkts_to_pktq(struct __kern_packet **__counted_by(n_pkts)pkts, uint32_t n_pkts, struct pktq *pktq)
1266 {
1267 ASSERT(KPKTQ_EMPTY(pktq));
1268
1269 for (uint32_t i = 0; i < n_pkts; i++) {
1270 struct __kern_packet *__single pkt = pkts[i];
1271 ASSERT(pkt->pkt_nextpkt == NULL);
1272 KPKTQ_ENQUEUE(pktq, pkt);
1273 }
1274 }
1275
1276 /*
1277 * This function is modeled after nx_netif_host_grab_pkts() in nx_netif_host.c.
1278 */
1279 SK_NO_INLINE_ATTRIBUTE
1280 static void
convert_native_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_headp,struct mbuf ** m_tailp,uint32_t * cnt,uint32_t * bytes)1281 convert_native_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1282 struct mbuf **m_headp, struct mbuf **m_tailp, uint32_t *cnt, uint32_t *bytes)
1283 {
1284 uint32_t tot_cnt;
1285 unsigned int num_segs = 1;
1286 struct mbuf *__single mhead, *__single head = NULL;
1287 struct mbuf *__single tail = NULL, **__single tailp = &head;
1288 uint32_t mhead_cnt, mhead_bufsize;
1289 uint32_t mhead_waste = 0;
1290 uint32_t mcnt = 0, mbytes = 0;
1291 uint32_t largest, max_pkt_len;
1292 struct __kern_packet *__single pkt;
1293 struct kern_pbufpool *pp;
1294
1295 tot_cnt = KPKTQ_LEN(pktq);
1296 ASSERT(tot_cnt > 0);
1297 mhead_cnt = tot_cnt;
1298
1299 /*
1300 * Opportunistically batch-allocate the mbufs based on the largest
1301 * packet size we've seen in the recent past. Note that we reset
1302 * fe_rx_largest_size below if we notice that we're under-utilizing the
1303 * allocated buffers (thus disabling this batch allocation).
1304 */
1305 largest = *(volatile uint32_t*)&fsw->fsw_rx_largest_size; /* read once */
1306 if (__probable(largest != 0)) {
1307 if (largest <= MCLBYTES) {
1308 mhead = m_allocpacket_internal(&mhead_cnt, MCLBYTES,
1309 &num_segs, M_NOWAIT, 1, 0);
1310 mhead_bufsize = MCLBYTES;
1311 } else if (largest <= MBIGCLBYTES) {
1312 mhead = m_allocpacket_internal(&mhead_cnt, MBIGCLBYTES,
1313 &num_segs, M_NOWAIT, 1, 0);
1314 mhead_bufsize = MBIGCLBYTES;
1315 } else if (largest <= M16KCLBYTES) {
1316 mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES,
1317 &num_segs, M_NOWAIT, 1, 0);
1318 mhead_bufsize = M16KCLBYTES;
1319 } else if (largest <= M16KCLBYTES * 2) {
1320 num_segs = 2;
1321 mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES * 2,
1322 &num_segs, M_NOWAIT, 1, 0);
1323 mhead_bufsize = M16KCLBYTES * 2;
1324 } else {
1325 mhead = NULL;
1326 mhead_bufsize = mhead_cnt = 0;
1327 }
1328 } else {
1329 mhead = NULL;
1330 mhead_bufsize = mhead_cnt = 0;
1331 }
1332 DTRACE_SKYWALK4(bufstats, uint32_t, largest, uint32_t, mhead_bufsize,
1333 uint32_t, mhead_cnt, uint32_t, tot_cnt);
1334
1335 pp = __DECONST(struct kern_pbufpool *, KPKTQ_FIRST(pktq)->pkt_qum.qum_pp);
1336 max_pkt_len = PP_BUF_SIZE_DEF(pp) * pp->pp_max_frags;
1337
1338 KPKTQ_FOREACH(pkt, pktq) {
1339 uint32_t tot_len, len;
1340 uint16_t pad, llhlen, iphlen;
1341 boolean_t do_cksum_rx;
1342 struct mbuf *__single m;
1343 int error;
1344
1345 llhlen = pkt->pkt_l2_len;
1346 len = pkt->pkt_length;
1347 if (__improbable(len > max_pkt_len || len == 0 || llhlen > len)) {
1348 DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1349 struct __kern_packet *, pkt);
1350 FSW_STATS_INC(FSW_STATS_DROP);
1351 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1352 continue;
1353 }
1354 /* begin payload on 32-bit boundary; figure out the padding */
1355 pad = (uint16_t)P2ROUNDUP(llhlen, sizeof(uint32_t)) - llhlen;
1356 tot_len = pad + len;
1357
1358 /* remember largest packet size */
1359 if (__improbable(largest < tot_len)) {
1360 largest = MAX(tot_len, MCLBYTES);
1361 }
1362
1363 /*
1364 * If the above batch allocation returned partial
1365 * success, we try a blocking allocation here again.
1366 */
1367 m = mhead;
1368 if (__improbable(m == NULL || tot_len > mhead_bufsize)) {
1369 ASSERT(mhead != NULL || mhead_cnt == 0);
1370 num_segs = 1;
1371 if (tot_len > M16KCLBYTES) {
1372 num_segs = 0;
1373 }
1374 if ((error = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
1375 &num_segs, &m)) != 0) {
1376 DTRACE_SKYWALK2(bad__len,
1377 struct nx_flowswitch *, fsw,
1378 struct __kern_packet *, pkt);
1379 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
1380 FSW_STATS_INC(FSW_STATS_DROP);
1381 continue;
1382 }
1383 } else {
1384 mhead = m->m_nextpkt;
1385 m->m_nextpkt = NULL;
1386 ASSERT(mhead_cnt != 0);
1387 --mhead_cnt;
1388
1389 /* check if we're underutilizing large buffers */
1390 if (__improbable(mhead_bufsize > MCLBYTES &&
1391 tot_len < (mhead_bufsize >> 1))) {
1392 ++mhead_waste;
1393 }
1394 /*
1395 * Clean up unused mbuf.
1396 * Ony need to do this when we pre-alloc 2x16K mbufs
1397 */
1398 if (__improbable(mhead_bufsize >= tot_len + M16KCLBYTES)) {
1399 ASSERT(mhead_bufsize == 2 * M16KCLBYTES);
1400 struct mbuf *m_extra = m->m_next;
1401 ASSERT(m_extra != NULL);
1402 ASSERT(m_extra->m_len == 0);
1403 ASSERT(M_SIZE(m_extra) == M16KCLBYTES);
1404 m->m_next = NULL;
1405 m_freem(m_extra);
1406 FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
1407 }
1408 }
1409 m->m_data += pad;
1410 /*
1411 * XXX -fbounds-safety: external dependency
1412 * mtod does not work because m_len is 0
1413 */
1414 m->m_pkthdr.pkt_hdr = m_mtod_current(m);
1415
1416 /* don't include IP header from partial sum */
1417 if (__probable((pkt->pkt_qum_qflags &
1418 QUM_F_FLOW_CLASSIFIED) != 0)) {
1419 iphlen = pkt->pkt_flow_ip_hlen;
1420 do_cksum_rx = sk_cksum_rx;
1421 } else {
1422 iphlen = 0;
1423 do_cksum_rx = FALSE;
1424 }
1425
1426 fsw->fsw_pkt_copy_to_mbuf(NR_RX, SK_PKT2PH(pkt),
1427 pkt->pkt_headroom, m, 0, len, do_cksum_rx,
1428 llhlen + iphlen);
1429
1430 FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2MBUF);
1431 if (do_cksum_rx) {
1432 FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
1433 }
1434 #if DEBUG || DEVELOPMENT
1435 if (__improbable(pkt_trailers > 0)) {
1436 (void) pkt_add_trailers_mbuf(m, llhlen + iphlen);
1437 }
1438 #endif /* DEBUG || DEVELOPMENT */
1439 m_adj(m, llhlen);
1440
1441 m->m_pkthdr.rcvif = fsw->fsw_ifp;
1442 if (__improbable((pkt->pkt_link_flags &
1443 PKT_LINKF_ETHFCS) != 0)) {
1444 m->m_flags |= M_HASFCS;
1445 }
1446 if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1447 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
1448 }
1449 ASSERT(m->m_nextpkt == NULL);
1450 tail = m;
1451 *tailp = m;
1452 tailp = &m->m_nextpkt;
1453 mcnt++;
1454 mbytes += m_pktlen(m);
1455 }
1456 /* free any leftovers */
1457 if (__improbable(mhead != NULL)) {
1458 DTRACE_SKYWALK1(mhead__leftover, uint32_t, mhead_cnt);
1459 ASSERT(mhead_cnt != 0);
1460 (void) m_freem_list(mhead);
1461 mhead = NULL;
1462 mhead_cnt = 0;
1463 }
1464
1465 /* reset if most packets (>50%) are smaller than our batch buffers */
1466 if (__improbable(mhead_waste > ((uint32_t)tot_cnt >> 1))) {
1467 DTRACE_SKYWALK4(mhead__waste, struct nx_flowswitch *, fsw,
1468 struct flow_entry *, NULL, uint32_t, mhead_waste,
1469 uint32_t, tot_cnt);
1470 largest = 0;
1471 }
1472
1473 if (largest != fsw->fsw_rx_largest_size) {
1474 os_atomic_store(&fsw->fsw_rx_largest_size, largest, release);
1475 }
1476
1477 pp_free_pktq(pktq);
1478 *m_headp = head;
1479 *m_tailp = tail;
1480 *cnt = mcnt;
1481 *bytes = mbytes;
1482 }
1483
1484 /*
1485 * This function only extracts the mbuf from the packet. The caller frees
1486 * the packet.
1487 */
1488 static inline struct mbuf *
convert_compat_pkt_to_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1489 convert_compat_pkt_to_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1490 {
1491 struct mbuf *m;
1492 struct pkthdr *mhdr;
1493 uint16_t llhlen;
1494
1495 m = pkt->pkt_mbuf;
1496 ASSERT(m != NULL);
1497
1498 llhlen = pkt->pkt_l2_len;
1499 if (llhlen > pkt->pkt_length) {
1500 m_freem(m);
1501 KPKT_CLEAR_MBUF_DATA(pkt);
1502 DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1503 struct __kern_packet *, pkt);
1504 FSW_STATS_INC(FSW_STATS_DROP);
1505 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1506 return NULL;
1507 }
1508 mhdr = &m->m_pkthdr;
1509 if ((mhdr->csum_flags & CSUM_DATA_VALID) == 0 &&
1510 PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
1511 mhdr->csum_flags &= ~CSUM_RX_FLAGS;
1512 mhdr->csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
1513 mhdr->csum_rx_start = pkt->pkt_csum_rx_start_off;
1514 mhdr->csum_rx_val = pkt->pkt_csum_rx_value;
1515 }
1516 #if DEBUG || DEVELOPMENT
1517 uint32_t extra = 0;
1518 if (__improbable(pkt_trailers > 0)) {
1519 extra = pkt_add_trailers_mbuf(m, llhlen);
1520 }
1521 #endif /* DEBUG || DEVELOPMENT */
1522 m_adj(m, llhlen);
1523 ASSERT((uint32_t)m_pktlen(m) == ((pkt->pkt_length - llhlen) + extra));
1524 KPKT_CLEAR_MBUF_DATA(pkt);
1525 return m;
1526 }
1527
1528 SK_NO_INLINE_ATTRIBUTE
1529 static void
convert_compat_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_head,struct mbuf ** m_tail,uint32_t * cnt,uint32_t * bytes)1530 convert_compat_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1531 struct mbuf **m_head, struct mbuf **m_tail, uint32_t *cnt, uint32_t *bytes)
1532 {
1533 struct __kern_packet *pkt;
1534 struct mbuf *__single m, *__single head = NULL;
1535 struct mbuf *__single tail = NULL, **__single tailp = &head;
1536 uint32_t c = 0, b = 0;
1537
1538 KPKTQ_FOREACH(pkt, pktq) {
1539 m = convert_compat_pkt_to_mbuf(fsw, pkt);
1540 if (__improbable(m == NULL)) {
1541 continue;
1542 }
1543 tail = m;
1544 *tailp = m;
1545 tailp = &m->m_nextpkt;
1546 c++;
1547 b += m_pktlen(m);
1548 }
1549 pp_free_pktq(pktq);
1550 *m_head = head;
1551 *m_tail = tail;
1552 *cnt = c;
1553 *bytes = b;
1554 }
1555
1556 void
fsw_host_sendup(ifnet_t ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes)1557 fsw_host_sendup(ifnet_t ifp, struct mbuf *m_head, struct mbuf *m_tail,
1558 uint32_t cnt, uint32_t bytes)
1559 {
1560 struct ifnet_stat_increment_param s;
1561
1562 bzero(&s, sizeof(s));
1563 s.packets_in = cnt;
1564 s.bytes_in = bytes;
1565 dlil_input_handler(ifp, m_head, m_tail, &s, FALSE, NULL);
1566 }
1567
1568 void
fsw_host_rx(struct nx_flowswitch * fsw,struct pktq * pktq)1569 fsw_host_rx(struct nx_flowswitch *fsw, struct pktq *pktq)
1570 {
1571 struct mbuf *__single m_head = NULL, *__single m_tail = NULL;
1572 uint32_t cnt = 0, bytes = 0;
1573 ifnet_fsw_rx_cb_t __single cb;
1574 void *__single cb_arg;
1575 boolean_t compat;
1576
1577 ASSERT(!KPKTQ_EMPTY(pktq));
1578 if (ifnet_get_flowswitch_rx_callback(fsw->fsw_ifp, &cb, &cb_arg) == 0) {
1579 ASSERT(cb != NULL);
1580 ASSERT(cb_arg != NULL);
1581 (*cb)(cb_arg, pktq);
1582 ifnet_release_flowswitch_rx_callback(fsw->fsw_ifp);
1583 if (KPKTQ_EMPTY(pktq)) {
1584 return;
1585 } else {
1586 DTRACE_SKYWALK2(leftover__pkts, struct nx_flowswitch *, fsw,
1587 struct pktq *, pktq);
1588 }
1589 }
1590
1591 /* All packets in the pktq must have the same type */
1592 compat = ((KPKTQ_FIRST(pktq)->pkt_pflags & PKT_F_MBUF_DATA) != 0);
1593 if (compat) {
1594 convert_compat_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1595 &bytes);
1596 } else {
1597 convert_native_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1598 &bytes);
1599 }
1600 if (__improbable(m_head == NULL)) {
1601 DTRACE_SKYWALK1(empty__head, struct nx_flowswitch *, fsw);
1602 return;
1603 }
1604 fsw_host_sendup(fsw->fsw_ifp, m_head, m_tail, cnt, bytes);
1605 }
1606
1607 void
fsw_ring_enqueue_tail_drop(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1608 fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw,
1609 struct __kern_channel_ring *r, struct pktq *pktq)
1610 {
1611 fsw_ring_enqueue_pktq(fsw, r, pktq);
1612 /*
1613 * Rx stall detection: don't update enqueue ts if dequeue ts < enqueue ts.
1614 * This is to ensure we use the timestamp of the earliest enqueue without
1615 * a dequeue.
1616 */
1617 if (r->ckr_rx_dequeue_ts >= r->ckr_rx_enqueue_ts) {
1618 r->ckr_rx_enqueue_ts = _net_uptime;
1619 }
1620 FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq));
1621 dp_drop_pktq(fsw, pktq, 0, DROP_REASON_RX_DST_RING_FULL, __LINE__,
1622 DROPTAP_FLAG_L2_MISSING);
1623 }
1624
1625 static struct nexus_adapter *
flow_get_na(struct nx_flowswitch * fsw,struct flow_entry * fe)1626 flow_get_na(struct nx_flowswitch *fsw, struct flow_entry *fe)
1627 {
1628 struct kern_nexus *nx = fsw->fsw_nx;
1629 struct nexus_adapter *na = NULL;
1630 nexus_port_t port = fe->fe_nx_port;
1631
1632 if (port == FSW_VP_DEV || port == FSW_VP_HOST) {
1633 SK_ERR("dev or host ports have no NA");
1634 return NULL;
1635 }
1636
1637 if (__improbable(!nx_port_is_valid(nx, port))) {
1638 SK_DF(SK_VERB_FSW_DP, "%s[%d] port no longer valid",
1639 if_name(fsw->fsw_ifp), port);
1640 return NULL;
1641 }
1642
1643 na = nx_port_get_na(nx, port);
1644 if (__improbable(na == NULL)) {
1645 FSW_STATS_INC(FSW_STATS_DST_NXPORT_INVALID);
1646 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer valid",
1647 if_name(fsw->fsw_ifp), port);
1648 return NULL;
1649 }
1650
1651 if (__improbable(!NA_IS_ACTIVE(na))) {
1652 FSW_STATS_INC(FSW_STATS_DST_NXPORT_INACTIVE);
1653 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer active",
1654 if_name(fsw->fsw_ifp), port);
1655 return NULL;
1656 }
1657
1658 if (__improbable(nx_port_is_defunct(nx, port))) {
1659 FSW_STATS_INC(FSW_STATS_DST_NXPORT_DEFUNCT);
1660 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA defuncted",
1661 if_name(fsw->fsw_ifp), port);
1662 return NULL;
1663 }
1664
1665 return na;
1666 }
1667
1668 static inline struct __kern_channel_ring *
flow_get_ring(struct nx_flowswitch * fsw,struct flow_entry * fe,enum txrx txrx)1669 flow_get_ring(struct nx_flowswitch *fsw, struct flow_entry *fe, enum txrx txrx)
1670 {
1671 struct nexus_vp_adapter *na = NULL;
1672 struct __kern_channel_ring *__single r = NULL;
1673
1674 na = VPNA(flow_get_na(fsw, fe));
1675 if (__improbable(na == NULL)) {
1676 return NULL;
1677 }
1678
1679 switch (txrx) {
1680 case NR_RX:
1681 r = KR_SINGLE(&na->vpna_up.na_rx_rings[0]);
1682 break;
1683 case NR_TX:
1684 r = KR_SINGLE(&na->vpna_up.na_tx_rings[0]);
1685 break;
1686 default:
1687 __builtin_unreachable();
1688 VERIFY(0);
1689 }
1690
1691 if (__improbable(KR_DROP(r))) {
1692 FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE);
1693 SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %0xllx %s drop mode",
1694 r->ckr_name, SK_KVA(r));
1695 return NULL;
1696 }
1697
1698 ASSERT(KRNA(r)->na_md_type == NEXUS_META_TYPE_PACKET);
1699
1700 #if (DEVELOPMENT || DEBUG)
1701 if (r != NULL) {
1702 _FSW_INJECT_ERROR(4, r, NULL, null_func);
1703 }
1704 #endif /* DEVELOPMENT || DEBUG */
1705
1706 return r;
1707 }
1708
1709 struct __kern_channel_ring *
fsw_flow_get_rx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1710 fsw_flow_get_rx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1711 {
1712 return flow_get_ring(fsw, fe, NR_RX);
1713 }
1714
1715 static bool
dp_flow_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1716 dp_flow_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1717 {
1718 struct flow_route *fr = fe->fe_route;
1719 struct ifnet *ifp = fsw->fsw_ifp;
1720
1721 if (__improbable(!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
1722 !fe->fe_want_nonviable && (fe->fe_key.fk_mask & FKMASK_SRC) &&
1723 fe->fe_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt &&
1724 !flow_route_key_validate(&fe->fe_key, ifp, &fe->fe_laddr_gencnt))) {
1725 /*
1726 * The source address is no longer around; we want this
1727 * flow to be nonviable, but that requires holding the lock
1728 * as writer (which isn't the case now.) Indicate that
1729 * we need to finalize the nonviable later down below.
1730 *
1731 * We also request that the flow route be re-configured,
1732 * if this is a connected mode flow.
1733 *
1734 */
1735 if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) {
1736 /*
1737 * fsw_pending_nonviable is a hint for reaper thread;
1738 * due to the fact that setting fe_want_nonviable and
1739 * incrementing fsw_pending_nonviable counter is not
1740 * atomic, let the increment happen first, and the
1741 * thread losing the CAS does decrement.
1742 */
1743 os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
1744 if (os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
1745 fsw_reap_sched(fsw);
1746 } else {
1747 os_atomic_dec(&fsw->fsw_pending_nonviable, relaxed);
1748 }
1749 }
1750 if (fr != NULL) {
1751 os_atomic_inc(&fr->fr_want_configure, relaxed);
1752 }
1753 }
1754
1755 /* if flow was (or is going to be) marked as nonviable, drop it */
1756 if (__improbable(fe->fe_want_nonviable ||
1757 (fe->fe_flags & FLOWENTF_NONVIABLE) != 0)) {
1758 SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow 0x%llx non-viable",
1759 SK_KVA(fe));
1760 return false;
1761 }
1762 return true;
1763 }
1764
1765 bool
dp_flow_rx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1766 dp_flow_rx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1767 {
1768 bool okay;
1769 okay = dp_flow_route_process(fsw, fe);
1770 #if (DEVELOPMENT || DEBUG)
1771 if (okay) {
1772 _FSW_INJECT_ERROR(5, okay, false, null_func);
1773 }
1774 #endif /* DEVELOPMENT || DEBUG */
1775
1776 return okay;
1777 }
1778
1779 void
dp_flow_rx_process(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,uint32_t rx_bytes,uint32_t flags)1780 dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
1781 struct pktq *rx_pkts, uint32_t rx_bytes, uint32_t flags)
1782 {
1783 #pragma unused(flags)
1784 struct pktq dpkts; /* dst pool alloc'ed packets */
1785 struct pktq disposed_pkts; /* done src packets */
1786 struct pktq dropped_pkts; /* dropped src packets */
1787 struct pktq transferred_pkts; /* dst packet ready for ring */
1788 struct __kern_packet *pkt, *tpkt;
1789 struct kern_pbufpool *dpp;
1790 uint32_t n_pkts = KPKTQ_LEN(rx_pkts);
1791 uint64_t buf_array[RX_BUFLET_BATCH_COUNT];
1792 uint16_t buf_array_iter = 0;
1793 uint32_t cnt, buf_cnt = 0;
1794 int err;
1795 drop_reason_t reason = DROP_REASON_UNSPECIFIED;
1796 uint16_t line = 0;
1797
1798 KPKTQ_INIT(&dpkts);
1799 KPKTQ_INIT(&dropped_pkts);
1800 KPKTQ_INIT(&disposed_pkts);
1801 KPKTQ_INIT(&transferred_pkts);
1802
1803 if (__improbable(!dp_flow_rx_route_process(fsw, fe))) {
1804 SK_ERR("Rx route bad");
1805 fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
1806 FSW_STATS_ADD(FSW_STATS_RX_FLOW_NONVIABLE, n_pkts);
1807 reason = DROP_REASON_FSW_FLOW_NONVIABLE;
1808 line = __LINE__;
1809 goto done;
1810 }
1811
1812 if (fe->fe_nx_port == FSW_VP_HOST) {
1813 /*
1814 * The host ring does not exist anymore so we can't take
1815 * the enqueue path below. This path should only be hit
1816 * for the rare tcp fragmentation case.
1817 */
1818 fsw_host_rx(fsw, rx_pkts);
1819 return;
1820 }
1821
1822 /* find the ring */
1823 struct __kern_channel_ring *r;
1824 r = fsw_flow_get_rx_ring(fsw, fe);
1825 if (__improbable(r == NULL)) {
1826 fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
1827 reason = DROP_REASON_FSW_RX_RING_NOT_FOUND;
1828 line = __LINE__;
1829 goto done;
1830 }
1831
1832 /* snoop before L2 is stripped */
1833 if (__improbable(pktap_total_tap_count != 0)) {
1834 fsw_snoop(fsw, fe, rx_pkts, true);
1835 }
1836
1837 dpp = r->ckr_pp;
1838 /* batch allocate enough packets */
1839 err = pp_alloc_pktq(dpp, 1, &dpkts, n_pkts, NULL, NULL,
1840 SKMEM_NOSLEEP);
1841 if (__improbable(err == ENOMEM)) {
1842 ASSERT(KPKTQ_EMPTY(&dpkts));
1843 KPKTQ_CONCAT(&dropped_pkts, rx_pkts);
1844 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1845 SK_ERR("failed to alloc %u pkts for kr %s, 0x%llu", n_pkts,
1846 r->ckr_name, SK_KVA(r));
1847 reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
1848 line = __LINE__;
1849 goto done;
1850 }
1851
1852 /*
1853 * estimate total number of buflets for the packet chain.
1854 */
1855 cnt = howmany(rx_bytes, PP_BUF_SIZE_DEF(dpp));
1856 if (cnt > n_pkts) {
1857 ASSERT(dpp->pp_max_frags > 1);
1858 cnt -= n_pkts;
1859 buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, cnt);
1860 err = pp_alloc_buflet_batch(dpp, buf_array, &buf_cnt,
1861 SKMEM_NOSLEEP, false);
1862 if (__improbable(buf_cnt == 0)) {
1863 KPKTQ_CONCAT(&dropped_pkts, rx_pkts);
1864 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1865 SK_ERR("failed to alloc %d buflets (err %d) for kr %s, "
1866 "0x%llu", cnt, err, r->ckr_name, SK_KVA(r));
1867 reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
1868 line = __LINE__;
1869 goto done;
1870 }
1871 err = 0;
1872 }
1873
1874 /* extra processing for user flow */
1875 KPKTQ_FOREACH_SAFE(pkt, rx_pkts, tpkt) {
1876 err = 0;
1877 KPKTQ_REMOVE(rx_pkts, pkt);
1878 if (rx_bytes > pkt->pkt_flow_ulen) {
1879 rx_bytes -= pkt->pkt_flow_ulen;
1880 } else {
1881 rx_bytes = 0;
1882 }
1883 err = flow_pkt_track(fe, pkt, true);
1884 _FSW_INJECT_ERROR(33, err, EPROTO, null_func);
1885 if (__improbable(err != 0)) {
1886 SK_DF(SK_VERB_FLOW_TRACK, "flow_pkt_track failed (err %d)", err);
1887 FSW_STATS_INC(FSW_STATS_RX_FLOW_TRACK_ERR);
1888 /* if need to trigger RST */
1889 if (err == ENETRESET) {
1890 flow_track_abort_tcp(fe, pkt, NULL);
1891 }
1892 dp_drop_pkt_single(fsw, pkt, 0, DROP_REASON_FSW_FLOW_TRACK_ERR,
1893 DROPTAP_FLAG_L2_MISSING);
1894 continue;
1895 }
1896
1897 /* transfer to dpkt */
1898 if (pkt->pkt_qum.qum_pp != dpp) {
1899 struct __kern_buflet *bprev, *bnew;
1900 struct __kern_packet *dpkt = NULL;
1901 uint32_t n_bufs, i;
1902
1903 KPKTQ_DEQUEUE(&dpkts, dpkt);
1904 /* XXX Why would dpkt be NULL at this point? */
1905 if (__improbable(dpkt == NULL)) {
1906 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1907 dp_drop_pkt_single(fsw, pkt, 0,
1908 DROP_REASON_FSW_PP_ALLOC_FAILED, DROPTAP_FLAG_L2_MISSING);
1909 continue;
1910 }
1911 n_bufs = howmany(pkt->pkt_length, PP_BUF_SIZE_DEF(dpp));
1912 n_bufs--;
1913 for (i = 0; i < n_bufs; i++) {
1914 if (__improbable(buf_cnt == 0)) {
1915 ASSERT(dpp->pp_max_frags > 1);
1916 buf_array_iter = 0;
1917 cnt = howmany(rx_bytes, PP_BUF_SIZE_DEF(dpp));
1918 n_pkts = KPKTQ_LEN(rx_pkts);
1919 if (cnt >= n_pkts) {
1920 cnt -= n_pkts;
1921 } else {
1922 cnt = 0;
1923 }
1924 cnt += (n_bufs - i);
1925 buf_cnt = MIN(RX_BUFLET_BATCH_COUNT,
1926 cnt);
1927 cnt = buf_cnt;
1928 err = pp_alloc_buflet_batch(dpp,
1929 buf_array, &buf_cnt,
1930 SKMEM_NOSLEEP, false);
1931 if (__improbable(buf_cnt == 0)) {
1932 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1933 dp_drop_pkt_single(fsw, pkt, 0,
1934 DROP_REASON_FSW_PP_ALLOC_FAILED,
1935 DROPTAP_FLAG_L2_MISSING);
1936 pkt = NULL;
1937 pp_free_packet_single(dpkt);
1938 dpkt = NULL;
1939 SK_ERR("failed to alloc %d "
1940 "buflets (err %d) for "
1941 "kr %s, 0x%llu", cnt, err,
1942 r->ckr_name, SK_KVA(r));
1943 break;
1944 }
1945 err = 0;
1946 }
1947 ASSERT(buf_cnt != 0);
1948 if (i == 0) {
1949 PKT_GET_FIRST_BUFLET(dpkt, 1, bprev);
1950 }
1951 /*
1952 * XXX -fbounds-safety: can't avoid using forge
1953 * unless we change the signature of
1954 * pp_alloc_buflet_batch().
1955 */
1956 bnew = __unsafe_forge_single(kern_buflet_t,
1957 buf_array[buf_array_iter]);
1958 buf_array[buf_array_iter] = 0;
1959 buf_array_iter++;
1960 buf_cnt--;
1961 VERIFY(kern_packet_add_buflet(SK_PKT2PH(dpkt),
1962 bprev, bnew) == 0);
1963 bprev = bnew;
1964 }
1965 if (__improbable(err != 0)) {
1966 continue;
1967 }
1968 err = copy_packet_from_dev(fsw, pkt, dpkt);
1969 _FSW_INJECT_ERROR(43, err, EINVAL, null_func);
1970 if (__improbable(err != 0)) {
1971 SK_ERR("copy packet failed (err %d)", err);
1972 dp_drop_pkt_single(fsw, pkt, 0,
1973 DROP_REASON_FSW_PKT_COPY_FAILED,
1974 DROPTAP_FLAG_L2_MISSING);
1975 pp_free_packet_single(dpkt);
1976 dpkt = NULL;
1977 continue;
1978 }
1979 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1980 pkt = dpkt;
1981 }
1982 _UUID_COPY(pkt->pkt_flow_id, fe->fe_uuid);
1983 _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1984 pkt->pkt_policy_id = fe->fe_policy_id;
1985 pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
1986 pkt->pkt_transport_protocol = fe->fe_transport_protocol;
1987 if (pkt->pkt_bufs_cnt > 1) {
1988 pkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1989 }
1990 KPKTQ_ENQUEUE(&transferred_pkts, pkt);
1991 }
1992 KPKTQ_FINI(rx_pkts);
1993
1994 if (KPKTQ_LEN(&transferred_pkts) > 0) {
1995 fsw_ring_enqueue_tail_drop(fsw, r, &transferred_pkts);
1996 }
1997 KPKTQ_FINI(&transferred_pkts);
1998
1999 done:
2000 /* Free unused buflets */
2001 while (buf_cnt > 0) {
2002 /*
2003 * XXX -fbounds-safety: can't avoid using forge unless we change
2004 * the signature of pp_alloc_buflet_batch().
2005 */
2006 pp_free_buflet(dpp, __unsafe_forge_single(kern_buflet_t,
2007 (kern_buflet_t)(buf_array[buf_array_iter])));
2008 buf_array[buf_array_iter] = 0;
2009 buf_array_iter++;
2010 buf_cnt--;
2011 }
2012 dp_free_pktq(fsw, &dpkts);
2013 dp_free_pktq(fsw, &disposed_pkts);
2014 dp_drop_pktq(fsw, &dropped_pkts, 0, reason, line, DROPTAP_FLAG_L2_MISSING);
2015 }
2016
2017 static inline void
rx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe,struct flow_entry_list * fes)2018 rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
2019 struct flow_entry_list *fes)
2020 {
2021 struct pktq rx_pkts;
2022 uint32_t rx_bytes;
2023 uint32_t rx_proc_flags;
2024
2025 ASSERT(!KPKTQ_EMPTY(&fe->fe_rx_pktq));
2026 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0);
2027
2028 KPKTQ_INIT(&rx_pkts);
2029 for (;;) {
2030 lck_mtx_lock(&fe->fe_rx_pktq_lock);
2031 if (KPKTQ_EMPTY(&fe->fe_rx_pktq)) {
2032 fe->fe_rx_worker_tid = 0;
2033 TAILQ_REMOVE(fes, fe, fe_rx_link);
2034 lck_mtx_unlock(&fe->fe_rx_pktq_lock);
2035 break;
2036 }
2037 KPKTQ_CONCAT(&rx_pkts, &fe->fe_rx_pktq);
2038 KPKTQ_DISPOSE(&fe->fe_rx_pktq);
2039 rx_bytes = fe->fe_rx_pktq_bytes;
2040 rx_proc_flags = fe->fe_rx_frag_count ? FLOW_PROC_FLAG_FRAGMENTS : 0;
2041 fe->fe_rx_pktq_bytes = 0;
2042 fe->fe_rx_frag_count = 0;
2043 lck_mtx_unlock(&fe->fe_rx_pktq_lock);
2044 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX, "Rx %d pkts for fe %p port %d",
2045 KPKTQ_LEN(&rx_pkts), fe, fe->fe_nx_port);
2046 /* flow related processing (default, agg, fpd, etc.) */
2047 fe->fe_rx_process(fsw, fe, &rx_pkts, rx_bytes, rx_proc_flags);
2048 }
2049 ASSERT(KPKTQ_EMPTY(&rx_pkts));
2050
2051 if (__improbable(fe->fe_want_withdraw)) {
2052 fsw_reap_sched(fsw);
2053 }
2054 }
2055
2056 static inline void
dp_rx_process_wake_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2057 dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2058 {
2059 /*
2060 * We only care about wake packets of flows that belong the flow switch
2061 * as wake packets for the host stack are handled by the host input
2062 * function
2063 */
2064 #if (DEBUG || DEVELOPMENT)
2065 if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
2066 /*
2067 * This is a one shot command
2068 */
2069 fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
2070
2071 pkt->pkt_pflags |= PKT_F_WAKE_PKT;
2072 }
2073 #endif /* (DEBUG || DEVELOPMENT) */
2074 if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
2075 if_ports_used_match_pkt(fsw->fsw_ifp, pkt);
2076 }
2077 }
2078
2079 static void
_fsw_receive_locked(struct nx_flowswitch * fsw,struct pktq * pktq)2080 _fsw_receive_locked(struct nx_flowswitch *fsw, struct pktq *pktq)
2081 {
2082 struct __kern_packet *__single pkt, *__single tpkt;
2083 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
2084 struct flow_entry *__single fe, *__single prev_fe;
2085 sa_family_t af;
2086 struct pktq host_pkts, dropped_pkts;
2087 drop_reason_t reason = DROP_REASON_UNSPECIFIED;
2088 uint16_t line = 0;
2089 int err;
2090 uint64_t thread_id;
2091
2092 KPKTQ_INIT(&host_pkts);
2093 KPKTQ_INIT(&dropped_pkts);
2094
2095 if (__improbable(FSW_QUIESCED(fsw))) {
2096 DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw);
2097 KPKTQ_CONCAT(&dropped_pkts, pktq);
2098 reason = DROP_REASON_FSW_QUIESCED;
2099 line = __LINE__;
2100 goto done;
2101 }
2102 if (__improbable(fsw->fsw_demux == NULL)) {
2103 KPKTQ_CONCAT(&dropped_pkts, pktq);
2104 reason = DROP_REASON_FSW_DEMUX_FAILED;
2105 line = __LINE__;
2106 goto done;
2107 }
2108
2109 thread_id = thread_tid(current_thread());
2110 prev_fe = NULL;
2111 KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
2112 if (__probable(tpkt)) {
2113 void *baddr;
2114 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
2115 SK_PREFETCH(baddr, 0);
2116 /* prefetch L3 and L4 flow structs */
2117 SK_PREFETCHW(tpkt->pkt_flow, 0);
2118 SK_PREFETCHW(tpkt->pkt_flow, 128);
2119 }
2120
2121 KPKTQ_REMOVE(pktq, pkt);
2122
2123 pkt = rx_prepare_packet(fsw, pkt);
2124
2125 af = fsw->fsw_demux(fsw, pkt);
2126 if (__improbable(af == AF_UNSPEC)) {
2127 KPKTQ_ENQUEUE(&host_pkts, pkt);
2128 continue;
2129 }
2130
2131 err = flow_pkt_classify(pkt, fsw->fsw_ifp, af, TRUE);
2132 _FSW_INJECT_ERROR(1, err, ENXIO, null_func);
2133 if (__improbable(err != 0)) {
2134 FSW_STATS_INC(FSW_STATS_RX_FLOW_EXTRACT_ERR);
2135 KPKTQ_ENQUEUE(&host_pkts, pkt);
2136 continue;
2137 }
2138
2139 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
2140 pkt = rx_process_ip_frag(fsw, pkt);
2141 if (pkt == NULL) {
2142 continue;
2143 }
2144 }
2145
2146 prev_fe = fe = rx_lookup_flow(fsw, pkt, prev_fe);
2147 if (__improbable(fe == NULL)) {
2148 KPKTQ_ENQUEUE_LIST(&host_pkts, pkt);
2149 continue;
2150 }
2151
2152 dp_rx_process_wake_packet(fsw, pkt);
2153
2154 rx_flow_batch_packets(&fes, fe, pkt, thread_id);
2155 prev_fe = fe;
2156 }
2157
2158 struct flow_entry *tfe = NULL;
2159 TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) {
2160 rx_flow_process(fsw, fe, &fes);
2161 flow_entry_release(&fe);
2162 }
2163
2164 if (!KPKTQ_EMPTY(&host_pkts)) {
2165 fsw_host_rx(fsw, &host_pkts);
2166 }
2167
2168 done:
2169 dp_drop_pktq(fsw, &dropped_pkts, 0, reason, line, 0);
2170 }
2171
2172 #if (DEVELOPMENT || DEBUG)
2173 static void
fsw_rps_rx(struct nx_flowswitch * fsw,uint32_t id,struct __kern_packet * pkt)2174 fsw_rps_rx(struct nx_flowswitch *fsw, uint32_t id,
2175 struct __kern_packet *pkt)
2176 {
2177 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2178
2179 lck_mtx_lock_spin(&frt->frt_lock);
2180 KPKTQ_ENQUEUE(&frt->frt_pktq, pkt);
2181 lck_mtx_unlock(&frt->frt_lock);
2182 }
2183
2184 static void
fsw_rps_thread_schedule(struct nx_flowswitch * fsw,uint32_t id)2185 fsw_rps_thread_schedule(struct nx_flowswitch *fsw, uint32_t id)
2186 {
2187 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2188
2189 ASSERT(frt->frt_thread != THREAD_NULL);
2190 lck_mtx_lock_spin(&frt->frt_lock);
2191 ASSERT(!(frt->frt_flags & (FRT_TERMINATING | FRT_TERMINATED)));
2192
2193 frt->frt_requests++;
2194 if (!(frt->frt_flags & FRT_RUNNING)) {
2195 thread_wakeup((caddr_t)frt);
2196 }
2197 lck_mtx_unlock(&frt->frt_lock);
2198 }
2199
2200 __attribute__((noreturn))
2201 static void
fsw_rps_thread_cont(void * v,wait_result_t w)2202 fsw_rps_thread_cont(void *v, wait_result_t w)
2203 {
2204 struct fsw_rps_thread *__single frt = v;
2205 struct nx_flowswitch *fsw = frt->frt_fsw;
2206
2207 lck_mtx_lock(&frt->frt_lock);
2208 if (__improbable(w == THREAD_INTERRUPTIBLE ||
2209 (frt->frt_flags & FRT_TERMINATING) != 0)) {
2210 goto terminate;
2211 }
2212 if (KPKTQ_EMPTY(&frt->frt_pktq)) {
2213 goto done;
2214 }
2215 frt->frt_flags |= FRT_RUNNING;
2216
2217 for (;;) {
2218 uint32_t requests = frt->frt_requests;
2219 struct pktq pkts;
2220
2221 KPKTQ_INIT(&pkts);
2222 KPKTQ_CONCAT(&pkts, &frt->frt_pktq);
2223 lck_mtx_unlock(&frt->frt_lock);
2224
2225 sk_protect_t protect;
2226 protect = sk_sync_protect();
2227 FSW_RLOCK(fsw);
2228 _fsw_receive_locked(fsw, &pkts);
2229 FSW_RUNLOCK(fsw);
2230 sk_sync_unprotect(protect);
2231
2232 lck_mtx_lock(&frt->frt_lock);
2233 if ((frt->frt_flags & FRT_TERMINATING) != 0 ||
2234 requests == frt->frt_requests) {
2235 frt->frt_requests = 0;
2236 break;
2237 }
2238 }
2239
2240 done:
2241 lck_mtx_unlock(&frt->frt_lock);
2242 if (!(frt->frt_flags & FRT_TERMINATING)) {
2243 frt->frt_flags &= ~FRT_RUNNING;
2244 assert_wait(frt, THREAD_UNINT);
2245 thread_block_parameter(fsw_rps_thread_cont, frt);
2246 __builtin_unreachable();
2247 } else {
2248 terminate:
2249 LCK_MTX_ASSERT(&frt->frt_lock, LCK_MTX_ASSERT_OWNED);
2250 frt->frt_flags &= ~(FRT_RUNNING | FRT_TERMINATING);
2251 frt->frt_flags |= FRT_TERMINATED;
2252
2253 if (frt->frt_flags & FRT_TERMINATEBLOCK) {
2254 thread_wakeup((caddr_t)&frt);
2255 }
2256 lck_mtx_unlock(&frt->frt_lock);
2257
2258 SK_D("fsw_rx_%s_%d terminated", if_name(fsw->fsw_ifp),
2259 frt->frt_idx);
2260
2261 /* for the extra refcnt from kernel_thread_start() */
2262 thread_deallocate(current_thread());
2263 /* this is the end */
2264 thread_terminate(current_thread());
2265 /* NOTREACHED */
2266 __builtin_unreachable();
2267 }
2268
2269 /* must never get here */
2270 VERIFY(0);
2271 /* NOTREACHED */
2272 __builtin_unreachable();
2273 }
2274
2275 __attribute__((noreturn))
2276 static void
fsw_rps_thread_func(void * v,wait_result_t w)2277 fsw_rps_thread_func(void *v, wait_result_t w)
2278 {
2279 #pragma unused(w)
2280 struct fsw_rps_thread *__single frt = v;
2281 struct nx_flowswitch *fsw = frt->frt_fsw;
2282 const char *__null_terminated tname = NULL;
2283
2284 char thread_name[MAXTHREADNAMESIZE];
2285 bzero(thread_name, sizeof(thread_name));
2286 tname = tsnprintf(thread_name, sizeof(thread_name), "fsw_rx_%s_%d",
2287 if_name(fsw->fsw_ifp), frt->frt_idx);
2288
2289 thread_set_thread_name(frt->frt_thread, tname);
2290 SK_D("%s spawned", tname);
2291
2292 net_thread_marks_push(NET_THREAD_SYNC_RX);
2293 assert_wait(frt, THREAD_UNINT);
2294 (void) thread_block_parameter(fsw_rps_thread_cont, frt);
2295
2296 __builtin_unreachable();
2297 }
2298
2299 static void
fsw_rps_thread_join(struct nx_flowswitch * fsw,uint32_t i)2300 fsw_rps_thread_join(struct nx_flowswitch *fsw, uint32_t i)
2301 {
2302 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2303 uint64_t f = (1 * NSEC_PER_MSEC);
2304 uint64_t s = (1000 * NSEC_PER_SEC);
2305 uint32_t c = 0;
2306
2307 lck_mtx_lock(&frt->frt_lock);
2308 frt->frt_flags |= FRT_TERMINATING;
2309
2310 while (!(frt->frt_flags & FRT_TERMINATED)) {
2311 uint64_t t = 0;
2312 nanoseconds_to_absolutetime((c++ == 0) ? f : s, &t);
2313 clock_absolutetime_interval_to_deadline(t, &t);
2314 ASSERT(t != 0);
2315
2316 frt->frt_flags |= FRT_TERMINATEBLOCK;
2317 if (!(frt->frt_flags & FRT_RUNNING)) {
2318 thread_wakeup_one((caddr_t)frt);
2319 }
2320 (void) assert_wait_deadline(&frt->frt_thread, THREAD_UNINT, t);
2321 lck_mtx_unlock(&frt->frt_lock);
2322 thread_block(THREAD_CONTINUE_NULL);
2323 lck_mtx_lock(&frt->frt_lock);
2324 frt->frt_flags &= ~FRT_TERMINATEBLOCK;
2325 }
2326 ASSERT(frt->frt_flags & FRT_TERMINATED);
2327 lck_mtx_unlock(&frt->frt_lock);
2328 frt->frt_thread = THREAD_NULL;
2329 }
2330
2331 static void
fsw_rps_thread_spawn(struct nx_flowswitch * fsw,uint32_t i)2332 fsw_rps_thread_spawn(struct nx_flowswitch *fsw, uint32_t i)
2333 {
2334 kern_return_t error;
2335 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2336
2337 lck_mtx_init(&frt->frt_lock, &nexus_lock_group, &nexus_lock_attr);
2338 frt->frt_idx = i;
2339 frt->frt_fsw = fsw;
2340 error = kernel_thread_start(fsw_rps_thread_func, frt, &frt->frt_thread);
2341 ASSERT(!error);
2342 KPKTQ_INIT(&frt->frt_pktq);
2343 }
2344
2345 int
fsw_rps_set_nthreads(struct nx_flowswitch * fsw,uint32_t n)2346 fsw_rps_set_nthreads(struct nx_flowswitch* fsw, uint32_t n)
2347 {
2348 if (n > FSW_RPS_MAX_NTHREADS) {
2349 SK_ERR("rps nthreads %d, max %d", n, FSW_RPS_MAX_NTHREADS);
2350 return EINVAL;
2351 }
2352
2353 FSW_WLOCK(fsw);
2354 if (n < fsw->fsw_rps_nthreads) {
2355 for (uint32_t i = n; i < fsw->fsw_rps_nthreads; i++) {
2356 fsw_rps_thread_join(fsw, i);
2357 }
2358 fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2359 fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2360 fsw->fsw_rps_nthreads = n;
2361 } else if (n > fsw->fsw_rps_nthreads) {
2362 uint32_t nthreads_old = fsw->fsw_rps_nthreads;
2363
2364 fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2365 fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2366 fsw->fsw_rps_nthreads = n;
2367 for (uint32_t i = nthreads_old; i < n; i++) {
2368 fsw_rps_thread_spawn(fsw, i);
2369 }
2370 }
2371 FSW_WUNLOCK(fsw);
2372 return 0;
2373 }
2374
2375 static uint32_t
get_rps_id(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2376 get_rps_id(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2377 {
2378 sa_family_t af = fsw->fsw_demux(fsw, pkt);
2379 if (__improbable(af == AF_UNSPEC)) {
2380 return 0;
2381 }
2382
2383 flow_pkt_classify(pkt, fsw->fsw_ifp, af, true);
2384
2385 if (__improbable((pkt->pkt_qum_qflags &
2386 QUM_F_FLOW_CLASSIFIED) == 0)) {
2387 return 0;
2388 }
2389
2390 struct flow_key key;
2391 flow_pkt2key(pkt, true, &key);
2392 key.fk_mask = FKMASK_5TUPLE;
2393
2394 uint32_t id = flow_key_hash(&key) % fsw->fsw_rps_nthreads;
2395
2396 return id;
2397 }
2398
2399 #endif /* !DEVELOPMENT && !DEBUG */
2400
2401 void
fsw_receive(struct nx_flowswitch * fsw,struct pktq * pktq)2402 fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq)
2403 {
2404 FSW_RLOCK(fsw);
2405 #if (DEVELOPMENT || DEBUG)
2406 if (fsw->fsw_rps_nthreads != 0) {
2407 struct __kern_packet *pkt, *tpkt;
2408 bitmap_t map = 0;
2409
2410 _CASSERT(BITMAP_LEN(FSW_RPS_MAX_NTHREADS) == 1);
2411 KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
2412 uint32_t id = get_rps_id(fsw, pkt);
2413 KPKTQ_REMOVE(pktq, pkt);
2414 fsw_rps_rx(fsw, id, pkt);
2415 bitmap_set(&map, id);
2416 }
2417 for (int i = bitmap_first(&map, 64); i >= 0;
2418 i = bitmap_next(&map, i)) {
2419 fsw_rps_thread_schedule(fsw, i);
2420 }
2421 } else
2422 #endif /* !DEVELOPMENT && !DEBUG */
2423 {
2424 _fsw_receive_locked(fsw, pktq);
2425 }
2426 FSW_RUNLOCK(fsw);
2427 }
2428
2429 int
fsw_dev_input_netem_dequeue(void * handle,pktsched_pkt_t * __counted_by (n_pkts)pkts,uint32_t n_pkts)2430 fsw_dev_input_netem_dequeue(void *handle,
2431 pktsched_pkt_t *__counted_by(n_pkts)pkts, uint32_t n_pkts)
2432 {
2433 #pragma unused(handle)
2434 struct nx_flowswitch *__single fsw = handle;
2435 struct __kern_packet *kpkts[FSW_VP_DEV_BATCH_MAX];
2436 struct pktq pktq;
2437 sk_protect_t protect;
2438 uint32_t i;
2439
2440 ASSERT(n_pkts <= FSW_VP_DEV_BATCH_MAX);
2441
2442 for (i = 0; i < n_pkts; i++) {
2443 ASSERT(pkts[i].pktsched_ptype == QP_PACKET);
2444 ASSERT(pkts[i].pktsched_pkt_kpkt != NULL);
2445 kpkts[i] = pkts[i].pktsched_pkt_kpkt;
2446 }
2447
2448 protect = sk_sync_protect();
2449 KPKTQ_INIT(&pktq);
2450 pkts_to_pktq(kpkts, n_pkts, &pktq);
2451
2452 fsw_receive(fsw, &pktq);
2453 KPKTQ_FINI(&pktq);
2454 sk_sync_unprotect(protect);
2455
2456 return 0;
2457 }
2458
2459 static void
fsw_dev_input_netem_enqueue(struct nx_flowswitch * fsw,struct pktq * q)2460 fsw_dev_input_netem_enqueue(struct nx_flowswitch *fsw, struct pktq *q)
2461 {
2462 classq_pkt_t p;
2463 struct netem *__single ne;
2464 struct __kern_packet *pkt, *tpkt;
2465
2466 ASSERT(fsw->fsw_ifp != NULL);
2467 ne = fsw->fsw_ifp->if_input_netem;
2468 ASSERT(ne != NULL);
2469 KPKTQ_FOREACH_SAFE(pkt, q, tpkt) {
2470 bool pdrop;
2471 KPKTQ_REMOVE(q, pkt);
2472 CLASSQ_PKT_INIT_PACKET(&p, pkt);
2473 netem_enqueue(ne, &p, &pdrop);
2474 }
2475 }
2476
2477 void
fsw_devna_rx(struct nexus_adapter * devna,struct __kern_packet * pkt_head,struct nexus_pkt_stats * out_stats)2478 fsw_devna_rx(struct nexus_adapter *devna, struct __kern_packet *pkt_head,
2479 struct nexus_pkt_stats *out_stats)
2480 {
2481 struct __kern_packet *pkt = pkt_head, *next;
2482 struct nx_flowswitch *fsw;
2483 uint32_t n_bytes = 0, n_pkts = 0;
2484 uint64_t total_pkts = 0, total_bytes = 0;
2485 struct pktq q;
2486
2487 KPKTQ_INIT(&q);
2488 if (__improbable(devna->na_ifp == NULL ||
2489 (fsw = fsw_ifp_to_fsw(devna->na_ifp)) == NULL)) {
2490 SK_ERR("fsw not attached, dropping %d pkts", KPKTQ_LEN(&q));
2491 dp_drop_pkt_chain(pkt_head, 0, DROP_REASON_FSW_QUIESCED, DROPTAP_FLAG_L2_MISSING);
2492 return;
2493 }
2494 while (pkt != NULL) {
2495 if (__improbable(pkt->pkt_trace_id != 0)) {
2496 KDBG(SK_KTRACE_PKT_RX_DRV | DBG_FUNC_END, pkt->pkt_trace_id);
2497 KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_START, pkt->pkt_trace_id);
2498 }
2499 next = pkt->pkt_nextpkt;
2500 pkt->pkt_nextpkt = NULL;
2501
2502 if (__probable((pkt->pkt_qum_qflags & QUM_F_DROPPED) == 0)) {
2503 KPKTQ_ENQUEUE(&q, pkt);
2504 n_bytes += pkt->pkt_length;
2505 } else {
2506 DTRACE_SKYWALK1(non__finalized__drop,
2507 struct __kern_packet *, pkt);
2508 FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_FINALIZED);
2509 dp_drop_pkt_single(fsw, pkt, 0,
2510 DROP_REASON_FSW_RX_PKT_NOT_FINALIZED,
2511 DROPTAP_FLAG_L2_MISSING);
2512 pkt = NULL;
2513 }
2514 n_pkts = KPKTQ_LEN(&q);
2515 if (n_pkts == fsw_rx_batch || (next == NULL && n_pkts > 0)) {
2516 if (__improbable(fsw->fsw_ifp->if_input_netem != NULL)) {
2517 fsw_dev_input_netem_enqueue(fsw, &q);
2518 } else {
2519 fsw_receive(fsw, &q);
2520 }
2521 total_pkts += n_pkts;
2522 total_bytes += n_bytes;
2523 n_pkts = 0;
2524 n_bytes = 0;
2525 KPKTQ_FINI(&q);
2526 }
2527 pkt = next;
2528 }
2529 ASSERT(KPKTQ_LEN(&q) == 0);
2530 FSW_STATS_ADD(FSW_STATS_RX_PACKETS, total_pkts);
2531 if (out_stats != NULL) {
2532 out_stats->nps_pkts += total_pkts;
2533 out_stats->nps_bytes += total_bytes;
2534 }
2535 KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(devna), total_pkts, total_bytes);
2536 }
2537
2538 static int
dp_copy_to_dev_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2539 dp_copy_to_dev_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2540 struct __kern_packet *dpkt)
2541 {
2542 struct mbuf *__single m = NULL;
2543 uint32_t bdlen, bdlim, bdoff;
2544 uint8_t *bdaddr;
2545 unsigned int one = 1;
2546 int err = 0;
2547
2548 err = mbuf_allocpacket(MBUF_DONTWAIT,
2549 (fsw->fsw_frame_headroom + spkt->pkt_length), &one, &m);
2550 #if (DEVELOPMENT || DEBUG)
2551 if (m != NULL) {
2552 _FSW_INJECT_ERROR(11, m, NULL, m_freem, m);
2553 }
2554 #endif /* DEVELOPMENT || DEBUG */
2555 if (__improbable(m == NULL)) {
2556 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
2557 err = ENOBUFS;
2558 goto done;
2559 }
2560
2561 MD_BUFLET_ADDR_ABS_DLEN(dpkt, bdaddr, bdlen, bdlim, bdoff);
2562 if (fsw->fsw_frame_headroom > bdlim) {
2563 SK_ERR("not enough space in buffer for headroom");
2564 err = EINVAL;
2565 goto done;
2566 }
2567
2568 dpkt->pkt_headroom = fsw->fsw_frame_headroom;
2569 dpkt->pkt_mbuf = m;
2570 dpkt->pkt_pflags |= PKT_F_MBUF_DATA;
2571
2572 /* packet copy into mbuf */
2573 fsw->fsw_pkt_copy_to_mbuf(NR_TX, SK_PTR_ENCODE(spkt,
2574 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)), 0, m,
2575 fsw->fsw_frame_headroom, spkt->pkt_length,
2576 PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2577 spkt->pkt_csum_tx_start_off);
2578 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2MBUF);
2579
2580 /* header copy into dpkt buffer for classification */
2581 kern_packet_t sph = SK_PTR_ENCODE(spkt,
2582 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2583 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2584 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2585 uint32_t copy_len = MIN(spkt->pkt_length, bdlim - dpkt->pkt_headroom);
2586 fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, dpkt->pkt_headroom,
2587 sph, spkt->pkt_headroom, copy_len, FALSE, 0, 0, 0);
2588 if (copy_len < spkt->pkt_length) {
2589 dpkt->pkt_pflags |= PKT_F_TRUNCATED;
2590 }
2591
2592 /*
2593 * fsw->fsw_frame_headroom is after m_data, thus we treat m_data same as
2594 * buflet baddr m_data always points to the beginning of packet and
2595 * should represents the same as baddr + headroom
2596 */
2597 ASSERT((uintptr_t)m->m_data ==
2598 ((uintptr_t)mbuf_datastart(m) + fsw->fsw_frame_headroom));
2599
2600 done:
2601 return err;
2602 }
2603
2604 static int
dp_copy_to_dev_pkt(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2605 dp_copy_to_dev_pkt(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2606 struct __kern_packet *dpkt)
2607 {
2608 struct ifnet *ifp = fsw->fsw_ifp;
2609 uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
2610
2611 if (headroom > UINT8_MAX) {
2612 SK_ERR("headroom too large %d", headroom);
2613 return ERANGE;
2614 }
2615 dpkt->pkt_headroom = (uint8_t)headroom;
2616 ASSERT((dpkt->pkt_headroom & 0x7) == 0);
2617 dpkt->pkt_l2_len = 0;
2618 dpkt->pkt_link_flags = spkt->pkt_link_flags;
2619
2620 kern_packet_t sph = SK_PTR_ENCODE(spkt,
2621 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2622 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2623 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2624 fsw->fsw_pkt_copy_from_pkt(NR_TX, dph,
2625 dpkt->pkt_headroom, sph, spkt->pkt_headroom,
2626 spkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2627 (spkt->pkt_csum_tx_start_off - spkt->pkt_headroom),
2628 (spkt->pkt_csum_tx_stuff_off - spkt->pkt_headroom),
2629 (spkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT));
2630
2631 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
2632
2633 return 0;
2634 }
2635
2636 #if SK_LOG
2637 /* Hoisted out of line to reduce kernel stack footprint */
2638 SK_LOG_ATTRIBUTE
2639 static void
dp_copy_to_dev_log(struct nx_flowswitch * fsw,const struct kern_pbufpool * pp,struct __kern_packet * spkt,struct __kern_packet * dpkt,int error)2640 dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp,
2641 struct __kern_packet *spkt, struct __kern_packet *dpkt, int error)
2642 {
2643 struct proc *p = current_proc();
2644 struct ifnet *ifp = fsw->fsw_ifp;
2645 uint64_t logflags = (SK_VERB_FSW_DP | SK_VERB_TX);
2646
2647 if (error == ERANGE) {
2648 SK_ERR("packet too long, hr(fr+tx)+slen (%u+%u)+%u > "
2649 "dev_pp_max %u", (uint32_t)fsw->fsw_frame_headroom,
2650 (uint32_t)ifp->if_tx_headroom, spkt->pkt_length,
2651 (uint32_t)pp->pp_max_frags * PP_BUF_SIZE_DEF(pp));
2652 } else if (error == ENOBUFS) {
2653 SK_DF(logflags, "%s(%d) packet allocation failure",
2654 sk_proc_name_address(p), sk_proc_pid(p));
2655 } else if (error == 0) {
2656 ASSERT(dpkt != NULL);
2657 char *daddr;
2658 uint32_t pkt_len;
2659
2660 MD_BUFLET_ADDR_ABS(dpkt, daddr);
2661 pkt_len = __packet_get_real_data_length(dpkt);
2662 SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)",
2663 sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
2664 dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
2665 (uint32_t)fsw->fsw_frame_headroom,
2666 (uint32_t)ifp->if_tx_headroom);
2667 SK_DF(logflags | SK_VERB_DUMP, "%s",
2668 sk_dump("buf", daddr, pkt_len, 128, NULL, 0));
2669 } else {
2670 SK_DF(logflags, "%s(%d) error %d", error);
2671 }
2672 }
2673 #else
2674 #define dp_copy_to_dev_log(...)
2675 #endif /* SK_LOG */
2676
2677 static void
fsw_pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)2678 fsw_pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
2679 {
2680 ASSERT(!(spkt->pkt_pflags & PKT_F_MBUF_MASK));
2681 ASSERT(!(spkt->pkt_pflags & PKT_F_PKT_MASK));
2682
2683 SK_PREFETCHW(dpkt->pkt_qum_buf.buf_addr, 0);
2684 /* Copy packet metadata */
2685 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
2686 _PKT_COPY(spkt, dpkt);
2687 _PKT_COPY_TX_PORT_DATA(spkt, dpkt);
2688 ASSERT((dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
2689 !PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
2690 ASSERT(dpkt->pkt_mbuf == NULL);
2691
2692 /* Copy AQM metadata */
2693 dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type;
2694 dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx;
2695 _CASSERT((offsetof(struct __flow, flow_src_id) % 8) == 0);
2696 _UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id);
2697 _UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid);
2698 dpkt->pkt_policy_id = spkt->pkt_policy_id;
2699 dpkt->pkt_skip_policy_id = spkt->pkt_skip_policy_id;
2700 }
2701
2702 static int
dp_copy_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2703 dp_copy_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2704 struct __kern_packet *dpkt)
2705 {
2706 const struct kern_pbufpool *pp = dpkt->pkt_qum.qum_pp;
2707 struct ifnet *ifp = fsw->fsw_ifp;
2708 uint32_t dev_pkt_len;
2709 int err = 0;
2710
2711 fsw_pkt_copy_metadata(spkt, dpkt);
2712 switch (fsw->fsw_classq_enq_ptype) {
2713 case QP_MBUF:
2714 err = dp_copy_to_dev_mbuf(fsw, spkt, dpkt);
2715 break;
2716
2717 case QP_PACKET:
2718 dev_pkt_len = fsw->fsw_frame_headroom + ifp->if_tx_headroom +
2719 spkt->pkt_length;
2720 if (dev_pkt_len > pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)) {
2721 FSW_STATS_INC(FSW_STATS_TX_COPY_BAD_LEN);
2722 err = ERANGE;
2723 goto done;
2724 }
2725 err = dp_copy_to_dev_pkt(fsw, spkt, dpkt);
2726 break;
2727
2728 default:
2729 VERIFY(0);
2730 __builtin_unreachable();
2731 }
2732 done:
2733 dp_copy_to_dev_log(fsw, pp, spkt, dpkt, err);
2734 return err;
2735 }
2736
2737 static int
dp_copy_headers_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2738 dp_copy_headers_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2739 struct __kern_packet *dpkt)
2740 {
2741 uint8_t *sbaddr, *dbaddr;
2742 uint16_t headroom = fsw->fsw_frame_headroom + fsw->fsw_ifp->if_tx_headroom;
2743 uint16_t hdrs_len_estimate = (uint16_t)MIN(spkt->pkt_length, 128);
2744
2745 fsw_pkt_copy_metadata(spkt, dpkt);
2746
2747 MD_BUFLET_ADDR_ABS(spkt, sbaddr);
2748 ASSERT(sbaddr != NULL);
2749 sbaddr += spkt->pkt_headroom;
2750
2751 MD_BUFLET_ADDR_ABS(dpkt, dbaddr);
2752 ASSERT(dbaddr != NULL);
2753 dpkt->pkt_headroom = (uint8_t)headroom;
2754 dbaddr += headroom;
2755
2756 pkt_copy(sbaddr, dbaddr, hdrs_len_estimate);
2757 METADATA_SET_LEN(dpkt, hdrs_len_estimate, headroom);
2758
2759 /* packet length is set to the full length */
2760 dpkt->pkt_length = spkt->pkt_length;
2761 dpkt->pkt_pflags |= PKT_F_TRUNCATED;
2762 return 0;
2763 }
2764
2765 static struct mbuf *
convert_pkt_to_mbuf(struct __kern_packet * pkt)2766 convert_pkt_to_mbuf(struct __kern_packet *pkt)
2767 {
2768 ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
2769 ASSERT(pkt->pkt_mbuf != NULL);
2770 struct mbuf *m = pkt->pkt_mbuf;
2771
2772 /* pass additional metadata generated from flow parse/lookup */
2773 _CASSERT(sizeof(m->m_pkthdr.pkt_flowid) ==
2774 sizeof(pkt->pkt_flow_token));
2775 _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) ==
2776 sizeof(pkt->pkt_flowsrc_token));
2777 _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) ==
2778 sizeof(pkt->pkt_flowsrc_fidx));
2779 m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
2780 m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
2781 m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
2782 m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
2783 m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
2784 m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
2785 m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx;
2786
2787 if (pkt->pkt_transport_protocol == IPPROTO_QUIC) {
2788 m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_QUIC;
2789 }
2790
2791 /* The packet should have a timestamp by the time we get here. */
2792 m->m_pkthdr.pkt_timestamp = pkt->pkt_timestamp;
2793 m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
2794
2795 m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
2796 m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
2797 /* set pkt_hdr so that AQM can find IP header and mark ECN bits */
2798 m->m_pkthdr.pkt_hdr = m_mtod_current(m) + pkt->pkt_l2_len;
2799
2800 if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
2801 m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
2802 }
2803 KPKT_CLEAR_MBUF_DATA(pkt);
2804
2805 /* mbuf has been consumed, release packet as well */
2806 ASSERT(pkt->pkt_qum.qum_ksd == NULL);
2807 pp_free_packet_single(pkt);
2808 return m;
2809 }
2810
2811 static void
convert_pkt_to_mbuf_list(struct __kern_packet * pkt_list,struct mbuf ** head,struct mbuf ** tail,uint32_t * cnt,uint32_t * bytes)2812 convert_pkt_to_mbuf_list(struct __kern_packet *pkt_list,
2813 struct mbuf **head, struct mbuf **tail,
2814 uint32_t *cnt, uint32_t *bytes)
2815 {
2816 struct __kern_packet *pkt = pkt_list, *next;
2817 struct mbuf *__single m_head = NULL, **__single m_tailp = &m_head;
2818 struct mbuf *__single m = NULL;
2819 uint32_t c = 0, b = 0;
2820
2821 while (pkt != NULL) {
2822 next = pkt->pkt_nextpkt;
2823 pkt->pkt_nextpkt = NULL;
2824 m = convert_pkt_to_mbuf(pkt);
2825 ASSERT(m != NULL);
2826
2827 *m_tailp = m;
2828 m_tailp = &m->m_nextpkt;
2829 c++;
2830 b += m_pktlen(m);
2831 pkt = next;
2832 }
2833 if (head != NULL) {
2834 *head = m_head;
2835 }
2836 if (tail != NULL) {
2837 *tail = m;
2838 }
2839 if (cnt != NULL) {
2840 *cnt = c;
2841 }
2842 if (bytes != NULL) {
2843 *bytes = b;
2844 }
2845 }
2846
2847 SK_NO_INLINE_ATTRIBUTE
2848 static int
classq_enqueue_flow_single(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2849 classq_enqueue_flow_single(struct nx_flowswitch *fsw,
2850 struct __kern_packet *pkt)
2851 {
2852 struct ifnet *ifp = fsw->fsw_ifp;
2853 boolean_t pkt_drop = FALSE;
2854 int err;
2855
2856 FSW_LOCK_ASSERT_HELD(fsw);
2857 ASSERT(fsw->fsw_classq_enabled);
2858 ASSERT(pkt->pkt_flow_token != 0);
2859 fsw_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class,
2860 1, pkt->pkt_length);
2861
2862 if (__improbable(pkt->pkt_trace_id != 0)) {
2863 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
2864 KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
2865 }
2866
2867 switch (fsw->fsw_classq_enq_ptype) {
2868 case QP_MBUF: { /* compat interface */
2869 struct mbuf *m;
2870
2871 m = convert_pkt_to_mbuf(pkt);
2872 ASSERT(m != NULL);
2873 pkt = NULL;
2874
2875 /* ifnet_enqueue consumes mbuf */
2876 err = ifnet_enqueue_mbuf(ifp, m, false, &pkt_drop);
2877 m = NULL;
2878 #if (DEVELOPMENT || DEBUG)
2879 if (__improbable(!pkt_drop)) {
2880 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2881 }
2882 #endif /* DEVELOPMENT || DEBUG */
2883 if (pkt_drop) {
2884 FSW_STATS_INC(FSW_STATS_DROP);
2885 FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2886 }
2887 break;
2888 }
2889 case QP_PACKET: { /* native interface */
2890 /* ifnet_enqueue consumes packet */
2891 err = ifnet_enqueue_pkt(ifp, pkt, false, &pkt_drop);
2892 pkt = NULL;
2893 #if (DEVELOPMENT || DEBUG)
2894 if (__improbable(!pkt_drop)) {
2895 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2896 }
2897 #endif /* DEVELOPMENT || DEBUG */
2898 if (pkt_drop) {
2899 FSW_STATS_INC(FSW_STATS_DROP);
2900 FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2901 }
2902 break;
2903 }
2904 default:
2905 err = EINVAL;
2906 VERIFY(0);
2907 /* NOTREACHED */
2908 __builtin_unreachable();
2909 }
2910
2911 return err;
2912 }
2913
2914 static int
classq_enqueue_flow_chain(struct nx_flowswitch * fsw,struct __kern_packet * pkt_head,struct __kern_packet * pkt_tail,uint32_t cnt,uint32_t bytes)2915 classq_enqueue_flow_chain(struct nx_flowswitch *fsw,
2916 struct __kern_packet *pkt_head, struct __kern_packet *pkt_tail,
2917 uint32_t cnt, uint32_t bytes)
2918 {
2919 struct ifnet *ifp = fsw->fsw_ifp;
2920 boolean_t pkt_drop = FALSE;
2921 uint32_t svc;
2922 int err;
2923
2924 FSW_LOCK_ASSERT_HELD(fsw);
2925 ASSERT(fsw->fsw_classq_enabled);
2926 ASSERT(pkt_head->pkt_flow_token != 0);
2927
2928 /*
2929 * All packets in the flow should have the same svc.
2930 */
2931 svc = pkt_head->pkt_svc_class;
2932 fsw_ifp_inc_traffic_class_out_pkt(ifp, svc, cnt, bytes);
2933
2934 switch (fsw->fsw_classq_enq_ptype) {
2935 case QP_MBUF: { /* compat interface */
2936 struct mbuf *__single m_head = NULL, *__single m_tail = NULL;
2937 uint32_t c = 0, b = 0;
2938
2939 convert_pkt_to_mbuf_list(pkt_head, &m_head, &m_tail, &c, &b);
2940 ASSERT(m_head != NULL && m_tail != NULL);
2941 ASSERT(c == cnt);
2942 ASSERT(b == bytes);
2943 pkt_head = NULL;
2944
2945 /* ifnet_enqueue consumes mbuf */
2946 err = ifnet_enqueue_mbuf_chain(ifp, m_head, m_tail, cnt,
2947 bytes, FALSE, &pkt_drop);
2948 m_head = NULL;
2949 m_tail = NULL;
2950 #if (DEVELOPMENT || DEBUG)
2951 if (__improbable(!pkt_drop)) {
2952 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2953 }
2954 #endif /* DEVELOPMENT || DEBUG */
2955 if (pkt_drop) {
2956 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2957 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2958 cnt);
2959 }
2960 break;
2961 }
2962 case QP_PACKET: { /* native interface */
2963 /* ifnet_enqueue consumes packet */
2964 err = ifnet_enqueue_pkt_chain(ifp, pkt_head, pkt_tail, cnt,
2965 bytes, FALSE, &pkt_drop);
2966 pkt_head = NULL;
2967 #if (DEVELOPMENT || DEBUG)
2968 if (__improbable(!pkt_drop)) {
2969 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2970 }
2971 #endif /* DEVELOPMENT || DEBUG */
2972 if (pkt_drop) {
2973 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2974 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2975 cnt);
2976 }
2977 break;
2978 }
2979 default:
2980 err = EINVAL;
2981 VERIFY(0);
2982 /* NOTREACHED */
2983 __builtin_unreachable();
2984 }
2985
2986 return err;
2987 }
2988
2989 /*
2990 * This code path needs to be kept for interfaces without logical link support.
2991 */
2992 static void
classq_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)2993 classq_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
2994 bool chain, uint32_t cnt, uint32_t bytes)
2995 {
2996 struct __kern_packet *pkt, *tail, *tpkt;
2997 flowadv_idx_t flow_adv_idx;
2998 bool flowadv_cap;
2999 flowadv_token_t flow_adv_token;
3000 int err;
3001
3002 SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
3003 if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
3004
3005 if (chain) {
3006 pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
3007 tail = KPKTQ_LAST(&fe->fe_tx_pktq);
3008 KPKTQ_INIT(&fe->fe_tx_pktq);
3009 if (pkt == NULL) {
3010 return;
3011 }
3012 flow_adv_idx = pkt->pkt_flowsrc_fidx;
3013 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3014 flow_adv_token = pkt->pkt_flow_token;
3015
3016 err = classq_enqueue_flow_chain(fsw, pkt, tail, cnt, bytes);
3017 DTRACE_SKYWALK3(chain__enqueue, uint32_t, cnt, uint32_t, bytes, int, err);
3018 } else {
3019 uint32_t c = 0, b = 0;
3020
3021 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3022 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3023
3024 flow_adv_idx = pkt->pkt_flowsrc_fidx;
3025 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3026 flow_adv_token = pkt->pkt_flow_token;
3027
3028 c++;
3029 b += pkt->pkt_length;
3030 err = classq_enqueue_flow_single(fsw, pkt);
3031 }
3032 ASSERT(c == cnt);
3033 ASSERT(b == bytes);
3034 DTRACE_SKYWALK3(non__chain__enqueue, uint32_t, cnt, uint32_t, bytes,
3035 int, err);
3036 }
3037 }
3038
3039 /*
3040 * Logical link code path
3041 */
3042 static void
classq_qset_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)3043 classq_qset_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
3044 bool chain, uint32_t cnt, uint32_t bytes)
3045 {
3046 struct __kern_packet *pkt, *tail;
3047 flowadv_idx_t flow_adv_idx;
3048 bool flowadv_cap;
3049 flowadv_token_t flow_adv_token;
3050 uint32_t flowctl = 0, dropped = 0;
3051 int err;
3052
3053 SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
3054 if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
3055
3056 pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
3057 tail = KPKTQ_LAST(&fe->fe_tx_pktq);
3058 KPKTQ_INIT(&fe->fe_tx_pktq);
3059 if (pkt == NULL) {
3060 return;
3061 }
3062 flow_adv_idx = pkt->pkt_flowsrc_fidx;
3063 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3064 flow_adv_token = pkt->pkt_flow_token;
3065
3066 err = netif_qset_enqueue(fe->fe_qset, chain, pkt, tail, cnt, bytes,
3067 &flowctl, &dropped);
3068
3069 if (__improbable(err != 0) && dropped > 0) {
3070 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, dropped);
3071 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP, dropped);
3072 }
3073 }
3074
3075 static void
tx_finalize_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)3076 tx_finalize_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
3077 {
3078 #pragma unused(fsw)
3079 /* finalize here; no more changes to buflets after classq */
3080 if (__probable(!(pkt->pkt_pflags & PKT_F_MBUF_DATA))) {
3081 kern_packet_t ph = SK_PTR_ENCODE(pkt,
3082 METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
3083 int err = __packet_finalize(ph);
3084 VERIFY(err == 0);
3085 }
3086 }
3087
3088 static bool
dp_flow_tx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)3089 dp_flow_tx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
3090 {
3091 struct flow_route *fr = fe->fe_route;
3092 int err;
3093
3094 ASSERT(fr != NULL);
3095
3096 if (__improbable(!dp_flow_route_process(fsw, fe))) {
3097 return false;
3098 }
3099 if (fe->fe_qset_select == FE_QSET_SELECT_DYNAMIC) {
3100 flow_qset_select_dynamic(fsw, fe, TRUE);
3101 }
3102
3103 _FSW_INJECT_ERROR(35, fr->fr_flags, fr->fr_flags,
3104 _fsw_error35_handler, 1, fr, NULL, NULL);
3105 _FSW_INJECT_ERROR(36, fr->fr_flags, fr->fr_flags,
3106 _fsw_error36_handler, 1, fr, NULL);
3107
3108 /*
3109 * See if we need to resolve the flow route; note the test against
3110 * fr_flags here is done without any lock for performance. Thus
3111 * it's possible that we race against the thread performing route
3112 * event updates for a packet (which is OK). In any case we should
3113 * not have any assertion on fr_flags value(s) due to the lack of
3114 * serialization.
3115 */
3116 if (fr->fr_flags & FLOWRTF_RESOLVED) {
3117 goto frame;
3118 }
3119
3120 struct __kern_packet *pkt, *tpkt;
3121 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3122 err = fsw->fsw_resolve(fsw, fr, pkt);
3123 _FSW_INJECT_ERROR_SET(35, _fsw_error35_handler, 2, fr, pkt, &err);
3124 _FSW_INJECT_ERROR_SET(36, _fsw_error36_handler, 2, fr, &err);
3125 /*
3126 * If resolver returns EJUSTRETURN then we drop the pkt as the
3127 * resolver should have converted the pkt into mbuf (or
3128 * detached the attached mbuf from pkt) and added it to the
3129 * llinfo queue. If we do have a cached llinfo, then proceed
3130 * to using it even though it may be stale (very unlikely)
3131 * while the resolution is in progress.
3132 * Otherwise, any other error results in dropping pkt.
3133 */
3134 if (err == EJUSTRETURN) {
3135 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3136 pp_free_packet_single(pkt);
3137 FSW_STATS_INC(FSW_STATS_TX_RESOLV_PENDING);
3138 continue;
3139 } else if (err != 0 && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
3140 /* use existing llinfo */
3141 FSW_STATS_INC(FSW_STATS_TX_RESOLV_STALE);
3142 } else if (err != 0) {
3143 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3144 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_RESOLV_FAILED,
3145 DROPTAP_FLAG_L2_MISSING);
3146 FSW_STATS_INC(FSW_STATS_TX_RESOLV_FAIL);
3147 continue;
3148 }
3149 }
3150
3151 frame:
3152 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3153 if (fsw->fsw_frame != NULL) {
3154 fsw->fsw_frame(fsw, fr, pkt);
3155 }
3156 }
3157
3158 return true;
3159 }
3160
3161 static void
dp_listener_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)3162 dp_listener_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
3163 {
3164 #pragma unused(fsw)
3165 struct __kern_packet *pkt, *tpkt;
3166 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3167 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3168 /* listener is only allowed TCP RST */
3169 if (pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
3170 (pkt->pkt_flow_tcp_flags & TH_RST) != 0) {
3171 flow_track_abort_tcp(fe, NULL, pkt);
3172 } else {
3173 char *addr;
3174
3175 MD_BUFLET_ADDR_ABS(pkt, addr);
3176 SK_ERR("listener flow sends non-RST packet %s",
3177 sk_dump(sk_proc_name_address(current_proc()),
3178 addr, __packet_get_real_data_length(pkt), 128, NULL, 0));
3179 }
3180 pp_free_packet_single(pkt);
3181 }
3182 }
3183
3184 static void
fsw_update_timestamps(struct __kern_packet * pkt,volatile uint64_t * fg_ts,volatile uint64_t * rt_ts,ifnet_t ifp,uint64_t now)3185 fsw_update_timestamps(struct __kern_packet *pkt, volatile uint64_t *fg_ts,
3186 volatile uint64_t *rt_ts, ifnet_t ifp, uint64_t now)
3187 {
3188 if (!(pkt->pkt_pflags & PKT_F_TS_VALID) || pkt->pkt_timestamp == 0) {
3189 pkt->pkt_timestamp = now;
3190 }
3191 pkt->pkt_pflags &= ~PKT_F_TS_VALID;
3192
3193 /*
3194 * If the packet service class is not background,
3195 * update the timestamps on the interface, as well as
3196 * the ones in nexus-wide advisory to indicate recent
3197 * activity on a foreground flow.
3198 */
3199 if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) {
3200 ifp->if_fg_sendts = (uint32_t)_net_uptime;
3201 if (fg_ts != NULL) {
3202 *fg_ts = _net_uptime;
3203 }
3204 }
3205 if (pkt->pkt_pflags & PKT_F_REALTIME) {
3206 ifp->if_rt_sendts = (uint32_t)_net_uptime;
3207 if (rt_ts != NULL) {
3208 *rt_ts = _net_uptime;
3209 }
3210 }
3211 }
3212
3213 static bool
fsw_chain_enqueue_enabled(struct nx_flowswitch * fsw)3214 fsw_chain_enqueue_enabled(struct nx_flowswitch *fsw)
3215 {
3216 return fsw_chain_enqueue != 0 &&
3217 fsw->fsw_ifp->if_output_netem == NULL &&
3218 (fsw->fsw_ifp->if_eflags & IFEF_ENQUEUE_MULTI) == 0;
3219 }
3220
3221 void
dp_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)3222 dp_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3223 uint32_t flags)
3224 {
3225 struct pktq dropped_pkts;
3226 bool chain, same_svc = true;
3227 bool gso = ((flags & FLOW_PROC_FLAG_GSO) != 0);
3228 uint32_t cnt = 0, bytes = 0;
3229 volatile struct sk_nexusadv *nxadv = NULL;
3230 volatile uint64_t *fg_ts = NULL;
3231 volatile uint64_t *rt_ts = NULL;
3232 uint8_t qset_idx = (fe->fe_qset != NULL) ? fe->fe_qset->nqs_idx : 0;
3233 drop_reason_t reason = DROP_REASON_UNSPECIFIED;
3234 uint16_t line = 0;
3235 uint32_t svc = 0;
3236 struct timespec now;
3237 uint64_t now_nsec = 0;
3238
3239 KPKTQ_INIT(&dropped_pkts);
3240 ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3241 if (__improbable(fe->fe_flags & FLOWENTF_LISTENER)) {
3242 dp_listener_flow_tx_process(fsw, fe);
3243 return;
3244 }
3245 if (__improbable(!dp_flow_tx_route_process(fsw, fe))) {
3246 SK_RDERR(5, "Tx route bad");
3247 FSW_STATS_ADD(FSW_STATS_TX_FLOW_NONVIABLE,
3248 KPKTQ_LEN(&fe->fe_tx_pktq));
3249 KPKTQ_CONCAT(&dropped_pkts, &fe->fe_tx_pktq);
3250 reason = DROP_REASON_FSW_FLOW_NONVIABLE;
3251 line = __LINE__;
3252 goto done;
3253 }
3254 chain = fsw_chain_enqueue_enabled(fsw) && KPKTQ_LEN(&fe->fe_tx_pktq) > 1;
3255 if (chain) {
3256 nanouptime(&now);
3257 net_timernsec(&now, &now_nsec);
3258 nxadv = fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
3259 if (nxadv != NULL) {
3260 fg_ts = &nxadv->nxadv_fg_sendts;
3261 rt_ts = &nxadv->nxadv_rt_sendts;
3262 }
3263 }
3264
3265 struct __kern_packet *pkt, *tpkt;
3266 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3267 int err = 0;
3268 if (svc == 0) {
3269 svc = pkt->pkt_svc_class;
3270 }
3271
3272 err = flow_pkt_track(fe, pkt, false);
3273 if (__improbable(err != 0)) {
3274 SK_RDERR(5, "flow_pkt_track failed (err %d)", err);
3275 FSW_STATS_INC(FSW_STATS_TX_FLOW_TRACK_ERR);
3276 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3277 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_FLOW_TRACK_ERR,
3278 DROPTAP_FLAG_L2_MISSING);
3279 continue;
3280 }
3281 _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
3282 pkt->pkt_transport_protocol = fe->fe_transport_protocol;
3283
3284 /* set AQM related values for outgoing packet */
3285 if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
3286 pkt->pkt_pflags |= PKT_F_FLOW_ADV;
3287 pkt->pkt_flowsrc_type = FLOWSRC_CHANNEL;
3288 pkt->pkt_flowsrc_fidx = fe->fe_adv_idx;
3289 } else {
3290 pkt->pkt_pflags &= ~PKT_F_FLOW_ADV;
3291 }
3292 _UUID_CLEAR(pkt->pkt_flow_id);
3293 pkt->pkt_flow_token = fe->fe_flowid;
3294 pkt->pkt_pflags |= PKT_F_FLOW_ID;
3295 pkt->pkt_qset_idx = qset_idx;
3296 pkt->pkt_policy_id = fe->fe_policy_id;
3297 pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
3298
3299 /*
3300 * The same code is exercised per packet for the non-chain case
3301 * (see ifnet_enqueue_ifclassq()). It's replicated here to avoid
3302 * re-walking the chain later.
3303 */
3304 if (chain && (gso || same_svc)) {
3305 fsw_update_timestamps(pkt, fg_ts, rt_ts, fsw->fsw_ifp, now_nsec);
3306 }
3307 /* mark packet tos/svc_class */
3308 fsw_qos_mark(fsw, fe, pkt);
3309
3310 tx_finalize_packet(fsw, pkt);
3311 bytes += pkt->pkt_length;
3312 cnt++;
3313
3314 same_svc = (same_svc && (svc == pkt->pkt_svc_class));
3315 /*
3316 * we are using the first 4 bytes of flow_id as the AQM flow
3317 * identifier.
3318 */
3319 ASSERT(!uuid_is_null(pkt->pkt_flow_id));
3320
3321 if (__improbable(pkt->pkt_trace_id != 0)) {
3322 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
3323 KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
3324 }
3325 }
3326
3327 /* snoop after it's finalized */
3328 if (__improbable(pktap_total_tap_count != 0)) {
3329 fsw_snoop(fsw, fe, &fe->fe_tx_pktq, false);
3330 }
3331
3332 chain = chain && (gso || same_svc);
3333 if (fe->fe_qset != NULL) {
3334 classq_qset_enqueue_flow(fsw, fe, chain, cnt, bytes);
3335 } else {
3336 classq_enqueue_flow(fsw, fe, chain, cnt, bytes);
3337 }
3338 done:
3339 dp_drop_pktq(fsw, &dropped_pkts, 1, reason, line, 0);
3340 }
3341
3342 static struct flow_entry *
tx_process_continuous_ip_frag(struct nx_flowswitch * fsw,struct flow_entry * prev_fe,struct __kern_packet * pkt)3343 tx_process_continuous_ip_frag(struct nx_flowswitch *fsw,
3344 struct flow_entry *prev_fe, struct __kern_packet *pkt)
3345 {
3346 ASSERT(!pkt->pkt_flow_ip_is_first_frag);
3347
3348 if (__improbable(pkt->pkt_flow_ip_frag_id == 0)) {
3349 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID);
3350 SK_ERR("%s(%d) invalid zero fragment id",
3351 sk_proc_name_address(current_proc()),
3352 sk_proc_pid(current_proc()));
3353 return NULL;
3354 }
3355
3356 SK_DF(SK_VERB_FSW_DP | SK_VERB_TX,
3357 "%s(%d) continuation frag, id %u",
3358 sk_proc_name_address(current_proc()),
3359 sk_proc_pid(current_proc()),
3360 pkt->pkt_flow_ip_frag_id);
3361 if (__improbable(prev_fe == NULL ||
3362 !prev_fe->fe_tx_is_cont_frag)) {
3363 SK_ERR("%s(%d) unexpected continuation frag",
3364 sk_proc_name_address(current_proc()),
3365 sk_proc_pid(current_proc()),
3366 pkt->pkt_flow_ip_frag_id);
3367 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3368 return NULL;
3369 }
3370 if (__improbable(pkt->pkt_flow_ip_frag_id !=
3371 prev_fe->fe_tx_frag_id)) {
3372 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3373 SK_ERR("%s(%d) wrong continuation frag id %u expecting %u",
3374 sk_proc_name_address(current_proc()),
3375 sk_proc_pid(current_proc()),
3376 pkt->pkt_flow_ip_frag_id,
3377 prev_fe->fe_tx_frag_id);
3378 return NULL;
3379 }
3380
3381 return prev_fe;
3382 }
3383
3384 static struct flow_entry *
tx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)3385 tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
3386 struct flow_entry *prev_fe)
3387 {
3388 struct flow_entry *__single fe;
3389
3390 fe = lookup_flow_with_pkt(fsw, pkt, false, prev_fe);
3391 if (__improbable(fe == NULL)) {
3392 goto done;
3393 }
3394
3395 if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
3396 SK_RDERR(5, "Tx flow torn down");
3397 FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN);
3398 flow_entry_release(&fe);
3399 goto done;
3400 }
3401
3402 _FSW_INJECT_ERROR(34, pkt->pkt_flow_id[0], fe->fe_uuid[0] + 1,
3403 null_func);
3404
3405 if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
3406 uuid_string_t flow_id_str, pkt_id_str;
3407 sk_uuid_unparse(fe->fe_uuid, flow_id_str);
3408 sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str);
3409 SK_ERR("pkt flow id %s != flow id %s", pkt_id_str, flow_id_str);
3410 flow_entry_release(&fe);
3411 FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID);
3412 }
3413
3414 done:
3415 return fe;
3416 }
3417
3418 static inline void
tx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)3419 tx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3420 uint32_t flags)
3421 {
3422 ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3423 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) != 0);
3424
3425 SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, "TX %d pkts from fe %p port %d",
3426 KPKTQ_LEN(&fe->fe_tx_pktq), fe, fe->fe_nx_port);
3427
3428 /* flow related processing (default, agg, etc.) */
3429 fe->fe_tx_process(fsw, fe, flags);
3430
3431 KPKTQ_FINI(&fe->fe_tx_pktq);
3432 }
3433
3434 #if SK_LOG
3435 static void
dp_tx_log_pkt(uint64_t verb,char * desc,struct __kern_packet * pkt)3436 dp_tx_log_pkt(uint64_t verb, char *desc, struct __kern_packet *pkt)
3437 {
3438 char *pkt_buf;
3439 uint32_t pkt_len;
3440
3441 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
3442 pkt_len = __packet_get_real_data_length(pkt);
3443 SK_DF(verb, "%s(%d) %s %s", sk_proc_name_address(current_proc()),
3444 sk_proc_pid(current_proc()), desc, sk_dump("buf", pkt_buf, pkt_len,
3445 128, NULL, 0));
3446 }
3447 #else /* !SK_LOG */
3448 #define dp_tx_log_pkt(...)
3449 #endif /* !SK_LOG */
3450
3451 static inline struct ifnet *
fsw_datamov_begin(struct nx_flowswitch * fsw)3452 fsw_datamov_begin(struct nx_flowswitch *fsw)
3453 {
3454 struct ifnet *ifp;
3455
3456 ifp = fsw->fsw_ifp;
3457 if (!ifnet_datamov_begin(ifp)) {
3458 DTRACE_SKYWALK1(ifnet__detached, struct ifnet *, ifp);
3459 return NULL;
3460 }
3461 return ifp;
3462 }
3463
3464 static inline void
fsw_datamov_end(struct nx_flowswitch * fsw)3465 fsw_datamov_end(struct nx_flowswitch *fsw)
3466 {
3467 ifnet_datamov_end(fsw->fsw_ifp);
3468 }
3469
3470 static void
dp_tx_pktq(struct nx_flowswitch * fsw,struct pktq * spktq)3471 dp_tx_pktq(struct nx_flowswitch *fsw, struct pktq *spktq)
3472 {
3473 struct __kern_packet *spkt, *pkt;
3474 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3475 struct flow_entry *__single fe, *__single prev_fe;
3476 struct pktq dropped_pkts, dpktq;
3477 struct nexus_adapter *dev_na;
3478 struct kern_pbufpool *dev_pp;
3479 struct ifnet *ifp = NULL;
3480 sa_family_t af;
3481 uint32_t n_pkts, n_flows = 0;
3482 boolean_t do_pacing = FALSE;
3483 drop_reason_t reason = DROP_REASON_UNSPECIFIED;
3484 uint16_t line = 0;
3485
3486 int err;
3487 KPKTQ_INIT(&dpktq);
3488 KPKTQ_INIT(&dropped_pkts);
3489 n_pkts = KPKTQ_LEN(spktq);
3490
3491 FSW_RLOCK(fsw);
3492 if (__improbable(FSW_QUIESCED(fsw))) {
3493 DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3494 SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3495 KPKTQ_CONCAT(&dropped_pkts, spktq);
3496 reason = DROP_REASON_FSW_QUIESCED;
3497 line = __LINE__;
3498 goto done;
3499 }
3500 dev_na = fsw->fsw_dev_ch->ch_na;
3501 if (__improbable(dev_na == NULL)) {
3502 SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3503 FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3504 KPKTQ_CONCAT(&dropped_pkts, spktq);
3505 reason = DROP_REASON_FSW_TX_DEVPORT_NOT_ATTACHED;
3506 line = __LINE__;
3507 goto done;
3508 }
3509 ifp = fsw_datamov_begin(fsw);
3510 if (ifp == NULL) {
3511 SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
3512 KPKTQ_CONCAT(&dropped_pkts, spktq);
3513 reason = DROP_REASON_FSW_IFNET_NOT_ATTACHED;
3514 line = __LINE__;
3515 goto done;
3516 }
3517
3518 /* batch allocate enough packets */
3519 dev_pp = na_kr_get_pp(dev_na, NR_TX);
3520
3521 err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, n_pkts, NULL,
3522 NULL, SKMEM_NOSLEEP);
3523 #if DEVELOPMENT || DEBUG
3524 if (__probable(err != ENOMEM)) {
3525 _FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
3526 }
3527 #endif /* DEVELOPMENT || DEBUG */
3528 if (__improbable(err == ENOMEM)) {
3529 ASSERT(KPKTQ_EMPTY(&dpktq));
3530 KPKTQ_CONCAT(&dropped_pkts, spktq);
3531 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3532 SK_ERR("failed to alloc %u pkts from device pool", n_pkts);
3533 reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
3534 line = __LINE__;
3535 goto done;
3536 } else if (__improbable(err == EAGAIN)) {
3537 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT,
3538 (n_pkts - KPKTQ_LEN(&dpktq)));
3539 FSW_STATS_ADD(FSW_STATS_DROP,
3540 (n_pkts - KPKTQ_LEN(&dpktq)));
3541 }
3542
3543 n_pkts = KPKTQ_LEN(&dpktq);
3544 prev_fe = NULL;
3545 KPKTQ_FOREACH(spkt, spktq) {
3546 if (n_pkts == 0) {
3547 break;
3548 }
3549 --n_pkts;
3550
3551 KPKTQ_DEQUEUE(&dpktq, pkt);
3552 ASSERT(pkt != NULL);
3553 err = dp_copy_to_dev(fsw, spkt, pkt);
3554 if (__improbable(err != 0)) {
3555 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_PKT_COPY_FAILED,
3556 DROPTAP_FLAG_L2_MISSING);
3557 continue;
3558 }
3559
3560 do_pacing |= __packet_get_tx_timestamp(SK_PKT2PH(pkt)) != 0;
3561 af = fsw_ip_demux(fsw, pkt);
3562 if (__improbable(af == AF_UNSPEC)) {
3563 dp_tx_log_pkt(SK_VERB_ERROR, "demux err", pkt);
3564 FSW_STATS_INC(FSW_STATS_TX_DEMUX_ERR);
3565 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_DEMUX_FAILED,
3566 DROPTAP_FLAG_L2_MISSING);
3567 continue;
3568 }
3569
3570 err = flow_pkt_classify(pkt, ifp, af, false);
3571 if (__improbable(err != 0)) {
3572 dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
3573 FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
3574 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_EXTRACT_FAILED,
3575 DROPTAP_FLAG_L2_MISSING);
3576 continue;
3577 }
3578
3579 if (__improbable(pkt->pkt_flow_ip_is_frag &&
3580 !pkt->pkt_flow_ip_is_first_frag)) {
3581 fe = tx_process_continuous_ip_frag(fsw, prev_fe, pkt);
3582 if (__probable(fe != NULL)) {
3583 flow_entry_retain(fe);
3584 goto flow_batch;
3585 } else {
3586 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3587 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FRAG_BAD_CONT,
3588 DROPTAP_FLAG_L2_MISSING);
3589 continue;
3590 }
3591 }
3592
3593 fe = tx_lookup_flow(fsw, pkt, prev_fe);
3594 if (__improbable(fe == NULL)) {
3595 FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
3596 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_NOT_FOUND,
3597 DROPTAP_FLAG_L2_MISSING);
3598 prev_fe = NULL;
3599 continue;
3600 }
3601 flow_batch:
3602 tx_flow_batch_packet(&fes, fe, pkt);
3603 prev_fe = fe;
3604 }
3605
3606 struct flow_entry *tfe = NULL;
3607 TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
3608 tx_flow_process(fsw, fe, 0);
3609 TAILQ_REMOVE(&fes, fe, fe_tx_link);
3610 fe->fe_tx_is_cont_frag = false;
3611 fe->fe_tx_frag_id = 0;
3612 flow_entry_release(&fe);
3613 n_flows++;
3614 }
3615
3616 done:
3617 FSW_RUNLOCK(fsw);
3618 if (n_flows > 0) {
3619 netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL | (do_pacing ? NETIF_XMIT_FLAG_PACING : 0));
3620 }
3621 if (ifp != NULL) {
3622 fsw_datamov_end(fsw);
3623 }
3624 dp_drop_pktq(fsw, &dropped_pkts, 1, reason, line, DROPTAP_FLAG_L2_MISSING);
3625 KPKTQ_FINI(&dropped_pkts);
3626 KPKTQ_FINI(&dpktq);
3627 }
3628
3629 static sa_family_t
get_tso_af(struct __kern_packet * pkt)3630 get_tso_af(struct __kern_packet *pkt)
3631 {
3632 packet_tso_flags_t tso_flags;
3633
3634 tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
3635 if (tso_flags == PACKET_TSO_IPV4) {
3636 return AF_INET;
3637 } else if (tso_flags == PACKET_TSO_IPV6) {
3638 return AF_INET6;
3639 } else {
3640 panic("invalid tso flags: 0x%x\n", tso_flags);
3641 /* NOTREACHED */
3642 __builtin_unreachable();
3643 }
3644 }
3645
3646 static inline void
update_flow_info(struct __kern_packet * pkt,void * iphdr,void * tcphdr,uint16_t payload_sz)3647 update_flow_info(struct __kern_packet *pkt, void *iphdr, void *tcphdr, uint16_t payload_sz)
3648 {
3649 struct tcphdr *__single tcp = tcphdr;
3650
3651 DTRACE_SKYWALK4(update__flow__info, struct __kern_packet *, pkt,
3652 void *, iphdr, void *, tcphdr, uint16_t, payload_sz);
3653 pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iphdr;
3654 pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcphdr;
3655 pkt->pkt_flow_tcp_flags = tcp->th_flags;
3656 pkt->pkt_flow_tcp_seq = tcp->th_seq;
3657 pkt->pkt_flow_ulen = payload_sz;
3658 }
3659
3660 static int
do_gso(struct nx_flowswitch * fsw,int af,struct __kern_packet * orig_pkt,struct __kern_packet * first_pkt,struct pktq * dev_pktq,struct pktq * gso_pktq)3661 do_gso(struct nx_flowswitch *fsw, int af, struct __kern_packet *orig_pkt,
3662 struct __kern_packet *first_pkt, struct pktq *dev_pktq,
3663 struct pktq *gso_pktq)
3664 {
3665 ifnet_t ifp = fsw->fsw_ifp;
3666 struct __kern_packet *pkt = first_pkt;
3667 uint8_t proto = pkt->pkt_flow_ip_proto;
3668 uint16_t ip_hlen = pkt->pkt_flow_ip_hlen;
3669 uint16_t tcp_hlen = pkt->pkt_flow_tcp_hlen;
3670 uint16_t total_hlen = ip_hlen + tcp_hlen;
3671 uint16_t mtu = (uint16_t)ifp->if_mtu;
3672 uint16_t mss = pkt->pkt_proto_seg_sz, payload_sz;
3673 uint32_t n, n_pkts, off = 0, total_len = orig_pkt->pkt_length;
3674 uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
3675 kern_packet_t orig_ph = SK_PKT2PH(orig_pkt);
3676 uint8_t *orig_pkt_baddr;
3677 struct tcphdr *tcp;
3678 struct ip *ip;
3679 struct ip6_hdr *ip6;
3680 uint32_t tcp_seq;
3681 uint16_t ipid;
3682 uint32_t pseudo_hdr_csum, bufsz;
3683 uint64_t pkt_tx_timestamp = 0;
3684
3685 ASSERT(headroom <= UINT8_MAX);
3686 if (proto != IPPROTO_TCP) {
3687 SK_ERR("invalid proto: %d", proto);
3688 DTRACE_SKYWALK3(invalid__proto, struct nx_flowswitch *,
3689 fsw, ifnet_t, ifp, uint8_t, proto);
3690 return EINVAL;
3691 }
3692 if (mss == 0 || mss > (mtu - total_hlen)) {
3693 SK_ERR("invalid args: mss %d, mtu %d, total_hlen %d",
3694 mss, mtu, total_hlen);
3695 DTRACE_SKYWALK5(invalid__args1, struct nx_flowswitch *,
3696 fsw, ifnet_t, ifp, uint16_t, mss, uint16_t, mtu,
3697 uint32_t, total_hlen);
3698 return EINVAL;
3699 }
3700 bufsz = PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp);
3701 if ((headroom + total_hlen + mss) > bufsz) {
3702 SK_ERR("invalid args: headroom %d, total_hlen %d, "
3703 "mss %d, bufsz %d", headroom, total_hlen, mss, bufsz);
3704 DTRACE_SKYWALK6(invalid__args2, struct nx_flowswitch *,
3705 fsw, ifnet_t, ifp, uint16_t, headroom, uint16_t,
3706 total_hlen, uint16_t, mss, uint32_t, bufsz);
3707 return EINVAL;
3708 }
3709 n_pkts = (uint32_t)(SK_ROUNDUP((total_len - total_hlen), mss) / mss);
3710
3711 ASSERT(pkt->pkt_headroom == headroom);
3712 ASSERT(pkt->pkt_length == total_len);
3713 ASSERT(pkt->pkt_l2_len == 0);
3714 ASSERT((pkt->pkt_qum.qum_qflags & QUM_F_FINALIZED) == 0);
3715 ASSERT((pkt->pkt_pflags & PKT_F_TRUNCATED) != 0);
3716 pkt->pkt_pflags &= ~PKT_F_TRUNCATED;
3717 pkt->pkt_proto_seg_sz = 0;
3718 pkt->pkt_csum_flags = 0;
3719 MD_BUFLET_ADDR_ABS(orig_pkt, orig_pkt_baddr);
3720 orig_pkt_baddr += orig_pkt->pkt_headroom;
3721
3722 if (af == AF_INET) {
3723 /*
3724 * XXX -fbounds-safety: can't avoid using forge unless we change
3725 * the flow metadata definition.
3726 */
3727 ip = __unsafe_forge_bidi_indexable(struct ip *,
3728 pkt->pkt_flow_ip_hdr, pkt->pkt_length);
3729 tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
3730 pkt->pkt_flow_tcp_hdr, pkt->pkt_length - ip_hlen);
3731 ipid = ip->ip_id;
3732 pseudo_hdr_csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
3733 pkt->pkt_flow_ipv4_dst.s_addr, 0);
3734 } else {
3735 ASSERT(af == AF_INET6);
3736 tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
3737 pkt->pkt_flow_tcp_hdr, pkt->pkt_length - ip_hlen);
3738 pseudo_hdr_csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
3739 &pkt->pkt_flow_ipv6_dst, 0);
3740 }
3741 tcp_seq = ntohl(tcp->th_seq);
3742
3743 pkt_tx_timestamp = __packet_get_tx_timestamp(orig_ph);
3744
3745 for (n = 1, payload_sz = mss, off = total_hlen; off < total_len;
3746 off += payload_sz) {
3747 uint8_t *baddr, *baddr0;
3748 uint32_t partial;
3749
3750 if (pkt == NULL) {
3751 n++;
3752 KPKTQ_DEQUEUE(dev_pktq, pkt);
3753 ASSERT(pkt != NULL);
3754 }
3755 MD_BUFLET_ADDR_ABS(pkt, baddr0);
3756 baddr = baddr0;
3757 baddr += headroom;
3758
3759 /* Copy headers from the original packet */
3760 if (n != 1) {
3761 ASSERT(pkt != first_pkt);
3762 pkt_copy(orig_pkt_baddr, baddr, total_hlen);
3763 fsw_pkt_copy_metadata(first_pkt, pkt);
3764
3765 ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
3766 /* flow info still needs to be updated below */
3767 bcopy(first_pkt->pkt_flow, pkt->pkt_flow,
3768 sizeof(*pkt->pkt_flow));
3769 pkt->pkt_trace_id = 0;
3770 ASSERT(pkt->pkt_headroom == headroom);
3771 } else {
3772 METADATA_SET_LEN(pkt, 0, 0);
3773 }
3774 baddr += total_hlen;
3775
3776 /* copy tx timestamp from the orignal packet */
3777 __packet_set_tx_timestamp(SK_PKT2PH(pkt), pkt_tx_timestamp);
3778
3779 /* Copy/checksum the payload from the original packet */
3780 if (off + payload_sz > total_len) {
3781 payload_sz = (uint16_t)(total_len - off);
3782 }
3783 pkt_copypkt_sum(orig_ph,
3784 (uint16_t)(orig_pkt->pkt_headroom + off),
3785 SK_PKT2PH(pkt), headroom + total_hlen, payload_sz,
3786 &partial, TRUE);
3787
3788 DTRACE_SKYWALK6(copy__csum, struct nx_flowswitch *, fsw,
3789 ifnet_t, ifp, uint8_t *, baddr, uint16_t, payload_sz,
3790 uint16_t, mss, uint32_t, partial);
3791 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
3792
3793 /*
3794 * Adjust header information and fill in the missing fields.
3795 */
3796 if (af == AF_INET) {
3797 ip = (struct ip *)(void *)(baddr0 + pkt->pkt_headroom);
3798 tcp = (struct tcphdr *)(void *)((caddr_t)ip + ip_hlen);
3799
3800 if (n != n_pkts) {
3801 tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3802 }
3803 if (n != 1) {
3804 tcp->th_flags &= ~TH_CWR;
3805 tcp->th_seq = htonl(tcp_seq);
3806 }
3807 update_flow_info(pkt, ip, tcp, payload_sz);
3808
3809 ip->ip_id = htons((ipid)++);
3810 ip->ip_len = htons(ip_hlen + tcp_hlen + payload_sz);
3811 ip->ip_sum = 0;
3812 ip->ip_sum = inet_cksum_buffer(ip, 0, 0, ip_hlen);
3813 tcp->th_sum = 0;
3814
3815 partial = __packet_cksum(tcp, tcp_hlen, partial);
3816 partial += htons(tcp_hlen + IPPROTO_TCP + payload_sz);
3817 partial += pseudo_hdr_csum;
3818 ADDCARRY(partial);
3819 tcp->th_sum = ~(uint16_t)partial;
3820 } else {
3821 ASSERT(af == AF_INET6);
3822 ip6 = (struct ip6_hdr *)(baddr0 + pkt->pkt_headroom);
3823 tcp = (struct tcphdr *)(void *)((caddr_t)ip6 + ip_hlen);
3824
3825 if (n != n_pkts) {
3826 tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3827 }
3828 if (n != 1) {
3829 tcp->th_flags &= ~TH_CWR;
3830 tcp->th_seq = htonl(tcp_seq);
3831 }
3832 update_flow_info(pkt, ip6, tcp, payload_sz);
3833
3834 ip6->ip6_plen = htons(tcp_hlen + payload_sz);
3835 tcp->th_sum = 0;
3836 partial = __packet_cksum(tcp, tcp_hlen, partial);
3837 partial += htonl(tcp_hlen + IPPROTO_TCP + payload_sz);
3838 partial += pseudo_hdr_csum;
3839 ADDCARRY(partial);
3840 tcp->th_sum = ~(uint16_t)partial;
3841 }
3842 tcp_seq += payload_sz;
3843 METADATA_ADJUST_LEN(pkt, total_hlen, headroom);
3844 #if (DEVELOPMENT || DEBUG)
3845 struct __kern_buflet *bft;
3846 uint32_t blen;
3847 PKT_GET_FIRST_BUFLET(pkt, 1, bft);
3848 blen = __buflet_get_data_length(bft);
3849 if (blen != total_hlen + payload_sz) {
3850 panic("blen (%d) != total_len + payload_sz (%d)\n",
3851 blen, total_hlen + payload_sz);
3852 }
3853 #endif /* DEVELOPMENT || DEBUG */
3854
3855 pkt->pkt_length = total_hlen + payload_sz;
3856 KPKTQ_ENQUEUE(gso_pktq, pkt);
3857 pkt = NULL;
3858
3859 /*
3860 * Note that at this point the packet is not yet finalized.
3861 * The finalization happens in dp_flow_tx_process() after
3862 * the framing is done.
3863 */
3864 }
3865 ASSERT(n == n_pkts);
3866 ASSERT(off == total_len);
3867 DTRACE_SKYWALK7(gso__done, struct nx_flowswitch *, fsw, ifnet_t, ifp,
3868 uint32_t, n_pkts, uint32_t, total_len, uint16_t, ip_hlen,
3869 uint16_t, tcp_hlen, uint8_t *, orig_pkt_baddr);
3870 return 0;
3871 }
3872
3873 static void
tx_flow_enqueue_gso_pktq(struct flow_entry_list * fes,struct flow_entry * fe,struct pktq * gso_pktq)3874 tx_flow_enqueue_gso_pktq(struct flow_entry_list *fes, struct flow_entry *fe,
3875 struct pktq *gso_pktq)
3876 {
3877 if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
3878 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
3879 TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
3880 KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3881 KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3882 KPKTQ_INIT(gso_pktq);
3883 } else {
3884 ASSERT(!TAILQ_EMPTY(fes));
3885 KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3886 KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3887 KPKTQ_INIT(gso_pktq);
3888 flow_entry_release(&fe);
3889 }
3890 }
3891
3892 static void
dp_gso_pktq(struct nx_flowswitch * fsw,struct pktq * spktq,uint32_t gso_pkts_estimate)3893 dp_gso_pktq(struct nx_flowswitch *fsw, struct pktq *spktq,
3894 uint32_t gso_pkts_estimate)
3895 {
3896 struct __kern_packet *spkt, *pkt;
3897 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3898 struct flow_entry *__single fe, *__single prev_fe;
3899 struct pktq dpktq;
3900 struct nexus_adapter *dev_na;
3901 struct kern_pbufpool *dev_pp;
3902 struct ifnet *ifp = NULL;
3903 sa_family_t af;
3904 uint32_t n_pkts, n_flows = 0;
3905 int err;
3906
3907 KPKTQ_INIT(&dpktq);
3908 n_pkts = KPKTQ_LEN(spktq);
3909
3910 FSW_RLOCK(fsw);
3911 if (__improbable(FSW_QUIESCED(fsw))) {
3912 DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3913 SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3914 dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_QUIESCED, __LINE__,
3915 DROPTAP_FLAG_L2_MISSING);
3916 goto done;
3917 }
3918 dev_na = fsw->fsw_dev_ch->ch_na;
3919 if (__improbable(dev_na == NULL)) {
3920 SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3921 FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3922 dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_TX_DEVPORT_NOT_ATTACHED,
3923 __LINE__, DROPTAP_FLAG_L2_MISSING);
3924 goto done;
3925 }
3926 ifp = fsw_datamov_begin(fsw);
3927 if (ifp == NULL) {
3928 SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
3929 dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_IFNET_NOT_ATTACHED,
3930 __LINE__, DROPTAP_FLAG_L2_MISSING);
3931 goto done;
3932 }
3933
3934 dev_pp = na_kr_get_pp(dev_na, NR_TX);
3935
3936 /*
3937 * Batch allocate enough packets to perform GSO on all
3938 * packets in spktq.
3939 */
3940 err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq,
3941 gso_pkts_estimate, NULL, NULL, SKMEM_NOSLEEP);
3942 #if DEVELOPMENT || DEBUG
3943 if (__probable(err != ENOMEM)) {
3944 _FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
3945 }
3946 #endif /* DEVELOPMENT || DEBUG */
3947 /*
3948 * We either get all packets or none. No partial allocations.
3949 */
3950 if (__improbable(err != 0)) {
3951 if (err == ENOMEM) {
3952 ASSERT(KPKTQ_EMPTY(&dpktq));
3953 } else {
3954 dp_free_pktq(fsw, &dpktq);
3955 }
3956 DTRACE_SKYWALK1(gso__no__mem, int, err);
3957 dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_PP_ALLOC_FAILED,
3958 __LINE__, DROPTAP_FLAG_L2_MISSING);
3959 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3960 SK_ERR("failed to alloc %u pkts from device pool",
3961 gso_pkts_estimate);
3962 goto done;
3963 }
3964 prev_fe = NULL;
3965 KPKTQ_FOREACH(spkt, spktq) {
3966 KPKTQ_DEQUEUE(&dpktq, pkt);
3967 ASSERT(pkt != NULL);
3968 /*
3969 * Copy only headers to the first packet of the GSO chain.
3970 * The headers will be used for classification below.
3971 */
3972 err = dp_copy_headers_to_dev(fsw, spkt, pkt);
3973 if (__improbable(err != 0)) {
3974 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_PKT_COPY_FAILED,
3975 DROPTAP_FLAG_L2_MISSING);
3976 DTRACE_SKYWALK2(copy__headers__failed,
3977 struct nx_flowswitch *, fsw,
3978 struct __kern_packet *, spkt);
3979 continue;
3980 }
3981 af = get_tso_af(pkt);
3982 ASSERT(af == AF_INET || af == AF_INET6);
3983
3984 err = flow_pkt_classify(pkt, ifp, af, false);
3985 if (__improbable(err != 0)) {
3986 dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
3987 FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
3988 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_EXTRACT_FAILED,
3989 DROPTAP_FLAG_L2_MISSING);
3990 DTRACE_SKYWALK4(classify__failed,
3991 struct nx_flowswitch *, fsw,
3992 struct __kern_packet *, spkt,
3993 struct __kern_packet *, pkt,
3994 int, err);
3995 continue;
3996 }
3997 /*
3998 * GSO cannot be done on a fragment and it's a bug in user
3999 * space to mark a fragment as needing GSO.
4000 */
4001 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
4002 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
4003 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FRAG_BAD_CONT,
4004 DROPTAP_FLAG_L2_MISSING);
4005 DTRACE_SKYWALK3(is__frag,
4006 struct nx_flowswitch *, fsw,
4007 struct __kern_packet *, spkt,
4008 struct __kern_packet *, pkt);
4009 continue;
4010 }
4011 fe = tx_lookup_flow(fsw, pkt, prev_fe);
4012 if (__improbable(fe == NULL)) {
4013 FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
4014 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_NOT_FOUND,
4015 DROPTAP_FLAG_L2_MISSING);
4016 DTRACE_SKYWALK3(lookup__failed,
4017 struct nx_flowswitch *, fsw,
4018 struct __kern_packet *, spkt,
4019 struct __kern_packet *, pkt);
4020 prev_fe = NULL;
4021 continue;
4022 }
4023 /*
4024 * Perform GSO on spkt using the flow information
4025 * obtained above.
4026 */
4027 struct pktq gso_pktq;
4028 KPKTQ_INIT(&gso_pktq);
4029 err = do_gso(fsw, af, spkt, pkt, &dpktq, &gso_pktq);
4030 if (__probable(err == 0)) {
4031 tx_flow_enqueue_gso_pktq(&fes, fe, &gso_pktq);
4032 prev_fe = fe;
4033 } else {
4034 DTRACE_SKYWALK1(gso__error, int, err);
4035 /* TODO: increment error stat */
4036 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_GSO_FAILED,
4037 DROPTAP_FLAG_L2_MISSING);
4038 flow_entry_release(&fe);
4039 prev_fe = NULL;
4040 }
4041 KPKTQ_FINI(&gso_pktq);
4042 }
4043 struct flow_entry *tfe = NULL;
4044 TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
4045 /* Chain-enqueue can be used for GSO chains */
4046 tx_flow_process(fsw, fe, FLOW_PROC_FLAG_GSO);
4047 TAILQ_REMOVE(&fes, fe, fe_tx_link);
4048 flow_entry_release(&fe);
4049 n_flows++;
4050 }
4051 done:
4052 FSW_RUNLOCK(fsw);
4053 if (n_flows > 0) {
4054 netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL);
4055 }
4056 if (ifp != NULL) {
4057 fsw_datamov_end(fsw);
4058 }
4059
4060 /*
4061 * It's possible for packets to be left in dpktq because
4062 * gso_pkts_estimate is only an estimate. The actual number
4063 * of packets needed could be less.
4064 */
4065 uint32_t dpktq_len;
4066 if ((dpktq_len = KPKTQ_LEN(&dpktq)) > 0) {
4067 DTRACE_SKYWALK2(leftover__dev__pkts,
4068 struct nx_flowswitch *, fsw, uint32_t, dpktq_len);
4069 dp_free_pktq(fsw, &dpktq);
4070 }
4071 KPKTQ_FINI(&dpktq);
4072 }
4073
4074 static inline void
fsw_dev_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4075 fsw_dev_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4076 struct proc *p)
4077 {
4078 #pragma unused(p)
4079 uint32_t total_pkts = 0, total_bytes = 0;
4080
4081 for (;;) {
4082 struct pktq pktq;
4083 KPKTQ_INIT(&pktq);
4084 uint32_t n_bytes;
4085 fsw_rx_ring_dequeue_pktq(fsw, r, fsw_rx_batch, &pktq, &n_bytes);
4086 if (n_bytes == 0) {
4087 break;
4088 }
4089 total_pkts += KPKTQ_LEN(&pktq);
4090 total_bytes += n_bytes;
4091
4092 if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) {
4093 fsw_receive(fsw, &pktq);
4094 } else {
4095 fsw_dev_input_netem_enqueue(fsw, &pktq);
4096 }
4097 KPKTQ_FINI(&pktq);
4098 }
4099
4100 KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
4101 DTRACE_SKYWALK2(fsw__dp__dev__ring__flush, uint32_t, total_pkts,
4102 uint32_t, total_bytes);
4103
4104 /* compute mitigation rate for delivered traffic */
4105 if (__probable(r->ckr_netif_mit_stats != NULL)) {
4106 r->ckr_netif_mit_stats(r, total_pkts, total_bytes);
4107 }
4108 }
4109
4110 static inline void
fsw_user_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4111 fsw_user_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4112 struct proc *p)
4113 {
4114 #pragma unused(p)
4115 static packet_trace_id_t trace_id = 0;
4116 uint32_t total_pkts = 0, total_bytes = 0;
4117
4118 for (;;) {
4119 struct pktq pktq;
4120 KPKTQ_INIT(&pktq);
4121 uint32_t n_bytes;
4122 uint32_t gso_pkts_estimate = 0;
4123
4124 fsw_tx_ring_dequeue_pktq(fsw, r, fsw_tx_batch, &pktq, &n_bytes,
4125 &gso_pkts_estimate);
4126 if (n_bytes == 0) {
4127 break;
4128 }
4129 total_pkts += KPKTQ_LEN(&pktq);
4130 total_bytes += n_bytes;
4131
4132 KPKTQ_FIRST(&pktq)->pkt_trace_id = ++trace_id;
4133 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_START,
4134 KPKTQ_FIRST(&pktq)->pkt_trace_id);
4135
4136 if (gso_pkts_estimate > 0) {
4137 dp_gso_pktq(fsw, &pktq, gso_pkts_estimate);
4138 } else {
4139 dp_tx_pktq(fsw, &pktq);
4140 }
4141 dp_free_pktq(fsw, &pktq);
4142 KPKTQ_FINI(&pktq);
4143 }
4144 kr_update_stats(r, total_pkts, total_bytes);
4145
4146 KDBG(SK_KTRACE_FSW_USER_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
4147 DTRACE_SKYWALK2(fsw__dp__user__ring__flush, uint32_t, total_pkts,
4148 uint32_t, total_bytes);
4149 }
4150
4151 void
fsw_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4152 fsw_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4153 struct proc *p)
4154 {
4155 struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
4156
4157 ASSERT(sk_is_sync_protected());
4158 ASSERT(vpna->vpna_nx_port != FSW_VP_HOST);
4159 ASSERT(vpna->vpna_up.na_md_type == NEXUS_META_TYPE_PACKET);
4160
4161 if (vpna->vpna_nx_port == FSW_VP_DEV) {
4162 fsw_dev_ring_flush(fsw, r, p);
4163 } else {
4164 fsw_user_ring_flush(fsw, r, p);
4165 }
4166 }
4167
4168 int
fsw_dp_ctor(struct nx_flowswitch * fsw)4169 fsw_dp_ctor(struct nx_flowswitch *fsw)
4170 {
4171 uint32_t fe_cnt = fsw_fe_table_size;
4172 uint32_t fob_cnt = fsw_flow_owner_buckets;
4173 uint32_t frb_cnt = fsw_flow_route_buckets;
4174 uint32_t frib_cnt = fsw_flow_route_id_buckets;
4175 struct kern_nexus *nx = fsw->fsw_nx;
4176 char name[64];
4177 const char *__null_terminated fsw_name = NULL;
4178 int error = 0;
4179
4180 /* just in case */
4181 if (fe_cnt == 0) {
4182 fe_cnt = NX_FSW_FE_TABLESZ;
4183 ASSERT(fe_cnt != 0);
4184 }
4185 if (fob_cnt == 0) {
4186 fob_cnt = NX_FSW_FOB_HASHSZ;
4187 ASSERT(fob_cnt != 0);
4188 }
4189 if (frb_cnt == 0) {
4190 frb_cnt = NX_FSW_FRB_HASHSZ;
4191 ASSERT(frb_cnt != 0);
4192 }
4193 if (frib_cnt == 0) {
4194 frib_cnt = NX_FSW_FRIB_HASHSZ;
4195 ASSERT(frib_cnt != 0);
4196 }
4197
4198 /* make sure fe_cnt is a power of two, else round up */
4199 if ((fe_cnt & (fe_cnt - 1)) != 0) {
4200 fe_cnt--;
4201 fe_cnt |= (fe_cnt >> 1);
4202 fe_cnt |= (fe_cnt >> 2);
4203 fe_cnt |= (fe_cnt >> 4);
4204 fe_cnt |= (fe_cnt >> 8);
4205 fe_cnt |= (fe_cnt >> 16);
4206 fe_cnt++;
4207 }
4208
4209 /* make sure frb_cnt is a power of two, else round up */
4210 if ((frb_cnt & (frb_cnt - 1)) != 0) {
4211 frb_cnt--;
4212 frb_cnt |= (frb_cnt >> 1);
4213 frb_cnt |= (frb_cnt >> 2);
4214 frb_cnt |= (frb_cnt >> 4);
4215 frb_cnt |= (frb_cnt >> 8);
4216 frb_cnt |= (frb_cnt >> 16);
4217 frb_cnt++;
4218 }
4219
4220 lck_mtx_init(&fsw->fsw_detach_barrier_lock, &nexus_lock_group,
4221 &nexus_lock_attr);
4222 lck_mtx_init(&fsw->fsw_reap_lock, &nexus_lock_group, &nexus_lock_attr);
4223 lck_mtx_init(&fsw->fsw_linger_lock, &nexus_lock_group, &nexus_lock_attr);
4224 TAILQ_INIT(&fsw->fsw_linger_head);
4225
4226 fsw_name = tsnprintf(name, sizeof(name), "%s_%llu", NX_FSW_NAME, nx->nx_id);
4227 error = nx_advisory_alloc(nx, fsw_name,
4228 &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
4229 NEXUS_ADVISORY_TYPE_FLOWSWITCH);
4230 if (error != 0) {
4231 fsw_dp_dtor(fsw);
4232 return error;
4233 }
4234
4235 fsw->fsw_flow_mgr = flow_mgr_create(fe_cnt, fob_cnt, frb_cnt, frib_cnt);
4236 if (fsw->fsw_flow_mgr == NULL) {
4237 fsw_dp_dtor(fsw);
4238 return error;
4239 }
4240
4241 /* generic name; will be customized upon ifattach */
4242 (void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
4243 FSW_REAP_THREADNAME, name, "");
4244
4245 if (kernel_thread_start(fsw_reap_thread_func, fsw,
4246 &fsw->fsw_reap_thread) != KERN_SUCCESS) {
4247 panic_plain("%s: can't create thread", __func__);
4248 /* NOTREACHED */
4249 __builtin_unreachable();
4250 }
4251 /* this must not fail */
4252 VERIFY(fsw->fsw_reap_thread != NULL);
4253
4254 SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
4255
4256
4257 return error;
4258 }
4259
4260 void
fsw_dp_dtor(struct nx_flowswitch * fsw)4261 fsw_dp_dtor(struct nx_flowswitch *fsw)
4262 {
4263 uint64_t f = (1 * NSEC_PER_MSEC); /* 1 ms */
4264 uint64_t s = (1000 * NSEC_PER_SEC); /* 1 sec */
4265 uint32_t i = 0;
4266
4267 #if (DEVELOPMENT || DEBUG)
4268 if (fsw->fsw_rps_threads != NULL) {
4269 for (i = 0; i < fsw->fsw_rps_nthreads; i++) {
4270 fsw_rps_thread_join(fsw, i);
4271 }
4272 kfree_type_counted_by(struct fsw_rps_thread, fsw->fsw_rps_nthreads,
4273 fsw->fsw_rps_threads);
4274 }
4275 #endif /* !DEVELOPMENT && !DEBUG */
4276
4277 nx_advisory_free(fsw->fsw_nx);
4278
4279 if (fsw->fsw_reap_thread != THREAD_NULL) {
4280 /* signal thread to begin self-termination */
4281 lck_mtx_lock(&fsw->fsw_reap_lock);
4282 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATING;
4283
4284 /*
4285 * And wait for thread to terminate; use another
4286 * wait channel here other than fsw_reap_flags to
4287 * make it more explicit. In the event the reaper
4288 * thread misses a wakeup, we'll try again once
4289 * every second (except for the first time).
4290 */
4291 while (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)) {
4292 uint64_t t = 0;
4293
4294 nanoseconds_to_absolutetime((i++ == 0) ? f : s, &t);
4295 clock_absolutetime_interval_to_deadline(t, &t);
4296 ASSERT(t != 0);
4297
4298 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATEBLOCK;
4299 if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)) {
4300 thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4301 }
4302 (void) assert_wait_deadline(&fsw->fsw_reap_thread,
4303 THREAD_UNINT, t);
4304 lck_mtx_unlock(&fsw->fsw_reap_lock);
4305 thread_block(THREAD_CONTINUE_NULL);
4306 lck_mtx_lock(&fsw->fsw_reap_lock);
4307 fsw->fsw_reap_flags &= ~FSW_REAPF_TERMINATEBLOCK;
4308 }
4309 ASSERT(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED);
4310 lck_mtx_unlock(&fsw->fsw_reap_lock);
4311 fsw->fsw_reap_thread = THREAD_NULL;
4312 }
4313
4314 /* free any remaining flow entries in the linger list */
4315 fsw_linger_purge(fsw);
4316
4317 if (fsw->fsw_flow_mgr != NULL) {
4318 flow_mgr_destroy(fsw->fsw_flow_mgr);
4319 fsw->fsw_flow_mgr = NULL;
4320 }
4321
4322
4323 lck_mtx_destroy(&fsw->fsw_detach_barrier_lock, &nexus_lock_group);
4324 lck_mtx_destroy(&fsw->fsw_reap_lock, &nexus_lock_group);
4325 lck_mtx_destroy(&fsw->fsw_linger_lock, &nexus_lock_group);
4326 }
4327
4328 void
fsw_linger_insert(struct flow_entry * fe)4329 fsw_linger_insert(struct flow_entry *fe)
4330 {
4331 struct nx_flowswitch *fsw = fe->fe_fsw;
4332 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4333 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
4334 fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
4335 fe->fe_flags, FLOWENTF_BITS);
4336
4337 net_update_uptime();
4338
4339 ASSERT(flow_entry_refcnt(fe) >= 1);
4340 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4341 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4342 ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
4343 ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE);
4344 ASSERT(fe->fe_linger_wait != 0);
4345 fe->fe_linger_expire = (_net_uptime + fe->fe_linger_wait);
4346 os_atomic_or(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4347
4348 lck_mtx_lock_spin(&fsw->fsw_linger_lock);
4349 TAILQ_INSERT_TAIL(&fsw->fsw_linger_head, fe, fe_linger_link);
4350 fsw->fsw_linger_cnt++;
4351 VERIFY(fsw->fsw_linger_cnt != 0);
4352 lck_mtx_unlock(&fsw->fsw_linger_lock);
4353
4354 fsw_reap_sched(fsw);
4355 }
4356
4357 static void
fsw_linger_remove_internal(struct flow_entry_linger_head * linger_head,struct flow_entry * fe)4358 fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head,
4359 struct flow_entry *fe)
4360 {
4361 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4362 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
4363 fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
4364 fe->fe_flags, FLOWENTF_BITS);
4365
4366 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4367 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4368 ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
4369 os_atomic_andnot(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4370
4371 TAILQ_REMOVE(linger_head, fe, fe_linger_link);
4372 flow_entry_release(&fe);
4373 }
4374
4375 static void
fsw_linger_remove(struct flow_entry * fe)4376 fsw_linger_remove(struct flow_entry *fe)
4377 {
4378 struct nx_flowswitch *fsw = fe->fe_fsw;
4379
4380 LCK_MTX_ASSERT(&fsw->fsw_linger_lock, LCK_MTX_ASSERT_OWNED);
4381
4382 fsw_linger_remove_internal(&fsw->fsw_linger_head, fe);
4383 VERIFY(fsw->fsw_linger_cnt != 0);
4384 fsw->fsw_linger_cnt--;
4385 }
4386
4387 void
fsw_linger_purge(struct nx_flowswitch * fsw)4388 fsw_linger_purge(struct nx_flowswitch *fsw)
4389 {
4390 struct flow_entry *fe, *tfe;
4391
4392 lck_mtx_lock(&fsw->fsw_linger_lock);
4393 TAILQ_FOREACH_SAFE(fe, &fsw->fsw_linger_head, fe_linger_link, tfe) {
4394 fsw_linger_remove(fe);
4395 }
4396 ASSERT(fsw->fsw_linger_cnt == 0);
4397 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4398 lck_mtx_unlock(&fsw->fsw_linger_lock);
4399 }
4400
4401 static void
fsw_defunct_rx_stall_channel(struct nx_flowswitch * fsw)4402 fsw_defunct_rx_stall_channel(struct nx_flowswitch *fsw)
4403 {
4404 struct kern_nexus *nx;
4405 uint64_t now = _net_uptime;
4406
4407 nx = fsw->fsw_nx;
4408
4409 /* Walk through all channels and check for Rx stall condition */
4410 /* uncrustify doesn't handle C blocks properly */
4411 /* BEGIN IGNORE CODESTYLE */
4412 nx_port_foreach(nx, ^(nexus_port_t nxport) {
4413 struct nexus_adapter *na = nx_port_get_na(nx, nxport);
4414 uint64_t elapsed, enqueue_ts, dequeue_ts;
4415 struct __kern_channel_ring *ring;
4416 struct kern_channel *ch;
4417 struct proc *p;
4418
4419 if (na == NULL || na->na_work_ts == 0 || na->na_rx_rings == NULL) {
4420 return;
4421 }
4422 ch = (struct kern_channel *)na->na_private;
4423 if (ch == NULL) {
4424 return;
4425 }
4426 ring = KR_SINGLE(na->na_rx_rings);
4427 enqueue_ts = ring->ckr_rx_enqueue_ts;
4428 dequeue_ts = ring->ckr_rx_dequeue_ts;
4429 /* Elapsed time since last Rx enqueue */
4430 elapsed = now - enqueue_ts;
4431 if ((dequeue_ts < enqueue_ts) && (elapsed > fsw_rx_stall_thresh)) {
4432 p = proc_find(ch->ch_pid);
4433 if (p == NULL) {
4434 return;
4435 }
4436 if (fsw_rx_stall_defunct) {
4437 kern_channel_defunct(p, ch);
4438 }
4439 proc_rele(p);
4440 DTRACE_SKYWALK3(rx__stall, struct nx_flowswitch *, fsw,
4441 struct nexus_adapter *, na, struct __kern_channel_ring *, ring);
4442 FSW_STATS_INC(FSW_STATS_RX_STALL);
4443 SK_ERR("Rx stall detected in proc %s(%llu) (%s): "
4444 "elapsed %llu (s), now: %llu, enqueue: %llu, dequeue: %llu, "
4445 "defunct: %s",
4446 ch->ch_name, ch->ch_pid, fsw->fsw_ifp->if_xname,
4447 elapsed, now, enqueue_ts, dequeue_ts,
4448 fsw_rx_stall_defunct ? "yes" : "no");
4449 }
4450 });
4451 /* END IGNORE CODESTYLE */
4452 }
4453
4454 void
fsw_reap_sched(struct nx_flowswitch * fsw)4455 fsw_reap_sched(struct nx_flowswitch *fsw)
4456 {
4457 ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
4458 lck_mtx_lock_spin(&fsw->fsw_reap_lock);
4459 if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING) &&
4460 !(fsw->fsw_reap_flags & (FSW_REAPF_TERMINATING | FSW_REAPF_TERMINATED))) {
4461 thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4462 }
4463 lck_mtx_unlock(&fsw->fsw_reap_lock);
4464 }
4465
4466 __attribute__((noreturn))
4467 static void
fsw_reap_thread_func(void * v,wait_result_t w)4468 fsw_reap_thread_func(void *v, wait_result_t w)
4469 {
4470 #pragma unused(w)
4471 struct nx_flowswitch *__single fsw = v;
4472
4473 ASSERT(fsw->fsw_reap_thread == current_thread());
4474 /*
4475 * -fbounds-safety: __unsafe_null_terminated_from_indexable provides
4476 * checks to ensure source contains the null terminator, by doing a
4477 * linear scan of the string.
4478 */
4479 thread_set_thread_name(current_thread(),
4480 __unsafe_null_terminated_from_indexable(fsw->fsw_reap_name));
4481
4482 net_update_uptime();
4483
4484 lck_mtx_lock(&fsw->fsw_reap_lock);
4485 VERIFY(!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING));
4486 (void) assert_wait(&fsw->fsw_reap_flags, THREAD_UNINT);
4487 lck_mtx_unlock(&fsw->fsw_reap_lock);
4488 thread_block_parameter(fsw_reap_thread_cont, fsw);
4489 /* NOTREACHED */
4490 __builtin_unreachable();
4491 }
4492
4493 __attribute__((noreturn))
4494 static void
fsw_reap_thread_cont(void * v,wait_result_t wres)4495 fsw_reap_thread_cont(void *v, wait_result_t wres)
4496 {
4497 struct nx_flowswitch *__single fsw = v;
4498 boolean_t low;
4499 uint64_t t = 0;
4500
4501 SK_DF(SK_VERB_FLOW, "%s: running", fsw->fsw_reap_name);
4502
4503 lck_mtx_lock(&fsw->fsw_reap_lock);
4504 if (__improbable(wres == THREAD_INTERRUPTED ||
4505 (fsw->fsw_reap_flags & FSW_REAPF_TERMINATING) != 0)) {
4506 goto terminate;
4507 }
4508
4509 ASSERT(!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED));
4510 fsw->fsw_reap_flags |= FSW_REAPF_RUNNING;
4511 lck_mtx_unlock(&fsw->fsw_reap_lock);
4512
4513 net_update_uptime();
4514
4515 /* prevent detach from happening while we're here */
4516 if (!fsw_detach_barrier_add(fsw)) {
4517 SK_ERR("%s: netagent detached", fsw->fsw_reap_name);
4518 t = 0;
4519 } else {
4520 uint32_t fe_nonviable, fe_freed, fe_aborted;
4521 uint32_t fr_freed, fr_resid = 0;
4522 struct ifnet *ifp = fsw->fsw_ifp;
4523 uint64_t i = FSW_REAP_IVAL;
4524 uint64_t now = _net_uptime;
4525 uint64_t last;
4526
4527 ASSERT(fsw->fsw_ifp != NULL);
4528
4529 /*
4530 * Pass 1: process any deferred {withdrawn,nonviable} requests.
4531 */
4532 fe_nonviable = fsw_process_deferred(fsw);
4533
4534 /*
4535 * Pass 2: remove any expired lingering flows.
4536 */
4537 fe_freed = fsw_process_linger(fsw, &fe_aborted);
4538
4539 /*
4540 * Pass 3: prune idle flow routes.
4541 */
4542 fr_freed = flow_route_prune(fsw->fsw_flow_mgr,
4543 ifp, &fr_resid);
4544
4545 /*
4546 * Pass 4: prune flow table
4547 *
4548 */
4549 cuckoo_hashtable_try_shrink(fsw->fsw_flow_mgr->fm_flow_table);
4550
4551 SK_DF(SK_VERB_FLOW, "%s: fe_nonviable %u/%u fe_freed %u/%u "
4552 "fe_aborted %u fr_freed %u/%u",
4553 fsw->fsw_flow_mgr->fm_name, fe_nonviable,
4554 (fe_nonviable + fsw->fsw_pending_nonviable),
4555 fe_freed, fsw->fsw_linger_cnt, fe_aborted, fe_freed,
4556 (fe_freed + fr_resid));
4557
4558 /* see if VM memory level is critical */
4559 low = skmem_lowmem_check();
4560
4561 /*
4562 * If things appear to be idle, we can prune away cached
4563 * object that have fallen out of the working sets (this
4564 * is different than purging). Every once in a while, we
4565 * also purge the caches. Note that this is done across
4566 * all flowswitch instances, and so we limit this to no
4567 * more than once every FSW_REAP_SK_THRES seconds.
4568 */
4569 last = os_atomic_load(&fsw_reap_last, relaxed);
4570 if ((low || (last != 0 && (now - last) >= FSW_REAP_SK_THRES)) &&
4571 os_atomic_cmpxchg(&fsw_reap_last, last, now, acq_rel)) {
4572 fsw_purge_cache(fsw, low);
4573
4574 /* increase sleep interval if idle */
4575 if (kdebug_enable == 0 && fsw->fsw_linger_cnt == 0 &&
4576 fsw->fsw_pending_nonviable == 0 && fr_resid == 0) {
4577 i <<= 3;
4578 }
4579 } else if (last == 0) {
4580 os_atomic_store(&fsw_reap_last, now, release);
4581 }
4582
4583 /*
4584 * Additionally, run thru the list of channels and prune
4585 * or purge away cached objects on "idle" channels. This
4586 * check is rate limited to no more than once every
4587 * FSW_DRAIN_CH_THRES seconds.
4588 */
4589 last = fsw->fsw_drain_channel_chk_last;
4590 if (low || (last != 0 && (now - last) >= FSW_DRAIN_CH_THRES)) {
4591 SK_DF(SK_VERB_FLOW, "%s: pruning channels",
4592 fsw->fsw_flow_mgr->fm_name);
4593
4594 fsw->fsw_drain_channel_chk_last = now;
4595 fsw_drain_channels(fsw, now, low);
4596 } else if (__improbable(last == 0)) {
4597 fsw->fsw_drain_channel_chk_last = now;
4598 }
4599
4600 /*
4601 * Finally, invoke the interface's reap callback to
4602 * tell it to prune or purge away cached objects if
4603 * it is idle. This check is rate limited to no more
4604 * than once every FSW_REAP_IF_THRES seconds.
4605 */
4606 last = fsw->fsw_drain_netif_chk_last;
4607 if (low || (last != 0 && (now - last) >= FSW_REAP_IF_THRES)) {
4608 ASSERT(fsw->fsw_nifna != NULL);
4609
4610 if (ifp->if_na_ops != NULL &&
4611 ifp->if_na_ops->ni_reap != NULL) {
4612 SK_DF(SK_VERB_FLOW, "%s: pruning netif",
4613 fsw->fsw_flow_mgr->fm_name);
4614 ifp->if_na_ops->ni_reap(ifp->if_na, ifp,
4615 FSW_REAP_IF_THRES, low);
4616 }
4617
4618 fsw->fsw_drain_netif_chk_last = now;
4619 } else if (__improbable(last == 0)) {
4620 fsw->fsw_drain_netif_chk_last = now;
4621 }
4622
4623 /* emit periodic interface stats ktrace */
4624 last = fsw->fsw_reap_last;
4625 if (last != 0 && (now - last) >= FSW_IFSTATS_THRES) {
4626 KDBG(SK_KTRACE_AON_IF_STATS, ifp->if_data.ifi_ipackets,
4627 ifp->if_data.ifi_ibytes * 8,
4628 ifp->if_data.ifi_opackets,
4629 ifp->if_data.ifi_obytes * 8);
4630
4631 fsw->fsw_reap_last = now;
4632 } else if (__improbable(last == 0)) {
4633 fsw->fsw_reap_last = now;
4634 }
4635
4636 /* Check for Rx stall condition every NX_FSW_RX_STALL_THRES seconds */
4637 last = fsw->fsw_rx_stall_chk_last;
4638 if (last != 0 && (now - last) >= NX_FSW_RX_STALL_THRES) {
4639 fsw_defunct_rx_stall_channel(fsw);
4640 fsw->fsw_rx_stall_chk_last = now;
4641 } else if (__improbable(last == 0)) {
4642 fsw->fsw_rx_stall_chk_last = now;
4643 }
4644
4645 nanoseconds_to_absolutetime(i * NSEC_PER_SEC, &t);
4646 clock_absolutetime_interval_to_deadline(t, &t);
4647 ASSERT(t != 0);
4648
4649 /* allow any pending detach to proceed */
4650 fsw_detach_barrier_remove(fsw);
4651 }
4652
4653 lck_mtx_lock(&fsw->fsw_reap_lock);
4654 if (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING)) {
4655 fsw->fsw_reap_flags &= ~FSW_REAPF_RUNNING;
4656 (void) assert_wait_deadline(&fsw->fsw_reap_flags,
4657 THREAD_UNINT, t);
4658 lck_mtx_unlock(&fsw->fsw_reap_lock);
4659 thread_block_parameter(fsw_reap_thread_cont, fsw);
4660 /* NOTREACHED */
4661 __builtin_unreachable();
4662 } else {
4663 terminate:
4664 LCK_MTX_ASSERT(&fsw->fsw_reap_lock, LCK_MTX_ASSERT_OWNED);
4665 fsw->fsw_reap_flags &= ~(FSW_REAPF_RUNNING | FSW_REAPF_TERMINATING);
4666 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATED;
4667 /*
4668 * And signal any thread waiting for us to terminate;
4669 * wait channel here other than fsw_reap_flags to make
4670 * it more explicit.
4671 */
4672 if (fsw->fsw_reap_flags & FSW_REAPF_TERMINATEBLOCK) {
4673 thread_wakeup((caddr_t)&fsw->fsw_reap_thread);
4674 }
4675 lck_mtx_unlock(&fsw->fsw_reap_lock);
4676
4677 SK_DF(SK_VERB_FLOW, "%s: terminating", fsw->fsw_reap_name);
4678
4679 /* for the extra refcnt from kernel_thread_start() */
4680 thread_deallocate(current_thread());
4681 /* this is the end */
4682 thread_terminate(current_thread());
4683 /* NOTREACHED */
4684 __builtin_unreachable();
4685 }
4686
4687 /* must never get here */
4688 VERIFY(0);
4689 /* NOTREACHED */
4690 __builtin_unreachable();
4691 }
4692
4693 static void
fsw_drain_channels(struct nx_flowswitch * fsw,uint64_t now,boolean_t low)4694 fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low)
4695 {
4696 struct kern_nexus *nx = fsw->fsw_nx;
4697
4698 /* flowswitch protects NA via fsw_lock, see fsw_port_alloc/free */
4699 FSW_RLOCK(fsw);
4700
4701 /* uncrustify doesn't handle C blocks properly */
4702 /* BEGIN IGNORE CODESTYLE */
4703 nx_port_foreach(nx, ^(nexus_port_t p) {
4704 struct nexus_adapter *na = nx_port_get_na(nx, p);
4705 if (na == NULL || na->na_work_ts == 0 || na->na_rx_rings == NULL) {
4706 return;
4707 }
4708
4709 boolean_t purge;
4710
4711 /*
4712 * If some activity happened in the last FSW_DRAIN_CH_THRES
4713 * seconds on this channel, we reclaim memory if the channel
4714 * throughput is less than the reap threshold value.
4715 */
4716 if ((now - na->na_work_ts) < FSW_DRAIN_CH_THRES) {
4717 struct __kern_channel_ring *__single ring;
4718 channel_ring_stats *stats;
4719 uint64_t bps;
4720
4721 ring = KR_SINGLE(na->na_rx_rings);
4722 stats = &ring->ckr_stats;
4723 bps = stats->crs_bytes_per_second;
4724
4725 if (bps < fsw_channel_reap_thresh) {
4726 purge = FALSE;
4727 na_drain(na, purge);
4728 }
4729 return;
4730 }
4731
4732 /*
4733 * If NA has been inactive for some time (twice the drain
4734 * threshold), we clear the work timestamp to temporarily skip
4735 * this channel until it's active again. Purging cached objects
4736 * can be expensive since we'd need to allocate and construct
4737 * them again, so we do it only when necessary.
4738 */
4739 if (low || ((now - na->na_work_ts) >= (FSW_DRAIN_CH_THRES << 1))) {
4740 na->na_work_ts = 0;
4741 purge = TRUE;
4742 } else {
4743 purge = FALSE;
4744 }
4745
4746 na_drain(na, purge); /* purge/prune caches */
4747 });
4748 /* END IGNORE CODESTYLE */
4749
4750 FSW_RUNLOCK(fsw);
4751 }
4752
4753 static void
fsw_purge_cache(struct nx_flowswitch * fsw,boolean_t low)4754 fsw_purge_cache(struct nx_flowswitch *fsw, boolean_t low)
4755 {
4756 #pragma unused(fsw)
4757 uint64_t o = os_atomic_inc_orig(&fsw_want_purge, relaxed);
4758 uint32_t p = fsw_flow_purge_thresh;
4759 boolean_t purge = (low || (o != 0 && p != 0 && (o % p) == 0));
4760
4761 SK_DF(SK_VERB_FLOW, "%s: %s caches",
4762 fsw->fsw_flow_mgr->fm_name,
4763 (purge ? "purge" : "prune"));
4764
4765 skmem_cache_reap_now(sk_fo_cache, purge);
4766 skmem_cache_reap_now(sk_fe_cache, purge);
4767 skmem_cache_reap_now(sk_fab_cache, purge);
4768 skmem_cache_reap_now(flow_route_cache, purge);
4769 skmem_cache_reap_now(flow_stats_cache, purge);
4770 netns_reap_caches(purge);
4771 skmem_reap_caches(purge);
4772
4773 #if CONFIG_MBUF_MCACHE
4774 if (if_is_fsw_transport_netagent_enabled() && purge) {
4775 mbuf_drain(FALSE);
4776 }
4777 #endif /* CONFIG_MBUF_MCACHE */
4778 }
4779
4780 static void
fsw_flow_handle_low_power(struct nx_flowswitch * fsw,struct flow_entry * fe)4781 fsw_flow_handle_low_power(struct nx_flowswitch *fsw, struct flow_entry *fe)
4782 {
4783 /* When the interface is in low power mode, the flow is nonviable */
4784 if (!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
4785 os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
4786 os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
4787 }
4788 }
4789
4790 static uint32_t
fsw_process_deferred(struct nx_flowswitch * fsw)4791 fsw_process_deferred(struct nx_flowswitch *fsw)
4792 {
4793 struct flow_entry_dead sfed __sk_aligned(8);
4794 struct flow_mgr *fm = fsw->fsw_flow_mgr;
4795 struct flow_entry_dead *fed, *tfed;
4796 LIST_HEAD(, flow_entry_dead) fed_head =
4797 LIST_HEAD_INITIALIZER(fed_head);
4798 uint32_t i, nonviable = 0;
4799 boolean_t lowpowermode = FALSE;
4800
4801 bzero(&sfed, sizeof(sfed));
4802
4803 /*
4804 * The flows become nonviable when the interface
4805 * is in low power mode (edge trigger)
4806 */
4807 if ((fsw->fsw_ifp->if_xflags & IFXF_LOW_POWER) &&
4808 fsw->fsw_ifp->if_low_power_gencnt != fsw->fsw_low_power_gencnt) {
4809 lowpowermode = TRUE;
4810 fsw->fsw_low_power_gencnt = fsw->fsw_ifp->if_low_power_gencnt;
4811 }
4812
4813 /*
4814 * Scan thru the flow entry tree, and commit any pending withdraw or
4815 * nonviable requests. We may need to push stats and/or unassign the
4816 * nexus from NECP, but we cannot do that while holding the locks;
4817 * build a temporary list for those entries.
4818 */
4819 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
4820 struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
4821 struct flow_owner *fo;
4822
4823 /*
4824 * Grab the lock at all costs when handling low power mode
4825 */
4826 if (__probable(!lowpowermode)) {
4827 if (!FOB_TRY_LOCK(fob)) {
4828 continue;
4829 }
4830 } else {
4831 FOB_LOCK(fob);
4832 }
4833
4834 FOB_LOCK_ASSERT_HELD(fob);
4835 RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
4836 struct flow_entry *fe;
4837
4838 RB_FOREACH(fe, flow_entry_id_tree,
4839 &fo->fo_flow_entry_id_head) {
4840 /* try first as reader; skip if we can't */
4841 if (__improbable(lowpowermode)) {
4842 fsw_flow_handle_low_power(fsw, fe);
4843 }
4844 if (__improbable(fe->fe_flags & FLOWENTF_HALF_CLOSED)) {
4845 os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
4846 flow_namespace_half_close(&fe->fe_port_reservation);
4847 }
4848
4849 /* if not withdrawn/nonviable, skip */
4850 if (!fe->fe_want_withdraw &&
4851 !fe->fe_want_nonviable) {
4852 continue;
4853 }
4854 /*
4855 * Here we're holding the lock as writer;
4856 * don't spend too much time as we're
4857 * blocking the data path now.
4858 */
4859 ASSERT(!uuid_is_null(fe->fe_uuid));
4860 /* only need flow UUID and booleans */
4861 uuid_copy(sfed.fed_uuid, fe->fe_uuid);
4862 sfed.fed_want_clonotify =
4863 (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY);
4864 sfed.fed_want_nonviable = fe->fe_want_nonviable;
4865 flow_entry_teardown(fo, fe);
4866
4867 /* do this outside the flow bucket lock */
4868 fed = flow_entry_dead_alloc(Z_WAITOK);
4869 ASSERT(fed != NULL);
4870 *fed = sfed;
4871 LIST_INSERT_HEAD(&fed_head, fed, fed_link);
4872 }
4873 }
4874 FOB_UNLOCK(fob);
4875 }
4876
4877 /*
4878 * These nonviable flows are no longer useful since we've lost
4879 * the source IP address; in the event the client monitors the
4880 * viability of the flow, explicitly mark it as nonviable so
4881 * that a new flow can be created.
4882 */
4883 LIST_FOREACH_SAFE(fed, &fed_head, fed_link, tfed) {
4884 LIST_REMOVE(fed, fed_link);
4885 ASSERT(fsw->fsw_agent_session != NULL);
4886
4887 /* if flow is closed early */
4888 if (fed->fed_want_clonotify) {
4889 necp_client_early_close(fed->fed_uuid);
4890 }
4891
4892 /* if nonviable, unassign nexus attributes */
4893 if (fed->fed_want_nonviable) {
4894 (void) netagent_assign_nexus(fsw->fsw_agent_session,
4895 fed->fed_uuid, NULL, 0);
4896 }
4897
4898 flow_entry_dead_free(fed);
4899 ++nonviable;
4900 }
4901 ASSERT(LIST_EMPTY(&fed_head));
4902
4903 return nonviable;
4904 }
4905
4906 static uint32_t
fsw_process_linger(struct nx_flowswitch * fsw,uint32_t * abort)4907 fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort)
4908 {
4909 struct flow_entry_linger_head linger_head =
4910 TAILQ_HEAD_INITIALIZER(linger_head);
4911 struct flow_entry *fe, *tfe;
4912 uint64_t now = _net_uptime;
4913 uint32_t i = 0, cnt = 0, freed = 0;
4914
4915 ASSERT(fsw->fsw_ifp != NULL);
4916 ASSERT(abort != NULL);
4917 *abort = 0;
4918
4919 /*
4920 * We don't want to contend with the datapath, so move
4921 * everything that's in the linger list into a local list.
4922 * This allows us to generate RSTs or free the flow entry
4923 * outside the lock. Any remaining flow entry in the local
4924 * list will get re-added back to the head of the linger
4925 * list, in front of any new ones added since then.
4926 */
4927 lck_mtx_lock(&fsw->fsw_linger_lock);
4928 TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
4929 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4930 cnt = fsw->fsw_linger_cnt;
4931 fsw->fsw_linger_cnt = 0;
4932 lck_mtx_unlock(&fsw->fsw_linger_lock);
4933
4934 TAILQ_FOREACH_SAFE(fe, &linger_head, fe_linger_link, tfe) {
4935 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4936 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4937 ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
4938
4939 /*
4940 * See if this is a TCP flow that needs to generate
4941 * a RST to the remote peer (if not already).
4942 */
4943 if (flow_track_tcp_want_abort(fe)) {
4944 VERIFY(fe->fe_flags & FLOWENTF_ABORTED);
4945 ASSERT(!uuid_is_null(fe->fe_uuid));
4946 flow_track_abort_tcp(fe, NULL, NULL);
4947 (*abort)++;
4948 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4949 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx "
4950 "flags 0x%b [RST]", fe_as_string(fe, dbgbuf,
4951 sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags,
4952 FLOWENTF_BITS);
4953 }
4954
4955 /*
4956 * If flow has expired, remove from list and free;
4957 * otherwise leave it around in the linger list.
4958 */
4959 if (fe->fe_linger_expire <= now) {
4960 freed++;
4961 fsw_linger_remove_internal(&linger_head, fe);
4962 fe = NULL;
4963 }
4964 ++i;
4965 }
4966 VERIFY(i == cnt && cnt >= freed);
4967
4968 /*
4969 * Add any remaining ones back into the linger list.
4970 */
4971 lck_mtx_lock(&fsw->fsw_linger_lock);
4972 if (!TAILQ_EMPTY(&linger_head)) {
4973 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head) || fsw->fsw_linger_cnt);
4974 TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
4975 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4976 TAILQ_CONCAT(&fsw->fsw_linger_head, &linger_head, fe_linger_link);
4977 fsw->fsw_linger_cnt += (cnt - freed);
4978 }
4979 ASSERT(TAILQ_EMPTY(&linger_head));
4980 lck_mtx_unlock(&fsw->fsw_linger_lock);
4981
4982 return freed;
4983 }
4984
4985 __attribute__((always_inline))
4986 static inline void
fsw_ifp_inc_traffic_class_in_pkt(struct ifnet * ifp,kern_packet_t ph)4987 fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph)
4988 {
4989 switch (__packet_get_traffic_class(ph)) {
4990 case PKT_TC_BE:
4991 ifp->if_tc.ifi_ibepackets++;
4992 ifp->if_tc.ifi_ibebytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
4993 break;
4994 case PKT_TC_BK:
4995 ifp->if_tc.ifi_ibkpackets++;
4996 ifp->if_tc.ifi_ibkbytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
4997 break;
4998 case PKT_TC_VI:
4999 ifp->if_tc.ifi_ivipackets++;
5000 ifp->if_tc.ifi_ivibytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5001 break;
5002 case PKT_TC_VO:
5003 ifp->if_tc.ifi_ivopackets++;
5004 ifp->if_tc.ifi_ivobytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5005 break;
5006 default:
5007 break;
5008 }
5009 }
5010
5011 __attribute__((always_inline))
5012 static inline void
fsw_ifp_inc_traffic_class_out_pkt(struct ifnet * ifp,uint32_t svc,uint32_t cnt,uint32_t len)5013 fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *ifp, uint32_t svc,
5014 uint32_t cnt, uint32_t len)
5015 {
5016 switch (svc) {
5017 case PKT_TC_BE:
5018 ifp->if_tc.ifi_obepackets += cnt;
5019 ifp->if_tc.ifi_obebytes += len;
5020 break;
5021 case PKT_TC_BK:
5022 ifp->if_tc.ifi_obkpackets += cnt;
5023 ifp->if_tc.ifi_obkbytes += len;
5024 break;
5025 case PKT_TC_VI:
5026 ifp->if_tc.ifi_ovipackets += cnt;
5027 ifp->if_tc.ifi_ovibytes += len;
5028 break;
5029 case PKT_TC_VO:
5030 ifp->if_tc.ifi_ovopackets += cnt;
5031 ifp->if_tc.ifi_ovobytes += len;
5032 break;
5033 default:
5034 break;
5035 }
5036 }
5037