1 /*
2 * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 */
53
54 /*
55 * BSD LICENSE
56 *
57 * Copyright(c) 2015 NEC Europe Ltd. All rights reserved.
58 * All rights reserved.
59 *
60 * Redistribution and use in source and binary forms, with or without
61 * modification, are permitted provided that the following conditions
62 * are met:
63 *
64 * * Redistributions of source code must retain the above copyright
65 * notice, this list of conditions and the following disclaimer.
66 * * Redistributions in binary form must reproduce the above copyright
67 * notice, this list of conditions and the following disclaimer in
68 * the documentation and/or other materials provided with the
69 * distribution.
70 * * Neither the name of NEC Europe Ltd. nor the names of
71 * its contributors may be used to endorse or promote products derived
72 * from this software without specific prior written permission.
73 *
74 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
75 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
76 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
77 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
78 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
79 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
80 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
81 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
82 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
83 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
84 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85 */
86
87 #include <skywalk/os_skywalk_private.h>
88 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
89 #include <skywalk/nexus/flowswitch/fsw_var.h>
90 #include <skywalk/nexus/netif/nx_netif.h>
91 #include <skywalk/nexus/netif/nx_netif_compat.h>
92 #include <kern/sched_prim.h>
93 #include <sys/kdebug.h>
94 #include <sys/sdt.h>
95 #include <net/bpf.h>
96 #include <net/if_ports_used.h>
97 #include <net/pktap.h>
98 #include <net/droptap.h>
99 #include <net/pktsched/pktsched_netem.h>
100 #include <netinet/tcp.h>
101 #include <netinet/udp.h>
102 #include <netinet/ip.h>
103 #include <netinet/ip6.h>
104 #include <netinet/in_var.h>
105
106 extern kern_return_t thread_terminate(thread_t);
107
108 #define FSW_ZONE_MAX 256
109 #define FSW_ZONE_NAME "skywalk.nx.fsw"
110
111 static uint64_t fsw_reap_last __sk_aligned(8);
112 static uint64_t fsw_want_purge __sk_aligned(8);
113
114 #define NX_FSW_FE_TABLESZ 256 /* some power of 2 */
115 static uint32_t fsw_fe_table_size = NX_FSW_FE_TABLESZ;
116
117 #define NX_FSW_FOB_HASHSZ 31 /* some mersenne prime */
118 static uint32_t fsw_flow_owner_buckets = NX_FSW_FOB_HASHSZ;
119
120 #define NX_FSW_FRB_HASHSZ 128 /* some power of 2 */
121 static uint32_t fsw_flow_route_buckets = NX_FSW_FRB_HASHSZ;
122
123 #define NX_FSW_FRIB_HASHSZ 13 /* some mersenne prime */
124 static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ;
125
126 #define NX_FSW_FLOW_REAP_INTERVAL 1 /* seconds */
127 static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL;
128
129 #define NX_FSW_RX_STALL_THRES 10 /* seconds */
130 static uint32_t fsw_rx_stall_thresh = NX_FSW_RX_STALL_THRES;
131
132 #define NX_FSW_RX_STALL_DEFUNCT 1 /* defunct Rx-stalled channel (0 = disable) */
133 static uint32_t fsw_rx_stall_defunct = NX_FSW_RX_STALL_DEFUNCT;
134
135 #define NX_FSW_FLOW_PURGE_THRES 0 /* purge every N reaps (0 = disable) */
136 static uint32_t fsw_flow_purge_thresh = NX_FSW_FLOW_PURGE_THRES;
137
138 #define FSW_REAP_IVAL (MAX(1, fsw_flow_reap_interval))
139 #define FSW_REAP_SK_THRES (FSW_REAP_IVAL << 5)
140 #define FSW_REAP_IF_THRES (FSW_REAP_IVAL << 5)
141 #define FSW_DRAIN_CH_THRES (FSW_REAP_IVAL << 5)
142 #define FSW_IFSTATS_THRES 1
143
144 #define NX_FSW_CHANNEL_REAP_THRES 1000 /* threshold (bytes/sec) for reaping*/
145 uint64_t fsw_channel_reap_thresh = NX_FSW_CHANNEL_REAP_THRES;
146
147 #define RX_BUFLET_BATCH_COUNT 64 /* max batch size for buflet allocation */
148
149 uint32_t fsw_rx_batch = NX_FSW_RXBATCH; /* # of packets per batch (RX) */
150 uint32_t fsw_tx_batch = NX_FSW_TXBATCH; /* # of packets per batch (TX) */
151 uint32_t fsw_gso_batch = 8;
152 #if (DEVELOPMENT || DEBUG)
153 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_batch,
154 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_batch, 0,
155 "flowswitch Rx batch size");
156 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, tx_batch,
157 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_tx_batch, 0,
158 "flowswitch Tx batch size");
159 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_batch,
160 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_gso_batch, 0,
161 "flowswitch GSO batch size");
162 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, reap_throughput,
163 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_channel_reap_thresh,
164 "flowswitch channel reap threshold throughput (bytes/sec)");
165 #endif /* !DEVELOPMENT && !DEBUG */
166
167 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp,
168 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp, 0,
169 "flowswitch RX aggregation for tcp flows (enable/disable)");
170 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp_host,
171 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp_host, 0,
172 "flowswitch RX aggregation for tcp kernel path (0/1/2 (off/on/auto))");
173 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_mtu,
174 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_gso_mtu, 0,
175 "flowswitch GSO for tcp flows (mtu > 0: enable, mtu == 0: disable)");
176
177 /*
178 * IP reassembly
179 * The "kern.skywalk.flowswitch.ip_reass" sysctl can be used to force
180 * enable/disable the reassembly routine regardless of whether the
181 * transport netagent is enabled or not.
182 *
183 * 'fsw_ip_reass' is a tri-state:
184 * 0 means force IP reassembly off
185 * 1 means force IP reassembly on
186 * 2 means don't force the value, use what's appropriate for this flowswitch
187 */
188 #define FSW_IP_REASS_FORCE_OFF 0
189 #define FSW_IP_REASS_FORCE_ON 1
190 #define FSW_IP_REASS_NO_FORCE 2
191
192 uint32_t fsw_ip_reass = FSW_IP_REASS_NO_FORCE;
193
194 static int
195 fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS
196 {
197 #pragma unused(oidp, arg1, arg2)
198 unsigned int new_value;
199 int changed;
200 int error;
201
202 error = sysctl_io_number(req, fsw_ip_reass, sizeof(fsw_ip_reass),
203 &new_value, &changed);
204 if (error == 0 && changed != 0) {
205 if (new_value > FSW_IP_REASS_NO_FORCE) {
206 return EINVAL;
207 }
208 fsw_ip_reass = new_value;
209 }
210 return error;
211 }
212
213 SYSCTL_PROC(_kern_skywalk_flowswitch, OID_AUTO, ip_reass,
214 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
215 0, 0, fsw_ip_reass_sysctl, "IU",
216 "adjust flowswitch IP reassembly");
217
218 #if (DEVELOPMENT || DEBUG)
219 static uint64_t _fsw_inject_error = 0;
220 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) \
221 _SK_INJECT_ERROR(_fsw_inject_error, _en, _ev, _ec, \
222 &FSW_STATS_VAL(_FSW_STATS_ERROR_INJECTIONS), _f, __VA_ARGS__)
223
224 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { \
225 if (__improbable(((_fsw_inject_error) & (1ULL << (_en))) != 0)) { \
226 SK_DF(SK_VERB_ERROR_INJECT, "injecting error %d", (_en));\
227 if ((_f) != NULL) \
228 (_f)(__VA_ARGS__); \
229 } \
230 } while (0)
231
232 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_owner_buckets,
233 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_owner_buckets, 0, "");
234 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, fe_table_size,
235 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_fe_table_size, 0, "");
236 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_buckets,
237 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_route_buckets, 0, "");
238 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO,
239 flow_route_id_buckets, CTLFLAG_RW | CTLFLAG_LOCKED,
240 &fsw_flow_route_id_buckets, 0, "");
241 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_reap_interval,
242 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_reap_interval, 0, "");
243 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_stall_thresh,
244 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_stall_thresh, 0, "");
245 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_stall_defunct,
246 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_stall_defunct, 0, "");
247 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_purge_thresh,
248 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_purge_thresh, 0, "");
249 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, fsw_inject_error,
250 CTLFLAG_RW | CTLFLAG_LOCKED, &_fsw_inject_error, "");
251 #else
252 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0)
253 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { } while (0)
254 #endif /* !DEVELOPMENT && !DEBUG */
255
256 static void fsw_linger_remove_internal(struct flow_entry_linger_head *,
257 struct flow_entry *);
258 static void fsw_reap_thread_func(void *, wait_result_t);
259 static void fsw_reap_thread_cont(void *, wait_result_t);
260 static void fsw_purge_cache(struct nx_flowswitch *, boolean_t);
261 static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t);
262 static uint32_t fsw_process_deferred(struct nx_flowswitch *);
263 static uint32_t fsw_process_linger(struct nx_flowswitch *, uint32_t *);
264
265 static int copy_packet_from_dev(struct nx_flowswitch *, struct __kern_packet *,
266 struct __kern_packet *);
267
268 static void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *, kern_packet_t);
269 static void fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *, uint32_t,
270 uint32_t, uint32_t);
271
272 static int __fsw_dp_inited = 0;
273
274 int
fsw_dp_init(void)275 fsw_dp_init(void)
276 {
277 _CASSERT(FSW_VP_DEV == 0);
278 _CASSERT(FSW_VP_HOST == 1);
279 _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN);
280 _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT);
281
282 ASSERT(!__fsw_dp_inited);
283
284 flow_mgr_init();
285 flow_init();
286
287 __fsw_dp_inited = 1;
288
289 return 0;
290 }
291
292 void
fsw_dp_uninit(void)293 fsw_dp_uninit(void)
294 {
295 if (__fsw_dp_inited) {
296 flow_fini();
297 flow_mgr_fini();
298
299 __fsw_dp_inited = 0;
300 }
301 }
302
303 static void
dp_free_pktq(struct nx_flowswitch * fsw __sk_unused,struct pktq * pktq)304 dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq)
305 {
306 pp_free_pktq(pktq);
307 }
308
309 #define dp_drop_pktq(fsw, pktq, outgoing, _reason, line, _flags) do { \
310 uint32_t _len = KPKTQ_LEN(pktq); \
311 if (KPKTQ_EMPTY(pktq)) { \
312 ASSERT(_len == 0); \
313 return; \
314 } \
315 SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop %d packets", _len); \
316 FSW_STATS_ADD(FSW_STATS_DROP, _len); \
317 DTRACE_SKYWALK1(fsw__dp__drop, int, _len); \
318 if (__probable(droptap_total_tap_count == 0)) { \
319 dp_free_pktq(fsw, pktq); \
320 break; \
321 } \
322 drop_func_t dropfunc; \
323 dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
324 struct __kern_packet *kpkt = KPKTQ_FIRST(pktq); \
325 struct __kern_packet *next_pkt; \
326 for (; kpkt != NULL; kpkt = next_pkt) { \
327 next_pkt = kpkt->pkt_nextpkt; \
328 dropfunc(SK_PKT2PH(kpkt), _reason, __func__, line, _flags, \
329 fsw->fsw_ifp, kpkt->pkt_qum.qum_pid, NULL, -1, NULL, \
330 0, 0); \
331 } \
332 dp_free_pktq(fsw, pktq); \
333 } while (0)
334
335 #define dp_drop_pkt_single(fsw, pkt, outgoing, _reason, _flags) do { \
336 SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop 1 packet"); \
337 FSW_STATS_ADD(FSW_STATS_DROP, 1); \
338 if (__probable(droptap_total_tap_count == 0)) { \
339 pp_free_packet_single(pkt); \
340 break; \
341 } \
342 drop_func_t dropfunc; \
343 dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
344 dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags, \
345 fsw->fsw_ifp, (pkt)->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0); \
346 pp_free_packet_single(pkt); \
347 } while (0)
348
349 #define dp_drop_pkt_chain(pkt, outgoing, _reason, _flags) do { \
350 if (__probable(droptap_total_tap_count == 0)) { \
351 pp_free_packet_chain(pkt, NULL); \
352 break; \
353 } \
354 drop_func_t dropfunc; \
355 dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
356 struct __kern_packet *next_pkt; \
357 for (; pkt != NULL; pkt = next_pkt) { \
358 next_pkt = pkt->pkt_nextpkt; \
359 dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags, \
360 NULL, pkt->pkt_qum.qum_pid, NULL, -1, NULL, \
361 0, 0); \
362 } \
363 pp_free_packet_chain(pkt, NULL); \
364 } while (0)
365
366
367 SK_NO_INLINE_ATTRIBUTE
368 void
fsw_snoop(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * pktq,bool input)369 fsw_snoop(struct nx_flowswitch *fsw, struct flow_entry *fe, struct pktq *pktq,
370 bool input)
371 {
372 pid_t pid;
373 char proc_name_buf[FLOW_PROCESS_NAME_LENGTH];
374 const char *__null_terminated proc_name = NULL;
375 pid_t epid;
376 char eproc_name_buf[FLOW_PROCESS_NAME_LENGTH];
377 const char *__null_terminated eproc_name = NULL;
378 sa_family_t af;
379 bool tap_early = false;
380 struct __kern_packet *pkt;
381
382 ASSERT(fe != NULL);
383 ASSERT(fsw->fsw_ifp != NULL);
384
385 if (fe->fe_nx_port == FSW_VP_HOST) {
386 /* allow packets to be tapped before aggregation happens */
387 tap_early = (input && fe->fe_key.fk_proto == IPPROTO_TCP);
388 if (!tap_early) {
389 /* all other traffic will be tapped in the dlil input path */
390 return;
391 }
392 }
393 if (fe->fe_key.fk_ipver == IPVERSION) {
394 af = AF_INET;
395 } else if (fe->fe_key.fk_ipver == IPV6_VERSION) {
396 af = AF_INET6;
397 } else {
398 return;
399 }
400
401 pid = fe->fe_pid;
402 if (fe->fe_proc_name[0] != '\0') {
403 proc_name = strbufcpy(proc_name_buf, sizeof(proc_name_buf),
404 fe->fe_proc_name, sizeof(fe->fe_proc_name));
405 }
406 epid = fe->fe_epid;
407 if (fe->fe_eproc_name[0] != '\0') {
408 eproc_name = strbufcpy(eproc_name_buf, sizeof(eproc_name_buf),
409 fe->fe_eproc_name, sizeof(fe->fe_eproc_name));
410 }
411 if (input) {
412 KPKTQ_FOREACH(pkt, pktq) {
413 pktap_input_packet(fsw->fsw_ifp, af,
414 fsw->fsw_ifp_dlt, pid, proc_name, epid,
415 eproc_name, SK_PKT2PH(pkt), NULL, 0,
416 IPPROTO_TCP, fe->fe_flowid,
417 tap_early ? PTH_FLAG_SOCKET: PTH_FLAG_NEXUS_CHAN);
418 }
419 } else {
420 KPKTQ_FOREACH(pkt, pktq) {
421 pktap_output_packet(fsw->fsw_ifp, af,
422 fsw->fsw_ifp_dlt, pid, proc_name, epid,
423 eproc_name, SK_PKT2PH(pkt), NULL, 0,
424 0, 0, PTH_FLAG_NEXUS_CHAN);
425 }
426 }
427 }
428
429 #if (DEVELOPMENT || DEBUG)
430 static void
_fsw_error35_handler(int step,struct flow_route * fr,struct __kern_packet * pkt,int * ret)431 _fsw_error35_handler(int step, struct flow_route *fr, struct __kern_packet *pkt,
432 int *ret)
433 {
434 static boolean_t _err35_flag_modified = FALSE;
435
436 switch (step) {
437 case 1:
438 if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
439 (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
440 fr->fr_flags &= ~FLOWRTF_RESOLVED;
441 _err35_flag_modified = TRUE;
442 }
443 break;
444
445 case 2:
446 if (!_err35_flag_modified) {
447 return;
448 }
449 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
450 m_freem(pkt->pkt_mbuf);
451 pkt->pkt_pflags &= ~PKT_F_MBUF_DATA;
452 pkt->pkt_mbuf = NULL;
453 }
454 *ret = EJUSTRETURN;
455 fr->fr_flags |= FLOWRTF_RESOLVED;
456 _err35_flag_modified = FALSE;
457 break;
458
459 default:
460 VERIFY(0);
461 /* not reached */
462 }
463 }
464
465 static void
_fsw_error36_handler(int step,struct flow_route * fr,int * ret)466 _fsw_error36_handler(int step, struct flow_route *fr, int *ret)
467 {
468 static boolean_t _err36_flag_modified = FALSE;
469
470 switch (step) {
471 case 1:
472 if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
473 (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
474 fr->fr_flags &= ~FLOWRTF_RESOLVED;
475 _err36_flag_modified = TRUE;
476 }
477 break;
478
479 case 2:
480 if (!_err36_flag_modified) {
481 return;
482 }
483 *ret = ENETUNREACH;
484 fr->fr_flags |= FLOWRTF_RESOLVED;
485 _err36_flag_modified = FALSE;
486 break;
487
488 default:
489 VERIFY(0);
490 /* not reached */
491 }
492 }
493 #else /* !DEVELOPMENT && !DEBUG */
494 #define _fsw_error35_handler(...)
495 #define _fsw_error36_handler(...)
496 #endif /* DEVELOPMENT || DEBUG */
497
498 /*
499 * Check if the source packet content can fit into the destination
500 * ring's packet. Returns TRUE if the source packet can fit.
501 * Note: Failures could be caused by misconfigured packet pool sizes,
502 * missing packet size check again MTU or if the source packet is from
503 * a compat netif and the attached mbuf is larger than MTU due to LRO.
504 */
505 static inline boolean_t
validate_pkt_len(struct __kern_packet * spkt,kern_packet_t dph,uint32_t skip_l2hlen,uint32_t l2hlen,uint16_t headroom,uint32_t * copy_len)506 validate_pkt_len(struct __kern_packet *spkt, kern_packet_t dph,
507 uint32_t skip_l2hlen, uint32_t l2hlen, uint16_t headroom,
508 uint32_t *copy_len)
509 {
510 uint32_t tlen = 0;
511 uint32_t splen = spkt->pkt_length - skip_l2hlen;
512
513 if (l2hlen != 0) {
514 VERIFY(skip_l2hlen == 0);
515 tlen += l2hlen;
516 } else if ((spkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0) {
517 splen -= ETHER_CRC_LEN;
518 }
519
520 tlen += splen;
521 *copy_len = splen;
522
523 return tlen <= ((__packet_get_buflet_count(dph) *
524 PP_BUF_SIZE_DEF(SK_PTR_ADDR_KPKT(dph)->pkt_qum.qum_pp)) -
525 headroom);
526 }
527
528 #if SK_LOG
529 /* Hoisted out of line to reduce kernel stack footprint */
530 SK_LOG_ATTRIBUTE
531 static void
copy_packet_from_dev_log(struct __kern_packet * spkt,struct __kern_packet * dpkt,struct proc * p)532 copy_packet_from_dev_log(struct __kern_packet *spkt,
533 struct __kern_packet *dpkt, struct proc *p)
534 {
535 uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
536 ((spkt->pkt_pflags & PKT_F_MBUF_DATA) ?
537 SK_VERB_COPY_MBUF : SK_VERB_COPY));
538 char *daddr;
539 uint32_t pkt_len;
540
541 MD_BUFLET_ADDR_ABS(dpkt, daddr);
542 pkt_len = __packet_get_real_data_length(dpkt);
543 SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u",
544 sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
545 dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
546 (uint32_t)dpkt->pkt_l2_len);
547 SK_DF(logflags | SK_VERB_DUMP, "%s",
548 sk_dump("buf", daddr, pkt_len, 128, NULL, 0));
549 }
550 #else
551 #define copy_packet_from_dev_log(...)
552 #endif /* SK_LOG */
553
554
555 static inline int
copy_packet_from_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)556 copy_packet_from_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
557 struct __kern_packet *dpkt)
558 {
559 /*
560 * source and destination nexus don't share the packet pool
561 * sync operation here is to
562 * - alloc packet for the rx(dst) ring
563 * - copy data/metadata from src packet to dst packet
564 * - attach alloc'd packet to rx(dst) ring
565 */
566 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
567 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
568 kern_packet_t sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt),
569 METADATA_SUBTYPE(spkt));
570 boolean_t do_cksum_rx;
571 uint16_t skip_l2h_len = spkt->pkt_l2_len;
572 uint16_t iphlen;
573 uint32_t dlen;
574 int err;
575
576 if (__improbable(!validate_pkt_len(spkt, dph, skip_l2h_len, 0, 0,
577 &dlen))) {
578 SK_ERR("bufcnt %d, bufsz %d", __packet_get_buflet_count(dph),
579 PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp));
580 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
581 return EINVAL;
582 }
583
584 /* Copy packet metadata */
585 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
586 _PKT_COPY(spkt, dpkt);
587 ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
588 PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
589 ASSERT(dpkt->pkt_mbuf == NULL);
590
591 dpkt->pkt_headroom = 0;
592 dpkt->pkt_l2_len = 0;
593
594 /* don't include IP header from partial sum */
595 if (__probable((spkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) {
596 iphlen = spkt->pkt_flow_ip_hlen;
597 do_cksum_rx = sk_cksum_rx;
598 } else {
599 iphlen = 0;
600 do_cksum_rx = FALSE;
601 }
602
603 /* Copy packet payload */
604 if ((spkt->pkt_pflags & PKT_F_MBUF_DATA) &&
605 (spkt->pkt_pflags & PKT_F_TRUNCATED)) {
606 FSW_STATS_INC(FSW_STATS_RX_COPY_MBUF2PKT);
607 /*
608 * Source packet has truncated contents (just enough for
609 * the classifer) of an mbuf from the compat driver; copy
610 * the entire entire mbuf contents to destination packet.
611 */
612 m_adj(spkt->pkt_mbuf, skip_l2h_len);
613 ASSERT((uint32_t)m_pktlen(spkt->pkt_mbuf) >= dlen);
614 fsw->fsw_pkt_copy_from_mbuf(NR_RX, dph, 0,
615 spkt->pkt_mbuf, 0, dlen, do_cksum_rx, iphlen);
616 } else {
617 FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2PKT);
618 /*
619 * Source packet has full contents, either from an mbuf
620 * that came up from the compat driver, or because it
621 * originated on the native driver; copy to destination.
622 */
623 fsw->fsw_pkt_copy_from_pkt(NR_RX, dph, 0, sph,
624 (spkt->pkt_headroom + spkt->pkt_l2_len), dlen, do_cksum_rx,
625 iphlen, 0, FALSE);
626 }
627
628 #if DEBUG || DEVELOPMENT
629 if (__improbable(pkt_trailers > 0)) {
630 dlen += pkt_add_trailers(dph, dlen, iphlen);
631 }
632 #endif /* DEBUG || DEVELOPMENT */
633
634 /* Finalize and attach packet to Rx ring */
635 METADATA_ADJUST_LEN(dpkt, 0, 0);
636 err = __packet_finalize(dph);
637 VERIFY(err == 0);
638
639 copy_packet_from_dev_log(spkt, dpkt, kernproc);
640
641 if (spkt->pkt_pflags & PKT_F_MBUF_DATA) {
642 ifp_inc_traffic_class_in(fsw->fsw_ifp, spkt->pkt_mbuf);
643 mbuf_free(spkt->pkt_mbuf);
644 KPKT_CLEAR_MBUF_DATA(spkt);
645 } else {
646 fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph);
647 }
648
649 if (__probable(do_cksum_rx != 0)) {
650 FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
651 }
652
653 return 0;
654 }
655
656 SK_NO_INLINE_ATTRIBUTE
657 static struct __kern_packet *
rx_process_ip_frag(struct nx_flowswitch * fsw,struct __kern_packet * pkt)658 rx_process_ip_frag(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
659 {
660 char *pkt_buf;
661 void *l3_hdr;
662 uint16_t nfrags, tlen;
663 int err = 0;
664
665 switch (fsw_ip_reass) {
666 case FSW_IP_REASS_FORCE_OFF:
667 return pkt;
668 case FSW_IP_REASS_FORCE_ON:
669 break;
670 default:
671 if (!FSW_NETAGENT_ENABLED(fsw) ||
672 flow_mgr_get_num_flows(fsw->fsw_flow_mgr) == 0) {
673 return pkt;
674 }
675 break;
676 }
677
678 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
679 l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
680
681 ASSERT(fsw->fsw_ipfm != NULL);
682 ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
683
684 if (pkt->pkt_flow_ip_ver == IPVERSION) {
685 struct ip *ip = l3_hdr;
686 err = fsw_ip_frag_reass_v4(fsw->fsw_ipfm, &pkt, ip, &nfrags, &tlen);
687 } else {
688 struct ip6_hdr *ip6_hdr = l3_hdr;
689 struct ip6_frag *__single ip6_frag =
690 (struct ip6_frag *)((uint8_t *)l3_hdr + sizeof(struct ip6_hdr));
691
692 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
693 /* we only handle frag header immediately after v6 header */
694 err = fsw_ip_frag_reass_v6(fsw->fsw_ipfm, &pkt, ip6_hdr, ip6_frag,
695 &nfrags, &tlen);
696 }
697 if (__improbable(err != 0)) {
698 /* if we get a bad fragment, free it */
699 pp_free_packet_single(pkt);
700 pkt = NULL;
701 } else {
702 ASSERT(!((pkt != NULL) ^ (nfrags > 0)));
703 }
704
705 return pkt;
706 }
707
708 SK_NO_INLINE_ATTRIBUTE
709 static void
rx_prepare_packet_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)710 rx_prepare_packet_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
711 {
712 ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
713 uint32_t mlen = (uint32_t)m_pktlen(pkt->pkt_mbuf);
714 kern_packet_t ph = SK_PTR_ENCODE(pkt,
715 METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
716 /*
717 * This is the case when the packet is coming in from
718 * compat-netif. This packet only has valid metadata
719 * and an attached mbuf. We need to copy enough data
720 * from the mbuf to the packet buffer for the
721 * classifier. Compat netif packet pool is configured
722 * with buffer size of NETIF_COMPAT_MAX_MBUF_DATA_COPY
723 * which is just enough to hold the protocol headers
724 * for the flowswitch classifier.
725 */
726
727 pkt->pkt_headroom = 0;
728 METADATA_ADJUST_LEN(pkt, 0, 0);
729 /*
730 * Copy the initial 128 bytes of the packet for
731 * classification.
732 * Ethernet(14) + IPv6 header(40) +
733 * + IPv6 fragment header(8) +
734 * TCP header with options(60).
735 */
736 fsw->fsw_pkt_copy_from_mbuf(NR_RX, ph,
737 pkt->pkt_headroom, pkt->pkt_mbuf, 0,
738 MIN(mlen, NETIF_COMPAT_MAX_MBUF_DATA_COPY),
739 FALSE, 0);
740
741 int err = __packet_finalize_with_mbuf(pkt);
742 VERIFY(err == 0);
743 }
744
745 static struct __kern_packet *
rx_prepare_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)746 rx_prepare_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
747 {
748 pkt->pkt_qum_qflags &= ~QUM_F_FLOW_CLASSIFIED;
749
750 if (__improbable(pkt->pkt_pflags & PKT_F_MBUF_DATA)) {
751 rx_prepare_packet_mbuf(fsw, pkt);
752 }
753
754 return pkt;
755 }
756
757 static struct flow_entry *
lookup_flow_with_pkt(struct nx_flowswitch * fsw,struct __kern_packet * pkt,bool input,struct flow_entry * prev_fe)758 lookup_flow_with_pkt(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
759 bool input, struct flow_entry *prev_fe)
760 {
761 struct flow_key key __sk_aligned(16);
762 struct flow_entry *__single fe = NULL;
763
764 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
765 flow_pkt2key(pkt, input, &key);
766
767 if (__probable(prev_fe != NULL &&
768 prev_fe->fe_key.fk_mask == FKMASK_5TUPLE)) {
769 uint16_t saved_mask = key.fk_mask;
770 key.fk_mask = FKMASK_5TUPLE;
771 if (flow_key_cmp_mask(&prev_fe->fe_key, &key, &fk_mask_5tuple) == 0) {
772 flow_entry_retain(prev_fe);
773 fe = prev_fe;
774 } else {
775 key.fk_mask = saved_mask;
776 }
777 }
778
779 top:
780 if (__improbable(fe == NULL)) {
781 fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &key);
782 }
783
784 if (__improbable(fe != NULL &&
785 (fe->fe_flags & (FLOWENTF_PARENT | FLOWENTF_CHILD)) != 0)) {
786 /* Rx */
787 if (input) {
788 if (fe->fe_flags & FLOWENTF_PARENT) {
789 struct flow_entry *child_fe = rx_lookup_child_flow(fsw, fe, pkt);
790 if (child_fe != NULL) {
791 flow_entry_release(&fe);
792 fe = child_fe;
793 }
794 } else {
795 if (!rx_flow_demux_match(fsw, fe, pkt)) {
796 flow_entry_release(&fe);
797 fe = NULL;
798 goto top;
799 }
800 }
801 } else {
802 /* Tx */
803 if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
804 if (__probable(fe->fe_flags & FLOWENTF_PARENT)) {
805 struct flow_entry *__single parent_fe = fe;
806 fe = tx_lookup_child_flow(parent_fe, pkt->pkt_flow_id);
807 flow_entry_release(&parent_fe);
808 } else {
809 flow_entry_release(&fe);
810 fe = NULL;
811 goto top;
812 }
813 }
814 }
815 }
816
817 SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]);
818 SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP,
819 "%s %s %s \"%s\" fe 0x%llx",
820 input ? "Rx" : "Tx", if_name(fsw->fsw_ifp),
821 sk_proc_name_address(current_proc()),
822 fk_as_string(&key, fkbuf, sizeof(fkbuf)),
823 SK_KVA(fe));
824
825 return fe;
826 }
827
828 SK_NO_INLINE_ATTRIBUTE
829 static bool
pkt_is_for_listener(struct flow_entry * fe,struct __kern_packet * pkt)830 pkt_is_for_listener(struct flow_entry *fe, struct __kern_packet *pkt)
831 {
832 struct nx_flowswitch *fsw = fe->fe_fsw;
833 struct ifnet *ifp = fsw->fsw_ifp;
834 struct in_ifaddr *ia = NULL;
835 struct in_ifaddr *best_ia = NULL;
836 struct in6_ifaddr *ia6 = NULL;
837 struct in6_ifaddr *best_ia6 = NULL;
838 struct ifnet *match_ifp = NULL;
839 struct __flow *flow = pkt->pkt_flow;
840 bool result = false;
841
842 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
843
844 if (flow->flow_ip_ver == IPVERSION) {
845 if (IN_ZERONET(ntohl(flow->flow_ipv4_dst.s_addr)) ||
846 IN_LOOPBACK(ntohl(flow->flow_ipv4_dst.s_addr)) ||
847 IN_LINKLOCAL(ntohl(flow->flow_ipv4_dst.s_addr)) ||
848 IN_DS_LITE(ntohl(flow->flow_ipv4_dst.s_addr)) ||
849 IN_6TO4_RELAY_ANYCAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
850 IN_MULTICAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
851 INADDR_BROADCAST == flow->flow_ipv4_dst.s_addr) {
852 result = true;
853 goto done;
854 }
855
856 /*
857 * Check for a match in the hash bucket.
858 */
859 lck_rw_lock_shared(&in_ifaddr_rwlock);
860 TAILQ_FOREACH(ia, INADDR_HASH(flow->flow_ipv4_dst.s_addr), ia_hash) {
861 if (IA_SIN(ia)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr) {
862 best_ia = ia;
863 match_ifp = ia->ia_ifp;
864
865 if (match_ifp == ifp) {
866 break;
867 }
868 /*
869 * Continue the loop in case there's a exact match with another
870 * interface
871 */
872 }
873 }
874
875 if (best_ia != NULL) {
876 if (match_ifp != ifp && ipforwarding == 0 &&
877 (match_ifp->if_family == IFNET_FAMILY_IPSEC ||
878 match_ifp->if_family == IFNET_FAMILY_UTUN)) {
879 /*
880 * Drop when interface address check is strict and forwarding
881 * is disabled
882 */
883 } else {
884 lck_rw_done(&in_ifaddr_rwlock);
885 result = true;
886 goto done;
887 }
888 }
889 lck_rw_done(&in_ifaddr_rwlock);
890
891 if (ifp->if_flags & IFF_BROADCAST) {
892 /*
893 * Check for broadcast addresses.
894 *
895 * Only accept broadcast packets that arrive via the matching
896 * interface. Reception of forwarded directed broadcasts would be
897 * handled via ip_forward() and ether_frameout() with the loopback
898 * into the stack for SIMPLEX interfaces handled by ether_frameout().
899 */
900 struct ifaddr *ifa;
901
902 ifnet_lock_shared(ifp);
903 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
904 if (ifa->ifa_addr->sa_family != AF_INET) {
905 continue;
906 }
907 ia = ifatoia(ifa);
908 if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr ||
909 ia->ia_netbroadcast.s_addr == flow->flow_ipv4_dst.s_addr) {
910 ifnet_lock_done(ifp);
911 result = true;
912 goto done;
913 }
914 }
915 ifnet_lock_done(ifp);
916 }
917 } else {
918 struct in6_ifaddrhashhead *ia6_hash_head;
919
920 if (IN6_IS_ADDR_LOOPBACK(&flow->flow_ipv6_dst) ||
921 IN6_IS_ADDR_LINKLOCAL(&flow->flow_ipv6_dst) ||
922 IN6_IS_ADDR_MULTICAST(&flow->flow_ipv6_dst)) {
923 result = true;
924 goto done;
925 }
926
927 /*
928 * Check for exact addresses in the hash bucket.
929 */
930 lck_rw_lock_shared(&in6_ifaddr_rwlock);
931 /* XXX -fbounds-safety: external dependency on ip6_input.c */
932 ia6_hash_head = __unsafe_forge_bidi_indexable(struct in6_ifaddrhashhead *,
933 in6_ifaddrhashtbl, in6addr_nhash * sizeof(*in6_ifaddrhashtbl));
934 ia6_hash_head = &ia6_hash_head[in6addr_hashval(&flow->flow_ipv6_dst)];
935
936 TAILQ_FOREACH(ia6, ia6_hash_head, ia6_hash) {
937 if (in6_are_addr_equal_scoped(&ia6->ia_addr.sin6_addr, &flow->flow_ipv6_dst,
938 ia6->ia_ifp->if_index, ifp->if_index)) {
939 if ((ia6->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46))) {
940 continue;
941 }
942 best_ia6 = ia6;
943 if (ia6->ia_ifp == ifp) {
944 break;
945 }
946 /*
947 * Continue the loop in case there's a exact match with another
948 * interface
949 */
950 }
951 }
952 if (best_ia6 != NULL) {
953 if (best_ia6->ia_ifp != ifp && ip6_forwarding == 0 &&
954 (best_ia6->ia_ifp->if_family == IFNET_FAMILY_IPSEC ||
955 best_ia6->ia_ifp->if_family == IFNET_FAMILY_UTUN)) {
956 /*
957 * Drop when interface address check is strict and forwarding
958 * is disabled
959 */
960 } else {
961 lck_rw_done(&in6_ifaddr_rwlock);
962 result = true;
963 goto done;
964 }
965 }
966 lck_rw_done(&in6_ifaddr_rwlock);
967 }
968
969 /*
970 * In forwarding mode, if the destination address
971 * of the packet does not match any interface
972 * address, it maybe destined to the client device
973 */
974 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
975 "Rx flow does not match interface address");
976 done:
977 return result;
978 }
979
980 static struct flow_entry *
rx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)981 rx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
982 struct flow_entry *prev_fe)
983 {
984 struct flow_entry *__single fe;
985
986 fe = lookup_flow_with_pkt(fsw, pkt, true, prev_fe);
987 _FSW_INJECT_ERROR(2, fe, NULL, flow_entry_release, &fe);
988 if (fe == NULL) {
989 FSW_STATS_INC(FSW_STATS_RX_FLOW_NOT_FOUND);
990 return NULL;
991 }
992
993 if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE &&
994 fe->fe_flags & FLOWENTF_LISTENER) &&
995 !pkt_is_for_listener(fe, pkt)) {
996 FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_LISTENER);
997 flow_entry_release(&fe);
998 return NULL;
999 }
1000
1001 if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
1002 FSW_STATS_INC(FSW_STATS_RX_FLOW_TORNDOWN);
1003 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
1004 "Rx flow torn down");
1005 flow_entry_release(&fe);
1006 fe = NULL;
1007 }
1008
1009 return fe;
1010 }
1011
1012 static inline void
rx_flow_batch_packets(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt,uint64_t tid)1013 rx_flow_batch_packets(struct flow_entry_list *fes, struct flow_entry *fe,
1014 struct __kern_packet *pkt, uint64_t tid)
1015 {
1016 /*
1017 * Among threads working on the same fe, the first thread that reaches here
1018 * will be responsible for processing all the packets until a point when
1019 * it does not see new packets in fe_rx_pktq. Other threads only
1020 * enqueue their packets but do not add the flow entry to their flow entry list.
1021 */
1022 lck_mtx_lock(&fe->fe_rx_pktq_lock);
1023
1024 if (fe->fe_rx_worker_tid == 0) {
1025 fe->fe_rx_worker_tid = tid;
1026 } else if (__improbable(fe->fe_rx_worker_tid != tid)) {
1027 STATS_INC(&fe->fe_fsw->fsw_stats, FSW_STATS_RX_FLOW_IN_USE);
1028 }
1029
1030 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
1031 fe->fe_rx_frag_count++;
1032 }
1033
1034 fe->fe_rx_pktq_bytes += pkt->pkt_flow_ulen;
1035 /* KPKTQ_ENQUEUE_LIST is needed until frags become chained buflet */
1036 if (KPKTQ_EMPTY(&fe->fe_rx_pktq) && tid == fe->fe_rx_worker_tid) {
1037 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) == 0);
1038 TAILQ_INSERT_TAIL(fes, fe, fe_rx_link);
1039 KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
1040 lck_mtx_unlock(&fe->fe_rx_pktq_lock);
1041 } else {
1042 KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
1043 lck_mtx_unlock(&fe->fe_rx_pktq_lock);
1044 flow_entry_release(&fe);
1045 }
1046 }
1047
1048 static void
tx_flow_batch_packet(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt)1049 tx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
1050 struct __kern_packet *pkt)
1051 {
1052 /* record frag continuation */
1053 if (__improbable(pkt->pkt_flow_ip_is_first_frag)) {
1054 ASSERT(pkt->pkt_flow_ip_is_frag);
1055 fe->fe_tx_is_cont_frag = true;
1056 fe->fe_tx_frag_id = pkt->pkt_flow_ip_frag_id;
1057 } else if (__probable(!pkt->pkt_flow_ip_is_frag)) {
1058 fe->fe_tx_is_cont_frag = false;
1059 fe->fe_tx_frag_id = 0;
1060 }
1061
1062 if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
1063 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
1064 TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
1065 KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
1066 } else {
1067 ASSERT(!TAILQ_EMPTY(fes));
1068 KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
1069 flow_entry_release(&fe);
1070 }
1071 }
1072
1073 static inline void
fsw_rx_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes)1074 fsw_rx_ring_dequeue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
1075 uint32_t n_pkts_max, struct pktq *pktq, uint32_t *n_bytes)
1076 {
1077 uint32_t n_pkts = 0;
1078 slot_idx_t idx, idx_end;
1079 idx = r->ckr_khead;
1080 idx_end = r->ckr_rhead;
1081
1082 ASSERT(KPKTQ_EMPTY(pktq));
1083 *n_bytes = 0;
1084 for (; n_pkts < n_pkts_max && idx != idx_end;
1085 idx = SLOT_NEXT(idx, r->ckr_lim)) {
1086 struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1087 struct __kern_packet *pkt = ksd->sd_pkt;
1088
1089 ASSERT(pkt->pkt_nextpkt == NULL);
1090 KR_SLOT_DETACH_METADATA(r, ksd);
1091
1092 _FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1093 pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1094 if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1095 || (pkt->pkt_length == 0)) {
1096 FSW_STATS_INC(FSW_STATS_DROP);
1097 pp_free_packet_single(pkt);
1098 continue;
1099 }
1100 n_pkts++;
1101 *n_bytes += pkt->pkt_length;
1102
1103 KPKTQ_ENQUEUE(pktq, pkt);
1104 }
1105 r->ckr_khead = idx;
1106 r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
1107 }
1108
1109 /*
1110 * This is only for estimating how many packets each GSO packet will need.
1111 * The number does not need to be exact because any leftover packets allocated
1112 * will be freed.
1113 */
1114 static uint32_t
estimate_gso_pkts(struct __kern_packet * pkt)1115 estimate_gso_pkts(struct __kern_packet *pkt)
1116 {
1117 packet_tso_flags_t tso_flags;
1118 uint16_t mss;
1119 uint32_t n_pkts = 0, total_hlen = 0, total_len = 0;
1120
1121 tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
1122 mss = pkt->pkt_proto_seg_sz;
1123
1124 if (tso_flags == PACKET_TSO_IPV4) {
1125 total_hlen = sizeof(struct ip) + sizeof(struct tcphdr);
1126 } else if (tso_flags == PACKET_TSO_IPV6) {
1127 total_hlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1128 }
1129 if (total_hlen != 0 && mss != 0) {
1130 total_len = pkt->pkt_length;
1131 n_pkts = (uint32_t)
1132 (SK_ROUNDUP((total_len - total_hlen), mss) / mss);
1133 }
1134 DTRACE_SKYWALK5(estimate__gso, packet_tso_flags_t, tso_flags,
1135 uint32_t, total_hlen, uint32_t, total_len, uint16_t, mss,
1136 uint32_t, n_pkts);
1137 return n_pkts;
1138 }
1139
1140 /*
1141 * This function retrieves a chain of packets of the same type only
1142 * (GSO or non-GSO).
1143 */
1144 static inline void
fsw_tx_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes,uint32_t * gso_pkts_estimate)1145 fsw_tx_ring_dequeue_pktq(struct nx_flowswitch *fsw,
1146 struct __kern_channel_ring *r, uint32_t n_pkts_max,
1147 struct pktq *pktq, uint32_t *n_bytes, uint32_t *gso_pkts_estimate)
1148 {
1149 uint32_t n_pkts = 0;
1150 slot_idx_t idx, idx_end;
1151 idx = r->ckr_khead;
1152 idx_end = r->ckr_rhead;
1153 struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
1154 boolean_t gso_enabled, gso_required;
1155 uint32_t gso_pkts;
1156
1157 gso_enabled = (fsw->fsw_tso_mode == FSW_TSO_MODE_SW);
1158 ASSERT(KPKTQ_EMPTY(pktq));
1159 *n_bytes = 0;
1160 for (; n_pkts < n_pkts_max &&
1161 (!gso_enabled || fsw_gso_batch == 0 ||
1162 *gso_pkts_estimate < fsw_gso_batch) &&
1163 idx != idx_end; idx = SLOT_NEXT(idx, r->ckr_lim)) {
1164 struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1165 struct __kern_packet *pkt = ksd->sd_pkt;
1166
1167 ASSERT(pkt->pkt_nextpkt == NULL);
1168
1169 _FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1170 pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1171 if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1172 || (pkt->pkt_length == 0)) {
1173 KR_SLOT_DETACH_METADATA(r, ksd);
1174 FSW_STATS_INC(FSW_STATS_DROP);
1175 pp_free_packet_single(pkt);
1176 continue;
1177 }
1178 if (gso_enabled) {
1179 gso_pkts = estimate_gso_pkts(pkt);
1180
1181 /*
1182 * We use the first packet to determine what
1183 * type the subsequent ones need to be (GSO or
1184 * non-GSO).
1185 */
1186 if (n_pkts == 0) {
1187 gso_required = (gso_pkts != 0);
1188 } else {
1189 if (gso_required != (gso_pkts != 0)) {
1190 break;
1191 }
1192 }
1193 *gso_pkts_estimate += gso_pkts;
1194 }
1195 KR_SLOT_DETACH_METADATA(r, ksd);
1196 if (NA_CHANNEL_EVENT_ATTACHED(&vpna->vpna_up)) {
1197 __packet_set_tx_nx_port(SK_PKT2PH(pkt),
1198 vpna->vpna_nx_port, vpna->vpna_gencnt);
1199 }
1200 n_pkts++;
1201 *n_bytes += pkt->pkt_length;
1202 KPKTQ_ENQUEUE(pktq, pkt);
1203 }
1204 r->ckr_khead = idx;
1205 r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
1206 DTRACE_SKYWALK5(tx__ring__dequeue, struct nx_flowswitch *, fsw,
1207 ifnet_t, fsw->fsw_ifp, uint32_t, n_pkts, uint32_t, *n_bytes,
1208 uint32_t, *gso_pkts_estimate);
1209 }
1210
1211 static void
fsw_ring_enqueue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1212 fsw_ring_enqueue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
1213 struct pktq *pktq)
1214 {
1215 #pragma unused(fsw)
1216 struct __kern_packet *pkt;
1217 struct __kern_quantum *kqum;
1218 uint32_t kr_space_avail = 0;
1219 uint32_t n, n_pkts = 0, n_bytes = 0;
1220 slot_idx_t idx = 0, idx_start = 0, idx_end = 0;
1221
1222 kr_enter(r, TRUE);
1223
1224 idx_start = r->ckr_ktail;
1225 kr_space_avail = kr_available_slots_rxring(r);
1226 _FSW_INJECT_ERROR(40, kr_space_avail, 0, null_func);
1227 n = MIN(kr_space_avail, KPKTQ_LEN(pktq));
1228 _FSW_INJECT_ERROR(41, n, 0, null_func);
1229 idx_end = SLOT_INCREMENT(idx_start, n, r->ckr_lim);
1230
1231 idx = idx_start;
1232 while (idx != idx_end) {
1233 KPKTQ_DEQUEUE(pktq, pkt);
1234 kqum = SK_PTR_ADDR_KQUM(pkt);
1235 kqum->qum_qflags |= QUM_F_FINALIZED;
1236 n_pkts++;
1237 n_bytes += pkt->pkt_length;
1238 KR_SLOT_ATTACH_METADATA(r, KR_KSD(r, idx), kqum);
1239 if (__improbable(pkt->pkt_trace_id != 0)) {
1240 KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
1241 KDBG(SK_KTRACE_PKT_RX_CHN | DBG_FUNC_START, pkt->pkt_trace_id);
1242 }
1243 idx = SLOT_NEXT(idx, r->ckr_lim);
1244 }
1245
1246 kr_update_stats(r, n_pkts, n_bytes);
1247
1248 /*
1249 * ensure slot attachments are visible before updating the
1250 * tail pointer
1251 */
1252 os_atomic_thread_fence(seq_cst);
1253
1254 r->ckr_ktail = idx_end;
1255
1256 kr_exit(r);
1257
1258 r->ckr_na_notify(r, kernproc, NA_NOTEF_PUSH);
1259
1260 SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "%s enqueued %d pkts",
1261 r->ckr_name, n_pkts);
1262 }
1263
1264 static void
pkts_to_pktq(struct __kern_packet ** __counted_by (n_pkts)pkts,uint32_t n_pkts,struct pktq * pktq)1265 pkts_to_pktq(struct __kern_packet **__counted_by(n_pkts)pkts, uint32_t n_pkts, struct pktq *pktq)
1266 {
1267 ASSERT(KPKTQ_EMPTY(pktq));
1268
1269 for (uint32_t i = 0; i < n_pkts; i++) {
1270 struct __kern_packet *__single pkt = pkts[i];
1271 ASSERT(pkt->pkt_nextpkt == NULL);
1272 KPKTQ_ENQUEUE(pktq, pkt);
1273 }
1274 }
1275
1276 /*
1277 * This function is modeled after nx_netif_host_grab_pkts() in nx_netif_host.c.
1278 */
1279 SK_NO_INLINE_ATTRIBUTE
1280 static void
convert_native_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_headp,struct mbuf ** m_tailp,uint32_t * cnt,uint32_t * bytes)1281 convert_native_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1282 struct mbuf **m_headp, struct mbuf **m_tailp, uint32_t *cnt, uint32_t *bytes)
1283 {
1284 uint32_t tot_cnt;
1285 unsigned int num_segs = 1;
1286 struct mbuf *__single mhead, *__single head = NULL;
1287 struct mbuf *__single tail = NULL, **__single tailp = &head;
1288 uint32_t mhead_cnt, mhead_bufsize;
1289 uint32_t mhead_waste = 0;
1290 uint32_t mcnt = 0, mbytes = 0;
1291 uint32_t largest, max_pkt_len;
1292 struct __kern_packet *__single pkt;
1293 struct kern_pbufpool *pp;
1294
1295 tot_cnt = KPKTQ_LEN(pktq);
1296 ASSERT(tot_cnt > 0);
1297 mhead_cnt = tot_cnt;
1298
1299 /*
1300 * Opportunistically batch-allocate the mbufs based on the largest
1301 * packet size we've seen in the recent past. Note that we reset
1302 * fe_rx_largest_size below if we notice that we're under-utilizing the
1303 * allocated buffers (thus disabling this batch allocation).
1304 */
1305 largest = *(volatile uint32_t*)&fsw->fsw_rx_largest_size; /* read once */
1306 if (__probable(largest != 0)) {
1307 if (largest <= MCLBYTES) {
1308 mhead = m_allocpacket_internal(&mhead_cnt, MCLBYTES,
1309 &num_segs, M_NOWAIT, 1, 0);
1310 mhead_bufsize = MCLBYTES;
1311 } else if (largest <= MBIGCLBYTES) {
1312 mhead = m_allocpacket_internal(&mhead_cnt, MBIGCLBYTES,
1313 &num_segs, M_NOWAIT, 1, 0);
1314 mhead_bufsize = MBIGCLBYTES;
1315 } else if (largest <= M16KCLBYTES) {
1316 mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES,
1317 &num_segs, M_NOWAIT, 1, 0);
1318 mhead_bufsize = M16KCLBYTES;
1319 } else if (largest <= M16KCLBYTES * 2) {
1320 num_segs = 2;
1321 mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES * 2,
1322 &num_segs, M_NOWAIT, 1, 0);
1323 mhead_bufsize = M16KCLBYTES * 2;
1324 } else {
1325 mhead = NULL;
1326 mhead_bufsize = mhead_cnt = 0;
1327 }
1328 } else {
1329 mhead = NULL;
1330 mhead_bufsize = mhead_cnt = 0;
1331 }
1332 DTRACE_SKYWALK4(bufstats, uint32_t, largest, uint32_t, mhead_bufsize,
1333 uint32_t, mhead_cnt, uint32_t, tot_cnt);
1334
1335 pp = __DECONST(struct kern_pbufpool *, KPKTQ_FIRST(pktq)->pkt_qum.qum_pp);
1336 max_pkt_len = PP_BUF_SIZE_DEF(pp) * pp->pp_max_frags;
1337
1338 KPKTQ_FOREACH(pkt, pktq) {
1339 uint32_t tot_len, len;
1340 uint16_t pad, llhlen, iphlen;
1341 boolean_t do_cksum_rx;
1342 struct mbuf *__single m;
1343 int error;
1344
1345 llhlen = pkt->pkt_l2_len;
1346 len = pkt->pkt_length;
1347 if (__improbable(len > max_pkt_len || len == 0 || llhlen > len)) {
1348 DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1349 struct __kern_packet *, pkt);
1350 FSW_STATS_INC(FSW_STATS_DROP);
1351 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1352 continue;
1353 }
1354 /* begin payload on 32-bit boundary; figure out the padding */
1355 pad = (uint16_t)P2ROUNDUP(llhlen, sizeof(uint32_t)) - llhlen;
1356 tot_len = pad + len;
1357
1358 /* remember largest packet size */
1359 if (__improbable(largest < tot_len)) {
1360 largest = MAX(tot_len, MCLBYTES);
1361 }
1362
1363 /*
1364 * If the above batch allocation returned partial
1365 * success, we try a blocking allocation here again.
1366 */
1367 m = mhead;
1368 if (__improbable(m == NULL || tot_len > mhead_bufsize)) {
1369 ASSERT(mhead != NULL || mhead_cnt == 0);
1370 num_segs = 1;
1371 if (tot_len > M16KCLBYTES) {
1372 num_segs = 0;
1373 }
1374 if ((error = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
1375 &num_segs, &m)) != 0) {
1376 DTRACE_SKYWALK2(bad__len,
1377 struct nx_flowswitch *, fsw,
1378 struct __kern_packet *, pkt);
1379 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
1380 FSW_STATS_INC(FSW_STATS_DROP);
1381 continue;
1382 }
1383 } else {
1384 mhead = m->m_nextpkt;
1385 m->m_nextpkt = NULL;
1386 ASSERT(mhead_cnt != 0);
1387 --mhead_cnt;
1388
1389 /* check if we're underutilizing large buffers */
1390 if (__improbable(mhead_bufsize > MCLBYTES &&
1391 tot_len < (mhead_bufsize >> 1))) {
1392 ++mhead_waste;
1393 }
1394 /*
1395 * Clean up unused mbuf.
1396 * Ony need to do this when we pre-alloc 2x16K mbufs
1397 */
1398 if (__improbable(mhead_bufsize >= tot_len + M16KCLBYTES)) {
1399 ASSERT(mhead_bufsize == 2 * M16KCLBYTES);
1400 struct mbuf *m_extra = m->m_next;
1401 ASSERT(m_extra != NULL);
1402 ASSERT(m_extra->m_len == 0);
1403 ASSERT(M_SIZE(m_extra) == M16KCLBYTES);
1404 m->m_next = NULL;
1405 m_freem(m_extra);
1406 FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
1407 }
1408 }
1409 m->m_data += pad;
1410 /*
1411 * XXX -fbounds-safety: external dependency
1412 * mtod does not work because m_len is 0
1413 */
1414 m->m_pkthdr.pkt_hdr = m_mtod_current(m);
1415
1416 /* don't include IP header from partial sum */
1417 if (__probable((pkt->pkt_qum_qflags &
1418 QUM_F_FLOW_CLASSIFIED) != 0)) {
1419 iphlen = pkt->pkt_flow_ip_hlen;
1420 do_cksum_rx = sk_cksum_rx;
1421 } else {
1422 iphlen = 0;
1423 do_cksum_rx = FALSE;
1424 }
1425
1426 fsw->fsw_pkt_copy_to_mbuf(NR_RX, SK_PKT2PH(pkt),
1427 pkt->pkt_headroom, m, 0, len, do_cksum_rx,
1428 llhlen + iphlen);
1429
1430 FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2MBUF);
1431 if (do_cksum_rx) {
1432 FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
1433 }
1434 #if DEBUG || DEVELOPMENT
1435 if (__improbable(pkt_trailers > 0)) {
1436 (void) pkt_add_trailers_mbuf(m, llhlen + iphlen);
1437 }
1438 #endif /* DEBUG || DEVELOPMENT */
1439 m_adj(m, llhlen);
1440
1441 m->m_pkthdr.rcvif = fsw->fsw_ifp;
1442 if (__improbable((pkt->pkt_link_flags &
1443 PKT_LINKF_ETHFCS) != 0)) {
1444 m->m_flags |= M_HASFCS;
1445 }
1446 if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1447 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
1448 }
1449 ASSERT(m->m_nextpkt == NULL);
1450 tail = m;
1451 *tailp = m;
1452 tailp = &m->m_nextpkt;
1453 mcnt++;
1454 mbytes += m_pktlen(m);
1455 }
1456 /* free any leftovers */
1457 if (__improbable(mhead != NULL)) {
1458 DTRACE_SKYWALK1(mhead__leftover, uint32_t, mhead_cnt);
1459 ASSERT(mhead_cnt != 0);
1460 (void) m_freem_list(mhead);
1461 mhead = NULL;
1462 mhead_cnt = 0;
1463 }
1464
1465 /* reset if most packets (>50%) are smaller than our batch buffers */
1466 if (__improbable(mhead_waste > ((uint32_t)tot_cnt >> 1))) {
1467 DTRACE_SKYWALK4(mhead__waste, struct nx_flowswitch *, fsw,
1468 struct flow_entry *, NULL, uint32_t, mhead_waste,
1469 uint32_t, tot_cnt);
1470 largest = 0;
1471 }
1472
1473 if (largest != fsw->fsw_rx_largest_size) {
1474 os_atomic_store(&fsw->fsw_rx_largest_size, largest, release);
1475 }
1476
1477 pp_free_pktq(pktq);
1478 *m_headp = head;
1479 *m_tailp = tail;
1480 *cnt = mcnt;
1481 *bytes = mbytes;
1482 }
1483
1484 /*
1485 * This function only extracts the mbuf from the packet. The caller frees
1486 * the packet.
1487 */
1488 static inline struct mbuf *
convert_compat_pkt_to_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1489 convert_compat_pkt_to_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1490 {
1491 struct mbuf *m;
1492 struct pkthdr *mhdr;
1493 uint16_t llhlen;
1494
1495 m = pkt->pkt_mbuf;
1496 ASSERT(m != NULL);
1497
1498 llhlen = pkt->pkt_l2_len;
1499 if (llhlen > pkt->pkt_length) {
1500 m_freem(m);
1501 KPKT_CLEAR_MBUF_DATA(pkt);
1502 DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1503 struct __kern_packet *, pkt);
1504 FSW_STATS_INC(FSW_STATS_DROP);
1505 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1506 return NULL;
1507 }
1508 mhdr = &m->m_pkthdr;
1509 if ((mhdr->csum_flags & CSUM_DATA_VALID) == 0 &&
1510 PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
1511 mhdr->csum_flags &= ~CSUM_RX_FLAGS;
1512 mhdr->csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
1513 mhdr->csum_rx_start = pkt->pkt_csum_rx_start_off;
1514 mhdr->csum_rx_val = pkt->pkt_csum_rx_value;
1515 }
1516 #if DEBUG || DEVELOPMENT
1517 uint32_t extra = 0;
1518 if (__improbable(pkt_trailers > 0)) {
1519 extra = pkt_add_trailers_mbuf(m, llhlen);
1520 }
1521 #endif /* DEBUG || DEVELOPMENT */
1522 m_adj(m, llhlen);
1523 ASSERT((uint32_t)m_pktlen(m) == ((pkt->pkt_length - llhlen) + extra));
1524 KPKT_CLEAR_MBUF_DATA(pkt);
1525 return m;
1526 }
1527
1528 SK_NO_INLINE_ATTRIBUTE
1529 static void
convert_compat_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_head,struct mbuf ** m_tail,uint32_t * cnt,uint32_t * bytes)1530 convert_compat_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1531 struct mbuf **m_head, struct mbuf **m_tail, uint32_t *cnt, uint32_t *bytes)
1532 {
1533 struct __kern_packet *pkt;
1534 struct mbuf *__single m, *__single head = NULL;
1535 struct mbuf *__single tail = NULL, **__single tailp = &head;
1536 uint32_t c = 0, b = 0;
1537
1538 KPKTQ_FOREACH(pkt, pktq) {
1539 m = convert_compat_pkt_to_mbuf(fsw, pkt);
1540 if (__improbable(m == NULL)) {
1541 continue;
1542 }
1543 tail = m;
1544 *tailp = m;
1545 tailp = &m->m_nextpkt;
1546 c++;
1547 b += m_pktlen(m);
1548 }
1549 pp_free_pktq(pktq);
1550 *m_head = head;
1551 *m_tail = tail;
1552 *cnt = c;
1553 *bytes = b;
1554 }
1555
1556 void
fsw_host_sendup(ifnet_t ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes)1557 fsw_host_sendup(ifnet_t ifp, struct mbuf *m_head, struct mbuf *m_tail,
1558 uint32_t cnt, uint32_t bytes)
1559 {
1560 struct ifnet_stat_increment_param s;
1561
1562 bzero(&s, sizeof(s));
1563 s.packets_in = cnt;
1564 s.bytes_in = bytes;
1565 dlil_input_handler(ifp, m_head, m_tail, &s, FALSE, NULL);
1566 }
1567
1568 void
fsw_host_rx(struct nx_flowswitch * fsw,struct pktq * pktq)1569 fsw_host_rx(struct nx_flowswitch *fsw, struct pktq *pktq)
1570 {
1571 struct mbuf *__single m_head = NULL, *__single m_tail = NULL;
1572 uint32_t cnt = 0, bytes = 0;
1573 ifnet_fsw_rx_cb_t __single cb;
1574 void *__single cb_arg;
1575 boolean_t compat;
1576
1577 ASSERT(!KPKTQ_EMPTY(pktq));
1578 if (ifnet_get_flowswitch_rx_callback(fsw->fsw_ifp, &cb, &cb_arg) == 0) {
1579 ASSERT(cb != NULL);
1580 ASSERT(cb_arg != NULL);
1581 (*cb)(cb_arg, pktq);
1582 ifnet_release_flowswitch_rx_callback(fsw->fsw_ifp);
1583 if (KPKTQ_EMPTY(pktq)) {
1584 return;
1585 } else {
1586 DTRACE_SKYWALK2(leftover__pkts, struct nx_flowswitch *, fsw,
1587 struct pktq *, pktq);
1588 }
1589 }
1590
1591 /* All packets in the pktq must have the same type */
1592 compat = ((KPKTQ_FIRST(pktq)->pkt_pflags & PKT_F_MBUF_DATA) != 0);
1593 if (compat) {
1594 convert_compat_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1595 &bytes);
1596 } else {
1597 convert_native_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1598 &bytes);
1599 }
1600 if (__improbable(m_head == NULL)) {
1601 DTRACE_SKYWALK1(empty__head, struct nx_flowswitch *, fsw);
1602 return;
1603 }
1604 fsw_host_sendup(fsw->fsw_ifp, m_head, m_tail, cnt, bytes);
1605 }
1606
1607 void
fsw_ring_enqueue_tail_drop(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1608 fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw,
1609 struct __kern_channel_ring *r, struct pktq *pktq)
1610 {
1611 fsw_ring_enqueue_pktq(fsw, r, pktq);
1612 /*
1613 * Rx stall detection: don't update enqueue ts if dequeue ts < enqueue ts.
1614 * This is to ensure we use the timestamp of the earliest enqueue without
1615 * a dequeue.
1616 */
1617 if (r->ckr_rx_dequeue_ts >= r->ckr_rx_enqueue_ts) {
1618 r->ckr_rx_enqueue_ts = _net_uptime;
1619 }
1620 FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq));
1621 dp_drop_pktq(fsw, pktq, 0, DROP_REASON_RX_DST_RING_FULL, __LINE__,
1622 DROPTAP_FLAG_L2_MISSING);
1623 }
1624
1625 static struct nexus_adapter *
flow_get_na(struct nx_flowswitch * fsw,struct flow_entry * fe)1626 flow_get_na(struct nx_flowswitch *fsw, struct flow_entry *fe)
1627 {
1628 struct kern_nexus *nx = fsw->fsw_nx;
1629 struct nexus_adapter *na = NULL;
1630 nexus_port_t port = fe->fe_nx_port;
1631
1632 if (port == FSW_VP_DEV || port == FSW_VP_HOST) {
1633 SK_ERR("dev or host ports have no NA");
1634 return NULL;
1635 }
1636
1637 if (__improbable(!nx_port_is_valid(nx, port))) {
1638 SK_DF(SK_VERB_FSW_DP, "%s[%d] port no longer valid",
1639 if_name(fsw->fsw_ifp), port);
1640 return NULL;
1641 }
1642
1643 na = nx_port_get_na(nx, port);
1644 if (__improbable(na == NULL)) {
1645 FSW_STATS_INC(FSW_STATS_DST_NXPORT_INVALID);
1646 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer valid",
1647 if_name(fsw->fsw_ifp), port);
1648 return NULL;
1649 }
1650
1651 if (__improbable(!NA_IS_ACTIVE(na))) {
1652 FSW_STATS_INC(FSW_STATS_DST_NXPORT_INACTIVE);
1653 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer active",
1654 if_name(fsw->fsw_ifp), port);
1655 return NULL;
1656 }
1657
1658 if (__improbable(nx_port_is_defunct(nx, port))) {
1659 FSW_STATS_INC(FSW_STATS_DST_NXPORT_DEFUNCT);
1660 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA defuncted",
1661 if_name(fsw->fsw_ifp), port);
1662 return NULL;
1663 }
1664
1665 return na;
1666 }
1667
1668 static inline struct __kern_channel_ring *
flow_get_ring(struct nx_flowswitch * fsw,struct flow_entry * fe,enum txrx txrx)1669 flow_get_ring(struct nx_flowswitch *fsw, struct flow_entry *fe, enum txrx txrx)
1670 {
1671 struct nexus_vp_adapter *na = NULL;
1672 struct __kern_channel_ring *__single r = NULL;
1673
1674 na = VPNA(flow_get_na(fsw, fe));
1675 if (__improbable(na == NULL)) {
1676 return NULL;
1677 }
1678
1679 switch (txrx) {
1680 case NR_RX:
1681 r = KR_SINGLE(&na->vpna_up.na_rx_rings[0]);
1682 break;
1683 case NR_TX:
1684 r = KR_SINGLE(&na->vpna_up.na_tx_rings[0]);
1685 break;
1686 default:
1687 __builtin_unreachable();
1688 VERIFY(0);
1689 }
1690
1691 if (__improbable(KR_DROP(r))) {
1692 FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE);
1693 SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %0xllx %s drop mode",
1694 r->ckr_name, SK_KVA(r));
1695 return NULL;
1696 }
1697
1698 ASSERT(KRNA(r)->na_md_type == NEXUS_META_TYPE_PACKET);
1699
1700 #if (DEVELOPMENT || DEBUG)
1701 if (r != NULL) {
1702 _FSW_INJECT_ERROR(4, r, NULL, null_func);
1703 }
1704 #endif /* DEVELOPMENT || DEBUG */
1705
1706 return r;
1707 }
1708
1709 struct __kern_channel_ring *
fsw_flow_get_rx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1710 fsw_flow_get_rx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1711 {
1712 return flow_get_ring(fsw, fe, NR_RX);
1713 }
1714
1715 static inline struct __kern_channel_ring *
fsw_flow_get_tx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1716 fsw_flow_get_tx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1717 {
1718 return flow_get_ring(fsw, fe, NR_TX);
1719 }
1720
1721 static bool
dp_flow_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1722 dp_flow_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1723 {
1724 struct flow_route *fr = fe->fe_route;
1725 struct ifnet *ifp = fsw->fsw_ifp;
1726
1727 if (__improbable(!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
1728 !fe->fe_want_nonviable && (fe->fe_key.fk_mask & FKMASK_SRC) &&
1729 fe->fe_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt &&
1730 !flow_route_key_validate(&fe->fe_key, ifp, &fe->fe_laddr_gencnt))) {
1731 /*
1732 * The source address is no longer around; we want this
1733 * flow to be nonviable, but that requires holding the lock
1734 * as writer (which isn't the case now.) Indicate that
1735 * we need to finalize the nonviable later down below.
1736 *
1737 * We also request that the flow route be re-configured,
1738 * if this is a connected mode flow.
1739 *
1740 */
1741 if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) {
1742 /*
1743 * fsw_pending_nonviable is a hint for reaper thread;
1744 * due to the fact that setting fe_want_nonviable and
1745 * incrementing fsw_pending_nonviable counter is not
1746 * atomic, let the increment happen first, and the
1747 * thread losing the CAS does decrement.
1748 */
1749 os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
1750 if (os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
1751 fsw_reap_sched(fsw);
1752 } else {
1753 os_atomic_dec(&fsw->fsw_pending_nonviable, relaxed);
1754 }
1755 }
1756 if (fr != NULL) {
1757 os_atomic_inc(&fr->fr_want_configure, relaxed);
1758 }
1759 }
1760
1761 /* if flow was (or is going to be) marked as nonviable, drop it */
1762 if (__improbable(fe->fe_want_nonviable ||
1763 (fe->fe_flags & FLOWENTF_NONVIABLE) != 0)) {
1764 SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow 0x%llx non-viable",
1765 SK_KVA(fe));
1766 return false;
1767 }
1768 return true;
1769 }
1770
1771 bool
dp_flow_rx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1772 dp_flow_rx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1773 {
1774 bool okay;
1775 okay = dp_flow_route_process(fsw, fe);
1776 #if (DEVELOPMENT || DEBUG)
1777 if (okay) {
1778 _FSW_INJECT_ERROR(5, okay, false, null_func);
1779 }
1780 #endif /* DEVELOPMENT || DEBUG */
1781
1782 return okay;
1783 }
1784
1785 void
dp_flow_rx_process(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,uint32_t rx_bytes,uint32_t flags)1786 dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
1787 struct pktq *rx_pkts, uint32_t rx_bytes, uint32_t flags)
1788 {
1789 #pragma unused(flags)
1790 struct pktq dpkts; /* dst pool alloc'ed packets */
1791 struct pktq disposed_pkts; /* done src packets */
1792 struct pktq dropped_pkts; /* dropped src packets */
1793 struct pktq transferred_pkts; /* dst packet ready for ring */
1794 struct __kern_packet *pkt, *tpkt;
1795 struct kern_pbufpool *dpp;
1796 uint32_t n_pkts = KPKTQ_LEN(rx_pkts);
1797 uint64_t buf_array[RX_BUFLET_BATCH_COUNT];
1798 uint16_t buf_array_iter = 0;
1799 uint32_t cnt, buf_cnt = 0;
1800 int err;
1801 drop_reason_t reason = DROP_REASON_UNSPECIFIED;
1802 uint16_t line = 0;
1803
1804 KPKTQ_INIT(&dpkts);
1805 KPKTQ_INIT(&dropped_pkts);
1806 KPKTQ_INIT(&disposed_pkts);
1807 KPKTQ_INIT(&transferred_pkts);
1808
1809 if (__improbable(!dp_flow_rx_route_process(fsw, fe))) {
1810 SK_ERR("Rx route bad");
1811 fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
1812 FSW_STATS_ADD(FSW_STATS_RX_FLOW_NONVIABLE, n_pkts);
1813 reason = DROP_REASON_FSW_FLOW_NONVIABLE;
1814 line = __LINE__;
1815 goto done;
1816 }
1817
1818 if (fe->fe_nx_port == FSW_VP_HOST) {
1819 /*
1820 * The host ring does not exist anymore so we can't take
1821 * the enqueue path below. This path should only be hit
1822 * for the rare tcp fragmentation case.
1823 */
1824 fsw_host_rx(fsw, rx_pkts);
1825 return;
1826 }
1827
1828 /* find the ring */
1829 struct __kern_channel_ring *r;
1830 r = fsw_flow_get_rx_ring(fsw, fe);
1831 if (__improbable(r == NULL)) {
1832 fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
1833 reason = DROP_REASON_FSW_RX_RING_NOT_FOUND;
1834 line = __LINE__;
1835 goto done;
1836 }
1837
1838 /* snoop before L2 is stripped */
1839 if (__improbable(pktap_total_tap_count != 0)) {
1840 fsw_snoop(fsw, fe, rx_pkts, true);
1841 }
1842
1843 dpp = r->ckr_pp;
1844 /* batch allocate enough packets */
1845 err = pp_alloc_pktq(dpp, 1, &dpkts, n_pkts, NULL, NULL,
1846 SKMEM_NOSLEEP);
1847 if (__improbable(err == ENOMEM)) {
1848 ASSERT(KPKTQ_EMPTY(&dpkts));
1849 KPKTQ_CONCAT(&dropped_pkts, rx_pkts);
1850 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1851 SK_ERR("failed to alloc %u pkts for kr %s, 0x%llu", n_pkts,
1852 r->ckr_name, SK_KVA(r));
1853 reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
1854 line = __LINE__;
1855 goto done;
1856 }
1857
1858 /*
1859 * estimate total number of buflets for the packet chain.
1860 */
1861 cnt = howmany(rx_bytes, PP_BUF_SIZE_DEF(dpp));
1862 if (cnt > n_pkts) {
1863 ASSERT(dpp->pp_max_frags > 1);
1864 cnt -= n_pkts;
1865 buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, cnt);
1866 err = pp_alloc_buflet_batch(dpp, buf_array, &buf_cnt,
1867 SKMEM_NOSLEEP, false);
1868 if (__improbable(buf_cnt == 0)) {
1869 KPKTQ_CONCAT(&dropped_pkts, rx_pkts);
1870 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1871 SK_ERR("failed to alloc %d buflets (err %d) for kr %s, "
1872 "0x%llu", cnt, err, r->ckr_name, SK_KVA(r));
1873 reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
1874 line = __LINE__;
1875 goto done;
1876 }
1877 err = 0;
1878 }
1879
1880 /* extra processing for user flow */
1881 KPKTQ_FOREACH_SAFE(pkt, rx_pkts, tpkt) {
1882 err = 0;
1883 KPKTQ_REMOVE(rx_pkts, pkt);
1884 if (rx_bytes > pkt->pkt_flow_ulen) {
1885 rx_bytes -= pkt->pkt_flow_ulen;
1886 } else {
1887 rx_bytes = 0;
1888 }
1889 err = flow_pkt_track(fe, pkt, true);
1890 _FSW_INJECT_ERROR(33, err, EPROTO, null_func);
1891 if (__improbable(err != 0)) {
1892 SK_DF(SK_VERB_FLOW_TRACK, "flow_pkt_track failed (err %d)", err);
1893 FSW_STATS_INC(FSW_STATS_RX_FLOW_TRACK_ERR);
1894 /* if need to trigger RST */
1895 if (err == ENETRESET) {
1896 flow_track_abort_tcp(fe, pkt, NULL);
1897 }
1898 dp_drop_pkt_single(fsw, pkt, 0, DROP_REASON_FSW_FLOW_TRACK_ERR,
1899 DROPTAP_FLAG_L2_MISSING);
1900 continue;
1901 }
1902
1903 /* transfer to dpkt */
1904 if (pkt->pkt_qum.qum_pp != dpp) {
1905 struct __kern_buflet *bprev, *bnew;
1906 struct __kern_packet *dpkt = NULL;
1907 uint32_t n_bufs, i;
1908
1909 KPKTQ_DEQUEUE(&dpkts, dpkt);
1910 /* XXX Why would dpkt be NULL at this point? */
1911 if (__improbable(dpkt == NULL)) {
1912 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1913 dp_drop_pkt_single(fsw, pkt, 0,
1914 DROP_REASON_FSW_PP_ALLOC_FAILED, DROPTAP_FLAG_L2_MISSING);
1915 continue;
1916 }
1917 n_bufs = howmany(pkt->pkt_length, PP_BUF_SIZE_DEF(dpp));
1918 n_bufs--;
1919 for (i = 0; i < n_bufs; i++) {
1920 if (__improbable(buf_cnt == 0)) {
1921 ASSERT(dpp->pp_max_frags > 1);
1922 buf_array_iter = 0;
1923 cnt = howmany(rx_bytes, PP_BUF_SIZE_DEF(dpp));
1924 n_pkts = KPKTQ_LEN(rx_pkts);
1925 if (cnt >= n_pkts) {
1926 cnt -= n_pkts;
1927 } else {
1928 cnt = 0;
1929 }
1930 cnt += (n_bufs - i);
1931 buf_cnt = MIN(RX_BUFLET_BATCH_COUNT,
1932 cnt);
1933 cnt = buf_cnt;
1934 err = pp_alloc_buflet_batch(dpp,
1935 buf_array, &buf_cnt,
1936 SKMEM_NOSLEEP, false);
1937 if (__improbable(buf_cnt == 0)) {
1938 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1939 dp_drop_pkt_single(fsw, pkt, 0,
1940 DROP_REASON_FSW_PP_ALLOC_FAILED,
1941 DROPTAP_FLAG_L2_MISSING);
1942 pkt = NULL;
1943 pp_free_packet_single(dpkt);
1944 dpkt = NULL;
1945 SK_ERR("failed to alloc %d "
1946 "buflets (err %d) for "
1947 "kr %s, 0x%llu", cnt, err,
1948 r->ckr_name, SK_KVA(r));
1949 break;
1950 }
1951 err = 0;
1952 }
1953 ASSERT(buf_cnt != 0);
1954 if (i == 0) {
1955 PKT_GET_FIRST_BUFLET(dpkt, 1, bprev);
1956 }
1957 /*
1958 * XXX -fbounds-safety: can't avoid using forge
1959 * unless we change the signature of
1960 * pp_alloc_buflet_batch().
1961 */
1962 bnew = __unsafe_forge_single(kern_buflet_t,
1963 buf_array[buf_array_iter]);
1964 buf_array[buf_array_iter] = 0;
1965 buf_array_iter++;
1966 buf_cnt--;
1967 VERIFY(kern_packet_add_buflet(SK_PKT2PH(dpkt),
1968 bprev, bnew) == 0);
1969 bprev = bnew;
1970 }
1971 if (__improbable(err != 0)) {
1972 continue;
1973 }
1974 err = copy_packet_from_dev(fsw, pkt, dpkt);
1975 _FSW_INJECT_ERROR(43, err, EINVAL, null_func);
1976 if (__improbable(err != 0)) {
1977 SK_ERR("copy packet failed (err %d)", err);
1978 dp_drop_pkt_single(fsw, pkt, 0,
1979 DROP_REASON_FSW_PKT_COPY_FAILED,
1980 DROPTAP_FLAG_L2_MISSING);
1981 pp_free_packet_single(dpkt);
1982 dpkt = NULL;
1983 continue;
1984 }
1985 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1986 pkt = dpkt;
1987 }
1988 _UUID_COPY(pkt->pkt_flow_id, fe->fe_uuid);
1989 _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1990 pkt->pkt_policy_id = fe->fe_policy_id;
1991 pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
1992 pkt->pkt_transport_protocol = fe->fe_transport_protocol;
1993 if (pkt->pkt_bufs_cnt > 1) {
1994 pkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1995 pkt->pkt_seg_cnt = 1;
1996 }
1997 KPKTQ_ENQUEUE(&transferred_pkts, pkt);
1998 }
1999 KPKTQ_FINI(rx_pkts);
2000
2001 if (KPKTQ_LEN(&transferred_pkts) > 0) {
2002 fsw_ring_enqueue_tail_drop(fsw, r, &transferred_pkts);
2003 }
2004 KPKTQ_FINI(&transferred_pkts);
2005
2006 done:
2007 /* Free unused buflets */
2008 while (buf_cnt > 0) {
2009 /*
2010 * XXX -fbounds-safety: can't avoid using forge unless we change
2011 * the signature of pp_alloc_buflet_batch().
2012 */
2013 pp_free_buflet(dpp, __unsafe_forge_single(kern_buflet_t,
2014 (kern_buflet_t)(buf_array[buf_array_iter])));
2015 buf_array[buf_array_iter] = 0;
2016 buf_array_iter++;
2017 buf_cnt--;
2018 }
2019 dp_free_pktq(fsw, &dpkts);
2020 dp_free_pktq(fsw, &disposed_pkts);
2021 dp_drop_pktq(fsw, &dropped_pkts, 0, reason, line, DROPTAP_FLAG_L2_MISSING);
2022 }
2023
2024 static inline void
rx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe,struct flow_entry_list * fes)2025 rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
2026 struct flow_entry_list *fes)
2027 {
2028 struct pktq rx_pkts;
2029 uint32_t rx_bytes;
2030 uint32_t rx_proc_flags;
2031
2032 ASSERT(!KPKTQ_EMPTY(&fe->fe_rx_pktq));
2033 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0);
2034
2035 KPKTQ_INIT(&rx_pkts);
2036 for (;;) {
2037 lck_mtx_lock(&fe->fe_rx_pktq_lock);
2038 if (KPKTQ_EMPTY(&fe->fe_rx_pktq)) {
2039 fe->fe_rx_worker_tid = 0;
2040 TAILQ_REMOVE(fes, fe, fe_rx_link);
2041 lck_mtx_unlock(&fe->fe_rx_pktq_lock);
2042 break;
2043 }
2044 KPKTQ_CONCAT(&rx_pkts, &fe->fe_rx_pktq);
2045 KPKTQ_DISPOSE(&fe->fe_rx_pktq);
2046 rx_bytes = fe->fe_rx_pktq_bytes;
2047 rx_proc_flags = fe->fe_rx_frag_count ? FLOW_PROC_FLAG_FRAGMENTS : 0;
2048 fe->fe_rx_pktq_bytes = 0;
2049 fe->fe_rx_frag_count = 0;
2050 lck_mtx_unlock(&fe->fe_rx_pktq_lock);
2051 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX, "Rx %d pkts for fe %p port %d",
2052 KPKTQ_LEN(&rx_pkts), fe, fe->fe_nx_port);
2053 /* flow related processing (default, agg, fpd, etc.) */
2054 fe->fe_rx_process(fsw, fe, &rx_pkts, rx_bytes, rx_proc_flags);
2055 }
2056 ASSERT(KPKTQ_EMPTY(&rx_pkts));
2057
2058 if (__improbable(fe->fe_want_withdraw)) {
2059 fsw_reap_sched(fsw);
2060 }
2061 }
2062
2063 static inline void
dp_rx_process_wake_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2064 dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2065 {
2066 /*
2067 * We only care about wake packets of flows that belong the flow switch
2068 * as wake packets for the host stack are handled by the host input
2069 * function
2070 */
2071 #if (DEBUG || DEVELOPMENT)
2072 if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
2073 /*
2074 * This is a one shot command
2075 */
2076 fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
2077
2078 pkt->pkt_pflags |= PKT_F_WAKE_PKT;
2079 }
2080 #endif /* (DEBUG || DEVELOPMENT) */
2081 if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
2082 if_ports_used_match_pkt(fsw->fsw_ifp, pkt);
2083 }
2084 }
2085
2086 static void
_fsw_receive_locked(struct nx_flowswitch * fsw,struct pktq * pktq)2087 _fsw_receive_locked(struct nx_flowswitch *fsw, struct pktq *pktq)
2088 {
2089 struct __kern_packet *__single pkt, *__single tpkt;
2090 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
2091 struct flow_entry *__single fe, *__single prev_fe;
2092 sa_family_t af;
2093 struct pktq host_pkts, dropped_pkts;
2094 drop_reason_t reason = DROP_REASON_UNSPECIFIED;
2095 uint16_t line = 0;
2096 int err;
2097 uint64_t thread_id;
2098
2099 KPKTQ_INIT(&host_pkts);
2100 KPKTQ_INIT(&dropped_pkts);
2101
2102 if (__improbable(FSW_QUIESCED(fsw))) {
2103 DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw);
2104 KPKTQ_CONCAT(&dropped_pkts, pktq);
2105 reason = DROP_REASON_FSW_QUIESCED;
2106 line = __LINE__;
2107 goto done;
2108 }
2109 if (__improbable(fsw->fsw_demux == NULL)) {
2110 KPKTQ_CONCAT(&dropped_pkts, pktq);
2111 reason = DROP_REASON_FSW_DEMUX_FAILED;
2112 line = __LINE__;
2113 goto done;
2114 }
2115
2116 thread_id = thread_tid(current_thread());
2117 prev_fe = NULL;
2118 KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
2119 if (__probable(tpkt)) {
2120 void *baddr;
2121 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
2122 SK_PREFETCH(baddr, 0);
2123 /* prefetch L3 and L4 flow structs */
2124 SK_PREFETCHW(tpkt->pkt_flow, 0);
2125 SK_PREFETCHW(tpkt->pkt_flow, 128);
2126 }
2127
2128 KPKTQ_REMOVE(pktq, pkt);
2129
2130 pkt = rx_prepare_packet(fsw, pkt);
2131
2132 af = fsw->fsw_demux(fsw, pkt);
2133 if (__improbable(af == AF_UNSPEC)) {
2134 KPKTQ_ENQUEUE(&host_pkts, pkt);
2135 continue;
2136 }
2137
2138 err = flow_pkt_classify(pkt, fsw->fsw_ifp, af, TRUE);
2139 _FSW_INJECT_ERROR(1, err, ENXIO, null_func);
2140 if (__improbable(err != 0)) {
2141 FSW_STATS_INC(FSW_STATS_RX_FLOW_EXTRACT_ERR);
2142 KPKTQ_ENQUEUE(&host_pkts, pkt);
2143 continue;
2144 }
2145
2146 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
2147 pkt = rx_process_ip_frag(fsw, pkt);
2148 if (pkt == NULL) {
2149 continue;
2150 }
2151 }
2152
2153 prev_fe = fe = rx_lookup_flow(fsw, pkt, prev_fe);
2154 if (__improbable(fe == NULL)) {
2155 KPKTQ_ENQUEUE_LIST(&host_pkts, pkt);
2156 continue;
2157 }
2158
2159 dp_rx_process_wake_packet(fsw, pkt);
2160
2161 rx_flow_batch_packets(&fes, fe, pkt, thread_id);
2162 prev_fe = fe;
2163 }
2164
2165 struct flow_entry *tfe = NULL;
2166 TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) {
2167 rx_flow_process(fsw, fe, &fes);
2168 flow_entry_release(&fe);
2169 }
2170
2171 if (!KPKTQ_EMPTY(&host_pkts)) {
2172 fsw_host_rx(fsw, &host_pkts);
2173 }
2174
2175 done:
2176 dp_drop_pktq(fsw, &dropped_pkts, 0, reason, line, 0);
2177 }
2178
2179 #if (DEVELOPMENT || DEBUG)
2180 static void
fsw_rps_rx(struct nx_flowswitch * fsw,uint32_t id,struct __kern_packet * pkt)2181 fsw_rps_rx(struct nx_flowswitch *fsw, uint32_t id,
2182 struct __kern_packet *pkt)
2183 {
2184 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2185
2186 lck_mtx_lock_spin(&frt->frt_lock);
2187 KPKTQ_ENQUEUE(&frt->frt_pktq, pkt);
2188 lck_mtx_unlock(&frt->frt_lock);
2189 }
2190
2191 static void
fsw_rps_thread_schedule(struct nx_flowswitch * fsw,uint32_t id)2192 fsw_rps_thread_schedule(struct nx_flowswitch *fsw, uint32_t id)
2193 {
2194 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2195
2196 ASSERT(frt->frt_thread != THREAD_NULL);
2197 lck_mtx_lock_spin(&frt->frt_lock);
2198 ASSERT(!(frt->frt_flags & (FRT_TERMINATING | FRT_TERMINATED)));
2199
2200 frt->frt_requests++;
2201 if (!(frt->frt_flags & FRT_RUNNING)) {
2202 thread_wakeup((caddr_t)frt);
2203 }
2204 lck_mtx_unlock(&frt->frt_lock);
2205 }
2206
2207 __attribute__((noreturn))
2208 static void
fsw_rps_thread_cont(void * v,wait_result_t w)2209 fsw_rps_thread_cont(void *v, wait_result_t w)
2210 {
2211 struct fsw_rps_thread *__single frt = v;
2212 struct nx_flowswitch *fsw = frt->frt_fsw;
2213
2214 lck_mtx_lock(&frt->frt_lock);
2215 if (__improbable(w == THREAD_INTERRUPTIBLE ||
2216 (frt->frt_flags & FRT_TERMINATING) != 0)) {
2217 goto terminate;
2218 }
2219 if (KPKTQ_EMPTY(&frt->frt_pktq)) {
2220 goto done;
2221 }
2222 frt->frt_flags |= FRT_RUNNING;
2223
2224 for (;;) {
2225 uint32_t requests = frt->frt_requests;
2226 struct pktq pkts;
2227
2228 KPKTQ_INIT(&pkts);
2229 KPKTQ_CONCAT(&pkts, &frt->frt_pktq);
2230 lck_mtx_unlock(&frt->frt_lock);
2231
2232 sk_protect_t protect;
2233 protect = sk_sync_protect();
2234 FSW_RLOCK(fsw);
2235 _fsw_receive_locked(fsw, &pkts);
2236 FSW_RUNLOCK(fsw);
2237 sk_sync_unprotect(protect);
2238
2239 lck_mtx_lock(&frt->frt_lock);
2240 if ((frt->frt_flags & FRT_TERMINATING) != 0 ||
2241 requests == frt->frt_requests) {
2242 frt->frt_requests = 0;
2243 break;
2244 }
2245 }
2246
2247 done:
2248 lck_mtx_unlock(&frt->frt_lock);
2249 if (!(frt->frt_flags & FRT_TERMINATING)) {
2250 frt->frt_flags &= ~FRT_RUNNING;
2251 assert_wait(frt, THREAD_UNINT);
2252 thread_block_parameter(fsw_rps_thread_cont, frt);
2253 __builtin_unreachable();
2254 } else {
2255 terminate:
2256 LCK_MTX_ASSERT(&frt->frt_lock, LCK_MTX_ASSERT_OWNED);
2257 frt->frt_flags &= ~(FRT_RUNNING | FRT_TERMINATING);
2258 frt->frt_flags |= FRT_TERMINATED;
2259
2260 if (frt->frt_flags & FRT_TERMINATEBLOCK) {
2261 thread_wakeup((caddr_t)&frt);
2262 }
2263 lck_mtx_unlock(&frt->frt_lock);
2264
2265 SK_D("fsw_rx_%s_%d terminated", if_name(fsw->fsw_ifp),
2266 frt->frt_idx);
2267
2268 /* for the extra refcnt from kernel_thread_start() */
2269 thread_deallocate(current_thread());
2270 /* this is the end */
2271 thread_terminate(current_thread());
2272 /* NOTREACHED */
2273 __builtin_unreachable();
2274 }
2275
2276 /* must never get here */
2277 VERIFY(0);
2278 /* NOTREACHED */
2279 __builtin_unreachable();
2280 }
2281
2282 __attribute__((noreturn))
2283 static void
fsw_rps_thread_func(void * v,wait_result_t w)2284 fsw_rps_thread_func(void *v, wait_result_t w)
2285 {
2286 #pragma unused(w)
2287 struct fsw_rps_thread *__single frt = v;
2288 struct nx_flowswitch *fsw = frt->frt_fsw;
2289 const char *__null_terminated tname = NULL;
2290
2291 char thread_name[MAXTHREADNAMESIZE];
2292 bzero(thread_name, sizeof(thread_name));
2293 tname = tsnprintf(thread_name, sizeof(thread_name), "fsw_rx_%s_%d",
2294 if_name(fsw->fsw_ifp), frt->frt_idx);
2295
2296 thread_set_thread_name(frt->frt_thread, tname);
2297 SK_D("%s spawned", tname);
2298
2299 net_thread_marks_push(NET_THREAD_SYNC_RX);
2300 assert_wait(frt, THREAD_UNINT);
2301 (void) thread_block_parameter(fsw_rps_thread_cont, frt);
2302
2303 __builtin_unreachable();
2304 }
2305
2306 static void
fsw_rps_thread_join(struct nx_flowswitch * fsw,uint32_t i)2307 fsw_rps_thread_join(struct nx_flowswitch *fsw, uint32_t i)
2308 {
2309 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2310 uint64_t f = (1 * NSEC_PER_MSEC);
2311 uint64_t s = (1000 * NSEC_PER_SEC);
2312 uint32_t c = 0;
2313
2314 lck_mtx_lock(&frt->frt_lock);
2315 frt->frt_flags |= FRT_TERMINATING;
2316
2317 while (!(frt->frt_flags & FRT_TERMINATED)) {
2318 uint64_t t = 0;
2319 nanoseconds_to_absolutetime((c++ == 0) ? f : s, &t);
2320 clock_absolutetime_interval_to_deadline(t, &t);
2321 ASSERT(t != 0);
2322
2323 frt->frt_flags |= FRT_TERMINATEBLOCK;
2324 if (!(frt->frt_flags & FRT_RUNNING)) {
2325 thread_wakeup_one((caddr_t)frt);
2326 }
2327 (void) assert_wait_deadline(&frt->frt_thread, THREAD_UNINT, t);
2328 lck_mtx_unlock(&frt->frt_lock);
2329 thread_block(THREAD_CONTINUE_NULL);
2330 lck_mtx_lock(&frt->frt_lock);
2331 frt->frt_flags &= ~FRT_TERMINATEBLOCK;
2332 }
2333 ASSERT(frt->frt_flags & FRT_TERMINATED);
2334 lck_mtx_unlock(&frt->frt_lock);
2335 frt->frt_thread = THREAD_NULL;
2336 }
2337
2338 static void
fsw_rps_thread_spawn(struct nx_flowswitch * fsw,uint32_t i)2339 fsw_rps_thread_spawn(struct nx_flowswitch *fsw, uint32_t i)
2340 {
2341 kern_return_t error;
2342 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2343
2344 lck_mtx_init(&frt->frt_lock, &nexus_lock_group, &nexus_lock_attr);
2345 frt->frt_idx = i;
2346 frt->frt_fsw = fsw;
2347 error = kernel_thread_start(fsw_rps_thread_func, frt, &frt->frt_thread);
2348 ASSERT(!error);
2349 KPKTQ_INIT(&frt->frt_pktq);
2350 }
2351
2352 int
fsw_rps_set_nthreads(struct nx_flowswitch * fsw,uint32_t n)2353 fsw_rps_set_nthreads(struct nx_flowswitch* fsw, uint32_t n)
2354 {
2355 if (n > FSW_RPS_MAX_NTHREADS) {
2356 SK_ERR("rps nthreads %d, max %d", n, FSW_RPS_MAX_NTHREADS);
2357 return EINVAL;
2358 }
2359
2360 FSW_WLOCK(fsw);
2361 if (n < fsw->fsw_rps_nthreads) {
2362 for (uint32_t i = n; i < fsw->fsw_rps_nthreads; i++) {
2363 fsw_rps_thread_join(fsw, i);
2364 }
2365 fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2366 fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2367 fsw->fsw_rps_nthreads = n;
2368 } else if (n > fsw->fsw_rps_nthreads) {
2369 uint32_t nthreads_old = fsw->fsw_rps_nthreads;
2370
2371 fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2372 fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2373 fsw->fsw_rps_nthreads = n;
2374 for (uint32_t i = nthreads_old; i < n; i++) {
2375 fsw_rps_thread_spawn(fsw, i);
2376 }
2377 }
2378 FSW_WUNLOCK(fsw);
2379 return 0;
2380 }
2381
2382 static uint32_t
get_rps_id(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2383 get_rps_id(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2384 {
2385 sa_family_t af = fsw->fsw_demux(fsw, pkt);
2386 if (__improbable(af == AF_UNSPEC)) {
2387 return 0;
2388 }
2389
2390 flow_pkt_classify(pkt, fsw->fsw_ifp, af, true);
2391
2392 if (__improbable((pkt->pkt_qum_qflags &
2393 QUM_F_FLOW_CLASSIFIED) == 0)) {
2394 return 0;
2395 }
2396
2397 struct flow_key key;
2398 flow_pkt2key(pkt, true, &key);
2399 key.fk_mask = FKMASK_5TUPLE;
2400
2401 uint32_t id = flow_key_hash(&key) % fsw->fsw_rps_nthreads;
2402
2403 return id;
2404 }
2405
2406 #endif /* !DEVELOPMENT && !DEBUG */
2407
2408 void
fsw_receive(struct nx_flowswitch * fsw,struct pktq * pktq)2409 fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq)
2410 {
2411 FSW_RLOCK(fsw);
2412 #if (DEVELOPMENT || DEBUG)
2413 if (fsw->fsw_rps_nthreads != 0) {
2414 struct __kern_packet *pkt, *tpkt;
2415 bitmap_t map = 0;
2416
2417 _CASSERT(BITMAP_LEN(FSW_RPS_MAX_NTHREADS) == 1);
2418 KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
2419 uint32_t id = get_rps_id(fsw, pkt);
2420 KPKTQ_REMOVE(pktq, pkt);
2421 fsw_rps_rx(fsw, id, pkt);
2422 bitmap_set(&map, id);
2423 }
2424 for (int i = bitmap_first(&map, 64); i >= 0;
2425 i = bitmap_next(&map, i)) {
2426 fsw_rps_thread_schedule(fsw, i);
2427 }
2428 } else
2429 #endif /* !DEVELOPMENT && !DEBUG */
2430 {
2431 _fsw_receive_locked(fsw, pktq);
2432 }
2433 FSW_RUNLOCK(fsw);
2434 }
2435
2436 int
fsw_dev_input_netem_dequeue(void * handle,pktsched_pkt_t * __counted_by (n_pkts)pkts,uint32_t n_pkts)2437 fsw_dev_input_netem_dequeue(void *handle,
2438 pktsched_pkt_t *__counted_by(n_pkts)pkts, uint32_t n_pkts)
2439 {
2440 #pragma unused(handle)
2441 struct nx_flowswitch *__single fsw = handle;
2442 struct __kern_packet *kpkts[FSW_VP_DEV_BATCH_MAX];
2443 struct pktq pktq;
2444 sk_protect_t protect;
2445 uint32_t i;
2446
2447 ASSERT(n_pkts <= FSW_VP_DEV_BATCH_MAX);
2448
2449 for (i = 0; i < n_pkts; i++) {
2450 ASSERT(pkts[i].pktsched_ptype == QP_PACKET);
2451 ASSERT(pkts[i].pktsched_pkt_kpkt != NULL);
2452 kpkts[i] = pkts[i].pktsched_pkt_kpkt;
2453 }
2454
2455 protect = sk_sync_protect();
2456 KPKTQ_INIT(&pktq);
2457 pkts_to_pktq(kpkts, n_pkts, &pktq);
2458
2459 fsw_receive(fsw, &pktq);
2460 KPKTQ_FINI(&pktq);
2461 sk_sync_unprotect(protect);
2462
2463 return 0;
2464 }
2465
2466 static void
fsw_dev_input_netem_enqueue(struct nx_flowswitch * fsw,struct pktq * q)2467 fsw_dev_input_netem_enqueue(struct nx_flowswitch *fsw, struct pktq *q)
2468 {
2469 classq_pkt_t p;
2470 struct netem *__single ne;
2471 struct __kern_packet *pkt, *tpkt;
2472
2473 ASSERT(fsw->fsw_ifp != NULL);
2474 ne = fsw->fsw_ifp->if_input_netem;
2475 ASSERT(ne != NULL);
2476 KPKTQ_FOREACH_SAFE(pkt, q, tpkt) {
2477 bool pdrop;
2478 KPKTQ_REMOVE(q, pkt);
2479 CLASSQ_PKT_INIT_PACKET(&p, pkt);
2480 netem_enqueue(ne, &p, &pdrop);
2481 }
2482 }
2483
2484 void
fsw_devna_rx(struct nexus_adapter * devna,struct __kern_packet * pkt_head,struct nexus_pkt_stats * out_stats)2485 fsw_devna_rx(struct nexus_adapter *devna, struct __kern_packet *pkt_head,
2486 struct nexus_pkt_stats *out_stats)
2487 {
2488 struct __kern_packet *pkt = pkt_head, *next;
2489 struct nx_flowswitch *fsw;
2490 uint32_t n_bytes = 0, n_pkts = 0;
2491 uint64_t total_pkts = 0, total_bytes = 0;
2492 struct pktq q;
2493
2494 KPKTQ_INIT(&q);
2495 if (__improbable(devna->na_ifp == NULL ||
2496 (fsw = fsw_ifp_to_fsw(devna->na_ifp)) == NULL)) {
2497 SK_ERR("fsw not attached, dropping %d pkts", KPKTQ_LEN(&q));
2498 dp_drop_pkt_chain(pkt_head, 0, DROP_REASON_FSW_QUIESCED, DROPTAP_FLAG_L2_MISSING);
2499 return;
2500 }
2501 while (pkt != NULL) {
2502 if (__improbable(pkt->pkt_trace_id != 0)) {
2503 KDBG(SK_KTRACE_PKT_RX_DRV | DBG_FUNC_END, pkt->pkt_trace_id);
2504 KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_START, pkt->pkt_trace_id);
2505 }
2506 next = pkt->pkt_nextpkt;
2507 pkt->pkt_nextpkt = NULL;
2508
2509 if (__probable((pkt->pkt_qum_qflags & QUM_F_DROPPED) == 0)) {
2510 KPKTQ_ENQUEUE(&q, pkt);
2511 n_bytes += pkt->pkt_length;
2512 } else {
2513 DTRACE_SKYWALK1(non__finalized__drop,
2514 struct __kern_packet *, pkt);
2515 FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_FINALIZED);
2516 dp_drop_pkt_single(fsw, pkt, 0,
2517 DROP_REASON_FSW_RX_PKT_NOT_FINALIZED,
2518 DROPTAP_FLAG_L2_MISSING);
2519 pkt = NULL;
2520 }
2521 n_pkts = KPKTQ_LEN(&q);
2522 if (n_pkts == fsw_rx_batch || (next == NULL && n_pkts > 0)) {
2523 if (__improbable(fsw->fsw_ifp->if_input_netem != NULL)) {
2524 fsw_dev_input_netem_enqueue(fsw, &q);
2525 } else {
2526 fsw_receive(fsw, &q);
2527 }
2528 total_pkts += n_pkts;
2529 total_bytes += n_bytes;
2530 n_pkts = 0;
2531 n_bytes = 0;
2532 KPKTQ_FINI(&q);
2533 }
2534 pkt = next;
2535 }
2536 ASSERT(KPKTQ_LEN(&q) == 0);
2537 FSW_STATS_ADD(FSW_STATS_RX_PACKETS, total_pkts);
2538 if (out_stats != NULL) {
2539 out_stats->nps_pkts += total_pkts;
2540 out_stats->nps_bytes += total_bytes;
2541 }
2542 KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(devna), total_pkts, total_bytes);
2543 }
2544
2545 static int
dp_copy_to_dev_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2546 dp_copy_to_dev_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2547 struct __kern_packet *dpkt)
2548 {
2549 struct mbuf *__single m = NULL;
2550 uint32_t bdlen, bdlim, bdoff;
2551 uint8_t *bdaddr;
2552 unsigned int one = 1;
2553 int err = 0;
2554
2555 err = mbuf_allocpacket(MBUF_DONTWAIT,
2556 (fsw->fsw_frame_headroom + spkt->pkt_length), &one, &m);
2557 #if (DEVELOPMENT || DEBUG)
2558 if (m != NULL) {
2559 _FSW_INJECT_ERROR(11, m, NULL, m_freem, m);
2560 }
2561 #endif /* DEVELOPMENT || DEBUG */
2562 if (__improbable(m == NULL)) {
2563 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
2564 err = ENOBUFS;
2565 goto done;
2566 }
2567
2568 MD_BUFLET_ADDR_ABS_DLEN(dpkt, bdaddr, bdlen, bdlim, bdoff);
2569 if (fsw->fsw_frame_headroom > bdlim) {
2570 SK_ERR("not enough space in buffer for headroom");
2571 err = EINVAL;
2572 goto done;
2573 }
2574
2575 dpkt->pkt_headroom = fsw->fsw_frame_headroom;
2576 dpkt->pkt_mbuf = m;
2577 dpkt->pkt_pflags |= PKT_F_MBUF_DATA;
2578
2579 /* packet copy into mbuf */
2580 fsw->fsw_pkt_copy_to_mbuf(NR_TX, SK_PTR_ENCODE(spkt,
2581 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)), 0, m,
2582 fsw->fsw_frame_headroom, spkt->pkt_length,
2583 PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2584 spkt->pkt_csum_tx_start_off);
2585 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2MBUF);
2586
2587 /* header copy into dpkt buffer for classification */
2588 kern_packet_t sph = SK_PTR_ENCODE(spkt,
2589 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2590 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2591 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2592 uint32_t copy_len = MIN(spkt->pkt_length, bdlim - dpkt->pkt_headroom);
2593 fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, dpkt->pkt_headroom,
2594 sph, spkt->pkt_headroom, copy_len, FALSE, 0, 0, 0);
2595 if (copy_len < spkt->pkt_length) {
2596 dpkt->pkt_pflags |= PKT_F_TRUNCATED;
2597 }
2598
2599 /*
2600 * fsw->fsw_frame_headroom is after m_data, thus we treat m_data same as
2601 * buflet baddr m_data always points to the beginning of packet and
2602 * should represents the same as baddr + headroom
2603 */
2604 ASSERT((uintptr_t)m->m_data ==
2605 ((uintptr_t)mbuf_datastart(m) + fsw->fsw_frame_headroom));
2606
2607 done:
2608 return err;
2609 }
2610
2611 static int
dp_copy_to_dev_pkt(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2612 dp_copy_to_dev_pkt(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2613 struct __kern_packet *dpkt)
2614 {
2615 struct ifnet *ifp = fsw->fsw_ifp;
2616 uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
2617
2618 if (headroom > UINT8_MAX) {
2619 SK_ERR("headroom too large %d", headroom);
2620 return ERANGE;
2621 }
2622 dpkt->pkt_headroom = (uint8_t)headroom;
2623 ASSERT((dpkt->pkt_headroom & 0x7) == 0);
2624 dpkt->pkt_l2_len = 0;
2625 dpkt->pkt_link_flags = spkt->pkt_link_flags;
2626
2627 kern_packet_t sph = SK_PTR_ENCODE(spkt,
2628 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2629 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2630 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2631 fsw->fsw_pkt_copy_from_pkt(NR_TX, dph,
2632 dpkt->pkt_headroom, sph, spkt->pkt_headroom,
2633 spkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2634 (spkt->pkt_csum_tx_start_off - spkt->pkt_headroom),
2635 (spkt->pkt_csum_tx_stuff_off - spkt->pkt_headroom),
2636 (spkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT));
2637
2638 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
2639
2640 return 0;
2641 }
2642
2643 #if SK_LOG
2644 /* Hoisted out of line to reduce kernel stack footprint */
2645 SK_LOG_ATTRIBUTE
2646 static void
dp_copy_to_dev_log(struct nx_flowswitch * fsw,const struct kern_pbufpool * pp,struct __kern_packet * spkt,struct __kern_packet * dpkt,int error)2647 dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp,
2648 struct __kern_packet *spkt, struct __kern_packet *dpkt, int error)
2649 {
2650 struct proc *p = current_proc();
2651 struct ifnet *ifp = fsw->fsw_ifp;
2652 uint64_t logflags = (SK_VERB_FSW_DP | SK_VERB_TX);
2653
2654 if (error == ERANGE) {
2655 SK_ERR("packet too long, hr(fr+tx)+slen (%u+%u)+%u > "
2656 "dev_pp_max %u", (uint32_t)fsw->fsw_frame_headroom,
2657 (uint32_t)ifp->if_tx_headroom, spkt->pkt_length,
2658 (uint32_t)pp->pp_max_frags * PP_BUF_SIZE_DEF(pp));
2659 } else if (error == ENOBUFS) {
2660 SK_DF(logflags, "%s(%d) packet allocation failure",
2661 sk_proc_name_address(p), sk_proc_pid(p));
2662 } else if (error == 0) {
2663 ASSERT(dpkt != NULL);
2664 char *daddr;
2665 uint32_t pkt_len;
2666
2667 MD_BUFLET_ADDR_ABS(dpkt, daddr);
2668 pkt_len = __packet_get_real_data_length(dpkt);
2669 SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)",
2670 sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
2671 dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
2672 (uint32_t)fsw->fsw_frame_headroom,
2673 (uint32_t)ifp->if_tx_headroom);
2674 SK_DF(logflags | SK_VERB_DUMP, "%s",
2675 sk_dump("buf", daddr, pkt_len, 128, NULL, 0));
2676 } else {
2677 SK_DF(logflags, "%s(%d) error %d", error);
2678 }
2679 }
2680 #else
2681 #define dp_copy_to_dev_log(...)
2682 #endif /* SK_LOG */
2683
2684 static void
fsw_pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)2685 fsw_pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
2686 {
2687 ASSERT(!(spkt->pkt_pflags & PKT_F_MBUF_MASK));
2688 ASSERT(!(spkt->pkt_pflags & PKT_F_PKT_MASK));
2689
2690 SK_PREFETCHW(dpkt->pkt_qum_buf.buf_addr, 0);
2691 /* Copy packet metadata */
2692 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
2693 _PKT_COPY(spkt, dpkt);
2694 _PKT_COPY_TX_PORT_DATA(spkt, dpkt);
2695 ASSERT((dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
2696 !PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
2697 ASSERT(dpkt->pkt_mbuf == NULL);
2698
2699 /* Copy AQM metadata */
2700 dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type;
2701 dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx;
2702 _CASSERT((offsetof(struct __flow, flow_src_id) % 8) == 0);
2703 _UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id);
2704 _UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid);
2705 dpkt->pkt_policy_id = spkt->pkt_policy_id;
2706 dpkt->pkt_skip_policy_id = spkt->pkt_skip_policy_id;
2707 }
2708
2709 static int
dp_copy_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2710 dp_copy_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2711 struct __kern_packet *dpkt)
2712 {
2713 const struct kern_pbufpool *pp = dpkt->pkt_qum.qum_pp;
2714 struct ifnet *ifp = fsw->fsw_ifp;
2715 uint32_t dev_pkt_len;
2716 int err = 0;
2717
2718 fsw_pkt_copy_metadata(spkt, dpkt);
2719 switch (fsw->fsw_classq_enq_ptype) {
2720 case QP_MBUF:
2721 err = dp_copy_to_dev_mbuf(fsw, spkt, dpkt);
2722 break;
2723
2724 case QP_PACKET:
2725 dev_pkt_len = fsw->fsw_frame_headroom + ifp->if_tx_headroom +
2726 spkt->pkt_length;
2727 if (dev_pkt_len > pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)) {
2728 FSW_STATS_INC(FSW_STATS_TX_COPY_BAD_LEN);
2729 err = ERANGE;
2730 goto done;
2731 }
2732 err = dp_copy_to_dev_pkt(fsw, spkt, dpkt);
2733 break;
2734
2735 default:
2736 VERIFY(0);
2737 __builtin_unreachable();
2738 }
2739 done:
2740 dp_copy_to_dev_log(fsw, pp, spkt, dpkt, err);
2741 return err;
2742 }
2743
2744 static int
dp_copy_headers_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2745 dp_copy_headers_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2746 struct __kern_packet *dpkt)
2747 {
2748 uint8_t *sbaddr, *dbaddr;
2749 uint16_t headroom = fsw->fsw_frame_headroom + fsw->fsw_ifp->if_tx_headroom;
2750 uint16_t hdrs_len_estimate = (uint16_t)MIN(spkt->pkt_length, 128);
2751
2752 fsw_pkt_copy_metadata(spkt, dpkt);
2753
2754 MD_BUFLET_ADDR_ABS(spkt, sbaddr);
2755 ASSERT(sbaddr != NULL);
2756 sbaddr += spkt->pkt_headroom;
2757
2758 MD_BUFLET_ADDR_ABS(dpkt, dbaddr);
2759 ASSERT(dbaddr != NULL);
2760 dpkt->pkt_headroom = (uint8_t)headroom;
2761 dbaddr += headroom;
2762
2763 pkt_copy(sbaddr, dbaddr, hdrs_len_estimate);
2764 METADATA_SET_LEN(dpkt, hdrs_len_estimate, headroom);
2765
2766 /* packet length is set to the full length */
2767 dpkt->pkt_length = spkt->pkt_length;
2768 dpkt->pkt_pflags |= PKT_F_TRUNCATED;
2769 return 0;
2770 }
2771
2772 static struct mbuf *
convert_pkt_to_mbuf(struct __kern_packet * pkt)2773 convert_pkt_to_mbuf(struct __kern_packet *pkt)
2774 {
2775 ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
2776 ASSERT(pkt->pkt_mbuf != NULL);
2777 struct mbuf *m = pkt->pkt_mbuf;
2778
2779 /* pass additional metadata generated from flow parse/lookup */
2780 _CASSERT(sizeof(m->m_pkthdr.pkt_flowid) ==
2781 sizeof(pkt->pkt_flow_token));
2782 _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) ==
2783 sizeof(pkt->pkt_flowsrc_token));
2784 _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) ==
2785 sizeof(pkt->pkt_flowsrc_fidx));
2786 m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
2787 m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
2788 m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
2789 m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
2790 m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
2791 m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
2792 m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx;
2793
2794 if (pkt->pkt_transport_protocol == IPPROTO_QUIC) {
2795 m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_QUIC;
2796 }
2797
2798 /* The packet should have a timestamp by the time we get here. */
2799 m->m_pkthdr.pkt_timestamp = pkt->pkt_timestamp;
2800 m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
2801
2802 m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
2803 m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
2804 /* set pkt_hdr so that AQM can find IP header and mark ECN bits */
2805 m->m_pkthdr.pkt_hdr = m_mtod_current(m) + pkt->pkt_l2_len;
2806
2807 if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
2808 m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
2809 }
2810 KPKT_CLEAR_MBUF_DATA(pkt);
2811
2812 /* mbuf has been consumed, release packet as well */
2813 ASSERT(pkt->pkt_qum.qum_ksd == NULL);
2814 pp_free_packet_single(pkt);
2815 return m;
2816 }
2817
2818 static void
convert_pkt_to_mbuf_list(struct __kern_packet * pkt_list,struct mbuf ** head,struct mbuf ** tail,uint32_t * cnt,uint32_t * bytes)2819 convert_pkt_to_mbuf_list(struct __kern_packet *pkt_list,
2820 struct mbuf **head, struct mbuf **tail,
2821 uint32_t *cnt, uint32_t *bytes)
2822 {
2823 struct __kern_packet *pkt = pkt_list, *next;
2824 struct mbuf *__single m_head = NULL, **__single m_tailp = &m_head;
2825 struct mbuf *__single m = NULL;
2826 uint32_t c = 0, b = 0;
2827
2828 while (pkt != NULL) {
2829 next = pkt->pkt_nextpkt;
2830 pkt->pkt_nextpkt = NULL;
2831 m = convert_pkt_to_mbuf(pkt);
2832 ASSERT(m != NULL);
2833
2834 *m_tailp = m;
2835 m_tailp = &m->m_nextpkt;
2836 c++;
2837 b += m_pktlen(m);
2838 pkt = next;
2839 }
2840 if (head != NULL) {
2841 *head = m_head;
2842 }
2843 if (tail != NULL) {
2844 *tail = m;
2845 }
2846 if (cnt != NULL) {
2847 *cnt = c;
2848 }
2849 if (bytes != NULL) {
2850 *bytes = b;
2851 }
2852 }
2853
2854 SK_NO_INLINE_ATTRIBUTE
2855 static int
classq_enqueue_flow_single(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2856 classq_enqueue_flow_single(struct nx_flowswitch *fsw,
2857 struct __kern_packet *pkt)
2858 {
2859 struct ifnet *ifp = fsw->fsw_ifp;
2860 boolean_t pkt_drop = FALSE;
2861 int err;
2862
2863 FSW_LOCK_ASSERT_HELD(fsw);
2864 ASSERT(fsw->fsw_classq_enabled);
2865 ASSERT(pkt->pkt_flow_token != 0);
2866 fsw_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class,
2867 1, pkt->pkt_length);
2868
2869 if (__improbable(pkt->pkt_trace_id != 0)) {
2870 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
2871 KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
2872 }
2873
2874 switch (fsw->fsw_classq_enq_ptype) {
2875 case QP_MBUF: { /* compat interface */
2876 struct mbuf *m;
2877
2878 m = convert_pkt_to_mbuf(pkt);
2879 ASSERT(m != NULL);
2880 pkt = NULL;
2881
2882 /* ifnet_enqueue consumes mbuf */
2883 err = ifnet_enqueue_mbuf(ifp, m, false, &pkt_drop);
2884 m = NULL;
2885 #if (DEVELOPMENT || DEBUG)
2886 if (__improbable(!pkt_drop)) {
2887 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2888 }
2889 #endif /* DEVELOPMENT || DEBUG */
2890 if (pkt_drop) {
2891 FSW_STATS_INC(FSW_STATS_DROP);
2892 FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2893 }
2894 break;
2895 }
2896 case QP_PACKET: { /* native interface */
2897 /* ifnet_enqueue consumes packet */
2898 err = ifnet_enqueue_pkt(ifp, pkt, false, &pkt_drop);
2899 pkt = NULL;
2900 #if (DEVELOPMENT || DEBUG)
2901 if (__improbable(!pkt_drop)) {
2902 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2903 }
2904 #endif /* DEVELOPMENT || DEBUG */
2905 if (pkt_drop) {
2906 FSW_STATS_INC(FSW_STATS_DROP);
2907 FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2908 }
2909 break;
2910 }
2911 default:
2912 err = EINVAL;
2913 VERIFY(0);
2914 /* NOTREACHED */
2915 __builtin_unreachable();
2916 }
2917
2918 return err;
2919 }
2920
2921 static int
classq_enqueue_flow_chain(struct nx_flowswitch * fsw,struct __kern_packet * pkt_head,struct __kern_packet * pkt_tail,uint32_t cnt,uint32_t bytes)2922 classq_enqueue_flow_chain(struct nx_flowswitch *fsw,
2923 struct __kern_packet *pkt_head, struct __kern_packet *pkt_tail,
2924 uint32_t cnt, uint32_t bytes)
2925 {
2926 struct ifnet *ifp = fsw->fsw_ifp;
2927 boolean_t pkt_drop = FALSE;
2928 uint32_t svc;
2929 int err;
2930
2931 FSW_LOCK_ASSERT_HELD(fsw);
2932 ASSERT(fsw->fsw_classq_enabled);
2933 ASSERT(pkt_head->pkt_flow_token != 0);
2934
2935 /*
2936 * All packets in the flow should have the same svc.
2937 */
2938 svc = pkt_head->pkt_svc_class;
2939 fsw_ifp_inc_traffic_class_out_pkt(ifp, svc, cnt, bytes);
2940
2941 switch (fsw->fsw_classq_enq_ptype) {
2942 case QP_MBUF: { /* compat interface */
2943 struct mbuf *__single m_head = NULL, *__single m_tail = NULL;
2944 uint32_t c = 0, b = 0;
2945
2946 convert_pkt_to_mbuf_list(pkt_head, &m_head, &m_tail, &c, &b);
2947 ASSERT(m_head != NULL && m_tail != NULL);
2948 ASSERT(c == cnt);
2949 ASSERT(b == bytes);
2950 pkt_head = NULL;
2951
2952 /* ifnet_enqueue consumes mbuf */
2953 err = ifnet_enqueue_mbuf_chain(ifp, m_head, m_tail, cnt,
2954 bytes, FALSE, &pkt_drop);
2955 m_head = NULL;
2956 m_tail = NULL;
2957 #if (DEVELOPMENT || DEBUG)
2958 if (__improbable(!pkt_drop)) {
2959 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2960 }
2961 #endif /* DEVELOPMENT || DEBUG */
2962 if (pkt_drop) {
2963 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2964 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2965 cnt);
2966 }
2967 break;
2968 }
2969 case QP_PACKET: { /* native interface */
2970 /* ifnet_enqueue consumes packet */
2971 err = ifnet_enqueue_pkt_chain(ifp, pkt_head, pkt_tail, cnt,
2972 bytes, FALSE, &pkt_drop);
2973 pkt_head = NULL;
2974 #if (DEVELOPMENT || DEBUG)
2975 if (__improbable(!pkt_drop)) {
2976 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2977 }
2978 #endif /* DEVELOPMENT || DEBUG */
2979 if (pkt_drop) {
2980 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2981 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2982 cnt);
2983 }
2984 break;
2985 }
2986 default:
2987 err = EINVAL;
2988 VERIFY(0);
2989 /* NOTREACHED */
2990 __builtin_unreachable();
2991 }
2992
2993 return err;
2994 }
2995
2996 /*
2997 * This code path needs to be kept for interfaces without logical link support.
2998 */
2999 static void
classq_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)3000 classq_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
3001 bool chain, uint32_t cnt, uint32_t bytes)
3002 {
3003 bool flowadv_is_set = false;
3004 struct __kern_packet *pkt, *tail, *tpkt;
3005 flowadv_idx_t flow_adv_idx;
3006 bool flowadv_cap;
3007 flowadv_token_t flow_adv_token;
3008 int err;
3009
3010 SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
3011 if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
3012
3013 if (chain) {
3014 pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
3015 tail = KPKTQ_LAST(&fe->fe_tx_pktq);
3016 KPKTQ_INIT(&fe->fe_tx_pktq);
3017 if (pkt == NULL) {
3018 return;
3019 }
3020 flow_adv_idx = pkt->pkt_flowsrc_fidx;
3021 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3022 flow_adv_token = pkt->pkt_flow_token;
3023
3024 err = classq_enqueue_flow_chain(fsw, pkt, tail, cnt, bytes);
3025
3026 /* set flow advisory if needed */
3027 if (__improbable((err == EQFULL || err == EQSUSPENDED) &&
3028 flowadv_cap)) {
3029 flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
3030 flow_adv_idx, flow_adv_token);
3031 }
3032 DTRACE_SKYWALK3(chain__enqueue, uint32_t, cnt, uint32_t, bytes,
3033 bool, flowadv_is_set);
3034 } else {
3035 uint32_t c = 0, b = 0;
3036
3037 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3038 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3039
3040 flow_adv_idx = pkt->pkt_flowsrc_fidx;
3041 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3042 flow_adv_token = pkt->pkt_flow_token;
3043
3044 c++;
3045 b += pkt->pkt_length;
3046 err = classq_enqueue_flow_single(fsw, pkt);
3047
3048 /* set flow advisory if needed */
3049 if (__improbable(!flowadv_is_set &&
3050 ((err == EQFULL || err == EQSUSPENDED) &&
3051 flowadv_cap))) {
3052 flowadv_is_set = na_flowadv_set(
3053 flow_get_na(fsw, fe), flow_adv_idx,
3054 flow_adv_token);
3055 }
3056 }
3057 ASSERT(c == cnt);
3058 ASSERT(b == bytes);
3059 DTRACE_SKYWALK3(non__chain__enqueue, uint32_t, cnt, uint32_t, bytes,
3060 bool, flowadv_is_set);
3061 }
3062
3063 /* notify flow advisory event */
3064 if (__improbable(flowadv_is_set)) {
3065 struct __kern_channel_ring *r = fsw_flow_get_tx_ring(fsw, fe);
3066 if (__probable(r)) {
3067 na_flowadv_event(r);
3068 SK_DF(SK_VERB_FLOW_ADVISORY | SK_VERB_TX,
3069 "%s(%d) notified of flow update",
3070 sk_proc_name_address(current_proc()),
3071 sk_proc_pid(current_proc()));
3072 }
3073 }
3074 }
3075
3076 /*
3077 * Logical link code path
3078 */
3079 static void
classq_qset_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)3080 classq_qset_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
3081 bool chain, uint32_t cnt, uint32_t bytes)
3082 {
3083 #pragma unused(chain)
3084 struct __kern_packet *pkt, *tail;
3085 flowadv_idx_t flow_adv_idx;
3086 bool flowadv_is_set = false;
3087 bool flowadv_cap;
3088 flowadv_token_t flow_adv_token;
3089 uint32_t flowctl = 0, dropped = 0;
3090 int err;
3091
3092 SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
3093 if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
3094
3095 pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
3096 tail = KPKTQ_LAST(&fe->fe_tx_pktq);
3097 KPKTQ_INIT(&fe->fe_tx_pktq);
3098 if (pkt == NULL) {
3099 return;
3100 }
3101 flow_adv_idx = pkt->pkt_flowsrc_fidx;
3102 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3103 flow_adv_token = pkt->pkt_flow_token;
3104
3105 err = netif_qset_enqueue(fe->fe_qset, pkt, tail, cnt, bytes,
3106 &flowctl, &dropped);
3107
3108 if (__improbable(err != 0)) {
3109 /* set flow advisory if needed */
3110 if (flowctl > 0 && flowadv_cap) {
3111 flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
3112 flow_adv_idx, flow_adv_token);
3113
3114 /* notify flow advisory event */
3115 if (flowadv_is_set) {
3116 struct __kern_channel_ring *r =
3117 fsw_flow_get_tx_ring(fsw, fe);
3118 if (__probable(r)) {
3119 na_flowadv_event(r);
3120 SK_DF(SK_VERB_FLOW_ADVISORY |
3121 SK_VERB_TX,
3122 "%s(%d) notified of flow update",
3123 sk_proc_name_address(current_proc()),
3124 sk_proc_pid(current_proc()));
3125 }
3126 }
3127 }
3128 if (dropped > 0) {
3129 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, dropped);
3130 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
3131 dropped);
3132 }
3133 }
3134 }
3135
3136 static void
tx_finalize_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)3137 tx_finalize_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
3138 {
3139 #pragma unused(fsw)
3140 /* finalize here; no more changes to buflets after classq */
3141 if (__probable(!(pkt->pkt_pflags & PKT_F_MBUF_DATA))) {
3142 kern_packet_t ph = SK_PTR_ENCODE(pkt,
3143 METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
3144 int err = __packet_finalize(ph);
3145 VERIFY(err == 0);
3146 }
3147 }
3148
3149 static bool
dp_flow_tx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)3150 dp_flow_tx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
3151 {
3152 struct flow_route *fr = fe->fe_route;
3153 int err;
3154
3155 ASSERT(fr != NULL);
3156
3157 if (__improbable(!dp_flow_route_process(fsw, fe))) {
3158 return false;
3159 }
3160 if (fe->fe_qset_select == FE_QSET_SELECT_DYNAMIC) {
3161 flow_qset_select_dynamic(fsw, fe, TRUE);
3162 }
3163
3164 _FSW_INJECT_ERROR(35, fr->fr_flags, fr->fr_flags,
3165 _fsw_error35_handler, 1, fr, NULL, NULL);
3166 _FSW_INJECT_ERROR(36, fr->fr_flags, fr->fr_flags,
3167 _fsw_error36_handler, 1, fr, NULL);
3168
3169 /*
3170 * See if we need to resolve the flow route; note the test against
3171 * fr_flags here is done without any lock for performance. Thus
3172 * it's possible that we race against the thread performing route
3173 * event updates for a packet (which is OK). In any case we should
3174 * not have any assertion on fr_flags value(s) due to the lack of
3175 * serialization.
3176 */
3177 if (fr->fr_flags & FLOWRTF_RESOLVED) {
3178 goto frame;
3179 }
3180
3181 struct __kern_packet *pkt, *tpkt;
3182 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3183 err = fsw->fsw_resolve(fsw, fr, pkt);
3184 _FSW_INJECT_ERROR_SET(35, _fsw_error35_handler, 2, fr, pkt, &err);
3185 _FSW_INJECT_ERROR_SET(36, _fsw_error36_handler, 2, fr, &err);
3186 /*
3187 * If resolver returns EJUSTRETURN then we drop the pkt as the
3188 * resolver should have converted the pkt into mbuf (or
3189 * detached the attached mbuf from pkt) and added it to the
3190 * llinfo queue. If we do have a cached llinfo, then proceed
3191 * to using it even though it may be stale (very unlikely)
3192 * while the resolution is in progress.
3193 * Otherwise, any other error results in dropping pkt.
3194 */
3195 if (err == EJUSTRETURN) {
3196 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3197 pp_free_packet_single(pkt);
3198 FSW_STATS_INC(FSW_STATS_TX_RESOLV_PENDING);
3199 continue;
3200 } else if (err != 0 && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
3201 /* use existing llinfo */
3202 FSW_STATS_INC(FSW_STATS_TX_RESOLV_STALE);
3203 } else if (err != 0) {
3204 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3205 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_RESOLV_FAILED,
3206 DROPTAP_FLAG_L2_MISSING);
3207 FSW_STATS_INC(FSW_STATS_TX_RESOLV_FAIL);
3208 continue;
3209 }
3210 }
3211
3212 frame:
3213 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3214 if (fsw->fsw_frame != NULL) {
3215 fsw->fsw_frame(fsw, fr, pkt);
3216 }
3217 }
3218
3219 return true;
3220 }
3221
3222 static void
dp_listener_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)3223 dp_listener_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
3224 {
3225 #pragma unused(fsw)
3226 struct __kern_packet *pkt, *tpkt;
3227 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3228 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3229 /* listener is only allowed TCP RST */
3230 if (pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
3231 (pkt->pkt_flow_tcp_flags & TH_RST) != 0) {
3232 flow_track_abort_tcp(fe, NULL, pkt);
3233 } else {
3234 char *addr;
3235
3236 MD_BUFLET_ADDR_ABS(pkt, addr);
3237 SK_ERR("listener flow sends non-RST packet %s",
3238 sk_dump(sk_proc_name_address(current_proc()),
3239 addr, __packet_get_real_data_length(pkt), 128, NULL, 0));
3240 }
3241 pp_free_packet_single(pkt);
3242 }
3243 }
3244
3245 static void
fsw_update_timestamps(struct __kern_packet * pkt,volatile uint64_t * fg_ts,volatile uint64_t * rt_ts,ifnet_t ifp)3246 fsw_update_timestamps(struct __kern_packet *pkt, volatile uint64_t *fg_ts,
3247 volatile uint64_t *rt_ts, ifnet_t ifp)
3248 {
3249 struct timespec now;
3250 uint64_t now_nsec = 0;
3251
3252 if (!(pkt->pkt_pflags & PKT_F_TS_VALID) || pkt->pkt_timestamp == 0) {
3253 nanouptime(&now);
3254 net_timernsec(&now, &now_nsec);
3255 pkt->pkt_timestamp = now_nsec;
3256 }
3257 pkt->pkt_pflags &= ~PKT_F_TS_VALID;
3258
3259 /*
3260 * If the packet service class is not background,
3261 * update the timestamps on the interface, as well as
3262 * the ones in nexus-wide advisory to indicate recent
3263 * activity on a foreground flow.
3264 */
3265 if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) {
3266 ifp->if_fg_sendts = (uint32_t)_net_uptime;
3267 if (fg_ts != NULL) {
3268 *fg_ts = _net_uptime;
3269 }
3270 }
3271 if (pkt->pkt_pflags & PKT_F_REALTIME) {
3272 ifp->if_rt_sendts = (uint32_t)_net_uptime;
3273 if (rt_ts != NULL) {
3274 *rt_ts = _net_uptime;
3275 }
3276 }
3277 }
3278
3279 static bool
fsw_chain_enqueue_enabled(struct nx_flowswitch * fsw,bool gso_enabled)3280 fsw_chain_enqueue_enabled(struct nx_flowswitch *fsw, bool gso_enabled)
3281 {
3282 return fsw_chain_enqueue != 0 &&
3283 fsw->fsw_ifp->if_output_netem == NULL &&
3284 (fsw->fsw_ifp->if_eflags & IFEF_ENQUEUE_MULTI) == 0 &&
3285 gso_enabled;
3286 }
3287
3288 void
dp_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)3289 dp_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3290 uint32_t flags)
3291 {
3292 struct pktq dropped_pkts;
3293 bool chain, gso = ((flags & FLOW_PROC_FLAG_GSO) != 0);
3294 uint32_t cnt = 0, bytes = 0;
3295 volatile struct sk_nexusadv *nxadv = NULL;
3296 volatile uint64_t *fg_ts = NULL;
3297 volatile uint64_t *rt_ts = NULL;
3298 uint8_t qset_idx = (fe->fe_qset != NULL) ? fe->fe_qset->nqs_idx : 0;
3299 drop_reason_t reason = DROP_REASON_UNSPECIFIED;
3300 uint16_t line = 0;
3301
3302 KPKTQ_INIT(&dropped_pkts);
3303 ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3304 if (__improbable(fe->fe_flags & FLOWENTF_LISTENER)) {
3305 dp_listener_flow_tx_process(fsw, fe);
3306 return;
3307 }
3308 if (__improbable(!dp_flow_tx_route_process(fsw, fe))) {
3309 SK_RDERR(5, "Tx route bad");
3310 FSW_STATS_ADD(FSW_STATS_TX_FLOW_NONVIABLE,
3311 KPKTQ_LEN(&fe->fe_tx_pktq));
3312 KPKTQ_CONCAT(&dropped_pkts, &fe->fe_tx_pktq);
3313 reason = DROP_REASON_FSW_FLOW_NONVIABLE;
3314 line = __LINE__;
3315 goto done;
3316 }
3317 chain = fsw_chain_enqueue_enabled(fsw, gso);
3318 if (chain) {
3319 nxadv = fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
3320 if (nxadv != NULL) {
3321 fg_ts = &nxadv->nxadv_fg_sendts;
3322 rt_ts = &nxadv->nxadv_rt_sendts;
3323 }
3324 }
3325 struct __kern_packet *pkt, *tpkt;
3326 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3327 int err = 0;
3328
3329 err = flow_pkt_track(fe, pkt, false);
3330 if (__improbable(err != 0)) {
3331 SK_RDERR(5, "flow_pkt_track failed (err %d)", err);
3332 FSW_STATS_INC(FSW_STATS_TX_FLOW_TRACK_ERR);
3333 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3334 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_FLOW_TRACK_ERR,
3335 DROPTAP_FLAG_L2_MISSING);
3336 continue;
3337 }
3338 _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
3339 pkt->pkt_transport_protocol = fe->fe_transport_protocol;
3340
3341 /* set AQM related values for outgoing packet */
3342 if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
3343 pkt->pkt_pflags |= PKT_F_FLOW_ADV;
3344 pkt->pkt_flowsrc_type = FLOWSRC_CHANNEL;
3345 pkt->pkt_flowsrc_fidx = fe->fe_adv_idx;
3346 } else {
3347 pkt->pkt_pflags &= ~PKT_F_FLOW_ADV;
3348 }
3349 _UUID_CLEAR(pkt->pkt_flow_id);
3350 pkt->pkt_flow_token = fe->fe_flowid;
3351 pkt->pkt_pflags |= PKT_F_FLOW_ID;
3352 pkt->pkt_qset_idx = qset_idx;
3353 pkt->pkt_policy_id = fe->fe_policy_id;
3354 pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
3355
3356 /*
3357 * The same code is exercised per packet for the non-chain case
3358 * (see ifnet_enqueue_ifclassq()). It's replicated here to avoid
3359 * re-walking the chain later.
3360 */
3361 if (chain) {
3362 fsw_update_timestamps(pkt, fg_ts, rt_ts, fsw->fsw_ifp);
3363 }
3364 /* mark packet tos/svc_class */
3365 fsw_qos_mark(fsw, fe, pkt);
3366
3367 tx_finalize_packet(fsw, pkt);
3368 bytes += pkt->pkt_length;
3369 cnt++;
3370 }
3371
3372 /* snoop after it's finalized */
3373 if (__improbable(pktap_total_tap_count != 0)) {
3374 fsw_snoop(fsw, fe, &fe->fe_tx_pktq, false);
3375 }
3376 if (fe->fe_qset != NULL) {
3377 classq_qset_enqueue_flow(fsw, fe, chain, cnt, bytes);
3378 } else {
3379 classq_enqueue_flow(fsw, fe, chain, cnt, bytes);
3380 }
3381 done:
3382 dp_drop_pktq(fsw, &dropped_pkts, 1, reason, line, 0);
3383 }
3384
3385 static struct flow_entry *
tx_process_continuous_ip_frag(struct nx_flowswitch * fsw,struct flow_entry * prev_fe,struct __kern_packet * pkt)3386 tx_process_continuous_ip_frag(struct nx_flowswitch *fsw,
3387 struct flow_entry *prev_fe, struct __kern_packet *pkt)
3388 {
3389 ASSERT(!pkt->pkt_flow_ip_is_first_frag);
3390
3391 if (__improbable(pkt->pkt_flow_ip_frag_id == 0)) {
3392 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID);
3393 SK_ERR("%s(%d) invalid zero fragment id",
3394 sk_proc_name_address(current_proc()),
3395 sk_proc_pid(current_proc()));
3396 return NULL;
3397 }
3398
3399 SK_DF(SK_VERB_FSW_DP | SK_VERB_TX,
3400 "%s(%d) continuation frag, id %u",
3401 sk_proc_name_address(current_proc()),
3402 sk_proc_pid(current_proc()),
3403 pkt->pkt_flow_ip_frag_id);
3404 if (__improbable(prev_fe == NULL ||
3405 !prev_fe->fe_tx_is_cont_frag)) {
3406 SK_ERR("%s(%d) unexpected continuation frag",
3407 sk_proc_name_address(current_proc()),
3408 sk_proc_pid(current_proc()),
3409 pkt->pkt_flow_ip_frag_id);
3410 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3411 return NULL;
3412 }
3413 if (__improbable(pkt->pkt_flow_ip_frag_id !=
3414 prev_fe->fe_tx_frag_id)) {
3415 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3416 SK_ERR("%s(%d) wrong continuation frag id %u expecting %u",
3417 sk_proc_name_address(current_proc()),
3418 sk_proc_pid(current_proc()),
3419 pkt->pkt_flow_ip_frag_id,
3420 prev_fe->fe_tx_frag_id);
3421 return NULL;
3422 }
3423
3424 return prev_fe;
3425 }
3426
3427 static struct flow_entry *
tx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)3428 tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
3429 struct flow_entry *prev_fe)
3430 {
3431 struct flow_entry *__single fe;
3432
3433 fe = lookup_flow_with_pkt(fsw, pkt, false, prev_fe);
3434 if (__improbable(fe == NULL)) {
3435 goto done;
3436 }
3437
3438 if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
3439 SK_RDERR(5, "Tx flow torn down");
3440 FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN);
3441 flow_entry_release(&fe);
3442 goto done;
3443 }
3444
3445 _FSW_INJECT_ERROR(34, pkt->pkt_flow_id[0], fe->fe_uuid[0] + 1,
3446 null_func);
3447
3448 if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
3449 uuid_string_t flow_id_str, pkt_id_str;
3450 sk_uuid_unparse(fe->fe_uuid, flow_id_str);
3451 sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str);
3452 SK_ERR("pkt flow id %s != flow id %s", pkt_id_str, flow_id_str);
3453 flow_entry_release(&fe);
3454 FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID);
3455 }
3456
3457 done:
3458 return fe;
3459 }
3460
3461 static inline void
tx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)3462 tx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3463 uint32_t flags)
3464 {
3465 ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3466 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) != 0);
3467
3468 SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, "TX %d pkts from fe %p port %d",
3469 KPKTQ_LEN(&fe->fe_tx_pktq), fe, fe->fe_nx_port);
3470
3471 /* flow related processing (default, agg, etc.) */
3472 fe->fe_tx_process(fsw, fe, flags);
3473
3474 KPKTQ_FINI(&fe->fe_tx_pktq);
3475 }
3476
3477 #if SK_LOG
3478 static void
dp_tx_log_pkt(uint64_t verb,char * desc,struct __kern_packet * pkt)3479 dp_tx_log_pkt(uint64_t verb, char *desc, struct __kern_packet *pkt)
3480 {
3481 char *pkt_buf;
3482 uint32_t pkt_len;
3483
3484 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
3485 pkt_len = __packet_get_real_data_length(pkt);
3486 SK_DF(verb, "%s(%d) %s %s", sk_proc_name_address(current_proc()),
3487 sk_proc_pid(current_proc()), desc, sk_dump("buf", pkt_buf, pkt_len,
3488 128, NULL, 0));
3489 }
3490 #else /* !SK_LOG */
3491 #define dp_tx_log_pkt(...)
3492 #endif /* !SK_LOG */
3493
3494 static inline struct ifnet *
fsw_datamov_begin(struct nx_flowswitch * fsw)3495 fsw_datamov_begin(struct nx_flowswitch *fsw)
3496 {
3497 struct ifnet *ifp;
3498
3499 ifp = fsw->fsw_ifp;
3500 if (!ifnet_datamov_begin(ifp)) {
3501 DTRACE_SKYWALK1(ifnet__detached, struct ifnet *, ifp);
3502 return NULL;
3503 }
3504 return ifp;
3505 }
3506
3507 static inline void
fsw_datamov_end(struct nx_flowswitch * fsw)3508 fsw_datamov_end(struct nx_flowswitch *fsw)
3509 {
3510 ifnet_datamov_end(fsw->fsw_ifp);
3511 }
3512
3513 static void
dp_tx_pktq(struct nx_flowswitch * fsw,struct pktq * spktq)3514 dp_tx_pktq(struct nx_flowswitch *fsw, struct pktq *spktq)
3515 {
3516 struct __kern_packet *spkt, *pkt;
3517 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3518 struct flow_entry *__single fe, *__single prev_fe;
3519 struct pktq dropped_pkts, dpktq;
3520 struct nexus_adapter *dev_na;
3521 struct kern_pbufpool *dev_pp;
3522 struct ifnet *ifp = NULL;
3523 sa_family_t af;
3524 uint32_t n_pkts, n_flows = 0;
3525 boolean_t do_pacing = FALSE;
3526 drop_reason_t reason = DROP_REASON_UNSPECIFIED;
3527 uint16_t line = 0;
3528
3529 int err;
3530 KPKTQ_INIT(&dpktq);
3531 KPKTQ_INIT(&dropped_pkts);
3532 n_pkts = KPKTQ_LEN(spktq);
3533
3534 FSW_RLOCK(fsw);
3535 if (__improbable(FSW_QUIESCED(fsw))) {
3536 DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3537 SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3538 KPKTQ_CONCAT(&dropped_pkts, spktq);
3539 reason = DROP_REASON_FSW_QUIESCED;
3540 line = __LINE__;
3541 goto done;
3542 }
3543 dev_na = fsw->fsw_dev_ch->ch_na;
3544 if (__improbable(dev_na == NULL)) {
3545 SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3546 FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3547 KPKTQ_CONCAT(&dropped_pkts, spktq);
3548 reason = DROP_REASON_FSW_TX_DEVPORT_NOT_ATTACHED;
3549 line = __LINE__;
3550 goto done;
3551 }
3552 ifp = fsw_datamov_begin(fsw);
3553 if (ifp == NULL) {
3554 SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
3555 KPKTQ_CONCAT(&dropped_pkts, spktq);
3556 reason = DROP_REASON_FSW_IFNET_NOT_ATTACHED;
3557 line = __LINE__;
3558 goto done;
3559 }
3560
3561 /* batch allocate enough packets */
3562 dev_pp = na_kr_get_pp(dev_na, NR_TX);
3563
3564 err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, n_pkts, NULL,
3565 NULL, SKMEM_NOSLEEP);
3566 #if DEVELOPMENT || DEBUG
3567 if (__probable(err != ENOMEM)) {
3568 _FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
3569 }
3570 #endif /* DEVELOPMENT || DEBUG */
3571 if (__improbable(err == ENOMEM)) {
3572 ASSERT(KPKTQ_EMPTY(&dpktq));
3573 KPKTQ_CONCAT(&dropped_pkts, spktq);
3574 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3575 SK_ERR("failed to alloc %u pkts from device pool", n_pkts);
3576 reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
3577 line = __LINE__;
3578 goto done;
3579 } else if (__improbable(err == EAGAIN)) {
3580 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT,
3581 (n_pkts - KPKTQ_LEN(&dpktq)));
3582 FSW_STATS_ADD(FSW_STATS_DROP,
3583 (n_pkts - KPKTQ_LEN(&dpktq)));
3584 }
3585
3586 n_pkts = KPKTQ_LEN(&dpktq);
3587 prev_fe = NULL;
3588 KPKTQ_FOREACH(spkt, spktq) {
3589 if (n_pkts == 0) {
3590 break;
3591 }
3592 --n_pkts;
3593
3594 KPKTQ_DEQUEUE(&dpktq, pkt);
3595 ASSERT(pkt != NULL);
3596 err = dp_copy_to_dev(fsw, spkt, pkt);
3597 if (__improbable(err != 0)) {
3598 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_PKT_COPY_FAILED,
3599 DROPTAP_FLAG_L2_MISSING);
3600 continue;
3601 }
3602
3603 do_pacing |= ((pkt->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0);
3604 af = fsw_ip_demux(fsw, pkt);
3605 if (__improbable(af == AF_UNSPEC)) {
3606 dp_tx_log_pkt(SK_VERB_ERROR, "demux err", pkt);
3607 FSW_STATS_INC(FSW_STATS_TX_DEMUX_ERR);
3608 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_DEMUX_FAILED,
3609 DROPTAP_FLAG_L2_MISSING);
3610 continue;
3611 }
3612
3613 err = flow_pkt_classify(pkt, ifp, af, false);
3614 if (__improbable(err != 0)) {
3615 dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
3616 FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
3617 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_EXTRACT_FAILED,
3618 DROPTAP_FLAG_L2_MISSING);
3619 continue;
3620 }
3621
3622 if (__improbable(pkt->pkt_flow_ip_is_frag &&
3623 !pkt->pkt_flow_ip_is_first_frag)) {
3624 fe = tx_process_continuous_ip_frag(fsw, prev_fe, pkt);
3625 if (__probable(fe != NULL)) {
3626 flow_entry_retain(fe);
3627 goto flow_batch;
3628 } else {
3629 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3630 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FRAG_BAD_CONT,
3631 DROPTAP_FLAG_L2_MISSING);
3632 continue;
3633 }
3634 }
3635
3636 fe = tx_lookup_flow(fsw, pkt, prev_fe);
3637 if (__improbable(fe == NULL)) {
3638 FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
3639 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_NOT_FOUND,
3640 DROPTAP_FLAG_L2_MISSING);
3641 prev_fe = NULL;
3642 continue;
3643 }
3644 flow_batch:
3645 tx_flow_batch_packet(&fes, fe, pkt);
3646 prev_fe = fe;
3647 }
3648
3649 struct flow_entry *tfe = NULL;
3650 TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
3651 tx_flow_process(fsw, fe, 0);
3652 TAILQ_REMOVE(&fes, fe, fe_tx_link);
3653 fe->fe_tx_is_cont_frag = false;
3654 fe->fe_tx_frag_id = 0;
3655 flow_entry_release(&fe);
3656 n_flows++;
3657 }
3658
3659 done:
3660 FSW_RUNLOCK(fsw);
3661 if (n_flows > 0) {
3662 netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL | (do_pacing ? NETIF_XMIT_FLAG_PACING : 0));
3663 }
3664 if (ifp != NULL) {
3665 fsw_datamov_end(fsw);
3666 }
3667 dp_drop_pktq(fsw, &dropped_pkts, 1, reason, line, DROPTAP_FLAG_L2_MISSING);
3668 KPKTQ_FINI(&dropped_pkts);
3669 KPKTQ_FINI(&dpktq);
3670 }
3671
3672 static sa_family_t
get_tso_af(struct __kern_packet * pkt)3673 get_tso_af(struct __kern_packet *pkt)
3674 {
3675 packet_tso_flags_t tso_flags;
3676
3677 tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
3678 if (tso_flags == PACKET_TSO_IPV4) {
3679 return AF_INET;
3680 } else if (tso_flags == PACKET_TSO_IPV6) {
3681 return AF_INET6;
3682 } else {
3683 panic("invalid tso flags: 0x%x\n", tso_flags);
3684 /* NOTREACHED */
3685 __builtin_unreachable();
3686 }
3687 }
3688
3689 static inline void
update_flow_info(struct __kern_packet * pkt,void * iphdr,void * tcphdr,uint16_t payload_sz)3690 update_flow_info(struct __kern_packet *pkt, void *iphdr, void *tcphdr, uint16_t payload_sz)
3691 {
3692 struct tcphdr *__single tcp = tcphdr;
3693
3694 DTRACE_SKYWALK4(update__flow__info, struct __kern_packet *, pkt,
3695 void *, iphdr, void *, tcphdr, uint16_t, payload_sz);
3696 pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iphdr;
3697 pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcphdr;
3698 pkt->pkt_flow_tcp_flags = tcp->th_flags;
3699 pkt->pkt_flow_tcp_seq = tcp->th_seq;
3700 pkt->pkt_flow_ulen = payload_sz;
3701 }
3702
3703 static int
do_gso(struct nx_flowswitch * fsw,int af,struct __kern_packet * orig_pkt,struct __kern_packet * first_pkt,struct pktq * dev_pktq,struct pktq * gso_pktq)3704 do_gso(struct nx_flowswitch *fsw, int af, struct __kern_packet *orig_pkt,
3705 struct __kern_packet *first_pkt, struct pktq *dev_pktq,
3706 struct pktq *gso_pktq)
3707 {
3708 ifnet_t ifp = fsw->fsw_ifp;
3709 struct __kern_packet *pkt = first_pkt;
3710 uint8_t proto = pkt->pkt_flow_ip_proto;
3711 uint16_t ip_hlen = pkt->pkt_flow_ip_hlen;
3712 uint16_t tcp_hlen = pkt->pkt_flow_tcp_hlen;
3713 uint16_t total_hlen = ip_hlen + tcp_hlen;
3714 uint16_t mtu = (uint16_t)ifp->if_mtu;
3715 uint16_t mss = pkt->pkt_proto_seg_sz, payload_sz;
3716 uint32_t n, n_pkts, off = 0, total_len = orig_pkt->pkt_length;
3717 uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
3718 kern_packet_t orig_ph = SK_PKT2PH(orig_pkt);
3719 uint8_t *orig_pkt_baddr;
3720 struct tcphdr *tcp;
3721 struct ip *ip;
3722 struct ip6_hdr *ip6;
3723 uint32_t tcp_seq;
3724 uint16_t ipid;
3725 uint32_t pseudo_hdr_csum, bufsz;
3726
3727 ASSERT(headroom <= UINT8_MAX);
3728 if (proto != IPPROTO_TCP) {
3729 SK_ERR("invalid proto: %d", proto);
3730 DTRACE_SKYWALK3(invalid__proto, struct nx_flowswitch *,
3731 fsw, ifnet_t, ifp, uint8_t, proto);
3732 return EINVAL;
3733 }
3734 if (mss == 0 || mss > (mtu - total_hlen)) {
3735 SK_ERR("invalid args: mss %d, mtu %d, total_hlen %d",
3736 mss, mtu, total_hlen);
3737 DTRACE_SKYWALK5(invalid__args1, struct nx_flowswitch *,
3738 fsw, ifnet_t, ifp, uint16_t, mss, uint16_t, mtu,
3739 uint32_t, total_hlen);
3740 return EINVAL;
3741 }
3742 bufsz = PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp);
3743 if ((headroom + total_hlen + mss) > bufsz) {
3744 SK_ERR("invalid args: headroom %d, total_hlen %d, "
3745 "mss %d, bufsz %d", headroom, total_hlen, mss, bufsz);
3746 DTRACE_SKYWALK6(invalid__args2, struct nx_flowswitch *,
3747 fsw, ifnet_t, ifp, uint16_t, headroom, uint16_t,
3748 total_hlen, uint16_t, mss, uint32_t, bufsz);
3749 return EINVAL;
3750 }
3751 n_pkts = (uint32_t)(SK_ROUNDUP((total_len - total_hlen), mss) / mss);
3752
3753 ASSERT(pkt->pkt_headroom == headroom);
3754 ASSERT(pkt->pkt_length == total_len);
3755 ASSERT(pkt->pkt_l2_len == 0);
3756 ASSERT((pkt->pkt_qum.qum_qflags & QUM_F_FINALIZED) == 0);
3757 ASSERT((pkt->pkt_pflags & PKT_F_TRUNCATED) != 0);
3758 pkt->pkt_pflags &= ~PKT_F_TRUNCATED;
3759 pkt->pkt_proto_seg_sz = 0;
3760 pkt->pkt_csum_flags = 0;
3761 MD_BUFLET_ADDR_ABS(orig_pkt, orig_pkt_baddr);
3762 orig_pkt_baddr += orig_pkt->pkt_headroom;
3763
3764 if (af == AF_INET) {
3765 /*
3766 * XXX -fbounds-safety: can't avoid using forge unless we change
3767 * the flow metadata definition.
3768 */
3769 ip = __unsafe_forge_bidi_indexable(struct ip *,
3770 pkt->pkt_flow_ip_hdr, pkt->pkt_length);
3771 tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
3772 pkt->pkt_flow_tcp_hdr, pkt->pkt_length - ip_hlen);
3773 ipid = ip->ip_id;
3774 pseudo_hdr_csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
3775 pkt->pkt_flow_ipv4_dst.s_addr, 0);
3776 } else {
3777 ASSERT(af == AF_INET6);
3778 tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
3779 pkt->pkt_flow_tcp_hdr, pkt->pkt_length - ip_hlen);
3780 pseudo_hdr_csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
3781 &pkt->pkt_flow_ipv6_dst, 0);
3782 }
3783 tcp_seq = ntohl(tcp->th_seq);
3784
3785 for (n = 1, payload_sz = mss, off = total_hlen; off < total_len;
3786 off += payload_sz) {
3787 uint8_t *baddr, *baddr0;
3788 uint32_t partial;
3789
3790 if (pkt == NULL) {
3791 n++;
3792 KPKTQ_DEQUEUE(dev_pktq, pkt);
3793 ASSERT(pkt != NULL);
3794 }
3795 MD_BUFLET_ADDR_ABS(pkt, baddr0);
3796 baddr = baddr0;
3797 baddr += headroom;
3798
3799 /* Copy headers from the original packet */
3800 if (n != 1) {
3801 ASSERT(pkt != first_pkt);
3802 pkt_copy(orig_pkt_baddr, baddr, total_hlen);
3803 fsw_pkt_copy_metadata(first_pkt, pkt);
3804
3805 ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
3806 /* flow info still needs to be updated below */
3807 bcopy(first_pkt->pkt_flow, pkt->pkt_flow,
3808 sizeof(*pkt->pkt_flow));
3809 pkt->pkt_trace_id = 0;
3810 ASSERT(pkt->pkt_headroom == headroom);
3811 } else {
3812 METADATA_SET_LEN(pkt, 0, 0);
3813 }
3814 baddr += total_hlen;
3815
3816 /* Copy/checksum the payload from the original packet */
3817 if (off + payload_sz > total_len) {
3818 payload_sz = (uint16_t)(total_len - off);
3819 }
3820 pkt_copypkt_sum(orig_ph,
3821 (uint16_t)(orig_pkt->pkt_headroom + off),
3822 SK_PKT2PH(pkt), headroom + total_hlen, payload_sz,
3823 &partial, TRUE);
3824
3825 DTRACE_SKYWALK6(copy__csum, struct nx_flowswitch *, fsw,
3826 ifnet_t, ifp, uint8_t *, baddr, uint16_t, payload_sz,
3827 uint16_t, mss, uint32_t, partial);
3828 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
3829
3830 /*
3831 * Adjust header information and fill in the missing fields.
3832 */
3833 if (af == AF_INET) {
3834 ip = (struct ip *)(void *)(baddr0 + pkt->pkt_headroom);
3835 tcp = (struct tcphdr *)(void *)((caddr_t)ip + ip_hlen);
3836
3837 if (n != n_pkts) {
3838 tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3839 }
3840 if (n != 1) {
3841 tcp->th_flags &= ~TH_CWR;
3842 tcp->th_seq = htonl(tcp_seq);
3843 }
3844 update_flow_info(pkt, ip, tcp, payload_sz);
3845
3846 ip->ip_id = htons((ipid)++);
3847 ip->ip_len = htons(ip_hlen + tcp_hlen + payload_sz);
3848 ip->ip_sum = 0;
3849 ip->ip_sum = inet_cksum_buffer(ip, 0, 0, ip_hlen);
3850 tcp->th_sum = 0;
3851
3852 partial = __packet_cksum(tcp, tcp_hlen, partial);
3853 partial += htons(tcp_hlen + IPPROTO_TCP + payload_sz);
3854 partial += pseudo_hdr_csum;
3855 ADDCARRY(partial);
3856 tcp->th_sum = ~(uint16_t)partial;
3857 } else {
3858 ASSERT(af == AF_INET6);
3859 ip6 = (struct ip6_hdr *)(baddr0 + pkt->pkt_headroom);
3860 tcp = (struct tcphdr *)(void *)((caddr_t)ip6 + ip_hlen);
3861
3862 if (n != n_pkts) {
3863 tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3864 }
3865 if (n != 1) {
3866 tcp->th_flags &= ~TH_CWR;
3867 tcp->th_seq = htonl(tcp_seq);
3868 }
3869 update_flow_info(pkt, ip6, tcp, payload_sz);
3870
3871 ip6->ip6_plen = htons(tcp_hlen + payload_sz);
3872 tcp->th_sum = 0;
3873 partial = __packet_cksum(tcp, tcp_hlen, partial);
3874 partial += htonl(tcp_hlen + IPPROTO_TCP + payload_sz);
3875 partial += pseudo_hdr_csum;
3876 ADDCARRY(partial);
3877 tcp->th_sum = ~(uint16_t)partial;
3878 }
3879 tcp_seq += payload_sz;
3880 METADATA_ADJUST_LEN(pkt, total_hlen, headroom);
3881 #if (DEVELOPMENT || DEBUG)
3882 struct __kern_buflet *bft;
3883 uint32_t blen;
3884 PKT_GET_FIRST_BUFLET(pkt, 1, bft);
3885 blen = __buflet_get_data_length(bft);
3886 if (blen != total_hlen + payload_sz) {
3887 panic("blen (%d) != total_len + payload_sz (%d)\n",
3888 blen, total_hlen + payload_sz);
3889 }
3890 #endif /* DEVELOPMENT || DEBUG */
3891
3892 pkt->pkt_length = total_hlen + payload_sz;
3893 KPKTQ_ENQUEUE(gso_pktq, pkt);
3894 pkt = NULL;
3895
3896 /*
3897 * Note that at this point the packet is not yet finalized.
3898 * The finalization happens in dp_flow_tx_process() after
3899 * the framing is done.
3900 */
3901 }
3902 ASSERT(n == n_pkts);
3903 ASSERT(off == total_len);
3904 DTRACE_SKYWALK7(gso__done, struct nx_flowswitch *, fsw, ifnet_t, ifp,
3905 uint32_t, n_pkts, uint32_t, total_len, uint16_t, ip_hlen,
3906 uint16_t, tcp_hlen, uint8_t *, orig_pkt_baddr);
3907 return 0;
3908 }
3909
3910 static void
tx_flow_enqueue_gso_pktq(struct flow_entry_list * fes,struct flow_entry * fe,struct pktq * gso_pktq)3911 tx_flow_enqueue_gso_pktq(struct flow_entry_list *fes, struct flow_entry *fe,
3912 struct pktq *gso_pktq)
3913 {
3914 if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
3915 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
3916 TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
3917 KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3918 KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3919 KPKTQ_INIT(gso_pktq);
3920 } else {
3921 ASSERT(!TAILQ_EMPTY(fes));
3922 KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3923 KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3924 KPKTQ_INIT(gso_pktq);
3925 flow_entry_release(&fe);
3926 }
3927 }
3928
3929 static void
dp_gso_pktq(struct nx_flowswitch * fsw,struct pktq * spktq,uint32_t gso_pkts_estimate)3930 dp_gso_pktq(struct nx_flowswitch *fsw, struct pktq *spktq,
3931 uint32_t gso_pkts_estimate)
3932 {
3933 struct __kern_packet *spkt, *pkt;
3934 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3935 struct flow_entry *__single fe, *__single prev_fe;
3936 struct pktq dpktq;
3937 struct nexus_adapter *dev_na;
3938 struct kern_pbufpool *dev_pp;
3939 struct ifnet *ifp = NULL;
3940 sa_family_t af;
3941 uint32_t n_pkts, n_flows = 0;
3942 int err;
3943
3944 KPKTQ_INIT(&dpktq);
3945 n_pkts = KPKTQ_LEN(spktq);
3946
3947 FSW_RLOCK(fsw);
3948 if (__improbable(FSW_QUIESCED(fsw))) {
3949 DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3950 SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3951 dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_QUIESCED, __LINE__,
3952 DROPTAP_FLAG_L2_MISSING);
3953 goto done;
3954 }
3955 dev_na = fsw->fsw_dev_ch->ch_na;
3956 if (__improbable(dev_na == NULL)) {
3957 SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3958 FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3959 dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_TX_DEVPORT_NOT_ATTACHED,
3960 __LINE__, DROPTAP_FLAG_L2_MISSING);
3961 goto done;
3962 }
3963 ifp = fsw_datamov_begin(fsw);
3964 if (ifp == NULL) {
3965 SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
3966 dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_IFNET_NOT_ATTACHED,
3967 __LINE__, DROPTAP_FLAG_L2_MISSING);
3968 goto done;
3969 }
3970
3971 dev_pp = na_kr_get_pp(dev_na, NR_TX);
3972
3973 /*
3974 * Batch allocate enough packets to perform GSO on all
3975 * packets in spktq.
3976 */
3977 err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq,
3978 gso_pkts_estimate, NULL, NULL, SKMEM_NOSLEEP);
3979 #if DEVELOPMENT || DEBUG
3980 if (__probable(err != ENOMEM)) {
3981 _FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
3982 }
3983 #endif /* DEVELOPMENT || DEBUG */
3984 /*
3985 * We either get all packets or none. No partial allocations.
3986 */
3987 if (__improbable(err != 0)) {
3988 if (err == ENOMEM) {
3989 ASSERT(KPKTQ_EMPTY(&dpktq));
3990 } else {
3991 dp_free_pktq(fsw, &dpktq);
3992 }
3993 DTRACE_SKYWALK1(gso__no__mem, int, err);
3994 dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_PP_ALLOC_FAILED,
3995 __LINE__, DROPTAP_FLAG_L2_MISSING);
3996 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3997 SK_ERR("failed to alloc %u pkts from device pool",
3998 gso_pkts_estimate);
3999 goto done;
4000 }
4001 prev_fe = NULL;
4002 KPKTQ_FOREACH(spkt, spktq) {
4003 KPKTQ_DEQUEUE(&dpktq, pkt);
4004 ASSERT(pkt != NULL);
4005 /*
4006 * Copy only headers to the first packet of the GSO chain.
4007 * The headers will be used for classification below.
4008 */
4009 err = dp_copy_headers_to_dev(fsw, spkt, pkt);
4010 if (__improbable(err != 0)) {
4011 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_PKT_COPY_FAILED,
4012 DROPTAP_FLAG_L2_MISSING);
4013 DTRACE_SKYWALK2(copy__headers__failed,
4014 struct nx_flowswitch *, fsw,
4015 struct __kern_packet *, spkt);
4016 continue;
4017 }
4018 af = get_tso_af(pkt);
4019 ASSERT(af == AF_INET || af == AF_INET6);
4020
4021 err = flow_pkt_classify(pkt, ifp, af, false);
4022 if (__improbable(err != 0)) {
4023 dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
4024 FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
4025 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_EXTRACT_FAILED,
4026 DROPTAP_FLAG_L2_MISSING);
4027 DTRACE_SKYWALK4(classify__failed,
4028 struct nx_flowswitch *, fsw,
4029 struct __kern_packet *, spkt,
4030 struct __kern_packet *, pkt,
4031 int, err);
4032 continue;
4033 }
4034 /*
4035 * GSO cannot be done on a fragment and it's a bug in user
4036 * space to mark a fragment as needing GSO.
4037 */
4038 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
4039 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
4040 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FRAG_BAD_CONT,
4041 DROPTAP_FLAG_L2_MISSING);
4042 DTRACE_SKYWALK3(is__frag,
4043 struct nx_flowswitch *, fsw,
4044 struct __kern_packet *, spkt,
4045 struct __kern_packet *, pkt);
4046 continue;
4047 }
4048 fe = tx_lookup_flow(fsw, pkt, prev_fe);
4049 if (__improbable(fe == NULL)) {
4050 FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
4051 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FRAG_BAD_CONT,
4052 DROPTAP_FLAG_L2_MISSING);
4053 DTRACE_SKYWALK3(lookup__failed,
4054 struct nx_flowswitch *, fsw,
4055 struct __kern_packet *, spkt,
4056 struct __kern_packet *, pkt);
4057 prev_fe = NULL;
4058 continue;
4059 }
4060 /*
4061 * Perform GSO on spkt using the flow information
4062 * obtained above.
4063 */
4064 struct pktq gso_pktq;
4065 KPKTQ_INIT(&gso_pktq);
4066 err = do_gso(fsw, af, spkt, pkt, &dpktq, &gso_pktq);
4067 if (__probable(err == 0)) {
4068 tx_flow_enqueue_gso_pktq(&fes, fe, &gso_pktq);
4069 prev_fe = fe;
4070 } else {
4071 DTRACE_SKYWALK1(gso__error, int, err);
4072 /* TODO: increment error stat */
4073 dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_GSO_FAILED,
4074 DROPTAP_FLAG_L2_MISSING);
4075 flow_entry_release(&fe);
4076 prev_fe = NULL;
4077 }
4078 KPKTQ_FINI(&gso_pktq);
4079 }
4080 struct flow_entry *tfe = NULL;
4081 TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
4082 /* Chain-enqueue can be used for GSO chains */
4083 tx_flow_process(fsw, fe, FLOW_PROC_FLAG_GSO);
4084 TAILQ_REMOVE(&fes, fe, fe_tx_link);
4085 flow_entry_release(&fe);
4086 n_flows++;
4087 }
4088 done:
4089 FSW_RUNLOCK(fsw);
4090 if (n_flows > 0) {
4091 netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL);
4092 }
4093 if (ifp != NULL) {
4094 fsw_datamov_end(fsw);
4095 }
4096
4097 /*
4098 * It's possible for packets to be left in dpktq because
4099 * gso_pkts_estimate is only an estimate. The actual number
4100 * of packets needed could be less.
4101 */
4102 uint32_t dpktq_len;
4103 if ((dpktq_len = KPKTQ_LEN(&dpktq)) > 0) {
4104 DTRACE_SKYWALK2(leftover__dev__pkts,
4105 struct nx_flowswitch *, fsw, uint32_t, dpktq_len);
4106 dp_free_pktq(fsw, &dpktq);
4107 }
4108 KPKTQ_FINI(&dpktq);
4109 }
4110
4111 static inline void
fsw_dev_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4112 fsw_dev_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4113 struct proc *p)
4114 {
4115 #pragma unused(p)
4116 uint32_t total_pkts = 0, total_bytes = 0;
4117
4118 for (;;) {
4119 struct pktq pktq;
4120 KPKTQ_INIT(&pktq);
4121 uint32_t n_bytes;
4122 fsw_rx_ring_dequeue_pktq(fsw, r, fsw_rx_batch, &pktq, &n_bytes);
4123 if (n_bytes == 0) {
4124 break;
4125 }
4126 total_pkts += KPKTQ_LEN(&pktq);
4127 total_bytes += n_bytes;
4128
4129 if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) {
4130 fsw_receive(fsw, &pktq);
4131 } else {
4132 fsw_dev_input_netem_enqueue(fsw, &pktq);
4133 }
4134 KPKTQ_FINI(&pktq);
4135 }
4136
4137 KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
4138 DTRACE_SKYWALK2(fsw__dp__dev__ring__flush, uint32_t, total_pkts,
4139 uint32_t, total_bytes);
4140
4141 /* compute mitigation rate for delivered traffic */
4142 if (__probable(r->ckr_netif_mit_stats != NULL)) {
4143 r->ckr_netif_mit_stats(r, total_pkts, total_bytes);
4144 }
4145 }
4146
4147 static inline void
fsw_user_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4148 fsw_user_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4149 struct proc *p)
4150 {
4151 #pragma unused(p)
4152 static packet_trace_id_t trace_id = 0;
4153 uint32_t total_pkts = 0, total_bytes = 0;
4154
4155 for (;;) {
4156 struct pktq pktq;
4157 KPKTQ_INIT(&pktq);
4158 uint32_t n_bytes;
4159 uint32_t gso_pkts_estimate = 0;
4160
4161 fsw_tx_ring_dequeue_pktq(fsw, r, fsw_tx_batch, &pktq, &n_bytes,
4162 &gso_pkts_estimate);
4163 if (n_bytes == 0) {
4164 break;
4165 }
4166 total_pkts += KPKTQ_LEN(&pktq);
4167 total_bytes += n_bytes;
4168
4169 KPKTQ_FIRST(&pktq)->pkt_trace_id = ++trace_id;
4170 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_START,
4171 KPKTQ_FIRST(&pktq)->pkt_trace_id);
4172
4173 if (gso_pkts_estimate > 0) {
4174 dp_gso_pktq(fsw, &pktq, gso_pkts_estimate);
4175 } else {
4176 dp_tx_pktq(fsw, &pktq);
4177 }
4178 dp_free_pktq(fsw, &pktq);
4179 KPKTQ_FINI(&pktq);
4180 }
4181 kr_update_stats(r, total_pkts, total_bytes);
4182
4183 KDBG(SK_KTRACE_FSW_USER_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
4184 DTRACE_SKYWALK2(fsw__dp__user__ring__flush, uint32_t, total_pkts,
4185 uint32_t, total_bytes);
4186 }
4187
4188 void
fsw_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4189 fsw_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4190 struct proc *p)
4191 {
4192 struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
4193
4194 ASSERT(sk_is_sync_protected());
4195 ASSERT(vpna->vpna_nx_port != FSW_VP_HOST);
4196 ASSERT(vpna->vpna_up.na_md_type == NEXUS_META_TYPE_PACKET);
4197
4198 if (vpna->vpna_nx_port == FSW_VP_DEV) {
4199 fsw_dev_ring_flush(fsw, r, p);
4200 } else {
4201 fsw_user_ring_flush(fsw, r, p);
4202 }
4203 }
4204
4205 int
fsw_dp_ctor(struct nx_flowswitch * fsw)4206 fsw_dp_ctor(struct nx_flowswitch *fsw)
4207 {
4208 uint32_t fe_cnt = fsw_fe_table_size;
4209 uint32_t fob_cnt = fsw_flow_owner_buckets;
4210 uint32_t frb_cnt = fsw_flow_route_buckets;
4211 uint32_t frib_cnt = fsw_flow_route_id_buckets;
4212 struct kern_nexus *nx = fsw->fsw_nx;
4213 char name[64];
4214 const char *__null_terminated fsw_name = NULL;
4215 int error = 0;
4216
4217 /* just in case */
4218 if (fe_cnt == 0) {
4219 fe_cnt = NX_FSW_FE_TABLESZ;
4220 ASSERT(fe_cnt != 0);
4221 }
4222 if (fob_cnt == 0) {
4223 fob_cnt = NX_FSW_FOB_HASHSZ;
4224 ASSERT(fob_cnt != 0);
4225 }
4226 if (frb_cnt == 0) {
4227 frb_cnt = NX_FSW_FRB_HASHSZ;
4228 ASSERT(frb_cnt != 0);
4229 }
4230 if (frib_cnt == 0) {
4231 frib_cnt = NX_FSW_FRIB_HASHSZ;
4232 ASSERT(frib_cnt != 0);
4233 }
4234
4235 /* make sure fe_cnt is a power of two, else round up */
4236 if ((fe_cnt & (fe_cnt - 1)) != 0) {
4237 fe_cnt--;
4238 fe_cnt |= (fe_cnt >> 1);
4239 fe_cnt |= (fe_cnt >> 2);
4240 fe_cnt |= (fe_cnt >> 4);
4241 fe_cnt |= (fe_cnt >> 8);
4242 fe_cnt |= (fe_cnt >> 16);
4243 fe_cnt++;
4244 }
4245
4246 /* make sure frb_cnt is a power of two, else round up */
4247 if ((frb_cnt & (frb_cnt - 1)) != 0) {
4248 frb_cnt--;
4249 frb_cnt |= (frb_cnt >> 1);
4250 frb_cnt |= (frb_cnt >> 2);
4251 frb_cnt |= (frb_cnt >> 4);
4252 frb_cnt |= (frb_cnt >> 8);
4253 frb_cnt |= (frb_cnt >> 16);
4254 frb_cnt++;
4255 }
4256
4257 lck_mtx_init(&fsw->fsw_detach_barrier_lock, &nexus_lock_group,
4258 &nexus_lock_attr);
4259 lck_mtx_init(&fsw->fsw_reap_lock, &nexus_lock_group, &nexus_lock_attr);
4260 lck_mtx_init(&fsw->fsw_linger_lock, &nexus_lock_group, &nexus_lock_attr);
4261 TAILQ_INIT(&fsw->fsw_linger_head);
4262
4263 fsw_name = tsnprintf(name, sizeof(name), "%s_%llu", NX_FSW_NAME, nx->nx_id);
4264 error = nx_advisory_alloc(nx, fsw_name,
4265 &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
4266 NEXUS_ADVISORY_TYPE_FLOWSWITCH);
4267 if (error != 0) {
4268 fsw_dp_dtor(fsw);
4269 return error;
4270 }
4271
4272 fsw->fsw_flow_mgr = flow_mgr_create(fe_cnt, fob_cnt, frb_cnt, frib_cnt);
4273 if (fsw->fsw_flow_mgr == NULL) {
4274 fsw_dp_dtor(fsw);
4275 return error;
4276 }
4277
4278 /* generic name; will be customized upon ifattach */
4279 (void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
4280 FSW_REAP_THREADNAME, name, "");
4281
4282 if (kernel_thread_start(fsw_reap_thread_func, fsw,
4283 &fsw->fsw_reap_thread) != KERN_SUCCESS) {
4284 panic_plain("%s: can't create thread", __func__);
4285 /* NOTREACHED */
4286 __builtin_unreachable();
4287 }
4288 /* this must not fail */
4289 VERIFY(fsw->fsw_reap_thread != NULL);
4290
4291 SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
4292
4293
4294 return error;
4295 }
4296
4297 void
fsw_dp_dtor(struct nx_flowswitch * fsw)4298 fsw_dp_dtor(struct nx_flowswitch *fsw)
4299 {
4300 uint64_t f = (1 * NSEC_PER_MSEC); /* 1 ms */
4301 uint64_t s = (1000 * NSEC_PER_SEC); /* 1 sec */
4302 uint32_t i = 0;
4303
4304 #if (DEVELOPMENT || DEBUG)
4305 if (fsw->fsw_rps_threads != NULL) {
4306 for (i = 0; i < fsw->fsw_rps_nthreads; i++) {
4307 fsw_rps_thread_join(fsw, i);
4308 }
4309 kfree_type_counted_by(struct fsw_rps_thread, fsw->fsw_rps_nthreads,
4310 fsw->fsw_rps_threads);
4311 }
4312 #endif /* !DEVELOPMENT && !DEBUG */
4313
4314 nx_advisory_free(fsw->fsw_nx);
4315
4316 if (fsw->fsw_reap_thread != THREAD_NULL) {
4317 /* signal thread to begin self-termination */
4318 lck_mtx_lock(&fsw->fsw_reap_lock);
4319 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATING;
4320
4321 /*
4322 * And wait for thread to terminate; use another
4323 * wait channel here other than fsw_reap_flags to
4324 * make it more explicit. In the event the reaper
4325 * thread misses a wakeup, we'll try again once
4326 * every second (except for the first time).
4327 */
4328 while (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)) {
4329 uint64_t t = 0;
4330
4331 nanoseconds_to_absolutetime((i++ == 0) ? f : s, &t);
4332 clock_absolutetime_interval_to_deadline(t, &t);
4333 ASSERT(t != 0);
4334
4335 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATEBLOCK;
4336 if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)) {
4337 thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4338 }
4339 (void) assert_wait_deadline(&fsw->fsw_reap_thread,
4340 THREAD_UNINT, t);
4341 lck_mtx_unlock(&fsw->fsw_reap_lock);
4342 thread_block(THREAD_CONTINUE_NULL);
4343 lck_mtx_lock(&fsw->fsw_reap_lock);
4344 fsw->fsw_reap_flags &= ~FSW_REAPF_TERMINATEBLOCK;
4345 }
4346 ASSERT(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED);
4347 lck_mtx_unlock(&fsw->fsw_reap_lock);
4348 fsw->fsw_reap_thread = THREAD_NULL;
4349 }
4350
4351 /* free any remaining flow entries in the linger list */
4352 fsw_linger_purge(fsw);
4353
4354 if (fsw->fsw_flow_mgr != NULL) {
4355 flow_mgr_destroy(fsw->fsw_flow_mgr);
4356 fsw->fsw_flow_mgr = NULL;
4357 }
4358
4359
4360 lck_mtx_destroy(&fsw->fsw_detach_barrier_lock, &nexus_lock_group);
4361 lck_mtx_destroy(&fsw->fsw_reap_lock, &nexus_lock_group);
4362 lck_mtx_destroy(&fsw->fsw_linger_lock, &nexus_lock_group);
4363 }
4364
4365 void
fsw_linger_insert(struct flow_entry * fe)4366 fsw_linger_insert(struct flow_entry *fe)
4367 {
4368 struct nx_flowswitch *fsw = fe->fe_fsw;
4369 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4370 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
4371 fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
4372 fe->fe_flags, FLOWENTF_BITS);
4373
4374 net_update_uptime();
4375
4376 ASSERT(flow_entry_refcnt(fe) >= 1);
4377 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4378 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4379 ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
4380 ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE);
4381 ASSERT(fe->fe_linger_wait != 0);
4382 fe->fe_linger_expire = (_net_uptime + fe->fe_linger_wait);
4383 os_atomic_or(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4384
4385 lck_mtx_lock_spin(&fsw->fsw_linger_lock);
4386 TAILQ_INSERT_TAIL(&fsw->fsw_linger_head, fe, fe_linger_link);
4387 fsw->fsw_linger_cnt++;
4388 VERIFY(fsw->fsw_linger_cnt != 0);
4389 lck_mtx_unlock(&fsw->fsw_linger_lock);
4390
4391 fsw_reap_sched(fsw);
4392 }
4393
4394 static void
fsw_linger_remove_internal(struct flow_entry_linger_head * linger_head,struct flow_entry * fe)4395 fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head,
4396 struct flow_entry *fe)
4397 {
4398 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4399 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
4400 fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
4401 fe->fe_flags, FLOWENTF_BITS);
4402
4403 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4404 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4405 ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
4406 os_atomic_andnot(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4407
4408 TAILQ_REMOVE(linger_head, fe, fe_linger_link);
4409 flow_entry_release(&fe);
4410 }
4411
4412 static void
fsw_linger_remove(struct flow_entry * fe)4413 fsw_linger_remove(struct flow_entry *fe)
4414 {
4415 struct nx_flowswitch *fsw = fe->fe_fsw;
4416
4417 LCK_MTX_ASSERT(&fsw->fsw_linger_lock, LCK_MTX_ASSERT_OWNED);
4418
4419 fsw_linger_remove_internal(&fsw->fsw_linger_head, fe);
4420 VERIFY(fsw->fsw_linger_cnt != 0);
4421 fsw->fsw_linger_cnt--;
4422 }
4423
4424 void
fsw_linger_purge(struct nx_flowswitch * fsw)4425 fsw_linger_purge(struct nx_flowswitch *fsw)
4426 {
4427 struct flow_entry *fe, *tfe;
4428
4429 lck_mtx_lock(&fsw->fsw_linger_lock);
4430 TAILQ_FOREACH_SAFE(fe, &fsw->fsw_linger_head, fe_linger_link, tfe) {
4431 fsw_linger_remove(fe);
4432 }
4433 ASSERT(fsw->fsw_linger_cnt == 0);
4434 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4435 lck_mtx_unlock(&fsw->fsw_linger_lock);
4436 }
4437
4438 static void
fsw_defunct_rx_stall_channel(struct nx_flowswitch * fsw)4439 fsw_defunct_rx_stall_channel(struct nx_flowswitch *fsw)
4440 {
4441 struct kern_nexus *nx;
4442 uint64_t now = _net_uptime;
4443
4444 nx = fsw->fsw_nx;
4445
4446 /* Walk through all channels and check for Rx stall condition */
4447 /* uncrustify doesn't handle C blocks properly */
4448 /* BEGIN IGNORE CODESTYLE */
4449 nx_port_foreach(nx, ^(nexus_port_t nxport) {
4450 struct nexus_adapter *na = nx_port_get_na(nx, nxport);
4451 uint64_t elapsed, enqueue_ts, dequeue_ts;
4452 struct __kern_channel_ring *ring;
4453 struct kern_channel *ch;
4454 struct proc *p;
4455
4456 if (na == NULL || na->na_work_ts == 0 || na->na_rx_rings == NULL) {
4457 return;
4458 }
4459 ch = (struct kern_channel *)na->na_private;
4460 if (ch == NULL) {
4461 return;
4462 }
4463 ring = KR_SINGLE(na->na_rx_rings);
4464 enqueue_ts = ring->ckr_rx_enqueue_ts;
4465 dequeue_ts = ring->ckr_rx_dequeue_ts;
4466 /* Elapsed time since last Rx enqueue */
4467 elapsed = now - enqueue_ts;
4468 if ((dequeue_ts < enqueue_ts) && (elapsed > fsw_rx_stall_thresh)) {
4469 p = proc_find(ch->ch_pid);
4470 if (p == NULL) {
4471 return;
4472 }
4473 if (fsw_rx_stall_defunct) {
4474 kern_channel_defunct(p, ch);
4475 }
4476 proc_rele(p);
4477 DTRACE_SKYWALK3(rx__stall, struct nx_flowswitch *, fsw,
4478 struct nexus_adapter *, na, struct __kern_channel_ring *, ring);
4479 FSW_STATS_INC(FSW_STATS_RX_STALL);
4480 SK_ERR("Rx stall detected in proc %s(%llu) (%s): "
4481 "elapsed %llu (s), now: %llu, enqueue: %llu, dequeue: %llu, "
4482 "defunct: %s",
4483 ch->ch_name, ch->ch_pid, fsw->fsw_ifp->if_xname,
4484 elapsed, now, enqueue_ts, dequeue_ts,
4485 fsw_rx_stall_defunct ? "yes" : "no");
4486 }
4487 });
4488 /* END IGNORE CODESTYLE */
4489 }
4490
4491 void
fsw_reap_sched(struct nx_flowswitch * fsw)4492 fsw_reap_sched(struct nx_flowswitch *fsw)
4493 {
4494 ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
4495 lck_mtx_lock_spin(&fsw->fsw_reap_lock);
4496 if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING) &&
4497 !(fsw->fsw_reap_flags & (FSW_REAPF_TERMINATING | FSW_REAPF_TERMINATED))) {
4498 thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4499 }
4500 lck_mtx_unlock(&fsw->fsw_reap_lock);
4501 }
4502
4503 __attribute__((noreturn))
4504 static void
fsw_reap_thread_func(void * v,wait_result_t w)4505 fsw_reap_thread_func(void *v, wait_result_t w)
4506 {
4507 #pragma unused(w)
4508 struct nx_flowswitch *__single fsw = v;
4509
4510 ASSERT(fsw->fsw_reap_thread == current_thread());
4511 /*
4512 * -fbounds-safety: __unsafe_null_terminated_from_indexable provides
4513 * checks to ensure source contains the null terminator, by doing a
4514 * linear scan of the string.
4515 */
4516 thread_set_thread_name(current_thread(),
4517 __unsafe_null_terminated_from_indexable(fsw->fsw_reap_name));
4518
4519 net_update_uptime();
4520
4521 lck_mtx_lock(&fsw->fsw_reap_lock);
4522 VERIFY(!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING));
4523 (void) assert_wait(&fsw->fsw_reap_flags, THREAD_UNINT);
4524 lck_mtx_unlock(&fsw->fsw_reap_lock);
4525 thread_block_parameter(fsw_reap_thread_cont, fsw);
4526 /* NOTREACHED */
4527 __builtin_unreachable();
4528 }
4529
4530 __attribute__((noreturn))
4531 static void
fsw_reap_thread_cont(void * v,wait_result_t wres)4532 fsw_reap_thread_cont(void *v, wait_result_t wres)
4533 {
4534 struct nx_flowswitch *__single fsw = v;
4535 boolean_t low;
4536 uint64_t t = 0;
4537
4538 SK_DF(SK_VERB_FLOW, "%s: running", fsw->fsw_reap_name);
4539
4540 lck_mtx_lock(&fsw->fsw_reap_lock);
4541 if (__improbable(wres == THREAD_INTERRUPTED ||
4542 (fsw->fsw_reap_flags & FSW_REAPF_TERMINATING) != 0)) {
4543 goto terminate;
4544 }
4545
4546 ASSERT(!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED));
4547 fsw->fsw_reap_flags |= FSW_REAPF_RUNNING;
4548 lck_mtx_unlock(&fsw->fsw_reap_lock);
4549
4550 net_update_uptime();
4551
4552 /* prevent detach from happening while we're here */
4553 if (!fsw_detach_barrier_add(fsw)) {
4554 SK_ERR("%s: netagent detached", fsw->fsw_reap_name);
4555 t = 0;
4556 } else {
4557 uint32_t fe_nonviable, fe_freed, fe_aborted;
4558 uint32_t fr_freed, fr_resid = 0;
4559 struct ifnet *ifp = fsw->fsw_ifp;
4560 uint64_t i = FSW_REAP_IVAL;
4561 uint64_t now = _net_uptime;
4562 uint64_t last;
4563
4564 ASSERT(fsw->fsw_ifp != NULL);
4565
4566 /*
4567 * Pass 1: process any deferred {withdrawn,nonviable} requests.
4568 */
4569 fe_nonviable = fsw_process_deferred(fsw);
4570
4571 /*
4572 * Pass 2: remove any expired lingering flows.
4573 */
4574 fe_freed = fsw_process_linger(fsw, &fe_aborted);
4575
4576 /*
4577 * Pass 3: prune idle flow routes.
4578 */
4579 fr_freed = flow_route_prune(fsw->fsw_flow_mgr,
4580 ifp, &fr_resid);
4581
4582 /*
4583 * Pass 4: prune flow table
4584 *
4585 */
4586 cuckoo_hashtable_try_shrink(fsw->fsw_flow_mgr->fm_flow_table);
4587
4588 SK_DF(SK_VERB_FLOW, "%s: fe_nonviable %u/%u fe_freed %u/%u "
4589 "fe_aborted %u fr_freed %u/%u",
4590 fsw->fsw_flow_mgr->fm_name, fe_nonviable,
4591 (fe_nonviable + fsw->fsw_pending_nonviable),
4592 fe_freed, fsw->fsw_linger_cnt, fe_aborted, fe_freed,
4593 (fe_freed + fr_resid));
4594
4595 /* see if VM memory level is critical */
4596 low = skmem_lowmem_check();
4597
4598 /*
4599 * If things appear to be idle, we can prune away cached
4600 * object that have fallen out of the working sets (this
4601 * is different than purging). Every once in a while, we
4602 * also purge the caches. Note that this is done across
4603 * all flowswitch instances, and so we limit this to no
4604 * more than once every FSW_REAP_SK_THRES seconds.
4605 */
4606 last = os_atomic_load(&fsw_reap_last, relaxed);
4607 if ((low || (last != 0 && (now - last) >= FSW_REAP_SK_THRES)) &&
4608 os_atomic_cmpxchg(&fsw_reap_last, last, now, acq_rel)) {
4609 fsw_purge_cache(fsw, low);
4610
4611 /* increase sleep interval if idle */
4612 if (kdebug_enable == 0 && fsw->fsw_linger_cnt == 0 &&
4613 fsw->fsw_pending_nonviable == 0 && fr_resid == 0) {
4614 i <<= 3;
4615 }
4616 } else if (last == 0) {
4617 os_atomic_store(&fsw_reap_last, now, release);
4618 }
4619
4620 /*
4621 * Additionally, run thru the list of channels and prune
4622 * or purge away cached objects on "idle" channels. This
4623 * check is rate limited to no more than once every
4624 * FSW_DRAIN_CH_THRES seconds.
4625 */
4626 last = fsw->fsw_drain_channel_chk_last;
4627 if (low || (last != 0 && (now - last) >= FSW_DRAIN_CH_THRES)) {
4628 SK_DF(SK_VERB_FLOW, "%s: pruning channels",
4629 fsw->fsw_flow_mgr->fm_name);
4630
4631 fsw->fsw_drain_channel_chk_last = now;
4632 fsw_drain_channels(fsw, now, low);
4633 } else if (__improbable(last == 0)) {
4634 fsw->fsw_drain_channel_chk_last = now;
4635 }
4636
4637 /*
4638 * Finally, invoke the interface's reap callback to
4639 * tell it to prune or purge away cached objects if
4640 * it is idle. This check is rate limited to no more
4641 * than once every FSW_REAP_IF_THRES seconds.
4642 */
4643 last = fsw->fsw_drain_netif_chk_last;
4644 if (low || (last != 0 && (now - last) >= FSW_REAP_IF_THRES)) {
4645 ASSERT(fsw->fsw_nifna != NULL);
4646
4647 if (ifp->if_na_ops != NULL &&
4648 ifp->if_na_ops->ni_reap != NULL) {
4649 SK_DF(SK_VERB_FLOW, "%s: pruning netif",
4650 fsw->fsw_flow_mgr->fm_name);
4651 ifp->if_na_ops->ni_reap(ifp->if_na, ifp,
4652 FSW_REAP_IF_THRES, low);
4653 }
4654
4655 fsw->fsw_drain_netif_chk_last = now;
4656 } else if (__improbable(last == 0)) {
4657 fsw->fsw_drain_netif_chk_last = now;
4658 }
4659
4660 /* emit periodic interface stats ktrace */
4661 last = fsw->fsw_reap_last;
4662 if (last != 0 && (now - last) >= FSW_IFSTATS_THRES) {
4663 KDBG(SK_KTRACE_AON_IF_STATS, ifp->if_data.ifi_ipackets,
4664 ifp->if_data.ifi_ibytes * 8,
4665 ifp->if_data.ifi_opackets,
4666 ifp->if_data.ifi_obytes * 8);
4667
4668 fsw->fsw_reap_last = now;
4669 } else if (__improbable(last == 0)) {
4670 fsw->fsw_reap_last = now;
4671 }
4672
4673 /* Check for Rx stall condition every NX_FSW_RX_STALL_THRES seconds */
4674 last = fsw->fsw_rx_stall_chk_last;
4675 if (last != 0 && (now - last) >= NX_FSW_RX_STALL_THRES) {
4676 fsw_defunct_rx_stall_channel(fsw);
4677 fsw->fsw_rx_stall_chk_last = now;
4678 } else if (__improbable(last == 0)) {
4679 fsw->fsw_rx_stall_chk_last = now;
4680 }
4681
4682 nanoseconds_to_absolutetime(i * NSEC_PER_SEC, &t);
4683 clock_absolutetime_interval_to_deadline(t, &t);
4684 ASSERT(t != 0);
4685
4686 /* allow any pending detach to proceed */
4687 fsw_detach_barrier_remove(fsw);
4688 }
4689
4690 lck_mtx_lock(&fsw->fsw_reap_lock);
4691 if (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING)) {
4692 fsw->fsw_reap_flags &= ~FSW_REAPF_RUNNING;
4693 (void) assert_wait_deadline(&fsw->fsw_reap_flags,
4694 THREAD_UNINT, t);
4695 lck_mtx_unlock(&fsw->fsw_reap_lock);
4696 thread_block_parameter(fsw_reap_thread_cont, fsw);
4697 /* NOTREACHED */
4698 __builtin_unreachable();
4699 } else {
4700 terminate:
4701 LCK_MTX_ASSERT(&fsw->fsw_reap_lock, LCK_MTX_ASSERT_OWNED);
4702 fsw->fsw_reap_flags &= ~(FSW_REAPF_RUNNING | FSW_REAPF_TERMINATING);
4703 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATED;
4704 /*
4705 * And signal any thread waiting for us to terminate;
4706 * wait channel here other than fsw_reap_flags to make
4707 * it more explicit.
4708 */
4709 if (fsw->fsw_reap_flags & FSW_REAPF_TERMINATEBLOCK) {
4710 thread_wakeup((caddr_t)&fsw->fsw_reap_thread);
4711 }
4712 lck_mtx_unlock(&fsw->fsw_reap_lock);
4713
4714 SK_DF(SK_VERB_FLOW, "%s: terminating", fsw->fsw_reap_name);
4715
4716 /* for the extra refcnt from kernel_thread_start() */
4717 thread_deallocate(current_thread());
4718 /* this is the end */
4719 thread_terminate(current_thread());
4720 /* NOTREACHED */
4721 __builtin_unreachable();
4722 }
4723
4724 /* must never get here */
4725 VERIFY(0);
4726 /* NOTREACHED */
4727 __builtin_unreachable();
4728 }
4729
4730 static void
fsw_drain_channels(struct nx_flowswitch * fsw,uint64_t now,boolean_t low)4731 fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low)
4732 {
4733 struct kern_nexus *nx = fsw->fsw_nx;
4734
4735 /* flowswitch protects NA via fsw_lock, see fsw_port_alloc/free */
4736 FSW_RLOCK(fsw);
4737
4738 /* uncrustify doesn't handle C blocks properly */
4739 /* BEGIN IGNORE CODESTYLE */
4740 nx_port_foreach(nx, ^(nexus_port_t p) {
4741 struct nexus_adapter *na = nx_port_get_na(nx, p);
4742 if (na == NULL || na->na_work_ts == 0 || na->na_rx_rings == NULL) {
4743 return;
4744 }
4745
4746 boolean_t purge;
4747
4748 /*
4749 * If some activity happened in the last FSW_DRAIN_CH_THRES
4750 * seconds on this channel, we reclaim memory if the channel
4751 * throughput is less than the reap threshold value.
4752 */
4753 if ((now - na->na_work_ts) < FSW_DRAIN_CH_THRES) {
4754 struct __kern_channel_ring *__single ring;
4755 channel_ring_stats *stats;
4756 uint64_t bps;
4757
4758 ring = KR_SINGLE(na->na_rx_rings);
4759 stats = &ring->ckr_stats;
4760 bps = stats->crs_bytes_per_second;
4761
4762 if (bps < fsw_channel_reap_thresh) {
4763 purge = FALSE;
4764 na_drain(na, purge);
4765 }
4766 return;
4767 }
4768
4769 /*
4770 * If NA has been inactive for some time (twice the drain
4771 * threshold), we clear the work timestamp to temporarily skip
4772 * this channel until it's active again. Purging cached objects
4773 * can be expensive since we'd need to allocate and construct
4774 * them again, so we do it only when necessary.
4775 */
4776 if (low || ((now - na->na_work_ts) >= (FSW_DRAIN_CH_THRES << 1))) {
4777 na->na_work_ts = 0;
4778 purge = TRUE;
4779 } else {
4780 purge = FALSE;
4781 }
4782
4783 na_drain(na, purge); /* purge/prune caches */
4784 });
4785 /* END IGNORE CODESTYLE */
4786
4787 FSW_RUNLOCK(fsw);
4788 }
4789
4790 static void
fsw_purge_cache(struct nx_flowswitch * fsw,boolean_t low)4791 fsw_purge_cache(struct nx_flowswitch *fsw, boolean_t low)
4792 {
4793 #pragma unused(fsw)
4794 uint64_t o = os_atomic_inc_orig(&fsw_want_purge, relaxed);
4795 uint32_t p = fsw_flow_purge_thresh;
4796 boolean_t purge = (low || (o != 0 && p != 0 && (o % p) == 0));
4797
4798 SK_DF(SK_VERB_FLOW, "%s: %s caches",
4799 fsw->fsw_flow_mgr->fm_name,
4800 (purge ? "purge" : "prune"));
4801
4802 skmem_cache_reap_now(sk_fo_cache, purge);
4803 skmem_cache_reap_now(sk_fe_cache, purge);
4804 skmem_cache_reap_now(sk_fab_cache, purge);
4805 skmem_cache_reap_now(flow_route_cache, purge);
4806 skmem_cache_reap_now(flow_stats_cache, purge);
4807 netns_reap_caches(purge);
4808 skmem_reap_caches(purge);
4809
4810 #if CONFIG_MBUF_MCACHE
4811 if (if_is_fsw_transport_netagent_enabled() && purge) {
4812 mbuf_drain(FALSE);
4813 }
4814 #endif /* CONFIG_MBUF_MCACHE */
4815 }
4816
4817 static void
fsw_flow_handle_low_power(struct nx_flowswitch * fsw,struct flow_entry * fe)4818 fsw_flow_handle_low_power(struct nx_flowswitch *fsw, struct flow_entry *fe)
4819 {
4820 /* When the interface is in low power mode, the flow is nonviable */
4821 if (!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
4822 os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
4823 os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
4824 }
4825 }
4826
4827 static uint32_t
fsw_process_deferred(struct nx_flowswitch * fsw)4828 fsw_process_deferred(struct nx_flowswitch *fsw)
4829 {
4830 struct flow_entry_dead sfed __sk_aligned(8);
4831 struct flow_mgr *fm = fsw->fsw_flow_mgr;
4832 struct flow_entry_dead *fed, *tfed;
4833 LIST_HEAD(, flow_entry_dead) fed_head =
4834 LIST_HEAD_INITIALIZER(fed_head);
4835 uint32_t i, nonviable = 0;
4836 boolean_t lowpowermode = FALSE;
4837
4838 bzero(&sfed, sizeof(sfed));
4839
4840 /*
4841 * The flows become nonviable when the interface
4842 * is in low power mode (edge trigger)
4843 */
4844 if ((fsw->fsw_ifp->if_xflags & IFXF_LOW_POWER) &&
4845 fsw->fsw_ifp->if_low_power_gencnt != fsw->fsw_low_power_gencnt) {
4846 lowpowermode = TRUE;
4847 fsw->fsw_low_power_gencnt = fsw->fsw_ifp->if_low_power_gencnt;
4848 }
4849
4850 /*
4851 * Scan thru the flow entry tree, and commit any pending withdraw or
4852 * nonviable requests. We may need to push stats and/or unassign the
4853 * nexus from NECP, but we cannot do that while holding the locks;
4854 * build a temporary list for those entries.
4855 */
4856 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
4857 struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
4858 struct flow_owner *fo;
4859
4860 /*
4861 * Grab the lock at all costs when handling low power mode
4862 */
4863 if (__probable(!lowpowermode)) {
4864 if (!FOB_TRY_LOCK(fob)) {
4865 continue;
4866 }
4867 } else {
4868 FOB_LOCK(fob);
4869 }
4870
4871 FOB_LOCK_ASSERT_HELD(fob);
4872 RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
4873 struct flow_entry *fe;
4874
4875 RB_FOREACH(fe, flow_entry_id_tree,
4876 &fo->fo_flow_entry_id_head) {
4877 /* try first as reader; skip if we can't */
4878 if (__improbable(lowpowermode)) {
4879 fsw_flow_handle_low_power(fsw, fe);
4880 }
4881 if (__improbable(fe->fe_flags & FLOWENTF_HALF_CLOSED)) {
4882 os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
4883 flow_namespace_half_close(&fe->fe_port_reservation);
4884 }
4885
4886 /* if not withdrawn/nonviable, skip */
4887 if (!fe->fe_want_withdraw &&
4888 !fe->fe_want_nonviable) {
4889 continue;
4890 }
4891 /*
4892 * Here we're holding the lock as writer;
4893 * don't spend too much time as we're
4894 * blocking the data path now.
4895 */
4896 ASSERT(!uuid_is_null(fe->fe_uuid));
4897 /* only need flow UUID and booleans */
4898 uuid_copy(sfed.fed_uuid, fe->fe_uuid);
4899 sfed.fed_want_clonotify =
4900 (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY);
4901 sfed.fed_want_nonviable = fe->fe_want_nonviable;
4902 flow_entry_teardown(fo, fe);
4903
4904 /* do this outside the flow bucket lock */
4905 fed = flow_entry_dead_alloc(Z_WAITOK);
4906 ASSERT(fed != NULL);
4907 *fed = sfed;
4908 LIST_INSERT_HEAD(&fed_head, fed, fed_link);
4909 }
4910 }
4911 FOB_UNLOCK(fob);
4912 }
4913
4914 /*
4915 * These nonviable flows are no longer useful since we've lost
4916 * the source IP address; in the event the client monitors the
4917 * viability of the flow, explicitly mark it as nonviable so
4918 * that a new flow can be created.
4919 */
4920 LIST_FOREACH_SAFE(fed, &fed_head, fed_link, tfed) {
4921 LIST_REMOVE(fed, fed_link);
4922 ASSERT(fsw->fsw_agent_session != NULL);
4923
4924 /* if flow is closed early */
4925 if (fed->fed_want_clonotify) {
4926 necp_client_early_close(fed->fed_uuid);
4927 }
4928
4929 /* if nonviable, unassign nexus attributes */
4930 if (fed->fed_want_nonviable) {
4931 (void) netagent_assign_nexus(fsw->fsw_agent_session,
4932 fed->fed_uuid, NULL, 0);
4933 }
4934
4935 flow_entry_dead_free(fed);
4936 ++nonviable;
4937 }
4938 ASSERT(LIST_EMPTY(&fed_head));
4939
4940 return nonviable;
4941 }
4942
4943 static uint32_t
fsw_process_linger(struct nx_flowswitch * fsw,uint32_t * abort)4944 fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort)
4945 {
4946 struct flow_entry_linger_head linger_head =
4947 TAILQ_HEAD_INITIALIZER(linger_head);
4948 struct flow_entry *fe, *tfe;
4949 uint64_t now = _net_uptime;
4950 uint32_t i = 0, cnt = 0, freed = 0;
4951
4952 ASSERT(fsw->fsw_ifp != NULL);
4953 ASSERT(abort != NULL);
4954 *abort = 0;
4955
4956 /*
4957 * We don't want to contend with the datapath, so move
4958 * everything that's in the linger list into a local list.
4959 * This allows us to generate RSTs or free the flow entry
4960 * outside the lock. Any remaining flow entry in the local
4961 * list will get re-added back to the head of the linger
4962 * list, in front of any new ones added since then.
4963 */
4964 lck_mtx_lock(&fsw->fsw_linger_lock);
4965 TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
4966 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4967 cnt = fsw->fsw_linger_cnt;
4968 fsw->fsw_linger_cnt = 0;
4969 lck_mtx_unlock(&fsw->fsw_linger_lock);
4970
4971 TAILQ_FOREACH_SAFE(fe, &linger_head, fe_linger_link, tfe) {
4972 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4973 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4974 ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
4975
4976 /*
4977 * See if this is a TCP flow that needs to generate
4978 * a RST to the remote peer (if not already).
4979 */
4980 if (flow_track_tcp_want_abort(fe)) {
4981 VERIFY(fe->fe_flags & FLOWENTF_ABORTED);
4982 ASSERT(!uuid_is_null(fe->fe_uuid));
4983 flow_track_abort_tcp(fe, NULL, NULL);
4984 (*abort)++;
4985 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4986 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx "
4987 "flags 0x%b [RST]", fe_as_string(fe, dbgbuf,
4988 sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags,
4989 FLOWENTF_BITS);
4990 }
4991
4992 /*
4993 * If flow has expired, remove from list and free;
4994 * otherwise leave it around in the linger list.
4995 */
4996 if (fe->fe_linger_expire <= now) {
4997 freed++;
4998 fsw_linger_remove_internal(&linger_head, fe);
4999 fe = NULL;
5000 }
5001 ++i;
5002 }
5003 VERIFY(i == cnt && cnt >= freed);
5004
5005 /*
5006 * Add any remaining ones back into the linger list.
5007 */
5008 lck_mtx_lock(&fsw->fsw_linger_lock);
5009 if (!TAILQ_EMPTY(&linger_head)) {
5010 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head) || fsw->fsw_linger_cnt);
5011 TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
5012 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
5013 TAILQ_CONCAT(&fsw->fsw_linger_head, &linger_head, fe_linger_link);
5014 fsw->fsw_linger_cnt += (cnt - freed);
5015 }
5016 ASSERT(TAILQ_EMPTY(&linger_head));
5017 lck_mtx_unlock(&fsw->fsw_linger_lock);
5018
5019 return freed;
5020 }
5021
5022 __attribute__((always_inline))
5023 static inline void
fsw_ifp_inc_traffic_class_in_pkt(struct ifnet * ifp,kern_packet_t ph)5024 fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph)
5025 {
5026 switch (__packet_get_traffic_class(ph)) {
5027 case PKT_TC_BE:
5028 ifp->if_tc.ifi_ibepackets++;
5029 ifp->if_tc.ifi_ibebytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5030 break;
5031 case PKT_TC_BK:
5032 ifp->if_tc.ifi_ibkpackets++;
5033 ifp->if_tc.ifi_ibkbytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5034 break;
5035 case PKT_TC_VI:
5036 ifp->if_tc.ifi_ivipackets++;
5037 ifp->if_tc.ifi_ivibytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5038 break;
5039 case PKT_TC_VO:
5040 ifp->if_tc.ifi_ivopackets++;
5041 ifp->if_tc.ifi_ivobytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5042 break;
5043 default:
5044 break;
5045 }
5046 }
5047
5048 __attribute__((always_inline))
5049 static inline void
fsw_ifp_inc_traffic_class_out_pkt(struct ifnet * ifp,uint32_t svc,uint32_t cnt,uint32_t len)5050 fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *ifp, uint32_t svc,
5051 uint32_t cnt, uint32_t len)
5052 {
5053 switch (svc) {
5054 case PKT_TC_BE:
5055 ifp->if_tc.ifi_obepackets += cnt;
5056 ifp->if_tc.ifi_obebytes += len;
5057 break;
5058 case PKT_TC_BK:
5059 ifp->if_tc.ifi_obkpackets += cnt;
5060 ifp->if_tc.ifi_obkbytes += len;
5061 break;
5062 case PKT_TC_VI:
5063 ifp->if_tc.ifi_ovipackets += cnt;
5064 ifp->if_tc.ifi_ovibytes += len;
5065 break;
5066 case PKT_TC_VO:
5067 ifp->if_tc.ifi_ovopackets += cnt;
5068 ifp->if_tc.ifi_ovobytes += len;
5069 break;
5070 default:
5071 break;
5072 }
5073 }
5074