xref: /xnu-11215.1.10/bsd/skywalk/nexus/flowswitch/fsw_dp.c (revision 8d741a5de7ff4191bf97d57b9f54c2f6d4a15585)
1 /*
2  * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  *   1. Redistributions of source code must retain the above copyright
36  *      notice, this list of conditions and the following disclaimer.
37  *   2. Redistributions in binary form must reproduce the above copyright
38  *      notice, this list of conditions and the following disclaimer in the
39  *      documentation and/or other materials provided with the distribution.
40  *
41  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  */
53 
54 /*
55  *  BSD LICENSE
56  *
57  * Copyright(c) 2015 NEC Europe Ltd. All rights reserved.
58  *  All rights reserved.
59  *
60  * Redistribution and use in source and binary forms, with or without
61  *  modification, are permitted provided that the following conditions
62  *  are met:
63  *
64  *    * Redistributions of source code must retain the above copyright
65  *      notice, this list of conditions and the following disclaimer.
66  *    * Redistributions in binary form must reproduce the above copyright
67  *      notice, this list of conditions and the following disclaimer in
68  *      the documentation and/or other materials provided with the
69  *      distribution.
70  *    * Neither the name of NEC Europe Ltd. nor the names of
71  *      its contributors may be used to endorse or promote products derived
72  *      from this software without specific prior written permission.
73  *
74  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
75  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
76  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
77  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
78  *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
79  *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
80  *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
81  *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
82  *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
83  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
84  *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85  */
86 
87 #include <skywalk/os_skywalk_private.h>
88 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
89 #include <skywalk/nexus/flowswitch/fsw_var.h>
90 #include <skywalk/nexus/netif/nx_netif.h>
91 #include <skywalk/nexus/netif/nx_netif_compat.h>
92 #include <kern/sched_prim.h>
93 #include <sys/kdebug.h>
94 #include <sys/sdt.h>
95 #include <net/bpf.h>
96 #include <net/if_ports_used.h>
97 #include <net/pktap.h>
98 #include <net/droptap.h>
99 #include <net/pktsched/pktsched_netem.h>
100 #include <netinet/tcp.h>
101 #include <netinet/udp.h>
102 #include <netinet/ip.h>
103 #include <netinet/ip6.h>
104 #include <netinet/in_var.h>
105 
106 extern kern_return_t thread_terminate(thread_t);
107 
108 #define FSW_ZONE_MAX                  256
109 #define FSW_ZONE_NAME                 "skywalk.nx.fsw"
110 
111 static uint64_t fsw_reap_last __sk_aligned(8);
112 static uint64_t fsw_want_purge __sk_aligned(8);
113 
114 #define NX_FSW_FE_TABLESZ       256     /* some power of 2 */
115 static uint32_t fsw_fe_table_size = NX_FSW_FE_TABLESZ;
116 
117 #define NX_FSW_FOB_HASHSZ       31      /* some mersenne prime */
118 static uint32_t fsw_flow_owner_buckets = NX_FSW_FOB_HASHSZ;
119 
120 #define NX_FSW_FRB_HASHSZ       128     /* some power of 2 */
121 static uint32_t fsw_flow_route_buckets = NX_FSW_FRB_HASHSZ;
122 
123 #define NX_FSW_FRIB_HASHSZ      13      /* some mersenne prime */
124 static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ;
125 
126 #define NX_FSW_FLOW_REAP_INTERVAL 1     /* seconds */
127 static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL;
128 
129 #define NX_FSW_RX_STALL_THRES   10       /* seconds */
130 static uint32_t fsw_rx_stall_thresh = NX_FSW_RX_STALL_THRES;
131 
132 #define NX_FSW_RX_STALL_DEFUNCT 1       /* defunct Rx-stalled channel (0 = disable) */
133 static uint32_t fsw_rx_stall_defunct = NX_FSW_RX_STALL_DEFUNCT;
134 
135 #define NX_FSW_FLOW_PURGE_THRES 0       /* purge every N reaps (0 = disable) */
136 static uint32_t fsw_flow_purge_thresh = NX_FSW_FLOW_PURGE_THRES;
137 
138 #define FSW_REAP_IVAL            (MAX(1, fsw_flow_reap_interval))
139 #define FSW_REAP_SK_THRES        (FSW_REAP_IVAL << 5)
140 #define FSW_REAP_IF_THRES        (FSW_REAP_IVAL << 5)
141 #define FSW_DRAIN_CH_THRES       (FSW_REAP_IVAL << 5)
142 #define FSW_IFSTATS_THRES        1
143 
144 #define NX_FSW_CHANNEL_REAP_THRES 1000  /* threshold (bytes/sec) for reaping*/
145 uint64_t fsw_channel_reap_thresh = NX_FSW_CHANNEL_REAP_THRES;
146 
147 #define RX_BUFLET_BATCH_COUNT 64 /* max batch size for buflet allocation */
148 
149 uint32_t fsw_rx_batch = NX_FSW_RXBATCH; /* # of packets per batch (RX) */
150 uint32_t fsw_tx_batch = NX_FSW_TXBATCH; /* # of packets per batch (TX) */
151 uint32_t fsw_gso_batch = 8;
152 #if (DEVELOPMENT || DEBUG)
153 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_batch,
154     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_batch, 0,
155     "flowswitch Rx batch size");
156 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, tx_batch,
157     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_tx_batch, 0,
158     "flowswitch Tx batch size");
159 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_batch,
160     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_gso_batch, 0,
161     "flowswitch GSO batch size");
162 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, reap_throughput,
163     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_channel_reap_thresh,
164     "flowswitch channel reap threshold throughput (bytes/sec)");
165 #endif /* !DEVELOPMENT && !DEBUG */
166 
167 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp,
168     CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp, 0,
169     "flowswitch RX aggregation for tcp flows (enable/disable)");
170 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp_host,
171     CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp_host, 0,
172     "flowswitch RX aggregation for tcp kernel path (0/1/2 (off/on/auto))");
173 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_mtu,
174     CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_gso_mtu, 0,
175     "flowswitch GSO for tcp flows (mtu > 0: enable, mtu == 0: disable)");
176 
177 /*
178  * IP reassembly
179  * The "kern.skywalk.flowswitch.ip_reass" sysctl can be used to force
180  * enable/disable the reassembly routine regardless of whether the
181  * transport netagent is enabled or not.
182  *
183  * 'fsw_ip_reass' is a tri-state:
184  *    0 means force IP reassembly off
185  *    1 means force IP reassembly on
186  *    2 means don't force the value, use what's appropriate for this flowswitch
187  */
188 #define FSW_IP_REASS_FORCE_OFF          0
189 #define FSW_IP_REASS_FORCE_ON           1
190 #define FSW_IP_REASS_NO_FORCE           2
191 
192 uint32_t fsw_ip_reass = FSW_IP_REASS_NO_FORCE;
193 
194 static int
195 fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS
196 {
197 #pragma unused(oidp, arg1, arg2)
198 	unsigned int new_value;
199 	int changed;
200 	int error;
201 
202 	error = sysctl_io_number(req, fsw_ip_reass, sizeof(fsw_ip_reass),
203 	    &new_value, &changed);
204 	if (error == 0 && changed != 0) {
205 		if (new_value > FSW_IP_REASS_NO_FORCE) {
206 			return EINVAL;
207 		}
208 		fsw_ip_reass = new_value;
209 	}
210 	return error;
211 }
212 
213 SYSCTL_PROC(_kern_skywalk_flowswitch, OID_AUTO, ip_reass,
214     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
215     0, 0, fsw_ip_reass_sysctl, "IU",
216     "adjust flowswitch IP reassembly");
217 
218 #if (DEVELOPMENT || DEBUG)
219 static uint64_t _fsw_inject_error = 0;
220 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) \
221 	_SK_INJECT_ERROR(_fsw_inject_error, _en, _ev, _ec, \
222 	&FSW_STATS_VAL(_FSW_STATS_ERROR_INJECTIONS), _f, __VA_ARGS__)
223 
224 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { \
225 	if (__improbable(((_fsw_inject_error) & (1ULL << (_en))) != 0)) { \
226 	        SK_DF(SK_VERB_ERROR_INJECT, "injecting error %d", (_en));\
227 	        if ((_f) != NULL)                                       \
228 	                (_f)(__VA_ARGS__);                              \
229 	}                                                               \
230 } while (0)
231 
232 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_owner_buckets,
233     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_owner_buckets, 0, "");
234 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, fe_table_size,
235     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_fe_table_size, 0, "");
236 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_buckets,
237     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_route_buckets, 0, "");
238 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO,
239     flow_route_id_buckets, CTLFLAG_RW | CTLFLAG_LOCKED,
240     &fsw_flow_route_id_buckets, 0, "");
241 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_reap_interval,
242     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_reap_interval, 0, "");
243 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_stall_thresh,
244     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_stall_thresh, 0, "");
245 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_stall_defunct,
246     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_stall_defunct, 0, "");
247 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_purge_thresh,
248     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_purge_thresh, 0, "");
249 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, fsw_inject_error,
250     CTLFLAG_RW | CTLFLAG_LOCKED, &_fsw_inject_error, "");
251 #else
252 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0)
253 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { } while (0)
254 #endif /* !DEVELOPMENT && !DEBUG */
255 
256 static void fsw_linger_remove_internal(struct flow_entry_linger_head *,
257     struct flow_entry *);
258 static void fsw_reap_thread_func(void *, wait_result_t);
259 static void fsw_reap_thread_cont(void *, wait_result_t);
260 static void fsw_purge_cache(struct nx_flowswitch *, boolean_t);
261 static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t);
262 static uint32_t fsw_process_deferred(struct nx_flowswitch *);
263 static uint32_t fsw_process_linger(struct nx_flowswitch *, uint32_t *);
264 
265 static int copy_packet_from_dev(struct nx_flowswitch *, struct __kern_packet *,
266     struct __kern_packet *);
267 
268 static void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *, kern_packet_t);
269 static void fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *, uint32_t,
270     uint32_t, uint32_t);
271 
272 static int __fsw_dp_inited = 0;
273 
274 int
fsw_dp_init(void)275 fsw_dp_init(void)
276 {
277 	_CASSERT(FSW_VP_DEV == 0);
278 	_CASSERT(FSW_VP_HOST == 1);
279 	_CASSERT((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN);
280 	_CASSERT((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT);
281 
282 	ASSERT(!__fsw_dp_inited);
283 
284 	flow_mgr_init();
285 	flow_init();
286 
287 	__fsw_dp_inited = 1;
288 
289 	return 0;
290 }
291 
292 void
fsw_dp_uninit(void)293 fsw_dp_uninit(void)
294 {
295 	if (__fsw_dp_inited) {
296 		flow_fini();
297 		flow_mgr_fini();
298 
299 		__fsw_dp_inited = 0;
300 	}
301 }
302 
303 static void
dp_free_pktq(struct nx_flowswitch * fsw __sk_unused,struct pktq * pktq)304 dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq)
305 {
306 	pp_free_pktq(pktq);
307 }
308 
309 #define dp_drop_pktq(fsw, pktq, outgoing, _reason, line, _flags) do {         \
310 	uint32_t _len = KPKTQ_LEN(pktq);                                      \
311 	if (KPKTQ_EMPTY(pktq)) {                                              \
312 	        ASSERT(_len == 0);                                            \
313 	        return;                                                       \
314 	}                                                                     \
315 	SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop %d packets", _len);        \
316 	FSW_STATS_ADD(FSW_STATS_DROP, _len);                                  \
317 	DTRACE_SKYWALK1(fsw__dp__drop, int, _len);                            \
318 	if (__probable(droptap_total_tap_count == 0)) {                       \
319 	        dp_free_pktq(fsw, pktq);                                      \
320 	        break;                                                        \
321 	}                                                                     \
322 	drop_func_t dropfunc;                                                 \
323 	dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
324 	struct __kern_packet *kpkt = KPKTQ_FIRST(pktq);                       \
325 	struct __kern_packet *next_pkt;                                       \
326 	for (; kpkt != NULL; kpkt = next_pkt) {                               \
327 	        next_pkt = kpkt->pkt_nextpkt;                                 \
328 	        dropfunc(SK_PKT2PH(kpkt), _reason, __func__, line, _flags,    \
329 	            fsw->fsw_ifp, kpkt->pkt_qum.qum_pid, NULL, -1, NULL,      \
330 	            0, 0);                                                    \
331 	}                                                                     \
332 	dp_free_pktq(fsw, pktq);                                              \
333 } while (0)
334 
335 #define dp_drop_pkt_single(fsw, pkt, outgoing, _reason, _flags) do {          \
336 	SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop 1 packet");                \
337 	FSW_STATS_ADD(FSW_STATS_DROP, 1);                                     \
338 	if (__probable(droptap_total_tap_count == 0)) {                       \
339 	        pp_free_packet_single(pkt);                                   \
340 	        break;                                                        \
341 	}                                                                     \
342 	drop_func_t dropfunc;                                                 \
343 	dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
344 	dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags,         \
345 	    fsw->fsw_ifp, (pkt)->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0);      \
346 	pp_free_packet_single(pkt);                                           \
347 } while (0)
348 
349 #define dp_drop_pkt_chain(pkt, outgoing, _reason, _flags) do {                \
350 	if (__probable(droptap_total_tap_count == 0)) {                       \
351 	        pp_free_packet_chain(pkt, NULL);                              \
352 	        break;                                                        \
353 	}                                                                     \
354 	drop_func_t dropfunc;                                                 \
355 	dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \
356 	struct __kern_packet *next_pkt;                                       \
357 	for (; pkt != NULL; pkt = next_pkt) {                                 \
358 	        next_pkt = pkt->pkt_nextpkt;                                  \
359 	        dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags, \
360 	            NULL, pkt->pkt_qum.qum_pid, NULL, -1, NULL,               \
361 	            0, 0);                                                    \
362 	}                                                                     \
363 	pp_free_packet_chain(pkt, NULL);                                      \
364 } while (0)
365 
366 
367 SK_NO_INLINE_ATTRIBUTE
368 void
fsw_snoop(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * pktq,bool input)369 fsw_snoop(struct nx_flowswitch *fsw, struct flow_entry *fe, struct pktq *pktq,
370     bool input)
371 {
372 	pid_t pid;
373 	char proc_name_buf[FLOW_PROCESS_NAME_LENGTH];
374 	const char *__null_terminated proc_name = NULL;
375 	pid_t epid;
376 	char eproc_name_buf[FLOW_PROCESS_NAME_LENGTH];
377 	const char *__null_terminated eproc_name = NULL;
378 	sa_family_t af;
379 	bool tap_early = false;
380 	struct __kern_packet *pkt;
381 
382 	ASSERT(fe != NULL);
383 	ASSERT(fsw->fsw_ifp != NULL);
384 
385 	if (fe->fe_nx_port == FSW_VP_HOST) {
386 		/* allow packets to be tapped before aggregation happens */
387 		tap_early = (input && fe->fe_key.fk_proto == IPPROTO_TCP);
388 		if (!tap_early) {
389 			/* all other traffic will be tapped in the dlil input path */
390 			return;
391 		}
392 	}
393 	if (fe->fe_key.fk_ipver == IPVERSION) {
394 		af = AF_INET;
395 	} else if (fe->fe_key.fk_ipver == IPV6_VERSION) {
396 		af = AF_INET6;
397 	} else {
398 		return;
399 	}
400 
401 	pid = fe->fe_pid;
402 	if (fe->fe_proc_name[0] != '\0') {
403 		proc_name = strbufcpy(proc_name_buf, sizeof(proc_name_buf),
404 		    fe->fe_proc_name, sizeof(fe->fe_proc_name));
405 	}
406 	epid = fe->fe_epid;
407 	if (fe->fe_eproc_name[0] != '\0') {
408 		eproc_name = strbufcpy(eproc_name_buf, sizeof(eproc_name_buf),
409 		    fe->fe_eproc_name, sizeof(fe->fe_eproc_name));
410 	}
411 	if (input) {
412 		KPKTQ_FOREACH(pkt, pktq) {
413 			pktap_input_packet(fsw->fsw_ifp, af,
414 			    fsw->fsw_ifp_dlt, pid, proc_name, epid,
415 			    eproc_name, SK_PKT2PH(pkt), NULL, 0,
416 			    IPPROTO_TCP, fe->fe_flowid,
417 			    tap_early ? PTH_FLAG_SOCKET: PTH_FLAG_NEXUS_CHAN);
418 		}
419 	} else {
420 		KPKTQ_FOREACH(pkt, pktq) {
421 			pktap_output_packet(fsw->fsw_ifp, af,
422 			    fsw->fsw_ifp_dlt, pid, proc_name, epid,
423 			    eproc_name, SK_PKT2PH(pkt), NULL, 0,
424 			    0, 0, PTH_FLAG_NEXUS_CHAN);
425 		}
426 	}
427 }
428 
429 #if (DEVELOPMENT || DEBUG)
430 static void
_fsw_error35_handler(int step,struct flow_route * fr,struct __kern_packet * pkt,int * ret)431 _fsw_error35_handler(int step, struct flow_route *fr, struct __kern_packet *pkt,
432     int *ret)
433 {
434 	static boolean_t _err35_flag_modified = FALSE;
435 
436 	switch (step) {
437 	case 1:
438 		if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
439 		    (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
440 			fr->fr_flags &= ~FLOWRTF_RESOLVED;
441 			_err35_flag_modified = TRUE;
442 		}
443 		break;
444 
445 	case 2:
446 		if (!_err35_flag_modified) {
447 			return;
448 		}
449 		if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
450 			m_freem(pkt->pkt_mbuf);
451 			pkt->pkt_pflags &= ~PKT_F_MBUF_DATA;
452 			pkt->pkt_mbuf = NULL;
453 		}
454 		*ret = EJUSTRETURN;
455 		fr->fr_flags |= FLOWRTF_RESOLVED;
456 		_err35_flag_modified = FALSE;
457 		break;
458 
459 	default:
460 		VERIFY(0);
461 		/* not reached */
462 	}
463 }
464 
465 static void
_fsw_error36_handler(int step,struct flow_route * fr,int * ret)466 _fsw_error36_handler(int step, struct flow_route *fr, int *ret)
467 {
468 	static boolean_t _err36_flag_modified = FALSE;
469 
470 	switch (step) {
471 	case 1:
472 		if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
473 		    (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
474 			fr->fr_flags &= ~FLOWRTF_RESOLVED;
475 			_err36_flag_modified = TRUE;
476 		}
477 		break;
478 
479 	case 2:
480 		if (!_err36_flag_modified) {
481 			return;
482 		}
483 		*ret = ENETUNREACH;
484 		fr->fr_flags |= FLOWRTF_RESOLVED;
485 		_err36_flag_modified = FALSE;
486 		break;
487 
488 	default:
489 		VERIFY(0);
490 		/* not reached */
491 	}
492 }
493 #else /* !DEVELOPMENT && !DEBUG */
494 #define _fsw_error35_handler(...)
495 #define _fsw_error36_handler(...)
496 #endif /* DEVELOPMENT || DEBUG */
497 
498 /*
499  * Check if the source packet content can fit into the destination
500  * ring's packet. Returns TRUE if the source packet can fit.
501  * Note: Failures could be caused by misconfigured packet pool sizes,
502  * missing packet size check again MTU or if the source packet is from
503  * a compat netif and the attached mbuf is larger than MTU due to LRO.
504  */
505 static inline boolean_t
validate_pkt_len(struct __kern_packet * spkt,kern_packet_t dph,uint32_t skip_l2hlen,uint32_t l2hlen,uint16_t headroom,uint32_t * copy_len)506 validate_pkt_len(struct __kern_packet *spkt, kern_packet_t dph,
507     uint32_t skip_l2hlen, uint32_t l2hlen, uint16_t headroom,
508     uint32_t *copy_len)
509 {
510 	uint32_t tlen = 0;
511 	uint32_t splen = spkt->pkt_length - skip_l2hlen;
512 
513 	if (l2hlen != 0) {
514 		VERIFY(skip_l2hlen == 0);
515 		tlen += l2hlen;
516 	} else if ((spkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0) {
517 		splen -= ETHER_CRC_LEN;
518 	}
519 
520 	tlen += splen;
521 	*copy_len = splen;
522 
523 	return tlen <= ((__packet_get_buflet_count(dph) *
524 	       PP_BUF_SIZE_DEF(SK_PTR_ADDR_KPKT(dph)->pkt_qum.qum_pp)) -
525 	       headroom);
526 }
527 
528 #if SK_LOG
529 /* Hoisted out of line to reduce kernel stack footprint */
530 SK_LOG_ATTRIBUTE
531 static void
copy_packet_from_dev_log(struct __kern_packet * spkt,struct __kern_packet * dpkt,struct proc * p)532 copy_packet_from_dev_log(struct __kern_packet *spkt,
533     struct __kern_packet *dpkt, struct proc *p)
534 {
535 	uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
536 	    ((spkt->pkt_pflags & PKT_F_MBUF_DATA) ?
537 	    SK_VERB_COPY_MBUF : SK_VERB_COPY));
538 	char *daddr;
539 	uint32_t pkt_len;
540 
541 	MD_BUFLET_ADDR_ABS(dpkt, daddr);
542 	pkt_len = __packet_get_real_data_length(dpkt);
543 	SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u",
544 	    sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
545 	    dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
546 	    (uint32_t)dpkt->pkt_l2_len);
547 	SK_DF(logflags | SK_VERB_DUMP, "%s",
548 	    sk_dump("buf", daddr, pkt_len, 128, NULL, 0));
549 }
550 #else
551 #define copy_packet_from_dev_log(...)
552 #endif /* SK_LOG */
553 
554 
555 static inline int
copy_packet_from_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)556 copy_packet_from_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
557     struct __kern_packet *dpkt)
558 {
559 	/*
560 	 * source and destination nexus don't share the packet pool
561 	 * sync operation here is to
562 	 * - alloc packet for the rx(dst) ring
563 	 * - copy data/metadata from src packet to dst packet
564 	 * - attach alloc'd packet to rx(dst) ring
565 	 */
566 	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
567 	    METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
568 	kern_packet_t sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt),
569 	    METADATA_SUBTYPE(spkt));
570 	boolean_t do_cksum_rx;
571 	uint16_t skip_l2h_len = spkt->pkt_l2_len;
572 	uint16_t iphlen;
573 	uint32_t dlen;
574 	int err;
575 
576 	if (__improbable(!validate_pkt_len(spkt, dph, skip_l2h_len, 0, 0,
577 	    &dlen))) {
578 		SK_ERR("bufcnt %d, bufsz %d", __packet_get_buflet_count(dph),
579 		    PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp));
580 		FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
581 		return EINVAL;
582 	}
583 
584 	/* Copy packet metadata */
585 	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
586 	_PKT_COPY(spkt, dpkt);
587 	ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
588 	    PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
589 	ASSERT(dpkt->pkt_mbuf == NULL);
590 
591 	dpkt->pkt_headroom = 0;
592 	dpkt->pkt_l2_len = 0;
593 
594 	/* don't include IP header from partial sum */
595 	if (__probable((spkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) {
596 		iphlen = spkt->pkt_flow_ip_hlen;
597 		do_cksum_rx = sk_cksum_rx;
598 	} else {
599 		iphlen = 0;
600 		do_cksum_rx = FALSE;
601 	}
602 
603 	/* Copy packet payload */
604 	if ((spkt->pkt_pflags & PKT_F_MBUF_DATA) &&
605 	    (spkt->pkt_pflags & PKT_F_TRUNCATED)) {
606 		FSW_STATS_INC(FSW_STATS_RX_COPY_MBUF2PKT);
607 		/*
608 		 * Source packet has truncated contents (just enough for
609 		 * the classifer) of an mbuf from the compat driver; copy
610 		 * the entire entire mbuf contents to destination packet.
611 		 */
612 		m_adj(spkt->pkt_mbuf, skip_l2h_len);
613 		ASSERT((uint32_t)m_pktlen(spkt->pkt_mbuf) >= dlen);
614 		fsw->fsw_pkt_copy_from_mbuf(NR_RX, dph, 0,
615 		    spkt->pkt_mbuf, 0, dlen, do_cksum_rx, iphlen);
616 	} else {
617 		FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2PKT);
618 		/*
619 		 * Source packet has full contents, either from an mbuf
620 		 * that came up from the compat driver, or because it
621 		 * originated on the native driver; copy to destination.
622 		 */
623 		fsw->fsw_pkt_copy_from_pkt(NR_RX, dph, 0, sph,
624 		    (spkt->pkt_headroom + spkt->pkt_l2_len), dlen, do_cksum_rx,
625 		    iphlen, 0, FALSE);
626 	}
627 
628 #if DEBUG || DEVELOPMENT
629 	if (__improbable(pkt_trailers > 0)) {
630 		dlen += pkt_add_trailers(dph, dlen, iphlen);
631 	}
632 #endif /* DEBUG || DEVELOPMENT */
633 
634 	/* Finalize and attach packet to Rx ring */
635 	METADATA_ADJUST_LEN(dpkt, 0, 0);
636 	err = __packet_finalize(dph);
637 	VERIFY(err == 0);
638 
639 	copy_packet_from_dev_log(spkt, dpkt, kernproc);
640 
641 	if (spkt->pkt_pflags & PKT_F_MBUF_DATA) {
642 		ifp_inc_traffic_class_in(fsw->fsw_ifp, spkt->pkt_mbuf);
643 		mbuf_free(spkt->pkt_mbuf);
644 		KPKT_CLEAR_MBUF_DATA(spkt);
645 	} else {
646 		fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph);
647 	}
648 
649 	if (__probable(do_cksum_rx != 0)) {
650 		FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
651 	}
652 
653 	return 0;
654 }
655 
656 SK_NO_INLINE_ATTRIBUTE
657 static struct __kern_packet *
rx_process_ip_frag(struct nx_flowswitch * fsw,struct __kern_packet * pkt)658 rx_process_ip_frag(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
659 {
660 	char *pkt_buf;
661 	void *l3_hdr;
662 	uint16_t nfrags, tlen;
663 	int err = 0;
664 
665 	switch (fsw_ip_reass) {
666 	case FSW_IP_REASS_FORCE_OFF:
667 		return pkt;
668 	case FSW_IP_REASS_FORCE_ON:
669 		break;
670 	default:
671 		if (!FSW_NETAGENT_ENABLED(fsw) ||
672 		    flow_mgr_get_num_flows(fsw->fsw_flow_mgr) == 0) {
673 			return pkt;
674 		}
675 		break;
676 	}
677 
678 	MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
679 	l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
680 
681 	ASSERT(fsw->fsw_ipfm != NULL);
682 	ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
683 
684 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
685 		struct ip *ip = l3_hdr;
686 		err = fsw_ip_frag_reass_v4(fsw->fsw_ipfm, &pkt, ip, &nfrags, &tlen);
687 	} else {
688 		struct ip6_hdr *ip6_hdr = l3_hdr;
689 		struct ip6_frag *__single ip6_frag =
690 		    (struct ip6_frag *)((uint8_t *)l3_hdr + sizeof(struct ip6_hdr));
691 
692 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
693 		/* we only handle frag header immediately after v6 header */
694 		err = fsw_ip_frag_reass_v6(fsw->fsw_ipfm, &pkt, ip6_hdr, ip6_frag,
695 		    &nfrags, &tlen);
696 	}
697 	if (__improbable(err != 0)) {
698 		/* if we get a bad fragment, free it */
699 		pp_free_packet_single(pkt);
700 		pkt = NULL;
701 	} else {
702 		ASSERT(!((pkt != NULL) ^ (nfrags > 0)));
703 	}
704 
705 	return pkt;
706 }
707 
708 SK_NO_INLINE_ATTRIBUTE
709 static void
rx_prepare_packet_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)710 rx_prepare_packet_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
711 {
712 	ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
713 	uint32_t mlen = (uint32_t)m_pktlen(pkt->pkt_mbuf);
714 	kern_packet_t ph =  SK_PTR_ENCODE(pkt,
715 	    METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
716 	/*
717 	 * This is the case when the packet is coming in from
718 	 * compat-netif. This packet only has valid metadata
719 	 * and an attached mbuf. We need to copy enough data
720 	 * from the mbuf to the packet buffer for the
721 	 * classifier. Compat netif packet pool is configured
722 	 * with buffer size of NETIF_COMPAT_MAX_MBUF_DATA_COPY
723 	 * which is just enough to hold the protocol headers
724 	 * for the flowswitch classifier.
725 	 */
726 
727 	pkt->pkt_headroom = 0;
728 	METADATA_ADJUST_LEN(pkt, 0, 0);
729 	/*
730 	 * Copy the initial 128 bytes of the packet for
731 	 * classification.
732 	 * Ethernet(14) + IPv6 header(40) +
733 	 * + IPv6 fragment header(8) +
734 	 * TCP header with options(60).
735 	 */
736 	fsw->fsw_pkt_copy_from_mbuf(NR_RX, ph,
737 	    pkt->pkt_headroom, pkt->pkt_mbuf, 0,
738 	    MIN(mlen, NETIF_COMPAT_MAX_MBUF_DATA_COPY),
739 	    FALSE, 0);
740 
741 	int err = __packet_finalize_with_mbuf(pkt);
742 	VERIFY(err == 0);
743 }
744 
745 static struct __kern_packet *
rx_prepare_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)746 rx_prepare_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
747 {
748 	pkt->pkt_qum_qflags &= ~QUM_F_FLOW_CLASSIFIED;
749 
750 	if (__improbable(pkt->pkt_pflags & PKT_F_MBUF_DATA)) {
751 		rx_prepare_packet_mbuf(fsw, pkt);
752 	}
753 
754 	return pkt;
755 }
756 
757 static struct flow_entry *
lookup_flow_with_pkt(struct nx_flowswitch * fsw,struct __kern_packet * pkt,bool input,struct flow_entry * prev_fe)758 lookup_flow_with_pkt(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
759     bool input, struct flow_entry *prev_fe)
760 {
761 	struct flow_key key __sk_aligned(16);
762 	struct flow_entry *__single fe = NULL;
763 
764 	ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
765 	flow_pkt2key(pkt, input, &key);
766 
767 	if (__probable(prev_fe != NULL &&
768 	    prev_fe->fe_key.fk_mask == FKMASK_5TUPLE)) {
769 		uint16_t saved_mask = key.fk_mask;
770 		key.fk_mask = FKMASK_5TUPLE;
771 		if (flow_key_cmp_mask(&prev_fe->fe_key, &key, &fk_mask_5tuple) == 0) {
772 			flow_entry_retain(prev_fe);
773 			fe = prev_fe;
774 		} else {
775 			key.fk_mask = saved_mask;
776 		}
777 	}
778 
779 top:
780 	if (__improbable(fe == NULL)) {
781 		fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &key);
782 	}
783 
784 	if (__improbable(fe != NULL &&
785 	    (fe->fe_flags & (FLOWENTF_PARENT | FLOWENTF_CHILD)) != 0)) {
786 		/* Rx */
787 		if (input) {
788 			if (fe->fe_flags & FLOWENTF_PARENT) {
789 				struct flow_entry *child_fe = rx_lookup_child_flow(fsw, fe, pkt);
790 				if (child_fe != NULL) {
791 					flow_entry_release(&fe);
792 					fe = child_fe;
793 				}
794 			} else {
795 				if (!rx_flow_demux_match(fsw, fe, pkt)) {
796 					flow_entry_release(&fe);
797 					fe = NULL;
798 					goto top;
799 				}
800 			}
801 		} else {
802 			/* Tx */
803 			if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
804 				if (__probable(fe->fe_flags & FLOWENTF_PARENT)) {
805 					struct flow_entry *__single parent_fe = fe;
806 					fe = tx_lookup_child_flow(parent_fe, pkt->pkt_flow_id);
807 					flow_entry_release(&parent_fe);
808 				} else {
809 					flow_entry_release(&fe);
810 					fe = NULL;
811 					goto top;
812 				}
813 			}
814 		}
815 	}
816 
817 	SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]);
818 	SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP,
819 	    "%s %s %s \"%s\" fe 0x%llx",
820 	    input ? "Rx" : "Tx", if_name(fsw->fsw_ifp),
821 	    sk_proc_name_address(current_proc()),
822 	    fk_as_string(&key, fkbuf, sizeof(fkbuf)),
823 	    SK_KVA(fe));
824 
825 	return fe;
826 }
827 
828 SK_NO_INLINE_ATTRIBUTE
829 static bool
pkt_is_for_listener(struct flow_entry * fe,struct __kern_packet * pkt)830 pkt_is_for_listener(struct flow_entry *fe, struct __kern_packet *pkt)
831 {
832 	struct nx_flowswitch *fsw = fe->fe_fsw;
833 	struct ifnet *ifp = fsw->fsw_ifp;
834 	struct in_ifaddr *ia = NULL;
835 	struct in_ifaddr *best_ia = NULL;
836 	struct in6_ifaddr *ia6 = NULL;
837 	struct in6_ifaddr *best_ia6 = NULL;
838 	struct ifnet *match_ifp = NULL;
839 	struct __flow *flow = pkt->pkt_flow;
840 	bool result = false;
841 
842 	ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
843 
844 	if (flow->flow_ip_ver == IPVERSION) {
845 		if (IN_ZERONET(ntohl(flow->flow_ipv4_dst.s_addr)) ||
846 		    IN_LOOPBACK(ntohl(flow->flow_ipv4_dst.s_addr)) ||
847 		    IN_LINKLOCAL(ntohl(flow->flow_ipv4_dst.s_addr)) ||
848 		    IN_DS_LITE(ntohl(flow->flow_ipv4_dst.s_addr)) ||
849 		    IN_6TO4_RELAY_ANYCAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
850 		    IN_MULTICAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
851 		    INADDR_BROADCAST == flow->flow_ipv4_dst.s_addr) {
852 			result = true;
853 			goto done;
854 		}
855 
856 		/*
857 		 * Check for a match in the hash bucket.
858 		 */
859 		lck_rw_lock_shared(&in_ifaddr_rwlock);
860 		TAILQ_FOREACH(ia, INADDR_HASH(flow->flow_ipv4_dst.s_addr), ia_hash) {
861 			if (IA_SIN(ia)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr) {
862 				best_ia = ia;
863 				match_ifp = ia->ia_ifp;
864 
865 				if (match_ifp == ifp) {
866 					break;
867 				}
868 				/*
869 				 * Continue the loop in case there's a exact match with another
870 				 * interface
871 				 */
872 			}
873 		}
874 
875 		if (best_ia != NULL) {
876 			if (match_ifp != ifp && ipforwarding == 0 &&
877 			    (match_ifp->if_family == IFNET_FAMILY_IPSEC ||
878 			    match_ifp->if_family == IFNET_FAMILY_UTUN)) {
879 				/*
880 				 * Drop when interface address check is strict and forwarding
881 				 * is disabled
882 				 */
883 			} else {
884 				lck_rw_done(&in_ifaddr_rwlock);
885 				result = true;
886 				goto done;
887 			}
888 		}
889 		lck_rw_done(&in_ifaddr_rwlock);
890 
891 		if (ifp->if_flags & IFF_BROADCAST) {
892 			/*
893 			 * Check for broadcast addresses.
894 			 *
895 			 * Only accept broadcast packets that arrive via the matching
896 			 * interface.  Reception of forwarded directed broadcasts would be
897 			 * handled via ip_forward() and ether_frameout() with the loopback
898 			 * into the stack for SIMPLEX interfaces handled by ether_frameout().
899 			 */
900 			struct ifaddr *ifa;
901 
902 			ifnet_lock_shared(ifp);
903 			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
904 				if (ifa->ifa_addr->sa_family != AF_INET) {
905 					continue;
906 				}
907 				ia = ifatoia(ifa);
908 				if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr ||
909 				    ia->ia_netbroadcast.s_addr == flow->flow_ipv4_dst.s_addr) {
910 					ifnet_lock_done(ifp);
911 					result = true;
912 					goto done;
913 				}
914 			}
915 			ifnet_lock_done(ifp);
916 		}
917 	} else {
918 		struct in6_ifaddrhashhead *ia6_hash_head;
919 
920 		if (IN6_IS_ADDR_LOOPBACK(&flow->flow_ipv6_dst) ||
921 		    IN6_IS_ADDR_LINKLOCAL(&flow->flow_ipv6_dst) ||
922 		    IN6_IS_ADDR_MULTICAST(&flow->flow_ipv6_dst)) {
923 			result = true;
924 			goto done;
925 		}
926 
927 		/*
928 		 * Check for exact addresses in the hash bucket.
929 		 */
930 		lck_rw_lock_shared(&in6_ifaddr_rwlock);
931 		/* XXX -fbounds-safety: external dependency on ip6_input.c */
932 		ia6_hash_head = __unsafe_forge_bidi_indexable(struct in6_ifaddrhashhead *,
933 		    in6_ifaddrhashtbl, in6addr_nhash * sizeof(*in6_ifaddrhashtbl));
934 		ia6_hash_head = &ia6_hash_head[in6addr_hashval(&flow->flow_ipv6_dst)];
935 
936 		TAILQ_FOREACH(ia6, ia6_hash_head, ia6_hash) {
937 			if (in6_are_addr_equal_scoped(&ia6->ia_addr.sin6_addr, &flow->flow_ipv6_dst,
938 			    ia6->ia_ifp->if_index, ifp->if_index)) {
939 				if ((ia6->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46))) {
940 					continue;
941 				}
942 				best_ia6 = ia6;
943 				if (ia6->ia_ifp == ifp) {
944 					break;
945 				}
946 				/*
947 				 * Continue the loop in case there's a exact match with another
948 				 * interface
949 				 */
950 			}
951 		}
952 		if (best_ia6 != NULL) {
953 			if (best_ia6->ia_ifp != ifp && ip6_forwarding == 0 &&
954 			    (best_ia6->ia_ifp->if_family == IFNET_FAMILY_IPSEC ||
955 			    best_ia6->ia_ifp->if_family == IFNET_FAMILY_UTUN)) {
956 				/*
957 				 * Drop when interface address check is strict and forwarding
958 				 * is disabled
959 				 */
960 			} else {
961 				lck_rw_done(&in6_ifaddr_rwlock);
962 				result = true;
963 				goto done;
964 			}
965 		}
966 		lck_rw_done(&in6_ifaddr_rwlock);
967 	}
968 
969 	/*
970 	 * In forwarding mode, if the destination address
971 	 * of the packet does not match any interface
972 	 * address, it maybe destined to the client device
973 	 */
974 	SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
975 	    "Rx flow does not match interface address");
976 done:
977 	return result;
978 }
979 
980 static struct flow_entry *
rx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)981 rx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
982     struct flow_entry *prev_fe)
983 {
984 	struct flow_entry *__single fe;
985 
986 	fe = lookup_flow_with_pkt(fsw, pkt, true, prev_fe);
987 	_FSW_INJECT_ERROR(2, fe, NULL, flow_entry_release, &fe);
988 	if (fe == NULL) {
989 		FSW_STATS_INC(FSW_STATS_RX_FLOW_NOT_FOUND);
990 		return NULL;
991 	}
992 
993 	if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE &&
994 	    fe->fe_flags & FLOWENTF_LISTENER) &&
995 	    !pkt_is_for_listener(fe, pkt)) {
996 		FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_LISTENER);
997 		flow_entry_release(&fe);
998 		return NULL;
999 	}
1000 
1001 	if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
1002 		FSW_STATS_INC(FSW_STATS_RX_FLOW_TORNDOWN);
1003 		SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
1004 		    "Rx flow torn down");
1005 		flow_entry_release(&fe);
1006 		fe = NULL;
1007 	}
1008 
1009 	return fe;
1010 }
1011 
1012 static inline void
rx_flow_batch_packets(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt,uint64_t tid)1013 rx_flow_batch_packets(struct flow_entry_list *fes, struct flow_entry *fe,
1014     struct __kern_packet *pkt, uint64_t tid)
1015 {
1016 	/*
1017 	 * Among threads working on the same fe, the first thread that reaches here
1018 	 * will be responsible for processing all the packets until a point when
1019 	 * it does not see new packets in fe_rx_pktq. Other threads only
1020 	 * enqueue their packets but do not add the flow entry to their flow entry list.
1021 	 */
1022 	lck_mtx_lock(&fe->fe_rx_pktq_lock);
1023 
1024 	if (fe->fe_rx_worker_tid == 0) {
1025 		fe->fe_rx_worker_tid = tid;
1026 	} else if (__improbable(fe->fe_rx_worker_tid != tid)) {
1027 		STATS_INC(&fe->fe_fsw->fsw_stats, FSW_STATS_RX_FLOW_IN_USE);
1028 	}
1029 
1030 	if (__improbable(pkt->pkt_flow_ip_is_frag)) {
1031 		fe->fe_rx_frag_count++;
1032 	}
1033 
1034 	fe->fe_rx_pktq_bytes += pkt->pkt_flow_ulen;
1035 	/* KPKTQ_ENQUEUE_LIST is needed until frags become chained buflet */
1036 	if (KPKTQ_EMPTY(&fe->fe_rx_pktq) && tid == fe->fe_rx_worker_tid) {
1037 		ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) == 0);
1038 		TAILQ_INSERT_TAIL(fes, fe, fe_rx_link);
1039 		KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
1040 		lck_mtx_unlock(&fe->fe_rx_pktq_lock);
1041 	} else {
1042 		KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
1043 		lck_mtx_unlock(&fe->fe_rx_pktq_lock);
1044 		flow_entry_release(&fe);
1045 	}
1046 }
1047 
1048 static void
tx_flow_batch_packet(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt)1049 tx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
1050     struct __kern_packet *pkt)
1051 {
1052 	/* record frag continuation */
1053 	if (__improbable(pkt->pkt_flow_ip_is_first_frag)) {
1054 		ASSERT(pkt->pkt_flow_ip_is_frag);
1055 		fe->fe_tx_is_cont_frag = true;
1056 		fe->fe_tx_frag_id = pkt->pkt_flow_ip_frag_id;
1057 	} else if (__probable(!pkt->pkt_flow_ip_is_frag)) {
1058 		fe->fe_tx_is_cont_frag = false;
1059 		fe->fe_tx_frag_id = 0;
1060 	}
1061 
1062 	if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
1063 		ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
1064 		TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
1065 		KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
1066 	} else {
1067 		ASSERT(!TAILQ_EMPTY(fes));
1068 		KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
1069 		flow_entry_release(&fe);
1070 	}
1071 }
1072 
1073 static inline void
fsw_rx_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes)1074 fsw_rx_ring_dequeue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
1075     uint32_t n_pkts_max, struct pktq *pktq, uint32_t *n_bytes)
1076 {
1077 	uint32_t n_pkts = 0;
1078 	slot_idx_t idx, idx_end;
1079 	idx = r->ckr_khead;
1080 	idx_end = r->ckr_rhead;
1081 
1082 	ASSERT(KPKTQ_EMPTY(pktq));
1083 	*n_bytes = 0;
1084 	for (; n_pkts < n_pkts_max && idx != idx_end;
1085 	    idx = SLOT_NEXT(idx, r->ckr_lim)) {
1086 		struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1087 		struct __kern_packet *pkt = ksd->sd_pkt;
1088 
1089 		ASSERT(pkt->pkt_nextpkt == NULL);
1090 		KR_SLOT_DETACH_METADATA(r, ksd);
1091 
1092 		_FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1093 		    pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1094 		if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1095 		    || (pkt->pkt_length == 0)) {
1096 			FSW_STATS_INC(FSW_STATS_DROP);
1097 			pp_free_packet_single(pkt);
1098 			continue;
1099 		}
1100 		n_pkts++;
1101 		*n_bytes += pkt->pkt_length;
1102 
1103 		KPKTQ_ENQUEUE(pktq, pkt);
1104 	}
1105 	r->ckr_khead = idx;
1106 	r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
1107 }
1108 
1109 /*
1110  * This is only for estimating how many packets each GSO packet will need.
1111  * The number does not need to be exact because any leftover packets allocated
1112  * will be freed.
1113  */
1114 static uint32_t
estimate_gso_pkts(struct __kern_packet * pkt)1115 estimate_gso_pkts(struct __kern_packet *pkt)
1116 {
1117 	packet_tso_flags_t tso_flags;
1118 	uint16_t mss;
1119 	uint32_t n_pkts = 0, total_hlen = 0, total_len = 0;
1120 
1121 	tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
1122 	mss = pkt->pkt_proto_seg_sz;
1123 
1124 	if (tso_flags == PACKET_TSO_IPV4) {
1125 		total_hlen = sizeof(struct ip) + sizeof(struct tcphdr);
1126 	} else if (tso_flags == PACKET_TSO_IPV6) {
1127 		total_hlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1128 	}
1129 	if (total_hlen != 0 && mss != 0) {
1130 		total_len = pkt->pkt_length;
1131 		n_pkts = (uint32_t)
1132 		    (SK_ROUNDUP((total_len - total_hlen), mss) / mss);
1133 	}
1134 	DTRACE_SKYWALK5(estimate__gso, packet_tso_flags_t, tso_flags,
1135 	    uint32_t, total_hlen, uint32_t, total_len, uint16_t, mss,
1136 	    uint32_t, n_pkts);
1137 	return n_pkts;
1138 }
1139 
1140 /*
1141  * This function retrieves a chain of packets of the same type only
1142  * (GSO or non-GSO).
1143  */
1144 static inline void
fsw_tx_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes,uint32_t * gso_pkts_estimate)1145 fsw_tx_ring_dequeue_pktq(struct nx_flowswitch *fsw,
1146     struct __kern_channel_ring *r, uint32_t n_pkts_max,
1147     struct pktq *pktq, uint32_t *n_bytes, uint32_t *gso_pkts_estimate)
1148 {
1149 	uint32_t n_pkts = 0;
1150 	slot_idx_t idx, idx_end;
1151 	idx = r->ckr_khead;
1152 	idx_end = r->ckr_rhead;
1153 	struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
1154 	boolean_t gso_enabled, gso_required;
1155 	uint32_t gso_pkts;
1156 
1157 	gso_enabled = (fsw->fsw_tso_mode == FSW_TSO_MODE_SW);
1158 	ASSERT(KPKTQ_EMPTY(pktq));
1159 	*n_bytes = 0;
1160 	for (; n_pkts < n_pkts_max &&
1161 	    (!gso_enabled || fsw_gso_batch == 0 ||
1162 	    *gso_pkts_estimate < fsw_gso_batch) &&
1163 	    idx != idx_end; idx = SLOT_NEXT(idx, r->ckr_lim)) {
1164 		struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1165 		struct __kern_packet *pkt = ksd->sd_pkt;
1166 
1167 		ASSERT(pkt->pkt_nextpkt == NULL);
1168 
1169 		_FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1170 		    pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1171 		if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1172 		    || (pkt->pkt_length == 0)) {
1173 			KR_SLOT_DETACH_METADATA(r, ksd);
1174 			FSW_STATS_INC(FSW_STATS_DROP);
1175 			pp_free_packet_single(pkt);
1176 			continue;
1177 		}
1178 		if (gso_enabled) {
1179 			gso_pkts = estimate_gso_pkts(pkt);
1180 
1181 			/*
1182 			 * We use the first packet to determine what
1183 			 * type the subsequent ones need to be (GSO or
1184 			 * non-GSO).
1185 			 */
1186 			if (n_pkts == 0) {
1187 				gso_required = (gso_pkts != 0);
1188 			} else {
1189 				if (gso_required != (gso_pkts != 0)) {
1190 					break;
1191 				}
1192 			}
1193 			*gso_pkts_estimate += gso_pkts;
1194 		}
1195 		KR_SLOT_DETACH_METADATA(r, ksd);
1196 		if (NA_CHANNEL_EVENT_ATTACHED(&vpna->vpna_up)) {
1197 			__packet_set_tx_nx_port(SK_PKT2PH(pkt),
1198 			    vpna->vpna_nx_port, vpna->vpna_gencnt);
1199 		}
1200 		n_pkts++;
1201 		*n_bytes += pkt->pkt_length;
1202 		KPKTQ_ENQUEUE(pktq, pkt);
1203 	}
1204 	r->ckr_khead = idx;
1205 	r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
1206 	DTRACE_SKYWALK5(tx__ring__dequeue, struct nx_flowswitch *, fsw,
1207 	    ifnet_t, fsw->fsw_ifp, uint32_t, n_pkts, uint32_t, *n_bytes,
1208 	    uint32_t, *gso_pkts_estimate);
1209 }
1210 
1211 static void
fsw_ring_enqueue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1212 fsw_ring_enqueue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
1213     struct pktq *pktq)
1214 {
1215 #pragma unused(fsw)
1216 	struct __kern_packet *pkt;
1217 	struct __kern_quantum *kqum;
1218 	uint32_t kr_space_avail = 0;
1219 	uint32_t n, n_pkts = 0, n_bytes = 0;
1220 	slot_idx_t idx = 0, idx_start = 0, idx_end = 0;
1221 
1222 	kr_enter(r, TRUE);
1223 
1224 	idx_start = r->ckr_ktail;
1225 	kr_space_avail = kr_available_slots_rxring(r);
1226 	_FSW_INJECT_ERROR(40, kr_space_avail, 0, null_func);
1227 	n = MIN(kr_space_avail, KPKTQ_LEN(pktq));
1228 	_FSW_INJECT_ERROR(41, n, 0, null_func);
1229 	idx_end = SLOT_INCREMENT(idx_start, n, r->ckr_lim);
1230 
1231 	idx = idx_start;
1232 	while (idx != idx_end) {
1233 		KPKTQ_DEQUEUE(pktq, pkt);
1234 		kqum = SK_PTR_ADDR_KQUM(pkt);
1235 		kqum->qum_qflags |= QUM_F_FINALIZED;
1236 		n_pkts++;
1237 		n_bytes += pkt->pkt_length;
1238 		KR_SLOT_ATTACH_METADATA(r, KR_KSD(r, idx), kqum);
1239 		if (__improbable(pkt->pkt_trace_id != 0)) {
1240 			KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
1241 			KDBG(SK_KTRACE_PKT_RX_CHN | DBG_FUNC_START, pkt->pkt_trace_id);
1242 		}
1243 		idx = SLOT_NEXT(idx, r->ckr_lim);
1244 	}
1245 
1246 	kr_update_stats(r, n_pkts, n_bytes);
1247 
1248 	/*
1249 	 * ensure slot attachments are visible before updating the
1250 	 * tail pointer
1251 	 */
1252 	os_atomic_thread_fence(seq_cst);
1253 
1254 	r->ckr_ktail = idx_end;
1255 
1256 	kr_exit(r);
1257 
1258 	r->ckr_na_notify(r, kernproc, NA_NOTEF_PUSH);
1259 
1260 	SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "%s enqueued %d pkts",
1261 	    r->ckr_name, n_pkts);
1262 }
1263 
1264 static void
pkts_to_pktq(struct __kern_packet ** __counted_by (n_pkts)pkts,uint32_t n_pkts,struct pktq * pktq)1265 pkts_to_pktq(struct __kern_packet **__counted_by(n_pkts)pkts, uint32_t n_pkts, struct pktq *pktq)
1266 {
1267 	ASSERT(KPKTQ_EMPTY(pktq));
1268 
1269 	for (uint32_t i = 0; i < n_pkts; i++) {
1270 		struct __kern_packet *__single pkt = pkts[i];
1271 		ASSERT(pkt->pkt_nextpkt == NULL);
1272 		KPKTQ_ENQUEUE(pktq, pkt);
1273 	}
1274 }
1275 
1276 /*
1277  * This function is modeled after nx_netif_host_grab_pkts() in nx_netif_host.c.
1278  */
1279 SK_NO_INLINE_ATTRIBUTE
1280 static void
convert_native_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_headp,struct mbuf ** m_tailp,uint32_t * cnt,uint32_t * bytes)1281 convert_native_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1282     struct mbuf **m_headp, struct mbuf **m_tailp, uint32_t *cnt, uint32_t *bytes)
1283 {
1284 	uint32_t tot_cnt;
1285 	unsigned int num_segs = 1;
1286 	struct mbuf *__single mhead, *__single head = NULL;
1287 	struct mbuf *__single tail = NULL, **__single tailp = &head;
1288 	uint32_t mhead_cnt, mhead_bufsize;
1289 	uint32_t mhead_waste = 0;
1290 	uint32_t mcnt = 0, mbytes = 0;
1291 	uint32_t largest, max_pkt_len;
1292 	struct __kern_packet *__single pkt;
1293 	struct kern_pbufpool *pp;
1294 
1295 	tot_cnt = KPKTQ_LEN(pktq);
1296 	ASSERT(tot_cnt > 0);
1297 	mhead_cnt = tot_cnt;
1298 
1299 	/*
1300 	 * Opportunistically batch-allocate the mbufs based on the largest
1301 	 * packet size we've seen in the recent past.  Note that we reset
1302 	 * fe_rx_largest_size below if we notice that we're under-utilizing the
1303 	 * allocated buffers (thus disabling this batch allocation).
1304 	 */
1305 	largest = *(volatile uint32_t*)&fsw->fsw_rx_largest_size; /* read once */
1306 	if (__probable(largest != 0)) {
1307 		if (largest <= MCLBYTES) {
1308 			mhead = m_allocpacket_internal(&mhead_cnt, MCLBYTES,
1309 			    &num_segs, M_NOWAIT, 1, 0);
1310 			mhead_bufsize = MCLBYTES;
1311 		} else if (largest <= MBIGCLBYTES) {
1312 			mhead = m_allocpacket_internal(&mhead_cnt, MBIGCLBYTES,
1313 			    &num_segs, M_NOWAIT, 1, 0);
1314 			mhead_bufsize = MBIGCLBYTES;
1315 		} else if (largest <= M16KCLBYTES) {
1316 			mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES,
1317 			    &num_segs, M_NOWAIT, 1, 0);
1318 			mhead_bufsize = M16KCLBYTES;
1319 		} else if (largest <= M16KCLBYTES * 2) {
1320 			num_segs = 2;
1321 			mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES * 2,
1322 			    &num_segs, M_NOWAIT, 1, 0);
1323 			mhead_bufsize = M16KCLBYTES * 2;
1324 		} else {
1325 			mhead = NULL;
1326 			mhead_bufsize = mhead_cnt = 0;
1327 		}
1328 	} else {
1329 		mhead = NULL;
1330 		mhead_bufsize = mhead_cnt = 0;
1331 	}
1332 	DTRACE_SKYWALK4(bufstats, uint32_t, largest, uint32_t, mhead_bufsize,
1333 	    uint32_t, mhead_cnt, uint32_t, tot_cnt);
1334 
1335 	pp = __DECONST(struct kern_pbufpool *, KPKTQ_FIRST(pktq)->pkt_qum.qum_pp);
1336 	max_pkt_len = PP_BUF_SIZE_DEF(pp) * pp->pp_max_frags;
1337 
1338 	KPKTQ_FOREACH(pkt, pktq) {
1339 		uint32_t tot_len, len;
1340 		uint16_t pad, llhlen, iphlen;
1341 		boolean_t do_cksum_rx;
1342 		struct mbuf *__single m;
1343 		int error;
1344 
1345 		llhlen = pkt->pkt_l2_len;
1346 		len = pkt->pkt_length;
1347 		if (__improbable(len > max_pkt_len || len == 0 || llhlen > len)) {
1348 			DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1349 			    struct __kern_packet *, pkt);
1350 			FSW_STATS_INC(FSW_STATS_DROP);
1351 			FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1352 			continue;
1353 		}
1354 		/* begin payload on 32-bit boundary; figure out the padding */
1355 		pad = (uint16_t)P2ROUNDUP(llhlen, sizeof(uint32_t)) - llhlen;
1356 		tot_len = pad + len;
1357 
1358 		/* remember largest packet size */
1359 		if (__improbable(largest < tot_len)) {
1360 			largest = MAX(tot_len, MCLBYTES);
1361 		}
1362 
1363 		/*
1364 		 * If the above batch allocation returned partial
1365 		 * success, we try a blocking allocation here again.
1366 		 */
1367 		m = mhead;
1368 		if (__improbable(m == NULL || tot_len > mhead_bufsize)) {
1369 			ASSERT(mhead != NULL || mhead_cnt == 0);
1370 			num_segs = 1;
1371 			if (tot_len > M16KCLBYTES) {
1372 				num_segs = 0;
1373 			}
1374 			if ((error = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
1375 			    &num_segs, &m)) != 0) {
1376 				DTRACE_SKYWALK2(bad__len,
1377 				    struct nx_flowswitch *, fsw,
1378 				    struct __kern_packet *, pkt);
1379 				FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
1380 				FSW_STATS_INC(FSW_STATS_DROP);
1381 				continue;
1382 			}
1383 		} else {
1384 			mhead = m->m_nextpkt;
1385 			m->m_nextpkt = NULL;
1386 			ASSERT(mhead_cnt != 0);
1387 			--mhead_cnt;
1388 
1389 			/* check if we're underutilizing large buffers */
1390 			if (__improbable(mhead_bufsize > MCLBYTES &&
1391 			    tot_len < (mhead_bufsize >> 1))) {
1392 				++mhead_waste;
1393 			}
1394 			/*
1395 			 * Clean up unused mbuf.
1396 			 * Ony need to do this when we pre-alloc 2x16K mbufs
1397 			 */
1398 			if (__improbable(mhead_bufsize >= tot_len + M16KCLBYTES)) {
1399 				ASSERT(mhead_bufsize == 2 * M16KCLBYTES);
1400 				struct mbuf *m_extra = m->m_next;
1401 				ASSERT(m_extra != NULL);
1402 				ASSERT(m_extra->m_len == 0);
1403 				ASSERT(M_SIZE(m_extra) == M16KCLBYTES);
1404 				m->m_next = NULL;
1405 				m_freem(m_extra);
1406 				FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
1407 			}
1408 		}
1409 		m->m_data += pad;
1410 		/*
1411 		 * XXX -fbounds-safety: external dependency
1412 		 * mtod does not work because m_len is 0
1413 		 */
1414 		m->m_pkthdr.pkt_hdr = m_mtod_current(m);
1415 
1416 		/* don't include IP header from partial sum */
1417 		if (__probable((pkt->pkt_qum_qflags &
1418 		    QUM_F_FLOW_CLASSIFIED) != 0)) {
1419 			iphlen = pkt->pkt_flow_ip_hlen;
1420 			do_cksum_rx = sk_cksum_rx;
1421 		} else {
1422 			iphlen = 0;
1423 			do_cksum_rx = FALSE;
1424 		}
1425 
1426 		fsw->fsw_pkt_copy_to_mbuf(NR_RX, SK_PKT2PH(pkt),
1427 		    pkt->pkt_headroom, m, 0, len, do_cksum_rx,
1428 		    llhlen + iphlen);
1429 
1430 		FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2MBUF);
1431 		if (do_cksum_rx) {
1432 			FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
1433 		}
1434 #if DEBUG || DEVELOPMENT
1435 		if (__improbable(pkt_trailers > 0)) {
1436 			(void) pkt_add_trailers_mbuf(m, llhlen + iphlen);
1437 		}
1438 #endif /* DEBUG || DEVELOPMENT */
1439 		m_adj(m, llhlen);
1440 
1441 		m->m_pkthdr.rcvif = fsw->fsw_ifp;
1442 		if (__improbable((pkt->pkt_link_flags &
1443 		    PKT_LINKF_ETHFCS) != 0)) {
1444 			m->m_flags |= M_HASFCS;
1445 		}
1446 		if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1447 			m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
1448 		}
1449 		ASSERT(m->m_nextpkt == NULL);
1450 		tail = m;
1451 		*tailp = m;
1452 		tailp = &m->m_nextpkt;
1453 		mcnt++;
1454 		mbytes += m_pktlen(m);
1455 	}
1456 	/* free any leftovers */
1457 	if (__improbable(mhead != NULL)) {
1458 		DTRACE_SKYWALK1(mhead__leftover, uint32_t, mhead_cnt);
1459 		ASSERT(mhead_cnt != 0);
1460 		(void) m_freem_list(mhead);
1461 		mhead = NULL;
1462 		mhead_cnt = 0;
1463 	}
1464 
1465 	/* reset if most packets (>50%) are smaller than our batch buffers */
1466 	if (__improbable(mhead_waste > ((uint32_t)tot_cnt >> 1))) {
1467 		DTRACE_SKYWALK4(mhead__waste, struct nx_flowswitch *, fsw,
1468 		    struct flow_entry *, NULL, uint32_t, mhead_waste,
1469 		    uint32_t, tot_cnt);
1470 		largest = 0;
1471 	}
1472 
1473 	if (largest != fsw->fsw_rx_largest_size) {
1474 		os_atomic_store(&fsw->fsw_rx_largest_size, largest, release);
1475 	}
1476 
1477 	pp_free_pktq(pktq);
1478 	*m_headp = head;
1479 	*m_tailp = tail;
1480 	*cnt = mcnt;
1481 	*bytes = mbytes;
1482 }
1483 
1484 /*
1485  * This function only extracts the mbuf from the packet. The caller frees
1486  * the packet.
1487  */
1488 static inline struct mbuf *
convert_compat_pkt_to_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1489 convert_compat_pkt_to_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1490 {
1491 	struct mbuf *m;
1492 	struct pkthdr *mhdr;
1493 	uint16_t llhlen;
1494 
1495 	m = pkt->pkt_mbuf;
1496 	ASSERT(m != NULL);
1497 
1498 	llhlen = pkt->pkt_l2_len;
1499 	if (llhlen > pkt->pkt_length) {
1500 		m_freem(m);
1501 		KPKT_CLEAR_MBUF_DATA(pkt);
1502 		DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1503 		    struct __kern_packet *, pkt);
1504 		FSW_STATS_INC(FSW_STATS_DROP);
1505 		FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1506 		return NULL;
1507 	}
1508 	mhdr = &m->m_pkthdr;
1509 	if ((mhdr->csum_flags & CSUM_DATA_VALID) == 0 &&
1510 	    PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
1511 		mhdr->csum_flags &= ~CSUM_RX_FLAGS;
1512 		mhdr->csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
1513 		mhdr->csum_rx_start = pkt->pkt_csum_rx_start_off;
1514 		mhdr->csum_rx_val = pkt->pkt_csum_rx_value;
1515 	}
1516 #if DEBUG || DEVELOPMENT
1517 	uint32_t extra = 0;
1518 	if (__improbable(pkt_trailers > 0)) {
1519 		extra = pkt_add_trailers_mbuf(m, llhlen);
1520 	}
1521 #endif /* DEBUG || DEVELOPMENT */
1522 	m_adj(m, llhlen);
1523 	ASSERT((uint32_t)m_pktlen(m) == ((pkt->pkt_length - llhlen) + extra));
1524 	KPKT_CLEAR_MBUF_DATA(pkt);
1525 	return m;
1526 }
1527 
1528 SK_NO_INLINE_ATTRIBUTE
1529 static void
convert_compat_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_head,struct mbuf ** m_tail,uint32_t * cnt,uint32_t * bytes)1530 convert_compat_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1531     struct mbuf **m_head, struct mbuf **m_tail, uint32_t *cnt, uint32_t *bytes)
1532 {
1533 	struct __kern_packet *pkt;
1534 	struct mbuf *__single m, *__single head = NULL;
1535 	struct mbuf *__single tail = NULL, **__single tailp = &head;
1536 	uint32_t c = 0, b = 0;
1537 
1538 	KPKTQ_FOREACH(pkt, pktq) {
1539 		m = convert_compat_pkt_to_mbuf(fsw, pkt);
1540 		if (__improbable(m == NULL)) {
1541 			continue;
1542 		}
1543 		tail = m;
1544 		*tailp = m;
1545 		tailp = &m->m_nextpkt;
1546 		c++;
1547 		b += m_pktlen(m);
1548 	}
1549 	pp_free_pktq(pktq);
1550 	*m_head = head;
1551 	*m_tail = tail;
1552 	*cnt = c;
1553 	*bytes = b;
1554 }
1555 
1556 void
fsw_host_sendup(ifnet_t ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes)1557 fsw_host_sendup(ifnet_t ifp, struct mbuf *m_head, struct mbuf *m_tail,
1558     uint32_t cnt, uint32_t bytes)
1559 {
1560 	struct ifnet_stat_increment_param s;
1561 
1562 	bzero(&s, sizeof(s));
1563 	s.packets_in = cnt;
1564 	s.bytes_in = bytes;
1565 	dlil_input_handler(ifp, m_head, m_tail, &s, FALSE, NULL);
1566 }
1567 
1568 void
fsw_host_rx(struct nx_flowswitch * fsw,struct pktq * pktq)1569 fsw_host_rx(struct nx_flowswitch *fsw, struct pktq *pktq)
1570 {
1571 	struct mbuf *__single m_head = NULL, *__single m_tail = NULL;
1572 	uint32_t cnt = 0, bytes = 0;
1573 	ifnet_fsw_rx_cb_t __single cb;
1574 	void *__single cb_arg;
1575 	boolean_t compat;
1576 
1577 	ASSERT(!KPKTQ_EMPTY(pktq));
1578 	if (ifnet_get_flowswitch_rx_callback(fsw->fsw_ifp, &cb, &cb_arg) == 0) {
1579 		ASSERT(cb != NULL);
1580 		ASSERT(cb_arg != NULL);
1581 		(*cb)(cb_arg, pktq);
1582 		ifnet_release_flowswitch_rx_callback(fsw->fsw_ifp);
1583 		if (KPKTQ_EMPTY(pktq)) {
1584 			return;
1585 		} else {
1586 			DTRACE_SKYWALK2(leftover__pkts, struct nx_flowswitch *, fsw,
1587 			    struct pktq *, pktq);
1588 		}
1589 	}
1590 
1591 	/* All packets in the pktq must have the same type */
1592 	compat = ((KPKTQ_FIRST(pktq)->pkt_pflags & PKT_F_MBUF_DATA) != 0);
1593 	if (compat) {
1594 		convert_compat_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1595 		    &bytes);
1596 	} else {
1597 		convert_native_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1598 		    &bytes);
1599 	}
1600 	if (__improbable(m_head == NULL)) {
1601 		DTRACE_SKYWALK1(empty__head, struct nx_flowswitch *, fsw);
1602 		return;
1603 	}
1604 	fsw_host_sendup(fsw->fsw_ifp, m_head, m_tail, cnt, bytes);
1605 }
1606 
1607 void
fsw_ring_enqueue_tail_drop(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1608 fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw,
1609     struct __kern_channel_ring *r, struct pktq *pktq)
1610 {
1611 	fsw_ring_enqueue_pktq(fsw, r, pktq);
1612 	/*
1613 	 * Rx stall detection: don't update enqueue ts if dequeue ts < enqueue ts.
1614 	 * This is to ensure we use the timestamp of the earliest enqueue without
1615 	 * a dequeue.
1616 	 */
1617 	if (r->ckr_rx_dequeue_ts >= r->ckr_rx_enqueue_ts) {
1618 		r->ckr_rx_enqueue_ts = _net_uptime;
1619 	}
1620 	FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq));
1621 	dp_drop_pktq(fsw, pktq, 0, DROP_REASON_RX_DST_RING_FULL, __LINE__,
1622 	    DROPTAP_FLAG_L2_MISSING);
1623 }
1624 
1625 static struct nexus_adapter *
flow_get_na(struct nx_flowswitch * fsw,struct flow_entry * fe)1626 flow_get_na(struct nx_flowswitch *fsw, struct flow_entry *fe)
1627 {
1628 	struct kern_nexus *nx = fsw->fsw_nx;
1629 	struct nexus_adapter *na = NULL;
1630 	nexus_port_t port = fe->fe_nx_port;
1631 
1632 	if (port == FSW_VP_DEV || port == FSW_VP_HOST) {
1633 		SK_ERR("dev or host ports have no NA");
1634 		return NULL;
1635 	}
1636 
1637 	if (__improbable(!nx_port_is_valid(nx, port))) {
1638 		SK_DF(SK_VERB_FSW_DP, "%s[%d] port no longer valid",
1639 		    if_name(fsw->fsw_ifp), port);
1640 		return NULL;
1641 	}
1642 
1643 	na = nx_port_get_na(nx, port);
1644 	if (__improbable(na == NULL)) {
1645 		FSW_STATS_INC(FSW_STATS_DST_NXPORT_INVALID);
1646 		SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer valid",
1647 		    if_name(fsw->fsw_ifp), port);
1648 		return NULL;
1649 	}
1650 
1651 	if (__improbable(!NA_IS_ACTIVE(na))) {
1652 		FSW_STATS_INC(FSW_STATS_DST_NXPORT_INACTIVE);
1653 		SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer active",
1654 		    if_name(fsw->fsw_ifp), port);
1655 		return NULL;
1656 	}
1657 
1658 	if (__improbable(nx_port_is_defunct(nx, port))) {
1659 		FSW_STATS_INC(FSW_STATS_DST_NXPORT_DEFUNCT);
1660 		SK_DF(SK_VERB_FSW_DP, "%s[%d] NA defuncted",
1661 		    if_name(fsw->fsw_ifp), port);
1662 		return NULL;
1663 	}
1664 
1665 	return na;
1666 }
1667 
1668 static inline struct __kern_channel_ring *
flow_get_ring(struct nx_flowswitch * fsw,struct flow_entry * fe,enum txrx txrx)1669 flow_get_ring(struct nx_flowswitch *fsw, struct flow_entry *fe, enum txrx txrx)
1670 {
1671 	struct nexus_vp_adapter *na = NULL;
1672 	struct __kern_channel_ring *__single r = NULL;
1673 
1674 	na = VPNA(flow_get_na(fsw, fe));
1675 	if (__improbable(na == NULL)) {
1676 		return NULL;
1677 	}
1678 
1679 	switch (txrx) {
1680 	case NR_RX:
1681 		r = KR_SINGLE(&na->vpna_up.na_rx_rings[0]);
1682 		break;
1683 	case NR_TX:
1684 		r = KR_SINGLE(&na->vpna_up.na_tx_rings[0]);
1685 		break;
1686 	default:
1687 		__builtin_unreachable();
1688 		VERIFY(0);
1689 	}
1690 
1691 	if (__improbable(KR_DROP(r))) {
1692 		FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE);
1693 		SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %0xllx %s drop mode",
1694 		    r->ckr_name, SK_KVA(r));
1695 		return NULL;
1696 	}
1697 
1698 	ASSERT(KRNA(r)->na_md_type == NEXUS_META_TYPE_PACKET);
1699 
1700 #if (DEVELOPMENT || DEBUG)
1701 	if (r != NULL) {
1702 		_FSW_INJECT_ERROR(4, r, NULL, null_func);
1703 	}
1704 #endif /* DEVELOPMENT || DEBUG */
1705 
1706 	return r;
1707 }
1708 
1709 struct __kern_channel_ring *
fsw_flow_get_rx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1710 fsw_flow_get_rx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1711 {
1712 	return flow_get_ring(fsw, fe, NR_RX);
1713 }
1714 
1715 static inline struct __kern_channel_ring *
fsw_flow_get_tx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1716 fsw_flow_get_tx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1717 {
1718 	return flow_get_ring(fsw, fe, NR_TX);
1719 }
1720 
1721 static bool
dp_flow_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1722 dp_flow_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1723 {
1724 	struct flow_route *fr = fe->fe_route;
1725 	struct ifnet *ifp = fsw->fsw_ifp;
1726 
1727 	if (__improbable(!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
1728 	    !fe->fe_want_nonviable && (fe->fe_key.fk_mask & FKMASK_SRC) &&
1729 	    fe->fe_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt &&
1730 	    !flow_route_key_validate(&fe->fe_key, ifp, &fe->fe_laddr_gencnt))) {
1731 		/*
1732 		 * The source address is no longer around; we want this
1733 		 * flow to be nonviable, but that requires holding the lock
1734 		 * as writer (which isn't the case now.)  Indicate that
1735 		 * we need to finalize the nonviable later down below.
1736 		 *
1737 		 * We also request that the flow route be re-configured,
1738 		 * if this is a connected mode flow.
1739 		 *
1740 		 */
1741 		if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) {
1742 			/*
1743 			 * fsw_pending_nonviable is a hint for reaper thread;
1744 			 * due to the fact that setting fe_want_nonviable and
1745 			 * incrementing fsw_pending_nonviable counter is not
1746 			 * atomic, let the increment happen first, and the
1747 			 * thread losing the CAS does decrement.
1748 			 */
1749 			os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
1750 			if (os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
1751 				fsw_reap_sched(fsw);
1752 			} else {
1753 				os_atomic_dec(&fsw->fsw_pending_nonviable, relaxed);
1754 			}
1755 		}
1756 		if (fr != NULL) {
1757 			os_atomic_inc(&fr->fr_want_configure, relaxed);
1758 		}
1759 	}
1760 
1761 	/* if flow was (or is going to be) marked as nonviable, drop it */
1762 	if (__improbable(fe->fe_want_nonviable ||
1763 	    (fe->fe_flags & FLOWENTF_NONVIABLE) != 0)) {
1764 		SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow 0x%llx non-viable",
1765 		    SK_KVA(fe));
1766 		return false;
1767 	}
1768 	return true;
1769 }
1770 
1771 bool
dp_flow_rx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1772 dp_flow_rx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1773 {
1774 	bool okay;
1775 	okay = dp_flow_route_process(fsw, fe);
1776 #if (DEVELOPMENT || DEBUG)
1777 	if (okay) {
1778 		_FSW_INJECT_ERROR(5, okay, false, null_func);
1779 	}
1780 #endif /* DEVELOPMENT || DEBUG */
1781 
1782 	return okay;
1783 }
1784 
1785 void
dp_flow_rx_process(struct nx_flowswitch * fsw,struct flow_entry * fe,struct pktq * rx_pkts,uint32_t rx_bytes,uint32_t flags)1786 dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
1787     struct pktq *rx_pkts, uint32_t rx_bytes, uint32_t flags)
1788 {
1789 #pragma unused(flags)
1790 	struct pktq dpkts;              /* dst pool alloc'ed packets */
1791 	struct pktq disposed_pkts;         /* done src packets */
1792 	struct pktq dropped_pkts;         /* dropped src packets */
1793 	struct pktq transferred_pkts;         /* dst packet ready for ring */
1794 	struct __kern_packet *pkt, *tpkt;
1795 	struct kern_pbufpool *dpp;
1796 	uint32_t n_pkts = KPKTQ_LEN(rx_pkts);
1797 	uint64_t buf_array[RX_BUFLET_BATCH_COUNT];
1798 	uint16_t buf_array_iter = 0;
1799 	uint32_t cnt, buf_cnt = 0;
1800 	int err;
1801 	drop_reason_t reason = DROP_REASON_UNSPECIFIED;
1802 	uint16_t line = 0;
1803 
1804 	KPKTQ_INIT(&dpkts);
1805 	KPKTQ_INIT(&dropped_pkts);
1806 	KPKTQ_INIT(&disposed_pkts);
1807 	KPKTQ_INIT(&transferred_pkts);
1808 
1809 	if (__improbable(!dp_flow_rx_route_process(fsw, fe))) {
1810 		SK_ERR("Rx route bad");
1811 		fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
1812 		FSW_STATS_ADD(FSW_STATS_RX_FLOW_NONVIABLE, n_pkts);
1813 		reason = DROP_REASON_FSW_FLOW_NONVIABLE;
1814 		line = __LINE__;
1815 		goto done;
1816 	}
1817 
1818 	if (fe->fe_nx_port == FSW_VP_HOST) {
1819 		/*
1820 		 * The host ring does not exist anymore so we can't take
1821 		 * the enqueue path below. This path should only be hit
1822 		 * for the rare tcp fragmentation case.
1823 		 */
1824 		fsw_host_rx(fsw, rx_pkts);
1825 		return;
1826 	}
1827 
1828 	/* find the ring */
1829 	struct __kern_channel_ring *r;
1830 	r = fsw_flow_get_rx_ring(fsw, fe);
1831 	if (__improbable(r == NULL)) {
1832 		fsw_snoop_and_dequeue(fe, &dropped_pkts, rx_pkts, true);
1833 		reason = DROP_REASON_FSW_RX_RING_NOT_FOUND;
1834 		line = __LINE__;
1835 		goto done;
1836 	}
1837 
1838 	/* snoop before L2 is stripped */
1839 	if (__improbable(pktap_total_tap_count != 0)) {
1840 		fsw_snoop(fsw, fe, rx_pkts, true);
1841 	}
1842 
1843 	dpp = r->ckr_pp;
1844 	/* batch allocate enough packets */
1845 	err = pp_alloc_pktq(dpp, 1, &dpkts, n_pkts, NULL, NULL,
1846 	    SKMEM_NOSLEEP);
1847 	if (__improbable(err == ENOMEM)) {
1848 		ASSERT(KPKTQ_EMPTY(&dpkts));
1849 		KPKTQ_CONCAT(&dropped_pkts, rx_pkts);
1850 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1851 		SK_ERR("failed to alloc %u pkts for kr %s, 0x%llu", n_pkts,
1852 		    r->ckr_name, SK_KVA(r));
1853 		reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
1854 		line = __LINE__;
1855 		goto done;
1856 	}
1857 
1858 	/*
1859 	 * estimate total number of buflets for the packet chain.
1860 	 */
1861 	cnt = howmany(rx_bytes, PP_BUF_SIZE_DEF(dpp));
1862 	if (cnt > n_pkts) {
1863 		ASSERT(dpp->pp_max_frags > 1);
1864 		cnt -= n_pkts;
1865 		buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, cnt);
1866 		err = pp_alloc_buflet_batch(dpp, buf_array, &buf_cnt,
1867 		    SKMEM_NOSLEEP, false);
1868 		if (__improbable(buf_cnt == 0)) {
1869 			KPKTQ_CONCAT(&dropped_pkts, rx_pkts);
1870 			FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1871 			SK_ERR("failed to alloc %d buflets (err %d) for kr %s, "
1872 			    "0x%llu", cnt, err, r->ckr_name, SK_KVA(r));
1873 			reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
1874 			line = __LINE__;
1875 			goto done;
1876 		}
1877 		err = 0;
1878 	}
1879 
1880 	/* extra processing for user flow */
1881 	KPKTQ_FOREACH_SAFE(pkt, rx_pkts, tpkt) {
1882 		err = 0;
1883 		KPKTQ_REMOVE(rx_pkts, pkt);
1884 		if (rx_bytes > pkt->pkt_flow_ulen) {
1885 			rx_bytes -= pkt->pkt_flow_ulen;
1886 		} else {
1887 			rx_bytes = 0;
1888 		}
1889 		err = flow_pkt_track(fe, pkt, true);
1890 		_FSW_INJECT_ERROR(33, err, EPROTO, null_func);
1891 		if (__improbable(err != 0)) {
1892 			SK_DF(SK_VERB_FLOW_TRACK, "flow_pkt_track failed (err %d)", err);
1893 			FSW_STATS_INC(FSW_STATS_RX_FLOW_TRACK_ERR);
1894 			/* if need to trigger RST */
1895 			if (err == ENETRESET) {
1896 				flow_track_abort_tcp(fe, pkt, NULL);
1897 			}
1898 			dp_drop_pkt_single(fsw, pkt, 0, DROP_REASON_FSW_FLOW_TRACK_ERR,
1899 			    DROPTAP_FLAG_L2_MISSING);
1900 			continue;
1901 		}
1902 
1903 		/* transfer to dpkt */
1904 		if (pkt->pkt_qum.qum_pp != dpp) {
1905 			struct __kern_buflet *bprev, *bnew;
1906 			struct __kern_packet *dpkt = NULL;
1907 			uint32_t n_bufs, i;
1908 
1909 			KPKTQ_DEQUEUE(&dpkts, dpkt);
1910 			/* XXX Why would dpkt be NULL at this point? */
1911 			if (__improbable(dpkt == NULL)) {
1912 				FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1913 				dp_drop_pkt_single(fsw, pkt, 0,
1914 				    DROP_REASON_FSW_PP_ALLOC_FAILED, DROPTAP_FLAG_L2_MISSING);
1915 				continue;
1916 			}
1917 			n_bufs = howmany(pkt->pkt_length, PP_BUF_SIZE_DEF(dpp));
1918 			n_bufs--;
1919 			for (i = 0; i < n_bufs; i++) {
1920 				if (__improbable(buf_cnt == 0)) {
1921 					ASSERT(dpp->pp_max_frags > 1);
1922 					buf_array_iter = 0;
1923 					cnt = howmany(rx_bytes, PP_BUF_SIZE_DEF(dpp));
1924 					n_pkts = KPKTQ_LEN(rx_pkts);
1925 					if (cnt >= n_pkts) {
1926 						cnt -= n_pkts;
1927 					} else {
1928 						cnt = 0;
1929 					}
1930 					cnt += (n_bufs - i);
1931 					buf_cnt = MIN(RX_BUFLET_BATCH_COUNT,
1932 					    cnt);
1933 					cnt = buf_cnt;
1934 					err = pp_alloc_buflet_batch(dpp,
1935 					    buf_array, &buf_cnt,
1936 					    SKMEM_NOSLEEP, false);
1937 					if (__improbable(buf_cnt == 0)) {
1938 						FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1939 						dp_drop_pkt_single(fsw, pkt, 0,
1940 						    DROP_REASON_FSW_PP_ALLOC_FAILED,
1941 						    DROPTAP_FLAG_L2_MISSING);
1942 						pkt = NULL;
1943 						pp_free_packet_single(dpkt);
1944 						dpkt = NULL;
1945 						SK_ERR("failed to alloc %d "
1946 						    "buflets (err %d) for "
1947 						    "kr %s, 0x%llu", cnt, err,
1948 						    r->ckr_name, SK_KVA(r));
1949 						break;
1950 					}
1951 					err = 0;
1952 				}
1953 				ASSERT(buf_cnt != 0);
1954 				if (i == 0) {
1955 					PKT_GET_FIRST_BUFLET(dpkt, 1, bprev);
1956 				}
1957 				/*
1958 				 * XXX -fbounds-safety: can't avoid using forge
1959 				 * unless we change the signature of
1960 				 * pp_alloc_buflet_batch().
1961 				 */
1962 				bnew = __unsafe_forge_single(kern_buflet_t,
1963 				    buf_array[buf_array_iter]);
1964 				buf_array[buf_array_iter] = 0;
1965 				buf_array_iter++;
1966 				buf_cnt--;
1967 				VERIFY(kern_packet_add_buflet(SK_PKT2PH(dpkt),
1968 				    bprev, bnew) == 0);
1969 				bprev = bnew;
1970 			}
1971 			if (__improbable(err != 0)) {
1972 				continue;
1973 			}
1974 			err = copy_packet_from_dev(fsw, pkt, dpkt);
1975 			_FSW_INJECT_ERROR(43, err, EINVAL, null_func);
1976 			if (__improbable(err != 0)) {
1977 				SK_ERR("copy packet failed (err %d)", err);
1978 				dp_drop_pkt_single(fsw, pkt, 0,
1979 				    DROP_REASON_FSW_PKT_COPY_FAILED,
1980 				    DROPTAP_FLAG_L2_MISSING);
1981 				pp_free_packet_single(dpkt);
1982 				dpkt = NULL;
1983 				continue;
1984 			}
1985 			KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1986 			pkt = dpkt;
1987 		}
1988 		_UUID_COPY(pkt->pkt_flow_id, fe->fe_uuid);
1989 		_UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1990 		pkt->pkt_policy_id = fe->fe_policy_id;
1991 		pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
1992 		pkt->pkt_transport_protocol = fe->fe_transport_protocol;
1993 		if (pkt->pkt_bufs_cnt > 1) {
1994 			pkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1995 			pkt->pkt_seg_cnt = 1;
1996 		}
1997 		KPKTQ_ENQUEUE(&transferred_pkts, pkt);
1998 	}
1999 	KPKTQ_FINI(rx_pkts);
2000 
2001 	if (KPKTQ_LEN(&transferred_pkts) > 0) {
2002 		fsw_ring_enqueue_tail_drop(fsw, r, &transferred_pkts);
2003 	}
2004 	KPKTQ_FINI(&transferred_pkts);
2005 
2006 done:
2007 	/* Free unused buflets */
2008 	while (buf_cnt > 0) {
2009 		/*
2010 		 * XXX -fbounds-safety: can't avoid using forge unless we change
2011 		 * the signature of pp_alloc_buflet_batch().
2012 		 */
2013 		pp_free_buflet(dpp, __unsafe_forge_single(kern_buflet_t,
2014 		    (kern_buflet_t)(buf_array[buf_array_iter])));
2015 		buf_array[buf_array_iter] = 0;
2016 		buf_array_iter++;
2017 		buf_cnt--;
2018 	}
2019 	dp_free_pktq(fsw, &dpkts);
2020 	dp_free_pktq(fsw, &disposed_pkts);
2021 	dp_drop_pktq(fsw, &dropped_pkts, 0, reason, line, DROPTAP_FLAG_L2_MISSING);
2022 }
2023 
2024 static inline void
rx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe,struct flow_entry_list * fes)2025 rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
2026     struct flow_entry_list *fes)
2027 {
2028 	struct pktq rx_pkts;
2029 	uint32_t rx_bytes;
2030 	uint32_t rx_proc_flags;
2031 
2032 	ASSERT(!KPKTQ_EMPTY(&fe->fe_rx_pktq));
2033 	ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0);
2034 
2035 	KPKTQ_INIT(&rx_pkts);
2036 	for (;;) {
2037 		lck_mtx_lock(&fe->fe_rx_pktq_lock);
2038 		if (KPKTQ_EMPTY(&fe->fe_rx_pktq)) {
2039 			fe->fe_rx_worker_tid = 0;
2040 			TAILQ_REMOVE(fes, fe, fe_rx_link);
2041 			lck_mtx_unlock(&fe->fe_rx_pktq_lock);
2042 			break;
2043 		}
2044 		KPKTQ_CONCAT(&rx_pkts, &fe->fe_rx_pktq);
2045 		KPKTQ_DISPOSE(&fe->fe_rx_pktq);
2046 		rx_bytes = fe->fe_rx_pktq_bytes;
2047 		rx_proc_flags = fe->fe_rx_frag_count ? FLOW_PROC_FLAG_FRAGMENTS : 0;
2048 		fe->fe_rx_pktq_bytes = 0;
2049 		fe->fe_rx_frag_count = 0;
2050 		lck_mtx_unlock(&fe->fe_rx_pktq_lock);
2051 		SK_DF(SK_VERB_FSW_DP | SK_VERB_RX, "Rx %d pkts for fe %p port %d",
2052 		    KPKTQ_LEN(&rx_pkts), fe, fe->fe_nx_port);
2053 		/* flow related processing (default, agg, fpd, etc.) */
2054 		fe->fe_rx_process(fsw, fe, &rx_pkts, rx_bytes, rx_proc_flags);
2055 	}
2056 	ASSERT(KPKTQ_EMPTY(&rx_pkts));
2057 
2058 	if (__improbable(fe->fe_want_withdraw)) {
2059 		fsw_reap_sched(fsw);
2060 	}
2061 }
2062 
2063 static inline void
dp_rx_process_wake_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2064 dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2065 {
2066 	/*
2067 	 * We only care about wake packets of flows that belong the flow switch
2068 	 * as wake packets for the host stack are handled by the host input
2069 	 * function
2070 	 */
2071 #if (DEBUG || DEVELOPMENT)
2072 	if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
2073 		/*
2074 		 * This is a one shot command
2075 		 */
2076 		fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
2077 
2078 		pkt->pkt_pflags |= PKT_F_WAKE_PKT;
2079 	}
2080 #endif /* (DEBUG || DEVELOPMENT) */
2081 	if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
2082 		if_ports_used_match_pkt(fsw->fsw_ifp, pkt);
2083 	}
2084 }
2085 
2086 static void
_fsw_receive_locked(struct nx_flowswitch * fsw,struct pktq * pktq)2087 _fsw_receive_locked(struct nx_flowswitch *fsw, struct pktq *pktq)
2088 {
2089 	struct __kern_packet *__single pkt, *__single tpkt;
2090 	struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
2091 	struct flow_entry *__single fe, *__single prev_fe;
2092 	sa_family_t af;
2093 	struct pktq host_pkts, dropped_pkts;
2094 	drop_reason_t reason = DROP_REASON_UNSPECIFIED;
2095 	uint16_t line = 0;
2096 	int err;
2097 	uint64_t thread_id;
2098 
2099 	KPKTQ_INIT(&host_pkts);
2100 	KPKTQ_INIT(&dropped_pkts);
2101 
2102 	if (__improbable(FSW_QUIESCED(fsw))) {
2103 		DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw);
2104 		KPKTQ_CONCAT(&dropped_pkts, pktq);
2105 		reason = DROP_REASON_FSW_QUIESCED;
2106 		line = __LINE__;
2107 		goto done;
2108 	}
2109 	if (__improbable(fsw->fsw_demux == NULL)) {
2110 		KPKTQ_CONCAT(&dropped_pkts, pktq);
2111 		reason = DROP_REASON_FSW_DEMUX_FAILED;
2112 		line = __LINE__;
2113 		goto done;
2114 	}
2115 
2116 	thread_id = thread_tid(current_thread());
2117 	prev_fe = NULL;
2118 	KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
2119 		if (__probable(tpkt)) {
2120 			void *baddr;
2121 			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
2122 			SK_PREFETCH(baddr, 0);
2123 			/* prefetch L3 and L4 flow structs */
2124 			SK_PREFETCHW(tpkt->pkt_flow, 0);
2125 			SK_PREFETCHW(tpkt->pkt_flow, 128);
2126 		}
2127 
2128 		KPKTQ_REMOVE(pktq, pkt);
2129 
2130 		pkt = rx_prepare_packet(fsw, pkt);
2131 
2132 		af = fsw->fsw_demux(fsw, pkt);
2133 		if (__improbable(af == AF_UNSPEC)) {
2134 			KPKTQ_ENQUEUE(&host_pkts, pkt);
2135 			continue;
2136 		}
2137 
2138 		err = flow_pkt_classify(pkt, fsw->fsw_ifp, af, TRUE);
2139 		_FSW_INJECT_ERROR(1, err, ENXIO, null_func);
2140 		if (__improbable(err != 0)) {
2141 			FSW_STATS_INC(FSW_STATS_RX_FLOW_EXTRACT_ERR);
2142 			KPKTQ_ENQUEUE(&host_pkts, pkt);
2143 			continue;
2144 		}
2145 
2146 		if (__improbable(pkt->pkt_flow_ip_is_frag)) {
2147 			pkt = rx_process_ip_frag(fsw, pkt);
2148 			if (pkt == NULL) {
2149 				continue;
2150 			}
2151 		}
2152 
2153 		prev_fe = fe = rx_lookup_flow(fsw, pkt, prev_fe);
2154 		if (__improbable(fe == NULL)) {
2155 			KPKTQ_ENQUEUE_LIST(&host_pkts, pkt);
2156 			continue;
2157 		}
2158 
2159 		dp_rx_process_wake_packet(fsw, pkt);
2160 
2161 		rx_flow_batch_packets(&fes, fe, pkt, thread_id);
2162 		prev_fe = fe;
2163 	}
2164 
2165 	struct flow_entry *tfe = NULL;
2166 	TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) {
2167 		rx_flow_process(fsw, fe, &fes);
2168 		flow_entry_release(&fe);
2169 	}
2170 
2171 	if (!KPKTQ_EMPTY(&host_pkts)) {
2172 		fsw_host_rx(fsw, &host_pkts);
2173 	}
2174 
2175 done:
2176 	dp_drop_pktq(fsw, &dropped_pkts, 0, reason, line, 0);
2177 }
2178 
2179 #if (DEVELOPMENT || DEBUG)
2180 static void
fsw_rps_rx(struct nx_flowswitch * fsw,uint32_t id,struct __kern_packet * pkt)2181 fsw_rps_rx(struct nx_flowswitch *fsw, uint32_t id,
2182     struct __kern_packet *pkt)
2183 {
2184 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2185 
2186 	lck_mtx_lock_spin(&frt->frt_lock);
2187 	KPKTQ_ENQUEUE(&frt->frt_pktq, pkt);
2188 	lck_mtx_unlock(&frt->frt_lock);
2189 }
2190 
2191 static void
fsw_rps_thread_schedule(struct nx_flowswitch * fsw,uint32_t id)2192 fsw_rps_thread_schedule(struct nx_flowswitch *fsw, uint32_t id)
2193 {
2194 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2195 
2196 	ASSERT(frt->frt_thread != THREAD_NULL);
2197 	lck_mtx_lock_spin(&frt->frt_lock);
2198 	ASSERT(!(frt->frt_flags & (FRT_TERMINATING | FRT_TERMINATED)));
2199 
2200 	frt->frt_requests++;
2201 	if (!(frt->frt_flags & FRT_RUNNING)) {
2202 		thread_wakeup((caddr_t)frt);
2203 	}
2204 	lck_mtx_unlock(&frt->frt_lock);
2205 }
2206 
2207 __attribute__((noreturn))
2208 static void
fsw_rps_thread_cont(void * v,wait_result_t w)2209 fsw_rps_thread_cont(void *v, wait_result_t w)
2210 {
2211 	struct fsw_rps_thread *__single frt = v;
2212 	struct nx_flowswitch *fsw = frt->frt_fsw;
2213 
2214 	lck_mtx_lock(&frt->frt_lock);
2215 	if (__improbable(w == THREAD_INTERRUPTIBLE ||
2216 	    (frt->frt_flags & FRT_TERMINATING) != 0)) {
2217 		goto terminate;
2218 	}
2219 	if (KPKTQ_EMPTY(&frt->frt_pktq)) {
2220 		goto done;
2221 	}
2222 	frt->frt_flags |= FRT_RUNNING;
2223 
2224 	for (;;) {
2225 		uint32_t requests = frt->frt_requests;
2226 		struct pktq pkts;
2227 
2228 		KPKTQ_INIT(&pkts);
2229 		KPKTQ_CONCAT(&pkts, &frt->frt_pktq);
2230 		lck_mtx_unlock(&frt->frt_lock);
2231 
2232 		sk_protect_t protect;
2233 		protect = sk_sync_protect();
2234 		FSW_RLOCK(fsw);
2235 		_fsw_receive_locked(fsw, &pkts);
2236 		FSW_RUNLOCK(fsw);
2237 		sk_sync_unprotect(protect);
2238 
2239 		lck_mtx_lock(&frt->frt_lock);
2240 		if ((frt->frt_flags & FRT_TERMINATING) != 0 ||
2241 		    requests == frt->frt_requests) {
2242 			frt->frt_requests = 0;
2243 			break;
2244 		}
2245 	}
2246 
2247 done:
2248 	lck_mtx_unlock(&frt->frt_lock);
2249 	if (!(frt->frt_flags & FRT_TERMINATING)) {
2250 		frt->frt_flags &= ~FRT_RUNNING;
2251 		assert_wait(frt, THREAD_UNINT);
2252 		thread_block_parameter(fsw_rps_thread_cont, frt);
2253 		__builtin_unreachable();
2254 	} else {
2255 terminate:
2256 		LCK_MTX_ASSERT(&frt->frt_lock, LCK_MTX_ASSERT_OWNED);
2257 		frt->frt_flags &= ~(FRT_RUNNING | FRT_TERMINATING);
2258 		frt->frt_flags |= FRT_TERMINATED;
2259 
2260 		if (frt->frt_flags & FRT_TERMINATEBLOCK) {
2261 			thread_wakeup((caddr_t)&frt);
2262 		}
2263 		lck_mtx_unlock(&frt->frt_lock);
2264 
2265 		SK_D("fsw_rx_%s_%d terminated", if_name(fsw->fsw_ifp),
2266 		    frt->frt_idx);
2267 
2268 		/* for the extra refcnt from kernel_thread_start() */
2269 		thread_deallocate(current_thread());
2270 		/* this is the end */
2271 		thread_terminate(current_thread());
2272 		/* NOTREACHED */
2273 		__builtin_unreachable();
2274 	}
2275 
2276 	/* must never get here */
2277 	VERIFY(0);
2278 	/* NOTREACHED */
2279 	__builtin_unreachable();
2280 }
2281 
2282 __attribute__((noreturn))
2283 static void
fsw_rps_thread_func(void * v,wait_result_t w)2284 fsw_rps_thread_func(void *v, wait_result_t w)
2285 {
2286 #pragma unused(w)
2287 	struct fsw_rps_thread *__single frt = v;
2288 	struct nx_flowswitch *fsw = frt->frt_fsw;
2289 	const char *__null_terminated tname = NULL;
2290 
2291 	char thread_name[MAXTHREADNAMESIZE];
2292 	bzero(thread_name, sizeof(thread_name));
2293 	tname = tsnprintf(thread_name, sizeof(thread_name), "fsw_rx_%s_%d",
2294 	    if_name(fsw->fsw_ifp), frt->frt_idx);
2295 
2296 	thread_set_thread_name(frt->frt_thread, tname);
2297 	SK_D("%s spawned", tname);
2298 
2299 	net_thread_marks_push(NET_THREAD_SYNC_RX);
2300 	assert_wait(frt, THREAD_UNINT);
2301 	(void) thread_block_parameter(fsw_rps_thread_cont, frt);
2302 
2303 	__builtin_unreachable();
2304 }
2305 
2306 static void
fsw_rps_thread_join(struct nx_flowswitch * fsw,uint32_t i)2307 fsw_rps_thread_join(struct nx_flowswitch *fsw, uint32_t i)
2308 {
2309 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2310 	uint64_t f = (1 * NSEC_PER_MSEC);
2311 	uint64_t s = (1000 * NSEC_PER_SEC);
2312 	uint32_t c = 0;
2313 
2314 	lck_mtx_lock(&frt->frt_lock);
2315 	frt->frt_flags |= FRT_TERMINATING;
2316 
2317 	while (!(frt->frt_flags & FRT_TERMINATED)) {
2318 		uint64_t t = 0;
2319 		nanoseconds_to_absolutetime((c++ == 0) ? f : s, &t);
2320 		clock_absolutetime_interval_to_deadline(t, &t);
2321 		ASSERT(t != 0);
2322 
2323 		frt->frt_flags |= FRT_TERMINATEBLOCK;
2324 		if (!(frt->frt_flags & FRT_RUNNING)) {
2325 			thread_wakeup_one((caddr_t)frt);
2326 		}
2327 		(void) assert_wait_deadline(&frt->frt_thread, THREAD_UNINT, t);
2328 		lck_mtx_unlock(&frt->frt_lock);
2329 		thread_block(THREAD_CONTINUE_NULL);
2330 		lck_mtx_lock(&frt->frt_lock);
2331 		frt->frt_flags &= ~FRT_TERMINATEBLOCK;
2332 	}
2333 	ASSERT(frt->frt_flags & FRT_TERMINATED);
2334 	lck_mtx_unlock(&frt->frt_lock);
2335 	frt->frt_thread = THREAD_NULL;
2336 }
2337 
2338 static void
fsw_rps_thread_spawn(struct nx_flowswitch * fsw,uint32_t i)2339 fsw_rps_thread_spawn(struct nx_flowswitch *fsw, uint32_t i)
2340 {
2341 	kern_return_t error;
2342 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2343 
2344 	lck_mtx_init(&frt->frt_lock, &nexus_lock_group, &nexus_lock_attr);
2345 	frt->frt_idx = i;
2346 	frt->frt_fsw = fsw;
2347 	error = kernel_thread_start(fsw_rps_thread_func, frt, &frt->frt_thread);
2348 	ASSERT(!error);
2349 	KPKTQ_INIT(&frt->frt_pktq);
2350 }
2351 
2352 int
fsw_rps_set_nthreads(struct nx_flowswitch * fsw,uint32_t n)2353 fsw_rps_set_nthreads(struct nx_flowswitch* fsw, uint32_t n)
2354 {
2355 	if (n > FSW_RPS_MAX_NTHREADS) {
2356 		SK_ERR("rps nthreads %d, max %d", n, FSW_RPS_MAX_NTHREADS);
2357 		return EINVAL;
2358 	}
2359 
2360 	FSW_WLOCK(fsw);
2361 	if (n < fsw->fsw_rps_nthreads) {
2362 		for (uint32_t i = n; i < fsw->fsw_rps_nthreads; i++) {
2363 			fsw_rps_thread_join(fsw, i);
2364 		}
2365 		fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2366 		    fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2367 		fsw->fsw_rps_nthreads = n;
2368 	} else if (n > fsw->fsw_rps_nthreads) {
2369 		uint32_t nthreads_old = fsw->fsw_rps_nthreads;
2370 
2371 		fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2372 		    fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2373 		fsw->fsw_rps_nthreads = n;
2374 		for (uint32_t i = nthreads_old; i < n; i++) {
2375 			fsw_rps_thread_spawn(fsw, i);
2376 		}
2377 	}
2378 	FSW_WUNLOCK(fsw);
2379 	return 0;
2380 }
2381 
2382 static uint32_t
get_rps_id(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2383 get_rps_id(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2384 {
2385 	sa_family_t af = fsw->fsw_demux(fsw, pkt);
2386 	if (__improbable(af == AF_UNSPEC)) {
2387 		return 0;
2388 	}
2389 
2390 	flow_pkt_classify(pkt, fsw->fsw_ifp, af, true);
2391 
2392 	if (__improbable((pkt->pkt_qum_qflags &
2393 	    QUM_F_FLOW_CLASSIFIED) == 0)) {
2394 		return 0;
2395 	}
2396 
2397 	struct flow_key key;
2398 	flow_pkt2key(pkt, true, &key);
2399 	key.fk_mask = FKMASK_5TUPLE;
2400 
2401 	uint32_t id = flow_key_hash(&key) % fsw->fsw_rps_nthreads;
2402 
2403 	return id;
2404 }
2405 
2406 #endif /* !DEVELOPMENT && !DEBUG */
2407 
2408 void
fsw_receive(struct nx_flowswitch * fsw,struct pktq * pktq)2409 fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq)
2410 {
2411 	FSW_RLOCK(fsw);
2412 #if (DEVELOPMENT || DEBUG)
2413 	if (fsw->fsw_rps_nthreads != 0) {
2414 		struct __kern_packet *pkt, *tpkt;
2415 		bitmap_t map = 0;
2416 
2417 		_CASSERT(BITMAP_LEN(FSW_RPS_MAX_NTHREADS) == 1);
2418 		KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
2419 			uint32_t id = get_rps_id(fsw, pkt);
2420 			KPKTQ_REMOVE(pktq, pkt);
2421 			fsw_rps_rx(fsw, id, pkt);
2422 			bitmap_set(&map, id);
2423 		}
2424 		for (int i = bitmap_first(&map, 64); i >= 0;
2425 		    i = bitmap_next(&map, i)) {
2426 			fsw_rps_thread_schedule(fsw, i);
2427 		}
2428 	} else
2429 #endif /* !DEVELOPMENT && !DEBUG */
2430 	{
2431 		_fsw_receive_locked(fsw, pktq);
2432 	}
2433 	FSW_RUNLOCK(fsw);
2434 }
2435 
2436 int
fsw_dev_input_netem_dequeue(void * handle,pktsched_pkt_t * __counted_by (n_pkts)pkts,uint32_t n_pkts)2437 fsw_dev_input_netem_dequeue(void *handle,
2438     pktsched_pkt_t *__counted_by(n_pkts)pkts, uint32_t n_pkts)
2439 {
2440 #pragma unused(handle)
2441 	struct nx_flowswitch *__single fsw = handle;
2442 	struct __kern_packet *kpkts[FSW_VP_DEV_BATCH_MAX];
2443 	struct pktq pktq;
2444 	sk_protect_t protect;
2445 	uint32_t i;
2446 
2447 	ASSERT(n_pkts <= FSW_VP_DEV_BATCH_MAX);
2448 
2449 	for (i = 0; i < n_pkts; i++) {
2450 		ASSERT(pkts[i].pktsched_ptype == QP_PACKET);
2451 		ASSERT(pkts[i].pktsched_pkt_kpkt != NULL);
2452 		kpkts[i] = pkts[i].pktsched_pkt_kpkt;
2453 	}
2454 
2455 	protect = sk_sync_protect();
2456 	KPKTQ_INIT(&pktq);
2457 	pkts_to_pktq(kpkts, n_pkts, &pktq);
2458 
2459 	fsw_receive(fsw, &pktq);
2460 	KPKTQ_FINI(&pktq);
2461 	sk_sync_unprotect(protect);
2462 
2463 	return 0;
2464 }
2465 
2466 static void
fsw_dev_input_netem_enqueue(struct nx_flowswitch * fsw,struct pktq * q)2467 fsw_dev_input_netem_enqueue(struct nx_flowswitch *fsw, struct pktq *q)
2468 {
2469 	classq_pkt_t p;
2470 	struct netem *__single ne;
2471 	struct __kern_packet *pkt, *tpkt;
2472 
2473 	ASSERT(fsw->fsw_ifp != NULL);
2474 	ne = fsw->fsw_ifp->if_input_netem;
2475 	ASSERT(ne != NULL);
2476 	KPKTQ_FOREACH_SAFE(pkt, q, tpkt) {
2477 		bool pdrop;
2478 		KPKTQ_REMOVE(q, pkt);
2479 		CLASSQ_PKT_INIT_PACKET(&p, pkt);
2480 		netem_enqueue(ne, &p, &pdrop);
2481 	}
2482 }
2483 
2484 void
fsw_devna_rx(struct nexus_adapter * devna,struct __kern_packet * pkt_head,struct nexus_pkt_stats * out_stats)2485 fsw_devna_rx(struct nexus_adapter *devna, struct __kern_packet *pkt_head,
2486     struct nexus_pkt_stats *out_stats)
2487 {
2488 	struct __kern_packet *pkt = pkt_head, *next;
2489 	struct nx_flowswitch *fsw;
2490 	uint32_t n_bytes = 0, n_pkts = 0;
2491 	uint64_t total_pkts = 0, total_bytes = 0;
2492 	struct pktq q;
2493 
2494 	KPKTQ_INIT(&q);
2495 	if (__improbable(devna->na_ifp == NULL ||
2496 	    (fsw = fsw_ifp_to_fsw(devna->na_ifp)) == NULL)) {
2497 		SK_ERR("fsw not attached, dropping %d pkts", KPKTQ_LEN(&q));
2498 		dp_drop_pkt_chain(pkt_head, 0, DROP_REASON_FSW_QUIESCED, DROPTAP_FLAG_L2_MISSING);
2499 		return;
2500 	}
2501 	while (pkt != NULL) {
2502 		if (__improbable(pkt->pkt_trace_id != 0)) {
2503 			KDBG(SK_KTRACE_PKT_RX_DRV | DBG_FUNC_END, pkt->pkt_trace_id);
2504 			KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_START, pkt->pkt_trace_id);
2505 		}
2506 		next = pkt->pkt_nextpkt;
2507 		pkt->pkt_nextpkt = NULL;
2508 
2509 		if (__probable((pkt->pkt_qum_qflags & QUM_F_DROPPED) == 0)) {
2510 			KPKTQ_ENQUEUE(&q, pkt);
2511 			n_bytes += pkt->pkt_length;
2512 		} else {
2513 			DTRACE_SKYWALK1(non__finalized__drop,
2514 			    struct __kern_packet *, pkt);
2515 			FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_FINALIZED);
2516 			dp_drop_pkt_single(fsw, pkt, 0,
2517 			    DROP_REASON_FSW_RX_PKT_NOT_FINALIZED,
2518 			    DROPTAP_FLAG_L2_MISSING);
2519 			pkt = NULL;
2520 		}
2521 		n_pkts = KPKTQ_LEN(&q);
2522 		if (n_pkts == fsw_rx_batch || (next == NULL && n_pkts > 0)) {
2523 			if (__improbable(fsw->fsw_ifp->if_input_netem != NULL)) {
2524 				fsw_dev_input_netem_enqueue(fsw, &q);
2525 			} else {
2526 				fsw_receive(fsw, &q);
2527 			}
2528 			total_pkts += n_pkts;
2529 			total_bytes += n_bytes;
2530 			n_pkts = 0;
2531 			n_bytes = 0;
2532 			KPKTQ_FINI(&q);
2533 		}
2534 		pkt = next;
2535 	}
2536 	ASSERT(KPKTQ_LEN(&q) == 0);
2537 	FSW_STATS_ADD(FSW_STATS_RX_PACKETS, total_pkts);
2538 	if (out_stats != NULL) {
2539 		out_stats->nps_pkts += total_pkts;
2540 		out_stats->nps_bytes += total_bytes;
2541 	}
2542 	KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(devna), total_pkts, total_bytes);
2543 }
2544 
2545 static int
dp_copy_to_dev_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2546 dp_copy_to_dev_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2547     struct __kern_packet *dpkt)
2548 {
2549 	struct mbuf *__single m = NULL;
2550 	uint32_t bdlen, bdlim, bdoff;
2551 	uint8_t *bdaddr;
2552 	unsigned int one = 1;
2553 	int err = 0;
2554 
2555 	err = mbuf_allocpacket(MBUF_DONTWAIT,
2556 	    (fsw->fsw_frame_headroom + spkt->pkt_length), &one, &m);
2557 #if (DEVELOPMENT || DEBUG)
2558 	if (m != NULL) {
2559 		_FSW_INJECT_ERROR(11, m, NULL, m_freem, m);
2560 	}
2561 #endif /* DEVELOPMENT || DEBUG */
2562 	if (__improbable(m == NULL)) {
2563 		FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
2564 		err = ENOBUFS;
2565 		goto done;
2566 	}
2567 
2568 	MD_BUFLET_ADDR_ABS_DLEN(dpkt, bdaddr, bdlen, bdlim, bdoff);
2569 	if (fsw->fsw_frame_headroom > bdlim) {
2570 		SK_ERR("not enough space in buffer for headroom");
2571 		err = EINVAL;
2572 		goto done;
2573 	}
2574 
2575 	dpkt->pkt_headroom = fsw->fsw_frame_headroom;
2576 	dpkt->pkt_mbuf = m;
2577 	dpkt->pkt_pflags |= PKT_F_MBUF_DATA;
2578 
2579 	/* packet copy into mbuf */
2580 	fsw->fsw_pkt_copy_to_mbuf(NR_TX, SK_PTR_ENCODE(spkt,
2581 	    METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)), 0, m,
2582 	    fsw->fsw_frame_headroom, spkt->pkt_length,
2583 	    PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2584 	    spkt->pkt_csum_tx_start_off);
2585 	FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2MBUF);
2586 
2587 	/* header copy into dpkt buffer for classification */
2588 	kern_packet_t sph = SK_PTR_ENCODE(spkt,
2589 	    METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2590 	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2591 	    METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2592 	uint32_t copy_len = MIN(spkt->pkt_length, bdlim - dpkt->pkt_headroom);
2593 	fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, dpkt->pkt_headroom,
2594 	    sph, spkt->pkt_headroom, copy_len, FALSE, 0, 0, 0);
2595 	if (copy_len < spkt->pkt_length) {
2596 		dpkt->pkt_pflags |= PKT_F_TRUNCATED;
2597 	}
2598 
2599 	/*
2600 	 * fsw->fsw_frame_headroom is after m_data, thus we treat m_data same as
2601 	 * buflet baddr m_data always points to the beginning of packet and
2602 	 * should represents the same as baddr + headroom
2603 	 */
2604 	ASSERT((uintptr_t)m->m_data ==
2605 	    ((uintptr_t)mbuf_datastart(m) + fsw->fsw_frame_headroom));
2606 
2607 done:
2608 	return err;
2609 }
2610 
2611 static int
dp_copy_to_dev_pkt(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2612 dp_copy_to_dev_pkt(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2613     struct __kern_packet *dpkt)
2614 {
2615 	struct ifnet *ifp = fsw->fsw_ifp;
2616 	uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
2617 
2618 	if (headroom > UINT8_MAX) {
2619 		SK_ERR("headroom too large %d", headroom);
2620 		return ERANGE;
2621 	}
2622 	dpkt->pkt_headroom = (uint8_t)headroom;
2623 	ASSERT((dpkt->pkt_headroom & 0x7) == 0);
2624 	dpkt->pkt_l2_len = 0;
2625 	dpkt->pkt_link_flags = spkt->pkt_link_flags;
2626 
2627 	kern_packet_t sph = SK_PTR_ENCODE(spkt,
2628 	    METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2629 	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2630 	    METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2631 	fsw->fsw_pkt_copy_from_pkt(NR_TX, dph,
2632 	    dpkt->pkt_headroom, sph, spkt->pkt_headroom,
2633 	    spkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2634 	    (spkt->pkt_csum_tx_start_off - spkt->pkt_headroom),
2635 	    (spkt->pkt_csum_tx_stuff_off - spkt->pkt_headroom),
2636 	    (spkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT));
2637 
2638 	FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
2639 
2640 	return 0;
2641 }
2642 
2643 #if SK_LOG
2644 /* Hoisted out of line to reduce kernel stack footprint */
2645 SK_LOG_ATTRIBUTE
2646 static void
dp_copy_to_dev_log(struct nx_flowswitch * fsw,const struct kern_pbufpool * pp,struct __kern_packet * spkt,struct __kern_packet * dpkt,int error)2647 dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp,
2648     struct __kern_packet *spkt, struct __kern_packet *dpkt, int error)
2649 {
2650 	struct proc *p = current_proc();
2651 	struct ifnet *ifp = fsw->fsw_ifp;
2652 	uint64_t logflags = (SK_VERB_FSW_DP | SK_VERB_TX);
2653 
2654 	if (error == ERANGE) {
2655 		SK_ERR("packet too long, hr(fr+tx)+slen (%u+%u)+%u > "
2656 		    "dev_pp_max %u", (uint32_t)fsw->fsw_frame_headroom,
2657 		    (uint32_t)ifp->if_tx_headroom, spkt->pkt_length,
2658 		    (uint32_t)pp->pp_max_frags * PP_BUF_SIZE_DEF(pp));
2659 	} else if (error == ENOBUFS) {
2660 		SK_DF(logflags, "%s(%d) packet allocation failure",
2661 		    sk_proc_name_address(p), sk_proc_pid(p));
2662 	} else if (error == 0) {
2663 		ASSERT(dpkt != NULL);
2664 		char *daddr;
2665 		uint32_t pkt_len;
2666 
2667 		MD_BUFLET_ADDR_ABS(dpkt, daddr);
2668 		pkt_len = __packet_get_real_data_length(dpkt);
2669 		SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)",
2670 		    sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
2671 		    dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
2672 		    (uint32_t)fsw->fsw_frame_headroom,
2673 		    (uint32_t)ifp->if_tx_headroom);
2674 		SK_DF(logflags | SK_VERB_DUMP, "%s",
2675 		    sk_dump("buf", daddr, pkt_len, 128, NULL, 0));
2676 	} else {
2677 		SK_DF(logflags, "%s(%d) error %d", error);
2678 	}
2679 }
2680 #else
2681 #define dp_copy_to_dev_log(...)
2682 #endif /* SK_LOG */
2683 
2684 static void
fsw_pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)2685 fsw_pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
2686 {
2687 	ASSERT(!(spkt->pkt_pflags & PKT_F_MBUF_MASK));
2688 	ASSERT(!(spkt->pkt_pflags & PKT_F_PKT_MASK));
2689 
2690 	SK_PREFETCHW(dpkt->pkt_qum_buf.buf_addr, 0);
2691 	/* Copy packet metadata */
2692 	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
2693 	_PKT_COPY(spkt, dpkt);
2694 	_PKT_COPY_TX_PORT_DATA(spkt, dpkt);
2695 	ASSERT((dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
2696 	    !PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
2697 	ASSERT(dpkt->pkt_mbuf == NULL);
2698 
2699 	/* Copy AQM metadata */
2700 	dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type;
2701 	dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx;
2702 	_CASSERT((offsetof(struct __flow, flow_src_id) % 8) == 0);
2703 	_UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id);
2704 	_UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid);
2705 	dpkt->pkt_policy_id = spkt->pkt_policy_id;
2706 	dpkt->pkt_skip_policy_id = spkt->pkt_skip_policy_id;
2707 }
2708 
2709 static int
dp_copy_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2710 dp_copy_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2711     struct __kern_packet *dpkt)
2712 {
2713 	const struct kern_pbufpool *pp = dpkt->pkt_qum.qum_pp;
2714 	struct ifnet *ifp = fsw->fsw_ifp;
2715 	uint32_t dev_pkt_len;
2716 	int err = 0;
2717 
2718 	fsw_pkt_copy_metadata(spkt, dpkt);
2719 	switch (fsw->fsw_classq_enq_ptype) {
2720 	case QP_MBUF:
2721 		err = dp_copy_to_dev_mbuf(fsw, spkt, dpkt);
2722 		break;
2723 
2724 	case QP_PACKET:
2725 		dev_pkt_len = fsw->fsw_frame_headroom + ifp->if_tx_headroom +
2726 		    spkt->pkt_length;
2727 		if (dev_pkt_len > pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)) {
2728 			FSW_STATS_INC(FSW_STATS_TX_COPY_BAD_LEN);
2729 			err = ERANGE;
2730 			goto done;
2731 		}
2732 		err = dp_copy_to_dev_pkt(fsw, spkt, dpkt);
2733 		break;
2734 
2735 	default:
2736 		VERIFY(0);
2737 		__builtin_unreachable();
2738 	}
2739 done:
2740 	dp_copy_to_dev_log(fsw, pp, spkt, dpkt, err);
2741 	return err;
2742 }
2743 
2744 static int
dp_copy_headers_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2745 dp_copy_headers_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2746     struct __kern_packet *dpkt)
2747 {
2748 	uint8_t *sbaddr, *dbaddr;
2749 	uint16_t headroom = fsw->fsw_frame_headroom + fsw->fsw_ifp->if_tx_headroom;
2750 	uint16_t hdrs_len_estimate = (uint16_t)MIN(spkt->pkt_length, 128);
2751 
2752 	fsw_pkt_copy_metadata(spkt, dpkt);
2753 
2754 	MD_BUFLET_ADDR_ABS(spkt, sbaddr);
2755 	ASSERT(sbaddr != NULL);
2756 	sbaddr += spkt->pkt_headroom;
2757 
2758 	MD_BUFLET_ADDR_ABS(dpkt, dbaddr);
2759 	ASSERT(dbaddr != NULL);
2760 	dpkt->pkt_headroom = (uint8_t)headroom;
2761 	dbaddr += headroom;
2762 
2763 	pkt_copy(sbaddr, dbaddr, hdrs_len_estimate);
2764 	METADATA_SET_LEN(dpkt, hdrs_len_estimate, headroom);
2765 
2766 	/* packet length is set to the full length */
2767 	dpkt->pkt_length = spkt->pkt_length;
2768 	dpkt->pkt_pflags |= PKT_F_TRUNCATED;
2769 	return 0;
2770 }
2771 
2772 static struct mbuf *
convert_pkt_to_mbuf(struct __kern_packet * pkt)2773 convert_pkt_to_mbuf(struct __kern_packet *pkt)
2774 {
2775 	ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
2776 	ASSERT(pkt->pkt_mbuf != NULL);
2777 	struct mbuf *m = pkt->pkt_mbuf;
2778 
2779 	/* pass additional metadata generated from flow parse/lookup */
2780 	_CASSERT(sizeof(m->m_pkthdr.pkt_flowid) ==
2781 	    sizeof(pkt->pkt_flow_token));
2782 	_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) ==
2783 	    sizeof(pkt->pkt_flowsrc_token));
2784 	_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) ==
2785 	    sizeof(pkt->pkt_flowsrc_fidx));
2786 	m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
2787 	m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
2788 	m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
2789 	m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
2790 	m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
2791 	m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
2792 	m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx;
2793 
2794 	if (pkt->pkt_transport_protocol == IPPROTO_QUIC) {
2795 		m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_QUIC;
2796 	}
2797 
2798 	/* The packet should have a timestamp by the time we get here. */
2799 	m->m_pkthdr.pkt_timestamp = pkt->pkt_timestamp;
2800 	m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
2801 
2802 	m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
2803 	m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
2804 	/* set pkt_hdr so that AQM can find IP header and mark ECN bits */
2805 	m->m_pkthdr.pkt_hdr = m_mtod_current(m) + pkt->pkt_l2_len;
2806 
2807 	if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
2808 		m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
2809 	}
2810 	KPKT_CLEAR_MBUF_DATA(pkt);
2811 
2812 	/* mbuf has been consumed, release packet as well */
2813 	ASSERT(pkt->pkt_qum.qum_ksd == NULL);
2814 	pp_free_packet_single(pkt);
2815 	return m;
2816 }
2817 
2818 static void
convert_pkt_to_mbuf_list(struct __kern_packet * pkt_list,struct mbuf ** head,struct mbuf ** tail,uint32_t * cnt,uint32_t * bytes)2819 convert_pkt_to_mbuf_list(struct __kern_packet *pkt_list,
2820     struct mbuf **head, struct mbuf **tail,
2821     uint32_t *cnt, uint32_t *bytes)
2822 {
2823 	struct __kern_packet *pkt = pkt_list, *next;
2824 	struct mbuf *__single m_head = NULL, **__single m_tailp = &m_head;
2825 	struct mbuf *__single m = NULL;
2826 	uint32_t c = 0, b = 0;
2827 
2828 	while (pkt != NULL) {
2829 		next = pkt->pkt_nextpkt;
2830 		pkt->pkt_nextpkt = NULL;
2831 		m = convert_pkt_to_mbuf(pkt);
2832 		ASSERT(m != NULL);
2833 
2834 		*m_tailp = m;
2835 		m_tailp = &m->m_nextpkt;
2836 		c++;
2837 		b += m_pktlen(m);
2838 		pkt = next;
2839 	}
2840 	if (head != NULL) {
2841 		*head = m_head;
2842 	}
2843 	if (tail != NULL) {
2844 		*tail = m;
2845 	}
2846 	if (cnt != NULL) {
2847 		*cnt = c;
2848 	}
2849 	if (bytes != NULL) {
2850 		*bytes = b;
2851 	}
2852 }
2853 
2854 SK_NO_INLINE_ATTRIBUTE
2855 static int
classq_enqueue_flow_single(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2856 classq_enqueue_flow_single(struct nx_flowswitch *fsw,
2857     struct __kern_packet *pkt)
2858 {
2859 	struct ifnet *ifp = fsw->fsw_ifp;
2860 	boolean_t pkt_drop = FALSE;
2861 	int err;
2862 
2863 	FSW_LOCK_ASSERT_HELD(fsw);
2864 	ASSERT(fsw->fsw_classq_enabled);
2865 	ASSERT(pkt->pkt_flow_token != 0);
2866 	fsw_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class,
2867 	    1, pkt->pkt_length);
2868 
2869 	if (__improbable(pkt->pkt_trace_id != 0)) {
2870 		KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
2871 		KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
2872 	}
2873 
2874 	switch (fsw->fsw_classq_enq_ptype) {
2875 	case QP_MBUF: {                         /* compat interface */
2876 		struct mbuf *m;
2877 
2878 		m = convert_pkt_to_mbuf(pkt);
2879 		ASSERT(m != NULL);
2880 		pkt = NULL;
2881 
2882 		/* ifnet_enqueue consumes mbuf */
2883 		err = ifnet_enqueue_mbuf(ifp, m, false, &pkt_drop);
2884 		m = NULL;
2885 #if (DEVELOPMENT || DEBUG)
2886 		if (__improbable(!pkt_drop)) {
2887 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2888 		}
2889 #endif /* DEVELOPMENT || DEBUG */
2890 		if (pkt_drop) {
2891 			FSW_STATS_INC(FSW_STATS_DROP);
2892 			FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2893 		}
2894 		break;
2895 	}
2896 	case QP_PACKET: {                       /* native interface */
2897 		/* ifnet_enqueue consumes packet */
2898 		err = ifnet_enqueue_pkt(ifp, pkt, false, &pkt_drop);
2899 		pkt = NULL;
2900 #if (DEVELOPMENT || DEBUG)
2901 		if (__improbable(!pkt_drop)) {
2902 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2903 		}
2904 #endif /* DEVELOPMENT || DEBUG */
2905 		if (pkt_drop) {
2906 			FSW_STATS_INC(FSW_STATS_DROP);
2907 			FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2908 		}
2909 		break;
2910 	}
2911 	default:
2912 		err = EINVAL;
2913 		VERIFY(0);
2914 		/* NOTREACHED */
2915 		__builtin_unreachable();
2916 	}
2917 
2918 	return err;
2919 }
2920 
2921 static int
classq_enqueue_flow_chain(struct nx_flowswitch * fsw,struct __kern_packet * pkt_head,struct __kern_packet * pkt_tail,uint32_t cnt,uint32_t bytes)2922 classq_enqueue_flow_chain(struct nx_flowswitch *fsw,
2923     struct __kern_packet *pkt_head, struct __kern_packet *pkt_tail,
2924     uint32_t cnt, uint32_t bytes)
2925 {
2926 	struct ifnet *ifp = fsw->fsw_ifp;
2927 	boolean_t pkt_drop = FALSE;
2928 	uint32_t svc;
2929 	int err;
2930 
2931 	FSW_LOCK_ASSERT_HELD(fsw);
2932 	ASSERT(fsw->fsw_classq_enabled);
2933 	ASSERT(pkt_head->pkt_flow_token != 0);
2934 
2935 	/*
2936 	 * All packets in the flow should have the same svc.
2937 	 */
2938 	svc = pkt_head->pkt_svc_class;
2939 	fsw_ifp_inc_traffic_class_out_pkt(ifp, svc, cnt, bytes);
2940 
2941 	switch (fsw->fsw_classq_enq_ptype) {
2942 	case QP_MBUF: {                         /* compat interface */
2943 		struct mbuf *__single m_head = NULL, *__single m_tail = NULL;
2944 		uint32_t c = 0, b = 0;
2945 
2946 		convert_pkt_to_mbuf_list(pkt_head, &m_head, &m_tail, &c, &b);
2947 		ASSERT(m_head != NULL && m_tail != NULL);
2948 		ASSERT(c == cnt);
2949 		ASSERT(b == bytes);
2950 		pkt_head = NULL;
2951 
2952 		/* ifnet_enqueue consumes mbuf */
2953 		err = ifnet_enqueue_mbuf_chain(ifp, m_head, m_tail, cnt,
2954 		    bytes, FALSE, &pkt_drop);
2955 		m_head = NULL;
2956 		m_tail = NULL;
2957 #if (DEVELOPMENT || DEBUG)
2958 		if (__improbable(!pkt_drop)) {
2959 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2960 		}
2961 #endif /* DEVELOPMENT || DEBUG */
2962 		if (pkt_drop) {
2963 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2964 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2965 			    cnt);
2966 		}
2967 		break;
2968 	}
2969 	case QP_PACKET: {                       /* native interface */
2970 		/* ifnet_enqueue consumes packet */
2971 		err = ifnet_enqueue_pkt_chain(ifp, pkt_head, pkt_tail, cnt,
2972 		    bytes, FALSE, &pkt_drop);
2973 		pkt_head = NULL;
2974 #if (DEVELOPMENT || DEBUG)
2975 		if (__improbable(!pkt_drop)) {
2976 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2977 		}
2978 #endif /* DEVELOPMENT || DEBUG */
2979 		if (pkt_drop) {
2980 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2981 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2982 			    cnt);
2983 		}
2984 		break;
2985 	}
2986 	default:
2987 		err = EINVAL;
2988 		VERIFY(0);
2989 		/* NOTREACHED */
2990 		__builtin_unreachable();
2991 	}
2992 
2993 	return err;
2994 }
2995 
2996 /*
2997  * This code path needs to be kept for interfaces without logical link support.
2998  */
2999 static void
classq_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)3000 classq_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
3001     bool chain, uint32_t cnt, uint32_t bytes)
3002 {
3003 	bool flowadv_is_set = false;
3004 	struct __kern_packet *pkt, *tail, *tpkt;
3005 	flowadv_idx_t flow_adv_idx;
3006 	bool flowadv_cap;
3007 	flowadv_token_t flow_adv_token;
3008 	int err;
3009 
3010 	SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
3011 	    if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
3012 
3013 	if (chain) {
3014 		pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
3015 		tail = KPKTQ_LAST(&fe->fe_tx_pktq);
3016 		KPKTQ_INIT(&fe->fe_tx_pktq);
3017 		if (pkt == NULL) {
3018 			return;
3019 		}
3020 		flow_adv_idx = pkt->pkt_flowsrc_fidx;
3021 		flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3022 		flow_adv_token = pkt->pkt_flow_token;
3023 
3024 		err = classq_enqueue_flow_chain(fsw, pkt, tail, cnt, bytes);
3025 
3026 		/* set flow advisory if needed */
3027 		if (__improbable((err == EQFULL || err == EQSUSPENDED) &&
3028 		    flowadv_cap)) {
3029 			flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
3030 			    flow_adv_idx, flow_adv_token);
3031 		}
3032 		DTRACE_SKYWALK3(chain__enqueue, uint32_t, cnt, uint32_t, bytes,
3033 		    bool, flowadv_is_set);
3034 	} else {
3035 		uint32_t c = 0, b = 0;
3036 
3037 		KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3038 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3039 
3040 			flow_adv_idx = pkt->pkt_flowsrc_fidx;
3041 			flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3042 			flow_adv_token = pkt->pkt_flow_token;
3043 
3044 			c++;
3045 			b += pkt->pkt_length;
3046 			err = classq_enqueue_flow_single(fsw, pkt);
3047 
3048 			/* set flow advisory if needed */
3049 			if (__improbable(!flowadv_is_set &&
3050 			    ((err == EQFULL || err == EQSUSPENDED) &&
3051 			    flowadv_cap))) {
3052 				flowadv_is_set = na_flowadv_set(
3053 					flow_get_na(fsw, fe), flow_adv_idx,
3054 					flow_adv_token);
3055 			}
3056 		}
3057 		ASSERT(c == cnt);
3058 		ASSERT(b == bytes);
3059 		DTRACE_SKYWALK3(non__chain__enqueue, uint32_t, cnt, uint32_t, bytes,
3060 		    bool, flowadv_is_set);
3061 	}
3062 
3063 	/* notify flow advisory event */
3064 	if (__improbable(flowadv_is_set)) {
3065 		struct __kern_channel_ring *r = fsw_flow_get_tx_ring(fsw, fe);
3066 		if (__probable(r)) {
3067 			na_flowadv_event(r);
3068 			SK_DF(SK_VERB_FLOW_ADVISORY | SK_VERB_TX,
3069 			    "%s(%d) notified of flow update",
3070 			    sk_proc_name_address(current_proc()),
3071 			    sk_proc_pid(current_proc()));
3072 		}
3073 	}
3074 }
3075 
3076 /*
3077  * Logical link code path
3078  */
3079 static void
classq_qset_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)3080 classq_qset_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
3081     bool chain, uint32_t cnt, uint32_t bytes)
3082 {
3083 #pragma unused(chain)
3084 	struct __kern_packet *pkt, *tail;
3085 	flowadv_idx_t flow_adv_idx;
3086 	bool flowadv_is_set = false;
3087 	bool flowadv_cap;
3088 	flowadv_token_t flow_adv_token;
3089 	uint32_t flowctl = 0, dropped = 0;
3090 	int err;
3091 
3092 	SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
3093 	    if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
3094 
3095 	pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
3096 	tail = KPKTQ_LAST(&fe->fe_tx_pktq);
3097 	KPKTQ_INIT(&fe->fe_tx_pktq);
3098 	if (pkt == NULL) {
3099 		return;
3100 	}
3101 	flow_adv_idx = pkt->pkt_flowsrc_fidx;
3102 	flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
3103 	flow_adv_token = pkt->pkt_flow_token;
3104 
3105 	err = netif_qset_enqueue(fe->fe_qset, pkt, tail, cnt, bytes,
3106 	    &flowctl, &dropped);
3107 
3108 	if (__improbable(err != 0)) {
3109 		/* set flow advisory if needed */
3110 		if (flowctl > 0 && flowadv_cap) {
3111 			flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
3112 			    flow_adv_idx, flow_adv_token);
3113 
3114 			/* notify flow advisory event */
3115 			if (flowadv_is_set) {
3116 				struct __kern_channel_ring *r =
3117 				    fsw_flow_get_tx_ring(fsw, fe);
3118 				if (__probable(r)) {
3119 					na_flowadv_event(r);
3120 					SK_DF(SK_VERB_FLOW_ADVISORY |
3121 					    SK_VERB_TX,
3122 					    "%s(%d) notified of flow update",
3123 					    sk_proc_name_address(current_proc()),
3124 					    sk_proc_pid(current_proc()));
3125 				}
3126 			}
3127 		}
3128 		if (dropped > 0) {
3129 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, dropped);
3130 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
3131 			    dropped);
3132 		}
3133 	}
3134 }
3135 
3136 static void
tx_finalize_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)3137 tx_finalize_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
3138 {
3139 #pragma unused(fsw)
3140 	/* finalize here; no more changes to buflets after classq */
3141 	if (__probable(!(pkt->pkt_pflags & PKT_F_MBUF_DATA))) {
3142 		kern_packet_t ph = SK_PTR_ENCODE(pkt,
3143 		    METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
3144 		int err = __packet_finalize(ph);
3145 		VERIFY(err == 0);
3146 	}
3147 }
3148 
3149 static bool
dp_flow_tx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)3150 dp_flow_tx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
3151 {
3152 	struct flow_route *fr = fe->fe_route;
3153 	int err;
3154 
3155 	ASSERT(fr != NULL);
3156 
3157 	if (__improbable(!dp_flow_route_process(fsw, fe))) {
3158 		return false;
3159 	}
3160 	if (fe->fe_qset_select == FE_QSET_SELECT_DYNAMIC) {
3161 		flow_qset_select_dynamic(fsw, fe, TRUE);
3162 	}
3163 
3164 	_FSW_INJECT_ERROR(35, fr->fr_flags, fr->fr_flags,
3165 	    _fsw_error35_handler, 1, fr, NULL, NULL);
3166 	_FSW_INJECT_ERROR(36, fr->fr_flags, fr->fr_flags,
3167 	    _fsw_error36_handler, 1, fr, NULL);
3168 
3169 	/*
3170 	 * See if we need to resolve the flow route; note the test against
3171 	 * fr_flags here is done without any lock for performance.  Thus
3172 	 * it's possible that we race against the thread performing route
3173 	 * event updates for a packet (which is OK).  In any case we should
3174 	 * not have any assertion on fr_flags value(s) due to the lack of
3175 	 * serialization.
3176 	 */
3177 	if (fr->fr_flags & FLOWRTF_RESOLVED) {
3178 		goto frame;
3179 	}
3180 
3181 	struct __kern_packet *pkt, *tpkt;
3182 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3183 		err = fsw->fsw_resolve(fsw, fr, pkt);
3184 		_FSW_INJECT_ERROR_SET(35, _fsw_error35_handler, 2, fr, pkt, &err);
3185 		_FSW_INJECT_ERROR_SET(36, _fsw_error36_handler, 2, fr, &err);
3186 		/*
3187 		 * If resolver returns EJUSTRETURN then we drop the pkt as the
3188 		 * resolver should have converted the pkt into mbuf (or
3189 		 * detached the attached mbuf from pkt) and added it to the
3190 		 * llinfo queue. If we do have a cached llinfo, then proceed
3191 		 * to using it even though it may be stale (very unlikely)
3192 		 * while the resolution is in progress.
3193 		 * Otherwise, any other error results in dropping pkt.
3194 		 */
3195 		if (err == EJUSTRETURN) {
3196 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3197 			pp_free_packet_single(pkt);
3198 			FSW_STATS_INC(FSW_STATS_TX_RESOLV_PENDING);
3199 			continue;
3200 		} else if (err != 0 && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
3201 			/* use existing llinfo */
3202 			FSW_STATS_INC(FSW_STATS_TX_RESOLV_STALE);
3203 		} else if (err != 0) {
3204 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3205 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_RESOLV_FAILED,
3206 			    DROPTAP_FLAG_L2_MISSING);
3207 			FSW_STATS_INC(FSW_STATS_TX_RESOLV_FAIL);
3208 			continue;
3209 		}
3210 	}
3211 
3212 frame:
3213 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3214 		if (fsw->fsw_frame != NULL) {
3215 			fsw->fsw_frame(fsw, fr, pkt);
3216 		}
3217 	}
3218 
3219 	return true;
3220 }
3221 
3222 static void
dp_listener_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)3223 dp_listener_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
3224 {
3225 #pragma unused(fsw)
3226 	struct __kern_packet *pkt, *tpkt;
3227 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3228 		KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3229 		/* listener is only allowed TCP RST */
3230 		if (pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
3231 		    (pkt->pkt_flow_tcp_flags & TH_RST) != 0) {
3232 			flow_track_abort_tcp(fe, NULL, pkt);
3233 		} else {
3234 			char *addr;
3235 
3236 			MD_BUFLET_ADDR_ABS(pkt, addr);
3237 			SK_ERR("listener flow sends non-RST packet %s",
3238 			    sk_dump(sk_proc_name_address(current_proc()),
3239 			    addr, __packet_get_real_data_length(pkt), 128, NULL, 0));
3240 		}
3241 		pp_free_packet_single(pkt);
3242 	}
3243 }
3244 
3245 static void
fsw_update_timestamps(struct __kern_packet * pkt,volatile uint64_t * fg_ts,volatile uint64_t * rt_ts,ifnet_t ifp)3246 fsw_update_timestamps(struct __kern_packet *pkt, volatile uint64_t *fg_ts,
3247     volatile uint64_t *rt_ts, ifnet_t ifp)
3248 {
3249 	struct timespec now;
3250 	uint64_t now_nsec = 0;
3251 
3252 	if (!(pkt->pkt_pflags & PKT_F_TS_VALID) || pkt->pkt_timestamp == 0) {
3253 		nanouptime(&now);
3254 		net_timernsec(&now, &now_nsec);
3255 		pkt->pkt_timestamp = now_nsec;
3256 	}
3257 	pkt->pkt_pflags &= ~PKT_F_TS_VALID;
3258 
3259 	/*
3260 	 * If the packet service class is not background,
3261 	 * update the timestamps on the interface, as well as
3262 	 * the ones in nexus-wide advisory to indicate recent
3263 	 * activity on a foreground flow.
3264 	 */
3265 	if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) {
3266 		ifp->if_fg_sendts = (uint32_t)_net_uptime;
3267 		if (fg_ts != NULL) {
3268 			*fg_ts = _net_uptime;
3269 		}
3270 	}
3271 	if (pkt->pkt_pflags & PKT_F_REALTIME) {
3272 		ifp->if_rt_sendts = (uint32_t)_net_uptime;
3273 		if (rt_ts != NULL) {
3274 			*rt_ts = _net_uptime;
3275 		}
3276 	}
3277 }
3278 
3279 static bool
fsw_chain_enqueue_enabled(struct nx_flowswitch * fsw,bool gso_enabled)3280 fsw_chain_enqueue_enabled(struct nx_flowswitch *fsw, bool gso_enabled)
3281 {
3282 	return fsw_chain_enqueue != 0 &&
3283 	       fsw->fsw_ifp->if_output_netem == NULL &&
3284 	       (fsw->fsw_ifp->if_eflags & IFEF_ENQUEUE_MULTI) == 0 &&
3285 	       gso_enabled;
3286 }
3287 
3288 void
dp_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)3289 dp_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3290     uint32_t flags)
3291 {
3292 	struct pktq dropped_pkts;
3293 	bool chain, gso = ((flags & FLOW_PROC_FLAG_GSO) != 0);
3294 	uint32_t cnt = 0, bytes = 0;
3295 	volatile struct sk_nexusadv *nxadv = NULL;
3296 	volatile uint64_t *fg_ts = NULL;
3297 	volatile uint64_t *rt_ts = NULL;
3298 	uint8_t qset_idx = (fe->fe_qset != NULL) ? fe->fe_qset->nqs_idx : 0;
3299 	drop_reason_t reason = DROP_REASON_UNSPECIFIED;
3300 	uint16_t line = 0;
3301 
3302 	KPKTQ_INIT(&dropped_pkts);
3303 	ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3304 	if (__improbable(fe->fe_flags & FLOWENTF_LISTENER)) {
3305 		dp_listener_flow_tx_process(fsw, fe);
3306 		return;
3307 	}
3308 	if (__improbable(!dp_flow_tx_route_process(fsw, fe))) {
3309 		SK_RDERR(5, "Tx route bad");
3310 		FSW_STATS_ADD(FSW_STATS_TX_FLOW_NONVIABLE,
3311 		    KPKTQ_LEN(&fe->fe_tx_pktq));
3312 		KPKTQ_CONCAT(&dropped_pkts, &fe->fe_tx_pktq);
3313 		reason = DROP_REASON_FSW_FLOW_NONVIABLE;
3314 		line = __LINE__;
3315 		goto done;
3316 	}
3317 	chain = fsw_chain_enqueue_enabled(fsw, gso);
3318 	if (chain) {
3319 		nxadv = fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
3320 		if (nxadv != NULL) {
3321 			fg_ts = &nxadv->nxadv_fg_sendts;
3322 			rt_ts = &nxadv->nxadv_rt_sendts;
3323 		}
3324 	}
3325 	struct __kern_packet *pkt, *tpkt;
3326 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3327 		int err = 0;
3328 
3329 		err = flow_pkt_track(fe, pkt, false);
3330 		if (__improbable(err != 0)) {
3331 			SK_RDERR(5, "flow_pkt_track failed (err %d)", err);
3332 			FSW_STATS_INC(FSW_STATS_TX_FLOW_TRACK_ERR);
3333 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3334 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_FLOW_TRACK_ERR,
3335 			    DROPTAP_FLAG_L2_MISSING);
3336 			continue;
3337 		}
3338 		_UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
3339 		pkt->pkt_transport_protocol = fe->fe_transport_protocol;
3340 
3341 		/* set AQM related values for outgoing packet */
3342 		if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
3343 			pkt->pkt_pflags |= PKT_F_FLOW_ADV;
3344 			pkt->pkt_flowsrc_type = FLOWSRC_CHANNEL;
3345 			pkt->pkt_flowsrc_fidx = fe->fe_adv_idx;
3346 		} else {
3347 			pkt->pkt_pflags &= ~PKT_F_FLOW_ADV;
3348 		}
3349 		_UUID_CLEAR(pkt->pkt_flow_id);
3350 		pkt->pkt_flow_token = fe->fe_flowid;
3351 		pkt->pkt_pflags |= PKT_F_FLOW_ID;
3352 		pkt->pkt_qset_idx = qset_idx;
3353 		pkt->pkt_policy_id = fe->fe_policy_id;
3354 		pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
3355 
3356 		/*
3357 		 * The same code is exercised per packet for the non-chain case
3358 		 * (see ifnet_enqueue_ifclassq()). It's replicated here to avoid
3359 		 * re-walking the chain later.
3360 		 */
3361 		if (chain) {
3362 			fsw_update_timestamps(pkt, fg_ts, rt_ts, fsw->fsw_ifp);
3363 		}
3364 		/* mark packet tos/svc_class */
3365 		fsw_qos_mark(fsw, fe, pkt);
3366 
3367 		tx_finalize_packet(fsw, pkt);
3368 		bytes += pkt->pkt_length;
3369 		cnt++;
3370 	}
3371 
3372 	/* snoop after it's finalized */
3373 	if (__improbable(pktap_total_tap_count != 0)) {
3374 		fsw_snoop(fsw, fe, &fe->fe_tx_pktq, false);
3375 	}
3376 	if (fe->fe_qset != NULL) {
3377 		classq_qset_enqueue_flow(fsw, fe, chain, cnt, bytes);
3378 	} else {
3379 		classq_enqueue_flow(fsw, fe, chain, cnt, bytes);
3380 	}
3381 done:
3382 	dp_drop_pktq(fsw, &dropped_pkts, 1, reason, line, 0);
3383 }
3384 
3385 static struct flow_entry *
tx_process_continuous_ip_frag(struct nx_flowswitch * fsw,struct flow_entry * prev_fe,struct __kern_packet * pkt)3386 tx_process_continuous_ip_frag(struct nx_flowswitch *fsw,
3387     struct flow_entry *prev_fe, struct __kern_packet *pkt)
3388 {
3389 	ASSERT(!pkt->pkt_flow_ip_is_first_frag);
3390 
3391 	if (__improbable(pkt->pkt_flow_ip_frag_id == 0)) {
3392 		FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID);
3393 		SK_ERR("%s(%d) invalid zero fragment id",
3394 		    sk_proc_name_address(current_proc()),
3395 		    sk_proc_pid(current_proc()));
3396 		return NULL;
3397 	}
3398 
3399 	SK_DF(SK_VERB_FSW_DP | SK_VERB_TX,
3400 	    "%s(%d) continuation frag, id %u",
3401 	    sk_proc_name_address(current_proc()),
3402 	    sk_proc_pid(current_proc()),
3403 	    pkt->pkt_flow_ip_frag_id);
3404 	if (__improbable(prev_fe == NULL ||
3405 	    !prev_fe->fe_tx_is_cont_frag)) {
3406 		SK_ERR("%s(%d) unexpected continuation frag",
3407 		    sk_proc_name_address(current_proc()),
3408 		    sk_proc_pid(current_proc()),
3409 		    pkt->pkt_flow_ip_frag_id);
3410 		FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3411 		return NULL;
3412 	}
3413 	if (__improbable(pkt->pkt_flow_ip_frag_id !=
3414 	    prev_fe->fe_tx_frag_id)) {
3415 		FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3416 		SK_ERR("%s(%d) wrong continuation frag id %u expecting %u",
3417 		    sk_proc_name_address(current_proc()),
3418 		    sk_proc_pid(current_proc()),
3419 		    pkt->pkt_flow_ip_frag_id,
3420 		    prev_fe->fe_tx_frag_id);
3421 		return NULL;
3422 	}
3423 
3424 	return prev_fe;
3425 }
3426 
3427 static struct flow_entry *
tx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)3428 tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
3429     struct flow_entry *prev_fe)
3430 {
3431 	struct flow_entry *__single fe;
3432 
3433 	fe = lookup_flow_with_pkt(fsw, pkt, false, prev_fe);
3434 	if (__improbable(fe == NULL)) {
3435 		goto done;
3436 	}
3437 
3438 	if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
3439 		SK_RDERR(5, "Tx flow torn down");
3440 		FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN);
3441 		flow_entry_release(&fe);
3442 		goto done;
3443 	}
3444 
3445 	_FSW_INJECT_ERROR(34, pkt->pkt_flow_id[0], fe->fe_uuid[0] + 1,
3446 	    null_func);
3447 
3448 	if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
3449 		uuid_string_t flow_id_str, pkt_id_str;
3450 		sk_uuid_unparse(fe->fe_uuid, flow_id_str);
3451 		sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str);
3452 		SK_ERR("pkt flow id %s != flow id %s", pkt_id_str, flow_id_str);
3453 		flow_entry_release(&fe);
3454 		FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID);
3455 	}
3456 
3457 done:
3458 	return fe;
3459 }
3460 
3461 static inline void
tx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)3462 tx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3463     uint32_t flags)
3464 {
3465 	ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3466 	ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) != 0);
3467 
3468 	SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, "TX %d pkts from fe %p port %d",
3469 	    KPKTQ_LEN(&fe->fe_tx_pktq), fe, fe->fe_nx_port);
3470 
3471 	/* flow related processing (default, agg, etc.) */
3472 	fe->fe_tx_process(fsw, fe, flags);
3473 
3474 	KPKTQ_FINI(&fe->fe_tx_pktq);
3475 }
3476 
3477 #if SK_LOG
3478 static void
dp_tx_log_pkt(uint64_t verb,char * desc,struct __kern_packet * pkt)3479 dp_tx_log_pkt(uint64_t verb, char *desc, struct __kern_packet *pkt)
3480 {
3481 	char *pkt_buf;
3482 	uint32_t pkt_len;
3483 
3484 	MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
3485 	pkt_len = __packet_get_real_data_length(pkt);
3486 	SK_DF(verb, "%s(%d) %s %s", sk_proc_name_address(current_proc()),
3487 	    sk_proc_pid(current_proc()), desc, sk_dump("buf", pkt_buf, pkt_len,
3488 	    128, NULL, 0));
3489 }
3490 #else /* !SK_LOG */
3491 #define dp_tx_log_pkt(...)
3492 #endif /* !SK_LOG */
3493 
3494 static inline struct ifnet *
fsw_datamov_begin(struct nx_flowswitch * fsw)3495 fsw_datamov_begin(struct nx_flowswitch *fsw)
3496 {
3497 	struct ifnet *ifp;
3498 
3499 	ifp = fsw->fsw_ifp;
3500 	if (!ifnet_datamov_begin(ifp)) {
3501 		DTRACE_SKYWALK1(ifnet__detached, struct ifnet *, ifp);
3502 		return NULL;
3503 	}
3504 	return ifp;
3505 }
3506 
3507 static inline void
fsw_datamov_end(struct nx_flowswitch * fsw)3508 fsw_datamov_end(struct nx_flowswitch *fsw)
3509 {
3510 	ifnet_datamov_end(fsw->fsw_ifp);
3511 }
3512 
3513 static void
dp_tx_pktq(struct nx_flowswitch * fsw,struct pktq * spktq)3514 dp_tx_pktq(struct nx_flowswitch *fsw, struct pktq *spktq)
3515 {
3516 	struct __kern_packet *spkt, *pkt;
3517 	struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3518 	struct flow_entry *__single fe, *__single prev_fe;
3519 	struct pktq dropped_pkts, dpktq;
3520 	struct nexus_adapter *dev_na;
3521 	struct kern_pbufpool *dev_pp;
3522 	struct ifnet *ifp = NULL;
3523 	sa_family_t af;
3524 	uint32_t n_pkts, n_flows = 0;
3525 	boolean_t do_pacing = FALSE;
3526 	drop_reason_t reason = DROP_REASON_UNSPECIFIED;
3527 	uint16_t line = 0;
3528 
3529 	int err;
3530 	KPKTQ_INIT(&dpktq);
3531 	KPKTQ_INIT(&dropped_pkts);
3532 	n_pkts = KPKTQ_LEN(spktq);
3533 
3534 	FSW_RLOCK(fsw);
3535 	if (__improbable(FSW_QUIESCED(fsw))) {
3536 		DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3537 		SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3538 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3539 		reason = DROP_REASON_FSW_QUIESCED;
3540 		line = __LINE__;
3541 		goto done;
3542 	}
3543 	dev_na = fsw->fsw_dev_ch->ch_na;
3544 	if (__improbable(dev_na == NULL)) {
3545 		SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3546 		FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3547 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3548 		reason = DROP_REASON_FSW_TX_DEVPORT_NOT_ATTACHED;
3549 		line = __LINE__;
3550 		goto done;
3551 	}
3552 	ifp = fsw_datamov_begin(fsw);
3553 	if (ifp == NULL) {
3554 		SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
3555 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3556 		reason = DROP_REASON_FSW_IFNET_NOT_ATTACHED;
3557 		line = __LINE__;
3558 		goto done;
3559 	}
3560 
3561 	/* batch allocate enough packets */
3562 	dev_pp = na_kr_get_pp(dev_na, NR_TX);
3563 
3564 	err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, n_pkts, NULL,
3565 	    NULL, SKMEM_NOSLEEP);
3566 #if DEVELOPMENT || DEBUG
3567 	if (__probable(err != ENOMEM)) {
3568 		_FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
3569 	}
3570 #endif /* DEVELOPMENT || DEBUG */
3571 	if (__improbable(err == ENOMEM)) {
3572 		ASSERT(KPKTQ_EMPTY(&dpktq));
3573 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3574 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3575 		SK_ERR("failed to alloc %u pkts from device pool", n_pkts);
3576 		reason = DROP_REASON_FSW_PP_ALLOC_FAILED;
3577 		line = __LINE__;
3578 		goto done;
3579 	} else if (__improbable(err == EAGAIN)) {
3580 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT,
3581 		    (n_pkts - KPKTQ_LEN(&dpktq)));
3582 		FSW_STATS_ADD(FSW_STATS_DROP,
3583 		    (n_pkts - KPKTQ_LEN(&dpktq)));
3584 	}
3585 
3586 	n_pkts = KPKTQ_LEN(&dpktq);
3587 	prev_fe = NULL;
3588 	KPKTQ_FOREACH(spkt, spktq) {
3589 		if (n_pkts == 0) {
3590 			break;
3591 		}
3592 		--n_pkts;
3593 
3594 		KPKTQ_DEQUEUE(&dpktq, pkt);
3595 		ASSERT(pkt != NULL);
3596 		err = dp_copy_to_dev(fsw, spkt, pkt);
3597 		if (__improbable(err != 0)) {
3598 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_PKT_COPY_FAILED,
3599 			    DROPTAP_FLAG_L2_MISSING);
3600 			continue;
3601 		}
3602 
3603 		do_pacing |= ((pkt->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0);
3604 		af = fsw_ip_demux(fsw, pkt);
3605 		if (__improbable(af == AF_UNSPEC)) {
3606 			dp_tx_log_pkt(SK_VERB_ERROR, "demux err", pkt);
3607 			FSW_STATS_INC(FSW_STATS_TX_DEMUX_ERR);
3608 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_DEMUX_FAILED,
3609 			    DROPTAP_FLAG_L2_MISSING);
3610 			continue;
3611 		}
3612 
3613 		err = flow_pkt_classify(pkt, ifp, af, false);
3614 		if (__improbable(err != 0)) {
3615 			dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
3616 			FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
3617 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_EXTRACT_FAILED,
3618 			    DROPTAP_FLAG_L2_MISSING);
3619 			continue;
3620 		}
3621 
3622 		if (__improbable(pkt->pkt_flow_ip_is_frag &&
3623 		    !pkt->pkt_flow_ip_is_first_frag)) {
3624 			fe = tx_process_continuous_ip_frag(fsw, prev_fe, pkt);
3625 			if (__probable(fe != NULL)) {
3626 				flow_entry_retain(fe);
3627 				goto flow_batch;
3628 			} else {
3629 				FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3630 				dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FRAG_BAD_CONT,
3631 				    DROPTAP_FLAG_L2_MISSING);
3632 				continue;
3633 			}
3634 		}
3635 
3636 		fe = tx_lookup_flow(fsw, pkt, prev_fe);
3637 		if (__improbable(fe == NULL)) {
3638 			FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
3639 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_NOT_FOUND,
3640 			    DROPTAP_FLAG_L2_MISSING);
3641 			prev_fe = NULL;
3642 			continue;
3643 		}
3644 flow_batch:
3645 		tx_flow_batch_packet(&fes, fe, pkt);
3646 		prev_fe = fe;
3647 	}
3648 
3649 	struct flow_entry *tfe = NULL;
3650 	TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
3651 		tx_flow_process(fsw, fe, 0);
3652 		TAILQ_REMOVE(&fes, fe, fe_tx_link);
3653 		fe->fe_tx_is_cont_frag = false;
3654 		fe->fe_tx_frag_id = 0;
3655 		flow_entry_release(&fe);
3656 		n_flows++;
3657 	}
3658 
3659 done:
3660 	FSW_RUNLOCK(fsw);
3661 	if (n_flows > 0) {
3662 		netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL | (do_pacing ? NETIF_XMIT_FLAG_PACING : 0));
3663 	}
3664 	if (ifp != NULL) {
3665 		fsw_datamov_end(fsw);
3666 	}
3667 	dp_drop_pktq(fsw, &dropped_pkts, 1, reason, line, DROPTAP_FLAG_L2_MISSING);
3668 	KPKTQ_FINI(&dropped_pkts);
3669 	KPKTQ_FINI(&dpktq);
3670 }
3671 
3672 static sa_family_t
get_tso_af(struct __kern_packet * pkt)3673 get_tso_af(struct __kern_packet *pkt)
3674 {
3675 	packet_tso_flags_t tso_flags;
3676 
3677 	tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
3678 	if (tso_flags == PACKET_TSO_IPV4) {
3679 		return AF_INET;
3680 	} else if (tso_flags == PACKET_TSO_IPV6) {
3681 		return AF_INET6;
3682 	} else {
3683 		panic("invalid tso flags: 0x%x\n", tso_flags);
3684 		/* NOTREACHED */
3685 		__builtin_unreachable();
3686 	}
3687 }
3688 
3689 static inline void
update_flow_info(struct __kern_packet * pkt,void * iphdr,void * tcphdr,uint16_t payload_sz)3690 update_flow_info(struct __kern_packet *pkt, void *iphdr, void *tcphdr, uint16_t payload_sz)
3691 {
3692 	struct tcphdr *__single tcp = tcphdr;
3693 
3694 	DTRACE_SKYWALK4(update__flow__info, struct __kern_packet *, pkt,
3695 	    void *, iphdr, void *, tcphdr, uint16_t, payload_sz);
3696 	pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iphdr;
3697 	pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcphdr;
3698 	pkt->pkt_flow_tcp_flags = tcp->th_flags;
3699 	pkt->pkt_flow_tcp_seq = tcp->th_seq;
3700 	pkt->pkt_flow_ulen = payload_sz;
3701 }
3702 
3703 static int
do_gso(struct nx_flowswitch * fsw,int af,struct __kern_packet * orig_pkt,struct __kern_packet * first_pkt,struct pktq * dev_pktq,struct pktq * gso_pktq)3704 do_gso(struct nx_flowswitch *fsw, int af, struct __kern_packet *orig_pkt,
3705     struct __kern_packet *first_pkt, struct pktq *dev_pktq,
3706     struct pktq *gso_pktq)
3707 {
3708 	ifnet_t ifp = fsw->fsw_ifp;
3709 	struct __kern_packet *pkt = first_pkt;
3710 	uint8_t proto = pkt->pkt_flow_ip_proto;
3711 	uint16_t ip_hlen = pkt->pkt_flow_ip_hlen;
3712 	uint16_t tcp_hlen = pkt->pkt_flow_tcp_hlen;
3713 	uint16_t total_hlen = ip_hlen + tcp_hlen;
3714 	uint16_t mtu = (uint16_t)ifp->if_mtu;
3715 	uint16_t mss = pkt->pkt_proto_seg_sz, payload_sz;
3716 	uint32_t n, n_pkts, off = 0, total_len = orig_pkt->pkt_length;
3717 	uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
3718 	kern_packet_t orig_ph = SK_PKT2PH(orig_pkt);
3719 	uint8_t *orig_pkt_baddr;
3720 	struct tcphdr *tcp;
3721 	struct ip *ip;
3722 	struct ip6_hdr *ip6;
3723 	uint32_t tcp_seq;
3724 	uint16_t ipid;
3725 	uint32_t pseudo_hdr_csum, bufsz;
3726 
3727 	ASSERT(headroom <= UINT8_MAX);
3728 	if (proto != IPPROTO_TCP) {
3729 		SK_ERR("invalid proto: %d", proto);
3730 		DTRACE_SKYWALK3(invalid__proto, struct nx_flowswitch *,
3731 		    fsw, ifnet_t, ifp, uint8_t, proto);
3732 		return EINVAL;
3733 	}
3734 	if (mss == 0 || mss > (mtu - total_hlen)) {
3735 		SK_ERR("invalid args: mss %d, mtu %d, total_hlen %d",
3736 		    mss, mtu, total_hlen);
3737 		DTRACE_SKYWALK5(invalid__args1, struct nx_flowswitch *,
3738 		    fsw, ifnet_t, ifp, uint16_t, mss, uint16_t, mtu,
3739 		    uint32_t, total_hlen);
3740 		return EINVAL;
3741 	}
3742 	bufsz = PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp);
3743 	if ((headroom + total_hlen + mss) > bufsz) {
3744 		SK_ERR("invalid args: headroom %d, total_hlen %d, "
3745 		    "mss %d, bufsz %d", headroom, total_hlen, mss, bufsz);
3746 		DTRACE_SKYWALK6(invalid__args2, struct nx_flowswitch *,
3747 		    fsw, ifnet_t, ifp, uint16_t, headroom, uint16_t,
3748 		    total_hlen, uint16_t, mss, uint32_t, bufsz);
3749 		return EINVAL;
3750 	}
3751 	n_pkts = (uint32_t)(SK_ROUNDUP((total_len - total_hlen), mss) / mss);
3752 
3753 	ASSERT(pkt->pkt_headroom == headroom);
3754 	ASSERT(pkt->pkt_length == total_len);
3755 	ASSERT(pkt->pkt_l2_len == 0);
3756 	ASSERT((pkt->pkt_qum.qum_qflags & QUM_F_FINALIZED) == 0);
3757 	ASSERT((pkt->pkt_pflags & PKT_F_TRUNCATED) != 0);
3758 	pkt->pkt_pflags &= ~PKT_F_TRUNCATED;
3759 	pkt->pkt_proto_seg_sz = 0;
3760 	pkt->pkt_csum_flags = 0;
3761 	MD_BUFLET_ADDR_ABS(orig_pkt, orig_pkt_baddr);
3762 	orig_pkt_baddr += orig_pkt->pkt_headroom;
3763 
3764 	if (af == AF_INET) {
3765 		/*
3766 		 * XXX -fbounds-safety: can't avoid using forge unless we change
3767 		 * the flow metadata definition.
3768 		 */
3769 		ip = __unsafe_forge_bidi_indexable(struct ip *,
3770 		    pkt->pkt_flow_ip_hdr, pkt->pkt_length);
3771 		tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
3772 		    pkt->pkt_flow_tcp_hdr, pkt->pkt_length - ip_hlen);
3773 		ipid = ip->ip_id;
3774 		pseudo_hdr_csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
3775 		    pkt->pkt_flow_ipv4_dst.s_addr, 0);
3776 	} else {
3777 		ASSERT(af == AF_INET6);
3778 		tcp = __unsafe_forge_bidi_indexable(struct tcphdr *,
3779 		    pkt->pkt_flow_tcp_hdr, pkt->pkt_length - ip_hlen);
3780 		pseudo_hdr_csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
3781 		    &pkt->pkt_flow_ipv6_dst, 0);
3782 	}
3783 	tcp_seq = ntohl(tcp->th_seq);
3784 
3785 	for (n = 1, payload_sz = mss, off = total_hlen; off < total_len;
3786 	    off += payload_sz) {
3787 		uint8_t *baddr, *baddr0;
3788 		uint32_t partial;
3789 
3790 		if (pkt == NULL) {
3791 			n++;
3792 			KPKTQ_DEQUEUE(dev_pktq, pkt);
3793 			ASSERT(pkt != NULL);
3794 		}
3795 		MD_BUFLET_ADDR_ABS(pkt, baddr0);
3796 		baddr = baddr0;
3797 		baddr += headroom;
3798 
3799 		/* Copy headers from the original packet */
3800 		if (n != 1) {
3801 			ASSERT(pkt != first_pkt);
3802 			pkt_copy(orig_pkt_baddr, baddr, total_hlen);
3803 			fsw_pkt_copy_metadata(first_pkt, pkt);
3804 
3805 			ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
3806 			/* flow info still needs to be updated below */
3807 			bcopy(first_pkt->pkt_flow, pkt->pkt_flow,
3808 			    sizeof(*pkt->pkt_flow));
3809 			pkt->pkt_trace_id = 0;
3810 			ASSERT(pkt->pkt_headroom == headroom);
3811 		} else {
3812 			METADATA_SET_LEN(pkt, 0, 0);
3813 		}
3814 		baddr += total_hlen;
3815 
3816 		/* Copy/checksum the payload from the original packet */
3817 		if (off + payload_sz > total_len) {
3818 			payload_sz = (uint16_t)(total_len - off);
3819 		}
3820 		pkt_copypkt_sum(orig_ph,
3821 		    (uint16_t)(orig_pkt->pkt_headroom + off),
3822 		    SK_PKT2PH(pkt), headroom + total_hlen, payload_sz,
3823 		    &partial, TRUE);
3824 
3825 		DTRACE_SKYWALK6(copy__csum, struct nx_flowswitch *, fsw,
3826 		    ifnet_t, ifp, uint8_t *, baddr, uint16_t, payload_sz,
3827 		    uint16_t, mss, uint32_t, partial);
3828 		FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
3829 
3830 		/*
3831 		 * Adjust header information and fill in the missing fields.
3832 		 */
3833 		if (af == AF_INET) {
3834 			ip = (struct ip *)(void *)(baddr0 + pkt->pkt_headroom);
3835 			tcp = (struct tcphdr *)(void *)((caddr_t)ip + ip_hlen);
3836 
3837 			if (n != n_pkts) {
3838 				tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3839 			}
3840 			if (n != 1) {
3841 				tcp->th_flags &= ~TH_CWR;
3842 				tcp->th_seq = htonl(tcp_seq);
3843 			}
3844 			update_flow_info(pkt, ip, tcp, payload_sz);
3845 
3846 			ip->ip_id = htons((ipid)++);
3847 			ip->ip_len = htons(ip_hlen + tcp_hlen + payload_sz);
3848 			ip->ip_sum = 0;
3849 			ip->ip_sum = inet_cksum_buffer(ip, 0, 0, ip_hlen);
3850 			tcp->th_sum = 0;
3851 
3852 			partial = __packet_cksum(tcp, tcp_hlen, partial);
3853 			partial += htons(tcp_hlen + IPPROTO_TCP + payload_sz);
3854 			partial += pseudo_hdr_csum;
3855 			ADDCARRY(partial);
3856 			tcp->th_sum = ~(uint16_t)partial;
3857 		} else {
3858 			ASSERT(af == AF_INET6);
3859 			ip6 = (struct ip6_hdr *)(baddr0 + pkt->pkt_headroom);
3860 			tcp = (struct tcphdr *)(void *)((caddr_t)ip6 + ip_hlen);
3861 
3862 			if (n != n_pkts) {
3863 				tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3864 			}
3865 			if (n != 1) {
3866 				tcp->th_flags &= ~TH_CWR;
3867 				tcp->th_seq = htonl(tcp_seq);
3868 			}
3869 			update_flow_info(pkt, ip6, tcp, payload_sz);
3870 
3871 			ip6->ip6_plen = htons(tcp_hlen + payload_sz);
3872 			tcp->th_sum = 0;
3873 			partial = __packet_cksum(tcp, tcp_hlen, partial);
3874 			partial += htonl(tcp_hlen + IPPROTO_TCP + payload_sz);
3875 			partial += pseudo_hdr_csum;
3876 			ADDCARRY(partial);
3877 			tcp->th_sum = ~(uint16_t)partial;
3878 		}
3879 		tcp_seq += payload_sz;
3880 		METADATA_ADJUST_LEN(pkt, total_hlen, headroom);
3881 #if (DEVELOPMENT || DEBUG)
3882 		struct __kern_buflet *bft;
3883 		uint32_t blen;
3884 		PKT_GET_FIRST_BUFLET(pkt, 1, bft);
3885 		blen = __buflet_get_data_length(bft);
3886 		if (blen != total_hlen + payload_sz) {
3887 			panic("blen (%d) != total_len + payload_sz (%d)\n",
3888 			    blen, total_hlen + payload_sz);
3889 		}
3890 #endif /* DEVELOPMENT || DEBUG */
3891 
3892 		pkt->pkt_length = total_hlen + payload_sz;
3893 		KPKTQ_ENQUEUE(gso_pktq, pkt);
3894 		pkt = NULL;
3895 
3896 		/*
3897 		 * Note that at this point the packet is not yet finalized.
3898 		 * The finalization happens in dp_flow_tx_process() after
3899 		 * the framing is done.
3900 		 */
3901 	}
3902 	ASSERT(n == n_pkts);
3903 	ASSERT(off == total_len);
3904 	DTRACE_SKYWALK7(gso__done, struct nx_flowswitch *, fsw, ifnet_t, ifp,
3905 	    uint32_t, n_pkts, uint32_t, total_len, uint16_t, ip_hlen,
3906 	    uint16_t, tcp_hlen, uint8_t *, orig_pkt_baddr);
3907 	return 0;
3908 }
3909 
3910 static void
tx_flow_enqueue_gso_pktq(struct flow_entry_list * fes,struct flow_entry * fe,struct pktq * gso_pktq)3911 tx_flow_enqueue_gso_pktq(struct flow_entry_list *fes, struct flow_entry *fe,
3912     struct pktq *gso_pktq)
3913 {
3914 	if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
3915 		ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
3916 		TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
3917 		KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3918 		    KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3919 		KPKTQ_INIT(gso_pktq);
3920 	} else {
3921 		ASSERT(!TAILQ_EMPTY(fes));
3922 		KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3923 		    KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3924 		KPKTQ_INIT(gso_pktq);
3925 		flow_entry_release(&fe);
3926 	}
3927 }
3928 
3929 static void
dp_gso_pktq(struct nx_flowswitch * fsw,struct pktq * spktq,uint32_t gso_pkts_estimate)3930 dp_gso_pktq(struct nx_flowswitch *fsw, struct pktq *spktq,
3931     uint32_t gso_pkts_estimate)
3932 {
3933 	struct __kern_packet *spkt, *pkt;
3934 	struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3935 	struct flow_entry *__single fe, *__single prev_fe;
3936 	struct pktq dpktq;
3937 	struct nexus_adapter *dev_na;
3938 	struct kern_pbufpool *dev_pp;
3939 	struct ifnet *ifp = NULL;
3940 	sa_family_t af;
3941 	uint32_t n_pkts, n_flows = 0;
3942 	int err;
3943 
3944 	KPKTQ_INIT(&dpktq);
3945 	n_pkts = KPKTQ_LEN(spktq);
3946 
3947 	FSW_RLOCK(fsw);
3948 	if (__improbable(FSW_QUIESCED(fsw))) {
3949 		DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3950 		SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3951 		dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_QUIESCED, __LINE__,
3952 		    DROPTAP_FLAG_L2_MISSING);
3953 		goto done;
3954 	}
3955 	dev_na = fsw->fsw_dev_ch->ch_na;
3956 	if (__improbable(dev_na == NULL)) {
3957 		SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3958 		FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3959 		dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_TX_DEVPORT_NOT_ATTACHED,
3960 		    __LINE__, DROPTAP_FLAG_L2_MISSING);
3961 		goto done;
3962 	}
3963 	ifp = fsw_datamov_begin(fsw);
3964 	if (ifp == NULL) {
3965 		SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
3966 		dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_IFNET_NOT_ATTACHED,
3967 		    __LINE__, DROPTAP_FLAG_L2_MISSING);
3968 		goto done;
3969 	}
3970 
3971 	dev_pp = na_kr_get_pp(dev_na, NR_TX);
3972 
3973 	/*
3974 	 * Batch allocate enough packets to perform GSO on all
3975 	 * packets in spktq.
3976 	 */
3977 	err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq,
3978 	    gso_pkts_estimate, NULL, NULL, SKMEM_NOSLEEP);
3979 #if DEVELOPMENT || DEBUG
3980 	if (__probable(err != ENOMEM)) {
3981 		_FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
3982 	}
3983 #endif /* DEVELOPMENT || DEBUG */
3984 	/*
3985 	 * We either get all packets or none. No partial allocations.
3986 	 */
3987 	if (__improbable(err != 0)) {
3988 		if (err == ENOMEM) {
3989 			ASSERT(KPKTQ_EMPTY(&dpktq));
3990 		} else {
3991 			dp_free_pktq(fsw, &dpktq);
3992 		}
3993 		DTRACE_SKYWALK1(gso__no__mem, int, err);
3994 		dp_drop_pktq(fsw, spktq, 1, DROP_REASON_FSW_PP_ALLOC_FAILED,
3995 		    __LINE__, DROPTAP_FLAG_L2_MISSING);
3996 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3997 		SK_ERR("failed to alloc %u pkts from device pool",
3998 		    gso_pkts_estimate);
3999 		goto done;
4000 	}
4001 	prev_fe = NULL;
4002 	KPKTQ_FOREACH(spkt, spktq) {
4003 		KPKTQ_DEQUEUE(&dpktq, pkt);
4004 		ASSERT(pkt != NULL);
4005 		/*
4006 		 * Copy only headers to the first packet of the GSO chain.
4007 		 * The headers will be used for classification below.
4008 		 */
4009 		err = dp_copy_headers_to_dev(fsw, spkt, pkt);
4010 		if (__improbable(err != 0)) {
4011 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_PKT_COPY_FAILED,
4012 			    DROPTAP_FLAG_L2_MISSING);
4013 			DTRACE_SKYWALK2(copy__headers__failed,
4014 			    struct nx_flowswitch *, fsw,
4015 			    struct __kern_packet *, spkt);
4016 			continue;
4017 		}
4018 		af = get_tso_af(pkt);
4019 		ASSERT(af == AF_INET || af == AF_INET6);
4020 
4021 		err = flow_pkt_classify(pkt, ifp, af, false);
4022 		if (__improbable(err != 0)) {
4023 			dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
4024 			FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
4025 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FLOW_EXTRACT_FAILED,
4026 			    DROPTAP_FLAG_L2_MISSING);
4027 			DTRACE_SKYWALK4(classify__failed,
4028 			    struct nx_flowswitch *, fsw,
4029 			    struct __kern_packet *, spkt,
4030 			    struct __kern_packet *, pkt,
4031 			    int, err);
4032 			continue;
4033 		}
4034 		/*
4035 		 * GSO cannot be done on a fragment and it's a bug in user
4036 		 * space to mark a fragment as needing GSO.
4037 		 */
4038 		if (__improbable(pkt->pkt_flow_ip_is_frag)) {
4039 			FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
4040 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FRAG_BAD_CONT,
4041 			    DROPTAP_FLAG_L2_MISSING);
4042 			DTRACE_SKYWALK3(is__frag,
4043 			    struct nx_flowswitch *, fsw,
4044 			    struct __kern_packet *, spkt,
4045 			    struct __kern_packet *, pkt);
4046 			continue;
4047 		}
4048 		fe = tx_lookup_flow(fsw, pkt, prev_fe);
4049 		if (__improbable(fe == NULL)) {
4050 			FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
4051 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_TX_FRAG_BAD_CONT,
4052 			    DROPTAP_FLAG_L2_MISSING);
4053 			DTRACE_SKYWALK3(lookup__failed,
4054 			    struct nx_flowswitch *, fsw,
4055 			    struct __kern_packet *, spkt,
4056 			    struct __kern_packet *, pkt);
4057 			prev_fe = NULL;
4058 			continue;
4059 		}
4060 		/*
4061 		 * Perform GSO on spkt using the flow information
4062 		 * obtained above.
4063 		 */
4064 		struct pktq gso_pktq;
4065 		KPKTQ_INIT(&gso_pktq);
4066 		err = do_gso(fsw, af, spkt, pkt, &dpktq, &gso_pktq);
4067 		if (__probable(err == 0)) {
4068 			tx_flow_enqueue_gso_pktq(&fes, fe, &gso_pktq);
4069 			prev_fe = fe;
4070 		} else {
4071 			DTRACE_SKYWALK1(gso__error, int, err);
4072 			/* TODO: increment error stat */
4073 			dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_GSO_FAILED,
4074 			    DROPTAP_FLAG_L2_MISSING);
4075 			flow_entry_release(&fe);
4076 			prev_fe = NULL;
4077 		}
4078 		KPKTQ_FINI(&gso_pktq);
4079 	}
4080 	struct flow_entry *tfe = NULL;
4081 	TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
4082 		/* Chain-enqueue can be used for GSO chains */
4083 		tx_flow_process(fsw, fe, FLOW_PROC_FLAG_GSO);
4084 		TAILQ_REMOVE(&fes, fe, fe_tx_link);
4085 		flow_entry_release(&fe);
4086 		n_flows++;
4087 	}
4088 done:
4089 	FSW_RUNLOCK(fsw);
4090 	if (n_flows > 0) {
4091 		netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL);
4092 	}
4093 	if (ifp != NULL) {
4094 		fsw_datamov_end(fsw);
4095 	}
4096 
4097 	/*
4098 	 * It's possible for packets to be left in dpktq because
4099 	 * gso_pkts_estimate is only an estimate. The actual number
4100 	 * of packets needed could be less.
4101 	 */
4102 	uint32_t dpktq_len;
4103 	if ((dpktq_len = KPKTQ_LEN(&dpktq)) > 0) {
4104 		DTRACE_SKYWALK2(leftover__dev__pkts,
4105 		    struct nx_flowswitch *, fsw, uint32_t, dpktq_len);
4106 		dp_free_pktq(fsw, &dpktq);
4107 	}
4108 	KPKTQ_FINI(&dpktq);
4109 }
4110 
4111 static inline void
fsw_dev_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4112 fsw_dev_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4113     struct proc *p)
4114 {
4115 #pragma unused(p)
4116 	uint32_t total_pkts = 0, total_bytes = 0;
4117 
4118 	for (;;) {
4119 		struct pktq pktq;
4120 		KPKTQ_INIT(&pktq);
4121 		uint32_t n_bytes;
4122 		fsw_rx_ring_dequeue_pktq(fsw, r, fsw_rx_batch, &pktq, &n_bytes);
4123 		if (n_bytes == 0) {
4124 			break;
4125 		}
4126 		total_pkts += KPKTQ_LEN(&pktq);
4127 		total_bytes += n_bytes;
4128 
4129 		if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) {
4130 			fsw_receive(fsw, &pktq);
4131 		} else {
4132 			fsw_dev_input_netem_enqueue(fsw, &pktq);
4133 		}
4134 		KPKTQ_FINI(&pktq);
4135 	}
4136 
4137 	KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
4138 	DTRACE_SKYWALK2(fsw__dp__dev__ring__flush, uint32_t, total_pkts,
4139 	    uint32_t, total_bytes);
4140 
4141 	/* compute mitigation rate for delivered traffic */
4142 	if (__probable(r->ckr_netif_mit_stats != NULL)) {
4143 		r->ckr_netif_mit_stats(r, total_pkts, total_bytes);
4144 	}
4145 }
4146 
4147 static inline void
fsw_user_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4148 fsw_user_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4149     struct proc *p)
4150 {
4151 #pragma unused(p)
4152 	static packet_trace_id_t trace_id = 0;
4153 	uint32_t total_pkts = 0, total_bytes = 0;
4154 
4155 	for (;;) {
4156 		struct pktq pktq;
4157 		KPKTQ_INIT(&pktq);
4158 		uint32_t n_bytes;
4159 		uint32_t gso_pkts_estimate = 0;
4160 
4161 		fsw_tx_ring_dequeue_pktq(fsw, r, fsw_tx_batch, &pktq, &n_bytes,
4162 		    &gso_pkts_estimate);
4163 		if (n_bytes == 0) {
4164 			break;
4165 		}
4166 		total_pkts += KPKTQ_LEN(&pktq);
4167 		total_bytes += n_bytes;
4168 
4169 		KPKTQ_FIRST(&pktq)->pkt_trace_id = ++trace_id;
4170 		KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_START,
4171 		    KPKTQ_FIRST(&pktq)->pkt_trace_id);
4172 
4173 		if (gso_pkts_estimate > 0) {
4174 			dp_gso_pktq(fsw, &pktq, gso_pkts_estimate);
4175 		} else {
4176 			dp_tx_pktq(fsw, &pktq);
4177 		}
4178 		dp_free_pktq(fsw, &pktq);
4179 		KPKTQ_FINI(&pktq);
4180 	}
4181 	kr_update_stats(r, total_pkts, total_bytes);
4182 
4183 	KDBG(SK_KTRACE_FSW_USER_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
4184 	DTRACE_SKYWALK2(fsw__dp__user__ring__flush, uint32_t, total_pkts,
4185 	    uint32_t, total_bytes);
4186 }
4187 
4188 void
fsw_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)4189 fsw_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
4190     struct proc *p)
4191 {
4192 	struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
4193 
4194 	ASSERT(sk_is_sync_protected());
4195 	ASSERT(vpna->vpna_nx_port != FSW_VP_HOST);
4196 	ASSERT(vpna->vpna_up.na_md_type == NEXUS_META_TYPE_PACKET);
4197 
4198 	if (vpna->vpna_nx_port == FSW_VP_DEV) {
4199 		fsw_dev_ring_flush(fsw, r, p);
4200 	} else {
4201 		fsw_user_ring_flush(fsw, r, p);
4202 	}
4203 }
4204 
4205 int
fsw_dp_ctor(struct nx_flowswitch * fsw)4206 fsw_dp_ctor(struct nx_flowswitch *fsw)
4207 {
4208 	uint32_t fe_cnt = fsw_fe_table_size;
4209 	uint32_t fob_cnt = fsw_flow_owner_buckets;
4210 	uint32_t frb_cnt = fsw_flow_route_buckets;
4211 	uint32_t frib_cnt = fsw_flow_route_id_buckets;
4212 	struct kern_nexus *nx = fsw->fsw_nx;
4213 	char name[64];
4214 	const char *__null_terminated fsw_name = NULL;
4215 	int error = 0;
4216 
4217 	/* just in case */
4218 	if (fe_cnt == 0) {
4219 		fe_cnt = NX_FSW_FE_TABLESZ;
4220 		ASSERT(fe_cnt != 0);
4221 	}
4222 	if (fob_cnt == 0) {
4223 		fob_cnt = NX_FSW_FOB_HASHSZ;
4224 		ASSERT(fob_cnt != 0);
4225 	}
4226 	if (frb_cnt == 0) {
4227 		frb_cnt = NX_FSW_FRB_HASHSZ;
4228 		ASSERT(frb_cnt != 0);
4229 	}
4230 	if (frib_cnt == 0) {
4231 		frib_cnt = NX_FSW_FRIB_HASHSZ;
4232 		ASSERT(frib_cnt != 0);
4233 	}
4234 
4235 	/* make sure fe_cnt is a power of two, else round up */
4236 	if ((fe_cnt & (fe_cnt - 1)) != 0) {
4237 		fe_cnt--;
4238 		fe_cnt |= (fe_cnt >> 1);
4239 		fe_cnt |= (fe_cnt >> 2);
4240 		fe_cnt |= (fe_cnt >> 4);
4241 		fe_cnt |= (fe_cnt >> 8);
4242 		fe_cnt |= (fe_cnt >> 16);
4243 		fe_cnt++;
4244 	}
4245 
4246 	/* make sure frb_cnt is a power of two, else round up */
4247 	if ((frb_cnt & (frb_cnt - 1)) != 0) {
4248 		frb_cnt--;
4249 		frb_cnt |= (frb_cnt >> 1);
4250 		frb_cnt |= (frb_cnt >> 2);
4251 		frb_cnt |= (frb_cnt >> 4);
4252 		frb_cnt |= (frb_cnt >> 8);
4253 		frb_cnt |= (frb_cnt >> 16);
4254 		frb_cnt++;
4255 	}
4256 
4257 	lck_mtx_init(&fsw->fsw_detach_barrier_lock, &nexus_lock_group,
4258 	    &nexus_lock_attr);
4259 	lck_mtx_init(&fsw->fsw_reap_lock, &nexus_lock_group, &nexus_lock_attr);
4260 	lck_mtx_init(&fsw->fsw_linger_lock, &nexus_lock_group, &nexus_lock_attr);
4261 	TAILQ_INIT(&fsw->fsw_linger_head);
4262 
4263 	fsw_name = tsnprintf(name, sizeof(name), "%s_%llu", NX_FSW_NAME, nx->nx_id);
4264 	error = nx_advisory_alloc(nx, fsw_name,
4265 	    &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
4266 	    NEXUS_ADVISORY_TYPE_FLOWSWITCH);
4267 	if (error != 0) {
4268 		fsw_dp_dtor(fsw);
4269 		return error;
4270 	}
4271 
4272 	fsw->fsw_flow_mgr = flow_mgr_create(fe_cnt, fob_cnt, frb_cnt, frib_cnt);
4273 	if (fsw->fsw_flow_mgr == NULL) {
4274 		fsw_dp_dtor(fsw);
4275 		return error;
4276 	}
4277 
4278 	/* generic name; will be customized upon ifattach */
4279 	(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
4280 	    FSW_REAP_THREADNAME, name, "");
4281 
4282 	if (kernel_thread_start(fsw_reap_thread_func, fsw,
4283 	    &fsw->fsw_reap_thread) != KERN_SUCCESS) {
4284 		panic_plain("%s: can't create thread", __func__);
4285 		/* NOTREACHED */
4286 		__builtin_unreachable();
4287 	}
4288 	/* this must not fail */
4289 	VERIFY(fsw->fsw_reap_thread != NULL);
4290 
4291 	SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
4292 
4293 
4294 	return error;
4295 }
4296 
4297 void
fsw_dp_dtor(struct nx_flowswitch * fsw)4298 fsw_dp_dtor(struct nx_flowswitch *fsw)
4299 {
4300 	uint64_t f = (1 * NSEC_PER_MSEC);         /* 1 ms */
4301 	uint64_t s = (1000 * NSEC_PER_SEC);         /* 1 sec */
4302 	uint32_t i = 0;
4303 
4304 #if (DEVELOPMENT || DEBUG)
4305 	if (fsw->fsw_rps_threads != NULL) {
4306 		for (i = 0; i < fsw->fsw_rps_nthreads; i++) {
4307 			fsw_rps_thread_join(fsw, i);
4308 		}
4309 		kfree_type_counted_by(struct fsw_rps_thread, fsw->fsw_rps_nthreads,
4310 		    fsw->fsw_rps_threads);
4311 	}
4312 #endif /* !DEVELOPMENT && !DEBUG */
4313 
4314 	nx_advisory_free(fsw->fsw_nx);
4315 
4316 	if (fsw->fsw_reap_thread != THREAD_NULL) {
4317 		/* signal thread to begin self-termination */
4318 		lck_mtx_lock(&fsw->fsw_reap_lock);
4319 		fsw->fsw_reap_flags |= FSW_REAPF_TERMINATING;
4320 
4321 		/*
4322 		 * And wait for thread to terminate; use another
4323 		 * wait channel here other than fsw_reap_flags to
4324 		 * make it more explicit.  In the event the reaper
4325 		 * thread misses a wakeup, we'll try again once
4326 		 * every second (except for the first time).
4327 		 */
4328 		while (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)) {
4329 			uint64_t t = 0;
4330 
4331 			nanoseconds_to_absolutetime((i++ == 0) ? f : s, &t);
4332 			clock_absolutetime_interval_to_deadline(t, &t);
4333 			ASSERT(t != 0);
4334 
4335 			fsw->fsw_reap_flags |= FSW_REAPF_TERMINATEBLOCK;
4336 			if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)) {
4337 				thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4338 			}
4339 			(void) assert_wait_deadline(&fsw->fsw_reap_thread,
4340 			    THREAD_UNINT, t);
4341 			lck_mtx_unlock(&fsw->fsw_reap_lock);
4342 			thread_block(THREAD_CONTINUE_NULL);
4343 			lck_mtx_lock(&fsw->fsw_reap_lock);
4344 			fsw->fsw_reap_flags &= ~FSW_REAPF_TERMINATEBLOCK;
4345 		}
4346 		ASSERT(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED);
4347 		lck_mtx_unlock(&fsw->fsw_reap_lock);
4348 		fsw->fsw_reap_thread = THREAD_NULL;
4349 	}
4350 
4351 	/* free any remaining flow entries in the linger list */
4352 	fsw_linger_purge(fsw);
4353 
4354 	if (fsw->fsw_flow_mgr != NULL) {
4355 		flow_mgr_destroy(fsw->fsw_flow_mgr);
4356 		fsw->fsw_flow_mgr = NULL;
4357 	}
4358 
4359 
4360 	lck_mtx_destroy(&fsw->fsw_detach_barrier_lock, &nexus_lock_group);
4361 	lck_mtx_destroy(&fsw->fsw_reap_lock, &nexus_lock_group);
4362 	lck_mtx_destroy(&fsw->fsw_linger_lock, &nexus_lock_group);
4363 }
4364 
4365 void
fsw_linger_insert(struct flow_entry * fe)4366 fsw_linger_insert(struct flow_entry *fe)
4367 {
4368 	struct nx_flowswitch *fsw = fe->fe_fsw;
4369 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4370 	SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
4371 	    fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
4372 	    fe->fe_flags, FLOWENTF_BITS);
4373 
4374 	net_update_uptime();
4375 
4376 	ASSERT(flow_entry_refcnt(fe) >= 1);
4377 	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4378 	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4379 	ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
4380 	ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE);
4381 	ASSERT(fe->fe_linger_wait != 0);
4382 	fe->fe_linger_expire = (_net_uptime + fe->fe_linger_wait);
4383 	os_atomic_or(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4384 
4385 	lck_mtx_lock_spin(&fsw->fsw_linger_lock);
4386 	TAILQ_INSERT_TAIL(&fsw->fsw_linger_head, fe, fe_linger_link);
4387 	fsw->fsw_linger_cnt++;
4388 	VERIFY(fsw->fsw_linger_cnt != 0);
4389 	lck_mtx_unlock(&fsw->fsw_linger_lock);
4390 
4391 	fsw_reap_sched(fsw);
4392 }
4393 
4394 static void
fsw_linger_remove_internal(struct flow_entry_linger_head * linger_head,struct flow_entry * fe)4395 fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head,
4396     struct flow_entry *fe)
4397 {
4398 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4399 	SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
4400 	    fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
4401 	    fe->fe_flags, FLOWENTF_BITS);
4402 
4403 	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4404 	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4405 	ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
4406 	os_atomic_andnot(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4407 
4408 	TAILQ_REMOVE(linger_head, fe, fe_linger_link);
4409 	flow_entry_release(&fe);
4410 }
4411 
4412 static void
fsw_linger_remove(struct flow_entry * fe)4413 fsw_linger_remove(struct flow_entry *fe)
4414 {
4415 	struct nx_flowswitch *fsw = fe->fe_fsw;
4416 
4417 	LCK_MTX_ASSERT(&fsw->fsw_linger_lock, LCK_MTX_ASSERT_OWNED);
4418 
4419 	fsw_linger_remove_internal(&fsw->fsw_linger_head, fe);
4420 	VERIFY(fsw->fsw_linger_cnt != 0);
4421 	fsw->fsw_linger_cnt--;
4422 }
4423 
4424 void
fsw_linger_purge(struct nx_flowswitch * fsw)4425 fsw_linger_purge(struct nx_flowswitch *fsw)
4426 {
4427 	struct flow_entry *fe, *tfe;
4428 
4429 	lck_mtx_lock(&fsw->fsw_linger_lock);
4430 	TAILQ_FOREACH_SAFE(fe, &fsw->fsw_linger_head, fe_linger_link, tfe) {
4431 		fsw_linger_remove(fe);
4432 	}
4433 	ASSERT(fsw->fsw_linger_cnt == 0);
4434 	ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4435 	lck_mtx_unlock(&fsw->fsw_linger_lock);
4436 }
4437 
4438 static void
fsw_defunct_rx_stall_channel(struct nx_flowswitch * fsw)4439 fsw_defunct_rx_stall_channel(struct nx_flowswitch *fsw)
4440 {
4441 	struct kern_nexus *nx;
4442 	uint64_t now = _net_uptime;
4443 
4444 	nx = fsw->fsw_nx;
4445 
4446 	/* Walk through all channels and check for Rx stall condition */
4447 	/* uncrustify doesn't handle C blocks properly */
4448 	/* BEGIN IGNORE CODESTYLE */
4449 	nx_port_foreach(nx, ^(nexus_port_t nxport) {
4450 		struct nexus_adapter *na = nx_port_get_na(nx, nxport);
4451 		uint64_t elapsed, enqueue_ts, dequeue_ts;
4452 		struct __kern_channel_ring *ring;
4453 		struct kern_channel *ch;
4454 		struct proc *p;
4455 
4456 		if (na == NULL || na->na_work_ts == 0 || na->na_rx_rings == NULL) {
4457 			return;
4458 		}
4459 		ch = (struct kern_channel *)na->na_private;
4460 		if (ch == NULL) {
4461 			return;
4462 		}
4463 		ring = KR_SINGLE(na->na_rx_rings);
4464 		enqueue_ts = ring->ckr_rx_enqueue_ts;
4465 		dequeue_ts = ring->ckr_rx_dequeue_ts;
4466 		/* Elapsed time since last Rx enqueue */
4467 		elapsed = now - enqueue_ts;
4468 		if ((dequeue_ts < enqueue_ts) && (elapsed > fsw_rx_stall_thresh)) {
4469 			p = proc_find(ch->ch_pid);
4470 			if (p == NULL) {
4471 				return;
4472 			}
4473 			if (fsw_rx_stall_defunct) {
4474 				kern_channel_defunct(p, ch);
4475 			}
4476 			proc_rele(p);
4477 			DTRACE_SKYWALK3(rx__stall, struct nx_flowswitch *, fsw,
4478 			    struct nexus_adapter *, na, struct __kern_channel_ring *, ring);
4479 			FSW_STATS_INC(FSW_STATS_RX_STALL);
4480 			SK_ERR("Rx stall detected in proc %s(%llu) (%s): "
4481 			    "elapsed %llu (s), now: %llu, enqueue: %llu, dequeue: %llu, "
4482 			    "defunct: %s",
4483 			    ch->ch_name, ch->ch_pid, fsw->fsw_ifp->if_xname,
4484 			    elapsed, now, enqueue_ts, dequeue_ts,
4485 			    fsw_rx_stall_defunct ? "yes" : "no");
4486 		}
4487 	});
4488 	/* END IGNORE CODESTYLE */
4489 }
4490 
4491 void
fsw_reap_sched(struct nx_flowswitch * fsw)4492 fsw_reap_sched(struct nx_flowswitch *fsw)
4493 {
4494 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
4495 	lck_mtx_lock_spin(&fsw->fsw_reap_lock);
4496 	if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING) &&
4497 	    !(fsw->fsw_reap_flags & (FSW_REAPF_TERMINATING | FSW_REAPF_TERMINATED))) {
4498 		thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4499 	}
4500 	lck_mtx_unlock(&fsw->fsw_reap_lock);
4501 }
4502 
4503 __attribute__((noreturn))
4504 static void
fsw_reap_thread_func(void * v,wait_result_t w)4505 fsw_reap_thread_func(void *v, wait_result_t w)
4506 {
4507 #pragma unused(w)
4508 	struct nx_flowswitch *__single fsw = v;
4509 
4510 	ASSERT(fsw->fsw_reap_thread == current_thread());
4511 	/*
4512 	 * -fbounds-safety: __unsafe_null_terminated_from_indexable provides
4513 	 * checks to ensure source contains the null terminator, by doing a
4514 	 * linear scan of the string.
4515 	 */
4516 	thread_set_thread_name(current_thread(),
4517 	    __unsafe_null_terminated_from_indexable(fsw->fsw_reap_name));
4518 
4519 	net_update_uptime();
4520 
4521 	lck_mtx_lock(&fsw->fsw_reap_lock);
4522 	VERIFY(!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING));
4523 	(void) assert_wait(&fsw->fsw_reap_flags, THREAD_UNINT);
4524 	lck_mtx_unlock(&fsw->fsw_reap_lock);
4525 	thread_block_parameter(fsw_reap_thread_cont, fsw);
4526 	/* NOTREACHED */
4527 	__builtin_unreachable();
4528 }
4529 
4530 __attribute__((noreturn))
4531 static void
fsw_reap_thread_cont(void * v,wait_result_t wres)4532 fsw_reap_thread_cont(void *v, wait_result_t wres)
4533 {
4534 	struct nx_flowswitch *__single fsw = v;
4535 	boolean_t low;
4536 	uint64_t t = 0;
4537 
4538 	SK_DF(SK_VERB_FLOW, "%s: running", fsw->fsw_reap_name);
4539 
4540 	lck_mtx_lock(&fsw->fsw_reap_lock);
4541 	if (__improbable(wres == THREAD_INTERRUPTED ||
4542 	    (fsw->fsw_reap_flags & FSW_REAPF_TERMINATING) != 0)) {
4543 		goto terminate;
4544 	}
4545 
4546 	ASSERT(!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED));
4547 	fsw->fsw_reap_flags |= FSW_REAPF_RUNNING;
4548 	lck_mtx_unlock(&fsw->fsw_reap_lock);
4549 
4550 	net_update_uptime();
4551 
4552 	/* prevent detach from happening while we're here */
4553 	if (!fsw_detach_barrier_add(fsw)) {
4554 		SK_ERR("%s: netagent detached", fsw->fsw_reap_name);
4555 		t = 0;
4556 	} else {
4557 		uint32_t fe_nonviable, fe_freed, fe_aborted;
4558 		uint32_t fr_freed, fr_resid = 0;
4559 		struct ifnet *ifp = fsw->fsw_ifp;
4560 		uint64_t i = FSW_REAP_IVAL;
4561 		uint64_t now = _net_uptime;
4562 		uint64_t last;
4563 
4564 		ASSERT(fsw->fsw_ifp != NULL);
4565 
4566 		/*
4567 		 * Pass 1: process any deferred {withdrawn,nonviable} requests.
4568 		 */
4569 		fe_nonviable = fsw_process_deferred(fsw);
4570 
4571 		/*
4572 		 * Pass 2: remove any expired lingering flows.
4573 		 */
4574 		fe_freed = fsw_process_linger(fsw, &fe_aborted);
4575 
4576 		/*
4577 		 * Pass 3: prune idle flow routes.
4578 		 */
4579 		fr_freed = flow_route_prune(fsw->fsw_flow_mgr,
4580 		    ifp, &fr_resid);
4581 
4582 		/*
4583 		 * Pass 4: prune flow table
4584 		 *
4585 		 */
4586 		cuckoo_hashtable_try_shrink(fsw->fsw_flow_mgr->fm_flow_table);
4587 
4588 		SK_DF(SK_VERB_FLOW, "%s: fe_nonviable %u/%u fe_freed %u/%u "
4589 		    "fe_aborted %u fr_freed %u/%u",
4590 		    fsw->fsw_flow_mgr->fm_name, fe_nonviable,
4591 		    (fe_nonviable + fsw->fsw_pending_nonviable),
4592 		    fe_freed, fsw->fsw_linger_cnt, fe_aborted, fe_freed,
4593 		    (fe_freed + fr_resid));
4594 
4595 		/* see if VM memory level is critical */
4596 		low = skmem_lowmem_check();
4597 
4598 		/*
4599 		 * If things appear to be idle, we can prune away cached
4600 		 * object that have fallen out of the working sets (this
4601 		 * is different than purging).  Every once in a while, we
4602 		 * also purge the caches.  Note that this is done across
4603 		 * all flowswitch instances, and so we limit this to no
4604 		 * more than once every FSW_REAP_SK_THRES seconds.
4605 		 */
4606 		last = os_atomic_load(&fsw_reap_last, relaxed);
4607 		if ((low || (last != 0 && (now - last) >= FSW_REAP_SK_THRES)) &&
4608 		    os_atomic_cmpxchg(&fsw_reap_last, last, now, acq_rel)) {
4609 			fsw_purge_cache(fsw, low);
4610 
4611 			/* increase sleep interval if idle */
4612 			if (kdebug_enable == 0 && fsw->fsw_linger_cnt == 0 &&
4613 			    fsw->fsw_pending_nonviable == 0 && fr_resid == 0) {
4614 				i <<= 3;
4615 			}
4616 		} else if (last == 0) {
4617 			os_atomic_store(&fsw_reap_last, now, release);
4618 		}
4619 
4620 		/*
4621 		 * Additionally, run thru the list of channels and prune
4622 		 * or purge away cached objects on "idle" channels.  This
4623 		 * check is rate limited to no more than once every
4624 		 * FSW_DRAIN_CH_THRES seconds.
4625 		 */
4626 		last = fsw->fsw_drain_channel_chk_last;
4627 		if (low || (last != 0 && (now - last) >= FSW_DRAIN_CH_THRES)) {
4628 			SK_DF(SK_VERB_FLOW, "%s: pruning channels",
4629 			    fsw->fsw_flow_mgr->fm_name);
4630 
4631 			fsw->fsw_drain_channel_chk_last = now;
4632 			fsw_drain_channels(fsw, now, low);
4633 		} else if (__improbable(last == 0)) {
4634 			fsw->fsw_drain_channel_chk_last = now;
4635 		}
4636 
4637 		/*
4638 		 * Finally, invoke the interface's reap callback to
4639 		 * tell it to prune or purge away cached objects if
4640 		 * it is idle.  This check is rate limited to no more
4641 		 * than once every FSW_REAP_IF_THRES seconds.
4642 		 */
4643 		last = fsw->fsw_drain_netif_chk_last;
4644 		if (low || (last != 0 && (now - last) >= FSW_REAP_IF_THRES)) {
4645 			ASSERT(fsw->fsw_nifna != NULL);
4646 
4647 			if (ifp->if_na_ops != NULL &&
4648 			    ifp->if_na_ops->ni_reap != NULL) {
4649 				SK_DF(SK_VERB_FLOW, "%s: pruning netif",
4650 				    fsw->fsw_flow_mgr->fm_name);
4651 				ifp->if_na_ops->ni_reap(ifp->if_na, ifp,
4652 				    FSW_REAP_IF_THRES, low);
4653 			}
4654 
4655 			fsw->fsw_drain_netif_chk_last = now;
4656 		} else if (__improbable(last == 0)) {
4657 			fsw->fsw_drain_netif_chk_last = now;
4658 		}
4659 
4660 		/* emit periodic interface stats ktrace */
4661 		last = fsw->fsw_reap_last;
4662 		if (last != 0 && (now - last) >= FSW_IFSTATS_THRES) {
4663 			KDBG(SK_KTRACE_AON_IF_STATS, ifp->if_data.ifi_ipackets,
4664 			    ifp->if_data.ifi_ibytes * 8,
4665 			    ifp->if_data.ifi_opackets,
4666 			    ifp->if_data.ifi_obytes * 8);
4667 
4668 			fsw->fsw_reap_last = now;
4669 		} else if (__improbable(last == 0)) {
4670 			fsw->fsw_reap_last = now;
4671 		}
4672 
4673 		/* Check for Rx stall condition every NX_FSW_RX_STALL_THRES seconds */
4674 		last = fsw->fsw_rx_stall_chk_last;
4675 		if (last != 0 && (now - last) >= NX_FSW_RX_STALL_THRES) {
4676 			fsw_defunct_rx_stall_channel(fsw);
4677 			fsw->fsw_rx_stall_chk_last = now;
4678 		} else if (__improbable(last == 0)) {
4679 			fsw->fsw_rx_stall_chk_last = now;
4680 		}
4681 
4682 		nanoseconds_to_absolutetime(i * NSEC_PER_SEC, &t);
4683 		clock_absolutetime_interval_to_deadline(t, &t);
4684 		ASSERT(t != 0);
4685 
4686 		/* allow any pending detach to proceed */
4687 		fsw_detach_barrier_remove(fsw);
4688 	}
4689 
4690 	lck_mtx_lock(&fsw->fsw_reap_lock);
4691 	if (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING)) {
4692 		fsw->fsw_reap_flags &= ~FSW_REAPF_RUNNING;
4693 		(void) assert_wait_deadline(&fsw->fsw_reap_flags,
4694 		    THREAD_UNINT, t);
4695 		lck_mtx_unlock(&fsw->fsw_reap_lock);
4696 		thread_block_parameter(fsw_reap_thread_cont, fsw);
4697 		/* NOTREACHED */
4698 		__builtin_unreachable();
4699 	} else {
4700 terminate:
4701 		LCK_MTX_ASSERT(&fsw->fsw_reap_lock, LCK_MTX_ASSERT_OWNED);
4702 		fsw->fsw_reap_flags &= ~(FSW_REAPF_RUNNING | FSW_REAPF_TERMINATING);
4703 		fsw->fsw_reap_flags |= FSW_REAPF_TERMINATED;
4704 		/*
4705 		 * And signal any thread waiting for us to terminate;
4706 		 * wait channel here other than fsw_reap_flags to make
4707 		 * it more explicit.
4708 		 */
4709 		if (fsw->fsw_reap_flags & FSW_REAPF_TERMINATEBLOCK) {
4710 			thread_wakeup((caddr_t)&fsw->fsw_reap_thread);
4711 		}
4712 		lck_mtx_unlock(&fsw->fsw_reap_lock);
4713 
4714 		SK_DF(SK_VERB_FLOW, "%s: terminating", fsw->fsw_reap_name);
4715 
4716 		/* for the extra refcnt from kernel_thread_start() */
4717 		thread_deallocate(current_thread());
4718 		/* this is the end */
4719 		thread_terminate(current_thread());
4720 		/* NOTREACHED */
4721 		__builtin_unreachable();
4722 	}
4723 
4724 	/* must never get here */
4725 	VERIFY(0);
4726 	/* NOTREACHED */
4727 	__builtin_unreachable();
4728 }
4729 
4730 static void
fsw_drain_channels(struct nx_flowswitch * fsw,uint64_t now,boolean_t low)4731 fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low)
4732 {
4733 	struct kern_nexus *nx = fsw->fsw_nx;
4734 
4735 	/* flowswitch protects NA via fsw_lock, see fsw_port_alloc/free */
4736 	FSW_RLOCK(fsw);
4737 
4738 	/* uncrustify doesn't handle C blocks properly */
4739 	/* BEGIN IGNORE CODESTYLE */
4740 	nx_port_foreach(nx, ^(nexus_port_t p) {
4741 		struct nexus_adapter *na = nx_port_get_na(nx, p);
4742 		if (na == NULL || na->na_work_ts == 0 || na->na_rx_rings == NULL) {
4743 			return;
4744 		}
4745 
4746 		boolean_t purge;
4747 
4748 		/*
4749 		 * If some activity happened in the last FSW_DRAIN_CH_THRES
4750 		 * seconds on this channel, we reclaim memory if the channel
4751 		 * throughput is less than the reap threshold value.
4752 		 */
4753 		if ((now - na->na_work_ts) < FSW_DRAIN_CH_THRES) {
4754 			struct __kern_channel_ring *__single ring;
4755 			channel_ring_stats *stats;
4756 			uint64_t bps;
4757 
4758 			ring = KR_SINGLE(na->na_rx_rings);
4759 			stats = &ring->ckr_stats;
4760 			bps = stats->crs_bytes_per_second;
4761 
4762 			if (bps < fsw_channel_reap_thresh) {
4763 				purge = FALSE;
4764 				na_drain(na, purge);
4765 			}
4766 			return;
4767 		}
4768 
4769 		/*
4770 		 * If NA has been inactive for some time (twice the drain
4771 		 * threshold), we clear the work timestamp to temporarily skip
4772 		 * this channel until it's active again.  Purging cached objects
4773 		 * can be expensive since we'd need to allocate and construct
4774 		 * them again, so we do it only when necessary.
4775 		 */
4776 		if (low || ((now - na->na_work_ts) >= (FSW_DRAIN_CH_THRES << 1))) {
4777 			na->na_work_ts = 0;
4778 			purge = TRUE;
4779 		} else {
4780 			purge = FALSE;
4781 		}
4782 
4783 		na_drain(na, purge);  /* purge/prune caches */
4784 	});
4785 	/* END IGNORE CODESTYLE */
4786 
4787 	FSW_RUNLOCK(fsw);
4788 }
4789 
4790 static void
fsw_purge_cache(struct nx_flowswitch * fsw,boolean_t low)4791 fsw_purge_cache(struct nx_flowswitch *fsw, boolean_t low)
4792 {
4793 #pragma unused(fsw)
4794 	uint64_t o = os_atomic_inc_orig(&fsw_want_purge, relaxed);
4795 	uint32_t p = fsw_flow_purge_thresh;
4796 	boolean_t purge = (low || (o != 0 && p != 0 && (o % p) == 0));
4797 
4798 	SK_DF(SK_VERB_FLOW, "%s: %s caches",
4799 	    fsw->fsw_flow_mgr->fm_name,
4800 	    (purge ? "purge" : "prune"));
4801 
4802 	skmem_cache_reap_now(sk_fo_cache, purge);
4803 	skmem_cache_reap_now(sk_fe_cache, purge);
4804 	skmem_cache_reap_now(sk_fab_cache, purge);
4805 	skmem_cache_reap_now(flow_route_cache, purge);
4806 	skmem_cache_reap_now(flow_stats_cache, purge);
4807 	netns_reap_caches(purge);
4808 	skmem_reap_caches(purge);
4809 
4810 #if CONFIG_MBUF_MCACHE
4811 	if (if_is_fsw_transport_netagent_enabled() && purge) {
4812 		mbuf_drain(FALSE);
4813 	}
4814 #endif /* CONFIG_MBUF_MCACHE */
4815 }
4816 
4817 static void
fsw_flow_handle_low_power(struct nx_flowswitch * fsw,struct flow_entry * fe)4818 fsw_flow_handle_low_power(struct nx_flowswitch *fsw, struct flow_entry *fe)
4819 {
4820 	/* When the interface is in low power mode, the flow is nonviable */
4821 	if (!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
4822 	    os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
4823 		os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
4824 	}
4825 }
4826 
4827 static uint32_t
fsw_process_deferred(struct nx_flowswitch * fsw)4828 fsw_process_deferred(struct nx_flowswitch *fsw)
4829 {
4830 	struct flow_entry_dead sfed __sk_aligned(8);
4831 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
4832 	struct flow_entry_dead *fed, *tfed;
4833 	LIST_HEAD(, flow_entry_dead) fed_head =
4834 	    LIST_HEAD_INITIALIZER(fed_head);
4835 	uint32_t i, nonviable = 0;
4836 	boolean_t lowpowermode = FALSE;
4837 
4838 	bzero(&sfed, sizeof(sfed));
4839 
4840 	/*
4841 	 * The flows become nonviable when the interface
4842 	 * is in low power mode (edge trigger)
4843 	 */
4844 	if ((fsw->fsw_ifp->if_xflags & IFXF_LOW_POWER) &&
4845 	    fsw->fsw_ifp->if_low_power_gencnt != fsw->fsw_low_power_gencnt) {
4846 		lowpowermode = TRUE;
4847 		fsw->fsw_low_power_gencnt = fsw->fsw_ifp->if_low_power_gencnt;
4848 	}
4849 
4850 	/*
4851 	 * Scan thru the flow entry tree, and commit any pending withdraw or
4852 	 * nonviable requests.  We may need to push stats and/or unassign the
4853 	 * nexus from NECP, but we cannot do that while holding the locks;
4854 	 * build a temporary list for those entries.
4855 	 */
4856 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
4857 		struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
4858 		struct flow_owner *fo;
4859 
4860 		/*
4861 		 * Grab the lock at all costs when handling low power mode
4862 		 */
4863 		if (__probable(!lowpowermode)) {
4864 			if (!FOB_TRY_LOCK(fob)) {
4865 				continue;
4866 			}
4867 		} else {
4868 			FOB_LOCK(fob);
4869 		}
4870 
4871 		FOB_LOCK_ASSERT_HELD(fob);
4872 		RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
4873 			struct flow_entry *fe;
4874 
4875 			RB_FOREACH(fe, flow_entry_id_tree,
4876 			    &fo->fo_flow_entry_id_head) {
4877 				/* try first as reader; skip if we can't */
4878 				if (__improbable(lowpowermode)) {
4879 					fsw_flow_handle_low_power(fsw, fe);
4880 				}
4881 				if (__improbable(fe->fe_flags & FLOWENTF_HALF_CLOSED)) {
4882 					os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
4883 					flow_namespace_half_close(&fe->fe_port_reservation);
4884 				}
4885 
4886 				/* if not withdrawn/nonviable, skip */
4887 				if (!fe->fe_want_withdraw &&
4888 				    !fe->fe_want_nonviable) {
4889 					continue;
4890 				}
4891 				/*
4892 				 * Here we're holding the lock as writer;
4893 				 * don't spend too much time as we're
4894 				 * blocking the data path now.
4895 				 */
4896 				ASSERT(!uuid_is_null(fe->fe_uuid));
4897 				/* only need flow UUID and booleans */
4898 				uuid_copy(sfed.fed_uuid, fe->fe_uuid);
4899 				sfed.fed_want_clonotify =
4900 				    (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY);
4901 				sfed.fed_want_nonviable = fe->fe_want_nonviable;
4902 				flow_entry_teardown(fo, fe);
4903 
4904 				/* do this outside the flow bucket lock */
4905 				fed = flow_entry_dead_alloc(Z_WAITOK);
4906 				ASSERT(fed != NULL);
4907 				*fed = sfed;
4908 				LIST_INSERT_HEAD(&fed_head, fed, fed_link);
4909 			}
4910 		}
4911 		FOB_UNLOCK(fob);
4912 	}
4913 
4914 	/*
4915 	 * These nonviable flows are no longer useful since we've lost
4916 	 * the source IP address; in the event the client monitors the
4917 	 * viability of the flow, explicitly mark it as nonviable so
4918 	 * that a new flow can be created.
4919 	 */
4920 	LIST_FOREACH_SAFE(fed, &fed_head, fed_link, tfed) {
4921 		LIST_REMOVE(fed, fed_link);
4922 		ASSERT(fsw->fsw_agent_session != NULL);
4923 
4924 		/* if flow is closed early */
4925 		if (fed->fed_want_clonotify) {
4926 			necp_client_early_close(fed->fed_uuid);
4927 		}
4928 
4929 		/* if nonviable, unassign nexus attributes */
4930 		if (fed->fed_want_nonviable) {
4931 			(void) netagent_assign_nexus(fsw->fsw_agent_session,
4932 			    fed->fed_uuid, NULL, 0);
4933 		}
4934 
4935 		flow_entry_dead_free(fed);
4936 		++nonviable;
4937 	}
4938 	ASSERT(LIST_EMPTY(&fed_head));
4939 
4940 	return nonviable;
4941 }
4942 
4943 static uint32_t
fsw_process_linger(struct nx_flowswitch * fsw,uint32_t * abort)4944 fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort)
4945 {
4946 	struct flow_entry_linger_head linger_head =
4947 	    TAILQ_HEAD_INITIALIZER(linger_head);
4948 	struct flow_entry *fe, *tfe;
4949 	uint64_t now = _net_uptime;
4950 	uint32_t i = 0, cnt = 0, freed = 0;
4951 
4952 	ASSERT(fsw->fsw_ifp != NULL);
4953 	ASSERT(abort != NULL);
4954 	*abort = 0;
4955 
4956 	/*
4957 	 * We don't want to contend with the datapath, so move
4958 	 * everything that's in the linger list into a local list.
4959 	 * This allows us to generate RSTs or free the flow entry
4960 	 * outside the lock.  Any remaining flow entry in the local
4961 	 * list will get re-added back to the head of the linger
4962 	 * list, in front of any new ones added since then.
4963 	 */
4964 	lck_mtx_lock(&fsw->fsw_linger_lock);
4965 	TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
4966 	ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4967 	cnt = fsw->fsw_linger_cnt;
4968 	fsw->fsw_linger_cnt = 0;
4969 	lck_mtx_unlock(&fsw->fsw_linger_lock);
4970 
4971 	TAILQ_FOREACH_SAFE(fe, &linger_head, fe_linger_link, tfe) {
4972 		ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4973 		ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4974 		ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
4975 
4976 		/*
4977 		 * See if this is a TCP flow that needs to generate
4978 		 * a RST to the remote peer (if not already).
4979 		 */
4980 		if (flow_track_tcp_want_abort(fe)) {
4981 			VERIFY(fe->fe_flags & FLOWENTF_ABORTED);
4982 			ASSERT(!uuid_is_null(fe->fe_uuid));
4983 			flow_track_abort_tcp(fe, NULL, NULL);
4984 			(*abort)++;
4985 			SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4986 			SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx "
4987 			    "flags 0x%b [RST]", fe_as_string(fe, dbgbuf,
4988 			    sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags,
4989 			    FLOWENTF_BITS);
4990 		}
4991 
4992 		/*
4993 		 * If flow has expired, remove from list and free;
4994 		 * otherwise leave it around in the linger list.
4995 		 */
4996 		if (fe->fe_linger_expire <= now) {
4997 			freed++;
4998 			fsw_linger_remove_internal(&linger_head, fe);
4999 			fe = NULL;
5000 		}
5001 		++i;
5002 	}
5003 	VERIFY(i == cnt && cnt >= freed);
5004 
5005 	/*
5006 	 * Add any remaining ones back into the linger list.
5007 	 */
5008 	lck_mtx_lock(&fsw->fsw_linger_lock);
5009 	if (!TAILQ_EMPTY(&linger_head)) {
5010 		ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head) || fsw->fsw_linger_cnt);
5011 		TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
5012 		ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
5013 		TAILQ_CONCAT(&fsw->fsw_linger_head, &linger_head, fe_linger_link);
5014 		fsw->fsw_linger_cnt += (cnt - freed);
5015 	}
5016 	ASSERT(TAILQ_EMPTY(&linger_head));
5017 	lck_mtx_unlock(&fsw->fsw_linger_lock);
5018 
5019 	return freed;
5020 }
5021 
5022 __attribute__((always_inline))
5023 static inline void
fsw_ifp_inc_traffic_class_in_pkt(struct ifnet * ifp,kern_packet_t ph)5024 fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph)
5025 {
5026 	switch (__packet_get_traffic_class(ph)) {
5027 	case PKT_TC_BE:
5028 		ifp->if_tc.ifi_ibepackets++;
5029 		ifp->if_tc.ifi_ibebytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5030 		break;
5031 	case PKT_TC_BK:
5032 		ifp->if_tc.ifi_ibkpackets++;
5033 		ifp->if_tc.ifi_ibkbytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5034 		break;
5035 	case PKT_TC_VI:
5036 		ifp->if_tc.ifi_ivipackets++;
5037 		ifp->if_tc.ifi_ivibytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5038 		break;
5039 	case PKT_TC_VO:
5040 		ifp->if_tc.ifi_ivopackets++;
5041 		ifp->if_tc.ifi_ivobytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
5042 		break;
5043 	default:
5044 		break;
5045 	}
5046 }
5047 
5048 __attribute__((always_inline))
5049 static inline void
fsw_ifp_inc_traffic_class_out_pkt(struct ifnet * ifp,uint32_t svc,uint32_t cnt,uint32_t len)5050 fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *ifp, uint32_t svc,
5051     uint32_t cnt, uint32_t len)
5052 {
5053 	switch (svc) {
5054 	case PKT_TC_BE:
5055 		ifp->if_tc.ifi_obepackets += cnt;
5056 		ifp->if_tc.ifi_obebytes += len;
5057 		break;
5058 	case PKT_TC_BK:
5059 		ifp->if_tc.ifi_obkpackets += cnt;
5060 		ifp->if_tc.ifi_obkbytes += len;
5061 		break;
5062 	case PKT_TC_VI:
5063 		ifp->if_tc.ifi_ovipackets += cnt;
5064 		ifp->if_tc.ifi_ovibytes += len;
5065 		break;
5066 	case PKT_TC_VO:
5067 		ifp->if_tc.ifi_ovopackets += cnt;
5068 		ifp->if_tc.ifi_ovobytes += len;
5069 		break;
5070 	default:
5071 		break;
5072 	}
5073 }
5074