xref: /xnu-10002.61.3/bsd/skywalk/nexus/flowswitch/fsw_dp.c (revision 0f4c859e951fba394238ab619495c4e1d54d0f34) !
1 /*
2  * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  *   1. Redistributions of source code must retain the above copyright
36  *      notice, this list of conditions and the following disclaimer.
37  *   2. Redistributions in binary form must reproduce the above copyright
38  *      notice, this list of conditions and the following disclaimer in the
39  *      documentation and/or other materials provided with the distribution.
40  *
41  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  */
53 
54 /*
55  *  BSD LICENSE
56  *
57  * Copyright(c) 2015 NEC Europe Ltd. All rights reserved.
58  *  All rights reserved.
59  *
60  * Redistribution and use in source and binary forms, with or without
61  *  modification, are permitted provided that the following conditions
62  *  are met:
63  *
64  *    * Redistributions of source code must retain the above copyright
65  *      notice, this list of conditions and the following disclaimer.
66  *    * Redistributions in binary form must reproduce the above copyright
67  *      notice, this list of conditions and the following disclaimer in
68  *      the documentation and/or other materials provided with the
69  *      distribution.
70  *    * Neither the name of NEC Europe Ltd. nor the names of
71  *      its contributors may be used to endorse or promote products derived
72  *      from this software without specific prior written permission.
73  *
74  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
75  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
76  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
77  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
78  *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
79  *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
80  *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
81  *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
82  *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
83  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
84  *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85  */
86 
87 #include <skywalk/os_skywalk_private.h>
88 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
89 #include <skywalk/nexus/flowswitch/fsw_var.h>
90 #include <skywalk/nexus/netif/nx_netif.h>
91 #include <skywalk/nexus/netif/nx_netif_compat.h>
92 #include <kern/sched_prim.h>
93 #include <sys/kdebug.h>
94 #include <sys/sdt.h>
95 #include <net/bpf.h>
96 #include <net/if_ports_used.h>
97 #include <net/pktap.h>
98 #include <net/pktsched/pktsched_netem.h>
99 #include <netinet/tcp.h>
100 #include <netinet/udp.h>
101 #include <netinet/ip.h>
102 #include <netinet/ip6.h>
103 #include <netinet/in_var.h>
104 
105 extern kern_return_t thread_terminate(thread_t);
106 
107 #define FSW_ZONE_MAX                  256
108 #define FSW_ZONE_NAME                 "skywalk.nx.fsw"
109 
110 static uint64_t fsw_reap_last __sk_aligned(8);
111 static uint64_t fsw_want_purge __sk_aligned(8);
112 
113 #define NX_FSW_FE_TABLESZ       256     /* some power of 2 */
114 static uint32_t fsw_fe_table_size = NX_FSW_FE_TABLESZ;
115 
116 #define NX_FSW_FOB_HASHSZ       31      /* some mersenne prime */
117 static uint32_t fsw_flow_owner_buckets = NX_FSW_FOB_HASHSZ;
118 
119 #define NX_FSW_FRB_HASHSZ       128     /* some power of 2 */
120 static uint32_t fsw_flow_route_buckets = NX_FSW_FRB_HASHSZ;
121 
122 #define NX_FSW_FRIB_HASHSZ      13      /* some mersenne prime */
123 static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ;
124 
125 #define NX_FSW_FLOW_REAP_INTERVAL 1     /* seconds */
126 static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL;
127 
128 #define NX_FSW_FLOW_PURGE_THRES 0       /* purge every N reaps (0 = disable) */
129 static uint32_t fsw_flow_purge_thresh = NX_FSW_FLOW_PURGE_THRES;
130 
131 #define FSW_REAP_IVAL            (MAX(1, fsw_flow_reap_interval))
132 #define FSW_REAP_SK_THRES        (FSW_REAP_IVAL << 5)
133 #define FSW_REAP_IF_THRES        (FSW_REAP_IVAL << 5)
134 #define FSW_DRAIN_CH_THRES       (FSW_REAP_IVAL << 5)
135 #define FSW_IFSTATS_THRES        1
136 
137 #define NX_FSW_CHANNEL_REAP_THRES 1000  /* threshold (bytes/sec) for reaping*/
138 uint64_t fsw_channel_reap_thresh = NX_FSW_CHANNEL_REAP_THRES;
139 
140 #define RX_BUFLET_BATCH_COUNT 64 /* max batch size for buflet allocation */
141 
142 uint32_t fsw_rx_batch = NX_FSW_RXBATCH; /* # of packets per batch (RX) */
143 uint32_t fsw_tx_batch = NX_FSW_TXBATCH; /* # of packets per batch (TX) */
144 uint32_t fsw_gso_batch = 8;
145 #if (DEVELOPMENT || DEBUG)
146 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_batch,
147     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_batch, 0,
148     "flowswitch Rx batch size");
149 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, tx_batch,
150     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_tx_batch, 0,
151     "flowswitch Tx batch size");
152 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_batch,
153     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_gso_batch, 0,
154     "flowswitch GSO batch size");
155 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, reap_throughput,
156     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_channel_reap_thresh,
157     "flowswitch channel reap threshold throughput (bytes/sec)");
158 #endif /* !DEVELOPMENT && !DEBUG */
159 
160 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp,
161     CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp, 0,
162     "flowswitch RX aggregation for tcp flows (enable/disable)");
163 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp_host,
164     CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp_host, 0,
165     "flowswitch RX aggregation for tcp kernel path (0/1/2 (off/on/auto))");
166 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_mtu,
167     CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_gso_mtu, 0,
168     "flowswitch GSO for tcp flows (mtu > 0: enable, mtu == 0: disable)");
169 
170 /*
171  * IP reassembly
172  * The "kern.skywalk.flowswitch.ip_reass" sysctl can be used to force
173  * enable/disable the reassembly routine regardless of whether the
174  * transport netagent is enabled or not.
175  *
176  * 'fsw_ip_reass' is a tri-state:
177  *    0 means force IP reassembly off
178  *    1 means force IP reassembly on
179  *    2 means don't force the value, use what's appropriate for this flowswitch
180  */
181 #define FSW_IP_REASS_FORCE_OFF          0
182 #define FSW_IP_REASS_FORCE_ON           1
183 #define FSW_IP_REASS_NO_FORCE           2
184 
185 uint32_t fsw_ip_reass = FSW_IP_REASS_NO_FORCE;
186 
187 static int
188 fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS
189 {
190 #pragma unused(oidp, arg1, arg2)
191 	unsigned int new_value;
192 	int changed;
193 	int error;
194 
195 	error = sysctl_io_number(req, fsw_ip_reass, sizeof(fsw_ip_reass),
196 	    &new_value, &changed);
197 	if (error == 0 && changed != 0) {
198 		if (new_value > FSW_IP_REASS_NO_FORCE) {
199 			return EINVAL;
200 		}
201 		fsw_ip_reass = new_value;
202 	}
203 	return error;
204 }
205 
206 SYSCTL_PROC(_kern_skywalk_flowswitch, OID_AUTO, ip_reass,
207     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
208     0, 0, fsw_ip_reass_sysctl, "IU",
209     "adjust flowswitch IP reassembly");
210 
211 #if (DEVELOPMENT || DEBUG)
212 static uint64_t _fsw_inject_error = 0;
213 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) \
214 	_SK_INJECT_ERROR(_fsw_inject_error, _en, _ev, _ec, \
215 	&FSW_STATS_VAL(_FSW_STATS_ERROR_INJECTIONS), _f, __VA_ARGS__)
216 
217 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { \
218 	if (__improbable(((_fsw_inject_error) & (1ULL << (_en))) != 0)) { \
219 	        SK_DF(SK_VERB_ERROR_INJECT, "injecting error %d", (_en));\
220 	        if ((_f) != NULL)                                       \
221 	                (_f)(__VA_ARGS__);                              \
222 	}                                                               \
223 } while (0)
224 
225 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_owner_buckets,
226     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_owner_buckets, 0, "");
227 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, fe_table_size,
228     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_fe_table_size, 0, "");
229 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_buckets,
230     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_route_buckets, 0, "");
231 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO,
232     flow_route_id_buckets, CTLFLAG_RW | CTLFLAG_LOCKED,
233     &fsw_flow_route_id_buckets, 0, "");
234 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_reap_interval,
235     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_reap_interval, 0, "");
236 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_purge_thresh,
237     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_purge_thresh, 0, "");
238 SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, fsw_inject_error,
239     CTLFLAG_RW | CTLFLAG_LOCKED, &_fsw_inject_error, "");
240 #else
241 #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0)
242 #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { } while (0)
243 #endif /* !DEVELOPMENT && !DEBUG */
244 
245 static void fsw_linger_remove_internal(struct flow_entry_linger_head *,
246     struct flow_entry *);
247 static void fsw_reap_thread_func(void *, wait_result_t);
248 static void fsw_reap_thread_cont(void *, wait_result_t);
249 static void fsw_purge_cache(struct nx_flowswitch *, boolean_t);
250 static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t);
251 static uint32_t fsw_process_deferred(struct nx_flowswitch *);
252 static uint32_t fsw_process_linger(struct nx_flowswitch *, uint32_t *);
253 
254 static int copy_packet_from_dev(struct nx_flowswitch *, struct __kern_packet *,
255     struct __kern_packet *);
256 
257 static void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *, kern_packet_t);
258 static void fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *, uint32_t,
259     uint32_t, uint32_t);
260 
261 static int __fsw_dp_inited = 0;
262 
263 int
fsw_dp_init(void)264 fsw_dp_init(void)
265 {
266 	_CASSERT(FSW_VP_DEV == 0);
267 	_CASSERT(FSW_VP_HOST == 1);
268 	_CASSERT((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN);
269 	_CASSERT((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT);
270 
271 	ASSERT(!__fsw_dp_inited);
272 
273 	flow_mgr_init();
274 	flow_init();
275 
276 	__fsw_dp_inited = 1;
277 
278 	return 0;
279 }
280 
281 void
fsw_dp_uninit(void)282 fsw_dp_uninit(void)
283 {
284 	if (__fsw_dp_inited) {
285 		flow_fini();
286 		flow_mgr_fini();
287 
288 		__fsw_dp_inited = 0;
289 	}
290 }
291 
292 static void
dp_free_pktq(struct nx_flowswitch * fsw __sk_unused,struct pktq * pktq)293 dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq)
294 {
295 	pp_free_pktq(pktq);
296 }
297 
298 #define dp_drop_pktq(fsw, pktq) do { \
299 	uint32_t _len = KPKTQ_LEN(pktq); \
300 	if (KPKTQ_EMPTY(pktq)) { \
301 	        ASSERT(_len == 0); \
302 	        return; \
303 	} \
304 	SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop %d packets", _len); \
305 	FSW_STATS_ADD(FSW_STATS_DROP, _len); \
306 	DTRACE_SKYWALK1(fsw__dp__drop, int, _len); \
307 	dp_free_pktq(fsw, pktq); \
308 } while (0)
309 
310 SK_NO_INLINE_ATTRIBUTE
311 void
fsw_snoop(struct nx_flowswitch * fsw,struct flow_entry * fe,bool input)312 fsw_snoop(struct nx_flowswitch *fsw, struct flow_entry *fe, bool input)
313 {
314 	pid_t pid;
315 	char proc_name_buf[FLOW_PROCESS_NAME_LENGTH];
316 	char *proc_name = NULL;
317 	pid_t epid;
318 	char eproc_name_buf[FLOW_PROCESS_NAME_LENGTH];
319 	char *eproc_name = NULL;
320 	sa_family_t af;
321 	bool tap_early = false;
322 	struct __kern_packet *pkt;
323 
324 	ASSERT(fe != NULL);
325 	ASSERT(fsw->fsw_ifp != NULL);
326 
327 	if (fe->fe_nx_port == FSW_VP_HOST) {
328 		/* allow packets to be tapped before aggregation happens */
329 		tap_early = (input && fe->fe_key.fk_proto == IPPROTO_TCP);
330 		if (!tap_early) {
331 			/* all other traffic will be tapped in the dlil input path */
332 			return;
333 		}
334 	}
335 	if (fe->fe_key.fk_ipver == IPVERSION) {
336 		af = AF_INET;
337 	} else if (fe->fe_key.fk_ipver == IPV6_VERSION) {
338 		af = AF_INET6;
339 	} else {
340 		return;
341 	}
342 
343 	pid = fe->fe_pid;
344 	if (fe->fe_proc_name[0] != '\0') {
345 		(void) strlcpy(proc_name_buf, fe->fe_proc_name,
346 		    sizeof(proc_name_buf));
347 		proc_name = proc_name_buf;
348 	}
349 	epid = fe->fe_epid;
350 	if (fe->fe_eproc_name[0] != '\0') {
351 		(void) strlcpy(eproc_name_buf, fe->fe_eproc_name,
352 		    sizeof(eproc_name_buf));
353 		eproc_name = eproc_name_buf;
354 	}
355 	if (input) {
356 		KPKTQ_FOREACH(pkt, &fe->fe_rx_pktq) {
357 			pktap_input_packet(fsw->fsw_ifp, af,
358 			    fsw->fsw_ifp_dlt, pid, proc_name, epid,
359 			    eproc_name, SK_PKT2PH(pkt), NULL, 0,
360 			    IPPROTO_TCP, fe->fe_flowid,
361 			    tap_early ? PTH_FLAG_SOCKET: PTH_FLAG_NEXUS_CHAN);
362 		}
363 	} else {
364 		KPKTQ_FOREACH(pkt, &fe->fe_tx_pktq) {
365 			pktap_output_packet(fsw->fsw_ifp, af,
366 			    fsw->fsw_ifp_dlt, pid, proc_name, epid,
367 			    eproc_name, SK_PKT2PH(pkt), NULL, 0,
368 			    0, 0, PTH_FLAG_NEXUS_CHAN);
369 		}
370 	}
371 }
372 
373 #if (DEVELOPMENT || DEBUG)
374 static void
_fsw_error35_handler(int step,struct flow_route * fr,struct __kern_packet * pkt,int * ret)375 _fsw_error35_handler(int step, struct flow_route *fr, struct __kern_packet *pkt,
376     int *ret)
377 {
378 	static boolean_t _err35_flag_modified = FALSE;
379 
380 	switch (step) {
381 	case 1:
382 		if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
383 		    (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
384 			fr->fr_flags &= ~FLOWRTF_RESOLVED;
385 			_err35_flag_modified = TRUE;
386 		}
387 		break;
388 
389 	case 2:
390 		if (!_err35_flag_modified) {
391 			return;
392 		}
393 		if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
394 			m_freem(pkt->pkt_mbuf);
395 			pkt->pkt_pflags &= ~PKT_F_MBUF_DATA;
396 			pkt->pkt_mbuf = NULL;
397 		}
398 		*ret = EJUSTRETURN;
399 		fr->fr_flags |= FLOWRTF_RESOLVED;
400 		_err35_flag_modified = FALSE;
401 		break;
402 
403 	default:
404 		VERIFY(0);
405 		/* not reached */
406 	}
407 }
408 
409 static void
_fsw_error36_handler(int step,struct flow_route * fr,int * ret)410 _fsw_error36_handler(int step, struct flow_route *fr, int *ret)
411 {
412 	static boolean_t _err36_flag_modified = FALSE;
413 
414 	switch (step) {
415 	case 1:
416 		if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
417 		    (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
418 			fr->fr_flags &= ~FLOWRTF_RESOLVED;
419 			_err36_flag_modified = TRUE;
420 		}
421 		break;
422 
423 	case 2:
424 		if (!_err36_flag_modified) {
425 			return;
426 		}
427 		*ret = ENETUNREACH;
428 		fr->fr_flags |= FLOWRTF_RESOLVED;
429 		_err36_flag_modified = FALSE;
430 		break;
431 
432 	default:
433 		VERIFY(0);
434 		/* not reached */
435 	}
436 }
437 #else /* !DEVELOPMENT && !DEBUG */
438 #define _fsw_error35_handler(...)
439 #define _fsw_error36_handler(...)
440 #endif /* DEVELOPMENT || DEBUG */
441 
442 /*
443  * Check if the source packet content can fit into the destination
444  * ring's packet. Returns TRUE if the source packet can fit.
445  * Note: Failures could be caused by misconfigured packet pool sizes,
446  * missing packet size check again MTU or if the source packet is from
447  * a compat netif and the attached mbuf is larger than MTU due to LRO.
448  */
449 static inline boolean_t
validate_pkt_len(struct __kern_packet * spkt,kern_packet_t dph,uint32_t skip_l2hlen,uint32_t l2hlen,uint16_t headroom,uint32_t * copy_len)450 validate_pkt_len(struct __kern_packet *spkt, kern_packet_t dph,
451     uint32_t skip_l2hlen, uint32_t l2hlen, uint16_t headroom,
452     uint32_t *copy_len)
453 {
454 	uint32_t tlen = 0;
455 	uint32_t splen = spkt->pkt_length - skip_l2hlen;
456 
457 	if (l2hlen != 0) {
458 		VERIFY(skip_l2hlen == 0);
459 		tlen += l2hlen;
460 	} else if ((spkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0) {
461 		splen -= ETHER_CRC_LEN;
462 	}
463 
464 	tlen += splen;
465 	*copy_len = splen;
466 
467 	return tlen <= ((__packet_get_buflet_count(dph) *
468 	       PP_BUF_SIZE_DEF(SK_PTR_ADDR_KPKT(dph)->pkt_qum.qum_pp)) -
469 	       headroom);
470 }
471 
472 #if SK_LOG
473 /* Hoisted out of line to reduce kernel stack footprint */
474 SK_LOG_ATTRIBUTE
475 static void
copy_packet_from_dev_log(struct __kern_packet * spkt,struct __kern_packet * dpkt,struct proc * p)476 copy_packet_from_dev_log(struct __kern_packet *spkt,
477     struct __kern_packet *dpkt, struct proc *p)
478 {
479 	uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
480 	    ((spkt->pkt_pflags & PKT_F_MBUF_DATA) ?
481 	    SK_VERB_COPY_MBUF : SK_VERB_COPY));
482 	char *daddr;
483 	MD_BUFLET_ADDR_ABS(dpkt, daddr);
484 	SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u",
485 	    sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
486 	    dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
487 	    (uint32_t)dpkt->pkt_l2_len);
488 	SK_DF(logflags | SK_VERB_DUMP, "%s",
489 	    sk_dump("buf", daddr, dpkt->pkt_length, 128, NULL, 0));
490 }
491 #else
492 #define copy_packet_from_dev_log(...)
493 #endif /* SK_LOG */
494 
495 
496 static inline int
copy_packet_from_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)497 copy_packet_from_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
498     struct __kern_packet *dpkt)
499 {
500 	/*
501 	 * source and destination nexus don't share the packet pool
502 	 * sync operation here is to
503 	 * - alloc packet for the rx(dst) ring
504 	 * - copy data/metadata from src packet to dst packet
505 	 * - attach alloc'd packet to rx(dst) ring
506 	 */
507 	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
508 	    METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
509 	kern_packet_t sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt),
510 	    METADATA_SUBTYPE(spkt));
511 	boolean_t do_cksum_rx;
512 	uint16_t skip_l2h_len = spkt->pkt_l2_len;
513 	uint16_t iphlen;
514 	uint32_t dlen;
515 	int err;
516 
517 	if (__improbable(!validate_pkt_len(spkt, dph, skip_l2h_len, 0, 0,
518 	    &dlen))) {
519 		SK_ERR("bufcnt %d, bufsz %d", __packet_get_buflet_count(dph),
520 		    PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp));
521 		FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
522 		return EINVAL;
523 	}
524 
525 	/* Copy packet metadata */
526 	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
527 	_PKT_COPY(spkt, dpkt);
528 	ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
529 	    PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
530 	ASSERT(dpkt->pkt_mbuf == NULL);
531 
532 	dpkt->pkt_headroom = 0;
533 	dpkt->pkt_l2_len = 0;
534 
535 	/* don't include IP header from partial sum */
536 	if (__probable((spkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) {
537 		iphlen = spkt->pkt_flow_ip_hlen;
538 		do_cksum_rx = sk_cksum_rx;
539 	} else {
540 		iphlen = 0;
541 		do_cksum_rx = FALSE;
542 	}
543 
544 	/* Copy packet payload */
545 	if ((spkt->pkt_pflags & PKT_F_MBUF_DATA) &&
546 	    (spkt->pkt_pflags & PKT_F_TRUNCATED)) {
547 		FSW_STATS_INC(FSW_STATS_RX_COPY_MBUF2PKT);
548 		/*
549 		 * Source packet has truncated contents (just enough for
550 		 * the classifer) of an mbuf from the compat driver; copy
551 		 * the entire entire mbuf contents to destination packet.
552 		 */
553 		m_adj(spkt->pkt_mbuf, skip_l2h_len);
554 		ASSERT((uint32_t)m_pktlen(spkt->pkt_mbuf) >= dlen);
555 		fsw->fsw_pkt_copy_from_mbuf(NR_RX, dph, 0,
556 		    spkt->pkt_mbuf, 0, dlen, do_cksum_rx, iphlen);
557 	} else {
558 		FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2PKT);
559 		/*
560 		 * Source packet has full contents, either from an mbuf
561 		 * that came up from the compat driver, or because it
562 		 * originated on the native driver; copy to destination.
563 		 */
564 		fsw->fsw_pkt_copy_from_pkt(NR_RX, dph, 0, sph,
565 		    (spkt->pkt_headroom + spkt->pkt_l2_len), dlen, do_cksum_rx,
566 		    iphlen, 0, FALSE);
567 	}
568 
569 #if DEBUG || DEVELOPMENT
570 	if (__improbable(pkt_trailers > 0)) {
571 		dlen += pkt_add_trailers(dph, dlen, iphlen);
572 	}
573 #endif /* DEBUG || DEVELOPMENT */
574 
575 	/* Finalize and attach packet to Rx ring */
576 	METADATA_ADJUST_LEN(dpkt, 0, 0);
577 	err = __packet_finalize(dph);
578 	VERIFY(err == 0);
579 
580 	copy_packet_from_dev_log(spkt, dpkt, kernproc);
581 
582 	if (spkt->pkt_pflags & PKT_F_MBUF_DATA) {
583 		ifp_inc_traffic_class_in(fsw->fsw_ifp, spkt->pkt_mbuf);
584 		mbuf_free(spkt->pkt_mbuf);
585 		KPKT_CLEAR_MBUF_DATA(spkt);
586 	} else {
587 		fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph);
588 	}
589 
590 	if (__probable(do_cksum_rx != 0)) {
591 		FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
592 	}
593 
594 	return 0;
595 }
596 
597 SK_NO_INLINE_ATTRIBUTE
598 static struct __kern_packet *
rx_process_ip_frag(struct nx_flowswitch * fsw,struct __kern_packet * pkt)599 rx_process_ip_frag(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
600 {
601 	char *pkt_buf;
602 	void *l3_hdr;
603 	uint16_t nfrags, tlen;
604 	int err = 0;
605 
606 	switch (fsw_ip_reass) {
607 	case FSW_IP_REASS_FORCE_OFF:
608 		return pkt;
609 	case FSW_IP_REASS_FORCE_ON:
610 		break;
611 	default:
612 		if (!FSW_NETAGENT_ENABLED(fsw) ||
613 		    flow_mgr_get_num_flows(fsw->fsw_flow_mgr) == 0) {
614 			return pkt;
615 		}
616 		break;
617 	}
618 
619 	MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
620 	l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
621 
622 	ASSERT(fsw->fsw_ipfm != NULL);
623 	ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
624 
625 	if (pkt->pkt_flow_ip_ver == IPVERSION) {
626 		err = fsw_ip_frag_reass_v4(fsw->fsw_ipfm, &pkt,
627 		    (struct ip *)l3_hdr, &nfrags, &tlen);
628 	} else {
629 		ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
630 		/* we only handle frag header immediately after v6 header */
631 		err = fsw_ip_frag_reass_v6(fsw->fsw_ipfm, &pkt,
632 		    (struct ip6_hdr *)l3_hdr,
633 		    (struct ip6_frag *)((uintptr_t)l3_hdr + sizeof(struct ip6_hdr)),
634 		    &nfrags, &tlen);
635 	}
636 	if (__improbable(err != 0)) {
637 		/* if we get a bad fragment, free it */
638 		pp_free_packet_single(pkt);
639 		pkt = NULL;
640 	} else {
641 		ASSERT(!((pkt != NULL) ^ (nfrags > 0)));
642 	}
643 
644 	return pkt;
645 }
646 
647 SK_NO_INLINE_ATTRIBUTE
648 static void
rx_prepare_packet_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)649 rx_prepare_packet_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
650 {
651 	ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
652 	uint32_t mlen = (uint32_t)m_pktlen(pkt->pkt_mbuf);
653 	kern_packet_t ph =  SK_PTR_ENCODE(pkt,
654 	    METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
655 	/*
656 	 * This is the case when the packet is coming in from
657 	 * compat-netif. This packet only has valid metadata
658 	 * and an attached mbuf. We need to copy enough data
659 	 * from the mbuf to the packet buffer for the
660 	 * classifier. Compat netif packet pool is configured
661 	 * with buffer size of NETIF_COMPAT_MAX_MBUF_DATA_COPY
662 	 * which is just enough to hold the protocol headers
663 	 * for the flowswitch classifier.
664 	 */
665 
666 	pkt->pkt_headroom = 0;
667 	METADATA_ADJUST_LEN(pkt, 0, 0);
668 	/*
669 	 * Copy the initial 128 bytes of the packet for
670 	 * classification.
671 	 * Ethernet(14) + IPv6 header(40) +
672 	 * + IPv6 fragment header(8) +
673 	 * TCP header with options(60).
674 	 */
675 	fsw->fsw_pkt_copy_from_mbuf(NR_RX, ph,
676 	    pkt->pkt_headroom, pkt->pkt_mbuf, 0,
677 	    MIN(mlen, NETIF_COMPAT_MAX_MBUF_DATA_COPY),
678 	    FALSE, 0);
679 
680 	int err = __packet_finalize_with_mbuf(pkt);
681 	VERIFY(err == 0);
682 }
683 
684 static struct __kern_packet *
rx_prepare_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)685 rx_prepare_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
686 {
687 	pkt->pkt_qum_qflags &= ~QUM_F_FLOW_CLASSIFIED;
688 
689 	if (__improbable(pkt->pkt_pflags & PKT_F_MBUF_DATA)) {
690 		rx_prepare_packet_mbuf(fsw, pkt);
691 	}
692 
693 	return pkt;
694 }
695 
696 static struct flow_entry *
lookup_flow_with_pkt(struct nx_flowswitch * fsw,struct __kern_packet * pkt,bool input,struct flow_entry * prev_fe)697 lookup_flow_with_pkt(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
698     bool input, struct flow_entry *prev_fe)
699 {
700 	struct flow_key key __sk_aligned(16);
701 	struct flow_entry *fe = NULL;
702 
703 	ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
704 	flow_pkt2key(pkt, input, &key);
705 
706 	if (__probable(prev_fe != NULL &&
707 	    prev_fe->fe_key.fk_mask == FKMASK_5TUPLE)) {
708 		uint16_t saved_mask = key.fk_mask;
709 		key.fk_mask = FKMASK_5TUPLE;
710 		if (flow_key_cmp_mask(&prev_fe->fe_key, &key, &fk_mask_5tuple) == 0) {
711 			flow_entry_retain(prev_fe);
712 			fe = prev_fe;
713 		} else {
714 			key.fk_mask = saved_mask;
715 		}
716 	}
717 
718 top:
719 	if (__improbable(fe == NULL)) {
720 		fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &key);
721 	}
722 
723 	if (__improbable(fe != NULL &&
724 	    (fe->fe_flags & (FLOWENTF_PARENT | FLOWENTF_CHILD)) != 0)) {
725 		/* Rx */
726 		if (input) {
727 			if (fe->fe_flags & FLOWENTF_PARENT) {
728 				struct flow_entry *child_fe = rx_lookup_child_flow(fsw, fe, pkt);
729 				if (child_fe != NULL) {
730 					flow_entry_release(&fe);
731 					fe = child_fe;
732 				}
733 			} else {
734 				if (!rx_flow_demux_match(fsw, fe, pkt)) {
735 					flow_entry_release(&fe);
736 					fe = NULL;
737 					goto top;
738 				}
739 			}
740 		} else {
741 			/* Tx */
742 			if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
743 				if (__probable(fe->fe_flags & FLOWENTF_PARENT)) {
744 					struct flow_entry *parent_fe = fe;
745 					fe = tx_lookup_child_flow(parent_fe, pkt->pkt_flow_id);
746 					flow_entry_release(&parent_fe);
747 				} else {
748 					flow_entry_release(&fe);
749 					fe = NULL;
750 					goto top;
751 				}
752 			}
753 		}
754 	}
755 
756 	SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]);
757 	SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP,
758 	    "%s %s %s \"%s\" fe 0x%llx",
759 	    input ? "Rx" : "Tx", if_name(fsw->fsw_ifp),
760 	    sk_proc_name_address(current_proc()),
761 	    fk_as_string(&key, fkbuf, sizeof(fkbuf)),
762 	    SK_KVA(fe));
763 
764 	return fe;
765 }
766 
767 SK_NO_INLINE_ATTRIBUTE
768 static bool
pkt_is_for_listener(struct flow_entry * fe,struct __kern_packet * pkt)769 pkt_is_for_listener(struct flow_entry *fe, struct __kern_packet *pkt)
770 {
771 	struct nx_flowswitch *fsw = fe->fe_fsw;
772 	struct ifnet *ifp = fsw->fsw_ifp;
773 	struct in_ifaddr *ia = NULL;
774 	struct in_ifaddr *best_ia = NULL;
775 	struct in6_ifaddr *ia6 = NULL;
776 	struct in6_ifaddr *best_ia6 = NULL;
777 	struct ifnet *match_ifp = NULL;
778 	struct __flow *flow = pkt->pkt_flow;
779 	bool result = false;
780 
781 	ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
782 
783 	if (flow->flow_ip_ver == IPVERSION) {
784 		if (IN_ZERONET(ntohl(flow->flow_ipv4_dst.s_addr)) ||
785 		    IN_LOOPBACK(ntohl(flow->flow_ipv4_dst.s_addr)) ||
786 		    IN_LINKLOCAL(ntohl(flow->flow_ipv4_dst.s_addr)) ||
787 		    IN_DS_LITE(ntohl(flow->flow_ipv4_dst.s_addr)) ||
788 		    IN_6TO4_RELAY_ANYCAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
789 		    IN_MULTICAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
790 		    INADDR_BROADCAST == flow->flow_ipv4_dst.s_addr) {
791 			result = true;
792 			goto done;
793 		}
794 
795 		/*
796 		 * Check for a match in the hash bucket.
797 		 */
798 		lck_rw_lock_shared(&in_ifaddr_rwlock);
799 		TAILQ_FOREACH(ia, INADDR_HASH(flow->flow_ipv4_dst.s_addr), ia_hash) {
800 			if (IA_SIN(ia)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr) {
801 				best_ia = ia;
802 				match_ifp = ia->ia_ifp;
803 
804 				if (match_ifp == ifp) {
805 					break;
806 				}
807 				/*
808 				 * Continue the loop in case there's a exact match with another
809 				 * interface
810 				 */
811 			}
812 		}
813 
814 		if (best_ia != NULL) {
815 			if (match_ifp != ifp && ipforwarding == 0 &&
816 			    (match_ifp->if_family == IFNET_FAMILY_IPSEC ||
817 			    match_ifp->if_family == IFNET_FAMILY_UTUN)) {
818 				/*
819 				 * Drop when interface address check is strict and forwarding
820 				 * is disabled
821 				 */
822 			} else {
823 				lck_rw_done(&in_ifaddr_rwlock);
824 				result = true;
825 				goto done;
826 			}
827 		}
828 		lck_rw_done(&in_ifaddr_rwlock);
829 
830 		if (ifp->if_flags & IFF_BROADCAST) {
831 			/*
832 			 * Check for broadcast addresses.
833 			 *
834 			 * Only accept broadcast packets that arrive via the matching
835 			 * interface.  Reception of forwarded directed broadcasts would be
836 			 * handled via ip_forward() and ether_frameout() with the loopback
837 			 * into the stack for SIMPLEX interfaces handled by ether_frameout().
838 			 */
839 			struct ifaddr *ifa;
840 
841 			ifnet_lock_shared(ifp);
842 			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
843 				if (ifa->ifa_addr->sa_family != AF_INET) {
844 					continue;
845 				}
846 				ia = ifatoia(ifa);
847 				if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr ||
848 				    ia->ia_netbroadcast.s_addr == flow->flow_ipv4_dst.s_addr) {
849 					ifnet_lock_done(ifp);
850 					result = true;
851 					goto done;
852 				}
853 			}
854 			ifnet_lock_done(ifp);
855 		}
856 	} else {
857 		if (IN6_IS_ADDR_LOOPBACK(&flow->flow_ipv6_dst) ||
858 		    IN6_IS_ADDR_LINKLOCAL(&flow->flow_ipv6_dst) ||
859 		    IN6_IS_ADDR_MULTICAST(&flow->flow_ipv6_dst)) {
860 			result = true;
861 			goto done;
862 		}
863 
864 		/*
865 		 * Check for exact addresses in the hash bucket.
866 		 */
867 		lck_rw_lock_shared(&in6_ifaddr_rwlock);
868 		TAILQ_FOREACH(ia6, IN6ADDR_HASH(&flow->flow_ipv6_dst), ia6_hash) {
869 			if (in6_are_addr_equal_scoped(&ia6->ia_addr.sin6_addr, &flow->flow_ipv6_dst, ia6->ia_ifp->if_index, ifp->if_index)) {
870 				if ((ia6->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46))) {
871 					continue;
872 				}
873 				best_ia6 = ia6;
874 				if (ia6->ia_ifp == ifp) {
875 					break;
876 				}
877 				/*
878 				 * Continue the loop in case there's a exact match with another
879 				 * interface
880 				 */
881 			}
882 		}
883 		if (best_ia6 != NULL) {
884 			if (best_ia6->ia_ifp != ifp && ip6_forwarding == 0 &&
885 			    (best_ia6->ia_ifp->if_family == IFNET_FAMILY_IPSEC ||
886 			    best_ia6->ia_ifp->if_family == IFNET_FAMILY_UTUN)) {
887 				/*
888 				 * Drop when interface address check is strict and forwarding
889 				 * is disabled
890 				 */
891 			} else {
892 				lck_rw_done(&in6_ifaddr_rwlock);
893 				result = true;
894 				goto done;
895 			}
896 		}
897 		lck_rw_done(&in6_ifaddr_rwlock);
898 	}
899 
900 	/*
901 	 * In forwarding mode, if the destination address
902 	 * of the packet does not match any interface
903 	 * address, it maybe destined to the client device
904 	 */
905 	SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
906 	    "Rx flow does not match interface address");
907 done:
908 	return result;
909 }
910 
911 static struct flow_entry *
rx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)912 rx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
913     struct flow_entry *prev_fe)
914 {
915 	struct flow_entry *fe;
916 
917 	fe = lookup_flow_with_pkt(fsw, pkt, true, prev_fe);
918 	_FSW_INJECT_ERROR(2, fe, NULL, flow_entry_release, &fe);
919 	if (fe == NULL) {
920 		FSW_STATS_INC(FSW_STATS_RX_FLOW_NOT_FOUND);
921 		return NULL;
922 	}
923 
924 	if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE &&
925 	    fe->fe_flags & FLOWENTF_LISTENER) &&
926 	    !pkt_is_for_listener(fe, pkt)) {
927 		FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_LISTENER);
928 		flow_entry_release(&fe);
929 		return NULL;
930 	}
931 
932 	if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
933 		FSW_STATS_INC(FSW_STATS_RX_FLOW_TORNDOWN);
934 		SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
935 		    "Rx flow torn down");
936 		flow_entry_release(&fe);
937 		fe = NULL;
938 	}
939 
940 	return fe;
941 }
942 
943 static inline void
rx_flow_batch_packet(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt)944 rx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
945     struct __kern_packet *pkt)
946 {
947 	if (__improbable(pkt->pkt_flow_ip_is_frag)) {
948 		fe->fe_rx_frag_count++;
949 	}
950 
951 	/* KPKTQ_ENQUEUE_LIST is needed until frags become chained buflet */
952 	if (KPKTQ_EMPTY(&fe->fe_rx_pktq)) {
953 		ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) == 0);
954 		TAILQ_INSERT_TAIL(fes, fe, fe_rx_link);
955 		KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
956 	} else {
957 		ASSERT(!TAILQ_EMPTY(fes));
958 		KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
959 		flow_entry_release(&fe);
960 	}
961 }
962 
963 static void
tx_flow_batch_packet(struct flow_entry_list * fes,struct flow_entry * fe,struct __kern_packet * pkt)964 tx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
965     struct __kern_packet *pkt)
966 {
967 	/* record frag continuation */
968 	if (__improbable(pkt->pkt_flow_ip_is_first_frag)) {
969 		ASSERT(pkt->pkt_flow_ip_is_frag);
970 		fe->fe_tx_is_cont_frag = true;
971 		fe->fe_tx_frag_id = pkt->pkt_flow_ip_frag_id;
972 	} else if (__probable(!pkt->pkt_flow_ip_is_frag)) {
973 		fe->fe_tx_is_cont_frag = false;
974 		fe->fe_tx_frag_id = 0;
975 	}
976 
977 	if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
978 		ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
979 		TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
980 		KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
981 	} else {
982 		ASSERT(!TAILQ_EMPTY(fes));
983 		KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
984 		flow_entry_release(&fe);
985 	}
986 }
987 
988 static inline void
fsw_rx_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes)989 fsw_rx_ring_dequeue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
990     uint32_t n_pkts_max, struct pktq *pktq, uint32_t *n_bytes)
991 {
992 	uint32_t n_pkts = 0;
993 	slot_idx_t idx, idx_end;
994 	idx = r->ckr_khead;
995 	idx_end = r->ckr_rhead;
996 
997 	ASSERT(KPKTQ_EMPTY(pktq));
998 	*n_bytes = 0;
999 	for (; n_pkts < n_pkts_max && idx != idx_end;
1000 	    idx = SLOT_NEXT(idx, r->ckr_lim)) {
1001 		struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1002 		struct __kern_packet *pkt = ksd->sd_pkt;
1003 
1004 		ASSERT(pkt->pkt_nextpkt == NULL);
1005 		KR_SLOT_DETACH_METADATA(r, ksd);
1006 
1007 		_FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1008 		    pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1009 		if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1010 		    || (pkt->pkt_length == 0)) {
1011 			FSW_STATS_INC(FSW_STATS_DROP);
1012 			pp_free_packet_single(pkt);
1013 			continue;
1014 		}
1015 		n_pkts++;
1016 		*n_bytes += pkt->pkt_length;
1017 
1018 		KPKTQ_ENQUEUE(pktq, pkt);
1019 	}
1020 	r->ckr_khead = idx;
1021 	r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
1022 }
1023 
1024 /*
1025  * This is only for estimating how many packets each GSO packet will need.
1026  * The number does not need to be exact because any leftover packets allocated
1027  * will be freed.
1028  */
1029 static uint32_t
estimate_gso_pkts(struct __kern_packet * pkt)1030 estimate_gso_pkts(struct __kern_packet *pkt)
1031 {
1032 	packet_tso_flags_t tso_flags;
1033 	uint16_t mss;
1034 	uint32_t n_pkts = 0, total_hlen = 0, total_len = 0;
1035 
1036 	tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
1037 	mss = pkt->pkt_proto_seg_sz;
1038 
1039 	if (tso_flags == PACKET_TSO_IPV4) {
1040 		total_hlen = sizeof(struct ip) + sizeof(struct tcphdr);
1041 	} else if (tso_flags == PACKET_TSO_IPV6) {
1042 		total_hlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1043 	}
1044 	if (total_hlen != 0 && mss != 0) {
1045 		total_len = pkt->pkt_length;
1046 		n_pkts = (uint32_t)
1047 		    (SK_ROUNDUP((total_len - total_hlen), mss) / mss);
1048 	}
1049 	DTRACE_SKYWALK5(estimate__gso, packet_tso_flags_t, tso_flags,
1050 	    uint32_t, total_hlen, uint32_t, total_len, uint16_t, mss,
1051 	    uint32_t, n_pkts);
1052 	return n_pkts;
1053 }
1054 
1055 /*
1056  * This function retrieves a chain of packets of the same type only
1057  * (GSO or non-GSO).
1058  */
1059 static inline void
fsw_tx_ring_dequeue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,uint32_t n_pkts_max,struct pktq * pktq,uint32_t * n_bytes,uint32_t * gso_pkts_estimate)1060 fsw_tx_ring_dequeue_pktq(struct nx_flowswitch *fsw,
1061     struct __kern_channel_ring *r, uint32_t n_pkts_max,
1062     struct pktq *pktq, uint32_t *n_bytes, uint32_t *gso_pkts_estimate)
1063 {
1064 	uint32_t n_pkts = 0;
1065 	slot_idx_t idx, idx_end;
1066 	idx = r->ckr_khead;
1067 	idx_end = r->ckr_rhead;
1068 	struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
1069 	boolean_t gso_enabled, gso_required;
1070 	uint32_t gso_pkts;
1071 
1072 	gso_enabled = (fsw->fsw_tso_mode == FSW_TSO_MODE_SW);
1073 	ASSERT(KPKTQ_EMPTY(pktq));
1074 	*n_bytes = 0;
1075 	for (; n_pkts < n_pkts_max &&
1076 	    (!gso_enabled || fsw_gso_batch == 0 ||
1077 	    *gso_pkts_estimate < fsw_gso_batch) &&
1078 	    idx != idx_end; idx = SLOT_NEXT(idx, r->ckr_lim)) {
1079 		struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1080 		struct __kern_packet *pkt = ksd->sd_pkt;
1081 
1082 		ASSERT(pkt->pkt_nextpkt == NULL);
1083 
1084 		_FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1085 		    pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1086 		if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1087 		    || (pkt->pkt_length == 0)) {
1088 			KR_SLOT_DETACH_METADATA(r, ksd);
1089 			FSW_STATS_INC(FSW_STATS_DROP);
1090 			pp_free_packet_single(pkt);
1091 			continue;
1092 		}
1093 		if (gso_enabled) {
1094 			gso_pkts = estimate_gso_pkts(pkt);
1095 
1096 			/*
1097 			 * We use the first packet to determine what
1098 			 * type the subsequent ones need to be (GSO or
1099 			 * non-GSO).
1100 			 */
1101 			if (n_pkts == 0) {
1102 				gso_required = (gso_pkts != 0);
1103 			} else {
1104 				if (gso_required != (gso_pkts != 0)) {
1105 					break;
1106 				}
1107 			}
1108 			*gso_pkts_estimate += gso_pkts;
1109 		}
1110 		KR_SLOT_DETACH_METADATA(r, ksd);
1111 		if (NA_CHANNEL_EVENT_ATTACHED(&vpna->vpna_up)) {
1112 			__packet_set_tx_nx_port(SK_PKT2PH(pkt),
1113 			    vpna->vpna_nx_port, vpna->vpna_gencnt);
1114 		}
1115 		n_pkts++;
1116 		*n_bytes += pkt->pkt_length;
1117 		KPKTQ_ENQUEUE(pktq, pkt);
1118 	}
1119 	r->ckr_khead = idx;
1120 	r->ckr_ktail = SLOT_PREV(idx, r->ckr_lim);
1121 	DTRACE_SKYWALK5(tx__ring__dequeue, struct nx_flowswitch *, fsw,
1122 	    ifnet_t, fsw->fsw_ifp, uint32_t, n_pkts, uint32_t, *n_bytes,
1123 	    uint32_t, *gso_pkts_estimate);
1124 }
1125 
1126 static void
fsw_ring_enqueue_pktq(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1127 fsw_ring_enqueue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
1128     struct pktq *pktq)
1129 {
1130 #pragma unused(fsw)
1131 	struct __kern_packet *pkt;
1132 	struct __kern_quantum *kqum;
1133 	uint32_t kr_space_avail = 0;
1134 	uint32_t n, n_pkts = 0, n_bytes = 0;
1135 	slot_idx_t idx = 0, idx_start = 0, idx_end = 0;
1136 
1137 	kr_enter(r, TRUE);
1138 
1139 	idx_start = r->ckr_ktail;
1140 	kr_space_avail = kr_available_slots_rxring(r);
1141 	_FSW_INJECT_ERROR(40, kr_space_avail, 0, null_func);
1142 	n = MIN(kr_space_avail, KPKTQ_LEN(pktq));
1143 	_FSW_INJECT_ERROR(41, n, 0, null_func);
1144 	idx_end = SLOT_INCREMENT(idx_start, n, r->ckr_lim);
1145 
1146 	idx = idx_start;
1147 	while (idx != idx_end) {
1148 		KPKTQ_DEQUEUE(pktq, pkt);
1149 		kqum = SK_PTR_ADDR_KQUM(pkt);
1150 		kqum->qum_qflags |= QUM_F_FINALIZED;
1151 		n_pkts++;
1152 		n_bytes += pkt->pkt_length;
1153 		KR_SLOT_ATTACH_METADATA(r, KR_KSD(r, idx), kqum);
1154 		if (__improbable(pkt->pkt_trace_id != 0)) {
1155 			KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
1156 			KDBG(SK_KTRACE_PKT_RX_CHN | DBG_FUNC_START, pkt->pkt_trace_id);
1157 		}
1158 		idx = SLOT_NEXT(idx, r->ckr_lim);
1159 	}
1160 
1161 	kr_update_stats(r, n_pkts, n_bytes);
1162 
1163 	/*
1164 	 * ensure slot attachments are visible before updating the
1165 	 * tail pointer
1166 	 */
1167 	os_atomic_thread_fence(seq_cst);
1168 
1169 	r->ckr_ktail = idx_end;
1170 
1171 	kr_exit(r);
1172 
1173 	r->ckr_na_notify(r, kernproc, NA_NOTEF_PUSH);
1174 
1175 	SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "%s enqueued %d pkts",
1176 	    r->ckr_name, n_pkts);
1177 }
1178 
1179 static void
pkts_to_pktq(struct __kern_packet * pkts[],uint32_t n_pkts,struct pktq * pktq)1180 pkts_to_pktq(struct __kern_packet *pkts[], uint32_t n_pkts, struct pktq *pktq)
1181 {
1182 	ASSERT(KPKTQ_EMPTY(pktq));
1183 
1184 	for (uint32_t i = 0; i < n_pkts; i++) {
1185 		struct __kern_packet *pkt = pkts[i];
1186 		ASSERT(pkt->pkt_nextpkt == NULL);
1187 		KPKTQ_ENQUEUE(pktq, pkt);
1188 	}
1189 }
1190 
1191 /*
1192  * This function is modeled after nx_netif_host_grab_pkts() in nx_netif_host.c.
1193  */
1194 SK_NO_INLINE_ATTRIBUTE
1195 static void
convert_native_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_headp,struct mbuf ** m_tailp,uint32_t * cnt,uint32_t * bytes)1196 convert_native_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1197     struct mbuf **m_headp, struct mbuf **m_tailp, uint32_t *cnt, uint32_t *bytes)
1198 {
1199 	uint32_t tot_cnt;
1200 	unsigned int num_segs = 1;
1201 	struct mbuf *mhead, *head = NULL, *tail = NULL, **tailp = &head;
1202 	uint32_t mhead_cnt, mhead_bufsize;
1203 	uint32_t mhead_waste = 0;
1204 	uint32_t mcnt = 0, mbytes = 0;
1205 	uint32_t largest, max_pkt_len;
1206 	struct __kern_packet *pkt;
1207 	struct kern_pbufpool *pp;
1208 
1209 	tot_cnt = KPKTQ_LEN(pktq);
1210 	ASSERT(tot_cnt > 0);
1211 	mhead_cnt = tot_cnt;
1212 
1213 	/*
1214 	 * Opportunistically batch-allocate the mbufs based on the largest
1215 	 * packet size we've seen in the recent past.  Note that we reset
1216 	 * fe_rx_largest_size below if we notice that we're under-utilizing the
1217 	 * allocated buffers (thus disabling this batch allocation).
1218 	 */
1219 	largest = *(volatile uint32_t*)&fsw->fsw_rx_largest_size; /* read once */
1220 	if (__probable(largest != 0)) {
1221 		if (largest <= MCLBYTES) {
1222 			mhead = m_allocpacket_internal(&mhead_cnt, MCLBYTES,
1223 			    &num_segs, M_NOWAIT, 1, 0);
1224 			mhead_bufsize = MCLBYTES;
1225 		} else if (largest <= MBIGCLBYTES) {
1226 			mhead = m_allocpacket_internal(&mhead_cnt, MBIGCLBYTES,
1227 			    &num_segs, M_NOWAIT, 1, 0);
1228 			mhead_bufsize = MBIGCLBYTES;
1229 		} else if (largest <= M16KCLBYTES) {
1230 			mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES,
1231 			    &num_segs, M_NOWAIT, 1, 0);
1232 			mhead_bufsize = M16KCLBYTES;
1233 		} else if (largest <= M16KCLBYTES * 2) {
1234 			num_segs = 2;
1235 			mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES * 2,
1236 			    &num_segs, M_NOWAIT, 1, 0);
1237 			mhead_bufsize = M16KCLBYTES * 2;
1238 		} else {
1239 			mhead = NULL;
1240 			mhead_bufsize = mhead_cnt = 0;
1241 		}
1242 	} else {
1243 		mhead = NULL;
1244 		mhead_bufsize = mhead_cnt = 0;
1245 	}
1246 	DTRACE_SKYWALK4(bufstats, uint32_t, largest, uint32_t, mhead_bufsize,
1247 	    uint32_t, mhead_cnt, uint32_t, tot_cnt);
1248 
1249 	pp = __DECONST(struct kern_pbufpool *, KPKTQ_FIRST(pktq)->pkt_qum.qum_pp);
1250 	max_pkt_len = PP_BUF_SIZE_DEF(pp) * pp->pp_max_frags;
1251 
1252 	KPKTQ_FOREACH(pkt, pktq) {
1253 		uint32_t tot_len, len;
1254 		uint16_t pad, llhlen, iphlen;
1255 		boolean_t do_cksum_rx;
1256 		struct mbuf *m;
1257 		int error;
1258 
1259 		llhlen = pkt->pkt_l2_len;
1260 		len = pkt->pkt_length;
1261 		if (__improbable(len > max_pkt_len || llhlen > len)) {
1262 			DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1263 			    struct __kern_packet *, pkt);
1264 			FSW_STATS_INC(FSW_STATS_DROP);
1265 			FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1266 			continue;
1267 		}
1268 		/* begin payload on 32-bit boundary; figure out the padding */
1269 		pad = (uint16_t)P2ROUNDUP(llhlen, sizeof(uint32_t)) - llhlen;
1270 		tot_len = pad + len;
1271 
1272 		/* remember largest packet size */
1273 		if (__improbable(largest < tot_len)) {
1274 			largest = MAX(tot_len, MCLBYTES);
1275 		}
1276 
1277 		/*
1278 		 * If the above batch allocation returned partial
1279 		 * success, we try a blocking allocation here again.
1280 		 */
1281 		m = mhead;
1282 		if (__improbable(m == NULL || tot_len > mhead_bufsize)) {
1283 			ASSERT(mhead != NULL || mhead_cnt == 0);
1284 			num_segs = 1;
1285 			if (tot_len > M16KCLBYTES) {
1286 				num_segs = 0;
1287 			}
1288 			if ((error = mbuf_allocpacket(MBUF_DONTWAIT, tot_len,
1289 			    &num_segs, &m)) != 0) {
1290 				DTRACE_SKYWALK2(bad__len,
1291 				    struct nx_flowswitch *, fsw,
1292 				    struct __kern_packet *, pkt);
1293 				FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
1294 				FSW_STATS_INC(FSW_STATS_DROP);
1295 				continue;
1296 			}
1297 		} else {
1298 			mhead = m->m_nextpkt;
1299 			m->m_nextpkt = NULL;
1300 			ASSERT(mhead_cnt != 0);
1301 			--mhead_cnt;
1302 
1303 			/* check if we're underutilizing large buffers */
1304 			if (__improbable(mhead_bufsize > MCLBYTES &&
1305 			    tot_len < (mhead_bufsize >> 1))) {
1306 				++mhead_waste;
1307 			}
1308 			/*
1309 			 * Clean up unused mbuf.
1310 			 * Ony need to do this when we pre-alloc 2x16K mbufs
1311 			 */
1312 			if (__improbable(mhead_bufsize >= tot_len + M16KCLBYTES)) {
1313 				ASSERT(mhead_bufsize == 2 * M16KCLBYTES);
1314 				struct mbuf *m_extra = m->m_next;
1315 				ASSERT(m_extra != NULL);
1316 				ASSERT(m_extra->m_len == 0);
1317 				ASSERT(M_SIZE(m_extra) == M16KCLBYTES);
1318 				m->m_next = NULL;
1319 				m_freem(m_extra);
1320 				FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
1321 			}
1322 		}
1323 		m->m_data += pad;
1324 		m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
1325 
1326 		/* don't include IP header from partial sum */
1327 		if (__probable((pkt->pkt_qum_qflags &
1328 		    QUM_F_FLOW_CLASSIFIED) != 0)) {
1329 			iphlen = pkt->pkt_flow_ip_hlen;
1330 			do_cksum_rx = sk_cksum_rx;
1331 		} else {
1332 			iphlen = 0;
1333 			do_cksum_rx = FALSE;
1334 		}
1335 
1336 		fsw->fsw_pkt_copy_to_mbuf(NR_RX, SK_PKT2PH(pkt),
1337 		    pkt->pkt_headroom, m, 0, len, do_cksum_rx,
1338 		    llhlen + iphlen);
1339 
1340 		FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2MBUF);
1341 		if (do_cksum_rx) {
1342 			FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
1343 		}
1344 #if DEBUG || DEVELOPMENT
1345 		if (__improbable(pkt_trailers > 0)) {
1346 			(void) pkt_add_trailers_mbuf(m, llhlen + iphlen);
1347 		}
1348 #endif /* DEBUG || DEVELOPMENT */
1349 		m_adj(m, llhlen);
1350 
1351 		m->m_pkthdr.rcvif = fsw->fsw_ifp;
1352 		if (__improbable((pkt->pkt_link_flags &
1353 		    PKT_LINKF_ETHFCS) != 0)) {
1354 			m->m_flags |= M_HASFCS;
1355 		}
1356 		if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1357 			m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
1358 		}
1359 		ASSERT(m->m_nextpkt == NULL);
1360 		tail = m;
1361 		*tailp = m;
1362 		tailp = &m->m_nextpkt;
1363 		mcnt++;
1364 		mbytes += m_pktlen(m);
1365 	}
1366 	/* free any leftovers */
1367 	if (__improbable(mhead != NULL)) {
1368 		DTRACE_SKYWALK1(mhead__leftover, uint32_t, mhead_cnt);
1369 		ASSERT(mhead_cnt != 0);
1370 		(void) m_freem_list(mhead);
1371 		mhead = NULL;
1372 		mhead_cnt = 0;
1373 	}
1374 
1375 	/* reset if most packets (>50%) are smaller than our batch buffers */
1376 	if (__improbable(mhead_waste > ((uint32_t)tot_cnt >> 1))) {
1377 		DTRACE_SKYWALK4(mhead__waste, struct nx_flowswitch *, fsw,
1378 		    struct flow_entry *, NULL, uint32_t, mhead_waste,
1379 		    uint32_t, tot_cnt);
1380 		largest = 0;
1381 	}
1382 
1383 	if (largest != fsw->fsw_rx_largest_size) {
1384 		os_atomic_store(&fsw->fsw_rx_largest_size, largest, release);
1385 	}
1386 
1387 	pp_free_pktq(pktq);
1388 	*m_headp = head;
1389 	*m_tailp = tail;
1390 	*cnt = mcnt;
1391 	*bytes = mbytes;
1392 }
1393 
1394 /*
1395  * This function only extracts the mbuf from the packet. The caller frees
1396  * the packet.
1397  */
1398 static inline struct mbuf *
convert_compat_pkt_to_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1399 convert_compat_pkt_to_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1400 {
1401 	struct mbuf *m;
1402 	struct pkthdr *mhdr;
1403 	uint16_t llhlen;
1404 
1405 	m = pkt->pkt_mbuf;
1406 	ASSERT(m != NULL);
1407 
1408 	llhlen = pkt->pkt_l2_len;
1409 	if (llhlen > pkt->pkt_length) {
1410 		m_freem(m);
1411 		KPKT_CLEAR_MBUF_DATA(pkt);
1412 		DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1413 		    struct __kern_packet *, pkt);
1414 		FSW_STATS_INC(FSW_STATS_DROP);
1415 		FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1416 		return NULL;
1417 	}
1418 	mhdr = &m->m_pkthdr;
1419 	if ((mhdr->csum_flags & CSUM_DATA_VALID) == 0 &&
1420 	    PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
1421 		mhdr->csum_flags &= ~CSUM_RX_FLAGS;
1422 		mhdr->csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
1423 		mhdr->csum_rx_start = pkt->pkt_csum_rx_start_off;
1424 		mhdr->csum_rx_val = pkt->pkt_csum_rx_value;
1425 	}
1426 #if DEBUG || DEVELOPMENT
1427 	uint32_t extra = 0;
1428 	if (__improbable(pkt_trailers > 0)) {
1429 		extra = pkt_add_trailers_mbuf(m, llhlen);
1430 	}
1431 #endif /* DEBUG || DEVELOPMENT */
1432 	m_adj(m, llhlen);
1433 	ASSERT((uint32_t)m_pktlen(m) == ((pkt->pkt_length - llhlen) + extra));
1434 	KPKT_CLEAR_MBUF_DATA(pkt);
1435 	return m;
1436 }
1437 
1438 SK_NO_INLINE_ATTRIBUTE
1439 static void
convert_compat_pktq_to_mbufs(struct nx_flowswitch * fsw,struct pktq * pktq,struct mbuf ** m_head,struct mbuf ** m_tail,uint32_t * cnt,uint32_t * bytes)1440 convert_compat_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1441     struct mbuf **m_head, struct mbuf **m_tail, uint32_t *cnt, uint32_t *bytes)
1442 {
1443 	struct __kern_packet *pkt;
1444 	struct mbuf *m, *head = NULL, *tail = NULL, **tailp = &head;
1445 	uint32_t c = 0, b = 0;
1446 
1447 	KPKTQ_FOREACH(pkt, pktq) {
1448 		m = convert_compat_pkt_to_mbuf(fsw, pkt);
1449 		if (__improbable(m == NULL)) {
1450 			continue;
1451 		}
1452 		tail = m;
1453 		*tailp = m;
1454 		tailp = &m->m_nextpkt;
1455 		c++;
1456 		b += m_pktlen(m);
1457 	}
1458 	pp_free_pktq(pktq);
1459 	*m_head = head;
1460 	*m_tail = tail;
1461 	*cnt = c;
1462 	*bytes = b;
1463 }
1464 
1465 void
fsw_host_sendup(ifnet_t ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes)1466 fsw_host_sendup(ifnet_t ifp, struct mbuf *m_head, struct mbuf *m_tail,
1467     uint32_t cnt, uint32_t bytes)
1468 {
1469 	struct ifnet_stat_increment_param s;
1470 
1471 	bzero(&s, sizeof(s));
1472 	s.packets_in = cnt;
1473 	s.bytes_in = bytes;
1474 	dlil_input_handler(ifp, m_head, m_tail, &s, FALSE, NULL);
1475 }
1476 
1477 void
fsw_host_rx(struct nx_flowswitch * fsw,struct pktq * pktq)1478 fsw_host_rx(struct nx_flowswitch *fsw, struct pktq *pktq)
1479 {
1480 	struct mbuf *m_head = NULL, *m_tail = NULL;
1481 	uint32_t cnt = 0, bytes = 0;
1482 	ifnet_fsw_rx_cb_t cb;
1483 	void *cb_arg;
1484 	boolean_t compat;
1485 
1486 	ASSERT(!KPKTQ_EMPTY(pktq));
1487 	if (ifnet_get_flowswitch_rx_callback(fsw->fsw_ifp, &cb, &cb_arg) == 0) {
1488 		ASSERT(cb != NULL);
1489 		ASSERT(cb_arg != NULL);
1490 		/* callback consumes packets */
1491 		(*cb)(cb_arg, pktq);
1492 		ifnet_release_flowswitch_rx_callback(fsw->fsw_ifp);
1493 		return;
1494 	}
1495 
1496 	/* All packets in the pktq must have the same type */
1497 	compat = ((KPKTQ_FIRST(pktq)->pkt_pflags & PKT_F_MBUF_DATA) != 0);
1498 	if (compat) {
1499 		convert_compat_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1500 		    &bytes);
1501 	} else {
1502 		convert_native_pktq_to_mbufs(fsw, pktq, &m_head, &m_tail, &cnt,
1503 		    &bytes);
1504 	}
1505 	if (__improbable(m_head == NULL)) {
1506 		DTRACE_SKYWALK1(empty__head, struct nx_flowswitch *, fsw);
1507 		return;
1508 	}
1509 	fsw_host_sendup(fsw->fsw_ifp, m_head, m_tail, cnt, bytes);
1510 }
1511 
1512 void
fsw_ring_enqueue_tail_drop(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct pktq * pktq)1513 fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw,
1514     struct __kern_channel_ring *r, struct pktq *pktq)
1515 {
1516 	fsw_ring_enqueue_pktq(fsw, r, pktq);
1517 	FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq));
1518 	dp_drop_pktq(fsw, pktq);
1519 }
1520 
1521 static struct nexus_adapter *
flow_get_na(struct nx_flowswitch * fsw,struct flow_entry * fe)1522 flow_get_na(struct nx_flowswitch *fsw, struct flow_entry *fe)
1523 {
1524 	struct kern_nexus *nx = fsw->fsw_nx;
1525 	struct nexus_adapter *na = NULL;
1526 	nexus_port_t port = fe->fe_nx_port;
1527 
1528 	if (port == FSW_VP_DEV || port == FSW_VP_HOST) {
1529 		SK_ERR("dev or host ports have no NA");
1530 		return NULL;
1531 	}
1532 
1533 	if (__improbable(!nx_port_is_valid(nx, port))) {
1534 		SK_DF(SK_VERB_FSW_DP, "%s[%d] port no longer valid",
1535 		    if_name(fsw->fsw_ifp), port);
1536 		return NULL;
1537 	}
1538 
1539 	na = nx_port_get_na(nx, port);
1540 	if (__improbable(na == NULL)) {
1541 		FSW_STATS_INC(FSW_STATS_DST_NXPORT_INVALID);
1542 		SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer valid",
1543 		    if_name(fsw->fsw_ifp), port);
1544 		return NULL;
1545 	}
1546 
1547 	if (__improbable(!NA_IS_ACTIVE(na))) {
1548 		FSW_STATS_INC(FSW_STATS_DST_NXPORT_INACTIVE);
1549 		SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer active",
1550 		    if_name(fsw->fsw_ifp), port);
1551 		return NULL;
1552 	}
1553 
1554 	if (__improbable(nx_port_is_defunct(nx, port))) {
1555 		FSW_STATS_INC(FSW_STATS_DST_NXPORT_DEFUNCT);
1556 		SK_DF(SK_VERB_FSW_DP, "%s[%d] NA defuncted",
1557 		    if_name(fsw->fsw_ifp), port);
1558 		return NULL;
1559 	}
1560 
1561 	return na;
1562 }
1563 
1564 static inline struct __kern_channel_ring *
flow_get_ring(struct nx_flowswitch * fsw,struct flow_entry * fe,enum txrx txrx)1565 flow_get_ring(struct nx_flowswitch *fsw, struct flow_entry *fe, enum txrx txrx)
1566 {
1567 	struct nexus_vp_adapter *na = NULL;
1568 	struct __kern_channel_ring *r = NULL;
1569 
1570 	na = VPNA(flow_get_na(fsw, fe));
1571 	if (__improbable(na == NULL)) {
1572 		return NULL;
1573 	}
1574 
1575 	switch (txrx) {
1576 	case NR_RX:
1577 		r = &na->vpna_up.na_rx_rings[0];
1578 		break;
1579 	case NR_TX:
1580 		r = &na->vpna_up.na_tx_rings[0];
1581 		break;
1582 	default:
1583 		__builtin_unreachable();
1584 		VERIFY(0);
1585 	}
1586 
1587 	if (__improbable(KR_DROP(r))) {
1588 		FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE);
1589 		SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %0xllx %s drop mode",
1590 		    r->ckr_name, SK_KVA(r));
1591 		return NULL;
1592 	}
1593 
1594 	ASSERT(KRNA(r)->na_md_type == NEXUS_META_TYPE_PACKET);
1595 
1596 #if (DEVELOPMENT || DEBUG)
1597 	if (r != NULL) {
1598 		_FSW_INJECT_ERROR(4, r, NULL, null_func);
1599 	}
1600 #endif /* DEVELOPMENT || DEBUG */
1601 
1602 	return r;
1603 }
1604 
1605 struct __kern_channel_ring *
fsw_flow_get_rx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1606 fsw_flow_get_rx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1607 {
1608 	return flow_get_ring(fsw, fe, NR_RX);
1609 }
1610 
1611 static inline struct __kern_channel_ring *
fsw_flow_get_tx_ring(struct nx_flowswitch * fsw,struct flow_entry * fe)1612 fsw_flow_get_tx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1613 {
1614 	return flow_get_ring(fsw, fe, NR_TX);
1615 }
1616 
1617 static bool
dp_flow_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1618 dp_flow_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1619 {
1620 	struct flow_route *fr = fe->fe_route;
1621 	struct ifnet *ifp = fsw->fsw_ifp;
1622 
1623 	if (__improbable(!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
1624 	    !fe->fe_want_nonviable && (fe->fe_key.fk_mask & FKMASK_SRC) &&
1625 	    fe->fe_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt &&
1626 	    !flow_route_key_validate(&fe->fe_key, ifp, &fe->fe_laddr_gencnt))) {
1627 		/*
1628 		 * The source address is no longer around; we want this
1629 		 * flow to be nonviable, but that requires holding the lock
1630 		 * as writer (which isn't the case now.)  Indicate that
1631 		 * we need to finalize the nonviable later down below.
1632 		 *
1633 		 * We also request that the flow route be re-configured,
1634 		 * if this is a connected mode flow.
1635 		 *
1636 		 */
1637 		if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) {
1638 			/*
1639 			 * fsw_pending_nonviable is a hint for reaper thread;
1640 			 * due to the fact that setting fe_want_nonviable and
1641 			 * incrementing fsw_pending_nonviable counter is not
1642 			 * atomic, let the increment happen first, and the
1643 			 * thread losing the CAS does decrement.
1644 			 */
1645 			os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
1646 			if (os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
1647 				fsw_reap_sched(fsw);
1648 			} else {
1649 				os_atomic_dec(&fsw->fsw_pending_nonviable, relaxed);
1650 			}
1651 		}
1652 		if (fr != NULL) {
1653 			os_atomic_inc(&fr->fr_want_configure, relaxed);
1654 		}
1655 	}
1656 
1657 	/* if flow was (or is going to be) marked as nonviable, drop it */
1658 	if (__improbable(fe->fe_want_nonviable ||
1659 	    (fe->fe_flags & FLOWENTF_NONVIABLE) != 0)) {
1660 		SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow 0x%llx non-viable",
1661 		    SK_KVA(fe));
1662 		return false;
1663 	}
1664 	return true;
1665 }
1666 
1667 bool
dp_flow_rx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)1668 dp_flow_rx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1669 {
1670 	bool okay;
1671 	okay = dp_flow_route_process(fsw, fe);
1672 #if (DEVELOPMENT || DEBUG)
1673 	if (okay) {
1674 		_FSW_INJECT_ERROR(5, okay, false, null_func);
1675 	}
1676 #endif /* DEVELOPMENT || DEBUG */
1677 
1678 	return okay;
1679 }
1680 
1681 void
dp_flow_rx_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)1682 dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
1683     uint32_t flags)
1684 {
1685 #pragma unused(flags)
1686 	struct pktq dpkts;              /* dst pool alloc'ed packets */
1687 	struct pktq disposed_pkts;         /* done src packets */
1688 	struct pktq dropped_pkts;         /* dropped src packets */
1689 	struct pktq transferred_pkts;         /* dst packet ready for ring */
1690 	struct __kern_packet *pkt, *tpkt;
1691 	struct kern_pbufpool *dpp;
1692 	uint32_t n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq);
1693 	uint64_t buf_array[RX_BUFLET_BATCH_COUNT];
1694 	uint16_t buf_array_iter = 0;
1695 	uint32_t cnt, buf_cnt = 0;
1696 	int err;
1697 
1698 	KPKTQ_INIT(&dpkts);
1699 	KPKTQ_INIT(&dropped_pkts);
1700 	KPKTQ_INIT(&disposed_pkts);
1701 	KPKTQ_INIT(&transferred_pkts);
1702 
1703 	if (__improbable(!dp_flow_rx_route_process(fsw, fe))) {
1704 		SK_ERR("Rx route bad");
1705 		fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
1706 		FSW_STATS_ADD(FSW_STATS_RX_FLOW_NONVIABLE, n_pkts);
1707 		goto done;
1708 	}
1709 
1710 	if (fe->fe_nx_port == FSW_VP_HOST) {
1711 		/*
1712 		 * The host ring does not exist anymore so we can't take
1713 		 * the enqueue path below. This path should only be hit
1714 		 * for the rare tcp fragmentation case.
1715 		 */
1716 		fsw_host_rx(fsw, &fe->fe_rx_pktq);
1717 		return;
1718 	}
1719 
1720 	/* find the ring */
1721 	struct __kern_channel_ring *r;
1722 	r = fsw_flow_get_rx_ring(fsw, fe);
1723 	if (__improbable(r == NULL)) {
1724 		fsw_snoop_and_dequeue(fe, &dropped_pkts, true);
1725 		goto done;
1726 	}
1727 
1728 	/* snoop before L2 is stripped */
1729 	if (__improbable(pktap_total_tap_count != 0)) {
1730 		fsw_snoop(fsw, fe, true);
1731 	}
1732 
1733 	dpp = r->ckr_pp;
1734 	/* batch allocate enough packets */
1735 	err = pp_alloc_pktq(dpp, 1, &dpkts, n_pkts, NULL, NULL,
1736 	    SKMEM_NOSLEEP);
1737 	if (__improbable(err == ENOMEM)) {
1738 		ASSERT(KPKTQ_EMPTY(&dpkts));
1739 		KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq);
1740 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1741 		SK_ERR("failed to alloc %u pkts for kr %s, 0x%llu", n_pkts,
1742 		    r->ckr_name, SK_KVA(r));
1743 		goto done;
1744 	}
1745 
1746 	/*
1747 	 * estimate total number of buflets for the packet chain.
1748 	 */
1749 	cnt = howmany(fe->fe_rx_pktq_bytes, PP_BUF_SIZE_DEF(dpp));
1750 	if (cnt > n_pkts) {
1751 		ASSERT(dpp->pp_max_frags > 1);
1752 		cnt -= n_pkts;
1753 		buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, cnt);
1754 		err = pp_alloc_buflet_batch(dpp, buf_array, &buf_cnt,
1755 		    SKMEM_NOSLEEP, false);
1756 		if (__improbable(buf_cnt == 0)) {
1757 			KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq);
1758 			FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1759 			SK_ERR("failed to alloc %d buflets (err %d) for kr %s, "
1760 			    "0x%llu", cnt, err, r->ckr_name, SK_KVA(r));
1761 			goto done;
1762 		}
1763 		err = 0;
1764 	}
1765 
1766 	/* extra processing for user flow */
1767 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1768 		err = 0;
1769 		KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1770 		if (fe->fe_rx_pktq_bytes > pkt->pkt_flow_ulen) {
1771 			fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1772 		} else {
1773 			fe->fe_rx_pktq_bytes = 0;
1774 		}
1775 		err = flow_pkt_track(fe, pkt, true);
1776 		_FSW_INJECT_ERROR(33, err, EPROTO, null_func);
1777 		if (__improbable(err != 0)) {
1778 			SK_ERR("flow_pkt_track failed (err %d)", err);
1779 			FSW_STATS_INC(FSW_STATS_RX_FLOW_TRACK_ERR);
1780 			/* if need to trigger RST */
1781 			if (err == ENETRESET) {
1782 				flow_track_abort_tcp(fe, pkt, NULL);
1783 			}
1784 			KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1785 			continue;
1786 		}
1787 
1788 		/* transfer to dpkt */
1789 		if (pkt->pkt_qum.qum_pp != dpp) {
1790 			struct __kern_buflet *bprev, *bnew;
1791 			struct __kern_packet *dpkt = NULL;
1792 			uint32_t n_bufs, i;
1793 
1794 			KPKTQ_DEQUEUE(&dpkts, dpkt);
1795 			if (__improbable(dpkt == NULL)) {
1796 				FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1797 				KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1798 				continue;
1799 			}
1800 			n_bufs = howmany(pkt->pkt_length, PP_BUF_SIZE_DEF(dpp));
1801 			n_bufs--;
1802 			for (i = 0; i < n_bufs; i++) {
1803 				if (__improbable(buf_cnt == 0)) {
1804 					ASSERT(dpp->pp_max_frags > 1);
1805 					buf_array_iter = 0;
1806 					cnt = howmany(fe->fe_rx_pktq_bytes,
1807 					    PP_BUF_SIZE_DEF(dpp));
1808 					n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq);
1809 					if (cnt >= n_pkts) {
1810 						cnt -= n_pkts;
1811 					} else {
1812 						cnt = 0;
1813 					}
1814 					cnt += (n_bufs - i);
1815 					buf_cnt = MIN(RX_BUFLET_BATCH_COUNT,
1816 					    cnt);
1817 					cnt = buf_cnt;
1818 					err = pp_alloc_buflet_batch(dpp,
1819 					    buf_array, &buf_cnt,
1820 					    SKMEM_NOSLEEP, false);
1821 					if (__improbable(buf_cnt == 0)) {
1822 						FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1823 						KPKTQ_ENQUEUE(&dropped_pkts,
1824 						    pkt);
1825 						pkt = NULL;
1826 						pp_free_packet_single(dpkt);
1827 						dpkt = NULL;
1828 						SK_ERR("failed to alloc %d "
1829 						    "buflets (err %d) for "
1830 						    "kr %s, 0x%llu", cnt, err,
1831 						    r->ckr_name, SK_KVA(r));
1832 						break;
1833 					}
1834 					err = 0;
1835 				}
1836 				ASSERT(buf_cnt != 0);
1837 				if (i == 0) {
1838 					PKT_GET_FIRST_BUFLET(dpkt, 1, bprev);
1839 				}
1840 				bnew = (kern_buflet_t)buf_array[buf_array_iter];
1841 				buf_array[buf_array_iter] = 0;
1842 				buf_array_iter++;
1843 				buf_cnt--;
1844 				VERIFY(kern_packet_add_buflet(SK_PKT2PH(dpkt),
1845 				    bprev, bnew) == 0);
1846 				bprev = bnew;
1847 			}
1848 			if (__improbable(err != 0)) {
1849 				continue;
1850 			}
1851 			err = copy_packet_from_dev(fsw, pkt, dpkt);
1852 			_FSW_INJECT_ERROR(43, err, EINVAL, null_func);
1853 			if (__improbable(err != 0)) {
1854 				SK_ERR("copy packet failed (err %d)", err);
1855 				KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1856 				pp_free_packet_single(dpkt);
1857 				dpkt = NULL;
1858 				continue;
1859 			}
1860 			KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1861 			pkt = dpkt;
1862 		}
1863 		_UUID_COPY(pkt->pkt_flow_id, fe->fe_uuid);
1864 		_UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1865 		pkt->pkt_policy_id = fe->fe_policy_id;
1866 		pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
1867 		pkt->pkt_transport_protocol = fe->fe_transport_protocol;
1868 		if (pkt->pkt_bufs_cnt > 1) {
1869 			pkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1870 			pkt->pkt_seg_cnt = 1;
1871 		}
1872 		KPKTQ_ENQUEUE(&transferred_pkts, pkt);
1873 	}
1874 	KPKTQ_FINI(&fe->fe_rx_pktq);
1875 	KPKTQ_CONCAT(&fe->fe_rx_pktq, &transferred_pkts);
1876 	KPKTQ_FINI(&transferred_pkts);
1877 
1878 	fsw_ring_enqueue_tail_drop(fsw, r, &fe->fe_rx_pktq);
1879 
1880 done:
1881 	/* Free unused buflets */
1882 	while (buf_cnt > 0) {
1883 		pp_free_buflet(dpp, (kern_buflet_t)(buf_array[buf_array_iter]));
1884 		buf_array[buf_array_iter] = 0;
1885 		buf_array_iter++;
1886 		buf_cnt--;
1887 	}
1888 	dp_free_pktq(fsw, &dpkts);
1889 	dp_free_pktq(fsw, &disposed_pkts);
1890 	dp_drop_pktq(fsw, &dropped_pkts);
1891 }
1892 
1893 static inline void
rx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)1894 rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
1895     uint32_t flags)
1896 {
1897 	ASSERT(!KPKTQ_EMPTY(&fe->fe_rx_pktq));
1898 	ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0);
1899 
1900 	SK_DF(SK_VERB_FSW_DP | SK_VERB_RX, "Rx %d pkts for fe %p port %d",
1901 	    KPKTQ_LEN(&fe->fe_rx_pktq), fe, fe->fe_nx_port);
1902 
1903 	/* flow related processing (default, agg, fpd, etc.) */
1904 	fe->fe_rx_process(fsw, fe, flags);
1905 
1906 	if (__improbable(fe->fe_want_withdraw)) {
1907 		fsw_reap_sched(fsw);
1908 	}
1909 
1910 	KPKTQ_FINI(&fe->fe_rx_pktq);
1911 }
1912 
1913 static inline void
dp_rx_process_wake_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)1914 dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1915 {
1916 	/*
1917 	 * We only care about wake packets of flows that belong the flow switch
1918 	 * as wake packets for the host stack are handled by the host input
1919 	 * function
1920 	 */
1921 #if (DEBUG || DEVELOPMENT)
1922 	if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
1923 		/*
1924 		 * This is a one shot command
1925 		 */
1926 		fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
1927 
1928 		pkt->pkt_pflags |= PKT_F_WAKE_PKT;
1929 	}
1930 #endif /* (DEBUG || DEVELOPMENT) */
1931 	if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1932 		if_ports_used_match_pkt(fsw->fsw_ifp, pkt);
1933 	}
1934 }
1935 
1936 static void
_fsw_receive_locked(struct nx_flowswitch * fsw,struct pktq * pktq)1937 _fsw_receive_locked(struct nx_flowswitch *fsw, struct pktq *pktq)
1938 {
1939 	struct __kern_packet *pkt, *tpkt;
1940 	struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
1941 	struct flow_entry *fe, *prev_fe;
1942 	sa_family_t af;
1943 	struct pktq host_pkts, dropped_pkts;
1944 	int err;
1945 
1946 	KPKTQ_INIT(&host_pkts);
1947 	KPKTQ_INIT(&dropped_pkts);
1948 
1949 	if (__improbable(FSW_QUIESCED(fsw))) {
1950 		DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw);
1951 		KPKTQ_CONCAT(&dropped_pkts, pktq);
1952 		goto done;
1953 	}
1954 	if (__improbable(fsw->fsw_demux == NULL)) {
1955 		KPKTQ_CONCAT(&dropped_pkts, pktq);
1956 		goto done;
1957 	}
1958 
1959 	prev_fe = NULL;
1960 	KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
1961 		if (__probable(tpkt)) {
1962 			void *baddr;
1963 			MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1964 			SK_PREFETCH(baddr, 0);
1965 			/* prefetch L3 and L4 flow structs */
1966 			SK_PREFETCHW(tpkt->pkt_flow, 0);
1967 			SK_PREFETCHW(tpkt->pkt_flow, 128);
1968 		}
1969 
1970 		KPKTQ_REMOVE(pktq, pkt);
1971 
1972 		pkt = rx_prepare_packet(fsw, pkt);
1973 
1974 		af = fsw->fsw_demux(fsw, pkt);
1975 		if (__improbable(af == AF_UNSPEC)) {
1976 			KPKTQ_ENQUEUE(&host_pkts, pkt);
1977 			continue;
1978 		}
1979 
1980 		err = flow_pkt_classify(pkt, fsw->fsw_ifp, af, TRUE);
1981 		_FSW_INJECT_ERROR(1, err, ENXIO, null_func);
1982 		if (__improbable(err != 0)) {
1983 			FSW_STATS_INC(FSW_STATS_RX_FLOW_EXTRACT_ERR);
1984 			KPKTQ_ENQUEUE(&host_pkts, pkt);
1985 			continue;
1986 		}
1987 
1988 		if (__improbable(pkt->pkt_flow_ip_is_frag)) {
1989 			pkt = rx_process_ip_frag(fsw, pkt);
1990 			if (pkt == NULL) {
1991 				continue;
1992 			}
1993 		}
1994 
1995 		prev_fe = fe = rx_lookup_flow(fsw, pkt, prev_fe);
1996 		if (__improbable(fe == NULL)) {
1997 			KPKTQ_ENQUEUE_LIST(&host_pkts, pkt);
1998 			continue;
1999 		}
2000 
2001 		fe->fe_rx_pktq_bytes += pkt->pkt_flow_ulen;
2002 
2003 		dp_rx_process_wake_packet(fsw, pkt);
2004 
2005 		rx_flow_batch_packet(&fes, fe, pkt);
2006 		prev_fe = fe;
2007 	}
2008 
2009 	struct flow_entry *tfe = NULL;
2010 	TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) {
2011 		rx_flow_process(fsw, fe, 0);
2012 		TAILQ_REMOVE(&fes, fe, fe_rx_link);
2013 		fe->fe_rx_pktq_bytes = 0;
2014 		fe->fe_rx_frag_count = 0;
2015 		flow_entry_release(&fe);
2016 	}
2017 
2018 	if (!KPKTQ_EMPTY(&host_pkts)) {
2019 		fsw_host_rx(fsw, &host_pkts);
2020 	}
2021 
2022 done:
2023 	dp_drop_pktq(fsw, &dropped_pkts);
2024 }
2025 
2026 #if (DEVELOPMENT || DEBUG)
2027 static void
fsw_rps_rx(struct nx_flowswitch * fsw,uint32_t id,struct __kern_packet * pkt)2028 fsw_rps_rx(struct nx_flowswitch *fsw, uint32_t id,
2029     struct __kern_packet *pkt)
2030 {
2031 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2032 
2033 	lck_mtx_lock_spin(&frt->frt_lock);
2034 	KPKTQ_ENQUEUE(&frt->frt_pktq, pkt);
2035 	lck_mtx_unlock(&frt->frt_lock);
2036 }
2037 
2038 static void
fsw_rps_thread_schedule(struct nx_flowswitch * fsw,uint32_t id)2039 fsw_rps_thread_schedule(struct nx_flowswitch *fsw, uint32_t id)
2040 {
2041 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2042 
2043 	ASSERT(frt->frt_thread != THREAD_NULL);
2044 	lck_mtx_lock_spin(&frt->frt_lock);
2045 	ASSERT(!(frt->frt_flags & (FRT_TERMINATING | FRT_TERMINATED)));
2046 
2047 	frt->frt_requests++;
2048 	if (!(frt->frt_flags & FRT_RUNNING)) {
2049 		thread_wakeup((caddr_t)frt);
2050 	}
2051 	lck_mtx_unlock(&frt->frt_lock);
2052 }
2053 
2054 __attribute__((noreturn))
2055 static void
fsw_rps_thread_cont(void * v,wait_result_t w)2056 fsw_rps_thread_cont(void *v, wait_result_t w)
2057 {
2058 	struct fsw_rps_thread *frt = v;
2059 	struct nx_flowswitch *fsw = frt->frt_fsw;
2060 
2061 	lck_mtx_lock(&frt->frt_lock);
2062 	if (__improbable(w == THREAD_INTERRUPTIBLE ||
2063 	    (frt->frt_flags & FRT_TERMINATING) != 0)) {
2064 		goto terminate;
2065 	}
2066 	if (KPKTQ_EMPTY(&frt->frt_pktq)) {
2067 		goto done;
2068 	}
2069 	frt->frt_flags |= FRT_RUNNING;
2070 
2071 	for (;;) {
2072 		uint32_t requests = frt->frt_requests;
2073 		struct pktq pkts;
2074 
2075 		KPKTQ_INIT(&pkts);
2076 		KPKTQ_CONCAT(&pkts, &frt->frt_pktq);
2077 		lck_mtx_unlock(&frt->frt_lock);
2078 
2079 		sk_protect_t protect;
2080 		protect = sk_sync_protect();
2081 		FSW_RLOCK(fsw);
2082 		_fsw_receive_locked(fsw, &pkts);
2083 		FSW_RUNLOCK(fsw);
2084 		sk_sync_unprotect(protect);
2085 
2086 		lck_mtx_lock(&frt->frt_lock);
2087 		if ((frt->frt_flags & FRT_TERMINATING) != 0 ||
2088 		    requests == frt->frt_requests) {
2089 			frt->frt_requests = 0;
2090 			break;
2091 		}
2092 	}
2093 
2094 done:
2095 	lck_mtx_unlock(&frt->frt_lock);
2096 	if (!(frt->frt_flags & FRT_TERMINATING)) {
2097 		frt->frt_flags &= ~FRT_RUNNING;
2098 		assert_wait(frt, THREAD_UNINT);
2099 		thread_block_parameter(fsw_rps_thread_cont, frt);
2100 		__builtin_unreachable();
2101 	} else {
2102 terminate:
2103 		LCK_MTX_ASSERT(&frt->frt_lock, LCK_MTX_ASSERT_OWNED);
2104 		frt->frt_flags &= ~(FRT_RUNNING | FRT_TERMINATING);
2105 		frt->frt_flags |= FRT_TERMINATED;
2106 
2107 		if (frt->frt_flags & FRT_TERMINATEBLOCK) {
2108 			thread_wakeup((caddr_t)&frt);
2109 		}
2110 		lck_mtx_unlock(&frt->frt_lock);
2111 
2112 		SK_D("fsw_rx_%s_%d terminated", if_name(fsw->fsw_ifp),
2113 		    frt->frt_idx);
2114 
2115 		/* for the extra refcnt from kernel_thread_start() */
2116 		thread_deallocate(current_thread());
2117 		/* this is the end */
2118 		thread_terminate(current_thread());
2119 		/* NOTREACHED */
2120 		__builtin_unreachable();
2121 	}
2122 
2123 	/* must never get here */
2124 	VERIFY(0);
2125 	/* NOTREACHED */
2126 	__builtin_unreachable();
2127 }
2128 
2129 __attribute__((noreturn))
2130 static void
fsw_rps_thread_func(void * v,wait_result_t w)2131 fsw_rps_thread_func(void *v, wait_result_t w)
2132 {
2133 #pragma unused(w)
2134 	struct fsw_rps_thread *frt = v;
2135 	struct nx_flowswitch *fsw = frt->frt_fsw;
2136 
2137 	char thread_name[MAXTHREADNAMESIZE];
2138 	bzero(thread_name, sizeof(thread_name));
2139 	(void) snprintf(thread_name, sizeof(thread_name), "fsw_rx_%s_%d",
2140 	    if_name(fsw->fsw_ifp), frt->frt_idx);
2141 	thread_set_thread_name(frt->frt_thread, thread_name);
2142 	SK_D("%s spawned", thread_name);
2143 
2144 	net_thread_marks_push(NET_THREAD_SYNC_RX);
2145 	assert_wait(frt, THREAD_UNINT);
2146 	(void) thread_block_parameter(fsw_rps_thread_cont, frt);
2147 
2148 	__builtin_unreachable();
2149 }
2150 
2151 static void
fsw_rps_thread_join(struct nx_flowswitch * fsw,uint32_t i)2152 fsw_rps_thread_join(struct nx_flowswitch *fsw, uint32_t i)
2153 {
2154 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2155 	uint64_t f = (1 * NSEC_PER_MSEC);
2156 	uint64_t s = (1000 * NSEC_PER_SEC);
2157 	uint32_t c = 0;
2158 
2159 	lck_mtx_lock(&frt->frt_lock);
2160 	frt->frt_flags |= FRT_TERMINATING;
2161 
2162 	while (!(frt->frt_flags & FRT_TERMINATED)) {
2163 		uint64_t t = 0;
2164 		nanoseconds_to_absolutetime((c++ == 0) ? f : s, &t);
2165 		clock_absolutetime_interval_to_deadline(t, &t);
2166 		ASSERT(t != 0);
2167 
2168 		frt->frt_flags |= FRT_TERMINATEBLOCK;
2169 		if (!(frt->frt_flags & FRT_RUNNING)) {
2170 			thread_wakeup_one((caddr_t)frt);
2171 		}
2172 		(void) assert_wait_deadline(&frt->frt_thread, THREAD_UNINT, t);
2173 		lck_mtx_unlock(&frt->frt_lock);
2174 		thread_block(THREAD_CONTINUE_NULL);
2175 		lck_mtx_lock(&frt->frt_lock);
2176 		frt->frt_flags &= ~FRT_TERMINATEBLOCK;
2177 	}
2178 	ASSERT(frt->frt_flags & FRT_TERMINATED);
2179 	lck_mtx_unlock(&frt->frt_lock);
2180 	frt->frt_thread = THREAD_NULL;
2181 }
2182 
2183 static void
fsw_rps_thread_spawn(struct nx_flowswitch * fsw,uint32_t i)2184 fsw_rps_thread_spawn(struct nx_flowswitch *fsw, uint32_t i)
2185 {
2186 	kern_return_t error;
2187 	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2188 	lck_mtx_init(&frt->frt_lock, &nexus_lock_group, &nexus_lock_attr);
2189 	frt->frt_idx = i;
2190 	frt->frt_fsw = fsw;
2191 	error = kernel_thread_start(fsw_rps_thread_func, frt, &frt->frt_thread);
2192 	ASSERT(!error);
2193 	KPKTQ_INIT(&frt->frt_pktq);
2194 }
2195 
2196 int
fsw_rps_set_nthreads(struct nx_flowswitch * fsw,uint32_t n)2197 fsw_rps_set_nthreads(struct nx_flowswitch* fsw, uint32_t n)
2198 {
2199 	if (n > FSW_RPS_MAX_NTHREADS) {
2200 		SK_ERR("rps nthreads %d, max %d", n, FSW_RPS_MAX_NTHREADS);
2201 		return EINVAL;
2202 	}
2203 
2204 	FSW_WLOCK(fsw);
2205 	if (n < fsw->fsw_rps_nthreads) {
2206 		for (uint32_t i = n; i < fsw->fsw_rps_nthreads; i++) {
2207 			fsw_rps_thread_join(fsw, i);
2208 		}
2209 		fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2210 		    fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads,
2211 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2212 	} else if (n > fsw->fsw_rps_nthreads) {
2213 		fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2214 		    fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads,
2215 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2216 		for (uint32_t i = fsw->fsw_rps_nthreads; i < n; i++) {
2217 			fsw_rps_thread_spawn(fsw, i);
2218 		}
2219 	}
2220 	fsw->fsw_rps_nthreads = n;
2221 	FSW_WUNLOCK(fsw);
2222 	return 0;
2223 }
2224 
2225 static uint32_t
get_rps_id(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2226 get_rps_id(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2227 {
2228 	sa_family_t af = fsw->fsw_demux(fsw, pkt);
2229 	if (__improbable(af == AF_UNSPEC)) {
2230 		return 0;
2231 	}
2232 
2233 	flow_pkt_classify(pkt, fsw->fsw_ifp, af, true);
2234 
2235 	if (__improbable((pkt->pkt_qum_qflags &
2236 	    QUM_F_FLOW_CLASSIFIED) == 0)) {
2237 		return 0;
2238 	}
2239 
2240 	struct flow_key key;
2241 	flow_pkt2key(pkt, true, &key);
2242 	key.fk_mask = FKMASK_5TUPLE;
2243 
2244 	uint32_t id = flow_key_hash(&key) % fsw->fsw_rps_nthreads;
2245 
2246 	return id;
2247 }
2248 
2249 #endif /* !DEVELOPMENT && !DEBUG */
2250 
2251 void
fsw_receive(struct nx_flowswitch * fsw,struct pktq * pktq)2252 fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq)
2253 {
2254 	FSW_RLOCK(fsw);
2255 #if (DEVELOPMENT || DEBUG)
2256 	if (fsw->fsw_rps_nthreads != 0) {
2257 		struct __kern_packet *pkt, *tpkt;
2258 		bitmap_t map = 0;
2259 
2260 		_CASSERT(BITMAP_LEN(FSW_RPS_MAX_NTHREADS) == 1);
2261 		KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
2262 			uint32_t id = get_rps_id(fsw, pkt);
2263 			KPKTQ_REMOVE(pktq, pkt);
2264 			fsw_rps_rx(fsw, id, pkt);
2265 			bitmap_set(&map, id);
2266 		}
2267 		for (int i = bitmap_first(&map, 64); i >= 0;
2268 		    i = bitmap_next(&map, i)) {
2269 			fsw_rps_thread_schedule(fsw, i);
2270 		}
2271 	} else
2272 #endif /* !DEVELOPMENT && !DEBUG */
2273 	{
2274 		_fsw_receive_locked(fsw, pktq);
2275 	}
2276 	FSW_RUNLOCK(fsw);
2277 }
2278 
2279 int
fsw_dev_input_netem_dequeue(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)2280 fsw_dev_input_netem_dequeue(void *handle, pktsched_pkt_t * pkts,
2281     uint32_t n_pkts)
2282 {
2283 #pragma unused(handle)
2284 	struct nx_flowswitch *fsw = handle;
2285 	struct __kern_packet *kpkts[FSW_VP_DEV_BATCH_MAX];
2286 	struct pktq pktq;
2287 	sk_protect_t protect;
2288 	uint32_t i;
2289 
2290 	ASSERT(n_pkts <= FSW_VP_DEV_BATCH_MAX);
2291 
2292 	for (i = 0; i < n_pkts; i++) {
2293 		ASSERT(pkts[i].pktsched_ptype == QP_PACKET);
2294 		ASSERT(pkts[i].pktsched_pkt_kpkt != NULL);
2295 		kpkts[i] = pkts[i].pktsched_pkt_kpkt;
2296 	}
2297 
2298 	protect = sk_sync_protect();
2299 	KPKTQ_INIT(&pktq);
2300 	pkts_to_pktq(kpkts, n_pkts, &pktq);
2301 
2302 	fsw_receive(fsw, &pktq);
2303 	KPKTQ_FINI(&pktq);
2304 	sk_sync_unprotect(protect);
2305 
2306 	return 0;
2307 }
2308 
2309 static void
fsw_dev_input_netem_enqueue(struct nx_flowswitch * fsw,struct pktq * q)2310 fsw_dev_input_netem_enqueue(struct nx_flowswitch *fsw, struct pktq *q)
2311 {
2312 	classq_pkt_t p;
2313 	struct netem *ne;
2314 	struct __kern_packet *pkt, *tpkt;
2315 
2316 	ASSERT(fsw->fsw_ifp != NULL);
2317 	ne = fsw->fsw_ifp->if_input_netem;
2318 	ASSERT(ne != NULL);
2319 	KPKTQ_FOREACH_SAFE(pkt, q, tpkt) {
2320 		bool pdrop;
2321 		KPKTQ_REMOVE(q, pkt);
2322 		CLASSQ_PKT_INIT_PACKET(&p, pkt);
2323 		netem_enqueue(ne, &p, &pdrop);
2324 	}
2325 }
2326 
2327 void
fsw_devna_rx(struct nexus_adapter * devna,struct __kern_packet * pkt_head,struct nexus_pkt_stats * out_stats)2328 fsw_devna_rx(struct nexus_adapter *devna, struct __kern_packet *pkt_head,
2329     struct nexus_pkt_stats *out_stats)
2330 {
2331 	struct __kern_packet *pkt = pkt_head, *next;
2332 	struct nx_flowswitch *fsw;
2333 	uint32_t n_bytes = 0, n_pkts = 0;
2334 	uint64_t total_pkts = 0, total_bytes = 0;
2335 	struct pktq q;
2336 
2337 	KPKTQ_INIT(&q);
2338 	if (__improbable(devna->na_ifp == NULL ||
2339 	    (fsw = fsw_ifp_to_fsw(devna->na_ifp)) == NULL)) {
2340 		SK_ERR("fsw not attached, dropping %d pkts", KPKTQ_LEN(&q));
2341 		pp_free_packet_chain(pkt_head, NULL);
2342 		return;
2343 	}
2344 	while (pkt != NULL) {
2345 		if (__improbable(pkt->pkt_trace_id != 0)) {
2346 			KDBG(SK_KTRACE_PKT_RX_DRV | DBG_FUNC_END, pkt->pkt_trace_id);
2347 			KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_START, pkt->pkt_trace_id);
2348 		}
2349 		next = pkt->pkt_nextpkt;
2350 		pkt->pkt_nextpkt = NULL;
2351 
2352 		if (__probable((pkt->pkt_qum_qflags & QUM_F_DROPPED) == 0)) {
2353 			KPKTQ_ENQUEUE(&q, pkt);
2354 			n_bytes += pkt->pkt_length;
2355 		} else {
2356 			DTRACE_SKYWALK1(non__finalized__drop,
2357 			    struct __kern_packet *, pkt);
2358 			FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_FINALIZED);
2359 			pp_free_packet_single(pkt);
2360 			pkt = NULL;
2361 		}
2362 		n_pkts = KPKTQ_LEN(&q);
2363 		if (n_pkts == fsw_rx_batch || (next == NULL && n_pkts > 0)) {
2364 			if (__improbable(fsw->fsw_ifp->if_input_netem != NULL)) {
2365 				fsw_dev_input_netem_enqueue(fsw, &q);
2366 			} else {
2367 				fsw_receive(fsw, &q);
2368 			}
2369 			total_pkts += n_pkts;
2370 			total_bytes += n_bytes;
2371 			n_pkts = 0;
2372 			n_bytes = 0;
2373 			KPKTQ_FINI(&q);
2374 		}
2375 		pkt = next;
2376 	}
2377 	ASSERT(KPKTQ_LEN(&q) == 0);
2378 	FSW_STATS_ADD(FSW_STATS_RX_PACKETS, total_pkts);
2379 	if (out_stats != NULL) {
2380 		out_stats->nps_pkts = total_pkts;
2381 		out_stats->nps_bytes = total_bytes;
2382 	}
2383 	KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(devna), total_pkts, total_bytes);
2384 }
2385 
2386 static int
dp_copy_to_dev_mbuf(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2387 dp_copy_to_dev_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2388     struct __kern_packet *dpkt)
2389 {
2390 	struct mbuf *m = NULL;
2391 	uint32_t bdlen, bdlim, bdoff;
2392 	uint8_t *bdaddr;
2393 	unsigned int one = 1;
2394 	int err = 0;
2395 
2396 	err = mbuf_allocpacket(MBUF_DONTWAIT,
2397 	    (fsw->fsw_frame_headroom + spkt->pkt_length), &one, &m);
2398 #if (DEVELOPMENT || DEBUG)
2399 	if (m != NULL) {
2400 		_FSW_INJECT_ERROR(11, m, NULL, m_freem, m);
2401 	}
2402 #endif /* DEVELOPMENT || DEBUG */
2403 	if (__improbable(m == NULL)) {
2404 		FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
2405 		err = ENOBUFS;
2406 		goto done;
2407 	}
2408 
2409 	MD_BUFLET_ADDR_ABS_DLEN(dpkt, bdaddr, bdlen, bdlim, bdoff);
2410 	if (fsw->fsw_frame_headroom > bdlim) {
2411 		SK_ERR("not enough space in buffer for headroom");
2412 		err = EINVAL;
2413 		goto done;
2414 	}
2415 
2416 	dpkt->pkt_headroom = fsw->fsw_frame_headroom;
2417 	dpkt->pkt_mbuf = m;
2418 	dpkt->pkt_pflags |= PKT_F_MBUF_DATA;
2419 
2420 	/* packet copy into mbuf */
2421 	fsw->fsw_pkt_copy_to_mbuf(NR_TX, SK_PTR_ENCODE(spkt,
2422 	    METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)), 0, m,
2423 	    fsw->fsw_frame_headroom, spkt->pkt_length,
2424 	    PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2425 	    spkt->pkt_csum_tx_start_off);
2426 	FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2MBUF);
2427 
2428 	/* header copy into dpkt buffer for classification */
2429 	kern_packet_t sph = SK_PTR_ENCODE(spkt,
2430 	    METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2431 	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2432 	    METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2433 	uint32_t copy_len = MIN(spkt->pkt_length, bdlim - dpkt->pkt_headroom);
2434 	fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, dpkt->pkt_headroom,
2435 	    sph, spkt->pkt_headroom, copy_len, FALSE, 0, 0, 0);
2436 
2437 	/*
2438 	 * fsw->fsw_frame_headroom is after m_data, thus we treat m_data same as
2439 	 * buflet baddr m_data always points to the beginning of packet and
2440 	 * should represents the same as baddr + headroom
2441 	 */
2442 	ASSERT((uintptr_t)m->m_data ==
2443 	    ((uintptr_t)mbuf_datastart(m) + fsw->fsw_frame_headroom));
2444 
2445 done:
2446 	return err;
2447 }
2448 
2449 static int
dp_copy_to_dev_pkt(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2450 dp_copy_to_dev_pkt(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2451     struct __kern_packet *dpkt)
2452 {
2453 	struct ifnet *ifp = fsw->fsw_ifp;
2454 	uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
2455 
2456 	if (headroom > UINT8_MAX) {
2457 		SK_ERR("headroom too large %d", headroom);
2458 		return ERANGE;
2459 	}
2460 	dpkt->pkt_headroom = (uint8_t)headroom;
2461 	ASSERT((dpkt->pkt_headroom & 0x7) == 0);
2462 	dpkt->pkt_l2_len = 0;
2463 	dpkt->pkt_link_flags = spkt->pkt_link_flags;
2464 
2465 	kern_packet_t sph = SK_PTR_ENCODE(spkt,
2466 	    METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2467 	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2468 	    METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2469 	fsw->fsw_pkt_copy_from_pkt(NR_TX, dph,
2470 	    dpkt->pkt_headroom, sph, spkt->pkt_headroom,
2471 	    spkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2472 	    (spkt->pkt_csum_tx_start_off - spkt->pkt_headroom),
2473 	    (spkt->pkt_csum_tx_stuff_off - spkt->pkt_headroom),
2474 	    (spkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT));
2475 
2476 	FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
2477 
2478 	return 0;
2479 }
2480 
2481 #if SK_LOG
2482 /* Hoisted out of line to reduce kernel stack footprint */
2483 SK_LOG_ATTRIBUTE
2484 static void
dp_copy_to_dev_log(struct nx_flowswitch * fsw,const struct kern_pbufpool * pp,struct __kern_packet * spkt,struct __kern_packet * dpkt,int error)2485 dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp,
2486     struct __kern_packet *spkt, struct __kern_packet *dpkt, int error)
2487 {
2488 	struct proc *p = current_proc();
2489 	struct ifnet *ifp = fsw->fsw_ifp;
2490 	uint64_t logflags = (SK_VERB_FSW_DP | SK_VERB_TX);
2491 
2492 	if (error == ERANGE) {
2493 		SK_ERR("packet too long, hr(fr+tx)+slen (%u+%u)+%u > "
2494 		    "dev_pp_max %u", (uint32_t)fsw->fsw_frame_headroom,
2495 		    (uint32_t)ifp->if_tx_headroom, spkt->pkt_length,
2496 		    (uint32_t)pp->pp_max_frags * PP_BUF_SIZE_DEF(pp));
2497 	} else if (error == ENOBUFS) {
2498 		SK_DF(logflags, "%s(%d) packet allocation failure",
2499 		    sk_proc_name_address(p), sk_proc_pid(p));
2500 	} else if (error == 0) {
2501 		ASSERT(dpkt != NULL);
2502 		char *daddr;
2503 		MD_BUFLET_ADDR_ABS(dpkt, daddr);
2504 		SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)",
2505 		    sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
2506 		    dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
2507 		    (uint32_t)fsw->fsw_frame_headroom,
2508 		    (uint32_t)ifp->if_tx_headroom);
2509 		SK_DF(logflags | SK_VERB_DUMP, "%s",
2510 		    sk_dump("buf", daddr, dpkt->pkt_length, 128, NULL, 0));
2511 	} else {
2512 		SK_DF(logflags, "%s(%d) error %d", error);
2513 	}
2514 }
2515 #else
2516 #define dp_copy_to_dev_log(...)
2517 #endif /* SK_LOG */
2518 
2519 static void
fsw_pkt_copy_metadata(struct __kern_packet * spkt,struct __kern_packet * dpkt)2520 fsw_pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
2521 {
2522 	ASSERT(!(spkt->pkt_pflags & PKT_F_MBUF_MASK));
2523 	ASSERT(!(spkt->pkt_pflags & PKT_F_PKT_MASK));
2524 
2525 	SK_PREFETCHW(dpkt->pkt_qum_buf.buf_addr, 0);
2526 	/* Copy packet metadata */
2527 	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
2528 	_PKT_COPY(spkt, dpkt);
2529 	_PKT_COPY_TX_PORT_DATA(spkt, dpkt);
2530 	ASSERT((dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
2531 	    !PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
2532 	ASSERT(dpkt->pkt_mbuf == NULL);
2533 
2534 	/* Copy AQM metadata */
2535 	dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type;
2536 	dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx;
2537 	_CASSERT((offsetof(struct __flow, flow_src_id) % 8) == 0);
2538 	_UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id);
2539 	_UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid);
2540 	dpkt->pkt_policy_id = spkt->pkt_policy_id;
2541 	dpkt->pkt_skip_policy_id = spkt->pkt_skip_policy_id;
2542 }
2543 
2544 static int
dp_copy_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2545 dp_copy_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2546     struct __kern_packet *dpkt)
2547 {
2548 	const struct kern_pbufpool *pp = dpkt->pkt_qum.qum_pp;
2549 	struct ifnet *ifp = fsw->fsw_ifp;
2550 	uint32_t dev_pkt_len;
2551 	int err = 0;
2552 
2553 	fsw_pkt_copy_metadata(spkt, dpkt);
2554 	switch (fsw->fsw_classq_enq_ptype) {
2555 	case QP_MBUF:
2556 		err = dp_copy_to_dev_mbuf(fsw, spkt, dpkt);
2557 		break;
2558 
2559 	case QP_PACKET:
2560 		dev_pkt_len = fsw->fsw_frame_headroom + ifp->if_tx_headroom +
2561 		    spkt->pkt_length;
2562 		if (dev_pkt_len > pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)) {
2563 			FSW_STATS_INC(FSW_STATS_TX_COPY_BAD_LEN);
2564 			err = ERANGE;
2565 			goto done;
2566 		}
2567 		err = dp_copy_to_dev_pkt(fsw, spkt, dpkt);
2568 		break;
2569 
2570 	default:
2571 		VERIFY(0);
2572 		__builtin_unreachable();
2573 	}
2574 done:
2575 	dp_copy_to_dev_log(fsw, pp, spkt, dpkt, err);
2576 	return err;
2577 }
2578 
2579 static int
dp_copy_headers_to_dev(struct nx_flowswitch * fsw,struct __kern_packet * spkt,struct __kern_packet * dpkt)2580 dp_copy_headers_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2581     struct __kern_packet *dpkt)
2582 {
2583 	uint8_t *sbaddr, *dbaddr;
2584 	uint16_t headroom = fsw->fsw_frame_headroom + fsw->fsw_ifp->if_tx_headroom;
2585 	uint16_t hdrs_len_estimate = (uint16_t)MIN(spkt->pkt_length, 128);
2586 
2587 	fsw_pkt_copy_metadata(spkt, dpkt);
2588 
2589 	MD_BUFLET_ADDR_ABS(spkt, sbaddr);
2590 	ASSERT(sbaddr != NULL);
2591 	sbaddr += spkt->pkt_headroom;
2592 
2593 	MD_BUFLET_ADDR_ABS(dpkt, dbaddr);
2594 	ASSERT(dbaddr != NULL);
2595 	dpkt->pkt_headroom = (uint8_t)headroom;
2596 	dbaddr += headroom;
2597 
2598 	pkt_copy(sbaddr, dbaddr, hdrs_len_estimate);
2599 	METADATA_SET_LEN(dpkt, hdrs_len_estimate, headroom);
2600 
2601 	/* packet length is set to the full length */
2602 	dpkt->pkt_length = spkt->pkt_length;
2603 	dpkt->pkt_pflags |= PKT_F_TRUNCATED;
2604 	return 0;
2605 }
2606 
2607 static struct mbuf *
convert_pkt_to_mbuf(struct __kern_packet * pkt)2608 convert_pkt_to_mbuf(struct __kern_packet *pkt)
2609 {
2610 	ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
2611 	ASSERT(pkt->pkt_mbuf != NULL);
2612 	struct mbuf *m = pkt->pkt_mbuf;
2613 
2614 	/* pass additional metadata generated from flow parse/lookup */
2615 	_CASSERT(sizeof(m->m_pkthdr.pkt_flowid) ==
2616 	    sizeof(pkt->pkt_flow_token));
2617 	_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) ==
2618 	    sizeof(pkt->pkt_flowsrc_token));
2619 	_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) ==
2620 	    sizeof(pkt->pkt_flowsrc_fidx));
2621 	m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
2622 	m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
2623 	m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
2624 	m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
2625 	m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
2626 	m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
2627 	m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx;
2628 
2629 	if (pkt->pkt_transport_protocol == IPPROTO_QUIC) {
2630 		m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_QUIC;
2631 	}
2632 
2633 	/* The packet should have a timestamp by the time we get here. */
2634 	m->m_pkthdr.pkt_timestamp = pkt->pkt_timestamp;
2635 	m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
2636 
2637 	m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
2638 	m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
2639 	/* set pkt_hdr so that AQM can find IP header and mark ECN bits */
2640 	m->m_pkthdr.pkt_hdr = m->m_data + pkt->pkt_l2_len;
2641 
2642 	if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
2643 		m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
2644 	}
2645 	KPKT_CLEAR_MBUF_DATA(pkt);
2646 
2647 	/* mbuf has been consumed, release packet as well */
2648 	ASSERT(pkt->pkt_qum.qum_ksd == NULL);
2649 	pp_free_packet_single(pkt);
2650 	return m;
2651 }
2652 
2653 static void
convert_pkt_to_mbuf_list(struct __kern_packet * pkt_list,struct mbuf ** head,struct mbuf ** tail,uint32_t * cnt,uint32_t * bytes)2654 convert_pkt_to_mbuf_list(struct __kern_packet *pkt_list,
2655     struct mbuf **head, struct mbuf **tail,
2656     uint32_t *cnt, uint32_t *bytes)
2657 {
2658 	struct __kern_packet *pkt = pkt_list, *next;
2659 	struct mbuf *m_head = NULL, **m_tailp = &m_head, *m = NULL;
2660 	uint32_t c = 0, b = 0;
2661 
2662 	while (pkt != NULL) {
2663 		next = pkt->pkt_nextpkt;
2664 		pkt->pkt_nextpkt = NULL;
2665 		m = convert_pkt_to_mbuf(pkt);
2666 		ASSERT(m != NULL);
2667 
2668 		*m_tailp = m;
2669 		m_tailp = &m->m_nextpkt;
2670 		c++;
2671 		b += m_pktlen(m);
2672 		pkt = next;
2673 	}
2674 	if (head != NULL) {
2675 		*head = m_head;
2676 	}
2677 	if (tail != NULL) {
2678 		*tail = m;
2679 	}
2680 	if (cnt != NULL) {
2681 		*cnt = c;
2682 	}
2683 	if (bytes != NULL) {
2684 		*bytes = b;
2685 	}
2686 }
2687 
2688 SK_NO_INLINE_ATTRIBUTE
2689 static int
classq_enqueue_flow_single(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2690 classq_enqueue_flow_single(struct nx_flowswitch *fsw,
2691     struct __kern_packet *pkt)
2692 {
2693 	struct ifnet *ifp = fsw->fsw_ifp;
2694 	boolean_t pkt_drop = FALSE;
2695 	int err;
2696 
2697 	FSW_LOCK_ASSERT_HELD(fsw);
2698 	ASSERT(fsw->fsw_classq_enabled);
2699 	ASSERT(pkt->pkt_flow_token != 0);
2700 	fsw_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class,
2701 	    1, pkt->pkt_length);
2702 
2703 	if (__improbable(pkt->pkt_trace_id != 0)) {
2704 		KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
2705 		KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
2706 	}
2707 
2708 	switch (fsw->fsw_classq_enq_ptype) {
2709 	case QP_MBUF: {                         /* compat interface */
2710 		struct mbuf *m;
2711 
2712 		m = convert_pkt_to_mbuf(pkt);
2713 		ASSERT(m != NULL);
2714 		pkt = NULL;
2715 
2716 		/* ifnet_enqueue consumes mbuf */
2717 		err = ifnet_enqueue_mbuf(ifp, m, false, &pkt_drop);
2718 		m = NULL;
2719 #if (DEVELOPMENT || DEBUG)
2720 		if (__improbable(!pkt_drop)) {
2721 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2722 		}
2723 #endif /* DEVELOPMENT || DEBUG */
2724 		if (pkt_drop) {
2725 			FSW_STATS_INC(FSW_STATS_DROP);
2726 			FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2727 		}
2728 		break;
2729 	}
2730 	case QP_PACKET: {                       /* native interface */
2731 		/* ifnet_enqueue consumes packet */
2732 		err = ifnet_enqueue_pkt(ifp, pkt, false, &pkt_drop);
2733 		pkt = NULL;
2734 #if (DEVELOPMENT || DEBUG)
2735 		if (__improbable(!pkt_drop)) {
2736 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2737 		}
2738 #endif /* DEVELOPMENT || DEBUG */
2739 		if (pkt_drop) {
2740 			FSW_STATS_INC(FSW_STATS_DROP);
2741 			FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2742 		}
2743 		break;
2744 	}
2745 	default:
2746 		err = EINVAL;
2747 		VERIFY(0);
2748 		/* NOTREACHED */
2749 		__builtin_unreachable();
2750 	}
2751 
2752 	return err;
2753 }
2754 
2755 static int
classq_enqueue_flow_chain(struct nx_flowswitch * fsw,struct __kern_packet * pkt_head,struct __kern_packet * pkt_tail,uint32_t cnt,uint32_t bytes)2756 classq_enqueue_flow_chain(struct nx_flowswitch *fsw,
2757     struct __kern_packet *pkt_head, struct __kern_packet *pkt_tail,
2758     uint32_t cnt, uint32_t bytes)
2759 {
2760 	struct ifnet *ifp = fsw->fsw_ifp;
2761 	boolean_t pkt_drop = FALSE;
2762 	uint32_t svc;
2763 	int err;
2764 
2765 	FSW_LOCK_ASSERT_HELD(fsw);
2766 	ASSERT(fsw->fsw_classq_enabled);
2767 	ASSERT(pkt_head->pkt_flow_token != 0);
2768 
2769 	/*
2770 	 * All packets in the flow should have the same svc.
2771 	 */
2772 	svc = pkt_head->pkt_svc_class;
2773 	fsw_ifp_inc_traffic_class_out_pkt(ifp, svc, cnt, bytes);
2774 
2775 	switch (fsw->fsw_classq_enq_ptype) {
2776 	case QP_MBUF: {                         /* compat interface */
2777 		struct mbuf *m_head = NULL, *m_tail = NULL;
2778 		uint32_t c = 0, b = 0;
2779 
2780 		convert_pkt_to_mbuf_list(pkt_head, &m_head, &m_tail, &c, &b);
2781 		ASSERT(m_head != NULL && m_tail != NULL);
2782 		ASSERT(c == cnt);
2783 		ASSERT(b == bytes);
2784 		pkt_head = NULL;
2785 
2786 		/* ifnet_enqueue consumes mbuf */
2787 		err = ifnet_enqueue_mbuf_chain(ifp, m_head, m_tail, cnt,
2788 		    bytes, FALSE, &pkt_drop);
2789 		m_head = NULL;
2790 		m_tail = NULL;
2791 #if (DEVELOPMENT || DEBUG)
2792 		if (__improbable(!pkt_drop)) {
2793 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2794 		}
2795 #endif /* DEVELOPMENT || DEBUG */
2796 		if (pkt_drop) {
2797 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2798 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2799 			    cnt);
2800 		}
2801 		break;
2802 	}
2803 	case QP_PACKET: {                       /* native interface */
2804 		/* ifnet_enqueue consumes packet */
2805 		err = ifnet_enqueue_pkt_chain(ifp, pkt_head, pkt_tail, cnt,
2806 		    bytes, FALSE, &pkt_drop);
2807 		pkt_head = NULL;
2808 #if (DEVELOPMENT || DEBUG)
2809 		if (__improbable(!pkt_drop)) {
2810 			_FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2811 		}
2812 #endif /* DEVELOPMENT || DEBUG */
2813 		if (pkt_drop) {
2814 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2815 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2816 			    cnt);
2817 		}
2818 		break;
2819 	}
2820 	default:
2821 		err = EINVAL;
2822 		VERIFY(0);
2823 		/* NOTREACHED */
2824 		__builtin_unreachable();
2825 	}
2826 
2827 	return err;
2828 }
2829 
2830 /*
2831  * This code path needs to be kept for interfaces without logical link support.
2832  */
2833 static void
classq_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)2834 classq_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
2835     bool chain, uint32_t cnt, uint32_t bytes)
2836 {
2837 	bool flowadv_is_set = false;
2838 	struct __kern_packet *pkt, *tail, *tpkt;
2839 	flowadv_idx_t flow_adv_idx;
2840 	bool flowadv_cap;
2841 	flowadv_token_t flow_adv_token;
2842 	int err;
2843 
2844 	SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
2845 	    if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
2846 
2847 	if (chain) {
2848 		pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
2849 		tail = KPKTQ_LAST(&fe->fe_tx_pktq);
2850 		KPKTQ_INIT(&fe->fe_tx_pktq);
2851 		if (pkt == NULL) {
2852 			return;
2853 		}
2854 		flow_adv_idx = pkt->pkt_flowsrc_fidx;
2855 		flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2856 		flow_adv_token = pkt->pkt_flow_token;
2857 
2858 		err = classq_enqueue_flow_chain(fsw, pkt, tail, cnt, bytes);
2859 
2860 		/* set flow advisory if needed */
2861 		if (__improbable((err == EQFULL || err == EQSUSPENDED) &&
2862 		    flowadv_cap)) {
2863 			flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
2864 			    flow_adv_idx, flow_adv_token);
2865 		}
2866 		DTRACE_SKYWALK3(chain__enqueue, uint32_t, cnt, uint32_t, bytes,
2867 		    bool, flowadv_is_set);
2868 	} else {
2869 		uint32_t c = 0, b = 0;
2870 
2871 		KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2872 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2873 
2874 			flow_adv_idx = pkt->pkt_flowsrc_fidx;
2875 			flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2876 			flow_adv_token = pkt->pkt_flow_token;
2877 
2878 			c++;
2879 			b += pkt->pkt_length;
2880 			err = classq_enqueue_flow_single(fsw, pkt);
2881 
2882 			/* set flow advisory if needed */
2883 			if (__improbable(!flowadv_is_set &&
2884 			    ((err == EQFULL || err == EQSUSPENDED) &&
2885 			    flowadv_cap))) {
2886 				flowadv_is_set = na_flowadv_set(
2887 					flow_get_na(fsw, fe), flow_adv_idx,
2888 					flow_adv_token);
2889 			}
2890 		}
2891 		ASSERT(c == cnt);
2892 		ASSERT(b == bytes);
2893 		DTRACE_SKYWALK3(non__chain__enqueue, uint32_t, cnt, uint32_t, bytes,
2894 		    bool, flowadv_is_set);
2895 	}
2896 
2897 	/* notify flow advisory event */
2898 	if (__improbable(flowadv_is_set)) {
2899 		struct __kern_channel_ring *r = fsw_flow_get_tx_ring(fsw, fe);
2900 		if (__probable(r)) {
2901 			na_flowadv_event(r);
2902 			SK_DF(SK_VERB_FLOW_ADVISORY | SK_VERB_TX,
2903 			    "%s(%d) notified of flow update",
2904 			    sk_proc_name_address(current_proc()),
2905 			    sk_proc_pid(current_proc()));
2906 		}
2907 	}
2908 }
2909 
2910 /*
2911  * Logical link code path
2912  */
2913 static void
classq_qset_enqueue_flow(struct nx_flowswitch * fsw,struct flow_entry * fe,bool chain,uint32_t cnt,uint32_t bytes)2914 classq_qset_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
2915     bool chain, uint32_t cnt, uint32_t bytes)
2916 {
2917 #pragma unused(chain)
2918 	struct __kern_packet *pkt, *tail;
2919 	flowadv_idx_t flow_adv_idx;
2920 	bool flowadv_is_set = false;
2921 	bool flowadv_cap;
2922 	flowadv_token_t flow_adv_token;
2923 	uint32_t flowctl = 0, dropped = 0;
2924 	int err;
2925 
2926 	SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
2927 	    if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
2928 
2929 	pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
2930 	tail = KPKTQ_LAST(&fe->fe_tx_pktq);
2931 	KPKTQ_INIT(&fe->fe_tx_pktq);
2932 	if (pkt == NULL) {
2933 		return;
2934 	}
2935 	flow_adv_idx = pkt->pkt_flowsrc_fidx;
2936 	flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2937 	flow_adv_token = pkt->pkt_flow_token;
2938 
2939 	err = netif_qset_enqueue(fe->fe_qset, pkt, tail, cnt, bytes,
2940 	    &flowctl, &dropped);
2941 
2942 	if (__improbable(err != 0)) {
2943 		/* set flow advisory if needed */
2944 		if (flowctl > 0 && flowadv_cap) {
2945 			flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
2946 			    flow_adv_idx, flow_adv_token);
2947 
2948 			/* notify flow advisory event */
2949 			if (flowadv_is_set) {
2950 				struct __kern_channel_ring *r =
2951 				    fsw_flow_get_tx_ring(fsw, fe);
2952 				if (__probable(r)) {
2953 					na_flowadv_event(r);
2954 					SK_DF(SK_VERB_FLOW_ADVISORY |
2955 					    SK_VERB_TX,
2956 					    "%s(%d) notified of flow update",
2957 					    sk_proc_name_address(current_proc()),
2958 					    sk_proc_pid(current_proc()));
2959 				}
2960 			}
2961 		}
2962 		if (dropped > 0) {
2963 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, dropped);
2964 			STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2965 			    dropped);
2966 		}
2967 	}
2968 }
2969 
2970 static void
tx_finalize_packet(struct nx_flowswitch * fsw,struct __kern_packet * pkt)2971 tx_finalize_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2972 {
2973 #pragma unused(fsw)
2974 	/* finalize here; no more changes to buflets after classq */
2975 	if (__probable(!(pkt->pkt_pflags & PKT_F_MBUF_DATA))) {
2976 		kern_packet_t ph = SK_PTR_ENCODE(pkt,
2977 		    METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
2978 		int err = __packet_finalize(ph);
2979 		VERIFY(err == 0);
2980 	}
2981 }
2982 
2983 static bool
dp_flow_tx_route_process(struct nx_flowswitch * fsw,struct flow_entry * fe)2984 dp_flow_tx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2985 {
2986 	struct flow_route *fr = fe->fe_route;
2987 	int err;
2988 
2989 	ASSERT(fr != NULL);
2990 
2991 	if (__improbable(!dp_flow_route_process(fsw, fe))) {
2992 		return false;
2993 	}
2994 	if (fe->fe_qset_select == FE_QSET_SELECT_DYNAMIC) {
2995 		flow_qset_select_dynamic(fsw, fe, TRUE);
2996 	}
2997 
2998 	_FSW_INJECT_ERROR(35, fr->fr_flags, fr->fr_flags,
2999 	    _fsw_error35_handler, 1, fr, NULL, NULL);
3000 	_FSW_INJECT_ERROR(36, fr->fr_flags, fr->fr_flags,
3001 	    _fsw_error36_handler, 1, fr, NULL);
3002 
3003 	/*
3004 	 * See if we need to resolve the flow route; note the test against
3005 	 * fr_flags here is done without any lock for performance.  Thus
3006 	 * it's possible that we race against the thread performing route
3007 	 * event updates for a packet (which is OK).  In any case we should
3008 	 * not have any assertion on fr_flags value(s) due to the lack of
3009 	 * serialization.
3010 	 */
3011 	if (fr->fr_flags & FLOWRTF_RESOLVED) {
3012 		goto frame;
3013 	}
3014 
3015 	struct __kern_packet *pkt, *tpkt;
3016 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3017 		err = fsw->fsw_resolve(fsw, fr, pkt);
3018 		_FSW_INJECT_ERROR_SET(35, _fsw_error35_handler, 2, fr, pkt, &err);
3019 		_FSW_INJECT_ERROR_SET(36, _fsw_error36_handler, 2, fr, &err);
3020 		/*
3021 		 * If resolver returns EJUSTRETURN then we drop the pkt as the
3022 		 * resolver should have converted the pkt into mbuf (or
3023 		 * detached the attached mbuf from pkt) and added it to the
3024 		 * llinfo queue. If we do have a cached llinfo, then proceed
3025 		 * to using it even though it may be stale (very unlikely)
3026 		 * while the resolution is in progress.
3027 		 * Otherwise, any other error results in dropping pkt.
3028 		 */
3029 		if (err == EJUSTRETURN) {
3030 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3031 			pp_free_packet_single(pkt);
3032 			FSW_STATS_INC(FSW_STATS_TX_RESOLV_PENDING);
3033 			continue;
3034 		} else if (err != 0 && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
3035 			/* use existing llinfo */
3036 			FSW_STATS_INC(FSW_STATS_TX_RESOLV_STALE);
3037 		} else if (err != 0) {
3038 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3039 			pp_free_packet_single(pkt);
3040 			FSW_STATS_INC(FSW_STATS_TX_RESOLV_FAIL);
3041 			continue;
3042 		}
3043 	}
3044 
3045 frame:
3046 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3047 		if (fsw->fsw_frame != NULL) {
3048 			fsw->fsw_frame(fsw, fr, pkt);
3049 		}
3050 	}
3051 
3052 	return true;
3053 }
3054 
3055 static void
dp_listener_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe)3056 dp_listener_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
3057 {
3058 #pragma unused(fsw)
3059 	struct __kern_packet *pkt, *tpkt;
3060 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3061 		KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3062 		/* listener is only allowed TCP RST */
3063 		if (pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
3064 		    (pkt->pkt_flow_tcp_flags & TH_RST) != 0) {
3065 			flow_track_abort_tcp(fe, NULL, pkt);
3066 		} else {
3067 			char *addr;
3068 			MD_BUFLET_ADDR_ABS(pkt, addr);
3069 			SK_ERR("listener flow sends non-RST packet %s",
3070 			    sk_dump(sk_proc_name_address(current_proc()),
3071 			    addr, pkt->pkt_length, 128, NULL, 0));
3072 		}
3073 		pp_free_packet_single(pkt);
3074 	}
3075 }
3076 
3077 static void
fsw_update_timestamps(struct __kern_packet * pkt,volatile uint64_t * fg_ts,volatile uint64_t * rt_ts,ifnet_t ifp)3078 fsw_update_timestamps(struct __kern_packet *pkt, volatile uint64_t *fg_ts,
3079     volatile uint64_t *rt_ts, ifnet_t ifp)
3080 {
3081 	struct timespec now;
3082 	uint64_t now_nsec = 0;
3083 
3084 	if (!(pkt->pkt_pflags & PKT_F_TS_VALID) || pkt->pkt_timestamp == 0) {
3085 		nanouptime(&now);
3086 		net_timernsec(&now, &now_nsec);
3087 		pkt->pkt_timestamp = now_nsec;
3088 	}
3089 	pkt->pkt_pflags &= ~PKT_F_TS_VALID;
3090 
3091 	/*
3092 	 * If the packet service class is not background,
3093 	 * update the timestamps on the interface, as well as
3094 	 * the ones in nexus-wide advisory to indicate recent
3095 	 * activity on a foreground flow.
3096 	 */
3097 	if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) {
3098 		ifp->if_fg_sendts = (uint32_t)_net_uptime;
3099 		if (fg_ts != NULL) {
3100 			*fg_ts = _net_uptime;
3101 		}
3102 	}
3103 	if (pkt->pkt_pflags & PKT_F_REALTIME) {
3104 		ifp->if_rt_sendts = (uint32_t)_net_uptime;
3105 		if (rt_ts != NULL) {
3106 			*rt_ts = _net_uptime;
3107 		}
3108 	}
3109 }
3110 
3111 static bool
fsw_chain_enqueue_enabled(struct nx_flowswitch * fsw,bool gso_enabled)3112 fsw_chain_enqueue_enabled(struct nx_flowswitch *fsw, bool gso_enabled)
3113 {
3114 	return fsw_chain_enqueue != 0 &&
3115 	       fsw->fsw_ifp->if_output_netem == NULL &&
3116 	       (fsw->fsw_ifp->if_eflags & IFEF_ENQUEUE_MULTI) == 0 &&
3117 	       gso_enabled;
3118 }
3119 
3120 void
dp_flow_tx_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)3121 dp_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3122     uint32_t flags)
3123 {
3124 	struct pktq dropped_pkts;
3125 	bool chain, gso = ((flags & FLOW_PROC_FLAG_GSO) != 0);
3126 	uint32_t cnt = 0, bytes = 0;
3127 	volatile struct sk_nexusadv *nxadv = NULL;
3128 	volatile uint64_t *fg_ts = NULL;
3129 	volatile uint64_t *rt_ts = NULL;
3130 	uint8_t qset_idx = (fe->fe_qset != NULL) ? fe->fe_qset->nqs_idx : 0;
3131 
3132 	KPKTQ_INIT(&dropped_pkts);
3133 	ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3134 	if (__improbable(fe->fe_flags & FLOWENTF_LISTENER)) {
3135 		dp_listener_flow_tx_process(fsw, fe);
3136 		return;
3137 	}
3138 	if (__improbable(!dp_flow_tx_route_process(fsw, fe))) {
3139 		SK_RDERR(5, "Tx route bad");
3140 		FSW_STATS_ADD(FSW_STATS_TX_FLOW_NONVIABLE,
3141 		    KPKTQ_LEN(&fe->fe_tx_pktq));
3142 		KPKTQ_CONCAT(&dropped_pkts, &fe->fe_tx_pktq);
3143 		goto done;
3144 	}
3145 	chain = fsw_chain_enqueue_enabled(fsw, gso);
3146 	if (chain) {
3147 		nxadv = fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
3148 		if (nxadv != NULL) {
3149 			fg_ts = &nxadv->nxadv_fg_sendts;
3150 			rt_ts = &nxadv->nxadv_rt_sendts;
3151 		}
3152 	}
3153 	struct __kern_packet *pkt, *tpkt;
3154 	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3155 		int err = 0;
3156 
3157 		err = flow_pkt_track(fe, pkt, false);
3158 		if (__improbable(err != 0)) {
3159 			SK_RDERR(5, "flow_pkt_track failed (err %d)", err);
3160 			FSW_STATS_INC(FSW_STATS_TX_FLOW_TRACK_ERR);
3161 			KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3162 			KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3163 			continue;
3164 		}
3165 		_UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
3166 		pkt->pkt_transport_protocol = fe->fe_transport_protocol;
3167 
3168 		/* set AQM related values for outgoing packet */
3169 		if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
3170 			pkt->pkt_pflags |= PKT_F_FLOW_ADV;
3171 			pkt->pkt_flowsrc_type = FLOWSRC_CHANNEL;
3172 			pkt->pkt_flowsrc_fidx = fe->fe_adv_idx;
3173 		} else {
3174 			pkt->pkt_pflags &= ~PKT_F_FLOW_ADV;
3175 		}
3176 		_UUID_CLEAR(pkt->pkt_flow_id);
3177 		pkt->pkt_flow_token = fe->fe_flowid;
3178 		pkt->pkt_pflags |= PKT_F_FLOW_ID;
3179 		pkt->pkt_qset_idx = qset_idx;
3180 		pkt->pkt_policy_id = fe->fe_policy_id;
3181 		pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
3182 
3183 		/*
3184 		 * The same code is exercised per packet for the non-chain case
3185 		 * (see ifnet_enqueue_ifclassq()). It's replicated here to avoid
3186 		 * re-walking the chain later.
3187 		 */
3188 		if (chain) {
3189 			fsw_update_timestamps(pkt, fg_ts, rt_ts, fsw->fsw_ifp);
3190 		}
3191 		/* mark packet tos/svc_class */
3192 		fsw_qos_mark(fsw, fe, pkt);
3193 
3194 		tx_finalize_packet(fsw, pkt);
3195 		bytes += pkt->pkt_length;
3196 		cnt++;
3197 	}
3198 
3199 	/* snoop after it's finalized */
3200 	if (__improbable(pktap_total_tap_count != 0)) {
3201 		fsw_snoop(fsw, fe, false);
3202 	}
3203 	if (fe->fe_qset != NULL) {
3204 		classq_qset_enqueue_flow(fsw, fe, chain, cnt, bytes);
3205 	} else {
3206 		classq_enqueue_flow(fsw, fe, chain, cnt, bytes);
3207 	}
3208 done:
3209 	dp_drop_pktq(fsw, &dropped_pkts);
3210 }
3211 
3212 static struct flow_entry *
tx_process_continuous_ip_frag(struct nx_flowswitch * fsw,struct flow_entry * prev_fe,struct __kern_packet * pkt)3213 tx_process_continuous_ip_frag(struct nx_flowswitch *fsw,
3214     struct flow_entry *prev_fe, struct __kern_packet *pkt)
3215 {
3216 	ASSERT(!pkt->pkt_flow_ip_is_first_frag);
3217 
3218 	if (__improbable(pkt->pkt_flow_ip_frag_id == 0)) {
3219 		FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID);
3220 		SK_ERR("%s(%d) invalid zero fragment id",
3221 		    sk_proc_name_address(current_proc()),
3222 		    sk_proc_pid(current_proc()));
3223 		return NULL;
3224 	}
3225 
3226 	SK_DF(SK_VERB_FSW_DP | SK_VERB_TX,
3227 	    "%s(%d) continuation frag, id %u",
3228 	    sk_proc_name_address(current_proc()),
3229 	    sk_proc_pid(current_proc()),
3230 	    pkt->pkt_flow_ip_frag_id);
3231 	if (__improbable(prev_fe == NULL ||
3232 	    !prev_fe->fe_tx_is_cont_frag)) {
3233 		SK_ERR("%s(%d) unexpected continuation frag",
3234 		    sk_proc_name_address(current_proc()),
3235 		    sk_proc_pid(current_proc()),
3236 		    pkt->pkt_flow_ip_frag_id);
3237 		FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3238 		return NULL;
3239 	}
3240 	if (__improbable(pkt->pkt_flow_ip_frag_id !=
3241 	    prev_fe->fe_tx_frag_id)) {
3242 		FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3243 		SK_ERR("%s(%d) wrong continuation frag id %u expecting %u",
3244 		    sk_proc_name_address(current_proc()),
3245 		    sk_proc_pid(current_proc()),
3246 		    pkt->pkt_flow_ip_frag_id,
3247 		    prev_fe->fe_tx_frag_id);
3248 		return NULL;
3249 	}
3250 
3251 	return prev_fe;
3252 }
3253 
3254 static struct flow_entry *
tx_lookup_flow(struct nx_flowswitch * fsw,struct __kern_packet * pkt,struct flow_entry * prev_fe)3255 tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
3256     struct flow_entry *prev_fe)
3257 {
3258 	struct flow_entry *fe;
3259 
3260 	fe = lookup_flow_with_pkt(fsw, pkt, false, prev_fe);
3261 	if (__improbable(fe == NULL)) {
3262 		goto done;
3263 	}
3264 
3265 	if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
3266 		SK_RDERR(5, "Tx flow torn down");
3267 		FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN);
3268 		flow_entry_release(&fe);
3269 		goto done;
3270 	}
3271 
3272 	_FSW_INJECT_ERROR(34, pkt->pkt_flow_id[0], fe->fe_uuid[0] + 1,
3273 	    null_func);
3274 
3275 	if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
3276 		uuid_string_t flow_id_str, pkt_id_str;
3277 		sk_uuid_unparse(fe->fe_uuid, flow_id_str);
3278 		sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str);
3279 		SK_ERR("pkt flow id %s != flow id %s", pkt_id_str, flow_id_str);
3280 		flow_entry_release(&fe);
3281 		FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID);
3282 	}
3283 
3284 done:
3285 	return fe;
3286 }
3287 
3288 static inline void
tx_flow_process(struct nx_flowswitch * fsw,struct flow_entry * fe,uint32_t flags)3289 tx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3290     uint32_t flags)
3291 {
3292 	ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3293 	ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) != 0);
3294 
3295 	SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, "TX %d pkts from fe %p port %d",
3296 	    KPKTQ_LEN(&fe->fe_tx_pktq), fe, fe->fe_nx_port);
3297 
3298 	/* flow related processing (default, agg, etc.) */
3299 	fe->fe_tx_process(fsw, fe, flags);
3300 
3301 	KPKTQ_FINI(&fe->fe_tx_pktq);
3302 }
3303 
3304 #if SK_LOG
3305 static void
dp_tx_log_pkt(uint64_t verb,char * desc,struct __kern_packet * pkt)3306 dp_tx_log_pkt(uint64_t verb, char *desc, struct __kern_packet *pkt)
3307 {
3308 	char *pkt_buf;
3309 	MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
3310 	SK_DF(verb, "%s(%d) %s %s", sk_proc_name_address(current_proc()),
3311 	    sk_proc_pid(current_proc()), desc, sk_dump("buf", pkt_buf,
3312 	    pkt->pkt_length, 128, NULL, 0));
3313 }
3314 #else /* !SK_LOG */
3315 #define dp_tx_log_pkt(...)
3316 #endif /* !SK_LOG */
3317 
3318 static void
dp_tx_pktq(struct nx_flowswitch * fsw,struct pktq * spktq)3319 dp_tx_pktq(struct nx_flowswitch *fsw, struct pktq *spktq)
3320 {
3321 	struct __kern_packet *spkt, *pkt;
3322 	struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3323 	struct flow_entry *fe, *prev_fe;
3324 	struct pktq dropped_pkts, dpktq;
3325 	struct nexus_adapter *dev_na;
3326 	struct kern_pbufpool *dev_pp;
3327 	struct ifnet *ifp;
3328 	sa_family_t af;
3329 	uint32_t n_pkts, n_flows = 0;
3330 	boolean_t do_pacing = FALSE;
3331 
3332 	int err;
3333 	KPKTQ_INIT(&dpktq);
3334 	KPKTQ_INIT(&dropped_pkts);
3335 	n_pkts = KPKTQ_LEN(spktq);
3336 
3337 	FSW_RLOCK(fsw);
3338 	if (__improbable(FSW_QUIESCED(fsw))) {
3339 		DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3340 		SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3341 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3342 		goto done;
3343 	}
3344 	dev_na = fsw->fsw_dev_ch->ch_na;
3345 	if (__improbable(dev_na == NULL)) {
3346 		SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3347 		FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3348 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3349 		goto done;
3350 	}
3351 	/*
3352 	 * fsw_ifp should still be valid at this point. If fsw is detached
3353 	 * after fsw_lock is released, this ifp will remain valid and
3354 	 * netif_transmit() will behave properly even if the ifp is in
3355 	 * detached state.
3356 	 */
3357 	ifp = fsw->fsw_ifp;
3358 
3359 	/* batch allocate enough packets */
3360 	dev_pp = na_kr_get_pp(dev_na, NR_TX);
3361 
3362 	err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, n_pkts, NULL,
3363 	    NULL, SKMEM_NOSLEEP);
3364 #if DEVELOPMENT || DEBUG
3365 	if (__probable(err != ENOMEM)) {
3366 		_FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
3367 	}
3368 #endif /* DEVELOPMENT || DEBUG */
3369 	if (__improbable(err == ENOMEM)) {
3370 		ASSERT(KPKTQ_EMPTY(&dpktq));
3371 		KPKTQ_CONCAT(&dropped_pkts, spktq);
3372 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3373 		SK_ERR("failed to alloc %u pkts from device pool", n_pkts);
3374 		goto done;
3375 	} else if (__improbable(err == EAGAIN)) {
3376 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT,
3377 		    (n_pkts - KPKTQ_LEN(&dpktq)));
3378 		FSW_STATS_ADD(FSW_STATS_DROP,
3379 		    (n_pkts - KPKTQ_LEN(&dpktq)));
3380 	}
3381 
3382 	n_pkts = KPKTQ_LEN(&dpktq);
3383 	prev_fe = NULL;
3384 	KPKTQ_FOREACH(spkt, spktq) {
3385 		if (n_pkts == 0) {
3386 			break;
3387 		}
3388 		--n_pkts;
3389 
3390 		KPKTQ_DEQUEUE(&dpktq, pkt);
3391 		ASSERT(pkt != NULL);
3392 		err = dp_copy_to_dev(fsw, spkt, pkt);
3393 		if (__improbable(err != 0)) {
3394 			KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3395 			continue;
3396 		}
3397 
3398 		do_pacing |= ((pkt->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0);
3399 		af = fsw_ip_demux(fsw, pkt);
3400 		if (__improbable(af == AF_UNSPEC)) {
3401 			dp_tx_log_pkt(SK_VERB_ERROR, "demux err", pkt);
3402 			FSW_STATS_INC(FSW_STATS_TX_DEMUX_ERR);
3403 			KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3404 			continue;
3405 		}
3406 
3407 		err = flow_pkt_classify(pkt, ifp, af, false);
3408 		if (__improbable(err != 0)) {
3409 			dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
3410 			FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
3411 			KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3412 			continue;
3413 		}
3414 
3415 		if (__improbable(pkt->pkt_flow_ip_is_frag &&
3416 		    !pkt->pkt_flow_ip_is_first_frag)) {
3417 			fe = tx_process_continuous_ip_frag(fsw, prev_fe, pkt);
3418 			if (__probable(fe != NULL)) {
3419 				flow_entry_retain(fe);
3420 				goto flow_batch;
3421 			} else {
3422 				FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3423 				KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3424 				continue;
3425 			}
3426 		}
3427 
3428 		fe = tx_lookup_flow(fsw, pkt, prev_fe);
3429 		if (__improbable(fe == NULL)) {
3430 			FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
3431 			KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3432 			prev_fe = NULL;
3433 			continue;
3434 		}
3435 flow_batch:
3436 		tx_flow_batch_packet(&fes, fe, pkt);
3437 		prev_fe = fe;
3438 	}
3439 
3440 	struct flow_entry *tfe = NULL;
3441 	TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
3442 		tx_flow_process(fsw, fe, 0);
3443 		TAILQ_REMOVE(&fes, fe, fe_tx_link);
3444 		fe->fe_tx_is_cont_frag = false;
3445 		fe->fe_tx_frag_id = 0;
3446 		flow_entry_release(&fe);
3447 		n_flows++;
3448 	}
3449 
3450 done:
3451 	FSW_RUNLOCK(fsw);
3452 	if (n_flows > 0) {
3453 		netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL | (do_pacing ? NETIF_XMIT_FLAG_PACING : 0));
3454 	}
3455 	dp_drop_pktq(fsw, &dropped_pkts);
3456 	KPKTQ_FINI(&dropped_pkts);
3457 	KPKTQ_FINI(&dpktq);
3458 }
3459 
3460 static sa_family_t
get_tso_af(struct __kern_packet * pkt)3461 get_tso_af(struct __kern_packet *pkt)
3462 {
3463 	packet_tso_flags_t tso_flags;
3464 
3465 	tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
3466 	if (tso_flags == PACKET_TSO_IPV4) {
3467 		return AF_INET;
3468 	} else if (tso_flags == PACKET_TSO_IPV6) {
3469 		return AF_INET6;
3470 	} else {
3471 		panic("invalid tso flags: 0x%x\n", tso_flags);
3472 		/* NOTREACHED */
3473 		__builtin_unreachable();
3474 	}
3475 }
3476 
3477 static inline void
update_flow_info(struct __kern_packet * pkt,void * iphdr,void * tcphdr,uint16_t payload_sz)3478 update_flow_info(struct __kern_packet *pkt, void *iphdr, void *tcphdr,
3479     uint16_t payload_sz)
3480 {
3481 	struct tcphdr *tcp = tcphdr;
3482 
3483 	DTRACE_SKYWALK4(update__flow__info, struct __kern_packet *, pkt,
3484 	    void *, iphdr, void *, tcphdr, uint16_t, payload_sz);
3485 	pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iphdr;
3486 	pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcphdr;
3487 	pkt->pkt_flow_tcp_flags = tcp->th_flags;
3488 	pkt->pkt_flow_tcp_seq = tcp->th_seq;
3489 	pkt->pkt_flow_ulen = payload_sz;
3490 }
3491 
3492 static int
do_gso(struct nx_flowswitch * fsw,int af,struct __kern_packet * orig_pkt,struct __kern_packet * first_pkt,struct pktq * dev_pktq,struct pktq * gso_pktq)3493 do_gso(struct nx_flowswitch *fsw, int af, struct __kern_packet *orig_pkt,
3494     struct __kern_packet *first_pkt, struct pktq *dev_pktq,
3495     struct pktq *gso_pktq)
3496 {
3497 	ifnet_t ifp = fsw->fsw_ifp;
3498 	struct __kern_packet *pkt = first_pkt;
3499 	uint8_t proto = pkt->pkt_flow_ip_proto;
3500 	uint16_t ip_hlen = pkt->pkt_flow_ip_hlen;
3501 	uint16_t tcp_hlen = pkt->pkt_flow_tcp_hlen;
3502 	uint16_t total_hlen = ip_hlen + tcp_hlen;
3503 	uint16_t mtu = (uint16_t)ifp->if_mtu;
3504 	uint16_t mss = pkt->pkt_proto_seg_sz, payload_sz;
3505 	uint32_t n, n_pkts, off = 0, total_len = orig_pkt->pkt_length;
3506 	uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
3507 	kern_packet_t orig_ph = SK_PKT2PH(orig_pkt);
3508 	uint8_t *orig_pkt_baddr;
3509 	struct tcphdr *tcp;
3510 	struct ip *ip;
3511 	struct ip6_hdr *ip6;
3512 	uint32_t tcp_seq;
3513 	uint16_t ipid;
3514 	uint32_t pseudo_hdr_csum, bufsz;
3515 
3516 	ASSERT(headroom <= UINT8_MAX);
3517 	if (proto != IPPROTO_TCP) {
3518 		SK_ERR("invalid proto: %d", proto);
3519 		DTRACE_SKYWALK3(invalid__proto, struct nx_flowswitch *,
3520 		    fsw, ifnet_t, ifp, uint8_t, proto);
3521 		return EINVAL;
3522 	}
3523 	if (mss == 0 || mss > (mtu - total_hlen)) {
3524 		SK_ERR("invalid args: mss %d, mtu %d, total_hlen %d",
3525 		    mss, mtu, total_hlen);
3526 		DTRACE_SKYWALK5(invalid__args1, struct nx_flowswitch *,
3527 		    fsw, ifnet_t, ifp, uint16_t, mss, uint16_t, mtu,
3528 		    uint32_t, total_hlen);
3529 		return EINVAL;
3530 	}
3531 	bufsz = PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp);
3532 	if ((headroom + total_hlen + mss) > bufsz) {
3533 		SK_ERR("invalid args: headroom %d, total_hlen %d, "
3534 		    "mss %d, bufsz %d", headroom, total_hlen, mss, bufsz);
3535 		DTRACE_SKYWALK6(invalid__args2, struct nx_flowswitch *,
3536 		    fsw, ifnet_t, ifp, uint16_t, headroom, uint16_t,
3537 		    total_hlen, uint16_t, mss, uint32_t, bufsz);
3538 		return EINVAL;
3539 	}
3540 	n_pkts = (uint32_t)(SK_ROUNDUP((total_len - total_hlen), mss) / mss);
3541 
3542 	ASSERT(pkt->pkt_headroom == headroom);
3543 	ASSERT(pkt->pkt_length == total_len);
3544 	ASSERT(pkt->pkt_l2_len == 0);
3545 	ASSERT((pkt->pkt_qum.qum_qflags & QUM_F_FINALIZED) == 0);
3546 	ASSERT((pkt->pkt_pflags & PKT_F_TRUNCATED) != 0);
3547 	pkt->pkt_pflags &= ~PKT_F_TRUNCATED;
3548 	pkt->pkt_proto_seg_sz = 0;
3549 	pkt->pkt_csum_flags = 0;
3550 	MD_BUFLET_ADDR_ABS(orig_pkt, orig_pkt_baddr);
3551 	orig_pkt_baddr += orig_pkt->pkt_headroom;
3552 
3553 	if (af == AF_INET) {
3554 		ip = (struct ip *)pkt->pkt_flow_ip_hdr;
3555 		tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
3556 		ipid = ip->ip_id;
3557 		pseudo_hdr_csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
3558 		    pkt->pkt_flow_ipv4_dst.s_addr, 0);
3559 	} else {
3560 		ASSERT(af == AF_INET6);
3561 		tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
3562 		pseudo_hdr_csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
3563 		    &pkt->pkt_flow_ipv6_dst, 0);
3564 	}
3565 	tcp_seq = ntohl(tcp->th_seq);
3566 
3567 	for (n = 1, payload_sz = mss, off = total_hlen; off < total_len;
3568 	    off += payload_sz) {
3569 		uint8_t *baddr, *baddr0;
3570 		uint32_t partial;
3571 
3572 		if (pkt == NULL) {
3573 			n++;
3574 			KPKTQ_DEQUEUE(dev_pktq, pkt);
3575 			ASSERT(pkt != NULL);
3576 		}
3577 		MD_BUFLET_ADDR_ABS(pkt, baddr0);
3578 		baddr = baddr0;
3579 		baddr += headroom;
3580 
3581 		/* Copy headers from the original packet */
3582 		if (n != 1) {
3583 			ASSERT(pkt != first_pkt);
3584 			pkt_copy(orig_pkt_baddr, baddr, total_hlen);
3585 			fsw_pkt_copy_metadata(first_pkt, pkt);
3586 
3587 			ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
3588 			/* flow info still needs to be updated below */
3589 			bcopy(first_pkt->pkt_flow, pkt->pkt_flow,
3590 			    sizeof(*pkt->pkt_flow));
3591 			pkt->pkt_trace_id = 0;
3592 			ASSERT(pkt->pkt_headroom == headroom);
3593 		} else {
3594 			METADATA_SET_LEN(pkt, 0, 0);
3595 		}
3596 		baddr += total_hlen;
3597 
3598 		/* Copy/checksum the payload from the original packet */
3599 		if (off + payload_sz > total_len) {
3600 			payload_sz = (uint16_t)(total_len - off);
3601 		}
3602 		pkt_copypkt_sum(orig_ph,
3603 		    (uint16_t)(orig_pkt->pkt_headroom + off),
3604 		    SK_PKT2PH(pkt), headroom + total_hlen, payload_sz,
3605 		    &partial, TRUE);
3606 
3607 		DTRACE_SKYWALK6(copy__csum, struct nx_flowswitch *, fsw,
3608 		    ifnet_t, ifp, uint8_t *, baddr, uint16_t, payload_sz,
3609 		    uint16_t, mss, uint32_t, partial);
3610 		FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
3611 
3612 		/*
3613 		 * Adjust header information and fill in the missing fields.
3614 		 */
3615 		if (af == AF_INET) {
3616 			ip = (struct ip *)(void *)(baddr0 + pkt->pkt_headroom);
3617 			tcp = (struct tcphdr *)(void *)((caddr_t)ip + ip_hlen);
3618 
3619 			if (n != n_pkts) {
3620 				tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3621 			}
3622 			if (n != 1) {
3623 				tcp->th_flags &= ~TH_CWR;
3624 				tcp->th_seq = htonl(tcp_seq);
3625 			}
3626 			update_flow_info(pkt, ip, tcp, payload_sz);
3627 
3628 			ip->ip_id = htons((ipid)++);
3629 			ip->ip_len = htons(ip_hlen + tcp_hlen + payload_sz);
3630 			ip->ip_sum = 0;
3631 			ip->ip_sum = inet_cksum_buffer(ip, 0, 0, ip_hlen);
3632 			tcp->th_sum = 0;
3633 			partial = __packet_cksum(tcp, tcp_hlen, partial);
3634 			partial += htons(tcp_hlen + IPPROTO_TCP + payload_sz);
3635 			partial += pseudo_hdr_csum;
3636 			ADDCARRY(partial);
3637 			tcp->th_sum = ~(uint16_t)partial;
3638 		} else {
3639 			ASSERT(af == AF_INET6);
3640 			ip6 = (struct ip6_hdr *)(baddr0 + pkt->pkt_headroom);
3641 			tcp = (struct tcphdr *)(void *)((caddr_t)ip6 + ip_hlen);
3642 
3643 			if (n != n_pkts) {
3644 				tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3645 			}
3646 			if (n != 1) {
3647 				tcp->th_flags &= ~TH_CWR;
3648 				tcp->th_seq = htonl(tcp_seq);
3649 			}
3650 			update_flow_info(pkt, ip6, tcp, payload_sz);
3651 
3652 			ip6->ip6_plen = htons(tcp_hlen + payload_sz);
3653 			tcp->th_sum = 0;
3654 			partial = __packet_cksum(tcp, tcp_hlen, partial);
3655 			partial += htonl(tcp_hlen + IPPROTO_TCP + payload_sz);
3656 			partial += pseudo_hdr_csum;
3657 			ADDCARRY(partial);
3658 			tcp->th_sum = ~(uint16_t)partial;
3659 		}
3660 		tcp_seq += payload_sz;
3661 		METADATA_ADJUST_LEN(pkt, total_hlen, headroom);
3662 #if (DEVELOPMENT || DEBUG)
3663 		struct __kern_buflet *bft;
3664 		uint32_t blen;
3665 		PKT_GET_FIRST_BUFLET(pkt, 1, bft);
3666 		blen = __buflet_get_data_length(bft);
3667 		if (blen != total_hlen + payload_sz) {
3668 			panic("blen (%d) != total_len + payload_sz (%d)\n",
3669 			    blen, total_hlen + payload_sz);
3670 		}
3671 #endif /* DEVELOPMENT || DEBUG */
3672 
3673 		pkt->pkt_length = total_hlen + payload_sz;
3674 		KPKTQ_ENQUEUE(gso_pktq, pkt);
3675 		pkt = NULL;
3676 
3677 		/*
3678 		 * Note that at this point the packet is not yet finalized.
3679 		 * The finalization happens in dp_flow_tx_process() after
3680 		 * the framing is done.
3681 		 */
3682 	}
3683 	ASSERT(n == n_pkts);
3684 	ASSERT(off == total_len);
3685 	DTRACE_SKYWALK7(gso__done, struct nx_flowswitch *, fsw, ifnet_t, ifp,
3686 	    uint32_t, n_pkts, uint32_t, total_len, uint16_t, ip_hlen,
3687 	    uint16_t, tcp_hlen, uint8_t *, orig_pkt_baddr);
3688 	return 0;
3689 }
3690 
3691 static void
tx_flow_enqueue_gso_pktq(struct flow_entry_list * fes,struct flow_entry * fe,struct pktq * gso_pktq)3692 tx_flow_enqueue_gso_pktq(struct flow_entry_list *fes, struct flow_entry *fe,
3693     struct pktq *gso_pktq)
3694 {
3695 	if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
3696 		ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
3697 		TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
3698 		KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3699 		    KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3700 		KPKTQ_INIT(gso_pktq);
3701 	} else {
3702 		ASSERT(!TAILQ_EMPTY(fes));
3703 		KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3704 		    KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3705 		KPKTQ_INIT(gso_pktq);
3706 		flow_entry_release(&fe);
3707 	}
3708 }
3709 
3710 static void
dp_gso_pktq(struct nx_flowswitch * fsw,struct pktq * spktq,uint32_t gso_pkts_estimate)3711 dp_gso_pktq(struct nx_flowswitch *fsw, struct pktq *spktq,
3712     uint32_t gso_pkts_estimate)
3713 {
3714 	struct __kern_packet *spkt, *pkt;
3715 	struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3716 	struct flow_entry *fe, *prev_fe;
3717 	struct pktq dpktq;
3718 	struct nexus_adapter *dev_na;
3719 	struct kern_pbufpool *dev_pp;
3720 	struct ifnet *ifp;
3721 	sa_family_t af;
3722 	uint32_t n_pkts, n_flows = 0;
3723 	int err;
3724 
3725 	KPKTQ_INIT(&dpktq);
3726 	n_pkts = KPKTQ_LEN(spktq);
3727 
3728 	FSW_RLOCK(fsw);
3729 	if (__improbable(FSW_QUIESCED(fsw))) {
3730 		DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3731 		SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3732 		dp_drop_pktq(fsw, spktq);
3733 		goto done;
3734 	}
3735 	dev_na = fsw->fsw_dev_ch->ch_na;
3736 	if (__improbable(dev_na == NULL)) {
3737 		SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3738 		FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3739 		dp_drop_pktq(fsw, spktq);
3740 		goto done;
3741 	}
3742 	/*
3743 	 * fsw_ifp should still be valid at this point. If fsw is detached
3744 	 * after fsw_lock is released, this ifp will remain valid and
3745 	 * netif_transmit() will behave properly even if the ifp is in
3746 	 * detached state.
3747 	 */
3748 	ifp = fsw->fsw_ifp;
3749 	dev_pp = na_kr_get_pp(dev_na, NR_TX);
3750 
3751 	/*
3752 	 * Batch allocate enough packets to perform GSO on all
3753 	 * packets in spktq.
3754 	 */
3755 	err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq,
3756 	    gso_pkts_estimate, NULL, NULL, SKMEM_NOSLEEP);
3757 #if DEVELOPMENT || DEBUG
3758 	if (__probable(err != ENOMEM)) {
3759 		_FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
3760 	}
3761 #endif /* DEVELOPMENT || DEBUG */
3762 	/*
3763 	 * We either get all packets or none. No partial allocations.
3764 	 */
3765 	if (__improbable(err != 0)) {
3766 		if (err == ENOMEM) {
3767 			ASSERT(KPKTQ_EMPTY(&dpktq));
3768 		} else {
3769 			dp_free_pktq(fsw, &dpktq);
3770 		}
3771 		DTRACE_SKYWALK1(gso__no__mem, int, err);
3772 		dp_drop_pktq(fsw, spktq);
3773 		FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3774 		SK_ERR("failed to alloc %u pkts from device pool",
3775 		    gso_pkts_estimate);
3776 		goto done;
3777 	}
3778 	prev_fe = NULL;
3779 	KPKTQ_FOREACH(spkt, spktq) {
3780 		KPKTQ_DEQUEUE(&dpktq, pkt);
3781 		ASSERT(pkt != NULL);
3782 		/*
3783 		 * Copy only headers to the first packet of the GSO chain.
3784 		 * The headers will be used for classification below.
3785 		 */
3786 		err = dp_copy_headers_to_dev(fsw, spkt, pkt);
3787 		if (__improbable(err != 0)) {
3788 			pp_free_packet_single(pkt);
3789 			DTRACE_SKYWALK2(copy__headers__failed,
3790 			    struct nx_flowswitch *, fsw,
3791 			    struct __kern_packet *, spkt);
3792 			continue;
3793 		}
3794 		af = get_tso_af(pkt);
3795 		ASSERT(af == AF_INET || af == AF_INET6);
3796 
3797 		err = flow_pkt_classify(pkt, ifp, af, false);
3798 		if (__improbable(err != 0)) {
3799 			dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
3800 			FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
3801 			pp_free_packet_single(pkt);
3802 			DTRACE_SKYWALK4(classify__failed,
3803 			    struct nx_flowswitch *, fsw,
3804 			    struct __kern_packet *, spkt,
3805 			    struct __kern_packet *, pkt,
3806 			    int, err);
3807 			continue;
3808 		}
3809 		/*
3810 		 * GSO cannot be done on a fragment and it's a bug in user
3811 		 * space to mark a fragment as needing GSO.
3812 		 */
3813 		if (__improbable(pkt->pkt_flow_ip_is_frag)) {
3814 			FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3815 			pp_free_packet_single(pkt);
3816 			DTRACE_SKYWALK3(is__frag,
3817 			    struct nx_flowswitch *, fsw,
3818 			    struct __kern_packet *, spkt,
3819 			    struct __kern_packet *, pkt);
3820 			continue;
3821 		}
3822 		fe = tx_lookup_flow(fsw, pkt, prev_fe);
3823 		if (__improbable(fe == NULL)) {
3824 			FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
3825 			pp_free_packet_single(pkt);
3826 			DTRACE_SKYWALK3(lookup__failed,
3827 			    struct nx_flowswitch *, fsw,
3828 			    struct __kern_packet *, spkt,
3829 			    struct __kern_packet *, pkt);
3830 			prev_fe = NULL;
3831 			continue;
3832 		}
3833 		/*
3834 		 * Perform GSO on spkt using the flow information
3835 		 * obtained above.
3836 		 */
3837 		struct pktq gso_pktq;
3838 		KPKTQ_INIT(&gso_pktq);
3839 		err = do_gso(fsw, af, spkt, pkt, &dpktq, &gso_pktq);
3840 		if (__probable(err == 0)) {
3841 			tx_flow_enqueue_gso_pktq(&fes, fe, &gso_pktq);
3842 			prev_fe = fe;
3843 		} else {
3844 			DTRACE_SKYWALK1(gso__error, int, err);
3845 			/* TODO: increment error stat */
3846 			pp_free_packet_single(pkt);
3847 			flow_entry_release(&fe);
3848 			prev_fe = NULL;
3849 		}
3850 		KPKTQ_FINI(&gso_pktq);
3851 	}
3852 	struct flow_entry *tfe = NULL;
3853 	TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
3854 		/* Chain-enqueue can be used for GSO chains */
3855 		tx_flow_process(fsw, fe, FLOW_PROC_FLAG_GSO);
3856 		TAILQ_REMOVE(&fes, fe, fe_tx_link);
3857 		flow_entry_release(&fe);
3858 		n_flows++;
3859 	}
3860 done:
3861 	FSW_RUNLOCK(fsw);
3862 	if (n_flows > 0) {
3863 		netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL);
3864 	}
3865 
3866 	/*
3867 	 * It's possible for packets to be left in dpktq because
3868 	 * gso_pkts_estimate is only an estimate. The actual number
3869 	 * of packets needed could be less.
3870 	 */
3871 	uint32_t dpktq_len;
3872 	if ((dpktq_len = KPKTQ_LEN(&dpktq)) > 0) {
3873 		DTRACE_SKYWALK2(leftover__dev__pkts,
3874 		    struct nx_flowswitch *, fsw, uint32_t, dpktq_len);
3875 		dp_free_pktq(fsw, &dpktq);
3876 	}
3877 	KPKTQ_FINI(&dpktq);
3878 }
3879 
3880 static inline void
fsw_dev_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)3881 fsw_dev_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
3882     struct proc *p)
3883 {
3884 #pragma unused(p)
3885 	uint32_t total_pkts = 0, total_bytes = 0;
3886 
3887 	for (;;) {
3888 		struct pktq pktq;
3889 		KPKTQ_INIT(&pktq);
3890 		uint32_t n_bytes;
3891 		fsw_rx_ring_dequeue_pktq(fsw, r, fsw_rx_batch, &pktq, &n_bytes);
3892 		if (n_bytes == 0) {
3893 			break;
3894 		}
3895 		total_pkts += KPKTQ_LEN(&pktq);
3896 		total_bytes += n_bytes;
3897 
3898 		if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) {
3899 			fsw_receive(fsw, &pktq);
3900 		} else {
3901 			fsw_dev_input_netem_enqueue(fsw, &pktq);
3902 		}
3903 		KPKTQ_FINI(&pktq);
3904 	}
3905 
3906 	KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
3907 	DTRACE_SKYWALK2(fsw__dp__dev__ring__flush, uint32_t, total_pkts,
3908 	    uint32_t, total_bytes);
3909 
3910 	/* compute mitigation rate for delivered traffic */
3911 	if (__probable(r->ckr_netif_mit_stats != NULL)) {
3912 		r->ckr_netif_mit_stats(r, total_pkts, total_bytes);
3913 	}
3914 }
3915 
3916 static inline void
fsw_user_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)3917 fsw_user_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
3918     struct proc *p)
3919 {
3920 #pragma unused(p)
3921 	static packet_trace_id_t trace_id = 0;
3922 	uint32_t total_pkts = 0, total_bytes = 0;
3923 
3924 	for (;;) {
3925 		struct pktq pktq;
3926 		KPKTQ_INIT(&pktq);
3927 		uint32_t n_bytes;
3928 		uint32_t gso_pkts_estimate = 0;
3929 
3930 		fsw_tx_ring_dequeue_pktq(fsw, r, fsw_tx_batch, &pktq, &n_bytes,
3931 		    &gso_pkts_estimate);
3932 		if (n_bytes == 0) {
3933 			break;
3934 		}
3935 		total_pkts += KPKTQ_LEN(&pktq);
3936 		total_bytes += n_bytes;
3937 
3938 		KPKTQ_FIRST(&pktq)->pkt_trace_id = ++trace_id;
3939 		KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_START,
3940 		    KPKTQ_FIRST(&pktq)->pkt_trace_id);
3941 
3942 		if (gso_pkts_estimate > 0) {
3943 			dp_gso_pktq(fsw, &pktq, gso_pkts_estimate);
3944 		} else {
3945 			dp_tx_pktq(fsw, &pktq);
3946 		}
3947 		dp_free_pktq(fsw, &pktq);
3948 		KPKTQ_FINI(&pktq);
3949 	}
3950 	kr_update_stats(r, total_pkts, total_bytes);
3951 
3952 	KDBG(SK_KTRACE_FSW_USER_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
3953 	DTRACE_SKYWALK2(fsw__dp__user__ring__flush, uint32_t, total_pkts,
3954 	    uint32_t, total_bytes);
3955 }
3956 
3957 void
fsw_ring_flush(struct nx_flowswitch * fsw,struct __kern_channel_ring * r,struct proc * p)3958 fsw_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
3959     struct proc *p)
3960 {
3961 	struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
3962 
3963 	ASSERT(sk_is_sync_protected());
3964 	ASSERT(vpna->vpna_nx_port != FSW_VP_HOST);
3965 	ASSERT(vpna->vpna_up.na_md_type == NEXUS_META_TYPE_PACKET);
3966 
3967 	if (vpna->vpna_nx_port == FSW_VP_DEV) {
3968 		fsw_dev_ring_flush(fsw, r, p);
3969 	} else {
3970 		fsw_user_ring_flush(fsw, r, p);
3971 	}
3972 }
3973 
3974 int
fsw_dp_ctor(struct nx_flowswitch * fsw)3975 fsw_dp_ctor(struct nx_flowswitch *fsw)
3976 {
3977 	uint32_t fe_cnt = fsw_fe_table_size;
3978 	uint32_t fob_cnt = fsw_flow_owner_buckets;
3979 	uint32_t frb_cnt = fsw_flow_route_buckets;
3980 	uint32_t frib_cnt = fsw_flow_route_id_buckets;
3981 	struct kern_nexus *nx = fsw->fsw_nx;
3982 	char name[64];
3983 	int error = 0;
3984 
3985 	/* just in case */
3986 	if (fe_cnt == 0) {
3987 		fe_cnt = NX_FSW_FE_TABLESZ;
3988 		ASSERT(fe_cnt != 0);
3989 	}
3990 	if (fob_cnt == 0) {
3991 		fob_cnt = NX_FSW_FOB_HASHSZ;
3992 		ASSERT(fob_cnt != 0);
3993 	}
3994 	if (frb_cnt == 0) {
3995 		frb_cnt = NX_FSW_FRB_HASHSZ;
3996 		ASSERT(frb_cnt != 0);
3997 	}
3998 	if (frib_cnt == 0) {
3999 		frib_cnt = NX_FSW_FRIB_HASHSZ;
4000 		ASSERT(frib_cnt != 0);
4001 	}
4002 
4003 	/* make sure fe_cnt is a power of two, else round up */
4004 	if ((fe_cnt & (fe_cnt - 1)) != 0) {
4005 		fe_cnt--;
4006 		fe_cnt |= (fe_cnt >> 1);
4007 		fe_cnt |= (fe_cnt >> 2);
4008 		fe_cnt |= (fe_cnt >> 4);
4009 		fe_cnt |= (fe_cnt >> 8);
4010 		fe_cnt |= (fe_cnt >> 16);
4011 		fe_cnt++;
4012 	}
4013 
4014 	/* make sure frb_cnt is a power of two, else round up */
4015 	if ((frb_cnt & (frb_cnt - 1)) != 0) {
4016 		frb_cnt--;
4017 		frb_cnt |= (frb_cnt >> 1);
4018 		frb_cnt |= (frb_cnt >> 2);
4019 		frb_cnt |= (frb_cnt >> 4);
4020 		frb_cnt |= (frb_cnt >> 8);
4021 		frb_cnt |= (frb_cnt >> 16);
4022 		frb_cnt++;
4023 	}
4024 
4025 	lck_mtx_init(&fsw->fsw_detach_barrier_lock, &nexus_lock_group,
4026 	    &nexus_lock_attr);
4027 	lck_mtx_init(&fsw->fsw_reap_lock, &nexus_lock_group, &nexus_lock_attr);
4028 	lck_mtx_init(&fsw->fsw_linger_lock, &nexus_lock_group, &nexus_lock_attr);
4029 	TAILQ_INIT(&fsw->fsw_linger_head);
4030 
4031 	(void) snprintf(name, sizeof(name), "%s_%llu", NX_FSW_NAME, nx->nx_id);
4032 	error = nx_advisory_alloc(nx, name,
4033 	    &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
4034 	    NEXUS_ADVISORY_TYPE_FLOWSWITCH);
4035 	if (error != 0) {
4036 		fsw_dp_dtor(fsw);
4037 		return error;
4038 	}
4039 
4040 	fsw->fsw_flow_mgr = flow_mgr_create(fe_cnt, fob_cnt, frb_cnt, frib_cnt);
4041 	if (fsw->fsw_flow_mgr == NULL) {
4042 		fsw_dp_dtor(fsw);
4043 		return error;
4044 	}
4045 
4046 	/* generic name; will be customized upon ifattach */
4047 	(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
4048 	    FSW_REAP_THREADNAME, name, "");
4049 
4050 	if (kernel_thread_start(fsw_reap_thread_func, fsw,
4051 	    &fsw->fsw_reap_thread) != KERN_SUCCESS) {
4052 		panic_plain("%s: can't create thread", __func__);
4053 		/* NOTREACHED */
4054 		__builtin_unreachable();
4055 	}
4056 	/* this must not fail */
4057 	VERIFY(fsw->fsw_reap_thread != NULL);
4058 
4059 	SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
4060 
4061 
4062 	return error;
4063 }
4064 
4065 void
fsw_dp_dtor(struct nx_flowswitch * fsw)4066 fsw_dp_dtor(struct nx_flowswitch *fsw)
4067 {
4068 	uint64_t f = (1 * NSEC_PER_MSEC);         /* 1 ms */
4069 	uint64_t s = (1000 * NSEC_PER_SEC);         /* 1 sec */
4070 	uint32_t i = 0;
4071 
4072 #if (DEVELOPMENT || DEBUG)
4073 	if (fsw->fsw_rps_threads != NULL) {
4074 		for (i = 0; i < fsw->fsw_rps_nthreads; i++) {
4075 			fsw_rps_thread_join(fsw, i);
4076 		}
4077 		kfree_type(struct fsw_rps_thread, fsw->fsw_rps_threads);
4078 	}
4079 #endif /* !DEVELOPMENT && !DEBUG */
4080 
4081 	nx_advisory_free(fsw->fsw_nx);
4082 
4083 	if (fsw->fsw_reap_thread != THREAD_NULL) {
4084 		/* signal thread to begin self-termination */
4085 		lck_mtx_lock(&fsw->fsw_reap_lock);
4086 		fsw->fsw_reap_flags |= FSW_REAPF_TERMINATING;
4087 
4088 		/*
4089 		 * And wait for thread to terminate; use another
4090 		 * wait channel here other than fsw_reap_flags to
4091 		 * make it more explicit.  In the event the reaper
4092 		 * thread misses a wakeup, we'll try again once
4093 		 * every second (except for the first time).
4094 		 */
4095 		while (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)) {
4096 			uint64_t t = 0;
4097 
4098 			nanoseconds_to_absolutetime((i++ == 0) ? f : s, &t);
4099 			clock_absolutetime_interval_to_deadline(t, &t);
4100 			ASSERT(t != 0);
4101 
4102 			fsw->fsw_reap_flags |= FSW_REAPF_TERMINATEBLOCK;
4103 			if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)) {
4104 				thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4105 			}
4106 			(void) assert_wait_deadline(&fsw->fsw_reap_thread,
4107 			    THREAD_UNINT, t);
4108 			lck_mtx_unlock(&fsw->fsw_reap_lock);
4109 			thread_block(THREAD_CONTINUE_NULL);
4110 			lck_mtx_lock(&fsw->fsw_reap_lock);
4111 			fsw->fsw_reap_flags &= ~FSW_REAPF_TERMINATEBLOCK;
4112 		}
4113 		ASSERT(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED);
4114 		lck_mtx_unlock(&fsw->fsw_reap_lock);
4115 		fsw->fsw_reap_thread = THREAD_NULL;
4116 	}
4117 
4118 	/* free any remaining flow entries in the linger list */
4119 	fsw_linger_purge(fsw);
4120 
4121 	if (fsw->fsw_flow_mgr != NULL) {
4122 		flow_mgr_destroy(fsw->fsw_flow_mgr);
4123 		fsw->fsw_flow_mgr = NULL;
4124 	}
4125 
4126 
4127 	lck_mtx_destroy(&fsw->fsw_detach_barrier_lock, &nexus_lock_group);
4128 	lck_mtx_destroy(&fsw->fsw_reap_lock, &nexus_lock_group);
4129 	lck_mtx_destroy(&fsw->fsw_linger_lock, &nexus_lock_group);
4130 }
4131 
4132 void
fsw_linger_insert(struct flow_entry * fe)4133 fsw_linger_insert(struct flow_entry *fe)
4134 {
4135 	struct nx_flowswitch *fsw = fe->fe_fsw;
4136 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4137 	SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
4138 	    fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
4139 	    fe->fe_flags, FLOWENTF_BITS);
4140 
4141 	net_update_uptime();
4142 
4143 	ASSERT(flow_entry_refcnt(fe) >= 1);
4144 	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4145 	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4146 	ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
4147 	ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE);
4148 	ASSERT(fe->fe_linger_wait != 0);
4149 	fe->fe_linger_expire = (_net_uptime + fe->fe_linger_wait);
4150 	os_atomic_or(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4151 
4152 	lck_mtx_lock_spin(&fsw->fsw_linger_lock);
4153 	TAILQ_INSERT_TAIL(&fsw->fsw_linger_head, fe, fe_linger_link);
4154 	fsw->fsw_linger_cnt++;
4155 	VERIFY(fsw->fsw_linger_cnt != 0);
4156 	lck_mtx_unlock(&fsw->fsw_linger_lock);
4157 
4158 	fsw_reap_sched(fsw);
4159 }
4160 
4161 static void
fsw_linger_remove_internal(struct flow_entry_linger_head * linger_head,struct flow_entry * fe)4162 fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head,
4163     struct flow_entry *fe)
4164 {
4165 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4166 	SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
4167 	    fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
4168 	    fe->fe_flags, FLOWENTF_BITS);
4169 
4170 	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4171 	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4172 	ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
4173 	os_atomic_andnot(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4174 
4175 	TAILQ_REMOVE(linger_head, fe, fe_linger_link);
4176 	flow_entry_release(&fe);
4177 }
4178 
4179 static void
fsw_linger_remove(struct flow_entry * fe)4180 fsw_linger_remove(struct flow_entry *fe)
4181 {
4182 	struct nx_flowswitch *fsw = fe->fe_fsw;
4183 
4184 	LCK_MTX_ASSERT(&fsw->fsw_linger_lock, LCK_MTX_ASSERT_OWNED);
4185 
4186 	fsw_linger_remove_internal(&fsw->fsw_linger_head, fe);
4187 	VERIFY(fsw->fsw_linger_cnt != 0);
4188 	fsw->fsw_linger_cnt--;
4189 }
4190 
4191 void
fsw_linger_purge(struct nx_flowswitch * fsw)4192 fsw_linger_purge(struct nx_flowswitch *fsw)
4193 {
4194 	struct flow_entry *fe, *tfe;
4195 
4196 	lck_mtx_lock(&fsw->fsw_linger_lock);
4197 	TAILQ_FOREACH_SAFE(fe, &fsw->fsw_linger_head, fe_linger_link, tfe) {
4198 		fsw_linger_remove(fe);
4199 	}
4200 	ASSERT(fsw->fsw_linger_cnt == 0);
4201 	ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4202 	lck_mtx_unlock(&fsw->fsw_linger_lock);
4203 }
4204 
4205 void
fsw_reap_sched(struct nx_flowswitch * fsw)4206 fsw_reap_sched(struct nx_flowswitch *fsw)
4207 {
4208 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
4209 	lck_mtx_lock_spin(&fsw->fsw_reap_lock);
4210 	if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING) &&
4211 	    !(fsw->fsw_reap_flags & (FSW_REAPF_TERMINATING | FSW_REAPF_TERMINATED))) {
4212 		thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4213 	}
4214 	lck_mtx_unlock(&fsw->fsw_reap_lock);
4215 }
4216 
4217 __attribute__((noreturn))
4218 static void
fsw_reap_thread_func(void * v,wait_result_t w)4219 fsw_reap_thread_func(void *v, wait_result_t w)
4220 {
4221 #pragma unused(w)
4222 	struct nx_flowswitch *fsw = v;
4223 
4224 	ASSERT(fsw->fsw_reap_thread == current_thread());
4225 	thread_set_thread_name(current_thread(), fsw->fsw_reap_name);
4226 
4227 	net_update_uptime();
4228 
4229 	lck_mtx_lock(&fsw->fsw_reap_lock);
4230 	VERIFY(!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING));
4231 	(void) assert_wait(&fsw->fsw_reap_flags, THREAD_UNINT);
4232 	lck_mtx_unlock(&fsw->fsw_reap_lock);
4233 	thread_block_parameter(fsw_reap_thread_cont, fsw);
4234 	/* NOTREACHED */
4235 	__builtin_unreachable();
4236 }
4237 
4238 __attribute__((noreturn))
4239 static void
fsw_reap_thread_cont(void * v,wait_result_t wres)4240 fsw_reap_thread_cont(void *v, wait_result_t wres)
4241 {
4242 	struct nx_flowswitch *fsw = v;
4243 	boolean_t low;
4244 	uint64_t t = 0;
4245 
4246 	SK_DF(SK_VERB_FLOW, "%s: running", fsw->fsw_reap_name);
4247 
4248 	lck_mtx_lock(&fsw->fsw_reap_lock);
4249 	if (__improbable(wres == THREAD_INTERRUPTED ||
4250 	    (fsw->fsw_reap_flags & FSW_REAPF_TERMINATING) != 0)) {
4251 		goto terminate;
4252 	}
4253 
4254 	ASSERT(!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED));
4255 	fsw->fsw_reap_flags |= FSW_REAPF_RUNNING;
4256 	lck_mtx_unlock(&fsw->fsw_reap_lock);
4257 
4258 	net_update_uptime();
4259 
4260 	/* prevent detach from happening while we're here */
4261 	if (!fsw_detach_barrier_add(fsw)) {
4262 		SK_ERR("%s: netagent detached", fsw->fsw_reap_name);
4263 		t = 0;
4264 	} else {
4265 		uint32_t fe_nonviable, fe_freed, fe_aborted;
4266 		uint32_t fr_freed, fr_resid = 0;
4267 		struct ifnet *ifp = fsw->fsw_ifp;
4268 		uint64_t i = FSW_REAP_IVAL;
4269 		uint64_t now = _net_uptime;
4270 		uint64_t last;
4271 
4272 		ASSERT(fsw->fsw_ifp != NULL);
4273 
4274 		/*
4275 		 * Pass 1: process any deferred {withdrawn,nonviable} requests.
4276 		 */
4277 		fe_nonviable = fsw_process_deferred(fsw);
4278 
4279 		/*
4280 		 * Pass 2: remove any expired lingering flows.
4281 		 */
4282 		fe_freed = fsw_process_linger(fsw, &fe_aborted);
4283 
4284 		/*
4285 		 * Pass 3: prune idle flow routes.
4286 		 */
4287 		fr_freed = flow_route_prune(fsw->fsw_flow_mgr,
4288 		    ifp, &fr_resid);
4289 
4290 		/*
4291 		 * Pass 4: prune flow table
4292 		 *
4293 		 */
4294 		cuckoo_hashtable_try_shrink(fsw->fsw_flow_mgr->fm_flow_table);
4295 
4296 		SK_DF(SK_VERB_FLOW, "%s: fe_nonviable %u/%u fe_freed %u/%u "
4297 		    "fe_aborted %u fr_freed %u/%u",
4298 		    fsw->fsw_flow_mgr->fm_name, fe_nonviable,
4299 		    (fe_nonviable + fsw->fsw_pending_nonviable),
4300 		    fe_freed, fsw->fsw_linger_cnt, fe_aborted, fe_freed,
4301 		    (fe_freed + fr_resid));
4302 
4303 		/* see if VM memory level is critical */
4304 		low = skmem_lowmem_check();
4305 
4306 		/*
4307 		 * If things appear to be idle, we can prune away cached
4308 		 * object that have fallen out of the working sets (this
4309 		 * is different than purging).  Every once in a while, we
4310 		 * also purge the caches.  Note that this is done across
4311 		 * all flowswitch instances, and so we limit this to no
4312 		 * more than once every FSW_REAP_SK_THRES seconds.
4313 		 */
4314 		last = os_atomic_load(&fsw_reap_last, relaxed);
4315 		if ((low || (last != 0 && (now - last) >= FSW_REAP_SK_THRES)) &&
4316 		    os_atomic_cmpxchg(&fsw_reap_last, last, now, acq_rel)) {
4317 			fsw_purge_cache(fsw, low);
4318 
4319 			/* increase sleep interval if idle */
4320 			if (kdebug_enable == 0 && fsw->fsw_linger_cnt == 0 &&
4321 			    fsw->fsw_pending_nonviable == 0 && fr_resid == 0) {
4322 				i <<= 3;
4323 			}
4324 		} else if (last == 0) {
4325 			os_atomic_store(&fsw_reap_last, now, release);
4326 		}
4327 
4328 		/*
4329 		 * Additionally, run thru the list of channels and prune
4330 		 * or purge away cached objects on "idle" channels.  This
4331 		 * check is rate limited to no more than once every
4332 		 * FSW_DRAIN_CH_THRES seconds.
4333 		 */
4334 		last = fsw->fsw_drain_channel_chk_last;
4335 		if (low || (last != 0 && (now - last) >= FSW_DRAIN_CH_THRES)) {
4336 			SK_DF(SK_VERB_FLOW, "%s: pruning channels",
4337 			    fsw->fsw_flow_mgr->fm_name);
4338 
4339 			fsw->fsw_drain_channel_chk_last = now;
4340 			fsw_drain_channels(fsw, now, low);
4341 		} else if (__improbable(last == 0)) {
4342 			fsw->fsw_drain_channel_chk_last = now;
4343 		}
4344 
4345 		/*
4346 		 * Finally, invoke the interface's reap callback to
4347 		 * tell it to prune or purge away cached objects if
4348 		 * it is idle.  This check is rate limited to no more
4349 		 * than once every FSW_REAP_IF_THRES seconds.
4350 		 */
4351 		last = fsw->fsw_drain_netif_chk_last;
4352 		if (low || (last != 0 && (now - last) >= FSW_REAP_IF_THRES)) {
4353 			ASSERT(fsw->fsw_nifna != NULL);
4354 
4355 			if (ifp->if_na_ops != NULL &&
4356 			    ifp->if_na_ops->ni_reap != NULL) {
4357 				SK_DF(SK_VERB_FLOW, "%s: pruning netif",
4358 				    fsw->fsw_flow_mgr->fm_name);
4359 				ifp->if_na_ops->ni_reap(ifp->if_na, ifp,
4360 				    FSW_REAP_IF_THRES, low);
4361 			}
4362 
4363 			fsw->fsw_drain_netif_chk_last = now;
4364 		} else if (__improbable(last == 0)) {
4365 			fsw->fsw_drain_netif_chk_last = now;
4366 		}
4367 
4368 		/* emit periodic interface stats ktrace */
4369 		last = fsw->fsw_reap_last;
4370 		if (last != 0 && (now - last) >= FSW_IFSTATS_THRES) {
4371 			KDBG(SK_KTRACE_AON_IF_STATS, ifp->if_data.ifi_ipackets,
4372 			    ifp->if_data.ifi_ibytes * 8,
4373 			    ifp->if_data.ifi_opackets,
4374 			    ifp->if_data.ifi_obytes * 8);
4375 
4376 			fsw->fsw_reap_last = now;
4377 		} else if (__improbable(last == 0)) {
4378 			fsw->fsw_reap_last = now;
4379 		}
4380 
4381 		nanoseconds_to_absolutetime(i * NSEC_PER_SEC, &t);
4382 		clock_absolutetime_interval_to_deadline(t, &t);
4383 		ASSERT(t != 0);
4384 
4385 		/* allow any pending detach to proceed */
4386 		fsw_detach_barrier_remove(fsw);
4387 	}
4388 
4389 	lck_mtx_lock(&fsw->fsw_reap_lock);
4390 	if (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING)) {
4391 		fsw->fsw_reap_flags &= ~FSW_REAPF_RUNNING;
4392 		(void) assert_wait_deadline(&fsw->fsw_reap_flags,
4393 		    THREAD_UNINT, t);
4394 		lck_mtx_unlock(&fsw->fsw_reap_lock);
4395 		thread_block_parameter(fsw_reap_thread_cont, fsw);
4396 		/* NOTREACHED */
4397 		__builtin_unreachable();
4398 	} else {
4399 terminate:
4400 		LCK_MTX_ASSERT(&fsw->fsw_reap_lock, LCK_MTX_ASSERT_OWNED);
4401 		fsw->fsw_reap_flags &= ~(FSW_REAPF_RUNNING | FSW_REAPF_TERMINATING);
4402 		fsw->fsw_reap_flags |= FSW_REAPF_TERMINATED;
4403 		/*
4404 		 * And signal any thread waiting for us to terminate;
4405 		 * wait channel here other than fsw_reap_flags to make
4406 		 * it more explicit.
4407 		 */
4408 		if (fsw->fsw_reap_flags & FSW_REAPF_TERMINATEBLOCK) {
4409 			thread_wakeup((caddr_t)&fsw->fsw_reap_thread);
4410 		}
4411 		lck_mtx_unlock(&fsw->fsw_reap_lock);
4412 
4413 		SK_DF(SK_VERB_FLOW, "%s: terminating", fsw->fsw_reap_name);
4414 
4415 		/* for the extra refcnt from kernel_thread_start() */
4416 		thread_deallocate(current_thread());
4417 		/* this is the end */
4418 		thread_terminate(current_thread());
4419 		/* NOTREACHED */
4420 		__builtin_unreachable();
4421 	}
4422 
4423 	/* must never get here */
4424 	VERIFY(0);
4425 	/* NOTREACHED */
4426 	__builtin_unreachable();
4427 }
4428 
4429 static void
fsw_drain_channels(struct nx_flowswitch * fsw,uint64_t now,boolean_t low)4430 fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low)
4431 {
4432 	struct kern_nexus *nx = fsw->fsw_nx;
4433 
4434 	/* flowswitch protects NA via fsw_lock, see fsw_port_alloc/free */
4435 	FSW_RLOCK(fsw);
4436 
4437 	/* uncrustify doesn't handle C blocks properly */
4438 	/* BEGIN IGNORE CODESTYLE */
4439 	nx_port_foreach(nx, ^(nexus_port_t p) {
4440 		struct nexus_adapter *na = nx_port_get_na(nx, p);
4441 		if (na == NULL || na->na_work_ts == 0 || na->na_rx_rings == NULL) {
4442 			return;
4443 		}
4444 
4445 		boolean_t purge;
4446 
4447 		/*
4448 		 * If some activity happened in the last FSW_DRAIN_CH_THRES
4449 		 * seconds on this channel, we reclaim memory if the channel
4450 		 * throughput is less than the reap threshold value.
4451 		 */
4452 		if ((now - na->na_work_ts) < FSW_DRAIN_CH_THRES) {
4453 			struct __kern_channel_ring *ring;
4454 			channel_ring_stats *stats;
4455 			uint64_t bps;
4456 
4457 			ring = na->na_rx_rings;
4458 			stats = &ring->ckr_stats;
4459 			bps = stats->crs_bytes_per_second;
4460 
4461 			if (bps < fsw_channel_reap_thresh) {
4462 				purge = FALSE;
4463 				na_drain(na, purge);
4464 			}
4465 			return;
4466 		}
4467 
4468 		/*
4469 		 * If NA has been inactive for some time (twice the drain
4470 		 * threshold), we clear the work timestamp to temporarily skip
4471 		 * this channel until it's active again.  Purging cached objects
4472 		 * can be expensive since we'd need to allocate and construct
4473 		 * them again, so we do it only when necessary.
4474 		 */
4475 		if (low || ((now - na->na_work_ts) >= (FSW_DRAIN_CH_THRES << 1))) {
4476 			na->na_work_ts = 0;
4477 			purge = TRUE;
4478 		} else {
4479 			purge = FALSE;
4480 		}
4481 
4482 		na_drain(na, purge);  /* purge/prune caches */
4483 	});
4484 	/* END IGNORE CODESTYLE */
4485 
4486 	FSW_RUNLOCK(fsw);
4487 }
4488 
4489 static void
fsw_purge_cache(struct nx_flowswitch * fsw,boolean_t low)4490 fsw_purge_cache(struct nx_flowswitch *fsw, boolean_t low)
4491 {
4492 #pragma unused(fsw)
4493 	uint64_t o = os_atomic_inc_orig(&fsw_want_purge, relaxed);
4494 	uint32_t p = fsw_flow_purge_thresh;
4495 	boolean_t purge = (low || (o != 0 && p != 0 && (o % p) == 0));
4496 
4497 	SK_DF(SK_VERB_FLOW, "%s: %s caches",
4498 	    fsw->fsw_flow_mgr->fm_name,
4499 	    (purge ? "purge" : "prune"));
4500 
4501 	skmem_cache_reap_now(sk_fo_cache, purge);
4502 	skmem_cache_reap_now(sk_fe_cache, purge);
4503 	skmem_cache_reap_now(sk_fab_cache, purge);
4504 	skmem_cache_reap_now(flow_route_cache, purge);
4505 	skmem_cache_reap_now(flow_stats_cache, purge);
4506 	netns_reap_caches(purge);
4507 	skmem_reap_caches(purge);
4508 
4509 #if CONFIG_MBUF_MCACHE
4510 	if (if_is_fsw_transport_netagent_enabled() && purge) {
4511 		mbuf_drain(FALSE);
4512 	}
4513 #endif /* CONFIG_MBUF_MCACHE */
4514 }
4515 
4516 static void
fsw_flow_handle_low_power(struct nx_flowswitch * fsw,struct flow_entry * fe)4517 fsw_flow_handle_low_power(struct nx_flowswitch *fsw, struct flow_entry *fe)
4518 {
4519 	/* When the interface is in low power mode, the flow is nonviable */
4520 	if (!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
4521 	    os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
4522 		os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
4523 	}
4524 }
4525 
4526 static uint32_t
fsw_process_deferred(struct nx_flowswitch * fsw)4527 fsw_process_deferred(struct nx_flowswitch *fsw)
4528 {
4529 	struct flow_entry_dead sfed __sk_aligned(8);
4530 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
4531 	struct flow_entry_dead *fed, *tfed;
4532 	LIST_HEAD(, flow_entry_dead) fed_head =
4533 	    LIST_HEAD_INITIALIZER(fed_head);
4534 	uint32_t i, nonviable = 0;
4535 	boolean_t lowpowermode = FALSE;
4536 
4537 	bzero(&sfed, sizeof(sfed));
4538 
4539 	/*
4540 	 * The flows become nonviable when the interface
4541 	 * is in low power mode (edge trigger)
4542 	 */
4543 	if ((fsw->fsw_ifp->if_xflags & IFXF_LOW_POWER) &&
4544 	    fsw->fsw_ifp->if_low_power_gencnt != fsw->fsw_low_power_gencnt) {
4545 		lowpowermode = TRUE;
4546 		fsw->fsw_low_power_gencnt = fsw->fsw_ifp->if_low_power_gencnt;
4547 	}
4548 
4549 	/*
4550 	 * Scan thru the flow entry tree, and commit any pending withdraw or
4551 	 * nonviable requests.  We may need to push stats and/or unassign the
4552 	 * nexus from NECP, but we cannot do that while holding the locks;
4553 	 * build a temporary list for those entries.
4554 	 */
4555 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
4556 		struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
4557 		struct flow_owner *fo;
4558 
4559 		/*
4560 		 * Grab the lock at all costs when handling low power mode
4561 		 */
4562 		if (__probable(!lowpowermode)) {
4563 			if (!FOB_TRY_LOCK(fob)) {
4564 				continue;
4565 			}
4566 		} else {
4567 			FOB_LOCK(fob);
4568 		}
4569 
4570 		FOB_LOCK_ASSERT_HELD(fob);
4571 		RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
4572 			struct flow_entry *fe;
4573 
4574 			RB_FOREACH(fe, flow_entry_id_tree,
4575 			    &fo->fo_flow_entry_id_head) {
4576 				/* try first as reader; skip if we can't */
4577 				if (__improbable(lowpowermode)) {
4578 					fsw_flow_handle_low_power(fsw, fe);
4579 				}
4580 				if (__improbable(fe->fe_flags & FLOWENTF_HALF_CLOSED)) {
4581 					os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
4582 					flow_namespace_half_close(&fe->fe_port_reservation);
4583 				}
4584 
4585 				/* if not withdrawn/nonviable, skip */
4586 				if (!fe->fe_want_withdraw &&
4587 				    !fe->fe_want_nonviable) {
4588 					continue;
4589 				}
4590 				/*
4591 				 * Here we're holding the lock as writer;
4592 				 * don't spend too much time as we're
4593 				 * blocking the data path now.
4594 				 */
4595 				ASSERT(!uuid_is_null(fe->fe_uuid));
4596 				/* only need flow UUID and booleans */
4597 				uuid_copy(sfed.fed_uuid, fe->fe_uuid);
4598 				sfed.fed_want_clonotify =
4599 				    (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY);
4600 				sfed.fed_want_nonviable = fe->fe_want_nonviable;
4601 				flow_entry_teardown(fo, fe);
4602 
4603 				/* do this outside the flow bucket lock */
4604 				fed = flow_entry_dead_alloc(Z_WAITOK);
4605 				ASSERT(fed != NULL);
4606 				*fed = sfed;
4607 				LIST_INSERT_HEAD(&fed_head, fed, fed_link);
4608 			}
4609 		}
4610 		FOB_UNLOCK(fob);
4611 	}
4612 
4613 	/*
4614 	 * These nonviable flows are no longer useful since we've lost
4615 	 * the source IP address; in the event the client monitors the
4616 	 * viability of the flow, explicitly mark it as nonviable so
4617 	 * that a new flow can be created.
4618 	 */
4619 	LIST_FOREACH_SAFE(fed, &fed_head, fed_link, tfed) {
4620 		LIST_REMOVE(fed, fed_link);
4621 		ASSERT(fsw->fsw_agent_session != NULL);
4622 
4623 		/* if flow is closed early */
4624 		if (fed->fed_want_clonotify) {
4625 			necp_client_early_close(fed->fed_uuid);
4626 		}
4627 
4628 		/* if nonviable, unassign nexus attributes */
4629 		if (fed->fed_want_nonviable) {
4630 			(void) netagent_assign_nexus(fsw->fsw_agent_session,
4631 			    fed->fed_uuid, NULL, 0);
4632 		}
4633 
4634 		flow_entry_dead_free(fed);
4635 		++nonviable;
4636 	}
4637 	ASSERT(LIST_EMPTY(&fed_head));
4638 
4639 	return nonviable;
4640 }
4641 
4642 static uint32_t
fsw_process_linger(struct nx_flowswitch * fsw,uint32_t * abort)4643 fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort)
4644 {
4645 	struct flow_entry_linger_head linger_head =
4646 	    TAILQ_HEAD_INITIALIZER(linger_head);
4647 	struct flow_entry *fe, *tfe;
4648 	uint64_t now = _net_uptime;
4649 	uint32_t i = 0, cnt = 0, freed = 0;
4650 
4651 	ASSERT(fsw->fsw_ifp != NULL);
4652 	ASSERT(abort != NULL);
4653 	*abort = 0;
4654 
4655 	/*
4656 	 * We don't want to contend with the datapath, so move
4657 	 * everything that's in the linger list into a local list.
4658 	 * This allows us to generate RSTs or free the flow entry
4659 	 * outside the lock.  Any remaining flow entry in the local
4660 	 * list will get re-added back to the head of the linger
4661 	 * list, in front of any new ones added since then.
4662 	 */
4663 	lck_mtx_lock(&fsw->fsw_linger_lock);
4664 	TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
4665 	ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4666 	cnt = fsw->fsw_linger_cnt;
4667 	fsw->fsw_linger_cnt = 0;
4668 	lck_mtx_unlock(&fsw->fsw_linger_lock);
4669 
4670 	TAILQ_FOREACH_SAFE(fe, &linger_head, fe_linger_link, tfe) {
4671 		ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4672 		ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4673 		ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
4674 
4675 		/*
4676 		 * See if this is a TCP flow that needs to generate
4677 		 * a RST to the remote peer (if not already).
4678 		 */
4679 		if (flow_track_tcp_want_abort(fe)) {
4680 			VERIFY(fe->fe_flags & FLOWENTF_ABORTED);
4681 			ASSERT(!uuid_is_null(fe->fe_uuid));
4682 			flow_track_abort_tcp(fe, NULL, NULL);
4683 			(*abort)++;
4684 			SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4685 			SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx "
4686 			    "flags 0x%b [RST]", fe_as_string(fe, dbgbuf,
4687 			    sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags,
4688 			    FLOWENTF_BITS);
4689 		}
4690 
4691 		/*
4692 		 * If flow has expired, remove from list and free;
4693 		 * otherwise leave it around in the linger list.
4694 		 */
4695 		if (fe->fe_linger_expire <= now) {
4696 			freed++;
4697 			fsw_linger_remove_internal(&linger_head, fe);
4698 			fe = NULL;
4699 		}
4700 		++i;
4701 	}
4702 	VERIFY(i == cnt && cnt >= freed);
4703 
4704 	/*
4705 	 * Add any remaining ones back into the linger list.
4706 	 */
4707 	lck_mtx_lock(&fsw->fsw_linger_lock);
4708 	if (!TAILQ_EMPTY(&linger_head)) {
4709 		ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head) || fsw->fsw_linger_cnt);
4710 		TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
4711 		ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4712 		TAILQ_CONCAT(&fsw->fsw_linger_head, &linger_head, fe_linger_link);
4713 		fsw->fsw_linger_cnt += (cnt - freed);
4714 	}
4715 	ASSERT(TAILQ_EMPTY(&linger_head));
4716 	lck_mtx_unlock(&fsw->fsw_linger_lock);
4717 
4718 	return freed;
4719 }
4720 
4721 __attribute__((always_inline))
4722 static inline void
fsw_ifp_inc_traffic_class_in_pkt(struct ifnet * ifp,kern_packet_t ph)4723 fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph)
4724 {
4725 	switch (__packet_get_traffic_class(ph)) {
4726 	case PKT_TC_BE:
4727 		ifp->if_tc.ifi_ibepackets++;
4728 		ifp->if_tc.ifi_ibebytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
4729 		break;
4730 	case PKT_TC_BK:
4731 		ifp->if_tc.ifi_ibkpackets++;
4732 		ifp->if_tc.ifi_ibkbytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
4733 		break;
4734 	case PKT_TC_VI:
4735 		ifp->if_tc.ifi_ivipackets++;
4736 		ifp->if_tc.ifi_ivibytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
4737 		break;
4738 	case PKT_TC_VO:
4739 		ifp->if_tc.ifi_ivopackets++;
4740 		ifp->if_tc.ifi_ivobytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
4741 		break;
4742 	default:
4743 		break;
4744 	}
4745 }
4746 
4747 __attribute__((always_inline))
4748 static inline void
fsw_ifp_inc_traffic_class_out_pkt(struct ifnet * ifp,uint32_t svc,uint32_t cnt,uint32_t len)4749 fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *ifp, uint32_t svc,
4750     uint32_t cnt, uint32_t len)
4751 {
4752 	switch (svc) {
4753 	case PKT_TC_BE:
4754 		ifp->if_tc.ifi_obepackets += cnt;
4755 		ifp->if_tc.ifi_obebytes += len;
4756 		break;
4757 	case PKT_TC_BK:
4758 		ifp->if_tc.ifi_obkpackets += cnt;
4759 		ifp->if_tc.ifi_obkbytes += len;
4760 		break;
4761 	case PKT_TC_VI:
4762 		ifp->if_tc.ifi_ovipackets += cnt;
4763 		ifp->if_tc.ifi_ovibytes += len;
4764 		break;
4765 	case PKT_TC_VO:
4766 		ifp->if_tc.ifi_ovopackets += cnt;
4767 		ifp->if_tc.ifi_ovobytes += len;
4768 		break;
4769 	default:
4770 		break;
4771 	}
4772 }
4773