xref: /xnu-12377.81.4/bsd/skywalk/nexus/netif/nx_netif_host.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #define _IP_VHL
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/os_packet_private.h>
31 #include <skywalk/nexus/netif/nx_netif.h>
32 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
33 #include <net/ethernet.h>
34 #include <net/pktap.h>
35 #include <sys/kdebug.h>
36 #include <sys/sdt.h>
37 
38 #define DBG_FUNC_NX_NETIF_HOST_ENQUEUE  \
39 	SKYWALKDBG_CODE(DBG_SKYWALK_NETIF, 2)
40 
41 static void nx_netif_host_catch_tx(struct nexus_adapter *, bool);
42 static inline struct __kern_packet*
43 nx_netif_mbuf_to_kpkt(struct nexus_adapter *, struct mbuf *);
44 
45 #define SK_IFCAP_CSUM   (IFCAP_HWCSUM|IFCAP_CSUM_PARTIAL|IFCAP_CSUM_ZERO_INVERT)
46 
47 static  bool
nx_netif_host_is_gso_needed(struct nexus_adapter * na)48 nx_netif_host_is_gso_needed(struct nexus_adapter *na)
49 {
50 	struct nx_netif *nif = ((struct nexus_netif_adapter *)na)->nifna_netif;
51 
52 	/*
53 	 * Don't enable for Compat netif.
54 	 */
55 	if (na->na_type != NA_NETIF_HOST) {
56 		return false;
57 	}
58 	/*
59 	 * Don't enable if netif is not plumbed under a flowswitch.
60 	 */
61 	if (!NA_KERNEL_ONLY(na)) {
62 		return false;
63 	}
64 	/*
65 	 * Don't enable If HW TSO is enabled.
66 	 */
67 	if (((nif->nif_hwassist & IFNET_TSO_IPV4) != 0) ||
68 	    ((nif->nif_hwassist & IFNET_TSO_IPV6) != 0)) {
69 		return false;
70 	}
71 	/*
72 	 * Don't enable if TX aggregation is disabled.
73 	 */
74 	if (sk_fsw_tx_agg_tcp == 0) {
75 		return false;
76 	}
77 	return true;
78 }
79 
80 static void
nx_netif_host_adjust_if_capabilities(struct nexus_adapter * na,bool activate)81 nx_netif_host_adjust_if_capabilities(struct nexus_adapter *na, bool activate)
82 {
83 	struct nx_netif *nif = ((struct nexus_netif_adapter *)na)->nifna_netif;
84 	struct ifnet *ifp = na->na_ifp;
85 
86 	ifnet_lock_exclusive(ifp);
87 
88 	if (activate) {
89 		/* XXX: [email protected] - disable TSO and LRO for now */
90 		nif->nif_hwassist = ifp->if_hwassist;
91 		nif->nif_capabilities = ifp->if_capabilities;
92 		nif->nif_capenable = ifp->if_capenable;
93 		ifp->if_hwassist &= ~(IFNET_CHECKSUMF | IFNET_TSOF);
94 		ifp->if_capabilities &= ~(SK_IFCAP_CSUM | IFCAP_TSO);
95 		ifp->if_capenable &= ~(SK_IFCAP_CSUM | IFCAP_TSO);
96 
97 		/*
98 		 * Re-enable the capabilities which Skywalk layer provides:
99 		 *
100 		 * Native driver: a copy from packet to mbuf always occurs
101 		 * for each inbound and outbound packet; if hardware
102 		 * does not support csum offload, we leverage combined
103 		 * copy and checksum, and thus advertise IFNET_CSUM_PARTIAL.
104 		 * We also always enable 16KB jumbo mbuf support.
105 		 *
106 		 * Compat driver: inbound and outbound mbufs don't incur a
107 		 * copy, and so leave the driver advertised flags alone.
108 		 */
109 		if (NA_KERNEL_ONLY(na)) {
110 			if (na->na_type == NA_NETIF_HOST) {     /* native */
111 				ifp->if_hwassist |=
112 				    IFNET_MULTIPAGES | (nif->nif_hwassist &
113 				    (IFNET_CHECKSUMF | IFNET_TSOF));
114 				ifp->if_capabilities |=
115 				    (nif->nif_capabilities &
116 				    (SK_IFCAP_CSUM | IFCAP_TSO));
117 				ifp->if_capenable |=
118 				    (nif->nif_capenable &
119 				    (SK_IFCAP_CSUM | IFCAP_TSO));
120 				/*
121 				 * If hardware doesn't support IP and TCP/UDP csum offload,
122 				 * advertise IFNET_CSUM_PARTIAL.
123 				 */
124 				if ((ifp->if_hwassist & IFNET_UDP_TCP_TX_CHECKSUMF) !=
125 				    IFNET_UDP_TCP_TX_CHECKSUMF) {
126 					ifp->if_hwassist |= IFNET_CSUM_PARTIAL | IFNET_CSUM_ZERO_INVERT;
127 					ifp->if_capabilities |= IFCAP_CSUM_PARTIAL | IFCAP_CSUM_ZERO_INVERT;
128 					ifp->if_capenable |= IFCAP_CSUM_PARTIAL | IFCAP_CSUM_ZERO_INVERT;
129 				}
130 				if (sk_fsw_tx_agg_tcp != 0) {
131 					ifp->if_hwassist |= IFNET_TSOF;
132 					ifp->if_capabilities |= IFCAP_TSO;
133 					ifp->if_capenable |= IFCAP_TSO;
134 				}
135 
136 				if (!nx_netif_host_is_gso_needed(na)) {
137 					if_set_eflags(ifp, IFEF_SENDLIST);
138 				}
139 			} else {                                /* compat */
140 				ifp->if_hwassist |=
141 				    (nif->nif_hwassist &
142 				    (IFNET_CHECKSUMF | IFNET_TSOF));
143 				ifp->if_capabilities |=
144 				    (nif->nif_capabilities &
145 				    (SK_IFCAP_CSUM | IFCAP_TSO));
146 				ifp->if_capenable |=
147 				    (nif->nif_capenable &
148 				    (SK_IFCAP_CSUM | IFCAP_TSO));
149 			}
150 		}
151 	} else {
152 		if (NA_KERNEL_ONLY(na) && na->na_type == NA_NETIF_HOST) {
153 			if_clear_eflags(ifp, IFEF_SENDLIST);
154 		}
155 		/* Unset any capabilities previously set by Skywalk */
156 		ifp->if_hwassist &= ~(IFNET_CHECKSUMF | IFNET_MULTIPAGES);
157 		ifp->if_capabilities &= ~SK_IFCAP_CSUM;
158 		ifp->if_capenable &= ~SK_IFCAP_CSUM;
159 		if ((sk_fsw_tx_agg_tcp != 0) &&
160 		    (na->na_type == NA_NETIF_HOST)) {
161 			ifp->if_hwassist &= ~IFNET_TSOF;
162 			ifp->if_capabilities &= ~IFCAP_TSO;
163 			ifp->if_capenable &= ~IFCAP_TSO;
164 		}
165 		/* Restore driver original flags */
166 		ifp->if_hwassist |= (nif->nif_hwassist &
167 		    (IFNET_CHECKSUMF | IFNET_TSOF | IFNET_MULTIPAGES));
168 		ifp->if_capabilities |=
169 		    (nif->nif_capabilities & (SK_IFCAP_CSUM | IFCAP_TSO));
170 		ifp->if_capenable |=
171 		    (nif->nif_capenable & (SK_IFCAP_CSUM | IFCAP_TSO));
172 	}
173 
174 	ifnet_lock_done(ifp);
175 }
176 
177 int
nx_netif_host_na_activate(struct nexus_adapter * na,na_activate_mode_t mode)178 nx_netif_host_na_activate(struct nexus_adapter *na, na_activate_mode_t mode)
179 {
180 	struct ifnet *ifp = na->na_ifp;
181 	int error = 0;
182 
183 	ASSERT(na->na_type == NA_NETIF_HOST ||
184 	    na->na_type == NA_NETIF_COMPAT_HOST);
185 	ASSERT(na->na_flags & NAF_HOST_ONLY);
186 
187 	SK_DF(SK_VERB_NETIF, "na \"%s\" (%p) %s", na->na_name,
188 	    SK_KVA(na), na_activate_mode2str(mode));
189 
190 	switch (mode) {
191 	case NA_ACTIVATE_MODE_ON:
192 		VERIFY(SKYWALK_CAPABLE(ifp));
193 
194 		nx_netif_host_adjust_if_capabilities(na, true);
195 		/*
196 		 * Make skywalk control the packet steering
197 		 * Don't intercept tx packets if this is a netif compat
198 		 * adapter attached to a flowswitch
199 		 */
200 		nx_netif_host_catch_tx(na, true);
201 
202 		os_atomic_or(&na->na_flags, NAF_ACTIVE, relaxed);
203 		break;
204 
205 	case NA_ACTIVATE_MODE_DEFUNCT:
206 		VERIFY(SKYWALK_CAPABLE(ifp));
207 		break;
208 
209 	case NA_ACTIVATE_MODE_OFF:
210 		/* Release packet steering control. */
211 		nx_netif_host_catch_tx(na, false);
212 
213 		/*
214 		 * Note that here we cannot assert SKYWALK_CAPABLE()
215 		 * as we're called in the destructor path.
216 		 */
217 		os_atomic_andnot(&na->na_flags, NAF_ACTIVE, relaxed);
218 
219 		nx_netif_host_adjust_if_capabilities(na, false);
220 		break;
221 
222 	default:
223 		VERIFY(0);
224 		/* NOTREACHED */
225 		__builtin_unreachable();
226 	}
227 
228 	return error;
229 }
230 
231 /* na_krings_create callback for netif host adapters */
232 int
nx_netif_host_krings_create(struct nexus_adapter * na,struct kern_channel * ch)233 nx_netif_host_krings_create(struct nexus_adapter *na, struct kern_channel *ch)
234 {
235 	int ret;
236 
237 	SK_LOCK_ASSERT_HELD();
238 	ASSERT(na->na_type == NA_NETIF_HOST ||
239 	    na->na_type == NA_NETIF_COMPAT_HOST);
240 	ASSERT(na->na_flags & NAF_HOST_ONLY);
241 
242 	ret = na_rings_mem_setup(na, FALSE, ch);
243 	if (ret == 0) {
244 		struct __kern_channel_ring *kring;
245 		uint32_t i;
246 
247 		/* drop by default until fully bound */
248 		if (NA_KERNEL_ONLY(na)) {
249 			na_kr_drop(na, TRUE);
250 		}
251 
252 		for (i = 0; i < na_get_nrings(na, NR_RX); i++) {
253 			kring = &NAKR(na, NR_RX)[i];
254 			/* initialize the nx_mbq for the sw rx ring */
255 			nx_mbq_safe_init(kring, &kring->ckr_rx_queue,
256 			    NX_MBQ_NO_LIMIT, &nexus_mbq_lock_group,
257 			    &nexus_lock_attr);
258 			SK_DF(SK_VERB_NETIF,
259 			    "na \"%s\" (%p) initialized host kr \"%s\" "
260 			    "(%p) krflags 0x%x", na->na_name, SK_KVA(na),
261 			    kring->ckr_name, SK_KVA(kring), kring->ckr_flags);
262 		}
263 	}
264 	return ret;
265 }
266 
267 /*
268  * Destructor for netif host adapters; they also have an mbuf queue
269  * on the rings connected to the host so we need to purge them first.
270  */
271 void
nx_netif_host_krings_delete(struct nexus_adapter * na,struct kern_channel * ch,boolean_t defunct)272 nx_netif_host_krings_delete(struct nexus_adapter *na, struct kern_channel *ch,
273     boolean_t defunct)
274 {
275 	struct __kern_channel_ring *kring;
276 	uint32_t i;
277 
278 	SK_LOCK_ASSERT_HELD();
279 	ASSERT(na->na_type == NA_NETIF_HOST ||
280 	    na->na_type == NA_NETIF_COMPAT_HOST);
281 	ASSERT(na->na_flags & NAF_HOST_ONLY);
282 
283 	if (NA_KERNEL_ONLY(na)) {
284 		na_kr_drop(na, TRUE);
285 	}
286 
287 	for (i = 0; i < na_get_nrings(na, NR_RX); i++) {
288 		struct nx_mbq *q;
289 
290 		kring = &NAKR(na, NR_RX)[i];
291 		q = &kring->ckr_rx_queue;
292 		SK_DF(SK_VERB_NETIF,
293 		    "na \"%s\" (%p) destroy host kr \"%s\" (%p) "
294 		    "krflags 0x%x with qlen %u", na->na_name, SK_KVA(na),
295 		    kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
296 		    nx_mbq_len(q));
297 		nx_mbq_purge(q);
298 		if (!defunct) {
299 			nx_mbq_safe_destroy(q);
300 		}
301 	}
302 
303 	na_rings_mem_teardown(na, ch, defunct);
304 }
305 
306 /* kring->ckr_na_sync callback for the host rx ring */
307 int
nx_netif_host_na_rxsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)308 nx_netif_host_na_rxsync(struct __kern_channel_ring *kring,
309     struct proc *p, uint32_t flags)
310 {
311 #pragma unused(kring, p, flags)
312 	return 0;
313 }
314 
315 /*
316  * kring->ckr_na_sync callback for the host tx ring.
317  */
318 int
nx_netif_host_na_txsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)319 nx_netif_host_na_txsync(struct __kern_channel_ring *kring, struct proc *p,
320     uint32_t flags)
321 {
322 #pragma unused(kring, p, flags)
323 	return 0;
324 }
325 
326 int
nx_netif_host_na_special(struct nexus_adapter * na,struct kern_channel * ch,struct chreq * chr,nxspec_cmd_t spec_cmd)327 nx_netif_host_na_special(struct nexus_adapter *na, struct kern_channel *ch,
328     struct chreq *chr, nxspec_cmd_t spec_cmd)
329 {
330 	ASSERT(na->na_type == NA_NETIF_HOST ||
331 	    na->na_type == NA_NETIF_COMPAT_HOST);
332 	return nx_netif_na_special_common(na, ch, chr, spec_cmd);
333 }
334 
335 /*
336  * Intercept the packet steering routine in the tx path,
337  * so that we can decide which queue is used for an mbuf.
338  * Second argument is TRUE to intercept, FALSE to restore.
339  */
340 static void
nx_netif_host_catch_tx(struct nexus_adapter * na,bool activate)341 nx_netif_host_catch_tx(struct nexus_adapter *na, bool activate)
342 {
343 	struct ifnet *ifp = na->na_ifp;
344 	int err = 0;
345 
346 	ASSERT(na->na_type == NA_NETIF_HOST ||
347 	    na->na_type == NA_NETIF_COMPAT_HOST);
348 	ASSERT(na->na_flags & NAF_HOST_ONLY);
349 
350 	/*
351 	 * Common case is NA_KERNEL_ONLY: if the netif is plumbed
352 	 * below the flowswitch.  For TXSTART compat driver and legacy:
353 	 * don't intercept DLIL output handler, since in this model
354 	 * packets from both BSD stack and flowswitch are directly
355 	 * enqueued to the classq via ifnet_enqueue().
356 	 *
357 	 * Otherwise, it's the uncommon case where a user channel is
358 	 * opened directly to the netif.  Here we either intercept
359 	 * or restore the DLIL output handler.
360 	 */
361 	if (activate) {
362 		if (__improbable(!NA_KERNEL_ONLY(na))) {
363 			return;
364 		}
365 		/*
366 		 * For native drivers only, intercept if_output();
367 		 * for compat, leave it alone since we don't need
368 		 * to perform any mbuf-pkt conversion.
369 		 */
370 		if (na->na_type == NA_NETIF_HOST) {
371 			err = ifnet_set_output_handler(ifp,
372 			    nx_netif_host_is_gso_needed(na) ?
373 			    netif_gso_dispatch : nx_netif_host_output);
374 			VERIFY(err == 0);
375 		}
376 	} else {
377 		if (__improbable(!NA_KERNEL_ONLY(na))) {
378 			return;
379 		}
380 		/*
381 		 * Restore original if_output() for native drivers.
382 		 */
383 		if (na->na_type == NA_NETIF_HOST) {
384 			ifnet_reset_output_handler(ifp);
385 		}
386 	}
387 }
388 
389 static int
get_af_from_mbuf(struct mbuf * m)390 get_af_from_mbuf(struct mbuf *m)
391 {
392 	/*
393 	 * -fbounds-safety: Although m_pkthdr.pkt_hdr is a void * without
394 	 * annotations, here we can just mark the uint8_t *pkt_hdr as __single
395 	 * becase we don't do any arithmetic and the only place we dereference
396 	 * it is to read the ip version, where having the bounds of a single
397 	 * 8-bit size is enough.
398 	 */
399 	uint8_t *__single pkt_hdr;
400 	uint8_t ipv;
401 	struct mbuf *m0;
402 	int af;
403 
404 	pkt_hdr = m->m_pkthdr.pkt_hdr;
405 	for (m0 = m; m0 != NULL; m0 = m0->m_next) {
406 		if (pkt_hdr >= (uint8_t *)m0->m_data &&
407 		    pkt_hdr < (uint8_t *)m0->m_data + m0->m_len) {
408 			break;
409 		}
410 	}
411 	if (m0 == NULL) {
412 		DTRACE_SKYWALK1(bad__pkthdr, struct mbuf *, m);
413 		af = AF_UNSPEC;
414 		goto done;
415 	}
416 	ipv = IP_VHL_V(*pkt_hdr);
417 	if (ipv == 4) {
418 		af = AF_INET;
419 	} else if (ipv == 6) {
420 		af = AF_INET6;
421 	} else {
422 		af = AF_UNSPEC;
423 	}
424 done:
425 	DTRACE_SKYWALK2(mbuf__af, int, af, struct mbuf *, m);
426 	return af;
427 }
428 
429 /*
430  * if_output() callback called by dlil_output() to handle mbufs coming out
431  * of the host networking stack.  The mbuf will get converted to a packet,
432  * and enqueued to the classq of a Skywalk native interface.
433  */
434 int
nx_netif_host_output(struct ifnet * ifp,struct mbuf * m_chain)435 nx_netif_host_output(struct ifnet *ifp, struct mbuf *m_chain)
436 {
437 	struct nx_netif *nif = NA(ifp)->nifna_netif;
438 	struct __kern_channel_ring *currentkring = NULL;
439 	struct kern_nexus *nx = nif->nif_nx;
440 	struct nexus_adapter *hwna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
441 	struct nexus_adapter *hostna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST);
442 	struct netif_stats *nifs = &NX_NETIF_PRIVATE(hwna->na_nx)->nif_stats;
443 	struct mbuf *m_head = m_chain, *m = NULL, *drop_list = NULL, *free_list = NULL;
444 	struct __kern_packet *pkt_chain_head, *pkt_chain_tail;
445 	struct netif_qset *__single qset = NULL;
446 	struct pktq pkt_q;
447 	bool qset_id_valid = false;
448 	boolean_t pkt_drop = FALSE;
449 	uint32_t n_pkts = 0, n_bytes = 0;
450 	errno_t error = 0;
451 
452 	static_assert(sizeof(m_head->m_pkthdr.pkt_mpriv_qsetid) == sizeof(uint64_t));
453 
454 	ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
455 	ASSERT(hostna->na_type == NA_NETIF_HOST);
456 
457 	KPKTQ_INIT(&pkt_q);
458 	while (m_head) {
459 		struct __kern_channel_ring *kring;
460 
461 		pkt_drop = FALSE;
462 		m = m_head;
463 		m_head = m_head->m_nextpkt;
464 		m->m_nextpkt = NULL;
465 
466 		uint32_t sc_idx = MBUF_SCIDX(m_get_service_class(m));
467 		struct __kern_packet *kpkt;
468 
469 		/*
470 		 * nx_netif_host_catch_tx() must only be steering the output
471 		 * packets here only for native interfaces, otherwise we must
472 		 * not get here for compat.
473 		 */
474 
475 		ASSERT(sc_idx < KPKT_SC_MAX_CLASSES);
476 		kring = &hwna->na_tx_rings[hwna->na_kring_svc_lut[sc_idx]];
477 		if (currentkring != kring) {
478 			if (currentkring != NULL) {
479 				KDBG((SK_KTRACE_NETIF_HOST_ENQUEUE | DBG_FUNC_END), SK_KVA(currentkring),
480 				    error);
481 			}
482 			currentkring = kring;
483 			KDBG((SK_KTRACE_NETIF_HOST_ENQUEUE | DBG_FUNC_START), SK_KVA(currentkring));
484 		}
485 		if (__improbable(!NA_IS_ACTIVE(hwna) || !NA_IS_ACTIVE(hostna))) {
486 			STATS_INC(nifs, NETIF_STATS_DROP_NA_INACTIVE);
487 			SK_ERR("\"%s\" (%p) not in skywalk mode anymore",
488 			    hwna->na_name, SK_KVA(hwna));
489 			error = ENXIO;
490 			pkt_drop = TRUE;
491 			goto out;
492 		}
493 		/*
494 		 * Drop if the kring no longer accepts packets.
495 		 */
496 		if (__improbable(KR_DROP(&hostna->na_rx_rings[0]) || KR_DROP(kring))) {
497 			STATS_INC(nifs, NETIF_STATS_DROP_KRDROP_MODE);
498 			/* not a serious error, so no need to be chatty here */
499 			SK_DF(SK_VERB_NETIF,
500 			    "kr \"%s\" (%p) krflags 0x%x or %s in drop mode",
501 			    kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
502 			    ifp->if_xname);
503 			error = ENXIO;
504 			pkt_drop = TRUE;
505 			goto out;
506 		}
507 		if (__improbable(((unsigned)m_pktlen(m) + ifp->if_tx_headroom) >
508 		    kring->ckr_max_pkt_len)) {     /* too long for us */
509 			STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
510 			SK_ERR("\"%s\" (%p) from_host, drop packet size %u > %u",
511 			    hwna->na_name, SK_KVA(hwna), m_pktlen(m),
512 			    kring->ckr_max_pkt_len);
513 			pkt_drop = TRUE;
514 			goto out;
515 		}
516 		/*
517 		 * Convert mbuf to packet and enqueue it.
518 		 */
519 		kpkt = nx_netif_mbuf_to_kpkt(hwna, m);
520 		if (kpkt == NULL) {
521 			error = ENOBUFS;
522 			pkt_drop = TRUE;
523 			goto out;
524 		}
525 
526 		if ((m->m_pkthdr.pkt_flags & PKTF_SKIP_PKTAP) == 0 &&
527 		    pktap_total_tap_count != 0) {
528 			int af = get_af_from_mbuf(m);
529 
530 			if (af != AF_UNSPEC) {
531 				nx_netif_pktap_output(ifp, af, kpkt);
532 			}
533 		}
534 		if (!qset_id_valid) {
535 			if (m->m_pkthdr.pkt_ext_flags & PKTF_EXT_QSET_ID_VALID) {
536 				kpkt->pkt_pflags |= PKT_F_PRIV_HAS_QSET_ID;
537 				kpkt->pkt_priv =
538 				    __unsafe_forge_single(void *, m->m_pkthdr.pkt_mpriv_qsetid);
539 			}
540 
541 			qset = nx_netif_find_qset_with_pkt(ifp, kpkt);
542 			if (qset != NULL) {
543 				qset_id_valid = true;
544 			}
545 		}
546 
547 		if (qset != NULL) {
548 			kpkt->pkt_qset_idx = qset->nqs_idx;
549 		}
550 
551 		/*
552 		 * If the upper layers have set an expiry
553 		 * deadline for the mbuf, propagate it to
554 		 * the kernel packet.
555 		 */
556 		if (m->m_pkthdr.pkt_deadline) {
557 			/*
558 			 * Upper layers have set expiration deadline,
559 			 * propagate the value to the packet.
560 			 */
561 			kpkt->pkt_com_opt->__po_expire_ts = m->m_pkthdr.pkt_deadline;
562 			kpkt->pkt_pflags |= (PKT_F_OPT_EXPIRE_TS | PKT_F_OPT_EXP_ACTION);
563 		}
564 
565 		if (!netif_chain_enqueue_enabled(ifp)) {
566 			if (qset != NULL) {
567 				error = ifnet_enqueue_pkt(ifp,
568 				    qset->nqs_ifcq, kpkt,
569 				    false, &pkt_drop);
570 				nx_netif_qset_release(&qset);
571 			} else {
572 				/* callee consumes packet */
573 				error = ifnet_enqueue_pkt(ifp, ifp->if_snd, kpkt, false, &pkt_drop);
574 			}
575 
576 			if (pkt_drop) {
577 				STATS_INC(nifs, NETIF_STATS_TX_DROP_ENQ_AQM);
578 			}
579 		} else {
580 			KPKTQ_ENQUEUE(&pkt_q, kpkt);
581 			n_pkts++;
582 			n_bytes += m->m_pkthdr.len;
583 		}
584 out:
585 		/* always free mbuf (even in the success case) */
586 		m->m_nextpkt = free_list;
587 		free_list = m;
588 
589 		if (__improbable(pkt_drop)) {
590 			STATS_INC(nifs, NETIF_STATS_DROP);
591 		}
592 
593 		if (__improbable(error)) {
594 			break;
595 		}
596 	}
597 
598 	if (currentkring != NULL) {
599 		KDBG((SK_KTRACE_NETIF_HOST_ENQUEUE | DBG_FUNC_END), SK_KVA(currentkring),
600 		    error);
601 	}
602 
603 	if (__probable(!KPKTQ_EMPTY(&pkt_q))) {
604 		pkt_chain_head = KPKTQ_FIRST(&pkt_q);
605 		pkt_chain_tail = KPKTQ_LAST(&pkt_q);
606 		if (qset != NULL) {
607 			error = ifnet_enqueue_pkt_chain(ifp, qset->nqs_ifcq,
608 			    pkt_chain_head, pkt_chain_tail, n_pkts, n_bytes, false, &pkt_drop);
609 			nx_netif_qset_release(&qset);
610 		} else {
611 			/* callee consumes packet */
612 			error = ifnet_enqueue_pkt_chain(ifp, ifp->if_snd, pkt_chain_head,
613 			    pkt_chain_tail, n_pkts, n_bytes, false, &pkt_drop);
614 		}
615 		if (pkt_drop) {
616 			STATS_ADD(nifs, NETIF_STATS_TX_DROP_ENQ_AQM, n_pkts);
617 			STATS_ADD(nifs, NETIF_STATS_DROP, n_pkts);
618 		}
619 	}
620 
621 	if (error) {
622 		drop_list = m_head;
623 		while (m_head != NULL) {
624 			m_head = m_head->m_nextpkt;
625 			STATS_INC(nifs, NETIF_STATS_DROP);
626 		}
627 		m_freem_list(drop_list);
628 	}
629 	m_freem_list(free_list);
630 
631 	netif_transmit(ifp, NETIF_XMIT_FLAG_HOST);
632 
633 	return error;
634 }
635 
636 static inline int
get_l2_hlen(struct mbuf * m,uint8_t * l2len)637 get_l2_hlen(struct mbuf *m, uint8_t *l2len)
638 {
639 	/*
640 	 * -fbounds-safety: Although m_pkthdr.pkt_hdr is a void * without
641 	 * annotations, here we mark char *pkt_hdr as __single because we don't
642 	 * dereference this pointer, and we're mostly just using this pointer
643 	 * for comparisons.
644 	 */
645 	char *__single pkt_hdr;
646 	struct mbuf *m0;
647 	uint64_t len = 0;
648 
649 	pkt_hdr = m->m_pkthdr.pkt_hdr;
650 	for (m0 = m; m0 != NULL; m0 = m0->m_next) {
651 		if (pkt_hdr >= m_mtod_current(m0) &&
652 		    pkt_hdr < m_mtod_current(m0) + m0->m_len) {
653 			break;
654 		}
655 		len += m0->m_len;
656 	}
657 	if (m0 == NULL) {
658 		DTRACE_SKYWALK2(bad__pkthdr, struct mbuf *, m, char *, pkt_hdr);
659 		return EINVAL;
660 	}
661 	len += (pkt_hdr - m_mtod_current(m0));
662 	if (len > UINT8_MAX) {
663 		DTRACE_SKYWALK2(bad__l2len, struct mbuf *, m, uint64_t, len);
664 		return EINVAL;
665 	}
666 	*l2len = (uint8_t)len;
667 	return 0;
668 }
669 
670 #if SK_LOG
671 /* Hoisted out of line to reduce kernel stack footprint */
672 SK_LOG_ATTRIBUTE
673 static void
nx_netif_mbuf_to_kpkt_log(struct __kern_packet * kpkt,uint32_t len,uint32_t poff)674 nx_netif_mbuf_to_kpkt_log(struct __kern_packet *kpkt, uint32_t len,
675     uint32_t poff)
676 {
677 	uint8_t *baddr;
678 	uint32_t pkt_len;
679 
680 	MD_BUFLET_ADDR_ABS(kpkt, baddr);
681 	pkt_len = __packet_get_real_data_length(kpkt);
682 	SK_DF(SK_VERB_HOST | SK_VERB_TX, "mlen %u dplen %u"
683 	    " hr %u l2 %u poff %u", len, kpkt->pkt_length,
684 	    kpkt->pkt_headroom, kpkt->pkt_l2_len, poff);
685 	SK_DF(SK_VERB_HOST | SK_VERB_TX | SK_VERB_DUMP, "%s",
686 	    sk_dump("buf", baddr, pkt_len, 128));
687 }
688 #endif /* SK_LOG */
689 
690 static inline struct __kern_packet *
nx_netif_mbuf_to_kpkt(struct nexus_adapter * na,struct mbuf * m)691 nx_netif_mbuf_to_kpkt(struct nexus_adapter *na, struct mbuf *m)
692 {
693 	struct netif_stats *nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats;
694 	struct nexus_netif_adapter *nifna = NIFNA(na);
695 	struct nx_netif *nif = nifna->nifna_netif;
696 	uint16_t poff = na->na_ifp->if_tx_headroom;
697 	uint32_t len;
698 	struct kern_pbufpool *pp;
699 	struct __kern_packet *kpkt;
700 	kern_packet_t ph;
701 	boolean_t copysum;
702 	uint8_t l2hlen;
703 	int err;
704 
705 	pp = skmem_arena_nexus(na->na_arena)->arn_tx_pp;
706 	ASSERT((pp != NULL) && (pp->pp_md_type == NEXUS_META_TYPE_PACKET) &&
707 	    (pp->pp_md_subtype == NEXUS_META_SUBTYPE_RAW));
708 	ASSERT(!PP_HAS_TRUNCATED_BUF(pp));
709 
710 	len = m_pktlen(m);
711 	VERIFY((poff + len) <= (PP_BUF_SIZE_DEF(pp) * pp->pp_max_frags));
712 
713 	/* alloc packet */
714 	ph = pp_alloc_packet_by_size(pp, poff + len, SKMEM_NOSLEEP);
715 	if (__improbable(ph == 0)) {
716 		STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_PKT);
717 		SK_DF(SK_VERB_MEM,
718 		    "%s(%d) pp \"%s\" (%p) has no more "
719 		    "packet for %s", sk_proc_name(current_proc()),
720 		    sk_proc_pid(current_proc()), pp->pp_name, SK_KVA(pp),
721 		    if_name(na->na_ifp));
722 		return NULL;
723 	}
724 
725 	copysum = ((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID |
726 	    CSUM_PARTIAL)) == (CSUM_DATA_VALID | CSUM_PARTIAL));
727 
728 	STATS_INC(nifs, NETIF_STATS_TX_COPY_MBUF);
729 	if (copysum) {
730 		STATS_INC(nifs, NETIF_STATS_TX_COPY_SUM);
731 	}
732 
733 	kpkt = SK_PTR_ADDR_KPKT(ph);
734 	kpkt->pkt_link_flags = 0;
735 	nif->nif_pkt_copy_from_mbuf(NR_TX, ph, poff, m, 0, len,
736 	    copysum, m->m_pkthdr.csum_tx_start);
737 
738 	kpkt->pkt_headroom = (uint8_t)poff;
739 	if ((err = get_l2_hlen(m, &l2hlen)) == 0) {
740 		kpkt->pkt_l2_len = l2hlen;
741 	} else {
742 		kpkt->pkt_l2_len = 0;
743 	}
744 	/* finalize the packet */
745 	METADATA_ADJUST_LEN(kpkt, 0, poff);
746 	err = __packet_finalize(ph);
747 	VERIFY(err == 0);
748 
749 #if SK_LOG
750 	if (__improbable((sk_verbose & SK_VERB_HOST) != 0) && kpkt != NULL) {
751 		nx_netif_mbuf_to_kpkt_log(kpkt, len, poff);
752 	}
753 #endif /* SK_LOG */
754 
755 	return kpkt;
756 }
757