1 /*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #define _IP_VHL
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/netif/nx_netif.h>
31 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
32 #include <net/ethernet.h>
33 #include <net/pktap.h>
34 #include <sys/kdebug.h>
35 #include <sys/sdt.h>
36
37 #define DBG_FUNC_NX_NETIF_HOST_ENQUEUE \
38 SKYWALKDBG_CODE(DBG_SKYWALK_NETIF, 2)
39
40 static void nx_netif_host_catch_tx(struct nexus_adapter *, boolean_t);
41 static inline struct __kern_packet*
42 nx_netif_mbuf_to_kpkt(struct nexus_adapter *, struct mbuf *);
43
44 #define SK_IFCAP_CSUM (IFCAP_HWCSUM|IFCAP_CSUM_PARTIAL|IFCAP_CSUM_ZERO_INVERT)
45
46 int
nx_netif_host_na_activate(struct nexus_adapter * na,na_activate_mode_t mode)47 nx_netif_host_na_activate(struct nexus_adapter *na, na_activate_mode_t mode)
48 {
49 struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
50 struct nx_netif *nif = nifna->nifna_netif;
51 struct ifnet *ifp = na->na_ifp;
52 int error = 0;
53
54 ASSERT(na->na_type == NA_NETIF_HOST ||
55 na->na_type == NA_NETIF_COMPAT_HOST);
56 ASSERT(na->na_flags & NAF_HOST_ONLY);
57
58 SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx) %s", na->na_name,
59 SK_KVA(na), na_activate_mode2str(mode));
60
61 switch (mode) {
62 case NA_ACTIVATE_MODE_ON:
63 ASSERT(SKYWALK_CAPABLE(ifp));
64 /*
65 * Make skywalk control the packet steering
66 * Don't intercept tx packets if this is a netif compat
67 * adapter attached to a flowswitch
68 */
69 nx_netif_host_catch_tx(na, TRUE);
70
71 /* XXX: [email protected] - disable TSO and LRO for now */
72 ifnet_lock_exclusive(ifp);
73 nif->nif_hwassist = ifp->if_hwassist;
74 nif->nif_capabilities = ifp->if_capabilities;
75 nif->nif_capenable = ifp->if_capenable;
76 ifp->if_hwassist &= ~(IFNET_CHECKSUMF | IFNET_TSOF);
77 ifp->if_capabilities &= ~(SK_IFCAP_CSUM | IFCAP_TSO);
78 ifp->if_capenable &= ~(SK_IFCAP_CSUM | IFCAP_TSO);
79
80 /*
81 * Re-enable the capabilities which Skywalk layer provides:
82 *
83 * Native driver: a copy from packet to mbuf always occurs
84 * for each inbound and outbound packet; we leverage combined
85 * and copy checksum, and thus advertise the capabilities.
86 * We also always enable 16KB jumbo mbuf support.
87 *
88 * Compat driver: inbound and outbound mbufs don't incur a
89 * copy, and so leave the driver advertised flags alone.
90 */
91 if (NA_KERNEL_ONLY(na)) {
92 if (na->na_type == NA_NETIF_HOST) { /* native */
93 ifp->if_hwassist |= (IFNET_CSUM_TCP |
94 IFNET_CSUM_UDP | IFNET_CSUM_TCPIPV6 |
95 IFNET_CSUM_UDPIPV6 | IFNET_CSUM_PARTIAL |
96 IFNET_CSUM_ZERO_INVERT | IFNET_MULTIPAGES);
97 ifp->if_capabilities |= SK_IFCAP_CSUM;
98 ifp->if_capenable |= SK_IFCAP_CSUM;
99 if (sk_fsw_tx_agg_tcp != 0) {
100 ifp->if_hwassist |= IFNET_TSOF;
101 ifp->if_capabilities |= IFCAP_TSO;
102 ifp->if_capenable |= IFCAP_TSO;
103 }
104 } else { /* compat */
105 ifp->if_hwassist |=
106 (nif->nif_hwassist &
107 (IFNET_CHECKSUMF | IFNET_TSOF));
108 ifp->if_capabilities |=
109 (nif->nif_capabilities &
110 (SK_IFCAP_CSUM | IFCAP_TSO));
111 ifp->if_capenable |=
112 (nif->nif_capenable &
113 (SK_IFCAP_CSUM | IFCAP_TSO));
114 }
115 }
116 ifnet_lock_done(ifp);
117
118 atomic_bitset_32(&na->na_flags, NAF_ACTIVE);
119 break;
120
121 case NA_ACTIVATE_MODE_DEFUNCT:
122 ASSERT(SKYWALK_CAPABLE(ifp));
123 break;
124
125 case NA_ACTIVATE_MODE_OFF:
126 /* Release packet steering control. */
127 nx_netif_host_catch_tx(na, FALSE);
128
129 /*
130 * Note that here we cannot assert SKYWALK_CAPABLE()
131 * as we're called in the destructor path.
132 */
133 atomic_bitclear_32(&na->na_flags, NAF_ACTIVE);
134
135 ifnet_lock_exclusive(ifp);
136 /* Unset any capabilities previously set by Skywalk */
137 ifp->if_hwassist &= ~(IFNET_CHECKSUMF | IFNET_MULTIPAGES);
138 ifp->if_capabilities &= ~SK_IFCAP_CSUM;
139 ifp->if_capenable &= ~SK_IFCAP_CSUM;
140 if ((sk_fsw_tx_agg_tcp != 0) &&
141 (na->na_type == NA_NETIF_HOST)) {
142 ifp->if_hwassist &= ~IFNET_TSOF;
143 ifp->if_capabilities &= ~IFCAP_TSO;
144 ifp->if_capenable &= ~IFCAP_TSO;
145 }
146 /* Restore driver original flags */
147 ifp->if_hwassist |= (nif->nif_hwassist &
148 (IFNET_CHECKSUMF | IFNET_TSOF | IFNET_MULTIPAGES));
149 ifp->if_capabilities |=
150 (nif->nif_capabilities & (SK_IFCAP_CSUM | IFCAP_TSO));
151 ifp->if_capenable |=
152 (nif->nif_capenable & (SK_IFCAP_CSUM | IFCAP_TSO));
153 ifnet_lock_done(ifp);
154 break;
155
156 default:
157 VERIFY(0);
158 /* NOTREACHED */
159 __builtin_unreachable();
160 }
161
162 return error;
163 }
164
165 /* na_krings_create callback for netif host adapters */
166 int
nx_netif_host_krings_create(struct nexus_adapter * na,struct kern_channel * ch)167 nx_netif_host_krings_create(struct nexus_adapter *na, struct kern_channel *ch)
168 {
169 int ret;
170
171 SK_LOCK_ASSERT_HELD();
172 ASSERT(na->na_type == NA_NETIF_HOST ||
173 na->na_type == NA_NETIF_COMPAT_HOST);
174 ASSERT(na->na_flags & NAF_HOST_ONLY);
175
176 ret = na_rings_mem_setup(na, 0, FALSE, ch);
177 if (ret == 0) {
178 struct __kern_channel_ring *kring;
179 uint32_t i;
180
181 /* drop by default until fully bound */
182 if (NA_KERNEL_ONLY(na)) {
183 na_kr_drop(na, TRUE);
184 }
185
186 for (i = 0; i < na_get_nrings(na, NR_RX); i++) {
187 kring = &NAKR(na, NR_RX)[i];
188 /* initialize the nx_mbq for the sw rx ring */
189 nx_mbq_safe_init(kring, &kring->ckr_rx_queue,
190 NX_MBQ_NO_LIMIT, &nexus_mbq_lock_group,
191 &nexus_lock_attr);
192 SK_DF(SK_VERB_NETIF,
193 "na \"%s\" (0x%llx) initialized host kr \"%s\" "
194 "(0x%llx) krflags 0x%b", na->na_name, SK_KVA(na),
195 kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
196 CKRF_BITS);
197 }
198 }
199 return ret;
200 }
201
202 /*
203 * Destructor for netif host adapters; they also have an mbuf queue
204 * on the rings connected to the host so we need to purge them first.
205 */
206 void
nx_netif_host_krings_delete(struct nexus_adapter * na,struct kern_channel * ch,boolean_t defunct)207 nx_netif_host_krings_delete(struct nexus_adapter *na, struct kern_channel *ch,
208 boolean_t defunct)
209 {
210 struct __kern_channel_ring *kring;
211 uint32_t i;
212
213 SK_LOCK_ASSERT_HELD();
214 ASSERT(na->na_type == NA_NETIF_HOST ||
215 na->na_type == NA_NETIF_COMPAT_HOST);
216 ASSERT(na->na_flags & NAF_HOST_ONLY);
217
218 if (NA_KERNEL_ONLY(na)) {
219 na_kr_drop(na, TRUE);
220 }
221
222 for (i = 0; i < na_get_nrings(na, NR_RX); i++) {
223 struct nx_mbq *q;
224
225 kring = &NAKR(na, NR_RX)[i];
226 q = &kring->ckr_rx_queue;
227 SK_DF(SK_VERB_NETIF,
228 "na \"%s\" (0x%llx) destroy host kr \"%s\" (0x%llx) "
229 "krflags 0x%b with qlen %u", na->na_name, SK_KVA(na),
230 kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
231 CKRF_BITS, nx_mbq_len(q));
232 nx_mbq_purge(q);
233 if (!defunct) {
234 nx_mbq_safe_destroy(q);
235 }
236 }
237
238 na_rings_mem_teardown(na, ch, defunct);
239 }
240
241 /* kring->ckr_na_sync callback for the host rx ring */
242 int
nx_netif_host_na_rxsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)243 nx_netif_host_na_rxsync(struct __kern_channel_ring *kring,
244 struct proc *p, uint32_t flags)
245 {
246 #pragma unused(kring, p, flags)
247 return 0;
248 }
249
250 /*
251 * kring->ckr_na_sync callback for the host tx ring.
252 */
253 int
nx_netif_host_na_txsync(struct __kern_channel_ring * kring,struct proc * p,uint32_t flags)254 nx_netif_host_na_txsync(struct __kern_channel_ring *kring, struct proc *p,
255 uint32_t flags)
256 {
257 #pragma unused(kring, p, flags)
258 return 0;
259 }
260
261 int
nx_netif_host_na_special(struct nexus_adapter * na,struct kern_channel * ch,struct chreq * chr,nxspec_cmd_t spec_cmd)262 nx_netif_host_na_special(struct nexus_adapter *na, struct kern_channel *ch,
263 struct chreq *chr, nxspec_cmd_t spec_cmd)
264 {
265 ASSERT(na->na_type == NA_NETIF_HOST ||
266 na->na_type == NA_NETIF_COMPAT_HOST);
267 return nx_netif_na_special_common(na, ch, chr, spec_cmd);
268 }
269
270 /*
271 * Intercept the packet steering routine in the tx path,
272 * so that we can decide which queue is used for an mbuf.
273 * Second argument is TRUE to intercept, FALSE to restore.
274 */
275 static void
nx_netif_host_catch_tx(struct nexus_adapter * na,boolean_t enable)276 nx_netif_host_catch_tx(struct nexus_adapter *na, boolean_t enable)
277 {
278 struct ifnet *ifp = na->na_ifp;
279 int err = 0;
280
281 ASSERT(na->na_type == NA_NETIF_HOST ||
282 na->na_type == NA_NETIF_COMPAT_HOST);
283 ASSERT(na->na_flags & NAF_HOST_ONLY);
284
285 /*
286 * Common case is NA_KERNEL_ONLY: if the netif is plumbed
287 * below the flowswitch. For TXSTART compat driver and legacy:
288 * don't intercept DLIL output handler, since in this model
289 * packets from both BSD stack and flowswitch are directly
290 * enqueued to the classq via ifnet_enqueue().
291 *
292 * Otherwise, it's the uncommon case where a user channel is
293 * opened directly to the netif. Here we either intercept
294 * or restore the DLIL output handler.
295 */
296 if (enable) {
297 if (__improbable(!NA_KERNEL_ONLY(na))) {
298 return;
299 }
300 /*
301 * For native drivers only, intercept if_output();
302 * for compat, leave it alone since we don't need
303 * to perform any mbuf-pkt conversion.
304 */
305 if (na->na_type == NA_NETIF_HOST) {
306 err = ifnet_set_output_handler(ifp,
307 sk_fsw_tx_agg_tcp ? netif_gso_dispatch :
308 nx_netif_host_output);
309 VERIFY(err == 0);
310 }
311 } else {
312 if (__improbable(!NA_KERNEL_ONLY(na))) {
313 return;
314 }
315 /*
316 * Restore original if_output() for native drivers.
317 */
318 if (na->na_type == NA_NETIF_HOST) {
319 ifnet_reset_output_handler(ifp);
320 }
321 }
322 }
323
324 static int
get_af_from_mbuf(struct mbuf * m)325 get_af_from_mbuf(struct mbuf *m)
326 {
327 uint8_t *pkt_hdr;
328 uint8_t ipv;
329 struct mbuf *m0;
330 int af;
331
332 pkt_hdr = m->m_pkthdr.pkt_hdr;
333 for (m0 = m; m0 != NULL; m0 = m0->m_next) {
334 if (pkt_hdr >= (uint8_t *)m0->m_data &&
335 pkt_hdr < (uint8_t *)m0->m_data + m0->m_len) {
336 break;
337 }
338 }
339 if (m0 == NULL) {
340 DTRACE_SKYWALK1(bad__pkthdr, struct mbuf *, m);
341 af = AF_UNSPEC;
342 goto done;
343 }
344 ipv = IP_VHL_V(*pkt_hdr);
345 if (ipv == 4) {
346 af = AF_INET;
347 } else if (ipv == 6) {
348 af = AF_INET6;
349 } else {
350 af = AF_UNSPEC;
351 }
352 done:
353 DTRACE_SKYWALK2(mbuf__af, int, af, struct mbuf *, m);
354 return af;
355 }
356
357 /*
358 * if_output() callback called by dlil_output() to handle mbufs coming out
359 * of the host networking stack. The mbuf will get converted to a packet,
360 * and enqueued to the classq of a Skywalk native interface.
361 */
362 int
nx_netif_host_output(struct ifnet * ifp,struct mbuf * m)363 nx_netif_host_output(struct ifnet *ifp, struct mbuf *m)
364 {
365 struct nx_netif *nif = NA(ifp)->nifna_netif;
366 struct kern_nexus *nx = nif->nif_nx;
367 struct nexus_adapter *hwna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
368 struct nexus_adapter *hostna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST);
369 struct __kern_channel_ring *kring;
370 uint32_t sc_idx = MBUF_SCIDX(m_get_service_class(m));
371 struct netif_stats *nifs = &NX_NETIF_PRIVATE(hwna->na_nx)->nif_stats;
372 struct __kern_packet *kpkt;
373 errno_t error = ENOBUFS;
374 boolean_t pkt_drop = FALSE;
375
376 /*
377 * nx_netif_host_catch_tx() must only be steering the output
378 * packets here only for native interfaces, otherwise we must
379 * not get here for compat.
380 */
381 ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
382 ASSERT(m->m_nextpkt == NULL);
383 ASSERT(hostna->na_type == NA_NETIF_HOST);
384 ASSERT(sc_idx < KPKT_SC_MAX_CLASSES);
385
386 kring = &hwna->na_tx_rings[hwna->na_kring_svc_lut[sc_idx]];
387 KDBG((SK_KTRACE_NETIF_HOST_ENQUEUE | DBG_FUNC_START), SK_KVA(kring));
388 if (__improbable(!NA_IS_ACTIVE(hwna) || !NA_IS_ACTIVE(hostna))) {
389 STATS_INC(nifs, NETIF_STATS_DROP_NA_INACTIVE);
390 SK_ERR("\"%s\" (0x%llx) not in skywalk mode anymore",
391 hwna->na_name, SK_KVA(hwna));
392 error = ENXIO;
393 pkt_drop = TRUE;
394 goto done;
395 }
396 /*
397 * Drop if the kring no longer accepts packets.
398 */
399 if (__improbable(KR_DROP(&hostna->na_rx_rings[0]) || KR_DROP(kring))) {
400 STATS_INC(nifs, NETIF_STATS_DROP_KRDROP_MODE);
401 /* not a serious error, so no need to be chatty here */
402 SK_DF(SK_VERB_NETIF,
403 "kr \"%s\" (0x%llx) krflags 0x%b or %s in drop mode",
404 kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
405 CKRF_BITS, ifp->if_xname);
406 error = ENXIO;
407 pkt_drop = TRUE;
408 goto done;
409 }
410 if (__improbable(((unsigned)m_pktlen(m) + ifp->if_tx_headroom) >
411 kring->ckr_max_pkt_len)) { /* too long for us */
412 STATS_INC(nifs, NETIF_STATS_DROP_BADLEN);
413 SK_ERR("\"%s\" (0x%llx) from_host, drop packet size %u > %u",
414 hwna->na_name, SK_KVA(hwna), m_pktlen(m),
415 kring->ckr_max_pkt_len);
416 pkt_drop = TRUE;
417 goto done;
418 }
419 /*
420 * Convert mbuf to packet and enqueue it.
421 */
422 kpkt = nx_netif_mbuf_to_kpkt(hwna, m);
423 if (__probable(kpkt != NULL)) {
424 if ((m->m_pkthdr.pkt_flags & PKTF_SKIP_PKTAP) == 0 &&
425 pktap_total_tap_count != 0) {
426 int af = get_af_from_mbuf(m);
427
428 if (af != AF_UNSPEC) {
429 nx_netif_pktap_output(ifp, af, kpkt);
430 }
431 }
432 /* callee consumes packet */
433 error = ifnet_enqueue_pkt(ifp, kpkt, false, &pkt_drop);
434 netif_transmit(ifp, NETIF_XMIT_FLAG_HOST);
435 if (pkt_drop) {
436 STATS_INC(nifs, NETIF_STATS_TX_DROP_ENQ_AQM);
437 }
438 } else {
439 error = ENOBUFS;
440 pkt_drop = TRUE;
441 }
442 done:
443 /* always free mbuf (even in the success case) */
444 m_freem(m);
445 if (__improbable(pkt_drop)) {
446 STATS_INC(nifs, NETIF_STATS_DROP);
447 }
448
449 KDBG((SK_KTRACE_NETIF_HOST_ENQUEUE | DBG_FUNC_END), SK_KVA(kring),
450 error);
451
452 return error;
453 }
454
455 #if SK_LOG
456 /* Hoisted out of line to reduce kernel stack footprint */
457 SK_LOG_ATTRIBUTE
458 static void
nx_netif_mbuf_to_kpkt_log(struct __kern_packet * kpkt,uint32_t len,uint32_t poff)459 nx_netif_mbuf_to_kpkt_log(struct __kern_packet *kpkt, uint32_t len,
460 uint32_t poff)
461 {
462 uint8_t *baddr;
463 MD_BUFLET_ADDR_ABS(kpkt, baddr);
464 SK_DF(SK_VERB_HOST | SK_VERB_TX, "mlen %u dplen %u"
465 " hr %u l2 %u poff %u", len, kpkt->pkt_length,
466 kpkt->pkt_headroom, kpkt->pkt_l2_len, poff);
467 SK_DF(SK_VERB_HOST | SK_VERB_TX | SK_VERB_DUMP, "%s",
468 sk_dump("buf", baddr, kpkt->pkt_length, 128, NULL, 0));
469 }
470 #endif /* SK_LOG */
471
472 static inline struct __kern_packet *
nx_netif_mbuf_to_kpkt(struct nexus_adapter * na,struct mbuf * m)473 nx_netif_mbuf_to_kpkt(struct nexus_adapter *na, struct mbuf *m)
474 {
475 struct netif_stats *nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats;
476 struct nexus_netif_adapter *nifna = NIFNA(na);
477 struct nx_netif *nif = nifna->nifna_netif;
478 uint16_t poff = na->na_ifp->if_tx_headroom;
479 uint32_t len;
480 struct kern_pbufpool *pp;
481 struct __kern_packet *kpkt;
482 kern_packet_t ph;
483 boolean_t copysum;
484 int err;
485
486 pp = skmem_arena_nexus(na->na_arena)->arn_tx_pp;
487 ASSERT((pp != NULL) && (pp->pp_md_type == NEXUS_META_TYPE_PACKET) &&
488 (pp->pp_md_subtype == NEXUS_META_SUBTYPE_RAW));
489 ASSERT(!PP_HAS_TRUNCATED_BUF(pp));
490
491 len = m_pktlen(m);
492 VERIFY((poff + len) <= (pp->pp_buflet_size * pp->pp_max_frags));
493
494 /* alloc packet */
495 ph = pp_alloc_packet_by_size(pp, poff + len, SKMEM_NOSLEEP);
496 if (__improbable(ph == 0)) {
497 STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_PKT);
498 SK_DF(SK_VERB_MEM,
499 "%s(%d) pp \"%s\" (0x%llx) has no more "
500 "packet for %s", sk_proc_name_address(current_proc()),
501 sk_proc_pid(current_proc()), pp->pp_name, SK_KVA(pp),
502 if_name(na->na_ifp));
503 return NULL;
504 }
505
506 copysum = ((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID |
507 CSUM_PARTIAL)) == (CSUM_DATA_VALID | CSUM_PARTIAL));
508
509 STATS_INC(nifs, NETIF_STATS_TX_COPY_MBUF);
510 if (copysum) {
511 STATS_INC(nifs, NETIF_STATS_TX_COPY_SUM);
512 }
513
514 kpkt = SK_PTR_ADDR_KPKT(ph);
515 kpkt->pkt_link_flags = 0;
516 nif->nif_pkt_copy_from_mbuf(NR_TX, ph, poff, m, 0, len,
517 copysum, m->m_pkthdr.csum_tx_start);
518
519 kpkt->pkt_headroom = (uint8_t)poff;
520 kpkt->pkt_l2_len = 0;
521
522 /* finalize the packet */
523 METADATA_ADJUST_LEN(kpkt, 0, poff);
524 err = __packet_finalize(ph);
525 VERIFY(err == 0);
526
527 #if SK_LOG
528 if (__improbable((sk_verbose & SK_VERB_HOST) != 0) && kpkt != NULL) {
529 nx_netif_mbuf_to_kpkt_log(kpkt, len, poff);
530 }
531 #endif /* SK_LOG */
532
533 return kpkt;
534 }
535