1 /*
2 * Copyright (c) 2016-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <netinet/in_arp.h>
33 #include <netinet/ip6.h>
34 #include <netinet6/in6_var.h>
35 #include <netinet6/nd6.h>
36 #include <net/ethernet.h>
37 #include <net/route.h>
38 #include <sys/eventhandler.h>
39 #include <net/sockaddr_utils.h>
40 #include <kern/uipc_domain.h>
41
42 #define FSW_ETHER_LEN_PADDED 16
43 #define FSW_ETHER_PADDING (FSW_ETHER_LEN_PADDED - ETHER_HDR_LEN)
44 #define FSW_ETHER_FRAME_HEADROOM FSW_ETHER_LEN_PADDED
45
46 static void fsw_ethernet_ctor(struct nx_flowswitch *, struct flow_route *);
47 static int fsw_ethernet_resolve(struct nx_flowswitch *, struct flow_route *,
48 struct __kern_packet *);
49 static void fsw_ethernet_frame(struct nx_flowswitch *, struct flow_route *,
50 struct __kern_packet *);
51 static sa_family_t fsw_ethernet_demux(struct nx_flowswitch *,
52 struct __kern_packet *);
53
54 extern struct rtstat_64 rtstat;
55
56 int
fsw_ethernet_setup(struct nx_flowswitch * fsw,struct ifnet * ifp)57 fsw_ethernet_setup(struct nx_flowswitch *fsw, struct ifnet *ifp)
58 {
59 struct ifaddr *lladdr = ifp->if_lladdr;
60
61 if (SDL(lladdr->ifa_addr)->sdl_alen != ETHER_ADDR_LEN ||
62 SDL(lladdr->ifa_addr)->sdl_type != IFT_ETHER) {
63 return ENOTSUP;
64 }
65
66 ifnet_lladdr_copy_bytes(ifp, fsw->fsw_ether_shost, ETHER_ADDR_LEN);
67 fsw->fsw_ctor = fsw_ethernet_ctor;
68 fsw->fsw_resolve = fsw_ethernet_resolve;
69 fsw->fsw_frame = fsw_ethernet_frame;
70 fsw->fsw_frame_headroom = FSW_ETHER_FRAME_HEADROOM;
71 fsw->fsw_demux = fsw_ethernet_demux;
72
73 return 0;
74 }
75
76 static void
fsw_ethernet_ctor(struct nx_flowswitch * fsw,struct flow_route * fr)77 fsw_ethernet_ctor(struct nx_flowswitch *fsw, struct flow_route *fr)
78 {
79 ASSERT(fr->fr_af == AF_INET || fr->fr_af == AF_INET6);
80
81 fr->fr_llhdr.flh_gencnt = fsw->fsw_src_lla_gencnt;
82 bcopy(fsw->fsw_ether_shost, fr->fr_eth.ether_shost, ETHER_ADDR_LEN);
83 fr->fr_eth.ether_type = ((fr->fr_af == AF_INET) ?
84 htons(ETHERTYPE_IP) : htons(ETHERTYPE_IPV6));
85
86 /* const override */
87 static_assert(sizeof(fr->fr_llhdr.flh_off) == sizeof(uint8_t));
88 static_assert(sizeof(fr->fr_llhdr.flh_len) == sizeof(uint8_t));
89 *(uint8_t *)(uintptr_t)&fr->fr_llhdr.flh_off = 2;
90 *(uint8_t *)(uintptr_t)&fr->fr_llhdr.flh_len = ETHER_HDR_LEN;
91
92 SK_DF(SK_VERB_FLOW_ROUTE,
93 "fr %p eth_type 0x%x eth_src %x:%x:%x:%x:%x:%x",
94 SK_KVA(fr), ntohs(fr->fr_eth.ether_type),
95 fr->fr_eth.ether_shost[0], fr->fr_eth.ether_shost[1],
96 fr->fr_eth.ether_shost[2], fr->fr_eth.ether_shost[3],
97 fr->fr_eth.ether_shost[4], fr->fr_eth.ether_shost[5]);
98 }
99
100 static int
fsw_ethernet_resolve(struct nx_flowswitch * fsw,struct flow_route * fr,struct __kern_packet * pkt)101 fsw_ethernet_resolve(struct nx_flowswitch *fsw, struct flow_route *fr,
102 struct __kern_packet *pkt)
103 {
104 #if SK_LOG
105 char dst_s[MAX_IPv6_STR_LEN];
106 #endif /* SK_LOG */
107 struct ifnet *ifp = fsw->fsw_ifp;
108 struct rtentry *tgt_rt = NULL;
109 struct sockaddr *tgt_sa = NULL;
110 struct mbuf *m = NULL;
111 boolean_t reattach_mbuf = FALSE;
112 boolean_t probing;
113 int err = 0;
114 uint64_t pkt_mflags_restore; /* Save old mbuf flags to restore in error cases */
115
116 ASSERT(fr != NULL);
117 ASSERT(ifp != NULL);
118
119 FR_LOCK(fr);
120 /*
121 * If the destination is on-link, we use the final destination
122 * address as target. If it's off-link, we use the gateway
123 * address instead. Point tgt_rt to the the destination or
124 * gateway route accordingly.
125 */
126 if (fr->fr_flags & FLOWRTF_ONLINK) {
127 tgt_sa = SA(&fr->fr_faddr);
128 tgt_rt = fr->fr_rt_dst;
129 } else if (fr->fr_flags & FLOWRTF_GATEWAY) {
130 tgt_sa = SA(&fr->fr_gaddr);
131 tgt_rt = fr->fr_rt_gw;
132 }
133
134 /*
135 * Perform another routing table lookup if necessary.
136 */
137 if (tgt_rt == NULL || !(tgt_rt->rt_flags & RTF_UP) ||
138 fr->fr_want_configure) {
139 if (fr->fr_want_configure == 0) {
140 os_atomic_inc(&fr->fr_want_configure, relaxed);
141 }
142 err = flow_route_configure(fr, ifp, NULL);
143 if (err != 0) {
144 SK_ERR("failed to configure route to %s on %s (err %d)",
145 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
146 sizeof(dst_s)), ifp->if_xname, err);
147 goto done;
148 }
149
150 /* refresh pointers */
151 if (fr->fr_flags & FLOWRTF_ONLINK) {
152 tgt_sa = SA(&fr->fr_faddr);
153 tgt_rt = fr->fr_rt_dst;
154 } else if (fr->fr_flags & FLOWRTF_GATEWAY) {
155 tgt_sa = SA(&fr->fr_gaddr);
156 tgt_rt = fr->fr_rt_gw;
157 }
158 }
159
160 if (__improbable(!(fr->fr_flags & (FLOWRTF_ONLINK | FLOWRTF_GATEWAY)))) {
161 err = EHOSTUNREACH;
162 SK_ERR("invalid route for %s on %s (err %d)",
163 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
164 sizeof(dst_s)), ifp->if_xname, err);
165 goto done;
166 }
167
168 ASSERT(tgt_sa != NULL);
169 ASSERT(tgt_rt != NULL);
170
171 /*
172 * Attempt to convert kpkt to mbuf before acquiring the
173 * rt lock so that the lock won't be held if we need to do
174 * blocked a mbuf allocation.
175 */
176 if (!(fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
177 /*
178 * We need to resolve; if caller passes in a kpkt,
179 * convert the kpkt within to mbuf. Caller is then
180 * reponsible for freeing kpkt. In future, we could
181 * optimize this by having the ARP/ND lookup routines
182 * understand kpkt and perform the conversion only
183 * when it is needed.
184 */
185 if (__probable(pkt != NULL)) {
186 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
187 reattach_mbuf = TRUE;
188 m = pkt->pkt_mbuf;
189 pkt_mflags_restore = (pkt->pkt_pflags & PKT_F_MBUF_MASK);
190 KPKT_CLEAR_MBUF_DATA(pkt);
191 } else {
192 m = fsw_classq_kpkt_to_mbuf(fsw, pkt);
193 }
194 if (m == NULL) {
195 /* not a fatal error; move on */
196 SK_ERR("failed to allocate mbuf while "
197 "resolving %s on %s",
198 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
199 sizeof(dst_s)), ifp->if_xname);
200 }
201 } else {
202 m = NULL;
203 }
204 }
205
206 RT_LOCK(tgt_rt);
207
208 if (__improbable(!IS_DIRECT_HOSTROUTE(tgt_rt) ||
209 tgt_rt->rt_gateway->sa_family != AF_LINK ||
210 SDL(tgt_rt->rt_gateway)->sdl_type != IFT_ETHER)) {
211 rtstat.rts_badrtgwroute++;
212 err = ENETUNREACH;
213 RT_UNLOCK(tgt_rt);
214 SK_ERR("bad gateway route %s on %s (err %d)",
215 sk_sa_ntop(tgt_sa, dst_s, sizeof(dst_s)),
216 ifp->if_xname, err);
217 goto done;
218 }
219
220 /*
221 * If already resolved, grab the link-layer address and mark the
222 * flow route accordingly. Given that we will use the cached
223 * link-layer info, there's no need to convert and enqueue the
224 * packet to ARP/ND (i.e. no need to return EJUSTRETURN).
225 */
226 if (__probable((fr->fr_flags & FLOWRTF_HAS_LLINFO) &&
227 SDL(tgt_rt->rt_gateway)->sdl_alen == ETHER_ADDR_LEN)) {
228 VERIFY(m == NULL);
229 /* XXX Remove explicit __bidi_indexable once rdar://119193012 lands */
230 struct sockaddr_dl *__bidi_indexable sdl =
231 (struct sockaddr_dl *__bidi_indexable) SDL(tgt_rt->rt_gateway);
232 FLOWRT_UPD_ETH_DST(fr, LLADDR(sdl));
233 os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
234 /* if we're not probing, then we're done */
235 if (!(probing = (fr->fr_want_probe != 0))) {
236 VERIFY(err == 0);
237 RT_UNLOCK(tgt_rt);
238 goto done;
239 }
240 os_atomic_store(&fr->fr_want_probe, 0, release);
241 } else {
242 probing = FALSE;
243 os_atomic_andnot(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
244 }
245
246 SK_DF(SK_VERB_FLOW_ROUTE, "%s %s on %s", (probing ?
247 "probing" : "resolving"), sk_sa_ntop(tgt_sa, dst_s,
248 sizeof(dst_s)), ifp->if_xname);
249
250 /*
251 * Trigger ARP/NDP resolution or probing.
252 */
253 switch (tgt_sa->sa_family) {
254 case AF_INET: {
255 struct sockaddr_dl sdl;
256
257 RT_UNLOCK(tgt_rt);
258 /*
259 * Note we pass NULL as "hint" parameter, as tgt_sa
260 * is already refererring to the target address.
261 */
262 SOCKADDR_ZERO(&sdl, sizeof(sdl));
263 err = arp_lookup_ip(ifp, SIN(tgt_sa), &sdl, sizeof(sdl),
264 NULL, m);
265
266 /*
267 * If we're resolving (not probing), and it's now resolved,
268 * grab the link-layer address and update the flow route.
269 * If we get EJUSTRETURN, the mbuf (if any) would have
270 * been added to the hold queue. Any other return values
271 * including 0 means that we need to free it.
272 *
273 * If we're probing, we won't have any mbuf to deal with,
274 * and since we already have the cached llinfo we'll just
275 * return success even if we get EJUSTRETURN.
276 */
277 if (!probing) {
278 if (err == 0 && sdl.sdl_alen == ETHER_ADDR_LEN) {
279 SK_DF(SK_VERB_FLOW_ROUTE,
280 "fast-resolve %s on %s",
281 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
282 sizeof(dst_s)), ifp->if_xname);
283 FLOWRT_UPD_ETH_DST(fr, LLADDR(&sdl));
284 os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
285 }
286 if (err == EJUSTRETURN && m != NULL) {
287 SK_DF(SK_VERB_FLOW_ROUTE, "packet queued "
288 "while resolving %s on %s",
289 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
290 sizeof(dst_s)), ifp->if_xname);
291 m = NULL;
292 }
293 } else {
294 VERIFY(m == NULL);
295 if (err == EJUSTRETURN) {
296 err = 0;
297 }
298 }
299 break;
300 }
301
302 case AF_INET6: {
303 struct llinfo_nd6 *__single ln = tgt_rt->rt_llinfo;
304
305 /*
306 * Check if the route is down. RTF_LLINFO is set during
307 * RTM_{ADD,RESOLVE}, and is never cleared until the route
308 * is deleted from the routing table.
309 */
310 if ((tgt_rt->rt_flags & (RTF_UP | RTF_LLINFO)) !=
311 (RTF_UP | RTF_LLINFO) || ln == NULL) {
312 err = EHOSTUNREACH;
313 SK_ERR("route unavailable for %s on %s (err %d)",
314 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
315 sizeof(dst_s)), ifp->if_xname, err);
316 RT_UNLOCK(tgt_rt);
317 break;
318 }
319
320 /*
321 * If we're probing and IPv6 ND cache entry is STALE,
322 * use it anyway but also mark it for delayed probe
323 * and update the expiry.
324 */
325 if (probing) {
326 VERIFY(m == NULL);
327 VERIFY(ln->ln_state > ND6_LLINFO_INCOMPLETE);
328 if (ln->ln_state == ND6_LLINFO_STALE) {
329 ln->ln_asked = 0;
330 ND6_CACHE_STATE_TRANSITION(ln,
331 ND6_LLINFO_DELAY);
332 ln_setexpire(ln, net_uptime() + nd6_delay);
333 RT_UNLOCK(tgt_rt);
334
335 lck_mtx_lock(rnh_lock);
336 nd6_sched_timeout(NULL, NULL);
337 lck_mtx_unlock(rnh_lock);
338
339 SK_DF(SK_VERB_FLOW_ROUTE,
340 "NUD probe scheduled for %s on %s",
341 sk_sa_ntop(tgt_sa, dst_s,
342 sizeof(dst_s)), ifp->if_xname);
343 } else {
344 RT_UNLOCK(tgt_rt);
345 }
346 VERIFY(err == 0);
347 break;
348 }
349
350 /*
351 * If this is a permanent ND entry, we're done.
352 */
353 if (ln->ln_expire == 0 &&
354 ln->ln_state == ND6_LLINFO_REACHABLE) {
355 if (SDL(tgt_rt->rt_gateway)->sdl_alen !=
356 ETHER_ADDR_LEN) {
357 err = EHOSTUNREACH;
358 SK_ERR("invalid permanent route %s on %s"
359 "ln %p (err %d)",
360 sk_sa_ntop(rt_key(tgt_rt), dst_s,
361 sizeof(dst_s)), ifp->if_xname,
362 SK_KVA(ln), err);
363 } else {
364 SK_DF(SK_VERB_FLOW_ROUTE, "fast-resolve "
365 "permanent route %s on %s",
366 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
367 sizeof(dst_s)), ifp->if_xname);
368 /* copy permanent address into the flow route */
369 /*
370 * XXX Remove explicit __bidi_indexable once
371 * rdar://119193012 lands
372 */
373 struct sockaddr_dl *__bidi_indexable sdl =
374 (struct sockaddr_dl *__bidi_indexable) SDL(tgt_rt->rt_gateway);
375 FLOWRT_UPD_ETH_DST(fr, LLADDR(sdl));
376 os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
377 VERIFY(err == 0);
378 }
379 RT_UNLOCK(tgt_rt);
380 break;
381 }
382
383 if (ln->ln_state == ND6_LLINFO_NOSTATE) {
384 ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_INCOMPLETE);
385 }
386
387 if (ln->ln_state == ND6_LLINFO_INCOMPLETE && (!ln->ln_asked ||
388 !(fr->fr_flags & FLOWRTF_HAS_LLINFO))) {
389 struct nd_ifinfo *ndi = ND_IFINFO(tgt_rt->rt_ifp);
390 /*
391 * There is a neighbor cache entry, but no Ethernet
392 * address response yet. Replace the held mbuf
393 * (if any) with this the one we have (if any),
394 * else leave it alone.
395 *
396 * This code conforms to the rate-limiting rule
397 * described in Section 7.2.2 of RFC 4861, because
398 * the timer is set correctly after sending an
399 * NS below.
400 */
401 if (m != NULL) {
402 if (ln->ln_hold != NULL) {
403 m_freem_list(ln->ln_hold);
404 }
405 ln->ln_hold = m;
406 m = NULL;
407
408 SK_DF(SK_VERB_FLOW_ROUTE,
409 "packet queued while resolving %s on %s",
410 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
411 sizeof(dst_s)), ifp->if_xname);
412 }
413 VERIFY(ndi != NULL && ndi->initialized);
414 ln->ln_asked++;
415 ln_setexpire(ln, net_uptime() + ndi->retrans / 1000);
416 RT_UNLOCK(tgt_rt);
417
418 SK_DF(SK_VERB_FLOW_ROUTE, "soliciting for %s on %s"
419 "ln %p state %u", sk_sa_ntop(rt_key(tgt_rt),
420 dst_s, sizeof(dst_s)), ifp->if_xname, SK_KVA(ln),
421 ln->ln_state);
422
423 /* XXX Refactor this to use same src ip */
424 nd6_ns_output(tgt_rt->rt_ifp, NULL,
425 &SIN6(rt_key(tgt_rt))->sin6_addr, NULL, NULL, 0);
426
427 lck_mtx_lock(rnh_lock);
428 nd6_sched_timeout(NULL, NULL);
429 lck_mtx_unlock(rnh_lock);
430 err = EJUSTRETURN;
431 } else {
432 SK_DF(SK_VERB_FLOW_ROUTE, "fast-resolve %s on %s",
433 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
434 sizeof(dst_s)), ifp->if_xname);
435 /*
436 * The neighbor cache entry has been resolved;
437 * copy the address into the flow route.
438 */
439 /*
440 * XXX Remove explicit __bidi_indexable once
441 * rdar://119193012 lands
442 */
443 struct sockaddr_dl *__bidi_indexable sdl =
444 (struct sockaddr_dl *__bidi_indexable) SDL(tgt_rt->rt_gateway);
445 FLOWRT_UPD_ETH_DST(fr, LLADDR(sdl));
446 os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
447 RT_UNLOCK(tgt_rt);
448 VERIFY(err == 0);
449 }
450 /*
451 * XXX Need to optimize for the NDP garbage
452 * collection. It would be even better to unify
453 * BSD/SK NDP management through the completion
454 * of L2/L3 split.
455 */
456 break;
457 }
458
459 default:
460 VERIFY(0);
461 /* NOTREACHED */
462 __builtin_unreachable();
463 }
464 RT_LOCK_ASSERT_NOTHELD(tgt_rt);
465
466 done:
467 if (m != NULL) {
468 if (reattach_mbuf) {
469 pkt->pkt_mbuf = m;
470 pkt->pkt_pflags |= pkt_mflags_restore;
471 } else {
472 m_freem_list(m);
473 }
474 m = NULL;
475 }
476
477 if (__improbable(err != 0 && err != EJUSTRETURN)) {
478 SK_ERR("route to %s on %s can't be resolved (err %d)",
479 sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
480 ifp->if_xname, err);
481 /* keep FLOWRTF_HAS_LLINFO as llinfo is still useful */
482 os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
483 flow_route_cleanup(fr);
484 }
485
486 FR_UNLOCK(fr);
487
488 return err;
489 }
490
491 static void
fsw_ethernet_frame(struct nx_flowswitch * fsw,struct flow_route * fr,struct __kern_packet * pkt)492 fsw_ethernet_frame(struct nx_flowswitch *fsw, struct flow_route *fr,
493 struct __kern_packet *pkt)
494 {
495 /* in the event the source MAC address changed, update our copy */
496 if (__improbable(fr->fr_llhdr.flh_gencnt != fsw->fsw_src_lla_gencnt)) {
497 uint8_t old_shost[ETHER_ADDR_LEN];
498
499 bcopy(&fr->fr_eth.ether_shost, &old_shost, ETHER_ADDR_LEN);
500 fsw_ethernet_ctor(fsw, fr);
501
502 SK_ERR("fr %p source MAC address updated on %s, "
503 "was %x:%x:%x:%x:%x:%x now %x:%x:%x:%x:%x:%x",
504 SK_KVA(fr), if_name(fsw->fsw_ifp),
505 old_shost[0], old_shost[1],
506 old_shost[2], old_shost[3],
507 old_shost[4], old_shost[5],
508 fr->fr_eth.ether_shost[0], fr->fr_eth.ether_shost[1],
509 fr->fr_eth.ether_shost[2], fr->fr_eth.ether_shost[3],
510 fr->fr_eth.ether_shost[4], fr->fr_eth.ether_shost[5]);
511 }
512
513 static_assert(sizeof(fr->fr_eth_padded) == FSW_ETHER_LEN_PADDED);
514
515 if ((fr->fr_flags & FLOWRTF_DST_LL_MCAST) != 0) {
516 pkt->pkt_link_flags |= PKT_LINKF_MCAST;
517 } else if ((fr->fr_flags & FLOWRTF_DST_LL_BCAST) != 0) {
518 pkt->pkt_link_flags |= PKT_LINKF_BCAST;
519 }
520
521 ASSERT(pkt->pkt_headroom >= FSW_ETHER_LEN_PADDED);
522
523 char *pkt_buf;
524 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
525 sk_copy64_16((uint64_t *)(void *)&fr->fr_eth_padded,
526 (uint64_t *)(void *)(pkt_buf + pkt->pkt_headroom - FSW_ETHER_LEN_PADDED));
527
528 pkt->pkt_headroom -= ETHER_HDR_LEN;
529 pkt->pkt_l2_len = ETHER_HDR_LEN;
530
531 if ((pkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) {
532 /* frame and fix up mbuf */
533 struct mbuf *m = pkt->pkt_mbuf;
534 void *buf = m_mtod_current(m) - FSW_ETHER_LEN_PADDED;
535
536 sk_copy64_16((uint64_t *)(void *)&fr->fr_eth_padded, buf);
537 ASSERT((uintptr_t)m->m_data ==
538 (uintptr_t)mbuf_datastart(m) + FSW_ETHER_FRAME_HEADROOM);
539 m->m_data -= ETHER_HDR_LEN;
540 m->m_len += ETHER_HDR_LEN;
541 m_pktlen(m) += ETHER_HDR_LEN;
542 ASSERT(m->m_len == m_pktlen(m));
543 pkt->pkt_length = m_pktlen(m);
544 } else {
545 METADATA_ADJUST_LEN(pkt, ETHER_HDR_LEN, pkt->pkt_headroom);
546 }
547 }
548
549 static sa_family_t
fsw_ethernet_demux(struct nx_flowswitch * fsw,struct __kern_packet * pkt)550 fsw_ethernet_demux(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
551 {
552 #pragma unused(fsw)
553 const struct ether_header *eh;
554 sa_family_t af = AF_UNSPEC;
555 uint32_t bdlen, bdlim, bdoff;
556 uint8_t *baddr;
557
558 MD_BUFLET_ADDR_ABS_DLEN(pkt, baddr, bdlen, bdlim, bdoff);
559 baddr += pkt->pkt_headroom;
560 eh = (struct ether_header *)(void *)baddr;
561
562 if (__improbable(sizeof(*eh) > pkt->pkt_length)) {
563 STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_ERR);
564 SK_ERR("unrecognized pkt, len %u", pkt->pkt_length);
565 return AF_UNSPEC;
566 }
567
568 if (__improbable(pkt->pkt_headroom + sizeof(*eh) > bdlim)) {
569 SK_ERR("ethernet header overrun 1st buflet");
570 STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_ERR);
571 return AF_UNSPEC;
572 }
573
574 if (__improbable((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0)) {
575 pkt->pkt_length -= ETHER_CRC_LEN;
576 pkt->pkt_link_flags &= ~PKT_LINKF_ETHFCS;
577 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
578 ASSERT((pkt->pkt_mbuf->m_flags & M_HASFCS) != 0);
579 m_adj(pkt->pkt_mbuf, -ETHER_CRC_LEN);
580 pkt->pkt_mbuf->m_flags &= ~M_HASFCS;
581 }
582 }
583 pkt->pkt_l2_len = ETHER_HDR_LEN;
584 if ((eh->ether_dhost[0] & 1) == 0) {
585 /*
586 * When the driver is put into promiscuous mode we may receive
587 * unicast frames that are not intended for our interfaces.
588 * They are marked here as being promiscuous so the caller may
589 * dispose of them after passing the packets to any interface
590 * filters.
591 */
592 if (_ether_cmp(eh->ether_dhost, IF_LLADDR(fsw->fsw_ifp))) {
593 pkt->pkt_pflags |= PKT_F_PROMISC;
594 STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_PROMISC);
595 return AF_UNSPEC;
596 }
597 }
598 uint16_t ether_type = ntohs(eh->ether_type);
599 switch (ether_type) {
600 case ETHERTYPE_IP:
601 af = AF_INET;
602 break;
603 case ETHERTYPE_IPV6:
604 af = AF_INET6;
605 break;
606 default:
607 STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_UNSPEC);
608 break;
609 }
610
611 return af;
612 }
613