1 /*
2 * Copyright (c) 2016-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <netinet/in_arp.h>
33 #include <netinet/ip6.h>
34 #include <netinet6/in6_var.h>
35 #include <netinet6/nd6.h>
36 #include <net/ethernet.h>
37 #include <net/route.h>
38 #include <sys/eventhandler.h>
39
40 #define FSW_ETHER_LEN_PADDED 16
41 #define FSW_ETHER_PADDING (FSW_ETHER_LEN_PADDED - ETHER_HDR_LEN)
42 #define FSW_ETHER_FRAME_HEADROOM FSW_ETHER_LEN_PADDED
43
44 static void fsw_ethernet_ctor(struct nx_flowswitch *, struct flow_route *);
45 static int fsw_ethernet_resolve(struct nx_flowswitch *, struct flow_route *,
46 struct __kern_packet *);
47 static void fsw_ethernet_frame(struct nx_flowswitch *, struct flow_route *,
48 struct __kern_packet *);
49 static sa_family_t fsw_ethernet_demux(struct nx_flowswitch *,
50 struct __kern_packet *);
51
52 extern struct rtstat rtstat;
53
54 int
fsw_ethernet_setup(struct nx_flowswitch * fsw,struct ifnet * ifp)55 fsw_ethernet_setup(struct nx_flowswitch *fsw, struct ifnet *ifp)
56 {
57 struct ifaddr *lladdr = ifp->if_lladdr;
58
59 if (SDL(lladdr->ifa_addr)->sdl_alen != ETHER_ADDR_LEN ||
60 SDL(lladdr->ifa_addr)->sdl_type != IFT_ETHER) {
61 return ENOTSUP;
62 }
63
64 ifnet_lladdr_copy_bytes(ifp, fsw->fsw_ether_shost, ETHER_ADDR_LEN);
65 fsw->fsw_ctor = fsw_ethernet_ctor;
66 fsw->fsw_resolve = fsw_ethernet_resolve;
67 fsw->fsw_frame = fsw_ethernet_frame;
68 fsw->fsw_frame_headroom = FSW_ETHER_FRAME_HEADROOM;
69 fsw->fsw_demux = fsw_ethernet_demux;
70
71 return 0;
72 }
73
74 static void
fsw_ethernet_ctor(struct nx_flowswitch * fsw,struct flow_route * fr)75 fsw_ethernet_ctor(struct nx_flowswitch *fsw, struct flow_route *fr)
76 {
77 ASSERT(fr->fr_af == AF_INET || fr->fr_af == AF_INET6);
78
79 fr->fr_llhdr.flh_gencnt = fsw->fsw_src_lla_gencnt;
80 bcopy(fsw->fsw_ether_shost, fr->fr_eth.ether_shost, ETHER_ADDR_LEN);
81 fr->fr_eth.ether_type = ((fr->fr_af == AF_INET) ?
82 htons(ETHERTYPE_IP) : htons(ETHERTYPE_IPV6));
83
84 /* const override */
85 _CASSERT(sizeof(fr->fr_llhdr.flh_off) == sizeof(uint8_t));
86 _CASSERT(sizeof(fr->fr_llhdr.flh_len) == sizeof(uint8_t));
87 *(uint8_t *)(uintptr_t)&fr->fr_llhdr.flh_off = 2;
88 *(uint8_t *)(uintptr_t)&fr->fr_llhdr.flh_len = ETHER_HDR_LEN;
89
90 SK_DF(SK_VERB_FLOW_ROUTE,
91 "fr 0x%llx eth_type 0x%x eth_src %x:%x:%x:%x:%x:%x",
92 SK_KVA(fr), ntohs(fr->fr_eth.ether_type),
93 fr->fr_eth.ether_shost[0], fr->fr_eth.ether_shost[1],
94 fr->fr_eth.ether_shost[2], fr->fr_eth.ether_shost[3],
95 fr->fr_eth.ether_shost[4], fr->fr_eth.ether_shost[5]);
96 }
97
98 static int
fsw_ethernet_resolve(struct nx_flowswitch * fsw,struct flow_route * fr,struct __kern_packet * pkt)99 fsw_ethernet_resolve(struct nx_flowswitch *fsw, struct flow_route *fr,
100 struct __kern_packet *pkt)
101 {
102 #if SK_LOG
103 char dst_s[MAX_IPv6_STR_LEN];
104 #endif /* SK_LOG */
105 struct ifnet *ifp = fsw->fsw_ifp;
106 struct rtentry *tgt_rt = NULL;
107 struct sockaddr *tgt_sa = NULL;
108 struct mbuf *m = NULL;
109 boolean_t reattach_mbuf = FALSE;
110 boolean_t probing;
111 int err = 0;
112
113 ASSERT(fr != NULL);
114 ASSERT(ifp != NULL);
115
116 FR_LOCK(fr);
117 /*
118 * If the destination is on-link, we use the final destination
119 * address as target. If it's off-link, we use the gateway
120 * address instead. Point tgt_rt to the the destination or
121 * gateway route accordingly.
122 */
123 if (fr->fr_flags & FLOWRTF_ONLINK) {
124 tgt_sa = SA(&fr->fr_faddr);
125 tgt_rt = fr->fr_rt_dst;
126 } else if (fr->fr_flags & FLOWRTF_GATEWAY) {
127 tgt_sa = SA(&fr->fr_gaddr);
128 tgt_rt = fr->fr_rt_gw;
129 }
130
131 /*
132 * Perform another routing table lookup if necessary.
133 */
134 if (tgt_rt == NULL || !(tgt_rt->rt_flags & RTF_UP) ||
135 fr->fr_want_configure) {
136 if (fr->fr_want_configure == 0) {
137 os_atomic_inc(&fr->fr_want_configure, relaxed);
138 }
139 err = flow_route_configure(fr, ifp, NULL);
140 if (err != 0) {
141 SK_ERR("failed to configure route to %s on %s (err %d)",
142 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
143 sizeof(dst_s)), ifp->if_xname, err);
144 goto done;
145 }
146
147 /* refresh pointers */
148 if (fr->fr_flags & FLOWRTF_ONLINK) {
149 tgt_sa = SA(&fr->fr_faddr);
150 tgt_rt = fr->fr_rt_dst;
151 } else if (fr->fr_flags & FLOWRTF_GATEWAY) {
152 tgt_sa = SA(&fr->fr_gaddr);
153 tgt_rt = fr->fr_rt_gw;
154 }
155 }
156
157 if (__improbable(!(fr->fr_flags & (FLOWRTF_ONLINK | FLOWRTF_GATEWAY)))) {
158 err = EHOSTUNREACH;
159 SK_ERR("invalid route for %s on %s (err %d)",
160 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
161 sizeof(dst_s)), ifp->if_xname, err);
162 goto done;
163 }
164
165 ASSERT(tgt_sa != NULL);
166 ASSERT(tgt_rt != NULL);
167
168 /*
169 * Attempt to convert kpkt to mbuf before acquiring the
170 * rt lock so that the lock won't be held if we need to do
171 * blocked a mbuf allocation.
172 */
173 if (!(fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
174 /*
175 * We need to resolve; if caller passes in a kpkt,
176 * convert the kpkt within to mbuf. Caller is then
177 * reponsible for freeing kpkt. In future, we could
178 * optimize this by having the ARP/ND lookup routines
179 * understand kpkt and perform the conversion only
180 * when it is needed.
181 */
182 if (__probable(pkt != NULL)) {
183 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
184 reattach_mbuf = TRUE;
185 m = pkt->pkt_mbuf;
186 KPKT_CLEAR_MBUF_DATA(pkt);
187 } else {
188 m = fsw_classq_kpkt_to_mbuf(fsw, pkt);
189 }
190 if (m == NULL) {
191 /* not a fatal error; move on */
192 SK_ERR("failed to allocate mbuf while "
193 "resolving %s on %s",
194 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
195 sizeof(dst_s)), ifp->if_xname);
196 }
197 } else {
198 m = NULL;
199 }
200 }
201
202 RT_LOCK(tgt_rt);
203
204 if (__improbable(!IS_DIRECT_HOSTROUTE(tgt_rt) ||
205 tgt_rt->rt_gateway->sa_family != AF_LINK ||
206 SDL(tgt_rt->rt_gateway)->sdl_type != IFT_ETHER)) {
207 rtstat.rts_badrtgwroute++;
208 err = ENETUNREACH;
209 RT_UNLOCK(tgt_rt);
210 SK_ERR("bad gateway route %s on %s (err %d)",
211 sk_sa_ntop(tgt_sa, dst_s, sizeof(dst_s)),
212 ifp->if_xname, err);
213 goto done;
214 }
215
216 /*
217 * If already resolved, grab the link-layer address and mark the
218 * flow route accordingly. Given that we will use the cached
219 * link-layer info, there's no need to convert and enqueue the
220 * packet to ARP/ND (i.e. no need to return EJUSTRETURN).
221 */
222 if (__probable((fr->fr_flags & FLOWRTF_HAS_LLINFO) &&
223 SDL(tgt_rt->rt_gateway)->sdl_alen == ETHER_ADDR_LEN)) {
224 VERIFY(m == NULL);
225 FLOWRT_UPD_ETH_DST(fr, LLADDR(SDL(tgt_rt->rt_gateway)));
226 os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
227 /* if we're not probing, then we're done */
228 if (!(probing = (fr->fr_want_probe != 0))) {
229 VERIFY(err == 0);
230 RT_UNLOCK(tgt_rt);
231 goto done;
232 }
233 os_atomic_store(&fr->fr_want_probe, 0, release);
234 } else {
235 probing = FALSE;
236 os_atomic_andnot(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
237 }
238
239 SK_DF(SK_VERB_FLOW_ROUTE, "%s %s on %s", (probing ?
240 "probing" : "resolving"), sk_sa_ntop(tgt_sa, dst_s,
241 sizeof(dst_s)), ifp->if_xname);
242
243 /*
244 * Trigger ARP/NDP resolution or probing.
245 */
246 switch (tgt_sa->sa_family) {
247 case AF_INET: {
248 struct sockaddr_dl sdl;
249
250 RT_UNLOCK(tgt_rt);
251 /*
252 * Note we pass NULL as "hint" parameter, as tgt_sa
253 * is already refererring to the target address.
254 */
255 bzero(&sdl, sizeof(sdl));
256 err = arp_lookup_ip(ifp, SIN(tgt_sa), &sdl, sizeof(sdl),
257 NULL, m);
258
259 /*
260 * If we're resolving (not probing), and it's now resolved,
261 * grab the link-layer address and update the flow route.
262 * If we get EJUSTRETURN, the mbuf (if any) would have
263 * been added to the hold queue. Any other return values
264 * including 0 means that we need to free it.
265 *
266 * If we're probing, we won't have any mbuf to deal with,
267 * and since we already have the cached llinfo we'll just
268 * return success even if we get EJUSTRETURN.
269 */
270 if (!probing) {
271 if (err == 0 && sdl.sdl_alen == ETHER_ADDR_LEN) {
272 SK_DF(SK_VERB_FLOW_ROUTE,
273 "fast-resolve %s on %s",
274 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
275 sizeof(dst_s)), ifp->if_xname);
276 FLOWRT_UPD_ETH_DST(fr, LLADDR(&sdl));
277 os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
278 }
279 if (err == EJUSTRETURN && m != NULL) {
280 SK_DF(SK_VERB_FLOW_ROUTE, "packet queued "
281 "while resolving %s on %s",
282 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
283 sizeof(dst_s)), ifp->if_xname);
284 m = NULL;
285 }
286 } else {
287 VERIFY(m == NULL);
288 if (err == EJUSTRETURN) {
289 err = 0;
290 }
291 }
292 break;
293 }
294
295 case AF_INET6: {
296 struct llinfo_nd6 *ln = tgt_rt->rt_llinfo;
297
298 /*
299 * Check if the route is down. RTF_LLINFO is set during
300 * RTM_{ADD,RESOLVE}, and is never cleared until the route
301 * is deleted from the routing table.
302 */
303 if ((tgt_rt->rt_flags & (RTF_UP | RTF_LLINFO)) !=
304 (RTF_UP | RTF_LLINFO) || ln == NULL) {
305 err = EHOSTUNREACH;
306 SK_ERR("route unavailable for %s on %s (err %d)",
307 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
308 sizeof(dst_s)), ifp->if_xname, err);
309 RT_UNLOCK(tgt_rt);
310 break;
311 }
312
313 /*
314 * If we're probing and IPv6 ND cache entry is STALE,
315 * use it anyway but also mark it for delayed probe
316 * and update the expiry.
317 */
318 if (probing) {
319 VERIFY(m == NULL);
320 VERIFY(ln->ln_state > ND6_LLINFO_INCOMPLETE);
321 if (ln->ln_state == ND6_LLINFO_STALE) {
322 ln->ln_asked = 0;
323 ND6_CACHE_STATE_TRANSITION(ln,
324 ND6_LLINFO_DELAY);
325 ln_setexpire(ln, net_uptime() + nd6_delay);
326 RT_UNLOCK(tgt_rt);
327
328 lck_mtx_lock(rnh_lock);
329 nd6_sched_timeout(NULL, NULL);
330 lck_mtx_unlock(rnh_lock);
331
332 SK_DF(SK_VERB_FLOW_ROUTE,
333 "NUD probe scheduled for %s on %s",
334 sk_sa_ntop(tgt_sa, dst_s,
335 sizeof(dst_s)), ifp->if_xname);
336 } else {
337 RT_UNLOCK(tgt_rt);
338 }
339 VERIFY(err == 0);
340 break;
341 }
342
343 /*
344 * If this is a permanent ND entry, we're done.
345 */
346 if (ln->ln_expire == 0 &&
347 ln->ln_state == ND6_LLINFO_REACHABLE) {
348 if (SDL(tgt_rt->rt_gateway)->sdl_alen !=
349 ETHER_ADDR_LEN) {
350 err = EHOSTUNREACH;
351 SK_ERR("invalid permanent route %s on %s"
352 "ln 0x%llx (err %d)",
353 sk_sa_ntop(rt_key(tgt_rt), dst_s,
354 sizeof(dst_s)), ifp->if_xname,
355 SK_KVA(ln), err);
356 } else {
357 SK_DF(SK_VERB_FLOW_ROUTE, "fast-resolve "
358 "permanent route %s on %s",
359 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
360 sizeof(dst_s)), ifp->if_xname);
361 /* copy permanent address into the flow route */
362 FLOWRT_UPD_ETH_DST(fr,
363 LLADDR(SDL(tgt_rt->rt_gateway)));
364 os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
365 VERIFY(err == 0);
366 }
367 RT_UNLOCK(tgt_rt);
368 break;
369 }
370
371 if (ln->ln_state == ND6_LLINFO_NOSTATE) {
372 ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_INCOMPLETE);
373 }
374
375 if (ln->ln_state == ND6_LLINFO_INCOMPLETE && (!ln->ln_asked ||
376 !(fr->fr_flags & FLOWRTF_HAS_LLINFO))) {
377 struct nd_ifinfo *ndi = ND_IFINFO(tgt_rt->rt_ifp);
378 /*
379 * There is a neighbor cache entry, but no Ethernet
380 * address response yet. Replace the held mbuf
381 * (if any) with this the one we have (if any),
382 * else leave it alone.
383 *
384 * This code conforms to the rate-limiting rule
385 * described in Section 7.2.2 of RFC 4861, because
386 * the timer is set correctly after sending an
387 * NS below.
388 */
389 if (m != NULL) {
390 if (ln->ln_hold != NULL) {
391 m_freem_list(ln->ln_hold);
392 }
393 ln->ln_hold = m;
394 m = NULL;
395
396 SK_DF(SK_VERB_FLOW_ROUTE,
397 "packet queued while resolving %s on %s",
398 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
399 sizeof(dst_s)), ifp->if_xname);
400 }
401 VERIFY(ndi != NULL && ndi->initialized);
402 ln->ln_asked++;
403 ln_setexpire(ln, net_uptime() + ndi->retrans / 1000);
404 RT_UNLOCK(tgt_rt);
405
406 SK_DF(SK_VERB_FLOW_ROUTE, "soliciting for %s on %s"
407 "ln 0x%llx state %u", sk_sa_ntop(rt_key(tgt_rt),
408 dst_s, sizeof(dst_s)), ifp->if_xname, SK_KVA(ln),
409 ln->ln_state);
410
411 /* XXX Refactor this to use same src ip */
412 nd6_ns_output(tgt_rt->rt_ifp, NULL,
413 &SIN6(rt_key(tgt_rt))->sin6_addr, NULL, NULL);
414
415 lck_mtx_lock(rnh_lock);
416 nd6_sched_timeout(NULL, NULL);
417 lck_mtx_unlock(rnh_lock);
418 err = EJUSTRETURN;
419 } else {
420 SK_DF(SK_VERB_FLOW_ROUTE, "fast-resolve %s on %s",
421 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
422 sizeof(dst_s)), ifp->if_xname);
423 /*
424 * The neighbor cache entry has been resolved;
425 * copy the address into the flow route.
426 */
427 FLOWRT_UPD_ETH_DST(fr, LLADDR(SDL(tgt_rt->rt_gateway)));
428 os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
429 RT_UNLOCK(tgt_rt);
430 VERIFY(err == 0);
431 }
432 /*
433 * XXX Need to optimize for the NDP garbage
434 * collection. It would be even better to unify
435 * BSD/SK NDP management through the completion
436 * of L2/L3 split.
437 */
438 break;
439 }
440
441 default:
442 VERIFY(0);
443 /* NOTREACHED */
444 __builtin_unreachable();
445 }
446 RT_LOCK_ASSERT_NOTHELD(tgt_rt);
447
448 done:
449 if (m != NULL) {
450 if (reattach_mbuf) {
451 pkt->pkt_mbuf = m;
452 pkt->pkt_pflags |= PKT_F_MBUF_DATA;
453 } else {
454 m_freem_list(m);
455 }
456 m = NULL;
457 }
458
459 if (__improbable(err != 0 && err != EJUSTRETURN)) {
460 SK_ERR("route to %s on %s can't be resolved (err %d)",
461 sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
462 ifp->if_xname, err);
463 /* keep FLOWRTF_HAS_LLINFO as llinfo is still useful */
464 os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
465 flow_route_cleanup(fr);
466 }
467
468 FR_UNLOCK(fr);
469
470 return err;
471 }
472
473 static void
fsw_ethernet_frame(struct nx_flowswitch * fsw,struct flow_route * fr,struct __kern_packet * pkt)474 fsw_ethernet_frame(struct nx_flowswitch *fsw, struct flow_route *fr,
475 struct __kern_packet *pkt)
476 {
477 /* in the event the source MAC address changed, update our copy */
478 if (__improbable(fr->fr_llhdr.flh_gencnt != fsw->fsw_src_lla_gencnt)) {
479 uint8_t old_shost[ETHER_ADDR_LEN];
480
481 bcopy(&fr->fr_eth.ether_shost, &old_shost, ETHER_ADDR_LEN);
482 fsw_ethernet_ctor(fsw, fr);
483
484 SK_ERR("fr 0x%llx source MAC address updated on %s, "
485 "was %x:%x:%x:%x:%x:%x now %x:%x:%x:%x:%x:%x",
486 SK_KVA(fr), fsw->fsw_ifp,
487 old_shost[0], old_shost[1],
488 old_shost[2], old_shost[3],
489 old_shost[4], old_shost[5],
490 fr->fr_eth.ether_shost[0], fr->fr_eth.ether_shost[1],
491 fr->fr_eth.ether_shost[2], fr->fr_eth.ether_shost[3],
492 fr->fr_eth.ether_shost[4], fr->fr_eth.ether_shost[5]);
493 }
494
495 _CASSERT(sizeof(fr->fr_eth_padded) == FSW_ETHER_LEN_PADDED);
496
497 if ((fr->fr_flags & FLOWRTF_DST_LL_MCAST) != 0) {
498 pkt->pkt_link_flags |= PKT_LINKF_MCAST;
499 } else if ((fr->fr_flags & FLOWRTF_DST_LL_BCAST) != 0) {
500 pkt->pkt_link_flags |= PKT_LINKF_BCAST;
501 }
502
503 ASSERT(pkt->pkt_headroom >= FSW_ETHER_LEN_PADDED);
504
505 char *pkt_buf;
506 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
507 sk_copy64_16((uint64_t *)(void *)&fr->fr_eth_padded,
508 (uint64_t *)(void *)(pkt_buf + pkt->pkt_headroom - FSW_ETHER_LEN_PADDED));
509
510 pkt->pkt_headroom -= ETHER_HDR_LEN;
511 pkt->pkt_l2_len = ETHER_HDR_LEN;
512
513 if ((pkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) {
514 /* frame and fix up mbuf */
515 struct mbuf *m = pkt->pkt_mbuf;
516 sk_copy64_16((uint64_t *)(void *)&fr->fr_eth_padded,
517 (uint64_t *)(void *)(m->m_data - FSW_ETHER_LEN_PADDED));
518 ASSERT((uintptr_t)m->m_data ==
519 (uintptr_t)mbuf_datastart(m) + FSW_ETHER_FRAME_HEADROOM);
520 m->m_data -= ETHER_HDR_LEN;
521 m->m_len += ETHER_HDR_LEN;
522 m_pktlen(m) += ETHER_HDR_LEN;
523 ASSERT(m->m_len == m_pktlen(m));
524 pkt->pkt_length = m_pktlen(m);
525 } else {
526 METADATA_ADJUST_LEN(pkt, ETHER_HDR_LEN, pkt->pkt_headroom);
527 }
528 }
529
530 static sa_family_t
fsw_ethernet_demux(struct nx_flowswitch * fsw,struct __kern_packet * pkt)531 fsw_ethernet_demux(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
532 {
533 #pragma unused(fsw)
534 const struct ether_header *eh;
535 sa_family_t af = AF_UNSPEC;
536 uint32_t bdlen, bdlim, bdoff;
537 uint8_t *baddr;
538
539 MD_BUFLET_ADDR_ABS_DLEN(pkt, baddr, bdlen, bdlim, bdoff);
540 baddr += pkt->pkt_headroom;
541 eh = (struct ether_header *)(void *)baddr;
542
543 if (__improbable(sizeof(*eh) > pkt->pkt_length)) {
544 STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_ERR);
545 SK_ERR("unrecognized pkt, len %u", pkt->pkt_length);
546 return AF_UNSPEC;
547 }
548
549 if (__improbable(pkt->pkt_headroom + sizeof(*eh) > bdlim)) {
550 SK_ERR("ethernet header overrun 1st buflet");
551 STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_ERR);
552 return AF_UNSPEC;
553 }
554
555 if (__improbable((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0)) {
556 pkt->pkt_length -= ETHER_CRC_LEN;
557 pkt->pkt_link_flags &= ~PKT_LINKF_ETHFCS;
558 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
559 ASSERT((pkt->pkt_mbuf->m_flags & M_HASFCS) != 0);
560 m_adj(pkt->pkt_mbuf, -ETHER_CRC_LEN);
561 pkt->pkt_mbuf->m_flags &= ~M_HASFCS;
562 }
563 }
564 pkt->pkt_l2_len = ETHER_HDR_LEN;
565 if ((eh->ether_dhost[0] & 1) == 0) {
566 /*
567 * When the driver is put into promiscuous mode we may receive
568 * unicast frames that are not intended for our interfaces.
569 * They are marked here as being promiscuous so the caller may
570 * dispose of them after passing the packets to any interface
571 * filters.
572 */
573 if (_ether_cmp(eh->ether_dhost, IF_LLADDR(fsw->fsw_ifp))) {
574 pkt->pkt_pflags |= PKT_F_PROMISC;
575 STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_PROMISC);
576 return AF_UNSPEC;
577 }
578 }
579 uint16_t ether_type = ntohs(eh->ether_type);
580 switch (ether_type) {
581 case ETHERTYPE_IP:
582 af = AF_INET;
583 break;
584 case ETHERTYPE_IPV6:
585 af = AF_INET6;
586 break;
587 default:
588 STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_UNSPEC);
589 break;
590 }
591
592 return af;
593 }
594