1 /*
2 * Copyright (c) 2016-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <netinet/in_arp.h>
33 #include <netinet/ip6.h>
34 #include <netinet6/in6_var.h>
35 #include <netinet6/nd6.h>
36 #include <net/ethernet.h>
37 #include <net/route.h>
38 #include <sys/eventhandler.h>
39 #include <net/sockaddr_utils.h>
40
41 #define FSW_ETHER_LEN_PADDED 16
42 #define FSW_ETHER_PADDING (FSW_ETHER_LEN_PADDED - ETHER_HDR_LEN)
43 #define FSW_ETHER_FRAME_HEADROOM FSW_ETHER_LEN_PADDED
44
45 static void fsw_ethernet_ctor(struct nx_flowswitch *, struct flow_route *);
46 static int fsw_ethernet_resolve(struct nx_flowswitch *, struct flow_route *,
47 struct __kern_packet *);
48 static void fsw_ethernet_frame(struct nx_flowswitch *, struct flow_route *,
49 struct __kern_packet *);
50 static sa_family_t fsw_ethernet_demux(struct nx_flowswitch *,
51 struct __kern_packet *);
52
53 extern struct rtstat_64 rtstat;
54
55 int
fsw_ethernet_setup(struct nx_flowswitch * fsw,struct ifnet * ifp)56 fsw_ethernet_setup(struct nx_flowswitch *fsw, struct ifnet *ifp)
57 {
58 struct ifaddr *lladdr = ifp->if_lladdr;
59
60 if (SDL(lladdr->ifa_addr)->sdl_alen != ETHER_ADDR_LEN ||
61 SDL(lladdr->ifa_addr)->sdl_type != IFT_ETHER) {
62 return ENOTSUP;
63 }
64
65 ifnet_lladdr_copy_bytes(ifp, fsw->fsw_ether_shost, ETHER_ADDR_LEN);
66 fsw->fsw_ctor = fsw_ethernet_ctor;
67 fsw->fsw_resolve = fsw_ethernet_resolve;
68 fsw->fsw_frame = fsw_ethernet_frame;
69 fsw->fsw_frame_headroom = FSW_ETHER_FRAME_HEADROOM;
70 fsw->fsw_demux = fsw_ethernet_demux;
71
72 return 0;
73 }
74
75 static void
fsw_ethernet_ctor(struct nx_flowswitch * fsw,struct flow_route * fr)76 fsw_ethernet_ctor(struct nx_flowswitch *fsw, struct flow_route *fr)
77 {
78 ASSERT(fr->fr_af == AF_INET || fr->fr_af == AF_INET6);
79
80 fr->fr_llhdr.flh_gencnt = fsw->fsw_src_lla_gencnt;
81 bcopy(fsw->fsw_ether_shost, fr->fr_eth.ether_shost, ETHER_ADDR_LEN);
82 fr->fr_eth.ether_type = ((fr->fr_af == AF_INET) ?
83 htons(ETHERTYPE_IP) : htons(ETHERTYPE_IPV6));
84
85 /* const override */
86 _CASSERT(sizeof(fr->fr_llhdr.flh_off) == sizeof(uint8_t));
87 _CASSERT(sizeof(fr->fr_llhdr.flh_len) == sizeof(uint8_t));
88 *(uint8_t *)(uintptr_t)&fr->fr_llhdr.flh_off = 2;
89 *(uint8_t *)(uintptr_t)&fr->fr_llhdr.flh_len = ETHER_HDR_LEN;
90
91 SK_DF(SK_VERB_FLOW_ROUTE,
92 "fr 0x%llx eth_type 0x%x eth_src %x:%x:%x:%x:%x:%x",
93 SK_KVA(fr), ntohs(fr->fr_eth.ether_type),
94 fr->fr_eth.ether_shost[0], fr->fr_eth.ether_shost[1],
95 fr->fr_eth.ether_shost[2], fr->fr_eth.ether_shost[3],
96 fr->fr_eth.ether_shost[4], fr->fr_eth.ether_shost[5]);
97 }
98
99 static int
fsw_ethernet_resolve(struct nx_flowswitch * fsw,struct flow_route * fr,struct __kern_packet * pkt)100 fsw_ethernet_resolve(struct nx_flowswitch *fsw, struct flow_route *fr,
101 struct __kern_packet *pkt)
102 {
103 #if SK_LOG
104 char dst_s[MAX_IPv6_STR_LEN];
105 #endif /* SK_LOG */
106 struct ifnet *ifp = fsw->fsw_ifp;
107 struct rtentry *tgt_rt = NULL;
108 struct sockaddr *tgt_sa = NULL;
109 struct mbuf *m = NULL;
110 boolean_t reattach_mbuf = FALSE;
111 boolean_t probing;
112 int err = 0;
113 uint64_t pkt_mflags_restore; /* Save old mbuf flags to restore in error cases */
114
115 ASSERT(fr != NULL);
116 ASSERT(ifp != NULL);
117
118 FR_LOCK(fr);
119 /*
120 * If the destination is on-link, we use the final destination
121 * address as target. If it's off-link, we use the gateway
122 * address instead. Point tgt_rt to the the destination or
123 * gateway route accordingly.
124 */
125 if (fr->fr_flags & FLOWRTF_ONLINK) {
126 tgt_sa = SA(&fr->fr_faddr);
127 tgt_rt = fr->fr_rt_dst;
128 } else if (fr->fr_flags & FLOWRTF_GATEWAY) {
129 tgt_sa = SA(&fr->fr_gaddr);
130 tgt_rt = fr->fr_rt_gw;
131 }
132
133 /*
134 * Perform another routing table lookup if necessary.
135 */
136 if (tgt_rt == NULL || !(tgt_rt->rt_flags & RTF_UP) ||
137 fr->fr_want_configure) {
138 if (fr->fr_want_configure == 0) {
139 os_atomic_inc(&fr->fr_want_configure, relaxed);
140 }
141 err = flow_route_configure(fr, ifp, NULL);
142 if (err != 0) {
143 SK_ERR("failed to configure route to %s on %s (err %d)",
144 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
145 sizeof(dst_s)), ifp->if_xname, err);
146 goto done;
147 }
148
149 /* refresh pointers */
150 if (fr->fr_flags & FLOWRTF_ONLINK) {
151 tgt_sa = SA(&fr->fr_faddr);
152 tgt_rt = fr->fr_rt_dst;
153 } else if (fr->fr_flags & FLOWRTF_GATEWAY) {
154 tgt_sa = SA(&fr->fr_gaddr);
155 tgt_rt = fr->fr_rt_gw;
156 }
157 }
158
159 if (__improbable(!(fr->fr_flags & (FLOWRTF_ONLINK | FLOWRTF_GATEWAY)))) {
160 err = EHOSTUNREACH;
161 SK_ERR("invalid route for %s on %s (err %d)",
162 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
163 sizeof(dst_s)), ifp->if_xname, err);
164 goto done;
165 }
166
167 ASSERT(tgt_sa != NULL);
168 ASSERT(tgt_rt != NULL);
169
170 /*
171 * Attempt to convert kpkt to mbuf before acquiring the
172 * rt lock so that the lock won't be held if we need to do
173 * blocked a mbuf allocation.
174 */
175 if (!(fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
176 /*
177 * We need to resolve; if caller passes in a kpkt,
178 * convert the kpkt within to mbuf. Caller is then
179 * reponsible for freeing kpkt. In future, we could
180 * optimize this by having the ARP/ND lookup routines
181 * understand kpkt and perform the conversion only
182 * when it is needed.
183 */
184 if (__probable(pkt != NULL)) {
185 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
186 reattach_mbuf = TRUE;
187 m = pkt->pkt_mbuf;
188 pkt_mflags_restore = (pkt->pkt_pflags & PKT_F_MBUF_MASK);
189 KPKT_CLEAR_MBUF_DATA(pkt);
190 } else {
191 m = fsw_classq_kpkt_to_mbuf(fsw, pkt);
192 }
193 if (m == NULL) {
194 /* not a fatal error; move on */
195 SK_ERR("failed to allocate mbuf while "
196 "resolving %s on %s",
197 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
198 sizeof(dst_s)), ifp->if_xname);
199 }
200 } else {
201 m = NULL;
202 }
203 }
204
205 RT_LOCK(tgt_rt);
206
207 if (__improbable(!IS_DIRECT_HOSTROUTE(tgt_rt) ||
208 tgt_rt->rt_gateway->sa_family != AF_LINK ||
209 SDL(tgt_rt->rt_gateway)->sdl_type != IFT_ETHER)) {
210 rtstat.rts_badrtgwroute++;
211 err = ENETUNREACH;
212 RT_UNLOCK(tgt_rt);
213 SK_ERR("bad gateway route %s on %s (err %d)",
214 sk_sa_ntop(tgt_sa, dst_s, sizeof(dst_s)),
215 ifp->if_xname, err);
216 goto done;
217 }
218
219 /*
220 * If already resolved, grab the link-layer address and mark the
221 * flow route accordingly. Given that we will use the cached
222 * link-layer info, there's no need to convert and enqueue the
223 * packet to ARP/ND (i.e. no need to return EJUSTRETURN).
224 */
225 if (__probable((fr->fr_flags & FLOWRTF_HAS_LLINFO) &&
226 SDL(tgt_rt->rt_gateway)->sdl_alen == ETHER_ADDR_LEN)) {
227 VERIFY(m == NULL);
228 /* XXX Remove explicit __bidi_indexable once rdar://119193012 lands */
229 struct sockaddr_dl *__bidi_indexable sdl =
230 (struct sockaddr_dl *__bidi_indexable) SDL(tgt_rt->rt_gateway);
231 FLOWRT_UPD_ETH_DST(fr, LLADDR(sdl));
232 os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
233 /* if we're not probing, then we're done */
234 if (!(probing = (fr->fr_want_probe != 0))) {
235 VERIFY(err == 0);
236 RT_UNLOCK(tgt_rt);
237 goto done;
238 }
239 os_atomic_store(&fr->fr_want_probe, 0, release);
240 } else {
241 probing = FALSE;
242 os_atomic_andnot(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
243 }
244
245 SK_DF(SK_VERB_FLOW_ROUTE, "%s %s on %s", (probing ?
246 "probing" : "resolving"), sk_sa_ntop(tgt_sa, dst_s,
247 sizeof(dst_s)), ifp->if_xname);
248
249 /*
250 * Trigger ARP/NDP resolution or probing.
251 */
252 switch (tgt_sa->sa_family) {
253 case AF_INET: {
254 struct sockaddr_dl sdl;
255
256 RT_UNLOCK(tgt_rt);
257 /*
258 * Note we pass NULL as "hint" parameter, as tgt_sa
259 * is already refererring to the target address.
260 */
261 SOCKADDR_ZERO(&sdl, sizeof(sdl));
262 err = arp_lookup_ip(ifp, SIN(tgt_sa), &sdl, sizeof(sdl),
263 NULL, m);
264
265 /*
266 * If we're resolving (not probing), and it's now resolved,
267 * grab the link-layer address and update the flow route.
268 * If we get EJUSTRETURN, the mbuf (if any) would have
269 * been added to the hold queue. Any other return values
270 * including 0 means that we need to free it.
271 *
272 * If we're probing, we won't have any mbuf to deal with,
273 * and since we already have the cached llinfo we'll just
274 * return success even if we get EJUSTRETURN.
275 */
276 if (!probing) {
277 if (err == 0 && sdl.sdl_alen == ETHER_ADDR_LEN) {
278 SK_DF(SK_VERB_FLOW_ROUTE,
279 "fast-resolve %s on %s",
280 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
281 sizeof(dst_s)), ifp->if_xname);
282 FLOWRT_UPD_ETH_DST(fr, LLADDR(&sdl));
283 os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
284 }
285 if (err == EJUSTRETURN && m != NULL) {
286 SK_DF(SK_VERB_FLOW_ROUTE, "packet queued "
287 "while resolving %s on %s",
288 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
289 sizeof(dst_s)), ifp->if_xname);
290 m = NULL;
291 }
292 } else {
293 VERIFY(m == NULL);
294 if (err == EJUSTRETURN) {
295 err = 0;
296 }
297 }
298 break;
299 }
300
301 case AF_INET6: {
302 struct llinfo_nd6 *__single ln = tgt_rt->rt_llinfo;
303
304 /*
305 * Check if the route is down. RTF_LLINFO is set during
306 * RTM_{ADD,RESOLVE}, and is never cleared until the route
307 * is deleted from the routing table.
308 */
309 if ((tgt_rt->rt_flags & (RTF_UP | RTF_LLINFO)) !=
310 (RTF_UP | RTF_LLINFO) || ln == NULL) {
311 err = EHOSTUNREACH;
312 SK_ERR("route unavailable for %s on %s (err %d)",
313 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
314 sizeof(dst_s)), ifp->if_xname, err);
315 RT_UNLOCK(tgt_rt);
316 break;
317 }
318
319 /*
320 * If we're probing and IPv6 ND cache entry is STALE,
321 * use it anyway but also mark it for delayed probe
322 * and update the expiry.
323 */
324 if (probing) {
325 VERIFY(m == NULL);
326 VERIFY(ln->ln_state > ND6_LLINFO_INCOMPLETE);
327 if (ln->ln_state == ND6_LLINFO_STALE) {
328 ln->ln_asked = 0;
329 ND6_CACHE_STATE_TRANSITION(ln,
330 ND6_LLINFO_DELAY);
331 ln_setexpire(ln, net_uptime() + nd6_delay);
332 RT_UNLOCK(tgt_rt);
333
334 lck_mtx_lock(rnh_lock);
335 nd6_sched_timeout(NULL, NULL);
336 lck_mtx_unlock(rnh_lock);
337
338 SK_DF(SK_VERB_FLOW_ROUTE,
339 "NUD probe scheduled for %s on %s",
340 sk_sa_ntop(tgt_sa, dst_s,
341 sizeof(dst_s)), ifp->if_xname);
342 } else {
343 RT_UNLOCK(tgt_rt);
344 }
345 VERIFY(err == 0);
346 break;
347 }
348
349 /*
350 * If this is a permanent ND entry, we're done.
351 */
352 if (ln->ln_expire == 0 &&
353 ln->ln_state == ND6_LLINFO_REACHABLE) {
354 if (SDL(tgt_rt->rt_gateway)->sdl_alen !=
355 ETHER_ADDR_LEN) {
356 err = EHOSTUNREACH;
357 SK_ERR("invalid permanent route %s on %s"
358 "ln 0x%llx (err %d)",
359 sk_sa_ntop(rt_key(tgt_rt), dst_s,
360 sizeof(dst_s)), ifp->if_xname,
361 SK_KVA(ln), err);
362 } else {
363 SK_DF(SK_VERB_FLOW_ROUTE, "fast-resolve "
364 "permanent route %s on %s",
365 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
366 sizeof(dst_s)), ifp->if_xname);
367 /* copy permanent address into the flow route */
368 /*
369 * XXX Remove explicit __bidi_indexable once
370 * rdar://119193012 lands
371 */
372 struct sockaddr_dl *__bidi_indexable sdl =
373 (struct sockaddr_dl *__bidi_indexable) SDL(tgt_rt->rt_gateway);
374 FLOWRT_UPD_ETH_DST(fr, LLADDR(sdl));
375 os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
376 VERIFY(err == 0);
377 }
378 RT_UNLOCK(tgt_rt);
379 break;
380 }
381
382 if (ln->ln_state == ND6_LLINFO_NOSTATE) {
383 ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_INCOMPLETE);
384 }
385
386 if (ln->ln_state == ND6_LLINFO_INCOMPLETE && (!ln->ln_asked ||
387 !(fr->fr_flags & FLOWRTF_HAS_LLINFO))) {
388 struct nd_ifinfo *ndi = ND_IFINFO(tgt_rt->rt_ifp);
389 /*
390 * There is a neighbor cache entry, but no Ethernet
391 * address response yet. Replace the held mbuf
392 * (if any) with this the one we have (if any),
393 * else leave it alone.
394 *
395 * This code conforms to the rate-limiting rule
396 * described in Section 7.2.2 of RFC 4861, because
397 * the timer is set correctly after sending an
398 * NS below.
399 */
400 if (m != NULL) {
401 if (ln->ln_hold != NULL) {
402 m_freem_list(ln->ln_hold);
403 }
404 ln->ln_hold = m;
405 m = NULL;
406
407 SK_DF(SK_VERB_FLOW_ROUTE,
408 "packet queued while resolving %s on %s",
409 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
410 sizeof(dst_s)), ifp->if_xname);
411 }
412 VERIFY(ndi != NULL && ndi->initialized);
413 ln->ln_asked++;
414 ln_setexpire(ln, net_uptime() + ndi->retrans / 1000);
415 RT_UNLOCK(tgt_rt);
416
417 SK_DF(SK_VERB_FLOW_ROUTE, "soliciting for %s on %s"
418 "ln 0x%llx state %u", sk_sa_ntop(rt_key(tgt_rt),
419 dst_s, sizeof(dst_s)), ifp->if_xname, SK_KVA(ln),
420 ln->ln_state);
421
422 /* XXX Refactor this to use same src ip */
423 nd6_ns_output(tgt_rt->rt_ifp, NULL,
424 &SIN6(rt_key(tgt_rt))->sin6_addr, NULL, NULL, 0);
425
426 lck_mtx_lock(rnh_lock);
427 nd6_sched_timeout(NULL, NULL);
428 lck_mtx_unlock(rnh_lock);
429 err = EJUSTRETURN;
430 } else {
431 SK_DF(SK_VERB_FLOW_ROUTE, "fast-resolve %s on %s",
432 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
433 sizeof(dst_s)), ifp->if_xname);
434 /*
435 * The neighbor cache entry has been resolved;
436 * copy the address into the flow route.
437 */
438 /*
439 * XXX Remove explicit __bidi_indexable once
440 * rdar://119193012 lands
441 */
442 struct sockaddr_dl *__bidi_indexable sdl =
443 (struct sockaddr_dl *__bidi_indexable) SDL(tgt_rt->rt_gateway);
444 FLOWRT_UPD_ETH_DST(fr, LLADDR(sdl));
445 os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
446 RT_UNLOCK(tgt_rt);
447 VERIFY(err == 0);
448 }
449 /*
450 * XXX Need to optimize for the NDP garbage
451 * collection. It would be even better to unify
452 * BSD/SK NDP management through the completion
453 * of L2/L3 split.
454 */
455 break;
456 }
457
458 default:
459 VERIFY(0);
460 /* NOTREACHED */
461 __builtin_unreachable();
462 }
463 RT_LOCK_ASSERT_NOTHELD(tgt_rt);
464
465 done:
466 if (m != NULL) {
467 if (reattach_mbuf) {
468 pkt->pkt_mbuf = m;
469 pkt->pkt_pflags |= pkt_mflags_restore;
470 } else {
471 m_freem_list(m);
472 }
473 m = NULL;
474 }
475
476 if (__improbable(err != 0 && err != EJUSTRETURN)) {
477 SK_ERR("route to %s on %s can't be resolved (err %d)",
478 sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
479 ifp->if_xname, err);
480 /* keep FLOWRTF_HAS_LLINFO as llinfo is still useful */
481 os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
482 flow_route_cleanup(fr);
483 }
484
485 FR_UNLOCK(fr);
486
487 return err;
488 }
489
490 static void
fsw_ethernet_frame(struct nx_flowswitch * fsw,struct flow_route * fr,struct __kern_packet * pkt)491 fsw_ethernet_frame(struct nx_flowswitch *fsw, struct flow_route *fr,
492 struct __kern_packet *pkt)
493 {
494 /* in the event the source MAC address changed, update our copy */
495 if (__improbable(fr->fr_llhdr.flh_gencnt != fsw->fsw_src_lla_gencnt)) {
496 uint8_t old_shost[ETHER_ADDR_LEN];
497
498 bcopy(&fr->fr_eth.ether_shost, &old_shost, ETHER_ADDR_LEN);
499 fsw_ethernet_ctor(fsw, fr);
500
501 SK_ERR("fr 0x%llx source MAC address updated on %s, "
502 "was %x:%x:%x:%x:%x:%x now %x:%x:%x:%x:%x:%x",
503 SK_KVA(fr), fsw->fsw_ifp,
504 old_shost[0], old_shost[1],
505 old_shost[2], old_shost[3],
506 old_shost[4], old_shost[5],
507 fr->fr_eth.ether_shost[0], fr->fr_eth.ether_shost[1],
508 fr->fr_eth.ether_shost[2], fr->fr_eth.ether_shost[3],
509 fr->fr_eth.ether_shost[4], fr->fr_eth.ether_shost[5]);
510 }
511
512 _CASSERT(sizeof(fr->fr_eth_padded) == FSW_ETHER_LEN_PADDED);
513
514 if ((fr->fr_flags & FLOWRTF_DST_LL_MCAST) != 0) {
515 pkt->pkt_link_flags |= PKT_LINKF_MCAST;
516 } else if ((fr->fr_flags & FLOWRTF_DST_LL_BCAST) != 0) {
517 pkt->pkt_link_flags |= PKT_LINKF_BCAST;
518 }
519
520 ASSERT(pkt->pkt_headroom >= FSW_ETHER_LEN_PADDED);
521
522 char *pkt_buf;
523 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
524 sk_copy64_16((uint64_t *)(void *)&fr->fr_eth_padded,
525 (uint64_t *)(void *)(pkt_buf + pkt->pkt_headroom - FSW_ETHER_LEN_PADDED));
526
527 pkt->pkt_headroom -= ETHER_HDR_LEN;
528 pkt->pkt_l2_len = ETHER_HDR_LEN;
529
530 if ((pkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) {
531 /* frame and fix up mbuf */
532 struct mbuf *m = pkt->pkt_mbuf;
533 void *buf = m_mtod_current(m) - FSW_ETHER_LEN_PADDED;
534
535 sk_copy64_16((uint64_t *)(void *)&fr->fr_eth_padded, buf);
536 ASSERT((uintptr_t)m->m_data ==
537 (uintptr_t)mbuf_datastart(m) + FSW_ETHER_FRAME_HEADROOM);
538 m->m_data -= ETHER_HDR_LEN;
539 m->m_len += ETHER_HDR_LEN;
540 m_pktlen(m) += ETHER_HDR_LEN;
541 ASSERT(m->m_len == m_pktlen(m));
542 pkt->pkt_length = m_pktlen(m);
543 } else {
544 METADATA_ADJUST_LEN(pkt, ETHER_HDR_LEN, pkt->pkt_headroom);
545 }
546 }
547
548 static sa_family_t
fsw_ethernet_demux(struct nx_flowswitch * fsw,struct __kern_packet * pkt)549 fsw_ethernet_demux(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
550 {
551 #pragma unused(fsw)
552 const struct ether_header *eh;
553 sa_family_t af = AF_UNSPEC;
554 uint32_t bdlen, bdlim, bdoff;
555 uint8_t *baddr;
556
557 MD_BUFLET_ADDR_ABS_DLEN(pkt, baddr, bdlen, bdlim, bdoff);
558 baddr += pkt->pkt_headroom;
559 eh = (struct ether_header *)(void *)baddr;
560
561 if (__improbable(sizeof(*eh) > pkt->pkt_length)) {
562 STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_ERR);
563 SK_ERR("unrecognized pkt, len %u", pkt->pkt_length);
564 return AF_UNSPEC;
565 }
566
567 if (__improbable(pkt->pkt_headroom + sizeof(*eh) > bdlim)) {
568 SK_ERR("ethernet header overrun 1st buflet");
569 STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_ERR);
570 return AF_UNSPEC;
571 }
572
573 if (__improbable((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0)) {
574 pkt->pkt_length -= ETHER_CRC_LEN;
575 pkt->pkt_link_flags &= ~PKT_LINKF_ETHFCS;
576 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
577 ASSERT((pkt->pkt_mbuf->m_flags & M_HASFCS) != 0);
578 m_adj(pkt->pkt_mbuf, -ETHER_CRC_LEN);
579 pkt->pkt_mbuf->m_flags &= ~M_HASFCS;
580 }
581 }
582 pkt->pkt_l2_len = ETHER_HDR_LEN;
583 if ((eh->ether_dhost[0] & 1) == 0) {
584 /*
585 * When the driver is put into promiscuous mode we may receive
586 * unicast frames that are not intended for our interfaces.
587 * They are marked here as being promiscuous so the caller may
588 * dispose of them after passing the packets to any interface
589 * filters.
590 */
591 if (_ether_cmp(eh->ether_dhost, IF_LLADDR(fsw->fsw_ifp))) {
592 pkt->pkt_pflags |= PKT_F_PROMISC;
593 STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_PROMISC);
594 return AF_UNSPEC;
595 }
596 }
597 uint16_t ether_type = ntohs(eh->ether_type);
598 switch (ether_type) {
599 case ETHERTYPE_IP:
600 af = AF_INET;
601 break;
602 case ETHERTYPE_IPV6:
603 af = AF_INET6;
604 break;
605 default:
606 STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_UNSPEC);
607 break;
608 }
609
610 return af;
611 }
612