xref: /xnu-8792.61.2/bsd/skywalk/nexus/flowswitch/fsw_ethernet.c (revision 42e220869062b56f8d7d0726fd4c88954f87902c)
1 /*
2  * Copyright (c) 2016-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <netinet/in_arp.h>
33 #include <netinet/ip6.h>
34 #include <netinet6/in6_var.h>
35 #include <netinet6/nd6.h>
36 #include <net/ethernet.h>
37 #include <net/route.h>
38 #include <sys/eventhandler.h>
39 
40 #define FSW_ETHER_LEN_PADDED     16
41 #define FSW_ETHER_PADDING        (FSW_ETHER_LEN_PADDED - ETHER_HDR_LEN)
42 #define FSW_ETHER_FRAME_HEADROOM FSW_ETHER_LEN_PADDED
43 
44 static void fsw_ethernet_ctor(struct nx_flowswitch *, struct flow_route *);
45 static int fsw_ethernet_resolve(struct nx_flowswitch *, struct flow_route *,
46     struct __kern_packet *);
47 static void fsw_ethernet_frame(struct nx_flowswitch *, struct flow_route *,
48     struct __kern_packet *);
49 static sa_family_t fsw_ethernet_demux(struct nx_flowswitch *,
50     struct __kern_packet *);
51 
52 extern struct rtstat rtstat;
53 
54 int
fsw_ethernet_setup(struct nx_flowswitch * fsw,struct ifnet * ifp)55 fsw_ethernet_setup(struct nx_flowswitch *fsw, struct ifnet *ifp)
56 {
57 	struct ifaddr *lladdr = ifp->if_lladdr;
58 
59 	if (SDL(lladdr->ifa_addr)->sdl_alen != ETHER_ADDR_LEN ||
60 	    SDL(lladdr->ifa_addr)->sdl_type != IFT_ETHER) {
61 		return ENOTSUP;
62 	}
63 
64 	ifnet_lladdr_copy_bytes(ifp, fsw->fsw_ether_shost, ETHER_ADDR_LEN);
65 	fsw->fsw_ctor = fsw_ethernet_ctor;
66 	fsw->fsw_resolve = fsw_ethernet_resolve;
67 	fsw->fsw_frame = fsw_ethernet_frame;
68 	fsw->fsw_frame_headroom = FSW_ETHER_FRAME_HEADROOM;
69 	fsw->fsw_demux = fsw_ethernet_demux;
70 
71 	return 0;
72 }
73 
74 static void
fsw_ethernet_ctor(struct nx_flowswitch * fsw,struct flow_route * fr)75 fsw_ethernet_ctor(struct nx_flowswitch *fsw, struct flow_route *fr)
76 {
77 	ASSERT(fr->fr_af == AF_INET || fr->fr_af == AF_INET6);
78 
79 	fr->fr_llhdr.flh_gencnt = fsw->fsw_src_lla_gencnt;
80 	bcopy(fsw->fsw_ether_shost, fr->fr_eth.ether_shost, ETHER_ADDR_LEN);
81 	fr->fr_eth.ether_type = ((fr->fr_af == AF_INET) ?
82 	    htons(ETHERTYPE_IP) : htons(ETHERTYPE_IPV6));
83 
84 	/* const override */
85 	_CASSERT(sizeof(fr->fr_llhdr.flh_off) == sizeof(uint8_t));
86 	_CASSERT(sizeof(fr->fr_llhdr.flh_len) == sizeof(uint8_t));
87 	*(uint8_t *)(uintptr_t)&fr->fr_llhdr.flh_off = 2;
88 	*(uint8_t *)(uintptr_t)&fr->fr_llhdr.flh_len = ETHER_HDR_LEN;
89 
90 	SK_DF(SK_VERB_FLOW_ROUTE,
91 	    "fr 0x%llx eth_type 0x%x eth_src %x:%x:%x:%x:%x:%x",
92 	    SK_KVA(fr), ntohs(fr->fr_eth.ether_type),
93 	    fr->fr_eth.ether_shost[0], fr->fr_eth.ether_shost[1],
94 	    fr->fr_eth.ether_shost[2], fr->fr_eth.ether_shost[3],
95 	    fr->fr_eth.ether_shost[4], fr->fr_eth.ether_shost[5]);
96 }
97 
98 static int
fsw_ethernet_resolve(struct nx_flowswitch * fsw,struct flow_route * fr,struct __kern_packet * pkt)99 fsw_ethernet_resolve(struct nx_flowswitch *fsw, struct flow_route *fr,
100     struct __kern_packet *pkt)
101 {
102 #if SK_LOG
103 	char dst_s[MAX_IPv6_STR_LEN];
104 #endif /* SK_LOG */
105 	struct ifnet *ifp = fsw->fsw_ifp;
106 	struct rtentry *tgt_rt = NULL;
107 	struct sockaddr *tgt_sa = NULL;
108 	struct mbuf *m = NULL;
109 	boolean_t reattach_mbuf = FALSE;
110 	boolean_t probing;
111 	int err = 0;
112 
113 	ASSERT(fr != NULL);
114 	ASSERT(ifp != NULL);
115 
116 	FR_LOCK(fr);
117 	/*
118 	 * If the destination is on-link, we use the final destination
119 	 * address as target.  If it's off-link, we use the gateway
120 	 * address instead.  Point tgt_rt to the the destination or
121 	 * gateway route accordingly.
122 	 */
123 	if (fr->fr_flags & FLOWRTF_ONLINK) {
124 		tgt_sa = SA(&fr->fr_faddr);
125 		tgt_rt = fr->fr_rt_dst;
126 	} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
127 		tgt_sa = SA(&fr->fr_gaddr);
128 		tgt_rt = fr->fr_rt_gw;
129 	}
130 
131 	/*
132 	 * Perform another routing table lookup if necessary.
133 	 */
134 	if (tgt_rt == NULL || !(tgt_rt->rt_flags & RTF_UP) ||
135 	    fr->fr_want_configure) {
136 		if (fr->fr_want_configure == 0) {
137 			atomic_add_32(&fr->fr_want_configure, 1);
138 		}
139 		err = flow_route_configure(fr, ifp, NULL);
140 		if (err != 0) {
141 			SK_ERR("failed to configure route to %s on %s (err %d)",
142 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
143 			    sizeof(dst_s)), ifp->if_xname, err);
144 			goto done;
145 		}
146 
147 		/* refresh pointers */
148 		if (fr->fr_flags & FLOWRTF_ONLINK) {
149 			tgt_sa = SA(&fr->fr_faddr);
150 			tgt_rt = fr->fr_rt_dst;
151 		} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
152 			tgt_sa = SA(&fr->fr_gaddr);
153 			tgt_rt = fr->fr_rt_gw;
154 		}
155 	}
156 
157 	if (__improbable(!(fr->fr_flags & (FLOWRTF_ONLINK | FLOWRTF_GATEWAY)))) {
158 		err = EHOSTUNREACH;
159 		SK_ERR("invalid route for %s on %s (err %d)",
160 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
161 		    sizeof(dst_s)), ifp->if_xname, err);
162 		goto done;
163 	}
164 
165 	ASSERT(tgt_sa != NULL);
166 	ASSERT(tgt_rt != NULL);
167 
168 	/*
169 	 * Attempt to convert kpkt to mbuf before acquiring the
170 	 * rt lock so that the lock won't be held if we need to do
171 	 * blocked a mbuf allocation.
172 	 */
173 	if (!(fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
174 		/*
175 		 * We need to resolve; if caller passes in a kpkt,
176 		 * convert the kpkt within to mbuf.  Caller is then
177 		 * reponsible for freeing kpkt.  In future, we could
178 		 * optimize this by having the ARP/ND lookup routines
179 		 * understand kpkt and perform the conversion only
180 		 * when it is needed.
181 		 */
182 		if (__probable(pkt != NULL)) {
183 			if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
184 				reattach_mbuf = TRUE;
185 				m = pkt->pkt_mbuf;
186 				KPKT_CLEAR_MBUF_DATA(pkt);
187 			} else {
188 				m = fsw_classq_kpkt_to_mbuf(fsw, pkt);
189 			}
190 			if (m == NULL) {
191 				/* not a fatal error; move on */
192 				SK_ERR("failed to allocate mbuf while "
193 				    "resolving %s on %s",
194 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
195 				    sizeof(dst_s)), ifp->if_xname);
196 			}
197 		} else {
198 			m = NULL;
199 		}
200 	}
201 
202 	RT_LOCK(tgt_rt);
203 
204 	if (__improbable(!IS_DIRECT_HOSTROUTE(tgt_rt) ||
205 	    tgt_rt->rt_gateway->sa_family != AF_LINK ||
206 	    SDL(tgt_rt->rt_gateway)->sdl_type != IFT_ETHER)) {
207 		rtstat.rts_badrtgwroute++;
208 		err = ENETUNREACH;
209 		RT_UNLOCK(tgt_rt);
210 		SK_ERR("bad gateway route %s on %s (err %d)",
211 		    sk_sa_ntop(tgt_sa, dst_s, sizeof(dst_s)),
212 		    ifp->if_xname, err);
213 		goto done;
214 	}
215 
216 	/*
217 	 * If already resolved, grab the link-layer address and mark the
218 	 * flow route accordingly.  Given that we will use the cached
219 	 * link-layer info, there's no need to convert and enqueue the
220 	 * packet to ARP/ND (i.e. no need to return EJUSTRETURN).
221 	 */
222 	if (__probable((fr->fr_flags & FLOWRTF_HAS_LLINFO) &&
223 	    SDL(tgt_rt->rt_gateway)->sdl_alen == ETHER_ADDR_LEN)) {
224 		VERIFY(m == NULL);
225 		FLOWRT_UPD_ETH_DST(fr, LLADDR(SDL(tgt_rt->rt_gateway)));
226 		atomic_bitset_32(&fr->fr_flags,
227 		    (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO));
228 		/* if we're not probing, then we're done */
229 		if (!(probing = (fr->fr_want_probe != 0))) {
230 			VERIFY(err == 0);
231 			RT_UNLOCK(tgt_rt);
232 			goto done;
233 		}
234 		atomic_set_32(&fr->fr_want_probe, 0);
235 	} else {
236 		probing = FALSE;
237 		atomic_bitclear_32(&fr->fr_flags,
238 		    (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO));
239 	}
240 
241 	SK_DF(SK_VERB_FLOW_ROUTE, "%s %s on %s", (probing ?
242 	    "probing" : "resolving"), sk_sa_ntop(tgt_sa, dst_s,
243 	    sizeof(dst_s)), ifp->if_xname);
244 
245 	/*
246 	 * Trigger ARP/NDP resolution or probing.
247 	 */
248 	switch (tgt_sa->sa_family) {
249 	case AF_INET: {
250 		struct sockaddr_dl sdl;
251 
252 		RT_UNLOCK(tgt_rt);
253 		/*
254 		 * Note we pass NULL as "hint" parameter, as tgt_sa
255 		 * is already refererring to the target address.
256 		 */
257 		bzero(&sdl, sizeof(sdl));
258 		err = arp_lookup_ip(ifp, SIN(tgt_sa), &sdl, sizeof(sdl),
259 		    NULL, m);
260 
261 		/*
262 		 * If we're resolving (not probing), and it's now resolved,
263 		 * grab the link-layer address and update the flow route.
264 		 * If we get EJUSTRETURN, the mbuf (if any) would have
265 		 * been added to the hold queue.  Any other return values
266 		 * including 0 means that we need to free it.
267 		 *
268 		 * If we're probing, we won't have any mbuf to deal with,
269 		 * and since we already have the cached llinfo we'll just
270 		 * return success even if we get EJUSTRETURN.
271 		 */
272 		if (!probing) {
273 			if (err == 0 && sdl.sdl_alen == ETHER_ADDR_LEN) {
274 				SK_DF(SK_VERB_FLOW_ROUTE,
275 				    "fast-resolve %s on %s",
276 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
277 				    sizeof(dst_s)), ifp->if_xname);
278 				FLOWRT_UPD_ETH_DST(fr, LLADDR(&sdl));
279 				atomic_bitset_32(&fr->fr_flags,
280 				    (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO));
281 			}
282 			if (err == EJUSTRETURN && m != NULL) {
283 				SK_DF(SK_VERB_FLOW_ROUTE, "packet queued "
284 				    "while resolving %s on %s",
285 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
286 				    sizeof(dst_s)), ifp->if_xname);
287 				m = NULL;
288 			}
289 		} else {
290 			VERIFY(m == NULL);
291 			if (err == EJUSTRETURN) {
292 				err = 0;
293 			}
294 		}
295 		break;
296 	}
297 
298 	case AF_INET6: {
299 		struct llinfo_nd6 *ln = tgt_rt->rt_llinfo;
300 
301 		/*
302 		 * Check if the route is down.  RTF_LLINFO is set during
303 		 * RTM_{ADD,RESOLVE}, and is never cleared until the route
304 		 * is deleted from the routing table.
305 		 */
306 		if ((tgt_rt->rt_flags & (RTF_UP | RTF_LLINFO)) !=
307 		    (RTF_UP | RTF_LLINFO) || ln == NULL) {
308 			err = EHOSTUNREACH;
309 			SK_ERR("route unavailable for %s on %s (err %d)",
310 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
311 			    sizeof(dst_s)), ifp->if_xname, err);
312 			RT_UNLOCK(tgt_rt);
313 			break;
314 		}
315 
316 		/*
317 		 * If we're probing and IPv6 ND cache entry is STALE,
318 		 * use it anyway but also mark it for delayed probe
319 		 * and update the expiry.
320 		 */
321 		if (probing) {
322 			VERIFY(m == NULL);
323 			VERIFY(ln->ln_state > ND6_LLINFO_INCOMPLETE);
324 			if (ln->ln_state == ND6_LLINFO_STALE) {
325 				ln->ln_asked = 0;
326 				ND6_CACHE_STATE_TRANSITION(ln,
327 				    ND6_LLINFO_DELAY);
328 				ln_setexpire(ln, net_uptime() + nd6_delay);
329 				RT_UNLOCK(tgt_rt);
330 
331 				lck_mtx_lock(rnh_lock);
332 				nd6_sched_timeout(NULL, NULL);
333 				lck_mtx_unlock(rnh_lock);
334 
335 				SK_DF(SK_VERB_FLOW_ROUTE,
336 				    "NUD probe scheduled for %s on %s",
337 				    sk_sa_ntop(tgt_sa, dst_s,
338 				    sizeof(dst_s)), ifp->if_xname);
339 			} else {
340 				RT_UNLOCK(tgt_rt);
341 			}
342 			VERIFY(err == 0);
343 			break;
344 		}
345 
346 		/*
347 		 * If this is a permanent ND entry, we're done.
348 		 */
349 		if (ln->ln_expire == 0 &&
350 		    ln->ln_state == ND6_LLINFO_REACHABLE) {
351 			if (SDL(tgt_rt->rt_gateway)->sdl_alen !=
352 			    ETHER_ADDR_LEN) {
353 				err = EHOSTUNREACH;
354 				SK_ERR("invalid permanent route %s on %s"
355 				    "ln 0x%llx (err %d)",
356 				    sk_sa_ntop(rt_key(tgt_rt), dst_s,
357 				    sizeof(dst_s)), ifp->if_xname,
358 				    SK_KVA(ln), err);
359 			} else {
360 				SK_DF(SK_VERB_FLOW_ROUTE, "fast-resolve "
361 				    "permanent route %s on %s",
362 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
363 				    sizeof(dst_s)), ifp->if_xname);
364 				/* copy permanent address into the flow route */
365 				FLOWRT_UPD_ETH_DST(fr,
366 				    LLADDR(SDL(tgt_rt->rt_gateway)));
367 				atomic_bitset_32(&fr->fr_flags,
368 				    (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO));
369 				VERIFY(err == 0);
370 			}
371 			RT_UNLOCK(tgt_rt);
372 			break;
373 		}
374 
375 		if (ln->ln_state == ND6_LLINFO_NOSTATE) {
376 			ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_INCOMPLETE);
377 		}
378 
379 		if (ln->ln_state == ND6_LLINFO_INCOMPLETE && (!ln->ln_asked ||
380 		    !(fr->fr_flags & FLOWRTF_HAS_LLINFO))) {
381 			struct nd_ifinfo *ndi = ND_IFINFO(tgt_rt->rt_ifp);
382 			/*
383 			 * There is a neighbor cache entry, but no Ethernet
384 			 * address response yet.  Replace the held mbuf
385 			 * (if any) with this the one we have (if any),
386 			 * else leave it alone.
387 			 *
388 			 * This code conforms to the rate-limiting rule
389 			 * described in Section 7.2.2 of RFC 4861, because
390 			 * the timer is set correctly after sending an
391 			 * NS below.
392 			 */
393 			if (m != NULL) {
394 				if (ln->ln_hold != NULL) {
395 					m_freem_list(ln->ln_hold);
396 				}
397 				ln->ln_hold = m;
398 				m = NULL;
399 
400 				SK_DF(SK_VERB_FLOW_ROUTE,
401 				    "packet queued while resolving %s on %s",
402 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
403 				    sizeof(dst_s)), ifp->if_xname);
404 			}
405 			VERIFY(ndi != NULL && ndi->initialized);
406 			ln->ln_asked++;
407 			ln_setexpire(ln, net_uptime() + ndi->retrans / 1000);
408 			RT_UNLOCK(tgt_rt);
409 
410 			SK_DF(SK_VERB_FLOW_ROUTE, "soliciting for %s on %s"
411 			    "ln 0x%llx state %u", sk_sa_ntop(rt_key(tgt_rt),
412 			    dst_s, sizeof(dst_s)), ifp->if_xname, SK_KVA(ln),
413 			    ln->ln_state);
414 
415 			/* XXX Refactor this to use same src ip */
416 			nd6_ns_output(tgt_rt->rt_ifp, NULL,
417 			    &SIN6(rt_key(tgt_rt))->sin6_addr, NULL, NULL);
418 
419 			lck_mtx_lock(rnh_lock);
420 			nd6_sched_timeout(NULL, NULL);
421 			lck_mtx_unlock(rnh_lock);
422 			err = EJUSTRETURN;
423 		} else {
424 			SK_DF(SK_VERB_FLOW_ROUTE, "fast-resolve %s on %s",
425 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
426 			    sizeof(dst_s)), ifp->if_xname);
427 			/*
428 			 * The neighbor cache entry has been resolved;
429 			 * copy the address into the flow route.
430 			 */
431 			FLOWRT_UPD_ETH_DST(fr, LLADDR(SDL(tgt_rt->rt_gateway)));
432 			atomic_bitset_32(&fr->fr_flags,
433 			    (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO));
434 			RT_UNLOCK(tgt_rt);
435 			VERIFY(err == 0);
436 		}
437 		/*
438 		 * XXX Need to optimize for the NDP garbage
439 		 * collection.  It would be even better to unify
440 		 * BSD/SK NDP management through the completion
441 		 * of L2/L3 split.
442 		 */
443 		break;
444 	}
445 
446 	default:
447 		VERIFY(0);
448 		/* NOTREACHED */
449 		__builtin_unreachable();
450 	}
451 	RT_LOCK_ASSERT_NOTHELD(tgt_rt);
452 
453 done:
454 	if (m != NULL) {
455 		if (reattach_mbuf) {
456 			pkt->pkt_mbuf = m;
457 			pkt->pkt_pflags |= PKT_F_MBUF_DATA;
458 		} else {
459 			m_freem_list(m);
460 		}
461 		m = NULL;
462 	}
463 
464 	if (__improbable(err != 0 && err != EJUSTRETURN)) {
465 		SK_ERR("route to %s on %s can't be resolved (err %d)",
466 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
467 		    ifp->if_xname, err);
468 		/* keep FLOWRTF_HAS_LLINFO as llinfo is still useful */
469 		atomic_bitclear_32(&fr->fr_flags, FLOWRTF_RESOLVED);
470 		flow_route_cleanup(fr);
471 	}
472 
473 	FR_UNLOCK(fr);
474 
475 	return err;
476 }
477 
478 static void
fsw_ethernet_frame(struct nx_flowswitch * fsw,struct flow_route * fr,struct __kern_packet * pkt)479 fsw_ethernet_frame(struct nx_flowswitch *fsw, struct flow_route *fr,
480     struct __kern_packet *pkt)
481 {
482 	/* in the event the source MAC address changed, update our copy */
483 	if (__improbable(fr->fr_llhdr.flh_gencnt != fsw->fsw_src_lla_gencnt)) {
484 		uint8_t old_shost[ETHER_ADDR_LEN];
485 
486 		bcopy(&fr->fr_eth.ether_shost, &old_shost, ETHER_ADDR_LEN);
487 		fsw_ethernet_ctor(fsw, fr);
488 
489 		SK_ERR("fr 0x%llx source MAC address updated on %s, "
490 		    "was %x:%x:%x:%x:%x:%x now %x:%x:%x:%x:%x:%x",
491 		    SK_KVA(fr), fsw->fsw_ifp,
492 		    old_shost[0], old_shost[1],
493 		    old_shost[2], old_shost[3],
494 		    old_shost[4], old_shost[5],
495 		    fr->fr_eth.ether_shost[0], fr->fr_eth.ether_shost[1],
496 		    fr->fr_eth.ether_shost[2], fr->fr_eth.ether_shost[3],
497 		    fr->fr_eth.ether_shost[4], fr->fr_eth.ether_shost[5]);
498 	}
499 
500 	_CASSERT(sizeof(fr->fr_eth_padded) == FSW_ETHER_LEN_PADDED);
501 
502 	if ((fr->fr_flags & FLOWRTF_DST_LL_MCAST) != 0) {
503 		pkt->pkt_link_flags |= PKT_LINKF_MCAST;
504 	} else if ((fr->fr_flags & FLOWRTF_DST_LL_BCAST) != 0) {
505 		pkt->pkt_link_flags |= PKT_LINKF_BCAST;
506 	}
507 
508 	ASSERT(pkt->pkt_headroom >= FSW_ETHER_LEN_PADDED);
509 
510 	char *pkt_buf;
511 	MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
512 	sk_copy64_16((uint64_t *)(void *)&fr->fr_eth_padded,
513 	    (uint64_t *)(void *)(pkt_buf + pkt->pkt_headroom - FSW_ETHER_LEN_PADDED));
514 
515 	pkt->pkt_headroom -= ETHER_HDR_LEN;
516 	pkt->pkt_l2_len = ETHER_HDR_LEN;
517 
518 	if ((pkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) {
519 		/* frame and fix up mbuf */
520 		struct mbuf *m = pkt->pkt_mbuf;
521 		sk_copy64_16((uint64_t *)(void *)&fr->fr_eth_padded,
522 		    (uint64_t *)(void *)(m->m_data - FSW_ETHER_LEN_PADDED));
523 		ASSERT((uintptr_t)m->m_data ==
524 		    (uintptr_t)mbuf_datastart(m) + FSW_ETHER_FRAME_HEADROOM);
525 		m->m_data -= ETHER_HDR_LEN;
526 		m->m_len += ETHER_HDR_LEN;
527 		m_pktlen(m) += ETHER_HDR_LEN;
528 		ASSERT(m->m_len == m_pktlen(m));
529 		pkt->pkt_length = m_pktlen(m);
530 	} else {
531 		METADATA_ADJUST_LEN(pkt, ETHER_HDR_LEN, pkt->pkt_headroom);
532 	}
533 }
534 
535 static sa_family_t
fsw_ethernet_demux(struct nx_flowswitch * fsw,struct __kern_packet * pkt)536 fsw_ethernet_demux(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
537 {
538 #pragma unused(fsw)
539 	const struct ether_header *eh;
540 	sa_family_t af = AF_UNSPEC;
541 	uint16_t bdlim, bdlen, bdoff;
542 	uint8_t *baddr;
543 
544 	MD_BUFLET_ADDR_ABS_DLEN(pkt, baddr, bdlen, bdlim, bdoff);
545 	baddr += pkt->pkt_headroom;
546 	eh = (struct ether_header *)(void *)baddr;
547 
548 	if (__improbable(sizeof(*eh) > pkt->pkt_length)) {
549 		STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_ERR);
550 		SK_ERR("unrecognized pkt, len %u", pkt->pkt_length);
551 		return AF_UNSPEC;
552 	}
553 
554 	if (__improbable(pkt->pkt_headroom + sizeof(*eh) > bdlim)) {
555 		SK_ERR("ethernet header overrun 1st buflet");
556 		STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_ERR);
557 		return AF_UNSPEC;
558 	}
559 
560 	if (__improbable((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0)) {
561 		pkt->pkt_length -= ETHER_CRC_LEN;
562 		pkt->pkt_link_flags &= ~PKT_LINKF_ETHFCS;
563 		if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
564 			ASSERT((pkt->pkt_mbuf->m_flags & M_HASFCS) != 0);
565 			m_adj(pkt->pkt_mbuf, -ETHER_CRC_LEN);
566 			pkt->pkt_mbuf->m_flags &= ~M_HASFCS;
567 		}
568 	}
569 	pkt->pkt_l2_len = ETHER_HDR_LEN;
570 	if ((eh->ether_dhost[0] & 1) == 0) {
571 		/*
572 		 * When the driver is put into promiscuous mode we may receive
573 		 * unicast frames that are not intended for our interfaces.
574 		 * They are marked here as being promiscuous so the caller may
575 		 * dispose of them after passing the packets to any interface
576 		 * filters.
577 		 */
578 		if (_ether_cmp(eh->ether_dhost, IF_LLADDR(fsw->fsw_ifp))) {
579 			pkt->pkt_pflags |= PKT_F_PROMISC;
580 			STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_PROMISC);
581 			return AF_UNSPEC;
582 		}
583 	}
584 	uint16_t ether_type = ntohs(eh->ether_type);
585 	switch (ether_type) {
586 	case ETHERTYPE_IP:
587 		af = AF_INET;
588 		break;
589 	case ETHERTYPE_IPV6:
590 		af = AF_INET6;
591 		break;
592 	default:
593 		STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_UNSPEC);
594 		break;
595 	}
596 
597 	return af;
598 }
599