xref: /xnu-10063.121.3/bsd/skywalk/nexus/flowswitch/fsw_ethernet.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 2016-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <netinet/in_arp.h>
33 #include <netinet/ip6.h>
34 #include <netinet6/in6_var.h>
35 #include <netinet6/nd6.h>
36 #include <net/ethernet.h>
37 #include <net/route.h>
38 #include <sys/eventhandler.h>
39 #include <net/sockaddr_utils.h>
40 
41 #define FSW_ETHER_LEN_PADDED     16
42 #define FSW_ETHER_PADDING        (FSW_ETHER_LEN_PADDED - ETHER_HDR_LEN)
43 #define FSW_ETHER_FRAME_HEADROOM FSW_ETHER_LEN_PADDED
44 
45 static void fsw_ethernet_ctor(struct nx_flowswitch *, struct flow_route *);
46 static int fsw_ethernet_resolve(struct nx_flowswitch *, struct flow_route *,
47     struct __kern_packet *);
48 static void fsw_ethernet_frame(struct nx_flowswitch *, struct flow_route *,
49     struct __kern_packet *);
50 static sa_family_t fsw_ethernet_demux(struct nx_flowswitch *,
51     struct __kern_packet *);
52 
53 extern struct rtstat rtstat;
54 
55 int
fsw_ethernet_setup(struct nx_flowswitch * fsw,struct ifnet * ifp)56 fsw_ethernet_setup(struct nx_flowswitch *fsw, struct ifnet *ifp)
57 {
58 	struct ifaddr *lladdr = ifp->if_lladdr;
59 
60 	if (SDL(lladdr->ifa_addr)->sdl_alen != ETHER_ADDR_LEN ||
61 	    SDL(lladdr->ifa_addr)->sdl_type != IFT_ETHER) {
62 		return ENOTSUP;
63 	}
64 
65 	ifnet_lladdr_copy_bytes(ifp, fsw->fsw_ether_shost, ETHER_ADDR_LEN);
66 	fsw->fsw_ctor = fsw_ethernet_ctor;
67 	fsw->fsw_resolve = fsw_ethernet_resolve;
68 	fsw->fsw_frame = fsw_ethernet_frame;
69 	fsw->fsw_frame_headroom = FSW_ETHER_FRAME_HEADROOM;
70 	fsw->fsw_demux = fsw_ethernet_demux;
71 
72 	return 0;
73 }
74 
75 static void
fsw_ethernet_ctor(struct nx_flowswitch * fsw,struct flow_route * fr)76 fsw_ethernet_ctor(struct nx_flowswitch *fsw, struct flow_route *fr)
77 {
78 	ASSERT(fr->fr_af == AF_INET || fr->fr_af == AF_INET6);
79 
80 	fr->fr_llhdr.flh_gencnt = fsw->fsw_src_lla_gencnt;
81 	bcopy(fsw->fsw_ether_shost, fr->fr_eth.ether_shost, ETHER_ADDR_LEN);
82 	fr->fr_eth.ether_type = ((fr->fr_af == AF_INET) ?
83 	    htons(ETHERTYPE_IP) : htons(ETHERTYPE_IPV6));
84 
85 	/* const override */
86 	_CASSERT(sizeof(fr->fr_llhdr.flh_off) == sizeof(uint8_t));
87 	_CASSERT(sizeof(fr->fr_llhdr.flh_len) == sizeof(uint8_t));
88 	*(uint8_t *)(uintptr_t)&fr->fr_llhdr.flh_off = 2;
89 	*(uint8_t *)(uintptr_t)&fr->fr_llhdr.flh_len = ETHER_HDR_LEN;
90 
91 	SK_DF(SK_VERB_FLOW_ROUTE,
92 	    "fr 0x%llx eth_type 0x%x eth_src %x:%x:%x:%x:%x:%x",
93 	    SK_KVA(fr), ntohs(fr->fr_eth.ether_type),
94 	    fr->fr_eth.ether_shost[0], fr->fr_eth.ether_shost[1],
95 	    fr->fr_eth.ether_shost[2], fr->fr_eth.ether_shost[3],
96 	    fr->fr_eth.ether_shost[4], fr->fr_eth.ether_shost[5]);
97 }
98 
99 static int
fsw_ethernet_resolve(struct nx_flowswitch * fsw,struct flow_route * fr,struct __kern_packet * pkt)100 fsw_ethernet_resolve(struct nx_flowswitch *fsw, struct flow_route *fr,
101     struct __kern_packet *pkt)
102 {
103 #if SK_LOG
104 	char dst_s[MAX_IPv6_STR_LEN];
105 #endif /* SK_LOG */
106 	struct ifnet *ifp = fsw->fsw_ifp;
107 	struct rtentry *tgt_rt = NULL;
108 	struct sockaddr *tgt_sa = NULL;
109 	struct mbuf *m = NULL;
110 	boolean_t reattach_mbuf = FALSE;
111 	boolean_t probing;
112 	int err = 0;
113 
114 	ASSERT(fr != NULL);
115 	ASSERT(ifp != NULL);
116 
117 	FR_LOCK(fr);
118 	/*
119 	 * If the destination is on-link, we use the final destination
120 	 * address as target.  If it's off-link, we use the gateway
121 	 * address instead.  Point tgt_rt to the the destination or
122 	 * gateway route accordingly.
123 	 */
124 	if (fr->fr_flags & FLOWRTF_ONLINK) {
125 		tgt_sa = SA(&fr->fr_faddr);
126 		tgt_rt = fr->fr_rt_dst;
127 	} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
128 		tgt_sa = SA(&fr->fr_gaddr);
129 		tgt_rt = fr->fr_rt_gw;
130 	}
131 
132 	/*
133 	 * Perform another routing table lookup if necessary.
134 	 */
135 	if (tgt_rt == NULL || !(tgt_rt->rt_flags & RTF_UP) ||
136 	    fr->fr_want_configure) {
137 		if (fr->fr_want_configure == 0) {
138 			os_atomic_inc(&fr->fr_want_configure, relaxed);
139 		}
140 		err = flow_route_configure(fr, ifp, NULL);
141 		if (err != 0) {
142 			SK_ERR("failed to configure route to %s on %s (err %d)",
143 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
144 			    sizeof(dst_s)), ifp->if_xname, err);
145 			goto done;
146 		}
147 
148 		/* refresh pointers */
149 		if (fr->fr_flags & FLOWRTF_ONLINK) {
150 			tgt_sa = SA(&fr->fr_faddr);
151 			tgt_rt = fr->fr_rt_dst;
152 		} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
153 			tgt_sa = SA(&fr->fr_gaddr);
154 			tgt_rt = fr->fr_rt_gw;
155 		}
156 	}
157 
158 	if (__improbable(!(fr->fr_flags & (FLOWRTF_ONLINK | FLOWRTF_GATEWAY)))) {
159 		err = EHOSTUNREACH;
160 		SK_ERR("invalid route for %s on %s (err %d)",
161 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
162 		    sizeof(dst_s)), ifp->if_xname, err);
163 		goto done;
164 	}
165 
166 	ASSERT(tgt_sa != NULL);
167 	ASSERT(tgt_rt != NULL);
168 
169 	/*
170 	 * Attempt to convert kpkt to mbuf before acquiring the
171 	 * rt lock so that the lock won't be held if we need to do
172 	 * blocked a mbuf allocation.
173 	 */
174 	if (!(fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
175 		/*
176 		 * We need to resolve; if caller passes in a kpkt,
177 		 * convert the kpkt within to mbuf.  Caller is then
178 		 * reponsible for freeing kpkt.  In future, we could
179 		 * optimize this by having the ARP/ND lookup routines
180 		 * understand kpkt and perform the conversion only
181 		 * when it is needed.
182 		 */
183 		if (__probable(pkt != NULL)) {
184 			if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
185 				reattach_mbuf = TRUE;
186 				m = pkt->pkt_mbuf;
187 				KPKT_CLEAR_MBUF_DATA(pkt);
188 			} else {
189 				m = fsw_classq_kpkt_to_mbuf(fsw, pkt);
190 			}
191 			if (m == NULL) {
192 				/* not a fatal error; move on */
193 				SK_ERR("failed to allocate mbuf while "
194 				    "resolving %s on %s",
195 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
196 				    sizeof(dst_s)), ifp->if_xname);
197 			}
198 		} else {
199 			m = NULL;
200 		}
201 	}
202 
203 	RT_LOCK(tgt_rt);
204 
205 	if (__improbable(!IS_DIRECT_HOSTROUTE(tgt_rt) ||
206 	    tgt_rt->rt_gateway->sa_family != AF_LINK ||
207 	    SDL(tgt_rt->rt_gateway)->sdl_type != IFT_ETHER)) {
208 		rtstat.rts_badrtgwroute++;
209 		err = ENETUNREACH;
210 		RT_UNLOCK(tgt_rt);
211 		SK_ERR("bad gateway route %s on %s (err %d)",
212 		    sk_sa_ntop(tgt_sa, dst_s, sizeof(dst_s)),
213 		    ifp->if_xname, err);
214 		goto done;
215 	}
216 
217 	/*
218 	 * If already resolved, grab the link-layer address and mark the
219 	 * flow route accordingly.  Given that we will use the cached
220 	 * link-layer info, there's no need to convert and enqueue the
221 	 * packet to ARP/ND (i.e. no need to return EJUSTRETURN).
222 	 */
223 	if (__probable((fr->fr_flags & FLOWRTF_HAS_LLINFO) &&
224 	    SDL(tgt_rt->rt_gateway)->sdl_alen == ETHER_ADDR_LEN)) {
225 		VERIFY(m == NULL);
226 		FLOWRT_UPD_ETH_DST(fr, LLADDR(SDL(tgt_rt->rt_gateway)));
227 		os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
228 		/* if we're not probing, then we're done */
229 		if (!(probing = (fr->fr_want_probe != 0))) {
230 			VERIFY(err == 0);
231 			RT_UNLOCK(tgt_rt);
232 			goto done;
233 		}
234 		os_atomic_store(&fr->fr_want_probe, 0, release);
235 	} else {
236 		probing = FALSE;
237 		os_atomic_andnot(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
238 	}
239 
240 	SK_DF(SK_VERB_FLOW_ROUTE, "%s %s on %s", (probing ?
241 	    "probing" : "resolving"), sk_sa_ntop(tgt_sa, dst_s,
242 	    sizeof(dst_s)), ifp->if_xname);
243 
244 	/*
245 	 * Trigger ARP/NDP resolution or probing.
246 	 */
247 	switch (tgt_sa->sa_family) {
248 	case AF_INET: {
249 		struct sockaddr_dl sdl;
250 
251 		RT_UNLOCK(tgt_rt);
252 		/*
253 		 * Note we pass NULL as "hint" parameter, as tgt_sa
254 		 * is already refererring to the target address.
255 		 */
256 		SOCKADDR_ZERO(&sdl, sizeof(sdl));
257 		err = arp_lookup_ip(ifp, SIN(tgt_sa), &sdl, sizeof(sdl),
258 		    NULL, m);
259 
260 		/*
261 		 * If we're resolving (not probing), and it's now resolved,
262 		 * grab the link-layer address and update the flow route.
263 		 * If we get EJUSTRETURN, the mbuf (if any) would have
264 		 * been added to the hold queue.  Any other return values
265 		 * including 0 means that we need to free it.
266 		 *
267 		 * If we're probing, we won't have any mbuf to deal with,
268 		 * and since we already have the cached llinfo we'll just
269 		 * return success even if we get EJUSTRETURN.
270 		 */
271 		if (!probing) {
272 			if (err == 0 && sdl.sdl_alen == ETHER_ADDR_LEN) {
273 				SK_DF(SK_VERB_FLOW_ROUTE,
274 				    "fast-resolve %s on %s",
275 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
276 				    sizeof(dst_s)), ifp->if_xname);
277 				FLOWRT_UPD_ETH_DST(fr, LLADDR(&sdl));
278 				os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
279 			}
280 			if (err == EJUSTRETURN && m != NULL) {
281 				SK_DF(SK_VERB_FLOW_ROUTE, "packet queued "
282 				    "while resolving %s on %s",
283 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
284 				    sizeof(dst_s)), ifp->if_xname);
285 				m = NULL;
286 			}
287 		} else {
288 			VERIFY(m == NULL);
289 			if (err == EJUSTRETURN) {
290 				err = 0;
291 			}
292 		}
293 		break;
294 	}
295 
296 	case AF_INET6: {
297 		struct llinfo_nd6 *ln = tgt_rt->rt_llinfo;
298 
299 		/*
300 		 * Check if the route is down.  RTF_LLINFO is set during
301 		 * RTM_{ADD,RESOLVE}, and is never cleared until the route
302 		 * is deleted from the routing table.
303 		 */
304 		if ((tgt_rt->rt_flags & (RTF_UP | RTF_LLINFO)) !=
305 		    (RTF_UP | RTF_LLINFO) || ln == NULL) {
306 			err = EHOSTUNREACH;
307 			SK_ERR("route unavailable for %s on %s (err %d)",
308 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
309 			    sizeof(dst_s)), ifp->if_xname, err);
310 			RT_UNLOCK(tgt_rt);
311 			break;
312 		}
313 
314 		/*
315 		 * If we're probing and IPv6 ND cache entry is STALE,
316 		 * use it anyway but also mark it for delayed probe
317 		 * and update the expiry.
318 		 */
319 		if (probing) {
320 			VERIFY(m == NULL);
321 			VERIFY(ln->ln_state > ND6_LLINFO_INCOMPLETE);
322 			if (ln->ln_state == ND6_LLINFO_STALE) {
323 				ln->ln_asked = 0;
324 				ND6_CACHE_STATE_TRANSITION(ln,
325 				    ND6_LLINFO_DELAY);
326 				ln_setexpire(ln, net_uptime() + nd6_delay);
327 				RT_UNLOCK(tgt_rt);
328 
329 				lck_mtx_lock(rnh_lock);
330 				nd6_sched_timeout(NULL, NULL);
331 				lck_mtx_unlock(rnh_lock);
332 
333 				SK_DF(SK_VERB_FLOW_ROUTE,
334 				    "NUD probe scheduled for %s on %s",
335 				    sk_sa_ntop(tgt_sa, dst_s,
336 				    sizeof(dst_s)), ifp->if_xname);
337 			} else {
338 				RT_UNLOCK(tgt_rt);
339 			}
340 			VERIFY(err == 0);
341 			break;
342 		}
343 
344 		/*
345 		 * If this is a permanent ND entry, we're done.
346 		 */
347 		if (ln->ln_expire == 0 &&
348 		    ln->ln_state == ND6_LLINFO_REACHABLE) {
349 			if (SDL(tgt_rt->rt_gateway)->sdl_alen !=
350 			    ETHER_ADDR_LEN) {
351 				err = EHOSTUNREACH;
352 				SK_ERR("invalid permanent route %s on %s"
353 				    "ln 0x%llx (err %d)",
354 				    sk_sa_ntop(rt_key(tgt_rt), dst_s,
355 				    sizeof(dst_s)), ifp->if_xname,
356 				    SK_KVA(ln), err);
357 			} else {
358 				SK_DF(SK_VERB_FLOW_ROUTE, "fast-resolve "
359 				    "permanent route %s on %s",
360 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
361 				    sizeof(dst_s)), ifp->if_xname);
362 				/* copy permanent address into the flow route */
363 				FLOWRT_UPD_ETH_DST(fr,
364 				    LLADDR(SDL(tgt_rt->rt_gateway)));
365 				os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
366 				VERIFY(err == 0);
367 			}
368 			RT_UNLOCK(tgt_rt);
369 			break;
370 		}
371 
372 		if (ln->ln_state == ND6_LLINFO_NOSTATE) {
373 			ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_INCOMPLETE);
374 		}
375 
376 		if (ln->ln_state == ND6_LLINFO_INCOMPLETE && (!ln->ln_asked ||
377 		    !(fr->fr_flags & FLOWRTF_HAS_LLINFO))) {
378 			struct nd_ifinfo *ndi = ND_IFINFO(tgt_rt->rt_ifp);
379 			/*
380 			 * There is a neighbor cache entry, but no Ethernet
381 			 * address response yet.  Replace the held mbuf
382 			 * (if any) with this the one we have (if any),
383 			 * else leave it alone.
384 			 *
385 			 * This code conforms to the rate-limiting rule
386 			 * described in Section 7.2.2 of RFC 4861, because
387 			 * the timer is set correctly after sending an
388 			 * NS below.
389 			 */
390 			if (m != NULL) {
391 				if (ln->ln_hold != NULL) {
392 					m_freem_list(ln->ln_hold);
393 				}
394 				ln->ln_hold = m;
395 				m = NULL;
396 
397 				SK_DF(SK_VERB_FLOW_ROUTE,
398 				    "packet queued while resolving %s on %s",
399 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
400 				    sizeof(dst_s)), ifp->if_xname);
401 			}
402 			VERIFY(ndi != NULL && ndi->initialized);
403 			ln->ln_asked++;
404 			ln_setexpire(ln, net_uptime() + ndi->retrans / 1000);
405 			RT_UNLOCK(tgt_rt);
406 
407 			SK_DF(SK_VERB_FLOW_ROUTE, "soliciting for %s on %s"
408 			    "ln 0x%llx state %u", sk_sa_ntop(rt_key(tgt_rt),
409 			    dst_s, sizeof(dst_s)), ifp->if_xname, SK_KVA(ln),
410 			    ln->ln_state);
411 
412 			/* XXX Refactor this to use same src ip */
413 			nd6_ns_output(tgt_rt->rt_ifp, NULL,
414 			    &SIN6(rt_key(tgt_rt))->sin6_addr, NULL, NULL);
415 
416 			lck_mtx_lock(rnh_lock);
417 			nd6_sched_timeout(NULL, NULL);
418 			lck_mtx_unlock(rnh_lock);
419 			err = EJUSTRETURN;
420 		} else {
421 			SK_DF(SK_VERB_FLOW_ROUTE, "fast-resolve %s on %s",
422 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
423 			    sizeof(dst_s)), ifp->if_xname);
424 			/*
425 			 * The neighbor cache entry has been resolved;
426 			 * copy the address into the flow route.
427 			 */
428 			FLOWRT_UPD_ETH_DST(fr, LLADDR(SDL(tgt_rt->rt_gateway)));
429 			os_atomic_or(&fr->fr_flags, (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO), relaxed);
430 			RT_UNLOCK(tgt_rt);
431 			VERIFY(err == 0);
432 		}
433 		/*
434 		 * XXX Need to optimize for the NDP garbage
435 		 * collection.  It would be even better to unify
436 		 * BSD/SK NDP management through the completion
437 		 * of L2/L3 split.
438 		 */
439 		break;
440 	}
441 
442 	default:
443 		VERIFY(0);
444 		/* NOTREACHED */
445 		__builtin_unreachable();
446 	}
447 	RT_LOCK_ASSERT_NOTHELD(tgt_rt);
448 
449 done:
450 	if (m != NULL) {
451 		if (reattach_mbuf) {
452 			pkt->pkt_mbuf = m;
453 			pkt->pkt_pflags |= PKT_F_MBUF_DATA;
454 		} else {
455 			m_freem_list(m);
456 		}
457 		m = NULL;
458 	}
459 
460 	if (__improbable(err != 0 && err != EJUSTRETURN)) {
461 		SK_ERR("route to %s on %s can't be resolved (err %d)",
462 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
463 		    ifp->if_xname, err);
464 		/* keep FLOWRTF_HAS_LLINFO as llinfo is still useful */
465 		os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
466 		flow_route_cleanup(fr);
467 	}
468 
469 	FR_UNLOCK(fr);
470 
471 	return err;
472 }
473 
474 static void
fsw_ethernet_frame(struct nx_flowswitch * fsw,struct flow_route * fr,struct __kern_packet * pkt)475 fsw_ethernet_frame(struct nx_flowswitch *fsw, struct flow_route *fr,
476     struct __kern_packet *pkt)
477 {
478 	/* in the event the source MAC address changed, update our copy */
479 	if (__improbable(fr->fr_llhdr.flh_gencnt != fsw->fsw_src_lla_gencnt)) {
480 		uint8_t old_shost[ETHER_ADDR_LEN];
481 
482 		bcopy(&fr->fr_eth.ether_shost, &old_shost, ETHER_ADDR_LEN);
483 		fsw_ethernet_ctor(fsw, fr);
484 
485 		SK_ERR("fr 0x%llx source MAC address updated on %s, "
486 		    "was %x:%x:%x:%x:%x:%x now %x:%x:%x:%x:%x:%x",
487 		    SK_KVA(fr), fsw->fsw_ifp,
488 		    old_shost[0], old_shost[1],
489 		    old_shost[2], old_shost[3],
490 		    old_shost[4], old_shost[5],
491 		    fr->fr_eth.ether_shost[0], fr->fr_eth.ether_shost[1],
492 		    fr->fr_eth.ether_shost[2], fr->fr_eth.ether_shost[3],
493 		    fr->fr_eth.ether_shost[4], fr->fr_eth.ether_shost[5]);
494 	}
495 
496 	_CASSERT(sizeof(fr->fr_eth_padded) == FSW_ETHER_LEN_PADDED);
497 
498 	if ((fr->fr_flags & FLOWRTF_DST_LL_MCAST) != 0) {
499 		pkt->pkt_link_flags |= PKT_LINKF_MCAST;
500 	} else if ((fr->fr_flags & FLOWRTF_DST_LL_BCAST) != 0) {
501 		pkt->pkt_link_flags |= PKT_LINKF_BCAST;
502 	}
503 
504 	ASSERT(pkt->pkt_headroom >= FSW_ETHER_LEN_PADDED);
505 
506 	char *pkt_buf;
507 	MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
508 	sk_copy64_16((uint64_t *)(void *)&fr->fr_eth_padded,
509 	    (uint64_t *)(void *)(pkt_buf + pkt->pkt_headroom - FSW_ETHER_LEN_PADDED));
510 
511 	pkt->pkt_headroom -= ETHER_HDR_LEN;
512 	pkt->pkt_l2_len = ETHER_HDR_LEN;
513 
514 	if ((pkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) {
515 		/* frame and fix up mbuf */
516 		struct mbuf *m = pkt->pkt_mbuf;
517 		sk_copy64_16((uint64_t *)(void *)&fr->fr_eth_padded,
518 		    (uint64_t *)(void *)(m->m_data - FSW_ETHER_LEN_PADDED));
519 		ASSERT((uintptr_t)m->m_data ==
520 		    (uintptr_t)mbuf_datastart(m) + FSW_ETHER_FRAME_HEADROOM);
521 		m->m_data -= ETHER_HDR_LEN;
522 		m->m_len += ETHER_HDR_LEN;
523 		m_pktlen(m) += ETHER_HDR_LEN;
524 		ASSERT(m->m_len == m_pktlen(m));
525 		pkt->pkt_length = m_pktlen(m);
526 	} else {
527 		METADATA_ADJUST_LEN(pkt, ETHER_HDR_LEN, pkt->pkt_headroom);
528 	}
529 }
530 
531 static sa_family_t
fsw_ethernet_demux(struct nx_flowswitch * fsw,struct __kern_packet * pkt)532 fsw_ethernet_demux(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
533 {
534 #pragma unused(fsw)
535 	const struct ether_header *eh;
536 	sa_family_t af = AF_UNSPEC;
537 	uint32_t bdlen, bdlim, bdoff;
538 	uint8_t *baddr;
539 
540 	MD_BUFLET_ADDR_ABS_DLEN(pkt, baddr, bdlen, bdlim, bdoff);
541 	baddr += pkt->pkt_headroom;
542 	eh = (struct ether_header *)(void *)baddr;
543 
544 	if (__improbable(sizeof(*eh) > pkt->pkt_length)) {
545 		STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_ERR);
546 		SK_ERR("unrecognized pkt, len %u", pkt->pkt_length);
547 		return AF_UNSPEC;
548 	}
549 
550 	if (__improbable(pkt->pkt_headroom + sizeof(*eh) > bdlim)) {
551 		SK_ERR("ethernet header overrun 1st buflet");
552 		STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_ERR);
553 		return AF_UNSPEC;
554 	}
555 
556 	if (__improbable((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0)) {
557 		pkt->pkt_length -= ETHER_CRC_LEN;
558 		pkt->pkt_link_flags &= ~PKT_LINKF_ETHFCS;
559 		if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
560 			ASSERT((pkt->pkt_mbuf->m_flags & M_HASFCS) != 0);
561 			m_adj(pkt->pkt_mbuf, -ETHER_CRC_LEN);
562 			pkt->pkt_mbuf->m_flags &= ~M_HASFCS;
563 		}
564 	}
565 	pkt->pkt_l2_len = ETHER_HDR_LEN;
566 	if ((eh->ether_dhost[0] & 1) == 0) {
567 		/*
568 		 * When the driver is put into promiscuous mode we may receive
569 		 * unicast frames that are not intended for our interfaces.
570 		 * They are marked here as being promiscuous so the caller may
571 		 * dispose of them after passing the packets to any interface
572 		 * filters.
573 		 */
574 		if (_ether_cmp(eh->ether_dhost, IF_LLADDR(fsw->fsw_ifp))) {
575 			pkt->pkt_pflags |= PKT_F_PROMISC;
576 			STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_PROMISC);
577 			return AF_UNSPEC;
578 		}
579 	}
580 	uint16_t ether_type = ntohs(eh->ether_type);
581 	switch (ether_type) {
582 	case ETHERTYPE_IP:
583 		af = AF_INET;
584 		break;
585 	case ETHERTYPE_IPV6:
586 		af = AF_INET6;
587 		break;
588 	default:
589 		STATS_INC(&fsw->fsw_stats, FSW_STATS_RX_DEMUX_UNSPEC);
590 		break;
591 	}
592 
593 	return af;
594 }
595