xref: /xnu-8020.121.3/bsd/skywalk/nexus/flowswitch/flow/flow_route.c (revision fdd8201d7b966f0c3ea610489d29bd841d358941) !
1 /*
2  * Copyright (c) 2017-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Flow Routes.
31  *
32  * Each (non-listener) flow entry is always associated with a flow route
33  * object.  Multiple flow entries sharing the same remote address will use
34  * the same flow route for that address.  The flow route object contains
35  * the route information for the remote node.  It gets allocated when a
36  * flow entry requests to connect, and is garbage-collected when it's no
37  * longer referred to after its expiration time has passed.
38  *
39  * A flow route also contains the default local address that's used to
40  * reach the remote node.  This may not necessarily be the same local
41  * address used by the flow entry, if it has explicitly bound the entry
42  * to another local address.  But for the majority of cases, having the
43  * local address be present in the flow route allows us to avoid doing
44  * source address selection each time a connect request happens.
45  *
46  * When the remote node is reachable via a gateway, the gateway address
47  * portion of the flow route contains its IP address and the flow route
48  * is marked with FLOWRTF_GATEWAY.  We use this to optimize the gateway
49  * route lookup, since otherwise we'd have to perform an extra lookup
50  * each time we need to resolve the route.
51  *
52  * When the remote node is directly on the link, the FLOWRTF_ONLINK flag
53  * is set, and the gateway address isn't used.  The target address used
54  * for resolution will the the remote address itself.
55  *
56  * On links with link-layer information, we store the resolved address
57  * of the target node (which may be the gateway's) in the flow route,
58  * and mark the flow route with FLOWRTF_HAS_LLINFO.
59  *
60  * Each flow route also registers itself to receive route events when
61  * the underlying rtentry is updated or deleted.
62  */
63 
64 #include <skywalk/os_skywalk_private.h>
65 
66 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
67 #include <skywalk/nexus/flowswitch/fsw_var.h>
68 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
69 
70 #include <netinet/in.h>
71 #include <netinet/in_var.h>
72 #include <netinet/in_arp.h>
73 #include <netinet6/nd6.h>
74 #include <net/route.h>
75 
76 extern struct rtstat rtstat;
77 
78 static LCK_GRP_DECLARE(flow_route_lock_group, "sk_flow_route_lock");
79 static LCK_ATTR_DECLARE(flow_route_lock_attr, 0, 0);
80 
81 static int fr_cmp(const struct flow_route *, const struct flow_route *);
82 static int fr_id_cmp(const struct flow_route *, const struct flow_route *);
83 static struct flow_route *fr_alloc(boolean_t);
84 static void fr_free(struct flow_route *);
85 static uint32_t flow_route_bucket_purge_common(struct flow_route_bucket *,
86     uint32_t *, boolean_t, boolean_t);
87 static void flow_route_ev_callback(struct eventhandler_entry_arg,
88     struct sockaddr *, int, struct sockaddr *, int);
89 
90 RB_GENERATE_PREV(flow_route_tree, flow_route, fr_link, fr_cmp);
91 RB_GENERATE_PREV(flow_route_id_tree, flow_route, fr_id_link, fr_id_cmp);
92 
93 #define FR_ZONE_NAME    "flow.route"
94 
95 static unsigned int flow_route_size;            /* size of flow_route */
96 struct skmem_cache *flow_route_cache;           /* cache for flow_route */
97 
98 static int __flow_route_inited = 0;
99 
100 #define FLOW_ROUTE_EXPIRE       600     /* seconds */
101 static unsigned int flow_route_expire = FLOW_ROUTE_EXPIRE;
102 
103 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_expire,
104     CTLFLAG_RW | CTLFLAG_LOCKED, &flow_route_expire, 0, "");
105 
106 void
flow_route_init(void)107 flow_route_init(void)
108 {
109 	ASSERT(!__flow_route_inited);
110 
111 	flow_route_size = sizeof(struct flow_route);
112 	flow_route_cache = skmem_cache_create(FR_ZONE_NAME, flow_route_size,
113 	    sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0);
114 
115 	__flow_route_inited = 1;
116 }
117 
118 void
flow_route_fini(void)119 flow_route_fini(void)
120 {
121 	if (__flow_route_inited) {
122 		skmem_cache_destroy(flow_route_cache);
123 		flow_route_cache = NULL;
124 
125 		__flow_route_inited = 0;
126 	}
127 }
128 
129 struct flow_route_bucket *
flow_route_buckets_alloc(size_t frb_cnt,size_t * frb_sz,size_t * tot_sz)130 flow_route_buckets_alloc(size_t frb_cnt, size_t *frb_sz, size_t *tot_sz)
131 {
132 	uint32_t cache_sz = skmem_cpu_cache_line_size();
133 	struct flow_route_bucket *frb;
134 	void *frb_buf, **frb_pbuf;
135 	size_t frb_tot_sz;
136 
137 	/* each bucket is CPU cache-aligned */
138 	*frb_sz = P2ROUNDUP(sizeof(*frb), cache_sz);
139 
140 	/* total size includes extra for alignment requirements */
141 	*tot_sz = frb_tot_sz = (sizeof(void *) + (frb_cnt * (*frb_sz)) + cache_sz);
142 	frb_buf = sk_alloc(frb_tot_sz, Z_WAITOK, skmem_tag_fsw_frb_hash);
143 	if (__improbable(frb_buf == NULL)) {
144 		return NULL;
145 	}
146 
147 	/*
148 	 * In case we didn't get a cache-aligned memory, round it up
149 	 * accordingly.  This is needed in order to get the rest of
150 	 * the structure members aligned properly.  It also means that
151 	 * the memory span gets shifted due to the round up, but it
152 	 * is okay since we've allocated extra space for this.
153 	 */
154 	frb = (struct flow_route_bucket *)
155 	    P2ROUNDUP((intptr_t)frb_buf + sizeof(void *), cache_sz);
156 	frb_pbuf = (void **)((intptr_t)frb - sizeof(void *));
157 	ASSERT((intptr_t)frb_pbuf >= (intptr_t)frb_buf);
158 	ASSERT(((intptr_t)frb + (frb_cnt * (*frb_sz))) <=
159 	    ((intptr_t)frb_buf + frb_tot_sz));
160 	*frb_pbuf = frb_buf;
161 
162 	SK_DF(SK_VERB_MEM, "frb 0x%llx frb_cnt %zu frb_sz %zu "
163 	    "(total %zu bytes, frb_buf 0x%llx) ALLOC", SK_KVA(frb), frb_cnt,
164 	    *frb_sz, frb_tot_sz, SK_KVA(frb_buf));
165 
166 	return frb;
167 }
168 
169 void
flow_route_buckets_free(struct flow_route_bucket * frb,size_t tot_sz)170 flow_route_buckets_free(struct flow_route_bucket *frb, size_t tot_sz)
171 {
172 	void *frb_buf, **frb_pbuf;
173 
174 	/* get the original address that we stuffed in earlier and free it */
175 	frb_pbuf = (void **)((intptr_t)frb - sizeof(void *));
176 	frb_buf = *frb_pbuf;
177 	SK_DF(SK_VERB_MEM, "frb 0x%llx (frb_buf 0x%llx) FREE",
178 	    SK_KVA(frb), SK_KVA(frb_buf));
179 	sk_free(frb_buf, tot_sz);
180 }
181 
182 void
flow_route_bucket_init(struct flow_route_bucket * frb)183 flow_route_bucket_init(struct flow_route_bucket *frb)
184 {
185 	ASSERT(IS_P2ALIGNED(frb, skmem_cpu_cache_line_size()));
186 	lck_rw_init(&frb->frb_lock, &flow_route_lock_group,
187 	    &flow_route_lock_attr);
188 	RB_INIT(&frb->frb_head);
189 }
190 
191 void
flow_route_bucket_destroy(struct flow_route_bucket * frb)192 flow_route_bucket_destroy(struct flow_route_bucket *frb)
193 {
194 	ASSERT(RB_EMPTY(&frb->frb_head));
195 	lck_rw_destroy(&frb->frb_lock, &flow_route_lock_group);
196 }
197 
198 static struct flow_route *
flow_route_find_by_addr(struct flow_route_bucket * frb,union sockaddr_in_4_6 * dst)199 flow_route_find_by_addr(struct flow_route_bucket *frb,
200     union sockaddr_in_4_6 *dst)
201 {
202 	struct flow_route *fr;
203 	struct flow_route find;
204 
205 	FRB_LOCK_ASSERT_HELD(frb);
206 
207 	switch (SA(dst)->sa_family) {
208 	case AF_INET:
209 		find.fr_af = AF_INET;
210 		find.fr_addr_len = sizeof(struct in_addr);
211 		find.fr_addr_key = (void *)&SIN(dst)->sin_addr;
212 		break;
213 
214 	case AF_INET6:
215 		find.fr_af = AF_INET6;
216 		find.fr_addr_len = sizeof(struct in6_addr);
217 		find.fr_addr_key = (void *)&SIN6(dst)->sin6_addr;
218 		break;
219 
220 	default:
221 		VERIFY(0);
222 		/* NOTREACHED */
223 		__builtin_unreachable();
224 	}
225 
226 	fr = RB_FIND(flow_route_tree, &frb->frb_head, &find);
227 	if (fr != NULL) {
228 		flow_route_retain(fr);  /* for the caller */
229 	}
230 	return fr;
231 }
232 
233 struct flow_route_id_bucket *
flow_route_id_buckets_alloc(size_t frib_cnt,size_t * frib_sz,size_t * tot_sz)234 flow_route_id_buckets_alloc(size_t frib_cnt, size_t *frib_sz, size_t *tot_sz)
235 {
236 	uint32_t cache_sz = skmem_cpu_cache_line_size();
237 	struct flow_route_id_bucket *frib;
238 	void *frib_buf, **frib_pbuf;
239 	size_t frib_tot_sz;
240 
241 	/* each bucket is CPU cache-aligned */
242 	*frib_sz = P2ROUNDUP(sizeof(*frib), cache_sz);
243 
244 	/* total size includes extra for alignment requirements */
245 	*tot_sz = frib_tot_sz = (sizeof(void *) + (frib_cnt * (*frib_sz)) + cache_sz);
246 	frib_buf = sk_alloc(frib_tot_sz, Z_WAITOK, skmem_tag_fsw_frib_hash);
247 	if (__improbable(frib_buf == NULL)) {
248 		return NULL;
249 	}
250 
251 	/*
252 	 * In case we didn't get a cache-aligned memory, round it up
253 	 * accordingly.  This is needed in order to get the rest of
254 	 * the structure members aligned properly.  It also means that
255 	 * the memory span gets shifted due to the round up, but it
256 	 * is okay since we've allocated extra space for this.
257 	 */
258 	frib = (struct flow_route_id_bucket *)
259 	    P2ROUNDUP((intptr_t)frib_buf + sizeof(void *), cache_sz);
260 	frib_pbuf = (void **)((intptr_t)frib - sizeof(void *));
261 	ASSERT((intptr_t)frib_pbuf >= (intptr_t)frib_buf);
262 	ASSERT(((intptr_t)frib + (frib_cnt * (*frib_sz))) <=
263 	    ((intptr_t)frib_buf + frib_tot_sz));
264 	*frib_pbuf = frib_buf;
265 
266 	SK_DF(SK_VERB_MEM, "frib 0x%llx frib_cnt %zu frib_sz %zu "
267 	    "(total %zu bytes, frib_buf 0x%llx) ALLOC", SK_KVA(frib), frib_cnt,
268 	    *frib_sz, frib_tot_sz, SK_KVA(frib_buf));
269 
270 	return frib;
271 }
272 
273 void
flow_route_id_buckets_free(struct flow_route_id_bucket * frib,size_t tot_sz)274 flow_route_id_buckets_free(struct flow_route_id_bucket *frib, size_t tot_sz)
275 {
276 	void *frib_buf, **frib_pbuf;
277 
278 	/* get the original address that we stuffed in earlier and free it */
279 	frib_pbuf = (void **)((intptr_t)frib - sizeof(void *));
280 	frib_buf = *frib_pbuf;
281 	SK_DF(SK_VERB_MEM, "frib 0x%llx (frib_buf 0x%llx) FREE", SK_KVA(frib),
282 	    SK_KVA(frib_buf));
283 	sk_free(frib_buf, tot_sz);
284 }
285 
286 void
flow_route_id_bucket_init(struct flow_route_id_bucket * frib)287 flow_route_id_bucket_init(struct flow_route_id_bucket *frib)
288 {
289 	ASSERT(IS_P2ALIGNED(frib, skmem_cpu_cache_line_size()));
290 	lck_rw_init(&frib->frib_lock, &flow_route_lock_group,
291 	    &flow_route_lock_attr);
292 	RB_INIT(&frib->frib_head);
293 }
294 
295 void
flow_route_id_bucket_destroy(struct flow_route_id_bucket * frib)296 flow_route_id_bucket_destroy(struct flow_route_id_bucket *frib)
297 {
298 	ASSERT(RB_EMPTY(&frib->frib_head));
299 	lck_rw_destroy(&frib->frib_lock, &flow_route_lock_group);
300 }
301 
302 static struct flow_route *
flow_route_find_by_uuid(struct flow_route_id_bucket * frib,uuid_t id)303 flow_route_find_by_uuid(struct flow_route_id_bucket *frib, uuid_t id)
304 {
305 	struct flow_route *fr;
306 	struct flow_route find;
307 
308 	FRIB_LOCK_ASSERT_HELD(frib);
309 
310 	uuid_copy(find.fr_uuid, id);
311 	fr = RB_FIND(flow_route_id_tree, &frib->frib_head, &find);
312 	if (fr != NULL) {
313 		flow_route_retain(fr);  /* for the caller */
314 	}
315 	return fr;
316 }
317 
318 static struct flow_route *
fr_alloc(boolean_t cansleep)319 fr_alloc(boolean_t cansleep)
320 {
321 	struct flow_route *fr;
322 
323 	if ((fr = skmem_cache_alloc(flow_route_cache,
324 	    (cansleep ? SKMEM_SLEEP : SKMEM_NOSLEEP))) != NULL) {
325 		bzero(fr, flow_route_size);
326 		lck_spin_init(&fr->fr_reflock, &flow_route_lock_group,
327 		    &flow_route_lock_attr);
328 		lck_mtx_init(&fr->fr_lock, &flow_route_lock_group,
329 		    &flow_route_lock_attr);
330 		uuid_generate_random(fr->fr_uuid);
331 
332 		SK_DF(SK_VERB_MEM, "allocated fr 0x%llx", SK_KVA(fr));
333 	}
334 
335 	return fr;
336 }
337 
338 static void
fr_free(struct flow_route * fr)339 fr_free(struct flow_route *fr)
340 {
341 	SK_DF(SK_VERB_MEM, "freeing fr 0x%llx", SK_KVA(fr));
342 
343 	VERIFY(!(fr->fr_flags & FLOWRTF_ATTACHED));
344 	VERIFY(fr->fr_usecnt == 0);
345 
346 	FR_LOCK(fr);
347 	/* callee frees route entry */
348 	flow_route_cleanup(fr);
349 	VERIFY(fr->fr_rt_dst == NULL);
350 	VERIFY(fr->fr_rt_gw == NULL);
351 	VERIFY(fr->fr_rt_evhdlr_tag == NULL);
352 	FR_UNLOCK(fr);
353 
354 	lck_mtx_destroy(&fr->fr_lock, &flow_route_lock_group);
355 	lck_spin_destroy(&fr->fr_reflock, &flow_route_lock_group);
356 
357 	skmem_cache_free(flow_route_cache, fr);
358 }
359 
360 static inline int
fr_cmp(const struct flow_route * a,const struct flow_route * b)361 fr_cmp(const struct flow_route *a, const struct flow_route *b)
362 {
363 	int d;
364 
365 	if ((d = (a->fr_af - b->fr_af)) != 0) {
366 		return d;
367 	}
368 	if ((d = flow_ip_cmp(a->fr_addr_key, b->fr_addr_key,
369 	    b->fr_addr_len)) != 0) {
370 		return d;
371 	}
372 
373 	return 0;
374 }
375 
376 static inline int
fr_id_cmp(const struct flow_route * a,const struct flow_route * b)377 fr_id_cmp(const struct flow_route *a, const struct flow_route *b)
378 {
379 	return uuid_compare(a->fr_uuid, b->fr_uuid);
380 }
381 
382 static inline int
fr_use_stable_address(struct nx_flow_req * req)383 fr_use_stable_address(struct nx_flow_req *req)
384 {
385 	int use_stable_address = ip6_prefer_tempaddr ? 0 : 1;
386 	if (req != NULL &&
387 	    (req->nfr_flags & NXFLOWREQF_OVERRIDE_ADDRESS_SELECTION)) {
388 		use_stable_address = (req->nfr_flags & NXFLOWREQF_USE_STABLE_ADDRESS) ? 1 : 0;
389 	}
390 	return use_stable_address;
391 }
392 
393 int
flow_route_configure(struct flow_route * fr,struct ifnet * ifp,struct nx_flow_req * req)394 flow_route_configure(struct flow_route *fr, struct ifnet *ifp, struct nx_flow_req *req)
395 {
396 #if SK_LOG
397 	char old_s[MAX_IPv6_STR_LEN];   /* src */
398 	char src_s[MAX_IPv6_STR_LEN];   /* src */
399 	char dst_s[MAX_IPv6_STR_LEN];   /* dst */
400 #endif /* SK_LOG */
401 	struct rtentry *rt = NULL, *gwrt = NULL;
402 	int err = 0;
403 
404 	FR_LOCK_ASSERT_HELD(fr);
405 
406 	/*
407 	 * If there is a route entry for the final destination, see if
408 	 * it's no longer valid and perform another routing table lookup.
409 	 * A non-NULL fr_rt_dst is always associated with a route event
410 	 * registration, and the route reference is held there.
411 	 */
412 	rt = fr->fr_rt_dst;
413 	if (rt == NULL || !(rt->rt_flags & RTF_UP) || fr->fr_want_configure) {
414 		struct eventhandler_entry_arg ee_arg;
415 
416 		/* callee frees route entry */
417 		flow_route_cleanup(fr);
418 
419 		/* lookup destination route */
420 		ASSERT(err == 0);
421 		rt = rtalloc1_scoped(SA(&fr->fr_faddr), 1, 0, ifp->if_index);
422 		if (rt == NULL) {
423 			err = EHOSTUNREACH;
424 			SK_ERR("no route to %s on %s (err %d)",
425 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
426 			    sizeof(dst_s)), ifp->if_xname, err);
427 		} else {
428 			/*
429 			 * If route points to another interface and the
430 			 * route's gateway isn't link-layer, reject it.
431 			 * We make an exception otherwise, since local
432 			 * interface addresses resolve this way.
433 			 */
434 			if (rt->rt_ifp != ifp && rt->rt_ifp != lo_ifp &&
435 			    (rt->rt_gateway == NULL ||
436 			    SA(rt->rt_gateway)->sa_family != AF_LINK)) {
437 				err = EHOSTUNREACH;
438 				SK_ERR("route to %s on %s != %s (err %d)",
439 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
440 				    sizeof(dst_s)), rt->rt_ifp->if_xname,
441 				    ifp->if_xname, err);
442 			}
443 		}
444 
445 		if (err != 0) {
446 			goto done;
447 		}
448 
449 		ASSERT(fr->fr_mgr != NULL);
450 		ASSERT(!uuid_is_null(fr->fr_mgr->fm_uuid));
451 		ASSERT(!uuid_is_null(fr->fr_uuid));
452 		ASSERT(!uuid_is_null(fr->fr_nx_uuid));
453 
454 		bzero(&ee_arg, sizeof(ee_arg));
455 		uuid_copy(ee_arg.ee_fm_uuid, fr->fr_mgr->fm_uuid);
456 		uuid_copy(ee_arg.ee_fr_uuid, fr->fr_uuid);
457 
458 		/*
459 		 * Register for changes on destination route; this covers both
460 		 * cases where the destination is on-link, or if it is off-link
461 		 * and is using a gateway route.  This also transfers the refcnt
462 		 * of the route entry to the event handler, released later when
463 		 * it is deregistered.
464 		 */
465 		ASSERT(fr->fr_rt_dst == NULL);
466 		ASSERT(fr->fr_rt_evhdlr_tag == NULL);
467 		fr->fr_rt_dst = rt;             /* move reference to fr */
468 		fr->fr_rt_evhdlr_tag =
469 		    EVENTHANDLER_REGISTER(&rt->rt_evhdlr_ctxt, route_event,
470 		    flow_route_ev_callback, ee_arg, EVENTHANDLER_PRI_ANY);
471 		ASSERT(fr->fr_rt_evhdlr_tag != NULL);
472 		atomic_bitclear_32(&fr->fr_flags, FLOWRTF_DELETED);
473 
474 		/*
475 		 * Lookup gateway route (if any); returns locked gwrt
476 		 * with a reference bumped up.
477 		 */
478 		err = route_to_gwroute(SA(&fr->fr_faddr), rt, &gwrt);
479 		if (err != 0) {
480 			/*
481 			 * Reference held by fr_rt_dst will be taken
482 			 * care of by flow_route_cleanup() below, so
483 			 * make sure we don't do an extra rtfree().
484 			 */
485 			rt = NULL;
486 			ASSERT(gwrt == NULL);
487 			SK_ERR("no gw route to %s on %s (err %d)",
488 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
489 			    sizeof(dst_s)), ifp->if_xname, err);
490 			goto done;
491 		}
492 
493 		/* if RTF_GATEWAY isn't set, gwrt == rt */
494 		ASSERT(gwrt != NULL);
495 		RT_LOCK_ASSERT_HELD(gwrt);
496 
497 		/*
498 		 * Must have been cleared via cleanup, and that we're
499 		 * single-threaded here for fr by virtue of fr_lock.
500 		 */
501 		ASSERT(!(fr->fr_flags & (FLOWRTF_GATEWAY | FLOWRTF_ONLINK)));
502 
503 		if (gwrt != rt && (rt->rt_flags & RTF_GATEWAY) &&
504 		    (rt->rt_gateway->sa_family == AF_INET ||
505 		    rt->rt_gateway->sa_family == AF_INET6)) {
506 			struct sockaddr_storage ss;
507 
508 			ASSERT(fr->fr_rt_gw == NULL);
509 			/* locked via route_to_gwroute() above */
510 			fr->fr_rt_gw = gwrt;    /* move reference to fr */
511 			RT_ADDREF_LOCKED(gwrt); /* for this routine */
512 			/*
513 			 * Destination is off-link and is reachable
514 			 * thru an IP gateway route.  Save the IP
515 			 * address of the gateway in fr_gaddr.
516 			 */
517 			(void) sa_copy(rt->rt_gateway, &ss, NULL);
518 			_CASSERT(sizeof(fr->fr_gaddr) <= sizeof(ss));
519 			bcopy(&ss, &fr->fr_gaddr, sizeof(fr->fr_gaddr));
520 			atomic_bitset_32(&fr->fr_flags, FLOWRTF_GATEWAY);
521 		} else if (IS_DIRECT_HOSTROUTE(rt)) {
522 			/*
523 			 * Destination is on-link.
524 			 */
525 			atomic_bitset_32(&fr->fr_flags, FLOWRTF_ONLINK);
526 		}
527 		RT_UNLOCK(gwrt);
528 	}
529 	RT_ADDREF(rt);          /* for this routine */
530 
531 	/* see if we need to re-select default source address */
532 	int use_stable_address = fr_use_stable_address(req);
533 	if (fr->fr_want_configure ||
534 	    fr->fr_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt ||
535 	    !(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address) {
536 		union sockaddr_in_4_6 old = fr->fr_laddr;
537 		if (use_stable_address) {
538 			atomic_bitset_32(&fr->fr_flags, FLOWRTF_STABLE_ADDR);
539 		} else {
540 			atomic_bitclear_32(&fr->fr_flags, FLOWRTF_STABLE_ADDR);
541 		}
542 		if ((err = flow_route_select_laddr(&fr->fr_laddr, &fr->fr_faddr,
543 		    ifp, rt, &fr->fr_laddr_gencnt, use_stable_address)) != 0) {
544 			SK_ERR("no usable src address to reach %s on %s "
545 			    "(err %d)", sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
546 			    sizeof(dst_s)), ifp->if_xname, err);
547 			goto done;
548 		}
549 		if (bcmp(&old, &fr->fr_laddr, SA(&old)->sa_len) != 0) {
550 			SK_ERR("src address is now %s (was %s) to reach %s "
551 			    "on %s", sk_sa_ntop(SA(&fr->fr_laddr), src_s,
552 			    sizeof(src_s)), sk_sa_ntop(SA(&old), old_s,
553 			    sizeof(old_s)), sk_sa_ntop(SA(&fr->fr_faddr),
554 			    dst_s, sizeof(dst_s)), ifp->if_xname);
555 		}
556 	}
557 	ASSERT(err == 0);
558 
559 done:
560 	if (__probable(err == 0)) {
561 		atomic_set_32(&fr->fr_want_configure, 0);
562 	} else {
563 		/* callee frees route entry */
564 		flow_route_cleanup(fr);
565 	}
566 
567 	if (gwrt != NULL) {
568 		ASSERT(rt != NULL);
569 		if (gwrt == rt) {
570 			RT_REMREF(gwrt);
571 		} else {
572 			rtfree(gwrt);
573 		}
574 		gwrt = NULL;
575 	}
576 
577 	if (rt != NULL) {
578 		rtfree(rt);
579 		rt = NULL;
580 	}
581 
582 	return err;
583 }
584 
585 int
flow_route_find(struct kern_nexus * nx,struct flow_mgr * fm,struct ifnet * ifp,struct nx_flow_req * req,flow_route_ctor_fn_t fr_ctor,flow_route_resolve_fn_t fr_resolve,void * arg,struct flow_route ** frp)586 flow_route_find(struct kern_nexus *nx, struct flow_mgr *fm,
587     struct ifnet *ifp, struct nx_flow_req *req,
588     flow_route_ctor_fn_t fr_ctor, flow_route_resolve_fn_t fr_resolve,
589     void *arg, struct flow_route **frp)
590 {
591 #if SK_LOG
592 	char src_s[MAX_IPv6_STR_LEN];   /* dst */
593 	char dst_s[MAX_IPv6_STR_LEN];   /* dst */
594 	char gw_s[MAX_IPv6_STR_LEN];    /* gw */
595 #endif /* SK_LOG */
596 	union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
597 	struct flow_route_bucket *frb;
598 	struct flow_route_id_bucket *frib;
599 	struct flow_route *fr = NULL;
600 	int err = 0;
601 
602 	ASSERT(fr_ctor != NULL && fr_resolve != NULL);
603 
604 	ASSERT(frp != NULL);
605 	*frp = NULL;
606 
607 	frb = flow_mgr_get_frb_by_addr(fm, daddr);
608 
609 	int use_stable_address = fr_use_stable_address(req);
610 
611 	/* see if there is a cached flow route (as reader) */
612 	FRB_RLOCK(frb);
613 	fr = flow_route_find_by_addr(frb, daddr);
614 	if (fr != NULL) {
615 		if (__improbable(fr->fr_want_configure || fr->fr_laddr_gencnt !=
616 		    ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt) ||
617 		    __improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) {
618 			atomic_add_32(&fr->fr_want_configure, 1);
619 			FR_LOCK(fr);
620 			err = flow_route_configure(fr, ifp, req);
621 			if (err != 0) {
622 				SK_ERR("fr 0x%llx error re-configuring dst %s "
623 				    "on %s (err %d) [R]", SK_KVA(fr),
624 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
625 				    sizeof(dst_s)), ifp->if_xname, err);
626 			}
627 			FR_UNLOCK(fr);
628 		}
629 		if (err == 0) {
630 			SK_DF(SK_VERB_FLOW_ROUTE,
631 			    "fr 0x%llx found for dst %s " "on %s [R,%u]",
632 			    SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
633 			    sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt);
634 		}
635 		FRB_RUNLOCK(frb);       /* reader */
636 		goto done;
637 	}
638 
639 	/*
640 	 * Flow route doesn't exist; become a writer and prepare to
641 	 * allocate one.  We could be racing with other threads here,
642 	 * so check first if there is now a cached flow route that
643 	 * got created by the winning thread.
644 	 */
645 	if (!FRB_RLOCKTOWLOCK(frb)) {
646 		FRB_WLOCK(frb);
647 	}
648 
649 	fr = flow_route_find_by_addr(frb, daddr);
650 	if (fr != NULL) {
651 		if (__improbable(fr->fr_want_configure) ||
652 		    __improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) {
653 			FR_LOCK(fr);
654 			err = flow_route_configure(fr, ifp, req);
655 			if (err != 0) {
656 				SK_ERR("fr 0x%llx error re-configuring dst %s "
657 				    "on %s (err %d) [W]", SK_KVA(fr),
658 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
659 				    sizeof(dst_s)), ifp->if_xname, err);
660 			}
661 			FR_UNLOCK(fr);
662 		}
663 		if (err == 0) {
664 			SK_DF(SK_VERB_FLOW_ROUTE,
665 			    "fr 0x%llx found for dst %s on %s [W,%u]",
666 			    SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
667 			    sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt);
668 		}
669 		FRB_WUNLOCK(frb);       /* writer */
670 		goto done;
671 	}
672 
673 	/* allocate one */
674 	fr = fr_alloc(TRUE);
675 	fr->fr_faddr = *daddr;          /* remote address */
676 
677 	switch (SA(&fr->fr_faddr)->sa_family) {
678 	case AF_INET:
679 		SIN(&fr->fr_faddr)->sin_port = 0;
680 		fr->fr_addr_len = sizeof(struct in_addr);
681 		fr->fr_addr_key = &SIN(&fr->fr_faddr)->sin_addr;
682 		break;
683 
684 	case AF_INET6:
685 		SIN6(&fr->fr_faddr)->sin6_port = 0;
686 		fr->fr_addr_len = sizeof(struct in6_addr);
687 		fr->fr_addr_key = &SIN6(&fr->fr_faddr)->sin6_addr;
688 		break;
689 
690 	default:
691 		VERIFY(0);
692 		/* NOTREACHED */
693 		__builtin_unreachable();
694 	}
695 
696 	ASSERT(!uuid_is_null(fr->fr_uuid));
697 	uuid_copy(fr->fr_nx_uuid, nx->nx_uuid);
698 	*(struct flow_mgr **)(uintptr_t)&fr->fr_mgr = fm;
699 
700 	/* force configure newly-created flow route */
701 	atomic_add_32(&fr->fr_want_configure, 1);
702 
703 	FR_LOCK(fr);
704 	if ((err = flow_route_configure(fr, ifp, req)) != 0) {
705 		SK_ERR("fr 0x%llx error configuring dst %s on %s (err %d)",
706 		    SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
707 		    sizeof(dst_s)), ifp->if_xname, err);
708 		FR_UNLOCK(fr);
709 		FRB_WUNLOCK(frb);       /* writer */
710 		/* not yet in tree, so free immediately */
711 		fr_free(fr);
712 		fr = NULL;
713 		goto done;
714 	}
715 
716 	/* execute nexus-specific constructor */
717 	fr_ctor(arg, fr);
718 	FR_UNLOCK(fr);
719 
720 	frib = flow_mgr_get_frib_by_uuid(fm, fr->fr_uuid);
721 	FRIB_WLOCK(frib);
722 
723 	*(struct flow_route_bucket **)(uintptr_t)&fr->fr_frb = frb;
724 	*(struct flow_route_id_bucket **)(uintptr_t)&fr->fr_frib = frib;
725 
726 	FRB_WLOCK_ASSERT_HELD(frb);
727 	FRIB_WLOCK_ASSERT_HELD(frib);
728 
729 	RB_INSERT(flow_route_tree, &frb->frb_head, fr);
730 	RB_INSERT(flow_route_id_tree, &frib->frib_head, fr);
731 
732 	atomic_bitset_32(&fr->fr_flags, FLOWRTF_ATTACHED);
733 
734 #if DEBUG
735 	/* sanity checks for comparator routines */
736 	VERIFY(flow_route_find_by_addr(frb, &fr->fr_faddr) == fr);
737 	flow_route_release(fr);
738 	VERIFY(flow_route_find_by_uuid(frib, fr->fr_uuid) == fr);
739 	flow_route_release(fr);
740 #endif /* DEBUG */
741 
742 	/* for the trees */
743 	_CASSERT(FLOW_ROUTE_MINREF == 2);
744 	flow_route_retain(fr);
745 	flow_route_retain(fr);
746 	ASSERT(fr->fr_usecnt == FLOW_ROUTE_MINREF);
747 
748 	/* for the caller */
749 	flow_route_retain(fr);
750 
751 	FRIB_WUNLOCK(frib);     /* writer */
752 	FRB_WUNLOCK(frb);       /* writer */
753 
754 	/* execute nexus-specific resolver */
755 	if (!(fr->fr_flags & FLOWRTF_RESOLVED) &&
756 	    (err = fr_resolve(arg, fr, NULL)) != 0) {
757 		if (fr->fr_flags & FLOWRTF_GATEWAY) {
758 			SK_ERR("fr 0x%llx resolve %s gw %s on %s (err %d)",
759 			    SK_KVA(fr), (err == EJUSTRETURN ? "pending" :
760 			    "fail"), sk_sa_ntop(SA(&fr->fr_gaddr), dst_s,
761 			    sizeof(dst_s)), ifp->if_xname, err);
762 		} else {
763 			SK_ERR("fr 0x%llx resolve %s dst %s on %s (err %d)",
764 			    SK_KVA(fr), (err == EJUSTRETURN ? "pending" :
765 			    "fail"), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
766 			    sizeof(dst_s)), ifp->if_xname, err);
767 		}
768 		if (err == EJUSTRETURN) {
769 			err = 0;
770 		} else {
771 			goto done;
772 		}
773 	}
774 	ASSERT(err == 0);
775 
776 #if SK_LOG
777 	if (fr->fr_flags & FLOWRTF_GATEWAY) {
778 		SK_DF(SK_VERB_FLOW_ROUTE,
779 		    "add fr 0x%llx %s -> %s via gw %s on %s", SK_KVA(fr),
780 		    sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)),
781 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
782 		    sk_sa_ntop(SA(&fr->fr_gaddr), gw_s, sizeof(gw_s)),
783 		    ifp->if_xname);
784 	} else {
785 		SK_DF(SK_VERB_FLOW_ROUTE,
786 		    "add fr 0x%llx %s -> %s on %s", SK_KVA(fr),
787 		    sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)),
788 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
789 		    ifp->if_xname);
790 	}
791 #endif /* SK_LOG */
792 
793 done:
794 	if (err == 0) {
795 		ASSERT(fr != NULL);
796 		*frp = fr;
797 	} else if (fr != NULL) {
798 		/* can't directly call fr_free() if it's in the tree */
799 		flow_route_release(fr);
800 		fr = NULL;
801 	}
802 
803 	return err;
804 }
805 
806 void
flow_route_retain(struct flow_route * fr)807 flow_route_retain(struct flow_route *fr)
808 {
809 	lck_spin_lock(&fr->fr_reflock);
810 	if (fr->fr_usecnt++ == FLOW_ROUTE_MINREF) {
811 		fr->fr_expire = 0;
812 	}
813 	lck_spin_unlock(&fr->fr_reflock);
814 }
815 
816 void
flow_route_release(struct flow_route * fr)817 flow_route_release(struct flow_route *fr)
818 {
819 	bool should_free = false;
820 
821 	lck_spin_lock(&fr->fr_reflock);
822 	VERIFY(fr->fr_usecnt > 0);
823 	if (fr->fr_flags & FLOWRTF_ATTACHED) {
824 		if (fr->fr_usecnt-- == (FLOW_ROUTE_MINREF + 1)) {
825 			fr->fr_expire = _net_uptime + flow_route_expire;
826 		}
827 	} else {
828 		/*
829 		 * fr is no longer in lookup tree, so there shouldn't be
830 		 * further usecnt, if we reach 0 usecnt, then this is the very
831 		 * last reference and is safe to unlock and call fr_free.
832 		 */
833 		if (--(fr->fr_usecnt) == 0) {
834 			should_free = true;
835 		}
836 	}
837 	lck_spin_unlock(&fr->fr_reflock);
838 
839 	if (should_free) {
840 		fr_free(fr);
841 	}
842 }
843 
844 static uint32_t
flow_route_bucket_purge_common(struct flow_route_bucket * frb,uint32_t * resid,boolean_t all,boolean_t early_expire)845 flow_route_bucket_purge_common(struct flow_route_bucket *frb, uint32_t *resid,
846     boolean_t all, boolean_t early_expire)
847 {
848 #if SK_LOG
849 	char ss[MAX_IPv6_STR_LEN];      /* dst */
850 	char ds[MAX_IPv6_STR_LEN];      /* dst */
851 	char gs[MAX_IPv6_STR_LEN];      /* gw */
852 #endif /* SK_LOG */
853 	struct flow_route *fr, *tfr;
854 	uint64_t now = net_uptime();
855 	uint32_t i = 0, tot = 0;
856 
857 	FRB_WLOCK_ASSERT_HELD(frb);
858 
859 	RB_FOREACH_SAFE(fr, flow_route_tree, &frb->frb_head, tfr) {
860 		struct flow_route_id_bucket *frib =
861 		    __DECONST(struct flow_route_id_bucket *, fr->fr_frib);
862 
863 		++tot;
864 		/*
865 		 * We're not holding fr_lock here, since this is a
866 		 * best-effort check.  If there's a race and we miss
867 		 * it now, we'll come back again shortly.
868 		 */
869 		lck_spin_lock(&fr->fr_reflock);
870 		if (!all && (fr->fr_usecnt > FLOW_ROUTE_MINREF ||
871 		    (fr->fr_expire > now && !early_expire &&
872 		    !(fr->fr_flags & FLOWRTF_DELETED)))) {
873 			lck_spin_unlock(&fr->fr_reflock);
874 			SK_DF(SK_VERB_FLOW_ROUTE, "skipping fr 0x%llx "
875 			    "refcnt %u expire %llu", SK_KVA(fr),
876 			    fr->fr_usecnt, fr->fr_expire);
877 			continue;
878 		}
879 		lck_spin_unlock(&fr->fr_reflock);
880 
881 		/*
882 		 * If "all" is set, flow entries must be gone by now, as
883 		 * we must be called by flow_route_bucket_purge_all().
884 		 * It also means that the caller has acquired writer lock
885 		 * on all flow {route,route_id} buckets, and fr_usecnt
886 		 * must be at its minimum value now.
887 		 */
888 		if (!all) {
889 			FRIB_WLOCK(frib);
890 		}
891 		FRIB_WLOCK_ASSERT_HELD(frib);
892 
893 		_CASSERT(FLOW_ROUTE_MINREF == 2);
894 		ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF);
895 
896 		RB_REMOVE(flow_route_tree, &frb->frb_head, fr);
897 		RB_REMOVE(flow_route_id_tree, &frib->frib_head, fr);
898 
899 		atomic_bitclear_32(&fr->fr_flags, FLOWRTF_ATTACHED);
900 
901 #if SK_LOG
902 		if (fr->fr_flags & FLOWRTF_GATEWAY) {
903 			SK_DF(SK_VERB_FLOW_ROUTE,
904 			    "remove fr 0x%llx %s -> %s via gw %s [exp %lld]",
905 			    SK_KVA(fr),
906 			    sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
907 			    sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
908 			    sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs)),
909 			    (int64_t)(fr->fr_expire - now));
910 		} else {
911 			SK_DF(SK_VERB_FLOW_ROUTE,
912 			    "remove fr 0x%llx %s -> %s [exp %lld]", SK_KVA(fr),
913 			    sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
914 			    sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
915 			    (int64_t)(fr->fr_expire - now));
916 		}
917 #endif /* SK_LOG */
918 
919 		/* for the trees */
920 		flow_route_release(fr);
921 		flow_route_release(fr);
922 		++i;
923 
924 		if (!all) {
925 			FRIB_WUNLOCK(frib);
926 		}
927 	}
928 
929 	if (resid != NULL) {
930 		*resid = (tot - i);
931 	}
932 
933 	return i;
934 }
935 
936 void
flow_route_bucket_purge_all(struct flow_route_bucket * frb)937 flow_route_bucket_purge_all(struct flow_route_bucket *frb)
938 {
939 	(void) flow_route_bucket_purge_common(frb, NULL, TRUE, FALSE);
940 }
941 
942 static uint32_t
flow_route_bucket_prune(struct flow_route_bucket * frb,struct ifnet * ifp,uint32_t * resid)943 flow_route_bucket_prune(struct flow_route_bucket *frb, struct ifnet *ifp,
944     uint32_t *resid)
945 {
946 	uint64_t now = net_uptime();
947 	struct flow_route *fr;
948 	uint32_t i = 0, tot = 0;
949 	boolean_t ifdown = !(ifp->if_flags & IFF_UP);
950 
951 	FRB_RLOCK(frb);
952 	RB_FOREACH(fr, flow_route_tree, &frb->frb_head) {
953 		++tot;
954 		/* loose check; do this without holding fr_reflock */
955 		if (fr->fr_usecnt > FLOW_ROUTE_MINREF ||
956 		    (fr->fr_expire > now && !ifdown &&
957 		    !(fr->fr_flags & FLOWRTF_DELETED))) {
958 			continue;
959 		}
960 		++i;
961 	}
962 
963 	/*
964 	 * If there's nothing to prune or there's a writer, we're done.
965 	 * Note that if we failed to upgrade to writer, the lock would
966 	 * have been released automatically.
967 	 */
968 	if (i == 0 || !FRB_RLOCKTOWLOCK(frb)) {
969 		if (i == 0) {
970 			FRB_RUNLOCK(frb);
971 		}
972 		if (resid != NULL) {
973 			*resid = (tot - i);
974 		}
975 		return 0;
976 	}
977 
978 	SK_DF(SK_VERB_FLOW_ROUTE, "purging at least %u idle routes on %s",
979 	    i, ifp->if_xname);
980 
981 	/* purge idle ones */
982 	i = flow_route_bucket_purge_common(frb, resid, FALSE, ifdown);
983 	FRB_WUNLOCK(frb);
984 
985 	return i;
986 }
987 
988 uint32_t
flow_route_prune(struct flow_mgr * fm,struct ifnet * ifp,uint32_t * tot_resid)989 flow_route_prune(struct flow_mgr *fm, struct ifnet *ifp,
990     uint32_t *tot_resid)
991 {
992 	uint32_t pruned = 0;
993 	uint32_t resid;
994 	uint32_t i;
995 
996 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
997 		struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
998 		pruned += flow_route_bucket_prune(frb, ifp, &resid);
999 		if (tot_resid != NULL) {
1000 			*tot_resid += resid;
1001 		}
1002 	}
1003 
1004 	return pruned;
1005 }
1006 
1007 /*
1008  * This runs in the context of eventhandler invocation routine which loops
1009  * through all the registered callbacks.  Care must be taken to not call
1010  * any primitives here that would lead to routing changes in the same context
1011  * as it would lead to deadlock in eventhandler code.
1012  */
1013 static void
flow_route_ev_callback(struct eventhandler_entry_arg ee_arg,struct sockaddr * dst,int route_ev,struct sockaddr * gw_addr,int flags)1014 flow_route_ev_callback(struct eventhandler_entry_arg ee_arg,
1015     struct sockaddr *dst, int route_ev, struct sockaddr *gw_addr, int flags)
1016 {
1017 #pragma unused(dst, flags)
1018 #if SK_LOG
1019 	char dst_s[MAX_IPv6_STR_LEN];
1020 #endif /* SK_LOG */
1021 	struct flow_route_id_bucket *frib = NULL;
1022 	struct flow_route *fr = NULL;
1023 	struct flow_mgr *fm;
1024 
1025 	VERIFY(!uuid_is_null(ee_arg.ee_fm_uuid));
1026 	VERIFY(!uuid_is_null(ee_arg.ee_fr_uuid));
1027 
1028 	/*
1029 	 * Upon success, callee will hold flow manager lock as reader,
1030 	 * and we'll need to unlock it below.  Otherwise there's no
1031 	 * need to unlock here and just return.
1032 	 */
1033 	fm = flow_mgr_find_lock(ee_arg.ee_fm_uuid);
1034 	if (fm == NULL) {
1035 		SK_ERR("Event %s for dst %s ignored; flow manager not found",
1036 		    route_event2str(route_ev), sk_sa_ntop(dst, dst_s,
1037 		    sizeof(dst_s)));
1038 		return;
1039 	}
1040 
1041 	SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s event %s", fm->fm_name,
1042 	    sk_sa_ntop(dst, dst_s, sizeof(dst_s)), route_event2str(route_ev));
1043 
1044 	do {
1045 		frib = flow_mgr_get_frib_by_uuid(fm, ee_arg.ee_fr_uuid);
1046 
1047 		FRIB_RLOCK(frib);
1048 		/* callee returns a reference that we need to release below */
1049 		fr = flow_route_find_by_uuid(frib, ee_arg.ee_fr_uuid);
1050 		if (fr == NULL) {
1051 			SK_ERR("%s: dst %s flow route not found", fm->fm_name,
1052 			    sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1053 			break;
1054 		}
1055 
1056 		/*
1057 		 * Grab fr_lock to prevent flow route configuration or
1058 		 * resolver from using stale info while we are updating.
1059 		 */
1060 		FR_LOCK(fr);
1061 
1062 		switch (route_ev) {
1063 		case ROUTE_ENTRY_REFRESH:
1064 			/*
1065 			 * This is the case where the route entry has been
1066 			 * updated (for example through RTM_CHANGE).  Some
1067 			 * of it may not warrant a lookup again and some of
1068 			 * it may.  For now, mark flow to perform a look-up
1069 			 * again as the gateway may have changed.
1070 			 */
1071 			atomic_add_32(&fr->fr_want_configure, 1);
1072 			atomic_bitclear_32(&fr->fr_flags, FLOWRTF_RESOLVED);
1073 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route changed",
1074 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1075 			    sizeof(dst_s)));
1076 			break;
1077 
1078 		case ROUTE_ENTRY_DELETED:
1079 			/*
1080 			 * NOTE: flow_route_cleanup() should not be called
1081 			 * to de-register eventhandler in the context of
1082 			 * eventhandler callback to avoid deadlock in
1083 			 * eventhandler code.  Instead, just mark the flow
1084 			 * route un-resolved.  When it is being used again
1085 			 * or being deleted the old eventhandler must be
1086 			 * de-registered.
1087 			 */
1088 			atomic_add_32(&fr->fr_want_configure, 1);
1089 			atomic_bitclear_32(&fr->fr_flags, FLOWRTF_RESOLVED);
1090 			atomic_bitset_32(&fr->fr_flags, FLOWRTF_DELETED);
1091 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route deleted",
1092 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1093 			    sizeof(dst_s)));
1094 			break;
1095 
1096 		case ROUTE_LLENTRY_STALE:
1097 			/*
1098 			 * When the route entry is deemed unreliable or old
1099 			 * enough to trigger a route lookup again.  Don't
1100 			 * reconfigure the flow route, but simply attempt
1101 			 * to resolve it next time to trigger a probe.
1102 			 */
1103 			atomic_add_32(&fr->fr_want_probe, 1);
1104 			atomic_bitclear_32(&fr->fr_flags, FLOWRTF_RESOLVED);
1105 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry stale",
1106 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1107 			    sizeof(dst_s)));
1108 			break;
1109 
1110 		case ROUTE_LLENTRY_CHANGED:
1111 			/*
1112 			 * When the link-layer info has changed; replace
1113 			 * cached llinfo in the flow route (treat this
1114 			 * as ROUTE_LLENTRY_RESOLVED).
1115 			 */
1116 			OS_FALLTHROUGH;
1117 
1118 		case ROUTE_LLENTRY_RESOLVED:
1119 			/*
1120 			 * SDL address length may be 0 for cellular.
1121 			 * If Ethernet, copy into flow route and mark
1122 			 * it as cached.  In all cases, mark the flow
1123 			 * route as resolved.
1124 			 */
1125 			ASSERT(SDL(gw_addr)->sdl_family == AF_LINK);
1126 			if (SDL(gw_addr)->sdl_alen == ETHER_ADDR_LEN) {
1127 				FLOWRT_UPD_ETH_DST(fr, LLADDR(SDL(gw_addr)));
1128 				SK_DF(SK_VERB_FLOW_ROUTE,
1129 				    "%s: dst %s llentry %s", fm->fm_name,
1130 				    sk_sa_ntop(dst, dst_s, sizeof(dst_s)),
1131 				    (!(fr->fr_flags & FLOWRTF_HAS_LLINFO) ?
1132 				    "resolved" : "changed"));
1133 				atomic_bitset_32(&fr->fr_flags,
1134 				    FLOWRTF_HAS_LLINFO);
1135 			} else {
1136 				atomic_bitclear_32(&fr->fr_flags,
1137 				    FLOWRTF_HAS_LLINFO);
1138 			}
1139 			atomic_bitset_32(&fr->fr_flags, FLOWRTF_RESOLVED);
1140 #if SK_LOG
1141 			if (__improbable((sk_verbose & SK_VERB_FLOW_ROUTE) !=
1142 			    0) && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
1143 				SK_DF(SK_VERB_FLOW_ROUTE,
1144 				    "%s: fr 0x%llx eth_type 0x%x "
1145 				    "eth_src %x:%x:%x:%x:%x:%x "
1146 				    "eth_dst %x:%x:%x:%x:%x:%x [%s])",
1147 				    fm->fm_name, SK_KVA(fr),
1148 				    ntohs(fr->fr_eth.ether_type),
1149 				    fr->fr_eth.ether_shost[0],
1150 				    fr->fr_eth.ether_shost[1],
1151 				    fr->fr_eth.ether_shost[2],
1152 				    fr->fr_eth.ether_shost[3],
1153 				    fr->fr_eth.ether_shost[4],
1154 				    fr->fr_eth.ether_shost[5],
1155 				    fr->fr_eth.ether_dhost[0],
1156 				    fr->fr_eth.ether_dhost[1],
1157 				    fr->fr_eth.ether_dhost[2],
1158 				    fr->fr_eth.ether_dhost[3],
1159 				    fr->fr_eth.ether_dhost[4],
1160 				    fr->fr_eth.ether_dhost[5],
1161 				    sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1162 			}
1163 #endif /* SK_LOG */
1164 			break;
1165 
1166 		case ROUTE_LLENTRY_DELETED:
1167 			/*
1168 			 * If the route entry points to a router and an
1169 			 * RTM_DELETE has been issued on it; force the
1170 			 * flow route to be reconfigured.
1171 			 */
1172 			atomic_add_32(&fr->fr_want_configure, 1);
1173 			atomic_bitclear_32(&fr->fr_flags,
1174 			    (FLOWRTF_HAS_LLINFO | FLOWRTF_RESOLVED));
1175 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry deleted",
1176 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1177 			    sizeof(dst_s)));
1178 			break;
1179 
1180 		case ROUTE_LLENTRY_PROBED:
1181 			/*
1182 			 * When the resolver has begun probing the target;
1183 			 * nothing to do here.
1184 			 */
1185 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry probed",
1186 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1187 			    sizeof(dst_s)));
1188 			break;
1189 
1190 		case ROUTE_LLENTRY_UNREACH:
1191 			/*
1192 			 * When the route entry is marked with RTF_REJECT
1193 			 * or the probes have timed out, reconfigure.
1194 			 */
1195 			atomic_add_32(&fr->fr_want_configure, 1);
1196 			atomic_bitclear_32(&fr->fr_flags, FLOWRTF_RESOLVED);
1197 			SK_ERR("%s: dst %s llentry unreachable", fm->fm_name,
1198 			    sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1199 			break;
1200 
1201 		default:
1202 			break;
1203 		}
1204 	} while (0);
1205 
1206 	if (fr != NULL) {
1207 		flow_route_release(fr);
1208 		FR_UNLOCK(fr);
1209 	}
1210 
1211 	if (frib != NULL) {
1212 		FRIB_UNLOCK(frib);
1213 	}
1214 
1215 	if (fm != NULL) {
1216 		flow_mgr_unlock();
1217 	}
1218 }
1219 
1220 int
flow_route_select_laddr(union sockaddr_in_4_6 * src,union sockaddr_in_4_6 * dst,struct ifnet * ifp,struct rtentry * rt,uint32_t * ipaddr_gencnt,int use_stable_address)1221 flow_route_select_laddr(union sockaddr_in_4_6 *src, union sockaddr_in_4_6 *dst,
1222     struct ifnet *ifp, struct rtentry *rt, uint32_t *ipaddr_gencnt,
1223     int use_stable_address)
1224 {
1225 #if SK_LOG
1226 	char src_s[MAX_IPv6_STR_LEN];   /* src */
1227 	char dst_s[MAX_IPv6_STR_LEN];   /* dst */
1228 #endif /* SK_LOG */
1229 	sa_family_t af = SA(dst)->sa_family;
1230 	struct ifnet *src_ifp = NULL;
1231 	struct ifaddr *ifa = NULL;
1232 	int err = 0;
1233 
1234 	/* see comments in flow_route_configure() regarding loopback */
1235 	ASSERT(rt->rt_ifp == ifp || rt->rt_ifp == lo_ifp);
1236 
1237 	switch (af) {
1238 	case AF_INET: {
1239 		ifnet_lock_shared(ifp);
1240 		if (__improbable(rt->rt_ifa->ifa_debug & IFD_DETACHING) != 0) {
1241 			err = EHOSTUNREACH;
1242 			SK_ERR("route to %s has src address marked detaching "
1243 			    "(err %d)", inet_ntop(AF_INET,
1244 			    &SIN(dst)->sin_addr, dst_s, sizeof(dst_s)), err);
1245 			ifnet_lock_done(ifp);
1246 			break;
1247 		}
1248 		SIN(src)->sin_len = sizeof(struct sockaddr_in);
1249 		SIN(src)->sin_family = AF_INET;
1250 		SIN(src)->sin_addr = IA_SIN(rt->rt_ifa)->sin_addr;
1251 		ASSERT(SIN(src)->sin_addr.s_addr != INADDR_ANY);
1252 		*ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1253 		ifnet_lock_done(ifp);
1254 		break;
1255 	}
1256 
1257 	case AF_INET6: {
1258 		struct in6_addr src_storage, *in6;
1259 
1260 		uint32_t hints = (use_stable_address ? 0 : IPV6_SRCSEL_HINT_PREFER_TMPADDR);
1261 		if ((in6 = in6_selectsrc_core(SIN6(dst), hints,
1262 		    ifp, 0, &src_storage, &src_ifp, &err, &ifa)) == NULL) {
1263 			if (err == 0) {
1264 				err = EADDRNOTAVAIL;
1265 			}
1266 			VERIFY(src_ifp == NULL);
1267 			SK_ERR("src address to dst %s on %s not available "
1268 			    "(err %d)", inet_ntop(AF_INET6,
1269 			    &SIN6(dst)->sin6_addr, dst_s, sizeof(dst_s)),
1270 			    ifp->if_xname, err);
1271 			break;
1272 		}
1273 
1274 		VERIFY(src_ifp != NULL);
1275 		VERIFY(ifa != NULL);
1276 
1277 		if (__improbable(src_ifp != ifp)) {
1278 			if (err == 0) {
1279 				err = ENETUNREACH;
1280 			}
1281 			SK_ERR("dst %s, src %s ifp %s != %s (err %d)",
1282 			    inet_ntop(AF_INET6, &SIN6(dst)->sin6_addr,
1283 			    dst_s, sizeof(dst_s)),
1284 			    inet_ntop(AF_INET6, &SIN6(src)->sin6_addr,
1285 			    src_s, sizeof(src_s)),
1286 			    src_ifp->if_xname, ifp->if_xname, err);
1287 			break;
1288 		}
1289 
1290 		ifnet_lock_shared(ifp);
1291 		if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) {
1292 			err = EHOSTUNREACH;
1293 			SK_ERR("IPv6 address selected is marked to be "
1294 			    "detached (err %d)", err);
1295 			ifnet_lock_done(ifp);
1296 			break;
1297 		}
1298 
1299 		/* clear embedded scope if link-local src */
1300 		if (IN6_IS_SCOPE_EMBED(in6)) {
1301 			if (in6_embedded_scope) {
1302 				SIN6(src)->sin6_scope_id = ntohs(in6->s6_addr16[1]);
1303 				in6->s6_addr16[1] = 0;
1304 			} else {
1305 				SIN6(src)->sin6_scope_id = src_ifp->if_index;
1306 			}
1307 		}
1308 		SIN6(src)->sin6_len = sizeof(struct sockaddr_in6);
1309 		SIN6(src)->sin6_family = AF_INET6;
1310 		SIN6(src)->sin6_addr = *in6;
1311 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&SIN6(src)->sin6_addr));
1312 		*ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1313 		ifnet_lock_done(ifp);
1314 		break;
1315 	}
1316 
1317 	default:
1318 		VERIFY(0);
1319 		/* NOTREACHED */
1320 		__builtin_unreachable();
1321 	}
1322 
1323 	if (ifa != NULL) {
1324 		IFA_REMREF(ifa);
1325 	}
1326 
1327 	if (src_ifp != NULL) {
1328 		ifnet_release(src_ifp);
1329 	}
1330 
1331 #if SK_LOG
1332 	if (err == 0 && __improbable((sk_verbose & SK_VERB_FLOW_ROUTE) != 0)) {
1333 		SK_DF(SK_VERB_FLOW_ROUTE, "src %s to dst %s on %s",
1334 		    sk_sa_ntop(SA(src), src_s, sizeof(src_s)),
1335 		    sk_sa_ntop(SA(dst), dst_s, sizeof(dst_s)),
1336 		    ifp->if_xname);
1337 	}
1338 #endif /* SK_LOG */
1339 
1340 	return err;
1341 }
1342 
1343 void
flow_route_cleanup(struct flow_route * fr)1344 flow_route_cleanup(struct flow_route *fr)
1345 {
1346 #if SK_LOG
1347 	char ss[MAX_IPv6_STR_LEN];      /* dst */
1348 	char ds[MAX_IPv6_STR_LEN];      /* dst */
1349 	char gs[MAX_IPv6_STR_LEN];      /* gw */
1350 #endif /* SK_LOG */
1351 
1352 	FR_LOCK_ASSERT_HELD(fr);
1353 
1354 	if (fr->fr_rt_evhdlr_tag != NULL) {
1355 		ASSERT(fr->fr_rt_dst != NULL);
1356 		route_event_enqueue_nwk_wq_entry(fr->fr_rt_dst, NULL,
1357 		    ROUTE_EVHDLR_DEREGISTER, fr->fr_rt_evhdlr_tag, FALSE);
1358 		fr->fr_rt_evhdlr_tag = NULL;
1359 		fr->fr_rt_dst = NULL;
1360 	}
1361 	ASSERT(fr->fr_rt_dst == NULL);
1362 	if (fr->fr_rt_gw != NULL) {
1363 		rtfree(fr->fr_rt_gw);
1364 		fr->fr_rt_gw = NULL;
1365 	}
1366 
1367 #if SK_LOG
1368 	if (fr->fr_flags & FLOWRTF_GATEWAY) {
1369 		SK_DF(SK_VERB_FLOW_ROUTE,
1370 		    "clean fr 0x%llx %s -> %s via gw %s", SK_KVA(fr),
1371 		    sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
1372 		    sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
1373 		    sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs)));
1374 	} else if (fr->fr_flags & FLOWRTF_ONLINK) {
1375 		SK_DF(SK_VERB_FLOW_ROUTE,
1376 		    "clean fr 0x%llx %s -> %s", SK_KVA(fr),
1377 		    sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
1378 		    sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)));
1379 	}
1380 #endif /* SK_LOG */
1381 
1382 	atomic_bitclear_32(&fr->fr_flags, (FLOWRTF_GATEWAY | FLOWRTF_ONLINK));
1383 }
1384 
1385 static boolean_t
_flow_route_laddr_validate(struct flow_ip_addr * src_ip0,uint8_t ip_v,struct ifnet * ifp,uint32_t * gencnt)1386 _flow_route_laddr_validate(struct flow_ip_addr *src_ip0, uint8_t ip_v,
1387     struct ifnet *ifp, uint32_t *gencnt)
1388 {
1389 	boolean_t address_found = TRUE;
1390 	struct ifaddr *ifa = NULL;
1391 	struct flow_ip_addr src_ip = {};
1392 	uint32_t scope = ifp->if_index;
1393 
1394 	VERIFY(gencnt != NULL);
1395 	VERIFY(ip_v == IPVERSION || ip_v == IPV6_VERSION);
1396 
1397 	if (ip_v == IPVERSION) {
1398 		memcpy(&src_ip._v4, &src_ip0->_v4, sizeof(src_ip._v4));
1399 
1400 		ifa = (struct ifaddr *)ifa_foraddr_scoped(
1401 			src_ip._v4.s_addr, scope);
1402 	} else {
1403 		memcpy(&src_ip, src_ip0, sizeof(*src_ip0));
1404 
1405 		if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(&src_ip._v6)) {
1406 			src_ip._v6.s6_addr16[1] = htons((uint16_t)scope);
1407 		}
1408 		ifa = (struct ifaddr *)ifa_foraddr6_scoped(&src_ip._v6,
1409 		    scope);
1410 	}
1411 
1412 	if (__improbable(ifa == NULL)) {
1413 		address_found = FALSE;
1414 		goto done;
1415 	}
1416 
1417 	ifnet_lock_shared(ifp);
1418 	if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) {
1419 		address_found = FALSE;
1420 		ifnet_lock_done(ifp);
1421 		goto done;
1422 	}
1423 
1424 	if (ip_v == IPV6_VERSION) {
1425 		struct in6_ifaddr *ia6 = (struct in6_ifaddr *)ifa;
1426 
1427 		/*
1428 		 * Fail if IPv6 address is not ready or if the address
1429 		 * is reserved * for CLAT46.
1430 		 */
1431 		if (__improbable(ia6->ia6_flags &
1432 		    (IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) != 0) {
1433 			address_found = FALSE;
1434 			ifnet_lock_done(ifp);
1435 			goto done;
1436 		}
1437 	} else {
1438 		/*
1439 		 * If interface has CLAT46 enabled, fail IPv4 bind.
1440 		 * Since this implies network is NAT64/DNS64, Internet
1441 		 * effectively becomes reachable over IPv6.  If on
1442 		 * system IPv4 to IPv6 translation is required, that
1443 		 * should be handled solely through bump in the API.
1444 		 * The in kernel translation is only done for apps
1445 		 * directly using low level networking APIs.
1446 		 */
1447 		if (__improbable(IS_INTF_CLAT46(ifp))) {
1448 			address_found = FALSE;
1449 			ifnet_lock_done(ifp);
1450 			goto done;
1451 		}
1452 	}
1453 
1454 	*gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1455 	ifnet_lock_done(ifp);
1456 done:
1457 	if (ifa != NULL) {
1458 		IFA_REMREF(ifa);
1459 	}
1460 
1461 	return address_found;
1462 }
1463 
1464 boolean_t
flow_route_laddr_validate(union sockaddr_in_4_6 * saddr,struct ifnet * ifp,uint32_t * gencnt)1465 flow_route_laddr_validate(union sockaddr_in_4_6 *saddr, struct ifnet *ifp,
1466     uint32_t *gencnt)
1467 {
1468 	VERIFY(saddr->sa.sa_family == AF_INET ||
1469 	    saddr->sa.sa_family == AF_INET6);
1470 
1471 	struct flow_ip_addr *ipa;
1472 	uint8_t ipv;
1473 	if (saddr->sa.sa_family == AF_INET) {
1474 		ipv = IPVERSION;
1475 		ipa = (struct flow_ip_addr *)(void *)&saddr->sin.sin_addr;
1476 	} else {
1477 		ipv = IPV6_VERSION;
1478 		ipa = (struct flow_ip_addr *)(void *)&saddr->sin6.sin6_addr;
1479 	}
1480 
1481 	return _flow_route_laddr_validate(ipa, ipv, ifp, gencnt);
1482 }
1483 
1484 boolean_t
flow_route_key_validate(struct flow_key * fk,struct ifnet * ifp,uint32_t * gencnt)1485 flow_route_key_validate(struct flow_key *fk, struct ifnet *ifp,
1486     uint32_t *gencnt)
1487 {
1488 	return _flow_route_laddr_validate(&fk->fk_src, fk->fk_ipver, ifp,
1489 	           gencnt);
1490 }
1491