xref: /xnu-10002.81.5/bsd/skywalk/nexus/flowswitch/flow/flow_route.c (revision 5e3eaea39dcf651e66cb99ba7d70e32cc4a99587)
1 /*
2  * Copyright (c) 2017-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Flow Routes.
31  *
32  * Each (non-listener) flow entry is always associated with a flow route
33  * object.  Multiple flow entries sharing the same remote address will use
34  * the same flow route for that address.  The flow route object contains
35  * the route information for the remote node.  It gets allocated when a
36  * flow entry requests to connect, and is garbage-collected when it's no
37  * longer referred to after its expiration time has passed.
38  *
39  * A flow route also contains the default local address that's used to
40  * reach the remote node.  This may not necessarily be the same local
41  * address used by the flow entry, if it has explicitly bound the entry
42  * to another local address.  But for the majority of cases, having the
43  * local address be present in the flow route allows us to avoid doing
44  * source address selection each time a connect request happens.
45  *
46  * When the remote node is reachable via a gateway, the gateway address
47  * portion of the flow route contains its IP address and the flow route
48  * is marked with FLOWRTF_GATEWAY.  We use this to optimize the gateway
49  * route lookup, since otherwise we'd have to perform an extra lookup
50  * each time we need to resolve the route.
51  *
52  * When the remote node is directly on the link, the FLOWRTF_ONLINK flag
53  * is set, and the gateway address isn't used.  The target address used
54  * for resolution will the the remote address itself.
55  *
56  * On links with link-layer information, we store the resolved address
57  * of the target node (which may be the gateway's) in the flow route,
58  * and mark the flow route with FLOWRTF_HAS_LLINFO.
59  *
60  * Each flow route also registers itself to receive route events when
61  * the underlying rtentry is updated or deleted.
62  */
63 
64 #include <skywalk/os_skywalk_private.h>
65 
66 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
67 #include <skywalk/nexus/flowswitch/fsw_var.h>
68 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
69 
70 #include <netinet/in.h>
71 #include <netinet/in_var.h>
72 #include <netinet/in_arp.h>
73 #include <netinet6/nd6.h>
74 #include <net/route.h>
75 
76 extern struct rtstat rtstat;
77 
78 static LCK_GRP_DECLARE(flow_route_lock_group, "sk_flow_route_lock");
79 static LCK_ATTR_DECLARE(flow_route_lock_attr, 0, 0);
80 
81 static int fr_cmp(const struct flow_route *, const struct flow_route *);
82 static int fr_id_cmp(const struct flow_route *, const struct flow_route *);
83 static struct flow_route *fr_alloc(boolean_t);
84 static void fr_free(struct flow_route *);
85 static uint32_t flow_route_bucket_purge_common(struct flow_route_bucket *,
86     uint32_t *, boolean_t, boolean_t);
87 static void flow_route_ev_callback(struct eventhandler_entry_arg,
88     struct sockaddr *, int, struct sockaddr *, int);
89 
90 RB_GENERATE_PREV(flow_route_tree, flow_route, fr_link, fr_cmp);
91 RB_GENERATE_PREV(flow_route_id_tree, flow_route, fr_id_link, fr_id_cmp);
92 
93 KALLOC_TYPE_VAR_DEFINE(KT_SK_FRB, struct flow_route_bucket, KT_DEFAULT);
94 KALLOC_TYPE_VAR_DEFINE(KT_SK_FRIB, struct flow_route_id_bucket, KT_DEFAULT);
95 
96 #define FR_ZONE_NAME    "flow.route"
97 
98 static unsigned int flow_route_size;            /* size of flow_route */
99 struct skmem_cache *flow_route_cache;           /* cache for flow_route */
100 
101 static int __flow_route_inited = 0;
102 
103 #define FLOW_ROUTE_EXPIRE       600     /* seconds */
104 static unsigned int flow_route_expire = FLOW_ROUTE_EXPIRE;
105 
106 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_expire,
107     CTLFLAG_RW | CTLFLAG_LOCKED, &flow_route_expire, 0, "");
108 
109 void
flow_route_init(void)110 flow_route_init(void)
111 {
112 	ASSERT(!__flow_route_inited);
113 
114 	flow_route_size = sizeof(struct flow_route);
115 	flow_route_cache = skmem_cache_create(FR_ZONE_NAME, flow_route_size,
116 	    sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0);
117 
118 	__flow_route_inited = 1;
119 }
120 
121 void
flow_route_fini(void)122 flow_route_fini(void)
123 {
124 	if (__flow_route_inited) {
125 		skmem_cache_destroy(flow_route_cache);
126 		flow_route_cache = NULL;
127 
128 		__flow_route_inited = 0;
129 	}
130 }
131 
132 struct flow_route_bucket *
flow_route_buckets_alloc(size_t frb_cnt,size_t * frb_sz,size_t * tot_sz)133 flow_route_buckets_alloc(size_t frb_cnt, size_t *frb_sz, size_t *tot_sz)
134 {
135 	uint32_t cache_sz = skmem_cpu_cache_line_size();
136 	struct flow_route_bucket *frb;
137 	size_t frb_tot_sz;
138 
139 	/* each bucket is CPU cache-aligned */
140 	*frb_sz = P2ROUNDUP(sizeof(*frb), cache_sz);
141 	*tot_sz = frb_tot_sz = frb_cnt * (*frb_sz);
142 	frb = sk_alloc_type_hash(KT_SK_FRB, frb_tot_sz, Z_WAITOK,
143 	    skmem_tag_fsw_frb_hash);
144 	if (__improbable(frb == NULL)) {
145 		return NULL;
146 	}
147 
148 #if !KASAN_CLASSIC
149 	/*
150 	 * except in KASAN_CLASSIC mode, kalloc will always maintain cacheline
151 	 * size alignment if the requested size is a multiple of a cacheline
152 	 * size (this is true for any size that is a power of two from 16 to
153 	 * PAGE_SIZE).
154 	 *
155 	 * Because this is an optimization only, it is OK to leave KASAN_CLASSIC
156 	 * not respect this.
157 	 */
158 	ASSERT(IS_P2ALIGNED(frb, cache_sz));
159 #endif
160 
161 	SK_DF(SK_VERB_MEM, "frb 0x%llx frb_cnt %zu frb_sz %zu "
162 	    "(total %zu bytes) ALLOC", SK_KVA(frb), frb_cnt,
163 	    *frb_sz, frb_tot_sz);
164 
165 	return frb;
166 }
167 
168 void
flow_route_buckets_free(struct flow_route_bucket * frb,size_t tot_sz)169 flow_route_buckets_free(struct flow_route_bucket *frb, size_t tot_sz)
170 {
171 	SK_DF(SK_VERB_MEM, "frb 0x%llx FREE", SK_KVA(frb));
172 	sk_free_type_hash(KT_SK_FRB, tot_sz, frb);
173 }
174 
175 void
flow_route_bucket_init(struct flow_route_bucket * frb)176 flow_route_bucket_init(struct flow_route_bucket *frb)
177 {
178 #if !KASAN_CLASSIC
179 	ASSERT(IS_P2ALIGNED(frb, skmem_cpu_cache_line_size()));
180 #endif /* !KASAN_CLASSIC */
181 	lck_rw_init(&frb->frb_lock, &flow_route_lock_group,
182 	    &flow_route_lock_attr);
183 	RB_INIT(&frb->frb_head);
184 }
185 
186 void
flow_route_bucket_destroy(struct flow_route_bucket * frb)187 flow_route_bucket_destroy(struct flow_route_bucket *frb)
188 {
189 	ASSERT(RB_EMPTY(&frb->frb_head));
190 	lck_rw_destroy(&frb->frb_lock, &flow_route_lock_group);
191 }
192 
193 static struct flow_route *
flow_route_find_by_addr(struct flow_route_bucket * frb,union sockaddr_in_4_6 * dst)194 flow_route_find_by_addr(struct flow_route_bucket *frb,
195     union sockaddr_in_4_6 *dst)
196 {
197 	struct flow_route *fr;
198 	struct flow_route find;
199 
200 	FRB_LOCK_ASSERT_HELD(frb);
201 
202 	switch (SA(dst)->sa_family) {
203 	case AF_INET:
204 		find.fr_af = AF_INET;
205 		find.fr_addr_len = sizeof(struct in_addr);
206 		find.fr_addr_key = (void *)&SIN(dst)->sin_addr;
207 		break;
208 
209 	case AF_INET6:
210 		find.fr_af = AF_INET6;
211 		find.fr_addr_len = sizeof(struct in6_addr);
212 		find.fr_addr_key = (void *)&SIN6(dst)->sin6_addr;
213 		break;
214 
215 	default:
216 		VERIFY(0);
217 		/* NOTREACHED */
218 		__builtin_unreachable();
219 	}
220 
221 	fr = RB_FIND(flow_route_tree, &frb->frb_head, &find);
222 	if (fr != NULL) {
223 		flow_route_retain(fr);  /* for the caller */
224 	}
225 	return fr;
226 }
227 
228 struct flow_route_id_bucket *
flow_route_id_buckets_alloc(size_t frib_cnt,size_t * frib_sz,size_t * tot_sz)229 flow_route_id_buckets_alloc(size_t frib_cnt, size_t *frib_sz, size_t *tot_sz)
230 {
231 	uint32_t cache_sz = skmem_cpu_cache_line_size();
232 	struct flow_route_id_bucket *frib;
233 	size_t frib_tot_sz;
234 
235 	/* each bucket is CPU cache-aligned */
236 	*frib_sz = P2ROUNDUP(sizeof(*frib), cache_sz);
237 	*tot_sz = frib_tot_sz = frib_cnt * (*frib_sz);
238 	frib = sk_alloc_type_hash(KT_SK_FRIB, frib_tot_sz, Z_WAITOK,
239 	    skmem_tag_fsw_frib_hash);
240 	/* END IGNORE CODESTYLE */
241 	if (__improbable(frib == NULL)) {
242 		return NULL;
243 	}
244 
245 #if !KASAN_CLASSIC
246 	/*
247 	 * except in KASAN_CLASSIC mode, kalloc will always maintain cacheline
248 	 * size alignment if the requested size is a multiple of a cacheline
249 	 * size (this is true for any size that is a power of two from 16 to
250 	 * PAGE_SIZE).
251 	 *
252 	 * Because this is an optimization only, it is OK to leave KASAN_CLASSIC
253 	 * not respect this.
254 	 */
255 	ASSERT(IS_P2ALIGNED(frib, cache_sz));
256 #endif /* !KASAN_CLASSIC */
257 
258 	SK_DF(SK_VERB_MEM, "frib 0x%llx frib_cnt %zu frib_sz %zu "
259 	    "(total %zu bytes) ALLOC", SK_KVA(frib), frib_cnt,
260 	    *frib_sz, frib_tot_sz);
261 
262 	return frib;
263 }
264 
265 void
flow_route_id_buckets_free(struct flow_route_id_bucket * frib,size_t tot_sz)266 flow_route_id_buckets_free(struct flow_route_id_bucket *frib, size_t tot_sz)
267 {
268 	SK_DF(SK_VERB_MEM, "frib 0x%llx FREE", SK_KVA(frib));
269 	sk_free_type_hash(KT_SK_FRIB, tot_sz, frib);
270 }
271 
272 void
flow_route_id_bucket_init(struct flow_route_id_bucket * frib)273 flow_route_id_bucket_init(struct flow_route_id_bucket *frib)
274 {
275 #if !KASAN_CLASSIC
276 	ASSERT(IS_P2ALIGNED(frib, skmem_cpu_cache_line_size()));
277 #endif
278 	lck_rw_init(&frib->frib_lock, &flow_route_lock_group,
279 	    &flow_route_lock_attr);
280 	RB_INIT(&frib->frib_head);
281 }
282 
283 void
flow_route_id_bucket_destroy(struct flow_route_id_bucket * frib)284 flow_route_id_bucket_destroy(struct flow_route_id_bucket *frib)
285 {
286 	ASSERT(RB_EMPTY(&frib->frib_head));
287 	lck_rw_destroy(&frib->frib_lock, &flow_route_lock_group);
288 }
289 
290 static struct flow_route *
flow_route_find_by_uuid(struct flow_route_id_bucket * frib,uuid_t id)291 flow_route_find_by_uuid(struct flow_route_id_bucket *frib, uuid_t id)
292 {
293 	struct flow_route *fr;
294 	struct flow_route find;
295 
296 	FRIB_LOCK_ASSERT_HELD(frib);
297 
298 	uuid_copy(find.fr_uuid, id);
299 	fr = RB_FIND(flow_route_id_tree, &frib->frib_head, &find);
300 	if (fr != NULL) {
301 		flow_route_retain(fr);  /* for the caller */
302 	}
303 	return fr;
304 }
305 
306 static struct flow_route *
fr_alloc(boolean_t cansleep)307 fr_alloc(boolean_t cansleep)
308 {
309 	struct flow_route *fr;
310 
311 	if ((fr = skmem_cache_alloc(flow_route_cache,
312 	    (cansleep ? SKMEM_SLEEP : SKMEM_NOSLEEP))) != NULL) {
313 		bzero(fr, flow_route_size);
314 		lck_spin_init(&fr->fr_reflock, &flow_route_lock_group,
315 		    &flow_route_lock_attr);
316 		lck_mtx_init(&fr->fr_lock, &flow_route_lock_group,
317 		    &flow_route_lock_attr);
318 		uuid_generate_random(fr->fr_uuid);
319 
320 		SK_DF(SK_VERB_MEM, "allocated fr 0x%llx", SK_KVA(fr));
321 	}
322 
323 	return fr;
324 }
325 
326 static void
fr_free(struct flow_route * fr)327 fr_free(struct flow_route *fr)
328 {
329 	SK_DF(SK_VERB_MEM, "freeing fr 0x%llx", SK_KVA(fr));
330 
331 	VERIFY(!(fr->fr_flags & FLOWRTF_ATTACHED));
332 	VERIFY(fr->fr_usecnt == 0);
333 
334 	FR_LOCK(fr);
335 	/* callee frees route entry */
336 	flow_route_cleanup(fr);
337 	VERIFY(fr->fr_rt_dst == NULL);
338 	VERIFY(fr->fr_rt_gw == NULL);
339 	VERIFY(fr->fr_rt_evhdlr_tag == NULL);
340 	FR_UNLOCK(fr);
341 
342 	lck_mtx_destroy(&fr->fr_lock, &flow_route_lock_group);
343 	lck_spin_destroy(&fr->fr_reflock, &flow_route_lock_group);
344 
345 	skmem_cache_free(flow_route_cache, fr);
346 }
347 
348 static inline int
fr_cmp(const struct flow_route * a,const struct flow_route * b)349 fr_cmp(const struct flow_route *a, const struct flow_route *b)
350 {
351 	int d;
352 
353 	if ((d = (a->fr_af - b->fr_af)) != 0) {
354 		return d;
355 	}
356 	if ((d = flow_ip_cmp(a->fr_addr_key, b->fr_addr_key,
357 	    b->fr_addr_len)) != 0) {
358 		return d;
359 	}
360 
361 	return 0;
362 }
363 
364 static inline int
fr_id_cmp(const struct flow_route * a,const struct flow_route * b)365 fr_id_cmp(const struct flow_route *a, const struct flow_route *b)
366 {
367 	return uuid_compare(a->fr_uuid, b->fr_uuid);
368 }
369 
370 static inline int
fr_use_stable_address(struct nx_flow_req * req)371 fr_use_stable_address(struct nx_flow_req *req)
372 {
373 	int use_stable_address = ip6_prefer_tempaddr ? 0 : 1;
374 	if (req != NULL &&
375 	    (req->nfr_flags & NXFLOWREQF_OVERRIDE_ADDRESS_SELECTION)) {
376 		use_stable_address = (req->nfr_flags & NXFLOWREQF_USE_STABLE_ADDRESS) ? 1 : 0;
377 	}
378 	return use_stable_address;
379 }
380 
381 int
flow_route_configure(struct flow_route * fr,struct ifnet * ifp,struct nx_flow_req * req)382 flow_route_configure(struct flow_route *fr, struct ifnet *ifp, struct nx_flow_req *req)
383 {
384 #if SK_LOG
385 	char old_s[MAX_IPv6_STR_LEN];   /* src */
386 	char src_s[MAX_IPv6_STR_LEN];   /* src */
387 	char dst_s[MAX_IPv6_STR_LEN];   /* dst */
388 #endif /* SK_LOG */
389 	struct rtentry *rt = NULL, *gwrt = NULL;
390 	int err = 0;
391 
392 	FR_LOCK_ASSERT_HELD(fr);
393 
394 	/*
395 	 * If there is a route entry for the final destination, see if
396 	 * it's no longer valid and perform another routing table lookup.
397 	 * A non-NULL fr_rt_dst is always associated with a route event
398 	 * registration, and the route reference is held there.
399 	 */
400 	rt = fr->fr_rt_dst;
401 	if (rt == NULL || !(rt->rt_flags & RTF_UP) || fr->fr_want_configure) {
402 		struct eventhandler_entry_arg ee_arg;
403 
404 		/* callee frees route entry */
405 		flow_route_cleanup(fr);
406 
407 		/* lookup destination route */
408 		ASSERT(err == 0);
409 		rt = rtalloc1_scoped(SA(&fr->fr_faddr), 1, 0, ifp->if_index);
410 		if (rt == NULL) {
411 			err = EHOSTUNREACH;
412 			SK_ERR("no route to %s on %s (err %d)",
413 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
414 			    sizeof(dst_s)), ifp->if_xname, err);
415 		} else {
416 			/*
417 			 * If route points to another interface and the
418 			 * route's gateway isn't link-layer, reject it.
419 			 * We make an exception otherwise, since local
420 			 * interface addresses resolve this way.
421 			 */
422 			if (rt->rt_ifp != ifp && rt->rt_ifp != lo_ifp &&
423 			    (rt->rt_gateway == NULL ||
424 			    SA(rt->rt_gateway)->sa_family != AF_LINK)) {
425 				err = EHOSTUNREACH;
426 				SK_ERR("route to %s on %s != %s (err %d)",
427 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
428 				    sizeof(dst_s)), rt->rt_ifp->if_xname,
429 				    ifp->if_xname, err);
430 			}
431 		}
432 
433 		if (err != 0) {
434 			goto done;
435 		}
436 
437 		ASSERT(fr->fr_mgr != NULL);
438 		ASSERT(!uuid_is_null(fr->fr_mgr->fm_uuid));
439 		ASSERT(!uuid_is_null(fr->fr_uuid));
440 		ASSERT(!uuid_is_null(fr->fr_nx_uuid));
441 
442 		bzero(&ee_arg, sizeof(ee_arg));
443 		uuid_copy(ee_arg.ee_fm_uuid, fr->fr_mgr->fm_uuid);
444 		uuid_copy(ee_arg.ee_fr_uuid, fr->fr_uuid);
445 
446 		/*
447 		 * Register for changes on destination route; this covers both
448 		 * cases where the destination is on-link, or if it is off-link
449 		 * and is using a gateway route.  This also transfers the refcnt
450 		 * of the route entry to the event handler, released later when
451 		 * it is deregistered.
452 		 */
453 		ASSERT(fr->fr_rt_dst == NULL);
454 		ASSERT(fr->fr_rt_evhdlr_tag == NULL);
455 		fr->fr_rt_dst = rt;             /* move reference to fr */
456 		fr->fr_rt_evhdlr_tag =
457 		    EVENTHANDLER_REGISTER(&rt->rt_evhdlr_ctxt, route_event,
458 		    flow_route_ev_callback, ee_arg, EVENTHANDLER_PRI_ANY);
459 		ASSERT(fr->fr_rt_evhdlr_tag != NULL);
460 		os_atomic_andnot(&fr->fr_flags, FLOWRTF_DELETED, relaxed);
461 
462 		/*
463 		 * Lookup gateway route (if any); returns locked gwrt
464 		 * with a reference bumped up.
465 		 */
466 		err = route_to_gwroute(SA(&fr->fr_faddr), rt, &gwrt);
467 		if (err != 0) {
468 			/*
469 			 * Reference held by fr_rt_dst will be taken
470 			 * care of by flow_route_cleanup() below, so
471 			 * make sure we don't do an extra rtfree().
472 			 */
473 			rt = NULL;
474 			ASSERT(gwrt == NULL);
475 			SK_ERR("no gw route to %s on %s (err %d)",
476 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
477 			    sizeof(dst_s)), ifp->if_xname, err);
478 			goto done;
479 		}
480 
481 		/* if RTF_GATEWAY isn't set, gwrt == rt */
482 		ASSERT(gwrt != NULL);
483 		RT_LOCK_ASSERT_HELD(gwrt);
484 
485 		/*
486 		 * Must have been cleared via cleanup, and that we're
487 		 * single-threaded here for fr by virtue of fr_lock.
488 		 */
489 		ASSERT(!(fr->fr_flags & (FLOWRTF_GATEWAY | FLOWRTF_ONLINK)));
490 
491 		if (gwrt != rt && (rt->rt_flags & RTF_GATEWAY) &&
492 		    (rt->rt_gateway->sa_family == AF_INET ||
493 		    rt->rt_gateway->sa_family == AF_INET6)) {
494 			struct sockaddr_storage ss;
495 
496 			ASSERT(fr->fr_rt_gw == NULL);
497 			/* locked via route_to_gwroute() above */
498 			fr->fr_rt_gw = gwrt;    /* move reference to fr */
499 			RT_ADDREF_LOCKED(gwrt); /* for this routine */
500 			/*
501 			 * Destination is off-link and is reachable
502 			 * thru an IP gateway route.  Save the IP
503 			 * address of the gateway in fr_gaddr.
504 			 */
505 			(void) sa_copy(rt->rt_gateway, &ss, NULL);
506 			_CASSERT(sizeof(fr->fr_gaddr) <= sizeof(ss));
507 			bcopy(&ss, &fr->fr_gaddr, sizeof(fr->fr_gaddr));
508 			os_atomic_or(&fr->fr_flags, FLOWRTF_GATEWAY, relaxed);
509 		} else if (IS_DIRECT_HOSTROUTE(rt)) {
510 			/*
511 			 * Destination is on-link.
512 			 */
513 			os_atomic_or(&fr->fr_flags, FLOWRTF_ONLINK, relaxed);
514 		}
515 		RT_UNLOCK(gwrt);
516 	}
517 	RT_ADDREF(rt);          /* for this routine */
518 
519 	/* see if we need to re-select default source address */
520 	int use_stable_address = fr_use_stable_address(req);
521 	if (fr->fr_want_configure ||
522 	    fr->fr_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt ||
523 	    !(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address) {
524 		union sockaddr_in_4_6 old = fr->fr_laddr;
525 		if (use_stable_address) {
526 			os_atomic_or(&fr->fr_flags, FLOWRTF_STABLE_ADDR, relaxed);
527 		} else {
528 			os_atomic_andnot(&fr->fr_flags, FLOWRTF_STABLE_ADDR, relaxed);
529 		}
530 		if ((err = flow_route_select_laddr(&fr->fr_laddr, &fr->fr_faddr,
531 		    ifp, rt, &fr->fr_laddr_gencnt, use_stable_address)) != 0) {
532 			SK_ERR("no usable src address to reach %s on %s "
533 			    "(err %d)", sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
534 			    sizeof(dst_s)), ifp->if_xname, err);
535 			goto done;
536 		}
537 		if (bcmp(&old, &fr->fr_laddr, SA(&old)->sa_len) != 0) {
538 			SK_ERR("src address is now %s (was %s) to reach %s "
539 			    "on %s", sk_sa_ntop(SA(&fr->fr_laddr), src_s,
540 			    sizeof(src_s)), sk_sa_ntop(SA(&old), old_s,
541 			    sizeof(old_s)), sk_sa_ntop(SA(&fr->fr_faddr),
542 			    dst_s, sizeof(dst_s)), ifp->if_xname);
543 		}
544 	}
545 	ASSERT(err == 0);
546 
547 done:
548 	if (__probable(err == 0)) {
549 		os_atomic_store(&fr->fr_want_configure, 0, release);
550 	} else {
551 		/* callee frees route entry */
552 		flow_route_cleanup(fr);
553 	}
554 
555 	if (gwrt != NULL) {
556 		ASSERT(rt != NULL);
557 		if (gwrt == rt) {
558 			RT_REMREF(gwrt);
559 		} else {
560 			rtfree(gwrt);
561 		}
562 		gwrt = NULL;
563 	}
564 
565 	if (rt != NULL) {
566 		rtfree(rt);
567 		rt = NULL;
568 	}
569 
570 	return err;
571 }
572 
573 int
flow_route_find(struct kern_nexus * nx,struct flow_mgr * fm,struct ifnet * ifp,struct nx_flow_req * req,flow_route_ctor_fn_t fr_ctor,flow_route_resolve_fn_t fr_resolve,void * arg,struct flow_route ** frp)574 flow_route_find(struct kern_nexus *nx, struct flow_mgr *fm,
575     struct ifnet *ifp, struct nx_flow_req *req,
576     flow_route_ctor_fn_t fr_ctor, flow_route_resolve_fn_t fr_resolve,
577     void *arg, struct flow_route **frp)
578 {
579 #if SK_LOG
580 	char src_s[MAX_IPv6_STR_LEN];   /* dst */
581 	char dst_s[MAX_IPv6_STR_LEN];   /* dst */
582 	char gw_s[MAX_IPv6_STR_LEN];    /* gw */
583 #endif /* SK_LOG */
584 	union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
585 	struct flow_route_bucket *frb;
586 	struct flow_route_id_bucket *frib;
587 	struct flow_route *fr = NULL;
588 	int err = 0;
589 
590 	ASSERT(fr_ctor != NULL && fr_resolve != NULL);
591 
592 	ASSERT(frp != NULL);
593 	*frp = NULL;
594 
595 	frb = flow_mgr_get_frb_by_addr(fm, daddr);
596 
597 	int use_stable_address = fr_use_stable_address(req);
598 
599 	/* see if there is a cached flow route (as reader) */
600 	FRB_RLOCK(frb);
601 	fr = flow_route_find_by_addr(frb, daddr);
602 	if (fr != NULL) {
603 		if (__improbable(fr->fr_want_configure || fr->fr_laddr_gencnt !=
604 		    ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt) ||
605 		    __improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) {
606 			os_atomic_inc(&fr->fr_want_configure, relaxed);
607 			FR_LOCK(fr);
608 			err = flow_route_configure(fr, ifp, req);
609 			if (err != 0) {
610 				SK_ERR("fr 0x%llx error re-configuring dst %s "
611 				    "on %s (err %d) [R]", SK_KVA(fr),
612 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
613 				    sizeof(dst_s)), ifp->if_xname, err);
614 			}
615 			FR_UNLOCK(fr);
616 		}
617 		if (err == 0) {
618 			SK_DF(SK_VERB_FLOW_ROUTE,
619 			    "fr 0x%llx found for dst %s " "on %s [R,%u]",
620 			    SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
621 			    sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt);
622 		}
623 		FRB_RUNLOCK(frb);       /* reader */
624 		goto done;
625 	}
626 
627 	/*
628 	 * Flow route doesn't exist; become a writer and prepare to
629 	 * allocate one.  We could be racing with other threads here,
630 	 * so check first if there is now a cached flow route that
631 	 * got created by the winning thread.
632 	 */
633 	if (!FRB_RLOCKTOWLOCK(frb)) {
634 		FRB_WLOCK(frb);
635 	}
636 
637 	fr = flow_route_find_by_addr(frb, daddr);
638 	if (fr != NULL) {
639 		if (__improbable(fr->fr_want_configure) ||
640 		    __improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) {
641 			FR_LOCK(fr);
642 			err = flow_route_configure(fr, ifp, req);
643 			if (err != 0) {
644 				SK_ERR("fr 0x%llx error re-configuring dst %s "
645 				    "on %s (err %d) [W]", SK_KVA(fr),
646 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
647 				    sizeof(dst_s)), ifp->if_xname, err);
648 			}
649 			FR_UNLOCK(fr);
650 		}
651 		if (err == 0) {
652 			SK_DF(SK_VERB_FLOW_ROUTE,
653 			    "fr 0x%llx found for dst %s on %s [W,%u]",
654 			    SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
655 			    sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt);
656 		}
657 		FRB_WUNLOCK(frb);       /* writer */
658 		goto done;
659 	}
660 
661 	/* allocate one */
662 	fr = fr_alloc(TRUE);
663 	fr->fr_faddr = *daddr;          /* remote address */
664 
665 	switch (SA(&fr->fr_faddr)->sa_family) {
666 	case AF_INET:
667 		SIN(&fr->fr_faddr)->sin_port = 0;
668 		fr->fr_addr_len = sizeof(struct in_addr);
669 		fr->fr_addr_key = &SIN(&fr->fr_faddr)->sin_addr;
670 		break;
671 
672 	case AF_INET6:
673 		SIN6(&fr->fr_faddr)->sin6_port = 0;
674 		fr->fr_addr_len = sizeof(struct in6_addr);
675 		fr->fr_addr_key = &SIN6(&fr->fr_faddr)->sin6_addr;
676 		break;
677 
678 	default:
679 		VERIFY(0);
680 		/* NOTREACHED */
681 		__builtin_unreachable();
682 	}
683 
684 	ASSERT(!uuid_is_null(fr->fr_uuid));
685 	uuid_copy(fr->fr_nx_uuid, nx->nx_uuid);
686 	*(struct flow_mgr **)(uintptr_t)&fr->fr_mgr = fm;
687 
688 	/* force configure newly-created flow route */
689 	os_atomic_inc(&fr->fr_want_configure, relaxed);
690 
691 	FR_LOCK(fr);
692 	if ((err = flow_route_configure(fr, ifp, req)) != 0) {
693 		SK_ERR("fr 0x%llx error configuring dst %s on %s (err %d)",
694 		    SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
695 		    sizeof(dst_s)), ifp->if_xname, err);
696 		FR_UNLOCK(fr);
697 		FRB_WUNLOCK(frb);       /* writer */
698 		/* not yet in tree, so free immediately */
699 		fr_free(fr);
700 		fr = NULL;
701 		goto done;
702 	}
703 
704 	/* execute nexus-specific constructor */
705 	fr_ctor(arg, fr);
706 	FR_UNLOCK(fr);
707 
708 	frib = flow_mgr_get_frib_by_uuid(fm, fr->fr_uuid);
709 	FRIB_WLOCK(frib);
710 
711 	*(struct flow_route_bucket **)(uintptr_t)&fr->fr_frb = frb;
712 	*(struct flow_route_id_bucket **)(uintptr_t)&fr->fr_frib = frib;
713 
714 	FRB_WLOCK_ASSERT_HELD(frb);
715 	FRIB_WLOCK_ASSERT_HELD(frib);
716 
717 	RB_INSERT(flow_route_tree, &frb->frb_head, fr);
718 	RB_INSERT(flow_route_id_tree, &frib->frib_head, fr);
719 
720 	os_atomic_or(&fr->fr_flags, FLOWRTF_ATTACHED, relaxed);
721 
722 #if DEBUG
723 	/* sanity checks for comparator routines */
724 	VERIFY(flow_route_find_by_addr(frb, &fr->fr_faddr) == fr);
725 	flow_route_release(fr);
726 	VERIFY(flow_route_find_by_uuid(frib, fr->fr_uuid) == fr);
727 	flow_route_release(fr);
728 #endif /* DEBUG */
729 
730 	/* for the trees */
731 	_CASSERT(FLOW_ROUTE_MINREF == 2);
732 	flow_route_retain(fr);
733 	flow_route_retain(fr);
734 	ASSERT(fr->fr_usecnt == FLOW_ROUTE_MINREF);
735 
736 	/* for the caller */
737 	flow_route_retain(fr);
738 
739 	FRIB_WUNLOCK(frib);     /* writer */
740 	FRB_WUNLOCK(frb);       /* writer */
741 
742 	/* execute nexus-specific resolver */
743 	if (!(fr->fr_flags & FLOWRTF_RESOLVED) &&
744 	    (err = fr_resolve(arg, fr, NULL)) != 0) {
745 		if (fr->fr_flags & FLOWRTF_GATEWAY) {
746 			SK_ERR("fr 0x%llx resolve %s gw %s on %s (err %d)",
747 			    SK_KVA(fr), (err == EJUSTRETURN ? "pending" :
748 			    "fail"), sk_sa_ntop(SA(&fr->fr_gaddr), dst_s,
749 			    sizeof(dst_s)), ifp->if_xname, err);
750 		} else {
751 			SK_ERR("fr 0x%llx resolve %s dst %s on %s (err %d)",
752 			    SK_KVA(fr), (err == EJUSTRETURN ? "pending" :
753 			    "fail"), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
754 			    sizeof(dst_s)), ifp->if_xname, err);
755 		}
756 		if (err == EJUSTRETURN) {
757 			err = 0;
758 		} else {
759 			goto done;
760 		}
761 	}
762 	ASSERT(err == 0);
763 
764 #if SK_LOG
765 	if (fr->fr_flags & FLOWRTF_GATEWAY) {
766 		SK_DF(SK_VERB_FLOW_ROUTE,
767 		    "add fr 0x%llx %s -> %s via gw %s on %s", SK_KVA(fr),
768 		    sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)),
769 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
770 		    sk_sa_ntop(SA(&fr->fr_gaddr), gw_s, sizeof(gw_s)),
771 		    ifp->if_xname);
772 	} else {
773 		SK_DF(SK_VERB_FLOW_ROUTE,
774 		    "add fr 0x%llx %s -> %s on %s", SK_KVA(fr),
775 		    sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)),
776 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
777 		    ifp->if_xname);
778 	}
779 #endif /* SK_LOG */
780 
781 done:
782 	if (err == 0) {
783 		ASSERT(fr != NULL);
784 		*frp = fr;
785 	} else if (fr != NULL) {
786 		/* can't directly call fr_free() if it's in the tree */
787 		flow_route_release(fr);
788 		fr = NULL;
789 	}
790 
791 	return err;
792 }
793 
794 void
flow_route_retain(struct flow_route * fr)795 flow_route_retain(struct flow_route *fr)
796 {
797 	lck_spin_lock(&fr->fr_reflock);
798 	if (fr->fr_usecnt++ == FLOW_ROUTE_MINREF) {
799 		fr->fr_expire = 0;
800 	}
801 	lck_spin_unlock(&fr->fr_reflock);
802 }
803 
804 void
flow_route_release(struct flow_route * fr)805 flow_route_release(struct flow_route *fr)
806 {
807 	bool should_free = false;
808 
809 	lck_spin_lock(&fr->fr_reflock);
810 	VERIFY(fr->fr_usecnt > 0);
811 	if (fr->fr_flags & FLOWRTF_ATTACHED) {
812 		if (fr->fr_usecnt-- == (FLOW_ROUTE_MINREF + 1)) {
813 			fr->fr_expire = _net_uptime + flow_route_expire;
814 		}
815 	} else {
816 		/*
817 		 * fr is no longer in lookup tree, so there shouldn't be
818 		 * further usecnt, if we reach 0 usecnt, then this is the very
819 		 * last reference and is safe to unlock and call fr_free.
820 		 */
821 		if (--(fr->fr_usecnt) == 0) {
822 			should_free = true;
823 		}
824 	}
825 	lck_spin_unlock(&fr->fr_reflock);
826 
827 	if (should_free) {
828 		fr_free(fr);
829 	}
830 }
831 
832 static uint32_t
flow_route_bucket_purge_common(struct flow_route_bucket * frb,uint32_t * resid,boolean_t all,boolean_t early_expire)833 flow_route_bucket_purge_common(struct flow_route_bucket *frb, uint32_t *resid,
834     boolean_t all, boolean_t early_expire)
835 {
836 #if SK_LOG
837 	char ss[MAX_IPv6_STR_LEN];      /* dst */
838 	char ds[MAX_IPv6_STR_LEN];      /* dst */
839 	char gs[MAX_IPv6_STR_LEN];      /* gw */
840 #endif /* SK_LOG */
841 	struct flow_route *fr, *tfr;
842 	uint64_t now = net_uptime();
843 	uint32_t i = 0, tot = 0;
844 
845 	FRB_WLOCK_ASSERT_HELD(frb);
846 
847 	RB_FOREACH_SAFE(fr, flow_route_tree, &frb->frb_head, tfr) {
848 		struct flow_route_id_bucket *frib =
849 		    __DECONST(struct flow_route_id_bucket *, fr->fr_frib);
850 
851 		++tot;
852 		/*
853 		 * We're not holding fr_lock here, since this is a
854 		 * best-effort check.  If there's a race and we miss
855 		 * it now, we'll come back again shortly.
856 		 */
857 		lck_spin_lock(&fr->fr_reflock);
858 		if (!all && (fr->fr_usecnt > FLOW_ROUTE_MINREF ||
859 		    (fr->fr_expire > now && !early_expire &&
860 		    !(fr->fr_flags & FLOWRTF_DELETED)))) {
861 			lck_spin_unlock(&fr->fr_reflock);
862 			SK_DF(SK_VERB_FLOW_ROUTE, "skipping fr 0x%llx "
863 			    "refcnt %u expire %llu", SK_KVA(fr),
864 			    fr->fr_usecnt, fr->fr_expire);
865 			continue;
866 		}
867 		lck_spin_unlock(&fr->fr_reflock);
868 
869 		/*
870 		 * If "all" is set, flow entries must be gone by now, as
871 		 * we must be called by flow_route_bucket_purge_all().
872 		 * It also means that the caller has acquired writer lock
873 		 * on all flow {route,route_id} buckets, and fr_usecnt
874 		 * must be at its minimum value now.
875 		 */
876 		if (!all) {
877 			FRIB_WLOCK(frib);
878 		}
879 		FRIB_WLOCK_ASSERT_HELD(frib);
880 
881 		_CASSERT(FLOW_ROUTE_MINREF == 2);
882 		ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF);
883 
884 		RB_REMOVE(flow_route_tree, &frb->frb_head, fr);
885 		RB_REMOVE(flow_route_id_tree, &frib->frib_head, fr);
886 
887 		os_atomic_andnot(&fr->fr_flags, FLOWRTF_ATTACHED, relaxed);
888 
889 #if SK_LOG
890 		if (fr->fr_flags & FLOWRTF_GATEWAY) {
891 			SK_DF(SK_VERB_FLOW_ROUTE,
892 			    "remove fr 0x%llx %s -> %s via gw %s [exp %lld]",
893 			    SK_KVA(fr),
894 			    sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
895 			    sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
896 			    sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs)),
897 			    (int64_t)(fr->fr_expire - now));
898 		} else {
899 			SK_DF(SK_VERB_FLOW_ROUTE,
900 			    "remove fr 0x%llx %s -> %s [exp %lld]", SK_KVA(fr),
901 			    sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
902 			    sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
903 			    (int64_t)(fr->fr_expire - now));
904 		}
905 #endif /* SK_LOG */
906 
907 		/* for the trees */
908 		flow_route_release(fr);
909 		flow_route_release(fr);
910 		++i;
911 
912 		if (!all) {
913 			FRIB_WUNLOCK(frib);
914 		}
915 	}
916 
917 	if (resid != NULL) {
918 		*resid = (tot - i);
919 	}
920 
921 	return i;
922 }
923 
924 void
flow_route_bucket_purge_all(struct flow_route_bucket * frb)925 flow_route_bucket_purge_all(struct flow_route_bucket *frb)
926 {
927 	(void) flow_route_bucket_purge_common(frb, NULL, TRUE, FALSE);
928 }
929 
930 static uint32_t
flow_route_bucket_prune(struct flow_route_bucket * frb,struct ifnet * ifp,uint32_t * resid)931 flow_route_bucket_prune(struct flow_route_bucket *frb, struct ifnet *ifp,
932     uint32_t *resid)
933 {
934 	uint64_t now = net_uptime();
935 	struct flow_route *fr;
936 	uint32_t i = 0, tot = 0;
937 	boolean_t ifdown = !(ifp->if_flags & IFF_UP);
938 
939 	FRB_RLOCK(frb);
940 	RB_FOREACH(fr, flow_route_tree, &frb->frb_head) {
941 		++tot;
942 		/* loose check; do this without holding fr_reflock */
943 		if (fr->fr_usecnt > FLOW_ROUTE_MINREF ||
944 		    (fr->fr_expire > now && !ifdown &&
945 		    !(fr->fr_flags & FLOWRTF_DELETED))) {
946 			continue;
947 		}
948 		++i;
949 	}
950 
951 	/*
952 	 * If there's nothing to prune or there's a writer, we're done.
953 	 * Note that if we failed to upgrade to writer, the lock would
954 	 * have been released automatically.
955 	 */
956 	if (i == 0 || !FRB_RLOCKTOWLOCK(frb)) {
957 		if (i == 0) {
958 			FRB_RUNLOCK(frb);
959 		}
960 		if (resid != NULL) {
961 			*resid = (tot - i);
962 		}
963 		return 0;
964 	}
965 
966 	SK_DF(SK_VERB_FLOW_ROUTE, "purging at least %u idle routes on %s",
967 	    i, ifp->if_xname);
968 
969 	/* purge idle ones */
970 	i = flow_route_bucket_purge_common(frb, resid, FALSE, ifdown);
971 	FRB_WUNLOCK(frb);
972 
973 	return i;
974 }
975 
976 uint32_t
flow_route_prune(struct flow_mgr * fm,struct ifnet * ifp,uint32_t * tot_resid)977 flow_route_prune(struct flow_mgr *fm, struct ifnet *ifp,
978     uint32_t *tot_resid)
979 {
980 	uint32_t pruned = 0;
981 	uint32_t resid;
982 	uint32_t i;
983 
984 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
985 		struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
986 		pruned += flow_route_bucket_prune(frb, ifp, &resid);
987 		if (tot_resid != NULL) {
988 			*tot_resid += resid;
989 		}
990 	}
991 
992 	return pruned;
993 }
994 
995 /*
996  * This runs in the context of eventhandler invocation routine which loops
997  * through all the registered callbacks.  Care must be taken to not call
998  * any primitives here that would lead to routing changes in the same context
999  * as it would lead to deadlock in eventhandler code.
1000  */
1001 static void
flow_route_ev_callback(struct eventhandler_entry_arg ee_arg,struct sockaddr * dst,int route_ev,struct sockaddr * gw_addr,int flags)1002 flow_route_ev_callback(struct eventhandler_entry_arg ee_arg,
1003     struct sockaddr *dst, int route_ev, struct sockaddr *gw_addr, int flags)
1004 {
1005 #pragma unused(dst, flags)
1006 #if SK_LOG
1007 	char dst_s[MAX_IPv6_STR_LEN];
1008 #endif /* SK_LOG */
1009 	struct flow_route_id_bucket *frib = NULL;
1010 	struct flow_route *fr = NULL;
1011 	struct flow_mgr *fm;
1012 
1013 	VERIFY(!uuid_is_null(ee_arg.ee_fm_uuid));
1014 	VERIFY(!uuid_is_null(ee_arg.ee_fr_uuid));
1015 
1016 	/*
1017 	 * Upon success, callee will hold flow manager lock as reader,
1018 	 * and we'll need to unlock it below.  Otherwise there's no
1019 	 * need to unlock here and just return.
1020 	 */
1021 	fm = flow_mgr_find_lock(ee_arg.ee_fm_uuid);
1022 	if (fm == NULL) {
1023 		SK_ERR("Event %s for dst %s ignored; flow manager not found",
1024 		    route_event2str(route_ev), sk_sa_ntop(dst, dst_s,
1025 		    sizeof(dst_s)));
1026 		return;
1027 	}
1028 
1029 	SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s event %s", fm->fm_name,
1030 	    sk_sa_ntop(dst, dst_s, sizeof(dst_s)), route_event2str(route_ev));
1031 
1032 	do {
1033 		frib = flow_mgr_get_frib_by_uuid(fm, ee_arg.ee_fr_uuid);
1034 
1035 		FRIB_RLOCK(frib);
1036 		/* callee returns a reference that we need to release below */
1037 		fr = flow_route_find_by_uuid(frib, ee_arg.ee_fr_uuid);
1038 		if (fr == NULL) {
1039 			SK_ERR("%s: dst %s flow route not found", fm->fm_name,
1040 			    sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1041 			break;
1042 		}
1043 
1044 		/*
1045 		 * Grab fr_lock to prevent flow route configuration or
1046 		 * resolver from using stale info while we are updating.
1047 		 */
1048 		FR_LOCK(fr);
1049 
1050 		switch (route_ev) {
1051 		case ROUTE_ENTRY_REFRESH:
1052 			/*
1053 			 * This is the case where the route entry has been
1054 			 * updated (for example through RTM_CHANGE).  Some
1055 			 * of it may not warrant a lookup again and some of
1056 			 * it may.  For now, mark flow to perform a look-up
1057 			 * again as the gateway may have changed.
1058 			 */
1059 			os_atomic_inc(&fr->fr_want_configure, relaxed);
1060 			os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1061 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route changed",
1062 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1063 			    sizeof(dst_s)));
1064 			break;
1065 
1066 		case ROUTE_ENTRY_DELETED:
1067 			/*
1068 			 * NOTE: flow_route_cleanup() should not be called
1069 			 * to de-register eventhandler in the context of
1070 			 * eventhandler callback to avoid deadlock in
1071 			 * eventhandler code.  Instead, just mark the flow
1072 			 * route un-resolved.  When it is being used again
1073 			 * or being deleted the old eventhandler must be
1074 			 * de-registered.
1075 			 */
1076 			os_atomic_inc(&fr->fr_want_configure, relaxed);
1077 			os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1078 			os_atomic_or(&fr->fr_flags, FLOWRTF_DELETED, relaxed);
1079 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route deleted",
1080 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1081 			    sizeof(dst_s)));
1082 			break;
1083 
1084 		case ROUTE_LLENTRY_STALE:
1085 			/*
1086 			 * When the route entry is deemed unreliable or old
1087 			 * enough to trigger a route lookup again.  Don't
1088 			 * reconfigure the flow route, but simply attempt
1089 			 * to resolve it next time to trigger a probe.
1090 			 */
1091 			os_atomic_inc(&fr->fr_want_probe, relaxed);
1092 			os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1093 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry stale",
1094 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1095 			    sizeof(dst_s)));
1096 			break;
1097 
1098 		case ROUTE_LLENTRY_CHANGED:
1099 			/*
1100 			 * When the link-layer info has changed; replace
1101 			 * cached llinfo in the flow route (treat this
1102 			 * as ROUTE_LLENTRY_RESOLVED).
1103 			 */
1104 			OS_FALLTHROUGH;
1105 
1106 		case ROUTE_LLENTRY_RESOLVED:
1107 			/*
1108 			 * SDL address length may be 0 for cellular.
1109 			 * If Ethernet, copy into flow route and mark
1110 			 * it as cached.  In all cases, mark the flow
1111 			 * route as resolved.
1112 			 */
1113 			ASSERT(SDL(gw_addr)->sdl_family == AF_LINK);
1114 			if (SDL(gw_addr)->sdl_alen == ETHER_ADDR_LEN) {
1115 				FLOWRT_UPD_ETH_DST(fr, LLADDR(SDL(gw_addr)));
1116 				SK_DF(SK_VERB_FLOW_ROUTE,
1117 				    "%s: dst %s llentry %s", fm->fm_name,
1118 				    sk_sa_ntop(dst, dst_s, sizeof(dst_s)),
1119 				    (!(fr->fr_flags & FLOWRTF_HAS_LLINFO) ?
1120 				    "resolved" : "changed"));
1121 				os_atomic_or(&fr->fr_flags, FLOWRTF_HAS_LLINFO, relaxed);
1122 			} else {
1123 				os_atomic_andnot(&fr->fr_flags, FLOWRTF_HAS_LLINFO, relaxed);
1124 			}
1125 			os_atomic_or(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1126 #if SK_LOG
1127 			if (__improbable((sk_verbose & SK_VERB_FLOW_ROUTE) !=
1128 			    0) && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
1129 				SK_DF(SK_VERB_FLOW_ROUTE,
1130 				    "%s: fr 0x%llx eth_type 0x%x "
1131 				    "eth_src %x:%x:%x:%x:%x:%x "
1132 				    "eth_dst %x:%x:%x:%x:%x:%x [%s])",
1133 				    fm->fm_name, SK_KVA(fr),
1134 				    ntohs(fr->fr_eth.ether_type),
1135 				    fr->fr_eth.ether_shost[0],
1136 				    fr->fr_eth.ether_shost[1],
1137 				    fr->fr_eth.ether_shost[2],
1138 				    fr->fr_eth.ether_shost[3],
1139 				    fr->fr_eth.ether_shost[4],
1140 				    fr->fr_eth.ether_shost[5],
1141 				    fr->fr_eth.ether_dhost[0],
1142 				    fr->fr_eth.ether_dhost[1],
1143 				    fr->fr_eth.ether_dhost[2],
1144 				    fr->fr_eth.ether_dhost[3],
1145 				    fr->fr_eth.ether_dhost[4],
1146 				    fr->fr_eth.ether_dhost[5],
1147 				    sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1148 			}
1149 #endif /* SK_LOG */
1150 			break;
1151 
1152 		case ROUTE_LLENTRY_DELETED:
1153 			/*
1154 			 * If the route entry points to a router and an
1155 			 * RTM_DELETE has been issued on it; force the
1156 			 * flow route to be reconfigured.
1157 			 */
1158 			os_atomic_inc(&fr->fr_want_configure, relaxed);
1159 			os_atomic_andnot(&fr->fr_flags, (FLOWRTF_HAS_LLINFO | FLOWRTF_RESOLVED), relaxed);
1160 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry deleted",
1161 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1162 			    sizeof(dst_s)));
1163 			break;
1164 
1165 		case ROUTE_LLENTRY_PROBED:
1166 			/*
1167 			 * When the resolver has begun probing the target;
1168 			 * nothing to do here.
1169 			 */
1170 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry probed",
1171 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1172 			    sizeof(dst_s)));
1173 			break;
1174 
1175 		case ROUTE_LLENTRY_UNREACH:
1176 			/*
1177 			 * When the route entry is marked with RTF_REJECT
1178 			 * or the probes have timed out, reconfigure.
1179 			 */
1180 			os_atomic_inc(&fr->fr_want_configure, relaxed);
1181 			os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1182 			SK_ERR("%s: dst %s llentry unreachable", fm->fm_name,
1183 			    sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1184 			break;
1185 
1186 		default:
1187 			break;
1188 		}
1189 	} while (0);
1190 
1191 	if (fr != NULL) {
1192 		flow_route_release(fr);
1193 		FR_UNLOCK(fr);
1194 	}
1195 
1196 	if (frib != NULL) {
1197 		FRIB_UNLOCK(frib);
1198 	}
1199 
1200 	if (fm != NULL) {
1201 		flow_mgr_unlock();
1202 	}
1203 }
1204 
1205 int
flow_route_select_laddr(union sockaddr_in_4_6 * src,union sockaddr_in_4_6 * dst,struct ifnet * ifp,struct rtentry * rt,uint32_t * ipaddr_gencnt,int use_stable_address)1206 flow_route_select_laddr(union sockaddr_in_4_6 *src, union sockaddr_in_4_6 *dst,
1207     struct ifnet *ifp, struct rtentry *rt, uint32_t *ipaddr_gencnt,
1208     int use_stable_address)
1209 {
1210 #if SK_LOG
1211 	char src_s[MAX_IPv6_STR_LEN];   /* src */
1212 	char dst_s[MAX_IPv6_STR_LEN];   /* dst */
1213 #endif /* SK_LOG */
1214 	sa_family_t af = SA(dst)->sa_family;
1215 	struct ifnet *src_ifp = NULL;
1216 	struct ifaddr *ifa = NULL;
1217 	int err = 0;
1218 
1219 	/* see comments in flow_route_configure() regarding loopback */
1220 	ASSERT(rt->rt_ifp == ifp || rt->rt_ifp == lo_ifp);
1221 
1222 	switch (af) {
1223 	case AF_INET: {
1224 		ifnet_lock_shared(ifp);
1225 		if (__improbable(rt->rt_ifa->ifa_debug & IFD_DETACHING) != 0) {
1226 			err = EHOSTUNREACH;
1227 			SK_ERR("route to %s has src address marked detaching "
1228 			    "(err %d)", inet_ntop(AF_INET,
1229 			    &SIN(dst)->sin_addr, dst_s, sizeof(dst_s)), err);
1230 			ifnet_lock_done(ifp);
1231 			break;
1232 		}
1233 		SIN(src)->sin_len = sizeof(struct sockaddr_in);
1234 		SIN(src)->sin_family = AF_INET;
1235 		SIN(src)->sin_addr = IA_SIN(rt->rt_ifa)->sin_addr;
1236 		ASSERT(SIN(src)->sin_addr.s_addr != INADDR_ANY);
1237 		*ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1238 		ifnet_lock_done(ifp);
1239 		break;
1240 	}
1241 
1242 	case AF_INET6: {
1243 		struct in6_addr src_storage, *in6;
1244 		struct route_in6 ro = {};
1245 		uint32_t hints = (use_stable_address ? 0 : IPV6_SRCSEL_HINT_PREFER_TMPADDR);
1246 		ro.ro_rt = rt;
1247 
1248 		if ((in6 = in6_selectsrc_core(SIN6(dst), hints,
1249 		    ifp, 0, &src_storage, &src_ifp, &err, &ifa, &ro)) == NULL) {
1250 			if (err == 0) {
1251 				err = EADDRNOTAVAIL;
1252 			}
1253 			VERIFY(src_ifp == NULL);
1254 			SK_ERR("src address to dst %s on %s not available "
1255 			    "(err %d)", inet_ntop(AF_INET6,
1256 			    &SIN6(dst)->sin6_addr, dst_s, sizeof(dst_s)),
1257 			    ifp->if_xname, err);
1258 			break;
1259 		}
1260 
1261 		VERIFY(src_ifp != NULL);
1262 		VERIFY(ifa != NULL);
1263 
1264 		if (__improbable(src_ifp != ifp)) {
1265 			if (err == 0) {
1266 				err = ENETUNREACH;
1267 			}
1268 			SK_ERR("dst %s, src %s ifp %s != %s (err %d)",
1269 			    inet_ntop(AF_INET6, &SIN6(dst)->sin6_addr,
1270 			    dst_s, sizeof(dst_s)),
1271 			    inet_ntop(AF_INET6, &SIN6(src)->sin6_addr,
1272 			    src_s, sizeof(src_s)),
1273 			    src_ifp->if_xname, ifp->if_xname, err);
1274 			break;
1275 		}
1276 
1277 		ifnet_lock_shared(ifp);
1278 		if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) {
1279 			err = EHOSTUNREACH;
1280 			SK_ERR("IPv6 address selected is marked to be "
1281 			    "detached (err %d)", err);
1282 			ifnet_lock_done(ifp);
1283 			break;
1284 		}
1285 
1286 		/* clear embedded scope if link-local src */
1287 		if (IN6_IS_SCOPE_EMBED(in6)) {
1288 			if (in6_embedded_scope) {
1289 				SIN6(src)->sin6_scope_id = ntohs(in6->s6_addr16[1]);
1290 				in6->s6_addr16[1] = 0;
1291 			} else {
1292 				SIN6(src)->sin6_scope_id = src_ifp->if_index;
1293 			}
1294 		}
1295 		SIN6(src)->sin6_len = sizeof(struct sockaddr_in6);
1296 		SIN6(src)->sin6_family = AF_INET6;
1297 		SIN6(src)->sin6_addr = *in6;
1298 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&SIN6(src)->sin6_addr));
1299 		*ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1300 		ifnet_lock_done(ifp);
1301 		break;
1302 	}
1303 
1304 	default:
1305 		VERIFY(0);
1306 		/* NOTREACHED */
1307 		__builtin_unreachable();
1308 	}
1309 
1310 	if (ifa != NULL) {
1311 		IFA_REMREF(ifa);
1312 	}
1313 
1314 	if (src_ifp != NULL) {
1315 		ifnet_release(src_ifp);
1316 	}
1317 
1318 #if SK_LOG
1319 	if (err == 0 && __improbable((sk_verbose & SK_VERB_FLOW_ROUTE) != 0)) {
1320 		SK_DF(SK_VERB_FLOW_ROUTE, "src %s to dst %s on %s",
1321 		    sk_sa_ntop(SA(src), src_s, sizeof(src_s)),
1322 		    sk_sa_ntop(SA(dst), dst_s, sizeof(dst_s)),
1323 		    ifp->if_xname);
1324 	}
1325 #endif /* SK_LOG */
1326 
1327 	return err;
1328 }
1329 
1330 void
flow_route_cleanup(struct flow_route * fr)1331 flow_route_cleanup(struct flow_route *fr)
1332 {
1333 #if SK_LOG
1334 	char ss[MAX_IPv6_STR_LEN];      /* dst */
1335 	char ds[MAX_IPv6_STR_LEN];      /* dst */
1336 	char gs[MAX_IPv6_STR_LEN];      /* gw */
1337 #endif /* SK_LOG */
1338 
1339 	FR_LOCK_ASSERT_HELD(fr);
1340 
1341 	if (fr->fr_rt_evhdlr_tag != NULL) {
1342 		ASSERT(fr->fr_rt_dst != NULL);
1343 		route_event_enqueue_nwk_wq_entry(fr->fr_rt_dst, NULL,
1344 		    ROUTE_EVHDLR_DEREGISTER, fr->fr_rt_evhdlr_tag, FALSE);
1345 		fr->fr_rt_evhdlr_tag = NULL;
1346 		fr->fr_rt_dst = NULL;
1347 	}
1348 	ASSERT(fr->fr_rt_dst == NULL);
1349 	if (fr->fr_rt_gw != NULL) {
1350 		rtfree(fr->fr_rt_gw);
1351 		fr->fr_rt_gw = NULL;
1352 	}
1353 
1354 #if SK_LOG
1355 	if (fr->fr_flags & FLOWRTF_GATEWAY) {
1356 		SK_DF(SK_VERB_FLOW_ROUTE,
1357 		    "clean fr 0x%llx %s -> %s via gw %s", SK_KVA(fr),
1358 		    sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
1359 		    sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
1360 		    sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs)));
1361 	} else if (fr->fr_flags & FLOWRTF_ONLINK) {
1362 		SK_DF(SK_VERB_FLOW_ROUTE,
1363 		    "clean fr 0x%llx %s -> %s", SK_KVA(fr),
1364 		    sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
1365 		    sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)));
1366 	}
1367 #endif /* SK_LOG */
1368 
1369 	os_atomic_andnot(&fr->fr_flags, (FLOWRTF_GATEWAY | FLOWRTF_ONLINK), relaxed);
1370 }
1371 
1372 static boolean_t
_flow_route_laddr_validate(struct flow_ip_addr * src_ip0,uint8_t ip_v,struct ifnet * ifp,uint32_t * gencnt)1373 _flow_route_laddr_validate(struct flow_ip_addr *src_ip0, uint8_t ip_v,
1374     struct ifnet *ifp, uint32_t *gencnt)
1375 {
1376 	boolean_t address_found = TRUE;
1377 	struct ifaddr *ifa = NULL;
1378 	struct flow_ip_addr src_ip = {};
1379 	uint32_t scope = ifp->if_index;
1380 
1381 	VERIFY(gencnt != NULL);
1382 	VERIFY(ip_v == IPVERSION || ip_v == IPV6_VERSION);
1383 
1384 	if (ip_v == IPVERSION) {
1385 		memcpy(&src_ip._v4, &src_ip0->_v4, sizeof(src_ip._v4));
1386 
1387 		ifa = (struct ifaddr *)ifa_foraddr_scoped(
1388 			src_ip._v4.s_addr, scope);
1389 	} else {
1390 		memcpy(&src_ip, src_ip0, sizeof(*src_ip0));
1391 
1392 		if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(&src_ip._v6)) {
1393 			src_ip._v6.s6_addr16[1] = htons((uint16_t)scope);
1394 		}
1395 		ifa = (struct ifaddr *)ifa_foraddr6_scoped(&src_ip._v6,
1396 		    scope);
1397 	}
1398 
1399 	if (__improbable(ifa == NULL)) {
1400 		address_found = FALSE;
1401 		goto done;
1402 	}
1403 
1404 	ifnet_lock_shared(ifp);
1405 	if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) {
1406 		address_found = FALSE;
1407 		ifnet_lock_done(ifp);
1408 		goto done;
1409 	}
1410 
1411 	if (ip_v == IPV6_VERSION) {
1412 		struct in6_ifaddr *ia6 = (struct in6_ifaddr *)ifa;
1413 
1414 		/*
1415 		 * Fail if IPv6 address is not ready or if the address
1416 		 * is reserved * for CLAT46.
1417 		 */
1418 		if (__improbable(ia6->ia6_flags &
1419 		    (IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) != 0) {
1420 			address_found = FALSE;
1421 			ifnet_lock_done(ifp);
1422 			goto done;
1423 		}
1424 	} else {
1425 		/*
1426 		 * If interface has CLAT46 enabled, fail IPv4 bind.
1427 		 * Since this implies network is NAT64/DNS64, Internet
1428 		 * effectively becomes reachable over IPv6.  If on
1429 		 * system IPv4 to IPv6 translation is required, that
1430 		 * should be handled solely through bump in the API.
1431 		 * The in kernel translation is only done for apps
1432 		 * directly using low level networking APIs.
1433 		 */
1434 		if (__improbable(IS_INTF_CLAT46(ifp))) {
1435 			address_found = FALSE;
1436 			ifnet_lock_done(ifp);
1437 			goto done;
1438 		}
1439 	}
1440 
1441 	*gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1442 	ifnet_lock_done(ifp);
1443 done:
1444 	if (ifa != NULL) {
1445 		IFA_REMREF(ifa);
1446 	}
1447 
1448 	return address_found;
1449 }
1450 
1451 boolean_t
flow_route_laddr_validate(union sockaddr_in_4_6 * saddr,struct ifnet * ifp,uint32_t * gencnt)1452 flow_route_laddr_validate(union sockaddr_in_4_6 *saddr, struct ifnet *ifp,
1453     uint32_t *gencnt)
1454 {
1455 	VERIFY(saddr->sa.sa_family == AF_INET ||
1456 	    saddr->sa.sa_family == AF_INET6);
1457 
1458 	struct flow_ip_addr *ipa;
1459 	uint8_t ipv;
1460 	if (saddr->sa.sa_family == AF_INET) {
1461 		ipv = IPVERSION;
1462 		ipa = (struct flow_ip_addr *)(void *)&saddr->sin.sin_addr;
1463 	} else {
1464 		ipv = IPV6_VERSION;
1465 		ipa = (struct flow_ip_addr *)(void *)&saddr->sin6.sin6_addr;
1466 	}
1467 
1468 	return _flow_route_laddr_validate(ipa, ipv, ifp, gencnt);
1469 }
1470 
1471 boolean_t
flow_route_key_validate(struct flow_key * fk,struct ifnet * ifp,uint32_t * gencnt)1472 flow_route_key_validate(struct flow_key *fk, struct ifnet *ifp,
1473     uint32_t *gencnt)
1474 {
1475 	return _flow_route_laddr_validate(&fk->fk_src, fk->fk_ipver, ifp,
1476 	           gencnt);
1477 }
1478