xref: /xnu-12377.41.6/bsd/skywalk/nexus/flowswitch/flow/flow_route.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2017-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Flow Routes.
31  *
32  * Each (non-listener) flow entry is always associated with a flow route
33  * object.  Multiple flow entries sharing the same remote address will use
34  * the same flow route for that address.  The flow route object contains
35  * the route information for the remote node.  It gets allocated when a
36  * flow entry requests to connect, and is garbage-collected when it's no
37  * longer referred to after its expiration time has passed.
38  *
39  * A flow route also contains the default local address that's used to
40  * reach the remote node.  This may not necessarily be the same local
41  * address used by the flow entry, if it has explicitly bound the entry
42  * to another local address.  But for the majority of cases, having the
43  * local address be present in the flow route allows us to avoid doing
44  * source address selection each time a connect request happens.
45  *
46  * When the remote node is reachable via a gateway, the gateway address
47  * portion of the flow route contains its IP address and the flow route
48  * is marked with FLOWRTF_GATEWAY.  We use this to optimize the gateway
49  * route lookup, since otherwise we'd have to perform an extra lookup
50  * each time we need to resolve the route.
51  *
52  * When the remote node is directly on the link, the FLOWRTF_ONLINK flag
53  * is set, and the gateway address isn't used.  The target address used
54  * for resolution will the the remote address itself.
55  *
56  * On links with link-layer information, we store the resolved address
57  * of the target node (which may be the gateway's) in the flow route,
58  * and mark the flow route with FLOWRTF_HAS_LLINFO.
59  *
60  * Each flow route also registers itself to receive route events when
61  * the underlying rtentry is updated or deleted.
62  */
63 
64 #include <skywalk/os_skywalk_private.h>
65 
66 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
67 #include <skywalk/nexus/flowswitch/fsw_var.h>
68 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
69 
70 #include <netinet/in.h>
71 #include <netinet/in_var.h>
72 #include <netinet/in_arp.h>
73 #include <netinet6/nd6.h>
74 #include <net/route.h>
75 
76 #include <kern/uipc_domain.h>
77 
78 extern struct rtstat_64 rtstat;
79 
80 static LCK_GRP_DECLARE(flow_route_lock_group, "sk_flow_route_lock");
81 static LCK_ATTR_DECLARE(flow_route_lock_attr, 0, 0);
82 
83 static int fr_cmp(const struct flow_route *, const struct flow_route *);
84 static int fr_id_cmp(const struct flow_route *, const struct flow_route *);
85 static struct flow_route *fr_alloc(boolean_t);
86 static void fr_free(struct flow_route *);
87 static uint32_t flow_route_bucket_purge_common(struct flow_route_bucket *,
88     uint32_t *, boolean_t, boolean_t);
89 static void flow_route_ev_callback(struct eventhandler_entry_arg,
90     struct sockaddr *, int, struct sockaddr *, int);
91 
92 RB_GENERATE_PREV(flow_route_tree, flow_route, fr_link, fr_cmp);
93 RB_GENERATE_PREV(flow_route_id_tree, flow_route, fr_id_link, fr_id_cmp);
94 
95 KALLOC_TYPE_VAR_DEFINE(KT_SK_FRB, struct flow_route_bucket, KT_DEFAULT);
96 KALLOC_TYPE_VAR_DEFINE(KT_SK_FRIB, struct flow_route_id_bucket, KT_DEFAULT);
97 
98 #define FR_ZONE_NAME    "flow.route"
99 
100 static unsigned int flow_route_size;            /* size of flow_route */
101 struct skmem_cache *flow_route_cache;           /* cache for flow_route */
102 
103 static int __flow_route_inited = 0;
104 
105 #define FLOW_ROUTE_EXPIRE       600     /* seconds */
106 static unsigned int flow_route_expire = FLOW_ROUTE_EXPIRE;
107 
108 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_expire,
109     CTLFLAG_RW | CTLFLAG_LOCKED, &flow_route_expire, 0, "");
110 
111 void
flow_route_init(void)112 flow_route_init(void)
113 {
114 	ASSERT(!__flow_route_inited);
115 
116 	flow_route_size = sizeof(struct flow_route);
117 	flow_route_cache = skmem_cache_create(FR_ZONE_NAME, flow_route_size,
118 	    sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0);
119 
120 	__flow_route_inited = 1;
121 }
122 
123 void
flow_route_fini(void)124 flow_route_fini(void)
125 {
126 	if (__flow_route_inited) {
127 		skmem_cache_destroy(flow_route_cache);
128 		flow_route_cache = NULL;
129 
130 		__flow_route_inited = 0;
131 	}
132 }
133 
134 struct flow_route_bucket *
135 __sized_by(*tot_sz)
flow_route_buckets_alloc(size_t frb_cnt,size_t * frb_sz,size_t * tot_sz)136 flow_route_buckets_alloc(size_t frb_cnt, size_t * frb_sz, size_t * tot_sz){
137 	uint32_t cache_sz = skmem_cpu_cache_line_size();
138 	struct flow_route_bucket *frb;
139 	size_t frb_tot_sz;
140 
141 	/* each bucket is CPU cache-aligned */
142 	*frb_sz = P2ROUNDUP(sizeof(*frb), cache_sz);
143 	*tot_sz = frb_tot_sz = frb_cnt * (*frb_sz);
144 	frb = sk_alloc_type_hash(KT_SK_FRB, frb_tot_sz, Z_WAITOK,
145 	    skmem_tag_fsw_frb_hash);
146 	if (__improbable(frb == NULL)) {
147 		return NULL;
148 	}
149 
150 #if !KASAN_CLASSIC
151 	/*
152 	 * except in KASAN_CLASSIC mode, kalloc will always maintain cacheline
153 	 * size alignment if the requested size is a multiple of a cacheline
154 	 * size (this is true for any size that is a power of two from 16 to
155 	 * PAGE_SIZE).
156 	 *
157 	 * Because this is an optimization only, it is OK to leave KASAN_CLASSIC
158 	 * not respect this.
159 	 */
160 	ASSERT(IS_P2ALIGNED(frb, cache_sz));
161 #endif
162 
163 	SK_DF(SK_VERB_MEM, "frb %p frb_cnt %zu frb_sz %zu "
164 	    "(total %zu bytes) ALLOC", SK_KVA(frb), frb_cnt,
165 	    *frb_sz, frb_tot_sz);
166 
167 	return frb;
168 }
169 
170 void
flow_route_buckets_free(struct flow_route_bucket * frb,size_t tot_sz)171 flow_route_buckets_free(struct flow_route_bucket *frb, size_t tot_sz)
172 {
173 	SK_DF(SK_VERB_MEM, "frb %p FREE", SK_KVA(frb));
174 	sk_free_type_hash(KT_SK_FRB, tot_sz, frb);
175 }
176 
177 void
flow_route_bucket_init(struct flow_route_bucket * frb)178 flow_route_bucket_init(struct flow_route_bucket *frb)
179 {
180 #if !KASAN_CLASSIC
181 	ASSERT(IS_P2ALIGNED(frb, skmem_cpu_cache_line_size()));
182 #endif /* !KASAN_CLASSIC */
183 	lck_rw_init(&frb->frb_lock, &flow_route_lock_group,
184 	    &flow_route_lock_attr);
185 	RB_INIT(&frb->frb_head);
186 }
187 
188 void
flow_route_bucket_destroy(struct flow_route_bucket * frb)189 flow_route_bucket_destroy(struct flow_route_bucket *frb)
190 {
191 	ASSERT(RB_EMPTY(&frb->frb_head));
192 	lck_rw_destroy(&frb->frb_lock, &flow_route_lock_group);
193 }
194 
195 static struct flow_route *
flow_route_find_by_addr(struct flow_route_bucket * frb,union sockaddr_in_4_6 * dst)196 flow_route_find_by_addr(struct flow_route_bucket *frb,
197     union sockaddr_in_4_6 *dst)
198 {
199 	struct flow_route *fr;
200 	struct flow_route find;
201 
202 	FRB_LOCK_ASSERT_HELD(frb);
203 
204 	switch (SA(dst)->sa_family) {
205 	case AF_INET:
206 		find.fr_af = AF_INET;
207 		find.fr_addr_len = sizeof(struct in_addr);
208 		find.fr_addr_key = (void *)&SIN(dst)->sin_addr;
209 		break;
210 
211 	case AF_INET6:
212 		find.fr_af = AF_INET6;
213 		find.fr_addr_len = sizeof(struct in6_addr);
214 		find.fr_addr_key = (void *)&SIN6(dst)->sin6_addr;
215 		break;
216 
217 	default:
218 		VERIFY(0);
219 		/* NOTREACHED */
220 		__builtin_unreachable();
221 	}
222 
223 	fr = RB_FIND(flow_route_tree, &frb->frb_head, &find);
224 	if (fr != NULL) {
225 		flow_route_retain(fr);  /* for the caller */
226 	}
227 	return fr;
228 }
229 
230 struct flow_route_id_bucket *
231 __sized_by(*tot_sz)
flow_route_id_buckets_alloc(size_t frib_cnt,size_t * frib_sz,size_t * tot_sz)232 flow_route_id_buckets_alloc(size_t frib_cnt, size_t * frib_sz, size_t * tot_sz){
233 	uint32_t cache_sz = skmem_cpu_cache_line_size();
234 	struct flow_route_id_bucket *frib;
235 	size_t frib_tot_sz;
236 
237 	/* each bucket is CPU cache-aligned */
238 	*frib_sz = P2ROUNDUP(sizeof(*frib), cache_sz);
239 	*tot_sz = frib_tot_sz = frib_cnt * (*frib_sz);
240 	frib = sk_alloc_type_hash(KT_SK_FRIB, frib_tot_sz, Z_WAITOK,
241 	    skmem_tag_fsw_frib_hash);
242 	/* END IGNORE CODESTYLE */
243 	if (__improbable(frib == NULL)) {
244 		return NULL;
245 	}
246 
247 #if !KASAN_CLASSIC
248 	/*
249 	 * except in KASAN_CLASSIC mode, kalloc will always maintain cacheline
250 	 * size alignment if the requested size is a multiple of a cacheline
251 	 * size (this is true for any size that is a power of two from 16 to
252 	 * PAGE_SIZE).
253 	 *
254 	 * Because this is an optimization only, it is OK to leave KASAN_CLASSIC
255 	 * not respect this.
256 	 */
257 	ASSERT(IS_P2ALIGNED(frib, cache_sz));
258 #endif /* !KASAN_CLASSIC */
259 
260 	SK_DF(SK_VERB_MEM, "frib %p frib_cnt %zu frib_sz %zu "
261 	    "(total %zu bytes) ALLOC", SK_KVA(frib), frib_cnt,
262 	    *frib_sz, frib_tot_sz);
263 
264 	return frib;
265 }
266 
267 void
flow_route_id_buckets_free(struct flow_route_id_bucket * frib,size_t tot_sz)268 flow_route_id_buckets_free(struct flow_route_id_bucket *frib, size_t tot_sz)
269 {
270 	SK_DF(SK_VERB_MEM, "frib %p FREE", SK_KVA(frib));
271 	sk_free_type_hash(KT_SK_FRIB, tot_sz, frib);
272 }
273 
274 void
flow_route_id_bucket_init(struct flow_route_id_bucket * frib)275 flow_route_id_bucket_init(struct flow_route_id_bucket *frib)
276 {
277 #if !KASAN_CLASSIC
278 	ASSERT(IS_P2ALIGNED(frib, skmem_cpu_cache_line_size()));
279 #endif
280 	lck_rw_init(&frib->frib_lock, &flow_route_lock_group,
281 	    &flow_route_lock_attr);
282 	RB_INIT(&frib->frib_head);
283 }
284 
285 void
flow_route_id_bucket_destroy(struct flow_route_id_bucket * frib)286 flow_route_id_bucket_destroy(struct flow_route_id_bucket *frib)
287 {
288 	ASSERT(RB_EMPTY(&frib->frib_head));
289 	lck_rw_destroy(&frib->frib_lock, &flow_route_lock_group);
290 }
291 
292 static struct flow_route *
flow_route_find_by_uuid(struct flow_route_id_bucket * frib,uuid_t id)293 flow_route_find_by_uuid(struct flow_route_id_bucket *frib, uuid_t id)
294 {
295 	struct flow_route *fr;
296 	struct flow_route find;
297 
298 	FRIB_LOCK_ASSERT_HELD(frib);
299 
300 	uuid_copy(find.fr_uuid, id);
301 	fr = RB_FIND(flow_route_id_tree, &frib->frib_head, &find);
302 	if (fr != NULL) {
303 		flow_route_retain(fr);  /* for the caller */
304 	}
305 	return fr;
306 }
307 
308 static struct flow_route *
fr_alloc(boolean_t cansleep)309 fr_alloc(boolean_t cansleep)
310 {
311 	struct flow_route *fr;
312 
313 	fr = skmem_cache_alloc(flow_route_cache,
314 	    (cansleep ? SKMEM_SLEEP : SKMEM_NOSLEEP));
315 	if (fr == NULL) {
316 		return NULL;
317 	}
318 	bzero(fr, flow_route_size);
319 	lck_spin_init(&fr->fr_reflock, &flow_route_lock_group, &flow_route_lock_attr);
320 	lck_mtx_init(&fr->fr_lock, &flow_route_lock_group, &flow_route_lock_attr);
321 	uuid_generate_random(fr->fr_uuid);
322 
323 	SK_DF(SK_VERB_MEM, "allocated fr %p", SK_KVA(fr));
324 	return fr;
325 }
326 
327 static void
fr_free(struct flow_route * fr)328 fr_free(struct flow_route *fr)
329 {
330 	SK_DF(SK_VERB_MEM, "freeing fr %p", SK_KVA(fr));
331 
332 	VERIFY(!(fr->fr_flags & FLOWRTF_ATTACHED));
333 	VERIFY(fr->fr_usecnt == 0);
334 
335 	FR_LOCK(fr);
336 	/* callee frees route entry */
337 	flow_route_cleanup(fr);
338 	VERIFY(fr->fr_rt_dst == NULL);
339 	VERIFY(fr->fr_rt_gw == NULL);
340 	VERIFY(fr->fr_rt_evhdlr_tag == NULL);
341 	FR_UNLOCK(fr);
342 
343 	lck_mtx_destroy(&fr->fr_lock, &flow_route_lock_group);
344 	lck_spin_destroy(&fr->fr_reflock, &flow_route_lock_group);
345 
346 	skmem_cache_free(flow_route_cache, fr);
347 }
348 
349 static inline int
fr_cmp(const struct flow_route * a,const struct flow_route * b)350 fr_cmp(const struct flow_route *a, const struct flow_route *b)
351 {
352 	int d;
353 
354 	if ((d = (a->fr_af - b->fr_af)) != 0) {
355 		return d;
356 	}
357 	if ((d = flow_ip_cmp(a->fr_addr_key, b->fr_addr_key,
358 	    b->fr_addr_len)) != 0) {
359 		return d;
360 	}
361 
362 	return 0;
363 }
364 
365 static inline int
fr_id_cmp(const struct flow_route * a,const struct flow_route * b)366 fr_id_cmp(const struct flow_route *a, const struct flow_route *b)
367 {
368 	return uuid_compare(a->fr_uuid, b->fr_uuid);
369 }
370 
371 static inline int
fr_use_stable_address(struct nx_flow_req * req)372 fr_use_stable_address(struct nx_flow_req *req)
373 {
374 	int use_stable_address = ip6_prefer_tempaddr ? 0 : 1;
375 	if (req != NULL &&
376 	    (req->nfr_flags & NXFLOWREQF_OVERRIDE_ADDRESS_SELECTION)) {
377 		use_stable_address = (req->nfr_flags & NXFLOWREQF_USE_STABLE_ADDRESS) ? 1 : 0;
378 	}
379 	return use_stable_address;
380 }
381 
382 int
flow_route_configure(struct flow_route * fr,struct ifnet * ifp,struct nx_flow_req * req)383 flow_route_configure(struct flow_route *fr, struct ifnet *ifp, struct nx_flow_req *req)
384 {
385 #if SK_LOG
386 	char old_s[MAX_IPv6_STR_LEN];   /* src */
387 	char src_s[MAX_IPv6_STR_LEN];   /* src */
388 	char dst_s[MAX_IPv6_STR_LEN];   /* dst */
389 #endif /* SK_LOG */
390 	struct rtentry *rt = NULL, *__single gwrt = NULL;
391 	int err = 0;
392 
393 	FR_LOCK_ASSERT_HELD(fr);
394 
395 	/*
396 	 * If there is a route entry for the final destination, see if
397 	 * it's no longer valid and perform another routing table lookup.
398 	 * A non-NULL fr_rt_dst is always associated with a route event
399 	 * registration, and the route reference is held there.
400 	 */
401 	rt = fr->fr_rt_dst;
402 	if (rt == NULL || !(rt->rt_flags & RTF_UP) || fr->fr_want_configure) {
403 		struct eventhandler_entry_arg ee_arg;
404 
405 		/* callee frees route entry */
406 		flow_route_cleanup(fr);
407 
408 		/* lookup destination route */
409 		ASSERT(err == 0);
410 		rt = rtalloc1_scoped(SA(&fr->fr_faddr), 1, 0, ifp->if_index);
411 		if (rt == NULL) {
412 			err = EHOSTUNREACH;
413 			SK_ERR("no route to %s on %s (err %d)",
414 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
415 			    sizeof(dst_s)), ifp->if_xname, err);
416 		} else {
417 			/*
418 			 * If route points to another interface and the
419 			 * route's gateway isn't link-layer, reject it.
420 			 * We make an exception otherwise, since local
421 			 * interface addresses resolve this way.
422 			 */
423 			if (rt->rt_ifp != ifp && rt->rt_ifp != lo_ifp &&
424 			    (rt->rt_gateway == NULL ||
425 			    SA(rt->rt_gateway)->sa_family != AF_LINK)) {
426 				err = EHOSTUNREACH;
427 				SK_ERR("route to %s on %s != %s (err %d)",
428 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
429 				    sizeof(dst_s)), rt->rt_ifp->if_xname,
430 				    ifp->if_xname, err);
431 			}
432 		}
433 
434 		if (err != 0) {
435 			goto done;
436 		}
437 
438 		ASSERT(fr->fr_mgr != NULL);
439 		ASSERT(!uuid_is_null(fr->fr_mgr->fm_uuid));
440 		ASSERT(!uuid_is_null(fr->fr_uuid));
441 		ASSERT(!uuid_is_null(fr->fr_nx_uuid));
442 
443 		bzero(&ee_arg, sizeof(ee_arg));
444 		uuid_copy(ee_arg.ee_fm_uuid, fr->fr_mgr->fm_uuid);
445 		uuid_copy(ee_arg.ee_fr_uuid, fr->fr_uuid);
446 
447 		/*
448 		 * Register for changes on destination route; this covers both
449 		 * cases where the destination is on-link, or if it is off-link
450 		 * and is using a gateway route.  This also transfers the refcnt
451 		 * of the route entry to the event handler, released later when
452 		 * it is deregistered.
453 		 */
454 		ASSERT(fr->fr_rt_dst == NULL);
455 		ASSERT(fr->fr_rt_evhdlr_tag == NULL);
456 		fr->fr_rt_dst = rt;             /* move reference to fr */
457 		fr->fr_rt_evhdlr_tag =
458 		    EVENTHANDLER_REGISTER(&rt->rt_evhdlr_ctxt, route_event,
459 		    &flow_route_ev_callback, ee_arg, EVENTHANDLER_PRI_ANY);
460 		ASSERT(fr->fr_rt_evhdlr_tag != NULL);
461 		os_atomic_andnot(&fr->fr_flags, FLOWRTF_DELETED, relaxed);
462 
463 		/*
464 		 * Lookup gateway route (if any); returns locked gwrt
465 		 * with a reference bumped up.
466 		 */
467 		err = route_to_gwroute(SA(&fr->fr_faddr), rt, &gwrt);
468 		if (err != 0) {
469 			/*
470 			 * Reference held by fr_rt_dst will be taken
471 			 * care of by flow_route_cleanup() below, so
472 			 * make sure we don't do an extra rtfree().
473 			 */
474 			rt = NULL;
475 			ASSERT(gwrt == NULL);
476 			SK_ERR("no gw route to %s on %s (err %d)",
477 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
478 			    sizeof(dst_s)), ifp->if_xname, err);
479 			goto done;
480 		}
481 
482 		/* if RTF_GATEWAY isn't set, gwrt == rt */
483 		ASSERT(gwrt != NULL);
484 		RT_LOCK_ASSERT_HELD(gwrt);
485 
486 		/*
487 		 * Must have been cleared via cleanup, and that we're
488 		 * single-threaded here for fr by virtue of fr_lock.
489 		 */
490 		ASSERT(!(fr->fr_flags & (FLOWRTF_GATEWAY | FLOWRTF_ONLINK)));
491 
492 		if (gwrt != rt && (rt->rt_flags & RTF_GATEWAY) &&
493 		    (rt->rt_gateway->sa_family == AF_INET ||
494 		    rt->rt_gateway->sa_family == AF_INET6)) {
495 			struct sockaddr_storage ss;
496 
497 			ASSERT(fr->fr_rt_gw == NULL);
498 			/* locked via route_to_gwroute() above */
499 			fr->fr_rt_gw = gwrt;    /* move reference to fr */
500 			RT_ADDREF_LOCKED(gwrt); /* for this routine */
501 			/*
502 			 * Destination is off-link and is reachable
503 			 * thru an IP gateway route.  Save the IP
504 			 * address of the gateway in fr_gaddr.
505 			 */
506 			(void) sa_copy(rt->rt_gateway, &ss, NULL);
507 			static_assert(sizeof(fr->fr_gaddr) <= sizeof(ss));
508 			bcopy(&ss, &fr->fr_gaddr, sizeof(fr->fr_gaddr));
509 			os_atomic_or(&fr->fr_flags, FLOWRTF_GATEWAY, relaxed);
510 		} else if (IS_DIRECT_HOSTROUTE(rt)) {
511 			/*
512 			 * Destination is on-link.
513 			 */
514 			os_atomic_or(&fr->fr_flags, FLOWRTF_ONLINK, relaxed);
515 		}
516 		RT_UNLOCK(gwrt);
517 	}
518 	RT_ADDREF(rt);          /* for this routine */
519 
520 	/* see if we need to re-select default source address */
521 	int use_stable_address = fr_use_stable_address(req);
522 	if (fr->fr_want_configure ||
523 	    fr->fr_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt ||
524 	    !(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address) {
525 		union sockaddr_in_4_6 old = fr->fr_laddr;
526 		if (use_stable_address) {
527 			os_atomic_or(&fr->fr_flags, FLOWRTF_STABLE_ADDR, relaxed);
528 		} else {
529 			os_atomic_andnot(&fr->fr_flags, FLOWRTF_STABLE_ADDR, relaxed);
530 		}
531 		if ((err = flow_route_select_laddr(&fr->fr_laddr, &fr->fr_faddr,
532 		    ifp, rt, &fr->fr_laddr_gencnt, use_stable_address)) != 0) {
533 			SK_ERR("no usable src address to reach %s on %s "
534 			    "(err %d)", sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
535 			    sizeof(dst_s)), ifp->if_xname, err);
536 			goto done;
537 		}
538 		if (bcmp(&old, &fr->fr_laddr, SA(&old)->sa_len) != 0) {
539 			SK_ERR("src address is now %s (was %s) to reach %s "
540 			    "on %s", sk_sa_ntop(SA(&fr->fr_laddr), src_s,
541 			    sizeof(src_s)), sk_sa_ntop(SA(&old), old_s,
542 			    sizeof(old_s)), sk_sa_ntop(SA(&fr->fr_faddr),
543 			    dst_s, sizeof(dst_s)), ifp->if_xname);
544 		}
545 	}
546 	ASSERT(err == 0);
547 
548 done:
549 	if (__probable(err == 0)) {
550 		os_atomic_store(&fr->fr_want_configure, 0, release);
551 	} else {
552 		/* callee frees route entry */
553 		flow_route_cleanup(fr);
554 	}
555 
556 	if (gwrt != NULL) {
557 		ASSERT(rt != NULL);
558 		if (gwrt == rt) {
559 			RT_REMREF(gwrt);
560 		} else {
561 			rtfree(gwrt);
562 		}
563 		gwrt = NULL;
564 	}
565 
566 	if (rt != NULL) {
567 		rtfree(rt);
568 		rt = NULL;
569 	}
570 
571 	return err;
572 }
573 
574 int
flow_route_find(struct kern_nexus * nx,struct flow_mgr * fm,struct ifnet * ifp,struct nx_flow_req * req,flow_route_ctor_fn_t fr_ctor,flow_route_resolve_fn_t fr_resolve,void * arg,struct flow_route ** frp)575 flow_route_find(struct kern_nexus *nx, struct flow_mgr *fm,
576     struct ifnet *ifp, struct nx_flow_req *req,
577     flow_route_ctor_fn_t fr_ctor, flow_route_resolve_fn_t fr_resolve,
578     void *arg, struct flow_route **frp)
579 {
580 #if SK_LOG
581 	char src_s[MAX_IPv6_STR_LEN];   /* dst */
582 	char dst_s[MAX_IPv6_STR_LEN];   /* dst */
583 	char gw_s[MAX_IPv6_STR_LEN];    /* gw */
584 #endif /* SK_LOG */
585 	union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
586 	struct flow_route_bucket *frb;
587 	struct flow_route_id_bucket *frib;
588 	struct flow_route *fr = NULL;
589 	int err = 0;
590 
591 	ASSERT(fr_ctor != NULL && fr_resolve != NULL);
592 
593 	ASSERT(frp != NULL);
594 	*frp = NULL;
595 
596 	frb = flow_mgr_get_frb_by_addr(fm, daddr);
597 
598 	int use_stable_address = fr_use_stable_address(req);
599 
600 	/* see if there is a cached flow route (as reader) */
601 	FRB_RLOCK(frb);
602 	fr = flow_route_find_by_addr(frb, daddr);
603 	if (fr != NULL) {
604 		if (__improbable(fr->fr_want_configure || fr->fr_laddr_gencnt !=
605 		    ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt) ||
606 		    __improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) {
607 			os_atomic_inc(&fr->fr_want_configure, relaxed);
608 			FR_LOCK(fr);
609 			err = flow_route_configure(fr, ifp, req);
610 			if (err != 0) {
611 				SK_ERR("fr %p error re-configuring dst %s "
612 				    "on %s (err %d) [R]", SK_KVA(fr),
613 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
614 				    sizeof(dst_s)), ifp->if_xname, err);
615 			}
616 			FR_UNLOCK(fr);
617 		}
618 		if (err == 0) {
619 			SK_DF(SK_VERB_FLOW_ROUTE,
620 			    "fr %p found for dst %s " "on %s [R,%u]",
621 			    SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
622 			    sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt);
623 		}
624 		FRB_RUNLOCK(frb);       /* reader */
625 		goto done;
626 	}
627 
628 	/*
629 	 * Flow route doesn't exist; become a writer and prepare to
630 	 * allocate one.  We could be racing with other threads here,
631 	 * so check first if there is now a cached flow route that
632 	 * got created by the winning thread.
633 	 */
634 	if (!FRB_RLOCKTOWLOCK(frb)) {
635 		FRB_WLOCK(frb);
636 	}
637 
638 	fr = flow_route_find_by_addr(frb, daddr);
639 	if (fr != NULL) {
640 		if (__improbable(fr->fr_want_configure) ||
641 		    __improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) {
642 			FR_LOCK(fr);
643 			err = flow_route_configure(fr, ifp, req);
644 			if (err != 0) {
645 				SK_ERR("fr %p error re-configuring dst %s "
646 				    "on %s (err %d) [W]", SK_KVA(fr),
647 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
648 				    sizeof(dst_s)), ifp->if_xname, err);
649 			}
650 			FR_UNLOCK(fr);
651 		}
652 		if (err == 0) {
653 			SK_DF(SK_VERB_FLOW_ROUTE,
654 			    "fr %p found for dst %s on %s [W,%u]",
655 			    SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
656 			    sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt);
657 		}
658 		FRB_WUNLOCK(frb);       /* writer */
659 		goto done;
660 	}
661 
662 	/* allocate one */
663 	fr = fr_alloc(TRUE);
664 	fr->fr_faddr = *daddr;          /* remote address */
665 
666 	switch (SA(&fr->fr_faddr)->sa_family) {
667 	case AF_INET:
668 		SIN(&fr->fr_faddr)->sin_port = 0;
669 		fr->fr_addr_len = sizeof(struct in_addr);
670 		fr->fr_addr_key = &SIN(&fr->fr_faddr)->sin_addr;
671 		break;
672 
673 	case AF_INET6:
674 		SIN6(&fr->fr_faddr)->sin6_port = 0;
675 		fr->fr_addr_len = sizeof(struct in6_addr);
676 		fr->fr_addr_key = &SIN6(&fr->fr_faddr)->sin6_addr;
677 		break;
678 
679 	default:
680 		VERIFY(0);
681 		/* NOTREACHED */
682 		__builtin_unreachable();
683 	}
684 
685 	ASSERT(!uuid_is_null(fr->fr_uuid));
686 	uuid_copy(fr->fr_nx_uuid, nx->nx_uuid);
687 	*(struct flow_mgr **)(uintptr_t)&fr->fr_mgr = fm;
688 
689 	/* force configure newly-created flow route */
690 	os_atomic_inc(&fr->fr_want_configure, relaxed);
691 
692 	FR_LOCK(fr);
693 	if ((err = flow_route_configure(fr, ifp, req)) != 0) {
694 		SK_ERR("fr %p error configuring dst %s on %s (err %d)",
695 		    SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
696 		    sizeof(dst_s)), ifp->if_xname, err);
697 		FR_UNLOCK(fr);
698 		FRB_WUNLOCK(frb);       /* writer */
699 		/* not yet in tree, so free immediately */
700 		fr_free(fr);
701 		fr = NULL;
702 		goto done;
703 	}
704 
705 	/* execute nexus-specific constructor */
706 	fr_ctor(arg, fr);
707 	FR_UNLOCK(fr);
708 
709 	frib = flow_mgr_get_frib_by_uuid(fm, fr->fr_uuid);
710 	FRIB_WLOCK(frib);
711 
712 	*(struct flow_route_bucket **)(uintptr_t)&fr->fr_frb = frb;
713 	*(struct flow_route_id_bucket **)(uintptr_t)&fr->fr_frib = frib;
714 
715 	FRB_WLOCK_ASSERT_HELD(frb);
716 	FRIB_WLOCK_ASSERT_HELD(frib);
717 
718 	RB_INSERT(flow_route_tree, &frb->frb_head, fr);
719 	RB_INSERT(flow_route_id_tree, &frib->frib_head, fr);
720 
721 	os_atomic_or(&fr->fr_flags, FLOWRTF_ATTACHED, relaxed);
722 
723 #if DEBUG
724 	/* sanity checks for comparator routines */
725 	VERIFY(flow_route_find_by_addr(frb, &fr->fr_faddr) == fr);
726 	flow_route_release(fr);
727 	VERIFY(flow_route_find_by_uuid(frib, fr->fr_uuid) == fr);
728 	flow_route_release(fr);
729 #endif /* DEBUG */
730 
731 	/* for the trees */
732 	static_assert(FLOW_ROUTE_MINREF == 2);
733 	flow_route_retain(fr);
734 	flow_route_retain(fr);
735 	ASSERT(fr->fr_usecnt == FLOW_ROUTE_MINREF);
736 
737 	/* for the caller */
738 	flow_route_retain(fr);
739 
740 	FRIB_WUNLOCK(frib);     /* writer */
741 	FRB_WUNLOCK(frb);       /* writer */
742 
743 	/* execute nexus-specific resolver */
744 	if (!(fr->fr_flags & FLOWRTF_RESOLVED) &&
745 	    (err = fr_resolve(arg, fr, NULL)) != 0) {
746 		if (fr->fr_flags & FLOWRTF_GATEWAY) {
747 			SK_ERR("fr %p resolve %s gw %s on %s (err %d)",
748 			    SK_KVA(fr), (err == EJUSTRETURN ? "pending" :
749 			    "fail"), sk_sa_ntop(SA(&fr->fr_gaddr), dst_s,
750 			    sizeof(dst_s)), ifp->if_xname, err);
751 		} else {
752 			SK_ERR("fr %p resolve %s dst %s on %s (err %d)",
753 			    SK_KVA(fr), (err == EJUSTRETURN ? "pending" :
754 			    "fail"), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
755 			    sizeof(dst_s)), ifp->if_xname, err);
756 		}
757 		if (err == EJUSTRETURN) {
758 			err = 0;
759 		} else {
760 			goto done;
761 		}
762 	}
763 	ASSERT(err == 0);
764 
765 #if SK_LOG
766 	if (fr->fr_flags & FLOWRTF_GATEWAY) {
767 		SK_DF(SK_VERB_FLOW_ROUTE,
768 		    "add fr %p %s -> %s via gw %s on %s", SK_KVA(fr),
769 		    sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)),
770 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
771 		    sk_sa_ntop(SA(&fr->fr_gaddr), gw_s, sizeof(gw_s)),
772 		    ifp->if_xname);
773 	} else {
774 		SK_DF(SK_VERB_FLOW_ROUTE,
775 		    "add fr %p %s -> %s on %s", SK_KVA(fr),
776 		    sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)),
777 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
778 		    ifp->if_xname);
779 	}
780 #endif /* SK_LOG */
781 
782 done:
783 	if (err == 0) {
784 		ASSERT(fr != NULL);
785 		*frp = fr;
786 	} else if (fr != NULL) {
787 		/* can't directly call fr_free() if it's in the tree */
788 		flow_route_release(fr);
789 		fr = NULL;
790 	}
791 
792 	return err;
793 }
794 
795 void
flow_route_retain(struct flow_route * fr)796 flow_route_retain(struct flow_route *fr)
797 {
798 	lck_spin_lock(&fr->fr_reflock);
799 	if (fr->fr_usecnt++ == FLOW_ROUTE_MINREF) {
800 		fr->fr_expire = 0;
801 	}
802 	lck_spin_unlock(&fr->fr_reflock);
803 }
804 
805 static void
__flow_route_release(struct flow_route * fr,boolean_t renew)806 __flow_route_release(struct flow_route *fr, boolean_t renew)
807 {
808 	bool should_free = false;
809 
810 	lck_spin_lock(&fr->fr_reflock);
811 	VERIFY(fr->fr_usecnt > 0);
812 	if (fr->fr_flags & FLOWRTF_ATTACHED) {
813 		if (fr->fr_usecnt-- == (FLOW_ROUTE_MINREF + 1) && renew) {
814 			fr->fr_expire = net_uptime() + flow_route_expire;
815 		}
816 	} else {
817 		/*
818 		 * fr is no longer in lookup tree, so there shouldn't be
819 		 * further usecnt, if we reach 0 usecnt, then this is the very
820 		 * last reference and is safe to unlock and call fr_free.
821 		 */
822 		if (--(fr->fr_usecnt) == 0) {
823 			should_free = true;
824 		}
825 	}
826 	lck_spin_unlock(&fr->fr_reflock);
827 
828 	if (should_free) {
829 		fr_free(fr);
830 	}
831 }
832 
833 void
flow_route_release(struct flow_route * fr)834 flow_route_release(struct flow_route *fr)
835 {
836 	__flow_route_release(fr, true);
837 }
838 
839 static uint32_t
flow_route_bucket_purge_common(struct flow_route_bucket * frb,uint32_t * resid,boolean_t all,boolean_t early_expire)840 flow_route_bucket_purge_common(struct flow_route_bucket *frb, uint32_t *resid,
841     boolean_t all, boolean_t early_expire)
842 {
843 #if SK_LOG
844 	char ss[MAX_IPv6_STR_LEN];      /* dst */
845 	char ds[MAX_IPv6_STR_LEN];      /* dst */
846 	char gs[MAX_IPv6_STR_LEN];      /* gw */
847 #endif /* SK_LOG */
848 	struct flow_route *fr, *tfr;
849 	uint64_t now = net_uptime();
850 	uint32_t i = 0, tot = 0;
851 
852 	FRB_WLOCK_ASSERT_HELD(frb);
853 
854 	RB_FOREACH_SAFE(fr, flow_route_tree, &frb->frb_head, tfr) {
855 		struct flow_route_id_bucket *frib =
856 		    __DECONST(struct flow_route_id_bucket *, fr->fr_frib);
857 
858 		++tot;
859 		/*
860 		 * We're not holding fr_lock here, since this is a
861 		 * best-effort check.  If there's a race and we miss
862 		 * it now, we'll come back again shortly.
863 		 */
864 		lck_spin_lock(&fr->fr_reflock);
865 		if (!all && (fr->fr_usecnt > FLOW_ROUTE_MINREF ||
866 		    (fr->fr_expire > now && !early_expire &&
867 		    !(fr->fr_flags & FLOWRTF_DELETED)))) {
868 			lck_spin_unlock(&fr->fr_reflock);
869 			SK_DF(SK_VERB_FLOW_ROUTE, "skipping fr %p "
870 			    "refcnt %u expire %llu", SK_KVA(fr),
871 			    fr->fr_usecnt, fr->fr_expire);
872 			continue;
873 		}
874 		lck_spin_unlock(&fr->fr_reflock);
875 
876 		/*
877 		 * If "all" is set, flow entries must be gone by now, as
878 		 * we must be called by flow_route_bucket_purge_all().
879 		 * It also means that the caller has acquired writer lock
880 		 * on all flow {route,route_id} buckets, and fr_usecnt
881 		 * must be at its minimum value now.
882 		 */
883 		if (!all) {
884 			FRIB_WLOCK(frib);
885 		}
886 		FRIB_WLOCK_ASSERT_HELD(frib);
887 
888 		static_assert(FLOW_ROUTE_MINREF == 2);
889 		ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF);
890 
891 		RB_REMOVE(flow_route_tree, &frb->frb_head, fr);
892 		RB_REMOVE(flow_route_id_tree, &frib->frib_head, fr);
893 
894 		os_atomic_andnot(&fr->fr_flags, FLOWRTF_ATTACHED, relaxed);
895 
896 #if SK_LOG
897 		if (fr->fr_flags & FLOWRTF_GATEWAY) {
898 			SK_DF(SK_VERB_FLOW_ROUTE,
899 			    "remove fr %p %s -> %s via gw %s [exp %lld]",
900 			    SK_KVA(fr),
901 			    sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
902 			    sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
903 			    sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs)),
904 			    (int64_t)(fr->fr_expire - now));
905 		} else {
906 			SK_DF(SK_VERB_FLOW_ROUTE,
907 			    "remove fr %p %s -> %s [exp %lld]", SK_KVA(fr),
908 			    sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
909 			    sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
910 			    (int64_t)(fr->fr_expire - now));
911 		}
912 #endif /* SK_LOG */
913 
914 		/* for the trees */
915 		flow_route_release(fr);
916 		flow_route_release(fr);
917 		++i;
918 
919 		if (!all) {
920 			FRIB_WUNLOCK(frib);
921 		}
922 	}
923 
924 	if (resid != NULL) {
925 		*resid = (tot - i);
926 	}
927 
928 	return i;
929 }
930 
931 void
flow_route_bucket_purge_all(struct flow_route_bucket * frb)932 flow_route_bucket_purge_all(struct flow_route_bucket *frb)
933 {
934 	(void) flow_route_bucket_purge_common(frb, NULL, TRUE, FALSE);
935 }
936 
937 static uint32_t
flow_route_bucket_prune(struct flow_route_bucket * frb,struct ifnet * ifp,uint32_t * resid)938 flow_route_bucket_prune(struct flow_route_bucket *frb, struct ifnet *ifp,
939     uint32_t *resid)
940 {
941 	uint64_t now = net_uptime();
942 	struct flow_route *fr;
943 	uint32_t i = 0, tot = 0;
944 	boolean_t ifdown = !(ifp->if_flags & IFF_UP);
945 
946 	FRB_RLOCK(frb);
947 	RB_FOREACH(fr, flow_route_tree, &frb->frb_head) {
948 		++tot;
949 		/* loose check; do this without holding fr_reflock */
950 		if (fr->fr_usecnt > FLOW_ROUTE_MINREF ||
951 		    (fr->fr_expire > now && !ifdown &&
952 		    !(fr->fr_flags & FLOWRTF_DELETED))) {
953 			continue;
954 		}
955 		++i;
956 	}
957 
958 	/*
959 	 * If there's nothing to prune or there's a writer, we're done.
960 	 * Note that if we failed to upgrade to writer, the lock would
961 	 * have been released automatically.
962 	 */
963 	if (i == 0 || !FRB_RLOCKTOWLOCK(frb)) {
964 		if (i == 0) {
965 			FRB_RUNLOCK(frb);
966 		}
967 		if (resid != NULL) {
968 			*resid = (tot - i);
969 		}
970 		return 0;
971 	}
972 
973 	SK_DF(SK_VERB_FLOW_ROUTE, "purging at least %u idle routes on %s",
974 	    i, ifp->if_xname);
975 
976 	/* purge idle ones */
977 	i = flow_route_bucket_purge_common(frb, resid, FALSE, ifdown);
978 	FRB_WUNLOCK(frb);
979 
980 	return i;
981 }
982 
983 uint32_t
flow_route_prune(struct flow_mgr * fm,struct ifnet * ifp,uint32_t * tot_resid)984 flow_route_prune(struct flow_mgr *fm, struct ifnet *ifp,
985     uint32_t *tot_resid)
986 {
987 	uint32_t pruned = 0;
988 	uint32_t resid;
989 	uint32_t i;
990 
991 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
992 		struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
993 		pruned += flow_route_bucket_prune(frb, ifp, &resid);
994 		if (tot_resid != NULL) {
995 			*tot_resid += resid;
996 		}
997 	}
998 
999 	return pruned;
1000 }
1001 
1002 /*
1003  * This runs in the context of eventhandler invocation routine which loops
1004  * through all the registered callbacks.  Care must be taken to not call
1005  * any primitives here that would lead to routing changes in the same context
1006  * as it would lead to deadlock in eventhandler code.
1007  */
1008 static void
flow_route_ev_callback(struct eventhandler_entry_arg ee_arg,struct sockaddr * dst,int route_ev,struct sockaddr * gw_addr_orig,int flags)1009 flow_route_ev_callback(struct eventhandler_entry_arg ee_arg,
1010     struct sockaddr *dst, int route_ev, struct sockaddr *gw_addr_orig, int flags)
1011 {
1012 #pragma unused(dst, flags)
1013 #if SK_LOG
1014 	char dst_s[MAX_IPv6_STR_LEN];
1015 #endif /* SK_LOG */
1016 	struct flow_route_id_bucket *frib = NULL;
1017 	struct flow_route *fr = NULL;
1018 	struct flow_mgr *fm;
1019 	boolean_t renew_fr = true;
1020 
1021 	VERIFY(!uuid_is_null(ee_arg.ee_fm_uuid));
1022 	VERIFY(!uuid_is_null(ee_arg.ee_fr_uuid));
1023 
1024 	evhlog(debug, "%s: eventhandler saw event type=route_event event_code=%s",
1025 	    __func__, route_event2str(route_ev));
1026 
1027 	/*
1028 	 * Upon success, callee will hold flow manager lock as reader,
1029 	 * and we'll need to unlock it below.  Otherwise there's no
1030 	 * need to unlock here and just return.
1031 	 */
1032 	fm = flow_mgr_find_lock(ee_arg.ee_fm_uuid);
1033 	if (fm == NULL) {
1034 		SK_ERR("Event %s for dst %s ignored; flow manager not found",
1035 		    route_event2str(route_ev), sk_sa_ntop(dst, dst_s,
1036 		    sizeof(dst_s)));
1037 		return;
1038 	}
1039 
1040 	SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s event %s", fm->fm_name,
1041 	    sk_sa_ntop(dst, dst_s, sizeof(dst_s)), route_event2str(route_ev));
1042 
1043 	do {
1044 		frib = flow_mgr_get_frib_by_uuid(fm, ee_arg.ee_fr_uuid);
1045 
1046 		FRIB_RLOCK(frib);
1047 		/* callee returns a reference that we need to release below */
1048 		fr = flow_route_find_by_uuid(frib, ee_arg.ee_fr_uuid);
1049 		if (fr == NULL) {
1050 			SK_ERR("%s: dst %s flow route not found", fm->fm_name,
1051 			    sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1052 			break;
1053 		}
1054 
1055 		/*
1056 		 * Grab fr_lock to prevent flow route configuration or
1057 		 * resolver from using stale info while we are updating.
1058 		 */
1059 		FR_LOCK(fr);
1060 
1061 		switch (route_ev) {
1062 		case ROUTE_ENTRY_REFRESH:
1063 			/*
1064 			 * This is the case where the route entry has been
1065 			 * updated (for example through RTM_CHANGE).  Some
1066 			 * of it may not warrant a lookup again and some of
1067 			 * it may.  For now, mark flow to perform a look-up
1068 			 * again as the gateway may have changed.
1069 			 */
1070 			os_atomic_inc(&fr->fr_want_configure, relaxed);
1071 			os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1072 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route changed",
1073 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1074 			    sizeof(dst_s)));
1075 			break;
1076 
1077 		case ROUTE_ENTRY_DELETED:
1078 			/*
1079 			 * NOTE: flow_route_cleanup() should not be called
1080 			 * to de-register eventhandler in the context of
1081 			 * eventhandler callback to avoid deadlock in
1082 			 * eventhandler code.  Instead, just mark the flow
1083 			 * route un-resolved.  When it is being used again
1084 			 * or being deleted the old eventhandler must be
1085 			 * de-registered.
1086 			 */
1087 			os_atomic_inc(&fr->fr_want_configure, relaxed);
1088 			os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1089 			os_atomic_or(&fr->fr_flags, FLOWRTF_DELETED, relaxed);
1090 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route deleted",
1091 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1092 			    sizeof(dst_s)));
1093 			break;
1094 
1095 		case ROUTE_LLENTRY_STALE:
1096 			/*
1097 			 * When the route entry is deemed unreliable or old
1098 			 * enough to trigger a route lookup again.  Don't
1099 			 * reconfigure the flow route, but simply attempt
1100 			 * to resolve it next time to trigger a probe.
1101 			 */
1102 			os_atomic_inc(&fr->fr_want_probe, relaxed);
1103 			os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1104 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry stale",
1105 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1106 			    sizeof(dst_s)));
1107 			renew_fr = false;
1108 			break;
1109 
1110 		case ROUTE_LLENTRY_CHANGED:
1111 			/*
1112 			 * When the link-layer info has changed; replace
1113 			 * cached llinfo in the flow route (treat this
1114 			 * as ROUTE_LLENTRY_RESOLVED).
1115 			 */
1116 			OS_FALLTHROUGH;
1117 
1118 		case ROUTE_LLENTRY_RESOLVED:
1119 		{
1120 			/*
1121 			 * SDL address length may be 0 for cellular.
1122 			 * If Ethernet, copy into flow route and mark
1123 			 * it as cached.  In all cases, mark the flow
1124 			 * route as resolved.
1125 			 */
1126 			/*
1127 			 * XXX Remove explicit __bidi_indexable once
1128 			 * rdar://119193012 lands
1129 			 */
1130 			struct sockaddr_dl *__bidi_indexable gw_addr =
1131 			    (struct sockaddr_dl *__bidi_indexable) SDL(gw_addr_orig);
1132 			ASSERT(gw_addr->sdl_family == AF_LINK);
1133 			if (gw_addr->sdl_alen == ETHER_ADDR_LEN) {
1134 				FLOWRT_UPD_ETH_DST(fr, LLADDR(gw_addr));
1135 				SK_DF(SK_VERB_FLOW_ROUTE,
1136 				    "%s: dst %s llentry %s", fm->fm_name,
1137 				    sk_sa_ntop(dst, dst_s, sizeof(dst_s)),
1138 				    (!(fr->fr_flags & FLOWRTF_HAS_LLINFO) ?
1139 				    "resolved" : "changed"));
1140 				os_atomic_or(&fr->fr_flags, FLOWRTF_HAS_LLINFO, relaxed);
1141 			} else {
1142 				os_atomic_andnot(&fr->fr_flags, FLOWRTF_HAS_LLINFO, relaxed);
1143 			}
1144 			os_atomic_or(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1145 #if SK_LOG
1146 			if (__improbable((sk_verbose & SK_VERB_FLOW_ROUTE) !=
1147 			    0) && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
1148 				SK_DF(SK_VERB_FLOW_ROUTE,
1149 				    "%s: fr %p eth_type 0x%x "
1150 				    "eth_src %x:%x:%x:%x:%x:%x "
1151 				    "eth_dst %x:%x:%x:%x:%x:%x [%s])",
1152 				    fm->fm_name, SK_KVA(fr),
1153 				    ntohs(fr->fr_eth.ether_type),
1154 				    fr->fr_eth.ether_shost[0],
1155 				    fr->fr_eth.ether_shost[1],
1156 				    fr->fr_eth.ether_shost[2],
1157 				    fr->fr_eth.ether_shost[3],
1158 				    fr->fr_eth.ether_shost[4],
1159 				    fr->fr_eth.ether_shost[5],
1160 				    fr->fr_eth.ether_dhost[0],
1161 				    fr->fr_eth.ether_dhost[1],
1162 				    fr->fr_eth.ether_dhost[2],
1163 				    fr->fr_eth.ether_dhost[3],
1164 				    fr->fr_eth.ether_dhost[4],
1165 				    fr->fr_eth.ether_dhost[5],
1166 				    sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1167 			}
1168 #endif /* SK_LOG */
1169 			break;
1170 		}
1171 		case ROUTE_LLENTRY_DELETED:
1172 			/*
1173 			 * If the route entry points to a router and an
1174 			 * RTM_DELETE has been issued on it; force the
1175 			 * flow route to be reconfigured.
1176 			 */
1177 			os_atomic_inc(&fr->fr_want_configure, relaxed);
1178 			os_atomic_andnot(&fr->fr_flags, (FLOWRTF_HAS_LLINFO | FLOWRTF_RESOLVED), relaxed);
1179 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry deleted",
1180 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1181 			    sizeof(dst_s)));
1182 			break;
1183 
1184 		case ROUTE_LLENTRY_PROBED:
1185 			/*
1186 			 * When the resolver has begun probing the target;
1187 			 * nothing to do here.
1188 			 */
1189 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry probed",
1190 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1191 			    sizeof(dst_s)));
1192 			break;
1193 
1194 		case ROUTE_LLENTRY_UNREACH:
1195 			/*
1196 			 * When the route entry is marked with RTF_REJECT
1197 			 * or the probes have timed out, reconfigure.
1198 			 */
1199 			os_atomic_inc(&fr->fr_want_configure, relaxed);
1200 			os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1201 			SK_ERR("%s: dst %s llentry unreachable", fm->fm_name,
1202 			    sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1203 			break;
1204 
1205 		default:
1206 			break;
1207 		}
1208 	} while (0);
1209 
1210 	if (fr != NULL) {
1211 		__flow_route_release(fr, renew_fr);
1212 		FR_UNLOCK(fr);
1213 	}
1214 
1215 	if (frib != NULL) {
1216 		FRIB_UNLOCK(frib);
1217 	}
1218 
1219 	if (fm != NULL) {
1220 		flow_mgr_unlock();
1221 	}
1222 }
1223 
1224 int
flow_route_select_laddr(union sockaddr_in_4_6 * src,union sockaddr_in_4_6 * dst,struct ifnet * ifp,struct rtentry * rt,uint32_t * ipaddr_gencnt,int use_stable_address)1225 flow_route_select_laddr(union sockaddr_in_4_6 *src, union sockaddr_in_4_6 *dst,
1226     struct ifnet *ifp, struct rtentry *rt, uint32_t *ipaddr_gencnt,
1227     int use_stable_address)
1228 {
1229 #if SK_LOG
1230 	char src_s[MAX_IPv6_STR_LEN];   /* src */
1231 	char dst_s[MAX_IPv6_STR_LEN];   /* dst */
1232 #endif /* SK_LOG */
1233 	sa_family_t af = SA(dst)->sa_family;
1234 	struct ifnet *__single src_ifp = NULL;
1235 	struct ifaddr *__single ifa = NULL;
1236 	int err = 0;
1237 
1238 	/* see comments in flow_route_configure() regarding loopback */
1239 	ASSERT(rt->rt_ifp == ifp || rt->rt_ifp == lo_ifp);
1240 
1241 	switch (af) {
1242 	case AF_INET: {
1243 		ifnet_lock_shared(ifp);
1244 		if (__improbable(rt->rt_ifa->ifa_debug & IFD_DETACHING) != 0) {
1245 			err = EHOSTUNREACH;
1246 			SK_ERR("route to %s has src address marked detaching "
1247 			    "(err %d)", sk_ntop(AF_INET, &SIN(dst)->sin_addr,
1248 			    dst_s, sizeof(dst_s)), err);
1249 			ifnet_lock_done(ifp);
1250 			break;
1251 		}
1252 		SIN(src)->sin_len = sizeof(struct sockaddr_in);
1253 		SIN(src)->sin_family = AF_INET;
1254 		SIN(src)->sin_addr = IA_SIN(rt->rt_ifa)->sin_addr;
1255 		ASSERT(SIN(src)->sin_addr.s_addr != INADDR_ANY);
1256 		*ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1257 		ifnet_lock_done(ifp);
1258 		break;
1259 	}
1260 
1261 	case AF_INET6: {
1262 		struct in6_addr src_storage, *in6;
1263 		struct route_in6 ro = {};
1264 		uint32_t hints = (use_stable_address ? 0 : IPV6_SRCSEL_HINT_PREFER_TMPADDR);
1265 		ro.ro_rt = rt;
1266 
1267 		if ((in6 = in6_selectsrc_core(SIN6(dst), hints,
1268 		    ifp, 0, &src_storage, &src_ifp, &err, &ifa, &ro, FALSE)) == NULL) {
1269 			if (err == 0) {
1270 				err = EADDRNOTAVAIL;
1271 			}
1272 			VERIFY(src_ifp == NULL);
1273 			SK_ERR("src address to dst %s on %s not available "
1274 			    "(err %d)", sk_ntop(AF_INET6,
1275 			    &SIN6(dst)->sin6_addr, dst_s, sizeof(dst_s)),
1276 			    ifp->if_xname, err);
1277 			break;
1278 		}
1279 
1280 		VERIFY(src_ifp != NULL);
1281 		VERIFY(ifa != NULL);
1282 
1283 		if (__improbable(src_ifp != ifp)) {
1284 			if (err == 0) {
1285 				err = ENETUNREACH;
1286 			}
1287 			SK_ERR("dst %s, src %s ifp %s != %s (err %d)",
1288 			    sk_ntop(AF_INET6, &SIN6(dst)->sin6_addr,
1289 			    dst_s, sizeof(dst_s)),
1290 			    sk_ntop(AF_INET6, &SIN6(src)->sin6_addr,
1291 			    src_s, sizeof(src_s)),
1292 			    src_ifp->if_xname, ifp->if_xname, err);
1293 			break;
1294 		}
1295 
1296 		ifnet_lock_shared(ifp);
1297 		if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) {
1298 			err = EHOSTUNREACH;
1299 			SK_ERR("IPv6 address selected is marked to be "
1300 			    "detached (err %d)", err);
1301 			ifnet_lock_done(ifp);
1302 			break;
1303 		}
1304 
1305 		/* clear embedded scope if link-local src */
1306 		if (IN6_IS_SCOPE_EMBED(in6)) {
1307 			if (in6_embedded_scope) {
1308 				SIN6(src)->sin6_scope_id = ntohs(in6->s6_addr16[1]);
1309 				in6->s6_addr16[1] = 0;
1310 			} else {
1311 				SIN6(src)->sin6_scope_id = src_ifp->if_index;
1312 			}
1313 		}
1314 		SIN6(src)->sin6_len = sizeof(struct sockaddr_in6);
1315 		SIN6(src)->sin6_family = AF_INET6;
1316 		SIN6(src)->sin6_addr = *in6;
1317 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&SIN6(src)->sin6_addr));
1318 		*ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1319 		ifnet_lock_done(ifp);
1320 		break;
1321 	}
1322 
1323 	default:
1324 		VERIFY(0);
1325 		/* NOTREACHED */
1326 		__builtin_unreachable();
1327 	}
1328 
1329 	if (ifa != NULL) {
1330 		ifa_remref(ifa);
1331 	}
1332 
1333 	if (src_ifp != NULL) {
1334 		ifnet_release(src_ifp);
1335 	}
1336 
1337 #if SK_LOG
1338 	if (err == 0 && __improbable((sk_verbose & SK_VERB_FLOW_ROUTE) != 0)) {
1339 		SK_DF(SK_VERB_FLOW_ROUTE, "src %s to dst %s on %s",
1340 		    sk_sa_ntop(SA(src), src_s, sizeof(src_s)),
1341 		    sk_sa_ntop(SA(dst), dst_s, sizeof(dst_s)),
1342 		    ifp->if_xname);
1343 	}
1344 #endif /* SK_LOG */
1345 
1346 	return err;
1347 }
1348 
1349 void
flow_route_cleanup(struct flow_route * fr)1350 flow_route_cleanup(struct flow_route *fr)
1351 {
1352 #if SK_LOG
1353 	char ss[MAX_IPv6_STR_LEN];      /* dst */
1354 	char ds[MAX_IPv6_STR_LEN];      /* dst */
1355 	char gs[MAX_IPv6_STR_LEN];      /* gw */
1356 #endif /* SK_LOG */
1357 
1358 	FR_LOCK_ASSERT_HELD(fr);
1359 
1360 	if (fr->fr_rt_evhdlr_tag != NULL) {
1361 		ASSERT(fr->fr_rt_dst != NULL);
1362 		route_event_enqueue_nwk_wq_entry(fr->fr_rt_dst, NULL,
1363 		    ROUTE_EVHDLR_DEREGISTER, fr->fr_rt_evhdlr_tag, FALSE);
1364 		fr->fr_rt_evhdlr_tag = NULL;
1365 		fr->fr_rt_dst = NULL;
1366 	}
1367 	ASSERT(fr->fr_rt_dst == NULL);
1368 	if (fr->fr_rt_gw != NULL) {
1369 		rtfree(fr->fr_rt_gw);
1370 		fr->fr_rt_gw = NULL;
1371 	}
1372 
1373 #if SK_LOG
1374 	if (fr->fr_flags & FLOWRTF_GATEWAY) {
1375 		SK_DF(SK_VERB_FLOW_ROUTE,
1376 		    "clean fr %p %s -> %s via gw %s", SK_KVA(fr),
1377 		    sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
1378 		    sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
1379 		    sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs)));
1380 	} else if (fr->fr_flags & FLOWRTF_ONLINK) {
1381 		SK_DF(SK_VERB_FLOW_ROUTE,
1382 		    "clean fr %p %s -> %s", SK_KVA(fr),
1383 		    sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
1384 		    sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)));
1385 	}
1386 #endif /* SK_LOG */
1387 
1388 	os_atomic_andnot(&fr->fr_flags, (FLOWRTF_GATEWAY | FLOWRTF_ONLINK), relaxed);
1389 }
1390 
1391 static boolean_t
_flow_route_laddr_validate(struct flow_ip_addr * src_ip0,uint8_t ip_v,struct ifnet * ifp,uint32_t * gencnt)1392 _flow_route_laddr_validate(struct flow_ip_addr *src_ip0, uint8_t ip_v,
1393     struct ifnet *ifp, uint32_t *gencnt)
1394 {
1395 	boolean_t address_found = TRUE;
1396 	struct ifaddr *ifa = NULL;
1397 	struct flow_ip_addr src_ip = {};
1398 	uint32_t scope = ifp->if_index;
1399 
1400 	VERIFY(gencnt != NULL);
1401 	VERIFY(ip_v == IPVERSION || ip_v == IPV6_VERSION);
1402 
1403 	if (ip_v == IPVERSION) {
1404 		memcpy(&src_ip._v4, &src_ip0->_v4, sizeof(src_ip._v4));
1405 
1406 		ifa = (struct ifaddr *)ifa_foraddr_scoped(
1407 			src_ip._v4.s_addr, scope);
1408 	} else {
1409 		memcpy(&src_ip, src_ip0, sizeof(*src_ip0));
1410 
1411 		if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(&src_ip._v6)) {
1412 			src_ip._v6.s6_addr16[1] = htons((uint16_t)scope);
1413 		}
1414 		ifa = (struct ifaddr *)ifa_foraddr6_scoped(&src_ip._v6,
1415 		    scope);
1416 	}
1417 
1418 	if (__improbable(ifa == NULL)) {
1419 		address_found = FALSE;
1420 		goto done;
1421 	}
1422 
1423 	ifnet_lock_shared(ifp);
1424 	if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) {
1425 		address_found = FALSE;
1426 		ifnet_lock_done(ifp);
1427 		goto done;
1428 	}
1429 
1430 	if (ip_v == IPV6_VERSION) {
1431 		/*
1432 		 * -fbounds-safety: ia6 (in6_ifaddr) overlays ifa (ifaddr)
1433 		 */
1434 		struct in6_ifaddr *ia6 = __container_of(ifa, struct in6_ifaddr,
1435 		    ia_ifa);
1436 
1437 		/*
1438 		 * Fail if IPv6 address is not ready or if the address
1439 		 * is reserved * for CLAT46.
1440 		 */
1441 		if (__improbable(ia6->ia6_flags &
1442 		    (IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) != 0) {
1443 			address_found = FALSE;
1444 			ifnet_lock_done(ifp);
1445 			goto done;
1446 		}
1447 	} else {
1448 		/*
1449 		 * If interface has CLAT46 enabled, fail IPv4 bind.
1450 		 * Since this implies network is NAT64/DNS64, Internet
1451 		 * effectively becomes reachable over IPv6.  If on
1452 		 * system IPv4 to IPv6 translation is required, that
1453 		 * should be handled solely through bump in the API.
1454 		 * The in kernel translation is only done for apps
1455 		 * directly using low level networking APIs.
1456 		 */
1457 		if (__improbable(IS_INTF_CLAT46(ifp))) {
1458 			address_found = FALSE;
1459 			ifnet_lock_done(ifp);
1460 			goto done;
1461 		}
1462 	}
1463 
1464 	*gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1465 	ifnet_lock_done(ifp);
1466 done:
1467 	if (ifa != NULL) {
1468 		ifa_remref(ifa);
1469 	}
1470 
1471 	return address_found;
1472 }
1473 
1474 boolean_t
flow_route_laddr_validate(union sockaddr_in_4_6 * saddr,struct ifnet * ifp,uint32_t * gencnt)1475 flow_route_laddr_validate(union sockaddr_in_4_6 *saddr, struct ifnet *ifp,
1476     uint32_t *gencnt)
1477 {
1478 	VERIFY(saddr->sa.sa_family == AF_INET ||
1479 	    saddr->sa.sa_family == AF_INET6);
1480 
1481 	struct flow_ip_addr *ipa;
1482 	uint8_t ipv;
1483 	if (saddr->sa.sa_family == AF_INET) {
1484 		ipv = IPVERSION;
1485 		ipa = (struct flow_ip_addr *)(void *)&saddr->sin.sin_addr;
1486 	} else {
1487 		ipv = IPV6_VERSION;
1488 		ipa = (struct flow_ip_addr *)(void *)&saddr->sin6.sin6_addr;
1489 	}
1490 
1491 	return _flow_route_laddr_validate(ipa, ipv, ifp, gencnt);
1492 }
1493 
1494 boolean_t
flow_route_key_validate(struct flow_key * fk,struct ifnet * ifp,uint32_t * gencnt)1495 flow_route_key_validate(struct flow_key *fk, struct ifnet *ifp,
1496     uint32_t *gencnt)
1497 {
1498 	return _flow_route_laddr_validate(&fk->fk_src, fk->fk_ipver, ifp,
1499 	           gencnt);
1500 }
1501