xref: /xnu-8792.61.2/bsd/skywalk/nexus/flowswitch/flow/flow_route.c (revision 42e220869062b56f8d7d0726fd4c88954f87902c)
1 /*
2  * Copyright (c) 2017-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Flow Routes.
31  *
32  * Each (non-listener) flow entry is always associated with a flow route
33  * object.  Multiple flow entries sharing the same remote address will use
34  * the same flow route for that address.  The flow route object contains
35  * the route information for the remote node.  It gets allocated when a
36  * flow entry requests to connect, and is garbage-collected when it's no
37  * longer referred to after its expiration time has passed.
38  *
39  * A flow route also contains the default local address that's used to
40  * reach the remote node.  This may not necessarily be the same local
41  * address used by the flow entry, if it has explicitly bound the entry
42  * to another local address.  But for the majority of cases, having the
43  * local address be present in the flow route allows us to avoid doing
44  * source address selection each time a connect request happens.
45  *
46  * When the remote node is reachable via a gateway, the gateway address
47  * portion of the flow route contains its IP address and the flow route
48  * is marked with FLOWRTF_GATEWAY.  We use this to optimize the gateway
49  * route lookup, since otherwise we'd have to perform an extra lookup
50  * each time we need to resolve the route.
51  *
52  * When the remote node is directly on the link, the FLOWRTF_ONLINK flag
53  * is set, and the gateway address isn't used.  The target address used
54  * for resolution will the the remote address itself.
55  *
56  * On links with link-layer information, we store the resolved address
57  * of the target node (which may be the gateway's) in the flow route,
58  * and mark the flow route with FLOWRTF_HAS_LLINFO.
59  *
60  * Each flow route also registers itself to receive route events when
61  * the underlying rtentry is updated or deleted.
62  */
63 
64 #include <skywalk/os_skywalk_private.h>
65 
66 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
67 #include <skywalk/nexus/flowswitch/fsw_var.h>
68 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
69 
70 #include <netinet/in.h>
71 #include <netinet/in_var.h>
72 #include <netinet/in_arp.h>
73 #include <netinet6/nd6.h>
74 #include <net/route.h>
75 
76 extern struct rtstat rtstat;
77 
78 static LCK_GRP_DECLARE(flow_route_lock_group, "sk_flow_route_lock");
79 static LCK_ATTR_DECLARE(flow_route_lock_attr, 0, 0);
80 
81 static int fr_cmp(const struct flow_route *, const struct flow_route *);
82 static int fr_id_cmp(const struct flow_route *, const struct flow_route *);
83 static struct flow_route *fr_alloc(boolean_t);
84 static void fr_free(struct flow_route *);
85 static uint32_t flow_route_bucket_purge_common(struct flow_route_bucket *,
86     uint32_t *, boolean_t, boolean_t);
87 static void flow_route_ev_callback(struct eventhandler_entry_arg,
88     struct sockaddr *, int, struct sockaddr *, int);
89 
90 RB_GENERATE_PREV(flow_route_tree, flow_route, fr_link, fr_cmp);
91 RB_GENERATE_PREV(flow_route_id_tree, flow_route, fr_id_link, fr_id_cmp);
92 
93 #define FR_ZONE_NAME    "flow.route"
94 
95 static unsigned int flow_route_size;            /* size of flow_route */
96 struct skmem_cache *flow_route_cache;           /* cache for flow_route */
97 
98 static int __flow_route_inited = 0;
99 
100 #define FLOW_ROUTE_EXPIRE       600     /* seconds */
101 static unsigned int flow_route_expire = FLOW_ROUTE_EXPIRE;
102 
103 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_expire,
104     CTLFLAG_RW | CTLFLAG_LOCKED, &flow_route_expire, 0, "");
105 
106 void
flow_route_init(void)107 flow_route_init(void)
108 {
109 	ASSERT(!__flow_route_inited);
110 
111 	flow_route_size = sizeof(struct flow_route);
112 	flow_route_cache = skmem_cache_create(FR_ZONE_NAME, flow_route_size,
113 	    sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0);
114 
115 	__flow_route_inited = 1;
116 }
117 
118 void
flow_route_fini(void)119 flow_route_fini(void)
120 {
121 	if (__flow_route_inited) {
122 		skmem_cache_destroy(flow_route_cache);
123 		flow_route_cache = NULL;
124 
125 		__flow_route_inited = 0;
126 	}
127 }
128 
129 struct flow_route_bucket *
flow_route_buckets_alloc(size_t frb_cnt,size_t * frb_sz,size_t * tot_sz)130 flow_route_buckets_alloc(size_t frb_cnt, size_t *frb_sz, size_t *tot_sz)
131 {
132 	uint32_t cache_sz = skmem_cpu_cache_line_size();
133 	struct flow_route_bucket *frb;
134 	void *frb_buf, **frb_pbuf;
135 	size_t frb_tot_sz;
136 
137 	/* each bucket is CPU cache-aligned */
138 	*frb_sz = P2ROUNDUP(sizeof(*frb), cache_sz);
139 
140 	/* total size includes extra for alignment requirements */
141 	*tot_sz = frb_tot_sz = (sizeof(void *) + (frb_cnt * (*frb_sz)) + cache_sz);
142 	// rdar://88962126
143 	/* BEGIN IGNORE CODESTYLE */
144 	__typed_allocators_ignore_push
145 	frb_buf = sk_alloc(frb_tot_sz, Z_WAITOK, skmem_tag_fsw_frb_hash);
146 	__typed_allocators_ignore_pop
147 	/* END IGNORE CODESTYLE */
148 	if (__improbable(frb_buf == NULL)) {
149 		return NULL;
150 	}
151 
152 	/*
153 	 * In case we didn't get a cache-aligned memory, round it up
154 	 * accordingly.  This is needed in order to get the rest of
155 	 * the structure members aligned properly.  It also means that
156 	 * the memory span gets shifted due to the round up, but it
157 	 * is okay since we've allocated extra space for this.
158 	 */
159 	frb = (struct flow_route_bucket *)
160 	    P2ROUNDUP((intptr_t)frb_buf + sizeof(void *), cache_sz);
161 	frb_pbuf = (void **)((intptr_t)frb - sizeof(void *));
162 	ASSERT((intptr_t)frb_pbuf >= (intptr_t)frb_buf);
163 	ASSERT(((intptr_t)frb + (frb_cnt * (*frb_sz))) <=
164 	    ((intptr_t)frb_buf + frb_tot_sz));
165 	*frb_pbuf = frb_buf;
166 
167 	SK_DF(SK_VERB_MEM, "frb 0x%llx frb_cnt %zu frb_sz %zu "
168 	    "(total %zu bytes, frb_buf 0x%llx) ALLOC", SK_KVA(frb), frb_cnt,
169 	    *frb_sz, frb_tot_sz, SK_KVA(frb_buf));
170 
171 	return frb;
172 }
173 
174 void
flow_route_buckets_free(struct flow_route_bucket * frb,size_t tot_sz)175 flow_route_buckets_free(struct flow_route_bucket *frb, size_t tot_sz)
176 {
177 	void *frb_buf, **frb_pbuf;
178 
179 	/* get the original address that we stuffed in earlier and free it */
180 	frb_pbuf = (void **)((intptr_t)frb - sizeof(void *));
181 	frb_buf = *frb_pbuf;
182 	SK_DF(SK_VERB_MEM, "frb 0x%llx (frb_buf 0x%llx) FREE",
183 	    SK_KVA(frb), SK_KVA(frb_buf));
184 	// rdar://88962126
185 	__typed_allocators_ignore_push
186 	sk_free(frb_buf, tot_sz);
187 	__typed_allocators_ignore_pop
188 }
189 
190 void
flow_route_bucket_init(struct flow_route_bucket * frb)191 flow_route_bucket_init(struct flow_route_bucket *frb)
192 {
193 	ASSERT(IS_P2ALIGNED(frb, skmem_cpu_cache_line_size()));
194 	lck_rw_init(&frb->frb_lock, &flow_route_lock_group,
195 	    &flow_route_lock_attr);
196 	RB_INIT(&frb->frb_head);
197 }
198 
199 void
flow_route_bucket_destroy(struct flow_route_bucket * frb)200 flow_route_bucket_destroy(struct flow_route_bucket *frb)
201 {
202 	ASSERT(RB_EMPTY(&frb->frb_head));
203 	lck_rw_destroy(&frb->frb_lock, &flow_route_lock_group);
204 }
205 
206 static struct flow_route *
flow_route_find_by_addr(struct flow_route_bucket * frb,union sockaddr_in_4_6 * dst)207 flow_route_find_by_addr(struct flow_route_bucket *frb,
208     union sockaddr_in_4_6 *dst)
209 {
210 	struct flow_route *fr;
211 	struct flow_route find;
212 
213 	FRB_LOCK_ASSERT_HELD(frb);
214 
215 	switch (SA(dst)->sa_family) {
216 	case AF_INET:
217 		find.fr_af = AF_INET;
218 		find.fr_addr_len = sizeof(struct in_addr);
219 		find.fr_addr_key = (void *)&SIN(dst)->sin_addr;
220 		break;
221 
222 	case AF_INET6:
223 		find.fr_af = AF_INET6;
224 		find.fr_addr_len = sizeof(struct in6_addr);
225 		find.fr_addr_key = (void *)&SIN6(dst)->sin6_addr;
226 		break;
227 
228 	default:
229 		VERIFY(0);
230 		/* NOTREACHED */
231 		__builtin_unreachable();
232 	}
233 
234 	fr = RB_FIND(flow_route_tree, &frb->frb_head, &find);
235 	if (fr != NULL) {
236 		flow_route_retain(fr);  /* for the caller */
237 	}
238 	return fr;
239 }
240 
241 struct flow_route_id_bucket *
flow_route_id_buckets_alloc(size_t frib_cnt,size_t * frib_sz,size_t * tot_sz)242 flow_route_id_buckets_alloc(size_t frib_cnt, size_t *frib_sz, size_t *tot_sz)
243 {
244 	uint32_t cache_sz = skmem_cpu_cache_line_size();
245 	struct flow_route_id_bucket *frib;
246 	void *frib_buf, **frib_pbuf;
247 	size_t frib_tot_sz;
248 
249 	/* each bucket is CPU cache-aligned */
250 	*frib_sz = P2ROUNDUP(sizeof(*frib), cache_sz);
251 
252 	/* total size includes extra for alignment requirements */
253 	*tot_sz = frib_tot_sz = (sizeof(void *) + (frib_cnt * (*frib_sz)) + cache_sz);
254 	// rdar://88962126
255 	/* BEGIN IGNORE CODESTYLE */
256 	__typed_allocators_ignore_push
257 	frib_buf = sk_alloc(frib_tot_sz, Z_WAITOK, skmem_tag_fsw_frib_hash);
258 	__typed_allocators_ignore_pop
259 	/* END IGNORE CODESTYLE */
260 	if (__improbable(frib_buf == NULL)) {
261 		return NULL;
262 	}
263 
264 	/*
265 	 * In case we didn't get a cache-aligned memory, round it up
266 	 * accordingly.  This is needed in order to get the rest of
267 	 * the structure members aligned properly.  It also means that
268 	 * the memory span gets shifted due to the round up, but it
269 	 * is okay since we've allocated extra space for this.
270 	 */
271 	frib = (struct flow_route_id_bucket *)
272 	    P2ROUNDUP((intptr_t)frib_buf + sizeof(void *), cache_sz);
273 	frib_pbuf = (void **)((intptr_t)frib - sizeof(void *));
274 	ASSERT((intptr_t)frib_pbuf >= (intptr_t)frib_buf);
275 	ASSERT(((intptr_t)frib + (frib_cnt * (*frib_sz))) <=
276 	    ((intptr_t)frib_buf + frib_tot_sz));
277 	*frib_pbuf = frib_buf;
278 
279 	SK_DF(SK_VERB_MEM, "frib 0x%llx frib_cnt %zu frib_sz %zu "
280 	    "(total %zu bytes, frib_buf 0x%llx) ALLOC", SK_KVA(frib), frib_cnt,
281 	    *frib_sz, frib_tot_sz, SK_KVA(frib_buf));
282 
283 	return frib;
284 }
285 
286 void
flow_route_id_buckets_free(struct flow_route_id_bucket * frib,size_t tot_sz)287 flow_route_id_buckets_free(struct flow_route_id_bucket *frib, size_t tot_sz)
288 {
289 	void *frib_buf, **frib_pbuf;
290 
291 	/* get the original address that we stuffed in earlier and free it */
292 	frib_pbuf = (void **)((intptr_t)frib - sizeof(void *));
293 	frib_buf = *frib_pbuf;
294 	SK_DF(SK_VERB_MEM, "frib 0x%llx (frib_buf 0x%llx) FREE", SK_KVA(frib),
295 	    SK_KVA(frib_buf));
296 	// rdar://88962126
297 	__typed_allocators_ignore_push
298 	sk_free(frib_buf, tot_sz);
299 	__typed_allocators_ignore_pop
300 }
301 
302 void
flow_route_id_bucket_init(struct flow_route_id_bucket * frib)303 flow_route_id_bucket_init(struct flow_route_id_bucket *frib)
304 {
305 	ASSERT(IS_P2ALIGNED(frib, skmem_cpu_cache_line_size()));
306 	lck_rw_init(&frib->frib_lock, &flow_route_lock_group,
307 	    &flow_route_lock_attr);
308 	RB_INIT(&frib->frib_head);
309 }
310 
311 void
flow_route_id_bucket_destroy(struct flow_route_id_bucket * frib)312 flow_route_id_bucket_destroy(struct flow_route_id_bucket *frib)
313 {
314 	ASSERT(RB_EMPTY(&frib->frib_head));
315 	lck_rw_destroy(&frib->frib_lock, &flow_route_lock_group);
316 }
317 
318 static struct flow_route *
flow_route_find_by_uuid(struct flow_route_id_bucket * frib,uuid_t id)319 flow_route_find_by_uuid(struct flow_route_id_bucket *frib, uuid_t id)
320 {
321 	struct flow_route *fr;
322 	struct flow_route find;
323 
324 	FRIB_LOCK_ASSERT_HELD(frib);
325 
326 	uuid_copy(find.fr_uuid, id);
327 	fr = RB_FIND(flow_route_id_tree, &frib->frib_head, &find);
328 	if (fr != NULL) {
329 		flow_route_retain(fr);  /* for the caller */
330 	}
331 	return fr;
332 }
333 
334 static struct flow_route *
fr_alloc(boolean_t cansleep)335 fr_alloc(boolean_t cansleep)
336 {
337 	struct flow_route *fr;
338 
339 	if ((fr = skmem_cache_alloc(flow_route_cache,
340 	    (cansleep ? SKMEM_SLEEP : SKMEM_NOSLEEP))) != NULL) {
341 		bzero(fr, flow_route_size);
342 		lck_spin_init(&fr->fr_reflock, &flow_route_lock_group,
343 		    &flow_route_lock_attr);
344 		lck_mtx_init(&fr->fr_lock, &flow_route_lock_group,
345 		    &flow_route_lock_attr);
346 		uuid_generate_random(fr->fr_uuid);
347 
348 		SK_DF(SK_VERB_MEM, "allocated fr 0x%llx", SK_KVA(fr));
349 	}
350 
351 	return fr;
352 }
353 
354 static void
fr_free(struct flow_route * fr)355 fr_free(struct flow_route *fr)
356 {
357 	SK_DF(SK_VERB_MEM, "freeing fr 0x%llx", SK_KVA(fr));
358 
359 	VERIFY(!(fr->fr_flags & FLOWRTF_ATTACHED));
360 	VERIFY(fr->fr_usecnt == 0);
361 
362 	FR_LOCK(fr);
363 	/* callee frees route entry */
364 	flow_route_cleanup(fr);
365 	VERIFY(fr->fr_rt_dst == NULL);
366 	VERIFY(fr->fr_rt_gw == NULL);
367 	VERIFY(fr->fr_rt_evhdlr_tag == NULL);
368 	FR_UNLOCK(fr);
369 
370 	lck_mtx_destroy(&fr->fr_lock, &flow_route_lock_group);
371 	lck_spin_destroy(&fr->fr_reflock, &flow_route_lock_group);
372 
373 	skmem_cache_free(flow_route_cache, fr);
374 }
375 
376 static inline int
fr_cmp(const struct flow_route * a,const struct flow_route * b)377 fr_cmp(const struct flow_route *a, const struct flow_route *b)
378 {
379 	int d;
380 
381 	if ((d = (a->fr_af - b->fr_af)) != 0) {
382 		return d;
383 	}
384 	if ((d = flow_ip_cmp(a->fr_addr_key, b->fr_addr_key,
385 	    b->fr_addr_len)) != 0) {
386 		return d;
387 	}
388 
389 	return 0;
390 }
391 
392 static inline int
fr_id_cmp(const struct flow_route * a,const struct flow_route * b)393 fr_id_cmp(const struct flow_route *a, const struct flow_route *b)
394 {
395 	return uuid_compare(a->fr_uuid, b->fr_uuid);
396 }
397 
398 static inline int
fr_use_stable_address(struct nx_flow_req * req)399 fr_use_stable_address(struct nx_flow_req *req)
400 {
401 	int use_stable_address = ip6_prefer_tempaddr ? 0 : 1;
402 	if (req != NULL &&
403 	    (req->nfr_flags & NXFLOWREQF_OVERRIDE_ADDRESS_SELECTION)) {
404 		use_stable_address = (req->nfr_flags & NXFLOWREQF_USE_STABLE_ADDRESS) ? 1 : 0;
405 	}
406 	return use_stable_address;
407 }
408 
409 int
flow_route_configure(struct flow_route * fr,struct ifnet * ifp,struct nx_flow_req * req)410 flow_route_configure(struct flow_route *fr, struct ifnet *ifp, struct nx_flow_req *req)
411 {
412 #if SK_LOG
413 	char old_s[MAX_IPv6_STR_LEN];   /* src */
414 	char src_s[MAX_IPv6_STR_LEN];   /* src */
415 	char dst_s[MAX_IPv6_STR_LEN];   /* dst */
416 #endif /* SK_LOG */
417 	struct rtentry *rt = NULL, *gwrt = NULL;
418 	int err = 0;
419 
420 	FR_LOCK_ASSERT_HELD(fr);
421 
422 	/*
423 	 * If there is a route entry for the final destination, see if
424 	 * it's no longer valid and perform another routing table lookup.
425 	 * A non-NULL fr_rt_dst is always associated with a route event
426 	 * registration, and the route reference is held there.
427 	 */
428 	rt = fr->fr_rt_dst;
429 	if (rt == NULL || !(rt->rt_flags & RTF_UP) || fr->fr_want_configure) {
430 		struct eventhandler_entry_arg ee_arg;
431 
432 		/* callee frees route entry */
433 		flow_route_cleanup(fr);
434 
435 		/* lookup destination route */
436 		ASSERT(err == 0);
437 		rt = rtalloc1_scoped(SA(&fr->fr_faddr), 1, 0, ifp->if_index);
438 		if (rt == NULL) {
439 			err = EHOSTUNREACH;
440 			SK_ERR("no route to %s on %s (err %d)",
441 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
442 			    sizeof(dst_s)), ifp->if_xname, err);
443 		} else {
444 			/*
445 			 * If route points to another interface and the
446 			 * route's gateway isn't link-layer, reject it.
447 			 * We make an exception otherwise, since local
448 			 * interface addresses resolve this way.
449 			 */
450 			if (rt->rt_ifp != ifp && rt->rt_ifp != lo_ifp &&
451 			    (rt->rt_gateway == NULL ||
452 			    SA(rt->rt_gateway)->sa_family != AF_LINK)) {
453 				err = EHOSTUNREACH;
454 				SK_ERR("route to %s on %s != %s (err %d)",
455 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
456 				    sizeof(dst_s)), rt->rt_ifp->if_xname,
457 				    ifp->if_xname, err);
458 			}
459 		}
460 
461 		if (err != 0) {
462 			goto done;
463 		}
464 
465 		ASSERT(fr->fr_mgr != NULL);
466 		ASSERT(!uuid_is_null(fr->fr_mgr->fm_uuid));
467 		ASSERT(!uuid_is_null(fr->fr_uuid));
468 		ASSERT(!uuid_is_null(fr->fr_nx_uuid));
469 
470 		bzero(&ee_arg, sizeof(ee_arg));
471 		uuid_copy(ee_arg.ee_fm_uuid, fr->fr_mgr->fm_uuid);
472 		uuid_copy(ee_arg.ee_fr_uuid, fr->fr_uuid);
473 
474 		/*
475 		 * Register for changes on destination route; this covers both
476 		 * cases where the destination is on-link, or if it is off-link
477 		 * and is using a gateway route.  This also transfers the refcnt
478 		 * of the route entry to the event handler, released later when
479 		 * it is deregistered.
480 		 */
481 		ASSERT(fr->fr_rt_dst == NULL);
482 		ASSERT(fr->fr_rt_evhdlr_tag == NULL);
483 		fr->fr_rt_dst = rt;             /* move reference to fr */
484 		fr->fr_rt_evhdlr_tag =
485 		    EVENTHANDLER_REGISTER(&rt->rt_evhdlr_ctxt, route_event,
486 		    flow_route_ev_callback, ee_arg, EVENTHANDLER_PRI_ANY);
487 		ASSERT(fr->fr_rt_evhdlr_tag != NULL);
488 		atomic_bitclear_32(&fr->fr_flags, FLOWRTF_DELETED);
489 
490 		/*
491 		 * Lookup gateway route (if any); returns locked gwrt
492 		 * with a reference bumped up.
493 		 */
494 		err = route_to_gwroute(SA(&fr->fr_faddr), rt, &gwrt);
495 		if (err != 0) {
496 			/*
497 			 * Reference held by fr_rt_dst will be taken
498 			 * care of by flow_route_cleanup() below, so
499 			 * make sure we don't do an extra rtfree().
500 			 */
501 			rt = NULL;
502 			ASSERT(gwrt == NULL);
503 			SK_ERR("no gw route to %s on %s (err %d)",
504 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
505 			    sizeof(dst_s)), ifp->if_xname, err);
506 			goto done;
507 		}
508 
509 		/* if RTF_GATEWAY isn't set, gwrt == rt */
510 		ASSERT(gwrt != NULL);
511 		RT_LOCK_ASSERT_HELD(gwrt);
512 
513 		/*
514 		 * Must have been cleared via cleanup, and that we're
515 		 * single-threaded here for fr by virtue of fr_lock.
516 		 */
517 		ASSERT(!(fr->fr_flags & (FLOWRTF_GATEWAY | FLOWRTF_ONLINK)));
518 
519 		if (gwrt != rt && (rt->rt_flags & RTF_GATEWAY) &&
520 		    (rt->rt_gateway->sa_family == AF_INET ||
521 		    rt->rt_gateway->sa_family == AF_INET6)) {
522 			struct sockaddr_storage ss;
523 
524 			ASSERT(fr->fr_rt_gw == NULL);
525 			/* locked via route_to_gwroute() above */
526 			fr->fr_rt_gw = gwrt;    /* move reference to fr */
527 			RT_ADDREF_LOCKED(gwrt); /* for this routine */
528 			/*
529 			 * Destination is off-link and is reachable
530 			 * thru an IP gateway route.  Save the IP
531 			 * address of the gateway in fr_gaddr.
532 			 */
533 			(void) sa_copy(rt->rt_gateway, &ss, NULL);
534 			_CASSERT(sizeof(fr->fr_gaddr) <= sizeof(ss));
535 			bcopy(&ss, &fr->fr_gaddr, sizeof(fr->fr_gaddr));
536 			atomic_bitset_32(&fr->fr_flags, FLOWRTF_GATEWAY);
537 		} else if (IS_DIRECT_HOSTROUTE(rt)) {
538 			/*
539 			 * Destination is on-link.
540 			 */
541 			atomic_bitset_32(&fr->fr_flags, FLOWRTF_ONLINK);
542 		}
543 		RT_UNLOCK(gwrt);
544 	}
545 	RT_ADDREF(rt);          /* for this routine */
546 
547 	/* see if we need to re-select default source address */
548 	int use_stable_address = fr_use_stable_address(req);
549 	if (fr->fr_want_configure ||
550 	    fr->fr_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt ||
551 	    !(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address) {
552 		union sockaddr_in_4_6 old = fr->fr_laddr;
553 		if (use_stable_address) {
554 			atomic_bitset_32(&fr->fr_flags, FLOWRTF_STABLE_ADDR);
555 		} else {
556 			atomic_bitclear_32(&fr->fr_flags, FLOWRTF_STABLE_ADDR);
557 		}
558 		if ((err = flow_route_select_laddr(&fr->fr_laddr, &fr->fr_faddr,
559 		    ifp, rt, &fr->fr_laddr_gencnt, use_stable_address)) != 0) {
560 			SK_ERR("no usable src address to reach %s on %s "
561 			    "(err %d)", sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
562 			    sizeof(dst_s)), ifp->if_xname, err);
563 			goto done;
564 		}
565 		if (bcmp(&old, &fr->fr_laddr, SA(&old)->sa_len) != 0) {
566 			SK_ERR("src address is now %s (was %s) to reach %s "
567 			    "on %s", sk_sa_ntop(SA(&fr->fr_laddr), src_s,
568 			    sizeof(src_s)), sk_sa_ntop(SA(&old), old_s,
569 			    sizeof(old_s)), sk_sa_ntop(SA(&fr->fr_faddr),
570 			    dst_s, sizeof(dst_s)), ifp->if_xname);
571 		}
572 	}
573 	ASSERT(err == 0);
574 
575 done:
576 	if (__probable(err == 0)) {
577 		atomic_set_32(&fr->fr_want_configure, 0);
578 	} else {
579 		/* callee frees route entry */
580 		flow_route_cleanup(fr);
581 	}
582 
583 	if (gwrt != NULL) {
584 		ASSERT(rt != NULL);
585 		if (gwrt == rt) {
586 			RT_REMREF(gwrt);
587 		} else {
588 			rtfree(gwrt);
589 		}
590 		gwrt = NULL;
591 	}
592 
593 	if (rt != NULL) {
594 		rtfree(rt);
595 		rt = NULL;
596 	}
597 
598 	return err;
599 }
600 
601 int
flow_route_find(struct kern_nexus * nx,struct flow_mgr * fm,struct ifnet * ifp,struct nx_flow_req * req,flow_route_ctor_fn_t fr_ctor,flow_route_resolve_fn_t fr_resolve,void * arg,struct flow_route ** frp)602 flow_route_find(struct kern_nexus *nx, struct flow_mgr *fm,
603     struct ifnet *ifp, struct nx_flow_req *req,
604     flow_route_ctor_fn_t fr_ctor, flow_route_resolve_fn_t fr_resolve,
605     void *arg, struct flow_route **frp)
606 {
607 #if SK_LOG
608 	char src_s[MAX_IPv6_STR_LEN];   /* dst */
609 	char dst_s[MAX_IPv6_STR_LEN];   /* dst */
610 	char gw_s[MAX_IPv6_STR_LEN];    /* gw */
611 #endif /* SK_LOG */
612 	union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
613 	struct flow_route_bucket *frb;
614 	struct flow_route_id_bucket *frib;
615 	struct flow_route *fr = NULL;
616 	int err = 0;
617 
618 	ASSERT(fr_ctor != NULL && fr_resolve != NULL);
619 
620 	ASSERT(frp != NULL);
621 	*frp = NULL;
622 
623 	frb = flow_mgr_get_frb_by_addr(fm, daddr);
624 
625 	int use_stable_address = fr_use_stable_address(req);
626 
627 	/* see if there is a cached flow route (as reader) */
628 	FRB_RLOCK(frb);
629 	fr = flow_route_find_by_addr(frb, daddr);
630 	if (fr != NULL) {
631 		if (__improbable(fr->fr_want_configure || fr->fr_laddr_gencnt !=
632 		    ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt) ||
633 		    __improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) {
634 			atomic_add_32(&fr->fr_want_configure, 1);
635 			FR_LOCK(fr);
636 			err = flow_route_configure(fr, ifp, req);
637 			if (err != 0) {
638 				SK_ERR("fr 0x%llx error re-configuring dst %s "
639 				    "on %s (err %d) [R]", SK_KVA(fr),
640 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
641 				    sizeof(dst_s)), ifp->if_xname, err);
642 			}
643 			FR_UNLOCK(fr);
644 		}
645 		if (err == 0) {
646 			SK_DF(SK_VERB_FLOW_ROUTE,
647 			    "fr 0x%llx found for dst %s " "on %s [R,%u]",
648 			    SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
649 			    sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt);
650 		}
651 		FRB_RUNLOCK(frb);       /* reader */
652 		goto done;
653 	}
654 
655 	/*
656 	 * Flow route doesn't exist; become a writer and prepare to
657 	 * allocate one.  We could be racing with other threads here,
658 	 * so check first if there is now a cached flow route that
659 	 * got created by the winning thread.
660 	 */
661 	if (!FRB_RLOCKTOWLOCK(frb)) {
662 		FRB_WLOCK(frb);
663 	}
664 
665 	fr = flow_route_find_by_addr(frb, daddr);
666 	if (fr != NULL) {
667 		if (__improbable(fr->fr_want_configure) ||
668 		    __improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) {
669 			FR_LOCK(fr);
670 			err = flow_route_configure(fr, ifp, req);
671 			if (err != 0) {
672 				SK_ERR("fr 0x%llx error re-configuring dst %s "
673 				    "on %s (err %d) [W]", SK_KVA(fr),
674 				    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
675 				    sizeof(dst_s)), ifp->if_xname, err);
676 			}
677 			FR_UNLOCK(fr);
678 		}
679 		if (err == 0) {
680 			SK_DF(SK_VERB_FLOW_ROUTE,
681 			    "fr 0x%llx found for dst %s on %s [W,%u]",
682 			    SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
683 			    sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt);
684 		}
685 		FRB_WUNLOCK(frb);       /* writer */
686 		goto done;
687 	}
688 
689 	/* allocate one */
690 	fr = fr_alloc(TRUE);
691 	fr->fr_faddr = *daddr;          /* remote address */
692 
693 	switch (SA(&fr->fr_faddr)->sa_family) {
694 	case AF_INET:
695 		SIN(&fr->fr_faddr)->sin_port = 0;
696 		fr->fr_addr_len = sizeof(struct in_addr);
697 		fr->fr_addr_key = &SIN(&fr->fr_faddr)->sin_addr;
698 		break;
699 
700 	case AF_INET6:
701 		SIN6(&fr->fr_faddr)->sin6_port = 0;
702 		fr->fr_addr_len = sizeof(struct in6_addr);
703 		fr->fr_addr_key = &SIN6(&fr->fr_faddr)->sin6_addr;
704 		break;
705 
706 	default:
707 		VERIFY(0);
708 		/* NOTREACHED */
709 		__builtin_unreachable();
710 	}
711 
712 	ASSERT(!uuid_is_null(fr->fr_uuid));
713 	uuid_copy(fr->fr_nx_uuid, nx->nx_uuid);
714 	*(struct flow_mgr **)(uintptr_t)&fr->fr_mgr = fm;
715 
716 	/* force configure newly-created flow route */
717 	atomic_add_32(&fr->fr_want_configure, 1);
718 
719 	FR_LOCK(fr);
720 	if ((err = flow_route_configure(fr, ifp, req)) != 0) {
721 		SK_ERR("fr 0x%llx error configuring dst %s on %s (err %d)",
722 		    SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
723 		    sizeof(dst_s)), ifp->if_xname, err);
724 		FR_UNLOCK(fr);
725 		FRB_WUNLOCK(frb);       /* writer */
726 		/* not yet in tree, so free immediately */
727 		fr_free(fr);
728 		fr = NULL;
729 		goto done;
730 	}
731 
732 	/* execute nexus-specific constructor */
733 	fr_ctor(arg, fr);
734 	FR_UNLOCK(fr);
735 
736 	frib = flow_mgr_get_frib_by_uuid(fm, fr->fr_uuid);
737 	FRIB_WLOCK(frib);
738 
739 	*(struct flow_route_bucket **)(uintptr_t)&fr->fr_frb = frb;
740 	*(struct flow_route_id_bucket **)(uintptr_t)&fr->fr_frib = frib;
741 
742 	FRB_WLOCK_ASSERT_HELD(frb);
743 	FRIB_WLOCK_ASSERT_HELD(frib);
744 
745 	RB_INSERT(flow_route_tree, &frb->frb_head, fr);
746 	RB_INSERT(flow_route_id_tree, &frib->frib_head, fr);
747 
748 	atomic_bitset_32(&fr->fr_flags, FLOWRTF_ATTACHED);
749 
750 #if DEBUG
751 	/* sanity checks for comparator routines */
752 	VERIFY(flow_route_find_by_addr(frb, &fr->fr_faddr) == fr);
753 	flow_route_release(fr);
754 	VERIFY(flow_route_find_by_uuid(frib, fr->fr_uuid) == fr);
755 	flow_route_release(fr);
756 #endif /* DEBUG */
757 
758 	/* for the trees */
759 	_CASSERT(FLOW_ROUTE_MINREF == 2);
760 	flow_route_retain(fr);
761 	flow_route_retain(fr);
762 	ASSERT(fr->fr_usecnt == FLOW_ROUTE_MINREF);
763 
764 	/* for the caller */
765 	flow_route_retain(fr);
766 
767 	FRIB_WUNLOCK(frib);     /* writer */
768 	FRB_WUNLOCK(frb);       /* writer */
769 
770 	/* execute nexus-specific resolver */
771 	if (!(fr->fr_flags & FLOWRTF_RESOLVED) &&
772 	    (err = fr_resolve(arg, fr, NULL)) != 0) {
773 		if (fr->fr_flags & FLOWRTF_GATEWAY) {
774 			SK_ERR("fr 0x%llx resolve %s gw %s on %s (err %d)",
775 			    SK_KVA(fr), (err == EJUSTRETURN ? "pending" :
776 			    "fail"), sk_sa_ntop(SA(&fr->fr_gaddr), dst_s,
777 			    sizeof(dst_s)), ifp->if_xname, err);
778 		} else {
779 			SK_ERR("fr 0x%llx resolve %s dst %s on %s (err %d)",
780 			    SK_KVA(fr), (err == EJUSTRETURN ? "pending" :
781 			    "fail"), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
782 			    sizeof(dst_s)), ifp->if_xname, err);
783 		}
784 		if (err == EJUSTRETURN) {
785 			err = 0;
786 		} else {
787 			goto done;
788 		}
789 	}
790 	ASSERT(err == 0);
791 
792 #if SK_LOG
793 	if (fr->fr_flags & FLOWRTF_GATEWAY) {
794 		SK_DF(SK_VERB_FLOW_ROUTE,
795 		    "add fr 0x%llx %s -> %s via gw %s on %s", SK_KVA(fr),
796 		    sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)),
797 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
798 		    sk_sa_ntop(SA(&fr->fr_gaddr), gw_s, sizeof(gw_s)),
799 		    ifp->if_xname);
800 	} else {
801 		SK_DF(SK_VERB_FLOW_ROUTE,
802 		    "add fr 0x%llx %s -> %s on %s", SK_KVA(fr),
803 		    sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)),
804 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
805 		    ifp->if_xname);
806 	}
807 #endif /* SK_LOG */
808 
809 done:
810 	if (err == 0) {
811 		ASSERT(fr != NULL);
812 		*frp = fr;
813 	} else if (fr != NULL) {
814 		/* can't directly call fr_free() if it's in the tree */
815 		flow_route_release(fr);
816 		fr = NULL;
817 	}
818 
819 	return err;
820 }
821 
822 void
flow_route_retain(struct flow_route * fr)823 flow_route_retain(struct flow_route *fr)
824 {
825 	lck_spin_lock(&fr->fr_reflock);
826 	if (fr->fr_usecnt++ == FLOW_ROUTE_MINREF) {
827 		fr->fr_expire = 0;
828 	}
829 	lck_spin_unlock(&fr->fr_reflock);
830 }
831 
832 void
flow_route_release(struct flow_route * fr)833 flow_route_release(struct flow_route *fr)
834 {
835 	bool should_free = false;
836 
837 	lck_spin_lock(&fr->fr_reflock);
838 	VERIFY(fr->fr_usecnt > 0);
839 	if (fr->fr_flags & FLOWRTF_ATTACHED) {
840 		if (fr->fr_usecnt-- == (FLOW_ROUTE_MINREF + 1)) {
841 			fr->fr_expire = _net_uptime + flow_route_expire;
842 		}
843 	} else {
844 		/*
845 		 * fr is no longer in lookup tree, so there shouldn't be
846 		 * further usecnt, if we reach 0 usecnt, then this is the very
847 		 * last reference and is safe to unlock and call fr_free.
848 		 */
849 		if (--(fr->fr_usecnt) == 0) {
850 			should_free = true;
851 		}
852 	}
853 	lck_spin_unlock(&fr->fr_reflock);
854 
855 	if (should_free) {
856 		fr_free(fr);
857 	}
858 }
859 
860 static uint32_t
flow_route_bucket_purge_common(struct flow_route_bucket * frb,uint32_t * resid,boolean_t all,boolean_t early_expire)861 flow_route_bucket_purge_common(struct flow_route_bucket *frb, uint32_t *resid,
862     boolean_t all, boolean_t early_expire)
863 {
864 #if SK_LOG
865 	char ss[MAX_IPv6_STR_LEN];      /* dst */
866 	char ds[MAX_IPv6_STR_LEN];      /* dst */
867 	char gs[MAX_IPv6_STR_LEN];      /* gw */
868 #endif /* SK_LOG */
869 	struct flow_route *fr, *tfr;
870 	uint64_t now = net_uptime();
871 	uint32_t i = 0, tot = 0;
872 
873 	FRB_WLOCK_ASSERT_HELD(frb);
874 
875 	RB_FOREACH_SAFE(fr, flow_route_tree, &frb->frb_head, tfr) {
876 		struct flow_route_id_bucket *frib =
877 		    __DECONST(struct flow_route_id_bucket *, fr->fr_frib);
878 
879 		++tot;
880 		/*
881 		 * We're not holding fr_lock here, since this is a
882 		 * best-effort check.  If there's a race and we miss
883 		 * it now, we'll come back again shortly.
884 		 */
885 		lck_spin_lock(&fr->fr_reflock);
886 		if (!all && (fr->fr_usecnt > FLOW_ROUTE_MINREF ||
887 		    (fr->fr_expire > now && !early_expire &&
888 		    !(fr->fr_flags & FLOWRTF_DELETED)))) {
889 			lck_spin_unlock(&fr->fr_reflock);
890 			SK_DF(SK_VERB_FLOW_ROUTE, "skipping fr 0x%llx "
891 			    "refcnt %u expire %llu", SK_KVA(fr),
892 			    fr->fr_usecnt, fr->fr_expire);
893 			continue;
894 		}
895 		lck_spin_unlock(&fr->fr_reflock);
896 
897 		/*
898 		 * If "all" is set, flow entries must be gone by now, as
899 		 * we must be called by flow_route_bucket_purge_all().
900 		 * It also means that the caller has acquired writer lock
901 		 * on all flow {route,route_id} buckets, and fr_usecnt
902 		 * must be at its minimum value now.
903 		 */
904 		if (!all) {
905 			FRIB_WLOCK(frib);
906 		}
907 		FRIB_WLOCK_ASSERT_HELD(frib);
908 
909 		_CASSERT(FLOW_ROUTE_MINREF == 2);
910 		ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF);
911 
912 		RB_REMOVE(flow_route_tree, &frb->frb_head, fr);
913 		RB_REMOVE(flow_route_id_tree, &frib->frib_head, fr);
914 
915 		atomic_bitclear_32(&fr->fr_flags, FLOWRTF_ATTACHED);
916 
917 #if SK_LOG
918 		if (fr->fr_flags & FLOWRTF_GATEWAY) {
919 			SK_DF(SK_VERB_FLOW_ROUTE,
920 			    "remove fr 0x%llx %s -> %s via gw %s [exp %lld]",
921 			    SK_KVA(fr),
922 			    sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
923 			    sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
924 			    sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs)),
925 			    (int64_t)(fr->fr_expire - now));
926 		} else {
927 			SK_DF(SK_VERB_FLOW_ROUTE,
928 			    "remove fr 0x%llx %s -> %s [exp %lld]", SK_KVA(fr),
929 			    sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
930 			    sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
931 			    (int64_t)(fr->fr_expire - now));
932 		}
933 #endif /* SK_LOG */
934 
935 		/* for the trees */
936 		flow_route_release(fr);
937 		flow_route_release(fr);
938 		++i;
939 
940 		if (!all) {
941 			FRIB_WUNLOCK(frib);
942 		}
943 	}
944 
945 	if (resid != NULL) {
946 		*resid = (tot - i);
947 	}
948 
949 	return i;
950 }
951 
952 void
flow_route_bucket_purge_all(struct flow_route_bucket * frb)953 flow_route_bucket_purge_all(struct flow_route_bucket *frb)
954 {
955 	(void) flow_route_bucket_purge_common(frb, NULL, TRUE, FALSE);
956 }
957 
958 static uint32_t
flow_route_bucket_prune(struct flow_route_bucket * frb,struct ifnet * ifp,uint32_t * resid)959 flow_route_bucket_prune(struct flow_route_bucket *frb, struct ifnet *ifp,
960     uint32_t *resid)
961 {
962 	uint64_t now = net_uptime();
963 	struct flow_route *fr;
964 	uint32_t i = 0, tot = 0;
965 	boolean_t ifdown = !(ifp->if_flags & IFF_UP);
966 
967 	FRB_RLOCK(frb);
968 	RB_FOREACH(fr, flow_route_tree, &frb->frb_head) {
969 		++tot;
970 		/* loose check; do this without holding fr_reflock */
971 		if (fr->fr_usecnt > FLOW_ROUTE_MINREF ||
972 		    (fr->fr_expire > now && !ifdown &&
973 		    !(fr->fr_flags & FLOWRTF_DELETED))) {
974 			continue;
975 		}
976 		++i;
977 	}
978 
979 	/*
980 	 * If there's nothing to prune or there's a writer, we're done.
981 	 * Note that if we failed to upgrade to writer, the lock would
982 	 * have been released automatically.
983 	 */
984 	if (i == 0 || !FRB_RLOCKTOWLOCK(frb)) {
985 		if (i == 0) {
986 			FRB_RUNLOCK(frb);
987 		}
988 		if (resid != NULL) {
989 			*resid = (tot - i);
990 		}
991 		return 0;
992 	}
993 
994 	SK_DF(SK_VERB_FLOW_ROUTE, "purging at least %u idle routes on %s",
995 	    i, ifp->if_xname);
996 
997 	/* purge idle ones */
998 	i = flow_route_bucket_purge_common(frb, resid, FALSE, ifdown);
999 	FRB_WUNLOCK(frb);
1000 
1001 	return i;
1002 }
1003 
1004 uint32_t
flow_route_prune(struct flow_mgr * fm,struct ifnet * ifp,uint32_t * tot_resid)1005 flow_route_prune(struct flow_mgr *fm, struct ifnet *ifp,
1006     uint32_t *tot_resid)
1007 {
1008 	uint32_t pruned = 0;
1009 	uint32_t resid;
1010 	uint32_t i;
1011 
1012 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
1013 		struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
1014 		pruned += flow_route_bucket_prune(frb, ifp, &resid);
1015 		if (tot_resid != NULL) {
1016 			*tot_resid += resid;
1017 		}
1018 	}
1019 
1020 	return pruned;
1021 }
1022 
1023 /*
1024  * This runs in the context of eventhandler invocation routine which loops
1025  * through all the registered callbacks.  Care must be taken to not call
1026  * any primitives here that would lead to routing changes in the same context
1027  * as it would lead to deadlock in eventhandler code.
1028  */
1029 static void
flow_route_ev_callback(struct eventhandler_entry_arg ee_arg,struct sockaddr * dst,int route_ev,struct sockaddr * gw_addr,int flags)1030 flow_route_ev_callback(struct eventhandler_entry_arg ee_arg,
1031     struct sockaddr *dst, int route_ev, struct sockaddr *gw_addr, int flags)
1032 {
1033 #pragma unused(dst, flags)
1034 #if SK_LOG
1035 	char dst_s[MAX_IPv6_STR_LEN];
1036 #endif /* SK_LOG */
1037 	struct flow_route_id_bucket *frib = NULL;
1038 	struct flow_route *fr = NULL;
1039 	struct flow_mgr *fm;
1040 
1041 	VERIFY(!uuid_is_null(ee_arg.ee_fm_uuid));
1042 	VERIFY(!uuid_is_null(ee_arg.ee_fr_uuid));
1043 
1044 	/*
1045 	 * Upon success, callee will hold flow manager lock as reader,
1046 	 * and we'll need to unlock it below.  Otherwise there's no
1047 	 * need to unlock here and just return.
1048 	 */
1049 	fm = flow_mgr_find_lock(ee_arg.ee_fm_uuid);
1050 	if (fm == NULL) {
1051 		SK_ERR("Event %s for dst %s ignored; flow manager not found",
1052 		    route_event2str(route_ev), sk_sa_ntop(dst, dst_s,
1053 		    sizeof(dst_s)));
1054 		return;
1055 	}
1056 
1057 	SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s event %s", fm->fm_name,
1058 	    sk_sa_ntop(dst, dst_s, sizeof(dst_s)), route_event2str(route_ev));
1059 
1060 	do {
1061 		frib = flow_mgr_get_frib_by_uuid(fm, ee_arg.ee_fr_uuid);
1062 
1063 		FRIB_RLOCK(frib);
1064 		/* callee returns a reference that we need to release below */
1065 		fr = flow_route_find_by_uuid(frib, ee_arg.ee_fr_uuid);
1066 		if (fr == NULL) {
1067 			SK_ERR("%s: dst %s flow route not found", fm->fm_name,
1068 			    sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1069 			break;
1070 		}
1071 
1072 		/*
1073 		 * Grab fr_lock to prevent flow route configuration or
1074 		 * resolver from using stale info while we are updating.
1075 		 */
1076 		FR_LOCK(fr);
1077 
1078 		switch (route_ev) {
1079 		case ROUTE_ENTRY_REFRESH:
1080 			/*
1081 			 * This is the case where the route entry has been
1082 			 * updated (for example through RTM_CHANGE).  Some
1083 			 * of it may not warrant a lookup again and some of
1084 			 * it may.  For now, mark flow to perform a look-up
1085 			 * again as the gateway may have changed.
1086 			 */
1087 			atomic_add_32(&fr->fr_want_configure, 1);
1088 			atomic_bitclear_32(&fr->fr_flags, FLOWRTF_RESOLVED);
1089 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route changed",
1090 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1091 			    sizeof(dst_s)));
1092 			break;
1093 
1094 		case ROUTE_ENTRY_DELETED:
1095 			/*
1096 			 * NOTE: flow_route_cleanup() should not be called
1097 			 * to de-register eventhandler in the context of
1098 			 * eventhandler callback to avoid deadlock in
1099 			 * eventhandler code.  Instead, just mark the flow
1100 			 * route un-resolved.  When it is being used again
1101 			 * or being deleted the old eventhandler must be
1102 			 * de-registered.
1103 			 */
1104 			atomic_add_32(&fr->fr_want_configure, 1);
1105 			atomic_bitclear_32(&fr->fr_flags, FLOWRTF_RESOLVED);
1106 			atomic_bitset_32(&fr->fr_flags, FLOWRTF_DELETED);
1107 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route deleted",
1108 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1109 			    sizeof(dst_s)));
1110 			break;
1111 
1112 		case ROUTE_LLENTRY_STALE:
1113 			/*
1114 			 * When the route entry is deemed unreliable or old
1115 			 * enough to trigger a route lookup again.  Don't
1116 			 * reconfigure the flow route, but simply attempt
1117 			 * to resolve it next time to trigger a probe.
1118 			 */
1119 			atomic_add_32(&fr->fr_want_probe, 1);
1120 			atomic_bitclear_32(&fr->fr_flags, FLOWRTF_RESOLVED);
1121 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry stale",
1122 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1123 			    sizeof(dst_s)));
1124 			break;
1125 
1126 		case ROUTE_LLENTRY_CHANGED:
1127 			/*
1128 			 * When the link-layer info has changed; replace
1129 			 * cached llinfo in the flow route (treat this
1130 			 * as ROUTE_LLENTRY_RESOLVED).
1131 			 */
1132 			OS_FALLTHROUGH;
1133 
1134 		case ROUTE_LLENTRY_RESOLVED:
1135 			/*
1136 			 * SDL address length may be 0 for cellular.
1137 			 * If Ethernet, copy into flow route and mark
1138 			 * it as cached.  In all cases, mark the flow
1139 			 * route as resolved.
1140 			 */
1141 			ASSERT(SDL(gw_addr)->sdl_family == AF_LINK);
1142 			if (SDL(gw_addr)->sdl_alen == ETHER_ADDR_LEN) {
1143 				FLOWRT_UPD_ETH_DST(fr, LLADDR(SDL(gw_addr)));
1144 				SK_DF(SK_VERB_FLOW_ROUTE,
1145 				    "%s: dst %s llentry %s", fm->fm_name,
1146 				    sk_sa_ntop(dst, dst_s, sizeof(dst_s)),
1147 				    (!(fr->fr_flags & FLOWRTF_HAS_LLINFO) ?
1148 				    "resolved" : "changed"));
1149 				atomic_bitset_32(&fr->fr_flags,
1150 				    FLOWRTF_HAS_LLINFO);
1151 			} else {
1152 				atomic_bitclear_32(&fr->fr_flags,
1153 				    FLOWRTF_HAS_LLINFO);
1154 			}
1155 			atomic_bitset_32(&fr->fr_flags, FLOWRTF_RESOLVED);
1156 #if SK_LOG
1157 			if (__improbable((sk_verbose & SK_VERB_FLOW_ROUTE) !=
1158 			    0) && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
1159 				SK_DF(SK_VERB_FLOW_ROUTE,
1160 				    "%s: fr 0x%llx eth_type 0x%x "
1161 				    "eth_src %x:%x:%x:%x:%x:%x "
1162 				    "eth_dst %x:%x:%x:%x:%x:%x [%s])",
1163 				    fm->fm_name, SK_KVA(fr),
1164 				    ntohs(fr->fr_eth.ether_type),
1165 				    fr->fr_eth.ether_shost[0],
1166 				    fr->fr_eth.ether_shost[1],
1167 				    fr->fr_eth.ether_shost[2],
1168 				    fr->fr_eth.ether_shost[3],
1169 				    fr->fr_eth.ether_shost[4],
1170 				    fr->fr_eth.ether_shost[5],
1171 				    fr->fr_eth.ether_dhost[0],
1172 				    fr->fr_eth.ether_dhost[1],
1173 				    fr->fr_eth.ether_dhost[2],
1174 				    fr->fr_eth.ether_dhost[3],
1175 				    fr->fr_eth.ether_dhost[4],
1176 				    fr->fr_eth.ether_dhost[5],
1177 				    sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1178 			}
1179 #endif /* SK_LOG */
1180 			break;
1181 
1182 		case ROUTE_LLENTRY_DELETED:
1183 			/*
1184 			 * If the route entry points to a router and an
1185 			 * RTM_DELETE has been issued on it; force the
1186 			 * flow route to be reconfigured.
1187 			 */
1188 			atomic_add_32(&fr->fr_want_configure, 1);
1189 			atomic_bitclear_32(&fr->fr_flags,
1190 			    (FLOWRTF_HAS_LLINFO | FLOWRTF_RESOLVED));
1191 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry deleted",
1192 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1193 			    sizeof(dst_s)));
1194 			break;
1195 
1196 		case ROUTE_LLENTRY_PROBED:
1197 			/*
1198 			 * When the resolver has begun probing the target;
1199 			 * nothing to do here.
1200 			 */
1201 			SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry probed",
1202 			    fm->fm_name, sk_sa_ntop(dst, dst_s,
1203 			    sizeof(dst_s)));
1204 			break;
1205 
1206 		case ROUTE_LLENTRY_UNREACH:
1207 			/*
1208 			 * When the route entry is marked with RTF_REJECT
1209 			 * or the probes have timed out, reconfigure.
1210 			 */
1211 			atomic_add_32(&fr->fr_want_configure, 1);
1212 			atomic_bitclear_32(&fr->fr_flags, FLOWRTF_RESOLVED);
1213 			SK_ERR("%s: dst %s llentry unreachable", fm->fm_name,
1214 			    sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1215 			break;
1216 
1217 		default:
1218 			break;
1219 		}
1220 	} while (0);
1221 
1222 	if (fr != NULL) {
1223 		flow_route_release(fr);
1224 		FR_UNLOCK(fr);
1225 	}
1226 
1227 	if (frib != NULL) {
1228 		FRIB_UNLOCK(frib);
1229 	}
1230 
1231 	if (fm != NULL) {
1232 		flow_mgr_unlock();
1233 	}
1234 }
1235 
1236 int
flow_route_select_laddr(union sockaddr_in_4_6 * src,union sockaddr_in_4_6 * dst,struct ifnet * ifp,struct rtentry * rt,uint32_t * ipaddr_gencnt,int use_stable_address)1237 flow_route_select_laddr(union sockaddr_in_4_6 *src, union sockaddr_in_4_6 *dst,
1238     struct ifnet *ifp, struct rtentry *rt, uint32_t *ipaddr_gencnt,
1239     int use_stable_address)
1240 {
1241 #if SK_LOG
1242 	char src_s[MAX_IPv6_STR_LEN];   /* src */
1243 	char dst_s[MAX_IPv6_STR_LEN];   /* dst */
1244 #endif /* SK_LOG */
1245 	sa_family_t af = SA(dst)->sa_family;
1246 	struct ifnet *src_ifp = NULL;
1247 	struct ifaddr *ifa = NULL;
1248 	int err = 0;
1249 
1250 	/* see comments in flow_route_configure() regarding loopback */
1251 	ASSERT(rt->rt_ifp == ifp || rt->rt_ifp == lo_ifp);
1252 
1253 	switch (af) {
1254 	case AF_INET: {
1255 		ifnet_lock_shared(ifp);
1256 		if (__improbable(rt->rt_ifa->ifa_debug & IFD_DETACHING) != 0) {
1257 			err = EHOSTUNREACH;
1258 			SK_ERR("route to %s has src address marked detaching "
1259 			    "(err %d)", inet_ntop(AF_INET,
1260 			    &SIN(dst)->sin_addr, dst_s, sizeof(dst_s)), err);
1261 			ifnet_lock_done(ifp);
1262 			break;
1263 		}
1264 		SIN(src)->sin_len = sizeof(struct sockaddr_in);
1265 		SIN(src)->sin_family = AF_INET;
1266 		SIN(src)->sin_addr = IA_SIN(rt->rt_ifa)->sin_addr;
1267 		ASSERT(SIN(src)->sin_addr.s_addr != INADDR_ANY);
1268 		*ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1269 		ifnet_lock_done(ifp);
1270 		break;
1271 	}
1272 
1273 	case AF_INET6: {
1274 		struct in6_addr src_storage, *in6;
1275 		struct route_in6 ro = {};
1276 		uint32_t hints = (use_stable_address ? 0 : IPV6_SRCSEL_HINT_PREFER_TMPADDR);
1277 		ro.ro_rt = rt;
1278 
1279 		if ((in6 = in6_selectsrc_core(SIN6(dst), hints,
1280 		    ifp, 0, &src_storage, &src_ifp, &err, &ifa, &ro)) == NULL) {
1281 			if (err == 0) {
1282 				err = EADDRNOTAVAIL;
1283 			}
1284 			VERIFY(src_ifp == NULL);
1285 			SK_ERR("src address to dst %s on %s not available "
1286 			    "(err %d)", inet_ntop(AF_INET6,
1287 			    &SIN6(dst)->sin6_addr, dst_s, sizeof(dst_s)),
1288 			    ifp->if_xname, err);
1289 			break;
1290 		}
1291 
1292 		VERIFY(src_ifp != NULL);
1293 		VERIFY(ifa != NULL);
1294 
1295 		if (__improbable(src_ifp != ifp)) {
1296 			if (err == 0) {
1297 				err = ENETUNREACH;
1298 			}
1299 			SK_ERR("dst %s, src %s ifp %s != %s (err %d)",
1300 			    inet_ntop(AF_INET6, &SIN6(dst)->sin6_addr,
1301 			    dst_s, sizeof(dst_s)),
1302 			    inet_ntop(AF_INET6, &SIN6(src)->sin6_addr,
1303 			    src_s, sizeof(src_s)),
1304 			    src_ifp->if_xname, ifp->if_xname, err);
1305 			break;
1306 		}
1307 
1308 		ifnet_lock_shared(ifp);
1309 		if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) {
1310 			err = EHOSTUNREACH;
1311 			SK_ERR("IPv6 address selected is marked to be "
1312 			    "detached (err %d)", err);
1313 			ifnet_lock_done(ifp);
1314 			break;
1315 		}
1316 
1317 		/* clear embedded scope if link-local src */
1318 		if (IN6_IS_SCOPE_EMBED(in6)) {
1319 			if (in6_embedded_scope) {
1320 				SIN6(src)->sin6_scope_id = ntohs(in6->s6_addr16[1]);
1321 				in6->s6_addr16[1] = 0;
1322 			} else {
1323 				SIN6(src)->sin6_scope_id = src_ifp->if_index;
1324 			}
1325 		}
1326 		SIN6(src)->sin6_len = sizeof(struct sockaddr_in6);
1327 		SIN6(src)->sin6_family = AF_INET6;
1328 		SIN6(src)->sin6_addr = *in6;
1329 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&SIN6(src)->sin6_addr));
1330 		*ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1331 		ifnet_lock_done(ifp);
1332 		break;
1333 	}
1334 
1335 	default:
1336 		VERIFY(0);
1337 		/* NOTREACHED */
1338 		__builtin_unreachable();
1339 	}
1340 
1341 	if (ifa != NULL) {
1342 		IFA_REMREF(ifa);
1343 	}
1344 
1345 	if (src_ifp != NULL) {
1346 		ifnet_release(src_ifp);
1347 	}
1348 
1349 #if SK_LOG
1350 	if (err == 0 && __improbable((sk_verbose & SK_VERB_FLOW_ROUTE) != 0)) {
1351 		SK_DF(SK_VERB_FLOW_ROUTE, "src %s to dst %s on %s",
1352 		    sk_sa_ntop(SA(src), src_s, sizeof(src_s)),
1353 		    sk_sa_ntop(SA(dst), dst_s, sizeof(dst_s)),
1354 		    ifp->if_xname);
1355 	}
1356 #endif /* SK_LOG */
1357 
1358 	return err;
1359 }
1360 
1361 void
flow_route_cleanup(struct flow_route * fr)1362 flow_route_cleanup(struct flow_route *fr)
1363 {
1364 #if SK_LOG
1365 	char ss[MAX_IPv6_STR_LEN];      /* dst */
1366 	char ds[MAX_IPv6_STR_LEN];      /* dst */
1367 	char gs[MAX_IPv6_STR_LEN];      /* gw */
1368 #endif /* SK_LOG */
1369 
1370 	FR_LOCK_ASSERT_HELD(fr);
1371 
1372 	if (fr->fr_rt_evhdlr_tag != NULL) {
1373 		ASSERT(fr->fr_rt_dst != NULL);
1374 		route_event_enqueue_nwk_wq_entry(fr->fr_rt_dst, NULL,
1375 		    ROUTE_EVHDLR_DEREGISTER, fr->fr_rt_evhdlr_tag, FALSE);
1376 		fr->fr_rt_evhdlr_tag = NULL;
1377 		fr->fr_rt_dst = NULL;
1378 	}
1379 	ASSERT(fr->fr_rt_dst == NULL);
1380 	if (fr->fr_rt_gw != NULL) {
1381 		rtfree(fr->fr_rt_gw);
1382 		fr->fr_rt_gw = NULL;
1383 	}
1384 
1385 #if SK_LOG
1386 	if (fr->fr_flags & FLOWRTF_GATEWAY) {
1387 		SK_DF(SK_VERB_FLOW_ROUTE,
1388 		    "clean fr 0x%llx %s -> %s via gw %s", SK_KVA(fr),
1389 		    sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
1390 		    sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
1391 		    sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs)));
1392 	} else if (fr->fr_flags & FLOWRTF_ONLINK) {
1393 		SK_DF(SK_VERB_FLOW_ROUTE,
1394 		    "clean fr 0x%llx %s -> %s", SK_KVA(fr),
1395 		    sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
1396 		    sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)));
1397 	}
1398 #endif /* SK_LOG */
1399 
1400 	atomic_bitclear_32(&fr->fr_flags, (FLOWRTF_GATEWAY | FLOWRTF_ONLINK));
1401 }
1402 
1403 static boolean_t
_flow_route_laddr_validate(struct flow_ip_addr * src_ip0,uint8_t ip_v,struct ifnet * ifp,uint32_t * gencnt)1404 _flow_route_laddr_validate(struct flow_ip_addr *src_ip0, uint8_t ip_v,
1405     struct ifnet *ifp, uint32_t *gencnt)
1406 {
1407 	boolean_t address_found = TRUE;
1408 	struct ifaddr *ifa = NULL;
1409 	struct flow_ip_addr src_ip = {};
1410 	uint32_t scope = ifp->if_index;
1411 
1412 	VERIFY(gencnt != NULL);
1413 	VERIFY(ip_v == IPVERSION || ip_v == IPV6_VERSION);
1414 
1415 	if (ip_v == IPVERSION) {
1416 		memcpy(&src_ip._v4, &src_ip0->_v4, sizeof(src_ip._v4));
1417 
1418 		ifa = (struct ifaddr *)ifa_foraddr_scoped(
1419 			src_ip._v4.s_addr, scope);
1420 	} else {
1421 		memcpy(&src_ip, src_ip0, sizeof(*src_ip0));
1422 
1423 		if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(&src_ip._v6)) {
1424 			src_ip._v6.s6_addr16[1] = htons((uint16_t)scope);
1425 		}
1426 		ifa = (struct ifaddr *)ifa_foraddr6_scoped(&src_ip._v6,
1427 		    scope);
1428 	}
1429 
1430 	if (__improbable(ifa == NULL)) {
1431 		address_found = FALSE;
1432 		goto done;
1433 	}
1434 
1435 	ifnet_lock_shared(ifp);
1436 	if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) {
1437 		address_found = FALSE;
1438 		ifnet_lock_done(ifp);
1439 		goto done;
1440 	}
1441 
1442 	if (ip_v == IPV6_VERSION) {
1443 		struct in6_ifaddr *ia6 = (struct in6_ifaddr *)ifa;
1444 
1445 		/*
1446 		 * Fail if IPv6 address is not ready or if the address
1447 		 * is reserved * for CLAT46.
1448 		 */
1449 		if (__improbable(ia6->ia6_flags &
1450 		    (IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) != 0) {
1451 			address_found = FALSE;
1452 			ifnet_lock_done(ifp);
1453 			goto done;
1454 		}
1455 	} else {
1456 		/*
1457 		 * If interface has CLAT46 enabled, fail IPv4 bind.
1458 		 * Since this implies network is NAT64/DNS64, Internet
1459 		 * effectively becomes reachable over IPv6.  If on
1460 		 * system IPv4 to IPv6 translation is required, that
1461 		 * should be handled solely through bump in the API.
1462 		 * The in kernel translation is only done for apps
1463 		 * directly using low level networking APIs.
1464 		 */
1465 		if (__improbable(IS_INTF_CLAT46(ifp))) {
1466 			address_found = FALSE;
1467 			ifnet_lock_done(ifp);
1468 			goto done;
1469 		}
1470 	}
1471 
1472 	*gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1473 	ifnet_lock_done(ifp);
1474 done:
1475 	if (ifa != NULL) {
1476 		IFA_REMREF(ifa);
1477 	}
1478 
1479 	return address_found;
1480 }
1481 
1482 boolean_t
flow_route_laddr_validate(union sockaddr_in_4_6 * saddr,struct ifnet * ifp,uint32_t * gencnt)1483 flow_route_laddr_validate(union sockaddr_in_4_6 *saddr, struct ifnet *ifp,
1484     uint32_t *gencnt)
1485 {
1486 	VERIFY(saddr->sa.sa_family == AF_INET ||
1487 	    saddr->sa.sa_family == AF_INET6);
1488 
1489 	struct flow_ip_addr *ipa;
1490 	uint8_t ipv;
1491 	if (saddr->sa.sa_family == AF_INET) {
1492 		ipv = IPVERSION;
1493 		ipa = (struct flow_ip_addr *)(void *)&saddr->sin.sin_addr;
1494 	} else {
1495 		ipv = IPV6_VERSION;
1496 		ipa = (struct flow_ip_addr *)(void *)&saddr->sin6.sin6_addr;
1497 	}
1498 
1499 	return _flow_route_laddr_validate(ipa, ipv, ifp, gencnt);
1500 }
1501 
1502 boolean_t
flow_route_key_validate(struct flow_key * fk,struct ifnet * ifp,uint32_t * gencnt)1503 flow_route_key_validate(struct flow_key *fk, struct ifnet *ifp,
1504     uint32_t *gencnt)
1505 {
1506 	return _flow_route_laddr_validate(&fk->fk_src, fk->fk_ipver, ifp,
1507 	           gencnt);
1508 }
1509