1 /*
2 * Copyright (c) 2017-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Flow Routes.
31 *
32 * Each (non-listener) flow entry is always associated with a flow route
33 * object. Multiple flow entries sharing the same remote address will use
34 * the same flow route for that address. The flow route object contains
35 * the route information for the remote node. It gets allocated when a
36 * flow entry requests to connect, and is garbage-collected when it's no
37 * longer referred to after its expiration time has passed.
38 *
39 * A flow route also contains the default local address that's used to
40 * reach the remote node. This may not necessarily be the same local
41 * address used by the flow entry, if it has explicitly bound the entry
42 * to another local address. But for the majority of cases, having the
43 * local address be present in the flow route allows us to avoid doing
44 * source address selection each time a connect request happens.
45 *
46 * When the remote node is reachable via a gateway, the gateway address
47 * portion of the flow route contains its IP address and the flow route
48 * is marked with FLOWRTF_GATEWAY. We use this to optimize the gateway
49 * route lookup, since otherwise we'd have to perform an extra lookup
50 * each time we need to resolve the route.
51 *
52 * When the remote node is directly on the link, the FLOWRTF_ONLINK flag
53 * is set, and the gateway address isn't used. The target address used
54 * for resolution will the the remote address itself.
55 *
56 * On links with link-layer information, we store the resolved address
57 * of the target node (which may be the gateway's) in the flow route,
58 * and mark the flow route with FLOWRTF_HAS_LLINFO.
59 *
60 * Each flow route also registers itself to receive route events when
61 * the underlying rtentry is updated or deleted.
62 */
63
64 #include <skywalk/os_skywalk_private.h>
65
66 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
67 #include <skywalk/nexus/flowswitch/fsw_var.h>
68 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
69
70 #include <netinet/in.h>
71 #include <netinet/in_var.h>
72 #include <netinet/in_arp.h>
73 #include <netinet6/nd6.h>
74 #include <net/route.h>
75
76 extern struct rtstat rtstat;
77
78 static LCK_GRP_DECLARE(flow_route_lock_group, "sk_flow_route_lock");
79 static LCK_ATTR_DECLARE(flow_route_lock_attr, 0, 0);
80
81 static int fr_cmp(const struct flow_route *, const struct flow_route *);
82 static int fr_id_cmp(const struct flow_route *, const struct flow_route *);
83 static struct flow_route *fr_alloc(boolean_t);
84 static void fr_free(struct flow_route *);
85 static uint32_t flow_route_bucket_purge_common(struct flow_route_bucket *,
86 uint32_t *, boolean_t, boolean_t);
87 static void flow_route_ev_callback(struct eventhandler_entry_arg,
88 struct sockaddr *, int, struct sockaddr *, int);
89
90 RB_GENERATE_PREV(flow_route_tree, flow_route, fr_link, fr_cmp);
91 RB_GENERATE_PREV(flow_route_id_tree, flow_route, fr_id_link, fr_id_cmp);
92
93 #define FR_ZONE_NAME "flow.route"
94
95 static unsigned int flow_route_size; /* size of flow_route */
96 struct skmem_cache *flow_route_cache; /* cache for flow_route */
97
98 static int __flow_route_inited = 0;
99
100 #define FLOW_ROUTE_EXPIRE 600 /* seconds */
101 static unsigned int flow_route_expire = FLOW_ROUTE_EXPIRE;
102
103 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_expire,
104 CTLFLAG_RW | CTLFLAG_LOCKED, &flow_route_expire, 0, "");
105
106 void
flow_route_init(void)107 flow_route_init(void)
108 {
109 ASSERT(!__flow_route_inited);
110
111 flow_route_size = sizeof(struct flow_route);
112 flow_route_cache = skmem_cache_create(FR_ZONE_NAME, flow_route_size,
113 sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0);
114
115 __flow_route_inited = 1;
116 }
117
118 void
flow_route_fini(void)119 flow_route_fini(void)
120 {
121 if (__flow_route_inited) {
122 skmem_cache_destroy(flow_route_cache);
123 flow_route_cache = NULL;
124
125 __flow_route_inited = 0;
126 }
127 }
128
129 struct flow_route_bucket *
flow_route_buckets_alloc(size_t frb_cnt,size_t * frb_sz,size_t * tot_sz)130 flow_route_buckets_alloc(size_t frb_cnt, size_t *frb_sz, size_t *tot_sz)
131 {
132 uint32_t cache_sz = skmem_cpu_cache_line_size();
133 struct flow_route_bucket *frb;
134 void *frb_buf, **frb_pbuf;
135 size_t frb_tot_sz;
136
137 /* each bucket is CPU cache-aligned */
138 *frb_sz = P2ROUNDUP(sizeof(*frb), cache_sz);
139
140 /* total size includes extra for alignment requirements */
141 *tot_sz = frb_tot_sz = (sizeof(void *) + (frb_cnt * (*frb_sz)) + cache_sz);
142 // rdar://88962126
143 /* BEGIN IGNORE CODESTYLE */
144 __typed_allocators_ignore_push
145 frb_buf = sk_alloc(frb_tot_sz, Z_WAITOK, skmem_tag_fsw_frb_hash);
146 __typed_allocators_ignore_pop
147 /* END IGNORE CODESTYLE */
148 if (__improbable(frb_buf == NULL)) {
149 return NULL;
150 }
151
152 /*
153 * In case we didn't get a cache-aligned memory, round it up
154 * accordingly. This is needed in order to get the rest of
155 * the structure members aligned properly. It also means that
156 * the memory span gets shifted due to the round up, but it
157 * is okay since we've allocated extra space for this.
158 */
159 frb = (struct flow_route_bucket *)
160 P2ROUNDUP((intptr_t)frb_buf + sizeof(void *), cache_sz);
161 frb_pbuf = (void **)((intptr_t)frb - sizeof(void *));
162 ASSERT((intptr_t)frb_pbuf >= (intptr_t)frb_buf);
163 ASSERT(((intptr_t)frb + (frb_cnt * (*frb_sz))) <=
164 ((intptr_t)frb_buf + frb_tot_sz));
165 *frb_pbuf = frb_buf;
166
167 SK_DF(SK_VERB_MEM, "frb 0x%llx frb_cnt %zu frb_sz %zu "
168 "(total %zu bytes, frb_buf 0x%llx) ALLOC", SK_KVA(frb), frb_cnt,
169 *frb_sz, frb_tot_sz, SK_KVA(frb_buf));
170
171 return frb;
172 }
173
174 void
flow_route_buckets_free(struct flow_route_bucket * frb,size_t tot_sz)175 flow_route_buckets_free(struct flow_route_bucket *frb, size_t tot_sz)
176 {
177 void *frb_buf, **frb_pbuf;
178
179 /* get the original address that we stuffed in earlier and free it */
180 frb_pbuf = (void **)((intptr_t)frb - sizeof(void *));
181 frb_buf = *frb_pbuf;
182 SK_DF(SK_VERB_MEM, "frb 0x%llx (frb_buf 0x%llx) FREE",
183 SK_KVA(frb), SK_KVA(frb_buf));
184 // rdar://88962126
185 __typed_allocators_ignore_push
186 sk_free(frb_buf, tot_sz);
187 __typed_allocators_ignore_pop
188 }
189
190 void
flow_route_bucket_init(struct flow_route_bucket * frb)191 flow_route_bucket_init(struct flow_route_bucket *frb)
192 {
193 ASSERT(IS_P2ALIGNED(frb, skmem_cpu_cache_line_size()));
194 lck_rw_init(&frb->frb_lock, &flow_route_lock_group,
195 &flow_route_lock_attr);
196 RB_INIT(&frb->frb_head);
197 }
198
199 void
flow_route_bucket_destroy(struct flow_route_bucket * frb)200 flow_route_bucket_destroy(struct flow_route_bucket *frb)
201 {
202 ASSERT(RB_EMPTY(&frb->frb_head));
203 lck_rw_destroy(&frb->frb_lock, &flow_route_lock_group);
204 }
205
206 static struct flow_route *
flow_route_find_by_addr(struct flow_route_bucket * frb,union sockaddr_in_4_6 * dst)207 flow_route_find_by_addr(struct flow_route_bucket *frb,
208 union sockaddr_in_4_6 *dst)
209 {
210 struct flow_route *fr;
211 struct flow_route find;
212
213 FRB_LOCK_ASSERT_HELD(frb);
214
215 switch (SA(dst)->sa_family) {
216 case AF_INET:
217 find.fr_af = AF_INET;
218 find.fr_addr_len = sizeof(struct in_addr);
219 find.fr_addr_key = (void *)&SIN(dst)->sin_addr;
220 break;
221
222 case AF_INET6:
223 find.fr_af = AF_INET6;
224 find.fr_addr_len = sizeof(struct in6_addr);
225 find.fr_addr_key = (void *)&SIN6(dst)->sin6_addr;
226 break;
227
228 default:
229 VERIFY(0);
230 /* NOTREACHED */
231 __builtin_unreachable();
232 }
233
234 fr = RB_FIND(flow_route_tree, &frb->frb_head, &find);
235 if (fr != NULL) {
236 flow_route_retain(fr); /* for the caller */
237 }
238 return fr;
239 }
240
241 struct flow_route_id_bucket *
flow_route_id_buckets_alloc(size_t frib_cnt,size_t * frib_sz,size_t * tot_sz)242 flow_route_id_buckets_alloc(size_t frib_cnt, size_t *frib_sz, size_t *tot_sz)
243 {
244 uint32_t cache_sz = skmem_cpu_cache_line_size();
245 struct flow_route_id_bucket *frib;
246 void *frib_buf, **frib_pbuf;
247 size_t frib_tot_sz;
248
249 /* each bucket is CPU cache-aligned */
250 *frib_sz = P2ROUNDUP(sizeof(*frib), cache_sz);
251
252 /* total size includes extra for alignment requirements */
253 *tot_sz = frib_tot_sz = (sizeof(void *) + (frib_cnt * (*frib_sz)) + cache_sz);
254 // rdar://88962126
255 /* BEGIN IGNORE CODESTYLE */
256 __typed_allocators_ignore_push
257 frib_buf = sk_alloc(frib_tot_sz, Z_WAITOK, skmem_tag_fsw_frib_hash);
258 __typed_allocators_ignore_pop
259 /* END IGNORE CODESTYLE */
260 if (__improbable(frib_buf == NULL)) {
261 return NULL;
262 }
263
264 /*
265 * In case we didn't get a cache-aligned memory, round it up
266 * accordingly. This is needed in order to get the rest of
267 * the structure members aligned properly. It also means that
268 * the memory span gets shifted due to the round up, but it
269 * is okay since we've allocated extra space for this.
270 */
271 frib = (struct flow_route_id_bucket *)
272 P2ROUNDUP((intptr_t)frib_buf + sizeof(void *), cache_sz);
273 frib_pbuf = (void **)((intptr_t)frib - sizeof(void *));
274 ASSERT((intptr_t)frib_pbuf >= (intptr_t)frib_buf);
275 ASSERT(((intptr_t)frib + (frib_cnt * (*frib_sz))) <=
276 ((intptr_t)frib_buf + frib_tot_sz));
277 *frib_pbuf = frib_buf;
278
279 SK_DF(SK_VERB_MEM, "frib 0x%llx frib_cnt %zu frib_sz %zu "
280 "(total %zu bytes, frib_buf 0x%llx) ALLOC", SK_KVA(frib), frib_cnt,
281 *frib_sz, frib_tot_sz, SK_KVA(frib_buf));
282
283 return frib;
284 }
285
286 void
flow_route_id_buckets_free(struct flow_route_id_bucket * frib,size_t tot_sz)287 flow_route_id_buckets_free(struct flow_route_id_bucket *frib, size_t tot_sz)
288 {
289 void *frib_buf, **frib_pbuf;
290
291 /* get the original address that we stuffed in earlier and free it */
292 frib_pbuf = (void **)((intptr_t)frib - sizeof(void *));
293 frib_buf = *frib_pbuf;
294 SK_DF(SK_VERB_MEM, "frib 0x%llx (frib_buf 0x%llx) FREE", SK_KVA(frib),
295 SK_KVA(frib_buf));
296 // rdar://88962126
297 __typed_allocators_ignore_push
298 sk_free(frib_buf, tot_sz);
299 __typed_allocators_ignore_pop
300 }
301
302 void
flow_route_id_bucket_init(struct flow_route_id_bucket * frib)303 flow_route_id_bucket_init(struct flow_route_id_bucket *frib)
304 {
305 ASSERT(IS_P2ALIGNED(frib, skmem_cpu_cache_line_size()));
306 lck_rw_init(&frib->frib_lock, &flow_route_lock_group,
307 &flow_route_lock_attr);
308 RB_INIT(&frib->frib_head);
309 }
310
311 void
flow_route_id_bucket_destroy(struct flow_route_id_bucket * frib)312 flow_route_id_bucket_destroy(struct flow_route_id_bucket *frib)
313 {
314 ASSERT(RB_EMPTY(&frib->frib_head));
315 lck_rw_destroy(&frib->frib_lock, &flow_route_lock_group);
316 }
317
318 static struct flow_route *
flow_route_find_by_uuid(struct flow_route_id_bucket * frib,uuid_t id)319 flow_route_find_by_uuid(struct flow_route_id_bucket *frib, uuid_t id)
320 {
321 struct flow_route *fr;
322 struct flow_route find;
323
324 FRIB_LOCK_ASSERT_HELD(frib);
325
326 uuid_copy(find.fr_uuid, id);
327 fr = RB_FIND(flow_route_id_tree, &frib->frib_head, &find);
328 if (fr != NULL) {
329 flow_route_retain(fr); /* for the caller */
330 }
331 return fr;
332 }
333
334 static struct flow_route *
fr_alloc(boolean_t cansleep)335 fr_alloc(boolean_t cansleep)
336 {
337 struct flow_route *fr;
338
339 if ((fr = skmem_cache_alloc(flow_route_cache,
340 (cansleep ? SKMEM_SLEEP : SKMEM_NOSLEEP))) != NULL) {
341 bzero(fr, flow_route_size);
342 lck_spin_init(&fr->fr_reflock, &flow_route_lock_group,
343 &flow_route_lock_attr);
344 lck_mtx_init(&fr->fr_lock, &flow_route_lock_group,
345 &flow_route_lock_attr);
346 uuid_generate_random(fr->fr_uuid);
347
348 SK_DF(SK_VERB_MEM, "allocated fr 0x%llx", SK_KVA(fr));
349 }
350
351 return fr;
352 }
353
354 static void
fr_free(struct flow_route * fr)355 fr_free(struct flow_route *fr)
356 {
357 SK_DF(SK_VERB_MEM, "freeing fr 0x%llx", SK_KVA(fr));
358
359 VERIFY(!(fr->fr_flags & FLOWRTF_ATTACHED));
360 VERIFY(fr->fr_usecnt == 0);
361
362 FR_LOCK(fr);
363 /* callee frees route entry */
364 flow_route_cleanup(fr);
365 VERIFY(fr->fr_rt_dst == NULL);
366 VERIFY(fr->fr_rt_gw == NULL);
367 VERIFY(fr->fr_rt_evhdlr_tag == NULL);
368 FR_UNLOCK(fr);
369
370 lck_mtx_destroy(&fr->fr_lock, &flow_route_lock_group);
371 lck_spin_destroy(&fr->fr_reflock, &flow_route_lock_group);
372
373 skmem_cache_free(flow_route_cache, fr);
374 }
375
376 static inline int
fr_cmp(const struct flow_route * a,const struct flow_route * b)377 fr_cmp(const struct flow_route *a, const struct flow_route *b)
378 {
379 int d;
380
381 if ((d = (a->fr_af - b->fr_af)) != 0) {
382 return d;
383 }
384 if ((d = flow_ip_cmp(a->fr_addr_key, b->fr_addr_key,
385 b->fr_addr_len)) != 0) {
386 return d;
387 }
388
389 return 0;
390 }
391
392 static inline int
fr_id_cmp(const struct flow_route * a,const struct flow_route * b)393 fr_id_cmp(const struct flow_route *a, const struct flow_route *b)
394 {
395 return uuid_compare(a->fr_uuid, b->fr_uuid);
396 }
397
398 static inline int
fr_use_stable_address(struct nx_flow_req * req)399 fr_use_stable_address(struct nx_flow_req *req)
400 {
401 int use_stable_address = ip6_prefer_tempaddr ? 0 : 1;
402 if (req != NULL &&
403 (req->nfr_flags & NXFLOWREQF_OVERRIDE_ADDRESS_SELECTION)) {
404 use_stable_address = (req->nfr_flags & NXFLOWREQF_USE_STABLE_ADDRESS) ? 1 : 0;
405 }
406 return use_stable_address;
407 }
408
409 int
flow_route_configure(struct flow_route * fr,struct ifnet * ifp,struct nx_flow_req * req)410 flow_route_configure(struct flow_route *fr, struct ifnet *ifp, struct nx_flow_req *req)
411 {
412 #if SK_LOG
413 char old_s[MAX_IPv6_STR_LEN]; /* src */
414 char src_s[MAX_IPv6_STR_LEN]; /* src */
415 char dst_s[MAX_IPv6_STR_LEN]; /* dst */
416 #endif /* SK_LOG */
417 struct rtentry *rt = NULL, *gwrt = NULL;
418 int err = 0;
419
420 FR_LOCK_ASSERT_HELD(fr);
421
422 /*
423 * If there is a route entry for the final destination, see if
424 * it's no longer valid and perform another routing table lookup.
425 * A non-NULL fr_rt_dst is always associated with a route event
426 * registration, and the route reference is held there.
427 */
428 rt = fr->fr_rt_dst;
429 if (rt == NULL || !(rt->rt_flags & RTF_UP) || fr->fr_want_configure) {
430 struct eventhandler_entry_arg ee_arg;
431
432 /* callee frees route entry */
433 flow_route_cleanup(fr);
434
435 /* lookup destination route */
436 ASSERT(err == 0);
437 rt = rtalloc1_scoped(SA(&fr->fr_faddr), 1, 0, ifp->if_index);
438 if (rt == NULL) {
439 err = EHOSTUNREACH;
440 SK_ERR("no route to %s on %s (err %d)",
441 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
442 sizeof(dst_s)), ifp->if_xname, err);
443 } else {
444 /*
445 * If route points to another interface and the
446 * route's gateway isn't link-layer, reject it.
447 * We make an exception otherwise, since local
448 * interface addresses resolve this way.
449 */
450 if (rt->rt_ifp != ifp && rt->rt_ifp != lo_ifp &&
451 (rt->rt_gateway == NULL ||
452 SA(rt->rt_gateway)->sa_family != AF_LINK)) {
453 err = EHOSTUNREACH;
454 SK_ERR("route to %s on %s != %s (err %d)",
455 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
456 sizeof(dst_s)), rt->rt_ifp->if_xname,
457 ifp->if_xname, err);
458 }
459 }
460
461 if (err != 0) {
462 goto done;
463 }
464
465 ASSERT(fr->fr_mgr != NULL);
466 ASSERT(!uuid_is_null(fr->fr_mgr->fm_uuid));
467 ASSERT(!uuid_is_null(fr->fr_uuid));
468 ASSERT(!uuid_is_null(fr->fr_nx_uuid));
469
470 bzero(&ee_arg, sizeof(ee_arg));
471 uuid_copy(ee_arg.ee_fm_uuid, fr->fr_mgr->fm_uuid);
472 uuid_copy(ee_arg.ee_fr_uuid, fr->fr_uuid);
473
474 /*
475 * Register for changes on destination route; this covers both
476 * cases where the destination is on-link, or if it is off-link
477 * and is using a gateway route. This also transfers the refcnt
478 * of the route entry to the event handler, released later when
479 * it is deregistered.
480 */
481 ASSERT(fr->fr_rt_dst == NULL);
482 ASSERT(fr->fr_rt_evhdlr_tag == NULL);
483 fr->fr_rt_dst = rt; /* move reference to fr */
484 fr->fr_rt_evhdlr_tag =
485 EVENTHANDLER_REGISTER(&rt->rt_evhdlr_ctxt, route_event,
486 flow_route_ev_callback, ee_arg, EVENTHANDLER_PRI_ANY);
487 ASSERT(fr->fr_rt_evhdlr_tag != NULL);
488 atomic_bitclear_32(&fr->fr_flags, FLOWRTF_DELETED);
489
490 /*
491 * Lookup gateway route (if any); returns locked gwrt
492 * with a reference bumped up.
493 */
494 err = route_to_gwroute(SA(&fr->fr_faddr), rt, &gwrt);
495 if (err != 0) {
496 /*
497 * Reference held by fr_rt_dst will be taken
498 * care of by flow_route_cleanup() below, so
499 * make sure we don't do an extra rtfree().
500 */
501 rt = NULL;
502 ASSERT(gwrt == NULL);
503 SK_ERR("no gw route to %s on %s (err %d)",
504 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
505 sizeof(dst_s)), ifp->if_xname, err);
506 goto done;
507 }
508
509 /* if RTF_GATEWAY isn't set, gwrt == rt */
510 ASSERT(gwrt != NULL);
511 RT_LOCK_ASSERT_HELD(gwrt);
512
513 /*
514 * Must have been cleared via cleanup, and that we're
515 * single-threaded here for fr by virtue of fr_lock.
516 */
517 ASSERT(!(fr->fr_flags & (FLOWRTF_GATEWAY | FLOWRTF_ONLINK)));
518
519 if (gwrt != rt && (rt->rt_flags & RTF_GATEWAY) &&
520 (rt->rt_gateway->sa_family == AF_INET ||
521 rt->rt_gateway->sa_family == AF_INET6)) {
522 struct sockaddr_storage ss;
523
524 ASSERT(fr->fr_rt_gw == NULL);
525 /* locked via route_to_gwroute() above */
526 fr->fr_rt_gw = gwrt; /* move reference to fr */
527 RT_ADDREF_LOCKED(gwrt); /* for this routine */
528 /*
529 * Destination is off-link and is reachable
530 * thru an IP gateway route. Save the IP
531 * address of the gateway in fr_gaddr.
532 */
533 (void) sa_copy(rt->rt_gateway, &ss, NULL);
534 _CASSERT(sizeof(fr->fr_gaddr) <= sizeof(ss));
535 bcopy(&ss, &fr->fr_gaddr, sizeof(fr->fr_gaddr));
536 atomic_bitset_32(&fr->fr_flags, FLOWRTF_GATEWAY);
537 } else if (IS_DIRECT_HOSTROUTE(rt)) {
538 /*
539 * Destination is on-link.
540 */
541 atomic_bitset_32(&fr->fr_flags, FLOWRTF_ONLINK);
542 }
543 RT_UNLOCK(gwrt);
544 }
545 RT_ADDREF(rt); /* for this routine */
546
547 /* see if we need to re-select default source address */
548 int use_stable_address = fr_use_stable_address(req);
549 if (fr->fr_want_configure ||
550 fr->fr_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt ||
551 !(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address) {
552 union sockaddr_in_4_6 old = fr->fr_laddr;
553 if (use_stable_address) {
554 atomic_bitset_32(&fr->fr_flags, FLOWRTF_STABLE_ADDR);
555 } else {
556 atomic_bitclear_32(&fr->fr_flags, FLOWRTF_STABLE_ADDR);
557 }
558 if ((err = flow_route_select_laddr(&fr->fr_laddr, &fr->fr_faddr,
559 ifp, rt, &fr->fr_laddr_gencnt, use_stable_address)) != 0) {
560 SK_ERR("no usable src address to reach %s on %s "
561 "(err %d)", sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
562 sizeof(dst_s)), ifp->if_xname, err);
563 goto done;
564 }
565 if (bcmp(&old, &fr->fr_laddr, SA(&old)->sa_len) != 0) {
566 SK_ERR("src address is now %s (was %s) to reach %s "
567 "on %s", sk_sa_ntop(SA(&fr->fr_laddr), src_s,
568 sizeof(src_s)), sk_sa_ntop(SA(&old), old_s,
569 sizeof(old_s)), sk_sa_ntop(SA(&fr->fr_faddr),
570 dst_s, sizeof(dst_s)), ifp->if_xname);
571 }
572 }
573 ASSERT(err == 0);
574
575 done:
576 if (__probable(err == 0)) {
577 atomic_set_32(&fr->fr_want_configure, 0);
578 } else {
579 /* callee frees route entry */
580 flow_route_cleanup(fr);
581 }
582
583 if (gwrt != NULL) {
584 ASSERT(rt != NULL);
585 if (gwrt == rt) {
586 RT_REMREF(gwrt);
587 } else {
588 rtfree(gwrt);
589 }
590 gwrt = NULL;
591 }
592
593 if (rt != NULL) {
594 rtfree(rt);
595 rt = NULL;
596 }
597
598 return err;
599 }
600
601 int
flow_route_find(struct kern_nexus * nx,struct flow_mgr * fm,struct ifnet * ifp,struct nx_flow_req * req,flow_route_ctor_fn_t fr_ctor,flow_route_resolve_fn_t fr_resolve,void * arg,struct flow_route ** frp)602 flow_route_find(struct kern_nexus *nx, struct flow_mgr *fm,
603 struct ifnet *ifp, struct nx_flow_req *req,
604 flow_route_ctor_fn_t fr_ctor, flow_route_resolve_fn_t fr_resolve,
605 void *arg, struct flow_route **frp)
606 {
607 #if SK_LOG
608 char src_s[MAX_IPv6_STR_LEN]; /* dst */
609 char dst_s[MAX_IPv6_STR_LEN]; /* dst */
610 char gw_s[MAX_IPv6_STR_LEN]; /* gw */
611 #endif /* SK_LOG */
612 union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
613 struct flow_route_bucket *frb;
614 struct flow_route_id_bucket *frib;
615 struct flow_route *fr = NULL;
616 int err = 0;
617
618 ASSERT(fr_ctor != NULL && fr_resolve != NULL);
619
620 ASSERT(frp != NULL);
621 *frp = NULL;
622
623 frb = flow_mgr_get_frb_by_addr(fm, daddr);
624
625 int use_stable_address = fr_use_stable_address(req);
626
627 /* see if there is a cached flow route (as reader) */
628 FRB_RLOCK(frb);
629 fr = flow_route_find_by_addr(frb, daddr);
630 if (fr != NULL) {
631 if (__improbable(fr->fr_want_configure || fr->fr_laddr_gencnt !=
632 ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt) ||
633 __improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) {
634 atomic_add_32(&fr->fr_want_configure, 1);
635 FR_LOCK(fr);
636 err = flow_route_configure(fr, ifp, req);
637 if (err != 0) {
638 SK_ERR("fr 0x%llx error re-configuring dst %s "
639 "on %s (err %d) [R]", SK_KVA(fr),
640 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
641 sizeof(dst_s)), ifp->if_xname, err);
642 }
643 FR_UNLOCK(fr);
644 }
645 if (err == 0) {
646 SK_DF(SK_VERB_FLOW_ROUTE,
647 "fr 0x%llx found for dst %s " "on %s [R,%u]",
648 SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
649 sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt);
650 }
651 FRB_RUNLOCK(frb); /* reader */
652 goto done;
653 }
654
655 /*
656 * Flow route doesn't exist; become a writer and prepare to
657 * allocate one. We could be racing with other threads here,
658 * so check first if there is now a cached flow route that
659 * got created by the winning thread.
660 */
661 if (!FRB_RLOCKTOWLOCK(frb)) {
662 FRB_WLOCK(frb);
663 }
664
665 fr = flow_route_find_by_addr(frb, daddr);
666 if (fr != NULL) {
667 if (__improbable(fr->fr_want_configure) ||
668 __improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) {
669 FR_LOCK(fr);
670 err = flow_route_configure(fr, ifp, req);
671 if (err != 0) {
672 SK_ERR("fr 0x%llx error re-configuring dst %s "
673 "on %s (err %d) [W]", SK_KVA(fr),
674 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
675 sizeof(dst_s)), ifp->if_xname, err);
676 }
677 FR_UNLOCK(fr);
678 }
679 if (err == 0) {
680 SK_DF(SK_VERB_FLOW_ROUTE,
681 "fr 0x%llx found for dst %s on %s [W,%u]",
682 SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
683 sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt);
684 }
685 FRB_WUNLOCK(frb); /* writer */
686 goto done;
687 }
688
689 /* allocate one */
690 fr = fr_alloc(TRUE);
691 fr->fr_faddr = *daddr; /* remote address */
692
693 switch (SA(&fr->fr_faddr)->sa_family) {
694 case AF_INET:
695 SIN(&fr->fr_faddr)->sin_port = 0;
696 fr->fr_addr_len = sizeof(struct in_addr);
697 fr->fr_addr_key = &SIN(&fr->fr_faddr)->sin_addr;
698 break;
699
700 case AF_INET6:
701 SIN6(&fr->fr_faddr)->sin6_port = 0;
702 fr->fr_addr_len = sizeof(struct in6_addr);
703 fr->fr_addr_key = &SIN6(&fr->fr_faddr)->sin6_addr;
704 break;
705
706 default:
707 VERIFY(0);
708 /* NOTREACHED */
709 __builtin_unreachable();
710 }
711
712 ASSERT(!uuid_is_null(fr->fr_uuid));
713 uuid_copy(fr->fr_nx_uuid, nx->nx_uuid);
714 *(struct flow_mgr **)(uintptr_t)&fr->fr_mgr = fm;
715
716 /* force configure newly-created flow route */
717 atomic_add_32(&fr->fr_want_configure, 1);
718
719 FR_LOCK(fr);
720 if ((err = flow_route_configure(fr, ifp, req)) != 0) {
721 SK_ERR("fr 0x%llx error configuring dst %s on %s (err %d)",
722 SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
723 sizeof(dst_s)), ifp->if_xname, err);
724 FR_UNLOCK(fr);
725 FRB_WUNLOCK(frb); /* writer */
726 /* not yet in tree, so free immediately */
727 fr_free(fr);
728 fr = NULL;
729 goto done;
730 }
731
732 /* execute nexus-specific constructor */
733 fr_ctor(arg, fr);
734 FR_UNLOCK(fr);
735
736 frib = flow_mgr_get_frib_by_uuid(fm, fr->fr_uuid);
737 FRIB_WLOCK(frib);
738
739 *(struct flow_route_bucket **)(uintptr_t)&fr->fr_frb = frb;
740 *(struct flow_route_id_bucket **)(uintptr_t)&fr->fr_frib = frib;
741
742 FRB_WLOCK_ASSERT_HELD(frb);
743 FRIB_WLOCK_ASSERT_HELD(frib);
744
745 RB_INSERT(flow_route_tree, &frb->frb_head, fr);
746 RB_INSERT(flow_route_id_tree, &frib->frib_head, fr);
747
748 atomic_bitset_32(&fr->fr_flags, FLOWRTF_ATTACHED);
749
750 #if DEBUG
751 /* sanity checks for comparator routines */
752 VERIFY(flow_route_find_by_addr(frb, &fr->fr_faddr) == fr);
753 flow_route_release(fr);
754 VERIFY(flow_route_find_by_uuid(frib, fr->fr_uuid) == fr);
755 flow_route_release(fr);
756 #endif /* DEBUG */
757
758 /* for the trees */
759 _CASSERT(FLOW_ROUTE_MINREF == 2);
760 flow_route_retain(fr);
761 flow_route_retain(fr);
762 ASSERT(fr->fr_usecnt == FLOW_ROUTE_MINREF);
763
764 /* for the caller */
765 flow_route_retain(fr);
766
767 FRIB_WUNLOCK(frib); /* writer */
768 FRB_WUNLOCK(frb); /* writer */
769
770 /* execute nexus-specific resolver */
771 if (!(fr->fr_flags & FLOWRTF_RESOLVED) &&
772 (err = fr_resolve(arg, fr, NULL)) != 0) {
773 if (fr->fr_flags & FLOWRTF_GATEWAY) {
774 SK_ERR("fr 0x%llx resolve %s gw %s on %s (err %d)",
775 SK_KVA(fr), (err == EJUSTRETURN ? "pending" :
776 "fail"), sk_sa_ntop(SA(&fr->fr_gaddr), dst_s,
777 sizeof(dst_s)), ifp->if_xname, err);
778 } else {
779 SK_ERR("fr 0x%llx resolve %s dst %s on %s (err %d)",
780 SK_KVA(fr), (err == EJUSTRETURN ? "pending" :
781 "fail"), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
782 sizeof(dst_s)), ifp->if_xname, err);
783 }
784 if (err == EJUSTRETURN) {
785 err = 0;
786 } else {
787 goto done;
788 }
789 }
790 ASSERT(err == 0);
791
792 #if SK_LOG
793 if (fr->fr_flags & FLOWRTF_GATEWAY) {
794 SK_DF(SK_VERB_FLOW_ROUTE,
795 "add fr 0x%llx %s -> %s via gw %s on %s", SK_KVA(fr),
796 sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)),
797 sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
798 sk_sa_ntop(SA(&fr->fr_gaddr), gw_s, sizeof(gw_s)),
799 ifp->if_xname);
800 } else {
801 SK_DF(SK_VERB_FLOW_ROUTE,
802 "add fr 0x%llx %s -> %s on %s", SK_KVA(fr),
803 sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)),
804 sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
805 ifp->if_xname);
806 }
807 #endif /* SK_LOG */
808
809 done:
810 if (err == 0) {
811 ASSERT(fr != NULL);
812 *frp = fr;
813 } else if (fr != NULL) {
814 /* can't directly call fr_free() if it's in the tree */
815 flow_route_release(fr);
816 fr = NULL;
817 }
818
819 return err;
820 }
821
822 void
flow_route_retain(struct flow_route * fr)823 flow_route_retain(struct flow_route *fr)
824 {
825 lck_spin_lock(&fr->fr_reflock);
826 if (fr->fr_usecnt++ == FLOW_ROUTE_MINREF) {
827 fr->fr_expire = 0;
828 }
829 lck_spin_unlock(&fr->fr_reflock);
830 }
831
832 void
flow_route_release(struct flow_route * fr)833 flow_route_release(struct flow_route *fr)
834 {
835 bool should_free = false;
836
837 lck_spin_lock(&fr->fr_reflock);
838 VERIFY(fr->fr_usecnt > 0);
839 if (fr->fr_flags & FLOWRTF_ATTACHED) {
840 if (fr->fr_usecnt-- == (FLOW_ROUTE_MINREF + 1)) {
841 fr->fr_expire = _net_uptime + flow_route_expire;
842 }
843 } else {
844 /*
845 * fr is no longer in lookup tree, so there shouldn't be
846 * further usecnt, if we reach 0 usecnt, then this is the very
847 * last reference and is safe to unlock and call fr_free.
848 */
849 if (--(fr->fr_usecnt) == 0) {
850 should_free = true;
851 }
852 }
853 lck_spin_unlock(&fr->fr_reflock);
854
855 if (should_free) {
856 fr_free(fr);
857 }
858 }
859
860 static uint32_t
flow_route_bucket_purge_common(struct flow_route_bucket * frb,uint32_t * resid,boolean_t all,boolean_t early_expire)861 flow_route_bucket_purge_common(struct flow_route_bucket *frb, uint32_t *resid,
862 boolean_t all, boolean_t early_expire)
863 {
864 #if SK_LOG
865 char ss[MAX_IPv6_STR_LEN]; /* dst */
866 char ds[MAX_IPv6_STR_LEN]; /* dst */
867 char gs[MAX_IPv6_STR_LEN]; /* gw */
868 #endif /* SK_LOG */
869 struct flow_route *fr, *tfr;
870 uint64_t now = net_uptime();
871 uint32_t i = 0, tot = 0;
872
873 FRB_WLOCK_ASSERT_HELD(frb);
874
875 RB_FOREACH_SAFE(fr, flow_route_tree, &frb->frb_head, tfr) {
876 struct flow_route_id_bucket *frib =
877 __DECONST(struct flow_route_id_bucket *, fr->fr_frib);
878
879 ++tot;
880 /*
881 * We're not holding fr_lock here, since this is a
882 * best-effort check. If there's a race and we miss
883 * it now, we'll come back again shortly.
884 */
885 lck_spin_lock(&fr->fr_reflock);
886 if (!all && (fr->fr_usecnt > FLOW_ROUTE_MINREF ||
887 (fr->fr_expire > now && !early_expire &&
888 !(fr->fr_flags & FLOWRTF_DELETED)))) {
889 lck_spin_unlock(&fr->fr_reflock);
890 SK_DF(SK_VERB_FLOW_ROUTE, "skipping fr 0x%llx "
891 "refcnt %u expire %llu", SK_KVA(fr),
892 fr->fr_usecnt, fr->fr_expire);
893 continue;
894 }
895 lck_spin_unlock(&fr->fr_reflock);
896
897 /*
898 * If "all" is set, flow entries must be gone by now, as
899 * we must be called by flow_route_bucket_purge_all().
900 * It also means that the caller has acquired writer lock
901 * on all flow {route,route_id} buckets, and fr_usecnt
902 * must be at its minimum value now.
903 */
904 if (!all) {
905 FRIB_WLOCK(frib);
906 }
907 FRIB_WLOCK_ASSERT_HELD(frib);
908
909 _CASSERT(FLOW_ROUTE_MINREF == 2);
910 ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF);
911
912 RB_REMOVE(flow_route_tree, &frb->frb_head, fr);
913 RB_REMOVE(flow_route_id_tree, &frib->frib_head, fr);
914
915 atomic_bitclear_32(&fr->fr_flags, FLOWRTF_ATTACHED);
916
917 #if SK_LOG
918 if (fr->fr_flags & FLOWRTF_GATEWAY) {
919 SK_DF(SK_VERB_FLOW_ROUTE,
920 "remove fr 0x%llx %s -> %s via gw %s [exp %lld]",
921 SK_KVA(fr),
922 sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
923 sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
924 sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs)),
925 (int64_t)(fr->fr_expire - now));
926 } else {
927 SK_DF(SK_VERB_FLOW_ROUTE,
928 "remove fr 0x%llx %s -> %s [exp %lld]", SK_KVA(fr),
929 sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
930 sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
931 (int64_t)(fr->fr_expire - now));
932 }
933 #endif /* SK_LOG */
934
935 /* for the trees */
936 flow_route_release(fr);
937 flow_route_release(fr);
938 ++i;
939
940 if (!all) {
941 FRIB_WUNLOCK(frib);
942 }
943 }
944
945 if (resid != NULL) {
946 *resid = (tot - i);
947 }
948
949 return i;
950 }
951
952 void
flow_route_bucket_purge_all(struct flow_route_bucket * frb)953 flow_route_bucket_purge_all(struct flow_route_bucket *frb)
954 {
955 (void) flow_route_bucket_purge_common(frb, NULL, TRUE, FALSE);
956 }
957
958 static uint32_t
flow_route_bucket_prune(struct flow_route_bucket * frb,struct ifnet * ifp,uint32_t * resid)959 flow_route_bucket_prune(struct flow_route_bucket *frb, struct ifnet *ifp,
960 uint32_t *resid)
961 {
962 uint64_t now = net_uptime();
963 struct flow_route *fr;
964 uint32_t i = 0, tot = 0;
965 boolean_t ifdown = !(ifp->if_flags & IFF_UP);
966
967 FRB_RLOCK(frb);
968 RB_FOREACH(fr, flow_route_tree, &frb->frb_head) {
969 ++tot;
970 /* loose check; do this without holding fr_reflock */
971 if (fr->fr_usecnt > FLOW_ROUTE_MINREF ||
972 (fr->fr_expire > now && !ifdown &&
973 !(fr->fr_flags & FLOWRTF_DELETED))) {
974 continue;
975 }
976 ++i;
977 }
978
979 /*
980 * If there's nothing to prune or there's a writer, we're done.
981 * Note that if we failed to upgrade to writer, the lock would
982 * have been released automatically.
983 */
984 if (i == 0 || !FRB_RLOCKTOWLOCK(frb)) {
985 if (i == 0) {
986 FRB_RUNLOCK(frb);
987 }
988 if (resid != NULL) {
989 *resid = (tot - i);
990 }
991 return 0;
992 }
993
994 SK_DF(SK_VERB_FLOW_ROUTE, "purging at least %u idle routes on %s",
995 i, ifp->if_xname);
996
997 /* purge idle ones */
998 i = flow_route_bucket_purge_common(frb, resid, FALSE, ifdown);
999 FRB_WUNLOCK(frb);
1000
1001 return i;
1002 }
1003
1004 uint32_t
flow_route_prune(struct flow_mgr * fm,struct ifnet * ifp,uint32_t * tot_resid)1005 flow_route_prune(struct flow_mgr *fm, struct ifnet *ifp,
1006 uint32_t *tot_resid)
1007 {
1008 uint32_t pruned = 0;
1009 uint32_t resid;
1010 uint32_t i;
1011
1012 for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
1013 struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
1014 pruned += flow_route_bucket_prune(frb, ifp, &resid);
1015 if (tot_resid != NULL) {
1016 *tot_resid += resid;
1017 }
1018 }
1019
1020 return pruned;
1021 }
1022
1023 /*
1024 * This runs in the context of eventhandler invocation routine which loops
1025 * through all the registered callbacks. Care must be taken to not call
1026 * any primitives here that would lead to routing changes in the same context
1027 * as it would lead to deadlock in eventhandler code.
1028 */
1029 static void
flow_route_ev_callback(struct eventhandler_entry_arg ee_arg,struct sockaddr * dst,int route_ev,struct sockaddr * gw_addr,int flags)1030 flow_route_ev_callback(struct eventhandler_entry_arg ee_arg,
1031 struct sockaddr *dst, int route_ev, struct sockaddr *gw_addr, int flags)
1032 {
1033 #pragma unused(dst, flags)
1034 #if SK_LOG
1035 char dst_s[MAX_IPv6_STR_LEN];
1036 #endif /* SK_LOG */
1037 struct flow_route_id_bucket *frib = NULL;
1038 struct flow_route *fr = NULL;
1039 struct flow_mgr *fm;
1040
1041 VERIFY(!uuid_is_null(ee_arg.ee_fm_uuid));
1042 VERIFY(!uuid_is_null(ee_arg.ee_fr_uuid));
1043
1044 /*
1045 * Upon success, callee will hold flow manager lock as reader,
1046 * and we'll need to unlock it below. Otherwise there's no
1047 * need to unlock here and just return.
1048 */
1049 fm = flow_mgr_find_lock(ee_arg.ee_fm_uuid);
1050 if (fm == NULL) {
1051 SK_ERR("Event %s for dst %s ignored; flow manager not found",
1052 route_event2str(route_ev), sk_sa_ntop(dst, dst_s,
1053 sizeof(dst_s)));
1054 return;
1055 }
1056
1057 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s event %s", fm->fm_name,
1058 sk_sa_ntop(dst, dst_s, sizeof(dst_s)), route_event2str(route_ev));
1059
1060 do {
1061 frib = flow_mgr_get_frib_by_uuid(fm, ee_arg.ee_fr_uuid);
1062
1063 FRIB_RLOCK(frib);
1064 /* callee returns a reference that we need to release below */
1065 fr = flow_route_find_by_uuid(frib, ee_arg.ee_fr_uuid);
1066 if (fr == NULL) {
1067 SK_ERR("%s: dst %s flow route not found", fm->fm_name,
1068 sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1069 break;
1070 }
1071
1072 /*
1073 * Grab fr_lock to prevent flow route configuration or
1074 * resolver from using stale info while we are updating.
1075 */
1076 FR_LOCK(fr);
1077
1078 switch (route_ev) {
1079 case ROUTE_ENTRY_REFRESH:
1080 /*
1081 * This is the case where the route entry has been
1082 * updated (for example through RTM_CHANGE). Some
1083 * of it may not warrant a lookup again and some of
1084 * it may. For now, mark flow to perform a look-up
1085 * again as the gateway may have changed.
1086 */
1087 atomic_add_32(&fr->fr_want_configure, 1);
1088 atomic_bitclear_32(&fr->fr_flags, FLOWRTF_RESOLVED);
1089 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route changed",
1090 fm->fm_name, sk_sa_ntop(dst, dst_s,
1091 sizeof(dst_s)));
1092 break;
1093
1094 case ROUTE_ENTRY_DELETED:
1095 /*
1096 * NOTE: flow_route_cleanup() should not be called
1097 * to de-register eventhandler in the context of
1098 * eventhandler callback to avoid deadlock in
1099 * eventhandler code. Instead, just mark the flow
1100 * route un-resolved. When it is being used again
1101 * or being deleted the old eventhandler must be
1102 * de-registered.
1103 */
1104 atomic_add_32(&fr->fr_want_configure, 1);
1105 atomic_bitclear_32(&fr->fr_flags, FLOWRTF_RESOLVED);
1106 atomic_bitset_32(&fr->fr_flags, FLOWRTF_DELETED);
1107 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route deleted",
1108 fm->fm_name, sk_sa_ntop(dst, dst_s,
1109 sizeof(dst_s)));
1110 break;
1111
1112 case ROUTE_LLENTRY_STALE:
1113 /*
1114 * When the route entry is deemed unreliable or old
1115 * enough to trigger a route lookup again. Don't
1116 * reconfigure the flow route, but simply attempt
1117 * to resolve it next time to trigger a probe.
1118 */
1119 atomic_add_32(&fr->fr_want_probe, 1);
1120 atomic_bitclear_32(&fr->fr_flags, FLOWRTF_RESOLVED);
1121 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry stale",
1122 fm->fm_name, sk_sa_ntop(dst, dst_s,
1123 sizeof(dst_s)));
1124 break;
1125
1126 case ROUTE_LLENTRY_CHANGED:
1127 /*
1128 * When the link-layer info has changed; replace
1129 * cached llinfo in the flow route (treat this
1130 * as ROUTE_LLENTRY_RESOLVED).
1131 */
1132 OS_FALLTHROUGH;
1133
1134 case ROUTE_LLENTRY_RESOLVED:
1135 /*
1136 * SDL address length may be 0 for cellular.
1137 * If Ethernet, copy into flow route and mark
1138 * it as cached. In all cases, mark the flow
1139 * route as resolved.
1140 */
1141 ASSERT(SDL(gw_addr)->sdl_family == AF_LINK);
1142 if (SDL(gw_addr)->sdl_alen == ETHER_ADDR_LEN) {
1143 FLOWRT_UPD_ETH_DST(fr, LLADDR(SDL(gw_addr)));
1144 SK_DF(SK_VERB_FLOW_ROUTE,
1145 "%s: dst %s llentry %s", fm->fm_name,
1146 sk_sa_ntop(dst, dst_s, sizeof(dst_s)),
1147 (!(fr->fr_flags & FLOWRTF_HAS_LLINFO) ?
1148 "resolved" : "changed"));
1149 atomic_bitset_32(&fr->fr_flags,
1150 FLOWRTF_HAS_LLINFO);
1151 } else {
1152 atomic_bitclear_32(&fr->fr_flags,
1153 FLOWRTF_HAS_LLINFO);
1154 }
1155 atomic_bitset_32(&fr->fr_flags, FLOWRTF_RESOLVED);
1156 #if SK_LOG
1157 if (__improbable((sk_verbose & SK_VERB_FLOW_ROUTE) !=
1158 0) && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
1159 SK_DF(SK_VERB_FLOW_ROUTE,
1160 "%s: fr 0x%llx eth_type 0x%x "
1161 "eth_src %x:%x:%x:%x:%x:%x "
1162 "eth_dst %x:%x:%x:%x:%x:%x [%s])",
1163 fm->fm_name, SK_KVA(fr),
1164 ntohs(fr->fr_eth.ether_type),
1165 fr->fr_eth.ether_shost[0],
1166 fr->fr_eth.ether_shost[1],
1167 fr->fr_eth.ether_shost[2],
1168 fr->fr_eth.ether_shost[3],
1169 fr->fr_eth.ether_shost[4],
1170 fr->fr_eth.ether_shost[5],
1171 fr->fr_eth.ether_dhost[0],
1172 fr->fr_eth.ether_dhost[1],
1173 fr->fr_eth.ether_dhost[2],
1174 fr->fr_eth.ether_dhost[3],
1175 fr->fr_eth.ether_dhost[4],
1176 fr->fr_eth.ether_dhost[5],
1177 sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1178 }
1179 #endif /* SK_LOG */
1180 break;
1181
1182 case ROUTE_LLENTRY_DELETED:
1183 /*
1184 * If the route entry points to a router and an
1185 * RTM_DELETE has been issued on it; force the
1186 * flow route to be reconfigured.
1187 */
1188 atomic_add_32(&fr->fr_want_configure, 1);
1189 atomic_bitclear_32(&fr->fr_flags,
1190 (FLOWRTF_HAS_LLINFO | FLOWRTF_RESOLVED));
1191 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry deleted",
1192 fm->fm_name, sk_sa_ntop(dst, dst_s,
1193 sizeof(dst_s)));
1194 break;
1195
1196 case ROUTE_LLENTRY_PROBED:
1197 /*
1198 * When the resolver has begun probing the target;
1199 * nothing to do here.
1200 */
1201 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry probed",
1202 fm->fm_name, sk_sa_ntop(dst, dst_s,
1203 sizeof(dst_s)));
1204 break;
1205
1206 case ROUTE_LLENTRY_UNREACH:
1207 /*
1208 * When the route entry is marked with RTF_REJECT
1209 * or the probes have timed out, reconfigure.
1210 */
1211 atomic_add_32(&fr->fr_want_configure, 1);
1212 atomic_bitclear_32(&fr->fr_flags, FLOWRTF_RESOLVED);
1213 SK_ERR("%s: dst %s llentry unreachable", fm->fm_name,
1214 sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1215 break;
1216
1217 default:
1218 break;
1219 }
1220 } while (0);
1221
1222 if (fr != NULL) {
1223 flow_route_release(fr);
1224 FR_UNLOCK(fr);
1225 }
1226
1227 if (frib != NULL) {
1228 FRIB_UNLOCK(frib);
1229 }
1230
1231 if (fm != NULL) {
1232 flow_mgr_unlock();
1233 }
1234 }
1235
1236 int
flow_route_select_laddr(union sockaddr_in_4_6 * src,union sockaddr_in_4_6 * dst,struct ifnet * ifp,struct rtentry * rt,uint32_t * ipaddr_gencnt,int use_stable_address)1237 flow_route_select_laddr(union sockaddr_in_4_6 *src, union sockaddr_in_4_6 *dst,
1238 struct ifnet *ifp, struct rtentry *rt, uint32_t *ipaddr_gencnt,
1239 int use_stable_address)
1240 {
1241 #if SK_LOG
1242 char src_s[MAX_IPv6_STR_LEN]; /* src */
1243 char dst_s[MAX_IPv6_STR_LEN]; /* dst */
1244 #endif /* SK_LOG */
1245 sa_family_t af = SA(dst)->sa_family;
1246 struct ifnet *src_ifp = NULL;
1247 struct ifaddr *ifa = NULL;
1248 int err = 0;
1249
1250 /* see comments in flow_route_configure() regarding loopback */
1251 ASSERT(rt->rt_ifp == ifp || rt->rt_ifp == lo_ifp);
1252
1253 switch (af) {
1254 case AF_INET: {
1255 ifnet_lock_shared(ifp);
1256 if (__improbable(rt->rt_ifa->ifa_debug & IFD_DETACHING) != 0) {
1257 err = EHOSTUNREACH;
1258 SK_ERR("route to %s has src address marked detaching "
1259 "(err %d)", inet_ntop(AF_INET,
1260 &SIN(dst)->sin_addr, dst_s, sizeof(dst_s)), err);
1261 ifnet_lock_done(ifp);
1262 break;
1263 }
1264 SIN(src)->sin_len = sizeof(struct sockaddr_in);
1265 SIN(src)->sin_family = AF_INET;
1266 SIN(src)->sin_addr = IA_SIN(rt->rt_ifa)->sin_addr;
1267 ASSERT(SIN(src)->sin_addr.s_addr != INADDR_ANY);
1268 *ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1269 ifnet_lock_done(ifp);
1270 break;
1271 }
1272
1273 case AF_INET6: {
1274 struct in6_addr src_storage, *in6;
1275 struct route_in6 ro = {};
1276 uint32_t hints = (use_stable_address ? 0 : IPV6_SRCSEL_HINT_PREFER_TMPADDR);
1277 ro.ro_rt = rt;
1278
1279 if ((in6 = in6_selectsrc_core(SIN6(dst), hints,
1280 ifp, 0, &src_storage, &src_ifp, &err, &ifa, &ro)) == NULL) {
1281 if (err == 0) {
1282 err = EADDRNOTAVAIL;
1283 }
1284 VERIFY(src_ifp == NULL);
1285 SK_ERR("src address to dst %s on %s not available "
1286 "(err %d)", inet_ntop(AF_INET6,
1287 &SIN6(dst)->sin6_addr, dst_s, sizeof(dst_s)),
1288 ifp->if_xname, err);
1289 break;
1290 }
1291
1292 VERIFY(src_ifp != NULL);
1293 VERIFY(ifa != NULL);
1294
1295 if (__improbable(src_ifp != ifp)) {
1296 if (err == 0) {
1297 err = ENETUNREACH;
1298 }
1299 SK_ERR("dst %s, src %s ifp %s != %s (err %d)",
1300 inet_ntop(AF_INET6, &SIN6(dst)->sin6_addr,
1301 dst_s, sizeof(dst_s)),
1302 inet_ntop(AF_INET6, &SIN6(src)->sin6_addr,
1303 src_s, sizeof(src_s)),
1304 src_ifp->if_xname, ifp->if_xname, err);
1305 break;
1306 }
1307
1308 ifnet_lock_shared(ifp);
1309 if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) {
1310 err = EHOSTUNREACH;
1311 SK_ERR("IPv6 address selected is marked to be "
1312 "detached (err %d)", err);
1313 ifnet_lock_done(ifp);
1314 break;
1315 }
1316
1317 /* clear embedded scope if link-local src */
1318 if (IN6_IS_SCOPE_EMBED(in6)) {
1319 if (in6_embedded_scope) {
1320 SIN6(src)->sin6_scope_id = ntohs(in6->s6_addr16[1]);
1321 in6->s6_addr16[1] = 0;
1322 } else {
1323 SIN6(src)->sin6_scope_id = src_ifp->if_index;
1324 }
1325 }
1326 SIN6(src)->sin6_len = sizeof(struct sockaddr_in6);
1327 SIN6(src)->sin6_family = AF_INET6;
1328 SIN6(src)->sin6_addr = *in6;
1329 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&SIN6(src)->sin6_addr));
1330 *ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1331 ifnet_lock_done(ifp);
1332 break;
1333 }
1334
1335 default:
1336 VERIFY(0);
1337 /* NOTREACHED */
1338 __builtin_unreachable();
1339 }
1340
1341 if (ifa != NULL) {
1342 IFA_REMREF(ifa);
1343 }
1344
1345 if (src_ifp != NULL) {
1346 ifnet_release(src_ifp);
1347 }
1348
1349 #if SK_LOG
1350 if (err == 0 && __improbable((sk_verbose & SK_VERB_FLOW_ROUTE) != 0)) {
1351 SK_DF(SK_VERB_FLOW_ROUTE, "src %s to dst %s on %s",
1352 sk_sa_ntop(SA(src), src_s, sizeof(src_s)),
1353 sk_sa_ntop(SA(dst), dst_s, sizeof(dst_s)),
1354 ifp->if_xname);
1355 }
1356 #endif /* SK_LOG */
1357
1358 return err;
1359 }
1360
1361 void
flow_route_cleanup(struct flow_route * fr)1362 flow_route_cleanup(struct flow_route *fr)
1363 {
1364 #if SK_LOG
1365 char ss[MAX_IPv6_STR_LEN]; /* dst */
1366 char ds[MAX_IPv6_STR_LEN]; /* dst */
1367 char gs[MAX_IPv6_STR_LEN]; /* gw */
1368 #endif /* SK_LOG */
1369
1370 FR_LOCK_ASSERT_HELD(fr);
1371
1372 if (fr->fr_rt_evhdlr_tag != NULL) {
1373 ASSERT(fr->fr_rt_dst != NULL);
1374 route_event_enqueue_nwk_wq_entry(fr->fr_rt_dst, NULL,
1375 ROUTE_EVHDLR_DEREGISTER, fr->fr_rt_evhdlr_tag, FALSE);
1376 fr->fr_rt_evhdlr_tag = NULL;
1377 fr->fr_rt_dst = NULL;
1378 }
1379 ASSERT(fr->fr_rt_dst == NULL);
1380 if (fr->fr_rt_gw != NULL) {
1381 rtfree(fr->fr_rt_gw);
1382 fr->fr_rt_gw = NULL;
1383 }
1384
1385 #if SK_LOG
1386 if (fr->fr_flags & FLOWRTF_GATEWAY) {
1387 SK_DF(SK_VERB_FLOW_ROUTE,
1388 "clean fr 0x%llx %s -> %s via gw %s", SK_KVA(fr),
1389 sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
1390 sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
1391 sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs)));
1392 } else if (fr->fr_flags & FLOWRTF_ONLINK) {
1393 SK_DF(SK_VERB_FLOW_ROUTE,
1394 "clean fr 0x%llx %s -> %s", SK_KVA(fr),
1395 sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
1396 sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)));
1397 }
1398 #endif /* SK_LOG */
1399
1400 atomic_bitclear_32(&fr->fr_flags, (FLOWRTF_GATEWAY | FLOWRTF_ONLINK));
1401 }
1402
1403 static boolean_t
_flow_route_laddr_validate(struct flow_ip_addr * src_ip0,uint8_t ip_v,struct ifnet * ifp,uint32_t * gencnt)1404 _flow_route_laddr_validate(struct flow_ip_addr *src_ip0, uint8_t ip_v,
1405 struct ifnet *ifp, uint32_t *gencnt)
1406 {
1407 boolean_t address_found = TRUE;
1408 struct ifaddr *ifa = NULL;
1409 struct flow_ip_addr src_ip = {};
1410 uint32_t scope = ifp->if_index;
1411
1412 VERIFY(gencnt != NULL);
1413 VERIFY(ip_v == IPVERSION || ip_v == IPV6_VERSION);
1414
1415 if (ip_v == IPVERSION) {
1416 memcpy(&src_ip._v4, &src_ip0->_v4, sizeof(src_ip._v4));
1417
1418 ifa = (struct ifaddr *)ifa_foraddr_scoped(
1419 src_ip._v4.s_addr, scope);
1420 } else {
1421 memcpy(&src_ip, src_ip0, sizeof(*src_ip0));
1422
1423 if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(&src_ip._v6)) {
1424 src_ip._v6.s6_addr16[1] = htons((uint16_t)scope);
1425 }
1426 ifa = (struct ifaddr *)ifa_foraddr6_scoped(&src_ip._v6,
1427 scope);
1428 }
1429
1430 if (__improbable(ifa == NULL)) {
1431 address_found = FALSE;
1432 goto done;
1433 }
1434
1435 ifnet_lock_shared(ifp);
1436 if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) {
1437 address_found = FALSE;
1438 ifnet_lock_done(ifp);
1439 goto done;
1440 }
1441
1442 if (ip_v == IPV6_VERSION) {
1443 struct in6_ifaddr *ia6 = (struct in6_ifaddr *)ifa;
1444
1445 /*
1446 * Fail if IPv6 address is not ready or if the address
1447 * is reserved * for CLAT46.
1448 */
1449 if (__improbable(ia6->ia6_flags &
1450 (IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) != 0) {
1451 address_found = FALSE;
1452 ifnet_lock_done(ifp);
1453 goto done;
1454 }
1455 } else {
1456 /*
1457 * If interface has CLAT46 enabled, fail IPv4 bind.
1458 * Since this implies network is NAT64/DNS64, Internet
1459 * effectively becomes reachable over IPv6. If on
1460 * system IPv4 to IPv6 translation is required, that
1461 * should be handled solely through bump in the API.
1462 * The in kernel translation is only done for apps
1463 * directly using low level networking APIs.
1464 */
1465 if (__improbable(IS_INTF_CLAT46(ifp))) {
1466 address_found = FALSE;
1467 ifnet_lock_done(ifp);
1468 goto done;
1469 }
1470 }
1471
1472 *gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1473 ifnet_lock_done(ifp);
1474 done:
1475 if (ifa != NULL) {
1476 IFA_REMREF(ifa);
1477 }
1478
1479 return address_found;
1480 }
1481
1482 boolean_t
flow_route_laddr_validate(union sockaddr_in_4_6 * saddr,struct ifnet * ifp,uint32_t * gencnt)1483 flow_route_laddr_validate(union sockaddr_in_4_6 *saddr, struct ifnet *ifp,
1484 uint32_t *gencnt)
1485 {
1486 VERIFY(saddr->sa.sa_family == AF_INET ||
1487 saddr->sa.sa_family == AF_INET6);
1488
1489 struct flow_ip_addr *ipa;
1490 uint8_t ipv;
1491 if (saddr->sa.sa_family == AF_INET) {
1492 ipv = IPVERSION;
1493 ipa = (struct flow_ip_addr *)(void *)&saddr->sin.sin_addr;
1494 } else {
1495 ipv = IPV6_VERSION;
1496 ipa = (struct flow_ip_addr *)(void *)&saddr->sin6.sin6_addr;
1497 }
1498
1499 return _flow_route_laddr_validate(ipa, ipv, ifp, gencnt);
1500 }
1501
1502 boolean_t
flow_route_key_validate(struct flow_key * fk,struct ifnet * ifp,uint32_t * gencnt)1503 flow_route_key_validate(struct flow_key *fk, struct ifnet *ifp,
1504 uint32_t *gencnt)
1505 {
1506 return _flow_route_laddr_validate(&fk->fk_src, fk->fk_ipver, ifp,
1507 gencnt);
1508 }
1509