1 /*
2 * Copyright (c) 2017-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Flow Routes.
31 *
32 * Each (non-listener) flow entry is always associated with a flow route
33 * object. Multiple flow entries sharing the same remote address will use
34 * the same flow route for that address. The flow route object contains
35 * the route information for the remote node. It gets allocated when a
36 * flow entry requests to connect, and is garbage-collected when it's no
37 * longer referred to after its expiration time has passed.
38 *
39 * A flow route also contains the default local address that's used to
40 * reach the remote node. This may not necessarily be the same local
41 * address used by the flow entry, if it has explicitly bound the entry
42 * to another local address. But for the majority of cases, having the
43 * local address be present in the flow route allows us to avoid doing
44 * source address selection each time a connect request happens.
45 *
46 * When the remote node is reachable via a gateway, the gateway address
47 * portion of the flow route contains its IP address and the flow route
48 * is marked with FLOWRTF_GATEWAY. We use this to optimize the gateway
49 * route lookup, since otherwise we'd have to perform an extra lookup
50 * each time we need to resolve the route.
51 *
52 * When the remote node is directly on the link, the FLOWRTF_ONLINK flag
53 * is set, and the gateway address isn't used. The target address used
54 * for resolution will the the remote address itself.
55 *
56 * On links with link-layer information, we store the resolved address
57 * of the target node (which may be the gateway's) in the flow route,
58 * and mark the flow route with FLOWRTF_HAS_LLINFO.
59 *
60 * Each flow route also registers itself to receive route events when
61 * the underlying rtentry is updated or deleted.
62 */
63
64 #include <skywalk/os_skywalk_private.h>
65
66 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
67 #include <skywalk/nexus/flowswitch/fsw_var.h>
68 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
69
70 #include <netinet/in.h>
71 #include <netinet/in_var.h>
72 #include <netinet/in_arp.h>
73 #include <netinet6/nd6.h>
74 #include <net/route.h>
75
76 extern struct rtstat_64 rtstat;
77
78 static LCK_GRP_DECLARE(flow_route_lock_group, "sk_flow_route_lock");
79 static LCK_ATTR_DECLARE(flow_route_lock_attr, 0, 0);
80
81 static int fr_cmp(const struct flow_route *, const struct flow_route *);
82 static int fr_id_cmp(const struct flow_route *, const struct flow_route *);
83 static struct flow_route *fr_alloc(boolean_t);
84 static void fr_free(struct flow_route *);
85 static uint32_t flow_route_bucket_purge_common(struct flow_route_bucket *,
86 uint32_t *, boolean_t, boolean_t);
87 static void flow_route_ev_callback(struct eventhandler_entry_arg,
88 struct sockaddr *, int, struct sockaddr *, int);
89
90 RB_GENERATE_PREV(flow_route_tree, flow_route, fr_link, fr_cmp);
91 RB_GENERATE_PREV(flow_route_id_tree, flow_route, fr_id_link, fr_id_cmp);
92
93 KALLOC_TYPE_VAR_DEFINE(KT_SK_FRB, struct flow_route_bucket, KT_DEFAULT);
94 KALLOC_TYPE_VAR_DEFINE(KT_SK_FRIB, struct flow_route_id_bucket, KT_DEFAULT);
95
96 #define FR_ZONE_NAME "flow.route"
97
98 static unsigned int flow_route_size; /* size of flow_route */
99 struct skmem_cache *flow_route_cache; /* cache for flow_route */
100
101 static int __flow_route_inited = 0;
102
103 #define FLOW_ROUTE_EXPIRE 600 /* seconds */
104 static unsigned int flow_route_expire = FLOW_ROUTE_EXPIRE;
105
106 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_expire,
107 CTLFLAG_RW | CTLFLAG_LOCKED, &flow_route_expire, 0, "");
108
109 void
flow_route_init(void)110 flow_route_init(void)
111 {
112 ASSERT(!__flow_route_inited);
113
114 flow_route_size = sizeof(struct flow_route);
115 flow_route_cache = skmem_cache_create(FR_ZONE_NAME, flow_route_size,
116 sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0);
117
118 __flow_route_inited = 1;
119 }
120
121 void
flow_route_fini(void)122 flow_route_fini(void)
123 {
124 if (__flow_route_inited) {
125 skmem_cache_destroy(flow_route_cache);
126 flow_route_cache = NULL;
127
128 __flow_route_inited = 0;
129 }
130 }
131
132 struct flow_route_bucket *
133 __sized_by(*tot_sz)
flow_route_buckets_alloc(size_t frb_cnt,size_t * frb_sz,size_t * tot_sz)134 flow_route_buckets_alloc(size_t frb_cnt, size_t * frb_sz, size_t * tot_sz){
135 uint32_t cache_sz = skmem_cpu_cache_line_size();
136 struct flow_route_bucket *frb;
137 size_t frb_tot_sz;
138
139 /* each bucket is CPU cache-aligned */
140 *frb_sz = P2ROUNDUP(sizeof(*frb), cache_sz);
141 *tot_sz = frb_tot_sz = frb_cnt * (*frb_sz);
142 frb = sk_alloc_type_hash(KT_SK_FRB, frb_tot_sz, Z_WAITOK,
143 skmem_tag_fsw_frb_hash);
144 if (__improbable(frb == NULL)) {
145 return NULL;
146 }
147
148 #if !KASAN_CLASSIC
149 /*
150 * except in KASAN_CLASSIC mode, kalloc will always maintain cacheline
151 * size alignment if the requested size is a multiple of a cacheline
152 * size (this is true for any size that is a power of two from 16 to
153 * PAGE_SIZE).
154 *
155 * Because this is an optimization only, it is OK to leave KASAN_CLASSIC
156 * not respect this.
157 */
158 ASSERT(IS_P2ALIGNED(frb, cache_sz));
159 #endif
160
161 SK_DF(SK_VERB_MEM, "frb 0x%llx frb_cnt %zu frb_sz %zu "
162 "(total %zu bytes) ALLOC", SK_KVA(frb), frb_cnt,
163 *frb_sz, frb_tot_sz);
164
165 return frb;
166 }
167
168 void
flow_route_buckets_free(struct flow_route_bucket * frb,size_t tot_sz)169 flow_route_buckets_free(struct flow_route_bucket *frb, size_t tot_sz)
170 {
171 SK_DF(SK_VERB_MEM, "frb 0x%llx FREE", SK_KVA(frb));
172 sk_free_type_hash(KT_SK_FRB, tot_sz, frb);
173 }
174
175 void
flow_route_bucket_init(struct flow_route_bucket * frb)176 flow_route_bucket_init(struct flow_route_bucket *frb)
177 {
178 #if !KASAN_CLASSIC
179 ASSERT(IS_P2ALIGNED(frb, skmem_cpu_cache_line_size()));
180 #endif /* !KASAN_CLASSIC */
181 lck_rw_init(&frb->frb_lock, &flow_route_lock_group,
182 &flow_route_lock_attr);
183 RB_INIT(&frb->frb_head);
184 }
185
186 void
flow_route_bucket_destroy(struct flow_route_bucket * frb)187 flow_route_bucket_destroy(struct flow_route_bucket *frb)
188 {
189 ASSERT(RB_EMPTY(&frb->frb_head));
190 lck_rw_destroy(&frb->frb_lock, &flow_route_lock_group);
191 }
192
193 static struct flow_route *
flow_route_find_by_addr(struct flow_route_bucket * frb,union sockaddr_in_4_6 * dst)194 flow_route_find_by_addr(struct flow_route_bucket *frb,
195 union sockaddr_in_4_6 *dst)
196 {
197 struct flow_route *fr;
198 struct flow_route find;
199
200 FRB_LOCK_ASSERT_HELD(frb);
201
202 switch (SA(dst)->sa_family) {
203 case AF_INET:
204 find.fr_af = AF_INET;
205 find.fr_addr_len = sizeof(struct in_addr);
206 find.fr_addr_key = (void *)&SIN(dst)->sin_addr;
207 break;
208
209 case AF_INET6:
210 find.fr_af = AF_INET6;
211 find.fr_addr_len = sizeof(struct in6_addr);
212 find.fr_addr_key = (void *)&SIN6(dst)->sin6_addr;
213 break;
214
215 default:
216 VERIFY(0);
217 /* NOTREACHED */
218 __builtin_unreachable();
219 }
220
221 fr = RB_FIND(flow_route_tree, &frb->frb_head, &find);
222 if (fr != NULL) {
223 flow_route_retain(fr); /* for the caller */
224 }
225 return fr;
226 }
227
228 struct flow_route_id_bucket *
229 __sized_by(*tot_sz)
flow_route_id_buckets_alloc(size_t frib_cnt,size_t * frib_sz,size_t * tot_sz)230 flow_route_id_buckets_alloc(size_t frib_cnt, size_t * frib_sz, size_t * tot_sz){
231 uint32_t cache_sz = skmem_cpu_cache_line_size();
232 struct flow_route_id_bucket *frib;
233 size_t frib_tot_sz;
234
235 /* each bucket is CPU cache-aligned */
236 *frib_sz = P2ROUNDUP(sizeof(*frib), cache_sz);
237 *tot_sz = frib_tot_sz = frib_cnt * (*frib_sz);
238 frib = sk_alloc_type_hash(KT_SK_FRIB, frib_tot_sz, Z_WAITOK,
239 skmem_tag_fsw_frib_hash);
240 /* END IGNORE CODESTYLE */
241 if (__improbable(frib == NULL)) {
242 return NULL;
243 }
244
245 #if !KASAN_CLASSIC
246 /*
247 * except in KASAN_CLASSIC mode, kalloc will always maintain cacheline
248 * size alignment if the requested size is a multiple of a cacheline
249 * size (this is true for any size that is a power of two from 16 to
250 * PAGE_SIZE).
251 *
252 * Because this is an optimization only, it is OK to leave KASAN_CLASSIC
253 * not respect this.
254 */
255 ASSERT(IS_P2ALIGNED(frib, cache_sz));
256 #endif /* !KASAN_CLASSIC */
257
258 SK_DF(SK_VERB_MEM, "frib 0x%llx frib_cnt %zu frib_sz %zu "
259 "(total %zu bytes) ALLOC", SK_KVA(frib), frib_cnt,
260 *frib_sz, frib_tot_sz);
261
262 return frib;
263 }
264
265 void
flow_route_id_buckets_free(struct flow_route_id_bucket * frib,size_t tot_sz)266 flow_route_id_buckets_free(struct flow_route_id_bucket *frib, size_t tot_sz)
267 {
268 SK_DF(SK_VERB_MEM, "frib 0x%llx FREE", SK_KVA(frib));
269 sk_free_type_hash(KT_SK_FRIB, tot_sz, frib);
270 }
271
272 void
flow_route_id_bucket_init(struct flow_route_id_bucket * frib)273 flow_route_id_bucket_init(struct flow_route_id_bucket *frib)
274 {
275 #if !KASAN_CLASSIC
276 ASSERT(IS_P2ALIGNED(frib, skmem_cpu_cache_line_size()));
277 #endif
278 lck_rw_init(&frib->frib_lock, &flow_route_lock_group,
279 &flow_route_lock_attr);
280 RB_INIT(&frib->frib_head);
281 }
282
283 void
flow_route_id_bucket_destroy(struct flow_route_id_bucket * frib)284 flow_route_id_bucket_destroy(struct flow_route_id_bucket *frib)
285 {
286 ASSERT(RB_EMPTY(&frib->frib_head));
287 lck_rw_destroy(&frib->frib_lock, &flow_route_lock_group);
288 }
289
290 static struct flow_route *
flow_route_find_by_uuid(struct flow_route_id_bucket * frib,uuid_t id)291 flow_route_find_by_uuid(struct flow_route_id_bucket *frib, uuid_t id)
292 {
293 struct flow_route *fr;
294 struct flow_route find;
295
296 FRIB_LOCK_ASSERT_HELD(frib);
297
298 uuid_copy(find.fr_uuid, id);
299 fr = RB_FIND(flow_route_id_tree, &frib->frib_head, &find);
300 if (fr != NULL) {
301 flow_route_retain(fr); /* for the caller */
302 }
303 return fr;
304 }
305
306 static struct flow_route *
fr_alloc(boolean_t cansleep)307 fr_alloc(boolean_t cansleep)
308 {
309 struct flow_route *fr;
310
311 fr = skmem_cache_alloc(flow_route_cache,
312 (cansleep ? SKMEM_SLEEP : SKMEM_NOSLEEP));
313 if (fr == NULL) {
314 return NULL;
315 }
316 bzero(fr, flow_route_size);
317 lck_spin_init(&fr->fr_reflock, &flow_route_lock_group, &flow_route_lock_attr);
318 lck_mtx_init(&fr->fr_lock, &flow_route_lock_group, &flow_route_lock_attr);
319 uuid_generate_random(fr->fr_uuid);
320
321 SK_DF(SK_VERB_MEM, "allocated fr 0x%llx", SK_KVA(fr));
322 return fr;
323 }
324
325 static void
fr_free(struct flow_route * fr)326 fr_free(struct flow_route *fr)
327 {
328 SK_DF(SK_VERB_MEM, "freeing fr 0x%llx", SK_KVA(fr));
329
330 VERIFY(!(fr->fr_flags & FLOWRTF_ATTACHED));
331 VERIFY(fr->fr_usecnt == 0);
332
333 FR_LOCK(fr);
334 /* callee frees route entry */
335 flow_route_cleanup(fr);
336 VERIFY(fr->fr_rt_dst == NULL);
337 VERIFY(fr->fr_rt_gw == NULL);
338 VERIFY(fr->fr_rt_evhdlr_tag == NULL);
339 FR_UNLOCK(fr);
340
341 lck_mtx_destroy(&fr->fr_lock, &flow_route_lock_group);
342 lck_spin_destroy(&fr->fr_reflock, &flow_route_lock_group);
343
344 skmem_cache_free(flow_route_cache, fr);
345 }
346
347 static inline int
fr_cmp(const struct flow_route * a,const struct flow_route * b)348 fr_cmp(const struct flow_route *a, const struct flow_route *b)
349 {
350 int d;
351
352 if ((d = (a->fr_af - b->fr_af)) != 0) {
353 return d;
354 }
355 if ((d = flow_ip_cmp(a->fr_addr_key, b->fr_addr_key,
356 b->fr_addr_len)) != 0) {
357 return d;
358 }
359
360 return 0;
361 }
362
363 static inline int
fr_id_cmp(const struct flow_route * a,const struct flow_route * b)364 fr_id_cmp(const struct flow_route *a, const struct flow_route *b)
365 {
366 return uuid_compare(a->fr_uuid, b->fr_uuid);
367 }
368
369 static inline int
fr_use_stable_address(struct nx_flow_req * req)370 fr_use_stable_address(struct nx_flow_req *req)
371 {
372 int use_stable_address = ip6_prefer_tempaddr ? 0 : 1;
373 if (req != NULL &&
374 (req->nfr_flags & NXFLOWREQF_OVERRIDE_ADDRESS_SELECTION)) {
375 use_stable_address = (req->nfr_flags & NXFLOWREQF_USE_STABLE_ADDRESS) ? 1 : 0;
376 }
377 return use_stable_address;
378 }
379
380 int
flow_route_configure(struct flow_route * fr,struct ifnet * ifp,struct nx_flow_req * req)381 flow_route_configure(struct flow_route *fr, struct ifnet *ifp, struct nx_flow_req *req)
382 {
383 #if SK_LOG
384 char old_s[MAX_IPv6_STR_LEN]; /* src */
385 char src_s[MAX_IPv6_STR_LEN]; /* src */
386 char dst_s[MAX_IPv6_STR_LEN]; /* dst */
387 #endif /* SK_LOG */
388 struct rtentry *rt = NULL, *__single gwrt = NULL;
389 int err = 0;
390
391 FR_LOCK_ASSERT_HELD(fr);
392
393 /*
394 * If there is a route entry for the final destination, see if
395 * it's no longer valid and perform another routing table lookup.
396 * A non-NULL fr_rt_dst is always associated with a route event
397 * registration, and the route reference is held there.
398 */
399 rt = fr->fr_rt_dst;
400 if (rt == NULL || !(rt->rt_flags & RTF_UP) || fr->fr_want_configure) {
401 struct eventhandler_entry_arg ee_arg;
402
403 /* callee frees route entry */
404 flow_route_cleanup(fr);
405
406 /* lookup destination route */
407 ASSERT(err == 0);
408 rt = rtalloc1_scoped(SA(&fr->fr_faddr), 1, 0, ifp->if_index);
409 if (rt == NULL) {
410 err = EHOSTUNREACH;
411 SK_ERR("no route to %s on %s (err %d)",
412 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
413 sizeof(dst_s)), ifp->if_xname, err);
414 } else {
415 /*
416 * If route points to another interface and the
417 * route's gateway isn't link-layer, reject it.
418 * We make an exception otherwise, since local
419 * interface addresses resolve this way.
420 */
421 if (rt->rt_ifp != ifp && rt->rt_ifp != lo_ifp &&
422 (rt->rt_gateway == NULL ||
423 SA(rt->rt_gateway)->sa_family != AF_LINK)) {
424 err = EHOSTUNREACH;
425 SK_ERR("route to %s on %s != %s (err %d)",
426 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
427 sizeof(dst_s)), rt->rt_ifp->if_xname,
428 ifp->if_xname, err);
429 }
430 }
431
432 if (err != 0) {
433 goto done;
434 }
435
436 ASSERT(fr->fr_mgr != NULL);
437 ASSERT(!uuid_is_null(fr->fr_mgr->fm_uuid));
438 ASSERT(!uuid_is_null(fr->fr_uuid));
439 ASSERT(!uuid_is_null(fr->fr_nx_uuid));
440
441 bzero(&ee_arg, sizeof(ee_arg));
442 uuid_copy(ee_arg.ee_fm_uuid, fr->fr_mgr->fm_uuid);
443 uuid_copy(ee_arg.ee_fr_uuid, fr->fr_uuid);
444
445 /*
446 * Register for changes on destination route; this covers both
447 * cases where the destination is on-link, or if it is off-link
448 * and is using a gateway route. This also transfers the refcnt
449 * of the route entry to the event handler, released later when
450 * it is deregistered.
451 */
452 ASSERT(fr->fr_rt_dst == NULL);
453 ASSERT(fr->fr_rt_evhdlr_tag == NULL);
454 fr->fr_rt_dst = rt; /* move reference to fr */
455 fr->fr_rt_evhdlr_tag =
456 EVENTHANDLER_REGISTER(&rt->rt_evhdlr_ctxt, route_event,
457 &flow_route_ev_callback, ee_arg, EVENTHANDLER_PRI_ANY);
458 ASSERT(fr->fr_rt_evhdlr_tag != NULL);
459 os_atomic_andnot(&fr->fr_flags, FLOWRTF_DELETED, relaxed);
460
461 /*
462 * Lookup gateway route (if any); returns locked gwrt
463 * with a reference bumped up.
464 */
465 err = route_to_gwroute(SA(&fr->fr_faddr), rt, &gwrt);
466 if (err != 0) {
467 /*
468 * Reference held by fr_rt_dst will be taken
469 * care of by flow_route_cleanup() below, so
470 * make sure we don't do an extra rtfree().
471 */
472 rt = NULL;
473 ASSERT(gwrt == NULL);
474 SK_ERR("no gw route to %s on %s (err %d)",
475 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
476 sizeof(dst_s)), ifp->if_xname, err);
477 goto done;
478 }
479
480 /* if RTF_GATEWAY isn't set, gwrt == rt */
481 ASSERT(gwrt != NULL);
482 RT_LOCK_ASSERT_HELD(gwrt);
483
484 /*
485 * Must have been cleared via cleanup, and that we're
486 * single-threaded here for fr by virtue of fr_lock.
487 */
488 ASSERT(!(fr->fr_flags & (FLOWRTF_GATEWAY | FLOWRTF_ONLINK)));
489
490 if (gwrt != rt && (rt->rt_flags & RTF_GATEWAY) &&
491 (rt->rt_gateway->sa_family == AF_INET ||
492 rt->rt_gateway->sa_family == AF_INET6)) {
493 struct sockaddr_storage ss;
494
495 ASSERT(fr->fr_rt_gw == NULL);
496 /* locked via route_to_gwroute() above */
497 fr->fr_rt_gw = gwrt; /* move reference to fr */
498 RT_ADDREF_LOCKED(gwrt); /* for this routine */
499 /*
500 * Destination is off-link and is reachable
501 * thru an IP gateway route. Save the IP
502 * address of the gateway in fr_gaddr.
503 */
504 (void) sa_copy(rt->rt_gateway, &ss, NULL);
505 _CASSERT(sizeof(fr->fr_gaddr) <= sizeof(ss));
506 bcopy(&ss, &fr->fr_gaddr, sizeof(fr->fr_gaddr));
507 os_atomic_or(&fr->fr_flags, FLOWRTF_GATEWAY, relaxed);
508 } else if (IS_DIRECT_HOSTROUTE(rt)) {
509 /*
510 * Destination is on-link.
511 */
512 os_atomic_or(&fr->fr_flags, FLOWRTF_ONLINK, relaxed);
513 }
514 RT_UNLOCK(gwrt);
515 }
516 RT_ADDREF(rt); /* for this routine */
517
518 /* see if we need to re-select default source address */
519 int use_stable_address = fr_use_stable_address(req);
520 if (fr->fr_want_configure ||
521 fr->fr_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt ||
522 !(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address) {
523 union sockaddr_in_4_6 old = fr->fr_laddr;
524 if (use_stable_address) {
525 os_atomic_or(&fr->fr_flags, FLOWRTF_STABLE_ADDR, relaxed);
526 } else {
527 os_atomic_andnot(&fr->fr_flags, FLOWRTF_STABLE_ADDR, relaxed);
528 }
529 if ((err = flow_route_select_laddr(&fr->fr_laddr, &fr->fr_faddr,
530 ifp, rt, &fr->fr_laddr_gencnt, use_stable_address)) != 0) {
531 SK_ERR("no usable src address to reach %s on %s "
532 "(err %d)", sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
533 sizeof(dst_s)), ifp->if_xname, err);
534 goto done;
535 }
536 if (bcmp(&old, &fr->fr_laddr, SA(&old)->sa_len) != 0) {
537 SK_ERR("src address is now %s (was %s) to reach %s "
538 "on %s", sk_sa_ntop(SA(&fr->fr_laddr), src_s,
539 sizeof(src_s)), sk_sa_ntop(SA(&old), old_s,
540 sizeof(old_s)), sk_sa_ntop(SA(&fr->fr_faddr),
541 dst_s, sizeof(dst_s)), ifp->if_xname);
542 }
543 }
544 ASSERT(err == 0);
545
546 done:
547 if (__probable(err == 0)) {
548 os_atomic_store(&fr->fr_want_configure, 0, release);
549 } else {
550 /* callee frees route entry */
551 flow_route_cleanup(fr);
552 }
553
554 if (gwrt != NULL) {
555 ASSERT(rt != NULL);
556 if (gwrt == rt) {
557 RT_REMREF(gwrt);
558 } else {
559 rtfree(gwrt);
560 }
561 gwrt = NULL;
562 }
563
564 if (rt != NULL) {
565 rtfree(rt);
566 rt = NULL;
567 }
568
569 return err;
570 }
571
572 int
flow_route_find(struct kern_nexus * nx,struct flow_mgr * fm,struct ifnet * ifp,struct nx_flow_req * req,flow_route_ctor_fn_t fr_ctor,flow_route_resolve_fn_t fr_resolve,void * arg,struct flow_route ** frp)573 flow_route_find(struct kern_nexus *nx, struct flow_mgr *fm,
574 struct ifnet *ifp, struct nx_flow_req *req,
575 flow_route_ctor_fn_t fr_ctor, flow_route_resolve_fn_t fr_resolve,
576 void *arg, struct flow_route **frp)
577 {
578 #if SK_LOG
579 char src_s[MAX_IPv6_STR_LEN]; /* dst */
580 char dst_s[MAX_IPv6_STR_LEN]; /* dst */
581 char gw_s[MAX_IPv6_STR_LEN]; /* gw */
582 #endif /* SK_LOG */
583 union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
584 struct flow_route_bucket *frb;
585 struct flow_route_id_bucket *frib;
586 struct flow_route *fr = NULL;
587 int err = 0;
588
589 ASSERT(fr_ctor != NULL && fr_resolve != NULL);
590
591 ASSERT(frp != NULL);
592 *frp = NULL;
593
594 frb = flow_mgr_get_frb_by_addr(fm, daddr);
595
596 int use_stable_address = fr_use_stable_address(req);
597
598 /* see if there is a cached flow route (as reader) */
599 FRB_RLOCK(frb);
600 fr = flow_route_find_by_addr(frb, daddr);
601 if (fr != NULL) {
602 if (__improbable(fr->fr_want_configure || fr->fr_laddr_gencnt !=
603 ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt) ||
604 __improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) {
605 os_atomic_inc(&fr->fr_want_configure, relaxed);
606 FR_LOCK(fr);
607 err = flow_route_configure(fr, ifp, req);
608 if (err != 0) {
609 SK_ERR("fr 0x%llx error re-configuring dst %s "
610 "on %s (err %d) [R]", SK_KVA(fr),
611 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
612 sizeof(dst_s)), ifp->if_xname, err);
613 }
614 FR_UNLOCK(fr);
615 }
616 if (err == 0) {
617 SK_DF(SK_VERB_FLOW_ROUTE,
618 "fr 0x%llx found for dst %s " "on %s [R,%u]",
619 SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
620 sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt);
621 }
622 FRB_RUNLOCK(frb); /* reader */
623 goto done;
624 }
625
626 /*
627 * Flow route doesn't exist; become a writer and prepare to
628 * allocate one. We could be racing with other threads here,
629 * so check first if there is now a cached flow route that
630 * got created by the winning thread.
631 */
632 if (!FRB_RLOCKTOWLOCK(frb)) {
633 FRB_WLOCK(frb);
634 }
635
636 fr = flow_route_find_by_addr(frb, daddr);
637 if (fr != NULL) {
638 if (__improbable(fr->fr_want_configure) ||
639 __improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) {
640 FR_LOCK(fr);
641 err = flow_route_configure(fr, ifp, req);
642 if (err != 0) {
643 SK_ERR("fr 0x%llx error re-configuring dst %s "
644 "on %s (err %d) [W]", SK_KVA(fr),
645 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
646 sizeof(dst_s)), ifp->if_xname, err);
647 }
648 FR_UNLOCK(fr);
649 }
650 if (err == 0) {
651 SK_DF(SK_VERB_FLOW_ROUTE,
652 "fr 0x%llx found for dst %s on %s [W,%u]",
653 SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
654 sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt);
655 }
656 FRB_WUNLOCK(frb); /* writer */
657 goto done;
658 }
659
660 /* allocate one */
661 fr = fr_alloc(TRUE);
662 fr->fr_faddr = *daddr; /* remote address */
663
664 switch (SA(&fr->fr_faddr)->sa_family) {
665 case AF_INET:
666 SIN(&fr->fr_faddr)->sin_port = 0;
667 fr->fr_addr_len = sizeof(struct in_addr);
668 fr->fr_addr_key = &SIN(&fr->fr_faddr)->sin_addr;
669 break;
670
671 case AF_INET6:
672 SIN6(&fr->fr_faddr)->sin6_port = 0;
673 fr->fr_addr_len = sizeof(struct in6_addr);
674 fr->fr_addr_key = &SIN6(&fr->fr_faddr)->sin6_addr;
675 break;
676
677 default:
678 VERIFY(0);
679 /* NOTREACHED */
680 __builtin_unreachable();
681 }
682
683 ASSERT(!uuid_is_null(fr->fr_uuid));
684 uuid_copy(fr->fr_nx_uuid, nx->nx_uuid);
685 *(struct flow_mgr **)(uintptr_t)&fr->fr_mgr = fm;
686
687 /* force configure newly-created flow route */
688 os_atomic_inc(&fr->fr_want_configure, relaxed);
689
690 FR_LOCK(fr);
691 if ((err = flow_route_configure(fr, ifp, req)) != 0) {
692 SK_ERR("fr 0x%llx error configuring dst %s on %s (err %d)",
693 SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
694 sizeof(dst_s)), ifp->if_xname, err);
695 FR_UNLOCK(fr);
696 FRB_WUNLOCK(frb); /* writer */
697 /* not yet in tree, so free immediately */
698 fr_free(fr);
699 fr = NULL;
700 goto done;
701 }
702
703 /* execute nexus-specific constructor */
704 fr_ctor(arg, fr);
705 FR_UNLOCK(fr);
706
707 frib = flow_mgr_get_frib_by_uuid(fm, fr->fr_uuid);
708 FRIB_WLOCK(frib);
709
710 *(struct flow_route_bucket **)(uintptr_t)&fr->fr_frb = frb;
711 *(struct flow_route_id_bucket **)(uintptr_t)&fr->fr_frib = frib;
712
713 FRB_WLOCK_ASSERT_HELD(frb);
714 FRIB_WLOCK_ASSERT_HELD(frib);
715
716 RB_INSERT(flow_route_tree, &frb->frb_head, fr);
717 RB_INSERT(flow_route_id_tree, &frib->frib_head, fr);
718
719 os_atomic_or(&fr->fr_flags, FLOWRTF_ATTACHED, relaxed);
720
721 #if DEBUG
722 /* sanity checks for comparator routines */
723 VERIFY(flow_route_find_by_addr(frb, &fr->fr_faddr) == fr);
724 flow_route_release(fr);
725 VERIFY(flow_route_find_by_uuid(frib, fr->fr_uuid) == fr);
726 flow_route_release(fr);
727 #endif /* DEBUG */
728
729 /* for the trees */
730 _CASSERT(FLOW_ROUTE_MINREF == 2);
731 flow_route_retain(fr);
732 flow_route_retain(fr);
733 ASSERT(fr->fr_usecnt == FLOW_ROUTE_MINREF);
734
735 /* for the caller */
736 flow_route_retain(fr);
737
738 FRIB_WUNLOCK(frib); /* writer */
739 FRB_WUNLOCK(frb); /* writer */
740
741 /* execute nexus-specific resolver */
742 if (!(fr->fr_flags & FLOWRTF_RESOLVED) &&
743 (err = fr_resolve(arg, fr, NULL)) != 0) {
744 if (fr->fr_flags & FLOWRTF_GATEWAY) {
745 SK_ERR("fr 0x%llx resolve %s gw %s on %s (err %d)",
746 SK_KVA(fr), (err == EJUSTRETURN ? "pending" :
747 "fail"), sk_sa_ntop(SA(&fr->fr_gaddr), dst_s,
748 sizeof(dst_s)), ifp->if_xname, err);
749 } else {
750 SK_ERR("fr 0x%llx resolve %s dst %s on %s (err %d)",
751 SK_KVA(fr), (err == EJUSTRETURN ? "pending" :
752 "fail"), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
753 sizeof(dst_s)), ifp->if_xname, err);
754 }
755 if (err == EJUSTRETURN) {
756 err = 0;
757 } else {
758 goto done;
759 }
760 }
761 ASSERT(err == 0);
762
763 #if SK_LOG
764 if (fr->fr_flags & FLOWRTF_GATEWAY) {
765 SK_DF(SK_VERB_FLOW_ROUTE,
766 "add fr 0x%llx %s -> %s via gw %s on %s", SK_KVA(fr),
767 sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)),
768 sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
769 sk_sa_ntop(SA(&fr->fr_gaddr), gw_s, sizeof(gw_s)),
770 ifp->if_xname);
771 } else {
772 SK_DF(SK_VERB_FLOW_ROUTE,
773 "add fr 0x%llx %s -> %s on %s", SK_KVA(fr),
774 sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)),
775 sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
776 ifp->if_xname);
777 }
778 #endif /* SK_LOG */
779
780 done:
781 if (err == 0) {
782 ASSERT(fr != NULL);
783 *frp = fr;
784 } else if (fr != NULL) {
785 /* can't directly call fr_free() if it's in the tree */
786 flow_route_release(fr);
787 fr = NULL;
788 }
789
790 return err;
791 }
792
793 void
flow_route_retain(struct flow_route * fr)794 flow_route_retain(struct flow_route *fr)
795 {
796 lck_spin_lock(&fr->fr_reflock);
797 if (fr->fr_usecnt++ == FLOW_ROUTE_MINREF) {
798 fr->fr_expire = 0;
799 }
800 lck_spin_unlock(&fr->fr_reflock);
801 }
802
803 static void
__flow_route_release(struct flow_route * fr,boolean_t renew)804 __flow_route_release(struct flow_route *fr, boolean_t renew)
805 {
806 bool should_free = false;
807
808 lck_spin_lock(&fr->fr_reflock);
809 VERIFY(fr->fr_usecnt > 0);
810 if (fr->fr_flags & FLOWRTF_ATTACHED) {
811 if (fr->fr_usecnt-- == (FLOW_ROUTE_MINREF + 1) && renew) {
812 fr->fr_expire = _net_uptime + flow_route_expire;
813 }
814 } else {
815 /*
816 * fr is no longer in lookup tree, so there shouldn't be
817 * further usecnt, if we reach 0 usecnt, then this is the very
818 * last reference and is safe to unlock and call fr_free.
819 */
820 if (--(fr->fr_usecnt) == 0) {
821 should_free = true;
822 }
823 }
824 lck_spin_unlock(&fr->fr_reflock);
825
826 if (should_free) {
827 fr_free(fr);
828 }
829 }
830
831 void
flow_route_release(struct flow_route * fr)832 flow_route_release(struct flow_route *fr)
833 {
834 __flow_route_release(fr, true);
835 }
836
837 static uint32_t
flow_route_bucket_purge_common(struct flow_route_bucket * frb,uint32_t * resid,boolean_t all,boolean_t early_expire)838 flow_route_bucket_purge_common(struct flow_route_bucket *frb, uint32_t *resid,
839 boolean_t all, boolean_t early_expire)
840 {
841 #if SK_LOG
842 char ss[MAX_IPv6_STR_LEN]; /* dst */
843 char ds[MAX_IPv6_STR_LEN]; /* dst */
844 char gs[MAX_IPv6_STR_LEN]; /* gw */
845 #endif /* SK_LOG */
846 struct flow_route *fr, *tfr;
847 uint64_t now = net_uptime();
848 uint32_t i = 0, tot = 0;
849
850 FRB_WLOCK_ASSERT_HELD(frb);
851
852 RB_FOREACH_SAFE(fr, flow_route_tree, &frb->frb_head, tfr) {
853 struct flow_route_id_bucket *frib =
854 __DECONST(struct flow_route_id_bucket *, fr->fr_frib);
855
856 ++tot;
857 /*
858 * We're not holding fr_lock here, since this is a
859 * best-effort check. If there's a race and we miss
860 * it now, we'll come back again shortly.
861 */
862 lck_spin_lock(&fr->fr_reflock);
863 if (!all && (fr->fr_usecnt > FLOW_ROUTE_MINREF ||
864 (fr->fr_expire > now && !early_expire &&
865 !(fr->fr_flags & FLOWRTF_DELETED)))) {
866 lck_spin_unlock(&fr->fr_reflock);
867 SK_DF(SK_VERB_FLOW_ROUTE, "skipping fr 0x%llx "
868 "refcnt %u expire %llu", SK_KVA(fr),
869 fr->fr_usecnt, fr->fr_expire);
870 continue;
871 }
872 lck_spin_unlock(&fr->fr_reflock);
873
874 /*
875 * If "all" is set, flow entries must be gone by now, as
876 * we must be called by flow_route_bucket_purge_all().
877 * It also means that the caller has acquired writer lock
878 * on all flow {route,route_id} buckets, and fr_usecnt
879 * must be at its minimum value now.
880 */
881 if (!all) {
882 FRIB_WLOCK(frib);
883 }
884 FRIB_WLOCK_ASSERT_HELD(frib);
885
886 _CASSERT(FLOW_ROUTE_MINREF == 2);
887 ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF);
888
889 RB_REMOVE(flow_route_tree, &frb->frb_head, fr);
890 RB_REMOVE(flow_route_id_tree, &frib->frib_head, fr);
891
892 os_atomic_andnot(&fr->fr_flags, FLOWRTF_ATTACHED, relaxed);
893
894 #if SK_LOG
895 if (fr->fr_flags & FLOWRTF_GATEWAY) {
896 SK_DF(SK_VERB_FLOW_ROUTE,
897 "remove fr 0x%llx %s -> %s via gw %s [exp %lld]",
898 SK_KVA(fr),
899 sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
900 sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
901 sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs)),
902 (int64_t)(fr->fr_expire - now));
903 } else {
904 SK_DF(SK_VERB_FLOW_ROUTE,
905 "remove fr 0x%llx %s -> %s [exp %lld]", SK_KVA(fr),
906 sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
907 sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
908 (int64_t)(fr->fr_expire - now));
909 }
910 #endif /* SK_LOG */
911
912 /* for the trees */
913 flow_route_release(fr);
914 flow_route_release(fr);
915 ++i;
916
917 if (!all) {
918 FRIB_WUNLOCK(frib);
919 }
920 }
921
922 if (resid != NULL) {
923 *resid = (tot - i);
924 }
925
926 return i;
927 }
928
929 void
flow_route_bucket_purge_all(struct flow_route_bucket * frb)930 flow_route_bucket_purge_all(struct flow_route_bucket *frb)
931 {
932 (void) flow_route_bucket_purge_common(frb, NULL, TRUE, FALSE);
933 }
934
935 static uint32_t
flow_route_bucket_prune(struct flow_route_bucket * frb,struct ifnet * ifp,uint32_t * resid)936 flow_route_bucket_prune(struct flow_route_bucket *frb, struct ifnet *ifp,
937 uint32_t *resid)
938 {
939 uint64_t now = net_uptime();
940 struct flow_route *fr;
941 uint32_t i = 0, tot = 0;
942 boolean_t ifdown = !(ifp->if_flags & IFF_UP);
943
944 FRB_RLOCK(frb);
945 RB_FOREACH(fr, flow_route_tree, &frb->frb_head) {
946 ++tot;
947 /* loose check; do this without holding fr_reflock */
948 if (fr->fr_usecnt > FLOW_ROUTE_MINREF ||
949 (fr->fr_expire > now && !ifdown &&
950 !(fr->fr_flags & FLOWRTF_DELETED))) {
951 continue;
952 }
953 ++i;
954 }
955
956 /*
957 * If there's nothing to prune or there's a writer, we're done.
958 * Note that if we failed to upgrade to writer, the lock would
959 * have been released automatically.
960 */
961 if (i == 0 || !FRB_RLOCKTOWLOCK(frb)) {
962 if (i == 0) {
963 FRB_RUNLOCK(frb);
964 }
965 if (resid != NULL) {
966 *resid = (tot - i);
967 }
968 return 0;
969 }
970
971 SK_DF(SK_VERB_FLOW_ROUTE, "purging at least %u idle routes on %s",
972 i, ifp->if_xname);
973
974 /* purge idle ones */
975 i = flow_route_bucket_purge_common(frb, resid, FALSE, ifdown);
976 FRB_WUNLOCK(frb);
977
978 return i;
979 }
980
981 uint32_t
flow_route_prune(struct flow_mgr * fm,struct ifnet * ifp,uint32_t * tot_resid)982 flow_route_prune(struct flow_mgr *fm, struct ifnet *ifp,
983 uint32_t *tot_resid)
984 {
985 uint32_t pruned = 0;
986 uint32_t resid;
987 uint32_t i;
988
989 for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
990 struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
991 pruned += flow_route_bucket_prune(frb, ifp, &resid);
992 if (tot_resid != NULL) {
993 *tot_resid += resid;
994 }
995 }
996
997 return pruned;
998 }
999
1000 /*
1001 * This runs in the context of eventhandler invocation routine which loops
1002 * through all the registered callbacks. Care must be taken to not call
1003 * any primitives here that would lead to routing changes in the same context
1004 * as it would lead to deadlock in eventhandler code.
1005 */
1006 static void
flow_route_ev_callback(struct eventhandler_entry_arg ee_arg,struct sockaddr * dst,int route_ev,struct sockaddr * gw_addr_orig,int flags)1007 flow_route_ev_callback(struct eventhandler_entry_arg ee_arg,
1008 struct sockaddr *dst, int route_ev, struct sockaddr *gw_addr_orig, int flags)
1009 {
1010 #pragma unused(dst, flags)
1011 #if SK_LOG
1012 char dst_s[MAX_IPv6_STR_LEN];
1013 #endif /* SK_LOG */
1014 struct flow_route_id_bucket *frib = NULL;
1015 struct flow_route *fr = NULL;
1016 struct flow_mgr *fm;
1017 boolean_t renew_fr = true;
1018
1019 VERIFY(!uuid_is_null(ee_arg.ee_fm_uuid));
1020 VERIFY(!uuid_is_null(ee_arg.ee_fr_uuid));
1021
1022 evhlog(debug, "%s: eventhandler saw event type=route_event event_code=%s",
1023 __func__, route_event2str(route_ev));
1024
1025 /*
1026 * Upon success, callee will hold flow manager lock as reader,
1027 * and we'll need to unlock it below. Otherwise there's no
1028 * need to unlock here and just return.
1029 */
1030 fm = flow_mgr_find_lock(ee_arg.ee_fm_uuid);
1031 if (fm == NULL) {
1032 SK_ERR("Event %s for dst %s ignored; flow manager not found",
1033 route_event2str(route_ev), sk_sa_ntop(dst, dst_s,
1034 sizeof(dst_s)));
1035 return;
1036 }
1037
1038 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s event %s", fm->fm_name,
1039 sk_sa_ntop(dst, dst_s, sizeof(dst_s)), route_event2str(route_ev));
1040
1041 do {
1042 frib = flow_mgr_get_frib_by_uuid(fm, ee_arg.ee_fr_uuid);
1043
1044 FRIB_RLOCK(frib);
1045 /* callee returns a reference that we need to release below */
1046 fr = flow_route_find_by_uuid(frib, ee_arg.ee_fr_uuid);
1047 if (fr == NULL) {
1048 SK_ERR("%s: dst %s flow route not found", fm->fm_name,
1049 sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1050 break;
1051 }
1052
1053 /*
1054 * Grab fr_lock to prevent flow route configuration or
1055 * resolver from using stale info while we are updating.
1056 */
1057 FR_LOCK(fr);
1058
1059 switch (route_ev) {
1060 case ROUTE_ENTRY_REFRESH:
1061 /*
1062 * This is the case where the route entry has been
1063 * updated (for example through RTM_CHANGE). Some
1064 * of it may not warrant a lookup again and some of
1065 * it may. For now, mark flow to perform a look-up
1066 * again as the gateway may have changed.
1067 */
1068 os_atomic_inc(&fr->fr_want_configure, relaxed);
1069 os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1070 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route changed",
1071 fm->fm_name, sk_sa_ntop(dst, dst_s,
1072 sizeof(dst_s)));
1073 break;
1074
1075 case ROUTE_ENTRY_DELETED:
1076 /*
1077 * NOTE: flow_route_cleanup() should not be called
1078 * to de-register eventhandler in the context of
1079 * eventhandler callback to avoid deadlock in
1080 * eventhandler code. Instead, just mark the flow
1081 * route un-resolved. When it is being used again
1082 * or being deleted the old eventhandler must be
1083 * de-registered.
1084 */
1085 os_atomic_inc(&fr->fr_want_configure, relaxed);
1086 os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1087 os_atomic_or(&fr->fr_flags, FLOWRTF_DELETED, relaxed);
1088 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route deleted",
1089 fm->fm_name, sk_sa_ntop(dst, dst_s,
1090 sizeof(dst_s)));
1091 break;
1092
1093 case ROUTE_LLENTRY_STALE:
1094 /*
1095 * When the route entry is deemed unreliable or old
1096 * enough to trigger a route lookup again. Don't
1097 * reconfigure the flow route, but simply attempt
1098 * to resolve it next time to trigger a probe.
1099 */
1100 os_atomic_inc(&fr->fr_want_probe, relaxed);
1101 os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1102 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry stale",
1103 fm->fm_name, sk_sa_ntop(dst, dst_s,
1104 sizeof(dst_s)));
1105 renew_fr = false;
1106 break;
1107
1108 case ROUTE_LLENTRY_CHANGED:
1109 /*
1110 * When the link-layer info has changed; replace
1111 * cached llinfo in the flow route (treat this
1112 * as ROUTE_LLENTRY_RESOLVED).
1113 */
1114 OS_FALLTHROUGH;
1115
1116 case ROUTE_LLENTRY_RESOLVED:
1117 {
1118 /*
1119 * SDL address length may be 0 for cellular.
1120 * If Ethernet, copy into flow route and mark
1121 * it as cached. In all cases, mark the flow
1122 * route as resolved.
1123 */
1124 /*
1125 * XXX Remove explicit __bidi_indexable once
1126 * rdar://119193012 lands
1127 */
1128 struct sockaddr_dl *__bidi_indexable gw_addr =
1129 (struct sockaddr_dl *__bidi_indexable) SDL(gw_addr_orig);
1130 ASSERT(gw_addr->sdl_family == AF_LINK);
1131 if (gw_addr->sdl_alen == ETHER_ADDR_LEN) {
1132 FLOWRT_UPD_ETH_DST(fr, LLADDR(gw_addr));
1133 SK_DF(SK_VERB_FLOW_ROUTE,
1134 "%s: dst %s llentry %s", fm->fm_name,
1135 sk_sa_ntop(dst, dst_s, sizeof(dst_s)),
1136 (!(fr->fr_flags & FLOWRTF_HAS_LLINFO) ?
1137 "resolved" : "changed"));
1138 os_atomic_or(&fr->fr_flags, FLOWRTF_HAS_LLINFO, relaxed);
1139 } else {
1140 os_atomic_andnot(&fr->fr_flags, FLOWRTF_HAS_LLINFO, relaxed);
1141 }
1142 os_atomic_or(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1143 #if SK_LOG
1144 if (__improbable((sk_verbose & SK_VERB_FLOW_ROUTE) !=
1145 0) && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
1146 SK_DF(SK_VERB_FLOW_ROUTE,
1147 "%s: fr 0x%llx eth_type 0x%x "
1148 "eth_src %x:%x:%x:%x:%x:%x "
1149 "eth_dst %x:%x:%x:%x:%x:%x [%s])",
1150 fm->fm_name, SK_KVA(fr),
1151 ntohs(fr->fr_eth.ether_type),
1152 fr->fr_eth.ether_shost[0],
1153 fr->fr_eth.ether_shost[1],
1154 fr->fr_eth.ether_shost[2],
1155 fr->fr_eth.ether_shost[3],
1156 fr->fr_eth.ether_shost[4],
1157 fr->fr_eth.ether_shost[5],
1158 fr->fr_eth.ether_dhost[0],
1159 fr->fr_eth.ether_dhost[1],
1160 fr->fr_eth.ether_dhost[2],
1161 fr->fr_eth.ether_dhost[3],
1162 fr->fr_eth.ether_dhost[4],
1163 fr->fr_eth.ether_dhost[5],
1164 sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1165 }
1166 #endif /* SK_LOG */
1167 break;
1168 }
1169 case ROUTE_LLENTRY_DELETED:
1170 /*
1171 * If the route entry points to a router and an
1172 * RTM_DELETE has been issued on it; force the
1173 * flow route to be reconfigured.
1174 */
1175 os_atomic_inc(&fr->fr_want_configure, relaxed);
1176 os_atomic_andnot(&fr->fr_flags, (FLOWRTF_HAS_LLINFO | FLOWRTF_RESOLVED), relaxed);
1177 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry deleted",
1178 fm->fm_name, sk_sa_ntop(dst, dst_s,
1179 sizeof(dst_s)));
1180 break;
1181
1182 case ROUTE_LLENTRY_PROBED:
1183 /*
1184 * When the resolver has begun probing the target;
1185 * nothing to do here.
1186 */
1187 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry probed",
1188 fm->fm_name, sk_sa_ntop(dst, dst_s,
1189 sizeof(dst_s)));
1190 break;
1191
1192 case ROUTE_LLENTRY_UNREACH:
1193 /*
1194 * When the route entry is marked with RTF_REJECT
1195 * or the probes have timed out, reconfigure.
1196 */
1197 os_atomic_inc(&fr->fr_want_configure, relaxed);
1198 os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1199 SK_ERR("%s: dst %s llentry unreachable", fm->fm_name,
1200 sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1201 break;
1202
1203 default:
1204 break;
1205 }
1206 } while (0);
1207
1208 if (fr != NULL) {
1209 __flow_route_release(fr, renew_fr);
1210 FR_UNLOCK(fr);
1211 }
1212
1213 if (frib != NULL) {
1214 FRIB_UNLOCK(frib);
1215 }
1216
1217 if (fm != NULL) {
1218 flow_mgr_unlock();
1219 }
1220 }
1221
1222 int
flow_route_select_laddr(union sockaddr_in_4_6 * src,union sockaddr_in_4_6 * dst,struct ifnet * ifp,struct rtentry * rt,uint32_t * ipaddr_gencnt,int use_stable_address)1223 flow_route_select_laddr(union sockaddr_in_4_6 *src, union sockaddr_in_4_6 *dst,
1224 struct ifnet *ifp, struct rtentry *rt, uint32_t *ipaddr_gencnt,
1225 int use_stable_address)
1226 {
1227 #if SK_LOG
1228 char src_s[MAX_IPv6_STR_LEN]; /* src */
1229 char dst_s[MAX_IPv6_STR_LEN]; /* dst */
1230 #endif /* SK_LOG */
1231 sa_family_t af = SA(dst)->sa_family;
1232 struct ifnet *__single src_ifp = NULL;
1233 struct ifaddr *__single ifa = NULL;
1234 int err = 0;
1235
1236 /* see comments in flow_route_configure() regarding loopback */
1237 ASSERT(rt->rt_ifp == ifp || rt->rt_ifp == lo_ifp);
1238
1239 switch (af) {
1240 case AF_INET: {
1241 ifnet_lock_shared(ifp);
1242 if (__improbable(rt->rt_ifa->ifa_debug & IFD_DETACHING) != 0) {
1243 err = EHOSTUNREACH;
1244 SK_ERR("route to %s has src address marked detaching "
1245 "(err %d)", inet_ntop(AF_INET,
1246 &SIN(dst)->sin_addr, dst_s, sizeof(dst_s)), err);
1247 ifnet_lock_done(ifp);
1248 break;
1249 }
1250 SIN(src)->sin_len = sizeof(struct sockaddr_in);
1251 SIN(src)->sin_family = AF_INET;
1252 SIN(src)->sin_addr = IA_SIN(rt->rt_ifa)->sin_addr;
1253 ASSERT(SIN(src)->sin_addr.s_addr != INADDR_ANY);
1254 *ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1255 ifnet_lock_done(ifp);
1256 break;
1257 }
1258
1259 case AF_INET6: {
1260 struct in6_addr src_storage, *in6;
1261 struct route_in6 ro = {};
1262 uint32_t hints = (use_stable_address ? 0 : IPV6_SRCSEL_HINT_PREFER_TMPADDR);
1263 ro.ro_rt = rt;
1264
1265 if ((in6 = in6_selectsrc_core(SIN6(dst), hints,
1266 ifp, 0, &src_storage, &src_ifp, &err, &ifa, &ro, FALSE)) == NULL) {
1267 if (err == 0) {
1268 err = EADDRNOTAVAIL;
1269 }
1270 VERIFY(src_ifp == NULL);
1271 SK_ERR("src address to dst %s on %s not available "
1272 "(err %d)", inet_ntop(AF_INET6,
1273 &SIN6(dst)->sin6_addr, dst_s, sizeof(dst_s)),
1274 ifp->if_xname, err);
1275 break;
1276 }
1277
1278 VERIFY(src_ifp != NULL);
1279 VERIFY(ifa != NULL);
1280
1281 if (__improbable(src_ifp != ifp)) {
1282 if (err == 0) {
1283 err = ENETUNREACH;
1284 }
1285 SK_ERR("dst %s, src %s ifp %s != %s (err %d)",
1286 inet_ntop(AF_INET6, &SIN6(dst)->sin6_addr,
1287 dst_s, sizeof(dst_s)),
1288 inet_ntop(AF_INET6, &SIN6(src)->sin6_addr,
1289 src_s, sizeof(src_s)),
1290 src_ifp->if_xname, ifp->if_xname, err);
1291 break;
1292 }
1293
1294 ifnet_lock_shared(ifp);
1295 if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) {
1296 err = EHOSTUNREACH;
1297 SK_ERR("IPv6 address selected is marked to be "
1298 "detached (err %d)", err);
1299 ifnet_lock_done(ifp);
1300 break;
1301 }
1302
1303 /* clear embedded scope if link-local src */
1304 if (IN6_IS_SCOPE_EMBED(in6)) {
1305 if (in6_embedded_scope) {
1306 SIN6(src)->sin6_scope_id = ntohs(in6->s6_addr16[1]);
1307 in6->s6_addr16[1] = 0;
1308 } else {
1309 SIN6(src)->sin6_scope_id = src_ifp->if_index;
1310 }
1311 }
1312 SIN6(src)->sin6_len = sizeof(struct sockaddr_in6);
1313 SIN6(src)->sin6_family = AF_INET6;
1314 SIN6(src)->sin6_addr = *in6;
1315 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&SIN6(src)->sin6_addr));
1316 *ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1317 ifnet_lock_done(ifp);
1318 break;
1319 }
1320
1321 default:
1322 VERIFY(0);
1323 /* NOTREACHED */
1324 __builtin_unreachable();
1325 }
1326
1327 if (ifa != NULL) {
1328 ifa_remref(ifa);
1329 }
1330
1331 if (src_ifp != NULL) {
1332 ifnet_release(src_ifp);
1333 }
1334
1335 #if SK_LOG
1336 if (err == 0 && __improbable((sk_verbose & SK_VERB_FLOW_ROUTE) != 0)) {
1337 SK_DF(SK_VERB_FLOW_ROUTE, "src %s to dst %s on %s",
1338 sk_sa_ntop(SA(src), src_s, sizeof(src_s)),
1339 sk_sa_ntop(SA(dst), dst_s, sizeof(dst_s)),
1340 ifp->if_xname);
1341 }
1342 #endif /* SK_LOG */
1343
1344 return err;
1345 }
1346
1347 void
flow_route_cleanup(struct flow_route * fr)1348 flow_route_cleanup(struct flow_route *fr)
1349 {
1350 #if SK_LOG
1351 char ss[MAX_IPv6_STR_LEN]; /* dst */
1352 char ds[MAX_IPv6_STR_LEN]; /* dst */
1353 char gs[MAX_IPv6_STR_LEN]; /* gw */
1354 #endif /* SK_LOG */
1355
1356 FR_LOCK_ASSERT_HELD(fr);
1357
1358 if (fr->fr_rt_evhdlr_tag != NULL) {
1359 ASSERT(fr->fr_rt_dst != NULL);
1360 route_event_enqueue_nwk_wq_entry(fr->fr_rt_dst, NULL,
1361 ROUTE_EVHDLR_DEREGISTER, fr->fr_rt_evhdlr_tag, FALSE);
1362 fr->fr_rt_evhdlr_tag = NULL;
1363 fr->fr_rt_dst = NULL;
1364 }
1365 ASSERT(fr->fr_rt_dst == NULL);
1366 if (fr->fr_rt_gw != NULL) {
1367 rtfree(fr->fr_rt_gw);
1368 fr->fr_rt_gw = NULL;
1369 }
1370
1371 #if SK_LOG
1372 if (fr->fr_flags & FLOWRTF_GATEWAY) {
1373 SK_DF(SK_VERB_FLOW_ROUTE,
1374 "clean fr 0x%llx %s -> %s via gw %s", SK_KVA(fr),
1375 sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
1376 sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
1377 sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs)));
1378 } else if (fr->fr_flags & FLOWRTF_ONLINK) {
1379 SK_DF(SK_VERB_FLOW_ROUTE,
1380 "clean fr 0x%llx %s -> %s", SK_KVA(fr),
1381 sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
1382 sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)));
1383 }
1384 #endif /* SK_LOG */
1385
1386 os_atomic_andnot(&fr->fr_flags, (FLOWRTF_GATEWAY | FLOWRTF_ONLINK), relaxed);
1387 }
1388
1389 static boolean_t
_flow_route_laddr_validate(struct flow_ip_addr * src_ip0,uint8_t ip_v,struct ifnet * ifp,uint32_t * gencnt)1390 _flow_route_laddr_validate(struct flow_ip_addr *src_ip0, uint8_t ip_v,
1391 struct ifnet *ifp, uint32_t *gencnt)
1392 {
1393 boolean_t address_found = TRUE;
1394 struct ifaddr *ifa = NULL;
1395 struct flow_ip_addr src_ip = {};
1396 uint32_t scope = ifp->if_index;
1397
1398 VERIFY(gencnt != NULL);
1399 VERIFY(ip_v == IPVERSION || ip_v == IPV6_VERSION);
1400
1401 if (ip_v == IPVERSION) {
1402 memcpy(&src_ip._v4, &src_ip0->_v4, sizeof(src_ip._v4));
1403
1404 ifa = (struct ifaddr *)ifa_foraddr_scoped(
1405 src_ip._v4.s_addr, scope);
1406 } else {
1407 memcpy(&src_ip, src_ip0, sizeof(*src_ip0));
1408
1409 if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(&src_ip._v6)) {
1410 src_ip._v6.s6_addr16[1] = htons((uint16_t)scope);
1411 }
1412 ifa = (struct ifaddr *)ifa_foraddr6_scoped(&src_ip._v6,
1413 scope);
1414 }
1415
1416 if (__improbable(ifa == NULL)) {
1417 address_found = FALSE;
1418 goto done;
1419 }
1420
1421 ifnet_lock_shared(ifp);
1422 if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) {
1423 address_found = FALSE;
1424 ifnet_lock_done(ifp);
1425 goto done;
1426 }
1427
1428 if (ip_v == IPV6_VERSION) {
1429 /*
1430 * -fbounds-safety: ia6 (in6_ifaddr) overlays ifa (ifaddr)
1431 */
1432 struct in6_ifaddr *ia6 = __container_of(ifa, struct in6_ifaddr,
1433 ia_ifa);
1434
1435 /*
1436 * Fail if IPv6 address is not ready or if the address
1437 * is reserved * for CLAT46.
1438 */
1439 if (__improbable(ia6->ia6_flags &
1440 (IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) != 0) {
1441 address_found = FALSE;
1442 ifnet_lock_done(ifp);
1443 goto done;
1444 }
1445 } else {
1446 /*
1447 * If interface has CLAT46 enabled, fail IPv4 bind.
1448 * Since this implies network is NAT64/DNS64, Internet
1449 * effectively becomes reachable over IPv6. If on
1450 * system IPv4 to IPv6 translation is required, that
1451 * should be handled solely through bump in the API.
1452 * The in kernel translation is only done for apps
1453 * directly using low level networking APIs.
1454 */
1455 if (__improbable(IS_INTF_CLAT46(ifp))) {
1456 address_found = FALSE;
1457 ifnet_lock_done(ifp);
1458 goto done;
1459 }
1460 }
1461
1462 *gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1463 ifnet_lock_done(ifp);
1464 done:
1465 if (ifa != NULL) {
1466 ifa_remref(ifa);
1467 }
1468
1469 return address_found;
1470 }
1471
1472 boolean_t
flow_route_laddr_validate(union sockaddr_in_4_6 * saddr,struct ifnet * ifp,uint32_t * gencnt)1473 flow_route_laddr_validate(union sockaddr_in_4_6 *saddr, struct ifnet *ifp,
1474 uint32_t *gencnt)
1475 {
1476 VERIFY(saddr->sa.sa_family == AF_INET ||
1477 saddr->sa.sa_family == AF_INET6);
1478
1479 struct flow_ip_addr *ipa;
1480 uint8_t ipv;
1481 if (saddr->sa.sa_family == AF_INET) {
1482 ipv = IPVERSION;
1483 ipa = (struct flow_ip_addr *)(void *)&saddr->sin.sin_addr;
1484 } else {
1485 ipv = IPV6_VERSION;
1486 ipa = (struct flow_ip_addr *)(void *)&saddr->sin6.sin6_addr;
1487 }
1488
1489 return _flow_route_laddr_validate(ipa, ipv, ifp, gencnt);
1490 }
1491
1492 boolean_t
flow_route_key_validate(struct flow_key * fk,struct ifnet * ifp,uint32_t * gencnt)1493 flow_route_key_validate(struct flow_key *fk, struct ifnet *ifp,
1494 uint32_t *gencnt)
1495 {
1496 return _flow_route_laddr_validate(&fk->fk_src, fk->fk_ipver, ifp,
1497 gencnt);
1498 }
1499