xref: /xnu-12377.41.6/bsd/skywalk/nexus/flowswitch/flow/flow_manager.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/os_skywalk.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
33 #include <netinet/in.h>
34 #include <netinet/in_var.h>
35 #include <netinet6/ip6_var.h>
36 #include <netkey/key.h>
37 #include <netinet/udp.h>
38 
39 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
40 
41 #if CONFIG_MACF
42 #include <security/mac_framework.h>
43 #endif /* CONFIG_MACF */
44 
45 #include <net/net_api_stats.h>
46 
47 #define SKMEM_TAG_FSW_FLOW_MGR "com.apple.skywalk.fsw.flow_mgr"
48 static SKMEM_TAG_DEFINE(skmem_tag_fsw_flow_mgr, SKMEM_TAG_FSW_FLOW_MGR);
49 
50 static LCK_GRP_DECLARE(flow_mgr_lock_group, "sk_flow_mgr_lock");
51 static LCK_RW_DECLARE(flow_mgr_lock, &flow_mgr_lock_group);
52 
53 static int fm_cmp(const struct flow_mgr *,
54     const struct flow_mgr *);
55 
56 RB_HEAD(flow_mgr_tree, flow_mgr);
57 RB_PROTOTYPE_PREV(flow_mgr_tree, flow_mgr, fm_link, fm_cmp);
58 RB_GENERATE_PREV(flow_mgr_tree, flow_mgr, fm_link, fm_cmp);
59 
60 /* protected by the global lock flow_mgr_lock */
61 static struct flow_mgr_tree flow_mgr_head;
62 
63 static int __flow_mgr_inited = 0;
64 
65 void
flow_mgr_init(void)66 flow_mgr_init(void)
67 {
68 	ASSERT(!__flow_mgr_inited);
69 
70 	RB_INIT(&flow_mgr_head);
71 	__flow_mgr_inited = 1;
72 }
73 
74 void
flow_mgr_fini(void)75 flow_mgr_fini(void)
76 {
77 	if (__flow_mgr_inited) {
78 		VERIFY(RB_EMPTY(&flow_mgr_head));
79 
80 		__flow_mgr_inited = 0;
81 	}
82 }
83 
84 static int
__fe_cuckoo_cmp(struct cuckoo_node * node,void * key0)85 __fe_cuckoo_cmp(struct cuckoo_node *node, void *key0)
86 {
87 	struct flow_entry *__single fe = __container_of(node, struct flow_entry,
88 	    fe_cnode);
89 	struct flow_key *__single key = key0;
90 	const struct flow_key *mask;
91 
92 	/*
93 	 * This can probably be made more efficient by having "mask" be
94 	 * set by the original caller at the time the key is initialized,
95 	 * though that needs to be done carefully to ensure there is no
96 	 * mismatch between fk_mask value and "mask" itself.
97 	 */
98 	switch (key->fk_mask) {
99 	case FKMASK_5TUPLE:
100 		mask = &fk_mask_5tuple;
101 		break;
102 	case FKMASK_4TUPLE:
103 		mask = &fk_mask_4tuple;
104 		break;
105 	case FKMASK_3TUPLE:
106 		mask = &fk_mask_3tuple;
107 		break;
108 	case FKMASK_2TUPLE:
109 		mask = &fk_mask_2tuple;
110 		break;
111 	case FKMASK_IPFLOW3:
112 		mask = &fk_mask_ipflow3;
113 		break;
114 	case FKMASK_IPFLOW2:
115 		mask = &fk_mask_ipflow2;
116 		break;
117 	case FKMASK_IPFLOW1:
118 		mask = &fk_mask_ipflow1;
119 		break;
120 	default:
121 		return flow_key_cmp(&fe->fe_key, key);
122 	}
123 
124 	return flow_key_cmp_mask(&fe->fe_key, key, mask);
125 }
126 
127 static void
__fe_cuckoo_retain(struct cuckoo_node * node)128 __fe_cuckoo_retain(struct cuckoo_node *node)
129 {
130 	struct flow_entry *__single fe = __container_of(node, struct flow_entry,
131 	    fe_cnode);
132 	return flow_entry_retain(fe);
133 }
134 
135 static void
__fe_cuckoo_release(struct cuckoo_node * node)136 __fe_cuckoo_release(struct cuckoo_node *node)
137 {
138 #pragma unused(node)
139 	struct flow_entry *__single fe =
140 	    __container_of(node, struct flow_entry, fe_cnode);
141 	flow_entry_release(&fe);
142 }
143 
144 struct flow_mgr *
flow_mgr_create(size_t fe_cnt,size_t fob_cnt,size_t frb_cnt,size_t frib_cnt)145 flow_mgr_create(size_t fe_cnt, size_t fob_cnt,
146     size_t frb_cnt, size_t frib_cnt)
147 {
148 	struct flow_mgr *fm = NULL;
149 	size_t fob_sz, frb_sz, frib_sz;
150 	size_t fob_tot_sz, frb_tot_sz, frib_tot_sz;
151 	uint32_t i;
152 
153 	/* caller needs to ensure {fb,frb}_cnt is a power of two */
154 	ASSERT(frb_cnt != 0 && ((frb_cnt & (frb_cnt - 1)) == 0));
155 	ASSERT(fob_cnt != 0);
156 	ASSERT(frib_cnt != 0);
157 
158 	fm = sk_alloc_type(struct flow_mgr, Z_WAITOK | Z_NOFAIL, skmem_tag_fsw_flow_mgr);
159 	struct cuckoo_hashtable_params p = {
160 		.cht_capacity = fe_cnt,
161 		.cht_obj_cmp = __fe_cuckoo_cmp,
162 		.cht_obj_retain = __fe_cuckoo_retain,
163 		.cht_obj_release = __fe_cuckoo_release,
164 	};
165 	fm->fm_flow_table = cuckoo_hashtable_create(&p);
166 	if (fm->fm_flow_table == NULL) {
167 		flow_mgr_destroy(fm);
168 		return NULL;
169 	}
170 
171 	/*
172 	 * flow_owner_bucket cache-aligned objects.
173 	 */
174 	fm->fm_owner_buckets = flow_owner_buckets_alloc(fob_cnt, &fob_sz, &fob_tot_sz);
175 	fm->fm_owner_bucket_tot_sz = fob_tot_sz;
176 	if (fm->fm_owner_buckets == NULL) {
177 		flow_mgr_destroy(fm);
178 		return NULL;
179 	}
180 	fm->fm_owner_buckets_cnt = fob_cnt;
181 	fm->fm_owner_bucket_sz = fob_sz;
182 
183 	/*
184 	 * flow_route_bucket cache-aligned objects.
185 	 */
186 	fm->fm_route_buckets = flow_route_buckets_alloc(frb_cnt, &frb_sz, &frb_tot_sz);
187 	fm->fm_route_bucket_tot_sz = frb_tot_sz;
188 	if (fm->fm_route_buckets == NULL) {
189 		flow_mgr_destroy(fm);
190 		return NULL;
191 	}
192 	fm->fm_route_buckets_cnt = frb_cnt;
193 	fm->fm_route_bucket_sz = frb_sz;
194 
195 	/*
196 	 * flow_route_id_bucket cache-aligned objects.
197 	 */
198 	fm->fm_route_id_buckets =
199 	    flow_route_id_buckets_alloc(frib_cnt, &frib_sz, &frib_tot_sz);
200 	fm->fm_route_id_bucket_tot_sz = frib_tot_sz;
201 	if (fm->fm_route_id_buckets == NULL) {
202 		flow_mgr_destroy(fm);
203 		return NULL;
204 	}
205 	fm->fm_route_id_buckets_cnt = frib_cnt;
206 	fm->fm_route_id_bucket_sz = frib_sz;
207 
208 	/* construct flow_owner_buckets */
209 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
210 		struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
211 		flow_owner_bucket_init(fob);
212 		/* const override */
213 		*(size_t *)(uintptr_t)&fob->fob_idx = i;
214 	}
215 
216 	/* construct flow_route_buckets */
217 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
218 		struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
219 		flow_route_bucket_init(frb);
220 		/* const override */
221 		*(size_t *)(uintptr_t)&frb->frb_idx = i;
222 	}
223 
224 	/* construct flow_route_id_buckets */
225 	for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
226 		struct flow_route_id_bucket *frib =
227 		    flow_mgr_get_frib_at_idx(fm, i);
228 		flow_route_id_bucket_init(frib);
229 		/* const override */
230 		*(size_t *)(uintptr_t)&frib->frib_idx = i;
231 	}
232 
233 	uuid_generate_random(fm->fm_uuid);
234 
235 	lck_rw_lock_exclusive(&flow_mgr_lock);
236 	RB_INSERT(flow_mgr_tree, &flow_mgr_head, fm);
237 #if DEBUG
238 	struct flow_mgr find;
239 	uuid_copy(find.fm_uuid, fm->fm_uuid);
240 	/* make sure our tree compare routine is sane */
241 	ASSERT(RB_FIND(flow_mgr_tree,
242 	    &flow_mgr_head, &find) == fm);
243 #endif /* DEBUG */
244 	lck_rw_done(&flow_mgr_lock);
245 
246 	fm->fm_flow_hash_masks[0] = FKMASK_5TUPLE;
247 	fm->fm_flow_hash_masks[1] = FKMASK_4TUPLE;
248 	fm->fm_flow_hash_masks[2] = FKMASK_3TUPLE;
249 	fm->fm_flow_hash_masks[3] = FKMASK_2TUPLE;
250 	fm->fm_flow_hash_masks[4] = FKMASK_IPFLOW3;
251 	fm->fm_flow_hash_masks[5] = FKMASK_IPFLOW2;
252 	fm->fm_flow_hash_masks[6] = FKMASK_IPFLOW1;
253 
254 	memset(&fm->fm_flow_hash_count, 0, sizeof(fm->fm_flow_hash_count));
255 
256 	return fm;
257 }
258 
259 void
flow_mgr_destroy(struct flow_mgr * fm)260 flow_mgr_destroy(struct flow_mgr *fm)
261 {
262 	uint32_t i;
263 
264 	lck_rw_lock_exclusive(&flow_mgr_lock);
265 	ASSERT(!uuid_is_null(fm->fm_uuid));
266 
267 	if (fm->fm_flow_table != NULL) {
268 		cuckoo_hashtable_free(fm->fm_flow_table);
269 	}
270 
271 	if (fm->fm_owner_buckets != NULL) {
272 		for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
273 			struct flow_owner_bucket *fob =
274 			    flow_mgr_get_fob_at_idx(fm, i);
275 			ASSERT(fob->fob_idx == i);
276 			flow_owner_bucket_destroy(fob);
277 		}
278 		flow_owner_buckets_free(fm->fm_owner_buckets,
279 		    fm->fm_owner_bucket_tot_sz);
280 		fm->fm_owner_buckets = NULL;
281 		fm->fm_owner_bucket_tot_sz = 0;
282 		fm->fm_owner_buckets_cnt = 0;
283 		fm->fm_owner_bucket_sz = 0;
284 	}
285 	ASSERT(fm->fm_owner_buckets_cnt == 0);
286 	ASSERT(fm->fm_owner_bucket_sz == 0);
287 	ASSERT(fm->fm_owner_bucket_tot_sz == 0);
288 
289 	if (fm->fm_route_buckets != NULL) {
290 		for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
291 			struct flow_route_bucket *frb =
292 			    flow_mgr_get_frb_at_idx(fm, i);
293 			ASSERT(frb->frb_idx == i);
294 			flow_route_bucket_destroy(frb);
295 		}
296 		flow_route_buckets_free(fm->fm_route_buckets,
297 		    fm->fm_route_bucket_tot_sz);
298 		fm->fm_route_buckets = NULL;
299 		fm->fm_route_bucket_tot_sz = 0;
300 		fm->fm_route_buckets_cnt = 0;
301 		fm->fm_route_bucket_sz = 0;
302 	}
303 	ASSERT(fm->fm_route_buckets_cnt == 0);
304 	ASSERT(fm->fm_route_bucket_sz == 0);
305 	ASSERT(fm->fm_route_bucket_tot_sz == 0);
306 
307 	if (fm->fm_route_id_buckets != NULL) {
308 		for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
309 			struct flow_route_id_bucket *frib =
310 			    flow_mgr_get_frib_at_idx(fm, i);
311 			ASSERT(frib->frib_idx == i);
312 			flow_route_id_bucket_destroy(frib);
313 		}
314 		flow_route_id_buckets_free(fm->fm_route_id_buckets,
315 		    fm->fm_route_id_bucket_tot_sz);
316 		fm->fm_route_id_buckets = NULL;
317 		fm->fm_route_id_bucket_tot_sz = 0;
318 		fm->fm_route_id_buckets_cnt = 0;
319 		fm->fm_route_id_bucket_sz = 0;
320 	}
321 	ASSERT(fm->fm_route_id_buckets_cnt == 0);
322 	ASSERT(fm->fm_route_id_bucket_sz == 0);
323 	ASSERT(fm->fm_route_id_bucket_tot_sz == 0);
324 
325 	uuid_clear(fm->fm_uuid);
326 	RB_REMOVE(flow_mgr_tree, &flow_mgr_head, fm);
327 	lck_rw_done(&flow_mgr_lock);
328 
329 	sk_free_type(struct flow_mgr, fm);
330 }
331 
332 void
flow_mgr_terminate(struct flow_mgr * fm)333 flow_mgr_terminate(struct flow_mgr *fm)
334 {
335 	uint32_t i;
336 
337 	/*
338 	 * Purge all flow entries.
339 	 */
340 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
341 		struct flow_owner_bucket *fob =
342 		    flow_mgr_get_fob_at_idx(fm, i);
343 		FOB_LOCK(fob);
344 		fob->fob_busy_flags |= FOBF_DEAD;
345 	}
346 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
347 		struct flow_owner_bucket *fob =
348 		    flow_mgr_get_fob_at_idx(fm, i);
349 		SK_DF(SK_VERB_FLOW, "purging fob %p [%u]", SK_KVA(fob), i);
350 		flow_owner_bucket_purge_all(fob);
351 	}
352 
353 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
354 		FOB_UNLOCK(flow_mgr_get_fob_at_idx(fm, i));
355 	}
356 
357 	/*
358 	 * Purge all flow routes.
359 	 */
360 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
361 		struct flow_route_bucket *frb =
362 		    flow_mgr_get_frb_at_idx(fm, i);
363 		FRB_WLOCK(frb);
364 	}
365 	for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
366 		FRIB_WLOCK(flow_mgr_get_frib_at_idx(fm, i));
367 	}
368 
369 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
370 		struct flow_route_bucket *frb =
371 		    flow_mgr_get_frb_at_idx(fm, i);
372 		SK_DF(SK_VERB_FLOW, "purging frb %p [%u]", SK_KVA(frb), i);
373 		flow_route_bucket_purge_all(frb);
374 	}
375 
376 	for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
377 		FRIB_WUNLOCK(flow_mgr_get_frib_at_idx(fm, i));
378 	}
379 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
380 		FRB_WUNLOCK(flow_mgr_get_frb_at_idx(fm, i));
381 	}
382 }
383 
384 /*
385  * Must be matched with a call to flow_mgr_unlock().  Upon success will
386  * return the flow manager address of the specified UUID, and will acquire
387  * the global flow_mgr_lock as reader.  The caller is then expected to release
388  * the lock.
389  */
390 struct flow_mgr *
flow_mgr_find_lock(uuid_t uuid)391 flow_mgr_find_lock(uuid_t uuid)
392 {
393 	struct flow_mgr *fm, find;
394 
395 	uuid_copy(find.fm_uuid, uuid);
396 
397 	lck_rw_lock_shared(&flow_mgr_lock);
398 
399 	fm = RB_FIND(flow_mgr_tree, &flow_mgr_head, &find);
400 	if (fm == NULL) {
401 		lck_rw_done(&flow_mgr_lock);
402 		return NULL;
403 	}
404 
405 	/* caller is expected to call flow_mgr_unlock() when done */
406 	LCK_RW_ASSERT(&flow_mgr_lock, LCK_RW_ASSERT_SHARED);
407 	return fm;
408 }
409 
410 /*
411  * Must be matched with a successful call to flow_mgr_find_lock().
412  */
413 void
flow_mgr_unlock(void)414 flow_mgr_unlock(void)
415 {
416 	lck_rw_done(&flow_mgr_lock);
417 }
418 
419 static inline int
fm_cmp(const struct flow_mgr * a,const struct flow_mgr * b)420 fm_cmp(const struct flow_mgr *a, const struct flow_mgr *b)
421 {
422 	return uuid_compare(a->fm_uuid, b->fm_uuid);
423 }
424 
425 static void
flow_mgr_clear_embedded_scope_id(struct sockaddr_in6 * addr)426 flow_mgr_clear_embedded_scope_id(struct sockaddr_in6 *addr)
427 {
428 	struct in6_addr *in6;
429 	in6 = &addr->sin6_addr;
430 	if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(in6)) {
431 		addr->sin6_scope_id = ntohs(in6->s6_addr16[1]);
432 		in6->s6_addr16[1] = 0;
433 	}
434 }
435 
436 #if CONFIG_MACF
437 static bool
flow_req_check_mac_allowed(struct nx_flow_req * req)438 flow_req_check_mac_allowed(struct nx_flow_req *req)
439 {
440 	int socktype;
441 	switch (req->nfr_ip_protocol) {
442 	case IPPROTO_TCP:
443 		socktype = SOCK_STREAM;
444 		break;
445 
446 	case IPPROTO_UDP:
447 		socktype = SOCK_DGRAM;
448 		break;
449 
450 	default:
451 		/* Custom IP protocol, which is treated as IP diagram type */
452 		socktype = SOCK_DGRAM;
453 		return 0;
454 	}
455 
456 	if (req->nfr_flags & NXFLOWREQF_LISTENER) {
457 		return mac_skywalk_flow_check_listen(req->nfr_proc, NULL,
458 		           SA(&req->nfr_saddr.sa), socktype, req->nfr_ip_protocol);
459 	} else {
460 		return mac_skywalk_flow_check_connect(req->nfr_proc, NULL,
461 		           SA(&req->nfr_daddr.sa), socktype, req->nfr_ip_protocol);
462 	}
463 }
464 #endif /* CONFIG_MACF */
465 
466 static bool
flow_req_needs_netns_reservation(struct nx_flow_req * req)467 flow_req_needs_netns_reservation(struct nx_flow_req *req)
468 {
469 	uint8_t proto = req->nfr_ip_protocol;
470 	return proto == IPPROTO_TCP || proto == IPPROTO_UDP;
471 }
472 
473 static bool
flow_req_needs_protons_reservation(struct nx_flow_req * req)474 flow_req_needs_protons_reservation(struct nx_flow_req *req)
475 {
476 	uint8_t proto = req->nfr_ip_protocol;
477 	return proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
478 	       proto != IPPROTO_ESP && proto != IPPROTO_AH;
479 }
480 
481 static bool
flow_req_needs_ipsec_reservation(struct nx_flow_req * req)482 flow_req_needs_ipsec_reservation(struct nx_flow_req *req)
483 {
484 	uint8_t proto = req->nfr_ip_protocol;
485 	return proto == IPPROTO_ESP || proto == IPPROTO_AH;
486 }
487 
488 static void
flow_set_port_info(struct ns_flow_info * nfi,struct nx_flow_req * req)489 flow_set_port_info(struct ns_flow_info *nfi, struct nx_flow_req *req)
490 {
491 	union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
492 	union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
493 
494 	bzero(nfi, sizeof(struct ns_flow_info));
495 
496 	nfi->nfi_ifp = req->nfr_ifp;
497 
498 	nfi->nfi_laddr = *saddr;
499 	nfi->nfi_faddr = *daddr;
500 
501 	nfi->nfi_protocol = req->nfr_ip_protocol;
502 
503 	uuid_copy(nfi->nfi_flow_uuid, req->nfr_flow_uuid);
504 	ASSERT(!uuid_is_null(nfi->nfi_flow_uuid));
505 
506 	nfi->nfi_owner_pid = req->nfr_pid;
507 	if (req->nfr_epid != -1) {
508 		nfi->nfi_effective_pid = req->nfr_epid;
509 		proc_name(req->nfr_epid, nfi->nfi_effective_name,
510 		    sizeof(nfi->nfi_effective_name));
511 	} else {
512 		nfi->nfi_effective_pid = -1;
513 	}
514 
515 	proc_name(req->nfr_pid, nfi->nfi_owner_name,
516 	    sizeof(nfi->nfi_owner_name));
517 }
518 
519 static int
flow_req_prepare_namespace(struct nx_flow_req * req)520 flow_req_prepare_namespace(struct nx_flow_req *req)
521 {
522 	SK_LOG_VAR(char src_s[MAX_IPv6_STR_LEN]);
523 	int err = 0;
524 
525 	if (flow_req_needs_netns_reservation(req)) {
526 		if (!NETNS_TOKEN_VALID(&req->nfr_port_reservation)) {
527 			union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
528 			struct ns_flow_info nfi;
529 			netns_token __single ns_token;
530 			flow_set_port_info(&nfi, req);
531 			err = flow_namespace_create(saddr,
532 			    req->nfr_ip_protocol, &ns_token,
533 			    req->nfr_flags, &nfi);
534 			if (err != 0) {
535 				SK_ERR("netns for %s.%u failed",
536 				    sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)),
537 				    sk_sa_get_port(SA(saddr)));
538 				goto fail;
539 			}
540 			req->nfr_port_reservation = ns_token;
541 			req->nfr_flags &= ~NXFLOWREQF_EXT_PORT_RSV;
542 		} else {
543 			/* Validate PID associated with provided reservation */
544 			struct ns_flow_info nfi = {};
545 			err = netns_get_flow_info(&req->nfr_port_reservation,
546 			    &nfi);
547 			/* flow info could be NULL for socket flow */
548 			if (!err && (req->nfr_pid != nfi.nfi_owner_pid ||
549 			    (req->nfr_epid != -1 && nfi.nfi_effective_pid !=
550 			    req->nfr_epid))) {
551 				SK_ERR("netns flow info mismatch, "
552 				    "req_(e)pid %d(%d), nfr_(e)pid %d(%d)",
553 				    req->nfr_pid, req->nfr_epid,
554 				    nfi.nfi_owner_pid, nfi.nfi_effective_pid);
555 				err = EPERM;
556 				goto fail;
557 			}
558 			req->nfr_flags |= NXFLOWREQF_EXT_PORT_RSV;
559 		}
560 	}
561 
562 	if (flow_req_needs_ipsec_reservation(req)) {
563 		union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
564 		union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
565 		/*
566 		 * XXX -fbounds-safety: Currently, ke_reserve_custom_ipsec does
567 		 * not return any size information for the first argument
568 		 * (ipsec_token). Even though it takes a void **, it looks like
569 		 * only struct secashead * is used.
570 		 */
571 		void *__single ipsec_token = NULL;
572 		ASSERT(req->nfr_ipsec_reservation == NULL);
573 		err = key_reserve_custom_ipsec(&ipsec_token, saddr,
574 		    daddr, req->nfr_ip_protocol);
575 		if (err != 0) {
576 			SK_ERR("custom ipsec %u reserve %s failed",
577 			    req->nfr_ip_protocol,
578 			    sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)));
579 			goto fail;
580 		}
581 		req->nfr_ipsec_reservation = ipsec_token;
582 	}
583 
584 	if (flow_req_needs_protons_reservation(req)) {
585 		struct protons_token *__single ns_token = NULL;
586 		if (!protons_token_is_valid(req->nfr_proto_reservation)) {
587 			err = protons_reserve(&ns_token, req->nfr_pid,
588 			    req->nfr_epid, req->nfr_ip_protocol);
589 			if (err != 0) {
590 				SK_ERR("protocol %u namespace failed",
591 				    req->nfr_ip_protocol);
592 				goto fail;
593 			}
594 			req->nfr_flags &= ~NXFLOWREQF_EXT_PROTO_RSV;
595 			req->nfr_proto_reservation = ns_token;
596 		} else {
597 			/* Validate PID associated with provided reservation */
598 			if (!protons_token_has_matching_pid(req->nfr_proto_reservation,
599 			    req->nfr_pid, req->nfr_epid)) {
600 				SK_ERR("protons token pid mismatch");
601 				err = EPERM;
602 				goto fail;
603 			}
604 			req->nfr_flags |= NXFLOWREQF_EXT_PROTO_RSV;
605 		}
606 	}
607 
608 	return 0;
609 
610 fail:
611 	VERIFY(err != 0);
612 	SK_ERR("perparation failed (err %d)", err);
613 	return err;
614 }
615 
616 static int
flow_req_prepare(struct nx_flow_req * req,struct kern_nexus * nx,struct flow_mgr * fm,struct ifnet * ifp,flow_route_ctor_fn_t fr_ctor,flow_route_resolve_fn_t fr_resolve,void * fr_arg)617 flow_req_prepare(struct nx_flow_req *req, struct kern_nexus *nx,
618     struct flow_mgr *fm, struct ifnet *ifp, flow_route_ctor_fn_t fr_ctor,
619     flow_route_resolve_fn_t fr_resolve, void *fr_arg)
620 {
621 	int err = 0;
622 	union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
623 	union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
624 	uint8_t protocol = req->nfr_ip_protocol;
625 
626 	sa_family_t saf, daf, xaf, af;
627 
628 	saf = SA(saddr)->sa_family;
629 	daf = SA(daddr)->sa_family;
630 	xaf = saf ^ daf;
631 	if (xaf != 0 && xaf != saf && xaf != daf) {
632 		SK_ERR("invalid saddr af %d daddr af %d", saf, daf);
633 		return EINVAL;
634 	}
635 	af = (xaf == 0) ? saf : xaf;
636 
637 	bool has_saddr = false, has_daddr = false;
638 	bool has_sport = false, has_dport = false;
639 	uint16_t sport, dport;
640 	uint8_t sa_len;
641 	switch (af) {
642 	case AF_INET:
643 		sa_len = sizeof(struct sockaddr_in);
644 		has_saddr = (SIN(saddr)->sin_addr.s_addr != INADDR_ANY);
645 		has_daddr = (SIN(daddr)->sin_addr.s_addr != INADDR_ANY);
646 		sport = SIN(saddr)->sin_port;
647 		dport = SIN(daddr)->sin_port;
648 		has_sport = (sport != 0);
649 		has_dport = (dport != 0);
650 
651 		if ((has_saddr && SIN(saddr)->sin_len != sa_len) ||
652 		    (has_daddr && SIN(daddr)->sin_len != sa_len)) {
653 			SK_ERR("sin_len invalid");
654 			err = EINVAL;
655 			goto fail;
656 		}
657 		if ((has_saddr && IN_MULTICAST(ntohl(SIN(saddr)->sin_addr.s_addr))) ||
658 		    (has_daddr && IN_MULTICAST(ntohl(SIN(daddr)->sin_addr.s_addr)))) {
659 			SK_ERR("multicast flow not yet supported");
660 			err = EADDRNOTAVAIL;
661 			goto fail;
662 		}
663 		if (__probable(protocol == IPPROTO_TCP)) {
664 			INC_ATOMIC_INT64_LIM(
665 				net_api_stats.nas_nx_flow_inet6_stream_total);
666 		} else {
667 			INC_ATOMIC_INT64_LIM(
668 				net_api_stats.nas_nx_flow_inet6_dgram_total);
669 		}
670 		break;
671 
672 	case AF_INET6:
673 		sa_len = sizeof(struct sockaddr_in6);
674 		has_saddr = !IN6_IS_ADDR_UNSPECIFIED(&SIN6(saddr)->sin6_addr);
675 		has_daddr = !IN6_IS_ADDR_UNSPECIFIED(&SIN6(daddr)->sin6_addr);
676 		sport = SIN6(saddr)->sin6_port;
677 		dport = SIN6(daddr)->sin6_port;
678 		has_sport = (sport != 0);
679 		has_dport = (dport != 0);
680 		if ((has_saddr && SIN6(saddr)->sin6_len != sa_len) ||
681 		    (has_daddr && SIN6(daddr)->sin6_len != sa_len)) {
682 			SK_ERR("sin_len invalid");
683 			err = EINVAL;
684 			goto fail;
685 		}
686 		/* clear embedded scope if link-local src */
687 		if (has_saddr) {
688 			flow_mgr_clear_embedded_scope_id(SIN6(saddr));
689 			if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&SIN6(saddr)->sin6_addr)) {
690 				SIN6(saddr)->sin6_scope_id = ifp->if_index;
691 			}
692 		}
693 		if (has_daddr) {
694 			flow_mgr_clear_embedded_scope_id(SIN6(daddr));
695 			if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&SIN6(daddr)->sin6_addr)) {
696 				SIN6(daddr)->sin6_scope_id = ifp->if_index;
697 			}
698 		}
699 		if ((has_saddr && IN6_IS_ADDR_MULTICAST(&SIN6(saddr)->sin6_addr)) ||
700 		    (has_daddr && IN6_IS_ADDR_MULTICAST(&SIN6(daddr)->sin6_addr))) {
701 			SK_ERR("multicast flow not yet supported");
702 			err = EADDRNOTAVAIL;
703 			goto fail;
704 		}
705 		if (__probable(protocol == IPPROTO_TCP)) {
706 			INC_ATOMIC_INT64_LIM(
707 				net_api_stats.nas_nx_flow_inet_stream_total);
708 		} else {
709 			INC_ATOMIC_INT64_LIM(
710 				net_api_stats.nas_nx_flow_inet_dgram_total);
711 		}
712 		break;
713 
714 	default:
715 		SK_ERR("unknown address families saf %d daf %d", saf, daf);
716 		err = EINVAL;
717 		goto fail;
718 	}
719 
720 	SA(saddr)->sa_family = SA(daddr)->sa_family = af;
721 	SA(saddr)->sa_len = SA(daddr)->sa_len = sa_len;
722 
723 	if (__improbable(has_saddr && !flow_route_laddr_validate(saddr, ifp,
724 	    &req->nfr_saddr_gencnt))) {
725 		SK_LOG_VAR(char src_s[MAX_IPv6_STR_LEN]);
726 		SK_ERR("src address %s is not valid",
727 		    sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)));
728 		err = EADDRNOTAVAIL;
729 		goto fail;
730 	}
731 
732 	bool is_tcp_udp = (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP);
733 	if (!is_tcp_udp) {
734 		if (has_sport || has_dport) {
735 			SK_ERR("non-zero port for IP flow");
736 			return EINVAL;
737 		}
738 	} else {
739 		/* dst:dport as connected, 0:0 as listener, but not partial */
740 		if (has_daddr != has_dport) {
741 			err = EINVAL;
742 			SK_ERR("invalid dst/dport for TCP/UDP (err %d)", err);
743 			goto fail;
744 		}
745 	}
746 
747 	if (!has_daddr && !has_dport) {
748 		req->nfr_flags |= NXFLOWREQF_LISTENER;
749 	}
750 
751 	if (req->nfr_transport_protocol == 0) {
752 		req->nfr_transport_protocol = req->nfr_ip_protocol;
753 	}
754 
755 	bool is_child_flow = !uuid_is_null(req->nfr_parent_flow_uuid);
756 	if ((is_child_flow && req->nfr_flow_demux_count == 0) ||
757 	    (!is_child_flow && req->nfr_flow_demux_count > 0)) {
758 		err = EINVAL;
759 		SK_ERR("invalid flow demux count");
760 		goto fail;
761 	}
762 
763 	if (req->nfr_flow_demux_count > 0) {
764 		if (req->nfr_ip_protocol != IPPROTO_UDP) {
765 			err = EINVAL;
766 			SK_ERR("invalid ip protocol(%u) for flow demux",
767 			    req->nfr_ip_protocol);
768 			goto fail;
769 		}
770 
771 		for (int i = 0; i < req->nfr_flow_demux_count; i++) {
772 			if (req->nfr_flow_demux_patterns[i].fdp_len > FLOW_DEMUX_MAX_LEN ||
773 			    req->nfr_flow_demux_patterns[i].fdp_len == 0) {
774 				err = EINVAL;
775 				SK_ERR("invalid flow demux pattern len %u",
776 				    req->nfr_flow_demux_patterns[i].fdp_len);
777 				goto fail;
778 			}
779 			if (req->nfr_flow_demux_patterns[i].fdp_offset +
780 			    req->nfr_flow_demux_patterns[i].fdp_len > MAX_PKT_DEMUX_LIMIT) {
781 				err = EINVAL;
782 				SK_ERR("invalid demux offset plus length(%u > %d)",
783 				    req->nfr_flow_demux_patterns[i].fdp_offset +
784 				    req->nfr_flow_demux_patterns[i].fdp_len, MAX_PKT_DEMUX_LIMIT);
785 				goto fail;
786 			}
787 		}
788 	}
789 
790 	req->nfr_ifp = ifp;
791 
792 #if CONFIG_MACF
793 	err = flow_req_check_mac_allowed(req);
794 	if (err != 0) {
795 		SK_ERR("flow req failed MAC check");
796 		goto fail;
797 	}
798 #endif /* CONFIG_MACF */
799 
800 	/* setup flow route and prepare saddr if needed */
801 	if (__probable(has_daddr || has_dport)) {
802 		struct flow_route *__single fr = NULL;
803 		err = flow_route_find(nx, fm, ifp, req, fr_ctor,
804 		    fr_resolve, fr_arg, &fr);
805 		if (__improbable(err != 0)) {
806 			SK_ERR("flow route lookup failed");
807 			ASSERT(fr == NULL);
808 			goto fail;
809 		}
810 		ASSERT(fr != NULL);
811 		/* Pick up the default source address from flow route. */
812 		if (!has_saddr) {
813 			*saddr = fr->fr_laddr;
814 			SIN(saddr)->sin_port = sport;
815 		}
816 		req->nfr_route = fr;
817 		fr = NULL;
818 	}
819 
820 	/* child flow do not hold namespace references */
821 	if (__probable(uuid_is_null(req->nfr_parent_flow_uuid))) {
822 		err = flow_req_prepare_namespace(req);
823 		if (err != 0) {
824 			goto fail;
825 		}
826 	}
827 
828 	return 0;
829 
830 fail:
831 	VERIFY(err != 0);
832 	if (req->nfr_route != NULL) {
833 		flow_route_release(req->nfr_route);
834 		req->nfr_route = NULL;
835 	}
836 	SK_ERR("preparation failed (err %d)", err);
837 	return err;
838 }
839 
840 static void
flow_req_cleanup(struct nx_flow_req * req)841 flow_req_cleanup(struct nx_flow_req *req)
842 {
843 	if (NETNS_TOKEN_VALID(&req->nfr_port_reservation) &&
844 	    !(req->nfr_flags & NXFLOWREQF_EXT_PORT_RSV)) {
845 		netns_release(&req->nfr_port_reservation);
846 	}
847 
848 	if (protons_token_is_valid(req->nfr_proto_reservation) &&
849 	    !(req->nfr_flags & NXFLOWREQF_EXT_PROTO_RSV)) {
850 		protons_release(&req->nfr_proto_reservation);
851 	}
852 	if (key_custom_ipsec_token_is_valid(req->nfr_ipsec_reservation)) {
853 		key_release_custom_ipsec(&req->nfr_ipsec_reservation);
854 	}
855 }
856 
857 #if SK_LOG
858 /* Hoisted out of line to reduce kernel stack footprint */
859 SK_LOG_ATTRIBUTE
860 static void
flow_req_dump(char * desc,struct nx_flow_req * req)861 flow_req_dump(char *desc, struct nx_flow_req *req)
862 {
863 	if (!(sk_verbose & SK_VERB_FLOW)) {
864 		return;
865 	}
866 
867 	union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
868 	union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
869 	uint8_t protocol = req->nfr_ip_protocol;
870 	char src_s[MAX_IPv6_STR_LEN];
871 	char dst_s[MAX_IPv6_STR_LEN];
872 	uint8_t sipver = 0, dipver = 0;
873 	uint16_t sport = 0, dport = 0;
874 	uuid_string_t uuid_s;
875 
876 	// unsanitized req, treat source and destination AF separately
877 	if (saddr->sa.sa_family == AF_INET) {
878 		sipver = IPVERSION;
879 		(void) sk_ntop(AF_INET, &SIN(saddr)->sin_addr, src_s,
880 		    sizeof(src_s));
881 		sport = ntohs(saddr->sin.sin_port);
882 	} else if (saddr->sa.sa_family == AF_INET6) {
883 		sipver = IPV6_VERSION;
884 		(void) sk_ntop(AF_INET6, &SIN6(saddr)->sin6_addr, src_s,
885 		    sizeof(src_s));
886 		sport = ntohs(saddr->sin6.sin6_port);
887 	} else {
888 		sipver = 0;
889 		strlcpy(src_s, "INV", sizeof(src_s));
890 	}
891 	if (daddr->sa.sa_family == AF_INET) {
892 		dipver = IPVERSION;
893 		(void) sk_ntop(AF_INET, &SIN(daddr)->sin_addr, dst_s,
894 		    sizeof(dst_s));
895 		dport = ntohs(daddr->sin.sin_port);
896 	} else if (daddr->sa.sa_family == AF_INET6) {
897 		dipver = IPV6_VERSION;
898 		(void) sk_ntop(AF_INET6, &SIN6(daddr)->sin6_addr, dst_s,
899 		    sizeof(dst_s));
900 		dport = ntohs(daddr->sin6.sin6_port);
901 	} else {
902 		dipver = 0;
903 		strlcpy(dst_s, "INV", sizeof(src_s));
904 	}
905 
906 	SK_DF(SK_VERB_FLOW,
907 	    "%s %s sipver=%u,dipver=%u,src=%s.%u,dst=%s.%u,proto=%d "
908 	    "nx_port=%u,flags 0x%x", desc, sk_uuid_unparse(req->nfr_flow_uuid,
909 	    uuid_s), sipver, dipver, src_s, sport, dst_s, dport, protocol,
910 	    req->nfr_nx_port, req->nfr_flags);
911 }
912 #else
913 #define flow_req_dump(str, req) do { ((void)0); } while (0)
914 #endif /* SK_LOG */
915 
916 /*
917  * Upon success, returns a non-NULL fb that is (writer) locked.
918  */
919 int
flow_mgr_flow_add(struct kern_nexus * nx,struct flow_mgr * fm,struct flow_owner * fo,struct ifnet * ifp,struct nx_flow_req * req,flow_route_ctor_fn_t fr_ctor,flow_route_resolve_fn_t fr_resolve,void * fr_arg)920 flow_mgr_flow_add(struct kern_nexus *nx, struct flow_mgr *fm,
921     struct flow_owner *fo, struct ifnet *ifp, struct nx_flow_req *req,
922     flow_route_ctor_fn_t fr_ctor, flow_route_resolve_fn_t fr_resolve,
923     void *fr_arg)
924 {
925 	struct flow_entry *__single fe;
926 	int err = 0;
927 
928 	ASSERT(ifp != NULL);
929 	ASSERT(fr_ctor != NULL && fr_resolve != NULL);
930 	FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo));
931 
932 	flow_req_dump("req", req);
933 
934 	if (!(req->nfr_flags & NXFLOWREQF_ASIS)) {
935 		err = flow_req_prepare(req, nx, fm, ifp, fr_ctor, fr_resolve, fr_arg);
936 		if (err != 0) {
937 			SK_ERR("flow req preparation failure (err %d)", err);
938 			return err;
939 		}
940 	}
941 
942 	/*
943 	 * Add entry in flowswitch table; upon success, flow entry adds a
944 	 * retain count on the flow route (we'll always need to release the
945 	 * refcnt from flow_route_find), and the local address:port of the
946 	 * flow entry will be set.
947 	 */
948 	fe = flow_entry_alloc(fo, req, &err);
949 	if (__improbable(fe == NULL)) {
950 		ASSERT(err != 0);
951 		goto fail;
952 	}
953 
954 	VERIFY(NETNS_TOKEN_VALID(&fe->fe_port_reservation) ||
955 	    !(fe->fe_key.fk_mask & FKMASK_SPORT) ||
956 	    req->nfr_flags & NXFLOWREQF_ASIS ||
957 	    (fe->fe_flags & FLOWENTF_CHILD));
958 	VERIFY((req->nfr_flags & NXFLOWREQF_FLOWADV) ^
959 	    (req->nfr_flowadv_idx == FLOWADV_IDX_NONE));
960 	req->nfr_flowadv_idx = fe->fe_adv_idx;
961 	req->nfr_flowid = fe->fe_flowid;
962 
963 	flow_req_dump("added ", req);
964 
965 	if (fe != NULL) {
966 		flow_entry_release(&fe);
967 	}
968 
969 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
970 	if (req->nfr_saddr.sa.sa_family == AF_INET6 &&
971 	    IN6_IS_SCOPE_EMBED(&req->nfr_saddr.sin6.sin6_addr)) {
972 		req->nfr_saddr.sin6.sin6_scope_id = ifnet_index(
973 			fsw->fsw_ifp);
974 	}
975 	if (req->nfr_daddr.sa.sa_family == AF_INET6 &&
976 	    IN6_IS_SCOPE_EMBED(&req->nfr_daddr.sin6.sin6_addr)) {
977 		req->nfr_daddr.sin6.sin6_scope_id = ifnet_index(
978 			fsw->fsw_ifp);
979 	}
980 
981 	return 0;
982 
983 fail:
984 	VERIFY(err != 0);
985 	flow_req_cleanup(req);
986 
987 	return err;
988 }
989 
990 struct flow_owner_bucket *
flow_mgr_get_fob_by_pid(struct flow_mgr * fm,pid_t pid)991 flow_mgr_get_fob_by_pid(struct flow_mgr *fm, pid_t pid)
992 {
993 	return flow_mgr_get_fob_at_idx(fm,
994 	           (pid % fm->fm_owner_buckets_cnt));
995 }
996 
997 struct flow_entry *
flow_mgr_get_fe_by_uuid_rlock(struct flow_mgr * fm,uuid_t uuid)998 flow_mgr_get_fe_by_uuid_rlock(struct flow_mgr *fm, uuid_t uuid)
999 {
1000 	uint32_t i;
1001 	struct flow_owner_bucket *fob;
1002 	struct flow_owner *fo;
1003 	struct flow_entry *fe;
1004 
1005 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
1006 		fob = flow_mgr_get_fob_at_idx(fm, i);
1007 		FOB_LOCK_SPIN(fob);
1008 		RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
1009 			fe = flow_entry_find_by_uuid(fo, uuid);
1010 			if (fe != NULL) {
1011 				FOB_LOCK_CONVERT(fob);
1012 				FOB_UNLOCK(fob);
1013 				return fe;
1014 			}
1015 		}
1016 		FOB_UNLOCK(fob);
1017 	}
1018 	return NULL;
1019 }
1020 
1021 struct flow_route_bucket *
flow_mgr_get_frb_by_addr(struct flow_mgr * fm,union sockaddr_in_4_6 * daddr)1022 flow_mgr_get_frb_by_addr(struct flow_mgr *fm,
1023     union sockaddr_in_4_6 *daddr)
1024 {
1025 	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = flow_seed;
1026 
1027 	switch (SA(daddr)->sa_family) {
1028 	case AF_INET: {
1029 		uint8_t *p = (uint8_t *)&SIN(daddr)->sin_addr.s_addr;
1030 		b += ((uint32_t)p[3]);
1031 		a += ((uint32_t)p[2]) << 24;
1032 		a += ((uint32_t)p[1]) << 16;
1033 		a += ((uint32_t)p[0]) << 8;
1034 		break;
1035 	}
1036 
1037 	case AF_INET6: {
1038 		b += SIN6(daddr)->sin6_addr.s6_addr32[3];
1039 		a += SIN6(daddr)->sin6_addr.s6_addr32[2];
1040 		a += SIN6(daddr)->sin6_addr.s6_addr32[1];
1041 		a += SIN6(daddr)->sin6_addr.s6_addr32[0];
1042 		break;
1043 	}
1044 
1045 	default:
1046 		VERIFY(0);
1047 		/* NOTREACHED */
1048 		__builtin_unreachable();
1049 	}
1050 
1051 	/* mix */
1052 	a -= b; a -= c; a ^= (c >> 13);
1053 	b -= c; b -= a; b ^= (a << 8);
1054 	c -= a; c -= b; c ^= (b >> 13);
1055 	a -= b; a -= c; a ^= (c >> 12);
1056 	b -= c; b -= a; b ^= (a << 16);
1057 	c -= a; c -= b; c ^= (b >> 5);
1058 	a -= b; a -= c; a ^= (c >> 3);
1059 	b -= c; b -= a; b ^= (a << 10);
1060 	c -= a; c -= b; c ^= (b >> 15);
1061 
1062 	c &= (fm->fm_route_buckets_cnt - 1);
1063 
1064 	return flow_mgr_get_frb_at_idx(fm, c);
1065 }
1066 
1067 struct flow_route_id_bucket *
flow_mgr_get_frib_by_uuid(struct flow_mgr * fm,uuid_t fr_uuid)1068 flow_mgr_get_frib_by_uuid(struct flow_mgr *fm, uuid_t fr_uuid)
1069 {
1070 	union {
1071 		uuid_t   uuid __sk_aligned(8);
1072 		uint64_t u64[2];
1073 	} u;
1074 	uint64_t key;
1075 
1076 	static_assert(sizeof(u.uuid) == sizeof(u.u64));
1077 	uuid_copy(u.uuid, fr_uuid);
1078 
1079 	/* XOR fold UUID down to 4-bytes */
1080 	key = (u.u64[0] ^ u.u64[1]);
1081 	key = ((key >> 32) ^ (key & 0xffffffff));
1082 
1083 	/* add some offset to get more entropy */
1084 	return flow_mgr_get_frib_at_idx(fm,
1085 	           ((uint32_t)key % fm->fm_route_id_buckets_cnt));
1086 }
1087 
1088 static int
flow_hash_mask_add(struct flow_mgr * fm,uint32_t mask,int32_t v)1089 flow_hash_mask_add(struct flow_mgr *fm, uint32_t mask, int32_t v)
1090 {
1091 	for (uint32_t i = 0; i < FKMASK_IDX_MAX; i++) {
1092 		if (fm->fm_flow_hash_masks[i] == mask) {
1093 			os_atomic_add(&fm->fm_flow_hash_count[i], v, relaxed);
1094 			return 0;
1095 		}
1096 	}
1097 	SK_ERR("unkown hash mask 0x%x", mask);
1098 	return ENOTSUP;
1099 }
1100 
1101 int
flow_mgr_flow_hash_mask_add(struct flow_mgr * fm,uint32_t mask)1102 flow_mgr_flow_hash_mask_add(struct flow_mgr *fm, uint32_t mask)
1103 {
1104 	return flow_hash_mask_add(fm, mask, 1);
1105 }
1106 
1107 int
flow_mgr_flow_hash_mask_del(struct flow_mgr * fm,uint32_t mask)1108 flow_mgr_flow_hash_mask_del(struct flow_mgr *fm, uint32_t mask)
1109 {
1110 	return flow_hash_mask_add(fm, mask, -1);
1111 }
1112 
1113 #if SK_LOG
1114 SK_NO_INLINE_ATTRIBUTE
1115 static void
__flow_mgr_find_fe_by_key_prelog(struct flow_key * key)1116 __flow_mgr_find_fe_by_key_prelog(struct flow_key *key)
1117 {
1118 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
1119 	SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "key %s",
1120 	    fk2str(key, dbgbuf, sizeof(dbgbuf)));
1121 }
1122 
1123 SK_NO_INLINE_ATTRIBUTE
1124 static void
__flow_mgr_find_fe_by_key_epilog(struct flow_entry * fe)1125 __flow_mgr_find_fe_by_key_epilog(struct flow_entry *fe)
1126 {
1127 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
1128 	if (fe != NULL) {
1129 		SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "fe \"%s\"",
1130 		    fe2str(fe, dbgbuf, sizeof(dbgbuf)));
1131 	} else {
1132 		SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "fe not found");
1133 	}
1134 }
1135 #else
1136 #define __flow_mgr_find_fe_by_key_prelog(key) do { ((void)0); } while (0)
1137 #define __flow_mgr_find_fe_by_key_epilog(fe) do { ((void)0); } while (0)
1138 #endif /* SK_LOG */
1139 
1140 struct flow_entry *
flow_mgr_find_fe_by_key(struct flow_mgr * fm,struct flow_key * key)1141 flow_mgr_find_fe_by_key(struct flow_mgr *fm, struct flow_key *key)
1142 {
1143 	struct cuckoo_node *node = NULL;
1144 	struct flow_entry *__single fe = NULL;
1145 	uint32_t hash = 0;
1146 	uint16_t saved_mask = key->fk_mask;
1147 
1148 	__flow_mgr_find_fe_by_key_prelog(key);
1149 
1150 	for (int i = 0; i < FKMASK_IDX_MAX; i++) {
1151 		size_t count = fm->fm_flow_hash_count[i];
1152 		uint16_t mask = fm->fm_flow_hash_masks[i];
1153 		if (count == 0 || mask == 0) {
1154 			SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1155 			    "[%d] mask=%08x count=%zu skiped",
1156 			    i, mask, count);
1157 			continue;
1158 		}
1159 		key->fk_mask = mask;
1160 		hash = flow_key_hash(key);
1161 		node = cuckoo_hashtable_find_with_hash(fm->fm_flow_table, key, hash);
1162 		SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1163 		    "[%d] mask=%08x hash %08x node %p", i, mask, hash,
1164 		    SK_KVA(node));
1165 		if (node != NULL) {
1166 			fe = __container_of(node, struct flow_entry, fe_cnode);
1167 			/* v4 only listener fe shouldn't get v6 connection */
1168 			if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE &&
1169 			    fe->fe_key.fk_ipver == IPVERSION &&
1170 			    key->fk_ipver == IPV6_VERSION)) {
1171 				flow_entry_release(&fe);
1172 				ASSERT(fe == NULL);
1173 				SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1174 				    "\tskip v4 only fe");
1175 				continue;
1176 			}
1177 			break;
1178 		}
1179 	}
1180 
1181 	key->fk_mask = saved_mask;
1182 
1183 	__flow_mgr_find_fe_by_key_epilog(fe);
1184 
1185 	return fe;
1186 }
1187 
1188 struct flow_entry *
flow_mgr_find_conflicting_fe(struct flow_mgr * fm,struct flow_key * key)1189 flow_mgr_find_conflicting_fe(struct flow_mgr *fm, struct flow_key *key)
1190 {
1191 	struct cuckoo_node *node = NULL;
1192 	struct flow_entry *__single fe = NULL;
1193 	uint32_t hash = 0;
1194 
1195 	hash = flow_key_hash(key);
1196 	node = cuckoo_hashtable_find_with_hash(fm->fm_flow_table, key, hash);
1197 	if (node != NULL) {
1198 		fe = __container_of(node, struct flow_entry, fe_cnode);
1199 		return fe;
1200 	}
1201 
1202 	/* listener flow confliction will be checked at netns reservation */
1203 	return fe;
1204 }
1205 
1206 void
1207 flow_mgr_foreach_flow(struct flow_mgr *fm,
1208     void (^flow_handler)(struct flow_entry *fe))
1209 {
1210 	cuckoo_hashtable_foreach(fm->fm_flow_table,
1211 	    ^(struct cuckoo_node *node, uint32_t hv) {
1212 		#pragma unused(hv)
1213 		struct flow_entry *__single fe;
1214 		fe = __container_of(node, struct flow_entry, fe_cnode);
1215 		flow_handler(fe);
1216 
1217 		if (fe->fe_flags & FLOWENTF_PARENT) {
1218 		        struct flow_entry *child_fe;
1219 		        lck_rw_lock_shared(&fe->fe_child_list_lock);
1220 		        TAILQ_FOREACH(child_fe, &fe->fe_child_list, fe_child_link) {
1221 		                flow_handler(child_fe);
1222 			}
1223 		        lck_rw_unlock_shared(&fe->fe_child_list_lock);
1224 		}
1225 	}
1226 	    );
1227 }
1228 
1229 bool
rx_flow_demux_match(struct nx_flowswitch * fsw,struct flow_entry * fe,struct __kern_packet * pkt)1230 rx_flow_demux_match(struct nx_flowswitch *fsw, struct flow_entry *fe, struct __kern_packet *pkt)
1231 {
1232 	struct udphdr *uh;
1233 	uint8_t *pkt_buf;
1234 	uint32_t bdlen, bdlim, bdoff, pkt_payload_len;
1235 	uint8_t *demux_data;
1236 
1237 	ASSERT(fe->fe_flags & FLOWENTF_CHILD);
1238 	ASSERT(fe->fe_demux_pattern_count > 0);
1239 
1240 	if (fe->fe_flags & (FLOWENTF_TORN_DOWN | FLOWENTF_NONVIABLE)) {
1241 		return false;
1242 	}
1243 
1244 	/*
1245 	 * Demux only supported for UDP packets with payload
1246 	 */
1247 	if (__improbable(pkt->pkt_flow_ip_proto != IPPROTO_UDP)) {
1248 		return false;
1249 	}
1250 
1251 	uh = __unsafe_forge_bidi_indexable(struct udphdr *,
1252 	    (struct udphdr *)pkt->pkt_flow_udp_hdr, sizeof(*uh) + pkt->pkt_flow_ulen);
1253 	if (__improbable(uh == NULL || pkt->pkt_flow_ulen == 0)) {
1254 		return false;
1255 	}
1256 
1257 	int udp_payload_offset = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen + sizeof(*uh);
1258 
1259 	MD_BUFLET_ADDR_ABS_DLEN(pkt, pkt_buf, bdlen, bdlim, bdoff);
1260 	pkt_payload_len = bdlim - bdoff;
1261 	pkt_payload_len = MIN(pkt_payload_len, pkt->pkt_length);
1262 	pkt_payload_len -= udp_payload_offset;
1263 
1264 	for (int index = 0; index < fe->fe_demux_pattern_count; index++) {
1265 		struct flow_demux_pattern *demux_pattern = &fe->fe_demux_patterns[index].fdp_demux_pattern;
1266 		ASSERT(demux_pattern->fdp_len > 0);
1267 
1268 		if (pkt->pkt_flow_ulen >= demux_pattern->fdp_offset + demux_pattern->fdp_len) {
1269 			if (__probable(pkt_payload_len >= demux_pattern->fdp_offset + demux_pattern->fdp_len)) {
1270 				demux_data = (uint8_t *)(uh + 1) + demux_pattern->fdp_offset;
1271 			} else {
1272 				if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
1273 					m_copydata(pkt->pkt_mbuf, udp_payload_offset + demux_pattern->fdp_offset,
1274 					    demux_pattern->fdp_len, fe->fe_demux_pkt_data);
1275 					demux_data = fe->fe_demux_pkt_data;
1276 				} else {
1277 					FSW_STATS_INC(FSW_STATS_RX_DEMUX_SHORT_ERR);
1278 					return false;
1279 				}
1280 			}
1281 
1282 			int result = -1;
1283 			if (fe->fe_demux_patterns[index].fdp_memcmp_mask != NULL) {
1284 				result = fe->fe_demux_patterns[index].fdp_memcmp_mask(demux_data,
1285 				    demux_pattern->fdp_value, demux_pattern->fdp_mask);
1286 			} else {
1287 				result = sk_memcmp_mask(demux_data, demux_pattern->fdp_value,
1288 				    demux_pattern->fdp_mask, demux_pattern->fdp_len);
1289 			}
1290 
1291 			if (result == 0) {
1292 				return true;
1293 			}
1294 		}
1295 	}
1296 
1297 	return false;
1298 }
1299 
1300 struct flow_entry *
rx_lookup_child_flow(struct nx_flowswitch * fsw,struct flow_entry * parent_fe,struct __kern_packet * pkt)1301 rx_lookup_child_flow(struct nx_flowswitch *fsw, struct flow_entry *parent_fe,
1302     struct __kern_packet *pkt)
1303 {
1304 	struct flow_entry *child_fe;
1305 
1306 	/*
1307 	 * Demux only supported for UDP packets with payload
1308 	 */
1309 	if (__improbable(pkt->pkt_flow_ip_proto != IPPROTO_UDP)) {
1310 		return NULL;
1311 	}
1312 
1313 	lck_rw_lock_shared(&parent_fe->fe_child_list_lock);
1314 
1315 	TAILQ_FOREACH(child_fe, &parent_fe->fe_child_list, fe_child_link) {
1316 		if (rx_flow_demux_match(fsw, child_fe, pkt)) {
1317 			flow_entry_retain(child_fe);
1318 			lck_rw_unlock_shared(&parent_fe->fe_child_list_lock);
1319 			return child_fe;
1320 		}
1321 	}
1322 
1323 	lck_rw_unlock_shared(&parent_fe->fe_child_list_lock);
1324 	return NULL;
1325 }
1326 
1327 struct flow_entry *
tx_lookup_child_flow(struct flow_entry * parent_fe,uuid_t flow_id)1328 tx_lookup_child_flow(struct flow_entry *parent_fe, uuid_t flow_id)
1329 {
1330 	struct flow_entry *child_fe;
1331 
1332 	ASSERT(parent_fe->fe_flags & FLOWENTF_PARENT);
1333 
1334 	lck_rw_lock_shared(&parent_fe->fe_child_list_lock);
1335 	TAILQ_FOREACH(child_fe, &parent_fe->fe_child_list, fe_child_link) {
1336 		if (_UUID_MATCH(flow_id, child_fe->fe_uuid)) {
1337 			flow_entry_retain(child_fe);
1338 			lck_rw_unlock_shared(&parent_fe->fe_child_list_lock);
1339 			return child_fe;
1340 		}
1341 	}
1342 
1343 	lck_rw_unlock_shared(&parent_fe->fe_child_list_lock);
1344 	return NULL;
1345 }
1346