xref: /xnu-8020.121.3/bsd/skywalk/nexus/flowswitch/flow/flow_manager.c (revision fdd8201d7b966f0c3ea610489d29bd841d358941)
1 /*
2  * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/os_skywalk.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
33 #include <netinet/in.h>
34 #include <netinet/in_var.h>
35 #include <netinet6/ip6_var.h>
36 #include <netkey/key.h>
37 
38 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
39 
40 #if CONFIG_MACF
41 #include <security/mac_framework.h>
42 #endif /* CONFIG_MACF */
43 
44 #include <net/net_api_stats.h>
45 
46 #define SKMEM_TAG_FSW_FLOW_MGR "com.apple.skywalk.fsw.flow_mgr"
47 static SKMEM_TAG_DEFINE(skmem_tag_fsw_flow_mgr, SKMEM_TAG_FSW_FLOW_MGR);
48 
49 static LCK_GRP_DECLARE(flow_mgr_lock_group, "sk_flow_mgr_lock");
50 static LCK_RW_DECLARE(flow_mgr_lock, &flow_mgr_lock_group);
51 
52 static int fm_cmp(const struct flow_mgr *,
53     const struct flow_mgr *);
54 
55 RB_HEAD(flow_mgr_tree, flow_mgr);
56 RB_PROTOTYPE_PREV(flow_mgr_tree, flow_mgr, fm_link, fm_cmp);
57 RB_GENERATE_PREV(flow_mgr_tree, flow_mgr, fm_link, fm_cmp);
58 
59 /* protected by the global lock flow_mgr_lock */
60 static struct flow_mgr_tree flow_mgr_head;
61 
62 static int __flow_mgr_inited = 0;
63 
64 void
flow_mgr_init(void)65 flow_mgr_init(void)
66 {
67 	ASSERT(!__flow_mgr_inited);
68 
69 	RB_INIT(&flow_mgr_head);
70 	__flow_mgr_inited = 1;
71 }
72 
73 void
flow_mgr_fini(void)74 flow_mgr_fini(void)
75 {
76 	if (__flow_mgr_inited) {
77 		VERIFY(RB_EMPTY(&flow_mgr_head));
78 
79 		__flow_mgr_inited = 0;
80 	}
81 }
82 
83 static int
__fe_cuckoo_cmp(struct cuckoo_node * node,void * key0)84 __fe_cuckoo_cmp(struct cuckoo_node *node, void *key0)
85 {
86 	struct flow_entry *fe = container_of(node, struct flow_entry, fe_cnode);
87 	struct flow_key *key = key0;
88 	const struct flow_key *mask;
89 
90 	/*
91 	 * This can probably be made more efficient by having "mask" be
92 	 * set by the original caller at the time the key is initialized,
93 	 * though that needs to be done carefully to ensure there is no
94 	 * mismatch between fk_mask value and "mask" itself.
95 	 */
96 	switch (key->fk_mask) {
97 	case FKMASK_5TUPLE:
98 		mask = &fk_mask_5tuple;
99 		break;
100 	case FKMASK_4TUPLE:
101 		mask = &fk_mask_4tuple;
102 		break;
103 	case FKMASK_3TUPLE:
104 		mask = &fk_mask_3tuple;
105 		break;
106 	case FKMASK_2TUPLE:
107 		mask = &fk_mask_2tuple;
108 		break;
109 	case FKMASK_IPFLOW3:
110 		mask = &fk_mask_ipflow3;
111 		break;
112 	case FKMASK_IPFLOW2:
113 		mask = &fk_mask_ipflow2;
114 		break;
115 	case FKMASK_IPFLOW1:
116 		mask = &fk_mask_ipflow1;
117 		break;
118 	default:
119 		return flow_key_cmp(&fe->fe_key, key);
120 	}
121 
122 	return flow_key_cmp_mask(&fe->fe_key, key, mask);
123 }
124 
125 static void
__fe_cuckoo_retain(struct cuckoo_node * node)126 __fe_cuckoo_retain(struct cuckoo_node *node)
127 {
128 	struct flow_entry *fe = container_of(node, struct flow_entry, fe_cnode);
129 	return flow_entry_retain(fe);
130 }
131 
132 static void
__fe_cuckoo_release(struct cuckoo_node * node)133 __fe_cuckoo_release(struct cuckoo_node *node)
134 {
135 #pragma unused(node)
136 	struct flow_entry *fe = container_of(node, struct flow_entry, fe_cnode);
137 	flow_entry_release(&fe);
138 }
139 
140 struct flow_mgr *
flow_mgr_create(size_t fe_cnt,size_t fob_cnt,size_t frb_cnt,size_t frib_cnt)141 flow_mgr_create(size_t fe_cnt, size_t fob_cnt,
142     size_t frb_cnt, size_t frib_cnt)
143 {
144 	struct flow_mgr *fm = NULL;
145 	size_t fob_sz, frb_sz, frib_sz;
146 	size_t fob_tot_sz, frb_tot_sz, frib_tot_sz;
147 	uint32_t i;
148 
149 	/* caller needs to ensure {fb,frb}_cnt is a power of two */
150 	ASSERT(frb_cnt != 0 && ((frb_cnt & (frb_cnt - 1)) == 0));
151 	ASSERT(fob_cnt != 0);
152 	ASSERT(frib_cnt != 0);
153 
154 	fm = sk_alloc_type(struct flow_mgr, Z_WAITOK | Z_NOFAIL, skmem_tag_fsw_flow_mgr);
155 
156 	struct cuckoo_hashtable_params p = {
157 		.cht_capacity = fe_cnt,
158 		.cht_obj_cmp = __fe_cuckoo_cmp,
159 		.cht_obj_retain = __fe_cuckoo_retain,
160 		.cht_obj_release = __fe_cuckoo_release,
161 	};
162 	fm->fm_flow_table = cuckoo_hashtable_create(&p);
163 	if (fm->fm_flow_table == NULL) {
164 		flow_mgr_destroy(fm);
165 		return NULL;
166 	}
167 
168 	/*
169 	 * flow_owner_bucket cache-aligned objects.
170 	 */
171 	fm->fm_owner_buckets = flow_owner_buckets_alloc(fob_cnt, &fob_sz, &fob_tot_sz);
172 	if (fm->fm_owner_buckets == NULL) {
173 		flow_mgr_destroy(fm);
174 		return NULL;
175 	}
176 	/* const overrides */
177 	*(size_t *)(uintptr_t)&fm->fm_owner_buckets_cnt = fob_cnt;
178 	*(size_t *)(uintptr_t)&fm->fm_owner_bucket_sz = fob_sz;
179 	*(size_t *)(uintptr_t)&fm->fm_owner_bucket_tot_sz = fob_tot_sz;
180 
181 	/*
182 	 * flow_route_bucket cache-aligned objects.
183 	 */
184 	fm->fm_route_buckets = flow_route_buckets_alloc(frb_cnt, &frb_sz, &frb_tot_sz);
185 	if (fm->fm_route_buckets == NULL) {
186 		flow_mgr_destroy(fm);
187 		return NULL;
188 	}
189 	/* const overrides */
190 	*(size_t *)(uintptr_t)&fm->fm_route_buckets_cnt = frb_cnt;
191 	*(size_t *)(uintptr_t)&fm->fm_route_bucket_sz = frb_sz;
192 	*(size_t *)(uintptr_t)&fm->fm_route_bucket_tot_sz = frb_tot_sz;
193 
194 	/*
195 	 * flow_route_id_bucket cache-aligned objects.
196 	 */
197 	fm->fm_route_id_buckets =
198 	    flow_route_id_buckets_alloc(frib_cnt, &frib_sz, &frib_tot_sz);
199 	if (fm->fm_route_id_buckets == NULL) {
200 		flow_mgr_destroy(fm);
201 		return NULL;
202 	}
203 	/* const overrides */
204 	*(size_t *)(uintptr_t)&fm->fm_route_id_buckets_cnt = frib_cnt;
205 	*(size_t *)(uintptr_t)&fm->fm_route_id_bucket_sz = frib_sz;
206 	*(size_t *)(uintptr_t)&fm->fm_route_id_bucket_tot_sz = frib_tot_sz;
207 
208 	/* construct flow_owner_buckets */
209 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
210 		struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
211 		flow_owner_bucket_init(fob);
212 		/* const override */
213 		*(size_t *)(uintptr_t)&fob->fob_idx = i;
214 	}
215 
216 	/* construct flow_route_buckets */
217 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
218 		struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
219 		flow_route_bucket_init(frb);
220 		/* const override */
221 		*(size_t *)(uintptr_t)&frb->frb_idx = i;
222 	}
223 
224 	/* construct flow_route_id_buckets */
225 	for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
226 		struct flow_route_id_bucket *frib =
227 		    flow_mgr_get_frib_at_idx(fm, i);
228 		flow_route_id_bucket_init(frib);
229 		/* const override */
230 		*(size_t *)(uintptr_t)&frib->frib_idx = i;
231 	}
232 
233 	uuid_generate_random(fm->fm_uuid);
234 
235 	lck_rw_lock_exclusive(&flow_mgr_lock);
236 	RB_INSERT(flow_mgr_tree, &flow_mgr_head, fm);
237 #if DEBUG
238 	struct flow_mgr find;
239 	uuid_copy(find.fm_uuid, fm->fm_uuid);
240 	/* make sure our tree compare routine is sane */
241 	ASSERT(RB_FIND(flow_mgr_tree,
242 	    &flow_mgr_head, &find) == fm);
243 #endif /* DEBUG */
244 	lck_rw_done(&flow_mgr_lock);
245 
246 	fm->fm_flow_hash_masks[0] = FKMASK_5TUPLE;
247 	fm->fm_flow_hash_masks[1] = FKMASK_4TUPLE;
248 	fm->fm_flow_hash_masks[2] = FKMASK_3TUPLE;
249 	fm->fm_flow_hash_masks[3] = FKMASK_2TUPLE;
250 	fm->fm_flow_hash_masks[4] = FKMASK_IPFLOW3;
251 	fm->fm_flow_hash_masks[5] = FKMASK_IPFLOW2;
252 	fm->fm_flow_hash_masks[6] = FKMASK_IPFLOW1;
253 
254 	memset(&fm->fm_flow_hash_count, 0, sizeof(fm->fm_flow_hash_count));
255 
256 	return fm;
257 }
258 
259 void
flow_mgr_destroy(struct flow_mgr * fm)260 flow_mgr_destroy(struct flow_mgr *fm)
261 {
262 	uint32_t i;
263 
264 	lck_rw_lock_exclusive(&flow_mgr_lock);
265 	ASSERT(!uuid_is_null(fm->fm_uuid));
266 
267 	if (fm->fm_flow_table != NULL) {
268 		cuckoo_hashtable_free(fm->fm_flow_table);
269 	}
270 
271 	if (fm->fm_owner_buckets != NULL) {
272 		for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
273 			struct flow_owner_bucket *fob =
274 			    flow_mgr_get_fob_at_idx(fm, i);
275 			ASSERT(fob->fob_idx == i);
276 			flow_owner_bucket_destroy(fob);
277 		}
278 		flow_owner_buckets_free(fm->fm_owner_buckets,
279 		    fm->fm_owner_bucket_tot_sz);
280 		fm->fm_owner_buckets = NULL;
281 		*(uint32_t *)(uintptr_t)&fm->fm_owner_buckets_cnt = 0;
282 		*(uint32_t *)(uintptr_t)&fm->fm_owner_bucket_sz = 0;
283 		*(uint32_t *)(uintptr_t)&fm->fm_owner_bucket_tot_sz = 0;
284 	}
285 	ASSERT(fm->fm_owner_buckets_cnt == 0);
286 	ASSERT(fm->fm_owner_bucket_sz == 0);
287 	ASSERT(fm->fm_owner_bucket_tot_sz == 0);
288 
289 	if (fm->fm_route_buckets != NULL) {
290 		for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
291 			struct flow_route_bucket *frb =
292 			    flow_mgr_get_frb_at_idx(fm, i);
293 			ASSERT(frb->frb_idx == i);
294 			flow_route_bucket_destroy(frb);
295 		}
296 		flow_route_buckets_free(fm->fm_route_buckets,
297 		    fm->fm_route_bucket_tot_sz);
298 		fm->fm_route_buckets = NULL;
299 		*(uint32_t *)(uintptr_t)&fm->fm_route_buckets_cnt = 0;
300 		*(uint32_t *)(uintptr_t)&fm->fm_route_bucket_sz = 0;
301 		*(uint32_t *)(uintptr_t)&fm->fm_route_bucket_tot_sz = 0;
302 	}
303 	ASSERT(fm->fm_route_buckets_cnt == 0);
304 	ASSERT(fm->fm_route_bucket_sz == 0);
305 	ASSERT(fm->fm_route_bucket_tot_sz == 0);
306 
307 	if (fm->fm_route_id_buckets != NULL) {
308 		for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
309 			struct flow_route_id_bucket *frib =
310 			    flow_mgr_get_frib_at_idx(fm, i);
311 			ASSERT(frib->frib_idx == i);
312 			flow_route_id_bucket_destroy(frib);
313 		}
314 		flow_route_id_buckets_free(fm->fm_route_id_buckets,
315 		    fm->fm_route_id_bucket_tot_sz);
316 		fm->fm_route_id_buckets = NULL;
317 		*(uint32_t *)(uintptr_t)&fm->fm_route_id_buckets_cnt = 0;
318 		*(uint32_t *)(uintptr_t)&fm->fm_route_id_bucket_sz = 0;
319 		*(uint32_t *)(uintptr_t)&fm->fm_route_id_bucket_tot_sz = 0;
320 	}
321 	ASSERT(fm->fm_route_id_buckets_cnt == 0);
322 	ASSERT(fm->fm_route_id_bucket_sz == 0);
323 	ASSERT(fm->fm_route_id_bucket_tot_sz == 0);
324 
325 	uuid_clear(fm->fm_uuid);
326 	RB_REMOVE(flow_mgr_tree, &flow_mgr_head, fm);
327 	lck_rw_done(&flow_mgr_lock);
328 
329 	sk_free_type(struct flow_mgr, fm);
330 }
331 
332 void
flow_mgr_terminate(struct flow_mgr * fm)333 flow_mgr_terminate(struct flow_mgr *fm)
334 {
335 	uint32_t i;
336 
337 	/*
338 	 * Purge all flow entries.
339 	 */
340 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
341 		struct flow_owner_bucket *fob =
342 		    flow_mgr_get_fob_at_idx(fm, i);
343 		FOB_LOCK(fob);
344 		fob->fob_busy_flags |= FOBF_DEAD;
345 	}
346 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
347 		struct flow_owner_bucket *fob =
348 		    flow_mgr_get_fob_at_idx(fm, i);
349 		SK_DF(SK_VERB_FLOW, "purging fob 0x%llx [%u]", SK_KVA(fob), i);
350 		flow_owner_bucket_purge_all(fob);
351 	}
352 
353 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
354 		FOB_UNLOCK(flow_mgr_get_fob_at_idx(fm, i));
355 	}
356 
357 	/*
358 	 * Purge all flow routes.
359 	 */
360 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
361 		struct flow_route_bucket *frb =
362 		    flow_mgr_get_frb_at_idx(fm, i);
363 		FRB_WLOCK(frb);
364 	}
365 	for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
366 		FRIB_WLOCK(flow_mgr_get_frib_at_idx(fm, i));
367 	}
368 
369 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
370 		struct flow_route_bucket *frb =
371 		    flow_mgr_get_frb_at_idx(fm, i);
372 		SK_DF(SK_VERB_FLOW, "purging frb 0x%llx [%u]", SK_KVA(frb), i);
373 		flow_route_bucket_purge_all(frb);
374 	}
375 
376 	for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
377 		FRIB_WUNLOCK(flow_mgr_get_frib_at_idx(fm, i));
378 	}
379 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
380 		FRB_WUNLOCK(flow_mgr_get_frb_at_idx(fm, i));
381 	}
382 }
383 
384 void
flow_mgr_setup_host_flow(struct flow_mgr * fm,struct nx_flowswitch * fsw)385 flow_mgr_setup_host_flow(struct flow_mgr *fm, struct nx_flowswitch *fsw)
386 {
387 	struct flow_entry *host_fe = fe_alloc(true);
388 	host_fe->fe_key.fk_mask = 0;
389 	host_fe->fe_nx_port = FSW_VP_HOST;
390 	*(struct nx_flowswitch **)(uintptr_t)&host_fe->fe_fsw = fsw;
391 	host_fe->fe_svc_class = KPKT_SC_BE;
392 	host_fe->fe_pid = proc_getpid(kernproc);
393 	host_fe->fe_rx_process = fsw_host_rx;
394 	(void) snprintf(host_fe->fe_proc_name, sizeof(host_fe->fe_proc_name),
395 	    "%s", proc_name_address(kernproc));
396 	flow_entry_retain(host_fe);
397 	fm->fm_host_fe = host_fe;
398 	KPKTQ_INIT(&host_fe->fe_rx_pktq);
399 	KPKTQ_INIT(&host_fe->fe_rx_pktq);
400 }
401 
402 void
flow_mgr_teardown_host_flow(struct flow_mgr * fm)403 flow_mgr_teardown_host_flow(struct flow_mgr *fm)
404 {
405 	flow_entry_release(&fm->fm_host_fe);
406 }
407 
408 /*
409  * Must be matched with a call to flow_mgr_unlock().  Upon success will
410  * return the flow manager address of the specified UUID, and will acquire
411  * the global flow_mgr_lock as reader.  The caller is then expected to release
412  * the lock.
413  */
414 struct flow_mgr *
flow_mgr_find_lock(uuid_t uuid)415 flow_mgr_find_lock(uuid_t uuid)
416 {
417 	struct flow_mgr *fm, find;
418 
419 	uuid_copy(find.fm_uuid, uuid);
420 
421 	lck_rw_lock_shared(&flow_mgr_lock);
422 
423 	fm = RB_FIND(flow_mgr_tree, &flow_mgr_head, &find);
424 	if (fm == NULL) {
425 		lck_rw_done(&flow_mgr_lock);
426 		return NULL;
427 	}
428 
429 	/* caller is expected to call flow_mgr_unlock() when done */
430 	LCK_RW_ASSERT(&flow_mgr_lock, LCK_RW_ASSERT_SHARED);
431 	return fm;
432 }
433 
434 /*
435  * Must be matched with a successful call to flow_mgr_find_lock().
436  */
437 void
flow_mgr_unlock(void)438 flow_mgr_unlock(void)
439 {
440 	lck_rw_done(&flow_mgr_lock);
441 }
442 
443 static inline int
fm_cmp(const struct flow_mgr * a,const struct flow_mgr * b)444 fm_cmp(const struct flow_mgr *a, const struct flow_mgr *b)
445 {
446 	return uuid_compare(a->fm_uuid, b->fm_uuid);
447 }
448 
449 static void
flow_mgr_clear_embedded_scope_id(struct sockaddr_in6 * addr)450 flow_mgr_clear_embedded_scope_id(struct sockaddr_in6 *addr)
451 {
452 	struct in6_addr *in6;
453 	in6 = &addr->sin6_addr;
454 	if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(in6)) {
455 		addr->sin6_scope_id = ntohs(in6->s6_addr16[1]);
456 		in6->s6_addr16[1] = 0;
457 	}
458 }
459 
460 #if CONFIG_MACF
461 static bool
flow_req_check_mac_allowed(struct nx_flow_req * req)462 flow_req_check_mac_allowed(struct nx_flow_req *req)
463 {
464 	int socktype;
465 	switch (req->nfr_ip_protocol) {
466 	case IPPROTO_TCP:
467 		socktype = SOCK_STREAM;
468 		break;
469 
470 	case IPPROTO_UDP:
471 		socktype = SOCK_DGRAM;
472 		break;
473 
474 	default:
475 		/* Custom IP protocol, which is treated as IP diagram type */
476 		socktype = SOCK_DGRAM;
477 		return 0;
478 	}
479 
480 	if (req->nfr_flags & NXFLOWREQF_LISTENER) {
481 		return mac_skywalk_flow_check_listen(req->nfr_proc, NULL,
482 		           &req->nfr_saddr.sa, socktype, req->nfr_ip_protocol);
483 	} else {
484 		return mac_skywalk_flow_check_connect(req->nfr_proc, NULL,
485 		           &req->nfr_daddr.sa, socktype, req->nfr_ip_protocol);
486 	}
487 }
488 #endif /* CONFIG_MACF */
489 
490 static bool
flow_req_needs_netns_reservation(struct nx_flow_req * req)491 flow_req_needs_netns_reservation(struct nx_flow_req *req)
492 {
493 	uint8_t proto = req->nfr_ip_protocol;
494 	return proto == IPPROTO_TCP || proto == IPPROTO_UDP;
495 }
496 
497 static bool
flow_req_needs_protons_reservation(struct nx_flow_req * req)498 flow_req_needs_protons_reservation(struct nx_flow_req *req)
499 {
500 	uint8_t proto = req->nfr_ip_protocol;
501 	return proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
502 	       proto != IPPROTO_ESP && proto != IPPROTO_AH;
503 }
504 
505 static bool
flow_req_needs_ipsec_reservation(struct nx_flow_req * req)506 flow_req_needs_ipsec_reservation(struct nx_flow_req *req)
507 {
508 	uint8_t proto = req->nfr_ip_protocol;
509 	return proto == IPPROTO_ESP || proto == IPPROTO_AH;
510 }
511 
512 static void
flow_set_port_info(struct ns_flow_info * nfi,struct nx_flow_req * req)513 flow_set_port_info(struct ns_flow_info *nfi, struct nx_flow_req *req)
514 {
515 	union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
516 	union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
517 
518 	bzero(nfi, sizeof(struct ns_flow_info));
519 
520 	nfi->nfi_ifp = req->nfr_ifp;
521 
522 	nfi->nfi_laddr = *saddr;
523 	nfi->nfi_faddr = *daddr;
524 
525 	nfi->nfi_protocol = req->nfr_ip_protocol;
526 
527 	uuid_copy(nfi->nfi_flow_uuid, req->nfr_flow_uuid);
528 	ASSERT(!uuid_is_null(nfi->nfi_flow_uuid));
529 
530 	nfi->nfi_owner_pid = req->nfr_pid;
531 	if (req->nfr_epid != -1) {
532 		nfi->nfi_effective_pid = req->nfr_epid;
533 		proc_name(req->nfr_epid, nfi->nfi_effective_name,
534 		    sizeof(nfi->nfi_effective_name));
535 	} else {
536 		nfi->nfi_effective_pid = -1;
537 	}
538 
539 	proc_name(req->nfr_pid, nfi->nfi_owner_name,
540 	    sizeof(nfi->nfi_owner_name));
541 }
542 
543 static int
flow_req_prepare_namespace(struct nx_flow_req * req)544 flow_req_prepare_namespace(struct nx_flow_req *req)
545 {
546 	SK_LOG_VAR(char src_s[MAX_IPv6_STR_LEN]);
547 	int err = 0;
548 
549 	if (flow_req_needs_netns_reservation(req)) {
550 		if (!NETNS_TOKEN_VALID(&req->nfr_port_reservation)) {
551 			union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
552 			struct ns_flow_info nfi;
553 			netns_token ns_token;
554 			flow_set_port_info(&nfi, req);
555 			err = flow_namespace_create(saddr,
556 			    req->nfr_ip_protocol, &ns_token,
557 			    req->nfr_flags, &nfi);
558 			if (err != 0) {
559 				SK_ERR("netns for %s.%u failed",
560 				    sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)),
561 				    sk_sa_get_port(SA(saddr)));
562 				goto fail;
563 			}
564 			req->nfr_port_reservation = ns_token;
565 			req->nfr_flags &= ~NXFLOWREQF_EXT_PORT_RSV;
566 		} else {
567 			/* Validate PID associated with provided reservation */
568 			struct ns_flow_info nfi = {};
569 			err = netns_get_flow_info(&req->nfr_port_reservation,
570 			    &nfi);
571 			/* flow info could be NULL for socket flow */
572 			if (!err && (req->nfr_pid != nfi.nfi_owner_pid ||
573 			    (req->nfr_epid != -1 && nfi.nfi_effective_pid !=
574 			    req->nfr_epid))) {
575 				SK_ERR("netns flow info mismatch, "
576 				    "req_(e)pid %d(%d), nfr_(e)pid %d(%d)",
577 				    req->nfr_pid, req->nfr_epid,
578 				    nfi.nfi_owner_pid, nfi.nfi_effective_pid);
579 				err = EPERM;
580 				goto fail;
581 			}
582 			req->nfr_flags |= NXFLOWREQF_EXT_PORT_RSV;
583 		}
584 	}
585 
586 	if (flow_req_needs_ipsec_reservation(req)) {
587 		union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
588 		union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
589 		void *ipsec_token = NULL;
590 		ASSERT(req->nfr_ipsec_reservation == NULL);
591 		err = key_reserve_custom_ipsec(&ipsec_token, saddr,
592 		    daddr, req->nfr_ip_protocol);
593 		if (err != 0) {
594 			SK_ERR("custom ipsec %u reserve %s failed",
595 			    req->nfr_ip_protocol,
596 			    sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)));
597 			goto fail;
598 		}
599 		req->nfr_ipsec_reservation = ipsec_token;
600 	}
601 
602 	if (flow_req_needs_protons_reservation(req)) {
603 		struct protons_token *ns_token = NULL;
604 		if (!protons_token_is_valid(req->nfr_proto_reservation)) {
605 			err = protons_reserve(&ns_token, req->nfr_pid,
606 			    req->nfr_epid, req->nfr_ip_protocol);
607 			if (err != 0) {
608 				SK_ERR("protocol %u namespace failed",
609 				    req->nfr_ip_protocol);
610 				goto fail;
611 			}
612 			req->nfr_flags &= ~NXFLOWREQF_EXT_PROTO_RSV;
613 			req->nfr_proto_reservation = ns_token;
614 		} else {
615 			/* Validate PID associated with provided reservation */
616 			if (!protons_token_has_matching_pid(req->nfr_proto_reservation,
617 			    req->nfr_pid, req->nfr_epid)) {
618 				SK_ERR("protons token pid mismatch");
619 				err = EPERM;
620 				goto fail;
621 			}
622 			req->nfr_flags |= NXFLOWREQF_EXT_PROTO_RSV;
623 		}
624 	}
625 
626 	return 0;
627 
628 fail:
629 	VERIFY(err != 0);
630 	SK_ERR("perparation failed (err %d)", err);
631 	return err;
632 }
633 
634 static int
flow_req_prepare(struct nx_flow_req * req,struct kern_nexus * nx,struct flow_mgr * fm,struct ifnet * ifp,flow_route_ctor_fn_t fr_ctor,flow_route_resolve_fn_t fr_resolve,void * fr_arg)635 flow_req_prepare(struct nx_flow_req *req, struct kern_nexus *nx,
636     struct flow_mgr *fm, struct ifnet *ifp, flow_route_ctor_fn_t fr_ctor,
637     flow_route_resolve_fn_t fr_resolve, void *fr_arg)
638 {
639 	int err = 0;
640 	union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
641 	union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
642 	uint8_t protocol = req->nfr_ip_protocol;
643 
644 	sa_family_t saf, daf, xaf, af;
645 
646 	saf = SA(saddr)->sa_family;
647 	daf = SA(daddr)->sa_family;
648 	xaf = saf ^ daf;
649 	if (xaf != 0 && xaf != saf && xaf != daf) {
650 		SK_ERR("invalid saddr af %d daddr af %d", saf, daf);
651 		return EINVAL;
652 	}
653 	af = (xaf == 0) ? saf : xaf;
654 
655 	bool has_saddr = false, has_daddr = false;
656 	bool has_sport = false, has_dport = false;
657 	uint16_t sport, dport;
658 	uint8_t sa_len;
659 	switch (af) {
660 	case AF_INET:
661 		sa_len = sizeof(struct sockaddr_in);
662 		has_saddr = (SIN(saddr)->sin_addr.s_addr != INADDR_ANY);
663 		has_daddr = (SIN(daddr)->sin_addr.s_addr != INADDR_ANY);
664 		sport = SIN(saddr)->sin_port;
665 		dport = SIN(daddr)->sin_port;
666 		has_sport = (sport != 0);
667 		has_dport = (dport != 0);
668 
669 		if ((has_saddr && SIN(saddr)->sin_len != sa_len) ||
670 		    (has_daddr && SIN(daddr)->sin_len != sa_len)) {
671 			SK_ERR("sin_len invalid");
672 			err = EINVAL;
673 			goto fail;
674 		}
675 		if ((has_saddr && IN_MULTICAST(ntohl(SIN(saddr)->sin_addr.s_addr))) ||
676 		    (has_daddr && IN_MULTICAST(ntohl(SIN(daddr)->sin_addr.s_addr)))) {
677 			SK_ERR("multicast flow not yet supported");
678 			err = EADDRNOTAVAIL;
679 			goto fail;
680 		}
681 		if (__probable(protocol == IPPROTO_TCP)) {
682 			INC_ATOMIC_INT64_LIM(
683 				net_api_stats.nas_nx_flow_inet6_stream_total);
684 		} else {
685 			INC_ATOMIC_INT64_LIM(
686 				net_api_stats.nas_nx_flow_inet6_dgram_total);
687 		}
688 		break;
689 
690 	case AF_INET6:
691 		sa_len = sizeof(struct sockaddr_in6);
692 		has_saddr = !IN6_IS_ADDR_UNSPECIFIED(&SIN6(saddr)->sin6_addr);
693 		has_daddr = !IN6_IS_ADDR_UNSPECIFIED(&SIN6(daddr)->sin6_addr);
694 		sport = SIN6(saddr)->sin6_port;
695 		dport = SIN6(daddr)->sin6_port;
696 		has_sport = (sport != 0);
697 		has_dport = (dport != 0);
698 		if ((has_saddr && SIN6(saddr)->sin6_len != sa_len) ||
699 		    (has_daddr && SIN6(daddr)->sin6_len != sa_len)) {
700 			SK_ERR("sin_len invalid");
701 			err = EINVAL;
702 			goto fail;
703 		}
704 		/* clear embedded scope if link-local src */
705 		if (has_saddr) {
706 			flow_mgr_clear_embedded_scope_id(SIN6(saddr));
707 			if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&SIN6(saddr)->sin6_addr)) {
708 				SIN6(saddr)->sin6_scope_id = ifp->if_index;
709 			}
710 		}
711 		if (has_daddr) {
712 			flow_mgr_clear_embedded_scope_id(SIN6(daddr));
713 			if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&SIN6(daddr)->sin6_addr)) {
714 				SIN6(daddr)->sin6_scope_id = ifp->if_index;
715 			}
716 		}
717 		if ((has_saddr && IN6_IS_ADDR_MULTICAST(&SIN6(saddr)->sin6_addr)) ||
718 		    (has_daddr && IN6_IS_ADDR_MULTICAST(&SIN6(daddr)->sin6_addr))) {
719 			SK_ERR("multicast flow not yet supported");
720 			err = EADDRNOTAVAIL;
721 			goto fail;
722 		}
723 		if (__probable(protocol == IPPROTO_TCP)) {
724 			INC_ATOMIC_INT64_LIM(
725 				net_api_stats.nas_nx_flow_inet_stream_total);
726 		} else {
727 			INC_ATOMIC_INT64_LIM(
728 				net_api_stats.nas_nx_flow_inet_dgram_total);
729 		}
730 		break;
731 
732 	default:
733 		SK_ERR("unknown address families saf %d daf %d", saf, daf);
734 		err = EINVAL;
735 		goto fail;
736 	}
737 
738 	SA(saddr)->sa_family = SA(daddr)->sa_family = af;
739 	SA(saddr)->sa_len = SA(daddr)->sa_len = sa_len;
740 
741 	if (__improbable(has_saddr && !flow_route_laddr_validate(saddr, ifp,
742 	    &req->nfr_saddr_gencnt))) {
743 		SK_LOG_VAR(char src_s[MAX_IPv6_STR_LEN]);
744 		SK_ERR("src address %s is not valid",
745 		    sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)));
746 		err = EADDRNOTAVAIL;
747 		goto fail;
748 	}
749 
750 	bool is_tcp_udp = (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP);
751 	if (!is_tcp_udp) {
752 		if (has_sport || has_dport) {
753 			SK_ERR("non-zero port for IP flow");
754 			return EINVAL;
755 		}
756 	} else {
757 		/* dst:dport as connected, 0:0 as listener, but not partial */
758 		if (has_daddr != has_dport) {
759 			err = EINVAL;
760 			SK_ERR("invalid dst/dport for TCP/UDP (err %d)", err);
761 			goto fail;
762 		}
763 	}
764 
765 	if (!has_daddr && !has_dport) {
766 		req->nfr_flags |= NXFLOWREQF_LISTENER;
767 	}
768 
769 	if (req->nfr_transport_protocol == 0) {
770 		req->nfr_transport_protocol = req->nfr_ip_protocol;
771 	}
772 
773 	req->nfr_ifp = ifp;
774 
775 #if CONFIG_MACF
776 	err = flow_req_check_mac_allowed(req);
777 	if (err != 0) {
778 		SK_ERR("flow req failed MAC check");
779 		goto fail;
780 	}
781 #endif /* CONFIG_MACF */
782 
783 	/* setup flow route and prepare saddr if needed */
784 	if (__probable(has_daddr || has_dport)) {
785 		struct flow_route *fr = NULL;
786 		err = flow_route_find(nx, fm, ifp, req, fr_ctor,
787 		    fr_resolve, fr_arg, &fr);
788 		if (__improbable(err != 0)) {
789 			SK_ERR("flow route lookup failed");
790 			ASSERT(fr == NULL);
791 			goto fail;
792 		}
793 		ASSERT(fr != NULL);
794 		/* Pick up the default source address from flow route. */
795 		if (!has_saddr) {
796 			*saddr = fr->fr_laddr;
797 			SIN(saddr)->sin_port = sport;
798 		}
799 		req->nfr_route = fr;
800 		fr = NULL;
801 	}
802 
803 	err = flow_req_prepare_namespace(req);
804 	if (err != 0) {
805 		goto fail;
806 	}
807 
808 	return 0;
809 
810 fail:
811 	VERIFY(err != 0);
812 	if (req->nfr_route != NULL) {
813 		flow_route_release(req->nfr_route);
814 		req->nfr_route = NULL;
815 	}
816 	SK_ERR("preparation failed (err %d)", err);
817 	return err;
818 }
819 
820 static void
flow_req_cleanup(struct nx_flow_req * req)821 flow_req_cleanup(struct nx_flow_req *req)
822 {
823 	if (NETNS_TOKEN_VALID(&req->nfr_port_reservation) &&
824 	    !(req->nfr_flags & NXFLOWREQF_EXT_PORT_RSV)) {
825 		netns_release(&req->nfr_port_reservation);
826 	}
827 
828 	if (protons_token_is_valid(req->nfr_proto_reservation) &&
829 	    !(req->nfr_flags & NXFLOWREQF_EXT_PROTO_RSV)) {
830 		protons_release(&req->nfr_proto_reservation);
831 	}
832 
833 	if (key_custom_ipsec_token_is_valid(req->nfr_ipsec_reservation)) {
834 		key_release_custom_ipsec(&req->nfr_ipsec_reservation);
835 	}
836 }
837 
838 #if SK_LOG
839 /* Hoisted out of line to reduce kernel stack footprint */
840 SK_LOG_ATTRIBUTE
841 static void
flow_req_dump(char * desc,struct nx_flow_req * req)842 flow_req_dump(char *desc, struct nx_flow_req *req)
843 {
844 	if (!(sk_verbose & SK_VERB_FLOW)) {
845 		return;
846 	}
847 
848 	union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
849 	union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
850 	uint8_t protocol = req->nfr_ip_protocol;
851 	char src_s[MAX_IPv6_STR_LEN];
852 	char dst_s[MAX_IPv6_STR_LEN];
853 	uint8_t sipver = 0, dipver = 0;
854 	uint16_t sport = 0, dport = 0;
855 	uuid_string_t uuid_s;
856 
857 	// unsanitized req, treat source and destination AF separately
858 	if (saddr->sa.sa_family == AF_INET) {
859 		sipver = IPVERSION;
860 		(void) inet_ntop(AF_INET, &SIN(saddr)->sin_addr, src_s,
861 		    sizeof(src_s));
862 		sport = ntohs(saddr->sin.sin_port);
863 	} else if (saddr->sa.sa_family == AF_INET6) {
864 		sipver = IPV6_VERSION;
865 		(void) inet_ntop(AF_INET6, &SIN6(saddr)->sin6_addr, src_s,
866 		    sizeof(src_s));
867 		sport = ntohs(saddr->sin6.sin6_port);
868 	} else {
869 		sipver = 0;
870 		strlcpy(src_s, "INV", sizeof(src_s));
871 	}
872 	if (daddr->sa.sa_family == AF_INET) {
873 		dipver = IPVERSION;
874 		(void) inet_ntop(AF_INET, &SIN(daddr)->sin_addr, dst_s,
875 		    sizeof(dst_s));
876 		dport = ntohs(daddr->sin.sin_port);
877 	} else if (daddr->sa.sa_family == AF_INET6) {
878 		dipver = IPV6_VERSION;
879 		(void) inet_ntop(AF_INET6, &SIN6(saddr)->sin6_addr, dst_s,
880 		    sizeof(dst_s));
881 		dport = ntohs(daddr->sin6.sin6_port);
882 	} else {
883 		dipver = 0;
884 		strlcpy(dst_s, "INV", sizeof(src_s));
885 	}
886 
887 	SK_DF(SK_VERB_FLOW,
888 	    "%s %s sipver=%u,dipver=%u,src=%s,dst=%s,proto=%d,sport=%u,dport=%d"
889 	    " nx_port=%u,flags 0x%b", desc, sk_uuid_unparse(req->nfr_flow_uuid,
890 	    uuid_s), sipver, dipver, src_s, dst_s, protocol, sport, dport,
891 	    req->nfr_nx_port, req->nfr_flags, NXFLOWREQF_BITS);
892 }
893 #else
894 #define flow_req_dump(str, req) do { ((void)0); } while (0)
895 #endif /* SK_LOG */
896 
897 /*
898  * Upon success, returns a non-NULL fb that is (writer) locked.
899  */
900 int
flow_mgr_flow_add(struct kern_nexus * nx,struct flow_mgr * fm,struct flow_owner * fo,struct ifnet * ifp,struct nx_flow_req * req,flow_route_ctor_fn_t fr_ctor,flow_route_resolve_fn_t fr_resolve,void * fr_arg)901 flow_mgr_flow_add(struct kern_nexus *nx, struct flow_mgr *fm,
902     struct flow_owner *fo, struct ifnet *ifp, struct nx_flow_req *req,
903     flow_route_ctor_fn_t fr_ctor, flow_route_resolve_fn_t fr_resolve,
904     void *fr_arg)
905 {
906 	struct flow_entry *fe;
907 	int err = 0;
908 
909 	ASSERT(ifp != NULL);
910 	ASSERT(fr_ctor != NULL && fr_resolve != NULL);
911 	FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo));
912 
913 	flow_req_dump("req", req);
914 
915 	if (!(req->nfr_flags & NXFLOWREQF_ASIS)) {
916 		err = flow_req_prepare(req, nx, fm, ifp, fr_ctor, fr_resolve, fr_arg);
917 		if (err != 0) {
918 			SK_ERR("flow req preparation failure (err %d)", err);
919 			return err;
920 		}
921 	}
922 
923 	/*
924 	 * Add entry in flowswitch table; upon success, flow entry adds a
925 	 * retain count on the flow route (we'll always need to release the
926 	 * refcnt from flow_route_find), and the local address:port of the
927 	 * flow entry will be set.
928 	 */
929 	fe = flow_entry_alloc(fo, req, &err);
930 	if (__improbable(fe == NULL)) {
931 		ASSERT(err != 0);
932 		goto fail;
933 	}
934 
935 	VERIFY(NETNS_TOKEN_VALID(&fe->fe_port_reservation) ||
936 	    !(fe->fe_key.fk_mask & FKMASK_SPORT) ||
937 	    req->nfr_flags & NXFLOWREQF_ASIS);
938 	VERIFY((req->nfr_flags & NXFLOWREQF_FLOWADV) ^
939 	    (req->nfr_flowadv_idx == FLOWADV_IDX_NONE));
940 	req->nfr_flowadv_idx = fe->fe_adv_idx;
941 
942 	flow_req_dump("added ", req);
943 
944 	if (fe != NULL) {
945 		flow_entry_release(&fe);
946 	}
947 
948 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
949 	if (req->nfr_saddr.sa.sa_family == AF_INET6 &&
950 	    IN6_IS_SCOPE_EMBED(&req->nfr_saddr.sin6.sin6_addr)) {
951 		req->nfr_saddr.sin6.sin6_scope_id = ifnet_index(
952 			fsw->fsw_ifp);
953 	}
954 	if (req->nfr_daddr.sa.sa_family == AF_INET6 &&
955 	    IN6_IS_SCOPE_EMBED(&req->nfr_daddr.sin6.sin6_addr)) {
956 		req->nfr_daddr.sin6.sin6_scope_id = ifnet_index(
957 			fsw->fsw_ifp);
958 	}
959 
960 	return 0;
961 
962 fail:
963 	VERIFY(err != 0);
964 	flow_req_cleanup(req);
965 
966 	return err;
967 }
968 
969 struct flow_owner_bucket *
flow_mgr_get_fob_by_pid(struct flow_mgr * fm,pid_t pid)970 flow_mgr_get_fob_by_pid(struct flow_mgr *fm, pid_t pid)
971 {
972 	return flow_mgr_get_fob_at_idx(fm,
973 	           (pid % fm->fm_owner_buckets_cnt));
974 }
975 
976 struct flow_entry *
flow_mgr_get_fe_by_uuid_rlock(struct flow_mgr * fm,uuid_t uuid)977 flow_mgr_get_fe_by_uuid_rlock(struct flow_mgr *fm, uuid_t uuid)
978 {
979 	uint32_t i;
980 	struct flow_owner_bucket *fob;
981 	struct flow_owner *fo;
982 	struct flow_entry *fe;
983 
984 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
985 		fob = flow_mgr_get_fob_at_idx(fm, i);
986 		FOB_LOCK_SPIN(fob);
987 		RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
988 			fe = flow_entry_find_by_uuid(fo, uuid);
989 			if (fe != NULL) {
990 				FOB_LOCK_CONVERT(fob);
991 				FOB_UNLOCK(fob);
992 				return fe;
993 			}
994 		}
995 		FOB_UNLOCK(fob);
996 	}
997 	return NULL;
998 }
999 
1000 struct flow_route_bucket *
flow_mgr_get_frb_by_addr(struct flow_mgr * fm,union sockaddr_in_4_6 * daddr)1001 flow_mgr_get_frb_by_addr(struct flow_mgr *fm,
1002     union sockaddr_in_4_6 *daddr)
1003 {
1004 	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = flow_seed;
1005 
1006 	switch (SA(daddr)->sa_family) {
1007 	case AF_INET: {
1008 		uint8_t *p = (uint8_t *)&SIN(daddr)->sin_addr.s_addr;
1009 		b += ((uint32_t)p[3]);
1010 		a += ((uint32_t)p[2]) << 24;
1011 		a += ((uint32_t)p[1]) << 16;
1012 		a += ((uint32_t)p[0]) << 8;
1013 		break;
1014 	}
1015 
1016 	case AF_INET6: {
1017 		b += SIN6(daddr)->sin6_addr.s6_addr32[3];
1018 		a += SIN6(daddr)->sin6_addr.s6_addr32[2];
1019 		a += SIN6(daddr)->sin6_addr.s6_addr32[1];
1020 		a += SIN6(daddr)->sin6_addr.s6_addr32[0];
1021 		break;
1022 	}
1023 
1024 	default:
1025 		VERIFY(0);
1026 		/* NOTREACHED */
1027 		__builtin_unreachable();
1028 	}
1029 
1030 	/* mix */
1031 	a -= b; a -= c; a ^= (c >> 13);
1032 	b -= c; b -= a; b ^= (a << 8);
1033 	c -= a; c -= b; c ^= (b >> 13);
1034 	a -= b; a -= c; a ^= (c >> 12);
1035 	b -= c; b -= a; b ^= (a << 16);
1036 	c -= a; c -= b; c ^= (b >> 5);
1037 	a -= b; a -= c; a ^= (c >> 3);
1038 	b -= c; b -= a; b ^= (a << 10);
1039 	c -= a; c -= b; c ^= (b >> 15);
1040 
1041 	c &= (fm->fm_route_buckets_cnt - 1);
1042 
1043 	return flow_mgr_get_frb_at_idx(fm, c);
1044 }
1045 
1046 struct flow_route_id_bucket *
flow_mgr_get_frib_by_uuid(struct flow_mgr * fm,uuid_t fr_uuid)1047 flow_mgr_get_frib_by_uuid(struct flow_mgr *fm, uuid_t fr_uuid)
1048 {
1049 	union {
1050 		uuid_t   uuid __sk_aligned(8);
1051 		uint64_t u64[2];
1052 	} u;
1053 	uint64_t key;
1054 
1055 	_CASSERT(sizeof(u.uuid) == sizeof(u.u64));
1056 	uuid_copy(u.uuid, fr_uuid);
1057 
1058 	/* XOR fold UUID down to 4-bytes */
1059 	key = (u.u64[0] ^ u.u64[1]);
1060 	key = ((key >> 32) ^ (key & 0xffffffff));
1061 
1062 	/* add some offset to get more entropy */
1063 	return flow_mgr_get_frib_at_idx(fm,
1064 	           ((uint32_t)key % fm->fm_route_id_buckets_cnt));
1065 }
1066 
1067 static int
flow_hash_mask_add(struct flow_mgr * fm,uint32_t mask,int32_t v)1068 flow_hash_mask_add(struct flow_mgr *fm, uint32_t mask, int32_t v)
1069 {
1070 	for (uint32_t i = 0; i < FKMASK_IDX_MAX; i++) {
1071 		if (fm->fm_flow_hash_masks[i] == mask) {
1072 			atomic_add_32(&fm->fm_flow_hash_count[i], v);
1073 			return 0;
1074 		}
1075 	}
1076 	SK_ERR("unkown hash mask 0x%x", mask);
1077 	return ENOTSUP;
1078 }
1079 
1080 int
flow_mgr_flow_hash_mask_add(struct flow_mgr * fm,uint32_t mask)1081 flow_mgr_flow_hash_mask_add(struct flow_mgr *fm, uint32_t mask)
1082 {
1083 	return flow_hash_mask_add(fm, mask, 1);
1084 }
1085 
1086 int
flow_mgr_flow_hash_mask_del(struct flow_mgr * fm,uint32_t mask)1087 flow_mgr_flow_hash_mask_del(struct flow_mgr *fm, uint32_t mask)
1088 {
1089 	return flow_hash_mask_add(fm, mask, -1);
1090 }
1091 
1092 #if SK_LOG
1093 SK_NO_INLINE_ATTRIBUTE
1094 static void
__flow_mgr_find_fe_by_key_prelog(struct flow_key * key)1095 __flow_mgr_find_fe_by_key_prelog(struct flow_key *key)
1096 {
1097 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
1098 	SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "key %s",
1099 	    fk_as_string(key, dbgbuf, sizeof(dbgbuf)));
1100 }
1101 
1102 SK_NO_INLINE_ATTRIBUTE
1103 static void
__flow_mgr_find_fe_by_key_epilog(struct flow_entry * fe)1104 __flow_mgr_find_fe_by_key_epilog(struct flow_entry *fe)
1105 {
1106 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
1107 	if (fe != NULL) {
1108 		SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "fe 0x%llx \"%s\"",
1109 		    SK_KVA(fe), fe_as_string(fe, dbgbuf, sizeof(dbgbuf)));
1110 	} else {
1111 		SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "fe not found");
1112 	}
1113 }
1114 #else
1115 #define __flow_mgr_find_fe_by_key_prelog(key) do { ((void)0); } while (0)
1116 #define __flow_mgr_find_fe_by_key_epilog(fe) do { ((void)0); } while (0)
1117 #endif /* SK_LOG */
1118 
1119 struct flow_entry *
flow_mgr_find_fe_by_key(struct flow_mgr * fm,struct flow_key * key)1120 flow_mgr_find_fe_by_key(struct flow_mgr *fm, struct flow_key *key)
1121 {
1122 	struct cuckoo_node *node = NULL;
1123 	struct flow_entry *fe = NULL;
1124 	uint32_t hash = 0;
1125 	uint16_t saved_mask = key->fk_mask;
1126 
1127 	__flow_mgr_find_fe_by_key_prelog(key);
1128 
1129 	for (int i = 0; i < FKMASK_IDX_MAX; i++) {
1130 		size_t count = fm->fm_flow_hash_count[i];
1131 		uint16_t mask = fm->fm_flow_hash_masks[i];
1132 		if (count == 0 || mask == 0) {
1133 			SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1134 			    "[%d] mask=%08x count=%zu skiped",
1135 			    i, mask, count);
1136 			continue;
1137 		}
1138 		key->fk_mask = mask;
1139 		hash = flow_key_hash(key);
1140 		node = cuckoo_hashtable_find_with_hash(fm->fm_flow_table, key, hash);
1141 		SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1142 		    "[%d] mask=%08x hash %08x node 0x%llx", i, mask, hash,
1143 		    SK_KVA(node));
1144 		if (node != NULL) {
1145 			fe = container_of(node, struct flow_entry, fe_cnode);
1146 			/* v4 only listener fe shouldn't get v6 connection */
1147 			if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE &&
1148 			    fe->fe_key.fk_ipver == IPVERSION &&
1149 			    key->fk_ipver == IPV6_VERSION)) {
1150 				flow_entry_release(&fe);
1151 				ASSERT(fe == NULL);
1152 				SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1153 				    "\tskip v4 only fe");
1154 				continue;
1155 			}
1156 			break;
1157 		}
1158 	}
1159 
1160 	key->fk_mask = saved_mask;
1161 
1162 	__flow_mgr_find_fe_by_key_epilog(fe);
1163 
1164 	return fe;
1165 }
1166 
1167 struct flow_entry *
flow_mgr_find_conflicting_fe(struct flow_mgr * fm,struct flow_key * key)1168 flow_mgr_find_conflicting_fe(struct flow_mgr *fm, struct flow_key *key)
1169 {
1170 	struct cuckoo_node *node = NULL;
1171 	struct flow_entry *fe = NULL;
1172 	uint32_t hash = 0;
1173 
1174 	hash = flow_key_hash(key);
1175 	node = cuckoo_hashtable_find_with_hash(fm->fm_flow_table, key, hash);
1176 	if (node != NULL) {
1177 		fe = container_of(node, struct flow_entry, fe_cnode);
1178 		return fe;
1179 	}
1180 
1181 	/* listener flow confliction will be checked at netns reservation */
1182 	return fe;
1183 }
1184 
1185 void
1186 flow_mgr_foreach_flow(struct flow_mgr *fm,
1187     void (^flow_handler)(struct flow_entry *fe))
1188 {
1189 	cuckoo_hashtable_foreach(fm->fm_flow_table,
1190 	    ^(struct cuckoo_node *node, uint32_t hv) {
1191 		#pragma unused(hv)
1192 		struct flow_entry *fe;
1193 		fe = container_of(node, struct flow_entry, fe_cnode);
1194 		flow_handler(fe);
1195 	}
1196 	    );
1197 }
1198 
1199 struct flow_entry *
flow_mgr_get_host_fe(struct flow_mgr * fm)1200 flow_mgr_get_host_fe(struct flow_mgr *fm)
1201 {
1202 	struct flow_entry *fe;
1203 	fe = fm->fm_host_fe;
1204 	flow_entry_retain(fe);
1205 	return fe;
1206 }
1207