xref: /xnu-8019.80.24/bsd/skywalk/nexus/flowswitch/flow/flow_manager.c (revision a325d9c4a84054e40bbe985afedcb50ab80993ea)
1 /*
2  * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/os_skywalk.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
33 #include <netinet/in.h>
34 #include <netinet/in_var.h>
35 #include <netinet6/ip6_var.h>
36 #include <netkey/key.h>
37 
38 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
39 
40 #if CONFIG_MACF
41 #include <security/mac_framework.h>
42 #endif /* CONFIG_MACF */
43 
44 #include <net/net_api_stats.h>
45 
46 #define SKMEM_TAG_FSW_FLOW_MGR "com.apple.skywalk.fsw.flow_mgr"
47 static kern_allocation_name_t skmem_tag_fsw_flow_mgr;
48 
49 static LCK_GRP_DECLARE(flow_mgr_lock_group, "sk_flow_mgr_lock");
50 static LCK_RW_DECLARE(flow_mgr_lock, &flow_mgr_lock_group);
51 
52 static int fm_cmp(const struct flow_mgr *,
53     const struct flow_mgr *);
54 
55 RB_HEAD(flow_mgr_tree, flow_mgr);
56 RB_PROTOTYPE_PREV(flow_mgr_tree, flow_mgr, fm_link, fm_cmp);
57 RB_GENERATE_PREV(flow_mgr_tree, flow_mgr, fm_link, fm_cmp);
58 
59 /* protected by the global lock flow_mgr_lock */
60 static struct flow_mgr_tree flow_mgr_head;
61 
62 static int __flow_mgr_inited = 0;
63 
64 void
flow_mgr_init(void)65 flow_mgr_init(void)
66 {
67 	ASSERT(!__flow_mgr_inited);
68 
69 	ASSERT(skmem_tag_fsw_flow_mgr == NULL);
70 	skmem_tag_fsw_flow_mgr =
71 	    kern_allocation_name_allocate(SKMEM_TAG_FSW_FLOW_MGR, 0);
72 	ASSERT(skmem_tag_fsw_flow_mgr != NULL);
73 
74 	RB_INIT(&flow_mgr_head);
75 	__flow_mgr_inited = 1;
76 }
77 
78 void
flow_mgr_fini(void)79 flow_mgr_fini(void)
80 {
81 	if (__flow_mgr_inited) {
82 		VERIFY(RB_EMPTY(&flow_mgr_head));
83 
84 		if (skmem_tag_fsw_flow_mgr != NULL) {
85 			kern_allocation_name_release(skmem_tag_fsw_flow_mgr);
86 			skmem_tag_fsw_flow_mgr = NULL;
87 		}
88 
89 		__flow_mgr_inited = 0;
90 	}
91 }
92 
93 static int
__fe_cuckoo_cmp(struct cuckoo_node * node,void * key0)94 __fe_cuckoo_cmp(struct cuckoo_node *node, void *key0)
95 {
96 	struct flow_entry *fe = container_of(node, struct flow_entry, fe_cnode);
97 	struct flow_key *key = key0;
98 	const struct flow_key *mask;
99 
100 	/*
101 	 * This can probably be made more efficient by having "mask" be
102 	 * set by the original caller at the time the key is initialized,
103 	 * though that needs to be done carefully to ensure there is no
104 	 * mismatch between fk_mask value and "mask" itself.
105 	 */
106 	switch (key->fk_mask) {
107 	case FKMASK_5TUPLE:
108 		mask = &fk_mask_5tuple;
109 		break;
110 	case FKMASK_4TUPLE:
111 		mask = &fk_mask_4tuple;
112 		break;
113 	case FKMASK_3TUPLE:
114 		mask = &fk_mask_3tuple;
115 		break;
116 	case FKMASK_2TUPLE:
117 		mask = &fk_mask_2tuple;
118 		break;
119 	case FKMASK_IPFLOW3:
120 		mask = &fk_mask_ipflow3;
121 		break;
122 	case FKMASK_IPFLOW2:
123 		mask = &fk_mask_ipflow2;
124 		break;
125 	case FKMASK_IPFLOW1:
126 		mask = &fk_mask_ipflow1;
127 		break;
128 	default:
129 		return flow_key_cmp(&fe->fe_key, key);
130 	}
131 
132 	return flow_key_cmp_mask(&fe->fe_key, key, mask);
133 }
134 
135 static void
__fe_cuckoo_retain(struct cuckoo_node * node)136 __fe_cuckoo_retain(struct cuckoo_node *node)
137 {
138 	struct flow_entry *fe = container_of(node, struct flow_entry, fe_cnode);
139 	return flow_entry_retain(fe);
140 }
141 
142 static void
__fe_cuckoo_release(struct cuckoo_node * node)143 __fe_cuckoo_release(struct cuckoo_node *node)
144 {
145 #pragma unused(node)
146 	struct flow_entry *fe = container_of(node, struct flow_entry, fe_cnode);
147 	flow_entry_release(&fe);
148 }
149 
150 struct flow_mgr *
flow_mgr_create(size_t fe_cnt,size_t fob_cnt,size_t frb_cnt,size_t frib_cnt)151 flow_mgr_create(size_t fe_cnt, size_t fob_cnt,
152     size_t frb_cnt, size_t frib_cnt)
153 {
154 	struct flow_mgr *fm = NULL;
155 	size_t fob_sz, frb_sz, frib_sz;
156 	size_t fob_tot_sz, frb_tot_sz, frib_tot_sz;
157 	uint32_t i;
158 
159 	/* caller needs to ensure {fb,frb}_cnt is a power of two */
160 	ASSERT(frb_cnt != 0 && ((frb_cnt & (frb_cnt - 1)) == 0));
161 	ASSERT(fob_cnt != 0);
162 	ASSERT(frib_cnt != 0);
163 
164 	fm = sk_alloc_type(struct flow_mgr, Z_WAITOK | Z_NOFAIL, skmem_tag_fsw_flow_mgr);
165 
166 	struct cuckoo_hashtable_params p = {
167 		.cht_capacity = fe_cnt,
168 		.cht_obj_cmp = __fe_cuckoo_cmp,
169 		.cht_obj_retain = __fe_cuckoo_retain,
170 		.cht_obj_release = __fe_cuckoo_release,
171 	};
172 	fm->fm_flow_table = cuckoo_hashtable_create(&p);
173 	if (fm->fm_flow_table == NULL) {
174 		flow_mgr_destroy(fm);
175 		return NULL;
176 	}
177 
178 	/*
179 	 * flow_owner_bucket cache-aligned objects.
180 	 */
181 	fm->fm_owner_buckets = flow_owner_buckets_alloc(fob_cnt, &fob_sz, &fob_tot_sz);
182 	if (fm->fm_owner_buckets == NULL) {
183 		flow_mgr_destroy(fm);
184 		return NULL;
185 	}
186 	/* const overrides */
187 	*(size_t *)(uintptr_t)&fm->fm_owner_buckets_cnt = fob_cnt;
188 	*(size_t *)(uintptr_t)&fm->fm_owner_bucket_sz = fob_sz;
189 	*(size_t *)(uintptr_t)&fm->fm_owner_bucket_tot_sz = fob_tot_sz;
190 
191 	/*
192 	 * flow_route_bucket cache-aligned objects.
193 	 */
194 	fm->fm_route_buckets = flow_route_buckets_alloc(frb_cnt, &frb_sz, &frb_tot_sz);
195 	if (fm->fm_route_buckets == NULL) {
196 		flow_mgr_destroy(fm);
197 		return NULL;
198 	}
199 	/* const overrides */
200 	*(size_t *)(uintptr_t)&fm->fm_route_buckets_cnt = frb_cnt;
201 	*(size_t *)(uintptr_t)&fm->fm_route_bucket_sz = frb_sz;
202 	*(size_t *)(uintptr_t)&fm->fm_route_bucket_tot_sz = frb_tot_sz;
203 
204 	/*
205 	 * flow_route_id_bucket cache-aligned objects.
206 	 */
207 	fm->fm_route_id_buckets =
208 	    flow_route_id_buckets_alloc(frib_cnt, &frib_sz, &frib_tot_sz);
209 	if (fm->fm_route_id_buckets == NULL) {
210 		flow_mgr_destroy(fm);
211 		return NULL;
212 	}
213 	/* const overrides */
214 	*(size_t *)(uintptr_t)&fm->fm_route_id_buckets_cnt = frib_cnt;
215 	*(size_t *)(uintptr_t)&fm->fm_route_id_bucket_sz = frib_sz;
216 	*(size_t *)(uintptr_t)&fm->fm_route_id_bucket_tot_sz = frib_tot_sz;
217 
218 	/* construct flow_owner_buckets */
219 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
220 		struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
221 		flow_owner_bucket_init(fob);
222 		/* const override */
223 		*(size_t *)(uintptr_t)&fob->fob_idx = i;
224 	}
225 
226 	/* construct flow_route_buckets */
227 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
228 		struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
229 		flow_route_bucket_init(frb);
230 		/* const override */
231 		*(size_t *)(uintptr_t)&frb->frb_idx = i;
232 	}
233 
234 	/* construct flow_route_id_buckets */
235 	for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
236 		struct flow_route_id_bucket *frib =
237 		    flow_mgr_get_frib_at_idx(fm, i);
238 		flow_route_id_bucket_init(frib);
239 		/* const override */
240 		*(size_t *)(uintptr_t)&frib->frib_idx = i;
241 	}
242 
243 	uuid_generate_random(fm->fm_uuid);
244 
245 	lck_rw_lock_exclusive(&flow_mgr_lock);
246 	RB_INSERT(flow_mgr_tree, &flow_mgr_head, fm);
247 #if DEBUG
248 	struct flow_mgr find;
249 	uuid_copy(find.fm_uuid, fm->fm_uuid);
250 	/* make sure our tree compare routine is sane */
251 	ASSERT(RB_FIND(flow_mgr_tree,
252 	    &flow_mgr_head, &find) == fm);
253 #endif /* DEBUG */
254 	lck_rw_done(&flow_mgr_lock);
255 
256 	fm->fm_flow_hash_masks[0] = FKMASK_5TUPLE;
257 	fm->fm_flow_hash_masks[1] = FKMASK_4TUPLE;
258 	fm->fm_flow_hash_masks[2] = FKMASK_3TUPLE;
259 	fm->fm_flow_hash_masks[3] = FKMASK_2TUPLE;
260 	fm->fm_flow_hash_masks[4] = FKMASK_IPFLOW3;
261 	fm->fm_flow_hash_masks[5] = FKMASK_IPFLOW2;
262 	fm->fm_flow_hash_masks[6] = FKMASK_IPFLOW1;
263 
264 	memset(&fm->fm_flow_hash_count, 0, sizeof(fm->fm_flow_hash_count));
265 
266 	return fm;
267 }
268 
269 void
flow_mgr_destroy(struct flow_mgr * fm)270 flow_mgr_destroy(struct flow_mgr *fm)
271 {
272 	uint32_t i;
273 
274 	lck_rw_lock_exclusive(&flow_mgr_lock);
275 	ASSERT(!uuid_is_null(fm->fm_uuid));
276 
277 	if (fm->fm_flow_table != NULL) {
278 		cuckoo_hashtable_free(fm->fm_flow_table);
279 	}
280 
281 	if (fm->fm_owner_buckets != NULL) {
282 		for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
283 			struct flow_owner_bucket *fob =
284 			    flow_mgr_get_fob_at_idx(fm, i);
285 			ASSERT(fob->fob_idx == i);
286 			flow_owner_bucket_destroy(fob);
287 		}
288 		flow_owner_buckets_free(fm->fm_owner_buckets,
289 		    fm->fm_owner_bucket_tot_sz);
290 		fm->fm_owner_buckets = NULL;
291 		*(uint32_t *)(uintptr_t)&fm->fm_owner_buckets_cnt = 0;
292 		*(uint32_t *)(uintptr_t)&fm->fm_owner_bucket_sz = 0;
293 		*(uint32_t *)(uintptr_t)&fm->fm_owner_bucket_tot_sz = 0;
294 	}
295 	ASSERT(fm->fm_owner_buckets_cnt == 0);
296 	ASSERT(fm->fm_owner_bucket_sz == 0);
297 	ASSERT(fm->fm_owner_bucket_tot_sz == 0);
298 
299 	if (fm->fm_route_buckets != NULL) {
300 		for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
301 			struct flow_route_bucket *frb =
302 			    flow_mgr_get_frb_at_idx(fm, i);
303 			ASSERT(frb->frb_idx == i);
304 			flow_route_bucket_destroy(frb);
305 		}
306 		flow_route_buckets_free(fm->fm_route_buckets,
307 		    fm->fm_route_bucket_tot_sz);
308 		fm->fm_route_buckets = NULL;
309 		*(uint32_t *)(uintptr_t)&fm->fm_route_buckets_cnt = 0;
310 		*(uint32_t *)(uintptr_t)&fm->fm_route_bucket_sz = 0;
311 		*(uint32_t *)(uintptr_t)&fm->fm_route_bucket_tot_sz = 0;
312 	}
313 	ASSERT(fm->fm_route_buckets_cnt == 0);
314 	ASSERT(fm->fm_route_bucket_sz == 0);
315 	ASSERT(fm->fm_route_bucket_tot_sz == 0);
316 
317 	if (fm->fm_route_id_buckets != NULL) {
318 		for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
319 			struct flow_route_id_bucket *frib =
320 			    flow_mgr_get_frib_at_idx(fm, i);
321 			ASSERT(frib->frib_idx == i);
322 			flow_route_id_bucket_destroy(frib);
323 		}
324 		flow_route_id_buckets_free(fm->fm_route_id_buckets,
325 		    fm->fm_route_id_bucket_tot_sz);
326 		fm->fm_route_id_buckets = NULL;
327 		*(uint32_t *)(uintptr_t)&fm->fm_route_id_buckets_cnt = 0;
328 		*(uint32_t *)(uintptr_t)&fm->fm_route_id_bucket_sz = 0;
329 		*(uint32_t *)(uintptr_t)&fm->fm_route_id_bucket_tot_sz = 0;
330 	}
331 	ASSERT(fm->fm_route_id_buckets_cnt == 0);
332 	ASSERT(fm->fm_route_id_bucket_sz == 0);
333 	ASSERT(fm->fm_route_id_bucket_tot_sz == 0);
334 
335 	uuid_clear(fm->fm_uuid);
336 	RB_REMOVE(flow_mgr_tree, &flow_mgr_head, fm);
337 	lck_rw_done(&flow_mgr_lock);
338 
339 	sk_free_type(struct flow_mgr, fm);
340 }
341 
342 void
flow_mgr_terminate(struct flow_mgr * fm)343 flow_mgr_terminate(struct flow_mgr *fm)
344 {
345 	uint32_t i;
346 
347 	/*
348 	 * Purge all flow entries.
349 	 */
350 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
351 		struct flow_owner_bucket *fob =
352 		    flow_mgr_get_fob_at_idx(fm, i);
353 		FOB_LOCK(fob);
354 		fob->fob_busy_flags |= FOBF_DEAD;
355 	}
356 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
357 		struct flow_owner_bucket *fob =
358 		    flow_mgr_get_fob_at_idx(fm, i);
359 		SK_DF(SK_VERB_FLOW, "purging fob 0x%llx [%u]", SK_KVA(fob), i);
360 		flow_owner_bucket_purge_all(fob);
361 	}
362 
363 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
364 		FOB_UNLOCK(flow_mgr_get_fob_at_idx(fm, i));
365 	}
366 
367 	/*
368 	 * Purge all flow routes.
369 	 */
370 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
371 		struct flow_route_bucket *frb =
372 		    flow_mgr_get_frb_at_idx(fm, i);
373 		FRB_WLOCK(frb);
374 	}
375 	for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
376 		FRIB_WLOCK(flow_mgr_get_frib_at_idx(fm, i));
377 	}
378 
379 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
380 		struct flow_route_bucket *frb =
381 		    flow_mgr_get_frb_at_idx(fm, i);
382 		SK_DF(SK_VERB_FLOW, "purging frb 0x%llx [%u]", SK_KVA(frb), i);
383 		flow_route_bucket_purge_all(frb);
384 	}
385 
386 	for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
387 		FRIB_WUNLOCK(flow_mgr_get_frib_at_idx(fm, i));
388 	}
389 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
390 		FRB_WUNLOCK(flow_mgr_get_frb_at_idx(fm, i));
391 	}
392 }
393 
394 void
flow_mgr_setup_host_flow(struct flow_mgr * fm,struct nx_flowswitch * fsw)395 flow_mgr_setup_host_flow(struct flow_mgr *fm, struct nx_flowswitch *fsw)
396 {
397 	struct flow_entry *host_fe = fe_alloc(true);
398 	host_fe->fe_key.fk_mask = 0;
399 	host_fe->fe_nx_port = FSW_VP_HOST;
400 	*(struct nx_flowswitch **)(uintptr_t)&host_fe->fe_fsw = fsw;
401 	host_fe->fe_svc_class = KPKT_SC_BE;
402 	host_fe->fe_pid = proc_getpid(kernproc);
403 	host_fe->fe_rx_process = fsw_host_rx;
404 	(void) snprintf(host_fe->fe_proc_name, sizeof(host_fe->fe_proc_name),
405 	    "%s", proc_name_address(kernproc));
406 	flow_entry_retain(host_fe);
407 	fm->fm_host_fe = host_fe;
408 	KPKTQ_INIT(&host_fe->fe_rx_pktq);
409 	KPKTQ_INIT(&host_fe->fe_rx_pktq);
410 }
411 
412 void
flow_mgr_teardown_host_flow(struct flow_mgr * fm)413 flow_mgr_teardown_host_flow(struct flow_mgr *fm)
414 {
415 	flow_entry_release(&fm->fm_host_fe);
416 }
417 
418 /*
419  * Must be matched with a call to flow_mgr_unlock().  Upon success will
420  * return the flow manager address of the specified UUID, and will acquire
421  * the global flow_mgr_lock as reader.  The caller is then expected to release
422  * the lock.
423  */
424 struct flow_mgr *
flow_mgr_find_lock(uuid_t uuid)425 flow_mgr_find_lock(uuid_t uuid)
426 {
427 	struct flow_mgr *fm, find;
428 
429 	uuid_copy(find.fm_uuid, uuid);
430 
431 	lck_rw_lock_shared(&flow_mgr_lock);
432 
433 	fm = RB_FIND(flow_mgr_tree, &flow_mgr_head, &find);
434 	if (fm == NULL) {
435 		lck_rw_done(&flow_mgr_lock);
436 		return NULL;
437 	}
438 
439 	/* caller is expected to call flow_mgr_unlock() when done */
440 	LCK_RW_ASSERT(&flow_mgr_lock, LCK_RW_ASSERT_SHARED);
441 	return fm;
442 }
443 
444 /*
445  * Must be matched with a successful call to flow_mgr_find_lock().
446  */
447 void
flow_mgr_unlock(void)448 flow_mgr_unlock(void)
449 {
450 	lck_rw_done(&flow_mgr_lock);
451 }
452 
453 static inline int
fm_cmp(const struct flow_mgr * a,const struct flow_mgr * b)454 fm_cmp(const struct flow_mgr *a, const struct flow_mgr *b)
455 {
456 	return uuid_compare(a->fm_uuid, b->fm_uuid);
457 }
458 
459 static void
flow_mgr_clear_embedded_scope_id(struct sockaddr_in6 * addr)460 flow_mgr_clear_embedded_scope_id(struct sockaddr_in6 *addr)
461 {
462 	struct in6_addr *in6;
463 	in6 = &addr->sin6_addr;
464 	if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(in6)) {
465 		addr->sin6_scope_id = ntohs(in6->s6_addr16[1]);
466 		in6->s6_addr16[1] = 0;
467 	}
468 }
469 
470 #if CONFIG_MACF
471 static bool
flow_req_check_mac_allowed(struct nx_flow_req * req)472 flow_req_check_mac_allowed(struct nx_flow_req *req)
473 {
474 	int socktype;
475 	switch (req->nfr_ip_protocol) {
476 	case IPPROTO_TCP:
477 		socktype = SOCK_STREAM;
478 		break;
479 
480 	case IPPROTO_UDP:
481 		socktype = SOCK_DGRAM;
482 		break;
483 
484 	default:
485 		/* Custom IP protocol, which is treated as IP diagram type */
486 		socktype = SOCK_DGRAM;
487 		return 0;
488 	}
489 
490 	if (req->nfr_flags & NXFLOWREQF_LISTENER) {
491 		return mac_skywalk_flow_check_listen(req->nfr_proc, NULL,
492 		           &req->nfr_saddr.sa, socktype, req->nfr_ip_protocol);
493 	} else {
494 		return mac_skywalk_flow_check_connect(req->nfr_proc, NULL,
495 		           &req->nfr_daddr.sa, socktype, req->nfr_ip_protocol);
496 	}
497 }
498 #endif /* CONFIG_MACF */
499 
500 static bool
flow_req_needs_netns_reservation(struct nx_flow_req * req)501 flow_req_needs_netns_reservation(struct nx_flow_req *req)
502 {
503 	uint8_t proto = req->nfr_ip_protocol;
504 	return proto == IPPROTO_TCP || proto == IPPROTO_UDP;
505 }
506 
507 static bool
flow_req_needs_protons_reservation(struct nx_flow_req * req)508 flow_req_needs_protons_reservation(struct nx_flow_req *req)
509 {
510 	uint8_t proto = req->nfr_ip_protocol;
511 	return proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
512 	       proto != IPPROTO_ESP && proto != IPPROTO_AH;
513 }
514 
515 static bool
flow_req_needs_ipsec_reservation(struct nx_flow_req * req)516 flow_req_needs_ipsec_reservation(struct nx_flow_req *req)
517 {
518 	uint8_t proto = req->nfr_ip_protocol;
519 	return proto == IPPROTO_ESP || proto == IPPROTO_AH;
520 }
521 
522 static void
flow_set_port_info(struct ns_flow_info * nfi,struct nx_flow_req * req)523 flow_set_port_info(struct ns_flow_info *nfi, struct nx_flow_req *req)
524 {
525 	union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
526 	union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
527 
528 	bzero(nfi, sizeof(struct ns_flow_info));
529 
530 	nfi->nfi_ifp = req->nfr_ifp;
531 
532 	nfi->nfi_laddr = *saddr;
533 	nfi->nfi_faddr = *daddr;
534 
535 	nfi->nfi_protocol = req->nfr_ip_protocol;
536 
537 	uuid_copy(nfi->nfi_flow_uuid, req->nfr_flow_uuid);
538 	ASSERT(!uuid_is_null(nfi->nfi_flow_uuid));
539 
540 	nfi->nfi_owner_pid = req->nfr_pid;
541 	if (req->nfr_epid != -1) {
542 		nfi->nfi_effective_pid = req->nfr_epid;
543 		proc_name(req->nfr_epid, nfi->nfi_effective_name,
544 		    sizeof(nfi->nfi_effective_name));
545 	} else {
546 		nfi->nfi_effective_pid = -1;
547 	}
548 
549 	proc_name(req->nfr_pid, nfi->nfi_owner_name,
550 	    sizeof(nfi->nfi_owner_name));
551 }
552 
553 static int
flow_req_prepare_namespace(struct nx_flow_req * req)554 flow_req_prepare_namespace(struct nx_flow_req *req)
555 {
556 #if SK_LOG
557 	char src_s[MAX_IPv6_STR_LEN];
558 #endif /* SK_LOG */
559 	int err = 0;
560 
561 	if (flow_req_needs_netns_reservation(req)) {
562 		if (!NETNS_TOKEN_VALID(&req->nfr_port_reservation)) {
563 			union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
564 			struct ns_flow_info nfi;
565 			netns_token ns_token;
566 			flow_set_port_info(&nfi, req);
567 			err = flow_namespace_create(saddr,
568 			    req->nfr_ip_protocol, &ns_token,
569 			    req->nfr_flags & NXFLOWREQF_LISTENER, &nfi);
570 			if (err != 0) {
571 				SK_ERR("netns for %s.%u failed",
572 				    sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)),
573 				    sk_sa_get_port(SA(saddr)));
574 				goto fail;
575 			}
576 			req->nfr_port_reservation = ns_token;
577 			req->nfr_flags &= ~NXFLOWREQF_EXT_PORT_RSV;
578 		} else {
579 			/* Validate PID associated with provided reservation */
580 			struct ns_flow_info nfi = {};
581 			err = netns_get_flow_info(&req->nfr_port_reservation,
582 			    &nfi);
583 			/* flow info could be NULL for socket flow */
584 			if (!err && (req->nfr_pid != nfi.nfi_owner_pid ||
585 			    (req->nfr_epid != -1 && nfi.nfi_effective_pid !=
586 			    req->nfr_epid))) {
587 				SK_ERR("netns flow info mismatch, "
588 				    "req_(e)pid %d(%d), nfr_(e)pid %d(%d)",
589 				    req->nfr_pid, req->nfr_epid,
590 				    nfi.nfi_owner_pid, nfi.nfi_effective_pid);
591 				err = EPERM;
592 				goto fail;
593 			}
594 			req->nfr_flags |= NXFLOWREQF_EXT_PORT_RSV;
595 		}
596 	}
597 
598 	if (flow_req_needs_ipsec_reservation(req)) {
599 		union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
600 		union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
601 		void *ipsec_token = NULL;
602 		ASSERT(req->nfr_ipsec_reservation == NULL);
603 		err = key_reserve_custom_ipsec(&ipsec_token, saddr,
604 		    daddr, req->nfr_ip_protocol);
605 		if (err != 0) {
606 			SK_ERR("custom ipsec %u reserve %s failed",
607 			    req->nfr_ip_protocol,
608 			    sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)));
609 			goto fail;
610 		}
611 		req->nfr_ipsec_reservation = ipsec_token;
612 	}
613 
614 	if (flow_req_needs_protons_reservation(req)) {
615 		struct protons_token *ns_token = NULL;
616 		if (!protons_token_is_valid(req->nfr_proto_reservation)) {
617 			err = protons_reserve(&ns_token, req->nfr_pid,
618 			    req->nfr_epid, req->nfr_ip_protocol);
619 			if (err != 0) {
620 				SK_ERR("protocol %u namespace failed",
621 				    req->nfr_ip_protocol);
622 				goto fail;
623 			}
624 			req->nfr_flags &= ~NXFLOWREQF_EXT_PROTO_RSV;
625 			req->nfr_proto_reservation = ns_token;
626 		} else {
627 			/* Validate PID associated with provided reservation */
628 			if (!protons_token_has_matching_pid(req->nfr_proto_reservation,
629 			    req->nfr_pid, req->nfr_epid)) {
630 				SK_ERR("protons token pid mismatch");
631 				err = EPERM;
632 				goto fail;
633 			}
634 			req->nfr_flags |= NXFLOWREQF_EXT_PROTO_RSV;
635 		}
636 	}
637 
638 	return 0;
639 
640 fail:
641 	VERIFY(err != 0);
642 	SK_ERR("perparation failed (err %d)", err);
643 	return err;
644 }
645 
646 static int
flow_req_prepare(struct nx_flow_req * req,struct kern_nexus * nx,struct flow_mgr * fm,struct ifnet * ifp,flow_route_ctor_fn_t fr_ctor,flow_route_resolve_fn_t fr_resolve,void * fr_arg)647 flow_req_prepare(struct nx_flow_req *req, struct kern_nexus *nx,
648     struct flow_mgr *fm, struct ifnet *ifp, flow_route_ctor_fn_t fr_ctor,
649     flow_route_resolve_fn_t fr_resolve, void *fr_arg)
650 {
651 	int err = 0;
652 	union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
653 	union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
654 	uint8_t protocol = req->nfr_ip_protocol;
655 
656 	sa_family_t saf, daf, xaf, af;
657 
658 	saf = SA(saddr)->sa_family;
659 	daf = SA(daddr)->sa_family;
660 	xaf = saf ^ daf;
661 	if (xaf != 0 && xaf != saf && xaf != daf) {
662 		SK_ERR("invalid saddr af %d daddr af %d", saf, daf);
663 		return EINVAL;
664 	}
665 	af = (xaf == 0) ? saf : xaf;
666 
667 	bool has_saddr = false, has_daddr = false;
668 	bool has_sport = false, has_dport = false;
669 	uint16_t sport, dport;
670 	uint8_t sa_len;
671 	switch (af) {
672 	case AF_INET:
673 		sa_len = sizeof(struct sockaddr_in);
674 		has_saddr = (SIN(saddr)->sin_addr.s_addr != INADDR_ANY);
675 		has_daddr = (SIN(daddr)->sin_addr.s_addr != INADDR_ANY);
676 		sport = SIN(saddr)->sin_port;
677 		dport = SIN(daddr)->sin_port;
678 		has_sport = (sport != 0);
679 		has_dport = (dport != 0);
680 
681 		if ((has_saddr && SIN(saddr)->sin_len != sa_len) ||
682 		    (has_daddr && SIN(daddr)->sin_len != sa_len)) {
683 			SK_ERR("sin_len invalid");
684 			err = EINVAL;
685 			goto fail;
686 		}
687 		if ((has_saddr && IN_MULTICAST(ntohl(SIN(saddr)->sin_addr.s_addr))) ||
688 		    (has_daddr && IN_MULTICAST(ntohl(SIN(daddr)->sin_addr.s_addr)))) {
689 			SK_ERR("multicast flow not yet supported");
690 			err = EADDRNOTAVAIL;
691 			goto fail;
692 		}
693 		if (__probable(protocol == IPPROTO_TCP)) {
694 			INC_ATOMIC_INT64_LIM(
695 				net_api_stats.nas_nx_flow_inet6_stream_total);
696 		} else {
697 			INC_ATOMIC_INT64_LIM(
698 				net_api_stats.nas_nx_flow_inet6_dgram_total);
699 		}
700 		break;
701 
702 	case AF_INET6:
703 		sa_len = sizeof(struct sockaddr_in6);
704 		has_saddr = !IN6_IS_ADDR_UNSPECIFIED(&SIN6(saddr)->sin6_addr);
705 		has_daddr = !IN6_IS_ADDR_UNSPECIFIED(&SIN6(daddr)->sin6_addr);
706 		sport = SIN6(saddr)->sin6_port;
707 		dport = SIN6(daddr)->sin6_port;
708 		has_sport = (sport != 0);
709 		has_dport = (dport != 0);
710 		if ((has_saddr && SIN6(saddr)->sin6_len != sa_len) ||
711 		    (has_daddr && SIN6(daddr)->sin6_len != sa_len)) {
712 			SK_ERR("sin_len invalid");
713 			err = EINVAL;
714 			goto fail;
715 		}
716 		/* clear embedded scope if link-local src */
717 		if (has_saddr) {
718 			flow_mgr_clear_embedded_scope_id(SIN6(saddr));
719 			if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&SIN6(saddr)->sin6_addr)) {
720 				SIN6(saddr)->sin6_scope_id = ifp->if_index;
721 			}
722 		}
723 		if (has_daddr) {
724 			flow_mgr_clear_embedded_scope_id(SIN6(daddr));
725 			if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&SIN6(daddr)->sin6_addr)) {
726 				SIN6(daddr)->sin6_scope_id = ifp->if_index;
727 			}
728 		}
729 		if ((has_saddr && IN6_IS_ADDR_MULTICAST(&SIN6(saddr)->sin6_addr)) ||
730 		    (has_daddr && IN6_IS_ADDR_MULTICAST(&SIN6(daddr)->sin6_addr))) {
731 			SK_ERR("multicast flow not yet supported");
732 			err = EADDRNOTAVAIL;
733 			goto fail;
734 		}
735 		if (__probable(protocol == IPPROTO_TCP)) {
736 			INC_ATOMIC_INT64_LIM(
737 				net_api_stats.nas_nx_flow_inet_stream_total);
738 		} else {
739 			INC_ATOMIC_INT64_LIM(
740 				net_api_stats.nas_nx_flow_inet_dgram_total);
741 		}
742 		break;
743 
744 	default:
745 		SK_ERR("unknown address families saf %d daf %d", saf, daf);
746 		err = EINVAL;
747 		goto fail;
748 	}
749 
750 	SA(saddr)->sa_family = SA(daddr)->sa_family = af;
751 	SA(saddr)->sa_len = SA(daddr)->sa_len = sa_len;
752 
753 	if (__improbable(has_saddr && !flow_route_laddr_validate(saddr, ifp,
754 	    &req->nfr_saddr_gencnt))) {
755 #if SK_LOG
756 		char src_s[MAX_IPv6_STR_LEN];
757 #endif /* SK_LOG */
758 		SK_ERR("src address %s is not valid",
759 		    sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)));
760 		err = EADDRNOTAVAIL;
761 		goto fail;
762 	}
763 
764 	bool is_tcp_udp = (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP);
765 	if (!is_tcp_udp) {
766 		if (has_sport || has_dport) {
767 			SK_ERR("non-zero port for IP flow");
768 			return EINVAL;
769 		}
770 	} else {
771 		/* dst:dport as connected, 0:0 as listener, but not partial */
772 		if (has_daddr != has_dport) {
773 			err = EINVAL;
774 			SK_ERR("invalid dst/dport for TCP/UDP (err %d)", err);
775 			goto fail;
776 		}
777 	}
778 
779 	if (!has_daddr && !has_dport) {
780 		req->nfr_flags |= NXFLOWREQF_LISTENER;
781 	}
782 
783 	if (req->nfr_transport_protocol == 0) {
784 		req->nfr_transport_protocol = req->nfr_ip_protocol;
785 	}
786 
787 	req->nfr_ifp = ifp;
788 
789 #if CONFIG_MACF
790 	err = flow_req_check_mac_allowed(req);
791 	if (err != 0) {
792 		SK_ERR("flow req failed MAC check");
793 		goto fail;
794 	}
795 #endif /* CONFIG_MACF */
796 
797 	/* setup flow route and prepare saddr if needed */
798 	if (__probable(has_daddr || has_dport)) {
799 		struct flow_route *fr = NULL;
800 		err = flow_route_find(nx, fm, ifp, req, fr_ctor,
801 		    fr_resolve, fr_arg, &fr);
802 		if (__improbable(err != 0)) {
803 			SK_ERR("flow route lookup failed");
804 			ASSERT(fr == NULL);
805 			goto fail;
806 		}
807 		ASSERT(fr != NULL);
808 		/* Pick up the default source address from flow route. */
809 		if (!has_saddr) {
810 			*saddr = fr->fr_laddr;
811 			SIN(saddr)->sin_port = sport;
812 		}
813 		req->nfr_route = fr;
814 		fr = NULL;
815 	}
816 
817 	err = flow_req_prepare_namespace(req);
818 	if (err != 0) {
819 		goto fail;
820 	}
821 
822 	return 0;
823 
824 fail:
825 	VERIFY(err != 0);
826 	if (req->nfr_route != NULL) {
827 		flow_route_release(req->nfr_route);
828 		req->nfr_route = NULL;
829 	}
830 	SK_ERR("preparation failed (err %d)", err);
831 	return err;
832 }
833 
834 static void
flow_req_cleanup(struct nx_flow_req * req)835 flow_req_cleanup(struct nx_flow_req *req)
836 {
837 	if (NETNS_TOKEN_VALID(&req->nfr_port_reservation) &&
838 	    !(req->nfr_flags & NXFLOWREQF_EXT_PORT_RSV)) {
839 		netns_release(&req->nfr_port_reservation);
840 	}
841 
842 	if (protons_token_is_valid(req->nfr_proto_reservation) &&
843 	    !(req->nfr_flags & NXFLOWREQF_EXT_PROTO_RSV)) {
844 		protons_release(&req->nfr_proto_reservation);
845 	}
846 
847 	if (key_custom_ipsec_token_is_valid(req->nfr_ipsec_reservation)) {
848 		key_release_custom_ipsec(&req->nfr_ipsec_reservation);
849 	}
850 }
851 
852 #if SK_LOG
853 /* Hoisted out of line to reduce kernel stack footprint */
854 SK_LOG_ATTRIBUTE
855 static void
flow_req_dump(char * desc,struct nx_flow_req * req)856 flow_req_dump(char *desc, struct nx_flow_req *req)
857 {
858 	if (!(sk_verbose & SK_VERB_FLOW)) {
859 		return;
860 	}
861 
862 	union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
863 	union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
864 	uint8_t protocol = req->nfr_ip_protocol;
865 	char src_s[MAX_IPv6_STR_LEN];
866 	char dst_s[MAX_IPv6_STR_LEN];
867 	uint8_t sipver = 0, dipver = 0;
868 	uint16_t sport = 0, dport = 0;
869 	uuid_string_t uuid_s;
870 
871 	// unsanitized req, treat source and destination AF separately
872 	if (saddr->sa.sa_family == AF_INET) {
873 		sipver = IPVERSION;
874 		(void) inet_ntop(AF_INET, &SIN(saddr)->sin_addr, src_s,
875 		    sizeof(src_s));
876 		sport = ntohs(saddr->sin.sin_port);
877 	} else if (saddr->sa.sa_family == AF_INET6) {
878 		sipver = IPV6_VERSION;
879 		(void) inet_ntop(AF_INET6, &SIN6(saddr)->sin6_addr, src_s,
880 		    sizeof(src_s));
881 		sport = ntohs(saddr->sin6.sin6_port);
882 	} else {
883 		sipver = 0;
884 		strlcpy(src_s, "INV", sizeof(src_s));
885 	}
886 	if (daddr->sa.sa_family == AF_INET) {
887 		dipver = IPVERSION;
888 		(void) inet_ntop(AF_INET, &SIN(daddr)->sin_addr, dst_s,
889 		    sizeof(dst_s));
890 		dport = ntohs(daddr->sin.sin_port);
891 	} else if (daddr->sa.sa_family == AF_INET6) {
892 		dipver = IPV6_VERSION;
893 		(void) inet_ntop(AF_INET6, &SIN6(saddr)->sin6_addr, dst_s,
894 		    sizeof(dst_s));
895 		dport = ntohs(daddr->sin6.sin6_port);
896 	} else {
897 		dipver = 0;
898 		strlcpy(dst_s, "INV", sizeof(src_s));
899 	}
900 
901 	SK_DF(SK_VERB_FLOW,
902 	    "%s %s sipver=%u,dipver=%u,src=%s,dst=%s,proto=%d,sport=%u,dport=%d"
903 	    " nx_port=%u,flags 0x%b", desc, sk_uuid_unparse(req->nfr_flow_uuid,
904 	    uuid_s), sipver, dipver, src_s, dst_s, protocol, sport, dport,
905 	    req->nfr_nx_port, req->nfr_flags, NXFLOWREQF_BITS);
906 }
907 #else
908 #define flow_req_dump(str, req) do { ((void)0); } while (0)
909 #endif /* SK_LOG */
910 
911 /*
912  * Upon success, returns a non-NULL fb that is (writer) locked.
913  */
914 int
flow_mgr_flow_add(struct kern_nexus * nx,struct flow_mgr * fm,struct flow_owner * fo,struct ifnet * ifp,struct nx_flow_req * req,flow_route_ctor_fn_t fr_ctor,flow_route_resolve_fn_t fr_resolve,void * fr_arg)915 flow_mgr_flow_add(struct kern_nexus *nx, struct flow_mgr *fm,
916     struct flow_owner *fo, struct ifnet *ifp, struct nx_flow_req *req,
917     flow_route_ctor_fn_t fr_ctor, flow_route_resolve_fn_t fr_resolve,
918     void *fr_arg)
919 {
920 	struct flow_entry *fe;
921 	int err = 0;
922 
923 	ASSERT(ifp != NULL);
924 	ASSERT(fr_ctor != NULL && fr_resolve != NULL);
925 	FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo));
926 
927 	flow_req_dump("req", req);
928 
929 	if (!(req->nfr_flags & NXFLOWREQF_ASIS)) {
930 		err = flow_req_prepare(req, nx, fm, ifp, fr_ctor, fr_resolve, fr_arg);
931 		if (err != 0) {
932 			SK_ERR("flow req preparation failure (err %d)", err);
933 			return err;
934 		}
935 	}
936 
937 	/*
938 	 * Add entry in flowswitch table; upon success, flow entry adds a
939 	 * retain count on the flow route (we'll always need to release the
940 	 * refcnt from flow_route_find), and the local address:port of the
941 	 * flow entry will be set.
942 	 */
943 	fe = flow_entry_alloc(fo, req, &err);
944 	if (__improbable(fe == NULL)) {
945 		ASSERT(err != 0);
946 		goto fail;
947 	}
948 
949 	VERIFY(NETNS_TOKEN_VALID(&fe->fe_port_reservation) ||
950 	    !(fe->fe_key.fk_mask & FKMASK_SPORT) ||
951 	    req->nfr_flags & NXFLOWREQF_ASIS);
952 	VERIFY((req->nfr_flags & NXFLOWREQF_FLOWADV) ^
953 	    (req->nfr_flowadv_idx == FLOWADV_IDX_NONE));
954 	req->nfr_flowadv_idx = fe->fe_adv_idx;
955 
956 	flow_req_dump("added ", req);
957 
958 	if (fe != NULL) {
959 		flow_entry_release(&fe);
960 	}
961 
962 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
963 	if (req->nfr_saddr.sa.sa_family == AF_INET6 &&
964 	    IN6_IS_SCOPE_EMBED(&req->nfr_saddr.sin6.sin6_addr)) {
965 		req->nfr_saddr.sin6.sin6_scope_id = ifnet_index(
966 			fsw->fsw_ifp);
967 	}
968 	if (req->nfr_daddr.sa.sa_family == AF_INET6 &&
969 	    IN6_IS_SCOPE_EMBED(&req->nfr_daddr.sin6.sin6_addr)) {
970 		req->nfr_daddr.sin6.sin6_scope_id = ifnet_index(
971 			fsw->fsw_ifp);
972 	}
973 
974 	return 0;
975 
976 fail:
977 	VERIFY(err != 0);
978 	flow_req_cleanup(req);
979 
980 	return err;
981 }
982 
983 struct flow_owner_bucket *
flow_mgr_get_fob_by_pid(struct flow_mgr * fm,pid_t pid)984 flow_mgr_get_fob_by_pid(struct flow_mgr *fm, pid_t pid)
985 {
986 	return flow_mgr_get_fob_at_idx(fm,
987 	           (pid % fm->fm_owner_buckets_cnt));
988 }
989 
990 struct flow_entry *
flow_mgr_get_fe_by_uuid_rlock(struct flow_mgr * fm,uuid_t uuid)991 flow_mgr_get_fe_by_uuid_rlock(struct flow_mgr *fm, uuid_t uuid)
992 {
993 	uint32_t i;
994 	struct flow_owner_bucket *fob;
995 	struct flow_owner *fo;
996 	struct flow_entry *fe;
997 
998 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
999 		fob = flow_mgr_get_fob_at_idx(fm, i);
1000 		FOB_LOCK_SPIN(fob);
1001 		RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
1002 			fe = flow_entry_find_by_uuid(fo, uuid);
1003 			if (fe != NULL) {
1004 				FOB_LOCK_CONVERT(fob);
1005 				FOB_UNLOCK(fob);
1006 				return fe;
1007 			}
1008 		}
1009 		FOB_UNLOCK(fob);
1010 	}
1011 	return NULL;
1012 }
1013 
1014 struct flow_route_bucket *
flow_mgr_get_frb_by_addr(struct flow_mgr * fm,union sockaddr_in_4_6 * daddr)1015 flow_mgr_get_frb_by_addr(struct flow_mgr *fm,
1016     union sockaddr_in_4_6 *daddr)
1017 {
1018 	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = flow_seed;
1019 
1020 	switch (SA(daddr)->sa_family) {
1021 	case AF_INET: {
1022 		uint8_t *p = (uint8_t *)&SIN(daddr)->sin_addr.s_addr;
1023 		b += ((uint32_t)p[3]);
1024 		a += ((uint32_t)p[2]) << 24;
1025 		a += ((uint32_t)p[1]) << 16;
1026 		a += ((uint32_t)p[0]) << 8;
1027 		break;
1028 	}
1029 
1030 	case AF_INET6: {
1031 		b += SIN6(daddr)->sin6_addr.s6_addr32[3];
1032 		a += SIN6(daddr)->sin6_addr.s6_addr32[2];
1033 		a += SIN6(daddr)->sin6_addr.s6_addr32[1];
1034 		a += SIN6(daddr)->sin6_addr.s6_addr32[0];
1035 		break;
1036 	}
1037 
1038 	default:
1039 		VERIFY(0);
1040 		/* NOTREACHED */
1041 		__builtin_unreachable();
1042 	}
1043 
1044 	/* mix */
1045 	a -= b; a -= c; a ^= (c >> 13);
1046 	b -= c; b -= a; b ^= (a << 8);
1047 	c -= a; c -= b; c ^= (b >> 13);
1048 	a -= b; a -= c; a ^= (c >> 12);
1049 	b -= c; b -= a; b ^= (a << 16);
1050 	c -= a; c -= b; c ^= (b >> 5);
1051 	a -= b; a -= c; a ^= (c >> 3);
1052 	b -= c; b -= a; b ^= (a << 10);
1053 	c -= a; c -= b; c ^= (b >> 15);
1054 
1055 	c &= (fm->fm_route_buckets_cnt - 1);
1056 
1057 	return flow_mgr_get_frb_at_idx(fm, c);
1058 }
1059 
1060 struct flow_route_id_bucket *
flow_mgr_get_frib_by_uuid(struct flow_mgr * fm,uuid_t fr_uuid)1061 flow_mgr_get_frib_by_uuid(struct flow_mgr *fm, uuid_t fr_uuid)
1062 {
1063 	union {
1064 		uuid_t   uuid __sk_aligned(8);
1065 		uint64_t u64[2];
1066 	} u;
1067 	uint64_t key;
1068 
1069 	_CASSERT(sizeof(u.uuid) == sizeof(u.u64));
1070 	uuid_copy(u.uuid, fr_uuid);
1071 
1072 	/* XOR fold UUID down to 4-bytes */
1073 	key = (u.u64[0] ^ u.u64[1]);
1074 	key = ((key >> 32) ^ (key & 0xffffffff));
1075 
1076 	/* add some offset to get more entropy */
1077 	return flow_mgr_get_frib_at_idx(fm,
1078 	           ((uint32_t)key % fm->fm_route_id_buckets_cnt));
1079 }
1080 
1081 static int
flow_hash_mask_add(struct flow_mgr * fm,uint32_t mask,int32_t v)1082 flow_hash_mask_add(struct flow_mgr *fm, uint32_t mask, int32_t v)
1083 {
1084 	for (uint32_t i = 0; i < FKMASK_IDX_MAX; i++) {
1085 		if (fm->fm_flow_hash_masks[i] == mask) {
1086 			atomic_add_32(&fm->fm_flow_hash_count[i], v);
1087 			return 0;
1088 		}
1089 	}
1090 	SK_ERR("unkown hash mask 0x%x", mask);
1091 	return ENOTSUP;
1092 }
1093 
1094 int
flow_mgr_flow_hash_mask_add(struct flow_mgr * fm,uint32_t mask)1095 flow_mgr_flow_hash_mask_add(struct flow_mgr *fm, uint32_t mask)
1096 {
1097 	return flow_hash_mask_add(fm, mask, 1);
1098 }
1099 
1100 int
flow_mgr_flow_hash_mask_del(struct flow_mgr * fm,uint32_t mask)1101 flow_mgr_flow_hash_mask_del(struct flow_mgr *fm, uint32_t mask)
1102 {
1103 	return flow_hash_mask_add(fm, mask, -1);
1104 }
1105 
1106 struct flow_entry *
flow_mgr_find_fe_by_key(struct flow_mgr * fm,struct flow_key * key)1107 flow_mgr_find_fe_by_key(struct flow_mgr *fm, struct flow_key *key)
1108 {
1109 #if SK_LOG
1110 	char dbgbuf[FLOWENTRY_DBGBUF_SIZE]; /* just for debug message */
1111 #endif /* SK_LOG */
1112 	struct cuckoo_node *node = NULL;
1113 	struct flow_entry *fe = NULL;
1114 	uint32_t hash = 0;
1115 	uint16_t saved_mask = key->fk_mask;
1116 
1117 	SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "key %s",
1118 	    fk_as_string(key, dbgbuf, sizeof(dbgbuf)));
1119 
1120 	for (int i = 0; i < FKMASK_IDX_MAX; i++) {
1121 		size_t count = fm->fm_flow_hash_count[i];
1122 		uint16_t mask = fm->fm_flow_hash_masks[i];
1123 		if (count == 0 || mask == 0) {
1124 			SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1125 			    "[%d] mask=%08x count=%zu skiped",
1126 			    i, mask, count);
1127 			continue;
1128 		}
1129 		key->fk_mask = mask;
1130 		hash = flow_key_hash(key);
1131 		node = cuckoo_hashtable_find_with_hash(fm->fm_flow_table, key, hash);
1132 		SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1133 		    "[%d] mask=%08x hash %08x node 0x%llx", i, mask, hash,
1134 		    SK_KVA(node));
1135 		if (node != NULL) {
1136 			fe = container_of(node, struct flow_entry, fe_cnode);
1137 			/* v4 only listener fe shouldn't get v6 connection */
1138 			if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE &&
1139 			    fe->fe_key.fk_ipver == IPVERSION &&
1140 			    key->fk_ipver == IPV6_VERSION)) {
1141 				flow_entry_release(&fe);
1142 				ASSERT(fe == NULL);
1143 				SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1144 				    "\tskip v4 only fe");
1145 				continue;
1146 			}
1147 			break;
1148 		}
1149 	}
1150 
1151 	key->fk_mask = saved_mask;
1152 
1153 	return fe;
1154 }
1155 
1156 struct flow_entry *
flow_mgr_find_conflicting_fe(struct flow_mgr * fm,struct flow_key * key)1157 flow_mgr_find_conflicting_fe(struct flow_mgr *fm, struct flow_key *key)
1158 {
1159 	struct cuckoo_node *node = NULL;
1160 	struct flow_entry *fe = NULL;
1161 	uint32_t hash = 0;
1162 
1163 	hash = flow_key_hash(key);
1164 	node = cuckoo_hashtable_find_with_hash(fm->fm_flow_table, key, hash);
1165 	if (node != NULL) {
1166 		fe = container_of(node, struct flow_entry, fe_cnode);
1167 		return fe;
1168 	}
1169 
1170 	/* listener flow confliction will be checked at netns reservation */
1171 	return fe;
1172 }
1173 
1174 void
1175 flow_mgr_foreach_flow(struct flow_mgr *fm,
1176     void (^flow_handler)(struct flow_entry *fe))
1177 {
1178 	cuckoo_hashtable_foreach(fm->fm_flow_table,
1179 	    ^(struct cuckoo_node *node, uint32_t hv) {
1180 		#pragma unused(hv)
1181 		struct flow_entry *fe;
1182 		fe = container_of(node, struct flow_entry, fe_cnode);
1183 		flow_handler(fe);
1184 	}
1185 	    );
1186 }
1187 
1188 struct flow_entry *
flow_mgr_get_host_fe(struct flow_mgr * fm)1189 flow_mgr_get_host_fe(struct flow_mgr *fm)
1190 {
1191 	struct flow_entry *fe;
1192 	fe = fm->fm_host_fe;
1193 	flow_entry_retain(fe);
1194 	return fe;
1195 }
1196