xref: /xnu-10002.61.3/bsd/skywalk/nexus/flowswitch/flow/flow_manager.c (revision 0f4c859e951fba394238ab619495c4e1d54d0f34)
1 /*
2  * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/os_skywalk.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
33 #include <netinet/in.h>
34 #include <netinet/in_var.h>
35 #include <netinet6/ip6_var.h>
36 #include <netkey/key.h>
37 #include <netinet/udp.h>
38 
39 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
40 
41 #if CONFIG_MACF
42 #include <security/mac_framework.h>
43 #endif /* CONFIG_MACF */
44 
45 #include <net/net_api_stats.h>
46 
47 #define SKMEM_TAG_FSW_FLOW_MGR "com.apple.skywalk.fsw.flow_mgr"
48 static SKMEM_TAG_DEFINE(skmem_tag_fsw_flow_mgr, SKMEM_TAG_FSW_FLOW_MGR);
49 
50 static LCK_GRP_DECLARE(flow_mgr_lock_group, "sk_flow_mgr_lock");
51 static LCK_RW_DECLARE(flow_mgr_lock, &flow_mgr_lock_group);
52 
53 static int fm_cmp(const struct flow_mgr *,
54     const struct flow_mgr *);
55 
56 RB_HEAD(flow_mgr_tree, flow_mgr);
57 RB_PROTOTYPE_PREV(flow_mgr_tree, flow_mgr, fm_link, fm_cmp);
58 RB_GENERATE_PREV(flow_mgr_tree, flow_mgr, fm_link, fm_cmp);
59 
60 /* protected by the global lock flow_mgr_lock */
61 static struct flow_mgr_tree flow_mgr_head;
62 
63 static int __flow_mgr_inited = 0;
64 
65 void
flow_mgr_init(void)66 flow_mgr_init(void)
67 {
68 	ASSERT(!__flow_mgr_inited);
69 
70 	RB_INIT(&flow_mgr_head);
71 	__flow_mgr_inited = 1;
72 }
73 
74 void
flow_mgr_fini(void)75 flow_mgr_fini(void)
76 {
77 	if (__flow_mgr_inited) {
78 		VERIFY(RB_EMPTY(&flow_mgr_head));
79 
80 		__flow_mgr_inited = 0;
81 	}
82 }
83 
84 static int
__fe_cuckoo_cmp(struct cuckoo_node * node,void * key0)85 __fe_cuckoo_cmp(struct cuckoo_node *node, void *key0)
86 {
87 	struct flow_entry *fe = container_of(node, struct flow_entry, fe_cnode);
88 	struct flow_key *key = key0;
89 	const struct flow_key *mask;
90 
91 	/*
92 	 * This can probably be made more efficient by having "mask" be
93 	 * set by the original caller at the time the key is initialized,
94 	 * though that needs to be done carefully to ensure there is no
95 	 * mismatch between fk_mask value and "mask" itself.
96 	 */
97 	switch (key->fk_mask) {
98 	case FKMASK_5TUPLE:
99 		mask = &fk_mask_5tuple;
100 		break;
101 	case FKMASK_4TUPLE:
102 		mask = &fk_mask_4tuple;
103 		break;
104 	case FKMASK_3TUPLE:
105 		mask = &fk_mask_3tuple;
106 		break;
107 	case FKMASK_2TUPLE:
108 		mask = &fk_mask_2tuple;
109 		break;
110 	case FKMASK_IPFLOW3:
111 		mask = &fk_mask_ipflow3;
112 		break;
113 	case FKMASK_IPFLOW2:
114 		mask = &fk_mask_ipflow2;
115 		break;
116 	case FKMASK_IPFLOW1:
117 		mask = &fk_mask_ipflow1;
118 		break;
119 	default:
120 		return flow_key_cmp(&fe->fe_key, key);
121 	}
122 
123 	return flow_key_cmp_mask(&fe->fe_key, key, mask);
124 }
125 
126 static void
__fe_cuckoo_retain(struct cuckoo_node * node)127 __fe_cuckoo_retain(struct cuckoo_node *node)
128 {
129 	struct flow_entry *fe = container_of(node, struct flow_entry, fe_cnode);
130 	return flow_entry_retain(fe);
131 }
132 
133 static void
__fe_cuckoo_release(struct cuckoo_node * node)134 __fe_cuckoo_release(struct cuckoo_node *node)
135 {
136 #pragma unused(node)
137 	struct flow_entry *fe = container_of(node, struct flow_entry, fe_cnode);
138 	flow_entry_release(&fe);
139 }
140 
141 struct flow_mgr *
flow_mgr_create(size_t fe_cnt,size_t fob_cnt,size_t frb_cnt,size_t frib_cnt)142 flow_mgr_create(size_t fe_cnt, size_t fob_cnt,
143     size_t frb_cnt, size_t frib_cnt)
144 {
145 	struct flow_mgr *fm = NULL;
146 	size_t fob_sz, frb_sz, frib_sz;
147 	size_t fob_tot_sz, frb_tot_sz, frib_tot_sz;
148 	uint32_t i;
149 
150 	/* caller needs to ensure {fb,frb}_cnt is a power of two */
151 	ASSERT(frb_cnt != 0 && ((frb_cnt & (frb_cnt - 1)) == 0));
152 	ASSERT(fob_cnt != 0);
153 	ASSERT(frib_cnt != 0);
154 
155 	fm = sk_alloc_type(struct flow_mgr, Z_WAITOK | Z_NOFAIL, skmem_tag_fsw_flow_mgr);
156 
157 	struct cuckoo_hashtable_params p = {
158 		.cht_capacity = fe_cnt,
159 		.cht_obj_cmp = __fe_cuckoo_cmp,
160 		.cht_obj_retain = __fe_cuckoo_retain,
161 		.cht_obj_release = __fe_cuckoo_release,
162 	};
163 	fm->fm_flow_table = cuckoo_hashtable_create(&p);
164 	if (fm->fm_flow_table == NULL) {
165 		flow_mgr_destroy(fm);
166 		return NULL;
167 	}
168 
169 	/*
170 	 * flow_owner_bucket cache-aligned objects.
171 	 */
172 	fm->fm_owner_buckets = flow_owner_buckets_alloc(fob_cnt, &fob_sz, &fob_tot_sz);
173 	if (fm->fm_owner_buckets == NULL) {
174 		flow_mgr_destroy(fm);
175 		return NULL;
176 	}
177 	/* const overrides */
178 	*(size_t *)(uintptr_t)&fm->fm_owner_buckets_cnt = fob_cnt;
179 	*(size_t *)(uintptr_t)&fm->fm_owner_bucket_sz = fob_sz;
180 	*(size_t *)(uintptr_t)&fm->fm_owner_bucket_tot_sz = fob_tot_sz;
181 
182 	/*
183 	 * flow_route_bucket cache-aligned objects.
184 	 */
185 	fm->fm_route_buckets = flow_route_buckets_alloc(frb_cnt, &frb_sz, &frb_tot_sz);
186 	if (fm->fm_route_buckets == NULL) {
187 		flow_mgr_destroy(fm);
188 		return NULL;
189 	}
190 	/* const overrides */
191 	*(size_t *)(uintptr_t)&fm->fm_route_buckets_cnt = frb_cnt;
192 	*(size_t *)(uintptr_t)&fm->fm_route_bucket_sz = frb_sz;
193 	*(size_t *)(uintptr_t)&fm->fm_route_bucket_tot_sz = frb_tot_sz;
194 
195 	/*
196 	 * flow_route_id_bucket cache-aligned objects.
197 	 */
198 	fm->fm_route_id_buckets =
199 	    flow_route_id_buckets_alloc(frib_cnt, &frib_sz, &frib_tot_sz);
200 	if (fm->fm_route_id_buckets == NULL) {
201 		flow_mgr_destroy(fm);
202 		return NULL;
203 	}
204 	/* const overrides */
205 	*(size_t *)(uintptr_t)&fm->fm_route_id_buckets_cnt = frib_cnt;
206 	*(size_t *)(uintptr_t)&fm->fm_route_id_bucket_sz = frib_sz;
207 	*(size_t *)(uintptr_t)&fm->fm_route_id_bucket_tot_sz = frib_tot_sz;
208 
209 	/* construct flow_owner_buckets */
210 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
211 		struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
212 		flow_owner_bucket_init(fob);
213 		/* const override */
214 		*(size_t *)(uintptr_t)&fob->fob_idx = i;
215 	}
216 
217 	/* construct flow_route_buckets */
218 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
219 		struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
220 		flow_route_bucket_init(frb);
221 		/* const override */
222 		*(size_t *)(uintptr_t)&frb->frb_idx = i;
223 	}
224 
225 	/* construct flow_route_id_buckets */
226 	for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
227 		struct flow_route_id_bucket *frib =
228 		    flow_mgr_get_frib_at_idx(fm, i);
229 		flow_route_id_bucket_init(frib);
230 		/* const override */
231 		*(size_t *)(uintptr_t)&frib->frib_idx = i;
232 	}
233 
234 	uuid_generate_random(fm->fm_uuid);
235 
236 	lck_rw_lock_exclusive(&flow_mgr_lock);
237 	RB_INSERT(flow_mgr_tree, &flow_mgr_head, fm);
238 #if DEBUG
239 	struct flow_mgr find;
240 	uuid_copy(find.fm_uuid, fm->fm_uuid);
241 	/* make sure our tree compare routine is sane */
242 	ASSERT(RB_FIND(flow_mgr_tree,
243 	    &flow_mgr_head, &find) == fm);
244 #endif /* DEBUG */
245 	lck_rw_done(&flow_mgr_lock);
246 
247 	fm->fm_flow_hash_masks[0] = FKMASK_5TUPLE;
248 	fm->fm_flow_hash_masks[1] = FKMASK_4TUPLE;
249 	fm->fm_flow_hash_masks[2] = FKMASK_3TUPLE;
250 	fm->fm_flow_hash_masks[3] = FKMASK_2TUPLE;
251 	fm->fm_flow_hash_masks[4] = FKMASK_IPFLOW3;
252 	fm->fm_flow_hash_masks[5] = FKMASK_IPFLOW2;
253 	fm->fm_flow_hash_masks[6] = FKMASK_IPFLOW1;
254 
255 	memset(&fm->fm_flow_hash_count, 0, sizeof(fm->fm_flow_hash_count));
256 
257 	return fm;
258 }
259 
260 void
flow_mgr_destroy(struct flow_mgr * fm)261 flow_mgr_destroy(struct flow_mgr *fm)
262 {
263 	uint32_t i;
264 
265 	lck_rw_lock_exclusive(&flow_mgr_lock);
266 	ASSERT(!uuid_is_null(fm->fm_uuid));
267 
268 	if (fm->fm_flow_table != NULL) {
269 		cuckoo_hashtable_free(fm->fm_flow_table);
270 	}
271 
272 	if (fm->fm_owner_buckets != NULL) {
273 		for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
274 			struct flow_owner_bucket *fob =
275 			    flow_mgr_get_fob_at_idx(fm, i);
276 			ASSERT(fob->fob_idx == i);
277 			flow_owner_bucket_destroy(fob);
278 		}
279 		flow_owner_buckets_free(fm->fm_owner_buckets,
280 		    fm->fm_owner_bucket_tot_sz);
281 		fm->fm_owner_buckets = NULL;
282 		*(uint32_t *)(uintptr_t)&fm->fm_owner_buckets_cnt = 0;
283 		*(uint32_t *)(uintptr_t)&fm->fm_owner_bucket_sz = 0;
284 		*(uint32_t *)(uintptr_t)&fm->fm_owner_bucket_tot_sz = 0;
285 	}
286 	ASSERT(fm->fm_owner_buckets_cnt == 0);
287 	ASSERT(fm->fm_owner_bucket_sz == 0);
288 	ASSERT(fm->fm_owner_bucket_tot_sz == 0);
289 
290 	if (fm->fm_route_buckets != NULL) {
291 		for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
292 			struct flow_route_bucket *frb =
293 			    flow_mgr_get_frb_at_idx(fm, i);
294 			ASSERT(frb->frb_idx == i);
295 			flow_route_bucket_destroy(frb);
296 		}
297 		flow_route_buckets_free(fm->fm_route_buckets,
298 		    fm->fm_route_bucket_tot_sz);
299 		fm->fm_route_buckets = NULL;
300 		*(uint32_t *)(uintptr_t)&fm->fm_route_buckets_cnt = 0;
301 		*(uint32_t *)(uintptr_t)&fm->fm_route_bucket_sz = 0;
302 		*(uint32_t *)(uintptr_t)&fm->fm_route_bucket_tot_sz = 0;
303 	}
304 	ASSERT(fm->fm_route_buckets_cnt == 0);
305 	ASSERT(fm->fm_route_bucket_sz == 0);
306 	ASSERT(fm->fm_route_bucket_tot_sz == 0);
307 
308 	if (fm->fm_route_id_buckets != NULL) {
309 		for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
310 			struct flow_route_id_bucket *frib =
311 			    flow_mgr_get_frib_at_idx(fm, i);
312 			ASSERT(frib->frib_idx == i);
313 			flow_route_id_bucket_destroy(frib);
314 		}
315 		flow_route_id_buckets_free(fm->fm_route_id_buckets,
316 		    fm->fm_route_id_bucket_tot_sz);
317 		fm->fm_route_id_buckets = NULL;
318 		*(uint32_t *)(uintptr_t)&fm->fm_route_id_buckets_cnt = 0;
319 		*(uint32_t *)(uintptr_t)&fm->fm_route_id_bucket_sz = 0;
320 		*(uint32_t *)(uintptr_t)&fm->fm_route_id_bucket_tot_sz = 0;
321 	}
322 	ASSERT(fm->fm_route_id_buckets_cnt == 0);
323 	ASSERT(fm->fm_route_id_bucket_sz == 0);
324 	ASSERT(fm->fm_route_id_bucket_tot_sz == 0);
325 
326 	uuid_clear(fm->fm_uuid);
327 	RB_REMOVE(flow_mgr_tree, &flow_mgr_head, fm);
328 	lck_rw_done(&flow_mgr_lock);
329 
330 	sk_free_type(struct flow_mgr, fm);
331 }
332 
333 void
flow_mgr_terminate(struct flow_mgr * fm)334 flow_mgr_terminate(struct flow_mgr *fm)
335 {
336 	uint32_t i;
337 
338 	/*
339 	 * Purge all flow entries.
340 	 */
341 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
342 		struct flow_owner_bucket *fob =
343 		    flow_mgr_get_fob_at_idx(fm, i);
344 		FOB_LOCK(fob);
345 		fob->fob_busy_flags |= FOBF_DEAD;
346 	}
347 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
348 		struct flow_owner_bucket *fob =
349 		    flow_mgr_get_fob_at_idx(fm, i);
350 		SK_DF(SK_VERB_FLOW, "purging fob 0x%llx [%u]", SK_KVA(fob), i);
351 		flow_owner_bucket_purge_all(fob);
352 	}
353 
354 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
355 		FOB_UNLOCK(flow_mgr_get_fob_at_idx(fm, i));
356 	}
357 
358 	/*
359 	 * Purge all flow routes.
360 	 */
361 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
362 		struct flow_route_bucket *frb =
363 		    flow_mgr_get_frb_at_idx(fm, i);
364 		FRB_WLOCK(frb);
365 	}
366 	for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
367 		FRIB_WLOCK(flow_mgr_get_frib_at_idx(fm, i));
368 	}
369 
370 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
371 		struct flow_route_bucket *frb =
372 		    flow_mgr_get_frb_at_idx(fm, i);
373 		SK_DF(SK_VERB_FLOW, "purging frb 0x%llx [%u]", SK_KVA(frb), i);
374 		flow_route_bucket_purge_all(frb);
375 	}
376 
377 	for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
378 		FRIB_WUNLOCK(flow_mgr_get_frib_at_idx(fm, i));
379 	}
380 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
381 		FRB_WUNLOCK(flow_mgr_get_frb_at_idx(fm, i));
382 	}
383 }
384 
385 /*
386  * Must be matched with a call to flow_mgr_unlock().  Upon success will
387  * return the flow manager address of the specified UUID, and will acquire
388  * the global flow_mgr_lock as reader.  The caller is then expected to release
389  * the lock.
390  */
391 struct flow_mgr *
flow_mgr_find_lock(uuid_t uuid)392 flow_mgr_find_lock(uuid_t uuid)
393 {
394 	struct flow_mgr *fm, find;
395 
396 	uuid_copy(find.fm_uuid, uuid);
397 
398 	lck_rw_lock_shared(&flow_mgr_lock);
399 
400 	fm = RB_FIND(flow_mgr_tree, &flow_mgr_head, &find);
401 	if (fm == NULL) {
402 		lck_rw_done(&flow_mgr_lock);
403 		return NULL;
404 	}
405 
406 	/* caller is expected to call flow_mgr_unlock() when done */
407 	LCK_RW_ASSERT(&flow_mgr_lock, LCK_RW_ASSERT_SHARED);
408 	return fm;
409 }
410 
411 /*
412  * Must be matched with a successful call to flow_mgr_find_lock().
413  */
414 void
flow_mgr_unlock(void)415 flow_mgr_unlock(void)
416 {
417 	lck_rw_done(&flow_mgr_lock);
418 }
419 
420 static inline int
fm_cmp(const struct flow_mgr * a,const struct flow_mgr * b)421 fm_cmp(const struct flow_mgr *a, const struct flow_mgr *b)
422 {
423 	return uuid_compare(a->fm_uuid, b->fm_uuid);
424 }
425 
426 static void
flow_mgr_clear_embedded_scope_id(struct sockaddr_in6 * addr)427 flow_mgr_clear_embedded_scope_id(struct sockaddr_in6 *addr)
428 {
429 	struct in6_addr *in6;
430 	in6 = &addr->sin6_addr;
431 	if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(in6)) {
432 		addr->sin6_scope_id = ntohs(in6->s6_addr16[1]);
433 		in6->s6_addr16[1] = 0;
434 	}
435 }
436 
437 #if CONFIG_MACF
438 static bool
flow_req_check_mac_allowed(struct nx_flow_req * req)439 flow_req_check_mac_allowed(struct nx_flow_req *req)
440 {
441 	int socktype;
442 	switch (req->nfr_ip_protocol) {
443 	case IPPROTO_TCP:
444 		socktype = SOCK_STREAM;
445 		break;
446 
447 	case IPPROTO_UDP:
448 		socktype = SOCK_DGRAM;
449 		break;
450 
451 	default:
452 		/* Custom IP protocol, which is treated as IP diagram type */
453 		socktype = SOCK_DGRAM;
454 		return 0;
455 	}
456 
457 	if (req->nfr_flags & NXFLOWREQF_LISTENER) {
458 		return mac_skywalk_flow_check_listen(req->nfr_proc, NULL,
459 		           SA(&req->nfr_saddr.sa), socktype, req->nfr_ip_protocol);
460 	} else {
461 		return mac_skywalk_flow_check_connect(req->nfr_proc, NULL,
462 		           SA(&req->nfr_daddr.sa), socktype, req->nfr_ip_protocol);
463 	}
464 }
465 #endif /* CONFIG_MACF */
466 
467 static bool
flow_req_needs_netns_reservation(struct nx_flow_req * req)468 flow_req_needs_netns_reservation(struct nx_flow_req *req)
469 {
470 	uint8_t proto = req->nfr_ip_protocol;
471 	return proto == IPPROTO_TCP || proto == IPPROTO_UDP;
472 }
473 
474 static bool
flow_req_needs_protons_reservation(struct nx_flow_req * req)475 flow_req_needs_protons_reservation(struct nx_flow_req *req)
476 {
477 	uint8_t proto = req->nfr_ip_protocol;
478 	return proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
479 	       proto != IPPROTO_ESP && proto != IPPROTO_AH;
480 }
481 
482 static bool
flow_req_needs_ipsec_reservation(struct nx_flow_req * req)483 flow_req_needs_ipsec_reservation(struct nx_flow_req *req)
484 {
485 	uint8_t proto = req->nfr_ip_protocol;
486 	return proto == IPPROTO_ESP || proto == IPPROTO_AH;
487 }
488 
489 static void
flow_set_port_info(struct ns_flow_info * nfi,struct nx_flow_req * req)490 flow_set_port_info(struct ns_flow_info *nfi, struct nx_flow_req *req)
491 {
492 	union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
493 	union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
494 
495 	bzero(nfi, sizeof(struct ns_flow_info));
496 
497 	nfi->nfi_ifp = req->nfr_ifp;
498 
499 	nfi->nfi_laddr = *saddr;
500 	nfi->nfi_faddr = *daddr;
501 
502 	nfi->nfi_protocol = req->nfr_ip_protocol;
503 
504 	uuid_copy(nfi->nfi_flow_uuid, req->nfr_flow_uuid);
505 	ASSERT(!uuid_is_null(nfi->nfi_flow_uuid));
506 
507 	nfi->nfi_owner_pid = req->nfr_pid;
508 	if (req->nfr_epid != -1) {
509 		nfi->nfi_effective_pid = req->nfr_epid;
510 		proc_name(req->nfr_epid, nfi->nfi_effective_name,
511 		    sizeof(nfi->nfi_effective_name));
512 	} else {
513 		nfi->nfi_effective_pid = -1;
514 	}
515 
516 	proc_name(req->nfr_pid, nfi->nfi_owner_name,
517 	    sizeof(nfi->nfi_owner_name));
518 }
519 
520 static int
flow_req_prepare_namespace(struct nx_flow_req * req)521 flow_req_prepare_namespace(struct nx_flow_req *req)
522 {
523 	SK_LOG_VAR(char src_s[MAX_IPv6_STR_LEN]);
524 	int err = 0;
525 
526 	if (flow_req_needs_netns_reservation(req)) {
527 		if (!NETNS_TOKEN_VALID(&req->nfr_port_reservation)) {
528 			union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
529 			struct ns_flow_info nfi;
530 			netns_token ns_token;
531 			flow_set_port_info(&nfi, req);
532 			err = flow_namespace_create(saddr,
533 			    req->nfr_ip_protocol, &ns_token,
534 			    req->nfr_flags, &nfi);
535 			if (err != 0) {
536 				SK_ERR("netns for %s.%u failed",
537 				    sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)),
538 				    sk_sa_get_port(SA(saddr)));
539 				goto fail;
540 			}
541 			req->nfr_port_reservation = ns_token;
542 			req->nfr_flags &= ~NXFLOWREQF_EXT_PORT_RSV;
543 		} else {
544 			/* Validate PID associated with provided reservation */
545 			struct ns_flow_info nfi = {};
546 			err = netns_get_flow_info(&req->nfr_port_reservation,
547 			    &nfi);
548 			/* flow info could be NULL for socket flow */
549 			if (!err && (req->nfr_pid != nfi.nfi_owner_pid ||
550 			    (req->nfr_epid != -1 && nfi.nfi_effective_pid !=
551 			    req->nfr_epid))) {
552 				SK_ERR("netns flow info mismatch, "
553 				    "req_(e)pid %d(%d), nfr_(e)pid %d(%d)",
554 				    req->nfr_pid, req->nfr_epid,
555 				    nfi.nfi_owner_pid, nfi.nfi_effective_pid);
556 				err = EPERM;
557 				goto fail;
558 			}
559 			req->nfr_flags |= NXFLOWREQF_EXT_PORT_RSV;
560 		}
561 	}
562 
563 	if (flow_req_needs_ipsec_reservation(req)) {
564 		union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
565 		union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
566 		void *ipsec_token = NULL;
567 		ASSERT(req->nfr_ipsec_reservation == NULL);
568 		err = key_reserve_custom_ipsec(&ipsec_token, saddr,
569 		    daddr, req->nfr_ip_protocol);
570 		if (err != 0) {
571 			SK_ERR("custom ipsec %u reserve %s failed",
572 			    req->nfr_ip_protocol,
573 			    sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)));
574 			goto fail;
575 		}
576 		req->nfr_ipsec_reservation = ipsec_token;
577 	}
578 
579 	if (flow_req_needs_protons_reservation(req)) {
580 		struct protons_token *ns_token = NULL;
581 		if (!protons_token_is_valid(req->nfr_proto_reservation)) {
582 			err = protons_reserve(&ns_token, req->nfr_pid,
583 			    req->nfr_epid, req->nfr_ip_protocol);
584 			if (err != 0) {
585 				SK_ERR("protocol %u namespace failed",
586 				    req->nfr_ip_protocol);
587 				goto fail;
588 			}
589 			req->nfr_flags &= ~NXFLOWREQF_EXT_PROTO_RSV;
590 			req->nfr_proto_reservation = ns_token;
591 		} else {
592 			/* Validate PID associated with provided reservation */
593 			if (!protons_token_has_matching_pid(req->nfr_proto_reservation,
594 			    req->nfr_pid, req->nfr_epid)) {
595 				SK_ERR("protons token pid mismatch");
596 				err = EPERM;
597 				goto fail;
598 			}
599 			req->nfr_flags |= NXFLOWREQF_EXT_PROTO_RSV;
600 		}
601 	}
602 
603 	return 0;
604 
605 fail:
606 	VERIFY(err != 0);
607 	SK_ERR("perparation failed (err %d)", err);
608 	return err;
609 }
610 
611 static int
flow_req_prepare(struct nx_flow_req * req,struct kern_nexus * nx,struct flow_mgr * fm,struct ifnet * ifp,flow_route_ctor_fn_t fr_ctor,flow_route_resolve_fn_t fr_resolve,void * fr_arg)612 flow_req_prepare(struct nx_flow_req *req, struct kern_nexus *nx,
613     struct flow_mgr *fm, struct ifnet *ifp, flow_route_ctor_fn_t fr_ctor,
614     flow_route_resolve_fn_t fr_resolve, void *fr_arg)
615 {
616 	int err = 0;
617 	union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
618 	union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
619 	uint8_t protocol = req->nfr_ip_protocol;
620 
621 	sa_family_t saf, daf, xaf, af;
622 
623 	saf = SA(saddr)->sa_family;
624 	daf = SA(daddr)->sa_family;
625 	xaf = saf ^ daf;
626 	if (xaf != 0 && xaf != saf && xaf != daf) {
627 		SK_ERR("invalid saddr af %d daddr af %d", saf, daf);
628 		return EINVAL;
629 	}
630 	af = (xaf == 0) ? saf : xaf;
631 
632 	bool has_saddr = false, has_daddr = false;
633 	bool has_sport = false, has_dport = false;
634 	uint16_t sport, dport;
635 	uint8_t sa_len;
636 	switch (af) {
637 	case AF_INET:
638 		sa_len = sizeof(struct sockaddr_in);
639 		has_saddr = (SIN(saddr)->sin_addr.s_addr != INADDR_ANY);
640 		has_daddr = (SIN(daddr)->sin_addr.s_addr != INADDR_ANY);
641 		sport = SIN(saddr)->sin_port;
642 		dport = SIN(daddr)->sin_port;
643 		has_sport = (sport != 0);
644 		has_dport = (dport != 0);
645 
646 		if ((has_saddr && SIN(saddr)->sin_len != sa_len) ||
647 		    (has_daddr && SIN(daddr)->sin_len != sa_len)) {
648 			SK_ERR("sin_len invalid");
649 			err = EINVAL;
650 			goto fail;
651 		}
652 		if ((has_saddr && IN_MULTICAST(ntohl(SIN(saddr)->sin_addr.s_addr))) ||
653 		    (has_daddr && IN_MULTICAST(ntohl(SIN(daddr)->sin_addr.s_addr)))) {
654 			SK_ERR("multicast flow not yet supported");
655 			err = EADDRNOTAVAIL;
656 			goto fail;
657 		}
658 		if (__probable(protocol == IPPROTO_TCP)) {
659 			INC_ATOMIC_INT64_LIM(
660 				net_api_stats.nas_nx_flow_inet6_stream_total);
661 		} else {
662 			INC_ATOMIC_INT64_LIM(
663 				net_api_stats.nas_nx_flow_inet6_dgram_total);
664 		}
665 		break;
666 
667 	case AF_INET6:
668 		sa_len = sizeof(struct sockaddr_in6);
669 		has_saddr = !IN6_IS_ADDR_UNSPECIFIED(&SIN6(saddr)->sin6_addr);
670 		has_daddr = !IN6_IS_ADDR_UNSPECIFIED(&SIN6(daddr)->sin6_addr);
671 		sport = SIN6(saddr)->sin6_port;
672 		dport = SIN6(daddr)->sin6_port;
673 		has_sport = (sport != 0);
674 		has_dport = (dport != 0);
675 		if ((has_saddr && SIN6(saddr)->sin6_len != sa_len) ||
676 		    (has_daddr && SIN6(daddr)->sin6_len != sa_len)) {
677 			SK_ERR("sin_len invalid");
678 			err = EINVAL;
679 			goto fail;
680 		}
681 		/* clear embedded scope if link-local src */
682 		if (has_saddr) {
683 			flow_mgr_clear_embedded_scope_id(SIN6(saddr));
684 			if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&SIN6(saddr)->sin6_addr)) {
685 				SIN6(saddr)->sin6_scope_id = ifp->if_index;
686 			}
687 		}
688 		if (has_daddr) {
689 			flow_mgr_clear_embedded_scope_id(SIN6(daddr));
690 			if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&SIN6(daddr)->sin6_addr)) {
691 				SIN6(daddr)->sin6_scope_id = ifp->if_index;
692 			}
693 		}
694 		if ((has_saddr && IN6_IS_ADDR_MULTICAST(&SIN6(saddr)->sin6_addr)) ||
695 		    (has_daddr && IN6_IS_ADDR_MULTICAST(&SIN6(daddr)->sin6_addr))) {
696 			SK_ERR("multicast flow not yet supported");
697 			err = EADDRNOTAVAIL;
698 			goto fail;
699 		}
700 		if (__probable(protocol == IPPROTO_TCP)) {
701 			INC_ATOMIC_INT64_LIM(
702 				net_api_stats.nas_nx_flow_inet_stream_total);
703 		} else {
704 			INC_ATOMIC_INT64_LIM(
705 				net_api_stats.nas_nx_flow_inet_dgram_total);
706 		}
707 		break;
708 
709 	default:
710 		SK_ERR("unknown address families saf %d daf %d", saf, daf);
711 		err = EINVAL;
712 		goto fail;
713 	}
714 
715 	SA(saddr)->sa_family = SA(daddr)->sa_family = af;
716 	SA(saddr)->sa_len = SA(daddr)->sa_len = sa_len;
717 
718 	if (__improbable(has_saddr && !flow_route_laddr_validate(saddr, ifp,
719 	    &req->nfr_saddr_gencnt))) {
720 		SK_LOG_VAR(char src_s[MAX_IPv6_STR_LEN]);
721 		SK_ERR("src address %s is not valid",
722 		    sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)));
723 		err = EADDRNOTAVAIL;
724 		goto fail;
725 	}
726 
727 	bool is_tcp_udp = (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP);
728 	if (!is_tcp_udp) {
729 		if (has_sport || has_dport) {
730 			SK_ERR("non-zero port for IP flow");
731 			return EINVAL;
732 		}
733 	} else {
734 		/* dst:dport as connected, 0:0 as listener, but not partial */
735 		if (has_daddr != has_dport) {
736 			err = EINVAL;
737 			SK_ERR("invalid dst/dport for TCP/UDP (err %d)", err);
738 			goto fail;
739 		}
740 	}
741 
742 	if (!has_daddr && !has_dport) {
743 		req->nfr_flags |= NXFLOWREQF_LISTENER;
744 	}
745 
746 	if (req->nfr_transport_protocol == 0) {
747 		req->nfr_transport_protocol = req->nfr_ip_protocol;
748 	}
749 
750 	bool is_child_flow = !uuid_is_null(req->nfr_parent_flow_uuid);
751 	if ((is_child_flow && req->nfr_flow_demux_count == 0) ||
752 	    (!is_child_flow && req->nfr_flow_demux_count > 0)) {
753 		err = EINVAL;
754 		SK_ERR("invalid flow demux count");
755 		goto fail;
756 	}
757 
758 	if (req->nfr_flow_demux_count > 0) {
759 		if (req->nfr_ip_protocol != IPPROTO_UDP) {
760 			err = EINVAL;
761 			SK_ERR("invalid ip protocol(%u) for flow demux",
762 			    req->nfr_ip_protocol);
763 			goto fail;
764 		}
765 
766 		for (int i = 0; i < req->nfr_flow_demux_count; i++) {
767 			if (req->nfr_flow_demux_patterns[i].fdp_len > FLOW_DEMUX_MAX_LEN ||
768 			    req->nfr_flow_demux_patterns[i].fdp_len == 0) {
769 				err = EINVAL;
770 				SK_ERR("invalid flow demux pattern len %u",
771 				    req->nfr_flow_demux_patterns[i].fdp_len);
772 				goto fail;
773 			}
774 			if (req->nfr_flow_demux_patterns[i].fdp_offset +
775 			    req->nfr_flow_demux_patterns[i].fdp_len > MAX_PKT_DEMUX_LIMIT) {
776 				err = EINVAL;
777 				SK_ERR("invalid demux offset plus length(%u > %d)",
778 				    req->nfr_flow_demux_patterns[i].fdp_offset +
779 				    req->nfr_flow_demux_patterns[i].fdp_len, MAX_PKT_DEMUX_LIMIT);
780 				goto fail;
781 			}
782 		}
783 	}
784 
785 	req->nfr_ifp = ifp;
786 
787 #if CONFIG_MACF
788 	err = flow_req_check_mac_allowed(req);
789 	if (err != 0) {
790 		SK_ERR("flow req failed MAC check");
791 		goto fail;
792 	}
793 #endif /* CONFIG_MACF */
794 
795 	/* setup flow route and prepare saddr if needed */
796 	if (__probable(has_daddr || has_dport)) {
797 		struct flow_route *fr = NULL;
798 		err = flow_route_find(nx, fm, ifp, req, fr_ctor,
799 		    fr_resolve, fr_arg, &fr);
800 		if (__improbable(err != 0)) {
801 			SK_ERR("flow route lookup failed");
802 			ASSERT(fr == NULL);
803 			goto fail;
804 		}
805 		ASSERT(fr != NULL);
806 		/* Pick up the default source address from flow route. */
807 		if (!has_saddr) {
808 			*saddr = fr->fr_laddr;
809 			SIN(saddr)->sin_port = sport;
810 		}
811 		req->nfr_route = fr;
812 		fr = NULL;
813 	}
814 
815 	/* child flow do not hold namespace references */
816 	if (__probable(uuid_is_null(req->nfr_parent_flow_uuid))) {
817 		err = flow_req_prepare_namespace(req);
818 		if (err != 0) {
819 			goto fail;
820 		}
821 	}
822 
823 	return 0;
824 
825 fail:
826 	VERIFY(err != 0);
827 	if (req->nfr_route != NULL) {
828 		flow_route_release(req->nfr_route);
829 		req->nfr_route = NULL;
830 	}
831 	SK_ERR("preparation failed (err %d)", err);
832 	return err;
833 }
834 
835 static void
flow_req_cleanup(struct nx_flow_req * req)836 flow_req_cleanup(struct nx_flow_req *req)
837 {
838 	if (NETNS_TOKEN_VALID(&req->nfr_port_reservation) &&
839 	    !(req->nfr_flags & NXFLOWREQF_EXT_PORT_RSV)) {
840 		netns_release(&req->nfr_port_reservation);
841 	}
842 
843 	if (protons_token_is_valid(req->nfr_proto_reservation) &&
844 	    !(req->nfr_flags & NXFLOWREQF_EXT_PROTO_RSV)) {
845 		protons_release(&req->nfr_proto_reservation);
846 	}
847 
848 	if (key_custom_ipsec_token_is_valid(req->nfr_ipsec_reservation)) {
849 		key_release_custom_ipsec(&req->nfr_ipsec_reservation);
850 	}
851 }
852 
853 #if SK_LOG
854 /* Hoisted out of line to reduce kernel stack footprint */
855 SK_LOG_ATTRIBUTE
856 static void
flow_req_dump(char * desc,struct nx_flow_req * req)857 flow_req_dump(char *desc, struct nx_flow_req *req)
858 {
859 	if (!(sk_verbose & SK_VERB_FLOW)) {
860 		return;
861 	}
862 
863 	union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
864 	union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
865 	uint8_t protocol = req->nfr_ip_protocol;
866 	char src_s[MAX_IPv6_STR_LEN];
867 	char dst_s[MAX_IPv6_STR_LEN];
868 	uint8_t sipver = 0, dipver = 0;
869 	uint16_t sport = 0, dport = 0;
870 	uuid_string_t uuid_s;
871 
872 	// unsanitized req, treat source and destination AF separately
873 	if (saddr->sa.sa_family == AF_INET) {
874 		sipver = IPVERSION;
875 		(void) inet_ntop(AF_INET, &SIN(saddr)->sin_addr, src_s,
876 		    sizeof(src_s));
877 		sport = ntohs(saddr->sin.sin_port);
878 	} else if (saddr->sa.sa_family == AF_INET6) {
879 		sipver = IPV6_VERSION;
880 		(void) inet_ntop(AF_INET6, &SIN6(saddr)->sin6_addr, src_s,
881 		    sizeof(src_s));
882 		sport = ntohs(saddr->sin6.sin6_port);
883 	} else {
884 		sipver = 0;
885 		strlcpy(src_s, "INV", sizeof(src_s));
886 	}
887 	if (daddr->sa.sa_family == AF_INET) {
888 		dipver = IPVERSION;
889 		(void) inet_ntop(AF_INET, &SIN(daddr)->sin_addr, dst_s,
890 		    sizeof(dst_s));
891 		dport = ntohs(daddr->sin.sin_port);
892 	} else if (daddr->sa.sa_family == AF_INET6) {
893 		dipver = IPV6_VERSION;
894 		(void) inet_ntop(AF_INET6, &SIN6(daddr)->sin6_addr, dst_s,
895 		    sizeof(dst_s));
896 		dport = ntohs(daddr->sin6.sin6_port);
897 	} else {
898 		dipver = 0;
899 		strlcpy(dst_s, "INV", sizeof(src_s));
900 	}
901 
902 	SK_DF(SK_VERB_FLOW,
903 	    "%s %s sipver=%u,dipver=%u,src=%s,dst=%s,proto=%d,sport=%u,dport=%d"
904 	    " nx_port=%u,flags 0x%b", desc, sk_uuid_unparse(req->nfr_flow_uuid,
905 	    uuid_s), sipver, dipver, src_s, dst_s, protocol, sport, dport,
906 	    req->nfr_nx_port, req->nfr_flags, NXFLOWREQF_BITS);
907 }
908 #else
909 #define flow_req_dump(str, req) do { ((void)0); } while (0)
910 #endif /* SK_LOG */
911 
912 /*
913  * Upon success, returns a non-NULL fb that is (writer) locked.
914  */
915 int
flow_mgr_flow_add(struct kern_nexus * nx,struct flow_mgr * fm,struct flow_owner * fo,struct ifnet * ifp,struct nx_flow_req * req,flow_route_ctor_fn_t fr_ctor,flow_route_resolve_fn_t fr_resolve,void * fr_arg)916 flow_mgr_flow_add(struct kern_nexus *nx, struct flow_mgr *fm,
917     struct flow_owner *fo, struct ifnet *ifp, struct nx_flow_req *req,
918     flow_route_ctor_fn_t fr_ctor, flow_route_resolve_fn_t fr_resolve,
919     void *fr_arg)
920 {
921 	struct flow_entry *fe;
922 	int err = 0;
923 
924 	ASSERT(ifp != NULL);
925 	ASSERT(fr_ctor != NULL && fr_resolve != NULL);
926 	FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo));
927 
928 	flow_req_dump("req", req);
929 
930 	if (!(req->nfr_flags & NXFLOWREQF_ASIS)) {
931 		err = flow_req_prepare(req, nx, fm, ifp, fr_ctor, fr_resolve, fr_arg);
932 		if (err != 0) {
933 			SK_ERR("flow req preparation failure (err %d)", err);
934 			return err;
935 		}
936 	}
937 
938 	/*
939 	 * Add entry in flowswitch table; upon success, flow entry adds a
940 	 * retain count on the flow route (we'll always need to release the
941 	 * refcnt from flow_route_find), and the local address:port of the
942 	 * flow entry will be set.
943 	 */
944 	fe = flow_entry_alloc(fo, req, &err);
945 	if (__improbable(fe == NULL)) {
946 		ASSERT(err != 0);
947 		goto fail;
948 	}
949 
950 	VERIFY(NETNS_TOKEN_VALID(&fe->fe_port_reservation) ||
951 	    !(fe->fe_key.fk_mask & FKMASK_SPORT) ||
952 	    req->nfr_flags & NXFLOWREQF_ASIS ||
953 	    (fe->fe_flags & FLOWENTF_CHILD));
954 	VERIFY((req->nfr_flags & NXFLOWREQF_FLOWADV) ^
955 	    (req->nfr_flowadv_idx == FLOWADV_IDX_NONE));
956 	req->nfr_flowadv_idx = fe->fe_adv_idx;
957 
958 	flow_req_dump("added ", req);
959 
960 	if (fe != NULL) {
961 		flow_entry_release(&fe);
962 	}
963 
964 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
965 	if (req->nfr_saddr.sa.sa_family == AF_INET6 &&
966 	    IN6_IS_SCOPE_EMBED(&req->nfr_saddr.sin6.sin6_addr)) {
967 		req->nfr_saddr.sin6.sin6_scope_id = ifnet_index(
968 			fsw->fsw_ifp);
969 	}
970 	if (req->nfr_daddr.sa.sa_family == AF_INET6 &&
971 	    IN6_IS_SCOPE_EMBED(&req->nfr_daddr.sin6.sin6_addr)) {
972 		req->nfr_daddr.sin6.sin6_scope_id = ifnet_index(
973 			fsw->fsw_ifp);
974 	}
975 
976 	return 0;
977 
978 fail:
979 	VERIFY(err != 0);
980 	flow_req_cleanup(req);
981 
982 	return err;
983 }
984 
985 struct flow_owner_bucket *
flow_mgr_get_fob_by_pid(struct flow_mgr * fm,pid_t pid)986 flow_mgr_get_fob_by_pid(struct flow_mgr *fm, pid_t pid)
987 {
988 	return flow_mgr_get_fob_at_idx(fm,
989 	           (pid % fm->fm_owner_buckets_cnt));
990 }
991 
992 struct flow_entry *
flow_mgr_get_fe_by_uuid_rlock(struct flow_mgr * fm,uuid_t uuid)993 flow_mgr_get_fe_by_uuid_rlock(struct flow_mgr *fm, uuid_t uuid)
994 {
995 	uint32_t i;
996 	struct flow_owner_bucket *fob;
997 	struct flow_owner *fo;
998 	struct flow_entry *fe;
999 
1000 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
1001 		fob = flow_mgr_get_fob_at_idx(fm, i);
1002 		FOB_LOCK_SPIN(fob);
1003 		RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
1004 			fe = flow_entry_find_by_uuid(fo, uuid);
1005 			if (fe != NULL) {
1006 				FOB_LOCK_CONVERT(fob);
1007 				FOB_UNLOCK(fob);
1008 				return fe;
1009 			}
1010 		}
1011 		FOB_UNLOCK(fob);
1012 	}
1013 	return NULL;
1014 }
1015 
1016 struct flow_route_bucket *
flow_mgr_get_frb_by_addr(struct flow_mgr * fm,union sockaddr_in_4_6 * daddr)1017 flow_mgr_get_frb_by_addr(struct flow_mgr *fm,
1018     union sockaddr_in_4_6 *daddr)
1019 {
1020 	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = flow_seed;
1021 
1022 	switch (SA(daddr)->sa_family) {
1023 	case AF_INET: {
1024 		uint8_t *p = (uint8_t *)&SIN(daddr)->sin_addr.s_addr;
1025 		b += ((uint32_t)p[3]);
1026 		a += ((uint32_t)p[2]) << 24;
1027 		a += ((uint32_t)p[1]) << 16;
1028 		a += ((uint32_t)p[0]) << 8;
1029 		break;
1030 	}
1031 
1032 	case AF_INET6: {
1033 		b += SIN6(daddr)->sin6_addr.s6_addr32[3];
1034 		a += SIN6(daddr)->sin6_addr.s6_addr32[2];
1035 		a += SIN6(daddr)->sin6_addr.s6_addr32[1];
1036 		a += SIN6(daddr)->sin6_addr.s6_addr32[0];
1037 		break;
1038 	}
1039 
1040 	default:
1041 		VERIFY(0);
1042 		/* NOTREACHED */
1043 		__builtin_unreachable();
1044 	}
1045 
1046 	/* mix */
1047 	a -= b; a -= c; a ^= (c >> 13);
1048 	b -= c; b -= a; b ^= (a << 8);
1049 	c -= a; c -= b; c ^= (b >> 13);
1050 	a -= b; a -= c; a ^= (c >> 12);
1051 	b -= c; b -= a; b ^= (a << 16);
1052 	c -= a; c -= b; c ^= (b >> 5);
1053 	a -= b; a -= c; a ^= (c >> 3);
1054 	b -= c; b -= a; b ^= (a << 10);
1055 	c -= a; c -= b; c ^= (b >> 15);
1056 
1057 	c &= (fm->fm_route_buckets_cnt - 1);
1058 
1059 	return flow_mgr_get_frb_at_idx(fm, c);
1060 }
1061 
1062 struct flow_route_id_bucket *
flow_mgr_get_frib_by_uuid(struct flow_mgr * fm,uuid_t fr_uuid)1063 flow_mgr_get_frib_by_uuid(struct flow_mgr *fm, uuid_t fr_uuid)
1064 {
1065 	union {
1066 		uuid_t   uuid __sk_aligned(8);
1067 		uint64_t u64[2];
1068 	} u;
1069 	uint64_t key;
1070 
1071 	_CASSERT(sizeof(u.uuid) == sizeof(u.u64));
1072 	uuid_copy(u.uuid, fr_uuid);
1073 
1074 	/* XOR fold UUID down to 4-bytes */
1075 	key = (u.u64[0] ^ u.u64[1]);
1076 	key = ((key >> 32) ^ (key & 0xffffffff));
1077 
1078 	/* add some offset to get more entropy */
1079 	return flow_mgr_get_frib_at_idx(fm,
1080 	           ((uint32_t)key % fm->fm_route_id_buckets_cnt));
1081 }
1082 
1083 static int
flow_hash_mask_add(struct flow_mgr * fm,uint32_t mask,int32_t v)1084 flow_hash_mask_add(struct flow_mgr *fm, uint32_t mask, int32_t v)
1085 {
1086 	for (uint32_t i = 0; i < FKMASK_IDX_MAX; i++) {
1087 		if (fm->fm_flow_hash_masks[i] == mask) {
1088 			os_atomic_add(&fm->fm_flow_hash_count[i], v, relaxed);
1089 			return 0;
1090 		}
1091 	}
1092 	SK_ERR("unkown hash mask 0x%x", mask);
1093 	return ENOTSUP;
1094 }
1095 
1096 int
flow_mgr_flow_hash_mask_add(struct flow_mgr * fm,uint32_t mask)1097 flow_mgr_flow_hash_mask_add(struct flow_mgr *fm, uint32_t mask)
1098 {
1099 	return flow_hash_mask_add(fm, mask, 1);
1100 }
1101 
1102 int
flow_mgr_flow_hash_mask_del(struct flow_mgr * fm,uint32_t mask)1103 flow_mgr_flow_hash_mask_del(struct flow_mgr *fm, uint32_t mask)
1104 {
1105 	return flow_hash_mask_add(fm, mask, -1);
1106 }
1107 
1108 #if SK_LOG
1109 SK_NO_INLINE_ATTRIBUTE
1110 static void
__flow_mgr_find_fe_by_key_prelog(struct flow_key * key)1111 __flow_mgr_find_fe_by_key_prelog(struct flow_key *key)
1112 {
1113 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
1114 	SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "key %s",
1115 	    fk_as_string(key, dbgbuf, sizeof(dbgbuf)));
1116 }
1117 
1118 SK_NO_INLINE_ATTRIBUTE
1119 static void
__flow_mgr_find_fe_by_key_epilog(struct flow_entry * fe)1120 __flow_mgr_find_fe_by_key_epilog(struct flow_entry *fe)
1121 {
1122 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
1123 	if (fe != NULL) {
1124 		SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "fe 0x%llx \"%s\"",
1125 		    SK_KVA(fe), fe_as_string(fe, dbgbuf, sizeof(dbgbuf)));
1126 	} else {
1127 		SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "fe not found");
1128 	}
1129 }
1130 #else
1131 #define __flow_mgr_find_fe_by_key_prelog(key) do { ((void)0); } while (0)
1132 #define __flow_mgr_find_fe_by_key_epilog(fe) do { ((void)0); } while (0)
1133 #endif /* SK_LOG */
1134 
1135 struct flow_entry *
flow_mgr_find_fe_by_key(struct flow_mgr * fm,struct flow_key * key)1136 flow_mgr_find_fe_by_key(struct flow_mgr *fm, struct flow_key *key)
1137 {
1138 	struct cuckoo_node *node = NULL;
1139 	struct flow_entry *fe = NULL;
1140 	uint32_t hash = 0;
1141 	uint16_t saved_mask = key->fk_mask;
1142 
1143 	__flow_mgr_find_fe_by_key_prelog(key);
1144 
1145 	for (int i = 0; i < FKMASK_IDX_MAX; i++) {
1146 		size_t count = fm->fm_flow_hash_count[i];
1147 		uint16_t mask = fm->fm_flow_hash_masks[i];
1148 		if (count == 0 || mask == 0) {
1149 			SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1150 			    "[%d] mask=%08x count=%zu skiped",
1151 			    i, mask, count);
1152 			continue;
1153 		}
1154 		key->fk_mask = mask;
1155 		hash = flow_key_hash(key);
1156 		node = cuckoo_hashtable_find_with_hash(fm->fm_flow_table, key, hash);
1157 		SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1158 		    "[%d] mask=%08x hash %08x node 0x%llx", i, mask, hash,
1159 		    SK_KVA(node));
1160 		if (node != NULL) {
1161 			fe = container_of(node, struct flow_entry, fe_cnode);
1162 			/* v4 only listener fe shouldn't get v6 connection */
1163 			if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE &&
1164 			    fe->fe_key.fk_ipver == IPVERSION &&
1165 			    key->fk_ipver == IPV6_VERSION)) {
1166 				flow_entry_release(&fe);
1167 				ASSERT(fe == NULL);
1168 				SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1169 				    "\tskip v4 only fe");
1170 				continue;
1171 			}
1172 			break;
1173 		}
1174 	}
1175 
1176 	key->fk_mask = saved_mask;
1177 
1178 	__flow_mgr_find_fe_by_key_epilog(fe);
1179 
1180 	return fe;
1181 }
1182 
1183 struct flow_entry *
flow_mgr_find_conflicting_fe(struct flow_mgr * fm,struct flow_key * key)1184 flow_mgr_find_conflicting_fe(struct flow_mgr *fm, struct flow_key *key)
1185 {
1186 	struct cuckoo_node *node = NULL;
1187 	struct flow_entry *fe = NULL;
1188 	uint32_t hash = 0;
1189 
1190 	hash = flow_key_hash(key);
1191 	node = cuckoo_hashtable_find_with_hash(fm->fm_flow_table, key, hash);
1192 	if (node != NULL) {
1193 		fe = container_of(node, struct flow_entry, fe_cnode);
1194 		return fe;
1195 	}
1196 
1197 	/* listener flow confliction will be checked at netns reservation */
1198 	return fe;
1199 }
1200 
1201 void
1202 flow_mgr_foreach_flow(struct flow_mgr *fm,
1203     void (^flow_handler)(struct flow_entry *fe))
1204 {
1205 	cuckoo_hashtable_foreach(fm->fm_flow_table,
1206 	    ^(struct cuckoo_node *node, uint32_t hv) {
1207 		#pragma unused(hv)
1208 		struct flow_entry *fe;
1209 		fe = container_of(node, struct flow_entry, fe_cnode);
1210 		flow_handler(fe);
1211 
1212 		if (fe->fe_flags & FLOWENTF_PARENT) {
1213 		        struct flow_entry *child_fe;
1214 		        lck_rw_lock_shared(&fe->fe_child_list_lock);
1215 		        TAILQ_FOREACH(child_fe, &fe->fe_child_list, fe_child_link) {
1216 		                flow_handler(child_fe);
1217 			}
1218 		        lck_rw_unlock_shared(&fe->fe_child_list_lock);
1219 		}
1220 	}
1221 	    );
1222 }
1223 
1224 bool
rx_flow_demux_match(struct nx_flowswitch * fsw,struct flow_entry * fe,struct __kern_packet * pkt)1225 rx_flow_demux_match(struct nx_flowswitch *fsw, struct flow_entry *fe, struct __kern_packet *pkt)
1226 {
1227 	struct udphdr *uh;
1228 	uint8_t *pkt_buf;
1229 	uint32_t bdlen, bdlim, bdoff, pkt_payload_len;
1230 	uint8_t *demux_data;
1231 
1232 	ASSERT(fe->fe_flags & FLOWENTF_CHILD);
1233 	ASSERT(fe->fe_demux_pattern_count > 0);
1234 
1235 	if (fe->fe_flags & (FLOWENTF_TORN_DOWN | FLOWENTF_NONVIABLE)) {
1236 		return false;
1237 	}
1238 
1239 	/*
1240 	 * Demux only supported for UDP packets with payload
1241 	 */
1242 	if (__improbable(pkt->pkt_flow_ip_proto != IPPROTO_UDP)) {
1243 		return false;
1244 	}
1245 
1246 	uh = (struct udphdr *)pkt->pkt_flow_udp_hdr;
1247 	if (__improbable(uh == NULL || pkt->pkt_flow_ulen == 0)) {
1248 		return false;
1249 	}
1250 
1251 	int udp_payload_offset = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen + sizeof(*uh);
1252 
1253 	MD_BUFLET_ADDR_ABS_DLEN(pkt, pkt_buf, bdlen, bdlim, bdoff);
1254 	pkt_payload_len = bdlim - bdoff;
1255 	pkt_payload_len = MIN(pkt_payload_len, pkt->pkt_length);
1256 	pkt_payload_len -= udp_payload_offset;
1257 
1258 	for (int index = 0; index < fe->fe_demux_pattern_count; index++) {
1259 		struct flow_demux_pattern *demux_pattern = &fe->fe_demux_patterns[index].fdp_demux_pattern;
1260 		ASSERT(demux_pattern->fdp_len > 0);
1261 
1262 		if (pkt->pkt_flow_ulen >= demux_pattern->fdp_offset + demux_pattern->fdp_len) {
1263 			if (__probable(pkt_payload_len >= demux_pattern->fdp_offset + demux_pattern->fdp_len)) {
1264 				demux_data = (uint8_t *)(uh + 1) + demux_pattern->fdp_offset;
1265 			} else {
1266 				if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
1267 					m_copydata(pkt->pkt_mbuf, udp_payload_offset + demux_pattern->fdp_offset,
1268 					    demux_pattern->fdp_len, fe->fe_demux_pkt_data);
1269 					demux_data = fe->fe_demux_pkt_data;
1270 				} else {
1271 					FSW_STATS_INC(FSW_STATS_RX_DEMUX_SHORT_ERR);
1272 					return false;
1273 				}
1274 			}
1275 
1276 			int result = -1;
1277 			if (fe->fe_demux_patterns[index].fdp_memcmp_mask != NULL) {
1278 				result = fe->fe_demux_patterns[index].fdp_memcmp_mask(demux_data,
1279 				    demux_pattern->fdp_value, demux_pattern->fdp_mask);
1280 			} else {
1281 				result = sk_memcmp_mask(demux_data, demux_pattern->fdp_value,
1282 				    demux_pattern->fdp_mask, demux_pattern->fdp_len);
1283 			}
1284 
1285 			if (result == 0) {
1286 				return true;
1287 			}
1288 		}
1289 	}
1290 
1291 	return false;
1292 }
1293 
1294 struct flow_entry *
rx_lookup_child_flow(struct nx_flowswitch * fsw,struct flow_entry * parent_fe,struct __kern_packet * pkt)1295 rx_lookup_child_flow(struct nx_flowswitch *fsw, struct flow_entry *parent_fe,
1296     struct __kern_packet *pkt)
1297 {
1298 	struct flow_entry *child_fe;
1299 
1300 	/*
1301 	 * Demux only supported for UDP packets with payload
1302 	 */
1303 	if (__improbable(pkt->pkt_flow_ip_proto != IPPROTO_UDP)) {
1304 		return NULL;
1305 	}
1306 
1307 	lck_rw_lock_shared(&parent_fe->fe_child_list_lock);
1308 
1309 	TAILQ_FOREACH(child_fe, &parent_fe->fe_child_list, fe_child_link) {
1310 		if (rx_flow_demux_match(fsw, child_fe, pkt)) {
1311 			flow_entry_retain(child_fe);
1312 			lck_rw_unlock_shared(&parent_fe->fe_child_list_lock);
1313 			return child_fe;
1314 		}
1315 	}
1316 
1317 	lck_rw_unlock_shared(&parent_fe->fe_child_list_lock);
1318 	return NULL;
1319 }
1320 
1321 struct flow_entry *
tx_lookup_child_flow(struct flow_entry * parent_fe,uuid_t flow_id)1322 tx_lookup_child_flow(struct flow_entry *parent_fe, uuid_t flow_id)
1323 {
1324 	struct flow_entry *child_fe;
1325 
1326 	ASSERT(parent_fe->fe_flags & FLOWENTF_PARENT);
1327 
1328 	lck_rw_lock_shared(&parent_fe->fe_child_list_lock);
1329 	TAILQ_FOREACH(child_fe, &parent_fe->fe_child_list, fe_child_link) {
1330 		if (_UUID_MATCH(flow_id, child_fe->fe_uuid)) {
1331 			flow_entry_retain(child_fe);
1332 			lck_rw_unlock_shared(&parent_fe->fe_child_list_lock);
1333 			return child_fe;
1334 		}
1335 	}
1336 
1337 	lck_rw_unlock_shared(&parent_fe->fe_child_list_lock);
1338 	return NULL;
1339 }
1340