1 /*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <skywalk/os_skywalk.h>
31 #include <skywalk/nexus/flowswitch/fsw_var.h>
32 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
33 #include <netinet/in.h>
34 #include <netinet/in_var.h>
35 #include <netinet6/ip6_var.h>
36 #include <netkey/key.h>
37 #include <netinet/udp.h>
38
39 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
40
41 #if CONFIG_MACF
42 #include <security/mac_framework.h>
43 #endif /* CONFIG_MACF */
44
45 #include <net/net_api_stats.h>
46
47 #define SKMEM_TAG_FSW_FLOW_MGR "com.apple.skywalk.fsw.flow_mgr"
48 static SKMEM_TAG_DEFINE(skmem_tag_fsw_flow_mgr, SKMEM_TAG_FSW_FLOW_MGR);
49
50 static LCK_GRP_DECLARE(flow_mgr_lock_group, "sk_flow_mgr_lock");
51 static LCK_RW_DECLARE(flow_mgr_lock, &flow_mgr_lock_group);
52
53 static int fm_cmp(const struct flow_mgr *,
54 const struct flow_mgr *);
55
56 RB_HEAD(flow_mgr_tree, flow_mgr);
57 RB_PROTOTYPE_PREV(flow_mgr_tree, flow_mgr, fm_link, fm_cmp);
58 RB_GENERATE_PREV(flow_mgr_tree, flow_mgr, fm_link, fm_cmp);
59
60 /* protected by the global lock flow_mgr_lock */
61 static struct flow_mgr_tree flow_mgr_head;
62
63 static int __flow_mgr_inited = 0;
64
65 void
flow_mgr_init(void)66 flow_mgr_init(void)
67 {
68 ASSERT(!__flow_mgr_inited);
69
70 RB_INIT(&flow_mgr_head);
71 __flow_mgr_inited = 1;
72 }
73
74 void
flow_mgr_fini(void)75 flow_mgr_fini(void)
76 {
77 if (__flow_mgr_inited) {
78 VERIFY(RB_EMPTY(&flow_mgr_head));
79
80 __flow_mgr_inited = 0;
81 }
82 }
83
84 static int
__fe_cuckoo_cmp(struct cuckoo_node * node,void * key0)85 __fe_cuckoo_cmp(struct cuckoo_node *node, void *key0)
86 {
87 struct flow_entry *__single fe = __container_of(node, struct flow_entry,
88 fe_cnode);
89 struct flow_key *__single key = key0;
90 const struct flow_key *mask;
91
92 /*
93 * This can probably be made more efficient by having "mask" be
94 * set by the original caller at the time the key is initialized,
95 * though that needs to be done carefully to ensure there is no
96 * mismatch between fk_mask value and "mask" itself.
97 */
98 switch (key->fk_mask) {
99 case FKMASK_5TUPLE:
100 mask = &fk_mask_5tuple;
101 break;
102 case FKMASK_4TUPLE:
103 mask = &fk_mask_4tuple;
104 break;
105 case FKMASK_3TUPLE:
106 mask = &fk_mask_3tuple;
107 break;
108 case FKMASK_2TUPLE:
109 mask = &fk_mask_2tuple;
110 break;
111 case FKMASK_IPFLOW3:
112 mask = &fk_mask_ipflow3;
113 break;
114 case FKMASK_IPFLOW2:
115 mask = &fk_mask_ipflow2;
116 break;
117 case FKMASK_IPFLOW1:
118 mask = &fk_mask_ipflow1;
119 break;
120 default:
121 return flow_key_cmp(&fe->fe_key, key);
122 }
123
124 return flow_key_cmp_mask(&fe->fe_key, key, mask);
125 }
126
127 static void
__fe_cuckoo_retain(struct cuckoo_node * node)128 __fe_cuckoo_retain(struct cuckoo_node *node)
129 {
130 struct flow_entry *__single fe = __container_of(node, struct flow_entry,
131 fe_cnode);
132 return flow_entry_retain(fe);
133 }
134
135 static void
__fe_cuckoo_release(struct cuckoo_node * node)136 __fe_cuckoo_release(struct cuckoo_node *node)
137 {
138 #pragma unused(node)
139 struct flow_entry *__single fe =
140 __container_of(node, struct flow_entry, fe_cnode);
141 flow_entry_release(&fe);
142 }
143
144 struct flow_mgr *
flow_mgr_create(size_t fe_cnt,size_t fob_cnt,size_t frb_cnt,size_t frib_cnt)145 flow_mgr_create(size_t fe_cnt, size_t fob_cnt,
146 size_t frb_cnt, size_t frib_cnt)
147 {
148 struct flow_mgr *fm = NULL;
149 size_t fob_sz, frb_sz, frib_sz;
150 size_t fob_tot_sz, frb_tot_sz, frib_tot_sz;
151 uint32_t i;
152
153 /* caller needs to ensure {fb,frb}_cnt is a power of two */
154 ASSERT(frb_cnt != 0 && ((frb_cnt & (frb_cnt - 1)) == 0));
155 ASSERT(fob_cnt != 0);
156 ASSERT(frib_cnt != 0);
157
158 fm = sk_alloc_type(struct flow_mgr, Z_WAITOK | Z_NOFAIL, skmem_tag_fsw_flow_mgr);
159 struct cuckoo_hashtable_params p = {
160 .cht_capacity = fe_cnt,
161 .cht_obj_cmp = __fe_cuckoo_cmp,
162 .cht_obj_retain = __fe_cuckoo_retain,
163 .cht_obj_release = __fe_cuckoo_release,
164 };
165 fm->fm_flow_table = cuckoo_hashtable_create(&p);
166 if (fm->fm_flow_table == NULL) {
167 flow_mgr_destroy(fm);
168 return NULL;
169 }
170
171 /*
172 * flow_owner_bucket cache-aligned objects.
173 */
174 fm->fm_owner_buckets = flow_owner_buckets_alloc(fob_cnt, &fob_sz, &fob_tot_sz);
175 fm->fm_owner_bucket_tot_sz = fob_tot_sz;
176 if (fm->fm_owner_buckets == NULL) {
177 flow_mgr_destroy(fm);
178 return NULL;
179 }
180 fm->fm_owner_buckets_cnt = fob_cnt;
181 fm->fm_owner_bucket_sz = fob_sz;
182
183 /*
184 * flow_route_bucket cache-aligned objects.
185 */
186 fm->fm_route_buckets = flow_route_buckets_alloc(frb_cnt, &frb_sz, &frb_tot_sz);
187 fm->fm_route_bucket_tot_sz = frb_tot_sz;
188 if (fm->fm_route_buckets == NULL) {
189 flow_mgr_destroy(fm);
190 return NULL;
191 }
192 fm->fm_route_buckets_cnt = frb_cnt;
193 fm->fm_route_bucket_sz = frb_sz;
194
195 /*
196 * flow_route_id_bucket cache-aligned objects.
197 */
198 fm->fm_route_id_buckets =
199 flow_route_id_buckets_alloc(frib_cnt, &frib_sz, &frib_tot_sz);
200 fm->fm_route_id_bucket_tot_sz = frib_tot_sz;
201 if (fm->fm_route_id_buckets == NULL) {
202 flow_mgr_destroy(fm);
203 return NULL;
204 }
205 fm->fm_route_id_buckets_cnt = frib_cnt;
206 fm->fm_route_id_bucket_sz = frib_sz;
207
208 /* construct flow_owner_buckets */
209 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
210 struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
211 flow_owner_bucket_init(fob);
212 /* const override */
213 *(size_t *)(uintptr_t)&fob->fob_idx = i;
214 }
215
216 /* construct flow_route_buckets */
217 for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
218 struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
219 flow_route_bucket_init(frb);
220 /* const override */
221 *(size_t *)(uintptr_t)&frb->frb_idx = i;
222 }
223
224 /* construct flow_route_id_buckets */
225 for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
226 struct flow_route_id_bucket *frib =
227 flow_mgr_get_frib_at_idx(fm, i);
228 flow_route_id_bucket_init(frib);
229 /* const override */
230 *(size_t *)(uintptr_t)&frib->frib_idx = i;
231 }
232
233 uuid_generate_random(fm->fm_uuid);
234
235 lck_rw_lock_exclusive(&flow_mgr_lock);
236 RB_INSERT(flow_mgr_tree, &flow_mgr_head, fm);
237 #if DEBUG
238 struct flow_mgr find;
239 uuid_copy(find.fm_uuid, fm->fm_uuid);
240 /* make sure our tree compare routine is sane */
241 ASSERT(RB_FIND(flow_mgr_tree,
242 &flow_mgr_head, &find) == fm);
243 #endif /* DEBUG */
244 lck_rw_done(&flow_mgr_lock);
245
246 fm->fm_flow_hash_masks[0] = FKMASK_5TUPLE;
247 fm->fm_flow_hash_masks[1] = FKMASK_4TUPLE;
248 fm->fm_flow_hash_masks[2] = FKMASK_3TUPLE;
249 fm->fm_flow_hash_masks[3] = FKMASK_2TUPLE;
250 fm->fm_flow_hash_masks[4] = FKMASK_IPFLOW3;
251 fm->fm_flow_hash_masks[5] = FKMASK_IPFLOW2;
252 fm->fm_flow_hash_masks[6] = FKMASK_IPFLOW1;
253
254 memset(&fm->fm_flow_hash_count, 0, sizeof(fm->fm_flow_hash_count));
255
256 return fm;
257 }
258
259 void
flow_mgr_destroy(struct flow_mgr * fm)260 flow_mgr_destroy(struct flow_mgr *fm)
261 {
262 uint32_t i;
263
264 lck_rw_lock_exclusive(&flow_mgr_lock);
265 ASSERT(!uuid_is_null(fm->fm_uuid));
266
267 if (fm->fm_flow_table != NULL) {
268 cuckoo_hashtable_free(fm->fm_flow_table);
269 }
270
271 if (fm->fm_owner_buckets != NULL) {
272 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
273 struct flow_owner_bucket *fob =
274 flow_mgr_get_fob_at_idx(fm, i);
275 ASSERT(fob->fob_idx == i);
276 flow_owner_bucket_destroy(fob);
277 }
278 flow_owner_buckets_free(fm->fm_owner_buckets,
279 fm->fm_owner_bucket_tot_sz);
280 fm->fm_owner_buckets = NULL;
281 fm->fm_owner_bucket_tot_sz = 0;
282 fm->fm_owner_buckets_cnt = 0;
283 fm->fm_owner_bucket_sz = 0;
284 }
285 ASSERT(fm->fm_owner_buckets_cnt == 0);
286 ASSERT(fm->fm_owner_bucket_sz == 0);
287 ASSERT(fm->fm_owner_bucket_tot_sz == 0);
288
289 if (fm->fm_route_buckets != NULL) {
290 for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
291 struct flow_route_bucket *frb =
292 flow_mgr_get_frb_at_idx(fm, i);
293 ASSERT(frb->frb_idx == i);
294 flow_route_bucket_destroy(frb);
295 }
296 flow_route_buckets_free(fm->fm_route_buckets,
297 fm->fm_route_bucket_tot_sz);
298 fm->fm_route_buckets = NULL;
299 fm->fm_route_bucket_tot_sz = 0;
300 fm->fm_route_buckets_cnt = 0;
301 fm->fm_route_bucket_sz = 0;
302 }
303 ASSERT(fm->fm_route_buckets_cnt == 0);
304 ASSERT(fm->fm_route_bucket_sz == 0);
305 ASSERT(fm->fm_route_bucket_tot_sz == 0);
306
307 if (fm->fm_route_id_buckets != NULL) {
308 for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
309 struct flow_route_id_bucket *frib =
310 flow_mgr_get_frib_at_idx(fm, i);
311 ASSERT(frib->frib_idx == i);
312 flow_route_id_bucket_destroy(frib);
313 }
314 flow_route_id_buckets_free(fm->fm_route_id_buckets,
315 fm->fm_route_id_bucket_tot_sz);
316 fm->fm_route_id_buckets = NULL;
317 fm->fm_route_id_bucket_tot_sz = 0;
318 fm->fm_route_id_buckets_cnt = 0;
319 fm->fm_route_id_bucket_sz = 0;
320 }
321 ASSERT(fm->fm_route_id_buckets_cnt == 0);
322 ASSERT(fm->fm_route_id_bucket_sz == 0);
323 ASSERT(fm->fm_route_id_bucket_tot_sz == 0);
324
325 uuid_clear(fm->fm_uuid);
326 RB_REMOVE(flow_mgr_tree, &flow_mgr_head, fm);
327 lck_rw_done(&flow_mgr_lock);
328
329 sk_free_type(struct flow_mgr, fm);
330 }
331
332 void
flow_mgr_terminate(struct flow_mgr * fm)333 flow_mgr_terminate(struct flow_mgr *fm)
334 {
335 uint32_t i;
336
337 /*
338 * Purge all flow entries.
339 */
340 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
341 struct flow_owner_bucket *fob =
342 flow_mgr_get_fob_at_idx(fm, i);
343 FOB_LOCK(fob);
344 fob->fob_busy_flags |= FOBF_DEAD;
345 }
346 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
347 struct flow_owner_bucket *fob =
348 flow_mgr_get_fob_at_idx(fm, i);
349 SK_DF(SK_VERB_FLOW, "purging fob %p [%u]", SK_KVA(fob), i);
350 flow_owner_bucket_purge_all(fob);
351 }
352
353 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
354 FOB_UNLOCK(flow_mgr_get_fob_at_idx(fm, i));
355 }
356
357 /*
358 * Purge all flow routes.
359 */
360 for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
361 struct flow_route_bucket *frb =
362 flow_mgr_get_frb_at_idx(fm, i);
363 FRB_WLOCK(frb);
364 }
365 for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
366 FRIB_WLOCK(flow_mgr_get_frib_at_idx(fm, i));
367 }
368
369 for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
370 struct flow_route_bucket *frb =
371 flow_mgr_get_frb_at_idx(fm, i);
372 SK_DF(SK_VERB_FLOW, "purging frb %p [%u]", SK_KVA(frb), i);
373 flow_route_bucket_purge_all(frb);
374 }
375
376 for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
377 FRIB_WUNLOCK(flow_mgr_get_frib_at_idx(fm, i));
378 }
379 for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
380 FRB_WUNLOCK(flow_mgr_get_frb_at_idx(fm, i));
381 }
382 }
383
384 /*
385 * Must be matched with a call to flow_mgr_unlock(). Upon success will
386 * return the flow manager address of the specified UUID, and will acquire
387 * the global flow_mgr_lock as reader. The caller is then expected to release
388 * the lock.
389 */
390 struct flow_mgr *
flow_mgr_find_lock(uuid_t uuid)391 flow_mgr_find_lock(uuid_t uuid)
392 {
393 struct flow_mgr *fm, find;
394
395 uuid_copy(find.fm_uuid, uuid);
396
397 lck_rw_lock_shared(&flow_mgr_lock);
398
399 fm = RB_FIND(flow_mgr_tree, &flow_mgr_head, &find);
400 if (fm == NULL) {
401 lck_rw_done(&flow_mgr_lock);
402 return NULL;
403 }
404
405 /* caller is expected to call flow_mgr_unlock() when done */
406 LCK_RW_ASSERT(&flow_mgr_lock, LCK_RW_ASSERT_SHARED);
407 return fm;
408 }
409
410 /*
411 * Must be matched with a successful call to flow_mgr_find_lock().
412 */
413 void
flow_mgr_unlock(void)414 flow_mgr_unlock(void)
415 {
416 lck_rw_done(&flow_mgr_lock);
417 }
418
419 static inline int
fm_cmp(const struct flow_mgr * a,const struct flow_mgr * b)420 fm_cmp(const struct flow_mgr *a, const struct flow_mgr *b)
421 {
422 return uuid_compare(a->fm_uuid, b->fm_uuid);
423 }
424
425 static void
flow_mgr_clear_embedded_scope_id(struct sockaddr_in6 * addr)426 flow_mgr_clear_embedded_scope_id(struct sockaddr_in6 *addr)
427 {
428 struct in6_addr *in6;
429 in6 = &addr->sin6_addr;
430 if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(in6)) {
431 addr->sin6_scope_id = ntohs(in6->s6_addr16[1]);
432 in6->s6_addr16[1] = 0;
433 }
434 }
435
436 #if CONFIG_MACF
437 static bool
flow_req_check_mac_allowed(struct nx_flow_req * req)438 flow_req_check_mac_allowed(struct nx_flow_req *req)
439 {
440 int socktype;
441 switch (req->nfr_ip_protocol) {
442 case IPPROTO_TCP:
443 socktype = SOCK_STREAM;
444 break;
445
446 case IPPROTO_UDP:
447 socktype = SOCK_DGRAM;
448 break;
449
450 default:
451 /* Custom IP protocol, which is treated as IP diagram type */
452 socktype = SOCK_DGRAM;
453 return 0;
454 }
455
456 if (req->nfr_flags & NXFLOWREQF_LISTENER) {
457 return mac_skywalk_flow_check_listen(req->nfr_proc, NULL,
458 SA(&req->nfr_saddr.sa), socktype, req->nfr_ip_protocol);
459 } else {
460 return mac_skywalk_flow_check_connect(req->nfr_proc, NULL,
461 SA(&req->nfr_daddr.sa), socktype, req->nfr_ip_protocol);
462 }
463 }
464 #endif /* CONFIG_MACF */
465
466 static bool
flow_req_needs_netns_reservation(struct nx_flow_req * req)467 flow_req_needs_netns_reservation(struct nx_flow_req *req)
468 {
469 uint8_t proto = req->nfr_ip_protocol;
470 return proto == IPPROTO_TCP || proto == IPPROTO_UDP;
471 }
472
473 static bool
flow_req_needs_protons_reservation(struct nx_flow_req * req)474 flow_req_needs_protons_reservation(struct nx_flow_req *req)
475 {
476 uint8_t proto = req->nfr_ip_protocol;
477 return proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
478 proto != IPPROTO_ESP && proto != IPPROTO_AH;
479 }
480
481 static bool
flow_req_needs_ipsec_reservation(struct nx_flow_req * req)482 flow_req_needs_ipsec_reservation(struct nx_flow_req *req)
483 {
484 uint8_t proto = req->nfr_ip_protocol;
485 return proto == IPPROTO_ESP || proto == IPPROTO_AH;
486 }
487
488 static void
flow_set_port_info(struct ns_flow_info * nfi,struct nx_flow_req * req)489 flow_set_port_info(struct ns_flow_info *nfi, struct nx_flow_req *req)
490 {
491 union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
492 union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
493
494 bzero(nfi, sizeof(struct ns_flow_info));
495
496 nfi->nfi_ifp = req->nfr_ifp;
497
498 nfi->nfi_laddr = *saddr;
499 nfi->nfi_faddr = *daddr;
500
501 nfi->nfi_protocol = req->nfr_ip_protocol;
502
503 uuid_copy(nfi->nfi_flow_uuid, req->nfr_flow_uuid);
504 ASSERT(!uuid_is_null(nfi->nfi_flow_uuid));
505
506 nfi->nfi_owner_pid = req->nfr_pid;
507 if (req->nfr_epid != -1) {
508 nfi->nfi_effective_pid = req->nfr_epid;
509 proc_name(req->nfr_epid, nfi->nfi_effective_name,
510 sizeof(nfi->nfi_effective_name));
511 } else {
512 nfi->nfi_effective_pid = -1;
513 }
514
515 proc_name(req->nfr_pid, nfi->nfi_owner_name,
516 sizeof(nfi->nfi_owner_name));
517 }
518
519 static int
flow_req_prepare_namespace(struct nx_flow_req * req)520 flow_req_prepare_namespace(struct nx_flow_req *req)
521 {
522 SK_LOG_VAR(char src_s[MAX_IPv6_STR_LEN]);
523 int err = 0;
524
525 if (flow_req_needs_netns_reservation(req)) {
526 if (!NETNS_TOKEN_VALID(&req->nfr_port_reservation)) {
527 union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
528 struct ns_flow_info nfi;
529 netns_token __single ns_token;
530 flow_set_port_info(&nfi, req);
531 err = flow_namespace_create(saddr,
532 req->nfr_ip_protocol, &ns_token,
533 req->nfr_flags, &nfi);
534 if (err != 0) {
535 SK_ERR("netns for %s.%u failed",
536 sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)),
537 sk_sa_get_port(SA(saddr)));
538 goto fail;
539 }
540 req->nfr_port_reservation = ns_token;
541 req->nfr_flags &= ~NXFLOWREQF_EXT_PORT_RSV;
542 } else {
543 /* Validate PID associated with provided reservation */
544 struct ns_flow_info nfi = {};
545 err = netns_get_flow_info(&req->nfr_port_reservation,
546 &nfi);
547 /* flow info could be NULL for socket flow */
548 if (!err && (req->nfr_pid != nfi.nfi_owner_pid ||
549 (req->nfr_epid != -1 && nfi.nfi_effective_pid !=
550 req->nfr_epid))) {
551 SK_ERR("netns flow info mismatch, "
552 "req_(e)pid %d(%d), nfr_(e)pid %d(%d)",
553 req->nfr_pid, req->nfr_epid,
554 nfi.nfi_owner_pid, nfi.nfi_effective_pid);
555 err = EPERM;
556 goto fail;
557 }
558 req->nfr_flags |= NXFLOWREQF_EXT_PORT_RSV;
559 }
560 }
561
562 if (flow_req_needs_ipsec_reservation(req)) {
563 union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
564 union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
565 /*
566 * XXX -fbounds-safety: Currently, ke_reserve_custom_ipsec does
567 * not return any size information for the first argument
568 * (ipsec_token). Even though it takes a void **, it looks like
569 * only struct secashead * is used.
570 */
571 void *__single ipsec_token = NULL;
572 ASSERT(req->nfr_ipsec_reservation == NULL);
573 err = key_reserve_custom_ipsec(&ipsec_token, saddr,
574 daddr, req->nfr_ip_protocol);
575 if (err != 0) {
576 SK_ERR("custom ipsec %u reserve %s failed",
577 req->nfr_ip_protocol,
578 sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)));
579 goto fail;
580 }
581 req->nfr_ipsec_reservation = ipsec_token;
582 }
583
584 if (flow_req_needs_protons_reservation(req)) {
585 struct protons_token *__single ns_token = NULL;
586 if (!protons_token_is_valid(req->nfr_proto_reservation)) {
587 err = protons_reserve(&ns_token, req->nfr_pid,
588 req->nfr_epid, req->nfr_ip_protocol);
589 if (err != 0) {
590 SK_ERR("protocol %u namespace failed",
591 req->nfr_ip_protocol);
592 goto fail;
593 }
594 req->nfr_flags &= ~NXFLOWREQF_EXT_PROTO_RSV;
595 req->nfr_proto_reservation = ns_token;
596 } else {
597 /* Validate PID associated with provided reservation */
598 if (!protons_token_has_matching_pid(req->nfr_proto_reservation,
599 req->nfr_pid, req->nfr_epid)) {
600 SK_ERR("protons token pid mismatch");
601 err = EPERM;
602 goto fail;
603 }
604 req->nfr_flags |= NXFLOWREQF_EXT_PROTO_RSV;
605 }
606 }
607
608 return 0;
609
610 fail:
611 VERIFY(err != 0);
612 SK_ERR("perparation failed (err %d)", err);
613 return err;
614 }
615
616 static int
flow_req_prepare(struct nx_flow_req * req,struct kern_nexus * nx,struct flow_mgr * fm,struct ifnet * ifp,flow_route_ctor_fn_t fr_ctor,flow_route_resolve_fn_t fr_resolve,void * fr_arg)617 flow_req_prepare(struct nx_flow_req *req, struct kern_nexus *nx,
618 struct flow_mgr *fm, struct ifnet *ifp, flow_route_ctor_fn_t fr_ctor,
619 flow_route_resolve_fn_t fr_resolve, void *fr_arg)
620 {
621 int err = 0;
622 union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
623 union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
624 uint8_t protocol = req->nfr_ip_protocol;
625
626 sa_family_t saf, daf, xaf, af;
627
628 saf = SA(saddr)->sa_family;
629 daf = SA(daddr)->sa_family;
630 xaf = saf ^ daf;
631 if (xaf != 0 && xaf != saf && xaf != daf) {
632 SK_ERR("invalid saddr af %d daddr af %d", saf, daf);
633 return EINVAL;
634 }
635 af = (xaf == 0) ? saf : xaf;
636
637 bool has_saddr = false, has_daddr = false;
638 bool has_sport = false, has_dport = false;
639 uint16_t sport, dport;
640 uint8_t sa_len;
641 switch (af) {
642 case AF_INET:
643 sa_len = sizeof(struct sockaddr_in);
644 has_saddr = (SIN(saddr)->sin_addr.s_addr != INADDR_ANY);
645 has_daddr = (SIN(daddr)->sin_addr.s_addr != INADDR_ANY);
646 sport = SIN(saddr)->sin_port;
647 dport = SIN(daddr)->sin_port;
648 has_sport = (sport != 0);
649 has_dport = (dport != 0);
650
651 if ((has_saddr && SIN(saddr)->sin_len != sa_len) ||
652 (has_daddr && SIN(daddr)->sin_len != sa_len)) {
653 SK_ERR("sin_len invalid");
654 err = EINVAL;
655 goto fail;
656 }
657 if ((has_saddr && IN_MULTICAST(ntohl(SIN(saddr)->sin_addr.s_addr))) ||
658 (has_daddr && IN_MULTICAST(ntohl(SIN(daddr)->sin_addr.s_addr)))) {
659 SK_ERR("multicast flow not yet supported");
660 err = EADDRNOTAVAIL;
661 goto fail;
662 }
663 if (__probable(protocol == IPPROTO_TCP)) {
664 INC_ATOMIC_INT64_LIM(
665 net_api_stats.nas_nx_flow_inet6_stream_total);
666 } else {
667 INC_ATOMIC_INT64_LIM(
668 net_api_stats.nas_nx_flow_inet6_dgram_total);
669 }
670 break;
671
672 case AF_INET6:
673 sa_len = sizeof(struct sockaddr_in6);
674 has_saddr = !IN6_IS_ADDR_UNSPECIFIED(&SIN6(saddr)->sin6_addr);
675 has_daddr = !IN6_IS_ADDR_UNSPECIFIED(&SIN6(daddr)->sin6_addr);
676 sport = SIN6(saddr)->sin6_port;
677 dport = SIN6(daddr)->sin6_port;
678 has_sport = (sport != 0);
679 has_dport = (dport != 0);
680 if ((has_saddr && SIN6(saddr)->sin6_len != sa_len) ||
681 (has_daddr && SIN6(daddr)->sin6_len != sa_len)) {
682 SK_ERR("sin_len invalid");
683 err = EINVAL;
684 goto fail;
685 }
686 /* clear embedded scope if link-local src */
687 if (has_saddr) {
688 flow_mgr_clear_embedded_scope_id(SIN6(saddr));
689 if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&SIN6(saddr)->sin6_addr)) {
690 SIN6(saddr)->sin6_scope_id = ifp->if_index;
691 }
692 }
693 if (has_daddr) {
694 flow_mgr_clear_embedded_scope_id(SIN6(daddr));
695 if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&SIN6(daddr)->sin6_addr)) {
696 SIN6(daddr)->sin6_scope_id = ifp->if_index;
697 }
698 }
699 if ((has_saddr && IN6_IS_ADDR_MULTICAST(&SIN6(saddr)->sin6_addr)) ||
700 (has_daddr && IN6_IS_ADDR_MULTICAST(&SIN6(daddr)->sin6_addr))) {
701 SK_ERR("multicast flow not yet supported");
702 err = EADDRNOTAVAIL;
703 goto fail;
704 }
705 if (__probable(protocol == IPPROTO_TCP)) {
706 INC_ATOMIC_INT64_LIM(
707 net_api_stats.nas_nx_flow_inet_stream_total);
708 } else {
709 INC_ATOMIC_INT64_LIM(
710 net_api_stats.nas_nx_flow_inet_dgram_total);
711 }
712 break;
713
714 default:
715 SK_ERR("unknown address families saf %d daf %d", saf, daf);
716 err = EINVAL;
717 goto fail;
718 }
719
720 SA(saddr)->sa_family = SA(daddr)->sa_family = af;
721 SA(saddr)->sa_len = SA(daddr)->sa_len = sa_len;
722
723 if (__improbable(has_saddr && !flow_route_laddr_validate(saddr, ifp,
724 &req->nfr_saddr_gencnt))) {
725 SK_LOG_VAR(char src_s[MAX_IPv6_STR_LEN]);
726 SK_ERR("src address %s is not valid",
727 sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)));
728 err = EADDRNOTAVAIL;
729 goto fail;
730 }
731
732 bool is_tcp_udp = (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP);
733 if (!is_tcp_udp) {
734 if (has_sport || has_dport) {
735 SK_ERR("non-zero port for IP flow");
736 return EINVAL;
737 }
738 } else {
739 /* dst:dport as connected, 0:0 as listener, but not partial */
740 if (has_daddr != has_dport) {
741 err = EINVAL;
742 SK_ERR("invalid dst/dport for TCP/UDP (err %d)", err);
743 goto fail;
744 }
745 }
746
747 if (!has_daddr && !has_dport) {
748 req->nfr_flags |= NXFLOWREQF_LISTENER;
749 }
750
751 if (req->nfr_transport_protocol == 0) {
752 req->nfr_transport_protocol = req->nfr_ip_protocol;
753 }
754
755 bool is_child_flow = !uuid_is_null(req->nfr_parent_flow_uuid);
756 if ((is_child_flow && req->nfr_flow_demux_count == 0) ||
757 (!is_child_flow && req->nfr_flow_demux_count > 0)) {
758 err = EINVAL;
759 SK_ERR("invalid flow demux count");
760 goto fail;
761 }
762
763 if (req->nfr_flow_demux_count > 0) {
764 if (req->nfr_ip_protocol != IPPROTO_UDP) {
765 err = EINVAL;
766 SK_ERR("invalid ip protocol(%u) for flow demux",
767 req->nfr_ip_protocol);
768 goto fail;
769 }
770
771 for (int i = 0; i < req->nfr_flow_demux_count; i++) {
772 if (req->nfr_flow_demux_patterns[i].fdp_len > FLOW_DEMUX_MAX_LEN ||
773 req->nfr_flow_demux_patterns[i].fdp_len == 0) {
774 err = EINVAL;
775 SK_ERR("invalid flow demux pattern len %u",
776 req->nfr_flow_demux_patterns[i].fdp_len);
777 goto fail;
778 }
779 if (req->nfr_flow_demux_patterns[i].fdp_offset +
780 req->nfr_flow_demux_patterns[i].fdp_len > MAX_PKT_DEMUX_LIMIT) {
781 err = EINVAL;
782 SK_ERR("invalid demux offset plus length(%u > %d)",
783 req->nfr_flow_demux_patterns[i].fdp_offset +
784 req->nfr_flow_demux_patterns[i].fdp_len, MAX_PKT_DEMUX_LIMIT);
785 goto fail;
786 }
787 }
788 }
789
790 req->nfr_ifp = ifp;
791
792 #if CONFIG_MACF
793 err = flow_req_check_mac_allowed(req);
794 if (err != 0) {
795 SK_ERR("flow req failed MAC check");
796 goto fail;
797 }
798 #endif /* CONFIG_MACF */
799
800 /* setup flow route and prepare saddr if needed */
801 if (__probable(has_daddr || has_dport)) {
802 struct flow_route *__single fr = NULL;
803 err = flow_route_find(nx, fm, ifp, req, fr_ctor,
804 fr_resolve, fr_arg, &fr);
805 if (__improbable(err != 0)) {
806 SK_ERR("flow route lookup failed");
807 ASSERT(fr == NULL);
808 goto fail;
809 }
810 ASSERT(fr != NULL);
811 /* Pick up the default source address from flow route. */
812 if (!has_saddr) {
813 *saddr = fr->fr_laddr;
814 SIN(saddr)->sin_port = sport;
815 }
816 req->nfr_route = fr;
817 fr = NULL;
818 }
819
820 /* child flow do not hold namespace references */
821 if (__probable(uuid_is_null(req->nfr_parent_flow_uuid))) {
822 err = flow_req_prepare_namespace(req);
823 if (err != 0) {
824 goto fail;
825 }
826 }
827
828 return 0;
829
830 fail:
831 VERIFY(err != 0);
832 if (req->nfr_route != NULL) {
833 flow_route_release(req->nfr_route);
834 req->nfr_route = NULL;
835 }
836 SK_ERR("preparation failed (err %d)", err);
837 return err;
838 }
839
840 static void
flow_req_cleanup(struct nx_flow_req * req)841 flow_req_cleanup(struct nx_flow_req *req)
842 {
843 if (NETNS_TOKEN_VALID(&req->nfr_port_reservation) &&
844 !(req->nfr_flags & NXFLOWREQF_EXT_PORT_RSV)) {
845 netns_release(&req->nfr_port_reservation);
846 }
847
848 if (protons_token_is_valid(req->nfr_proto_reservation) &&
849 !(req->nfr_flags & NXFLOWREQF_EXT_PROTO_RSV)) {
850 protons_release(&req->nfr_proto_reservation);
851 }
852 if (key_custom_ipsec_token_is_valid(req->nfr_ipsec_reservation)) {
853 key_release_custom_ipsec(&req->nfr_ipsec_reservation);
854 }
855 }
856
857 #if SK_LOG
858 /* Hoisted out of line to reduce kernel stack footprint */
859 SK_LOG_ATTRIBUTE
860 static void
flow_req_dump(char * desc,struct nx_flow_req * req)861 flow_req_dump(char *desc, struct nx_flow_req *req)
862 {
863 if (!(sk_verbose & SK_VERB_FLOW)) {
864 return;
865 }
866
867 union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
868 union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
869 uint8_t protocol = req->nfr_ip_protocol;
870 char src_s[MAX_IPv6_STR_LEN];
871 char dst_s[MAX_IPv6_STR_LEN];
872 uint8_t sipver = 0, dipver = 0;
873 uint16_t sport = 0, dport = 0;
874 uuid_string_t uuid_s;
875
876 // unsanitized req, treat source and destination AF separately
877 if (saddr->sa.sa_family == AF_INET) {
878 sipver = IPVERSION;
879 (void) sk_ntop(AF_INET, &SIN(saddr)->sin_addr, src_s,
880 sizeof(src_s));
881 sport = ntohs(saddr->sin.sin_port);
882 } else if (saddr->sa.sa_family == AF_INET6) {
883 sipver = IPV6_VERSION;
884 (void) sk_ntop(AF_INET6, &SIN6(saddr)->sin6_addr, src_s,
885 sizeof(src_s));
886 sport = ntohs(saddr->sin6.sin6_port);
887 } else {
888 sipver = 0;
889 strlcpy(src_s, "INV", sizeof(src_s));
890 }
891 if (daddr->sa.sa_family == AF_INET) {
892 dipver = IPVERSION;
893 (void) sk_ntop(AF_INET, &SIN(daddr)->sin_addr, dst_s,
894 sizeof(dst_s));
895 dport = ntohs(daddr->sin.sin_port);
896 } else if (daddr->sa.sa_family == AF_INET6) {
897 dipver = IPV6_VERSION;
898 (void) sk_ntop(AF_INET6, &SIN6(daddr)->sin6_addr, dst_s,
899 sizeof(dst_s));
900 dport = ntohs(daddr->sin6.sin6_port);
901 } else {
902 dipver = 0;
903 strlcpy(dst_s, "INV", sizeof(src_s));
904 }
905
906 SK_DF(SK_VERB_FLOW,
907 "%s %s sipver=%u,dipver=%u,src=%s.%u,dst=%s.%u,proto=%d "
908 "nx_port=%u,flags 0x%x", desc, sk_uuid_unparse(req->nfr_flow_uuid,
909 uuid_s), sipver, dipver, src_s, sport, dst_s, dport, protocol,
910 req->nfr_nx_port, req->nfr_flags);
911 }
912 #else
913 #define flow_req_dump(str, req) do { ((void)0); } while (0)
914 #endif /* SK_LOG */
915
916 /*
917 * Upon success, returns a non-NULL fb that is (writer) locked.
918 */
919 int
flow_mgr_flow_add(struct kern_nexus * nx,struct flow_mgr * fm,struct flow_owner * fo,struct ifnet * ifp,struct nx_flow_req * req,flow_route_ctor_fn_t fr_ctor,flow_route_resolve_fn_t fr_resolve,void * fr_arg)920 flow_mgr_flow_add(struct kern_nexus *nx, struct flow_mgr *fm,
921 struct flow_owner *fo, struct ifnet *ifp, struct nx_flow_req *req,
922 flow_route_ctor_fn_t fr_ctor, flow_route_resolve_fn_t fr_resolve,
923 void *fr_arg)
924 {
925 struct flow_entry *__single fe;
926 int err = 0;
927
928 ASSERT(ifp != NULL);
929 ASSERT(fr_ctor != NULL && fr_resolve != NULL);
930 FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo));
931
932 flow_req_dump("req", req);
933
934 if (!(req->nfr_flags & NXFLOWREQF_ASIS)) {
935 err = flow_req_prepare(req, nx, fm, ifp, fr_ctor, fr_resolve, fr_arg);
936 if (err != 0) {
937 SK_ERR("flow req preparation failure (err %d)", err);
938 return err;
939 }
940 }
941
942 /*
943 * Add entry in flowswitch table; upon success, flow entry adds a
944 * retain count on the flow route (we'll always need to release the
945 * refcnt from flow_route_find), and the local address:port of the
946 * flow entry will be set.
947 */
948 fe = flow_entry_alloc(fo, req, &err);
949 if (__improbable(fe == NULL)) {
950 ASSERT(err != 0);
951 goto fail;
952 }
953
954 VERIFY(NETNS_TOKEN_VALID(&fe->fe_port_reservation) ||
955 !(fe->fe_key.fk_mask & FKMASK_SPORT) ||
956 req->nfr_flags & NXFLOWREQF_ASIS ||
957 (fe->fe_flags & FLOWENTF_CHILD));
958 VERIFY((req->nfr_flags & NXFLOWREQF_FLOWADV) ^
959 (req->nfr_flowadv_idx == FLOWADV_IDX_NONE));
960 req->nfr_flowadv_idx = fe->fe_adv_idx;
961 req->nfr_flowid = fe->fe_flowid;
962
963 flow_req_dump("added ", req);
964
965 if (fe != NULL) {
966 flow_entry_release(&fe);
967 }
968
969 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
970 if (req->nfr_saddr.sa.sa_family == AF_INET6 &&
971 IN6_IS_SCOPE_EMBED(&req->nfr_saddr.sin6.sin6_addr)) {
972 req->nfr_saddr.sin6.sin6_scope_id = ifnet_index(
973 fsw->fsw_ifp);
974 }
975 if (req->nfr_daddr.sa.sa_family == AF_INET6 &&
976 IN6_IS_SCOPE_EMBED(&req->nfr_daddr.sin6.sin6_addr)) {
977 req->nfr_daddr.sin6.sin6_scope_id = ifnet_index(
978 fsw->fsw_ifp);
979 }
980
981 return 0;
982
983 fail:
984 VERIFY(err != 0);
985 flow_req_cleanup(req);
986
987 return err;
988 }
989
990 struct flow_owner_bucket *
flow_mgr_get_fob_by_pid(struct flow_mgr * fm,pid_t pid)991 flow_mgr_get_fob_by_pid(struct flow_mgr *fm, pid_t pid)
992 {
993 return flow_mgr_get_fob_at_idx(fm,
994 (pid % fm->fm_owner_buckets_cnt));
995 }
996
997 struct flow_entry *
flow_mgr_get_fe_by_uuid_rlock(struct flow_mgr * fm,uuid_t uuid)998 flow_mgr_get_fe_by_uuid_rlock(struct flow_mgr *fm, uuid_t uuid)
999 {
1000 uint32_t i;
1001 struct flow_owner_bucket *fob;
1002 struct flow_owner *fo;
1003 struct flow_entry *fe;
1004
1005 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
1006 fob = flow_mgr_get_fob_at_idx(fm, i);
1007 FOB_LOCK_SPIN(fob);
1008 RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
1009 fe = flow_entry_find_by_uuid(fo, uuid);
1010 if (fe != NULL) {
1011 FOB_LOCK_CONVERT(fob);
1012 FOB_UNLOCK(fob);
1013 return fe;
1014 }
1015 }
1016 FOB_UNLOCK(fob);
1017 }
1018 return NULL;
1019 }
1020
1021 struct flow_route_bucket *
flow_mgr_get_frb_by_addr(struct flow_mgr * fm,union sockaddr_in_4_6 * daddr)1022 flow_mgr_get_frb_by_addr(struct flow_mgr *fm,
1023 union sockaddr_in_4_6 *daddr)
1024 {
1025 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = flow_seed;
1026
1027 switch (SA(daddr)->sa_family) {
1028 case AF_INET: {
1029 uint8_t *p = (uint8_t *)&SIN(daddr)->sin_addr.s_addr;
1030 b += ((uint32_t)p[3]);
1031 a += ((uint32_t)p[2]) << 24;
1032 a += ((uint32_t)p[1]) << 16;
1033 a += ((uint32_t)p[0]) << 8;
1034 break;
1035 }
1036
1037 case AF_INET6: {
1038 b += SIN6(daddr)->sin6_addr.s6_addr32[3];
1039 a += SIN6(daddr)->sin6_addr.s6_addr32[2];
1040 a += SIN6(daddr)->sin6_addr.s6_addr32[1];
1041 a += SIN6(daddr)->sin6_addr.s6_addr32[0];
1042 break;
1043 }
1044
1045 default:
1046 VERIFY(0);
1047 /* NOTREACHED */
1048 __builtin_unreachable();
1049 }
1050
1051 /* mix */
1052 a -= b; a -= c; a ^= (c >> 13);
1053 b -= c; b -= a; b ^= (a << 8);
1054 c -= a; c -= b; c ^= (b >> 13);
1055 a -= b; a -= c; a ^= (c >> 12);
1056 b -= c; b -= a; b ^= (a << 16);
1057 c -= a; c -= b; c ^= (b >> 5);
1058 a -= b; a -= c; a ^= (c >> 3);
1059 b -= c; b -= a; b ^= (a << 10);
1060 c -= a; c -= b; c ^= (b >> 15);
1061
1062 c &= (fm->fm_route_buckets_cnt - 1);
1063
1064 return flow_mgr_get_frb_at_idx(fm, c);
1065 }
1066
1067 struct flow_route_id_bucket *
flow_mgr_get_frib_by_uuid(struct flow_mgr * fm,uuid_t fr_uuid)1068 flow_mgr_get_frib_by_uuid(struct flow_mgr *fm, uuid_t fr_uuid)
1069 {
1070 union {
1071 uuid_t uuid __sk_aligned(8);
1072 uint64_t u64[2];
1073 } u;
1074 uint64_t key;
1075
1076 static_assert(sizeof(u.uuid) == sizeof(u.u64));
1077 uuid_copy(u.uuid, fr_uuid);
1078
1079 /* XOR fold UUID down to 4-bytes */
1080 key = (u.u64[0] ^ u.u64[1]);
1081 key = ((key >> 32) ^ (key & 0xffffffff));
1082
1083 /* add some offset to get more entropy */
1084 return flow_mgr_get_frib_at_idx(fm,
1085 ((uint32_t)key % fm->fm_route_id_buckets_cnt));
1086 }
1087
1088 static int
flow_hash_mask_add(struct flow_mgr * fm,uint32_t mask,int32_t v)1089 flow_hash_mask_add(struct flow_mgr *fm, uint32_t mask, int32_t v)
1090 {
1091 for (uint32_t i = 0; i < FKMASK_IDX_MAX; i++) {
1092 if (fm->fm_flow_hash_masks[i] == mask) {
1093 os_atomic_add(&fm->fm_flow_hash_count[i], v, relaxed);
1094 return 0;
1095 }
1096 }
1097 SK_ERR("unkown hash mask 0x%x", mask);
1098 return ENOTSUP;
1099 }
1100
1101 int
flow_mgr_flow_hash_mask_add(struct flow_mgr * fm,uint32_t mask)1102 flow_mgr_flow_hash_mask_add(struct flow_mgr *fm, uint32_t mask)
1103 {
1104 return flow_hash_mask_add(fm, mask, 1);
1105 }
1106
1107 int
flow_mgr_flow_hash_mask_del(struct flow_mgr * fm,uint32_t mask)1108 flow_mgr_flow_hash_mask_del(struct flow_mgr *fm, uint32_t mask)
1109 {
1110 return flow_hash_mask_add(fm, mask, -1);
1111 }
1112
1113 #if SK_LOG
1114 SK_NO_INLINE_ATTRIBUTE
1115 static void
__flow_mgr_find_fe_by_key_prelog(struct flow_key * key)1116 __flow_mgr_find_fe_by_key_prelog(struct flow_key *key)
1117 {
1118 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
1119 SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "key %s",
1120 fk2str(key, dbgbuf, sizeof(dbgbuf)));
1121 }
1122
1123 SK_NO_INLINE_ATTRIBUTE
1124 static void
__flow_mgr_find_fe_by_key_epilog(struct flow_entry * fe)1125 __flow_mgr_find_fe_by_key_epilog(struct flow_entry *fe)
1126 {
1127 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
1128 if (fe != NULL) {
1129 SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "fe \"%s\"",
1130 fe2str(fe, dbgbuf, sizeof(dbgbuf)));
1131 } else {
1132 SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "fe not found");
1133 }
1134 }
1135 #else
1136 #define __flow_mgr_find_fe_by_key_prelog(key) do { ((void)0); } while (0)
1137 #define __flow_mgr_find_fe_by_key_epilog(fe) do { ((void)0); } while (0)
1138 #endif /* SK_LOG */
1139
1140 struct flow_entry *
flow_mgr_find_fe_by_key(struct flow_mgr * fm,struct flow_key * key)1141 flow_mgr_find_fe_by_key(struct flow_mgr *fm, struct flow_key *key)
1142 {
1143 struct cuckoo_node *node = NULL;
1144 struct flow_entry *__single fe = NULL;
1145 uint32_t hash = 0;
1146 uint16_t saved_mask = key->fk_mask;
1147
1148 __flow_mgr_find_fe_by_key_prelog(key);
1149
1150 for (int i = 0; i < FKMASK_IDX_MAX; i++) {
1151 size_t count = fm->fm_flow_hash_count[i];
1152 uint16_t mask = fm->fm_flow_hash_masks[i];
1153 if (count == 0 || mask == 0) {
1154 SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1155 "[%d] mask=%08x count=%zu skiped",
1156 i, mask, count);
1157 continue;
1158 }
1159 key->fk_mask = mask;
1160 hash = flow_key_hash(key);
1161 node = cuckoo_hashtable_find_with_hash(fm->fm_flow_table, key, hash);
1162 SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1163 "[%d] mask=%08x hash %08x node %p", i, mask, hash,
1164 SK_KVA(node));
1165 if (node != NULL) {
1166 fe = __container_of(node, struct flow_entry, fe_cnode);
1167 /* v4 only listener fe shouldn't get v6 connection */
1168 if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE &&
1169 fe->fe_key.fk_ipver == IPVERSION &&
1170 key->fk_ipver == IPV6_VERSION)) {
1171 flow_entry_release(&fe);
1172 ASSERT(fe == NULL);
1173 SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1174 "\tskip v4 only fe");
1175 continue;
1176 }
1177 break;
1178 }
1179 }
1180
1181 key->fk_mask = saved_mask;
1182
1183 __flow_mgr_find_fe_by_key_epilog(fe);
1184
1185 return fe;
1186 }
1187
1188 struct flow_entry *
flow_mgr_find_conflicting_fe(struct flow_mgr * fm,struct flow_key * key)1189 flow_mgr_find_conflicting_fe(struct flow_mgr *fm, struct flow_key *key)
1190 {
1191 struct cuckoo_node *node = NULL;
1192 struct flow_entry *__single fe = NULL;
1193 uint32_t hash = 0;
1194
1195 hash = flow_key_hash(key);
1196 node = cuckoo_hashtable_find_with_hash(fm->fm_flow_table, key, hash);
1197 if (node != NULL) {
1198 fe = __container_of(node, struct flow_entry, fe_cnode);
1199 return fe;
1200 }
1201
1202 /* listener flow confliction will be checked at netns reservation */
1203 return fe;
1204 }
1205
1206 void
1207 flow_mgr_foreach_flow(struct flow_mgr *fm,
1208 void (^flow_handler)(struct flow_entry *fe))
1209 {
1210 cuckoo_hashtable_foreach(fm->fm_flow_table,
1211 ^(struct cuckoo_node *node, uint32_t hv) {
1212 #pragma unused(hv)
1213 struct flow_entry *__single fe;
1214 fe = __container_of(node, struct flow_entry, fe_cnode);
1215 flow_handler(fe);
1216
1217 if (fe->fe_flags & FLOWENTF_PARENT) {
1218 struct flow_entry *child_fe;
1219 lck_rw_lock_shared(&fe->fe_child_list_lock);
1220 TAILQ_FOREACH(child_fe, &fe->fe_child_list, fe_child_link) {
1221 flow_handler(child_fe);
1222 }
1223 lck_rw_unlock_shared(&fe->fe_child_list_lock);
1224 }
1225 }
1226 );
1227 }
1228
1229 bool
rx_flow_demux_match(struct nx_flowswitch * fsw,struct flow_entry * fe,struct __kern_packet * pkt)1230 rx_flow_demux_match(struct nx_flowswitch *fsw, struct flow_entry *fe, struct __kern_packet *pkt)
1231 {
1232 struct udphdr *uh;
1233 uint8_t *pkt_buf;
1234 uint32_t bdlen, bdlim, bdoff, pkt_payload_len;
1235 uint8_t *demux_data;
1236
1237 ASSERT(fe->fe_flags & FLOWENTF_CHILD);
1238 ASSERT(fe->fe_demux_pattern_count > 0);
1239
1240 if (fe->fe_flags & (FLOWENTF_TORN_DOWN | FLOWENTF_NONVIABLE)) {
1241 return false;
1242 }
1243
1244 /*
1245 * Demux only supported for UDP packets with payload
1246 */
1247 if (__improbable(pkt->pkt_flow_ip_proto != IPPROTO_UDP)) {
1248 return false;
1249 }
1250
1251 uh = __unsafe_forge_bidi_indexable(struct udphdr *,
1252 (struct udphdr *)pkt->pkt_flow_udp_hdr, sizeof(*uh) + pkt->pkt_flow_ulen);
1253 if (__improbable(uh == NULL || pkt->pkt_flow_ulen == 0)) {
1254 return false;
1255 }
1256
1257 int udp_payload_offset = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen + sizeof(*uh);
1258
1259 MD_BUFLET_ADDR_ABS_DLEN(pkt, pkt_buf, bdlen, bdlim, bdoff);
1260 pkt_payload_len = bdlim - bdoff;
1261 pkt_payload_len = MIN(pkt_payload_len, pkt->pkt_length);
1262 pkt_payload_len -= udp_payload_offset;
1263
1264 for (int index = 0; index < fe->fe_demux_pattern_count; index++) {
1265 struct flow_demux_pattern *demux_pattern = &fe->fe_demux_patterns[index].fdp_demux_pattern;
1266 ASSERT(demux_pattern->fdp_len > 0);
1267
1268 if (pkt->pkt_flow_ulen >= demux_pattern->fdp_offset + demux_pattern->fdp_len) {
1269 if (__probable(pkt_payload_len >= demux_pattern->fdp_offset + demux_pattern->fdp_len)) {
1270 demux_data = (uint8_t *)(uh + 1) + demux_pattern->fdp_offset;
1271 } else {
1272 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
1273 m_copydata(pkt->pkt_mbuf, udp_payload_offset + demux_pattern->fdp_offset,
1274 demux_pattern->fdp_len, fe->fe_demux_pkt_data);
1275 demux_data = fe->fe_demux_pkt_data;
1276 } else {
1277 FSW_STATS_INC(FSW_STATS_RX_DEMUX_SHORT_ERR);
1278 return false;
1279 }
1280 }
1281
1282 int result = -1;
1283 if (fe->fe_demux_patterns[index].fdp_memcmp_mask != NULL) {
1284 result = fe->fe_demux_patterns[index].fdp_memcmp_mask(demux_data,
1285 demux_pattern->fdp_value, demux_pattern->fdp_mask);
1286 } else {
1287 result = sk_memcmp_mask(demux_data, demux_pattern->fdp_value,
1288 demux_pattern->fdp_mask, demux_pattern->fdp_len);
1289 }
1290
1291 if (result == 0) {
1292 return true;
1293 }
1294 }
1295 }
1296
1297 return false;
1298 }
1299
1300 struct flow_entry *
rx_lookup_child_flow(struct nx_flowswitch * fsw,struct flow_entry * parent_fe,struct __kern_packet * pkt)1301 rx_lookup_child_flow(struct nx_flowswitch *fsw, struct flow_entry *parent_fe,
1302 struct __kern_packet *pkt)
1303 {
1304 struct flow_entry *child_fe;
1305
1306 /*
1307 * Demux only supported for UDP packets with payload
1308 */
1309 if (__improbable(pkt->pkt_flow_ip_proto != IPPROTO_UDP)) {
1310 return NULL;
1311 }
1312
1313 lck_rw_lock_shared(&parent_fe->fe_child_list_lock);
1314
1315 TAILQ_FOREACH(child_fe, &parent_fe->fe_child_list, fe_child_link) {
1316 if (rx_flow_demux_match(fsw, child_fe, pkt)) {
1317 flow_entry_retain(child_fe);
1318 lck_rw_unlock_shared(&parent_fe->fe_child_list_lock);
1319 return child_fe;
1320 }
1321 }
1322
1323 lck_rw_unlock_shared(&parent_fe->fe_child_list_lock);
1324 return NULL;
1325 }
1326
1327 struct flow_entry *
tx_lookup_child_flow(struct flow_entry * parent_fe,uuid_t flow_id)1328 tx_lookup_child_flow(struct flow_entry *parent_fe, uuid_t flow_id)
1329 {
1330 struct flow_entry *child_fe;
1331
1332 ASSERT(parent_fe->fe_flags & FLOWENTF_PARENT);
1333
1334 lck_rw_lock_shared(&parent_fe->fe_child_list_lock);
1335 TAILQ_FOREACH(child_fe, &parent_fe->fe_child_list, fe_child_link) {
1336 if (_UUID_MATCH(flow_id, child_fe->fe_uuid)) {
1337 flow_entry_retain(child_fe);
1338 lck_rw_unlock_shared(&parent_fe->fe_child_list_lock);
1339 return child_fe;
1340 }
1341 }
1342
1343 lck_rw_unlock_shared(&parent_fe->fe_child_list_lock);
1344 return NULL;
1345 }
1346