xref: /xnu-12377.41.6/bsd/skywalk/nexus/flowswitch/flow/flow_entry.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2016-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 
31 #include <dev/random/randomdev.h>
32 #include <net/flowhash.h>
33 #include <netkey/key.h>
34 #include <netinet/tcp_timer.h>
35 #include <netinet/tcp_var.h>
36 
37 #include <skywalk/nexus/flowswitch/fsw_var.h>
38 #include <skywalk/nexus/flowswitch/flow/flow_var.h>
39 #include <skywalk/nexus/netif/nx_netif.h>
40 #include <skywalk/namespace/flowidns.h>
41 
42 
43 struct flow_entry *fe_alloc(boolean_t);
44 static void fe_free(struct flow_entry *);
45 static int fe_id_cmp(const struct flow_entry *, const struct flow_entry *);
46 static void fe_stats_init(struct flow_entry *);
47 void fe_stats_update(struct flow_entry *);
48 
49 RB_GENERATE_PREV(flow_entry_id_tree, flow_entry, fe_id_link, fe_id_cmp);
50 
51 os_refgrp_decl(static, flow_entry_refgrp, "flow_entry", NULL);
52 
53 static SKMEM_TYPE_DEFINE(sk_fed_zone, struct flow_entry_dead);
54 
55 const struct flow_key fk_mask_2tuple
56 __sk_aligned(16) =
57 {
58 	.fk_mask = FKMASK_2TUPLE,
59 	.fk_ipver = 0,
60 	.fk_proto = 0xff,
61 	.fk_sport = 0xffff,
62 	.fk_dport = 0,
63 	.fk_src._addr64[0] = 0,
64 	.fk_src._addr64[1] = 0,
65 	.fk_dst._addr64[0] = 0,
66 	.fk_dst._addr64[1] = 0,
67 	.fk_pad[0] = 0,
68 };
69 
70 const struct flow_key fk_mask_3tuple
71 __sk_aligned(16) =
72 {
73 	.fk_mask = FKMASK_3TUPLE,
74 	.fk_ipver = 0xff,
75 	.fk_proto = 0xff,
76 	.fk_sport = 0xffff,
77 	.fk_dport = 0,
78 	.fk_src._addr64[0] = 0xffffffffffffffffULL,
79 	.fk_src._addr64[1] = 0xffffffffffffffffULL,
80 	.fk_dst._addr64[0] = 0,
81 	.fk_dst._addr64[1] = 0,
82 	.fk_pad[0] = 0,
83 };
84 
85 const struct flow_key fk_mask_4tuple
86 __sk_aligned(16) =
87 {
88 	.fk_mask = FKMASK_4TUPLE,
89 	.fk_ipver = 0xff,
90 	.fk_proto = 0xff,
91 	.fk_sport = 0xffff,
92 	.fk_dport = 0xffff,
93 	.fk_src._addr64[0] = 0xffffffffffffffffULL,
94 	.fk_src._addr64[1] = 0xffffffffffffffffULL,
95 	.fk_dst._addr64[0] = 0,
96 	.fk_dst._addr64[1] = 0,
97 	.fk_pad[0] = 0,
98 };
99 
100 const struct flow_key fk_mask_5tuple
101 __sk_aligned(16) =
102 {
103 	.fk_mask = FKMASK_5TUPLE,
104 	.fk_ipver = 0xff,
105 	.fk_proto = 0xff,
106 	.fk_sport = 0xffff,
107 	.fk_dport = 0xffff,
108 	.fk_src._addr64[0] = 0xffffffffffffffffULL,
109 	.fk_src._addr64[1] = 0xffffffffffffffffULL,
110 	.fk_dst._addr64[0] = 0xffffffffffffffffULL,
111 	.fk_dst._addr64[1] = 0xffffffffffffffffULL,
112 	.fk_pad[0] = 0,
113 };
114 
115 const struct flow_key fk_mask_ipflow1
116 __sk_aligned(16) =
117 {
118 	.fk_mask = FKMASK_IPFLOW1,
119 	.fk_ipver = 0,
120 	.fk_proto = 0xff,
121 	.fk_sport = 0,
122 	.fk_dport = 0,
123 	.fk_src._addr64[0] = 0,
124 	.fk_src._addr64[1] = 0,
125 	.fk_dst._addr64[0] = 0,
126 	.fk_dst._addr64[1] = 0,
127 	.fk_pad[0] = 0,
128 };
129 
130 const struct flow_key fk_mask_ipflow2
131 __sk_aligned(16) =
132 {
133 	.fk_mask = FKMASK_IPFLOW2,
134 	.fk_ipver = 0xff,
135 	.fk_proto = 0xff,
136 	.fk_sport = 0,
137 	.fk_dport = 0,
138 	.fk_src._addr64[0] = 0xffffffffffffffffULL,
139 	.fk_src._addr64[1] = 0xffffffffffffffffULL,
140 	.fk_dst._addr64[0] = 0,
141 	.fk_dst._addr64[1] = 0,
142 	.fk_pad[0] = 0,
143 };
144 
145 const struct flow_key fk_mask_ipflow3
146 __sk_aligned(16) =
147 {
148 	.fk_mask = FKMASK_IPFLOW3,
149 	.fk_ipver = 0xff,
150 	.fk_proto = 0xff,
151 	.fk_sport = 0,
152 	.fk_dport = 0,
153 	.fk_src._addr64[0] = 0xffffffffffffffffULL,
154 	.fk_src._addr64[1] = 0xffffffffffffffffULL,
155 	.fk_dst._addr64[0] = 0xffffffffffffffffULL,
156 	.fk_dst._addr64[1] = 0xffffffffffffffffULL,
157 	.fk_pad[0] = 0,
158 };
159 
160 struct flow_owner *
flow_owner_find_by_pid(struct flow_owner_bucket * fob,pid_t pid,void * context,bool low_latency)161 flow_owner_find_by_pid(struct flow_owner_bucket *fob, pid_t pid, void *context,
162     bool low_latency)
163 {
164 	struct flow_owner find = { .fo_context = context, .fo_pid = pid,
165 		                   .fo_low_latency = low_latency};
166 
167 	ASSERT(low_latency == true || low_latency == false);
168 	FOB_LOCK_ASSERT_HELD(fob);
169 	return RB_FIND(flow_owner_tree, &fob->fob_owner_head, &find);
170 }
171 
172 struct flow_entry *
flow_entry_find_by_uuid(struct flow_owner * fo,uuid_t uuid)173 flow_entry_find_by_uuid(struct flow_owner *fo, uuid_t uuid)
174 {
175 	struct flow_entry find, *fe = NULL;
176 	FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo));
177 
178 	uuid_copy(find.fe_uuid, uuid);
179 	fe = RB_FIND(flow_entry_id_tree, &fo->fo_flow_entry_id_head, &find);
180 	if (fe != NULL) {
181 		flow_entry_retain(fe);
182 	}
183 
184 	return fe;
185 }
186 
187 static uint32_t
flow_entry_calc_flowid(struct flow_entry * fe)188 flow_entry_calc_flowid(struct flow_entry *fe)
189 {
190 	uint32_t flowid;
191 	struct flowidns_flow_key fk;
192 
193 	bzero(&fk, sizeof(fk));
194 	static_assert(sizeof(fe->fe_key.fk_src) == sizeof(fk.ffk_laddr));
195 	static_assert(sizeof(fe->fe_key.fk_dst) == sizeof(fk.ffk_raddr));
196 	bcopy(&fe->fe_key.fk_src, &fk.ffk_laddr, sizeof(fk.ffk_laddr));
197 	bcopy(&fe->fe_key.fk_dst, &fk.ffk_raddr, sizeof(fk.ffk_raddr));
198 
199 	fk.ffk_lport = fe->fe_key.fk_sport;
200 	fk.ffk_rport = fe->fe_key.fk_dport;
201 	fk.ffk_af = (fe->fe_key.fk_ipver == 4) ? AF_INET : AF_INET6;
202 	fk.ffk_proto = fe->fe_key.fk_proto;
203 
204 	flowidns_allocate_flowid(FLOWIDNS_DOMAIN_FLOWSWITCH, &fk, &flowid);
205 	return flowid;
206 }
207 
208 static bool
flow_entry_add_child(struct flow_entry * parent_fe,struct flow_entry * child_fe)209 flow_entry_add_child(struct flow_entry *parent_fe, struct flow_entry *child_fe)
210 {
211 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
212 	ASSERT(parent_fe->fe_flags & FLOWENTF_PARENT);
213 
214 	lck_rw_lock_exclusive(&parent_fe->fe_child_list_lock);
215 
216 	if (parent_fe->fe_flags & FLOWENTF_NONVIABLE) {
217 		SK_ERR("child entry add failed, parent fe \"%s\" non viable",
218 		    fe2str(parent_fe, dbgbuf, sizeof(dbgbuf)));
219 		lck_rw_unlock_exclusive(&parent_fe->fe_child_list_lock);
220 		return false;
221 	}
222 
223 	struct flow_entry *__single fe, *__single tfe;
224 	TAILQ_FOREACH_SAFE(fe, &parent_fe->fe_child_list, fe_child_link, tfe) {
225 		if (!fe_id_cmp(fe, child_fe)) {
226 			lck_rw_unlock_exclusive(&parent_fe->fe_child_list_lock);
227 			SK_ERR("child entry \"%s\" already exists",
228 			    fe2str(fe, dbgbuf, sizeof(dbgbuf)));
229 			return false;
230 		}
231 
232 		if (fe->fe_flags & FLOWENTF_NONVIABLE) {
233 			TAILQ_REMOVE(&parent_fe->fe_child_list, fe, fe_child_link);
234 			ASSERT(--parent_fe->fe_child_count >= 0);
235 			flow_entry_release(&fe);
236 		}
237 	}
238 
239 	flow_entry_retain(child_fe);
240 	TAILQ_INSERT_TAIL(&parent_fe->fe_child_list, child_fe, fe_child_link);
241 	ASSERT(++parent_fe->fe_child_count > 0);
242 
243 	lck_rw_unlock_exclusive(&parent_fe->fe_child_list_lock);
244 
245 	return true;
246 }
247 
248 static void
flow_entry_remove_all_children(struct flow_entry * parent_fe,struct nx_flowswitch * fsw)249 flow_entry_remove_all_children(struct flow_entry *parent_fe, struct nx_flowswitch *fsw)
250 {
251 	bool sched_reaper_thread = false;
252 
253 	ASSERT(parent_fe->fe_flags & FLOWENTF_PARENT);
254 
255 	lck_rw_lock_exclusive(&parent_fe->fe_child_list_lock);
256 
257 	struct flow_entry *__single fe, *__single tfe;
258 	TAILQ_FOREACH_SAFE(fe, &parent_fe->fe_child_list, fe_child_link, tfe) {
259 		if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) {
260 			/*
261 			 * fsw_pending_nonviable is a hint for reaper thread;
262 			 * due to the fact that setting fe_want_nonviable and
263 			 * incrementing fsw_pending_nonviable counter is not
264 			 * atomic, let the increment happen first, and the
265 			 * thread losing the CAS does decrement.
266 			 */
267 			os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
268 			if (os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
269 				sched_reaper_thread = true;
270 			} else {
271 				os_atomic_dec(&fsw->fsw_pending_nonviable, relaxed);
272 			}
273 		}
274 
275 		TAILQ_REMOVE(&parent_fe->fe_child_list, fe, fe_child_link);
276 		ASSERT(--parent_fe->fe_child_count >= 0);
277 		flow_entry_release(&fe);
278 	}
279 
280 	lck_rw_unlock_exclusive(&parent_fe->fe_child_list_lock);
281 
282 	if (sched_reaper_thread) {
283 		fsw_reap_sched(fsw);
284 	}
285 }
286 
287 static void
flow_entry_set_demux_patterns(struct flow_entry * fe,struct nx_flow_req * req)288 flow_entry_set_demux_patterns(struct flow_entry *fe, struct nx_flow_req *req)
289 {
290 	ASSERT(fe->fe_flags & FLOWENTF_CHILD);
291 	ASSERT(req->nfr_flow_demux_count > 0);
292 
293 	fe->fe_demux_patterns = sk_alloc_type_array(struct kern_flow_demux_pattern, req->nfr_flow_demux_count,
294 	    Z_WAITOK | Z_NOFAIL, skmem_tag_flow_demux);
295 	fe->fe_demux_pattern_count = req->nfr_flow_demux_count;
296 
297 	for (int i = 0; i < req->nfr_flow_demux_count; i++) {
298 		bcopy(&req->nfr_flow_demux_patterns[i], &fe->fe_demux_patterns[i].fdp_demux_pattern,
299 		    sizeof(struct flow_demux_pattern));
300 
301 		fe->fe_demux_patterns[i].fdp_memcmp_mask = NULL;
302 		if (req->nfr_flow_demux_patterns[i].fdp_len == 16) {
303 			fe->fe_demux_patterns[i].fdp_memcmp_mask = sk_memcmp_mask_16B;
304 		} else if (req->nfr_flow_demux_patterns[i].fdp_len == 32) {
305 			fe->fe_demux_patterns[i].fdp_memcmp_mask = sk_memcmp_mask_32B;
306 		} else if (req->nfr_flow_demux_patterns[i].fdp_len > 32) {
307 			VERIFY(0);
308 		}
309 	}
310 }
311 
312 static int
convert_flowkey_to_inet_td(struct flow_key * key,struct ifnet_traffic_descriptor_inet * td)313 convert_flowkey_to_inet_td(struct flow_key *key,
314     struct ifnet_traffic_descriptor_inet *td)
315 {
316 	if ((key->fk_mask & FKMASK_IPVER) != 0) {
317 		td->inet_ipver = key->fk_ipver;
318 		td->inet_mask |= IFNET_TRAFFIC_DESCRIPTOR_INET_IPVER;
319 	}
320 	if ((key->fk_mask & FKMASK_PROTO) != 0) {
321 		td->inet_proto = key->fk_proto;
322 		td->inet_mask |= IFNET_TRAFFIC_DESCRIPTOR_INET_PROTO;
323 	}
324 	if ((key->fk_mask & FKMASK_SRC) != 0) {
325 		if (td->inet_ipver == IPVERSION) {
326 			bcopy(&key->fk_src4, &td->inet_laddr.iia_v4addr,
327 			    sizeof(key->fk_src4));
328 		} else {
329 			bcopy(&key->fk_src6, &td->inet_laddr,
330 			    sizeof(key->fk_src6));
331 		}
332 		td->inet_mask |= IFNET_TRAFFIC_DESCRIPTOR_INET_LADDR;
333 	}
334 	if ((key->fk_mask & FKMASK_DST) != 0) {
335 		if (td->inet_ipver == IPVERSION) {
336 			bcopy(&key->fk_dst4, &td->inet_raddr.iia_v4addr,
337 			    sizeof(key->fk_dst4));
338 		} else {
339 			bcopy(&key->fk_dst6, &td->inet_raddr,
340 			    sizeof(key->fk_dst6));
341 		}
342 		td->inet_mask |= IFNET_TRAFFIC_DESCRIPTOR_INET_RADDR;
343 	}
344 	if ((key->fk_mask & FKMASK_SPORT) != 0) {
345 		td->inet_lport = key->fk_sport;
346 		td->inet_mask |= IFNET_TRAFFIC_DESCRIPTOR_INET_LPORT;
347 	}
348 	if ((key->fk_mask & FKMASK_DPORT) != 0) {
349 		td->inet_rport = key->fk_dport;
350 		td->inet_mask |= IFNET_TRAFFIC_DESCRIPTOR_INET_RPORT;
351 	}
352 	td->inet_common.itd_type = IFNET_TRAFFIC_DESCRIPTOR_TYPE_INET;
353 	td->inet_common.itd_len = sizeof(*td);
354 	td->inet_common.itd_flags = IFNET_TRAFFIC_DESCRIPTOR_FLAG_INBOUND |
355 	    IFNET_TRAFFIC_DESCRIPTOR_FLAG_OUTBOUND;
356 	return 0;
357 }
358 
359 void
flow_qset_select_dynamic(struct nx_flowswitch * fsw,struct flow_entry * fe,boolean_t skip_if_no_change)360 flow_qset_select_dynamic(struct nx_flowswitch *fsw, struct flow_entry *fe,
361     boolean_t skip_if_no_change)
362 {
363 	struct ifnet_traffic_descriptor_inet td;
364 	struct ifnet *ifp;
365 	uint64_t qset_id;
366 	struct nx_netif *nif;
367 	int err;
368 
369 	ifp = fsw->fsw_ifp;
370 	if (ifp->if_traffic_rule_genid == fe->fe_tr_genid && skip_if_no_change) {
371 		return;
372 	}
373 	if (fe->fe_qset != NULL) {
374 		nx_netif_qset_release(&fe->fe_qset);
375 		ASSERT(fe->fe_qset == NULL);
376 	}
377 
378 	/*
379 	 * Note: ifp can have either eth traffc rules or inet traffc rules
380 	 * and not both.
381 	 */
382 	if (ifp->if_eth_traffic_rule_count > 0) {
383 		if (!fe->fe_route) {
384 			return;
385 		}
386 
387 		struct flow_route *fr = fe->fe_route;
388 		struct rtentry *rt = (fr->fr_flags & FLOWRTF_GATEWAY)
389 		    ? fr->fr_rt_gw : fr->fr_rt_dst;
390 		if (!rt) {
391 			return;
392 		}
393 
394 		/* If tr_genid is stale in the rtentry, run traffic rules again */
395 		ifnet_sync_traffic_rule_genid(ifp, &fe->fe_tr_genid);
396 		if (rt->rt_tr_genid != fe->fe_tr_genid) {
397 			rt_lookup_qset_id(rt, true);
398 		}
399 
400 		qset_id = rt->rt_qset_id;
401 	} else if (ifp->if_inet_traffic_rule_count > 0) {
402 		ifnet_sync_traffic_rule_genid(ifp, &fe->fe_tr_genid);
403 
404 		err = convert_flowkey_to_inet_td(&fe->fe_key, &td);
405 		ASSERT(err == 0);
406 		err = nxctl_inet_traffic_rule_find_qset_id(ifp->if_xname, &td, &qset_id);
407 		if (err != 0) {
408 			DTRACE_SKYWALK3(qset__id__not__found,
409 			    struct nx_flowswitch *, fsw,
410 			    struct flow_entry *, fe,
411 			    struct ifnet_traffic_descriptor_inet *, &td);
412 			return;
413 		}
414 	} else {
415 		DTRACE_SKYWALK2(no__rules, struct nx_flowswitch *, fsw,
416 		    struct flow_entry *, fe);
417 		return;
418 	}
419 
420 	DTRACE_SKYWALK4(qset__id__found, struct nx_flowswitch *, fsw,
421 	    struct flow_entry *, fe, struct ifnet_traffic_descriptor_inet *,
422 	    &td, uint64_t, qset_id);
423 	nif = NX_NETIF_PRIVATE(fsw->fsw_dev_ch->ch_na->na_nx);
424 	ASSERT(fe->fe_qset == NULL);
425 	fe->fe_qset = nx_netif_find_qset(nif, qset_id);
426 }
427 
428 /* writer-lock must be owned for memory management functions */
429 struct flow_entry *
flow_entry_alloc(struct flow_owner * fo,struct nx_flow_req * req,int * perr)430 flow_entry_alloc(struct flow_owner *fo, struct nx_flow_req *req, int *perr)
431 {
432 	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
433 	nexus_port_t nx_port = req->nfr_nx_port;
434 	struct flow_entry *__single fe = NULL;
435 	struct flow_entry *__single parent_fe = NULL;
436 	flowadv_idx_t fadv_idx = FLOWADV_IDX_NONE;
437 	struct nexus_adapter *dev_na;
438 	struct nx_flowswitch *fsw;
439 	struct nx_netif *nif;
440 	int err;
441 
442 	FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo));
443 	ASSERT(nx_port != NEXUS_PORT_ANY);
444 	ASSERT(!fo->fo_nx_port_destroyed);
445 
446 	*perr = 0;
447 
448 	struct flow_key key __sk_aligned(16);
449 	err = flow_req2key(req, &key);
450 	if (__improbable(err != 0)) {
451 		SK_ERR("invalid request (err %d)", err);
452 		goto done;
453 	}
454 
455 	fsw = fo->fo_fsw;
456 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
457 	fe = flow_mgr_find_conflicting_fe(fm, &key);
458 	if (fe != NULL) {
459 		if ((fe->fe_flags & FLOWENTF_PARENT) &&
460 		    uuid_compare(fe->fe_uuid, req->nfr_parent_flow_uuid) == 0) {
461 			parent_fe = fe;
462 			fe = NULL;
463 		} else {
464 			SK_ERR("entry \"%s\" already exists",
465 			    fe2str(fe, dbgbuf, sizeof(dbgbuf)));
466 			/* don't return it */
467 			flow_entry_release(&fe);
468 			err = EEXIST;
469 			goto done;
470 		}
471 	} else if (!uuid_is_null(req->nfr_parent_flow_uuid)) {
472 		uuid_string_t uuid_str;
473 		sk_uuid_unparse(req->nfr_parent_flow_uuid, uuid_str);
474 		SK_ERR("parent entry \"%s\" does not exist", uuid_str);
475 		err = ENOENT;
476 		goto done;
477 	}
478 
479 	if ((req->nfr_flags & NXFLOWREQF_FLOWADV) &&
480 	    (flow_owner_flowadv_index_alloc(fo, &fadv_idx) != 0)) {
481 		SK_ERR("failed to alloc flowadv index for flow %s",
482 		    sk_uuid_unparse(req->nfr_flow_uuid, dbgbuf));
483 		err = ENOMEM;
484 		goto done;
485 	}
486 
487 	fe = fe_alloc(TRUE);
488 	if (__improbable(fe == NULL)) {
489 		err = ENOMEM;
490 		goto done;
491 	}
492 
493 	fe->fe_key = key;
494 	if (req->nfr_route != NULL) {
495 		fe->fe_laddr_gencnt = req->nfr_route->fr_laddr_gencnt;
496 	} else {
497 		fe->fe_laddr_gencnt = req->nfr_saddr_gencnt;
498 	}
499 
500 	if (__improbable(req->nfr_flags & NXFLOWREQF_LISTENER)) {
501 		/* mark this as listener mode */
502 		os_atomic_or(&fe->fe_flags, FLOWENTF_LISTENER, relaxed);
503 	} else {
504 		ASSERT((fe->fe_key.fk_ipver == IPVERSION &&
505 		    fe->fe_key.fk_src4.s_addr != INADDR_ANY) ||
506 		    (fe->fe_key.fk_ipver == IPV6_VERSION &&
507 		    !IN6_IS_ADDR_UNSPECIFIED(&fe->fe_key.fk_src6)));
508 
509 		/* mark this as connected mode */
510 		os_atomic_or(&fe->fe_flags, FLOWENTF_CONNECTED, relaxed);
511 	}
512 
513 	if (req->nfr_flags & NXFLOWREQF_NOWAKEFROMSLEEP) {
514 		fe->fe_flags |= FLOWENTF_NOWAKEFROMSLEEP;
515 	}
516 	if (req->nfr_flags & NXFLOWREQF_CONNECTION_IDLE) {
517 		fe->fe_flags |= FLOWENTF_CONNECTION_IDLE;
518 	}
519 	fe->fe_port_reservation = req->nfr_port_reservation;
520 	req->nfr_port_reservation = NULL;
521 	if (req->nfr_flags & NXFLOWREQF_EXT_PORT_RSV) {
522 		fe->fe_flags |= FLOWENTF_EXTRL_PORT;
523 	}
524 	fe->fe_proto_reservation = req->nfr_proto_reservation;
525 	req->nfr_proto_reservation = NULL;
526 	if (req->nfr_flags & NXFLOWREQF_EXT_PROTO_RSV) {
527 		fe->fe_flags |= FLOWENTF_EXTRL_PROTO;
528 	}
529 	fe->fe_ipsec_reservation = req->nfr_ipsec_reservation;
530 	req->nfr_ipsec_reservation = NULL;
531 
532 	fe->fe_tx_process = dp_flow_tx_process;
533 	fe->fe_rx_process = dp_flow_rx_process;
534 
535 	dev_na = fsw->fsw_dev_ch->ch_na;
536 	nif = NX_NETIF_PRIVATE(dev_na->na_nx);
537 	if (NX_LLINK_PROV(nif->nif_nx) &&
538 	    (fe->fe_key.fk_mask & (FKMASK_IPVER | FKMASK_PROTO | FKMASK_DST)) ==
539 	    (FKMASK_IPVER | FKMASK_PROTO | FKMASK_DST)) {
540 		if (req->nfr_qset_id != 0) {
541 			fe->fe_qset_select = FE_QSET_SELECT_FIXED;
542 			fe->fe_qset_id = req->nfr_qset_id;
543 			fe->fe_qset = nx_netif_find_qset(nif, req->nfr_qset_id);
544 		} else {
545 			fe->fe_qset_select = FE_QSET_SELECT_DYNAMIC;
546 			fe->fe_qset_id = 0;
547 			flow_qset_select_dynamic(fsw, fe, FALSE);
548 		}
549 	} else {
550 		fe->fe_qset_select = FE_QSET_SELECT_NONE;
551 	}
552 	if (req->nfr_flags & NXFLOWREQF_LOW_LATENCY) {
553 		os_atomic_or(&fe->fe_flags, FLOWENTF_LOW_LATENCY, relaxed);
554 	}
555 
556 	fe->fe_transport_protocol = req->nfr_transport_protocol;
557 	if (NX_FSW_TCP_RX_AGG_ENABLED() &&
558 	    (fsw->fsw_nx->nx_prov->nxprov_params->nxp_max_frags > 1) &&
559 	    (fe->fe_key.fk_proto == IPPROTO_TCP) &&
560 	    (fe->fe_key.fk_mask == FKMASK_5TUPLE)) {
561 		fe->fe_rx_process = flow_rx_agg_tcp;
562 	}
563 	uuid_copy(fe->fe_uuid, req->nfr_flow_uuid);
564 	if ((req->nfr_flags & NXFLOWREQF_LISTENER) == 0 &&
565 	    (req->nfr_flags & NXFLOWREQF_TRACK) != 0) {
566 		switch (req->nfr_ip_protocol) {
567 		case IPPROTO_TCP:
568 		case IPPROTO_UDP:
569 			os_atomic_or(&fe->fe_flags, FLOWENTF_TRACK, relaxed);
570 			break;
571 		default:
572 			break;
573 		}
574 	}
575 
576 	if (req->nfr_flags & NXFLOWREQF_QOS_MARKING) {
577 		os_atomic_or(&fe->fe_flags, FLOWENTF_QOS_MARKING, relaxed);
578 	}
579 
580 	if (req->nfr_flags & NXFLOWREQF_PARENT) {
581 		os_atomic_or(&fe->fe_flags, FLOWENTF_PARENT, relaxed);
582 		TAILQ_INIT(&fe->fe_child_list);
583 		lck_rw_init(&fe->fe_child_list_lock, &nexus_lock_group, &nexus_lock_attr);
584 	}
585 
586 	if (req->nfr_route != NULL) {
587 		fe->fe_route = req->nfr_route;
588 		req->nfr_route = NULL;
589 	}
590 
591 	fe->fe_nx_port = nx_port;
592 	fe->fe_adv_idx = fadv_idx;
593 
594 	if (req->nfr_inp_flowhash != 0) {
595 		/*
596 		 * BSD flow, use the inpcb flow hash value
597 		 */
598 		fe->fe_flowid = req->nfr_inp_flowhash;
599 		fe->fe_flags |= FLOWENTF_EXTRL_FLOWID;
600 	} else {
601 		fe->fe_flowid = flow_entry_calc_flowid(fe);
602 	}
603 
604 	if (fe->fe_adv_idx != FLOWADV_IDX_NONE && fo->fo_nx_port_na != NULL) {
605 		na_flowadv_entry_alloc(fo->fo_nx_port_na, fe->fe_uuid,
606 		    fe->fe_adv_idx, fe->fe_flowid);
607 	}
608 
609 	if (KPKT_VALID_SVC(req->nfr_svc_class)) {
610 		fe->fe_svc_class = (kern_packet_svc_class_t)req->nfr_svc_class;
611 	} else {
612 		fe->fe_svc_class = KPKT_SC_BE;
613 	}
614 
615 	uuid_copy(fe->fe_eproc_uuid, req->nfr_euuid);
616 	fe->fe_policy_id = req->nfr_policy_id;
617 	fe->fe_skip_policy_id = req->nfr_skip_policy_id;
618 
619 	*(struct nx_flowswitch **)(uintptr_t)&fe->fe_fsw = fsw;
620 	fe->fe_pid = fo->fo_pid;
621 	if (req->nfr_epid != -1 && req->nfr_epid != fo->fo_pid) {
622 		fe->fe_epid = req->nfr_epid;
623 		proc_name(fe->fe_epid, fe->fe_eproc_name,
624 		    sizeof(fe->fe_eproc_name));
625 	} else {
626 		fe->fe_epid = -1;
627 	}
628 
629 	(void) snprintf(fe->fe_proc_name, sizeof(fe->fe_proc_name), "%s",
630 	    fo->fo_name);
631 
632 	fe_stats_init(fe);
633 	flow_stats_retain(fe->fe_stats);
634 	req->nfr_flow_stats = fe->fe_stats;
635 	fe->fe_rx_worker_tid = 0;
636 
637 	if (req->nfr_flags & NXFLOWREQF_AOP_OFFLOAD) {
638 		os_atomic_or(&fe->fe_flags, FLOWENTF_AOP_OFFLOAD, relaxed);
639 		/*
640 		 * For TCP flows over AOP, we will always linger in the kernel.
641 		 * We do not do TCP Time-Wait in AOP. This is so that we can
642 		 * cleanup resources from AOP quickly.
643 		 */
644 		if (req->nfr_ip_protocol == IPPROTO_TCP) {
645 			os_atomic_or(&fe->fe_flags, FLOWENTF_WAIT_CLOSE, relaxed);
646 			fe->fe_linger_wait = (2 * tcp_msl) / TCP_RETRANSHZ;
647 		}
648 	}
649 
650 	err = flow_mgr_flow_hash_mask_add(fm, fe->fe_key.fk_mask);
651 	ASSERT(err == 0);
652 
653 	if (parent_fe != NULL) {
654 		os_atomic_or(&fe->fe_flags, FLOWENTF_CHILD, relaxed);
655 		flow_entry_set_demux_patterns(fe, req);
656 		fe->fe_demux_pkt_data = sk_alloc_data(FLOW_DEMUX_MAX_LEN, Z_WAITOK | Z_NOFAIL, skmem_tag_flow_demux);
657 		if (!flow_entry_add_child(parent_fe, fe)) {
658 			goto done;
659 		}
660 	} else {
661 		fe->fe_key_hash = flow_key_hash(&fe->fe_key);
662 		err = cuckoo_hashtable_add_with_hash(fm->fm_flow_table, &fe->fe_cnode,
663 		    fe->fe_key_hash);
664 		if (err != 0) {
665 			SK_ERR("flow table add failed (err %d)", err);
666 			flow_mgr_flow_hash_mask_del(fm, fe->fe_key.fk_mask);
667 			goto done;
668 		}
669 	}
670 
671 	RB_INSERT(flow_entry_id_tree, &fo->fo_flow_entry_id_head, fe);
672 	flow_entry_retain(fe);  /* one refcnt in id_tree */
673 
674 	SK_D("fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf)));
675 
676 done:
677 	if (parent_fe != NULL) {
678 		flow_entry_release(&parent_fe);
679 	}
680 	if (err != 0) {
681 		if (fadv_idx != FLOWADV_IDX_NONE) {
682 			flow_owner_flowadv_index_free(fo, fadv_idx);
683 		}
684 		if (fe != NULL) {
685 			fe->fe_flags |= (FLOWENTF_TORN_DOWN | FLOWENTF_DESTROYED);
686 			flow_entry_release(&fe);
687 		}
688 	}
689 	*perr = err;
690 	return fe;
691 }
692 
693 /*
694  * Add an RX flow steering rule for the given flow entry.
695  *
696  * This function provides a high-level interface for configuring RX flow steering
697  * rules based on flow entry characteristics. It converts the flow key to a traffic
698  * descriptor and configures the underlying netif for hardware steering.
699  *
700  * Parameters:
701  *   fsw         - The flowswitch instance
702  *   fe          - The flow entry to configure steering for
703  *
704  * Returns:
705  *   0           - Success
706  *   ENOTSUP     - RX flow steering not supported
707  *   EINVAL      - Invalid parameters
708  *   ENXIO       - Device unavailable
709  *   Other       - Provider-specific error codes
710  */
711 int
flow_entry_add_rx_steering_rule(struct nx_flowswitch * fsw,struct flow_entry * fe)712 flow_entry_add_rx_steering_rule(struct nx_flowswitch *fsw, struct flow_entry *fe)
713 {
714 	struct ifnet_traffic_descriptor_inet td;
715 	struct kern_nexus *nx;
716 	int err = 0;
717 
718 	if (__improbable(fsw == NULL || fe == NULL)) {
719 		SK_ERR("Invalid parameters: fsw=%p, fe=%p", SK_KVA(fsw), SK_KVA(fe));
720 		return EINVAL;
721 	}
722 
723 	/* RX steering is only for AOP offload flows */
724 	ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD);
725 
726 	/* Check if device channel is available */
727 	if (__improbable(fsw->fsw_dev_ch == NULL)) {
728 		SK_ERR("Device channel not available for RX flow steering");
729 		FSW_STATS_INC(FSW_STATS_RX_FS_ADD_FAILURE);
730 		return ENXIO;
731 	}
732 
733 	nx = fsw->fsw_dev_ch->ch_na->na_nx;
734 	if (__improbable(nx == NULL)) {
735 		SK_ERR("Nexus not available for RX flow steering");
736 		FSW_STATS_INC(FSW_STATS_RX_FS_ADD_FAILURE);
737 		return ENXIO;
738 	}
739 
740 	/* Convert flow key to traffic descriptor */
741 	memset(&td, 0, sizeof(struct ifnet_traffic_descriptor_inet));
742 	err = convert_flowkey_to_inet_td(&fe->fe_key, &td);
743 	if (__improbable(err != 0)) {
744 		SK_ERR("Failed to convert flow key to traffic descriptor (err %d)", err);
745 		FSW_STATS_INC(FSW_STATS_RX_FS_ADD_FAILURE);
746 		return err;
747 	}
748 
749 	/* Always set inbound flag for RX flow steering */
750 	td.inet_common.itd_flags = IFNET_TRAFFIC_DESCRIPTOR_FLAG_INBOUND;
751 
752 	SK_DF(SK_VERB_NETIF,
753 	    "Adding RX flow steering rule: fsw=%p, fe=%p, flow_id=%u",
754 	    SK_KVA(fsw), SK_KVA(fe), fe->fe_flowid);
755 
756 	/* Configure the RX flow steering rule */
757 	err = nx_netif_configure_rx_flow_steering(nx, fe->fe_flowid,
758 	    (struct ifnet_traffic_descriptor_common *)&td,
759 	    RX_FLOW_STEERING_ACTION_ADD_AOP);
760 
761 	if (__improbable(err != 0)) {
762 		FSW_STATS_INC(FSW_STATS_RX_FS_ADD_FAILURE);
763 		SK_ERR("RX flow steering rule add failed (err %d)", err);
764 		DTRACE_SKYWALK4(rx__flow__steering__rule__add__failed,
765 		    struct nx_flowswitch *, fsw, struct flow_entry *, fe,
766 		    uint32_t, fe->fe_flowid, int, err);
767 	} else {
768 		FSW_STATS_INC(FSW_STATS_RX_FS_ADD_SUCCESS);
769 		SK_DF(SK_VERB_NETIF,
770 		    "Successfully added RX flow steering rule: flow_id=%u",
771 		    fe->fe_flowid);
772 		DTRACE_SKYWALK3(rx__flow__steering__rule__add__success,
773 		    struct nx_flowswitch *, fsw, struct flow_entry *, fe,
774 		    uint32_t, fe->fe_flowid);
775 
776 		/* Mark the flow entry as having RX steering configured */
777 		os_atomic_or(&fe->fe_flags, FLOWENTF_RX_STEERING, relaxed);
778 	}
779 
780 	return err;
781 }
782 
783 void
flow_entry_rx_steering_rule_cleanup(struct nx_flowswitch * fsw,struct flow_entry * fe)784 flow_entry_rx_steering_rule_cleanup(struct nx_flowswitch *fsw, struct flow_entry *fe)
785 {
786 	struct kern_nexus *nx = NULL;
787 	int err = 0;
788 
789 	ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD);
790 
791 	/*
792 	 * We check for fsw->fsw_dev_ch here because the flow could be cleaned
793 	 * up after the flow-switch has detached. The race between flow-switch
794 	 * detach and flow cleanup is prevented because flow_entry_teardown() is
795 	 * called either with a SK_LOCK() or with fsw_detach_barrier_add().
796 	 */
797 	if (fsw->fsw_dev_ch != NULL) {
798 		nx = fsw->fsw_dev_ch->ch_na->na_nx;
799 		err = nx_netif_configure_rx_flow_steering(nx,
800 		    fe->fe_flowid, NULL, RX_FLOW_STEERING_ACTION_REMOVE_AOP);
801 		if (err != 0) {
802 			FSW_STATS_INC(FSW_STATS_RX_FS_REMOVE_FAILURE);
803 			SK_ERR("rx flow steering cleanup failed (err %d)", err);
804 		} else {
805 			FSW_STATS_INC(FSW_STATS_RX_FS_REMOVE_SUCCESS);
806 		}
807 	} else {
808 		FSW_STATS_INC(FSW_STATS_RX_FS_REMOVE_SKIPPED);
809 	}
810 }
811 
812 void
flow_entry_teardown(struct flow_owner * fo,struct flow_entry * fe)813 flow_entry_teardown(struct flow_owner *fo, struct flow_entry *fe)
814 {
815 #if SK_LOG
816 	char dbgbuf[FLOWENTRY_DBGBUF_SIZE];
817 	SK_DF(SK_VERB_FLOW, "fe \"%s\" [fo %p] "
818 	    "non_via %d withdrawn %d", fe2str(fe, dbgbuf, sizeof(dbgbuf)),
819 	    SK_KVA(fo), fe->fe_want_nonviable, fe->fe_want_withdraw);
820 #endif /* SK_LOG */
821 	struct nx_flowswitch *fsw = fo->fo_fsw;
822 
823 	FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo));
824 
825 	ASSERT(!(fe->fe_flags & FLOWENTF_DESTROYED));
826 	ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
827 	ASSERT(fsw != NULL);
828 
829 	if (os_atomic_cmpxchg(&fe->fe_want_nonviable, 1, 0, acq_rel)) {
830 		ASSERT(fsw->fsw_pending_nonviable != 0);
831 		os_atomic_dec(&fsw->fsw_pending_nonviable, relaxed);
832 		os_atomic_or(&fe->fe_flags, FLOWENTF_NONVIABLE, relaxed);
833 	}
834 
835 	/* always withdraw namespace during tear down */
836 	if (!(fe->fe_flags & FLOWENTF_EXTRL_PORT) &&
837 	    !(fe->fe_flags & FLOWENTF_WITHDRAWN)) {
838 		os_atomic_or(&fe->fe_flags, FLOWENTF_WITHDRAWN, relaxed);
839 		os_atomic_store(&fe->fe_want_withdraw, 0, release);
840 		/* local port is now inactive; not eligible for offload */
841 		flow_namespace_withdraw(&fe->fe_port_reservation);
842 	}
843 
844 	/* we may get here multiple times, so check */
845 	if (!(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
846 		os_atomic_or(&fe->fe_flags, FLOWENTF_TORN_DOWN, relaxed);
847 		if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
848 			if (fo->fo_nx_port_na != NULL) {
849 				na_flowadv_entry_free(fo->fo_nx_port_na,
850 				    fe->fe_uuid, fe->fe_adv_idx, fe->fe_flowid);
851 			}
852 			flow_owner_flowadv_index_free(fo, fe->fe_adv_idx);
853 			fe->fe_adv_idx = FLOWADV_IDX_NONE;
854 		}
855 	}
856 	ASSERT(fe->fe_adv_idx == FLOWADV_IDX_NONE);
857 	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
858 
859 	/* mark child flow as nonviable */
860 	if (fe->fe_flags & FLOWENTF_PARENT) {
861 		flow_entry_remove_all_children(fe, fsw);
862 	}
863 }
864 
865 void
flow_entry_destroy(struct flow_owner * fo,struct flow_entry * fe,bool nolinger,void * close_params)866 flow_entry_destroy(struct flow_owner *fo, struct flow_entry *fe, bool nolinger,
867     void *close_params)
868 {
869 	struct flow_mgr *fm = fo->fo_fsw->fsw_flow_mgr;
870 	int err;
871 
872 	FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo));
873 
874 	/*
875 	 * regular flow: one in flow_table, one in id_tree, one here
876 	 * child flow: one in id_tree, one here
877 	 */
878 	ASSERT(flow_entry_refcnt(fe) > 2 ||
879 	    ((fe->fe_flags & FLOWENTF_CHILD) && flow_entry_refcnt(fe) > 1));
880 
881 	flow_entry_teardown(fo, fe);
882 
883 	err = flow_mgr_flow_hash_mask_del(fm, fe->fe_key.fk_mask);
884 	ASSERT(err == 0);
885 
886 	/* only regular or parent flows have entries in flow_table */
887 	if (__probable(!(fe->fe_flags & FLOWENTF_CHILD))) {
888 		uint32_t hash;
889 		hash = flow_key_hash(&fe->fe_key);
890 		cuckoo_hashtable_del(fm->fm_flow_table, &fe->fe_cnode, hash);
891 	}
892 
893 	RB_REMOVE(flow_entry_id_tree, &fo->fo_flow_entry_id_head, fe);
894 	struct flow_entry *__single tfe = fe;
895 	flow_entry_release(&tfe);
896 
897 	ASSERT(!(fe->fe_flags & FLOWENTF_DESTROYED));
898 	os_atomic_or(&fe->fe_flags, FLOWENTF_DESTROYED, relaxed);
899 
900 	if (fe->fe_flags & FLOWENTF_RX_STEERING) {
901 		fsw_rxstrc_insert(fe);
902 	}
903 
904 	if (fe->fe_transport_protocol == IPPROTO_QUIC) {
905 		if (!nolinger && close_params != NULL) {
906 			/*
907 			 * -fbounds-safety: We can't annotate close_params (last
908 			 * argument of this function) with
909 			 * __sized_by(QUIC_STATELESS_RESET_TOKEN_SIZE) because
910 			 * there are callsites that pass NULL to this. Until
911 			 * __sized_by_or_null is available (rdar://75598414),
912 			 * forge this for now.
913 			 */
914 			uint8_t *quic_close_params = __unsafe_forge_bidi_indexable(uint8_t *,
915 			    close_params, QUIC_STATELESS_RESET_TOKEN_SIZE);
916 			flow_track_abort_quic(fe, quic_close_params);
917 		}
918 		flow_entry_release(&fe);
919 	} else if (nolinger || !(fe->fe_flags & FLOWENTF_WAIT_CLOSE)) {
920 		flow_entry_release(&fe);
921 	} else {
922 		fsw_linger_insert(fe);
923 	}
924 }
925 
926 uint32_t
flow_entry_refcnt(struct flow_entry * fe)927 flow_entry_refcnt(struct flow_entry *fe)
928 {
929 	return os_ref_get_count(&fe->fe_refcnt);
930 }
931 
932 void
flow_entry_retain(struct flow_entry * fe)933 flow_entry_retain(struct flow_entry *fe)
934 {
935 	os_ref_retain(&fe->fe_refcnt);
936 }
937 
938 void
flow_entry_release(struct flow_entry ** pfe)939 flow_entry_release(struct flow_entry **pfe)
940 {
941 	struct flow_entry *fe = *pfe;
942 	ASSERT(fe != NULL);
943 	*pfe = NULL;    /* caller lose reference */
944 
945 	if (__improbable(os_ref_release(&fe->fe_refcnt) == 0)) {
946 		fe->fe_nx_port = NEXUS_PORT_ANY;
947 		if (fe->fe_route != NULL) {
948 			flow_route_release(fe->fe_route);
949 			fe->fe_route = NULL;
950 		}
951 		if (fe->fe_qset != NULL) {
952 			nx_netif_qset_release(&fe->fe_qset);
953 			ASSERT(fe->fe_qset == NULL);
954 		}
955 		if (fe->fe_demux_patterns != NULL) {
956 			sk_free_type_array_counted_by(struct kern_flow_demux_pattern,
957 			    fe->fe_demux_pattern_count, fe->fe_demux_patterns);
958 			fe->fe_demux_patterns = NULL;
959 			fe->fe_demux_pattern_count = 0;
960 		}
961 		if (fe->fe_demux_pkt_data != NULL) {
962 			size_t demux_pkt_data_size = FLOW_DEMUX_MAX_LEN;
963 			sk_free_data_sized_by(fe->fe_demux_pkt_data, demux_pkt_data_size);
964 			fe->fe_demux_pkt_data = NULL;
965 		}
966 		fe_free(fe);
967 	}
968 }
969 
970 struct flow_entry_dead *
flow_entry_dead_alloc(zalloc_flags_t how)971 flow_entry_dead_alloc(zalloc_flags_t how)
972 {
973 	struct flow_entry_dead *fed;
974 
975 	fed = zalloc_flags(sk_fed_zone, how | Z_ZERO);
976 	if (fed != NULL) {
977 		SK_DF(SK_VERB_MEM, "fed %p ALLOC", SK_KVA(fed));
978 	}
979 	return fed;
980 }
981 
982 void
flow_entry_dead_free(struct flow_entry_dead * fed)983 flow_entry_dead_free(struct flow_entry_dead *fed)
984 {
985 	SK_DF(SK_VERB_MEM, "fed %p FREE", SK_KVA(fed));
986 	zfree(sk_fed_zone, fed);
987 }
988 
989 static void
fe_stats_init(struct flow_entry * fe)990 fe_stats_init(struct flow_entry *fe)
991 {
992 	struct nx_flowswitch *fsw = fe->fe_fsw;
993 	struct sk_stats_flow *sf = &fe->fe_stats->fs_stats;
994 
995 	ASSERT(fe->fe_stats != NULL);
996 	ASSERT(os_ref_get_count(&fe->fe_stats->fs_refcnt) >= 1);
997 
998 	bzero(sf, sizeof(*sf));
999 	uuid_copy(sf->sf_nx_uuid, fsw->fsw_nx->nx_uuid);
1000 	uuid_copy(sf->sf_uuid, fe->fe_uuid);
1001 	(void) strbufcpy(sf->sf_if_name, fsw->fsw_flow_mgr->fm_name);
1002 	sf->sf_if_index = fsw->fsw_ifp->if_index;
1003 	sf->sf_pid = fe->fe_pid;
1004 	sf->sf_epid = fe->fe_epid;
1005 	(void) snprintf(sf->sf_proc_name, sizeof(sf->sf_proc_name), "%s",
1006 	    fe->fe_proc_name);
1007 	(void) snprintf(sf->sf_eproc_name, sizeof(sf->sf_eproc_name), "%s",
1008 	    fe->fe_eproc_name);
1009 
1010 	sf->sf_nx_port = fe->fe_nx_port;
1011 	sf->sf_key = fe->fe_key;
1012 	sf->sf_protocol = fe->fe_transport_protocol;
1013 	sf->sf_svc_class = (packet_svc_class_t)fe->fe_svc_class;
1014 	sf->sf_adv_idx = fe->fe_adv_idx;
1015 
1016 	if (fe->fe_flags & FLOWENTF_TRACK) {
1017 		sf->sf_flags |= SFLOWF_TRACK;
1018 	}
1019 	if (fe->fe_flags & FLOWENTF_LISTENER) {
1020 		sf->sf_flags |= SFLOWF_LISTENER;
1021 	}
1022 	if (fe->fe_route != NULL && fe->fe_route->fr_flags & FLOWRTF_ONLINK) {
1023 		sf->sf_flags |= SFLOWF_ONLINK;
1024 	}
1025 
1026 	fe_stats_update(fe);
1027 }
1028 
1029 void
fe_stats_update(struct flow_entry * fe)1030 fe_stats_update(struct flow_entry *fe)
1031 {
1032 	struct sk_stats_flow *sf = &fe->fe_stats->fs_stats;
1033 
1034 	ASSERT(fe->fe_stats != NULL);
1035 	ASSERT(os_ref_get_count(&fe->fe_stats->fs_refcnt) >= 1);
1036 
1037 	if (fe->fe_flags & FLOWENTF_CONNECTED) {
1038 		sf->sf_flags |= SFLOWF_CONNECTED;
1039 	}
1040 	if (fe->fe_flags & FLOWENTF_QOS_MARKING) {
1041 		sf->sf_flags |= SFLOWF_QOS_MARKING;
1042 	}
1043 	if (fe->fe_flags & FLOWENTF_WAIT_CLOSE) {
1044 		sf->sf_flags |= SFLOWF_WAIT_CLOSE;
1045 	}
1046 	if (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY) {
1047 		sf->sf_flags |= SFLOWF_CLOSE_NOTIFY;
1048 	}
1049 	if (fe->fe_flags & FLOWENTF_ABORTED) {
1050 		sf->sf_flags |= SFLOWF_ABORTED;
1051 	}
1052 	if (fe->fe_flags & FLOWENTF_NONVIABLE) {
1053 		sf->sf_flags |= SFLOWF_NONVIABLE;
1054 	}
1055 	if (fe->fe_flags & FLOWENTF_WITHDRAWN) {
1056 		sf->sf_flags |= SFLOWF_WITHDRAWN;
1057 	}
1058 	if (fe->fe_flags & FLOWENTF_TORN_DOWN) {
1059 		sf->sf_flags |= SFLOWF_TORN_DOWN;
1060 	}
1061 	if (fe->fe_flags & FLOWENTF_DESTROYED) {
1062 		sf->sf_flags |= SFLOWF_DESTROYED;
1063 	}
1064 	if (fe->fe_flags & FLOWENTF_LINGERING) {
1065 		sf->sf_flags |= SFLOWF_LINGERING;
1066 	}
1067 	if (fe->fe_flags & FLOWENTF_LOW_LATENCY) {
1068 		sf->sf_flags |= SFLOWF_LOW_LATENCY;
1069 	}
1070 	if (fe->fe_flags & FLOWENTF_PARENT) {
1071 		sf->sf_flags |= SFLOWF_PARENT;
1072 	}
1073 	if (fe->fe_flags & FLOWENTF_CHILD) {
1074 		sf->sf_flags |= SFLOWF_CHILD;
1075 	}
1076 	if (fe->fe_flags & FLOWENTF_NOWAKEFROMSLEEP) {
1077 		sf->sf_flags |= SFLOWF_NOWAKEFROMSLEEP;
1078 	} else {
1079 		sf->sf_flags &= ~SFLOWF_NOWAKEFROMSLEEP;
1080 	}
1081 	if (fe->fe_flags & FLOWENTF_AOP_OFFLOAD) {
1082 		sf->sf_flags |= SFLOWF_AOP_OFFLOAD;
1083 	}
1084 	if (fe->fe_flags & FLOWENTF_CONNECTION_IDLE) {
1085 		sf->sf_flags |= SFLOWF_CONNECTION_IDLE;
1086 	} else {
1087 		sf->sf_flags &= ~SFLOWF_CONNECTION_IDLE;
1088 	}
1089 
1090 	sf->sf_bucket_idx = SFLOW_BUCKET_NONE;
1091 
1092 	/* AOP offload flows are updated in NECP via shared memory with AOP */
1093 	if (!(fe->fe_flags & FLOWENTF_AOP_OFFLOAD)) {
1094 		sf->sf_ltrack.sft_state = fe->fe_ltrack.fse_state;
1095 		sf->sf_ltrack.sft_seq = fe->fe_ltrack.fse_seqlo;
1096 		sf->sf_ltrack.sft_max_win = fe->fe_ltrack.fse_max_win;
1097 		sf->sf_ltrack.sft_wscale = fe->fe_ltrack.fse_wscale;
1098 		sf->sf_rtrack.sft_state = fe->fe_rtrack.fse_state;
1099 		sf->sf_rtrack.sft_seq = fe->fe_rtrack.fse_seqlo;
1100 		sf->sf_rtrack.sft_max_win = fe->fe_rtrack.fse_max_win;
1101 	}
1102 }
1103 
1104 void
flow_entry_stats_get(struct flow_entry * fe,struct sk_stats_flow * sf)1105 flow_entry_stats_get(struct flow_entry *fe, struct sk_stats_flow *sf)
1106 {
1107 	static_assert(sizeof(fe->fe_stats->fs_stats) == sizeof(*sf));
1108 
1109 	fe_stats_update(fe);
1110 	bcopy(&fe->fe_stats->fs_stats, sf, sizeof(*sf));
1111 }
1112 
1113 struct flow_entry *
fe_alloc(boolean_t can_block)1114 fe_alloc(boolean_t can_block)
1115 {
1116 	struct flow_entry *fe;
1117 
1118 	static_assert((offsetof(struct flow_entry, fe_key) % 16) == 0);
1119 
1120 	fe = skmem_cache_alloc(sk_fe_cache,
1121 	    can_block ? SKMEM_SLEEP : SKMEM_NOSLEEP);
1122 	if (fe == NULL) {
1123 		return NULL;
1124 	}
1125 
1126 	/*
1127 	 * fe_key is 16-bytes aligned which requires fe to begin on
1128 	 * a 16-bytes boundary as well.  This alignment is specified
1129 	 * at sk_fe_cache creation time and we assert here.
1130 	 */
1131 	ASSERT(IS_P2ALIGNED(fe, 16));
1132 	bzero(fe, sk_fe_size);
1133 
1134 	fe->fe_stats = flow_stats_alloc(can_block);
1135 	if (fe->fe_stats == NULL) {
1136 		skmem_cache_free(sk_fe_cache, fe);
1137 		return NULL;
1138 	}
1139 
1140 	SK_DF(SK_VERB_MEM, "fe %p ALLOC", SK_KVA(fe));
1141 
1142 	os_ref_init(&fe->fe_refcnt, &flow_entry_refgrp);
1143 
1144 	lck_mtx_init(&fe->fe_rx_pktq_lock, &nexus_lock_group, &nexus_lock_attr);
1145 	KPKTQ_INIT(&fe->fe_rx_pktq);
1146 	KPKTQ_INIT(&fe->fe_tx_pktq);
1147 
1148 	return fe;
1149 }
1150 
1151 static void
fe_free(struct flow_entry * fe)1152 fe_free(struct flow_entry *fe)
1153 {
1154 	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
1155 	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
1156 	ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
1157 	ASSERT(fe->fe_route == NULL);
1158 
1159 	ASSERT(fe->fe_stats != NULL);
1160 	flow_stats_release(fe->fe_stats);
1161 	fe->fe_stats = NULL;
1162 
1163 	/* only at very last existence of flow releases namespace reservation */
1164 	if (!(fe->fe_flags & FLOWENTF_EXTRL_PORT) &&
1165 	    NETNS_TOKEN_VALID(&fe->fe_port_reservation)) {
1166 		flow_namespace_destroy(&fe->fe_port_reservation);
1167 		ASSERT(!NETNS_TOKEN_VALID(&fe->fe_port_reservation));
1168 	}
1169 	fe->fe_port_reservation = NULL;
1170 
1171 	if (!(fe->fe_flags & FLOWENTF_EXTRL_PROTO) &&
1172 	    protons_token_is_valid(fe->fe_proto_reservation)) {
1173 		protons_release(&fe->fe_proto_reservation);
1174 	}
1175 	fe->fe_proto_reservation = NULL;
1176 
1177 	if (key_custom_ipsec_token_is_valid(fe->fe_ipsec_reservation)) {
1178 		key_release_custom_ipsec(&fe->fe_ipsec_reservation);
1179 	}
1180 	fe->fe_ipsec_reservation = NULL;
1181 
1182 	if (!(fe->fe_flags & FLOWENTF_EXTRL_FLOWID) && (fe->fe_flowid != 0)) {
1183 		flowidns_release_flowid(fe->fe_flowid);
1184 		fe->fe_flowid = 0;
1185 	}
1186 
1187 	skmem_cache_free(sk_fe_cache, fe);
1188 }
1189 
1190 static __inline__ int
fe_id_cmp(const struct flow_entry * a,const struct flow_entry * b)1191 fe_id_cmp(const struct flow_entry *a, const struct flow_entry *b)
1192 {
1193 	return uuid_compare(a->fe_uuid, b->fe_uuid);
1194 }
1195 
1196 #if SK_LOG
1197 SK_NO_INLINE_ATTRIBUTE
1198 char *
fk2str(const struct flow_key * fk,char * __counted_by (dsz)dst,size_t dsz)1199 fk2str(const struct flow_key *fk, char *__counted_by(dsz)dst, size_t dsz)
1200 {
1201 	int af;
1202 	char src_s[MAX_IPv6_STR_LEN];
1203 	char dst_s[MAX_IPv6_STR_LEN];
1204 
1205 	af = fk->fk_ipver == 4 ? AF_INET : AF_INET6;
1206 
1207 	(void) sk_ntop(af, &fk->fk_src, src_s, sizeof(src_s));
1208 	(void) sk_ntop(af, &fk->fk_dst, dst_s, sizeof(dst_s));
1209 	(void) snprintf(dst, dsz,
1210 	    "ipver=%u,src=%s.%u,dst=%s.%u,proto=0x%02u mask=0x%08x,hash=0x%08x",
1211 	    fk->fk_ipver, src_s, ntohs(fk->fk_sport), dst_s, ntohs(fk->fk_dport),
1212 	    fk->fk_proto, fk->fk_mask, flow_key_hash(fk));
1213 
1214 	return dst;
1215 }
1216 
1217 SK_NO_INLINE_ATTRIBUTE
1218 char *
fe2str(const struct flow_entry * fe,char * __counted_by (dsz)dst,size_t dsz)1219 fe2str(const struct flow_entry *fe, char *__counted_by(dsz)dst, size_t dsz)
1220 {
1221 	char keybuf[FLOWKEY_DBGBUF_SIZE]; /* just for debug message */
1222 	uuid_string_t uuidstr;
1223 
1224 	fk2str(&fe->fe_key, keybuf, sizeof(keybuf));
1225 
1226 	(void) sk_snprintf(dst, dsz, "%p proc %s(%d)%s nx_port %d flow_uuid %s"
1227 	    " flags 0x%b %s tp_proto=0x%02u", SK_KVA(fe), fe->fe_proc_name,
1228 	    fe->fe_pid, fe->fe_eproc_name, (int)fe->fe_nx_port,
1229 	    sk_uuid_unparse(fe->fe_uuid, uuidstr), fe->fe_flags, FLOWENTF_BITS,
1230 	    keybuf, fe->fe_transport_protocol);
1231 
1232 	return dst;
1233 }
1234 #endif /* SK_LOG */
1235