xref: /xnu-10063.121.3/bsd/skywalk/nexus/flowswitch/fsw.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  *   1. Redistributions of source code must retain the above copyright
36  *      notice, this list of conditions and the following disclaimer.
37  *   2. Redistributions in binary form must reproduce the above copyright
38  *      notice, this list of conditions and the following disclaimer in the
39  *      documentation and/or other materials provided with the distribution.
40  *
41  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  */
53 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
54 #include <skywalk/os_skywalk_private.h>
55 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
56 #include <skywalk/nexus/flowswitch/fsw_var.h>
57 #include <skywalk/nexus/netif/nx_netif.h>
58 #include <skywalk/nexus/netif/nx_netif_compat.h>
59 
60 #include <net/bpf.h>
61 #include <net/if.h>
62 #include <net/pktsched/pktsched_netem.h>
63 #include <sys/eventhandler.h>
64 
65 #if (DEVELOPMENT || DEBUG)
66 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, chain_enqueue,
67     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_chain_enqueue, 0, "");
68 #endif /* !DEVELOPMENT && !DEBUG */
69 
70 /*
71  * Configures the flowswitch to utilize user packet pool with
72  * dual sized buffers.
73  * A non-zero value enables the support.
74  */
75 #if defined(XNU_TARGET_OS_IOS) || defined(XNU_TARGET_OS_OSX)
76 uint32_t fsw_use_dual_sized_pool = 1;
77 #else
78 uint32_t fsw_use_dual_sized_pool = 0;
79 #endif
80 
81 uint32_t fsw_chain_enqueue = 1;
82 static int __nx_fsw_inited = 0;
83 static eventhandler_tag __nx_fsw_ifnet_eventhandler_tag = NULL;
84 static eventhandler_tag __nx_fsw_protoctl_eventhandler_tag = NULL;
85 
86 static SKMEM_TYPE_DEFINE(nx_fsw_zone, struct nx_flowswitch);
87 
88 static SKMEM_TYPE_DEFINE(nx_fsw_stats_zone, struct __nx_stats_fsw);
89 
90 #define SKMEM_TAG_FSW_PORTS     "com.apple.skywalk.fsw.ports"
91 SKMEM_TAG_DEFINE(skmem_tag_fsw_ports, SKMEM_TAG_FSW_PORTS);
92 
93 #define SKMEM_TAG_FSW_FOB_HASH "com.apple.skywalk.fsw.fsw.fob.hash"
94 SKMEM_TAG_DEFINE(skmem_tag_fsw_fob_hash, SKMEM_TAG_FSW_FOB_HASH);
95 
96 #define SKMEM_TAG_FSW_FRB_HASH "com.apple.skywalk.fsw.fsw.frb.hash"
97 SKMEM_TAG_DEFINE(skmem_tag_fsw_frb_hash, SKMEM_TAG_FSW_FRB_HASH);
98 
99 #define SKMEM_TAG_FSW_FRIB_HASH "com.apple.skywalk.fsw.fsw.frib.hash"
100 SKMEM_TAG_DEFINE(skmem_tag_fsw_frib_hash, SKMEM_TAG_FSW_FRIB_HASH);
101 
102 #define SKMEM_TAG_FSW_FRAG_MGR "com.apple.skywalk.fsw.fsw.frag.mgr"
103 SKMEM_TAG_DEFINE(skmem_tag_fsw_frag_mgr, SKMEM_TAG_FSW_FRAG_MGR);
104 
105 /* 64-bit mask with range */
106 #define BMASK64(_beg, _end)     \
107 	((NX_FSW_CHUNK_FREE >> (63 - (_end))) & ~((1ULL << (_beg)) - 1))
108 
109 static int fsw_detach(struct nx_flowswitch *fsw, struct nexus_adapter *hwna,
110     boolean_t purge);
111 
112 int
fsw_attach_vp(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_vp_adapter ** vpna)113 fsw_attach_vp(struct kern_nexus *nx, struct kern_channel *ch,
114     struct chreq *chr, struct nxbind *nxb, struct proc *p,
115     struct nexus_vp_adapter **vpna)
116 {
117 #pragma unused(ch)
118 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
119 	SK_LOG_VAR(char *cr_name = chr->cr_name);
120 	int err = 0;
121 
122 	SK_LOCK_ASSERT_HELD();
123 	ASSERT(!(chr->cr_mode & CHMODE_CONFIG));
124 	*vpna = NULL;
125 
126 	/* if there's an existing adapter on the nexus port then use it */
127 	FSW_WLOCK(fsw);
128 	err = fsw_port_alloc(fsw, nxb, vpna, chr->cr_port, p, FALSE, FALSE);
129 	FSW_WUNLOCK(fsw);
130 
131 	if (err != 0) {
132 		ASSERT(*vpna == NULL);
133 		goto out;
134 	} else if (*vpna != NULL) {
135 		/*
136 		 * Use the existing adapter on that port; fsw_port_alloc()
137 		 * callback has retained a reference count on the adapter.
138 		 */
139 		goto out;
140 	}
141 	ASSERT(*vpna == NULL);
142 
143 	/* create a virtual port; callee holds vpna ref */
144 	err = fsw_vp_na_create(nx, chr, p, vpna);
145 	if (err != 0) {
146 		SK_ERR("vpna create failed (err %d)", err);
147 		goto out;
148 	}
149 
150 	FSW_WLOCK(fsw);
151 	err = fsw_port_alloc(fsw, nxb, vpna, (*vpna)->vpna_nx_port, p, FALSE, FALSE);
152 	FSW_WUNLOCK(fsw);
153 
154 out:
155 	if ((*vpna) != NULL) {
156 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
157 		    "vpna \"%s\" (0x%llx) refs %u to fsw \"%s\" "
158 		    "nx_port %d (err %d)", (*vpna)->vpna_up.na_name,
159 		    SK_KVA(&(*vpna)->vpna_up), (*vpna)->vpna_up.na_refcount,
160 		    cr_name, (int)(*vpna)->vpna_nx_port, err);
161 
162 		if (err != 0) {
163 			na_release_locked(&(*vpna)->vpna_up);
164 			*vpna = NULL;
165 		}
166 	}
167 
168 	return err;
169 }
170 
171 static int
fsw_nx_check(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)172 fsw_nx_check(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
173 {
174 #pragma unused(fsw)
175 	nexus_type_t hw_nxdom_type = NX_DOM(hw_nx)->nxdom_type;
176 
177 	if (hw_nxdom_type != NEXUS_TYPE_NET_IF) {
178 		return EINVAL;
179 	}
180 
181 	/* it's a netif below */
182 	return 0;
183 }
184 
185 static int
fsw_ctl_flow_add(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)186 fsw_ctl_flow_add(struct nx_flowswitch *fsw, struct proc *p,
187     struct nx_flow_req *req)
188 {
189 	struct flow_owner *fo;
190 	int error = 0;
191 
192 	ASSERT(p != PROC_NULL);
193 
194 	if (p != kernproc) {
195 		/* special port shouldn't be bound via this method */
196 		if (req->nfr_nx_port < FSW_VP_USER_MIN) {
197 			return EINVAL;
198 		}
199 		req->nfr_flags |= (NXFLOWREQF_TRACK | NXFLOWREQF_FLOWADV);
200 	} else {
201 		/* no flow track or advisory support for bsd flow */
202 		ASSERT((req->nfr_flags & NXFLOWREQF_TRACK) == 0);
203 		ASSERT((req->nfr_flags & NXFLOWREQF_FLOWADV) == 0);
204 		ASSERT((req->nfr_flags & NXFLOWREQF_LOW_LATENCY) == 0);
205 	}
206 
207 	/* init kernel only fields */
208 	if (p != kernproc) {
209 		nx_flow_req_internalize(req);
210 	}
211 	req->nfr_pid = proc_pid(p);
212 	if (req->nfr_epid == -1) {
213 		req->nfr_epid = proc_pid(p);
214 	}
215 
216 	if (req->nfr_flow_demux_count > MAX_FLOW_DEMUX_PATTERN) {
217 		SK_ERR("invalid flow demux count %u", req->nfr_flow_demux_count);
218 		return EINVAL;
219 	}
220 
221 	fo = fsw_flow_add(fsw, req, &error);
222 	ASSERT(fo != NULL || error != 0);
223 
224 	if (error == 0) {
225 		// user space don't need this flow stats
226 		flow_stats_release(req->nfr_flow_stats);
227 	}
228 	if (p != kernproc) {
229 		nx_flow_req_externalize(req);
230 	}
231 
232 	return error;
233 }
234 
235 static int
fsw_ctl_flow_del(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)236 fsw_ctl_flow_del(struct nx_flowswitch *fsw, struct proc *p,
237     struct nx_flow_req *req)
238 {
239 	int err;
240 
241 	nx_flow_req_internalize(req);
242 	req->nfr_pid = proc_pid(p);
243 	err = fsw_flow_del(fsw, req, TRUE, NULL);
244 
245 	nx_flow_req_externalize(req);
246 	return err;
247 }
248 
249 static int
fsw_ctl_flow_config(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)250 fsw_ctl_flow_config(struct nx_flowswitch *fsw, struct proc *p,
251     struct nx_flow_req *req)
252 {
253 	int err;
254 
255 	nx_flow_req_internalize(req);
256 	req->nfr_pid = proc_pid(p);
257 	err = fsw_flow_config(fsw, req);
258 
259 	nx_flow_req_externalize(req);
260 	return err;
261 }
262 
263 #if (DEVELOPMENT || DEBUG)
264 static int
265 fsw_rps_threads_sysctl SYSCTL_HANDLER_ARGS
266 {
267 #pragma unused(oidp, arg2)
268 	struct nx_flowswitch *fsw = arg1;
269 	uint32_t nthreads;
270 	int changed;
271 	int error;
272 
273 	error = sysctl_io_number(req, fsw->fsw_rps_nthreads,
274 	    sizeof(fsw->fsw_rps_nthreads), &nthreads, &changed);
275 	if (error == 0 && changed != 0) {
276 		error = fsw_rps_set_nthreads(fsw, nthreads);
277 	}
278 	return error;
279 }
280 #endif /* !DEVELOPMENT && !DEBUG */
281 
282 void
fsw_get_tso_capabilities(struct ifnet * ifp,uint32_t * tso_v4_mtu,uint32_t * tso_v6_mtu)283 fsw_get_tso_capabilities(struct ifnet *ifp, uint32_t *tso_v4_mtu, uint32_t *tso_v6_mtu)
284 {
285 #pragma unused(ifp)
286 	*tso_v4_mtu = 0;
287 	*tso_v6_mtu = 0;
288 
289 #ifdef XNU_TARGET_OS_OSX
290 	struct nx_flowswitch *fsw;
291 
292 	fsw = fsw_ifp_to_fsw(ifp);
293 	if (fsw == NULL) {
294 		return;
295 	}
296 	switch (fsw->fsw_tso_mode) {
297 	case FSW_TSO_MODE_HW: {
298 		ASSERT(ifp->if_tso_v4_mtu != 0 || ifp->if_tso_v6_mtu != 0);
299 		*tso_v4_mtu = ifp->if_tso_v4_mtu;
300 		*tso_v6_mtu = ifp->if_tso_v6_mtu;
301 		break;
302 	}
303 	case FSW_TSO_MODE_SW: {
304 		ASSERT(fsw->fsw_tso_sw_mtu != 0);
305 		*tso_v4_mtu = fsw->fsw_tso_sw_mtu;
306 		*tso_v6_mtu = fsw->fsw_tso_sw_mtu;
307 		break;
308 	}
309 	default:
310 		break;
311 	}
312 #endif /* XNU_TARGET_OS_OSX */
313 }
314 
315 static void
fsw_tso_setup(struct nx_flowswitch * fsw)316 fsw_tso_setup(struct nx_flowswitch *fsw)
317 {
318 	fsw->fsw_tso_mode = FSW_TSO_MODE_NONE;
319 #ifdef XNU_TARGET_OS_OSX
320 	struct ifnet *ifp = fsw->fsw_ifp;
321 	if (!SKYWALK_CAPABLE(ifp) || !SKYWALK_NATIVE(ifp)) {
322 		DTRACE_SKYWALK2(tso__no__support, struct nx_flowswitch *, fsw,
323 		    ifnet_t, ifp);
324 		return;
325 	}
326 	struct nx_netif *nif = NA(ifp)->nifna_netif;
327 	uint32_t large_buf_size = NX_PROV_PARAMS(fsw->fsw_nx)->nxp_large_buf_size;
328 
329 	if (large_buf_size == 0) {
330 		DTRACE_SKYWALK2(no__large__buf, struct nx_flowswitch *, fsw,
331 		    ifnet_t, ifp);
332 		return;
333 	}
334 	/*
335 	 * Unlike _dlil_adjust_large_buf_size_for_tso(), we check the nif_hwassist
336 	 * flags here for the original flags because nx_netif_host_adjust_if_capabilities()
337 	 * has already been called.
338 	 */
339 	if (((nif->nif_hwassist & IFNET_TSO_IPV4) != 0 && ifp->if_tso_v4_mtu != 0) ||
340 	    ((nif->nif_hwassist & IFNET_TSO_IPV6) != 0 && ifp->if_tso_v6_mtu != 0)) {
341 		ASSERT(large_buf_size <= ifp->if_tso_v4_mtu ||
342 		    large_buf_size <= ifp->if_tso_v6_mtu);
343 		fsw->fsw_tso_mode = FSW_TSO_MODE_HW;
344 	} else {
345 		if (sk_fsw_gso_mtu != 0 && large_buf_size >= sk_fsw_gso_mtu) {
346 			fsw->fsw_tso_mode = FSW_TSO_MODE_SW;
347 			fsw->fsw_tso_sw_mtu = sk_fsw_gso_mtu;
348 		}
349 	}
350 	DTRACE_SKYWALK3(tso__mode, struct nx_flowswitch *, fsw,
351 	    fsw_tso_mode_t, fsw->fsw_tso_mode, uint32_t, large_buf_size);
352 #endif /* XNU_TARGET_OS_OSX */
353 }
354 
355 static int
fsw_setup_ifp(struct nx_flowswitch * fsw,struct nexus_adapter * hwna)356 fsw_setup_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna)
357 {
358 	int error = 0;
359 	struct ifnet *ifp = hwna->na_ifp;
360 	struct kern_pbufpool *pp = skmem_arena_nexus(hwna->na_arena)->arn_rx_pp;
361 	size_t f_limit = pp->pp_kmd_region->skr_c_obj_cnt / 2;
362 
363 	ASSERT((hwna->na_type == NA_NETIF_HOST) ||
364 	    (hwna->na_type == NA_NETIF_COMPAT_HOST));
365 
366 	SK_LOCK_ASSERT_HELD();
367 
368 	/*
369 	 * XXX: we don't support non TXSTART interface.
370 	 * There are assumptions in fsw_port_flush_enqueue_dst() about
371 	 * single threaded write to destination rings.
372 	 */
373 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
374 		SK_ERR("non TXSTART interface not supported ifp(0x%llx)",
375 		    SK_KVA(ifp));
376 		return ENOTSUP;
377 	}
378 
379 	FSW_WLOCK(fsw);
380 
381 	ASSERT(fsw->fsw_ifp == NULL);
382 	ASSERT(fsw->fsw_nifna == NULL);
383 	ASSERT(fsw->fsw_resolve == NULL);
384 	ASSERT(fsw->fsw_frame == NULL);
385 	ASSERT(fsw->fsw_demux == NULL);
386 	ASSERT(fsw->fsw_pkt_copy_from_pkt == NULL);
387 	ASSERT(fsw->fsw_pkt_copy_from_mbuf == NULL);
388 	ASSERT(fsw->fsw_pkt_copy_to_mbuf == NULL);
389 
390 	fsw->fsw_ipfm = fsw_ip_frag_mgr_create(fsw, ifp, f_limit);
391 	if (fsw->fsw_ipfm == NULL) {
392 		FSW_WUNLOCK(fsw);
393 		return ENOMEM;
394 	}
395 
396 	switch (ifp->if_family) {
397 	case IFNET_FAMILY_ETHERNET:
398 		error = fsw_ethernet_setup(fsw, ifp);
399 		fsw->fsw_ifp_dlt = DLT_EN10MB;
400 		break;
401 
402 	case IFNET_FAMILY_CELLULAR:
403 		error = fsw_cellular_setup(fsw, ifp);
404 		fsw->fsw_ifp_dlt = DLT_RAW;
405 		break;
406 
407 	default:
408 		if (ifp->if_family == IFNET_FAMILY_IPSEC ||
409 		    ifp->if_family == IFNET_FAMILY_UTUN) {
410 			error = fsw_ip_setup(fsw, ifp);
411 			fsw->fsw_ifp_dlt = DLT_RAW;
412 			break;
413 		}
414 		error = ENOTSUP;
415 		break;
416 	}
417 
418 	if (error != 0) {
419 		FSW_WUNLOCK(fsw);
420 		return error;
421 	}
422 
423 	ASSERT(fsw->fsw_resolve != NULL);
424 
425 	if (NX_PROV(fsw->fsw_nx)->nxprov_region_params[SKMEM_REGION_KMD].
426 	    srp_max_frags > 1 || pp->pp_max_frags > 1) {
427 		fsw->fsw_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt;
428 		fsw->fsw_pkt_copy_from_mbuf = pkt_copy_multi_buflet_from_mbuf;
429 		fsw->fsw_pkt_copy_to_mbuf = pkt_copy_multi_buflet_to_mbuf;
430 	} else {
431 		fsw->fsw_pkt_copy_from_pkt = pkt_copy_from_pkt;
432 		fsw->fsw_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
433 		fsw->fsw_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
434 	}
435 
436 	/*
437 	 * Since it is possible for fsw to refer to the ifp after all
438 	 * underlying hwnas are freed (see fsw_teardown_ifp()), we need
439 	 * an extra reference to the ifp here.
440 	 *
441 	 * We also cache the netif adapter of the interface, as it's
442 	 * needed for each packet enqueued to the classq.  There is no
443 	 * need to retain a refcnt for the same reason as above.
444 	 *
445 	 * We hold the busy lock across these, just in case an interface
446 	 * detach and reattach happens, as fsw_flow_bind() relies on the
447 	 * same lock as well before making its checks.
448 	 */
449 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
450 
451 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
452 	fsw->fsw_ifp = ifp;
453 	fsw->fsw_nifna = &ifp->if_na->nifna_up;
454 	ifp->if_na->nifna_netif->nif_fsw = fsw;
455 	ifp->if_na->nifna_netif->nif_fsw_nxadv =
456 	    fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
457 	(void) strlcpy(fsw->fsw_flow_mgr->fm_name,
458 	    if_name(ifp), IFNAMSIZ);
459 
460 	fsw_classq_setup(fsw, hwna);
461 	fsw->fsw_classq_enabled = TRUE;
462 	fsw->fsw_src_lla_gencnt = 0;
463 	fsw_tso_setup(fsw);
464 
465 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
466 	(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
467 	    FSW_REAP_THREADNAME, ifp->if_xname, "");
468 	thread_set_thread_name(fsw->fsw_reap_thread, fsw->fsw_reap_name);
469 
470 	error = fsw_netagent_register(fsw, ifp);
471 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
472 	    "fsw_netagent_register %s (family %u) (err %d)",
473 	    if_name(ifp), ifp->if_family, error);
474 
475 	/*
476 	 * Clear NXF_REJECT to allow new channels to be opened
477 	 * to this nexus, in case this is an interface reattach.
478 	 * Otherwise this flag should already be cleared.
479 	 */
480 	if (error == 0) {
481 		os_atomic_andnot(&fsw->fsw_nx->nx_flags, NXF_REJECT, relaxed);
482 	}
483 
484 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
485 
486 	/*
487 	 * Wake up the reaper thread.
488 	 */
489 	if (error == 0) {
490 		fsw_reap_sched(fsw);
491 	}
492 
493 	/* init skoid */
494 	skoid_create(&fsw->fsw_skoid,
495 	    SKOID_SNODE(_kern_skywalk_flowswitch), if_name(ifp),
496 	    CTLFLAG_RW);
497 
498 #if (DEVELOPMENT || DEBUG)
499 	if (SKYWALK_NATIVE(fsw->fsw_ifp)) {
500 		skoid_add_handler(&fsw->fsw_skoid, "rps_nthreads", CTLFLAG_RW,
501 		    fsw_rps_threads_sysctl, fsw, 0);
502 	}
503 #endif /* !DEVELOPMENT && !DEBUG */
504 
505 	FSW_WUNLOCK(fsw);
506 
507 	return error;
508 }
509 
510 static void
fsw_teardown_ifp(struct nx_flowswitch * fsw,struct nexus_adapter * hwna)511 fsw_teardown_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna)
512 {
513 	struct ifnet *ifp;
514 
515 	SK_LOCK_ASSERT_HELD();
516 
517 	FSW_WLOCK_ASSERT_HELD(fsw);
518 	ifp = fsw->fsw_ifp;
519 	ASSERT(ifp != NULL);
520 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
521 
522 	fsw_netagent_unregister(fsw, ifp);
523 
524 	if (fsw->fsw_ipfm != NULL) {
525 		fsw_ip_frag_mgr_destroy(fsw->fsw_ipfm);
526 	}
527 
528 	skoid_destroy(&fsw->fsw_skoid);
529 
530 	SK_DF(SK_VERB_FSW, "%sdetached from %s (family %u)",
531 	    ((fsw->fsw_agent_session != NULL) ? "netagent" : ""),
532 	    if_name(ifp), ifp->if_family);
533 
534 	if (hwna != NULL) {
535 		fsw_classq_teardown(fsw, hwna);
536 	}
537 
538 	/*
539 	 * Set NXF_REJECT on the nexus, which would cause existing adapters
540 	 * to be marked similarly; channels associated with them would then
541 	 * cease to function.
542 	 */
543 	os_atomic_or(&fsw->fsw_nx->nx_flags, NXF_REJECT, relaxed);
544 
545 	/* see notes on fsw_na_attach() about I/O refcnt */
546 	if (ifp->if_na != NULL) {
547 		ifp->if_na->nifna_netif->nif_fsw = NULL;
548 		ifp->if_na->nifna_netif->nif_fsw_nxadv = NULL;
549 		os_atomic_thread_fence(seq_cst);
550 	}
551 
552 	fsw->fsw_ifp = NULL;
553 	fsw->fsw_nifna = NULL;
554 	fsw->fsw_resolve = NULL;
555 	fsw->fsw_frame = NULL;
556 	fsw->fsw_frame_headroom = 0;
557 	fsw->fsw_demux = NULL;
558 	fsw->fsw_classq_enabled = FALSE;
559 	fsw->fsw_pkt_copy_from_pkt = NULL;
560 	fsw->fsw_pkt_copy_from_mbuf = NULL;
561 	fsw->fsw_pkt_copy_to_mbuf = NULL;
562 
563 	if (ifp->if_input_netem != NULL) {
564 		netem_destroy(ifp->if_input_netem);
565 		ifp->if_input_netem = NULL;
566 	}
567 
568 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
569 	(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
570 	    FSW_REAP_THREADNAME, if_name(ifp), "_detached");
571 	thread_set_thread_name(fsw->fsw_reap_thread, fsw->fsw_reap_name);
572 }
573 
574 static int
fsw_host_setup(struct nx_flowswitch * fsw)575 fsw_host_setup(struct nx_flowswitch *fsw)
576 {
577 	struct nexus_adapter *hwna;
578 	struct ifnet *ifp;
579 
580 	SK_LOCK_ASSERT_HELD();
581 
582 	hwna = fsw->fsw_host_ch->ch_na;
583 	ASSERT(hwna != NULL);
584 
585 
586 	/* the netif below must have an ifnet attached (dev/host port) */
587 	if ((ifp = hwna->na_ifp) == NULL) {
588 		return ENXIO;
589 	}
590 
591 	/*
592 	 * XXX: we don't support multiple rx rings yet.
593 	 * There are assumptions in fsw_port_flush_enqueue_dst() about
594 	 * single threaded write to destination rings.
595 	 */
596 	if (SKYWALK_NATIVE(ifp) && (hwna->na_num_rx_rings > 1)) {
597 		SK_ERR("ifp(0x%llx): multiple rx rings(%d) not supported",
598 		    SK_KVA(ifp), hwna->na_num_rx_rings);
599 		return ENOTSUP;
600 	}
601 
602 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
603 	if ((fsw->fsw_detach_flags & FSW_DETACHF_DETACHING) != 0) {
604 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
605 		return EBUSY;
606 	}
607 	fsw->fsw_detach_flags = 0;
608 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
609 
610 	int error = fsw_setup_ifp(fsw, hwna);
611 	ASSERT(error != 0 || fsw->fsw_ifp != NULL);
612 	if (error != 0) {
613 		return error;
614 	}
615 
616 	/* update the interface index */
617 	ASSERT(NX_PROV(fsw->fsw_nx)->nxprov_params->nxp_ifindex == 0);
618 	NX_PROV(fsw->fsw_nx)->nxprov_params->nxp_ifindex = ifp->if_index;
619 	return 0;
620 }
621 
622 static int
fsw_host_teardown(struct nx_flowswitch * fsw)623 fsw_host_teardown(struct nx_flowswitch *fsw)
624 {
625 	struct nexus_adapter *hwna = fsw->fsw_host_ch->ch_na;
626 
627 	SK_LOCK_ASSERT_HELD();
628 	return fsw_detach(fsw, hwna, FALSE);
629 }
630 
631 #if SK_LOG
632 /* Hoisted out of line to reduce kernel stack footprint */
633 SK_LOG_ATTRIBUTE
634 static void
fsw_ctl_attach_log(const struct nx_spec_req * nsr,const struct kern_nexus * nx,int err)635 fsw_ctl_attach_log(const struct nx_spec_req *nsr,
636     const struct kern_nexus *nx, int err)
637 {
638 	uuid_string_t uuidstr, ifuuidstr;
639 	const char *nustr;
640 
641 	if (nsr->nsr_flags & NXSPECREQ_UUID) {
642 		nustr = sk_uuid_unparse(nsr->nsr_uuid, uuidstr);
643 	} else if (nsr->nsr_flags & NXSPECREQ_IFP) {
644 		(void) snprintf((char *)uuidstr, sizeof(uuidstr), "0x%llx",
645 		    SK_KVA(nsr->nsr_ifp));
646 		nustr = uuidstr;
647 	} else {
648 		nustr = nsr->nsr_name;
649 	}
650 
651 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
652 	    "nexus 0x%llx (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d",
653 	    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr,
654 	    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err);
655 }
656 #endif /* SK_LOG */
657 
658 SK_NO_INLINE_ATTRIBUTE
659 static void
fsw_netif_set_callbacks_common(struct nx_flowswitch * fsw,boolean_t set)660 fsw_netif_set_callbacks_common(struct nx_flowswitch *fsw, boolean_t set)
661 {
662 	struct nexus_adapter *hwna = fsw->fsw_dev_ch->ch_na;
663 
664 	ASSERT(hwna->na_type == NA_NETIF_DEV ||
665 	    hwna->na_type == NA_NETIF_COMPAT_DEV);
666 
667 	if (set) {
668 		netif_hwna_set_mode(hwna, NETIF_MODE_FSW, fsw_devna_rx);
669 	} else {
670 		netif_hwna_clear_mode(hwna);
671 	}
672 }
673 
674 SK_NO_INLINE_ATTRIBUTE
675 static void
fsw_netif_set_callbacks(struct nx_flowswitch * fsw)676 fsw_netif_set_callbacks(struct nx_flowswitch *fsw)
677 {
678 	fsw_netif_set_callbacks_common(fsw, TRUE);
679 }
680 
681 SK_NO_INLINE_ATTRIBUTE
682 static void
fsw_netif_clear_callbacks(struct nx_flowswitch * fsw)683 fsw_netif_clear_callbacks(struct nx_flowswitch *fsw)
684 {
685 	fsw_netif_set_callbacks_common(fsw, FALSE);
686 }
687 
688 SK_NO_INLINE_ATTRIBUTE
689 static void
fsw_dp_start(struct nx_flowswitch * fsw)690 fsw_dp_start(struct nx_flowswitch *fsw)
691 {
692 	ASSERT(fsw->fsw_dev_ch != NULL);
693 	ASSERT(fsw->fsw_host_ch != NULL);
694 
695 	fsw_netif_set_callbacks(fsw);
696 	na_start_spec(fsw->fsw_dev_ch->ch_nexus, fsw->fsw_dev_ch);
697 	na_start_spec(fsw->fsw_host_ch->ch_nexus, fsw->fsw_host_ch);
698 }
699 
700 SK_NO_INLINE_ATTRIBUTE
701 static int
fsw_dp_stop(struct nx_flowswitch * fsw,struct ifnet ** ifpp)702 fsw_dp_stop(struct nx_flowswitch *fsw, struct ifnet **ifpp)
703 {
704 	struct ifnet *ifp;
705 
706 	FSW_WLOCK(fsw);
707 	if ((fsw->fsw_state_flags & FSW_STATEF_QUIESCED) != 0) {
708 		FSW_WUNLOCK(fsw);
709 		return EALREADY;
710 	}
711 	fsw->fsw_state_flags |= FSW_STATEF_QUIESCED;
712 	FSW_WUNLOCK(fsw);
713 
714 	/*
715 	 * For regular kernel-attached interfaces, quiescing is handled by
716 	 * the ifnet detach thread, which calls dlil_quiesce_and_detach_nexuses().
717 	 * For interfaces created by skywalk test cases, flowswitch/netif nexuses
718 	 * are constructed on the fly and can also be torn down on the fly.
719 	 * dlil_quiesce_and_detach_nexuses() won't help here because any nexus
720 	 * can be detached while the interface is still attached.
721 	 */
722 	if ((ifp = fsw->fsw_ifp) != NULL &&
723 	    ifnet_datamov_suspend_if_needed(ifp)) {
724 		SK_UNLOCK();
725 		ifnet_datamov_drain(ifp);
726 		/* Reference will be released by caller */
727 		*ifpp = ifp;
728 		SK_LOCK();
729 	}
730 	ASSERT(fsw->fsw_dev_ch != NULL);
731 	ASSERT(fsw->fsw_host_ch != NULL);
732 	na_stop_spec(fsw->fsw_host_ch->ch_nexus, fsw->fsw_host_ch);
733 	na_stop_spec(fsw->fsw_dev_ch->ch_nexus, fsw->fsw_dev_ch);
734 	fsw_netif_clear_callbacks(fsw);
735 	return 0;
736 }
737 
738 SK_NO_INLINE_ATTRIBUTE
739 static int
fsw_netif_port_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx,boolean_t host)740 fsw_netif_port_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx,
741     boolean_t host)
742 {
743 	struct chreq chr;
744 	struct kern_channel *ch;
745 	int err;
746 
747 	bzero(&chr, sizeof(chr));
748 	uuid_copy(chr.cr_spec_uuid, hw_nx->nx_uuid);
749 	chr.cr_ring_id = CHANNEL_RING_ID_ANY;
750 	chr.cr_port = host ? NEXUS_PORT_NET_IF_HOST : NEXUS_PORT_NET_IF_DEV;
751 	chr.cr_mode |= CHMODE_CONFIG | (host ? CHMODE_HOST : 0);
752 
753 	err = 0;
754 	ch = ch_open_special(hw_nx, &chr, FALSE, &err);
755 	if (ch == NULL) {
756 		SK_ERR("ch_open_special(%s) failed: %d",
757 		    host ? "host" : "dev", err);
758 		return err;
759 	}
760 	if (host) {
761 		fsw->fsw_host_ch = ch;
762 	} else {
763 		fsw->fsw_dev_ch = ch;
764 	}
765 	return 0;
766 }
767 
768 SK_NO_INLINE_ATTRIBUTE
769 static int
fsw_netif_port_teardown(struct nx_flowswitch * fsw,boolean_t host)770 fsw_netif_port_teardown(struct nx_flowswitch *fsw, boolean_t host)
771 {
772 	struct kern_channel *ch;
773 
774 	ch = host ? fsw->fsw_host_ch : fsw->fsw_dev_ch;
775 	if (ch == NULL) {
776 		return EINVAL;
777 	}
778 	if (host) {
779 		fsw->fsw_host_ch = NULL;
780 	} else {
781 		fsw->fsw_dev_ch = NULL;
782 	}
783 	ch_close_special(ch);
784 	(void) ch_release_locked(ch);
785 	return 0;
786 }
787 
788 SK_NO_INLINE_ATTRIBUTE
789 static int
fsw_devna_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)790 fsw_devna_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
791 {
792 	return fsw_netif_port_setup(fsw, hw_nx, FALSE);
793 }
794 
795 SK_NO_INLINE_ATTRIBUTE
796 static int
fsw_hostna_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)797 fsw_hostna_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
798 {
799 	return fsw_netif_port_setup(fsw, hw_nx, TRUE);
800 }
801 
802 SK_NO_INLINE_ATTRIBUTE
803 static int
fsw_devna_teardown(struct nx_flowswitch * fsw)804 fsw_devna_teardown(struct nx_flowswitch *fsw)
805 {
806 	return fsw_netif_port_teardown(fsw, FALSE);
807 }
808 
809 SK_NO_INLINE_ATTRIBUTE
810 static int
fsw_hostna_teardown(struct nx_flowswitch * fsw)811 fsw_hostna_teardown(struct nx_flowswitch *fsw)
812 {
813 	return fsw_netif_port_teardown(fsw, TRUE);
814 }
815 
816 /* Process NXCFG_CMD_ATTACH */
817 SK_NO_INLINE_ATTRIBUTE
818 static int
fsw_ctl_attach(struct kern_nexus * nx,struct proc * p,struct nx_spec_req * nsr)819 fsw_ctl_attach(struct kern_nexus *nx, struct proc *p, struct nx_spec_req *nsr)
820 {
821 #pragma unused(p)
822 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
823 	struct kern_nexus *hw_nx = NULL;
824 	int err = 0;
825 
826 	SK_LOCK_ASSERT_HELD();
827 
828 	/*
829 	 * The flowswitch only accepts UUID as an identifier, since it
830 	 * represents the UUID of the kernel object we are trying to
831 	 * attach to this flowswitch.
832 	 */
833 	if ((nsr->nsr_flags & (NXSPECREQ_UUID | NXSPECREQ_IFP)) !=
834 	    NXSPECREQ_UUID || uuid_is_null(nsr->nsr_uuid)) {
835 		err = EINVAL;
836 		goto done;
837 	}
838 
839 	if (fsw->fsw_dev_ch != NULL) {
840 		ASSERT(fsw->fsw_host_ch != NULL);
841 		err = EEXIST;
842 		goto done;
843 	}
844 
845 	hw_nx = nx_find(nsr->nsr_uuid, TRUE);
846 	if (hw_nx == NULL) {
847 		err = ENOENT;
848 		goto done;
849 	} else if (hw_nx == nx) {
850 		err = EINVAL;
851 		goto done;
852 	}
853 
854 	/* preflight check to see if the nexus is attachable to us */
855 	err = fsw_nx_check(fsw, hw_nx);
856 	if (err != 0) {
857 		goto done;
858 	}
859 
860 	err = fsw_devna_setup(fsw, hw_nx);
861 	if (err != 0) {
862 		goto done;
863 	}
864 
865 	err = fsw_hostna_setup(fsw, hw_nx);
866 	if (err != 0) {
867 		(void) fsw_devna_teardown(fsw);
868 		goto done;
869 	}
870 
871 	err = fsw_host_setup(fsw);
872 	if (err != 0) {
873 		(void) fsw_hostna_teardown(fsw);
874 		(void) fsw_devna_teardown(fsw);
875 		goto done;
876 	}
877 
878 	fsw_dp_start(fsw);
879 
880 	/* return the devna UUID */
881 	uuid_copy(nsr->nsr_if_uuid, fsw->fsw_dev_ch->ch_na->na_uuid);
882 	ASSERT(!uuid_is_null(nsr->nsr_if_uuid));
883 done:
884 #if SK_LOG
885 	if (__improbable(sk_verbose != 0)) {
886 		fsw_ctl_attach_log(nsr, nx, err);
887 	}
888 #endif /* SK_LOG */
889 
890 	if (hw_nx != NULL) {
891 		nx_release_locked(hw_nx);
892 	}
893 
894 	return err;
895 }
896 
897 SK_NO_INLINE_ATTRIBUTE
898 static void
fsw_cleanup(struct nx_flowswitch * fsw)899 fsw_cleanup(struct nx_flowswitch *fsw)
900 {
901 	int err;
902 	struct ifnet *ifp = NULL;
903 
904 	if (fsw->fsw_dev_ch == NULL) {
905 		ASSERT(fsw->fsw_host_ch == NULL);
906 		return;
907 	}
908 	err = fsw_dp_stop(fsw, &ifp);
909 	if (err != 0) {
910 		return;
911 	}
912 	err = fsw_host_teardown(fsw);
913 	VERIFY(err == 0);
914 
915 	err = fsw_hostna_teardown(fsw);
916 	VERIFY(err == 0);
917 
918 	err = fsw_devna_teardown(fsw);
919 	VERIFY(err == 0);
920 
921 	if (ifp != NULL) {
922 		ifnet_datamov_resume(ifp);
923 	}
924 }
925 
926 int
fsw_ctl_detach(struct kern_nexus * nx,struct proc * p,struct nx_spec_req * nsr)927 fsw_ctl_detach(struct kern_nexus *nx, struct proc *p,
928     struct nx_spec_req *nsr)
929 {
930 #pragma unused(p)
931 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
932 	int err = 0;
933 
934 	SK_LOCK_ASSERT_HELD();
935 
936 	/*
937 	 * nsr is NULL when we're called from the destructor, and it
938 	 * implies that we'll detach everything that is attached.
939 	 */
940 	if (nsr == NULL) {
941 		fsw_cleanup(fsw);
942 		ASSERT(fsw->fsw_dev_ch == NULL);
943 		ASSERT(fsw->fsw_host_ch == NULL);
944 		goto done;
945 	}
946 
947 	if (uuid_is_null(nsr->nsr_if_uuid)) {
948 		err = EINVAL;
949 		goto done;
950 	} else if (fsw->fsw_dev_ch == NULL || fsw->fsw_host_ch == NULL) {
951 		err = ENXIO;
952 		goto done;
953 	}
954 
955 	/* check if the devna uuid is correct */
956 	if (uuid_compare(nsr->nsr_if_uuid,
957 	    fsw->fsw_dev_ch->ch_na->na_uuid) != 0) {
958 		err = ESRCH;
959 		goto done;
960 	}
961 	fsw_cleanup(fsw);
962 
963 done:
964 #if SK_LOG
965 	if (nsr != NULL) {
966 		uuid_string_t ifuuidstr;
967 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
968 		    "nexus 0x%llx (%s) if_uuid %s flags 0x%x err %d",
969 		    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
970 		    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr),
971 		    nsr->nsr_flags, err);
972 	} else {
973 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
974 		    "nexus 0x%llx (%s) ANY err %d", SK_KVA(nx),
975 		    NX_DOM_PROV(nx)->nxdom_prov_name, err);
976 	}
977 #endif /* SK_LOG */
978 
979 	return err;
980 }
981 
982 static int
fsw_netem_config(struct nx_flowswitch * fsw,void * data)983 fsw_netem_config(struct nx_flowswitch *fsw, void *data)
984 {
985 	struct ifnet *ifp = fsw->fsw_ifp;
986 	struct if_netem_params *params = data;
987 	int ret;
988 
989 	if (ifp == NULL) {
990 		return ENODEV;
991 	}
992 
993 	SK_LOCK_ASSERT_HELD();
994 #define fsw_INPUT_NETEM_THREADNAME   "if_input_netem_%s@fsw"
995 #define fsw_INPUT_NETEM_THREADNAME_LEN       32
996 	char netem_name[fsw_INPUT_NETEM_THREADNAME_LEN];
997 	(void) snprintf(netem_name, sizeof(netem_name),
998 	    fsw_INPUT_NETEM_THREADNAME, if_name(ifp));
999 	ret = netem_config(&ifp->if_input_netem, netem_name, ifp, params, fsw,
1000 	    fsw_dev_input_netem_dequeue, FSW_VP_DEV_BATCH_MAX);
1001 
1002 	return ret;
1003 }
1004 
1005 int
fsw_ctl(struct kern_nexus * nx,nxcfg_cmd_t nc_cmd,struct proc * p,void * data)1006 fsw_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, struct proc *p,
1007     void *data)
1008 {
1009 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
1010 	struct nx_spec_req *nsr = data;
1011 	struct nx_flow_req *req = data;
1012 	boolean_t need_check;
1013 	int error = 0;
1014 
1015 	switch (nc_cmd) {
1016 	case NXCFG_CMD_FLOW_ADD:
1017 	case NXCFG_CMD_FLOW_DEL:
1018 		if (uuid_is_null(req->nfr_flow_uuid)) {
1019 			error = EINVAL;
1020 			goto done;
1021 		}
1022 		if (p != kernproc) {
1023 			req->nfr_flags &= NXFLOWREQF_MASK;
1024 		}
1025 		req->nfr_flowadv_idx = FLOWADV_IDX_NONE;
1026 
1027 		if (nc_cmd == NXCFG_CMD_FLOW_DEL) {
1028 			break;
1029 		}
1030 
1031 		need_check = FALSE;
1032 		if (req->nfr_epid != -1 && proc_pid(p) != req->nfr_epid) {
1033 			need_check = TRUE;
1034 		} else if (!uuid_is_null(req->nfr_euuid)) {
1035 			uuid_t uuid;
1036 
1037 			/* get the UUID of the issuing process */
1038 			proc_getexecutableuuid(p, uuid, sizeof(uuid));
1039 
1040 			/*
1041 			 * If this is not issued by a process for its own
1042 			 * executable UUID and if the process does not have
1043 			 * the necessary privilege, reject the request.
1044 			 * The logic is similar to so_set_effective_uuid().
1045 			 */
1046 			if (uuid_compare(req->nfr_euuid, uuid) != 0) {
1047 				need_check = TRUE;
1048 			}
1049 		}
1050 		if (need_check) {
1051 			kauth_cred_t cred = kauth_cred_proc_ref(p);
1052 			error = priv_check_cred(cred,
1053 			    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0);
1054 			kauth_cred_unref(&cred);
1055 			if (error != 0) {
1056 				goto done;
1057 			}
1058 		}
1059 		break;
1060 
1061 	default:
1062 		break;
1063 	}
1064 
1065 	switch (nc_cmd) {
1066 	case NXCFG_CMD_ATTACH:
1067 		error = fsw_ctl_attach(nx, p, nsr);
1068 		break;
1069 
1070 	case NXCFG_CMD_DETACH:
1071 		error = fsw_ctl_detach(nx, p, nsr);
1072 		break;
1073 
1074 	case NXCFG_CMD_FLOW_ADD:       /* struct nx_flow_req */
1075 		error = fsw_ctl_flow_add(fsw, p, data);
1076 		break;
1077 
1078 	case NXCFG_CMD_FLOW_DEL:     /* struct nx_flow_req */
1079 		error = fsw_ctl_flow_del(fsw, p, data);
1080 		break;
1081 
1082 	case NXCFG_CMD_FLOW_CONFIG:
1083 		error = fsw_ctl_flow_config(fsw, p, data);
1084 		break;
1085 
1086 	case NXCFG_CMD_NETEM:           /* struct if_netem_params */
1087 		error = fsw_netem_config(fsw, data);
1088 		break;
1089 
1090 	default:
1091 		SK_ERR("invalid cmd %u", nc_cmd);
1092 		error = EINVAL;
1093 		break;
1094 	}
1095 
1096 done:
1097 	return error;
1098 }
1099 
1100 struct nx_flowswitch *
fsw_ifp_to_fsw(struct ifnet * ifp)1101 fsw_ifp_to_fsw(struct ifnet *ifp)
1102 {
1103 	struct nx_flowswitch *fsw = NULL;
1104 
1105 	if (ifp->if_na != NULL) {
1106 		fsw = ifp->if_na->nifna_netif->nif_fsw;
1107 	}
1108 	return fsw;
1109 }
1110 
1111 static void
fsw_ifnet_event_callback(struct eventhandler_entry_arg ee_arg __unused,struct ifnet * ifp,struct sockaddr * ip_addr __unused,intf_event_code_t intf_ev_code)1112 fsw_ifnet_event_callback(struct eventhandler_entry_arg ee_arg __unused,
1113     struct ifnet *ifp, struct sockaddr *ip_addr __unused,
1114     intf_event_code_t intf_ev_code)
1115 {
1116 	struct nx_flowswitch *fsw = NULL;
1117 
1118 	if (ifp->if_na == NULL) {
1119 		return;
1120 	}
1121 
1122 	SK_LOCK();
1123 	fsw = fsw_ifp_to_fsw(ifp);
1124 	if (fsw != NULL) {
1125 		switch (intf_ev_code) {
1126 		case INTF_EVENT_CODE_LLADDR_UPDATE:
1127 			if ((fsw->fsw_ifp == NULL) ||
1128 			    (fsw->fsw_ifp_dlt != DLT_EN10MB)) {
1129 				break;
1130 			}
1131 
1132 			VERIFY(fsw->fsw_ifp == ifp);
1133 			SK_DF(SK_VERB_FSW, "MAC address change detected for %s",
1134 			    if_name(fsw->fsw_ifp));
1135 			(void) ifnet_lladdr_copy_bytes(ifp, fsw->fsw_ether_shost,
1136 			    ETHER_ADDR_LEN);
1137 			os_atomic_inc(&fsw->fsw_src_lla_gencnt, relaxed);
1138 			break;
1139 
1140 		case INTF_EVENT_CODE_LOW_POWER_UPDATE:
1141 			if (fsw->fsw_ifp == NULL) {
1142 				break;
1143 			}
1144 
1145 			VERIFY(fsw->fsw_ifp == ifp);
1146 
1147 			if (ifp->if_xflags & IFXF_LOW_POWER) {
1148 				SK_DF(SK_VERB_FSW,
1149 				    "Low power mode updated for %s",
1150 				    if_name(fsw->fsw_ifp));
1151 
1152 				fsw_reap_sched(fsw);
1153 			}
1154 			break;
1155 
1156 		default:
1157 			break;
1158 		}
1159 	}
1160 	SK_UNLOCK();
1161 }
1162 
1163 static void
fsw_protoctl_event_callback(struct eventhandler_entry_arg ee_arg,struct ifnet * ifp,struct sockaddr * p_laddr,struct sockaddr * p_raddr,uint16_t lport,uint16_t rport,uint8_t proto,uint32_t protoctl_event_code,struct protoctl_ev_val * p_val)1164 fsw_protoctl_event_callback(struct eventhandler_entry_arg ee_arg,
1165     struct ifnet *ifp, struct sockaddr *p_laddr, struct sockaddr *p_raddr,
1166     uint16_t lport, uint16_t rport, uint8_t proto, uint32_t protoctl_event_code,
1167     struct protoctl_ev_val *p_val)
1168 {
1169 #pragma unused(ee_arg)
1170 	struct nx_flowswitch *fsw = NULL;
1171 	struct flow_entry *fe = NULL;
1172 	boolean_t netagent_update_flow = FALSE;
1173 	uuid_t fe_uuid;
1174 
1175 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1176 		return;
1177 	}
1178 
1179 	/*
1180 	 * XXX Right now only handle the event if we have enough
1181 	 * information to match the entire flow.
1182 	 */
1183 	if (lport == 0 || rport == 0 || p_laddr == NULL || p_raddr == NULL) {
1184 		return;
1185 	}
1186 
1187 	SK_LOCK();
1188 	fsw = fsw_ifp_to_fsw(ifp);
1189 	if (fsw == NULL) {
1190 		goto out;
1191 	}
1192 
1193 	if (!fsw_detach_barrier_add(fsw)) {
1194 		fsw = NULL;
1195 		SK_ERR("netagent detached");
1196 		goto out;
1197 	}
1198 
1199 	struct flow_key fk __sk_aligned(16);
1200 	FLOW_KEY_CLEAR(&fk);
1201 	fk.fk_proto = proto;
1202 	if (p_laddr->sa_family == AF_INET) {
1203 		fk.fk_ipver = IPVERSION;
1204 		fk.fk_src4 = SIN(p_laddr)->sin_addr;
1205 		fk.fk_dst4 = SIN(p_raddr)->sin_addr;
1206 	} else {
1207 		fk.fk_ipver = IPV6_VERSION;
1208 		fk.fk_src6 = SIN6(p_laddr)->sin6_addr;
1209 		/*
1210 		 * rdar://107435899 The scope ID for destination address needs
1211 		 * to be cleared out before looking up the flow entry for this
1212 		 * 5-tuple, because addresses in flow entries do not contain the
1213 		 * scope ID.
1214 		 */
1215 		struct in6_addr *in6;
1216 
1217 		fk.fk_dst6 = SIN6(p_raddr)->sin6_addr;
1218 		in6 = &fk.fk_dst6;
1219 		if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(in6)) {
1220 			in6->s6_addr16[1] = 0;
1221 		}
1222 	}
1223 	fk.fk_sport = lport;
1224 	fk.fk_dport = rport;
1225 	fk.fk_mask = FKMASK_5TUPLE;
1226 
1227 	fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &fk);
1228 	if (__improbable(fe == NULL)) {
1229 		goto out;
1230 	}
1231 
1232 	uuid_copy(fe_uuid, fe->fe_uuid);
1233 	/*
1234 	 * If the protocol notification is for TCP, make sure
1235 	 * protocol event received is for bytes in the flight.
1236 	 * XXX Redirect events are not delivered as protocol events
1237 	 * but as better route events.
1238 	 * Also redirect events do not indicate loss of the packet.
1239 	 */
1240 	if (proto != IPPROTO_TCP) {
1241 		p_val->tcp_seq_number = 0;
1242 	}
1243 
1244 	netagent_update_flow = TRUE;
1245 
1246 out:
1247 	SK_UNLOCK();
1248 
1249 	if (netagent_update_flow) {
1250 		int error = 0;
1251 #if SK_LOG
1252 		char dbgbuf[FLOWENTRY_DBGBUF_SIZE];
1253 		SK_DF(SK_VERB_FLOW, "Update flow entry \"%s\" for protocol "
1254 		    "event %d with value %d and tcp sequence number %d",
1255 		    fe_as_string(fe, dbgbuf, sizeof(dbgbuf)),
1256 		    protoctl_event_code, p_val->val, p_val->tcp_seq_number);
1257 #endif /* SK_LOG */
1258 		if ((error = netagent_update_flow_protoctl_event(
1259 			    fsw->fsw_agent_session, fe_uuid, protoctl_event_code,
1260 			    p_val->val, p_val->tcp_seq_number)) != 0) {
1261 #if SK_LOG
1262 			SK_DF(SK_VERB_FLOW, "Error: %d. Could not update "
1263 			    "flow entry \"%s\" for protocol event %d with "
1264 			    "value %d and tcp sequence number %d", error,
1265 			    dbgbuf, protoctl_event_code, p_val->val,
1266 			    p_val->tcp_seq_number);
1267 #endif /* SK_LOG */
1268 		}
1269 	}
1270 
1271 	if (fe != NULL) {
1272 		flow_entry_release(&fe);
1273 	}
1274 
1275 	if (fsw != NULL) {
1276 		fsw_detach_barrier_remove(fsw);
1277 	}
1278 }
1279 
1280 int
fsw_netagent_add_remove(struct kern_nexus * nx,boolean_t add)1281 fsw_netagent_add_remove(struct kern_nexus *nx, boolean_t add)
1282 {
1283 	struct nx_flowswitch *fsw = NULL;
1284 	int error = 0;
1285 
1286 	SK_LOCK_ASSERT_HELD();
1287 	VERIFY(nx != NULL);
1288 	VERIFY(NX_PROV(nx) != NULL);
1289 	VERIFY(NX_DOM_PROV(nx) != NULL);
1290 
1291 	if (NX_DOM(nx)->nxdom_type != NEXUS_TYPE_FLOW_SWITCH) {
1292 		error = EINVAL;
1293 		goto out;
1294 	}
1295 
1296 	fsw = NX_FSW_PRIVATE(nx);
1297 	VERIFY(fsw != NULL);
1298 	FSW_WLOCK(fsw);
1299 
1300 	if (fsw->fsw_agent_session == NULL) {
1301 		error = ENXIO;
1302 		goto out;
1303 	}
1304 
1305 	ASSERT(!uuid_is_null(fsw->fsw_agent_uuid));
1306 
1307 	if (add) {
1308 		if (FSW_NETAGENT_ADDED(fsw)) {
1309 			/* agent already added */
1310 			error = EEXIST;
1311 		} else if (fsw->fsw_ifp->if_bridge != NULL) {
1312 			/* see rdar://107076453 */
1313 			SK_ERR("%s is bridged, not adding netagent",
1314 			    if_name(fsw->fsw_ifp));
1315 			error = EBUSY;
1316 		} else {
1317 			fsw->fsw_state_flags |= FSW_STATEF_NETAGENT_ADDED;
1318 			if (if_is_fsw_netagent_enabled()) {
1319 				fsw->fsw_state_flags
1320 				        |= FSW_STATEF_NETAGENT_ENABLED;
1321 			}
1322 			if_add_netagent(fsw->fsw_ifp, fsw->fsw_agent_uuid);
1323 			SK_D("flowswitch netagent added for interface %s",
1324 			    if_name(fsw->fsw_ifp));
1325 		}
1326 	} else {
1327 		if (!FSW_NETAGENT_ADDED(fsw)) {
1328 			/* agent has not been added */
1329 			error = ENOENT;
1330 		} else {
1331 			fsw->fsw_state_flags &= ~(FSW_STATEF_NETAGENT_ADDED |
1332 			    FSW_STATEF_NETAGENT_ENABLED);
1333 			if_delete_netagent(fsw->fsw_ifp, fsw->fsw_agent_uuid);
1334 			SK_D("flowswitch netagent removed for interface %s",
1335 			    if_name(fsw->fsw_ifp));
1336 		}
1337 	}
1338 out:
1339 	if (fsw != NULL) {
1340 		FSW_UNLOCK(fsw);
1341 	}
1342 	return error;
1343 }
1344 
1345 void
fsw_netagent_update(struct kern_nexus * nx)1346 fsw_netagent_update(struct kern_nexus *nx)
1347 {
1348 	struct nx_flowswitch *fsw = NULL;
1349 
1350 	SK_LOCK_ASSERT_HELD();
1351 	VERIFY(nx != NULL);
1352 	VERIFY(NX_PROV(nx) != NULL);
1353 	VERIFY(NX_DOM_PROV(nx) != NULL);
1354 
1355 	if (NX_DOM(nx)->nxdom_type != NEXUS_TYPE_FLOW_SWITCH) {
1356 		goto out;
1357 	}
1358 	fsw = NX_FSW_PRIVATE(nx);
1359 	VERIFY(fsw != NULL);
1360 	FSW_WLOCK(fsw);
1361 	if (fsw->fsw_agent_session == NULL) {
1362 		goto out;
1363 	}
1364 	ASSERT(!uuid_is_null(fsw->fsw_agent_uuid));
1365 	uint32_t flags = netagent_get_flags(fsw->fsw_agent_uuid);
1366 	const bool ip_agent = ifnet_needs_fsw_ip_netagent(fsw->fsw_ifp);
1367 	const bool transport_agent = ifnet_needs_fsw_transport_netagent(fsw->fsw_ifp);
1368 	if (ip_agent || transport_agent) {
1369 		flags |= NETAGENT_FLAG_NEXUS_LISTENER;
1370 	} else {
1371 		flags &= ~NETAGENT_FLAG_NEXUS_LISTENER;
1372 	}
1373 	if (transport_agent) {
1374 		flags |= NETAGENT_FLAG_NEXUS_PROVIDER;
1375 	} else {
1376 		flags &= ~NETAGENT_FLAG_NEXUS_PROVIDER;
1377 	}
1378 	if (ip_agent) {
1379 		flags |= NETAGENT_FLAG_CUSTOM_IP_NEXUS;
1380 	} else {
1381 		flags &= ~NETAGENT_FLAG_CUSTOM_IP_NEXUS;
1382 	}
1383 	if (netagent_set_flags(fsw->fsw_agent_uuid, flags) == 0) {
1384 		SK_D("flowswitch netagent updated for interface %s",
1385 		    if_name(fsw->fsw_ifp));
1386 	}
1387 out:
1388 	if (fsw != NULL) {
1389 		FSW_UNLOCK(fsw);
1390 	}
1391 }
1392 
1393 static int
fsw_port_ctor(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,const struct nxbind * nxb)1394 fsw_port_ctor(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna,
1395     const struct nxbind *nxb)
1396 {
1397 #pragma unused(nxb)
1398 	int err = 0;
1399 
1400 	SK_LOCK_ASSERT_HELD();
1401 	ASSERT(nxb == NULL || !(nxb->nxb_flags & NXBF_MATCH_UNIQUEID) ||
1402 	    vpna->vpna_pid == nxb->nxb_pid);
1403 
1404 	/*
1405 	 * Reject regular channel open requests unless there is
1406 	 * something attached to the host port of the flowswitch.
1407 	 */
1408 	if (vpna->vpna_nx_port >= FSW_VP_USER_MIN) {
1409 		struct nexus_adapter *na = &vpna->vpna_up;
1410 		struct ifnet *ifp = fsw->fsw_ifp;
1411 
1412 		if (ifp == NULL) {
1413 			err = ENXIO;
1414 			goto done;
1415 		}
1416 
1417 		/* if adapter supports mitigation, set default value */
1418 		if (na->na_flags & (NAF_TX_MITIGATION | NAF_RX_MITIGATION)) {
1419 			if (IFNET_IS_WIFI(ifp)) {
1420 				na->na_ch_mit_ival = CH_MIT_IVAL_WIFI;
1421 			} else if (IFNET_IS_CELLULAR(ifp)) {
1422 				na->na_ch_mit_ival = CH_MIT_IVAL_CELLULAR;
1423 			} else if (IFNET_IS_ETHERNET(ifp)) {
1424 				na->na_ch_mit_ival = CH_MIT_IVAL_ETHERNET;
1425 			} else {
1426 				na->na_ch_mit_ival = CH_MIT_IVAL_DEFAULT;
1427 			}
1428 		}
1429 	}
1430 
1431 done:
1432 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
1433 	    "fsw 0x%llx nx_port %d vpna_pid %d vpna_pid_bound %u mit_ival %llu "
1434 	    "(err %d)", SK_KVA(fsw), (int)vpna->vpna_nx_port, vpna->vpna_pid,
1435 	    vpna->vpna_pid_bound, vpna->vpna_up.na_ch_mit_ival, err);
1436 
1437 	return err;
1438 }
1439 
1440 static bool
fsw_port_dtor(struct nx_flowswitch * fsw,const struct nexus_vp_adapter * vpna)1441 fsw_port_dtor(struct nx_flowswitch *fsw, const struct nexus_vp_adapter *vpna)
1442 {
1443 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1444 	nexus_port_t nx_port = vpna->vpna_nx_port;
1445 	uint32_t purge_cnt;
1446 
1447 	ASSERT(fsw == vpna->vpna_fsw);
1448 	ASSERT(nx_port != NEXUS_PORT_ANY);
1449 
1450 	/*
1451 	 * If this nexus port was bound to a PID, we just need to look at a
1452 	 * single bucket and iterate from there.  Note that in any case, we
1453 	 * can't just search for a single flow_owner based on the PID itself,
1454 	 * since a given process may be opening multiple channels to the
1455 	 * flowswitch; hence we search for the ones matching this nexus port.
1456 	 *
1457 	 * Close any open flows on the port and remove the flow owner and
1458 	 * nexus port binding.
1459 	 */
1460 	purge_cnt = flow_owner_detach_nexus_port(fm, vpna->vpna_pid_bound,
1461 	    vpna->vpna_pid, nx_port, FALSE);
1462 
1463 	SK_DF(SK_VERB_FSW,
1464 	    "fsw 0x%llx nx_port %d pid %d pid_bound %u defunct %u "
1465 	    "purged %u", SK_KVA(fsw), (int)nx_port,
1466 	    vpna->vpna_pid, vpna->vpna_pid_bound, vpna->vpna_defunct,
1467 	    purge_cnt);
1468 
1469 	return purge_cnt != 0;
1470 }
1471 
1472 /*
1473  * Flowswitch nexus port allocator.
1474  *
1475  * A nexus port is represented by a bit in the port bitmap; its state is
1476  * either free or allocated.  A free state implies that the port has no
1477  * nxbind AND no nexus adapter association.  An allocated state means that
1478  * either it has a nxbind OR a nexus adapter assocation.  This routine
1479  * manages the nexus adapter association with a nexus port; nxbind is
1480  * handled separately via nx_fsw_port_bind().
1481  *
1482  * The caller of this routine may optionally pass in a NULL nexus adapter.
1483  * In such a case (*vpna is NULL), this routine checks to see if the port
1484  * has already been associated with an adapter, and returns a reference to
1485  * that adapter.  No action is taken on a port that doesn't have an adapter
1486  * associated.  Otherwise (*vpna is non-NULL), this routine associates that
1487  * adapter with a port that's not already associated with one; the reference
1488  * to the adapter is untouched here, as the caller is expected to handle it.
1489  *
1490  * The flowswitch code invokes this routine each time it is requested to
1491  * find an adapter via nx_fsw_na_find().  The counterpart of this routine,
1492  * nx_fsw_port_free(), is only executed ONCE by the adapter's destructor.
1493  * This allows for multiple channels to be opened to a nexus port, each
1494  * time holding a reference to that same nexus adapter.  The releasing of
1495  * the nexus port only happens when the last channel closes.
1496  */
1497 static int
fsw_port_alloc__(struct nx_flowswitch * fsw,struct nxbind * nxb,struct nexus_vp_adapter ** vpna,nexus_port_t nx_port,struct proc * p)1498 fsw_port_alloc__(struct nx_flowswitch *fsw, struct nxbind *nxb,
1499     struct nexus_vp_adapter **vpna, nexus_port_t nx_port, struct proc *p)
1500 {
1501 	struct kern_nexus *nx = fsw->fsw_nx;
1502 	boolean_t refonly = FALSE;
1503 	int error = 0;
1504 
1505 	FSW_WLOCK_ASSERT_HELD(fsw);
1506 
1507 	error = nx_port_alloc(nx, nx_port, nxb, (struct nexus_adapter **)vpna, p);
1508 	if (error == 0 && *vpna != NULL && !refonly) {
1509 		/* initialize the nexus port and the adapter occupying it */
1510 		(*vpna)->vpna_fsw = fsw;
1511 		(*vpna)->vpna_nx_port = nx_port;
1512 		(*vpna)->vpna_pid = proc_pid(p);
1513 		if (nxb != NULL && (nxb->nxb_flags & NXBF_MATCH_UNIQUEID)) {
1514 			ASSERT((*vpna)->vpna_pid == nxb->nxb_pid);
1515 			(*vpna)->vpna_pid_bound = TRUE;
1516 		} else {
1517 			(*vpna)->vpna_pid_bound = FALSE;
1518 		}
1519 
1520 		error = fsw_port_ctor(fsw, *vpna, nxb);
1521 		if (error != 0) {
1522 			fsw_port_free(fsw, (*vpna),
1523 			    (*vpna)->vpna_nx_port, FALSE);
1524 		}
1525 	}
1526 
1527 #if SK_LOG
1528 	if (*vpna != NULL) {
1529 		SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
1530 		    "+++ vpna \"%s\" (0x%llx) <-> fsw 0x%llx "
1531 		    "%sport %d refonly %u (err %d)",
1532 		    (*vpna)->vpna_up.na_name, SK_KVA(*vpna), SK_KVA(fsw),
1533 		    nx_fsw_dom_port_is_reserved(nx, nx_port) ?
1534 		    "[reserved] " : "", (int)nx_port, refonly, error);
1535 	} else {
1536 		SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
1537 		    "+++ fsw 0x%llx nx_port %d refonly %u "
1538 		    "(err %d)", SK_KVA(fsw), (int)nx_port, refonly, error);
1539 	}
1540 #endif /* SK_LOG */
1541 
1542 	return error;
1543 }
1544 
1545 int
fsw_port_alloc(struct nx_flowswitch * fsw,struct nxbind * nxb,struct nexus_vp_adapter ** vpna,nexus_port_t nx_port,struct proc * p,boolean_t ifattach,boolean_t host)1546 fsw_port_alloc(struct nx_flowswitch *fsw, struct nxbind *nxb,
1547     struct nexus_vp_adapter **vpna, nexus_port_t nx_port, struct proc *p,
1548     boolean_t ifattach, boolean_t host)
1549 {
1550 	int err = 0;
1551 
1552 	FSW_WLOCK_ASSERT_HELD(fsw);
1553 
1554 	if (ifattach) {
1555 		/* override port to either NX_FSW_{HOST,DEV} */
1556 		nx_port = (host ? FSW_VP_HOST : FSW_VP_DEV);
1557 		/* allocate reserved port for ifattach */
1558 		err = fsw_port_alloc__(fsw, nxb, vpna, nx_port, p);
1559 	} else if (host) {
1560 		/* host is valid only for ifattach */
1561 		err = EINVAL;
1562 	} else {
1563 		/* nexus port otherwise (reserve dev and host for ifattach) */
1564 		err = fsw_port_alloc__(fsw, nxb, vpna, nx_port, p);
1565 	}
1566 
1567 	return err;
1568 }
1569 
1570 /*
1571  * Remove nexus port association from a nexus adapter.  This call is
1572  * the opposite of fsw_port_alloc(), except that it is called only
1573  * at nx_fsw_vp_na_dtor() destructor time.  See above notes
1574  * on fsw_port_alloc().
1575  */
1576 void
fsw_port_free(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,nexus_port_t nx_port,boolean_t defunct)1577 fsw_port_free(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna,
1578     nexus_port_t nx_port, boolean_t defunct)
1579 {
1580 	struct kern_nexus *nx = fsw->fsw_nx;
1581 
1582 	FSW_WLOCK_ASSERT_HELD(fsw);
1583 	ASSERT(vpna->vpna_fsw == fsw);
1584 
1585 	if (defunct) {
1586 		vpna->vpna_defunct = TRUE;
1587 		nx_port_defunct(nx, nx_port);
1588 	}
1589 
1590 	bool destroyed = fsw_port_dtor(fsw, vpna);
1591 	if (destroyed) {
1592 		/*
1593 		 * If the extension's destructor no longer needs to be
1594 		 * bound to any channel client, release the binding.
1595 		 */
1596 		nx_port_unbind(nx, nx_port);
1597 	}
1598 
1599 	/*
1600 	 * If this is a defunct, then stop here as the port is still
1601 	 * occupied by the channel.  We'll come here again later when
1602 	 * the actual close happens.
1603 	 */
1604 	if (defunct) {
1605 		return;
1606 	}
1607 
1608 	SK_DF(SK_VERB_FSW, "--- vpna \"%s\" (0x%llx) -!- fsw 0x%llx "
1609 	    "nx_port %d defunct %u", vpna->vpna_up.na_name, SK_KVA(vpna),
1610 	    SK_KVA(fsw), (int)nx_port, vpna->vpna_defunct);
1611 
1612 	nx_port_free(nx, nx_port);
1613 	vpna->vpna_fsw = NULL;
1614 	vpna->vpna_nx_port = NEXUS_PORT_ANY;
1615 	vpna->vpna_pid_bound = FALSE;
1616 	vpna->vpna_pid = -1;
1617 	vpna->vpna_defunct = FALSE;
1618 }
1619 
1620 int
fsw_port_na_activate(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,na_activate_mode_t mode)1621 fsw_port_na_activate(struct nx_flowswitch *fsw,
1622     struct nexus_vp_adapter *vpna, na_activate_mode_t mode)
1623 {
1624 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1625 	uint32_t fo_cnt = 0;
1626 
1627 	SK_LOCK_ASSERT_HELD();
1628 
1629 	/* The following code relies on the static value asserted below */
1630 	_CASSERT(FSW_VP_DEV == 0);
1631 	_CASSERT(FSW_VP_HOST == 1);
1632 
1633 	ASSERT(NA_IS_ACTIVE(&vpna->vpna_up));
1634 	ASSERT(vpna->vpna_nx_port != NEXUS_PORT_ANY);
1635 
1636 	switch (mode) {
1637 	case NA_ACTIVATE_MODE_ON:
1638 		break;
1639 
1640 	case NA_ACTIVATE_MODE_DEFUNCT:
1641 		break;
1642 
1643 	case NA_ACTIVATE_MODE_OFF:
1644 		break;
1645 
1646 	default:
1647 		VERIFY(0);
1648 		/* NOTREACHED */
1649 		__builtin_unreachable();
1650 	}
1651 
1652 	/* nothing further to do for special ports */
1653 	if (vpna->vpna_nx_port < FSW_VP_USER_MIN) {
1654 		goto done;
1655 	}
1656 
1657 	/* activate any flow owner related resources (e.g. flowadv), if any */
1658 	fo_cnt = flow_owner_activate_nexus_port(fm, vpna->vpna_pid_bound,
1659 	    vpna->vpna_pid, vpna->vpna_nx_port, &vpna->vpna_up, mode);
1660 
1661 done:
1662 	SK_DF(SK_VERB_FSW,
1663 	    "fsw 0x%llx %s nx_port %d vpna_pid %d vpna_pid_bound %u fo_cnt %u",
1664 	    SK_KVA(fsw), na_activate_mode2str(mode), (int)vpna->vpna_nx_port,
1665 	    vpna->vpna_pid, vpna->vpna_pid_bound, fo_cnt);
1666 
1667 	return 0;
1668 }
1669 
1670 int
fsw_port_na_defunct(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna)1671 fsw_port_na_defunct(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna)
1672 {
1673 	int err = 0;
1674 
1675 	SK_LOCK_ASSERT_HELD();
1676 	ASSERT(vpna->vpna_nx_port >= FSW_VP_USER_MIN);
1677 
1678 	/*
1679 	 * During defunct, we want to purge all flows associated to this
1680 	 * port and the flow owner as well.  This is accomplished as part
1681 	 * of calling the port's destructor.  However, we still want to
1682 	 * occupy the nexus port since there's a channel open to it.
1683 	 */
1684 	FSW_WLOCK(fsw);
1685 	if (!vpna->vpna_defunct) {
1686 		fsw_port_free(fsw, vpna, vpna->vpna_nx_port, TRUE);
1687 	} else {
1688 		err = EALREADY;
1689 	}
1690 	FSW_WUNLOCK(fsw);
1691 
1692 	return err;
1693 }
1694 
1695 static size_t
fsw_mib_get_flow(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1696 fsw_mib_get_flow(struct nx_flowswitch *fsw,
1697     struct nexus_mib_filter *filter, void *out, size_t len)
1698 {
1699 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1700 	size_t sf_size = sizeof(struct sk_stats_flow);
1701 	__block size_t actual_space = 0;
1702 	__block struct sk_stats_flow *sf = out;
1703 	struct flow_entry *fe;
1704 
1705 	FSW_LOCK_ASSERT_HELD(fsw);
1706 
1707 	if (filter->nmf_bitmap & NXMIB_FILTER_FLOW_ID) {
1708 		fe = flow_mgr_get_fe_by_uuid_rlock(fm, filter->nmf_flow_id);
1709 		if (fe != NULL) {
1710 			if (out != NULL && len >= sf_size) {
1711 				flow_entry_stats_get(fe, sf);
1712 			}
1713 
1714 			flow_entry_release(&fe);
1715 			return sf_size;
1716 		}
1717 		return 0;
1718 	} else if (filter->nmf_bitmap & NXMIB_FILTER_INFO_TUPLE) {
1719 		struct info_tuple *itpl = &filter->nmf_info_tuple;
1720 		struct flow_key fk;
1721 		bzero(&fk, sizeof(fk));
1722 		if (itpl->itpl_local_sa.sa_family == AF_INET &&
1723 		    itpl->itpl_remote_sa.sa_family == AF_INET) {
1724 			fk.fk_mask = FKMASK_5TUPLE;
1725 			fk.fk_ipver = IPVERSION;
1726 			fk.fk_proto = itpl->itpl_proto;
1727 			fk.fk_src4 = itpl->itpl_local_sin.sin_addr;
1728 			fk.fk_dst4 = itpl->itpl_remote_sin.sin_addr;
1729 			fk.fk_sport = itpl->itpl_local_sin.sin_port;
1730 			fk.fk_dport = itpl->itpl_remote_sin.sin_port;
1731 		} else if (itpl->itpl_local_sa.sa_family == AF_INET6 &&
1732 		    itpl->itpl_remote_sa.sa_family == AF_INET6) {
1733 			fk.fk_mask = FKMASK_5TUPLE;
1734 			fk.fk_ipver = IPV6_VERSION;
1735 			fk.fk_proto = itpl->itpl_proto;
1736 			fk.fk_src6 = itpl->itpl_local_sin6.sin6_addr;
1737 			fk.fk_dst6 = itpl->itpl_remote_sin6.sin6_addr;
1738 			fk.fk_sport = itpl->itpl_local_sin6.sin6_port;
1739 			fk.fk_dport = itpl->itpl_remote_sin6.sin6_port;
1740 		} else {
1741 			SK_ERR("invalid info tuple: local af %d remote af %d",
1742 			    itpl->itpl_local_sa.sa_family,
1743 			    itpl->itpl_remote_sa.sa_family);
1744 			return 0;
1745 		}
1746 
1747 		fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &fk);
1748 		if (fe != NULL) {
1749 			if (out != NULL && len >= sf_size) {
1750 				flow_entry_stats_get(fe, sf);
1751 			}
1752 			flow_entry_release(&fe);
1753 			return sf_size;
1754 		}
1755 		return 0;
1756 	}
1757 
1758 	flow_mgr_foreach_flow(fsw->fsw_flow_mgr, ^(struct flow_entry *_fe) {
1759 		actual_space += sf_size;
1760 
1761 		if (out == NULL || actual_space > len) {
1762 		        return;
1763 		}
1764 
1765 		flow_entry_stats_get(_fe, sf);
1766 		sf++;
1767 	});
1768 
1769 	/*
1770 	 * Also return the ones in deferred free list.
1771 	 */
1772 	lck_mtx_lock(&fsw->fsw_linger_lock);
1773 	TAILQ_FOREACH(fe, &fsw->fsw_linger_head, fe_linger_link) {
1774 		actual_space += sf_size;
1775 		if (out == NULL || actual_space > len) {
1776 			continue;
1777 		}
1778 
1779 		flow_entry_stats_get(fe, sf);
1780 		sf++;
1781 	}
1782 	lck_mtx_unlock(&fsw->fsw_linger_lock);
1783 
1784 	return actual_space;
1785 }
1786 
1787 static size_t
fsw_mib_get_flow_adv(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1788 fsw_mib_get_flow_adv(struct nx_flowswitch *fsw,
1789     struct nexus_mib_filter *filter, void *out, size_t len)
1790 {
1791 #pragma unused(filter)
1792 	uint32_t fae_idx;
1793 	size_t actual_space = 0;
1794 	struct kern_channel *ch = NULL;
1795 	struct sk_stats_flow_adv *sfa = NULL;
1796 	struct sk_stats_flow_adv_ent *sfae = NULL;
1797 	struct __flowadv_entry *fae = NULL;
1798 	size_t sfa_size = sizeof(struct sk_stats_flow_adv);
1799 	size_t sfae_size = sizeof(struct sk_stats_flow_adv_ent);
1800 	uint32_t max_flowadv =
1801 	    fsw->fsw_nx->nx_prov->nxprov_params->nxp_flowadv_max;
1802 
1803 	SK_LOCK_ASSERT_HELD();
1804 
1805 	sfa = out;
1806 	/* copyout flow advisory table (allocated entries only) */
1807 	STAILQ_FOREACH(ch, &fsw->fsw_nx->nx_ch_head, ch_link) {
1808 		struct skmem_arena *ar;
1809 		struct skmem_arena_nexus *arn;
1810 		struct nexus_adapter *na;
1811 
1812 		/* ch_lock isn't needed here since sk_lock is held */
1813 		if ((ch->ch_flags & CHANF_CLOSING) ||
1814 		    (na = ch->ch_na) == NULL) {
1815 			/* channel is closing */
1816 			continue;
1817 		}
1818 
1819 		ar = na->na_arena;
1820 		arn = skmem_arena_nexus(ar);
1821 
1822 		AR_LOCK(ar);
1823 		if (arn->arn_flowadv_obj == NULL) {
1824 			ASSERT(ar->ar_flags & ARF_DEFUNCT);
1825 			AR_UNLOCK(ar);
1826 			continue;
1827 		}
1828 		actual_space += sfa_size;
1829 		/* fill out flowadv_table info */
1830 		if (out != NULL && actual_space <= len) {
1831 			uuid_copy(sfa->sfa_nx_uuid, fsw->fsw_nx->nx_uuid);
1832 			(void) strlcpy(sfa->sfa_if_name,
1833 			    fsw->fsw_flow_mgr->fm_name, IFNAMSIZ);
1834 			sfa->sfa_owner_pid = ch->ch_pid;
1835 			sfa->sfa_entries_count = 0;
1836 		}
1837 
1838 		/* fill out flowadv_entries */
1839 		sfae = &sfa->sfa_entries[0];
1840 		for (fae_idx = 0; fae_idx < max_flowadv; fae_idx++) {
1841 			fae = &arn->arn_flowadv_obj[fae_idx];
1842 			if (!uuid_is_null(fae->fae_id)) {
1843 				actual_space += sfae_size;
1844 				if (out == NULL || actual_space > len) {
1845 					continue;
1846 				}
1847 
1848 				/* fill out entry */
1849 				uuid_copy(sfae->sfae_flow_id, fae->fae_id);
1850 				sfae->sfae_flags = fae->fae_flags;
1851 				sfae++;
1852 				sfa->sfa_entries_count++;
1853 			}
1854 		}
1855 		sfa = (struct sk_stats_flow_adv *)
1856 		    ((uintptr_t)out + actual_space);
1857 		AR_UNLOCK(ar);
1858 	}
1859 
1860 	return actual_space;
1861 }
1862 
1863 static inline void
fsw_fo2sfo(struct nx_flowswitch * fsw,struct flow_owner * fo,struct sk_stats_flow_owner * sfo)1864 fsw_fo2sfo(struct nx_flowswitch *fsw, struct flow_owner *fo,
1865     struct sk_stats_flow_owner *sfo)
1866 {
1867 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1868 
1869 	uuid_copy(sfo->sfo_nx_uuid, fsw->fsw_nx->nx_uuid);
1870 	(void) strlcpy(sfo->sfo_if_name, fsw->fsw_flow_mgr->fm_name,
1871 	    IFNAMSIZ);
1872 	sfo->sfo_bucket_idx = flow_mgr_get_fob_idx(fm, FO_BUCKET(fo));
1873 
1874 	(void) snprintf(sfo->sfo_name, sizeof(sfo->sfo_name), "%s",
1875 	    fo->fo_name);
1876 	sfo->sfo_pid = fo->fo_pid;
1877 	sfo->sfo_nx_port = fo->fo_nx_port;
1878 	sfo->sfo_nx_port_pid_bound = fo->fo_nx_port_pid_bound;
1879 	sfo->sfo_nx_port_destroyed = fo->fo_nx_port_destroyed;
1880 }
1881 
1882 static size_t
fsw_mib_get_flow_owner(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1883 fsw_mib_get_flow_owner(struct nx_flowswitch *fsw,
1884     struct nexus_mib_filter *filter, void *out, size_t len)
1885 {
1886 #pragma unused(filter)
1887 	uint32_t i;
1888 	size_t actual_space = 0;
1889 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1890 	struct sk_stats_flow_owner *sfo = out;
1891 	size_t sfo_size = sizeof(struct sk_stats_flow_owner);
1892 	struct flow_owner *fo;
1893 
1894 	FSW_LOCK_ASSERT_HELD(fsw);
1895 
1896 	/*
1897 	 * Ideally we'd like to hide the bucket level details from flow library
1898 	 * user, but there is no simple way to iterate flow_owner with
1899 	 * buckets/RB_TREE nested. So keep it as is.
1900 	 */
1901 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
1902 		struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
1903 		FOB_LOCK(fob);
1904 		RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
1905 			actual_space += sfo_size;
1906 			if (out == NULL || actual_space > len) {
1907 				continue;
1908 			}
1909 
1910 			fsw_fo2sfo(fsw, fo, sfo);
1911 			sfo++;
1912 		}
1913 		FOB_UNLOCK(fob);
1914 	}
1915 
1916 	return actual_space;
1917 }
1918 
1919 static inline void
fsw_fr2sfr(struct nx_flowswitch * fsw,struct flow_route * fr,struct sk_stats_flow_route * sfr,boolean_t ll_scrub)1920 fsw_fr2sfr(struct nx_flowswitch *fsw, struct flow_route *fr,
1921     struct sk_stats_flow_route *sfr, boolean_t ll_scrub)
1922 {
1923 	uuid_copy(sfr->sfr_nx_uuid, fsw->fsw_nx->nx_uuid);
1924 	uuid_copy(sfr->sfr_uuid, fr->fr_uuid);
1925 	(void) strlcpy(sfr->sfr_if_name, fsw->fsw_flow_mgr->fm_name,
1926 	    IFNAMSIZ);
1927 
1928 	sfr->sfr_bucket_idx = fr->fr_frb->frb_idx;
1929 	sfr->sfr_id_bucket_idx = fr->fr_frib->frib_idx;
1930 
1931 	if (fr->fr_flags & FLOWRTF_ATTACHED) {
1932 		sfr->sfr_flags |= SFLOWRTF_ATTACHED;
1933 	}
1934 	if (fr->fr_flags & FLOWRTF_ONLINK) {
1935 		sfr->sfr_flags |= SFLOWRTF_ONLINK;
1936 	}
1937 	if (fr->fr_flags & FLOWRTF_GATEWAY) {
1938 		sfr->sfr_flags |= SFLOWRTF_GATEWAY;
1939 	}
1940 	if (fr->fr_flags & FLOWRTF_RESOLVED) {
1941 		sfr->sfr_flags |= SFLOWRTF_RESOLVED;
1942 	}
1943 	if (fr->fr_flags & FLOWRTF_HAS_LLINFO) {
1944 		sfr->sfr_flags |= SFLOWRTF_HAS_LLINFO;
1945 	}
1946 	if (fr->fr_flags & FLOWRTF_DELETED) {
1947 		sfr->sfr_flags |= SFLOWRTF_DELETED;
1948 	}
1949 	if (fr->fr_flags & FLOWRTF_DST_LL_MCAST) {
1950 		sfr->sfr_flags |= SFLOWRTF_DST_LL_MCAST;
1951 	}
1952 	if (fr->fr_flags & FLOWRTF_DST_LL_BCAST) {
1953 		sfr->sfr_flags |= SFLOWRTF_DST_LL_BCAST;
1954 	}
1955 
1956 	lck_spin_lock(&fr->fr_reflock);
1957 	ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF);
1958 	sfr->sfr_usecnt = fr->fr_usecnt - FLOW_ROUTE_MINREF;
1959 	if (fr->fr_expire != 0) {
1960 		sfr->sfr_expire = (int64_t)(fr->fr_expire - net_uptime());
1961 	} else {
1962 		sfr->sfr_expire = 0;
1963 	}
1964 	lck_spin_unlock(&fr->fr_reflock);
1965 
1966 	sfr->sfr_laddr = fr->fr_laddr;
1967 	sfr->sfr_faddr = fr->fr_faddr;
1968 	sfr->sfr_gaddr = fr->fr_gaddr;
1969 
1970 	if (ll_scrub) {
1971 		static const uint8_t unspec[ETHER_ADDR_LEN] = {[0] = 2 };
1972 		bcopy(&unspec, &sfr->sfr_ether_dhost, ETHER_ADDR_LEN);
1973 	} else {
1974 		bcopy(&fr->fr_eth.ether_dhost, &sfr->sfr_ether_dhost,
1975 		    ETHER_ADDR_LEN);
1976 	}
1977 }
1978 
1979 #if CONFIG_MACF
1980 extern int dlil_lladdr_ckreq;
1981 #endif /* CONFIG_MACF */
1982 
1983 static size_t
fsw_mib_get_flow_route(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len,struct proc * p)1984 fsw_mib_get_flow_route(struct nx_flowswitch *fsw,
1985     struct nexus_mib_filter *filter, void *out, size_t len, struct proc *p)
1986 {
1987 #pragma unused(filter)
1988 	uint32_t i;
1989 	size_t actual_space = 0;
1990 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1991 	struct sk_stats_flow_route *sfr = out;
1992 	size_t sfo_size = sizeof(struct sk_stats_flow_route);
1993 	struct flow_route *fr;
1994 	boolean_t ll_scrub;
1995 
1996 	FSW_LOCK_ASSERT_HELD(fsw);
1997 
1998 	/*
1999 	 * To get the link-layer info, the caller must have the following
2000 	 * in their sandbox profile (or not be sandboxed at all), else we
2001 	 * scrub it clean just like dlil_ifaddr_bytes() does:
2002 	 *
2003 	 * (allow system-info (info-type "net.link.addr"))
2004 	 *
2005 	 * If scrubbed, we return 02:00:00:00:00:00.
2006 	 */
2007 #if CONFIG_MACF
2008 	ll_scrub = (dlil_lladdr_ckreq &&
2009 	    skywalk_mac_system_check_proc_cred(p, "net.link.addr") != 0);
2010 #else /* !CONFIG_MACF */
2011 	ll_scrub = FALSE;
2012 #endif /* !CONFIG_MACF */
2013 
2014 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
2015 		struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
2016 		FRB_RLOCK(frb);
2017 		RB_FOREACH(fr, flow_route_tree, &frb->frb_head) {
2018 			actual_space += sfo_size;
2019 			if (out == NULL || actual_space > len) {
2020 				continue;
2021 			}
2022 
2023 			fsw_fr2sfr(fsw, fr, sfr, ll_scrub);
2024 			sfr++;
2025 		}
2026 		FRB_UNLOCK(frb);
2027 	}
2028 
2029 	return actual_space;
2030 }
2031 
2032 static inline void
fsw_nxs2nus(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,pid_t pid,struct __nx_stats_fsw * nxs,struct sk_stats_userstack * sus)2033 fsw_nxs2nus(struct nx_flowswitch *fsw, struct nexus_mib_filter *filter,
2034     pid_t pid, struct __nx_stats_fsw *nxs, struct sk_stats_userstack *sus)
2035 {
2036 	uuid_copy(sus->sus_nx_uuid, fsw->fsw_nx->nx_uuid);
2037 	(void) strlcpy(sus->sus_if_name, fsw->fsw_flow_mgr->fm_name,
2038 	    IFNAMSIZ);
2039 	sus->sus_owner_pid = pid;
2040 
2041 	if (filter->nmf_type & NXMIB_IP_STATS) {
2042 		sus->sus_ip  = nxs->nxs_ipstat;
2043 	}
2044 
2045 	if (filter->nmf_type & NXMIB_IP6_STATS) {
2046 		sus->sus_ip6 = nxs->nxs_ip6stat;
2047 	}
2048 
2049 	if (filter->nmf_type & NXMIB_TCP_STATS) {
2050 		sus->sus_tcp = nxs->nxs_tcpstat;
2051 	}
2052 
2053 	if (filter->nmf_type & NXMIB_UDP_STATS) {
2054 		sus->sus_udp = nxs->nxs_udpstat;
2055 	}
2056 
2057 	if (filter->nmf_type & NXMIB_QUIC_STATS) {
2058 		sus->sus_quic = nxs->nxs_quicstat;
2059 	}
2060 }
2061 
2062 static size_t
fsw_mib_get_userstack_stats(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)2063 fsw_mib_get_userstack_stats(struct nx_flowswitch *fsw,
2064     struct nexus_mib_filter *filter, void *out, size_t len)
2065 {
2066 	size_t actual_space = 0;
2067 	struct kern_channel *ch;
2068 	struct __nx_stats_fsw *nxs;
2069 	struct sk_stats_userstack *sus = out;
2070 	size_t sus_size = sizeof(struct sk_stats_userstack);
2071 
2072 	SK_LOCK_ASSERT_HELD();
2073 
2074 	/* copyout saved stats from closed ports */
2075 	if (((filter->nmf_bitmap & NXMIB_FILTER_PID) &&
2076 	    (filter->nmf_pid == 0)) ||
2077 	    !(filter->nmf_bitmap & NXMIB_FILTER_PID)) {
2078 		actual_space += sus_size;
2079 		if (out != NULL && actual_space <= len) {
2080 			nxs = fsw->fsw_closed_na_stats;
2081 			fsw_nxs2nus(fsw, filter, 0, nxs, sus);
2082 			sus++;
2083 		}
2084 	}
2085 
2086 	/*
2087 	 * XXX Currently a proc only opens one channel to nexus so we don't do
2088 	 * per proc aggregation of inet stats now as this needs lots of code
2089 	 */
2090 	/* copyout per process stats */
2091 	STAILQ_FOREACH(ch, &fsw->fsw_nx->nx_ch_head, ch_link) {
2092 		struct skmem_arena *ar;
2093 		struct nexus_adapter *na;
2094 
2095 		/* ch_lock isn't needed here since sk_lock is held */
2096 		if ((ch->ch_flags & CHANF_CLOSING) ||
2097 		    (na = ch->ch_na) == NULL) {
2098 			/* channel is closing */
2099 			continue;
2100 		}
2101 
2102 		if ((filter->nmf_bitmap & NXMIB_FILTER_PID) &&
2103 		    filter->nmf_pid != ch->ch_pid) {
2104 			continue;
2105 		}
2106 
2107 		ar = na->na_arena;
2108 
2109 		AR_LOCK(ar);
2110 		nxs = skmem_arena_nexus(ar)->arn_stats_obj;
2111 		if (nxs == NULL) {
2112 			ASSERT(ar->ar_flags & ARF_DEFUNCT);
2113 			AR_UNLOCK(ar);
2114 			continue;
2115 		}
2116 
2117 		actual_space += sus_size;
2118 		if (out == NULL || actual_space > len) {
2119 			AR_UNLOCK(ar);
2120 			continue;
2121 		}
2122 
2123 		fsw_nxs2nus(fsw, filter, ch->ch_pid, nxs, sus);
2124 		sus++;
2125 		AR_UNLOCK(ar);
2126 	}
2127 
2128 	return actual_space;
2129 }
2130 
2131 static size_t
fsw_mib_get_stats(struct nx_flowswitch * fsw,void * out,size_t len)2132 fsw_mib_get_stats(struct nx_flowswitch *fsw, void *out, size_t len)
2133 {
2134 	struct sk_stats_flow_switch *sfs = out;
2135 	size_t actual_space = sizeof(struct sk_stats_flow_switch);
2136 
2137 	if (out != NULL && actual_space <= len) {
2138 		uuid_copy(sfs->sfs_nx_uuid, fsw->fsw_nx->nx_uuid);
2139 		(void) strlcpy(sfs->sfs_if_name,
2140 		    fsw->fsw_flow_mgr->fm_name, IFNAMSIZ);
2141 		sfs->sfs_fsws = fsw->fsw_stats;
2142 	}
2143 
2144 	return actual_space;
2145 }
2146 
2147 size_t
fsw_mib_get(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len,struct proc * p)2148 fsw_mib_get(struct nx_flowswitch *fsw, struct nexus_mib_filter *filter,
2149     void *out, size_t len, struct proc *p)
2150 {
2151 	size_t ret;
2152 
2153 	switch (filter->nmf_type) {
2154 	case NXMIB_FSW_STATS:
2155 		ret = fsw_mib_get_stats(fsw, out, len);
2156 		break;
2157 	case NXMIB_FLOW:
2158 		ret = fsw_mib_get_flow(fsw, filter, out, len);
2159 		break;
2160 	case NXMIB_FLOW_OWNER:
2161 		ret = fsw_mib_get_flow_owner(fsw, filter, out, len);
2162 		break;
2163 	case NXMIB_FLOW_ROUTE:
2164 		ret = fsw_mib_get_flow_route(fsw, filter, out, len, p);
2165 		break;
2166 	case NXMIB_TCP_STATS:
2167 	case NXMIB_UDP_STATS:
2168 	case NXMIB_IP_STATS:
2169 	case NXMIB_IP6_STATS:
2170 	case NXMIB_USERSTACK_STATS:
2171 		ret = fsw_mib_get_userstack_stats(fsw, filter, out, len);
2172 		break;
2173 	case NXMIB_FLOW_ADV:
2174 		ret = fsw_mib_get_flow_adv(fsw, filter, out, len);
2175 		break;
2176 	default:
2177 		ret = 0;
2178 		break;
2179 	}
2180 
2181 	return ret;
2182 }
2183 
2184 void
fsw_fold_stats(struct nx_flowswitch * fsw,void * data,nexus_stats_type_t type)2185 fsw_fold_stats(struct nx_flowswitch *fsw,
2186     void *data, nexus_stats_type_t type)
2187 {
2188 	ASSERT(data != NULL);
2189 	FSW_LOCK_ASSERT_HELD(fsw);
2190 
2191 	switch (type) {
2192 	case NEXUS_STATS_TYPE_FSW:
2193 	{
2194 		struct __nx_stats_fsw *d, *s;
2195 		d = fsw->fsw_closed_na_stats;
2196 		s = data;
2197 		ip_stats_fold(&d->nxs_ipstat, &s->nxs_ipstat);
2198 		ip6_stats_fold(&d->nxs_ip6stat, &s->nxs_ip6stat);
2199 		tcp_stats_fold(&d->nxs_tcpstat, &s->nxs_tcpstat);
2200 		udp_stats_fold(&d->nxs_udpstat, &s->nxs_udpstat);
2201 		quic_stats_fold(&d->nxs_quicstat, &s->nxs_quicstat);
2202 		break;
2203 	}
2204 	case NEXUS_STATS_TYPE_CHAN_ERRORS:
2205 	{
2206 		struct __nx_stats_channel_errors *s = data;
2207 		fsw_vp_channel_error_stats_fold(&fsw->fsw_stats, s);
2208 		break;
2209 	}
2210 	default:
2211 		VERIFY(0);
2212 		/* NOTREACHED */
2213 		__builtin_unreachable();
2214 	}
2215 }
2216 
2217 boolean_t
fsw_detach_barrier_add(struct nx_flowswitch * fsw)2218 fsw_detach_barrier_add(struct nx_flowswitch *fsw)
2219 {
2220 	lck_mtx_lock_spin(&fsw->fsw_detach_barrier_lock);
2221 	if (__improbable(fsw->fsw_detach_flags != 0 ||
2222 	    fsw->fsw_ifp == NULL || fsw->fsw_agent_session == NULL)) {
2223 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2224 		return FALSE;
2225 	}
2226 	fsw->fsw_detach_barriers++;
2227 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2228 
2229 	return TRUE;
2230 }
2231 
2232 void
fsw_detach_barrier_remove(struct nx_flowswitch * fsw)2233 fsw_detach_barrier_remove(struct nx_flowswitch *fsw)
2234 {
2235 	lck_mtx_lock_spin(&fsw->fsw_detach_barrier_lock);
2236 	ASSERT((fsw->fsw_detach_flags & FSW_DETACHF_DETACHED) == 0);
2237 	ASSERT(fsw->fsw_detach_barriers != 0);
2238 	fsw->fsw_detach_barriers--;
2239 	/* if there's a thread waiting to detach the interface, let it know */
2240 	if (__improbable((fsw->fsw_detach_waiters > 0) &&
2241 	    (fsw->fsw_detach_barriers == 0))) {
2242 		fsw->fsw_detach_waiters = 0;
2243 		wakeup(&fsw->fsw_detach_waiters);
2244 	}
2245 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2246 }
2247 
2248 /*
2249  * Generic resolver for non-Ethernet interfaces.
2250  */
2251 int
fsw_generic_resolve(struct nx_flowswitch * fsw,struct flow_route * fr,struct __kern_packet * pkt)2252 fsw_generic_resolve(struct nx_flowswitch *fsw, struct flow_route *fr,
2253     struct __kern_packet *pkt)
2254 {
2255 #pragma unused(pkt)
2256 #if SK_LOG
2257 	char dst_s[MAX_IPv6_STR_LEN];
2258 #endif /* SK_LOG */
2259 	struct ifnet *ifp = fsw->fsw_ifp;
2260 	struct rtentry *tgt_rt = NULL;
2261 	int err = 0;
2262 
2263 	ASSERT(fr != NULL);
2264 	ASSERT(ifp != NULL);
2265 
2266 	FR_LOCK(fr);
2267 	/*
2268 	 * If the destination is on-link, we use the final destination
2269 	 * address as target.  If it's off-link, we use the gateway
2270 	 * address instead.  Point tgt_rt to the the destination or
2271 	 * gateway route accordingly.
2272 	 */
2273 	if (fr->fr_flags & FLOWRTF_ONLINK) {
2274 		tgt_rt = fr->fr_rt_dst;
2275 	} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
2276 		tgt_rt = fr->fr_rt_gw;
2277 	}
2278 
2279 	/*
2280 	 * Perform another routing table lookup if necessary.
2281 	 */
2282 	if (tgt_rt == NULL || !(tgt_rt->rt_flags & RTF_UP) ||
2283 	    fr->fr_want_configure) {
2284 		if (fr->fr_want_configure == 0) {
2285 			os_atomic_inc(&fr->fr_want_configure, relaxed);
2286 		}
2287 		err = flow_route_configure(fr, ifp, NULL);
2288 		if (err != 0) {
2289 			SK_ERR("failed to configure route to %s on %s (err %d)",
2290 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
2291 			    sizeof(dst_s)), ifp->if_xname, err);
2292 			goto done;
2293 		}
2294 
2295 		/* refresh pointers */
2296 		if (fr->fr_flags & FLOWRTF_ONLINK) {
2297 			tgt_rt = fr->fr_rt_dst;
2298 		} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
2299 			tgt_rt = fr->fr_rt_gw;
2300 		}
2301 	}
2302 
2303 	if (__improbable(!(fr->fr_flags & (FLOWRTF_ONLINK | FLOWRTF_GATEWAY)))) {
2304 		err = EHOSTUNREACH;
2305 		SK_ERR("invalid route for %s on %s (err %d)",
2306 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
2307 		    sizeof(dst_s)), ifp->if_xname, err);
2308 		goto done;
2309 	}
2310 
2311 	ASSERT(tgt_rt != NULL);
2312 
2313 done:
2314 	if (__probable(err == 0)) {
2315 		/*
2316 		 * There's no actual resolution taking place here, so just
2317 		 * mark it with FLOWRTF_RESOLVED for consistency.
2318 		 */
2319 		os_atomic_or(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
2320 		os_atomic_store(&fr->fr_want_probe, 0, release);
2321 	} else {
2322 		os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
2323 		flow_route_cleanup(fr);
2324 	}
2325 	FR_UNLOCK(fr);
2326 
2327 	return err;
2328 }
2329 
2330 static void
fsw_read_boot_args(void)2331 fsw_read_boot_args(void)
2332 {
2333 	(void) PE_parse_boot_argn("fsw_use_dual_sized_pool",
2334 	    &fsw_use_dual_sized_pool, sizeof(fsw_use_dual_sized_pool));
2335 }
2336 
2337 void
fsw_init(void)2338 fsw_init(void)
2339 {
2340 	_CASSERT(NX_FSW_CHUNK_FREE == (uint64_t)-1);
2341 	_CASSERT(PKT_MAX_PROTO_HEADER_SIZE <= NX_FSW_MINBUFSIZE);
2342 
2343 	if (!__nx_fsw_inited) {
2344 		fsw_read_boot_args();
2345 		/*
2346 		 * Register callbacks for interface & protocol events
2347 		 * Use dummy arg for callback cookie.
2348 		 */
2349 		__nx_fsw_ifnet_eventhandler_tag =
2350 		    EVENTHANDLER_REGISTER(&ifnet_evhdlr_ctxt,
2351 		    ifnet_event, fsw_ifnet_event_callback,
2352 		    eventhandler_entry_dummy_arg, EVENTHANDLER_PRI_ANY);
2353 		VERIFY(__nx_fsw_ifnet_eventhandler_tag != NULL);
2354 
2355 		__nx_fsw_protoctl_eventhandler_tag =
2356 		    EVENTHANDLER_REGISTER(&protoctl_evhdlr_ctxt,
2357 		    protoctl_event, fsw_protoctl_event_callback,
2358 		    eventhandler_entry_dummy_arg, EVENTHANDLER_PRI_ANY);
2359 		VERIFY(__nx_fsw_protoctl_eventhandler_tag != NULL);
2360 		__nx_fsw_inited = 1;
2361 	}
2362 }
2363 
2364 void
fsw_uninit(void)2365 fsw_uninit(void)
2366 {
2367 	if (__nx_fsw_inited) {
2368 		EVENTHANDLER_DEREGISTER(&ifnet_evhdlr_ctxt, ifnet_event,
2369 		    __nx_fsw_ifnet_eventhandler_tag);
2370 		EVENTHANDLER_DEREGISTER(&protoctl_evhdlr_ctxt, protoctl_event,
2371 		    __nx_fsw_protoctl_eventhandler_tag);
2372 
2373 		__nx_fsw_inited = 0;
2374 	}
2375 }
2376 
2377 struct nx_flowswitch *
fsw_alloc(zalloc_flags_t how)2378 fsw_alloc(zalloc_flags_t how)
2379 {
2380 	struct nx_flowswitch *fsw;
2381 	struct __nx_stats_fsw *nsfw;
2382 
2383 	SK_LOCK_ASSERT_HELD();
2384 
2385 	nsfw = zalloc_flags(nx_fsw_stats_zone, how | Z_ZERO);
2386 	if (nsfw == NULL) {
2387 		return NULL;
2388 	}
2389 
2390 	fsw = zalloc_flags(nx_fsw_zone, how | Z_ZERO);
2391 	if (fsw == NULL) {
2392 		zfree(nx_fsw_stats_zone, nsfw);
2393 		return NULL;
2394 	}
2395 
2396 	FSW_RWINIT(fsw);
2397 	fsw->fsw_dev_ch = NULL;
2398 	fsw->fsw_host_ch = NULL;
2399 	fsw->fsw_closed_na_stats = nsfw;
2400 
2401 	SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
2402 
2403 	return fsw;
2404 }
2405 
2406 static int
fsw_detach(struct nx_flowswitch * fsw,struct nexus_adapter * hwna,boolean_t purge)2407 fsw_detach(struct nx_flowswitch *fsw, struct nexus_adapter *hwna,
2408     boolean_t purge)
2409 {
2410 	struct kern_nexus_provider *nx_prov = fsw->fsw_nx->nx_prov;
2411 	boolean_t do_dtor = FALSE;
2412 
2413 	SK_LOCK_ASSERT_HELD();
2414 
2415 	/*
2416 	 * return error if the the host port detach is in progress
2417 	 * or already detached.
2418 	 * For the case of flowswitch free (i.e. purge is TRUE) we have to
2419 	 * cleanup everything, so we will block if needed.
2420 	 */
2421 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
2422 	if (!purge && fsw->fsw_detach_flags != 0) {
2423 		SK_ERR("fsw detaching");
2424 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2425 		return EBUSY;
2426 	}
2427 	VERIFY(purge || fsw->fsw_detach_flags == 0);
2428 	/*
2429 	 * mark the flowswitch as detaching and release sk_lock while
2430 	 * waiting for other threads to exit. Maintain lock/unlock
2431 	 * ordering between the two locks.
2432 	 */
2433 	fsw->fsw_detach_flags |= FSW_DETACHF_DETACHING;
2434 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2435 	SK_UNLOCK();
2436 
2437 	/*
2438 	 * wait until all threads needing accesses to the flowswitch
2439 	 * netagent get out, and mark this as detached to prevent
2440 	 * further access requests from being admitted.
2441 	 */
2442 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
2443 	while (fsw->fsw_detach_barriers != 0) {
2444 		fsw->fsw_detach_waiters++;
2445 		(void) msleep(&fsw->fsw_detach_waiters,
2446 		    &fsw->fsw_detach_barrier_lock,
2447 		    (PZERO + 1), __FUNCTION__, NULL);
2448 	}
2449 	VERIFY(fsw->fsw_detach_barriers == 0);
2450 	VERIFY(fsw->fsw_detach_flags != 0);
2451 	fsw->fsw_detach_flags &= ~FSW_DETACHF_DETACHING;
2452 	/*
2453 	 * if the NA detach thread as well as the flowswitch free thread were
2454 	 * both waiting, then the thread which wins the race is responsible
2455 	 * for doing the dtor work.
2456 	 */
2457 	if (fsw->fsw_detach_flags == 0) {
2458 		fsw->fsw_detach_flags |= FSW_DETACHF_DETACHED;
2459 		do_dtor = TRUE;
2460 	}
2461 	VERIFY(fsw->fsw_detach_flags == FSW_DETACHF_DETACHED);
2462 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2463 	SK_LOCK();
2464 
2465 	FSW_WLOCK(fsw);
2466 	if (do_dtor) {
2467 		if (fsw->fsw_ifp != NULL) {
2468 			fsw_teardown_ifp(fsw, hwna);
2469 			ASSERT(fsw->fsw_ifp == NULL);
2470 			ASSERT(fsw->fsw_nifna == NULL);
2471 		}
2472 		bzero(fsw->fsw_slla, sizeof(fsw->fsw_slla));
2473 		nx_prov->nxprov_params->nxp_ifindex = 0;
2474 		/* free any flow entries in the deferred list */
2475 		fsw_linger_purge(fsw);
2476 	}
2477 	/*
2478 	 * If we are destroying the instance, release lock to let all
2479 	 * outstanding agent threads to enter, followed by waiting until
2480 	 * all of them exit the critical section before continuing.
2481 	 */
2482 	if (purge) {
2483 		FSW_UNLOCK(fsw);
2484 		flow_mgr_terminate(fsw->fsw_flow_mgr);
2485 		FSW_WLOCK(fsw);
2486 	}
2487 	FSW_WUNLOCK(fsw);
2488 	return 0;
2489 }
2490 
2491 void
fsw_free(struct nx_flowswitch * fsw)2492 fsw_free(struct nx_flowswitch *fsw)
2493 {
2494 	int err;
2495 
2496 	SK_LOCK_ASSERT_HELD();
2497 	ASSERT(fsw != NULL);
2498 
2499 	err = fsw_detach(fsw, NULL, TRUE);
2500 	VERIFY(err == 0);
2501 
2502 	fsw_dp_dtor(fsw);
2503 
2504 	ASSERT(fsw->fsw_dev_ch == NULL);
2505 	ASSERT(fsw->fsw_host_ch == NULL);
2506 	ASSERT(fsw->fsw_closed_na_stats != NULL);
2507 	zfree(nx_fsw_stats_zone, fsw->fsw_closed_na_stats);
2508 	fsw->fsw_closed_na_stats = NULL;
2509 	FSW_RWDESTROY(fsw);
2510 
2511 	SK_DF(SK_VERB_MEM, "fsw 0x%llx FREE", SK_KVA(fsw));
2512 	zfree(nx_fsw_zone, fsw);
2513 }
2514