xref: /xnu-10002.1.13/bsd/skywalk/nexus/flowswitch/fsw.c (revision 1031c584a5e37aff177559b9f69dbd3c8c3fd30a)
1 /*
2  * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  *   1. Redistributions of source code must retain the above copyright
36  *      notice, this list of conditions and the following disclaimer.
37  *   2. Redistributions in binary form must reproduce the above copyright
38  *      notice, this list of conditions and the following disclaimer in the
39  *      documentation and/or other materials provided with the distribution.
40  *
41  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  */
53 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
54 #include <skywalk/os_skywalk_private.h>
55 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
56 #include <skywalk/nexus/flowswitch/fsw_var.h>
57 #include <skywalk/nexus/netif/nx_netif.h>
58 #include <skywalk/nexus/netif/nx_netif_compat.h>
59 
60 #include <net/bpf.h>
61 #include <net/if.h>
62 #include <net/pktsched/pktsched_netem.h>
63 #include <sys/eventhandler.h>
64 
65 #if (DEVELOPMENT || DEBUG)
66 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, chain_enqueue,
67     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_chain_enqueue, 0, "");
68 #endif /* !DEVELOPMENT && !DEBUG */
69 
70 /*
71  * Configures the flowswitch to utilize user packet pool with
72  * dual sized buffers.
73  * A non-zero value enables the support.
74  */
75 #if defined(XNU_TARGET_OS_IOS) || defined(XNU_TARGET_OS_OSX)
76 uint32_t fsw_use_dual_sized_pool = 1;
77 #else
78 uint32_t fsw_use_dual_sized_pool = 0;
79 #endif
80 
81 uint32_t fsw_chain_enqueue = 1;
82 static int __nx_fsw_inited = 0;
83 static eventhandler_tag __nx_fsw_ifnet_eventhandler_tag = NULL;
84 static eventhandler_tag __nx_fsw_protoctl_eventhandler_tag = NULL;
85 
86 static SKMEM_TYPE_DEFINE(nx_fsw_zone, struct nx_flowswitch);
87 
88 static SKMEM_TYPE_DEFINE(nx_fsw_stats_zone, struct __nx_stats_fsw);
89 
90 #define SKMEM_TAG_FSW_PORTS     "com.apple.skywalk.fsw.ports"
91 SKMEM_TAG_DEFINE(skmem_tag_fsw_ports, SKMEM_TAG_FSW_PORTS);
92 
93 #define SKMEM_TAG_FSW_FOB_HASH "com.apple.skywalk.fsw.fsw.fob.hash"
94 SKMEM_TAG_DEFINE(skmem_tag_fsw_fob_hash, SKMEM_TAG_FSW_FOB_HASH);
95 
96 #define SKMEM_TAG_FSW_FRB_HASH "com.apple.skywalk.fsw.fsw.frb.hash"
97 SKMEM_TAG_DEFINE(skmem_tag_fsw_frb_hash, SKMEM_TAG_FSW_FRB_HASH);
98 
99 #define SKMEM_TAG_FSW_FRIB_HASH "com.apple.skywalk.fsw.fsw.frib.hash"
100 SKMEM_TAG_DEFINE(skmem_tag_fsw_frib_hash, SKMEM_TAG_FSW_FRIB_HASH);
101 
102 #define SKMEM_TAG_FSW_FRAG_MGR "com.apple.skywalk.fsw.fsw.frag.mgr"
103 SKMEM_TAG_DEFINE(skmem_tag_fsw_frag_mgr, SKMEM_TAG_FSW_FRAG_MGR);
104 
105 /* 64-bit mask with range */
106 #define BMASK64(_beg, _end)     \
107 	((NX_FSW_CHUNK_FREE >> (63 - (_end))) & ~((1ULL << (_beg)) - 1))
108 
109 static int fsw_detach(struct nx_flowswitch *fsw, struct nexus_adapter *hwna,
110     boolean_t purge);
111 
112 int
fsw_attach_vp(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_vp_adapter ** vpna)113 fsw_attach_vp(struct kern_nexus *nx, struct kern_channel *ch,
114     struct chreq *chr, struct nxbind *nxb, struct proc *p,
115     struct nexus_vp_adapter **vpna)
116 {
117 #pragma unused(ch)
118 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
119 	char *cr_name = chr->cr_name;
120 	int err = 0;
121 
122 	SK_LOCK_ASSERT_HELD();
123 	ASSERT(!(chr->cr_mode & CHMODE_CONFIG));
124 	*vpna = NULL;
125 
126 	/* if there's an existing adapter on the nexus port then use it */
127 	FSW_WLOCK(fsw);
128 	err = fsw_port_alloc(fsw, nxb, vpna, chr->cr_port, p, FALSE, FALSE);
129 	FSW_WUNLOCK(fsw);
130 
131 	if (err != 0) {
132 		ASSERT(*vpna == NULL);
133 		goto out;
134 	} else if (*vpna != NULL) {
135 		/*
136 		 * Use the existing adapter on that port; fsw_port_alloc()
137 		 * callback has retained a reference count on the adapter.
138 		 */
139 		goto out;
140 	}
141 	ASSERT(*vpna == NULL);
142 
143 	/* create a virtual port; callee holds vpna ref */
144 	err = fsw_vp_na_create(nx, chr, vpna);
145 	if (err != 0) {
146 		SK_ERR("vpna create failed (err %d)", err);
147 		goto out;
148 	}
149 
150 	/* attach vp to fsw */
151 	err = fsw_vp_na_attach(nx, cr_name, &(*vpna)->vpna_up);
152 	if (err != 0) {
153 		SK_ERR("vpna \"%s\" fsw attach failed (err %d)",
154 		    (*vpna)->vpna_up.na_name, err);
155 		goto out;
156 	}
157 
158 	FSW_WLOCK(fsw);
159 	err = fsw_port_alloc(fsw, nxb, vpna, (*vpna)->vpna_nx_port, p, FALSE, FALSE);
160 	FSW_WUNLOCK(fsw);
161 
162 out:
163 	if ((*vpna) != NULL) {
164 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
165 		    "vpna \"%s\" (0x%llx) refs %u to fsw \"%s\" "
166 		    "nx_port %d (err %d)", (*vpna)->vpna_up.na_name,
167 		    SK_KVA(&(*vpna)->vpna_up), (*vpna)->vpna_up.na_refcount,
168 		    cr_name, (int)(*vpna)->vpna_nx_port, err);
169 
170 		if (err != 0) {
171 			na_release_locked(&(*vpna)->vpna_up);
172 			*vpna = NULL;
173 		}
174 	}
175 
176 	return err;
177 }
178 
179 static int
fsw_nx_check(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)180 fsw_nx_check(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
181 {
182 #pragma unused(fsw)
183 	nexus_type_t hw_nxdom_type = NX_DOM(hw_nx)->nxdom_type;
184 
185 	if (hw_nxdom_type != NEXUS_TYPE_NET_IF) {
186 		return EINVAL;
187 	}
188 
189 	/* it's a netif below */
190 	return 0;
191 }
192 
193 static int
fsw_ctl_flow_add(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)194 fsw_ctl_flow_add(struct nx_flowswitch *fsw, struct proc *p,
195     struct nx_flow_req *req)
196 {
197 	struct flow_owner *fo;
198 	int error = 0;
199 
200 	ASSERT(p != PROC_NULL);
201 
202 	if (p != kernproc) {
203 		/* special port shouldn't be bound via this method */
204 		if (req->nfr_nx_port < FSW_VP_USER_MIN) {
205 			return EINVAL;
206 		}
207 		req->nfr_flags |= (NXFLOWREQF_TRACK | NXFLOWREQF_FLOWADV);
208 	} else {
209 		/* no flow track or advisory support for bsd flow */
210 		ASSERT((req->nfr_flags & NXFLOWREQF_TRACK) == 0);
211 		ASSERT((req->nfr_flags & NXFLOWREQF_FLOWADV) == 0);
212 		ASSERT((req->nfr_flags & NXFLOWREQF_LOW_LATENCY) == 0);
213 	}
214 
215 	/* init kernel only fields */
216 	if (p != kernproc) {
217 		nx_flow_req_internalize(req);
218 	}
219 	req->nfr_pid = proc_pid(p);
220 	if (req->nfr_epid == -1) {
221 		req->nfr_epid = proc_pid(p);
222 	}
223 
224 	if (req->nfr_flow_demux_count > MAX_FLOW_DEMUX_PATTERN) {
225 		SK_ERR("invalid flow demux count %u", req->nfr_flow_demux_count);
226 		return EINVAL;
227 	}
228 
229 	fo = fsw_flow_add(fsw, req, &error);
230 	ASSERT(fo != NULL || error != 0);
231 
232 	if (error == 0) {
233 		// user space don't need this flow stats
234 		flow_stats_release(req->nfr_flow_stats);
235 	}
236 	if (p != kernproc) {
237 		nx_flow_req_externalize(req);
238 	}
239 
240 	return error;
241 }
242 
243 static int
fsw_ctl_flow_del(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)244 fsw_ctl_flow_del(struct nx_flowswitch *fsw, struct proc *p,
245     struct nx_flow_req *req)
246 {
247 	int err;
248 
249 	nx_flow_req_internalize(req);
250 	req->nfr_pid = proc_pid(p);
251 	err = fsw_flow_del(fsw, req, TRUE, NULL);
252 
253 	nx_flow_req_externalize(req);
254 	return err;
255 }
256 
257 static int
fsw_ctl_flow_config(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)258 fsw_ctl_flow_config(struct nx_flowswitch *fsw, struct proc *p,
259     struct nx_flow_req *req)
260 {
261 	int err;
262 
263 	nx_flow_req_internalize(req);
264 	req->nfr_pid = proc_pid(p);
265 	err = fsw_flow_config(fsw, req);
266 
267 	nx_flow_req_externalize(req);
268 	return err;
269 }
270 
271 #if (DEVELOPMENT || DEBUG)
272 static int
273 fsw_rps_threads_sysctl SYSCTL_HANDLER_ARGS
274 {
275 #pragma unused(oidp, arg2)
276 	struct nx_flowswitch *fsw = arg1;
277 	uint32_t nthreads;
278 	int changed;
279 	int error;
280 
281 	error = sysctl_io_number(req, fsw->fsw_rps_nthreads,
282 	    sizeof(fsw->fsw_rps_nthreads), &nthreads, &changed);
283 	if (error == 0 && changed != 0) {
284 		error = fsw_rps_set_nthreads(fsw, nthreads);
285 	}
286 	return error;
287 }
288 #endif /* !DEVELOPMENT && !DEBUG */
289 
290 void
fsw_get_tso_capabilities(struct ifnet * ifp,uint32_t * tso_v4_mtu,uint32_t * tso_v6_mtu)291 fsw_get_tso_capabilities(struct ifnet *ifp, uint32_t *tso_v4_mtu, uint32_t *tso_v6_mtu)
292 {
293 #pragma unused(ifp)
294 	*tso_v4_mtu = 0;
295 	*tso_v6_mtu = 0;
296 
297 #ifdef XNU_TARGET_OS_OSX
298 	struct nx_flowswitch *fsw;
299 
300 	fsw = fsw_ifp_to_fsw(ifp);
301 	if (fsw == NULL) {
302 		return;
303 	}
304 	switch (fsw->fsw_tso_mode) {
305 	case FSW_TSO_MODE_HW: {
306 		ASSERT(ifp->if_tso_v4_mtu != 0 || ifp->if_tso_v6_mtu != 0);
307 		*tso_v4_mtu = ifp->if_tso_v4_mtu;
308 		*tso_v6_mtu = ifp->if_tso_v6_mtu;
309 		break;
310 	}
311 	case FSW_TSO_MODE_SW: {
312 		ASSERT(fsw->fsw_tso_sw_mtu != 0);
313 		*tso_v4_mtu = fsw->fsw_tso_sw_mtu;
314 		*tso_v6_mtu = fsw->fsw_tso_sw_mtu;
315 		break;
316 	}
317 	default:
318 		break;
319 	}
320 #endif /* XNU_TARGET_OS_OSX */
321 }
322 
323 static void
fsw_tso_setup(struct nx_flowswitch * fsw)324 fsw_tso_setup(struct nx_flowswitch *fsw)
325 {
326 	fsw->fsw_tso_mode = FSW_TSO_MODE_NONE;
327 #ifdef XNU_TARGET_OS_OSX
328 	struct ifnet *ifp = fsw->fsw_ifp;
329 	if (!SKYWALK_CAPABLE(ifp) || !SKYWALK_NATIVE(ifp)) {
330 		DTRACE_SKYWALK2(tso__no__support, struct nx_flowswitch *, fsw,
331 		    ifnet_t, ifp);
332 		return;
333 	}
334 	struct nx_netif *nif = NA(ifp)->nifna_netif;
335 	uint32_t large_buf_size = NX_PROV_PARAMS(fsw->fsw_nx)->nxp_large_buf_size;
336 
337 	if (large_buf_size == 0) {
338 		DTRACE_SKYWALK2(no__large__buf, struct nx_flowswitch *, fsw,
339 		    ifnet_t, ifp);
340 		return;
341 	}
342 	/*
343 	 * Unlike _dlil_adjust_large_buf_size_for_tso(), we check the nif_hwassist
344 	 * flags here for the original flags because nx_netif_host_adjust_if_capabilities()
345 	 * has already been called.
346 	 */
347 	if (((nif->nif_hwassist & IFNET_TSO_IPV4) != 0 && ifp->if_tso_v4_mtu != 0) ||
348 	    ((nif->nif_hwassist & IFNET_TSO_IPV6) != 0 && ifp->if_tso_v6_mtu != 0)) {
349 		ASSERT(large_buf_size <= ifp->if_tso_v4_mtu ||
350 		    large_buf_size <= ifp->if_tso_v6_mtu);
351 		fsw->fsw_tso_mode = FSW_TSO_MODE_HW;
352 	} else {
353 		if (sk_fsw_gso_mtu != 0 && large_buf_size >= sk_fsw_gso_mtu) {
354 			fsw->fsw_tso_mode = FSW_TSO_MODE_SW;
355 			fsw->fsw_tso_sw_mtu = sk_fsw_gso_mtu;
356 		}
357 	}
358 	DTRACE_SKYWALK3(tso__mode, struct nx_flowswitch *, fsw,
359 	    fsw_tso_mode_t, fsw->fsw_tso_mode, uint32_t, large_buf_size);
360 #endif /* XNU_TARGET_OS_OSX */
361 }
362 
363 static int
fsw_setup_ifp(struct nx_flowswitch * fsw,struct nexus_adapter * hwna)364 fsw_setup_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna)
365 {
366 	int error = 0;
367 	struct ifnet *ifp = hwna->na_ifp;
368 	struct kern_pbufpool *pp = skmem_arena_nexus(hwna->na_arena)->arn_rx_pp;
369 	size_t f_limit = pp->pp_kmd_region->skr_c_obj_cnt / 2;
370 
371 	ASSERT((hwna->na_type == NA_NETIF_HOST) ||
372 	    (hwna->na_type == NA_NETIF_COMPAT_HOST));
373 
374 	SK_LOCK_ASSERT_HELD();
375 
376 	/*
377 	 * XXX: we don't support non TXSTART interface.
378 	 * There are assumptions in fsw_port_flush_enqueue_dst() about
379 	 * single threaded write to destination rings.
380 	 */
381 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
382 		SK_ERR("non TXSTART interface not supported ifp(0x%llx)",
383 		    SK_KVA(ifp));
384 		return ENOTSUP;
385 	}
386 
387 	FSW_WLOCK(fsw);
388 
389 	ASSERT(fsw->fsw_ifp == NULL);
390 	ASSERT(fsw->fsw_nifna == NULL);
391 	ASSERT(fsw->fsw_resolve == NULL);
392 	ASSERT(fsw->fsw_frame == NULL);
393 	ASSERT(fsw->fsw_demux == NULL);
394 	ASSERT(fsw->fsw_pkt_copy_from_pkt == NULL);
395 	ASSERT(fsw->fsw_pkt_copy_from_mbuf == NULL);
396 	ASSERT(fsw->fsw_pkt_copy_to_mbuf == NULL);
397 
398 	fsw->fsw_ipfm = fsw_ip_frag_mgr_create(fsw, ifp, f_limit);
399 	if (fsw->fsw_ipfm == NULL) {
400 		FSW_WUNLOCK(fsw);
401 		return ENOMEM;
402 	}
403 
404 	switch (ifp->if_family) {
405 	case IFNET_FAMILY_ETHERNET:
406 		error = fsw_ethernet_setup(fsw, ifp);
407 		fsw->fsw_ifp_dlt = DLT_EN10MB;
408 		break;
409 
410 	case IFNET_FAMILY_CELLULAR:
411 		error = fsw_cellular_setup(fsw, ifp);
412 		fsw->fsw_ifp_dlt = DLT_RAW;
413 		break;
414 
415 	default:
416 		if (ifp->if_family == IFNET_FAMILY_IPSEC ||
417 		    ifp->if_family == IFNET_FAMILY_UTUN) {
418 			error = fsw_ip_setup(fsw, ifp);
419 			fsw->fsw_ifp_dlt = DLT_RAW;
420 			break;
421 		}
422 		error = ENOTSUP;
423 		break;
424 	}
425 
426 	if (error != 0) {
427 		FSW_WUNLOCK(fsw);
428 		return error;
429 	}
430 
431 	ASSERT(fsw->fsw_resolve != NULL);
432 
433 	if (NX_PROV(fsw->fsw_nx)->nxprov_region_params[SKMEM_REGION_KMD].
434 	    srp_max_frags > 1 || pp->pp_max_frags > 1) {
435 		fsw->fsw_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt;
436 		fsw->fsw_pkt_copy_from_mbuf = pkt_copy_multi_buflet_from_mbuf;
437 		fsw->fsw_pkt_copy_to_mbuf = pkt_copy_multi_buflet_to_mbuf;
438 	} else {
439 		fsw->fsw_pkt_copy_from_pkt = pkt_copy_from_pkt;
440 		fsw->fsw_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
441 		fsw->fsw_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
442 	}
443 
444 	/*
445 	 * Since it is possible for fsw to refer to the ifp after all
446 	 * underlying hwnas are freed (see fsw_teardown_ifp()), we need
447 	 * an extra reference to the ifp here.
448 	 *
449 	 * We also cache the netif adapter of the interface, as it's
450 	 * needed for each packet enqueued to the classq.  There is no
451 	 * need to retain a refcnt for the same reason as above.
452 	 *
453 	 * We hold the busy lock across these, just in case an interface
454 	 * detach and reattach happens, as fsw_flow_bind() relies on the
455 	 * same lock as well before making its checks.
456 	 */
457 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
458 
459 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
460 	fsw->fsw_ifp = ifp;
461 	fsw->fsw_nifna = &ifp->if_na->nifna_up;
462 	ifp->if_na->nifna_netif->nif_fsw = fsw;
463 	ifp->if_na->nifna_netif->nif_fsw_nxadv =
464 	    fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
465 	(void) strlcpy(fsw->fsw_flow_mgr->fm_name,
466 	    if_name(ifp), IFNAMSIZ);
467 
468 	fsw_classq_setup(fsw, hwna);
469 	fsw->fsw_classq_enabled = TRUE;
470 	fsw->fsw_src_lla_gencnt = 0;
471 	fsw_tso_setup(fsw);
472 
473 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
474 	(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
475 	    FSW_REAP_THREADNAME, ifp->if_xname, "");
476 	thread_set_thread_name(fsw->fsw_reap_thread, fsw->fsw_reap_name);
477 
478 	error = fsw_netagent_register(fsw, ifp);
479 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
480 	    "fsw_netagent_register %s (family %u) (err %d)",
481 	    if_name(ifp), ifp->if_family, error);
482 
483 	/*
484 	 * Clear NXF_REJECT to allow new channels to be opened
485 	 * to this nexus, in case this is an interface reattach.
486 	 * Otherwise this flag should already be cleared.
487 	 */
488 	if (error == 0) {
489 		os_atomic_andnot(&fsw->fsw_nx->nx_flags, NXF_REJECT, relaxed);
490 	}
491 
492 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
493 
494 	/*
495 	 * Wake up the reaper thread.
496 	 */
497 	if (error == 0) {
498 		fsw_reap_sched(fsw);
499 	}
500 
501 	/* init skoid */
502 	skoid_create(&fsw->fsw_skoid,
503 	    SKOID_SNODE(_kern_skywalk_flowswitch), if_name(ifp),
504 	    CTLFLAG_RW);
505 
506 #if (DEVELOPMENT || DEBUG)
507 	if (SKYWALK_NATIVE(fsw->fsw_ifp)) {
508 		skoid_add_handler(&fsw->fsw_skoid, "rps_nthreads", CTLFLAG_RW,
509 		    fsw_rps_threads_sysctl, fsw, 0);
510 	}
511 #endif /* !DEVELOPMENT && !DEBUG */
512 
513 	FSW_WUNLOCK(fsw);
514 
515 	return error;
516 }
517 
518 static void
fsw_teardown_ifp(struct nx_flowswitch * fsw,struct nexus_adapter * hwna)519 fsw_teardown_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna)
520 {
521 	struct ifnet *ifp;
522 
523 	SK_LOCK_ASSERT_HELD();
524 
525 	FSW_WLOCK_ASSERT_HELD(fsw);
526 	ifp = fsw->fsw_ifp;
527 	ASSERT(ifp != NULL);
528 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
529 
530 	fsw_netagent_unregister(fsw, ifp);
531 
532 	if (fsw->fsw_ipfm != NULL) {
533 		fsw_ip_frag_mgr_destroy(fsw->fsw_ipfm);
534 	}
535 
536 	skoid_destroy(&fsw->fsw_skoid);
537 
538 	SK_DF(SK_VERB_FSW, "%sdetached from %s (family %u)",
539 	    ((fsw->fsw_agent_session != NULL) ? "netagent" : ""),
540 	    if_name(ifp), ifp->if_family);
541 
542 	if (hwna != NULL) {
543 		fsw_classq_teardown(fsw, hwna);
544 	}
545 
546 	/*
547 	 * Set NXF_REJECT on the nexus, which would cause existing adapters
548 	 * to be marked similarly; channels associated with them would then
549 	 * cease to function.
550 	 */
551 	os_atomic_or(&fsw->fsw_nx->nx_flags, NXF_REJECT, relaxed);
552 
553 	/* see notes on fsw_na_attach() about I/O refcnt */
554 	if (ifp->if_na != NULL) {
555 		ifp->if_na->nifna_netif->nif_fsw = NULL;
556 		ifp->if_na->nifna_netif->nif_fsw_nxadv = NULL;
557 		os_atomic_thread_fence(seq_cst);
558 	}
559 
560 	fsw->fsw_ifp = NULL;
561 	fsw->fsw_nifna = NULL;
562 	fsw->fsw_resolve = NULL;
563 	fsw->fsw_frame = NULL;
564 	fsw->fsw_frame_headroom = 0;
565 	fsw->fsw_demux = NULL;
566 	fsw->fsw_classq_enabled = FALSE;
567 	fsw->fsw_pkt_copy_from_pkt = NULL;
568 	fsw->fsw_pkt_copy_from_mbuf = NULL;
569 	fsw->fsw_pkt_copy_to_mbuf = NULL;
570 
571 	if (ifp->if_input_netem != NULL) {
572 		netem_destroy(ifp->if_input_netem);
573 		ifp->if_input_netem = NULL;
574 	}
575 
576 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
577 	(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
578 	    FSW_REAP_THREADNAME, if_name(ifp), "_detached");
579 	thread_set_thread_name(fsw->fsw_reap_thread, fsw->fsw_reap_name);
580 }
581 
582 static int
fsw_host_setup(struct nx_flowswitch * fsw)583 fsw_host_setup(struct nx_flowswitch *fsw)
584 {
585 	struct nexus_adapter *hwna;
586 	struct ifnet *ifp;
587 
588 	SK_LOCK_ASSERT_HELD();
589 
590 	hwna = fsw->fsw_host_ch->ch_na;
591 	ASSERT(hwna != NULL);
592 
593 
594 	/* the netif below must have an ifnet attached (dev/host port) */
595 	if ((ifp = hwna->na_ifp) == NULL) {
596 		return ENXIO;
597 	}
598 
599 	/*
600 	 * XXX: we don't support multiple rx rings yet.
601 	 * There are assumptions in fsw_port_flush_enqueue_dst() about
602 	 * single threaded write to destination rings.
603 	 */
604 	if (SKYWALK_NATIVE(ifp) && (hwna->na_num_rx_rings > 1)) {
605 		SK_ERR("ifp(0x%llx): multiple rx rings(%d) not supported",
606 		    SK_KVA(ifp), hwna->na_num_rx_rings);
607 		return ENOTSUP;
608 	}
609 
610 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
611 	if ((fsw->fsw_detach_flags & FSW_DETACHF_DETACHING) != 0) {
612 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
613 		return EBUSY;
614 	}
615 	fsw->fsw_detach_flags = 0;
616 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
617 
618 	int error = fsw_setup_ifp(fsw, hwna);
619 	ASSERT(error != 0 || fsw->fsw_ifp != NULL);
620 	if (error != 0) {
621 		return error;
622 	}
623 
624 	/* update the interface index */
625 	ASSERT(NX_PROV(fsw->fsw_nx)->nxprov_params->nxp_ifindex == 0);
626 	NX_PROV(fsw->fsw_nx)->nxprov_params->nxp_ifindex = ifp->if_index;
627 	return 0;
628 }
629 
630 static int
fsw_host_teardown(struct nx_flowswitch * fsw)631 fsw_host_teardown(struct nx_flowswitch *fsw)
632 {
633 	struct nexus_adapter *hwna = fsw->fsw_host_ch->ch_na;
634 
635 	SK_LOCK_ASSERT_HELD();
636 	return fsw_detach(fsw, hwna, FALSE);
637 }
638 
639 #if SK_LOG
640 /* Hoisted out of line to reduce kernel stack footprint */
641 SK_LOG_ATTRIBUTE
642 static void
fsw_ctl_attach_log(const struct nx_spec_req * nsr,const struct kern_nexus * nx,int err)643 fsw_ctl_attach_log(const struct nx_spec_req *nsr,
644     const struct kern_nexus *nx, int err)
645 {
646 	uuid_string_t uuidstr, ifuuidstr;
647 	const char *nustr;
648 
649 	if (nsr->nsr_flags & NXSPECREQ_UUID) {
650 		nustr = sk_uuid_unparse(nsr->nsr_uuid, uuidstr);
651 	} else if (nsr->nsr_flags & NXSPECREQ_IFP) {
652 		(void) snprintf((char *)uuidstr, sizeof(uuidstr), "0x%llx",
653 		    SK_KVA(nsr->nsr_ifp));
654 		nustr = uuidstr;
655 	} else {
656 		nustr = nsr->nsr_name;
657 	}
658 
659 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
660 	    "nexus 0x%llx (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d",
661 	    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr,
662 	    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err);
663 }
664 #endif /* SK_LOG */
665 
666 SK_NO_INLINE_ATTRIBUTE
667 static void
fsw_netif_set_callbacks_common(struct nx_flowswitch * fsw,boolean_t set)668 fsw_netif_set_callbacks_common(struct nx_flowswitch *fsw, boolean_t set)
669 {
670 	struct nexus_adapter *hwna = fsw->fsw_dev_ch->ch_na;
671 
672 	ASSERT(hwna->na_type == NA_NETIF_DEV ||
673 	    hwna->na_type == NA_NETIF_COMPAT_DEV);
674 
675 	if (set) {
676 		netif_hwna_set_mode(hwna, NETIF_MODE_FSW, fsw_devna_rx);
677 	} else {
678 		netif_hwna_clear_mode(hwna);
679 	}
680 }
681 
682 SK_NO_INLINE_ATTRIBUTE
683 static void
fsw_netif_set_callbacks(struct nx_flowswitch * fsw)684 fsw_netif_set_callbacks(struct nx_flowswitch *fsw)
685 {
686 	fsw_netif_set_callbacks_common(fsw, TRUE);
687 }
688 
689 SK_NO_INLINE_ATTRIBUTE
690 static void
fsw_netif_clear_callbacks(struct nx_flowswitch * fsw)691 fsw_netif_clear_callbacks(struct nx_flowswitch *fsw)
692 {
693 	fsw_netif_set_callbacks_common(fsw, FALSE);
694 }
695 
696 SK_NO_INLINE_ATTRIBUTE
697 static void
fsw_dp_start(struct nx_flowswitch * fsw)698 fsw_dp_start(struct nx_flowswitch *fsw)
699 {
700 	ASSERT(fsw->fsw_dev_ch != NULL);
701 	ASSERT(fsw->fsw_host_ch != NULL);
702 
703 	fsw_netif_set_callbacks(fsw);
704 	na_start_spec(fsw->fsw_dev_ch->ch_nexus, fsw->fsw_dev_ch);
705 	na_start_spec(fsw->fsw_host_ch->ch_nexus, fsw->fsw_host_ch);
706 }
707 
708 SK_NO_INLINE_ATTRIBUTE
709 static int
fsw_dp_stop(struct nx_flowswitch * fsw,struct ifnet ** ifpp)710 fsw_dp_stop(struct nx_flowswitch *fsw, struct ifnet **ifpp)
711 {
712 	struct ifnet *ifp;
713 
714 	FSW_WLOCK(fsw);
715 	if ((fsw->fsw_state_flags & FSW_STATEF_QUIESCED) != 0) {
716 		FSW_WUNLOCK(fsw);
717 		return EALREADY;
718 	}
719 	fsw->fsw_state_flags |= FSW_STATEF_QUIESCED;
720 	FSW_WUNLOCK(fsw);
721 
722 	/*
723 	 * For regular kernel-attached interfaces, quiescing is handled by
724 	 * the ifnet detach thread, which calls dlil_quiesce_and_detach_nexuses().
725 	 * For interfaces created by skywalk test cases, flowswitch/netif nexuses
726 	 * are constructed on the fly and can also be torn down on the fly.
727 	 * dlil_quiesce_and_detach_nexuses() won't help here because any nexus
728 	 * can be detached while the interface is still attached.
729 	 */
730 	if ((ifp = fsw->fsw_ifp) != NULL &&
731 	    ifnet_datamov_suspend_if_needed(ifp)) {
732 		SK_UNLOCK();
733 		ifnet_datamov_drain(ifp);
734 		/* Reference will be released by caller */
735 		*ifpp = ifp;
736 		SK_LOCK();
737 	}
738 	ASSERT(fsw->fsw_dev_ch != NULL);
739 	ASSERT(fsw->fsw_host_ch != NULL);
740 	na_stop_spec(fsw->fsw_host_ch->ch_nexus, fsw->fsw_host_ch);
741 	na_stop_spec(fsw->fsw_dev_ch->ch_nexus, fsw->fsw_dev_ch);
742 	fsw_netif_clear_callbacks(fsw);
743 	return 0;
744 }
745 
746 SK_NO_INLINE_ATTRIBUTE
747 static int
fsw_netif_port_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx,boolean_t host)748 fsw_netif_port_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx,
749     boolean_t host)
750 {
751 	struct chreq chr;
752 	struct kern_channel *ch;
753 	int err;
754 
755 	bzero(&chr, sizeof(chr));
756 	uuid_copy(chr.cr_spec_uuid, hw_nx->nx_uuid);
757 	chr.cr_ring_id = CHANNEL_RING_ID_ANY;
758 	chr.cr_port = host ? NEXUS_PORT_NET_IF_HOST : NEXUS_PORT_NET_IF_DEV;
759 	chr.cr_mode |= CHMODE_CONFIG | (host ? CHMODE_HOST : 0);
760 
761 	err = 0;
762 	ch = ch_open_special(hw_nx, &chr, FALSE, &err);
763 	if (ch == NULL) {
764 		SK_ERR("ch_open_special(%s) failed: %d",
765 		    host ? "host" : "dev", err);
766 		return err;
767 	}
768 	if (host) {
769 		fsw->fsw_host_ch = ch;
770 	} else {
771 		fsw->fsw_dev_ch = ch;
772 	}
773 	return 0;
774 }
775 
776 SK_NO_INLINE_ATTRIBUTE
777 static int
fsw_netif_port_teardown(struct nx_flowswitch * fsw,boolean_t host)778 fsw_netif_port_teardown(struct nx_flowswitch *fsw, boolean_t host)
779 {
780 	struct kern_channel *ch;
781 
782 	ch = host ? fsw->fsw_host_ch : fsw->fsw_dev_ch;
783 	if (ch == NULL) {
784 		return EINVAL;
785 	}
786 	if (host) {
787 		fsw->fsw_host_ch = NULL;
788 	} else {
789 		fsw->fsw_dev_ch = NULL;
790 	}
791 	ch_close_special(ch);
792 	(void) ch_release_locked(ch);
793 	return 0;
794 }
795 
796 SK_NO_INLINE_ATTRIBUTE
797 static int
fsw_devna_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)798 fsw_devna_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
799 {
800 	return fsw_netif_port_setup(fsw, hw_nx, FALSE);
801 }
802 
803 SK_NO_INLINE_ATTRIBUTE
804 static int
fsw_hostna_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)805 fsw_hostna_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
806 {
807 	return fsw_netif_port_setup(fsw, hw_nx, TRUE);
808 }
809 
810 SK_NO_INLINE_ATTRIBUTE
811 static int
fsw_devna_teardown(struct nx_flowswitch * fsw)812 fsw_devna_teardown(struct nx_flowswitch *fsw)
813 {
814 	return fsw_netif_port_teardown(fsw, FALSE);
815 }
816 
817 SK_NO_INLINE_ATTRIBUTE
818 static int
fsw_hostna_teardown(struct nx_flowswitch * fsw)819 fsw_hostna_teardown(struct nx_flowswitch *fsw)
820 {
821 	return fsw_netif_port_teardown(fsw, TRUE);
822 }
823 
824 /* Process NXCFG_CMD_ATTACH */
825 SK_NO_INLINE_ATTRIBUTE
826 static int
fsw_ctl_attach(struct kern_nexus * nx,struct proc * p,struct nx_spec_req * nsr)827 fsw_ctl_attach(struct kern_nexus *nx, struct proc *p, struct nx_spec_req *nsr)
828 {
829 #pragma unused(p)
830 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
831 	struct kern_nexus *hw_nx = NULL;
832 	int err = 0;
833 
834 	SK_LOCK_ASSERT_HELD();
835 
836 	/*
837 	 * The flowswitch only accepts UUID as an identifier, since it
838 	 * represents the UUID of the kernel object we are trying to
839 	 * attach to this flowswitch.
840 	 */
841 	if ((nsr->nsr_flags & (NXSPECREQ_UUID | NXSPECREQ_IFP)) !=
842 	    NXSPECREQ_UUID || uuid_is_null(nsr->nsr_uuid)) {
843 		err = EINVAL;
844 		goto done;
845 	}
846 
847 	if (fsw->fsw_dev_ch != NULL) {
848 		ASSERT(fsw->fsw_host_ch != NULL);
849 		err = EEXIST;
850 		goto done;
851 	}
852 
853 	hw_nx = nx_find(nsr->nsr_uuid, TRUE);
854 	if (hw_nx == NULL) {
855 		err = ENOENT;
856 		goto done;
857 	} else if (hw_nx == nx) {
858 		err = EINVAL;
859 		goto done;
860 	}
861 
862 	/* preflight check to see if the nexus is attachable to us */
863 	err = fsw_nx_check(fsw, hw_nx);
864 	if (err != 0) {
865 		goto done;
866 	}
867 
868 	err = fsw_devna_setup(fsw, hw_nx);
869 	if (err != 0) {
870 		goto done;
871 	}
872 
873 	err = fsw_hostna_setup(fsw, hw_nx);
874 	if (err != 0) {
875 		(void) fsw_devna_teardown(fsw);
876 		goto done;
877 	}
878 
879 	err = fsw_host_setup(fsw);
880 	if (err != 0) {
881 		(void) fsw_hostna_teardown(fsw);
882 		(void) fsw_devna_teardown(fsw);
883 		goto done;
884 	}
885 
886 	fsw_dp_start(fsw);
887 
888 	/* return the devna UUID */
889 	uuid_copy(nsr->nsr_if_uuid, fsw->fsw_dev_ch->ch_na->na_uuid);
890 	ASSERT(!uuid_is_null(nsr->nsr_if_uuid));
891 done:
892 #if SK_LOG
893 	if (__improbable(sk_verbose != 0)) {
894 		fsw_ctl_attach_log(nsr, nx, err);
895 	}
896 #endif /* SK_LOG */
897 
898 	if (hw_nx != NULL) {
899 		nx_release_locked(hw_nx);
900 	}
901 
902 	return err;
903 }
904 
905 SK_NO_INLINE_ATTRIBUTE
906 static void
fsw_cleanup(struct nx_flowswitch * fsw)907 fsw_cleanup(struct nx_flowswitch *fsw)
908 {
909 	int err;
910 	struct ifnet *ifp = NULL;
911 
912 	if (fsw->fsw_dev_ch == NULL) {
913 		ASSERT(fsw->fsw_host_ch == NULL);
914 		return;
915 	}
916 	err = fsw_dp_stop(fsw, &ifp);
917 	if (err != 0) {
918 		return;
919 	}
920 	err = fsw_host_teardown(fsw);
921 	VERIFY(err == 0);
922 
923 	err = fsw_hostna_teardown(fsw);
924 	VERIFY(err == 0);
925 
926 	err = fsw_devna_teardown(fsw);
927 	VERIFY(err == 0);
928 
929 	if (ifp != NULL) {
930 		ifnet_datamov_resume(ifp);
931 	}
932 }
933 
934 int
fsw_ctl_detach(struct kern_nexus * nx,struct proc * p,struct nx_spec_req * nsr)935 fsw_ctl_detach(struct kern_nexus *nx, struct proc *p,
936     struct nx_spec_req *nsr)
937 {
938 #pragma unused(p)
939 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
940 	int err = 0;
941 
942 	SK_LOCK_ASSERT_HELD();
943 
944 	/*
945 	 * nsr is NULL when we're called from the destructor, and it
946 	 * implies that we'll detach everything that is attached.
947 	 */
948 	if (nsr == NULL) {
949 		fsw_cleanup(fsw);
950 		ASSERT(fsw->fsw_dev_ch == NULL);
951 		ASSERT(fsw->fsw_host_ch == NULL);
952 		goto done;
953 	}
954 
955 	if (uuid_is_null(nsr->nsr_if_uuid)) {
956 		err = EINVAL;
957 		goto done;
958 	} else if (fsw->fsw_dev_ch == NULL || fsw->fsw_host_ch == NULL) {
959 		err = ENXIO;
960 		goto done;
961 	}
962 
963 	/* check if the devna uuid is correct */
964 	if (uuid_compare(nsr->nsr_if_uuid,
965 	    fsw->fsw_dev_ch->ch_na->na_uuid) != 0) {
966 		err = ESRCH;
967 		goto done;
968 	}
969 	fsw_cleanup(fsw);
970 
971 done:
972 #if SK_LOG
973 	if (nsr != NULL) {
974 		uuid_string_t ifuuidstr;
975 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
976 		    "nexus 0x%llx (%s) if_uuid %s flags 0x%x err %d",
977 		    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
978 		    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr),
979 		    nsr->nsr_flags, err);
980 	} else {
981 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
982 		    "nexus 0x%llx (%s) ANY err %d", SK_KVA(nx),
983 		    NX_DOM_PROV(nx)->nxdom_prov_name, err);
984 	}
985 #endif /* SK_LOG */
986 
987 	return err;
988 }
989 
990 static int
fsw_netem_config(struct nx_flowswitch * fsw,void * data)991 fsw_netem_config(struct nx_flowswitch *fsw, void *data)
992 {
993 	struct ifnet *ifp = fsw->fsw_ifp;
994 	struct if_netem_params *params = data;
995 	int ret;
996 
997 	if (ifp == NULL) {
998 		return ENODEV;
999 	}
1000 
1001 	SK_LOCK_ASSERT_HELD();
1002 #define fsw_INPUT_NETEM_THREADNAME   "if_input_netem_%s@fsw"
1003 #define fsw_INPUT_NETEM_THREADNAME_LEN       32
1004 	char netem_name[fsw_INPUT_NETEM_THREADNAME_LEN];
1005 	(void) snprintf(netem_name, sizeof(netem_name),
1006 	    fsw_INPUT_NETEM_THREADNAME, if_name(ifp));
1007 	ret = netem_config(&ifp->if_input_netem, netem_name, ifp, params, fsw,
1008 	    fsw_dev_input_netem_dequeue, FSW_VP_DEV_BATCH_MAX);
1009 
1010 	return ret;
1011 }
1012 
1013 int
fsw_ctl(struct kern_nexus * nx,nxcfg_cmd_t nc_cmd,struct proc * p,void * data)1014 fsw_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, struct proc *p,
1015     void *data)
1016 {
1017 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
1018 	struct nx_spec_req *nsr = data;
1019 	struct nx_flow_req *req = data;
1020 	boolean_t need_check;
1021 	int error = 0;
1022 
1023 	switch (nc_cmd) {
1024 	case NXCFG_CMD_FLOW_ADD:
1025 	case NXCFG_CMD_FLOW_DEL:
1026 		if (uuid_is_null(req->nfr_flow_uuid)) {
1027 			error = EINVAL;
1028 			goto done;
1029 		}
1030 		if (p != kernproc) {
1031 			req->nfr_flags &= NXFLOWREQF_MASK;
1032 		}
1033 		req->nfr_flowadv_idx = FLOWADV_IDX_NONE;
1034 
1035 		if (nc_cmd == NXCFG_CMD_FLOW_DEL) {
1036 			break;
1037 		}
1038 
1039 		need_check = FALSE;
1040 		if (req->nfr_epid != -1 && proc_pid(p) != req->nfr_epid) {
1041 			need_check = TRUE;
1042 		} else if (!uuid_is_null(req->nfr_euuid)) {
1043 			uuid_t uuid;
1044 
1045 			/* get the UUID of the issuing process */
1046 			proc_getexecutableuuid(p, uuid, sizeof(uuid));
1047 
1048 			/*
1049 			 * If this is not issued by a process for its own
1050 			 * executable UUID and if the process does not have
1051 			 * the necessary privilege, reject the request.
1052 			 * The logic is similar to so_set_effective_uuid().
1053 			 */
1054 			if (uuid_compare(req->nfr_euuid, uuid) != 0) {
1055 				need_check = TRUE;
1056 			}
1057 		}
1058 		if (need_check) {
1059 			kauth_cred_t cred = kauth_cred_proc_ref(p);
1060 			error = priv_check_cred(cred,
1061 			    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0);
1062 			kauth_cred_unref(&cred);
1063 			if (error != 0) {
1064 				goto done;
1065 			}
1066 		}
1067 		break;
1068 
1069 	default:
1070 		break;
1071 	}
1072 
1073 	switch (nc_cmd) {
1074 	case NXCFG_CMD_ATTACH:
1075 		error = fsw_ctl_attach(nx, p, nsr);
1076 		break;
1077 
1078 	case NXCFG_CMD_DETACH:
1079 		error = fsw_ctl_detach(nx, p, nsr);
1080 		break;
1081 
1082 	case NXCFG_CMD_FLOW_ADD:       /* struct nx_flow_req */
1083 		error = fsw_ctl_flow_add(fsw, p, data);
1084 		break;
1085 
1086 	case NXCFG_CMD_FLOW_DEL:     /* struct nx_flow_req */
1087 		error = fsw_ctl_flow_del(fsw, p, data);
1088 		break;
1089 
1090 	case NXCFG_CMD_FLOW_CONFIG:
1091 		error = fsw_ctl_flow_config(fsw, p, data);
1092 		break;
1093 
1094 	case NXCFG_CMD_NETEM:           /* struct if_netem_params */
1095 		error = fsw_netem_config(fsw, data);
1096 		break;
1097 
1098 	default:
1099 		SK_ERR("invalid cmd %u", nc_cmd);
1100 		error = EINVAL;
1101 		break;
1102 	}
1103 
1104 done:
1105 	return error;
1106 }
1107 
1108 struct nx_flowswitch *
fsw_ifp_to_fsw(struct ifnet * ifp)1109 fsw_ifp_to_fsw(struct ifnet *ifp)
1110 {
1111 	struct nx_flowswitch *fsw = NULL;
1112 
1113 	if (ifp->if_na != NULL) {
1114 		fsw = ifp->if_na->nifna_netif->nif_fsw;
1115 	}
1116 	return fsw;
1117 }
1118 
1119 static void
fsw_ifnet_event_callback(struct eventhandler_entry_arg ee_arg __unused,struct ifnet * ifp,struct sockaddr * ip_addr __unused,intf_event_code_t intf_ev_code)1120 fsw_ifnet_event_callback(struct eventhandler_entry_arg ee_arg __unused,
1121     struct ifnet *ifp, struct sockaddr *ip_addr __unused,
1122     intf_event_code_t intf_ev_code)
1123 {
1124 	struct nx_flowswitch *fsw = NULL;
1125 
1126 	if (ifp->if_na == NULL) {
1127 		return;
1128 	}
1129 
1130 	SK_LOCK();
1131 	fsw = fsw_ifp_to_fsw(ifp);
1132 	if (fsw != NULL) {
1133 		switch (intf_ev_code) {
1134 		case INTF_EVENT_CODE_LLADDR_UPDATE:
1135 			if ((fsw->fsw_ifp == NULL) ||
1136 			    (fsw->fsw_ifp_dlt != DLT_EN10MB)) {
1137 				break;
1138 			}
1139 
1140 			VERIFY(fsw->fsw_ifp == ifp);
1141 			SK_DF(SK_VERB_FSW, "MAC address change detected for %s",
1142 			    if_name(fsw->fsw_ifp));
1143 			(void) ifnet_lladdr_copy_bytes(ifp, fsw->fsw_ether_shost,
1144 			    ETHER_ADDR_LEN);
1145 			os_atomic_inc(&fsw->fsw_src_lla_gencnt, relaxed);
1146 			break;
1147 
1148 		case INTF_EVENT_CODE_LOW_POWER_UPDATE:
1149 			if (fsw->fsw_ifp == NULL) {
1150 				break;
1151 			}
1152 
1153 			VERIFY(fsw->fsw_ifp == ifp);
1154 
1155 			if (ifp->if_xflags & IFXF_LOW_POWER) {
1156 				SK_DF(SK_VERB_FSW,
1157 				    "Low power mode updated for %s",
1158 				    if_name(fsw->fsw_ifp));
1159 
1160 				fsw_reap_sched(fsw);
1161 			}
1162 			break;
1163 
1164 		default:
1165 			break;
1166 		}
1167 	}
1168 	SK_UNLOCK();
1169 }
1170 
1171 static void
fsw_protoctl_event_callback(struct eventhandler_entry_arg ee_arg,struct ifnet * ifp,struct sockaddr * p_laddr,struct sockaddr * p_raddr,uint16_t lport,uint16_t rport,uint8_t proto,uint32_t protoctl_event_code,struct protoctl_ev_val * p_val)1172 fsw_protoctl_event_callback(struct eventhandler_entry_arg ee_arg,
1173     struct ifnet *ifp, struct sockaddr *p_laddr, struct sockaddr *p_raddr,
1174     uint16_t lport, uint16_t rport, uint8_t proto, uint32_t protoctl_event_code,
1175     struct protoctl_ev_val *p_val)
1176 {
1177 #pragma unused(ee_arg)
1178 	struct nx_flowswitch *fsw = NULL;
1179 	struct flow_entry *fe = NULL;
1180 	boolean_t netagent_update_flow = FALSE;
1181 	uuid_t fe_uuid;
1182 
1183 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1184 		return;
1185 	}
1186 
1187 	/*
1188 	 * XXX Right now only handle the event if we have enough
1189 	 * information to match the entire flow.
1190 	 */
1191 	if (lport == 0 || rport == 0 || p_laddr == NULL || p_raddr == NULL) {
1192 		return;
1193 	}
1194 
1195 	SK_LOCK();
1196 	fsw = fsw_ifp_to_fsw(ifp);
1197 	if (fsw == NULL) {
1198 		goto out;
1199 	}
1200 
1201 	if (!fsw_detach_barrier_add(fsw)) {
1202 		fsw = NULL;
1203 		SK_ERR("netagent detached");
1204 		goto out;
1205 	}
1206 
1207 	struct flow_key fk __sk_aligned(16);
1208 	FLOW_KEY_CLEAR(&fk);
1209 	fk.fk_proto = proto;
1210 	if (p_laddr->sa_family == AF_INET) {
1211 		fk.fk_ipver = IPVERSION;
1212 		fk.fk_src4 = SIN(p_laddr)->sin_addr;
1213 		fk.fk_dst4 = SIN(p_raddr)->sin_addr;
1214 	} else {
1215 		fk.fk_ipver = IPV6_VERSION;
1216 		fk.fk_src6 = SIN6(p_laddr)->sin6_addr;
1217 		/*
1218 		 * rdar://107435899 The scope ID for destination address needs
1219 		 * to be cleared out before looking up the flow entry for this
1220 		 * 5-tuple, because addresses in flow entries do not contain the
1221 		 * scope ID.
1222 		 */
1223 		struct in6_addr *in6;
1224 
1225 		fk.fk_dst6 = SIN6(p_raddr)->sin6_addr;
1226 		in6 = &fk.fk_dst6;
1227 		if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(in6)) {
1228 			in6->s6_addr16[1] = 0;
1229 		}
1230 	}
1231 	fk.fk_sport = lport;
1232 	fk.fk_dport = rport;
1233 	fk.fk_mask = FKMASK_5TUPLE;
1234 
1235 	fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &fk);
1236 	if (__improbable(fe == NULL)) {
1237 		goto out;
1238 	}
1239 
1240 	uuid_copy(fe_uuid, fe->fe_uuid);
1241 	/*
1242 	 * If the protocol notification is for TCP, make sure
1243 	 * protocol event received is for bytes in the flight.
1244 	 * XXX Redirect events are not delivered as protocol events
1245 	 * but as better route events.
1246 	 * Also redirect events do not indicate loss of the packet.
1247 	 */
1248 	if (proto != IPPROTO_TCP) {
1249 		p_val->tcp_seq_number = 0;
1250 	}
1251 
1252 	netagent_update_flow = TRUE;
1253 
1254 out:
1255 	SK_UNLOCK();
1256 
1257 	if (netagent_update_flow) {
1258 		int error = 0;
1259 #if SK_LOG
1260 		char dbgbuf[FLOWENTRY_DBGBUF_SIZE];
1261 		SK_DF(SK_VERB_FLOW, "Update flow entry \"%s\" for protocol "
1262 		    "event %d with value %d and tcp sequence number %d",
1263 		    fe_as_string(fe, dbgbuf, sizeof(dbgbuf)),
1264 		    protoctl_event_code, p_val->val, p_val->tcp_seq_number);
1265 #endif /* SK_LOG */
1266 		if ((error = netagent_update_flow_protoctl_event(
1267 			    fsw->fsw_agent_session, fe_uuid, protoctl_event_code,
1268 			    p_val->val, p_val->tcp_seq_number)) != 0) {
1269 #if SK_LOG
1270 			SK_DF(SK_VERB_FLOW, "Error: %d. Could not update "
1271 			    "flow entry \"%s\" for protocol event %d with "
1272 			    "value %d and tcp sequence number %d", error,
1273 			    dbgbuf, protoctl_event_code, p_val->val,
1274 			    p_val->tcp_seq_number);
1275 #endif /* SK_LOG */
1276 		}
1277 	}
1278 
1279 	if (fe != NULL) {
1280 		flow_entry_release(&fe);
1281 	}
1282 
1283 	if (fsw != NULL) {
1284 		fsw_detach_barrier_remove(fsw);
1285 	}
1286 }
1287 
1288 int
fsw_netagent_add_remove(struct kern_nexus * nx,boolean_t add)1289 fsw_netagent_add_remove(struct kern_nexus *nx, boolean_t add)
1290 {
1291 	struct nx_flowswitch *fsw = NULL;
1292 	int error = 0;
1293 
1294 	SK_LOCK_ASSERT_HELD();
1295 	VERIFY(nx != NULL);
1296 	VERIFY(NX_PROV(nx) != NULL);
1297 	VERIFY(NX_DOM_PROV(nx) != NULL);
1298 
1299 	if (NX_DOM(nx)->nxdom_type != NEXUS_TYPE_FLOW_SWITCH) {
1300 		error = EINVAL;
1301 		goto out;
1302 	}
1303 
1304 	fsw = NX_FSW_PRIVATE(nx);
1305 	VERIFY(fsw != NULL);
1306 	FSW_WLOCK(fsw);
1307 
1308 	if (fsw->fsw_agent_session == NULL) {
1309 		error = ENXIO;
1310 		goto out;
1311 	}
1312 
1313 	ASSERT(!uuid_is_null(fsw->fsw_agent_uuid));
1314 
1315 	if (add) {
1316 		if (FSW_NETAGENT_ADDED(fsw)) {
1317 			/* agent already added */
1318 			error = EEXIST;
1319 		} else if (fsw->fsw_ifp->if_bridge != NULL) {
1320 			/* see rdar://107076453 */
1321 			SK_ERR("%s is bridged, not adding netagent",
1322 			    if_name(fsw->fsw_ifp));
1323 			error = EBUSY;
1324 		} else {
1325 			fsw->fsw_state_flags |= FSW_STATEF_NETAGENT_ADDED;
1326 			if (if_is_fsw_netagent_enabled()) {
1327 				fsw->fsw_state_flags
1328 				        |= FSW_STATEF_NETAGENT_ENABLED;
1329 			}
1330 			if_add_netagent(fsw->fsw_ifp, fsw->fsw_agent_uuid);
1331 			SK_D("flowswitch netagent added for interface %s",
1332 			    if_name(fsw->fsw_ifp));
1333 		}
1334 	} else {
1335 		if (!FSW_NETAGENT_ADDED(fsw)) {
1336 			/* agent has not been added */
1337 			error = ENOENT;
1338 		} else {
1339 			fsw->fsw_state_flags &= ~(FSW_STATEF_NETAGENT_ADDED |
1340 			    FSW_STATEF_NETAGENT_ENABLED);
1341 			if_delete_netagent(fsw->fsw_ifp, fsw->fsw_agent_uuid);
1342 			SK_D("flowswitch netagent removed for interface %s",
1343 			    if_name(fsw->fsw_ifp));
1344 		}
1345 	}
1346 out:
1347 	if (fsw != NULL) {
1348 		FSW_UNLOCK(fsw);
1349 	}
1350 	return error;
1351 }
1352 
1353 void
fsw_netagent_update(struct kern_nexus * nx)1354 fsw_netagent_update(struct kern_nexus *nx)
1355 {
1356 	struct nx_flowswitch *fsw = NULL;
1357 
1358 	SK_LOCK_ASSERT_HELD();
1359 	VERIFY(nx != NULL);
1360 	VERIFY(NX_PROV(nx) != NULL);
1361 	VERIFY(NX_DOM_PROV(nx) != NULL);
1362 
1363 	if (NX_DOM(nx)->nxdom_type != NEXUS_TYPE_FLOW_SWITCH) {
1364 		goto out;
1365 	}
1366 	fsw = NX_FSW_PRIVATE(nx);
1367 	VERIFY(fsw != NULL);
1368 	FSW_WLOCK(fsw);
1369 	if (fsw->fsw_agent_session == NULL) {
1370 		goto out;
1371 	}
1372 	ASSERT(!uuid_is_null(fsw->fsw_agent_uuid));
1373 	uint32_t flags = netagent_get_flags(fsw->fsw_agent_uuid);
1374 	const bool ip_agent = ifnet_needs_fsw_ip_netagent(fsw->fsw_ifp);
1375 	const bool transport_agent = ifnet_needs_fsw_transport_netagent(fsw->fsw_ifp);
1376 	if (ip_agent || transport_agent) {
1377 		flags |= NETAGENT_FLAG_NEXUS_LISTENER;
1378 	} else {
1379 		flags &= ~NETAGENT_FLAG_NEXUS_LISTENER;
1380 	}
1381 	if (transport_agent) {
1382 		flags |= NETAGENT_FLAG_NEXUS_PROVIDER;
1383 	} else {
1384 		flags &= ~NETAGENT_FLAG_NEXUS_PROVIDER;
1385 	}
1386 	if (ip_agent) {
1387 		flags |= NETAGENT_FLAG_CUSTOM_IP_NEXUS;
1388 	} else {
1389 		flags &= ~NETAGENT_FLAG_CUSTOM_IP_NEXUS;
1390 	}
1391 	if (netagent_set_flags(fsw->fsw_agent_uuid, flags) == 0) {
1392 		SK_D("flowswitch netagent updated for interface %s",
1393 		    if_name(fsw->fsw_ifp));
1394 	}
1395 out:
1396 	if (fsw != NULL) {
1397 		FSW_UNLOCK(fsw);
1398 	}
1399 }
1400 
1401 static int
fsw_port_ctor(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,const struct nxbind * nxb)1402 fsw_port_ctor(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna,
1403     const struct nxbind *nxb)
1404 {
1405 #pragma unused(nxb)
1406 	int err = 0;
1407 
1408 	SK_LOCK_ASSERT_HELD();
1409 	ASSERT(nxb == NULL || !(nxb->nxb_flags & NXBF_MATCH_UNIQUEID) ||
1410 	    vpna->vpna_pid == nxb->nxb_pid);
1411 
1412 	/*
1413 	 * Reject regular channel open requests unless there is
1414 	 * something attached to the host port of the flowswitch.
1415 	 */
1416 	if (vpna->vpna_nx_port >= FSW_VP_USER_MIN) {
1417 		struct nexus_adapter *na = &vpna->vpna_up;
1418 		struct ifnet *ifp = fsw->fsw_ifp;
1419 
1420 		if (ifp == NULL) {
1421 			err = ENXIO;
1422 			goto done;
1423 		}
1424 
1425 		/* if adapter supports mitigation, set default value */
1426 		if (na->na_flags & (NAF_TX_MITIGATION | NAF_RX_MITIGATION)) {
1427 			if (IFNET_IS_WIFI(ifp)) {
1428 				na->na_ch_mit_ival = CH_MIT_IVAL_WIFI;
1429 			} else if (IFNET_IS_CELLULAR(ifp)) {
1430 				na->na_ch_mit_ival = CH_MIT_IVAL_CELLULAR;
1431 			} else if (IFNET_IS_ETHERNET(ifp)) {
1432 				na->na_ch_mit_ival = CH_MIT_IVAL_ETHERNET;
1433 			} else {
1434 				na->na_ch_mit_ival = CH_MIT_IVAL_DEFAULT;
1435 			}
1436 		}
1437 	}
1438 
1439 done:
1440 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
1441 	    "fsw 0x%llx nx_port %d vpna_pid %d vpna_pid_bound %u mit_ival %llu "
1442 	    "(err %d)", SK_KVA(fsw), (int)vpna->vpna_nx_port, vpna->vpna_pid,
1443 	    vpna->vpna_pid_bound, vpna->vpna_up.na_ch_mit_ival, err);
1444 
1445 	return err;
1446 }
1447 
1448 static bool
fsw_port_dtor(struct nx_flowswitch * fsw,const struct nexus_vp_adapter * vpna)1449 fsw_port_dtor(struct nx_flowswitch *fsw, const struct nexus_vp_adapter *vpna)
1450 {
1451 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1452 	nexus_port_t nx_port = vpna->vpna_nx_port;
1453 	uint32_t purge_cnt;
1454 
1455 	ASSERT(fsw == vpna->vpna_fsw);
1456 	ASSERT(nx_port != NEXUS_PORT_ANY);
1457 
1458 	/*
1459 	 * If this nexus port was bound to a PID, we just need to look at a
1460 	 * single bucket and iterate from there.  Note that in any case, we
1461 	 * can't just search for a single flow_owner based on the PID itself,
1462 	 * since a given process may be opening multiple channels to the
1463 	 * flowswitch; hence we search for the ones matching this nexus port.
1464 	 *
1465 	 * Close any open flows on the port and remove the flow owner and
1466 	 * nexus port binding.
1467 	 */
1468 	purge_cnt = flow_owner_detach_nexus_port(fm, vpna->vpna_pid_bound,
1469 	    vpna->vpna_pid, nx_port, FALSE);
1470 
1471 	SK_DF(SK_VERB_FSW,
1472 	    "fsw 0x%llx nx_port %d pid %d pid_bound %u defunct %u "
1473 	    "purged %u", SK_KVA(fsw), (int)nx_port,
1474 	    vpna->vpna_pid, vpna->vpna_pid_bound, vpna->vpna_defunct,
1475 	    purge_cnt);
1476 
1477 	return purge_cnt != 0;
1478 }
1479 
1480 /*
1481  * Flowswitch nexus port allocator.
1482  *
1483  * A nexus port is represented by a bit in the port bitmap; its state is
1484  * either free or allocated.  A free state implies that the port has no
1485  * nxbind AND no nexus adapter association.  An allocated state means that
1486  * either it has a nxbind OR a nexus adapter assocation.  This routine
1487  * manages the nexus adapter association with a nexus port; nxbind is
1488  * handled separately via nx_fsw_port_bind().
1489  *
1490  * The caller of this routine may optionally pass in a NULL nexus adapter.
1491  * In such a case (*vpna is NULL), this routine checks to see if the port
1492  * has already been associated with an adapter, and returns a reference to
1493  * that adapter.  No action is taken on a port that doesn't have an adapter
1494  * associated.  Otherwise (*vpna is non-NULL), this routine associates that
1495  * adapter with a port that's not already associated with one; the reference
1496  * to the adapter is untouched here, as the caller is expected to handle it.
1497  *
1498  * The flowswitch code invokes this routine each time it is requested to
1499  * find an adapter via nx_fsw_na_find().  The counterpart of this routine,
1500  * nx_fsw_port_free(), is only executed ONCE by the adapter's destructor.
1501  * This allows for multiple channels to be opened to a nexus port, each
1502  * time holding a reference to that same nexus adapter.  The releasing of
1503  * the nexus port only happens when the last channel closes.
1504  */
1505 static int
fsw_port_alloc__(struct nx_flowswitch * fsw,struct nxbind * nxb,struct nexus_vp_adapter ** vpna,nexus_port_t nx_port,struct proc * p)1506 fsw_port_alloc__(struct nx_flowswitch *fsw, struct nxbind *nxb,
1507     struct nexus_vp_adapter **vpna, nexus_port_t nx_port, struct proc *p)
1508 {
1509 	struct kern_nexus *nx = fsw->fsw_nx;
1510 	boolean_t refonly = FALSE;
1511 	int error = 0;
1512 
1513 	FSW_WLOCK_ASSERT_HELD(fsw);
1514 
1515 	error = nx_port_alloc(nx, nx_port, nxb, (struct nexus_adapter **)vpna, p);
1516 	if (error == 0 && *vpna != NULL && !refonly) {
1517 		/* initialize the nexus port and the adapter occupying it */
1518 		(*vpna)->vpna_fsw = fsw;
1519 		(*vpna)->vpna_nx_port = nx_port;
1520 		(*vpna)->vpna_pid = proc_pid(p);
1521 		if (nxb != NULL && (nxb->nxb_flags & NXBF_MATCH_UNIQUEID)) {
1522 			ASSERT((*vpna)->vpna_pid == nxb->nxb_pid);
1523 			(*vpna)->vpna_pid_bound = TRUE;
1524 		} else {
1525 			(*vpna)->vpna_pid_bound = FALSE;
1526 		}
1527 
1528 		error = fsw_port_ctor(fsw, *vpna, nxb);
1529 		if (error != 0) {
1530 			fsw_port_free(fsw, (*vpna),
1531 			    (*vpna)->vpna_nx_port, FALSE);
1532 		}
1533 	}
1534 
1535 #if SK_LOG
1536 	if (*vpna != NULL) {
1537 		SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
1538 		    "+++ vpna \"%s\" (0x%llx) <-> fsw 0x%llx "
1539 		    "%sport %d refonly %u (err %d)",
1540 		    (*vpna)->vpna_up.na_name, SK_KVA(*vpna), SK_KVA(fsw),
1541 		    nx_fsw_dom_port_is_reserved(nx, nx_port) ?
1542 		    "[reserved] " : "", (int)nx_port, refonly, error);
1543 	} else {
1544 		SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
1545 		    "+++ fsw 0x%llx nx_port %d refonly %u "
1546 		    "(err %d)", SK_KVA(fsw), (int)nx_port, refonly, error);
1547 	}
1548 #endif /* SK_LOG */
1549 
1550 	return error;
1551 }
1552 
1553 int
fsw_port_alloc(struct nx_flowswitch * fsw,struct nxbind * nxb,struct nexus_vp_adapter ** vpna,nexus_port_t nx_port,struct proc * p,boolean_t ifattach,boolean_t host)1554 fsw_port_alloc(struct nx_flowswitch *fsw, struct nxbind *nxb,
1555     struct nexus_vp_adapter **vpna, nexus_port_t nx_port, struct proc *p,
1556     boolean_t ifattach, boolean_t host)
1557 {
1558 	int err = 0;
1559 
1560 	FSW_WLOCK_ASSERT_HELD(fsw);
1561 
1562 	if (ifattach) {
1563 		/* override port to either NX_FSW_{HOST,DEV} */
1564 		nx_port = (host ? FSW_VP_HOST : FSW_VP_DEV);
1565 		/* allocate reserved port for ifattach */
1566 		err = fsw_port_alloc__(fsw, nxb, vpna, nx_port, p);
1567 	} else if (host) {
1568 		/* host is valid only for ifattach */
1569 		err = EINVAL;
1570 	} else {
1571 		/* nexus port otherwise (reserve dev and host for ifattach) */
1572 		err = fsw_port_alloc__(fsw, nxb, vpna, nx_port, p);
1573 	}
1574 
1575 	return err;
1576 }
1577 
1578 /*
1579  * Remove nexus port association from a nexus adapter.  This call is
1580  * the opposite of fsw_port_alloc(), except that it is called only
1581  * at nx_fsw_vp_na_dtor() destructor time.  See above notes
1582  * on fsw_port_alloc().
1583  */
1584 void
fsw_port_free(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,nexus_port_t nx_port,boolean_t defunct)1585 fsw_port_free(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna,
1586     nexus_port_t nx_port, boolean_t defunct)
1587 {
1588 	struct kern_nexus *nx = fsw->fsw_nx;
1589 
1590 	FSW_WLOCK_ASSERT_HELD(fsw);
1591 	ASSERT(vpna->vpna_fsw == fsw);
1592 
1593 	if (defunct) {
1594 		vpna->vpna_defunct = TRUE;
1595 		nx_port_defunct(nx, nx_port);
1596 	}
1597 
1598 	bool destroyed = fsw_port_dtor(fsw, vpna);
1599 	if (destroyed) {
1600 		/*
1601 		 * If the extension's destructor no longer needs to be
1602 		 * bound to any channel client, release the binding.
1603 		 */
1604 		nx_port_unbind(nx, nx_port);
1605 	}
1606 
1607 	/*
1608 	 * If this is a defunct, then stop here as the port is still
1609 	 * occupied by the channel.  We'll come here again later when
1610 	 * the actual close happens.
1611 	 */
1612 	if (defunct) {
1613 		return;
1614 	}
1615 
1616 	SK_DF(SK_VERB_FSW, "--- vpna \"%s\" (0x%llx) -!- fsw 0x%llx "
1617 	    "nx_port %d defunct %u", vpna->vpna_up.na_name, SK_KVA(vpna),
1618 	    SK_KVA(fsw), (int)nx_port, vpna->vpna_defunct);
1619 
1620 	nx_port_free(nx, nx_port);
1621 	vpna->vpna_fsw = NULL;
1622 	vpna->vpna_nx_port = NEXUS_PORT_ANY;
1623 	vpna->vpna_pid_bound = FALSE;
1624 	vpna->vpna_pid = -1;
1625 	vpna->vpna_defunct = FALSE;
1626 }
1627 
1628 int
fsw_port_na_activate(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,na_activate_mode_t mode)1629 fsw_port_na_activate(struct nx_flowswitch *fsw,
1630     struct nexus_vp_adapter *vpna, na_activate_mode_t mode)
1631 {
1632 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1633 	uint32_t fo_cnt = 0;
1634 
1635 	SK_LOCK_ASSERT_HELD();
1636 
1637 	/* The following code relies on the static value asserted below */
1638 	_CASSERT(FSW_VP_DEV == 0);
1639 	_CASSERT(FSW_VP_HOST == 1);
1640 
1641 	ASSERT(NA_IS_ACTIVE(&vpna->vpna_up));
1642 	ASSERT(vpna->vpna_nx_port != NEXUS_PORT_ANY);
1643 
1644 	switch (mode) {
1645 	case NA_ACTIVATE_MODE_ON:
1646 		break;
1647 
1648 	case NA_ACTIVATE_MODE_DEFUNCT:
1649 		break;
1650 
1651 	case NA_ACTIVATE_MODE_OFF:
1652 		break;
1653 
1654 	default:
1655 		VERIFY(0);
1656 		/* NOTREACHED */
1657 		__builtin_unreachable();
1658 	}
1659 
1660 	/* nothing further to do for special ports */
1661 	if (vpna->vpna_nx_port < FSW_VP_USER_MIN) {
1662 		goto done;
1663 	}
1664 
1665 	/* activate any flow owner related resources (e.g. flowadv), if any */
1666 	fo_cnt = flow_owner_activate_nexus_port(fm, vpna->vpna_pid_bound,
1667 	    vpna->vpna_pid, vpna->vpna_nx_port, &vpna->vpna_up, mode);
1668 
1669 done:
1670 	SK_DF(SK_VERB_FSW,
1671 	    "fsw 0x%llx %s nx_port %d vpna_pid %d vpna_pid_bound %u fo_cnt %u",
1672 	    SK_KVA(fsw), na_activate_mode2str(mode), (int)vpna->vpna_nx_port,
1673 	    vpna->vpna_pid, vpna->vpna_pid_bound, fo_cnt);
1674 
1675 	return 0;
1676 }
1677 
1678 int
fsw_port_na_defunct(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna)1679 fsw_port_na_defunct(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna)
1680 {
1681 	int err = 0;
1682 
1683 	SK_LOCK_ASSERT_HELD();
1684 	ASSERT(vpna->vpna_nx_port >= FSW_VP_USER_MIN);
1685 
1686 	/*
1687 	 * During defunct, we want to purge all flows associated to this
1688 	 * port and the flow owner as well.  This is accomplished as part
1689 	 * of calling the port's destructor.  However, we still want to
1690 	 * occupy the nexus port since there's a channel open to it.
1691 	 */
1692 	FSW_WLOCK(fsw);
1693 	if (!vpna->vpna_defunct) {
1694 		fsw_port_free(fsw, vpna, vpna->vpna_nx_port, TRUE);
1695 	} else {
1696 		err = EALREADY;
1697 	}
1698 	FSW_WUNLOCK(fsw);
1699 
1700 	return err;
1701 }
1702 
1703 static size_t
fsw_mib_get_flow(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1704 fsw_mib_get_flow(struct nx_flowswitch *fsw,
1705     struct nexus_mib_filter *filter, void *out, size_t len)
1706 {
1707 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1708 	size_t sf_size = sizeof(struct sk_stats_flow);
1709 	__block size_t actual_space = 0;
1710 	__block struct sk_stats_flow *sf = out;
1711 	struct flow_entry *fe;
1712 
1713 	FSW_LOCK_ASSERT_HELD(fsw);
1714 
1715 	if (filter->nmf_bitmap & NXMIB_FILTER_FLOW_ID) {
1716 		fe = flow_mgr_get_fe_by_uuid_rlock(fm, filter->nmf_flow_id);
1717 		if (fe != NULL) {
1718 			if (out != NULL && len >= sf_size) {
1719 				flow_entry_stats_get(fe, sf);
1720 			}
1721 
1722 			flow_entry_release(&fe);
1723 			return sf_size;
1724 		}
1725 		return 0;
1726 	} else if (filter->nmf_bitmap & NXMIB_FILTER_INFO_TUPLE) {
1727 		struct info_tuple *itpl = &filter->nmf_info_tuple;
1728 		struct flow_key fk;
1729 		bzero(&fk, sizeof(fk));
1730 		if (itpl->itpl_local_sa.sa_family == AF_INET &&
1731 		    itpl->itpl_remote_sa.sa_family == AF_INET) {
1732 			fk.fk_mask = FKMASK_5TUPLE;
1733 			fk.fk_ipver = IPVERSION;
1734 			fk.fk_proto = itpl->itpl_proto;
1735 			fk.fk_src4 = itpl->itpl_local_sin.sin_addr;
1736 			fk.fk_dst4 = itpl->itpl_remote_sin.sin_addr;
1737 			fk.fk_sport = itpl->itpl_local_sin.sin_port;
1738 			fk.fk_dport = itpl->itpl_remote_sin.sin_port;
1739 		} else if (itpl->itpl_local_sa.sa_family == AF_INET6 &&
1740 		    itpl->itpl_remote_sa.sa_family == AF_INET6) {
1741 			fk.fk_mask = FKMASK_5TUPLE;
1742 			fk.fk_ipver = IPV6_VERSION;
1743 			fk.fk_proto = itpl->itpl_proto;
1744 			fk.fk_src6 = itpl->itpl_local_sin6.sin6_addr;
1745 			fk.fk_dst6 = itpl->itpl_remote_sin6.sin6_addr;
1746 			fk.fk_sport = itpl->itpl_local_sin6.sin6_port;
1747 			fk.fk_dport = itpl->itpl_remote_sin6.sin6_port;
1748 		} else {
1749 			SK_ERR("invalid info tuple: local af %d remote af %d",
1750 			    itpl->itpl_local_sa.sa_family,
1751 			    itpl->itpl_remote_sa.sa_family);
1752 			return 0;
1753 		}
1754 
1755 		fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &fk);
1756 		if (fe != NULL) {
1757 			if (out != NULL && len >= sf_size) {
1758 				flow_entry_stats_get(fe, sf);
1759 			}
1760 			flow_entry_release(&fe);
1761 			return sf_size;
1762 		}
1763 		return 0;
1764 	}
1765 
1766 	flow_mgr_foreach_flow(fsw->fsw_flow_mgr, ^(struct flow_entry *_fe) {
1767 		actual_space += sf_size;
1768 
1769 		if (out == NULL || actual_space > len) {
1770 		        return;
1771 		}
1772 
1773 		flow_entry_stats_get(_fe, sf);
1774 		sf++;
1775 	});
1776 
1777 	/*
1778 	 * Also return the ones in deferred free list.
1779 	 */
1780 	lck_mtx_lock(&fsw->fsw_linger_lock);
1781 	TAILQ_FOREACH(fe, &fsw->fsw_linger_head, fe_linger_link) {
1782 		actual_space += sf_size;
1783 		if (out == NULL || actual_space > len) {
1784 			continue;
1785 		}
1786 
1787 		flow_entry_stats_get(fe, sf);
1788 		sf++;
1789 	}
1790 	lck_mtx_unlock(&fsw->fsw_linger_lock);
1791 
1792 	return actual_space;
1793 }
1794 
1795 static size_t
fsw_mib_get_flow_adv(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1796 fsw_mib_get_flow_adv(struct nx_flowswitch *fsw,
1797     struct nexus_mib_filter *filter, void *out, size_t len)
1798 {
1799 #pragma unused(filter)
1800 	uint32_t fae_idx;
1801 	size_t actual_space = 0;
1802 	struct kern_channel *ch = NULL;
1803 	struct sk_stats_flow_adv *sfa = NULL;
1804 	struct sk_stats_flow_adv_ent *sfae = NULL;
1805 	struct __flowadv_entry *fae = NULL;
1806 	size_t sfa_size = sizeof(struct sk_stats_flow_adv);
1807 	size_t sfae_size = sizeof(struct sk_stats_flow_adv_ent);
1808 	uint32_t max_flowadv =
1809 	    fsw->fsw_nx->nx_prov->nxprov_params->nxp_flowadv_max;
1810 
1811 	SK_LOCK_ASSERT_HELD();
1812 
1813 	sfa = out;
1814 	/* copyout flow advisory table (allocated entries only) */
1815 	STAILQ_FOREACH(ch, &fsw->fsw_nx->nx_ch_head, ch_link) {
1816 		struct skmem_arena *ar;
1817 		struct skmem_arena_nexus *arn;
1818 		struct nexus_adapter *na;
1819 
1820 		/* ch_lock isn't needed here since sk_lock is held */
1821 		if ((ch->ch_flags & CHANF_CLOSING) ||
1822 		    (na = ch->ch_na) == NULL) {
1823 			/* channel is closing */
1824 			continue;
1825 		}
1826 
1827 		ar = na->na_arena;
1828 		arn = skmem_arena_nexus(ar);
1829 
1830 		AR_LOCK(ar);
1831 		if (arn->arn_flowadv_obj == NULL) {
1832 			ASSERT(ar->ar_flags & ARF_DEFUNCT);
1833 			AR_UNLOCK(ar);
1834 			continue;
1835 		}
1836 		actual_space += sfa_size;
1837 		/* fill out flowadv_table info */
1838 		if (out != NULL && actual_space <= len) {
1839 			uuid_copy(sfa->sfa_nx_uuid, fsw->fsw_nx->nx_uuid);
1840 			(void) strlcpy(sfa->sfa_if_name,
1841 			    fsw->fsw_flow_mgr->fm_name, IFNAMSIZ);
1842 			sfa->sfa_owner_pid = ch->ch_pid;
1843 			sfa->sfa_entries_count = 0;
1844 		}
1845 
1846 		/* fill out flowadv_entries */
1847 		sfae = &sfa->sfa_entries[0];
1848 		for (fae_idx = 0; fae_idx < max_flowadv; fae_idx++) {
1849 			fae = &arn->arn_flowadv_obj[fae_idx];
1850 			if (!uuid_is_null(fae->fae_id)) {
1851 				actual_space += sfae_size;
1852 				if (out == NULL || actual_space > len) {
1853 					continue;
1854 				}
1855 
1856 				/* fill out entry */
1857 				uuid_copy(sfae->sfae_flow_id, fae->fae_id);
1858 				sfae->sfae_flags = fae->fae_flags;
1859 				sfae++;
1860 				sfa->sfa_entries_count++;
1861 			}
1862 		}
1863 		sfa = (struct sk_stats_flow_adv *)
1864 		    ((uintptr_t)out + actual_space);
1865 		AR_UNLOCK(ar);
1866 	}
1867 
1868 	return actual_space;
1869 }
1870 
1871 static inline void
fsw_fo2sfo(struct nx_flowswitch * fsw,struct flow_owner * fo,struct sk_stats_flow_owner * sfo)1872 fsw_fo2sfo(struct nx_flowswitch *fsw, struct flow_owner *fo,
1873     struct sk_stats_flow_owner *sfo)
1874 {
1875 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1876 
1877 	uuid_copy(sfo->sfo_nx_uuid, fsw->fsw_nx->nx_uuid);
1878 	(void) strlcpy(sfo->sfo_if_name, fsw->fsw_flow_mgr->fm_name,
1879 	    IFNAMSIZ);
1880 	sfo->sfo_bucket_idx = flow_mgr_get_fob_idx(fm, FO_BUCKET(fo));
1881 
1882 	(void) snprintf(sfo->sfo_name, sizeof(sfo->sfo_name), "%s",
1883 	    fo->fo_name);
1884 	sfo->sfo_pid = fo->fo_pid;
1885 	sfo->sfo_nx_port = fo->fo_nx_port;
1886 	sfo->sfo_nx_port_pid_bound = fo->fo_nx_port_pid_bound;
1887 	sfo->sfo_nx_port_destroyed = fo->fo_nx_port_destroyed;
1888 }
1889 
1890 static size_t
fsw_mib_get_flow_owner(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1891 fsw_mib_get_flow_owner(struct nx_flowswitch *fsw,
1892     struct nexus_mib_filter *filter, void *out, size_t len)
1893 {
1894 #pragma unused(filter)
1895 	uint32_t i;
1896 	size_t actual_space = 0;
1897 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1898 	struct sk_stats_flow_owner *sfo = out;
1899 	size_t sfo_size = sizeof(struct sk_stats_flow_owner);
1900 	struct flow_owner *fo;
1901 
1902 	FSW_LOCK_ASSERT_HELD(fsw);
1903 
1904 	/*
1905 	 * Ideally we'd like to hide the bucket level details from flow library
1906 	 * user, but there is no simple way to iterate flow_owner with
1907 	 * buckets/RB_TREE nested. So keep it as is.
1908 	 */
1909 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
1910 		struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
1911 		FOB_LOCK(fob);
1912 		RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
1913 			actual_space += sfo_size;
1914 			if (out == NULL || actual_space > len) {
1915 				continue;
1916 			}
1917 
1918 			fsw_fo2sfo(fsw, fo, sfo);
1919 			sfo++;
1920 		}
1921 		FOB_UNLOCK(fob);
1922 	}
1923 
1924 	return actual_space;
1925 }
1926 
1927 static inline void
fsw_fr2sfr(struct nx_flowswitch * fsw,struct flow_route * fr,struct sk_stats_flow_route * sfr,boolean_t ll_scrub)1928 fsw_fr2sfr(struct nx_flowswitch *fsw, struct flow_route *fr,
1929     struct sk_stats_flow_route *sfr, boolean_t ll_scrub)
1930 {
1931 	uuid_copy(sfr->sfr_nx_uuid, fsw->fsw_nx->nx_uuid);
1932 	uuid_copy(sfr->sfr_uuid, fr->fr_uuid);
1933 	(void) strlcpy(sfr->sfr_if_name, fsw->fsw_flow_mgr->fm_name,
1934 	    IFNAMSIZ);
1935 
1936 	sfr->sfr_bucket_idx = fr->fr_frb->frb_idx;
1937 	sfr->sfr_id_bucket_idx = fr->fr_frib->frib_idx;
1938 
1939 	if (fr->fr_flags & FLOWRTF_ATTACHED) {
1940 		sfr->sfr_flags |= SFLOWRTF_ATTACHED;
1941 	}
1942 	if (fr->fr_flags & FLOWRTF_ONLINK) {
1943 		sfr->sfr_flags |= SFLOWRTF_ONLINK;
1944 	}
1945 	if (fr->fr_flags & FLOWRTF_GATEWAY) {
1946 		sfr->sfr_flags |= SFLOWRTF_GATEWAY;
1947 	}
1948 	if (fr->fr_flags & FLOWRTF_RESOLVED) {
1949 		sfr->sfr_flags |= SFLOWRTF_RESOLVED;
1950 	}
1951 	if (fr->fr_flags & FLOWRTF_HAS_LLINFO) {
1952 		sfr->sfr_flags |= SFLOWRTF_HAS_LLINFO;
1953 	}
1954 	if (fr->fr_flags & FLOWRTF_DELETED) {
1955 		sfr->sfr_flags |= SFLOWRTF_DELETED;
1956 	}
1957 	if (fr->fr_flags & FLOWRTF_DST_LL_MCAST) {
1958 		sfr->sfr_flags |= SFLOWRTF_DST_LL_MCAST;
1959 	}
1960 	if (fr->fr_flags & FLOWRTF_DST_LL_BCAST) {
1961 		sfr->sfr_flags |= SFLOWRTF_DST_LL_BCAST;
1962 	}
1963 
1964 	lck_spin_lock(&fr->fr_reflock);
1965 	ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF);
1966 	sfr->sfr_usecnt = fr->fr_usecnt - FLOW_ROUTE_MINREF;
1967 	if (fr->fr_expire != 0) {
1968 		sfr->sfr_expire = (int64_t)(fr->fr_expire - net_uptime());
1969 	} else {
1970 		sfr->sfr_expire = 0;
1971 	}
1972 	lck_spin_unlock(&fr->fr_reflock);
1973 
1974 	sfr->sfr_laddr = fr->fr_laddr;
1975 	sfr->sfr_faddr = fr->fr_faddr;
1976 	sfr->sfr_gaddr = fr->fr_gaddr;
1977 
1978 	if (ll_scrub) {
1979 		static const uint8_t unspec[ETHER_ADDR_LEN] = {[0] = 2 };
1980 		bcopy(&unspec, &sfr->sfr_ether_dhost, ETHER_ADDR_LEN);
1981 	} else {
1982 		bcopy(&fr->fr_eth.ether_dhost, &sfr->sfr_ether_dhost,
1983 		    ETHER_ADDR_LEN);
1984 	}
1985 }
1986 
1987 #if CONFIG_MACF
1988 extern int dlil_lladdr_ckreq;
1989 #endif /* CONFIG_MACF */
1990 
1991 static size_t
fsw_mib_get_flow_route(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len,struct proc * p)1992 fsw_mib_get_flow_route(struct nx_flowswitch *fsw,
1993     struct nexus_mib_filter *filter, void *out, size_t len, struct proc *p)
1994 {
1995 #pragma unused(filter)
1996 	uint32_t i;
1997 	size_t actual_space = 0;
1998 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1999 	struct sk_stats_flow_route *sfr = out;
2000 	size_t sfo_size = sizeof(struct sk_stats_flow_route);
2001 	struct flow_route *fr;
2002 	boolean_t ll_scrub;
2003 
2004 	FSW_LOCK_ASSERT_HELD(fsw);
2005 
2006 	/*
2007 	 * To get the link-layer info, the caller must have the following
2008 	 * in their sandbox profile (or not be sandboxed at all), else we
2009 	 * scrub it clean just like dlil_ifaddr_bytes() does:
2010 	 *
2011 	 * (allow system-info (info-type "net.link.addr"))
2012 	 *
2013 	 * If scrubbed, we return 02:00:00:00:00:00.
2014 	 */
2015 #if CONFIG_MACF
2016 	ll_scrub = (dlil_lladdr_ckreq &&
2017 	    skywalk_mac_system_check_proc_cred(p, "net.link.addr") != 0);
2018 #else /* !CONFIG_MACF */
2019 	ll_scrub = FALSE;
2020 #endif /* !CONFIG_MACF */
2021 
2022 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
2023 		struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
2024 		FRB_RLOCK(frb);
2025 		RB_FOREACH(fr, flow_route_tree, &frb->frb_head) {
2026 			actual_space += sfo_size;
2027 			if (out == NULL || actual_space > len) {
2028 				continue;
2029 			}
2030 
2031 			fsw_fr2sfr(fsw, fr, sfr, ll_scrub);
2032 			sfr++;
2033 		}
2034 		FRB_UNLOCK(frb);
2035 	}
2036 
2037 	return actual_space;
2038 }
2039 
2040 static inline void
fsw_nxs2nus(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,pid_t pid,struct __nx_stats_fsw * nxs,struct sk_stats_userstack * sus)2041 fsw_nxs2nus(struct nx_flowswitch *fsw, struct nexus_mib_filter *filter,
2042     pid_t pid, struct __nx_stats_fsw *nxs, struct sk_stats_userstack *sus)
2043 {
2044 	uuid_copy(sus->sus_nx_uuid, fsw->fsw_nx->nx_uuid);
2045 	(void) strlcpy(sus->sus_if_name, fsw->fsw_flow_mgr->fm_name,
2046 	    IFNAMSIZ);
2047 	sus->sus_owner_pid = pid;
2048 
2049 	if (filter->nmf_type & NXMIB_IP_STATS) {
2050 		sus->sus_ip  = nxs->nxs_ipstat;
2051 	}
2052 
2053 	if (filter->nmf_type & NXMIB_IP6_STATS) {
2054 		sus->sus_ip6 = nxs->nxs_ip6stat;
2055 	}
2056 
2057 	if (filter->nmf_type & NXMIB_TCP_STATS) {
2058 		sus->sus_tcp = nxs->nxs_tcpstat;
2059 	}
2060 
2061 	if (filter->nmf_type & NXMIB_UDP_STATS) {
2062 		sus->sus_udp = nxs->nxs_udpstat;
2063 	}
2064 
2065 	if (filter->nmf_type & NXMIB_QUIC_STATS) {
2066 		sus->sus_quic = nxs->nxs_quicstat;
2067 	}
2068 }
2069 
2070 static size_t
fsw_mib_get_userstack_stats(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)2071 fsw_mib_get_userstack_stats(struct nx_flowswitch *fsw,
2072     struct nexus_mib_filter *filter, void *out, size_t len)
2073 {
2074 	size_t actual_space = 0;
2075 	struct kern_channel *ch;
2076 	struct __nx_stats_fsw *nxs;
2077 	struct sk_stats_userstack *sus = out;
2078 	size_t sus_size = sizeof(struct sk_stats_userstack);
2079 
2080 	SK_LOCK_ASSERT_HELD();
2081 
2082 	/* copyout saved stats from closed ports */
2083 	if (((filter->nmf_bitmap & NXMIB_FILTER_PID) &&
2084 	    (filter->nmf_pid == 0)) ||
2085 	    !(filter->nmf_bitmap & NXMIB_FILTER_PID)) {
2086 		actual_space += sus_size;
2087 		if (out != NULL && actual_space <= len) {
2088 			nxs = fsw->fsw_closed_na_stats;
2089 			fsw_nxs2nus(fsw, filter, 0, nxs, sus);
2090 			sus++;
2091 		}
2092 	}
2093 
2094 	/*
2095 	 * XXX Currently a proc only opens one channel to nexus so we don't do
2096 	 * per proc aggregation of inet stats now as this needs lots of code
2097 	 */
2098 	/* copyout per process stats */
2099 	STAILQ_FOREACH(ch, &fsw->fsw_nx->nx_ch_head, ch_link) {
2100 		struct skmem_arena *ar;
2101 		struct nexus_adapter *na;
2102 
2103 		/* ch_lock isn't needed here since sk_lock is held */
2104 		if ((ch->ch_flags & CHANF_CLOSING) ||
2105 		    (na = ch->ch_na) == NULL) {
2106 			/* channel is closing */
2107 			continue;
2108 		}
2109 
2110 		if ((filter->nmf_bitmap & NXMIB_FILTER_PID) &&
2111 		    filter->nmf_pid != ch->ch_pid) {
2112 			continue;
2113 		}
2114 
2115 		ar = na->na_arena;
2116 
2117 		AR_LOCK(ar);
2118 		nxs = skmem_arena_nexus(ar)->arn_stats_obj;
2119 		if (nxs == NULL) {
2120 			ASSERT(ar->ar_flags & ARF_DEFUNCT);
2121 			AR_UNLOCK(ar);
2122 			continue;
2123 		}
2124 
2125 		actual_space += sus_size;
2126 		if (out == NULL || actual_space > len) {
2127 			AR_UNLOCK(ar);
2128 			continue;
2129 		}
2130 
2131 		fsw_nxs2nus(fsw, filter, ch->ch_pid, nxs, sus);
2132 		sus++;
2133 		AR_UNLOCK(ar);
2134 	}
2135 
2136 	return actual_space;
2137 }
2138 
2139 static size_t
fsw_mib_get_stats(struct nx_flowswitch * fsw,void * out,size_t len)2140 fsw_mib_get_stats(struct nx_flowswitch *fsw, void *out, size_t len)
2141 {
2142 	struct sk_stats_flow_switch *sfs = out;
2143 	size_t actual_space = sizeof(struct sk_stats_flow_switch);
2144 
2145 	if (out != NULL && actual_space <= len) {
2146 		uuid_copy(sfs->sfs_nx_uuid, fsw->fsw_nx->nx_uuid);
2147 		(void) strlcpy(sfs->sfs_if_name,
2148 		    fsw->fsw_flow_mgr->fm_name, IFNAMSIZ);
2149 		sfs->sfs_fsws = fsw->fsw_stats;
2150 	}
2151 
2152 	return actual_space;
2153 }
2154 
2155 size_t
fsw_mib_get(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len,struct proc * p)2156 fsw_mib_get(struct nx_flowswitch *fsw, struct nexus_mib_filter *filter,
2157     void *out, size_t len, struct proc *p)
2158 {
2159 	size_t ret;
2160 
2161 	switch (filter->nmf_type) {
2162 	case NXMIB_FSW_STATS:
2163 		ret = fsw_mib_get_stats(fsw, out, len);
2164 		break;
2165 	case NXMIB_FLOW:
2166 		ret = fsw_mib_get_flow(fsw, filter, out, len);
2167 		break;
2168 	case NXMIB_FLOW_OWNER:
2169 		ret = fsw_mib_get_flow_owner(fsw, filter, out, len);
2170 		break;
2171 	case NXMIB_FLOW_ROUTE:
2172 		ret = fsw_mib_get_flow_route(fsw, filter, out, len, p);
2173 		break;
2174 	case NXMIB_TCP_STATS:
2175 	case NXMIB_UDP_STATS:
2176 	case NXMIB_IP_STATS:
2177 	case NXMIB_IP6_STATS:
2178 	case NXMIB_USERSTACK_STATS:
2179 		ret = fsw_mib_get_userstack_stats(fsw, filter, out, len);
2180 		break;
2181 	case NXMIB_FLOW_ADV:
2182 		ret = fsw_mib_get_flow_adv(fsw, filter, out, len);
2183 		break;
2184 	default:
2185 		ret = 0;
2186 		break;
2187 	}
2188 
2189 	return ret;
2190 }
2191 
2192 void
fsw_fold_stats(struct nx_flowswitch * fsw,void * data,nexus_stats_type_t type)2193 fsw_fold_stats(struct nx_flowswitch *fsw,
2194     void *data, nexus_stats_type_t type)
2195 {
2196 	ASSERT(data != NULL);
2197 	FSW_LOCK_ASSERT_HELD(fsw);
2198 
2199 	switch (type) {
2200 	case NEXUS_STATS_TYPE_FSW:
2201 	{
2202 		struct __nx_stats_fsw *d, *s;
2203 		d = fsw->fsw_closed_na_stats;
2204 		s = data;
2205 		ip_stats_fold(&d->nxs_ipstat, &s->nxs_ipstat);
2206 		ip6_stats_fold(&d->nxs_ip6stat, &s->nxs_ip6stat);
2207 		tcp_stats_fold(&d->nxs_tcpstat, &s->nxs_tcpstat);
2208 		udp_stats_fold(&d->nxs_udpstat, &s->nxs_udpstat);
2209 		quic_stats_fold(&d->nxs_quicstat, &s->nxs_quicstat);
2210 		break;
2211 	}
2212 	case NEXUS_STATS_TYPE_CHAN_ERRORS:
2213 	{
2214 		struct __nx_stats_channel_errors *s = data;
2215 		fsw_vp_channel_error_stats_fold(&fsw->fsw_stats, s);
2216 		break;
2217 	}
2218 	default:
2219 		VERIFY(0);
2220 		/* NOTREACHED */
2221 		__builtin_unreachable();
2222 	}
2223 }
2224 
2225 boolean_t
fsw_detach_barrier_add(struct nx_flowswitch * fsw)2226 fsw_detach_barrier_add(struct nx_flowswitch *fsw)
2227 {
2228 	lck_mtx_lock_spin(&fsw->fsw_detach_barrier_lock);
2229 	if (__improbable(fsw->fsw_detach_flags != 0 ||
2230 	    fsw->fsw_ifp == NULL || fsw->fsw_agent_session == NULL)) {
2231 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2232 		return FALSE;
2233 	}
2234 	fsw->fsw_detach_barriers++;
2235 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2236 
2237 	return TRUE;
2238 }
2239 
2240 void
fsw_detach_barrier_remove(struct nx_flowswitch * fsw)2241 fsw_detach_barrier_remove(struct nx_flowswitch *fsw)
2242 {
2243 	lck_mtx_lock_spin(&fsw->fsw_detach_barrier_lock);
2244 	ASSERT((fsw->fsw_detach_flags & FSW_DETACHF_DETACHED) == 0);
2245 	ASSERT(fsw->fsw_detach_barriers != 0);
2246 	fsw->fsw_detach_barriers--;
2247 	/* if there's a thread waiting to detach the interface, let it know */
2248 	if (__improbable((fsw->fsw_detach_waiters > 0) &&
2249 	    (fsw->fsw_detach_barriers == 0))) {
2250 		fsw->fsw_detach_waiters = 0;
2251 		wakeup(&fsw->fsw_detach_waiters);
2252 	}
2253 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2254 }
2255 
2256 /*
2257  * Generic resolver for non-Ethernet interfaces.
2258  */
2259 int
fsw_generic_resolve(struct nx_flowswitch * fsw,struct flow_route * fr,struct __kern_packet * pkt)2260 fsw_generic_resolve(struct nx_flowswitch *fsw, struct flow_route *fr,
2261     struct __kern_packet *pkt)
2262 {
2263 #pragma unused(pkt)
2264 #if SK_LOG
2265 	char dst_s[MAX_IPv6_STR_LEN];
2266 #endif /* SK_LOG */
2267 	struct ifnet *ifp = fsw->fsw_ifp;
2268 	struct rtentry *tgt_rt = NULL;
2269 	int err = 0;
2270 
2271 	ASSERT(fr != NULL);
2272 	ASSERT(ifp != NULL);
2273 
2274 	FR_LOCK(fr);
2275 	/*
2276 	 * If the destination is on-link, we use the final destination
2277 	 * address as target.  If it's off-link, we use the gateway
2278 	 * address instead.  Point tgt_rt to the the destination or
2279 	 * gateway route accordingly.
2280 	 */
2281 	if (fr->fr_flags & FLOWRTF_ONLINK) {
2282 		tgt_rt = fr->fr_rt_dst;
2283 	} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
2284 		tgt_rt = fr->fr_rt_gw;
2285 	}
2286 
2287 	/*
2288 	 * Perform another routing table lookup if necessary.
2289 	 */
2290 	if (tgt_rt == NULL || !(tgt_rt->rt_flags & RTF_UP) ||
2291 	    fr->fr_want_configure) {
2292 		if (fr->fr_want_configure == 0) {
2293 			os_atomic_inc(&fr->fr_want_configure, relaxed);
2294 		}
2295 		err = flow_route_configure(fr, ifp, NULL);
2296 		if (err != 0) {
2297 			SK_ERR("failed to configure route to %s on %s (err %d)",
2298 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
2299 			    sizeof(dst_s)), ifp->if_xname, err);
2300 			goto done;
2301 		}
2302 
2303 		/* refresh pointers */
2304 		if (fr->fr_flags & FLOWRTF_ONLINK) {
2305 			tgt_rt = fr->fr_rt_dst;
2306 		} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
2307 			tgt_rt = fr->fr_rt_gw;
2308 		}
2309 	}
2310 
2311 	if (__improbable(!(fr->fr_flags & (FLOWRTF_ONLINK | FLOWRTF_GATEWAY)))) {
2312 		err = EHOSTUNREACH;
2313 		SK_ERR("invalid route for %s on %s (err %d)",
2314 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
2315 		    sizeof(dst_s)), ifp->if_xname, err);
2316 		goto done;
2317 	}
2318 
2319 	ASSERT(tgt_rt != NULL);
2320 
2321 done:
2322 	if (__probable(err == 0)) {
2323 		/*
2324 		 * There's no actual resolution taking place here, so just
2325 		 * mark it with FLOWRTF_RESOLVED for consistency.
2326 		 */
2327 		os_atomic_or(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
2328 		os_atomic_store(&fr->fr_want_probe, 0, release);
2329 	} else {
2330 		os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
2331 		flow_route_cleanup(fr);
2332 	}
2333 	FR_UNLOCK(fr);
2334 
2335 	return err;
2336 }
2337 
2338 static void
fsw_read_boot_args(void)2339 fsw_read_boot_args(void)
2340 {
2341 	(void) PE_parse_boot_argn("fsw_use_dual_sized_pool",
2342 	    &fsw_use_dual_sized_pool, sizeof(fsw_use_dual_sized_pool));
2343 }
2344 
2345 void
fsw_init(void)2346 fsw_init(void)
2347 {
2348 	_CASSERT(NX_FSW_CHUNK_FREE == (uint64_t)-1);
2349 	_CASSERT(PKT_MAX_PROTO_HEADER_SIZE <= NX_FSW_MINBUFSIZE);
2350 
2351 	if (!__nx_fsw_inited) {
2352 		fsw_read_boot_args();
2353 		/*
2354 		 * Register callbacks for interface & protocol events
2355 		 * Use dummy arg for callback cookie.
2356 		 */
2357 		__nx_fsw_ifnet_eventhandler_tag =
2358 		    EVENTHANDLER_REGISTER(&ifnet_evhdlr_ctxt,
2359 		    ifnet_event, fsw_ifnet_event_callback,
2360 		    eventhandler_entry_dummy_arg, EVENTHANDLER_PRI_ANY);
2361 		VERIFY(__nx_fsw_ifnet_eventhandler_tag != NULL);
2362 
2363 		__nx_fsw_protoctl_eventhandler_tag =
2364 		    EVENTHANDLER_REGISTER(&protoctl_evhdlr_ctxt,
2365 		    protoctl_event, fsw_protoctl_event_callback,
2366 		    eventhandler_entry_dummy_arg, EVENTHANDLER_PRI_ANY);
2367 		VERIFY(__nx_fsw_protoctl_eventhandler_tag != NULL);
2368 		__nx_fsw_inited = 1;
2369 	}
2370 }
2371 
2372 void
fsw_uninit(void)2373 fsw_uninit(void)
2374 {
2375 	if (__nx_fsw_inited) {
2376 		EVENTHANDLER_DEREGISTER(&ifnet_evhdlr_ctxt, ifnet_event,
2377 		    __nx_fsw_ifnet_eventhandler_tag);
2378 		EVENTHANDLER_DEREGISTER(&protoctl_evhdlr_ctxt, protoctl_event,
2379 		    __nx_fsw_protoctl_eventhandler_tag);
2380 
2381 		__nx_fsw_inited = 0;
2382 	}
2383 }
2384 
2385 struct nx_flowswitch *
fsw_alloc(zalloc_flags_t how)2386 fsw_alloc(zalloc_flags_t how)
2387 {
2388 	struct nx_flowswitch *fsw;
2389 	struct __nx_stats_fsw *nsfw;
2390 
2391 	SK_LOCK_ASSERT_HELD();
2392 
2393 	nsfw = zalloc_flags(nx_fsw_stats_zone, how | Z_ZERO);
2394 	if (nsfw == NULL) {
2395 		return NULL;
2396 	}
2397 
2398 	fsw = zalloc_flags(nx_fsw_zone, how | Z_ZERO);
2399 	if (fsw == NULL) {
2400 		zfree(nx_fsw_stats_zone, nsfw);
2401 		return NULL;
2402 	}
2403 
2404 	FSW_RWINIT(fsw);
2405 	fsw->fsw_dev_ch = NULL;
2406 	fsw->fsw_host_ch = NULL;
2407 	fsw->fsw_closed_na_stats = nsfw;
2408 
2409 	SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
2410 
2411 	return fsw;
2412 }
2413 
2414 static int
fsw_detach(struct nx_flowswitch * fsw,struct nexus_adapter * hwna,boolean_t purge)2415 fsw_detach(struct nx_flowswitch *fsw, struct nexus_adapter *hwna,
2416     boolean_t purge)
2417 {
2418 	struct kern_nexus_provider *nx_prov = fsw->fsw_nx->nx_prov;
2419 	boolean_t do_dtor = FALSE;
2420 
2421 	SK_LOCK_ASSERT_HELD();
2422 
2423 	/*
2424 	 * return error if the the host port detach is in progress
2425 	 * or already detached.
2426 	 * For the case of flowswitch free (i.e. purge is TRUE) we have to
2427 	 * cleanup everything, so we will block if needed.
2428 	 */
2429 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
2430 	if (!purge && fsw->fsw_detach_flags != 0) {
2431 		SK_ERR("fsw detaching");
2432 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2433 		return EBUSY;
2434 	}
2435 	VERIFY(purge || fsw->fsw_detach_flags == 0);
2436 	/*
2437 	 * mark the flowswitch as detaching and release sk_lock while
2438 	 * waiting for other threads to exit. Maintain lock/unlock
2439 	 * ordering between the two locks.
2440 	 */
2441 	fsw->fsw_detach_flags |= FSW_DETACHF_DETACHING;
2442 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2443 	SK_UNLOCK();
2444 
2445 	/*
2446 	 * wait until all threads needing accesses to the flowswitch
2447 	 * netagent get out, and mark this as detached to prevent
2448 	 * further access requests from being admitted.
2449 	 */
2450 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
2451 	while (fsw->fsw_detach_barriers != 0) {
2452 		fsw->fsw_detach_waiters++;
2453 		(void) msleep(&fsw->fsw_detach_waiters,
2454 		    &fsw->fsw_detach_barrier_lock,
2455 		    (PZERO + 1), __FUNCTION__, NULL);
2456 	}
2457 	VERIFY(fsw->fsw_detach_barriers == 0);
2458 	VERIFY(fsw->fsw_detach_flags != 0);
2459 	fsw->fsw_detach_flags &= ~FSW_DETACHF_DETACHING;
2460 	/*
2461 	 * if the NA detach thread as well as the flowswitch free thread were
2462 	 * both waiting, then the thread which wins the race is responsible
2463 	 * for doing the dtor work.
2464 	 */
2465 	if (fsw->fsw_detach_flags == 0) {
2466 		fsw->fsw_detach_flags |= FSW_DETACHF_DETACHED;
2467 		do_dtor = TRUE;
2468 	}
2469 	VERIFY(fsw->fsw_detach_flags == FSW_DETACHF_DETACHED);
2470 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2471 	SK_LOCK();
2472 
2473 	FSW_WLOCK(fsw);
2474 	if (do_dtor) {
2475 		if (fsw->fsw_ifp != NULL) {
2476 			fsw_teardown_ifp(fsw, hwna);
2477 			ASSERT(fsw->fsw_ifp == NULL);
2478 			ASSERT(fsw->fsw_nifna == NULL);
2479 		}
2480 		bzero(fsw->fsw_slla, sizeof(fsw->fsw_slla));
2481 		nx_prov->nxprov_params->nxp_ifindex = 0;
2482 		/* free any flow entries in the deferred list */
2483 		fsw_linger_purge(fsw);
2484 	}
2485 	/*
2486 	 * If we are destroying the instance, release lock to let all
2487 	 * outstanding agent threads to enter, followed by waiting until
2488 	 * all of them exit the critical section before continuing.
2489 	 */
2490 	if (purge) {
2491 		FSW_UNLOCK(fsw);
2492 		flow_mgr_terminate(fsw->fsw_flow_mgr);
2493 		FSW_WLOCK(fsw);
2494 	}
2495 	FSW_WUNLOCK(fsw);
2496 	return 0;
2497 }
2498 
2499 void
fsw_free(struct nx_flowswitch * fsw)2500 fsw_free(struct nx_flowswitch *fsw)
2501 {
2502 	int err;
2503 
2504 	SK_LOCK_ASSERT_HELD();
2505 	ASSERT(fsw != NULL);
2506 
2507 	err = fsw_detach(fsw, NULL, TRUE);
2508 	VERIFY(err == 0);
2509 
2510 	fsw_dp_dtor(fsw);
2511 
2512 	ASSERT(fsw->fsw_dev_ch == NULL);
2513 	ASSERT(fsw->fsw_host_ch == NULL);
2514 	ASSERT(fsw->fsw_closed_na_stats != NULL);
2515 	zfree(nx_fsw_stats_zone, fsw->fsw_closed_na_stats);
2516 	fsw->fsw_closed_na_stats = NULL;
2517 	FSW_RWDESTROY(fsw);
2518 
2519 	SK_DF(SK_VERB_MEM, "fsw 0x%llx FREE", SK_KVA(fsw));
2520 	zfree(nx_fsw_zone, fsw);
2521 }
2522