xref: /xnu-11215.41.3/bsd/skywalk/nexus/flowswitch/fsw.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  *   1. Redistributions of source code must retain the above copyright
36  *      notice, this list of conditions and the following disclaimer.
37  *   2. Redistributions in binary form must reproduce the above copyright
38  *      notice, this list of conditions and the following disclaimer in the
39  *      documentation and/or other materials provided with the distribution.
40  *
41  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  */
53 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
54 #include <skywalk/os_skywalk_private.h>
55 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
56 #include <skywalk/nexus/flowswitch/fsw_var.h>
57 #include <skywalk/nexus/netif/nx_netif.h>
58 #include <skywalk/nexus/netif/nx_netif_compat.h>
59 
60 #include <net/bpf.h>
61 #include <net/if.h>
62 #include <net/pktsched/pktsched_netem.h>
63 #include <sys/eventhandler.h>
64 
65 #if (DEVELOPMENT || DEBUG)
66 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, chain_enqueue,
67     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_chain_enqueue, 0, "");
68 #endif /* !DEVELOPMENT && !DEBUG */
69 
70 /*
71  * Configures the flowswitch to utilize user packet pool with
72  * dual sized buffers.
73  * A non-zero value enables the support.
74  */
75 #if defined(XNU_TARGET_OS_IOS) || defined(XNU_TARGET_OS_OSX) || defined(XNU_TARGET_OS_XR)
76 uint32_t fsw_use_dual_sized_pool = 1;
77 #else
78 uint32_t fsw_use_dual_sized_pool = 0;
79 #endif
80 
81 uint32_t fsw_chain_enqueue = 1;
82 static int __nx_fsw_inited = 0;
83 static eventhandler_tag __nx_fsw_ifnet_eventhandler_tag = NULL;
84 static eventhandler_tag __nx_fsw_protoctl_eventhandler_tag = NULL;
85 
86 static SKMEM_TYPE_DEFINE(nx_fsw_zone, struct nx_flowswitch);
87 
88 static SKMEM_TYPE_DEFINE(nx_fsw_stats_zone, struct __nx_stats_fsw);
89 
90 #define SKMEM_TAG_FSW_PORTS     "com.apple.skywalk.fsw.ports"
91 SKMEM_TAG_DEFINE(skmem_tag_fsw_ports, SKMEM_TAG_FSW_PORTS);
92 
93 #define SKMEM_TAG_FSW_FOB_HASH "com.apple.skywalk.fsw.fsw.fob.hash"
94 SKMEM_TAG_DEFINE(skmem_tag_fsw_fob_hash, SKMEM_TAG_FSW_FOB_HASH);
95 
96 #define SKMEM_TAG_FSW_FRB_HASH "com.apple.skywalk.fsw.fsw.frb.hash"
97 SKMEM_TAG_DEFINE(skmem_tag_fsw_frb_hash, SKMEM_TAG_FSW_FRB_HASH);
98 
99 #define SKMEM_TAG_FSW_FRIB_HASH "com.apple.skywalk.fsw.fsw.frib.hash"
100 SKMEM_TAG_DEFINE(skmem_tag_fsw_frib_hash, SKMEM_TAG_FSW_FRIB_HASH);
101 
102 #define SKMEM_TAG_FSW_FRAG_MGR "com.apple.skywalk.fsw.fsw.frag.mgr"
103 SKMEM_TAG_DEFINE(skmem_tag_fsw_frag_mgr, SKMEM_TAG_FSW_FRAG_MGR);
104 
105 /* 64-bit mask with range */
106 #define BMASK64(_beg, _end)     \
107 	((NX_FSW_CHUNK_FREE >> (63 - (_end))) & ~((1ULL << (_beg)) - 1))
108 
109 static int fsw_detach(struct nx_flowswitch *fsw, struct nexus_adapter *hwna,
110     boolean_t purge);
111 
112 int
fsw_attach_vp(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_vp_adapter ** vpna)113 fsw_attach_vp(struct kern_nexus *nx, struct kern_channel *ch,
114     struct chreq *chr, struct nxbind *nxb, struct proc *p,
115     struct nexus_vp_adapter **vpna)
116 {
117 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
118 	/* -fbounds-safety: cr_name should be null terminated (via snprintf) */
119 	SK_LOG_VAR(const char *__null_terminated cr_name =
120 	    __unsafe_forge_null_terminated(const char *, chr->cr_name));
121 	int err = 0;
122 
123 	SK_LOCK_ASSERT_HELD();
124 	ASSERT(!(chr->cr_mode & CHMODE_CONFIG));
125 	*vpna = NULL;
126 
127 	/* if there's an existing adapter on the nexus port then use it */
128 	FSW_WLOCK(fsw);
129 	err = fsw_port_alloc(fsw, nxb, vpna, chr->cr_port, p, FALSE, FALSE);
130 	FSW_WUNLOCK(fsw);
131 
132 	if (err != 0) {
133 		ASSERT(*vpna == NULL);
134 		goto out;
135 	} else if (*vpna != NULL) {
136 		/*
137 		 * Use the existing adapter on that port; fsw_port_alloc()
138 		 * callback has retained a reference count on the adapter.
139 		 */
140 		goto out;
141 	}
142 	ASSERT(*vpna == NULL);
143 
144 	/* create a virtual port; callee holds vpna ref */
145 	err = fsw_vp_na_create(nx, chr, p, vpna);
146 	if (err != 0) {
147 		SK_ERR("vpna create failed (err %d)", err);
148 		goto out;
149 	}
150 
151 	FSW_WLOCK(fsw);
152 	err = fsw_port_alloc(fsw, nxb, vpna, (*vpna)->vpna_nx_port, p, FALSE, FALSE);
153 	FSW_WUNLOCK(fsw);
154 
155 out:
156 	if ((*vpna) != NULL) {
157 		(*vpna)->vpna_up.na_private = ch;
158 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
159 		    "vpna \"%s\" (0x%llx) refs %u to fsw \"%s\" "
160 		    "nx_port %d (err %d)", (*vpna)->vpna_up.na_name,
161 		    SK_KVA(&(*vpna)->vpna_up), (*vpna)->vpna_up.na_refcount,
162 		    cr_name, (int)(*vpna)->vpna_nx_port, err);
163 
164 		if (err != 0) {
165 			na_release_locked(&(*vpna)->vpna_up);
166 			*vpna = NULL;
167 		}
168 	}
169 
170 	return err;
171 }
172 
173 static int
fsw_nx_check(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)174 fsw_nx_check(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
175 {
176 #pragma unused(fsw)
177 	nexus_type_t hw_nxdom_type = NX_DOM(hw_nx)->nxdom_type;
178 
179 	if (hw_nxdom_type != NEXUS_TYPE_NET_IF) {
180 		return EINVAL;
181 	}
182 
183 	/* it's a netif below */
184 	return 0;
185 }
186 
187 static int
fsw_ctl_flow_add(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)188 fsw_ctl_flow_add(struct nx_flowswitch *fsw, struct proc *p,
189     struct nx_flow_req *req)
190 {
191 	struct flow_owner *fo;
192 	int error = 0;
193 
194 	ASSERT(p != PROC_NULL);
195 
196 	if (p != kernproc) {
197 		/* special port shouldn't be bound via this method */
198 		if (req->nfr_nx_port < FSW_VP_USER_MIN) {
199 			return EINVAL;
200 		}
201 		req->nfr_flags |= (NXFLOWREQF_TRACK | NXFLOWREQF_FLOWADV);
202 	} else {
203 		/* no flow track or advisory support for bsd flow */
204 		ASSERT((req->nfr_flags & NXFLOWREQF_TRACK) == 0);
205 		ASSERT((req->nfr_flags & NXFLOWREQF_FLOWADV) == 0);
206 		ASSERT((req->nfr_flags & NXFLOWREQF_LOW_LATENCY) == 0);
207 	}
208 
209 	/* init kernel only fields */
210 	if (p != kernproc) {
211 		nx_flow_req_internalize(req);
212 	}
213 	req->nfr_pid = proc_pid(p);
214 	if (req->nfr_epid == -1) {
215 		req->nfr_epid = proc_pid(p);
216 	}
217 
218 	if (req->nfr_flow_demux_count > MAX_FLOW_DEMUX_PATTERN) {
219 		SK_ERR("invalid flow demux count %u", req->nfr_flow_demux_count);
220 		return EINVAL;
221 	}
222 
223 	fo = fsw_flow_add(fsw, req, &error);
224 	ASSERT(fo != NULL || error != 0);
225 
226 	if (error == 0) {
227 		// user space don't need this flow stats
228 		flow_stats_release(req->nfr_flow_stats);
229 	}
230 	if (p != kernproc) {
231 		nx_flow_req_externalize(req);
232 	}
233 
234 	return error;
235 }
236 
237 static int
fsw_ctl_flow_del(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)238 fsw_ctl_flow_del(struct nx_flowswitch *fsw, struct proc *p,
239     struct nx_flow_req *req)
240 {
241 	int err;
242 
243 	nx_flow_req_internalize(req);
244 	req->nfr_pid = proc_pid(p);
245 	err = fsw_flow_del(fsw, req, TRUE, NULL);
246 
247 	nx_flow_req_externalize(req);
248 	return err;
249 }
250 
251 static int
fsw_ctl_flow_config(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)252 fsw_ctl_flow_config(struct nx_flowswitch *fsw, struct proc *p,
253     struct nx_flow_req *req)
254 {
255 	int err;
256 
257 	nx_flow_req_internalize(req);
258 	req->nfr_pid = proc_pid(p);
259 	err = fsw_flow_config(fsw, req);
260 
261 	nx_flow_req_externalize(req);
262 	return err;
263 }
264 
265 #if (DEVELOPMENT || DEBUG)
266 static int
267 fsw_rps_threads_sysctl SYSCTL_HANDLER_ARGS
268 {
269 #pragma unused(oidp, arg2)
270 	struct nx_flowswitch *__single fsw = arg1;
271 	uint32_t nthreads;
272 	int changed;
273 	int error;
274 
275 	error = sysctl_io_number(req, fsw->fsw_rps_nthreads,
276 	    sizeof(fsw->fsw_rps_nthreads), &nthreads, &changed);
277 	if (error == 0 && changed != 0) {
278 		error = fsw_rps_set_nthreads(fsw, nthreads);
279 	}
280 	return error;
281 }
282 #endif /* !DEVELOPMENT && !DEBUG */
283 
284 void
fsw_get_tso_capabilities(struct ifnet * ifp,uint32_t * tso_v4_mtu,uint32_t * tso_v6_mtu)285 fsw_get_tso_capabilities(struct ifnet *ifp, uint32_t *tso_v4_mtu, uint32_t *tso_v6_mtu)
286 {
287 #pragma unused(ifp)
288 	*tso_v4_mtu = 0;
289 	*tso_v6_mtu = 0;
290 
291 	struct nx_flowswitch *fsw;
292 
293 	if (!kernel_is_macos_or_server()) {
294 		return;
295 	}
296 
297 	fsw = fsw_ifp_to_fsw(ifp);
298 	if (fsw == NULL) {
299 		return;
300 	}
301 	switch (fsw->fsw_tso_mode) {
302 	case FSW_TSO_MODE_HW: {
303 		ASSERT(ifp->if_tso_v4_mtu != 0 || ifp->if_tso_v6_mtu != 0);
304 		*tso_v4_mtu = ifp->if_tso_v4_mtu;
305 		*tso_v6_mtu = ifp->if_tso_v6_mtu;
306 		break;
307 	}
308 	case FSW_TSO_MODE_SW: {
309 		ASSERT(fsw->fsw_tso_sw_mtu != 0);
310 		*tso_v4_mtu = fsw->fsw_tso_sw_mtu;
311 		*tso_v6_mtu = fsw->fsw_tso_sw_mtu;
312 		break;
313 	}
314 	default:
315 		break;
316 	}
317 }
318 
319 static void
fsw_tso_setup(struct nx_flowswitch * fsw)320 fsw_tso_setup(struct nx_flowswitch *fsw)
321 {
322 	if (!kernel_is_macos_or_server()) {
323 		return;
324 	}
325 
326 	fsw->fsw_tso_mode = FSW_TSO_MODE_NONE;
327 	struct ifnet *ifp = fsw->fsw_ifp;
328 	if (!SKYWALK_CAPABLE(ifp) || !SKYWALK_NATIVE(ifp)) {
329 		DTRACE_SKYWALK2(tso__no__support, struct nx_flowswitch *, fsw,
330 		    ifnet_t, ifp);
331 		return;
332 	}
333 	struct nx_netif *nif = NA(ifp)->nifna_netif;
334 	uint32_t large_buf_size = NX_PROV_PARAMS(fsw->fsw_nx)->nxp_large_buf_size;
335 
336 	if (large_buf_size == 0) {
337 		DTRACE_SKYWALK2(no__large__buf, struct nx_flowswitch *, fsw,
338 		    ifnet_t, ifp);
339 		return;
340 	}
341 	/*
342 	 * Unlike _dlil_adjust_large_buf_size_for_tso(), we check the nif_hwassist
343 	 * flags here for the original flags because nx_netif_host_adjust_if_capabilities()
344 	 * has already been called.
345 	 */
346 	if (((nif->nif_hwassist & IFNET_TSO_IPV4) != 0 && ifp->if_tso_v4_mtu != 0) ||
347 	    ((nif->nif_hwassist & IFNET_TSO_IPV6) != 0 && ifp->if_tso_v6_mtu != 0)) {
348 		ASSERT(large_buf_size <= ifp->if_tso_v4_mtu ||
349 		    large_buf_size <= ifp->if_tso_v6_mtu);
350 		fsw->fsw_tso_mode = FSW_TSO_MODE_HW;
351 	} else {
352 		if (sk_fsw_gso_mtu != 0 && large_buf_size >= sk_fsw_gso_mtu) {
353 			fsw->fsw_tso_mode = FSW_TSO_MODE_SW;
354 			fsw->fsw_tso_sw_mtu = sk_fsw_gso_mtu;
355 		}
356 	}
357 	DTRACE_SKYWALK3(tso__mode, struct nx_flowswitch *, fsw,
358 	    fsw_tso_mode_t, fsw->fsw_tso_mode, uint32_t, large_buf_size);
359 }
360 
361 static int
fsw_setup_ifp(struct nx_flowswitch * fsw,struct nexus_adapter * hwna)362 fsw_setup_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna)
363 {
364 	int error = 0;
365 	struct ifnet *ifp = hwna->na_ifp;
366 	struct kern_pbufpool *pp = skmem_arena_nexus(hwna->na_arena)->arn_rx_pp;
367 	size_t f_limit = pp->pp_kmd_region->skr_c_obj_cnt / 2;
368 
369 	ASSERT((hwna->na_type == NA_NETIF_HOST) ||
370 	    (hwna->na_type == NA_NETIF_COMPAT_HOST));
371 
372 	SK_LOCK_ASSERT_HELD();
373 
374 	/*
375 	 * XXX: we don't support non TXSTART interface.
376 	 * There are assumptions in fsw_port_flush_enqueue_dst() about
377 	 * single threaded write to destination rings.
378 	 */
379 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
380 		SK_ERR("non TXSTART interface not supported ifp(0x%llx)",
381 		    SK_KVA(ifp));
382 		return ENOTSUP;
383 	}
384 
385 	FSW_WLOCK(fsw);
386 
387 	ASSERT(fsw->fsw_ifp == NULL);
388 	ASSERT(fsw->fsw_nifna == NULL);
389 	ASSERT(fsw->fsw_resolve == NULL);
390 	ASSERT(fsw->fsw_frame == NULL);
391 	ASSERT(fsw->fsw_demux == NULL);
392 	ASSERT(fsw->fsw_pkt_copy_from_pkt == NULL);
393 	ASSERT(fsw->fsw_pkt_copy_from_mbuf == NULL);
394 	ASSERT(fsw->fsw_pkt_copy_to_mbuf == NULL);
395 
396 	fsw->fsw_ipfm = fsw_ip_frag_mgr_create(fsw, ifp, f_limit);
397 	if (fsw->fsw_ipfm == NULL) {
398 		FSW_WUNLOCK(fsw);
399 		return ENOMEM;
400 	}
401 
402 	switch (ifp->if_family) {
403 	case IFNET_FAMILY_ETHERNET:
404 		error = fsw_ethernet_setup(fsw, ifp);
405 		fsw->fsw_ifp_dlt = DLT_EN10MB;
406 		break;
407 
408 	case IFNET_FAMILY_CELLULAR:
409 		error = fsw_cellular_setup(fsw, ifp);
410 		fsw->fsw_ifp_dlt = DLT_RAW;
411 		break;
412 
413 	default:
414 		if (ifp->if_family == IFNET_FAMILY_IPSEC ||
415 		    ifp->if_family == IFNET_FAMILY_UTUN) {
416 			error = fsw_ip_setup(fsw, ifp);
417 			fsw->fsw_ifp_dlt = DLT_RAW;
418 			break;
419 		}
420 		error = ENOTSUP;
421 		break;
422 	}
423 
424 	if (error != 0) {
425 		FSW_WUNLOCK(fsw);
426 		return error;
427 	}
428 
429 	ASSERT(fsw->fsw_resolve != NULL);
430 
431 	if (NX_PROV(fsw->fsw_nx)->nxprov_region_params[SKMEM_REGION_KMD].
432 	    srp_max_frags > 1 || pp->pp_max_frags > 1) {
433 		fsw->fsw_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt;
434 		fsw->fsw_pkt_copy_from_mbuf = pkt_copy_multi_buflet_from_mbuf;
435 		fsw->fsw_pkt_copy_to_mbuf = pkt_copy_multi_buflet_to_mbuf;
436 	} else {
437 		fsw->fsw_pkt_copy_from_pkt = pkt_copy_from_pkt;
438 		fsw->fsw_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
439 		fsw->fsw_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
440 	}
441 
442 	/*
443 	 * Since it is possible for fsw to refer to the ifp after all
444 	 * underlying hwnas are freed (see fsw_teardown_ifp()), we need
445 	 * an extra reference to the ifp here.
446 	 *
447 	 * We also cache the netif adapter of the interface, as it's
448 	 * needed for each packet enqueued to the classq.  There is no
449 	 * need to retain a refcnt for the same reason as above.
450 	 *
451 	 * We hold the busy lock across these, just in case an interface
452 	 * detach and reattach happens, as fsw_flow_bind() relies on the
453 	 * same lock as well before making its checks.
454 	 */
455 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
456 
457 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
458 	fsw->fsw_ifp = ifp;
459 	fsw->fsw_nifna = &ifp->if_na->nifna_up;
460 	ifp->if_na->nifna_netif->nif_fsw = fsw;
461 	ifp->if_na->nifna_netif->nif_fsw_nxadv =
462 	    fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
463 	(void) strlcpy(fsw->fsw_flow_mgr->fm_name,
464 	    if_name(ifp), IFNAMSIZ);
465 
466 	fsw_classq_setup(fsw, hwna);
467 	fsw->fsw_classq_enabled = TRUE;
468 	fsw->fsw_src_lla_gencnt = 0;
469 	fsw_tso_setup(fsw);
470 
471 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
472 	(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
473 	    FSW_REAP_THREADNAME, ifp->if_xname, "");
474 	thread_set_thread_name(fsw->fsw_reap_thread,
475 	    __unsafe_null_terminated_from_indexable(fsw->fsw_reap_name));
476 
477 	error = fsw_netagent_register(fsw, ifp);
478 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
479 	    "fsw_netagent_register %s (family %u) (err %d)",
480 	    if_name(ifp), ifp->if_family, error);
481 
482 	/*
483 	 * Clear NXF_REJECT to allow new channels to be opened
484 	 * to this nexus, in case this is an interface reattach.
485 	 * Otherwise this flag should already be cleared.
486 	 */
487 	if (error == 0) {
488 		os_atomic_andnot(&fsw->fsw_nx->nx_flags, NXF_REJECT, relaxed);
489 	}
490 
491 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
492 
493 	/*
494 	 * Wake up the reaper thread.
495 	 */
496 	if (error == 0) {
497 		fsw_reap_sched(fsw);
498 	}
499 
500 	/* init skoid */
501 	skoid_create(&fsw->fsw_skoid,
502 	    SKOID_SNODE(_kern_skywalk_flowswitch), if_name(ifp),
503 	    CTLFLAG_RW);
504 
505 #if (DEVELOPMENT || DEBUG)
506 	if (SKYWALK_NATIVE(fsw->fsw_ifp)) {
507 		skoid_add_handler(&fsw->fsw_skoid, "rps_nthreads", CTLFLAG_RW,
508 		    fsw_rps_threads_sysctl, fsw, 0);
509 	}
510 #endif /* !DEVELOPMENT && !DEBUG */
511 
512 	FSW_WUNLOCK(fsw);
513 
514 	return error;
515 }
516 
517 static void
fsw_teardown_ifp(struct nx_flowswitch * fsw,struct nexus_adapter * hwna)518 fsw_teardown_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna)
519 {
520 	struct ifnet *ifp;
521 	const char *__null_terminated reap_name = NULL;
522 
523 	SK_LOCK_ASSERT_HELD();
524 
525 	FSW_WLOCK_ASSERT_HELD(fsw);
526 	ifp = fsw->fsw_ifp;
527 	ASSERT(ifp != NULL);
528 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
529 
530 	fsw_netagent_unregister(fsw, ifp);
531 
532 	if (fsw->fsw_ipfm != NULL) {
533 		fsw_ip_frag_mgr_destroy(fsw->fsw_ipfm);
534 	}
535 
536 	skoid_destroy(&fsw->fsw_skoid);
537 
538 	SK_DF(SK_VERB_FSW, "%sdetached from %s (family %u)",
539 	    ((fsw->fsw_agent_session != NULL) ? "netagent" : ""),
540 	    if_name(ifp), ifp->if_family);
541 
542 	if (hwna != NULL) {
543 		fsw_classq_teardown(fsw, hwna);
544 	}
545 
546 	/*
547 	 * Set NXF_REJECT on the nexus, which would cause existing adapters
548 	 * to be marked similarly; channels associated with them would then
549 	 * cease to function.
550 	 */
551 	os_atomic_or(&fsw->fsw_nx->nx_flags, NXF_REJECT, relaxed);
552 
553 	/* see notes on fsw_na_attach() about I/O refcnt */
554 	if (ifp->if_na != NULL) {
555 		ifp->if_na->nifna_netif->nif_fsw = NULL;
556 		ifp->if_na->nifna_netif->nif_fsw_nxadv = NULL;
557 		os_atomic_thread_fence(seq_cst);
558 	}
559 
560 	fsw->fsw_ifp = NULL;
561 	fsw->fsw_nifna = NULL;
562 	fsw->fsw_resolve = NULL;
563 	fsw->fsw_frame = NULL;
564 	fsw->fsw_frame_headroom = 0;
565 	fsw->fsw_demux = NULL;
566 	fsw->fsw_classq_enabled = FALSE;
567 	fsw->fsw_pkt_copy_from_pkt = NULL;
568 	fsw->fsw_pkt_copy_from_mbuf = NULL;
569 	fsw->fsw_pkt_copy_to_mbuf = NULL;
570 
571 	if (ifp->if_input_netem != NULL) {
572 		netem_destroy(ifp->if_input_netem);
573 		ifp->if_input_netem = NULL;
574 	}
575 
576 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
577 	reap_name = tsnprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
578 	    FSW_REAP_THREADNAME, if_name(ifp), "_detached");
579 	thread_set_thread_name(fsw->fsw_reap_thread, reap_name);
580 }
581 
582 static int
fsw_host_setup(struct nx_flowswitch * fsw)583 fsw_host_setup(struct nx_flowswitch *fsw)
584 {
585 	struct nexus_adapter *hwna;
586 	struct ifnet *ifp;
587 
588 	SK_LOCK_ASSERT_HELD();
589 
590 	hwna = fsw->fsw_host_ch->ch_na;
591 	ASSERT(hwna != NULL);
592 
593 
594 	/* the netif below must have an ifnet attached (dev/host port) */
595 	if ((ifp = hwna->na_ifp) == NULL) {
596 		return ENXIO;
597 	}
598 
599 	/*
600 	 * XXX: we don't support multiple rx rings yet.
601 	 * There are assumptions in fsw_port_flush_enqueue_dst() about
602 	 * single threaded write to destination rings.
603 	 */
604 	if (SKYWALK_NATIVE(ifp) && (hwna->na_num_rx_rings > 1)) {
605 		SK_ERR("ifp(0x%llx): multiple rx rings(%d) not supported",
606 		    SK_KVA(ifp), hwna->na_num_rx_rings);
607 		return ENOTSUP;
608 	}
609 
610 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
611 	if ((fsw->fsw_detach_flags & FSW_DETACHF_DETACHING) != 0) {
612 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
613 		return EBUSY;
614 	}
615 	fsw->fsw_detach_flags = 0;
616 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
617 
618 	int error = fsw_setup_ifp(fsw, hwna);
619 	ASSERT(error != 0 || fsw->fsw_ifp != NULL);
620 	if (error != 0) {
621 		return error;
622 	}
623 
624 	/* update the interface index */
625 	ASSERT(NX_PROV(fsw->fsw_nx)->nxprov_params->nxp_ifindex == 0);
626 	NX_PROV(fsw->fsw_nx)->nxprov_params->nxp_ifindex = ifp->if_index;
627 	return 0;
628 }
629 
630 static int
fsw_host_teardown(struct nx_flowswitch * fsw)631 fsw_host_teardown(struct nx_flowswitch *fsw)
632 {
633 	struct nexus_adapter *hwna = fsw->fsw_host_ch->ch_na;
634 
635 	SK_LOCK_ASSERT_HELD();
636 	return fsw_detach(fsw, hwna, FALSE);
637 }
638 
639 #if SK_LOG
640 /* Hoisted out of line to reduce kernel stack footprint */
641 SK_LOG_ATTRIBUTE
642 static void
fsw_ctl_attach_log(const struct nx_spec_req * nsr,const struct kern_nexus * nx,int err)643 fsw_ctl_attach_log(const struct nx_spec_req *nsr,
644     const struct kern_nexus *nx, int err)
645 {
646 	uuid_string_t uuidstr, ifuuidstr;
647 	const char *__null_terminated nustr = NULL;
648 
649 	if (nsr->nsr_flags & NXSPECREQ_UUID) {
650 		/*
651 		 * -fbounds-safety: We know the output of sk_uuid_unparse is
652 		 * null-terminated.
653 		 */
654 		nustr = __unsafe_forge_null_terminated(const char *,
655 		    sk_uuid_unparse(nsr->nsr_uuid, uuidstr));
656 	} else if (nsr->nsr_flags & NXSPECREQ_IFP) {
657 		nustr = tsnprintf((char *)uuidstr, sizeof(uuidstr), "0x%llx",
658 		    SK_KVA(nsr->nsr_ifp));
659 	} else {
660 		nustr = __unsafe_null_terminated_from_indexable(nsr->nsr_name);
661 	}
662 
663 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
664 	    "nexus 0x%llx (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d",
665 	    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr,
666 	    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err);
667 }
668 #endif /* SK_LOG */
669 
670 SK_NO_INLINE_ATTRIBUTE
671 static void
fsw_netif_set_callbacks_common(struct nx_flowswitch * fsw,boolean_t set)672 fsw_netif_set_callbacks_common(struct nx_flowswitch *fsw, boolean_t set)
673 {
674 	struct nexus_adapter *hwna = fsw->fsw_dev_ch->ch_na;
675 
676 	ASSERT(hwna->na_type == NA_NETIF_DEV ||
677 	    hwna->na_type == NA_NETIF_COMPAT_DEV);
678 
679 	if (set) {
680 		netif_hwna_set_mode(hwna, NETIF_MODE_FSW, fsw_devna_rx);
681 	} else {
682 		netif_hwna_clear_mode(hwna);
683 	}
684 }
685 
686 SK_NO_INLINE_ATTRIBUTE
687 static void
fsw_netif_set_callbacks(struct nx_flowswitch * fsw)688 fsw_netif_set_callbacks(struct nx_flowswitch *fsw)
689 {
690 	fsw_netif_set_callbacks_common(fsw, TRUE);
691 }
692 
693 SK_NO_INLINE_ATTRIBUTE
694 static void
fsw_netif_clear_callbacks(struct nx_flowswitch * fsw)695 fsw_netif_clear_callbacks(struct nx_flowswitch *fsw)
696 {
697 	fsw_netif_set_callbacks_common(fsw, FALSE);
698 }
699 
700 SK_NO_INLINE_ATTRIBUTE
701 static void
fsw_dp_start(struct nx_flowswitch * fsw)702 fsw_dp_start(struct nx_flowswitch *fsw)
703 {
704 	ASSERT(fsw->fsw_dev_ch != NULL);
705 	ASSERT(fsw->fsw_host_ch != NULL);
706 
707 	fsw_netif_set_callbacks(fsw);
708 	na_start_spec(fsw->fsw_dev_ch->ch_nexus, fsw->fsw_dev_ch);
709 	na_start_spec(fsw->fsw_host_ch->ch_nexus, fsw->fsw_host_ch);
710 }
711 
712 SK_NO_INLINE_ATTRIBUTE
713 static int
fsw_dp_stop(struct nx_flowswitch * fsw,struct ifnet ** ifpp)714 fsw_dp_stop(struct nx_flowswitch *fsw, struct ifnet **ifpp)
715 {
716 	struct ifnet *ifp;
717 
718 	FSW_WLOCK(fsw);
719 	if ((fsw->fsw_state_flags & FSW_STATEF_QUIESCED) != 0) {
720 		FSW_WUNLOCK(fsw);
721 		return EALREADY;
722 	}
723 	fsw->fsw_state_flags |= FSW_STATEF_QUIESCED;
724 	FSW_WUNLOCK(fsw);
725 
726 	/*
727 	 * For regular kernel-attached interfaces, quiescing is handled by
728 	 * the ifnet detach thread, which calls dlil_quiesce_and_detach_nexuses().
729 	 * For interfaces created by skywalk test cases, flowswitch/netif nexuses
730 	 * are constructed on the fly and can also be torn down on the fly.
731 	 * dlil_quiesce_and_detach_nexuses() won't help here because any nexus
732 	 * can be detached while the interface is still attached.
733 	 */
734 	if ((ifp = fsw->fsw_ifp) != NULL &&
735 	    ifnet_datamov_suspend_if_needed(ifp)) {
736 		SK_UNLOCK();
737 		ifnet_datamov_drain(ifp);
738 		/* Reference will be released by caller */
739 		*ifpp = ifp;
740 		SK_LOCK();
741 	}
742 	ASSERT(fsw->fsw_dev_ch != NULL);
743 	ASSERT(fsw->fsw_host_ch != NULL);
744 	na_stop_spec(fsw->fsw_host_ch->ch_nexus, fsw->fsw_host_ch);
745 	na_stop_spec(fsw->fsw_dev_ch->ch_nexus, fsw->fsw_dev_ch);
746 	fsw_netif_clear_callbacks(fsw);
747 	return 0;
748 }
749 
750 SK_NO_INLINE_ATTRIBUTE
751 static int
fsw_netif_port_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx,boolean_t host)752 fsw_netif_port_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx,
753     boolean_t host)
754 {
755 	struct chreq chr;
756 	struct kern_channel *ch;
757 	int err;
758 
759 	bzero(&chr, sizeof(chr));
760 	uuid_copy(chr.cr_spec_uuid, hw_nx->nx_uuid);
761 	chr.cr_ring_id = CHANNEL_RING_ID_ANY;
762 	chr.cr_port = host ? NEXUS_PORT_NET_IF_HOST : NEXUS_PORT_NET_IF_DEV;
763 	chr.cr_mode |= CHMODE_CONFIG | (host ? CHMODE_HOST : 0);
764 
765 	err = 0;
766 	ch = ch_open_special(hw_nx, &chr, FALSE, &err);
767 	if (ch == NULL) {
768 		SK_ERR("ch_open_special(%s) failed: %d",
769 		    host ? "host" : "dev", err);
770 		return err;
771 	}
772 	if (host) {
773 		fsw->fsw_host_ch = ch;
774 	} else {
775 		fsw->fsw_dev_ch = ch;
776 	}
777 	return 0;
778 }
779 
780 SK_NO_INLINE_ATTRIBUTE
781 static int
fsw_netif_port_teardown(struct nx_flowswitch * fsw,boolean_t host)782 fsw_netif_port_teardown(struct nx_flowswitch *fsw, boolean_t host)
783 {
784 	struct kern_channel *ch;
785 
786 	ch = host ? fsw->fsw_host_ch : fsw->fsw_dev_ch;
787 	if (ch == NULL) {
788 		return EINVAL;
789 	}
790 	if (host) {
791 		fsw->fsw_host_ch = NULL;
792 	} else {
793 		fsw->fsw_dev_ch = NULL;
794 	}
795 	ch_close_special(ch);
796 	(void) ch_release_locked(ch);
797 	return 0;
798 }
799 
800 SK_NO_INLINE_ATTRIBUTE
801 static int
fsw_devna_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)802 fsw_devna_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
803 {
804 	return fsw_netif_port_setup(fsw, hw_nx, FALSE);
805 }
806 
807 SK_NO_INLINE_ATTRIBUTE
808 static int
fsw_hostna_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)809 fsw_hostna_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
810 {
811 	return fsw_netif_port_setup(fsw, hw_nx, TRUE);
812 }
813 
814 SK_NO_INLINE_ATTRIBUTE
815 static int
fsw_devna_teardown(struct nx_flowswitch * fsw)816 fsw_devna_teardown(struct nx_flowswitch *fsw)
817 {
818 	return fsw_netif_port_teardown(fsw, FALSE);
819 }
820 
821 SK_NO_INLINE_ATTRIBUTE
822 static int
fsw_hostna_teardown(struct nx_flowswitch * fsw)823 fsw_hostna_teardown(struct nx_flowswitch *fsw)
824 {
825 	return fsw_netif_port_teardown(fsw, TRUE);
826 }
827 
828 /* Process NXCFG_CMD_ATTACH */
829 SK_NO_INLINE_ATTRIBUTE
830 static int
fsw_ctl_attach(struct kern_nexus * nx,struct proc * p,struct nx_spec_req * nsr)831 fsw_ctl_attach(struct kern_nexus *nx, struct proc *p, struct nx_spec_req *nsr)
832 {
833 #pragma unused(p)
834 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
835 	struct kern_nexus *hw_nx = NULL;
836 	int err = 0;
837 
838 	SK_LOCK_ASSERT_HELD();
839 
840 	/*
841 	 * The flowswitch only accepts UUID as an identifier, since it
842 	 * represents the UUID of the kernel object we are trying to
843 	 * attach to this flowswitch.
844 	 */
845 	if ((nsr->nsr_flags & (NXSPECREQ_UUID | NXSPECREQ_IFP)) !=
846 	    NXSPECREQ_UUID || uuid_is_null(nsr->nsr_uuid)) {
847 		err = EINVAL;
848 		goto done;
849 	}
850 
851 	if (fsw->fsw_dev_ch != NULL) {
852 		ASSERT(fsw->fsw_host_ch != NULL);
853 		err = EEXIST;
854 		goto done;
855 	}
856 
857 	hw_nx = nx_find(nsr->nsr_uuid, TRUE);
858 	if (hw_nx == NULL) {
859 		err = ENOENT;
860 		goto done;
861 	} else if (hw_nx == nx) {
862 		err = EINVAL;
863 		goto done;
864 	}
865 
866 	/* preflight check to see if the nexus is attachable to us */
867 	err = fsw_nx_check(fsw, hw_nx);
868 	if (err != 0) {
869 		goto done;
870 	}
871 
872 	err = fsw_devna_setup(fsw, hw_nx);
873 	if (err != 0) {
874 		goto done;
875 	}
876 
877 	err = fsw_hostna_setup(fsw, hw_nx);
878 	if (err != 0) {
879 		(void) fsw_devna_teardown(fsw);
880 		goto done;
881 	}
882 
883 	err = fsw_host_setup(fsw);
884 	if (err != 0) {
885 		(void) fsw_hostna_teardown(fsw);
886 		(void) fsw_devna_teardown(fsw);
887 		goto done;
888 	}
889 
890 	fsw_dp_start(fsw);
891 
892 	/* return the devna UUID */
893 	uuid_copy(nsr->nsr_if_uuid, fsw->fsw_dev_ch->ch_na->na_uuid);
894 	ASSERT(!uuid_is_null(nsr->nsr_if_uuid));
895 done:
896 #if SK_LOG
897 	if (__improbable(sk_verbose != 0)) {
898 		fsw_ctl_attach_log(nsr, nx, err);
899 	}
900 #endif /* SK_LOG */
901 
902 	if (hw_nx != NULL) {
903 		nx_release_locked(hw_nx);
904 	}
905 
906 	return err;
907 }
908 
909 SK_NO_INLINE_ATTRIBUTE
910 static void
fsw_cleanup(struct nx_flowswitch * fsw)911 fsw_cleanup(struct nx_flowswitch *fsw)
912 {
913 	int err;
914 	struct ifnet *__single ifp = NULL;
915 
916 	if (fsw->fsw_dev_ch == NULL) {
917 		ASSERT(fsw->fsw_host_ch == NULL);
918 		return;
919 	}
920 	err = fsw_dp_stop(fsw, &ifp);
921 	if (err != 0) {
922 		return;
923 	}
924 	err = fsw_host_teardown(fsw);
925 	VERIFY(err == 0);
926 
927 	err = fsw_hostna_teardown(fsw);
928 	VERIFY(err == 0);
929 
930 	err = fsw_devna_teardown(fsw);
931 	VERIFY(err == 0);
932 
933 	if (ifp != NULL) {
934 		ifnet_datamov_resume(ifp);
935 	}
936 }
937 
938 int
fsw_ctl_detach(struct kern_nexus * nx,struct proc * p,struct nx_spec_req * nsr)939 fsw_ctl_detach(struct kern_nexus *nx, struct proc *p,
940     struct nx_spec_req *nsr)
941 {
942 #pragma unused(p)
943 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
944 	int err = 0;
945 
946 	SK_LOCK_ASSERT_HELD();
947 
948 	/*
949 	 * nsr is NULL when we're called from the destructor, and it
950 	 * implies that we'll detach everything that is attached.
951 	 */
952 	if (nsr == NULL) {
953 		fsw_cleanup(fsw);
954 		ASSERT(fsw->fsw_dev_ch == NULL);
955 		ASSERT(fsw->fsw_host_ch == NULL);
956 		goto done;
957 	}
958 
959 	if (uuid_is_null(nsr->nsr_if_uuid)) {
960 		err = EINVAL;
961 		goto done;
962 	} else if (fsw->fsw_dev_ch == NULL || fsw->fsw_host_ch == NULL) {
963 		err = ENXIO;
964 		goto done;
965 	}
966 
967 	/* check if the devna uuid is correct */
968 	if (uuid_compare(nsr->nsr_if_uuid,
969 	    fsw->fsw_dev_ch->ch_na->na_uuid) != 0) {
970 		err = ESRCH;
971 		goto done;
972 	}
973 	fsw_cleanup(fsw);
974 
975 done:
976 #if SK_LOG
977 	if (nsr != NULL) {
978 		uuid_string_t ifuuidstr;
979 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
980 		    "nexus 0x%llx (%s) if_uuid %s flags 0x%x err %d",
981 		    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
982 		    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr),
983 		    nsr->nsr_flags, err);
984 	} else {
985 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
986 		    "nexus 0x%llx (%s) ANY err %d", SK_KVA(nx),
987 		    NX_DOM_PROV(nx)->nxdom_prov_name, err);
988 	}
989 #endif /* SK_LOG */
990 
991 	return err;
992 }
993 
994 static int
fsw_netem_config(struct nx_flowswitch * fsw,void * data)995 fsw_netem_config(struct nx_flowswitch *fsw, void *data)
996 {
997 	struct ifnet *ifp = fsw->fsw_ifp;
998 	struct if_netem_params *__single params = data;
999 	int ret;
1000 	const char *__null_terminated name = NULL;
1001 
1002 	if (ifp == NULL) {
1003 		return ENODEV;
1004 	}
1005 
1006 	SK_LOCK_ASSERT_HELD();
1007 #define fsw_INPUT_NETEM_THREADNAME   "if_input_netem_%s@fsw"
1008 #define fsw_INPUT_NETEM_THREADNAME_LEN       32
1009 	char netem_name[fsw_INPUT_NETEM_THREADNAME_LEN];
1010 	name = tsnprintf(netem_name, sizeof(netem_name),
1011 	    fsw_INPUT_NETEM_THREADNAME, if_name(ifp));
1012 	ret = netem_config(&ifp->if_input_netem, name, ifp, params, fsw,
1013 	    fsw_dev_input_netem_dequeue, FSW_VP_DEV_BATCH_MAX);
1014 
1015 	return ret;
1016 }
1017 
1018 int
fsw_ctl(struct kern_nexus * nx,nxcfg_cmd_t nc_cmd,struct proc * p,void * data)1019 fsw_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, struct proc *p,
1020     void *data)
1021 {
1022 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
1023 	struct nx_spec_req *__single nsr = data;
1024 	struct nx_flow_req *__single req = data;
1025 	boolean_t need_check;
1026 	int error = 0;
1027 
1028 	switch (nc_cmd) {
1029 	case NXCFG_CMD_FLOW_ADD:
1030 	case NXCFG_CMD_FLOW_DEL:
1031 		if (uuid_is_null(req->nfr_flow_uuid)) {
1032 			error = EINVAL;
1033 			goto done;
1034 		}
1035 		if (p != kernproc) {
1036 			req->nfr_flags &= NXFLOWREQF_MASK;
1037 		}
1038 		req->nfr_flowadv_idx = FLOWADV_IDX_NONE;
1039 
1040 		if (nc_cmd == NXCFG_CMD_FLOW_DEL) {
1041 			break;
1042 		}
1043 
1044 		need_check = FALSE;
1045 		if (req->nfr_epid != -1 && proc_pid(p) != req->nfr_epid) {
1046 			need_check = TRUE;
1047 		} else if (!uuid_is_null(req->nfr_euuid)) {
1048 			uuid_t uuid;
1049 
1050 			/* get the UUID of the issuing process */
1051 			proc_getexecutableuuid(p, uuid, sizeof(uuid));
1052 
1053 			/*
1054 			 * If this is not issued by a process for its own
1055 			 * executable UUID and if the process does not have
1056 			 * the necessary privilege, reject the request.
1057 			 * The logic is similar to so_set_effective_uuid().
1058 			 */
1059 			if (uuid_compare(req->nfr_euuid, uuid) != 0) {
1060 				need_check = TRUE;
1061 			}
1062 		}
1063 		if (need_check) {
1064 			kauth_cred_t __single cred = kauth_cred_proc_ref(p);
1065 			error = priv_check_cred(cred,
1066 			    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0);
1067 			kauth_cred_unref(&cred);
1068 			if (error != 0) {
1069 				goto done;
1070 			}
1071 		}
1072 		break;
1073 
1074 	default:
1075 		break;
1076 	}
1077 
1078 	switch (nc_cmd) {
1079 	case NXCFG_CMD_ATTACH:
1080 		error = fsw_ctl_attach(nx, p, nsr);
1081 		break;
1082 
1083 	case NXCFG_CMD_DETACH:
1084 		error = fsw_ctl_detach(nx, p, nsr);
1085 		break;
1086 
1087 	case NXCFG_CMD_FLOW_ADD:       /* struct nx_flow_req */
1088 		error = fsw_ctl_flow_add(fsw, p, data);
1089 		break;
1090 
1091 	case NXCFG_CMD_FLOW_DEL:     /* struct nx_flow_req */
1092 		error = fsw_ctl_flow_del(fsw, p, data);
1093 		break;
1094 
1095 	case NXCFG_CMD_FLOW_CONFIG:
1096 		error = fsw_ctl_flow_config(fsw, p, data);
1097 		break;
1098 
1099 	case NXCFG_CMD_NETEM:           /* struct if_netem_params */
1100 		error = fsw_netem_config(fsw, data);
1101 		break;
1102 
1103 	default:
1104 		SK_ERR("invalid cmd %u", nc_cmd);
1105 		error = EINVAL;
1106 		break;
1107 	}
1108 
1109 done:
1110 	return error;
1111 }
1112 
1113 struct nx_flowswitch *
fsw_ifp_to_fsw(struct ifnet * ifp)1114 fsw_ifp_to_fsw(struct ifnet *ifp)
1115 {
1116 	struct nx_flowswitch *fsw = NULL;
1117 
1118 	if (ifp->if_na != NULL) {
1119 		fsw = ifp->if_na->nifna_netif->nif_fsw;
1120 	}
1121 	return fsw;
1122 }
1123 
1124 static void
fsw_ifnet_event_callback(struct eventhandler_entry_arg ee_arg __unused,struct ifnet * ifp,struct sockaddr * ip_addr __unused,intf_event_code_t intf_ev_code)1125 fsw_ifnet_event_callback(struct eventhandler_entry_arg ee_arg __unused,
1126     struct ifnet *ifp, struct sockaddr *ip_addr __unused,
1127     intf_event_code_t intf_ev_code)
1128 {
1129 	struct nx_flowswitch *fsw = NULL;
1130 
1131 	evhlog(debug, "%s: eventhandler saw event type=intf_event event_code=%s",
1132 	    __func__, intf_event2str(intf_ev_code));
1133 
1134 	if (ifp->if_na == NULL) {
1135 		return;
1136 	}
1137 
1138 	SK_LOCK();
1139 	fsw = fsw_ifp_to_fsw(ifp);
1140 	if (fsw != NULL) {
1141 		switch (intf_ev_code) {
1142 		case INTF_EVENT_CODE_LLADDR_UPDATE:
1143 			if ((fsw->fsw_ifp == NULL) ||
1144 			    (fsw->fsw_ifp_dlt != DLT_EN10MB)) {
1145 				break;
1146 			}
1147 
1148 			VERIFY(fsw->fsw_ifp == ifp);
1149 			SK_DF(SK_VERB_FSW, "MAC address change detected for %s",
1150 			    if_name(fsw->fsw_ifp));
1151 			(void) ifnet_lladdr_copy_bytes(ifp, fsw->fsw_ether_shost,
1152 			    ETHER_ADDR_LEN);
1153 			os_atomic_inc(&fsw->fsw_src_lla_gencnt, relaxed);
1154 			break;
1155 
1156 		case INTF_EVENT_CODE_LOW_POWER_UPDATE:
1157 			if (fsw->fsw_ifp == NULL) {
1158 				break;
1159 			}
1160 
1161 			VERIFY(fsw->fsw_ifp == ifp);
1162 
1163 			if (ifp->if_xflags & IFXF_LOW_POWER) {
1164 				SK_DF(SK_VERB_FSW,
1165 				    "Low power mode updated for %s",
1166 				    if_name(fsw->fsw_ifp));
1167 
1168 				fsw_reap_sched(fsw);
1169 			}
1170 			break;
1171 
1172 		default:
1173 			break;
1174 		}
1175 	}
1176 	SK_UNLOCK();
1177 }
1178 
1179 static void
fsw_protoctl_event_callback(struct eventhandler_entry_arg ee_arg,struct ifnet * ifp,struct sockaddr * p_laddr,struct sockaddr * p_raddr,uint16_t lport,uint16_t rport,uint8_t proto,uint32_t protoctl_event_code,struct protoctl_ev_val * p_val)1180 fsw_protoctl_event_callback(struct eventhandler_entry_arg ee_arg,
1181     struct ifnet *ifp, struct sockaddr *p_laddr, struct sockaddr *p_raddr,
1182     uint16_t lport, uint16_t rport, uint8_t proto, uint32_t protoctl_event_code,
1183     struct protoctl_ev_val *p_val)
1184 {
1185 #pragma unused(ee_arg)
1186 	struct nx_flowswitch *__single fsw = NULL;
1187 	struct flow_entry *__single fe = NULL;
1188 	boolean_t netagent_update_flow = FALSE;
1189 	uuid_t fe_uuid;
1190 
1191 	evhlog(debug, "%s: eventhandler saw event type=protoctl_event event_code=%d",
1192 	    __func__, proto);
1193 
1194 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1195 		return;
1196 	}
1197 
1198 	/*
1199 	 * XXX Right now only handle the event if we have enough
1200 	 * information to match the entire flow.
1201 	 */
1202 	if (lport == 0 || rport == 0 || p_laddr == NULL || p_raddr == NULL) {
1203 		return;
1204 	}
1205 
1206 	SK_LOCK();
1207 	fsw = fsw_ifp_to_fsw(ifp);
1208 	if (fsw == NULL) {
1209 		goto out;
1210 	}
1211 
1212 	if (!fsw_detach_barrier_add(fsw)) {
1213 		fsw = NULL;
1214 		SK_ERR("netagent detached");
1215 		goto out;
1216 	}
1217 
1218 	struct flow_key fk __sk_aligned(16);
1219 	FLOW_KEY_CLEAR(&fk);
1220 	fk.fk_proto = proto;
1221 	if (p_laddr->sa_family == AF_INET) {
1222 		fk.fk_ipver = IPVERSION;
1223 		fk.fk_src4 = SIN(p_laddr)->sin_addr;
1224 		fk.fk_dst4 = SIN(p_raddr)->sin_addr;
1225 	} else {
1226 		fk.fk_ipver = IPV6_VERSION;
1227 		fk.fk_src6 = SIN6(p_laddr)->sin6_addr;
1228 		/*
1229 		 * rdar://107435899 The scope ID for destination address needs
1230 		 * to be cleared out before looking up the flow entry for this
1231 		 * 5-tuple, because addresses in flow entries do not contain the
1232 		 * scope ID.
1233 		 */
1234 		struct in6_addr *in6;
1235 
1236 		fk.fk_dst6 = SIN6(p_raddr)->sin6_addr;
1237 		in6 = &fk.fk_dst6;
1238 		if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(in6)) {
1239 			in6->s6_addr16[1] = 0;
1240 		}
1241 	}
1242 	fk.fk_sport = lport;
1243 	fk.fk_dport = rport;
1244 	fk.fk_mask = FKMASK_5TUPLE;
1245 
1246 	fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &fk);
1247 	if (__improbable(fe == NULL)) {
1248 		goto out;
1249 	}
1250 
1251 	uuid_copy(fe_uuid, fe->fe_uuid);
1252 	/*
1253 	 * If the protocol notification is for TCP, make sure
1254 	 * protocol event received is for bytes in the flight.
1255 	 * XXX Redirect events are not delivered as protocol events
1256 	 * but as better route events.
1257 	 * Also redirect events do not indicate loss of the packet.
1258 	 */
1259 	if (proto != IPPROTO_TCP) {
1260 		p_val->tcp_seq_number = 0;
1261 	}
1262 
1263 	netagent_update_flow = TRUE;
1264 
1265 out:
1266 	SK_UNLOCK();
1267 
1268 	if (netagent_update_flow) {
1269 		int error = 0;
1270 #if SK_LOG
1271 		char dbgbuf[FLOWENTRY_DBGBUF_SIZE];
1272 		SK_DF(SK_VERB_FLOW, "Update flow entry \"%s\" for protocol "
1273 		    "event %d with value %d and tcp sequence number %d",
1274 		    fe_as_string(fe, dbgbuf, sizeof(dbgbuf)),
1275 		    protoctl_event_code, p_val->val, p_val->tcp_seq_number);
1276 #endif /* SK_LOG */
1277 		if ((error = netagent_update_flow_protoctl_event(
1278 			    fsw->fsw_agent_session, fe_uuid, protoctl_event_code,
1279 			    p_val->val, p_val->tcp_seq_number)) != 0) {
1280 #if SK_LOG
1281 			SK_DF(SK_VERB_FLOW, "Error: %d. Could not update "
1282 			    "flow entry \"%s\" for protocol event %d with "
1283 			    "value %d and tcp sequence number %d", error,
1284 			    dbgbuf, protoctl_event_code, p_val->val,
1285 			    p_val->tcp_seq_number);
1286 #endif /* SK_LOG */
1287 		}
1288 	}
1289 
1290 	if (fe != NULL) {
1291 		flow_entry_release(&fe);
1292 	}
1293 
1294 	if (fsw != NULL) {
1295 		fsw_detach_barrier_remove(fsw);
1296 	}
1297 }
1298 
1299 int
fsw_netagent_add_remove(struct kern_nexus * nx,boolean_t add)1300 fsw_netagent_add_remove(struct kern_nexus *nx, boolean_t add)
1301 {
1302 	struct nx_flowswitch *fsw = NULL;
1303 	int error = 0;
1304 
1305 	SK_LOCK_ASSERT_HELD();
1306 	VERIFY(nx != NULL);
1307 	VERIFY(NX_PROV(nx) != NULL);
1308 	VERIFY(NX_DOM_PROV(nx) != NULL);
1309 
1310 	if (NX_DOM(nx)->nxdom_type != NEXUS_TYPE_FLOW_SWITCH) {
1311 		error = EINVAL;
1312 		goto out;
1313 	}
1314 
1315 	fsw = NX_FSW_PRIVATE(nx);
1316 	VERIFY(fsw != NULL);
1317 	FSW_WLOCK(fsw);
1318 
1319 	if (fsw->fsw_agent_session == NULL) {
1320 		error = ENXIO;
1321 		goto out;
1322 	}
1323 
1324 	ASSERT(!uuid_is_null(fsw->fsw_agent_uuid));
1325 
1326 	if (add) {
1327 		if (FSW_NETAGENT_ADDED(fsw)) {
1328 			/* agent already added */
1329 			error = EEXIST;
1330 		} else if (fsw->fsw_ifp->if_bridge != NULL) {
1331 			/* see rdar://107076453 */
1332 			SK_ERR("%s is bridged, not adding netagent",
1333 			    if_name(fsw->fsw_ifp));
1334 			error = EBUSY;
1335 		} else {
1336 			fsw->fsw_state_flags |= FSW_STATEF_NETAGENT_ADDED;
1337 			if (if_is_fsw_netagent_enabled()) {
1338 				fsw->fsw_state_flags
1339 				        |= FSW_STATEF_NETAGENT_ENABLED;
1340 			}
1341 			if_add_netagent(fsw->fsw_ifp, fsw->fsw_agent_uuid);
1342 			SK_D("flowswitch netagent added for interface %s",
1343 			    if_name(fsw->fsw_ifp));
1344 		}
1345 	} else {
1346 		if (!FSW_NETAGENT_ADDED(fsw)) {
1347 			/* agent has not been added */
1348 			error = ENOENT;
1349 		} else {
1350 			fsw->fsw_state_flags &= ~(FSW_STATEF_NETAGENT_ADDED |
1351 			    FSW_STATEF_NETAGENT_ENABLED);
1352 			if_delete_netagent(fsw->fsw_ifp, fsw->fsw_agent_uuid);
1353 			SK_D("flowswitch netagent removed for interface %s",
1354 			    if_name(fsw->fsw_ifp));
1355 		}
1356 	}
1357 out:
1358 	if (fsw != NULL) {
1359 		FSW_UNLOCK(fsw);
1360 	}
1361 	return error;
1362 }
1363 
1364 void
fsw_netagent_update(struct kern_nexus * nx)1365 fsw_netagent_update(struct kern_nexus *nx)
1366 {
1367 	struct nx_flowswitch *fsw = NULL;
1368 
1369 	SK_LOCK_ASSERT_HELD();
1370 	VERIFY(nx != NULL);
1371 	VERIFY(NX_PROV(nx) != NULL);
1372 	VERIFY(NX_DOM_PROV(nx) != NULL);
1373 
1374 	if (NX_DOM(nx)->nxdom_type != NEXUS_TYPE_FLOW_SWITCH) {
1375 		goto out;
1376 	}
1377 	fsw = NX_FSW_PRIVATE(nx);
1378 	VERIFY(fsw != NULL);
1379 	FSW_WLOCK(fsw);
1380 	if (fsw->fsw_agent_session == NULL) {
1381 		goto out;
1382 	}
1383 	ASSERT(!uuid_is_null(fsw->fsw_agent_uuid));
1384 	uint32_t flags = netagent_get_flags(fsw->fsw_agent_uuid);
1385 	const bool ip_agent = ifnet_needs_fsw_ip_netagent(fsw->fsw_ifp);
1386 	const bool transport_agent = ifnet_needs_fsw_transport_netagent(fsw->fsw_ifp);
1387 	if (ip_agent || transport_agent) {
1388 		flags |= NETAGENT_FLAG_NEXUS_LISTENER;
1389 	} else {
1390 		flags &= ~NETAGENT_FLAG_NEXUS_LISTENER;
1391 	}
1392 	if (transport_agent) {
1393 		flags |= NETAGENT_FLAG_NEXUS_PROVIDER;
1394 	} else {
1395 		flags &= ~NETAGENT_FLAG_NEXUS_PROVIDER;
1396 	}
1397 	if (ip_agent) {
1398 		flags |= NETAGENT_FLAG_CUSTOM_IP_NEXUS;
1399 	} else {
1400 		flags &= ~NETAGENT_FLAG_CUSTOM_IP_NEXUS;
1401 	}
1402 	if (netagent_set_flags(fsw->fsw_agent_uuid, flags) == 0) {
1403 		SK_D("flowswitch netagent updated for interface %s",
1404 		    if_name(fsw->fsw_ifp));
1405 	}
1406 out:
1407 	if (fsw != NULL) {
1408 		FSW_UNLOCK(fsw);
1409 	}
1410 }
1411 
1412 static int
fsw_port_ctor(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,const struct nxbind * nxb)1413 fsw_port_ctor(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna,
1414     const struct nxbind *nxb)
1415 {
1416 #pragma unused(nxb)
1417 	int err = 0;
1418 
1419 	SK_LOCK_ASSERT_HELD();
1420 	ASSERT(nxb == NULL || !(nxb->nxb_flags & NXBF_MATCH_UNIQUEID) ||
1421 	    vpna->vpna_pid == nxb->nxb_pid);
1422 
1423 	/*
1424 	 * Reject regular channel open requests unless there is
1425 	 * something attached to the host port of the flowswitch.
1426 	 */
1427 	if (vpna->vpna_nx_port >= FSW_VP_USER_MIN) {
1428 		struct nexus_adapter *na = &vpna->vpna_up;
1429 		struct ifnet *ifp = fsw->fsw_ifp;
1430 
1431 		if (ifp == NULL) {
1432 			err = ENXIO;
1433 			goto done;
1434 		}
1435 
1436 		/* if adapter supports mitigation, set default value */
1437 		if (na->na_flags & (NAF_TX_MITIGATION | NAF_RX_MITIGATION)) {
1438 			if (IFNET_IS_WIFI(ifp)) {
1439 				na->na_ch_mit_ival = CH_MIT_IVAL_WIFI;
1440 			} else if (IFNET_IS_CELLULAR(ifp)) {
1441 				na->na_ch_mit_ival = CH_MIT_IVAL_CELLULAR;
1442 			} else if (IFNET_IS_ETHERNET(ifp)) {
1443 				na->na_ch_mit_ival = CH_MIT_IVAL_ETHERNET;
1444 			} else {
1445 				na->na_ch_mit_ival = CH_MIT_IVAL_DEFAULT;
1446 			}
1447 		}
1448 	}
1449 
1450 done:
1451 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
1452 	    "fsw 0x%llx nx_port %d vpna_pid %d vpna_pid_bound %u mit_ival %llu "
1453 	    "(err %d)", SK_KVA(fsw), (int)vpna->vpna_nx_port, vpna->vpna_pid,
1454 	    vpna->vpna_pid_bound, vpna->vpna_up.na_ch_mit_ival, err);
1455 
1456 	return err;
1457 }
1458 
1459 static bool
fsw_port_dtor(struct nx_flowswitch * fsw,const struct nexus_vp_adapter * vpna)1460 fsw_port_dtor(struct nx_flowswitch *fsw, const struct nexus_vp_adapter *vpna)
1461 {
1462 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1463 	nexus_port_t nx_port = vpna->vpna_nx_port;
1464 	uint32_t purge_cnt;
1465 
1466 	ASSERT(fsw == vpna->vpna_fsw);
1467 	ASSERT(nx_port != NEXUS_PORT_ANY);
1468 
1469 	/*
1470 	 * If this nexus port was bound to a PID, we just need to look at a
1471 	 * single bucket and iterate from there.  Note that in any case, we
1472 	 * can't just search for a single flow_owner based on the PID itself,
1473 	 * since a given process may be opening multiple channels to the
1474 	 * flowswitch; hence we search for the ones matching this nexus port.
1475 	 *
1476 	 * Close any open flows on the port and remove the flow owner and
1477 	 * nexus port binding.
1478 	 */
1479 	purge_cnt = flow_owner_detach_nexus_port(fm, vpna->vpna_pid_bound,
1480 	    vpna->vpna_pid, nx_port, FALSE);
1481 
1482 	SK_DF(SK_VERB_FSW,
1483 	    "fsw 0x%llx nx_port %d pid %d pid_bound %u defunct %u "
1484 	    "purged %u", SK_KVA(fsw), (int)nx_port,
1485 	    vpna->vpna_pid, vpna->vpna_pid_bound, vpna->vpna_defunct,
1486 	    purge_cnt);
1487 
1488 	return purge_cnt != 0;
1489 }
1490 
1491 /*
1492  * Flowswitch nexus port allocator.
1493  *
1494  * A nexus port is represented by a bit in the port bitmap; its state is
1495  * either free or allocated.  A free state implies that the port has no
1496  * nxbind AND no nexus adapter association.  An allocated state means that
1497  * either it has a nxbind OR a nexus adapter assocation.  This routine
1498  * manages the nexus adapter association with a nexus port; nxbind is
1499  * handled separately via nx_fsw_port_bind().
1500  *
1501  * The caller of this routine may optionally pass in a NULL nexus adapter.
1502  * In such a case (*vpna is NULL), this routine checks to see if the port
1503  * has already been associated with an adapter, and returns a reference to
1504  * that adapter.  No action is taken on a port that doesn't have an adapter
1505  * associated.  Otherwise (*vpna is non-NULL), this routine associates that
1506  * adapter with a port that's not already associated with one; the reference
1507  * to the adapter is untouched here, as the caller is expected to handle it.
1508  *
1509  * The flowswitch code invokes this routine each time it is requested to
1510  * find an adapter via nx_fsw_na_find().  The counterpart of this routine,
1511  * nx_fsw_port_free(), is only executed ONCE by the adapter's destructor.
1512  * This allows for multiple channels to be opened to a nexus port, each
1513  * time holding a reference to that same nexus adapter.  The releasing of
1514  * the nexus port only happens when the last channel closes.
1515  */
1516 static int
fsw_port_alloc__(struct nx_flowswitch * fsw,struct nxbind * nxb,struct nexus_vp_adapter ** vpna,nexus_port_t nx_port,struct proc * p)1517 fsw_port_alloc__(struct nx_flowswitch *fsw, struct nxbind *nxb,
1518     struct nexus_vp_adapter **vpna, nexus_port_t nx_port, struct proc *p)
1519 {
1520 	struct kern_nexus *nx = fsw->fsw_nx;
1521 	boolean_t refonly = FALSE;
1522 	int error = 0;
1523 
1524 	FSW_WLOCK_ASSERT_HELD(fsw);
1525 
1526 	error = nx_port_alloc(nx, nx_port, nxb, (struct nexus_adapter **)vpna, p);
1527 	if (error == 0 && *vpna != NULL && !refonly) {
1528 		/* initialize the nexus port and the adapter occupying it */
1529 		(*vpna)->vpna_fsw = fsw;
1530 		(*vpna)->vpna_nx_port = nx_port;
1531 		(*vpna)->vpna_pid = proc_pid(p);
1532 		if (nxb != NULL && (nxb->nxb_flags & NXBF_MATCH_UNIQUEID)) {
1533 			ASSERT((*vpna)->vpna_pid == nxb->nxb_pid);
1534 			(*vpna)->vpna_pid_bound = TRUE;
1535 		} else {
1536 			(*vpna)->vpna_pid_bound = FALSE;
1537 		}
1538 
1539 		error = fsw_port_ctor(fsw, *vpna, nxb);
1540 		if (error != 0) {
1541 			fsw_port_free(fsw, (*vpna),
1542 			    (*vpna)->vpna_nx_port, FALSE);
1543 		}
1544 	}
1545 
1546 #if SK_LOG
1547 	if (*vpna != NULL) {
1548 		SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
1549 		    "+++ vpna \"%s\" (0x%llx) <-> fsw 0x%llx "
1550 		    "%sport %d refonly %u (err %d)",
1551 		    (*vpna)->vpna_up.na_name, SK_KVA(*vpna), SK_KVA(fsw),
1552 		    nx_fsw_dom_port_is_reserved(nx, nx_port) ?
1553 		    "[reserved] " : "", (int)nx_port, refonly, error);
1554 	} else {
1555 		SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
1556 		    "+++ fsw 0x%llx nx_port %d refonly %u "
1557 		    "(err %d)", SK_KVA(fsw), (int)nx_port, refonly, error);
1558 	}
1559 #endif /* SK_LOG */
1560 
1561 	return error;
1562 }
1563 
1564 int
fsw_port_alloc(struct nx_flowswitch * fsw,struct nxbind * nxb,struct nexus_vp_adapter ** vpna,nexus_port_t nx_port,struct proc * p,boolean_t ifattach,boolean_t host)1565 fsw_port_alloc(struct nx_flowswitch *fsw, struct nxbind *nxb,
1566     struct nexus_vp_adapter **vpna, nexus_port_t nx_port, struct proc *p,
1567     boolean_t ifattach, boolean_t host)
1568 {
1569 	int err = 0;
1570 
1571 	FSW_WLOCK_ASSERT_HELD(fsw);
1572 
1573 	if (ifattach) {
1574 		/* override port to either NX_FSW_{HOST,DEV} */
1575 		nx_port = (host ? FSW_VP_HOST : FSW_VP_DEV);
1576 		/* allocate reserved port for ifattach */
1577 		err = fsw_port_alloc__(fsw, nxb, vpna, nx_port, p);
1578 	} else if (host) {
1579 		/* host is valid only for ifattach */
1580 		err = EINVAL;
1581 	} else {
1582 		/* nexus port otherwise (reserve dev and host for ifattach) */
1583 		err = fsw_port_alloc__(fsw, nxb, vpna, nx_port, p);
1584 	}
1585 
1586 	return err;
1587 }
1588 
1589 /*
1590  * Remove nexus port association from a nexus adapter.  This call is
1591  * the opposite of fsw_port_alloc(), except that it is called only
1592  * at nx_fsw_vp_na_dtor() destructor time.  See above notes
1593  * on fsw_port_alloc().
1594  */
1595 void
fsw_port_free(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,nexus_port_t nx_port,boolean_t defunct)1596 fsw_port_free(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna,
1597     nexus_port_t nx_port, boolean_t defunct)
1598 {
1599 	struct kern_nexus *nx = fsw->fsw_nx;
1600 
1601 	FSW_WLOCK_ASSERT_HELD(fsw);
1602 	ASSERT(vpna->vpna_fsw == fsw);
1603 
1604 	if (defunct) {
1605 		vpna->vpna_defunct = TRUE;
1606 		nx_port_defunct(nx, nx_port);
1607 	}
1608 
1609 	bool destroyed = fsw_port_dtor(fsw, vpna);
1610 	if (destroyed) {
1611 		/*
1612 		 * If the extension's destructor no longer needs to be
1613 		 * bound to any channel client, release the binding.
1614 		 */
1615 		nx_port_unbind(nx, nx_port);
1616 	}
1617 
1618 	/*
1619 	 * If this is a defunct, then stop here as the port is still
1620 	 * occupied by the channel.  We'll come here again later when
1621 	 * the actual close happens.
1622 	 */
1623 	if (defunct) {
1624 		return;
1625 	}
1626 
1627 	SK_DF(SK_VERB_FSW, "--- vpna \"%s\" (0x%llx) -!- fsw 0x%llx "
1628 	    "nx_port %d defunct %u", vpna->vpna_up.na_name, SK_KVA(vpna),
1629 	    SK_KVA(fsw), (int)nx_port, vpna->vpna_defunct);
1630 
1631 	nx_port_free(nx, nx_port);
1632 	vpna->vpna_fsw = NULL;
1633 	vpna->vpna_nx_port = NEXUS_PORT_ANY;
1634 	vpna->vpna_pid_bound = FALSE;
1635 	vpna->vpna_pid = -1;
1636 	vpna->vpna_defunct = FALSE;
1637 	vpna->vpna_up.na_private = NULL;
1638 }
1639 
1640 int
fsw_port_na_activate(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,na_activate_mode_t mode)1641 fsw_port_na_activate(struct nx_flowswitch *fsw,
1642     struct nexus_vp_adapter *vpna, na_activate_mode_t mode)
1643 {
1644 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1645 	uint32_t fo_cnt = 0;
1646 
1647 	SK_LOCK_ASSERT_HELD();
1648 
1649 	/* The following code relies on the static value asserted below */
1650 	_CASSERT(FSW_VP_DEV == 0);
1651 	_CASSERT(FSW_VP_HOST == 1);
1652 
1653 	ASSERT(NA_IS_ACTIVE(&vpna->vpna_up));
1654 	ASSERT(vpna->vpna_nx_port != NEXUS_PORT_ANY);
1655 
1656 	switch (mode) {
1657 	case NA_ACTIVATE_MODE_ON:
1658 		break;
1659 
1660 	case NA_ACTIVATE_MODE_DEFUNCT:
1661 		break;
1662 
1663 	case NA_ACTIVATE_MODE_OFF:
1664 		break;
1665 
1666 	default:
1667 		VERIFY(0);
1668 		/* NOTREACHED */
1669 		__builtin_unreachable();
1670 	}
1671 
1672 	/* nothing further to do for special ports */
1673 	if (vpna->vpna_nx_port < FSW_VP_USER_MIN) {
1674 		goto done;
1675 	}
1676 
1677 	/* activate any flow owner related resources (e.g. flowadv), if any */
1678 	fo_cnt = flow_owner_activate_nexus_port(fm, vpna->vpna_pid_bound,
1679 	    vpna->vpna_pid, vpna->vpna_nx_port, &vpna->vpna_up, mode);
1680 
1681 done:
1682 	SK_DF(SK_VERB_FSW,
1683 	    "fsw 0x%llx %s nx_port %d vpna_pid %d vpna_pid_bound %u fo_cnt %u",
1684 	    SK_KVA(fsw), na_activate_mode2str(mode), (int)vpna->vpna_nx_port,
1685 	    vpna->vpna_pid, vpna->vpna_pid_bound, fo_cnt);
1686 
1687 	return 0;
1688 }
1689 
1690 int
fsw_port_na_defunct(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna)1691 fsw_port_na_defunct(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna)
1692 {
1693 	int err = 0;
1694 
1695 	SK_LOCK_ASSERT_HELD();
1696 	ASSERT(vpna->vpna_nx_port >= FSW_VP_USER_MIN);
1697 
1698 	/*
1699 	 * During defunct, we want to purge all flows associated to this
1700 	 * port and the flow owner as well.  This is accomplished as part
1701 	 * of calling the port's destructor.  However, we still want to
1702 	 * occupy the nexus port since there's a channel open to it.
1703 	 */
1704 	FSW_WLOCK(fsw);
1705 	if (!vpna->vpna_defunct) {
1706 		fsw_port_free(fsw, vpna, vpna->vpna_nx_port, TRUE);
1707 	} else {
1708 		err = EALREADY;
1709 	}
1710 	FSW_WUNLOCK(fsw);
1711 
1712 	return err;
1713 }
1714 
1715 static size_t
fsw_mib_get_flow(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len)1716 fsw_mib_get_flow(struct nx_flowswitch *fsw,
1717     struct nexus_mib_filter *filter, void *__sized_by(len)out, size_t len)
1718 {
1719 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1720 	size_t sf_size = sizeof(struct sk_stats_flow);
1721 	__block size_t actual_space = 0;
1722 	__block struct sk_stats_flow *sf = out;
1723 	struct flow_entry *__single fe;
1724 
1725 	FSW_LOCK_ASSERT_HELD(fsw);
1726 
1727 	if (filter->nmf_bitmap & NXMIB_FILTER_FLOW_ID) {
1728 		fe = flow_mgr_get_fe_by_uuid_rlock(fm, filter->nmf_flow_id);
1729 		if (fe != NULL) {
1730 			if (out != NULL && len >= sf_size) {
1731 				flow_entry_stats_get(fe, sf);
1732 			}
1733 
1734 			flow_entry_release(&fe);
1735 			return sf_size;
1736 		}
1737 		return 0;
1738 	} else if (filter->nmf_bitmap & NXMIB_FILTER_INFO_TUPLE) {
1739 		struct info_tuple *itpl = &filter->nmf_info_tuple;
1740 		struct flow_key fk;
1741 		bzero(&fk, sizeof(fk));
1742 		if (itpl->itpl_local_sah.sa_family == AF_INET &&
1743 		    itpl->itpl_remote_sah.sa_family == AF_INET) {
1744 			fk.fk_mask = FKMASK_5TUPLE;
1745 			fk.fk_ipver = IPVERSION;
1746 			fk.fk_proto = itpl->itpl_proto;
1747 			fk.fk_src4 = itpl->itpl_local_sin.sin_addr;
1748 			fk.fk_dst4 = itpl->itpl_remote_sin.sin_addr;
1749 			fk.fk_sport = itpl->itpl_local_sin.sin_port;
1750 			fk.fk_dport = itpl->itpl_remote_sin.sin_port;
1751 		} else if (itpl->itpl_local_sah.sa_family == AF_INET6 &&
1752 		    itpl->itpl_remote_sah.sa_family == AF_INET6) {
1753 			fk.fk_mask = FKMASK_5TUPLE;
1754 			fk.fk_ipver = IPV6_VERSION;
1755 			fk.fk_proto = itpl->itpl_proto;
1756 			fk.fk_src6 = itpl->itpl_local_sin6.sin6_addr;
1757 			fk.fk_dst6 = itpl->itpl_remote_sin6.sin6_addr;
1758 			fk.fk_sport = itpl->itpl_local_sin6.sin6_port;
1759 			fk.fk_dport = itpl->itpl_remote_sin6.sin6_port;
1760 		} else {
1761 			SK_ERR("invalid info tuple: local af %d remote af %d",
1762 			    itpl->itpl_local_sah.sa_family,
1763 			    itpl->itpl_remote_sah.sa_family);
1764 			return 0;
1765 		}
1766 
1767 		fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &fk);
1768 		if (fe != NULL) {
1769 			if (out != NULL && len >= sf_size) {
1770 				flow_entry_stats_get(fe, sf);
1771 			}
1772 			flow_entry_release(&fe);
1773 			return sf_size;
1774 		}
1775 		return 0;
1776 	}
1777 
1778 	flow_mgr_foreach_flow(fsw->fsw_flow_mgr, ^(struct flow_entry *_fe) {
1779 		actual_space += sf_size;
1780 
1781 		if (out == NULL || actual_space > len) {
1782 		        return;
1783 		}
1784 
1785 		flow_entry_stats_get(_fe, sf);
1786 		sf++;
1787 	});
1788 
1789 	/*
1790 	 * Also return the ones in deferred free list.
1791 	 */
1792 	lck_mtx_lock(&fsw->fsw_linger_lock);
1793 	TAILQ_FOREACH(fe, &fsw->fsw_linger_head, fe_linger_link) {
1794 		actual_space += sf_size;
1795 		if (out == NULL || actual_space > len) {
1796 			continue;
1797 		}
1798 
1799 		flow_entry_stats_get(fe, sf);
1800 		sf++;
1801 	}
1802 	lck_mtx_unlock(&fsw->fsw_linger_lock);
1803 
1804 	return actual_space;
1805 }
1806 
1807 static size_t
fsw_mib_get_flow_adv(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len)1808 fsw_mib_get_flow_adv(struct nx_flowswitch *fsw,
1809     struct nexus_mib_filter *filter, void *__sized_by(len)out, size_t len)
1810 {
1811 #pragma unused(filter)
1812 	uint32_t fae_idx;
1813 	size_t actual_space = 0;
1814 	struct kern_channel *__single ch = NULL;
1815 	struct sk_stats_flow_adv *sfa = NULL;
1816 	struct sk_stats_flow_adv_ent *sfae = NULL;
1817 	struct __flowadv_entry *__single fae = NULL;
1818 	size_t sfa_size = sizeof(struct sk_stats_flow_adv);
1819 	size_t sfae_size = sizeof(struct sk_stats_flow_adv_ent);
1820 	uint32_t max_flowadv =
1821 	    fsw->fsw_nx->nx_prov->nxprov_params->nxp_flowadv_max;
1822 
1823 	SK_LOCK_ASSERT_HELD();
1824 
1825 	sfa = out;
1826 	/* copyout flow advisory table (allocated entries only) */
1827 	STAILQ_FOREACH(ch, &fsw->fsw_nx->nx_ch_head, ch_link) {
1828 		struct skmem_arena *ar;
1829 		struct skmem_arena_nexus *arn;
1830 		struct nexus_adapter *na;
1831 
1832 		/* ch_lock isn't needed here since sk_lock is held */
1833 		if ((ch->ch_flags & CHANF_CLOSING) ||
1834 		    (na = ch->ch_na) == NULL) {
1835 			/* channel is closing */
1836 			continue;
1837 		}
1838 
1839 		ar = na->na_arena;
1840 		arn = skmem_arena_nexus(ar);
1841 
1842 		AR_LOCK(ar);
1843 		if (arn->arn_flowadv_obj == NULL) {
1844 			ASSERT(ar->ar_flags & ARF_DEFUNCT);
1845 			AR_UNLOCK(ar);
1846 			continue;
1847 		}
1848 		actual_space += sfa_size;
1849 		/* fill out flowadv_table info */
1850 		if (out != NULL && actual_space <= len) {
1851 			uuid_copy(sfa->sfa_nx_uuid, fsw->fsw_nx->nx_uuid);
1852 			(void) strbufcpy(sfa->sfa_if_name,
1853 			    fsw->fsw_flow_mgr->fm_name);
1854 			sfa->sfa_owner_pid = ch->ch_pid;
1855 			sfa->sfa_entries_count = 0;
1856 		}
1857 
1858 		/* fill out flowadv_entries */
1859 		for (fae_idx = 0; fae_idx < max_flowadv; fae_idx++) {
1860 			fae = &arn->arn_flowadv_obj[fae_idx];
1861 			if (!uuid_is_null(fae->fae_id)) {
1862 				actual_space += sfae_size;
1863 				if (out == NULL || actual_space > len) {
1864 					continue;
1865 				}
1866 				sfae = &sfa->sfa_entries[0];
1867 
1868 				/* fill out entry */
1869 				uuid_copy(sfae->sfae_flow_id, fae->fae_id);
1870 				sfae->sfae_flags = fae->fae_flags;
1871 				sfae++;
1872 				sfa->sfa_entries_count++;
1873 			}
1874 		}
1875 		sfa = (struct sk_stats_flow_adv *)
1876 		    (void *)((int8_t *)out + actual_space);
1877 		AR_UNLOCK(ar);
1878 	}
1879 
1880 	return actual_space;
1881 }
1882 
1883 static inline void
fsw_fo2sfo(struct nx_flowswitch * fsw,struct flow_owner * fo,struct sk_stats_flow_owner * sfo)1884 fsw_fo2sfo(struct nx_flowswitch *fsw, struct flow_owner *fo,
1885     struct sk_stats_flow_owner *sfo)
1886 {
1887 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1888 
1889 	uuid_copy(sfo->sfo_nx_uuid, fsw->fsw_nx->nx_uuid);
1890 	(void) strbufcpy(sfo->sfo_if_name, fsw->fsw_flow_mgr->fm_name);
1891 	sfo->sfo_bucket_idx = flow_mgr_get_fob_idx(fm, FO_BUCKET(fo));
1892 
1893 	(void) snprintf(sfo->sfo_name, sizeof(sfo->sfo_name), "%s",
1894 	    fo->fo_name);
1895 	sfo->sfo_pid = fo->fo_pid;
1896 	sfo->sfo_nx_port = fo->fo_nx_port;
1897 	sfo->sfo_nx_port_pid_bound = fo->fo_nx_port_pid_bound;
1898 	sfo->sfo_nx_port_destroyed = fo->fo_nx_port_destroyed;
1899 }
1900 
1901 static size_t
fsw_mib_get_flow_owner(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len)1902 fsw_mib_get_flow_owner(struct nx_flowswitch *fsw,
1903     struct nexus_mib_filter *filter, void *__sized_by(len)out, size_t len)
1904 {
1905 #pragma unused(filter)
1906 	uint32_t i;
1907 	size_t actual_space = 0;
1908 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1909 	struct sk_stats_flow_owner *sfo = out;
1910 	size_t sfo_size = sizeof(struct sk_stats_flow_owner);
1911 	struct flow_owner *fo;
1912 
1913 	FSW_LOCK_ASSERT_HELD(fsw);
1914 
1915 	/*
1916 	 * Ideally we'd like to hide the bucket level details from flow library
1917 	 * user, but there is no simple way to iterate flow_owner with
1918 	 * buckets/RB_TREE nested. So keep it as is.
1919 	 */
1920 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
1921 		struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
1922 		FOB_LOCK(fob);
1923 		RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
1924 			actual_space += sfo_size;
1925 			if (out == NULL || actual_space > len) {
1926 				continue;
1927 			}
1928 
1929 			fsw_fo2sfo(fsw, fo, sfo);
1930 			sfo++;
1931 		}
1932 		FOB_UNLOCK(fob);
1933 	}
1934 
1935 	return actual_space;
1936 }
1937 
1938 static inline void
fsw_fr2sfr(struct nx_flowswitch * fsw,struct flow_route * fr,struct sk_stats_flow_route * sfr,boolean_t ll_scrub)1939 fsw_fr2sfr(struct nx_flowswitch *fsw, struct flow_route *fr,
1940     struct sk_stats_flow_route *sfr, boolean_t ll_scrub)
1941 {
1942 	uuid_copy(sfr->sfr_nx_uuid, fsw->fsw_nx->nx_uuid);
1943 	uuid_copy(sfr->sfr_uuid, fr->fr_uuid);
1944 	(void) strbufcpy(sfr->sfr_if_name, fsw->fsw_flow_mgr->fm_name);
1945 
1946 	sfr->sfr_bucket_idx = fr->fr_frb->frb_idx;
1947 	sfr->sfr_id_bucket_idx = fr->fr_frib->frib_idx;
1948 
1949 	if (fr->fr_flags & FLOWRTF_ATTACHED) {
1950 		sfr->sfr_flags |= SFLOWRTF_ATTACHED;
1951 	}
1952 	if (fr->fr_flags & FLOWRTF_ONLINK) {
1953 		sfr->sfr_flags |= SFLOWRTF_ONLINK;
1954 	}
1955 	if (fr->fr_flags & FLOWRTF_GATEWAY) {
1956 		sfr->sfr_flags |= SFLOWRTF_GATEWAY;
1957 	}
1958 	if (fr->fr_flags & FLOWRTF_RESOLVED) {
1959 		sfr->sfr_flags |= SFLOWRTF_RESOLVED;
1960 	}
1961 	if (fr->fr_flags & FLOWRTF_HAS_LLINFO) {
1962 		sfr->sfr_flags |= SFLOWRTF_HAS_LLINFO;
1963 	}
1964 	if (fr->fr_flags & FLOWRTF_DELETED) {
1965 		sfr->sfr_flags |= SFLOWRTF_DELETED;
1966 	}
1967 	if (fr->fr_flags & FLOWRTF_DST_LL_MCAST) {
1968 		sfr->sfr_flags |= SFLOWRTF_DST_LL_MCAST;
1969 	}
1970 	if (fr->fr_flags & FLOWRTF_DST_LL_BCAST) {
1971 		sfr->sfr_flags |= SFLOWRTF_DST_LL_BCAST;
1972 	}
1973 
1974 	lck_spin_lock(&fr->fr_reflock);
1975 	ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF);
1976 	sfr->sfr_usecnt = fr->fr_usecnt - FLOW_ROUTE_MINREF;
1977 	if (fr->fr_expire != 0) {
1978 		sfr->sfr_expire = (int64_t)(fr->fr_expire - net_uptime());
1979 	} else {
1980 		sfr->sfr_expire = 0;
1981 	}
1982 	lck_spin_unlock(&fr->fr_reflock);
1983 
1984 	sfr->sfr_laddr = fr->fr_laddr;
1985 	sfr->sfr_faddr = fr->fr_faddr;
1986 	sfr->sfr_gaddr = fr->fr_gaddr;
1987 
1988 	if (ll_scrub) {
1989 		static const uint8_t unspec[ETHER_ADDR_LEN] = {[0] = 2 };
1990 		bcopy(&unspec, &sfr->sfr_ether_dhost, ETHER_ADDR_LEN);
1991 	} else {
1992 		bcopy(&fr->fr_eth.ether_dhost, &sfr->sfr_ether_dhost,
1993 		    ETHER_ADDR_LEN);
1994 	}
1995 }
1996 
1997 #if CONFIG_MACF
1998 extern int dlil_lladdr_ckreq;
1999 #endif /* CONFIG_MACF */
2000 
2001 static size_t
fsw_mib_get_flow_route(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len,struct proc * p)2002 fsw_mib_get_flow_route(struct nx_flowswitch *fsw,
2003     struct nexus_mib_filter *filter, void *__sized_by(len)out, size_t len, struct proc *p)
2004 {
2005 #pragma unused(filter)
2006 	uint32_t i;
2007 	size_t actual_space = 0;
2008 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
2009 	struct sk_stats_flow_route *sfr = out;
2010 	size_t sfo_size = sizeof(struct sk_stats_flow_route);
2011 	struct flow_route *fr;
2012 	boolean_t ll_scrub;
2013 
2014 	FSW_LOCK_ASSERT_HELD(fsw);
2015 
2016 	/*
2017 	 * To get the link-layer info, the caller must have the following
2018 	 * in their sandbox profile (or not be sandboxed at all), else we
2019 	 * scrub it clean just like dlil_ifaddr_bytes() does:
2020 	 *
2021 	 * (allow system-info (info-type "net.link.addr"))
2022 	 *
2023 	 * If scrubbed, we return 02:00:00:00:00:00.
2024 	 */
2025 #if CONFIG_MACF
2026 	ll_scrub = (dlil_lladdr_ckreq &&
2027 	    skywalk_mac_system_check_proc_cred(p, "net.link.addr") != 0);
2028 #else /* !CONFIG_MACF */
2029 	ll_scrub = FALSE;
2030 #endif /* !CONFIG_MACF */
2031 
2032 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
2033 		struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
2034 		FRB_RLOCK(frb);
2035 		RB_FOREACH(fr, flow_route_tree, &frb->frb_head) {
2036 			actual_space += sfo_size;
2037 			if (out == NULL || actual_space > len) {
2038 				continue;
2039 			}
2040 
2041 			fsw_fr2sfr(fsw, fr, sfr, ll_scrub);
2042 			sfr++;
2043 		}
2044 		FRB_UNLOCK(frb);
2045 	}
2046 
2047 	return actual_space;
2048 }
2049 
2050 static inline void
fsw_nxs2nus(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,pid_t pid,struct __nx_stats_fsw * nxs,struct sk_stats_userstack * sus)2051 fsw_nxs2nus(struct nx_flowswitch *fsw, struct nexus_mib_filter *filter,
2052     pid_t pid, struct __nx_stats_fsw *nxs, struct sk_stats_userstack *sus)
2053 {
2054 	uuid_copy(sus->sus_nx_uuid, fsw->fsw_nx->nx_uuid);
2055 	(void) strbufcpy(sus->sus_if_name, fsw->fsw_flow_mgr->fm_name);
2056 	sus->sus_owner_pid = pid;
2057 
2058 	if (filter->nmf_type & NXMIB_IP_STATS) {
2059 		sus->sus_ip  = nxs->nxs_ipstat;
2060 	}
2061 
2062 	if (filter->nmf_type & NXMIB_IP6_STATS) {
2063 		sus->sus_ip6 = nxs->nxs_ip6stat;
2064 	}
2065 
2066 	if (filter->nmf_type & NXMIB_TCP_STATS) {
2067 		sus->sus_tcp = nxs->nxs_tcpstat;
2068 	}
2069 
2070 	if (filter->nmf_type & NXMIB_UDP_STATS) {
2071 		sus->sus_udp = nxs->nxs_udpstat;
2072 	}
2073 
2074 	if (filter->nmf_type & NXMIB_QUIC_STATS) {
2075 		sus->sus_quic = nxs->nxs_quicstat;
2076 	}
2077 }
2078 
2079 static size_t
fsw_mib_get_userstack_stats(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len)2080 fsw_mib_get_userstack_stats(struct nx_flowswitch *fsw,
2081     struct nexus_mib_filter *filter, void *__sized_by(len)out, size_t len)
2082 {
2083 	size_t actual_space = 0;
2084 	struct kern_channel *ch;
2085 	struct __nx_stats_fsw *nxs;
2086 	struct sk_stats_userstack *sus = out;
2087 	size_t sus_size = sizeof(struct sk_stats_userstack);
2088 
2089 	SK_LOCK_ASSERT_HELD();
2090 
2091 	/* copyout saved stats from closed ports */
2092 	if (((filter->nmf_bitmap & NXMIB_FILTER_PID) &&
2093 	    (filter->nmf_pid == 0)) ||
2094 	    !(filter->nmf_bitmap & NXMIB_FILTER_PID)) {
2095 		actual_space += sus_size;
2096 		if (out != NULL && actual_space <= len) {
2097 			nxs = fsw->fsw_closed_na_stats;
2098 			fsw_nxs2nus(fsw, filter, 0, nxs, sus);
2099 			sus++;
2100 		}
2101 	}
2102 
2103 	/*
2104 	 * XXX Currently a proc only opens one channel to nexus so we don't do
2105 	 * per proc aggregation of inet stats now as this needs lots of code
2106 	 */
2107 	/* copyout per process stats */
2108 	STAILQ_FOREACH(ch, &fsw->fsw_nx->nx_ch_head, ch_link) {
2109 		struct skmem_arena *ar;
2110 		struct nexus_adapter *na;
2111 
2112 		/* ch_lock isn't needed here since sk_lock is held */
2113 		if ((ch->ch_flags & CHANF_CLOSING) ||
2114 		    (na = ch->ch_na) == NULL) {
2115 			/* channel is closing */
2116 			continue;
2117 		}
2118 
2119 		if ((filter->nmf_bitmap & NXMIB_FILTER_PID) &&
2120 		    filter->nmf_pid != ch->ch_pid) {
2121 			continue;
2122 		}
2123 
2124 		ar = na->na_arena;
2125 
2126 		AR_LOCK(ar);
2127 		nxs = skmem_arena_nexus(ar)->arn_stats_obj;
2128 		if (nxs == NULL) {
2129 			ASSERT(ar->ar_flags & ARF_DEFUNCT);
2130 			AR_UNLOCK(ar);
2131 			continue;
2132 		}
2133 
2134 		actual_space += sus_size;
2135 		if (out == NULL || actual_space > len) {
2136 			AR_UNLOCK(ar);
2137 			continue;
2138 		}
2139 
2140 		fsw_nxs2nus(fsw, filter, ch->ch_pid, nxs, sus);
2141 		sus++;
2142 		AR_UNLOCK(ar);
2143 	}
2144 
2145 	return actual_space;
2146 }
2147 
2148 static size_t
fsw_mib_get_stats(struct nx_flowswitch * fsw,void * __sized_by (len)out,size_t len)2149 fsw_mib_get_stats(struct nx_flowswitch *fsw, void *__sized_by(len)out, size_t len)
2150 {
2151 	struct sk_stats_flow_switch *sfs = out;
2152 	size_t actual_space = sizeof(struct sk_stats_flow_switch);
2153 
2154 	/* XXX -fbounds-safety: Come back and fix strlcpy */
2155 	if (out != NULL && actual_space <= len) {
2156 		uuid_copy(sfs->sfs_nx_uuid, fsw->fsw_nx->nx_uuid);
2157 		(void) strbufcpy(sfs->sfs_if_name, fsw->fsw_flow_mgr->fm_name);
2158 		sfs->sfs_fsws = fsw->fsw_stats;
2159 	}
2160 
2161 	return actual_space;
2162 }
2163 
2164 size_t
fsw_mib_get(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len,struct proc * p)2165 fsw_mib_get(struct nx_flowswitch *fsw, struct nexus_mib_filter *filter,
2166     void *__sized_by(len)out, size_t len, struct proc *p)
2167 {
2168 	size_t ret;
2169 
2170 	switch (filter->nmf_type) {
2171 	case NXMIB_FSW_STATS:
2172 		ret = fsw_mib_get_stats(fsw, out, len);
2173 		break;
2174 	case NXMIB_FLOW:
2175 		ret = fsw_mib_get_flow(fsw, filter, out, len);
2176 		break;
2177 	case NXMIB_FLOW_OWNER:
2178 		ret = fsw_mib_get_flow_owner(fsw, filter, out, len);
2179 		break;
2180 	case NXMIB_FLOW_ROUTE:
2181 		ret = fsw_mib_get_flow_route(fsw, filter, out, len, p);
2182 		break;
2183 	case NXMIB_TCP_STATS:
2184 	case NXMIB_UDP_STATS:
2185 	case NXMIB_IP_STATS:
2186 	case NXMIB_IP6_STATS:
2187 	case NXMIB_USERSTACK_STATS:
2188 		ret = fsw_mib_get_userstack_stats(fsw, filter, out, len);
2189 		break;
2190 	case NXMIB_FLOW_ADV:
2191 		ret = fsw_mib_get_flow_adv(fsw, filter, out, len);
2192 		break;
2193 	default:
2194 		ret = 0;
2195 		break;
2196 	}
2197 
2198 	return ret;
2199 }
2200 
2201 void
fsw_fold_stats(struct nx_flowswitch * fsw,void * data,nexus_stats_type_t type)2202 fsw_fold_stats(struct nx_flowswitch *fsw,
2203     void *data, nexus_stats_type_t type)
2204 {
2205 	ASSERT(data != NULL);
2206 	FSW_LOCK_ASSERT_HELD(fsw);
2207 
2208 	switch (type) {
2209 	case NEXUS_STATS_TYPE_FSW:
2210 	{
2211 		struct __nx_stats_fsw *d, *__single s;
2212 		d = fsw->fsw_closed_na_stats;
2213 		s = data;
2214 		ip_stats_fold(&d->nxs_ipstat, &s->nxs_ipstat);
2215 		ip6_stats_fold(&d->nxs_ip6stat, &s->nxs_ip6stat);
2216 		tcp_stats_fold(&d->nxs_tcpstat, &s->nxs_tcpstat);
2217 		udp_stats_fold(&d->nxs_udpstat, &s->nxs_udpstat);
2218 		quic_stats_fold(&d->nxs_quicstat, &s->nxs_quicstat);
2219 		break;
2220 	}
2221 	case NEXUS_STATS_TYPE_CHAN_ERRORS:
2222 	{
2223 		struct __nx_stats_channel_errors *__single s = data;
2224 		fsw_vp_channel_error_stats_fold(&fsw->fsw_stats, s);
2225 		break;
2226 	}
2227 	default:
2228 		VERIFY(0);
2229 		/* NOTREACHED */
2230 		__builtin_unreachable();
2231 	}
2232 }
2233 
2234 boolean_t
fsw_detach_barrier_add(struct nx_flowswitch * fsw)2235 fsw_detach_barrier_add(struct nx_flowswitch *fsw)
2236 {
2237 	lck_mtx_lock_spin(&fsw->fsw_detach_barrier_lock);
2238 	if (__improbable(fsw->fsw_detach_flags != 0 ||
2239 	    fsw->fsw_ifp == NULL || fsw->fsw_agent_session == NULL)) {
2240 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2241 		return FALSE;
2242 	}
2243 	fsw->fsw_detach_barriers++;
2244 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2245 
2246 	return TRUE;
2247 }
2248 
2249 void
fsw_detach_barrier_remove(struct nx_flowswitch * fsw)2250 fsw_detach_barrier_remove(struct nx_flowswitch *fsw)
2251 {
2252 	lck_mtx_lock_spin(&fsw->fsw_detach_barrier_lock);
2253 	ASSERT((fsw->fsw_detach_flags & FSW_DETACHF_DETACHED) == 0);
2254 	ASSERT(fsw->fsw_detach_barriers != 0);
2255 	fsw->fsw_detach_barriers--;
2256 	/* if there's a thread waiting to detach the interface, let it know */
2257 	if (__improbable((fsw->fsw_detach_waiters > 0) &&
2258 	    (fsw->fsw_detach_barriers == 0))) {
2259 		fsw->fsw_detach_waiters = 0;
2260 		wakeup(&fsw->fsw_detach_waiters);
2261 	}
2262 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2263 }
2264 
2265 /*
2266  * Generic resolver for non-Ethernet interfaces.
2267  */
2268 int
fsw_generic_resolve(struct nx_flowswitch * fsw,struct flow_route * fr,struct __kern_packet * pkt)2269 fsw_generic_resolve(struct nx_flowswitch *fsw, struct flow_route *fr,
2270     struct __kern_packet *pkt)
2271 {
2272 #pragma unused(pkt)
2273 #if SK_LOG
2274 	char dst_s[MAX_IPv6_STR_LEN];
2275 #endif /* SK_LOG */
2276 	struct ifnet *ifp = fsw->fsw_ifp;
2277 	struct rtentry *tgt_rt = NULL;
2278 	int err = 0;
2279 
2280 	ASSERT(fr != NULL);
2281 	ASSERT(ifp != NULL);
2282 
2283 	FR_LOCK(fr);
2284 	/*
2285 	 * If the destination is on-link, we use the final destination
2286 	 * address as target.  If it's off-link, we use the gateway
2287 	 * address instead.  Point tgt_rt to the the destination or
2288 	 * gateway route accordingly.
2289 	 */
2290 	if (fr->fr_flags & FLOWRTF_ONLINK) {
2291 		tgt_rt = fr->fr_rt_dst;
2292 	} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
2293 		tgt_rt = fr->fr_rt_gw;
2294 	}
2295 
2296 	/*
2297 	 * Perform another routing table lookup if necessary.
2298 	 */
2299 	if (tgt_rt == NULL || !(tgt_rt->rt_flags & RTF_UP) ||
2300 	    fr->fr_want_configure) {
2301 		if (fr->fr_want_configure == 0) {
2302 			os_atomic_inc(&fr->fr_want_configure, relaxed);
2303 		}
2304 		err = flow_route_configure(fr, ifp, NULL);
2305 		if (err != 0) {
2306 			SK_ERR("failed to configure route to %s on %s (err %d)",
2307 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
2308 			    sizeof(dst_s)), ifp->if_xname, err);
2309 			goto done;
2310 		}
2311 
2312 		/* refresh pointers */
2313 		if (fr->fr_flags & FLOWRTF_ONLINK) {
2314 			tgt_rt = fr->fr_rt_dst;
2315 		} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
2316 			tgt_rt = fr->fr_rt_gw;
2317 		}
2318 	}
2319 
2320 	if (__improbable(!(fr->fr_flags & (FLOWRTF_ONLINK | FLOWRTF_GATEWAY)))) {
2321 		err = EHOSTUNREACH;
2322 		SK_ERR("invalid route for %s on %s (err %d)",
2323 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
2324 		    sizeof(dst_s)), ifp->if_xname, err);
2325 		goto done;
2326 	}
2327 
2328 	ASSERT(tgt_rt != NULL);
2329 
2330 done:
2331 	if (__probable(err == 0)) {
2332 		/*
2333 		 * There's no actual resolution taking place here, so just
2334 		 * mark it with FLOWRTF_RESOLVED for consistency.
2335 		 */
2336 		os_atomic_or(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
2337 		os_atomic_store(&fr->fr_want_probe, 0, release);
2338 	} else {
2339 		os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
2340 		flow_route_cleanup(fr);
2341 	}
2342 	FR_UNLOCK(fr);
2343 
2344 	return err;
2345 }
2346 
2347 static void
fsw_read_boot_args(void)2348 fsw_read_boot_args(void)
2349 {
2350 	(void) PE_parse_boot_argn("fsw_use_dual_sized_pool",
2351 	    &fsw_use_dual_sized_pool, sizeof(fsw_use_dual_sized_pool));
2352 }
2353 
2354 void
fsw_init(void)2355 fsw_init(void)
2356 {
2357 	_CASSERT(NX_FSW_CHUNK_FREE == (uint64_t)-1);
2358 	_CASSERT(PKT_MAX_PROTO_HEADER_SIZE <= NX_FSW_MINBUFSIZE);
2359 
2360 	if (!__nx_fsw_inited) {
2361 		fsw_read_boot_args();
2362 		/*
2363 		 * Register callbacks for interface & protocol events
2364 		 * Use dummy arg for callback cookie.
2365 		 */
2366 		__nx_fsw_ifnet_eventhandler_tag =
2367 		    EVENTHANDLER_REGISTER(&ifnet_evhdlr_ctxt,
2368 		    ifnet_event, &fsw_ifnet_event_callback,
2369 		    eventhandler_entry_dummy_arg, EVENTHANDLER_PRI_ANY);
2370 		VERIFY(__nx_fsw_ifnet_eventhandler_tag != NULL);
2371 
2372 		__nx_fsw_protoctl_eventhandler_tag =
2373 		    EVENTHANDLER_REGISTER(&protoctl_evhdlr_ctxt,
2374 		    protoctl_event, &fsw_protoctl_event_callback,
2375 		    eventhandler_entry_dummy_arg, EVENTHANDLER_PRI_ANY);
2376 		VERIFY(__nx_fsw_protoctl_eventhandler_tag != NULL);
2377 		__nx_fsw_inited = 1;
2378 	}
2379 }
2380 
2381 void
fsw_uninit(void)2382 fsw_uninit(void)
2383 {
2384 	if (__nx_fsw_inited) {
2385 		EVENTHANDLER_DEREGISTER(&ifnet_evhdlr_ctxt, ifnet_event,
2386 		    __nx_fsw_ifnet_eventhandler_tag);
2387 		EVENTHANDLER_DEREGISTER(&protoctl_evhdlr_ctxt, protoctl_event,
2388 		    __nx_fsw_protoctl_eventhandler_tag);
2389 
2390 		__nx_fsw_inited = 0;
2391 	}
2392 }
2393 
2394 struct nx_flowswitch *
fsw_alloc(zalloc_flags_t how)2395 fsw_alloc(zalloc_flags_t how)
2396 {
2397 	struct nx_flowswitch *fsw;
2398 	struct __nx_stats_fsw *__single nsfw;
2399 
2400 	SK_LOCK_ASSERT_HELD();
2401 
2402 	nsfw = zalloc_flags(nx_fsw_stats_zone, how | Z_ZERO);
2403 	if (nsfw == NULL) {
2404 		return NULL;
2405 	}
2406 
2407 	fsw = zalloc_flags(nx_fsw_zone, how | Z_ZERO);
2408 	if (fsw == NULL) {
2409 		zfree(nx_fsw_stats_zone, nsfw);
2410 		return NULL;
2411 	}
2412 
2413 	FSW_RWINIT(fsw);
2414 	fsw->fsw_dev_ch = NULL;
2415 	fsw->fsw_host_ch = NULL;
2416 	fsw->fsw_closed_na_stats = nsfw;
2417 
2418 	SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
2419 
2420 	return fsw;
2421 }
2422 
2423 static int
fsw_detach(struct nx_flowswitch * fsw,struct nexus_adapter * hwna,boolean_t purge)2424 fsw_detach(struct nx_flowswitch *fsw, struct nexus_adapter *hwna,
2425     boolean_t purge)
2426 {
2427 	struct kern_nexus_provider *nx_prov = fsw->fsw_nx->nx_prov;
2428 	boolean_t do_dtor = FALSE;
2429 
2430 	SK_LOCK_ASSERT_HELD();
2431 
2432 	/*
2433 	 * return error if the the host port detach is in progress
2434 	 * or already detached.
2435 	 * For the case of flowswitch free (i.e. purge is TRUE) we have to
2436 	 * cleanup everything, so we will block if needed.
2437 	 */
2438 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
2439 	if (!purge && fsw->fsw_detach_flags != 0) {
2440 		SK_ERR("fsw detaching");
2441 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2442 		return EBUSY;
2443 	}
2444 	VERIFY(purge || fsw->fsw_detach_flags == 0);
2445 	/*
2446 	 * mark the flowswitch as detaching and release sk_lock while
2447 	 * waiting for other threads to exit. Maintain lock/unlock
2448 	 * ordering between the two locks.
2449 	 */
2450 	fsw->fsw_detach_flags |= FSW_DETACHF_DETACHING;
2451 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2452 	SK_UNLOCK();
2453 
2454 	/*
2455 	 * wait until all threads needing accesses to the flowswitch
2456 	 * netagent get out, and mark this as detached to prevent
2457 	 * further access requests from being admitted.
2458 	 */
2459 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
2460 	while (fsw->fsw_detach_barriers != 0) {
2461 		fsw->fsw_detach_waiters++;
2462 		(void) msleep(&fsw->fsw_detach_waiters,
2463 		    &fsw->fsw_detach_barrier_lock,
2464 		    (PZERO + 1), __FUNCTION__, NULL);
2465 	}
2466 	VERIFY(fsw->fsw_detach_barriers == 0);
2467 	VERIFY(fsw->fsw_detach_flags != 0);
2468 	fsw->fsw_detach_flags &= ~FSW_DETACHF_DETACHING;
2469 	/*
2470 	 * if the NA detach thread as well as the flowswitch free thread were
2471 	 * both waiting, then the thread which wins the race is responsible
2472 	 * for doing the dtor work.
2473 	 */
2474 	if (fsw->fsw_detach_flags == 0) {
2475 		fsw->fsw_detach_flags |= FSW_DETACHF_DETACHED;
2476 		do_dtor = TRUE;
2477 	}
2478 	VERIFY(fsw->fsw_detach_flags == FSW_DETACHF_DETACHED);
2479 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2480 	SK_LOCK();
2481 
2482 	FSW_WLOCK(fsw);
2483 	if (do_dtor) {
2484 		if (fsw->fsw_ifp != NULL) {
2485 			fsw_teardown_ifp(fsw, hwna);
2486 			ASSERT(fsw->fsw_ifp == NULL);
2487 			ASSERT(fsw->fsw_nifna == NULL);
2488 		}
2489 		bzero(fsw->fsw_slla, sizeof(fsw->fsw_slla));
2490 		nx_prov->nxprov_params->nxp_ifindex = 0;
2491 		/* free any flow entries in the deferred list */
2492 		fsw_linger_purge(fsw);
2493 	}
2494 	/*
2495 	 * If we are destroying the instance, release lock to let all
2496 	 * outstanding agent threads to enter, followed by waiting until
2497 	 * all of them exit the critical section before continuing.
2498 	 */
2499 	if (purge) {
2500 		FSW_UNLOCK(fsw);
2501 		flow_mgr_terminate(fsw->fsw_flow_mgr);
2502 		FSW_WLOCK(fsw);
2503 	}
2504 	FSW_WUNLOCK(fsw);
2505 	return 0;
2506 }
2507 
2508 void
fsw_free(struct nx_flowswitch * fsw)2509 fsw_free(struct nx_flowswitch *fsw)
2510 {
2511 	int err;
2512 
2513 	SK_LOCK_ASSERT_HELD();
2514 	ASSERT(fsw != NULL);
2515 
2516 	err = fsw_detach(fsw, NULL, TRUE);
2517 	VERIFY(err == 0);
2518 
2519 	fsw_dp_dtor(fsw);
2520 
2521 	ASSERT(fsw->fsw_dev_ch == NULL);
2522 	ASSERT(fsw->fsw_host_ch == NULL);
2523 	ASSERT(fsw->fsw_closed_na_stats != NULL);
2524 	zfree(nx_fsw_stats_zone, fsw->fsw_closed_na_stats);
2525 	fsw->fsw_closed_na_stats = NULL;
2526 	FSW_RWDESTROY(fsw);
2527 
2528 	SK_DF(SK_VERB_MEM, "fsw 0x%llx FREE", SK_KVA(fsw));
2529 	zfree(nx_fsw_zone, fsw);
2530 }
2531