xref: /xnu-12377.41.6/bsd/skywalk/nexus/flowswitch/fsw.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  *   1. Redistributions of source code must retain the above copyright
36  *      notice, this list of conditions and the following disclaimer.
37  *   2. Redistributions in binary form must reproduce the above copyright
38  *      notice, this list of conditions and the following disclaimer in the
39  *      documentation and/or other materials provided with the distribution.
40  *
41  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  */
53 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
54 #include <skywalk/os_skywalk_private.h>
55 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
56 #include <skywalk/nexus/flowswitch/fsw_var.h>
57 #include <skywalk/nexus/netif/nx_netif.h>
58 #include <skywalk/nexus/netif/nx_netif_compat.h>
59 
60 #include <net/bpf.h>
61 #include <net/if.h>
62 #include <net/pktsched/pktsched_netem.h>
63 #include <sys/eventhandler.h>
64 #include <IOKit/IOBSD.h>
65 
66 #include <kern/uipc_domain.h>
67 
68 #if (DEVELOPMENT || DEBUG)
69 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, chain_enqueue,
70     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_chain_enqueue, 0, "");
71 #endif /* !DEVELOPMENT && !DEBUG */
72 
73 /*
74  * Configures the flowswitch to utilize user packet pool with
75  * dual sized buffers.
76  * A non-zero value enables the support.
77  */
78 #if defined(XNU_TARGET_OS_IOS) || defined(XNU_TARGET_OS_OSX) || defined(XNU_TARGET_OS_XR)
79 uint32_t fsw_use_dual_sized_pool = 1;
80 #else
81 uint32_t fsw_use_dual_sized_pool = 0;
82 #endif
83 
84 uint32_t fsw_chain_enqueue = 1;
85 static int __nx_fsw_inited = 0;
86 static eventhandler_tag __nx_fsw_ifnet_eventhandler_tag = NULL;
87 static eventhandler_tag __nx_fsw_protoctl_eventhandler_tag = NULL;
88 
89 static SKMEM_TYPE_DEFINE(nx_fsw_zone, struct nx_flowswitch);
90 
91 static SKMEM_TYPE_DEFINE(nx_fsw_stats_zone, struct __nx_stats_fsw);
92 
93 #define SKMEM_TAG_FSW_PORTS     "com.apple.skywalk.fsw.ports"
94 SKMEM_TAG_DEFINE(skmem_tag_fsw_ports, SKMEM_TAG_FSW_PORTS);
95 
96 #define SKMEM_TAG_FSW_FOB_HASH "com.apple.skywalk.fsw.fsw.fob.hash"
97 SKMEM_TAG_DEFINE(skmem_tag_fsw_fob_hash, SKMEM_TAG_FSW_FOB_HASH);
98 
99 #define SKMEM_TAG_FSW_FRB_HASH "com.apple.skywalk.fsw.fsw.frb.hash"
100 SKMEM_TAG_DEFINE(skmem_tag_fsw_frb_hash, SKMEM_TAG_FSW_FRB_HASH);
101 
102 #define SKMEM_TAG_FSW_FRIB_HASH "com.apple.skywalk.fsw.fsw.frib.hash"
103 SKMEM_TAG_DEFINE(skmem_tag_fsw_frib_hash, SKMEM_TAG_FSW_FRIB_HASH);
104 
105 #define SKMEM_TAG_FSW_FRAG_MGR "com.apple.skywalk.fsw.fsw.frag.mgr"
106 SKMEM_TAG_DEFINE(skmem_tag_fsw_frag_mgr, SKMEM_TAG_FSW_FRAG_MGR);
107 
108 /* 64-bit mask with range */
109 #define BMASK64(_beg, _end)     \
110 	((NX_FSW_CHUNK_FREE >> (63 - (_end))) & ~((1ULL << (_beg)) - 1))
111 
112 static int fsw_detach(struct nx_flowswitch *fsw, struct nexus_adapter *hwna,
113     boolean_t purge);
114 
115 int
fsw_attach_vp(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_vp_adapter ** vpna)116 fsw_attach_vp(struct kern_nexus *nx, struct kern_channel *ch,
117     struct chreq *chr, struct nxbind *nxb, struct proc *p,
118     struct nexus_vp_adapter **vpna)
119 {
120 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
121 	/* -fbounds-safety: cr_name should be null terminated (via snprintf) */
122 	SK_LOG_VAR(const char *__null_terminated cr_name =
123 	    __unsafe_forge_null_terminated(const char *, chr->cr_name));
124 	int err = 0;
125 
126 	SK_LOCK_ASSERT_HELD();
127 	ASSERT(!(chr->cr_mode & CHMODE_CONFIG));
128 	*vpna = NULL;
129 
130 	/* if there's an existing adapter on the nexus port then use it */
131 	FSW_WLOCK(fsw);
132 	err = fsw_port_alloc(fsw, nxb, vpna, chr->cr_port, p, FALSE, FALSE);
133 	FSW_WUNLOCK(fsw);
134 
135 	if (err != 0) {
136 		ASSERT(*vpna == NULL);
137 		goto out;
138 	} else if (*vpna != NULL) {
139 		/*
140 		 * Use the existing adapter on that port; fsw_port_alloc()
141 		 * callback has retained a reference count on the adapter.
142 		 */
143 		goto out;
144 	}
145 	ASSERT(*vpna == NULL);
146 
147 	/* create a virtual port; callee holds vpna ref */
148 	err = fsw_vp_na_create(nx, chr, p, vpna);
149 	if (err != 0) {
150 		SK_ERR("vpna create failed (err %d)", err);
151 		goto out;
152 	}
153 
154 	FSW_WLOCK(fsw);
155 	err = fsw_port_alloc(fsw, nxb, vpna, (*vpna)->vpna_nx_port, p, FALSE, FALSE);
156 	FSW_WUNLOCK(fsw);
157 
158 out:
159 	if ((*vpna) != NULL) {
160 		(*vpna)->vpna_up.na_private = ch;
161 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
162 		    "vpna \"%s\" (%p) refs %u to fsw \"%s\" "
163 		    "nx_port %d (err %d)", (*vpna)->vpna_up.na_name,
164 		    SK_KVA(&(*vpna)->vpna_up), (*vpna)->vpna_up.na_refcount,
165 		    cr_name, (int)(*vpna)->vpna_nx_port, err);
166 
167 		if (err != 0) {
168 			na_release_locked(&(*vpna)->vpna_up);
169 			*vpna = NULL;
170 		}
171 	}
172 
173 	return err;
174 }
175 
176 static int
fsw_nx_check(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)177 fsw_nx_check(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
178 {
179 #pragma unused(fsw)
180 	nexus_type_t hw_nxdom_type = NX_DOM(hw_nx)->nxdom_type;
181 
182 	if (hw_nxdom_type != NEXUS_TYPE_NET_IF) {
183 		return EINVAL;
184 	}
185 
186 	/* it's a netif below */
187 	return 0;
188 }
189 
190 static int
fsw_ctl_flow_add(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)191 fsw_ctl_flow_add(struct nx_flowswitch *fsw, struct proc *p,
192     struct nx_flow_req *req)
193 {
194 	struct flow_owner *fo;
195 	int error = 0;
196 
197 	ASSERT(p != PROC_NULL);
198 
199 	if (p != kernproc) {
200 		/* special port shouldn't be bound via this method */
201 		if (req->nfr_nx_port < FSW_VP_USER_MIN) {
202 			return EINVAL;
203 		}
204 		req->nfr_flags |= (NXFLOWREQF_TRACK | NXFLOWREQF_FLOWADV);
205 	} else {
206 		/* no flow track or advisory support for bsd flow */
207 		ASSERT((req->nfr_flags & NXFLOWREQF_TRACK) == 0);
208 		ASSERT((req->nfr_flags & NXFLOWREQF_FLOWADV) == 0);
209 		ASSERT((req->nfr_flags & NXFLOWREQF_LOW_LATENCY) == 0);
210 	}
211 
212 	/* init kernel only fields */
213 	if (p != kernproc) {
214 		nx_flow_req_internalize(req);
215 	}
216 	req->nfr_pid = proc_pid(p);
217 	if (req->nfr_epid == -1) {
218 		req->nfr_epid = proc_pid(p);
219 	}
220 
221 	if (req->nfr_flow_demux_count > MAX_FLOW_DEMUX_PATTERN) {
222 		SK_ERR("invalid flow demux count %u", req->nfr_flow_demux_count);
223 		return EINVAL;
224 	}
225 
226 	fo = fsw_flow_add(fsw, req, &error);
227 	ASSERT(fo != NULL || error != 0);
228 
229 	if (error == 0) {
230 		// user space don't need this flow stats
231 		flow_stats_release(req->nfr_flow_stats);
232 	}
233 	if (p != kernproc) {
234 		nx_flow_req_externalize(req);
235 	}
236 
237 	return error;
238 }
239 
240 static int
fsw_ctl_flow_del(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)241 fsw_ctl_flow_del(struct nx_flowswitch *fsw, struct proc *p,
242     struct nx_flow_req *req)
243 {
244 	int err;
245 
246 	nx_flow_req_internalize(req);
247 	req->nfr_pid = proc_pid(p);
248 	err = fsw_flow_del(fsw, req, TRUE, NULL);
249 
250 	nx_flow_req_externalize(req);
251 	return err;
252 }
253 
254 static int
fsw_ctl_flow_config(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)255 fsw_ctl_flow_config(struct nx_flowswitch *fsw, struct proc *p,
256     struct nx_flow_req *req)
257 {
258 	int err;
259 
260 	nx_flow_req_internalize(req);
261 	req->nfr_pid = proc_pid(p);
262 	err = fsw_flow_config(fsw, req);
263 
264 	nx_flow_req_externalize(req);
265 	return err;
266 }
267 
268 #if (DEVELOPMENT || DEBUG)
269 static int
270 fsw_rps_threads_sysctl SYSCTL_HANDLER_ARGS
271 {
272 #pragma unused(oidp, arg2)
273 	struct nx_flowswitch *__single fsw = arg1;
274 	uint32_t nthreads;
275 	int changed;
276 	int error;
277 
278 	error = sysctl_io_number(req, fsw->fsw_rps_nthreads,
279 	    sizeof(fsw->fsw_rps_nthreads), &nthreads, &changed);
280 	if (error == 0 && changed != 0) {
281 		error = fsw_rps_set_nthreads(fsw, nthreads);
282 	}
283 	return error;
284 }
285 #endif /* !DEVELOPMENT && !DEBUG */
286 
287 void
fsw_get_tso_capabilities(struct ifnet * ifp,uint32_t * tso_v4_mtu,uint32_t * tso_v6_mtu)288 fsw_get_tso_capabilities(struct ifnet *ifp, uint32_t *tso_v4_mtu, uint32_t *tso_v6_mtu)
289 {
290 #pragma unused(ifp)
291 	*tso_v4_mtu = 0;
292 	*tso_v6_mtu = 0;
293 
294 	struct nx_flowswitch *fsw;
295 
296 	if (!kernel_is_macos_or_server()) {
297 		return;
298 	}
299 
300 	fsw = fsw_ifp_to_fsw(ifp);
301 	if (fsw == NULL) {
302 		return;
303 	}
304 	switch (fsw->fsw_tso_mode) {
305 	case FSW_TSO_MODE_HW: {
306 		*tso_v4_mtu = fsw->fsw_tso_hw_v4_mtu;
307 		*tso_v6_mtu = fsw->fsw_tso_hw_v6_mtu;
308 		break;
309 	}
310 	case FSW_TSO_MODE_SW: {
311 		ASSERT(fsw->fsw_tso_sw_mtu != 0);
312 		*tso_v4_mtu = fsw->fsw_tso_sw_mtu;
313 		*tso_v6_mtu = fsw->fsw_tso_sw_mtu;
314 		break;
315 	}
316 	default:
317 		break;
318 	}
319 }
320 
321 static void
fsw_tso_setup(struct nx_flowswitch * fsw)322 fsw_tso_setup(struct nx_flowswitch *fsw)
323 {
324 	if (!kernel_is_macos_or_server()) {
325 		return;
326 	}
327 
328 	fsw->fsw_tso_mode = FSW_TSO_MODE_NONE;
329 	struct ifnet *ifp = fsw->fsw_ifp;
330 	if (!SKYWALK_CAPABLE(ifp)) {
331 		DTRACE_SKYWALK2(tso__no__support, struct nx_flowswitch *, fsw,
332 		    ifnet_t, ifp);
333 		return;
334 	}
335 	struct nx_netif *nif = NA(ifp)->nifna_netif;
336 	uint32_t large_buf_size = NX_PROV_PARAMS(fsw->fsw_nx)->nxp_large_buf_size;
337 
338 	if (large_buf_size == 0) {
339 		DTRACE_SKYWALK2(no__large__buf, struct nx_flowswitch *, fsw,
340 		    ifnet_t, ifp);
341 		return;
342 	}
343 	/*
344 	 * Unlike _dlil_adjust_large_buf_size_for_tso(), we check the nif_hwassist
345 	 * flags here for the original flags because nx_netif_host_adjust_if_capabilities()
346 	 * has already been called.
347 	 */
348 	if (((nif->nif_hwassist & IFNET_TSO_IPV4) != 0) ||
349 	    ((nif->nif_hwassist & IFNET_TSO_IPV6) != 0)) {
350 		fsw->fsw_tso_mode = FSW_TSO_MODE_HW;
351 		fsw->fsw_tso_hw_v4_mtu = large_buf_size;
352 		fsw->fsw_tso_hw_v6_mtu = large_buf_size;
353 	} else {
354 		if (sk_fsw_gso_mtu != 0 && large_buf_size >= sk_fsw_gso_mtu &&
355 		    SKYWALK_NATIVE(ifp)) {
356 			fsw->fsw_tso_mode = FSW_TSO_MODE_SW;
357 			fsw->fsw_tso_sw_mtu = sk_fsw_gso_mtu;
358 		}
359 	}
360 	DTRACE_SKYWALK3(tso__mode, struct nx_flowswitch *, fsw,
361 	    fsw_tso_mode_t, fsw->fsw_tso_mode, uint32_t, large_buf_size);
362 }
363 
364 static int
fsw_setup_ifp(struct nx_flowswitch * fsw,struct nexus_adapter * hwna)365 fsw_setup_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna)
366 {
367 	int error = 0;
368 	struct ifnet *ifp = hwna->na_ifp;
369 	struct kern_pbufpool *pp = skmem_arena_nexus(hwna->na_arena)->arn_rx_pp;
370 	size_t f_limit = pp->pp_kmd_region->skr_c_obj_cnt / 2;
371 
372 	ASSERT((hwna->na_type == NA_NETIF_HOST) ||
373 	    (hwna->na_type == NA_NETIF_COMPAT_HOST));
374 
375 	SK_LOCK_ASSERT_HELD();
376 
377 	/*
378 	 * XXX: we don't support non TXSTART interface.
379 	 * There are assumptions in fsw_port_flush_enqueue_dst() about
380 	 * single threaded write to destination rings.
381 	 */
382 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
383 		SK_ERR("non TXSTART interface not supported ifp(%p)",
384 		    SK_KVA(ifp));
385 		return ENOTSUP;
386 	}
387 
388 	FSW_WLOCK(fsw);
389 
390 	ASSERT(fsw->fsw_ifp == NULL);
391 	ASSERT(fsw->fsw_nifna == NULL);
392 	ASSERT(fsw->fsw_resolve == NULL);
393 	ASSERT(fsw->fsw_frame == NULL);
394 	ASSERT(fsw->fsw_demux == NULL);
395 	ASSERT(fsw->fsw_pkt_copy_from_pkt == NULL);
396 	ASSERT(fsw->fsw_pkt_copy_from_mbuf == NULL);
397 	ASSERT(fsw->fsw_pkt_copy_to_mbuf == NULL);
398 
399 	fsw->fsw_ipfm = fsw_ip_frag_mgr_create(fsw, ifp, f_limit);
400 	if (fsw->fsw_ipfm == NULL) {
401 		FSW_WUNLOCK(fsw);
402 		return ENOMEM;
403 	}
404 
405 	switch (ifp->if_family) {
406 	case IFNET_FAMILY_ETHERNET:
407 		error = fsw_ethernet_setup(fsw, ifp);
408 		fsw->fsw_ifp_dlt = DLT_EN10MB;
409 		break;
410 
411 	case IFNET_FAMILY_CELLULAR:
412 		error = fsw_cellular_setup(fsw, ifp);
413 		fsw->fsw_ifp_dlt = DLT_RAW;
414 		break;
415 
416 	default:
417 		if (ifp->if_family == IFNET_FAMILY_IPSEC ||
418 		    ifp->if_family == IFNET_FAMILY_UTUN) {
419 			error = fsw_ip_setup(fsw, ifp);
420 			fsw->fsw_ifp_dlt = DLT_RAW;
421 			break;
422 		}
423 		error = ENOTSUP;
424 		break;
425 	}
426 
427 	if (error != 0) {
428 		FSW_WUNLOCK(fsw);
429 		return error;
430 	}
431 
432 	ASSERT(fsw->fsw_resolve != NULL);
433 
434 	if (NX_PROV(fsw->fsw_nx)->nxprov_region_params[SKMEM_REGION_KMD].
435 	    srp_max_frags > 1 || pp->pp_max_frags > 1) {
436 		fsw->fsw_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt;
437 		fsw->fsw_pkt_copy_from_mbuf = pkt_copy_multi_buflet_from_mbuf;
438 		fsw->fsw_pkt_copy_to_mbuf = pkt_copy_multi_buflet_to_mbuf;
439 	} else {
440 		fsw->fsw_pkt_copy_from_pkt = pkt_copy_from_pkt;
441 		fsw->fsw_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
442 		fsw->fsw_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
443 	}
444 
445 	/*
446 	 * Since it is possible for fsw to refer to the ifp after all
447 	 * underlying hwnas are freed (see fsw_teardown_ifp()), we need
448 	 * an extra reference to the ifp here.
449 	 *
450 	 * We also cache the netif adapter of the interface, as it's
451 	 * needed for each packet enqueued to the classq.  There is no
452 	 * need to retain a refcnt for the same reason as above.
453 	 *
454 	 * We hold the busy lock across these, just in case an interface
455 	 * detach and reattach happens, as fsw_flow_bind() relies on the
456 	 * same lock as well before making its checks.
457 	 */
458 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
459 
460 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
461 	fsw->fsw_ifp = ifp;
462 	fsw->fsw_nifna = &ifp->if_na->nifna_up;
463 	ifp->if_na->nifna_netif->nif_fsw = fsw;
464 	ifp->if_na->nifna_netif->nif_fsw_nxadv =
465 	    fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
466 	(void) strlcpy(fsw->fsw_flow_mgr->fm_name,
467 	    if_name(ifp), IFNAMSIZ);
468 
469 	fsw_classq_setup(fsw, hwna);
470 	fsw->fsw_classq_enabled = TRUE;
471 	fsw->fsw_src_lla_gencnt = 0;
472 	fsw_tso_setup(fsw);
473 
474 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
475 	(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
476 	    FSW_REAP_THREADNAME, ifp->if_xname, "");
477 	thread_set_thread_name(fsw->fsw_reap_thread,
478 	    __unsafe_null_terminated_from_indexable(fsw->fsw_reap_name));
479 
480 	error = fsw_netagent_register(fsw, ifp);
481 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_DEFAULT,
482 	    "fsw_netagent_register %s (family %u) (err %d)",
483 	    if_name(ifp), ifp->if_family, error);
484 
485 	/*
486 	 * Clear NXF_REJECT to allow new channels to be opened
487 	 * to this nexus, in case this is an interface reattach.
488 	 * Otherwise this flag should already be cleared.
489 	 */
490 	if (error == 0) {
491 		os_atomic_andnot(&fsw->fsw_nx->nx_flags, NXF_REJECT, relaxed);
492 	}
493 
494 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
495 
496 	/*
497 	 * Wake up the reaper thread.
498 	 */
499 	if (error == 0) {
500 		fsw_reap_sched(fsw);
501 	}
502 
503 	/* init skoid */
504 	skoid_create(&fsw->fsw_skoid,
505 	    SKOID_SNODE(_kern_skywalk_flowswitch), if_name(ifp),
506 	    CTLFLAG_RW);
507 
508 #if (DEVELOPMENT || DEBUG)
509 	if (SKYWALK_NATIVE(fsw->fsw_ifp)) {
510 		skoid_add_handler(&fsw->fsw_skoid, "rps_nthreads", CTLFLAG_RW,
511 		    fsw_rps_threads_sysctl, fsw, 0);
512 	}
513 #endif /* !DEVELOPMENT && !DEBUG */
514 
515 	FSW_WUNLOCK(fsw);
516 
517 	return error;
518 }
519 
520 static void
fsw_teardown_ifp(struct nx_flowswitch * fsw,struct nexus_adapter * hwna)521 fsw_teardown_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna)
522 {
523 	struct ifnet *ifp;
524 	const char *__null_terminated reap_name = NULL;
525 
526 	SK_LOCK_ASSERT_HELD();
527 
528 	FSW_WLOCK_ASSERT_HELD(fsw);
529 	ifp = fsw->fsw_ifp;
530 	ASSERT(ifp != NULL);
531 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
532 
533 	fsw_netagent_unregister(fsw, ifp);
534 
535 	if (fsw->fsw_ipfm != NULL) {
536 		fsw_ip_frag_mgr_destroy(fsw->fsw_ipfm);
537 	}
538 
539 	skoid_destroy(&fsw->fsw_skoid);
540 
541 	SK_D("%sdetached from %s (family %u)",
542 	    ((fsw->fsw_agent_session != NULL) ? "netagent " : ""),
543 	    if_name(ifp), ifp->if_family);
544 
545 	if (hwna != NULL) {
546 		fsw_classq_teardown(fsw, hwna);
547 	}
548 
549 	/*
550 	 * Set NXF_REJECT on the nexus, which would cause existing adapters
551 	 * to be marked similarly; channels associated with them would then
552 	 * cease to function.
553 	 */
554 	os_atomic_or(&fsw->fsw_nx->nx_flags, NXF_REJECT, relaxed);
555 
556 	/* see notes on fsw_na_attach() about I/O refcnt */
557 	if (ifp->if_na != NULL) {
558 		ifp->if_na->nifna_netif->nif_fsw = NULL;
559 		ifp->if_na->nifna_netif->nif_fsw_nxadv = NULL;
560 		os_atomic_thread_fence(seq_cst);
561 	}
562 
563 	fsw->fsw_ifp = NULL;
564 	fsw->fsw_nifna = NULL;
565 	fsw->fsw_resolve = NULL;
566 	fsw->fsw_frame = NULL;
567 	fsw->fsw_frame_headroom = 0;
568 	fsw->fsw_demux = NULL;
569 	fsw->fsw_classq_enabled = FALSE;
570 	fsw->fsw_pkt_copy_from_pkt = NULL;
571 	fsw->fsw_pkt_copy_from_mbuf = NULL;
572 	fsw->fsw_pkt_copy_to_mbuf = NULL;
573 
574 	if (ifp->if_input_netem != NULL) {
575 		netem_destroy(ifp->if_input_netem);
576 		ifp->if_input_netem = NULL;
577 	}
578 
579 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
580 	reap_name = tsnprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
581 	    FSW_REAP_THREADNAME, if_name(ifp), "_detached");
582 	thread_set_thread_name(fsw->fsw_reap_thread, reap_name);
583 }
584 
585 static int
fsw_host_setup(struct nx_flowswitch * fsw)586 fsw_host_setup(struct nx_flowswitch *fsw)
587 {
588 	struct nexus_adapter *hwna;
589 	struct ifnet *ifp;
590 
591 	SK_LOCK_ASSERT_HELD();
592 
593 	hwna = fsw->fsw_host_ch->ch_na;
594 	ASSERT(hwna != NULL);
595 
596 
597 	/* the netif below must have an ifnet attached (dev/host port) */
598 	if ((ifp = hwna->na_ifp) == NULL) {
599 		return ENXIO;
600 	}
601 
602 	/*
603 	 * XXX: we don't support multiple rx rings yet.
604 	 * There are assumptions in fsw_port_flush_enqueue_dst() about
605 	 * single threaded write to destination rings.
606 	 */
607 	if (SKYWALK_NATIVE(ifp) && (hwna->na_num_rx_rings > 1)) {
608 		SK_ERR("ifp(%p): multiple rx rings(%d) not supported",
609 		    SK_KVA(ifp), hwna->na_num_rx_rings);
610 		return ENOTSUP;
611 	}
612 
613 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
614 	if ((fsw->fsw_detach_flags & FSW_DETACHF_DETACHING) != 0) {
615 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
616 		return EBUSY;
617 	}
618 	fsw->fsw_detach_flags = 0;
619 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
620 
621 	int error = fsw_setup_ifp(fsw, hwna);
622 	ASSERT(error != 0 || fsw->fsw_ifp != NULL);
623 	if (error != 0) {
624 		return error;
625 	}
626 
627 	/* update the interface index */
628 	ASSERT(NX_PROV(fsw->fsw_nx)->nxprov_params->nxp_ifindex == 0);
629 	NX_PROV(fsw->fsw_nx)->nxprov_params->nxp_ifindex = ifp->if_index;
630 	return 0;
631 }
632 
633 static int
fsw_host_teardown(struct nx_flowswitch * fsw)634 fsw_host_teardown(struct nx_flowswitch *fsw)
635 {
636 	struct nexus_adapter *hwna = fsw->fsw_host_ch->ch_na;
637 
638 	SK_LOCK_ASSERT_HELD();
639 	return fsw_detach(fsw, hwna, FALSE);
640 }
641 
642 #if SK_LOG
643 /* Hoisted out of line to reduce kernel stack footprint */
644 SK_LOG_ATTRIBUTE
645 static void
fsw_ctl_attach_log(const struct nx_spec_req * nsr,const struct kern_nexus * nx,int err)646 fsw_ctl_attach_log(const struct nx_spec_req *nsr,
647     const struct kern_nexus *nx, int err)
648 {
649 	uuid_string_t uuidstr, ifuuidstr;
650 	const char *__null_terminated nustr = NULL;
651 
652 	if (nsr->nsr_flags & NXSPECREQ_UUID) {
653 		/*
654 		 * -fbounds-safety: We know the output of sk_uuid_unparse is
655 		 * null-terminated.
656 		 */
657 		nustr = __unsafe_forge_null_terminated(const char *,
658 		    sk_uuid_unparse(nsr->nsr_uuid, uuidstr));
659 	} else if (nsr->nsr_flags & NXSPECREQ_IFP) {
660 		nustr = tsnprintf((char *)uuidstr, sizeof(uuidstr), "%p",
661 		    SK_KVA(nsr->nsr_ifp));
662 	} else {
663 		nustr = __unsafe_null_terminated_from_indexable(nsr->nsr_name);
664 	}
665 
666 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
667 	    "nexus %p (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d",
668 	    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr,
669 	    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err);
670 }
671 #endif /* SK_LOG */
672 
673 SK_NO_INLINE_ATTRIBUTE
674 static void
fsw_netif_set_callbacks_common(struct nx_flowswitch * fsw,boolean_t set)675 fsw_netif_set_callbacks_common(struct nx_flowswitch *fsw, boolean_t set)
676 {
677 	struct nexus_adapter *hwna = fsw->fsw_dev_ch->ch_na;
678 
679 	ASSERT(hwna->na_type == NA_NETIF_DEV ||
680 	    hwna->na_type == NA_NETIF_COMPAT_DEV);
681 
682 	if (set) {
683 		netif_hwna_set_mode(hwna, NETIF_MODE_FSW, fsw_devna_rx);
684 	} else {
685 		netif_hwna_clear_mode(hwna);
686 	}
687 }
688 
689 SK_NO_INLINE_ATTRIBUTE
690 static void
fsw_netif_set_callbacks(struct nx_flowswitch * fsw)691 fsw_netif_set_callbacks(struct nx_flowswitch *fsw)
692 {
693 	fsw_netif_set_callbacks_common(fsw, TRUE);
694 }
695 
696 SK_NO_INLINE_ATTRIBUTE
697 static void
fsw_netif_clear_callbacks(struct nx_flowswitch * fsw)698 fsw_netif_clear_callbacks(struct nx_flowswitch *fsw)
699 {
700 	fsw_netif_set_callbacks_common(fsw, FALSE);
701 }
702 
703 SK_NO_INLINE_ATTRIBUTE
704 static void
fsw_dp_start(struct nx_flowswitch * fsw)705 fsw_dp_start(struct nx_flowswitch *fsw)
706 {
707 	ASSERT(fsw->fsw_dev_ch != NULL);
708 	ASSERT(fsw->fsw_host_ch != NULL);
709 
710 	fsw_netif_set_callbacks(fsw);
711 	na_start_spec(fsw->fsw_dev_ch->ch_nexus, fsw->fsw_dev_ch);
712 	na_start_spec(fsw->fsw_host_ch->ch_nexus, fsw->fsw_host_ch);
713 }
714 
715 SK_NO_INLINE_ATTRIBUTE
716 static int
fsw_dp_stop(struct nx_flowswitch * fsw,struct ifnet ** ifpp)717 fsw_dp_stop(struct nx_flowswitch *fsw, struct ifnet **ifpp)
718 {
719 	struct ifnet *ifp;
720 
721 	FSW_WLOCK(fsw);
722 	if ((fsw->fsw_state_flags & FSW_STATEF_QUIESCED) != 0) {
723 		FSW_WUNLOCK(fsw);
724 		return EALREADY;
725 	}
726 	fsw->fsw_state_flags |= FSW_STATEF_QUIESCED;
727 	FSW_WUNLOCK(fsw);
728 
729 	/*
730 	 * For regular kernel-attached interfaces, quiescing is handled by
731 	 * the ifnet detach thread, which calls dlil_quiesce_and_detach_nexuses().
732 	 * For interfaces created by skywalk test cases, flowswitch/netif nexuses
733 	 * are constructed on the fly and can also be torn down on the fly.
734 	 * dlil_quiesce_and_detach_nexuses() won't help here because any nexus
735 	 * can be detached while the interface is still attached.
736 	 */
737 	if ((ifp = fsw->fsw_ifp) != NULL &&
738 	    ifnet_datamov_suspend_if_needed(ifp)) {
739 		SK_UNLOCK();
740 		ifnet_datamov_drain(ifp);
741 		/* Reference will be released by caller */
742 		*ifpp = ifp;
743 		SK_LOCK();
744 	}
745 	ASSERT(fsw->fsw_dev_ch != NULL);
746 	ASSERT(fsw->fsw_host_ch != NULL);
747 	na_stop_spec(fsw->fsw_host_ch->ch_nexus, fsw->fsw_host_ch);
748 	na_stop_spec(fsw->fsw_dev_ch->ch_nexus, fsw->fsw_dev_ch);
749 	fsw_netif_clear_callbacks(fsw);
750 	return 0;
751 }
752 
753 SK_NO_INLINE_ATTRIBUTE
754 static int
fsw_netif_port_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx,boolean_t host)755 fsw_netif_port_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx,
756     boolean_t host)
757 {
758 	struct chreq chr;
759 	struct kern_channel *ch;
760 	int err;
761 
762 	bzero(&chr, sizeof(chr));
763 	uuid_copy(chr.cr_spec_uuid, hw_nx->nx_uuid);
764 	chr.cr_ring_id = CHANNEL_RING_ID_ANY;
765 	chr.cr_port = host ? NEXUS_PORT_NET_IF_HOST : NEXUS_PORT_NET_IF_DEV;
766 	chr.cr_mode |= CHMODE_CONFIG | (host ? CHMODE_HOST : 0);
767 
768 	err = 0;
769 	ch = ch_open_special(hw_nx, &chr, FALSE, &err);
770 	if (ch == NULL) {
771 		SK_ERR("ch_open_special(%s) failed: %d",
772 		    host ? "host" : "dev", err);
773 		return err;
774 	}
775 	if (host) {
776 		fsw->fsw_host_ch = ch;
777 	} else {
778 		fsw->fsw_dev_ch = ch;
779 	}
780 	return 0;
781 }
782 
783 SK_NO_INLINE_ATTRIBUTE
784 static int
fsw_netif_port_teardown(struct nx_flowswitch * fsw,boolean_t host)785 fsw_netif_port_teardown(struct nx_flowswitch *fsw, boolean_t host)
786 {
787 	struct kern_channel *ch;
788 
789 	ch = host ? fsw->fsw_host_ch : fsw->fsw_dev_ch;
790 	if (ch == NULL) {
791 		return EINVAL;
792 	}
793 	if (host) {
794 		fsw->fsw_host_ch = NULL;
795 	} else {
796 		fsw->fsw_dev_ch = NULL;
797 	}
798 	ch_close_special(ch);
799 	(void) ch_release_locked(ch);
800 	return 0;
801 }
802 
803 SK_NO_INLINE_ATTRIBUTE
804 static int
fsw_devna_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)805 fsw_devna_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
806 {
807 	return fsw_netif_port_setup(fsw, hw_nx, FALSE);
808 }
809 
810 SK_NO_INLINE_ATTRIBUTE
811 static int
fsw_hostna_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)812 fsw_hostna_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
813 {
814 	return fsw_netif_port_setup(fsw, hw_nx, TRUE);
815 }
816 
817 SK_NO_INLINE_ATTRIBUTE
818 static int
fsw_devna_teardown(struct nx_flowswitch * fsw)819 fsw_devna_teardown(struct nx_flowswitch *fsw)
820 {
821 	return fsw_netif_port_teardown(fsw, FALSE);
822 }
823 
824 SK_NO_INLINE_ATTRIBUTE
825 static int
fsw_hostna_teardown(struct nx_flowswitch * fsw)826 fsw_hostna_teardown(struct nx_flowswitch *fsw)
827 {
828 	return fsw_netif_port_teardown(fsw, TRUE);
829 }
830 
831 /* Process NXCFG_CMD_ATTACH */
832 SK_NO_INLINE_ATTRIBUTE
833 static int
fsw_ctl_attach(struct kern_nexus * nx,struct proc * p,struct nx_spec_req * nsr)834 fsw_ctl_attach(struct kern_nexus *nx, struct proc *p, struct nx_spec_req *nsr)
835 {
836 #pragma unused(p)
837 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
838 	struct kern_nexus *hw_nx = NULL;
839 	int err = 0;
840 
841 	SK_LOCK_ASSERT_HELD();
842 
843 	/*
844 	 * The flowswitch only accepts UUID as an identifier, since it
845 	 * represents the UUID of the kernel object we are trying to
846 	 * attach to this flowswitch.
847 	 */
848 	if ((nsr->nsr_flags & (NXSPECREQ_UUID | NXSPECREQ_IFP)) !=
849 	    NXSPECREQ_UUID || uuid_is_null(nsr->nsr_uuid)) {
850 		err = EINVAL;
851 		goto done;
852 	}
853 
854 	if (fsw->fsw_dev_ch != NULL) {
855 		ASSERT(fsw->fsw_host_ch != NULL);
856 		err = EEXIST;
857 		goto done;
858 	}
859 
860 	hw_nx = nx_find(nsr->nsr_uuid, TRUE);
861 	if (hw_nx == NULL) {
862 		err = ENOENT;
863 		goto done;
864 	} else if (hw_nx == nx) {
865 		err = EINVAL;
866 		goto done;
867 	}
868 
869 	/* preflight check to see if the nexus is attachable to us */
870 	err = fsw_nx_check(fsw, hw_nx);
871 	if (err != 0) {
872 		goto done;
873 	}
874 
875 	err = fsw_devna_setup(fsw, hw_nx);
876 	if (err != 0) {
877 		goto done;
878 	}
879 
880 	err = fsw_hostna_setup(fsw, hw_nx);
881 	if (err != 0) {
882 		(void) fsw_devna_teardown(fsw);
883 		goto done;
884 	}
885 
886 	err = fsw_host_setup(fsw);
887 	if (err != 0) {
888 		(void) fsw_hostna_teardown(fsw);
889 		(void) fsw_devna_teardown(fsw);
890 		goto done;
891 	}
892 
893 	fsw_dp_start(fsw);
894 
895 	/* return the devna UUID */
896 	uuid_copy(nsr->nsr_if_uuid, fsw->fsw_dev_ch->ch_na->na_uuid);
897 	ASSERT(!uuid_is_null(nsr->nsr_if_uuid));
898 done:
899 #if SK_LOG
900 	if (__improbable(sk_verbose != 0)) {
901 		fsw_ctl_attach_log(nsr, nx, err);
902 	}
903 #endif /* SK_LOG */
904 
905 	if (hw_nx != NULL) {
906 		nx_release_locked(hw_nx);
907 	}
908 
909 	return err;
910 }
911 
912 SK_NO_INLINE_ATTRIBUTE
913 static void
fsw_cleanup(struct nx_flowswitch * fsw)914 fsw_cleanup(struct nx_flowswitch *fsw)
915 {
916 	int err;
917 	struct ifnet *__single ifp = NULL;
918 
919 	if (fsw->fsw_dev_ch == NULL) {
920 		ASSERT(fsw->fsw_host_ch == NULL);
921 		return;
922 	}
923 	err = fsw_dp_stop(fsw, &ifp);
924 	if (err != 0) {
925 		return;
926 	}
927 	err = fsw_host_teardown(fsw);
928 	VERIFY(err == 0);
929 
930 	err = fsw_hostna_teardown(fsw);
931 	VERIFY(err == 0);
932 
933 	err = fsw_devna_teardown(fsw);
934 	VERIFY(err == 0);
935 
936 	if (ifp != NULL) {
937 		ifnet_datamov_resume(ifp);
938 	}
939 }
940 
941 int
fsw_ctl_detach(struct kern_nexus * nx,struct proc * p,struct nx_spec_req * nsr)942 fsw_ctl_detach(struct kern_nexus *nx, struct proc *p,
943     struct nx_spec_req *nsr)
944 {
945 #pragma unused(p)
946 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
947 	int err = 0;
948 
949 	SK_LOCK_ASSERT_HELD();
950 
951 	/*
952 	 * nsr is NULL when we're called from the destructor, and it
953 	 * implies that we'll detach everything that is attached.
954 	 */
955 	if (nsr == NULL) {
956 		fsw_cleanup(fsw);
957 		ASSERT(fsw->fsw_dev_ch == NULL);
958 		ASSERT(fsw->fsw_host_ch == NULL);
959 		goto done;
960 	}
961 
962 	if (uuid_is_null(nsr->nsr_if_uuid)) {
963 		err = EINVAL;
964 		goto done;
965 	} else if (fsw->fsw_dev_ch == NULL || fsw->fsw_host_ch == NULL) {
966 		err = ENXIO;
967 		goto done;
968 	}
969 
970 	/* check if the devna uuid is correct */
971 	if (uuid_compare(nsr->nsr_if_uuid,
972 	    fsw->fsw_dev_ch->ch_na->na_uuid) != 0) {
973 		err = ESRCH;
974 		goto done;
975 	}
976 	fsw_cleanup(fsw);
977 
978 done:
979 #if SK_LOG
980 	if (nsr != NULL) {
981 		uuid_string_t ifuuidstr;
982 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
983 		    "nexus %p (%s) if_uuid %s flags 0x%x err %d",
984 		    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
985 		    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr),
986 		    nsr->nsr_flags, err);
987 	} else {
988 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
989 		    "nexus %p (%s) ANY err %d", SK_KVA(nx),
990 		    NX_DOM_PROV(nx)->nxdom_prov_name, err);
991 	}
992 #endif /* SK_LOG */
993 
994 	return err;
995 }
996 
997 static int
fsw_netem_config(struct nx_flowswitch * fsw,void * data)998 fsw_netem_config(struct nx_flowswitch *fsw, void *data)
999 {
1000 	struct ifnet *ifp = fsw->fsw_ifp;
1001 	struct if_netem_params *__single params = data;
1002 	int ret;
1003 	const char *__null_terminated name = NULL;
1004 
1005 	if (ifp == NULL) {
1006 		return ENODEV;
1007 	}
1008 
1009 	SK_LOCK_ASSERT_HELD();
1010 #define fsw_INPUT_NETEM_THREADNAME   "if_input_netem_%s@fsw"
1011 #define fsw_INPUT_NETEM_THREADNAME_LEN       32
1012 	char netem_name[fsw_INPUT_NETEM_THREADNAME_LEN];
1013 	name = tsnprintf(netem_name, sizeof(netem_name),
1014 	    fsw_INPUT_NETEM_THREADNAME, if_name(ifp));
1015 	ret = netem_config(&ifp->if_input_netem, name, ifp, params, fsw,
1016 	    fsw_dev_input_netem_dequeue, FSW_VP_DEV_BATCH_MAX);
1017 
1018 	return ret;
1019 }
1020 
1021 int
fsw_ctl(struct kern_nexus * nx,nxcfg_cmd_t nc_cmd,struct proc * p,void * data)1022 fsw_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, struct proc *p,
1023     void *data)
1024 {
1025 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
1026 	struct nx_spec_req *__single nsr = data;
1027 	struct nx_flow_req *__single req = data;
1028 	const task_t __single task = proc_task(p);
1029 	boolean_t need_check;
1030 	int error = 0;
1031 
1032 	switch (nc_cmd) {
1033 	case NXCFG_CMD_FLOW_ADD:
1034 	case NXCFG_CMD_FLOW_DEL:
1035 		if (uuid_is_null(req->nfr_flow_uuid)) {
1036 			error = EINVAL;
1037 			goto done;
1038 		}
1039 		if (p != kernproc) {
1040 			req->nfr_flags &= NXFLOWREQF_MASK;
1041 		}
1042 		req->nfr_flowadv_idx = FLOWADV_IDX_NONE;
1043 
1044 		if (nc_cmd == NXCFG_CMD_FLOW_DEL) {
1045 			break;
1046 		}
1047 
1048 		need_check = FALSE;
1049 		if (req->nfr_epid != -1 && proc_pid(p) != req->nfr_epid) {
1050 			need_check = TRUE;
1051 		} else if (!uuid_is_null(req->nfr_euuid)) {
1052 			uuid_t uuid;
1053 
1054 			/* get the UUID of the issuing process */
1055 			proc_getexecutableuuid(p, uuid, sizeof(uuid));
1056 
1057 			/*
1058 			 * If this is not issued by a process for its own
1059 			 * executable UUID and if the process does not have
1060 			 * the necessary privilege, reject the request.
1061 			 * The logic is similar to so_set_effective_uuid().
1062 			 */
1063 			if (uuid_compare(req->nfr_euuid, uuid) != 0) {
1064 				need_check = TRUE;
1065 			}
1066 		}
1067 		if (need_check) {
1068 			kauth_cred_t __single cred = kauth_cred_proc_ref(p);
1069 			error = priv_check_cred(cred,
1070 			    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0);
1071 			kauth_cred_unref(&cred);
1072 			if (error != 0) {
1073 				goto done;
1074 			}
1075 		}
1076 
1077 		if (req->nfr_flags & NXFLOWREQF_AOP_OFFLOAD) {
1078 			if (!IOTaskHasEntitlement(task, "com.apple.private.network.aop2_offload")) {
1079 				error = EPERM;
1080 				goto done;
1081 			}
1082 		}
1083 		break;
1084 
1085 	default:
1086 		break;
1087 	}
1088 
1089 	switch (nc_cmd) {
1090 	case NXCFG_CMD_ATTACH:
1091 		error = fsw_ctl_attach(nx, p, nsr);
1092 		break;
1093 
1094 	case NXCFG_CMD_DETACH:
1095 		error = fsw_ctl_detach(nx, p, nsr);
1096 		break;
1097 
1098 	case NXCFG_CMD_FLOW_ADD:       /* struct nx_flow_req */
1099 		error = fsw_ctl_flow_add(fsw, p, data);
1100 		break;
1101 
1102 	case NXCFG_CMD_FLOW_DEL:     /* struct nx_flow_req */
1103 		error = fsw_ctl_flow_del(fsw, p, data);
1104 		break;
1105 
1106 	case NXCFG_CMD_FLOW_CONFIG:
1107 		error = fsw_ctl_flow_config(fsw, p, data);
1108 		break;
1109 
1110 	case NXCFG_CMD_NETEM:           /* struct if_netem_params */
1111 		error = fsw_netem_config(fsw, data);
1112 		break;
1113 
1114 	default:
1115 		SK_ERR("invalid cmd %u", nc_cmd);
1116 		error = EINVAL;
1117 		break;
1118 	}
1119 
1120 done:
1121 	return error;
1122 }
1123 
1124 struct nx_flowswitch *
fsw_ifp_to_fsw(struct ifnet * ifp)1125 fsw_ifp_to_fsw(struct ifnet *ifp)
1126 {
1127 	struct nx_flowswitch *fsw = NULL;
1128 
1129 	if (ifp->if_na != NULL) {
1130 		fsw = ifp->if_na->nifna_netif->nif_fsw;
1131 	}
1132 	return fsw;
1133 }
1134 
1135 static void
fsw_ifnet_event_callback(struct eventhandler_entry_arg ee_arg __unused,struct ifnet * ifp,struct sockaddr * ip_addr __unused,intf_event_code_t intf_ev_code)1136 fsw_ifnet_event_callback(struct eventhandler_entry_arg ee_arg __unused,
1137     struct ifnet *ifp, struct sockaddr *ip_addr __unused,
1138     intf_event_code_t intf_ev_code)
1139 {
1140 	struct nx_flowswitch *fsw = NULL;
1141 
1142 	evhlog(debug, "%s: eventhandler saw event type=intf_event event_code=%s",
1143 	    __func__, intf_event2str(intf_ev_code));
1144 
1145 	if (ifp->if_na == NULL) {
1146 		return;
1147 	}
1148 
1149 	SK_LOCK();
1150 	fsw = fsw_ifp_to_fsw(ifp);
1151 	if (fsw != NULL) {
1152 		switch (intf_ev_code) {
1153 		case INTF_EVENT_CODE_LLADDR_UPDATE:
1154 			if ((fsw->fsw_ifp == NULL) ||
1155 			    (fsw->fsw_ifp_dlt != DLT_EN10MB)) {
1156 				break;
1157 			}
1158 
1159 			VERIFY(fsw->fsw_ifp == ifp);
1160 			SK_DF(SK_VERB_FSW, "MAC address change detected for %s",
1161 			    if_name(fsw->fsw_ifp));
1162 			(void) ifnet_lladdr_copy_bytes(ifp, fsw->fsw_ether_shost,
1163 			    ETHER_ADDR_LEN);
1164 			os_atomic_inc(&fsw->fsw_src_lla_gencnt, relaxed);
1165 			break;
1166 
1167 		case INTF_EVENT_CODE_LOW_POWER_UPDATE:
1168 			if (fsw->fsw_ifp == NULL) {
1169 				break;
1170 			}
1171 
1172 			VERIFY(fsw->fsw_ifp == ifp);
1173 
1174 			if (ifp->if_xflags & IFXF_LOW_POWER) {
1175 				SK_DF(SK_VERB_FSW,
1176 				    "Low power mode updated for %s",
1177 				    if_name(fsw->fsw_ifp));
1178 
1179 				fsw_reap_sched(fsw);
1180 			}
1181 			break;
1182 
1183 		default:
1184 			break;
1185 		}
1186 	}
1187 	SK_UNLOCK();
1188 }
1189 
1190 static void
fsw_protoctl_event_callback(struct eventhandler_entry_arg ee_arg,struct ifnet * ifp,struct sockaddr * p_laddr,struct sockaddr * p_raddr,uint16_t lport,uint16_t rport,uint8_t proto,uint32_t protoctl_event_code,struct protoctl_ev_val * p_val)1191 fsw_protoctl_event_callback(struct eventhandler_entry_arg ee_arg,
1192     struct ifnet *ifp, struct sockaddr *p_laddr, struct sockaddr *p_raddr,
1193     uint16_t lport, uint16_t rport, uint8_t proto, uint32_t protoctl_event_code,
1194     struct protoctl_ev_val *p_val)
1195 {
1196 #pragma unused(ee_arg)
1197 	struct nx_flowswitch *__single fsw = NULL;
1198 	struct flow_entry *__single fe = NULL;
1199 	boolean_t netagent_update_flow = FALSE;
1200 	uuid_t fe_uuid;
1201 
1202 	evhlog(debug, "%s: eventhandler saw event type=protoctl_event event_code=%d",
1203 	    __func__, proto);
1204 
1205 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1206 		return;
1207 	}
1208 
1209 	/*
1210 	 * XXX Right now only handle the event if we have enough
1211 	 * information to match the entire flow.
1212 	 */
1213 	if (lport == 0 || rport == 0 || p_laddr == NULL || p_raddr == NULL) {
1214 		return;
1215 	}
1216 
1217 	SK_LOCK();
1218 	fsw = fsw_ifp_to_fsw(ifp);
1219 	if (fsw == NULL) {
1220 		goto out;
1221 	}
1222 
1223 	if (!fsw_detach_barrier_add(fsw)) {
1224 		fsw = NULL;
1225 		SK_ERR("netagent detached");
1226 		goto out;
1227 	}
1228 
1229 	struct flow_key fk __sk_aligned(16);
1230 	FLOW_KEY_CLEAR(&fk);
1231 	fk.fk_proto = proto;
1232 	if (p_laddr->sa_family == AF_INET) {
1233 		fk.fk_ipver = IPVERSION;
1234 		fk.fk_src4 = SIN(p_laddr)->sin_addr;
1235 		fk.fk_dst4 = SIN(p_raddr)->sin_addr;
1236 	} else {
1237 		fk.fk_ipver = IPV6_VERSION;
1238 		fk.fk_src6 = SIN6(p_laddr)->sin6_addr;
1239 		/*
1240 		 * rdar://107435899 The scope ID for destination address needs
1241 		 * to be cleared out before looking up the flow entry for this
1242 		 * 5-tuple, because addresses in flow entries do not contain the
1243 		 * scope ID.
1244 		 */
1245 		struct in6_addr *in6;
1246 
1247 		fk.fk_dst6 = SIN6(p_raddr)->sin6_addr;
1248 		in6 = &fk.fk_dst6;
1249 		if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(in6)) {
1250 			in6->s6_addr16[1] = 0;
1251 		}
1252 	}
1253 	fk.fk_sport = lport;
1254 	fk.fk_dport = rport;
1255 	fk.fk_mask = FKMASK_5TUPLE;
1256 
1257 	fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &fk);
1258 	if (__improbable(fe == NULL)) {
1259 		goto out;
1260 	}
1261 
1262 	uuid_copy(fe_uuid, fe->fe_uuid);
1263 	/*
1264 	 * If the protocol notification is for TCP, make sure
1265 	 * protocol event received is for bytes in the flight.
1266 	 * XXX Redirect events are not delivered as protocol events
1267 	 * but as better route events.
1268 	 * Also redirect events do not indicate loss of the packet.
1269 	 */
1270 	if (proto != IPPROTO_TCP) {
1271 		p_val->tcp_seq_number = 0;
1272 	}
1273 
1274 	netagent_update_flow = TRUE;
1275 
1276 out:
1277 	SK_UNLOCK();
1278 
1279 	if (netagent_update_flow) {
1280 		int error = 0;
1281 #if SK_LOG
1282 		char dbgbuf[FLOWENTRY_DBGBUF_SIZE];
1283 		SK_DF(SK_VERB_FLOW, "Update flow entry \"%s\" for protocol "
1284 		    "event %d with value %d and tcp sequence number %d",
1285 		    fe2str(fe, dbgbuf, sizeof(dbgbuf)), protoctl_event_code,
1286 		    p_val->val, p_val->tcp_seq_number);
1287 #endif /* SK_LOG */
1288 		if ((error = netagent_update_flow_protoctl_event(
1289 			    fsw->fsw_agent_session, fe_uuid, protoctl_event_code,
1290 			    p_val->val, p_val->tcp_seq_number)) != 0) {
1291 #if SK_LOG
1292 			SK_DF(SK_VERB_FLOW, "Error: %d. Could not update "
1293 			    "flow entry \"%s\" for protocol event %d with "
1294 			    "value %d and tcp sequence number %d", error,
1295 			    dbgbuf, protoctl_event_code, p_val->val,
1296 			    p_val->tcp_seq_number);
1297 #endif /* SK_LOG */
1298 		}
1299 	}
1300 
1301 	if (fe != NULL) {
1302 		flow_entry_release(&fe);
1303 	}
1304 
1305 	if (fsw != NULL) {
1306 		fsw_detach_barrier_remove(fsw);
1307 	}
1308 }
1309 
1310 int
fsw_netagent_add_remove(struct kern_nexus * nx,boolean_t add)1311 fsw_netagent_add_remove(struct kern_nexus *nx, boolean_t add)
1312 {
1313 	struct nx_flowswitch *fsw = NULL;
1314 	int error = 0;
1315 
1316 	SK_LOCK_ASSERT_HELD();
1317 	VERIFY(nx != NULL);
1318 	VERIFY(NX_PROV(nx) != NULL);
1319 	VERIFY(NX_DOM_PROV(nx) != NULL);
1320 
1321 	if (NX_DOM(nx)->nxdom_type != NEXUS_TYPE_FLOW_SWITCH) {
1322 		error = EINVAL;
1323 		goto out;
1324 	}
1325 
1326 	fsw = NX_FSW_PRIVATE(nx);
1327 	VERIFY(fsw != NULL);
1328 	FSW_WLOCK(fsw);
1329 
1330 	if (fsw->fsw_agent_session == NULL) {
1331 		error = ENXIO;
1332 		goto out;
1333 	}
1334 
1335 	ASSERT(!uuid_is_null(fsw->fsw_agent_uuid));
1336 
1337 	if (add) {
1338 		if (FSW_NETAGENT_ADDED(fsw)) {
1339 			/* agent already added */
1340 			error = EEXIST;
1341 		} else if (fsw->fsw_ifp->if_bridge != NULL) {
1342 			/* see rdar://107076453 */
1343 			SK_ERR("%s is bridged, not adding netagent",
1344 			    if_name(fsw->fsw_ifp));
1345 			error = EBUSY;
1346 		} else {
1347 			fsw->fsw_state_flags |= FSW_STATEF_NETAGENT_ADDED;
1348 			if (if_is_fsw_netagent_enabled()) {
1349 				fsw->fsw_state_flags
1350 				        |= FSW_STATEF_NETAGENT_ENABLED;
1351 			}
1352 			if_add_netagent(fsw->fsw_ifp, fsw->fsw_agent_uuid);
1353 			SK_D("flowswitch netagent added for interface %s",
1354 			    if_name(fsw->fsw_ifp));
1355 		}
1356 	} else {
1357 		if (!FSW_NETAGENT_ADDED(fsw)) {
1358 			/* agent has not been added */
1359 			error = ENOENT;
1360 		} else {
1361 			fsw->fsw_state_flags &= ~(FSW_STATEF_NETAGENT_ADDED |
1362 			    FSW_STATEF_NETAGENT_ENABLED);
1363 			if_delete_netagent(fsw->fsw_ifp, fsw->fsw_agent_uuid);
1364 			SK_D("flowswitch netagent removed for interface %s",
1365 			    if_name(fsw->fsw_ifp));
1366 		}
1367 	}
1368 out:
1369 	if (fsw != NULL) {
1370 		FSW_UNLOCK(fsw);
1371 	}
1372 	return error;
1373 }
1374 
1375 void
fsw_netagent_update(struct kern_nexus * nx)1376 fsw_netagent_update(struct kern_nexus *nx)
1377 {
1378 	struct nx_flowswitch *fsw = NULL;
1379 
1380 	SK_LOCK_ASSERT_HELD();
1381 	VERIFY(nx != NULL);
1382 	VERIFY(NX_PROV(nx) != NULL);
1383 	VERIFY(NX_DOM_PROV(nx) != NULL);
1384 
1385 	if (NX_DOM(nx)->nxdom_type != NEXUS_TYPE_FLOW_SWITCH) {
1386 		goto out;
1387 	}
1388 	fsw = NX_FSW_PRIVATE(nx);
1389 	VERIFY(fsw != NULL);
1390 	FSW_WLOCK(fsw);
1391 	if (fsw->fsw_agent_session == NULL) {
1392 		goto out;
1393 	}
1394 	ASSERT(!uuid_is_null(fsw->fsw_agent_uuid));
1395 	uint32_t flags = netagent_get_flags(fsw->fsw_agent_uuid);
1396 	const bool ip_agent = ifnet_needs_fsw_ip_netagent(fsw->fsw_ifp);
1397 	const bool transport_agent = ifnet_needs_fsw_transport_netagent(fsw->fsw_ifp);
1398 	if (ip_agent || transport_agent) {
1399 		flags |= NETAGENT_FLAG_NEXUS_LISTENER;
1400 	} else {
1401 		flags &= ~NETAGENT_FLAG_NEXUS_LISTENER;
1402 	}
1403 	if (transport_agent) {
1404 		flags |= NETAGENT_FLAG_NEXUS_PROVIDER;
1405 	} else {
1406 		flags &= ~NETAGENT_FLAG_NEXUS_PROVIDER;
1407 	}
1408 	if (ip_agent) {
1409 		flags |= NETAGENT_FLAG_CUSTOM_IP_NEXUS;
1410 	} else {
1411 		flags &= ~NETAGENT_FLAG_CUSTOM_IP_NEXUS;
1412 	}
1413 	if (netagent_set_flags(fsw->fsw_agent_uuid, flags) == 0) {
1414 		SK_D("flowswitch netagent updated for interface %s",
1415 		    if_name(fsw->fsw_ifp));
1416 	}
1417 out:
1418 	if (fsw != NULL) {
1419 		FSW_UNLOCK(fsw);
1420 	}
1421 }
1422 
1423 static int
fsw_port_ctor(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,const struct nxbind * nxb)1424 fsw_port_ctor(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna,
1425     const struct nxbind *nxb)
1426 {
1427 #pragma unused(nxb)
1428 	int err = 0;
1429 
1430 	SK_LOCK_ASSERT_HELD();
1431 	ASSERT(nxb == NULL || !(nxb->nxb_flags & NXBF_MATCH_UNIQUEID) ||
1432 	    vpna->vpna_pid == nxb->nxb_pid);
1433 
1434 	/*
1435 	 * Reject regular channel open requests unless there is
1436 	 * something attached to the host port of the flowswitch.
1437 	 */
1438 	if (vpna->vpna_nx_port >= FSW_VP_USER_MIN) {
1439 		struct nexus_adapter *na = &vpna->vpna_up;
1440 		struct ifnet *ifp = fsw->fsw_ifp;
1441 
1442 		if (ifp == NULL) {
1443 			err = ENXIO;
1444 			goto done;
1445 		}
1446 
1447 		/* if adapter supports mitigation, set default value */
1448 		if (na->na_flags & (NAF_TX_MITIGATION | NAF_RX_MITIGATION)) {
1449 			if (IFNET_IS_WIFI(ifp)) {
1450 				na->na_ch_mit_ival = CH_MIT_IVAL_WIFI;
1451 			} else if (IFNET_IS_CELLULAR(ifp)) {
1452 				na->na_ch_mit_ival = CH_MIT_IVAL_CELLULAR;
1453 			} else if (IFNET_IS_ETHERNET(ifp)) {
1454 				na->na_ch_mit_ival = CH_MIT_IVAL_ETHERNET;
1455 			} else {
1456 				na->na_ch_mit_ival = CH_MIT_IVAL_DEFAULT;
1457 			}
1458 		}
1459 	}
1460 
1461 done:
1462 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
1463 	    "fsw %p nx_port %d vpna_pid %d vpna_pid_bound %u mit_ival %llu "
1464 	    "(err %d)", SK_KVA(fsw), (int)vpna->vpna_nx_port, vpna->vpna_pid,
1465 	    vpna->vpna_pid_bound, vpna->vpna_up.na_ch_mit_ival, err);
1466 
1467 	return err;
1468 }
1469 
1470 static bool
fsw_port_dtor(struct nx_flowswitch * fsw,const struct nexus_vp_adapter * vpna)1471 fsw_port_dtor(struct nx_flowswitch *fsw, const struct nexus_vp_adapter *vpna)
1472 {
1473 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1474 	nexus_port_t nx_port = vpna->vpna_nx_port;
1475 	uint32_t purge_cnt;
1476 
1477 	ASSERT(fsw == vpna->vpna_fsw);
1478 	ASSERT(nx_port != NEXUS_PORT_ANY);
1479 
1480 	/*
1481 	 * If this nexus port was bound to a PID, we just need to look at a
1482 	 * single bucket and iterate from there.  Note that in any case, we
1483 	 * can't just search for a single flow_owner based on the PID itself,
1484 	 * since a given process may be opening multiple channels to the
1485 	 * flowswitch; hence we search for the ones matching this nexus port.
1486 	 *
1487 	 * Close any open flows on the port and remove the flow owner and
1488 	 * nexus port binding.
1489 	 */
1490 	purge_cnt = flow_owner_detach_nexus_port(fm, vpna->vpna_pid_bound,
1491 	    vpna->vpna_pid, nx_port, FALSE);
1492 
1493 	SK_DF(SK_VERB_FSW,
1494 	    "fsw %p nx_port %d pid %d pid_bound %u defunct %u "
1495 	    "purged %u", SK_KVA(fsw), (int)nx_port,
1496 	    vpna->vpna_pid, vpna->vpna_pid_bound, vpna->vpna_defunct,
1497 	    purge_cnt);
1498 
1499 	return purge_cnt != 0;
1500 }
1501 
1502 /*
1503  * Flowswitch nexus port allocator.
1504  *
1505  * A nexus port is represented by a bit in the port bitmap; its state is
1506  * either free or allocated.  A free state implies that the port has no
1507  * nxbind AND no nexus adapter association.  An allocated state means that
1508  * either it has a nxbind OR a nexus adapter assocation.  This routine
1509  * manages the nexus adapter association with a nexus port; nxbind is
1510  * handled separately via nx_fsw_port_bind().
1511  *
1512  * The caller of this routine may optionally pass in a NULL nexus adapter.
1513  * In such a case (*vpna is NULL), this routine checks to see if the port
1514  * has already been associated with an adapter, and returns a reference to
1515  * that adapter.  No action is taken on a port that doesn't have an adapter
1516  * associated.  Otherwise (*vpna is non-NULL), this routine associates that
1517  * adapter with a port that's not already associated with one; the reference
1518  * to the adapter is untouched here, as the caller is expected to handle it.
1519  *
1520  * The flowswitch code invokes this routine each time it is requested to
1521  * find an adapter via nx_fsw_na_find().  The counterpart of this routine,
1522  * nx_fsw_port_free(), is only executed ONCE by the adapter's destructor.
1523  * This allows for multiple channels to be opened to a nexus port, each
1524  * time holding a reference to that same nexus adapter.  The releasing of
1525  * the nexus port only happens when the last channel closes.
1526  */
1527 static int
fsw_port_alloc__(struct nx_flowswitch * fsw,struct nxbind * nxb,struct nexus_vp_adapter ** vpna,nexus_port_t nx_port,struct proc * p)1528 fsw_port_alloc__(struct nx_flowswitch *fsw, struct nxbind *nxb,
1529     struct nexus_vp_adapter **vpna, nexus_port_t nx_port, struct proc *p)
1530 {
1531 	struct kern_nexus *nx = fsw->fsw_nx;
1532 	boolean_t refonly = FALSE;
1533 	int error = 0;
1534 
1535 	FSW_WLOCK_ASSERT_HELD(fsw);
1536 
1537 	error = nx_port_alloc(nx, nx_port, nxb, (struct nexus_adapter **)vpna, p);
1538 	if (error == 0 && *vpna != NULL && !refonly) {
1539 		/* initialize the nexus port and the adapter occupying it */
1540 		(*vpna)->vpna_fsw = fsw;
1541 		(*vpna)->vpna_nx_port = nx_port;
1542 		(*vpna)->vpna_pid = proc_pid(p);
1543 		if (nxb != NULL && (nxb->nxb_flags & NXBF_MATCH_UNIQUEID)) {
1544 			ASSERT((*vpna)->vpna_pid == nxb->nxb_pid);
1545 			(*vpna)->vpna_pid_bound = TRUE;
1546 		} else {
1547 			(*vpna)->vpna_pid_bound = FALSE;
1548 		}
1549 
1550 		error = fsw_port_ctor(fsw, *vpna, nxb);
1551 		if (error != 0) {
1552 			fsw_port_free(fsw, (*vpna),
1553 			    (*vpna)->vpna_nx_port, FALSE);
1554 		}
1555 	}
1556 
1557 #if SK_LOG
1558 	if (*vpna != NULL) {
1559 		SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
1560 		    "+++ vpna \"%s\" (%p) <-> fsw %p "
1561 		    "%sport %d refonly %u (err %d)",
1562 		    (*vpna)->vpna_up.na_name, SK_KVA(*vpna), SK_KVA(fsw),
1563 		    nx_fsw_dom_port_is_reserved(nx, nx_port) ?
1564 		    "[reserved] " : "", (int)nx_port, refonly, error);
1565 	} else {
1566 		SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
1567 		    "+++ fsw %p nx_port %d refonly %u "
1568 		    "(err %d)", SK_KVA(fsw), (int)nx_port, refonly, error);
1569 	}
1570 #endif /* SK_LOG */
1571 
1572 	return error;
1573 }
1574 
1575 int
fsw_port_alloc(struct nx_flowswitch * fsw,struct nxbind * nxb,struct nexus_vp_adapter ** vpna,nexus_port_t nx_port,struct proc * p,boolean_t ifattach,boolean_t host)1576 fsw_port_alloc(struct nx_flowswitch *fsw, struct nxbind *nxb,
1577     struct nexus_vp_adapter **vpna, nexus_port_t nx_port, struct proc *p,
1578     boolean_t ifattach, boolean_t host)
1579 {
1580 	int err = 0;
1581 
1582 	FSW_WLOCK_ASSERT_HELD(fsw);
1583 
1584 	if (ifattach) {
1585 		/* override port to either NX_FSW_{HOST,DEV} */
1586 		nx_port = (host ? FSW_VP_HOST : FSW_VP_DEV);
1587 		/* allocate reserved port for ifattach */
1588 		err = fsw_port_alloc__(fsw, nxb, vpna, nx_port, p);
1589 	} else if (host) {
1590 		/* host is valid only for ifattach */
1591 		err = EINVAL;
1592 	} else {
1593 		/* nexus port otherwise (reserve dev and host for ifattach) */
1594 		err = fsw_port_alloc__(fsw, nxb, vpna, nx_port, p);
1595 	}
1596 
1597 	return err;
1598 }
1599 
1600 /*
1601  * Remove nexus port association from a nexus adapter.  This call is
1602  * the opposite of fsw_port_alloc(), except that it is called only
1603  * at nx_fsw_vp_na_dtor() destructor time.  See above notes
1604  * on fsw_port_alloc().
1605  */
1606 void
fsw_port_free(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,nexus_port_t nx_port,boolean_t defunct)1607 fsw_port_free(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna,
1608     nexus_port_t nx_port, boolean_t defunct)
1609 {
1610 	struct kern_nexus *nx = fsw->fsw_nx;
1611 
1612 	FSW_WLOCK_ASSERT_HELD(fsw);
1613 	ASSERT(vpna->vpna_fsw == fsw);
1614 
1615 	if (defunct) {
1616 		vpna->vpna_defunct = TRUE;
1617 		nx_port_defunct(nx, nx_port);
1618 	}
1619 
1620 	bool destroyed = fsw_port_dtor(fsw, vpna);
1621 	if (destroyed) {
1622 		/*
1623 		 * If the extension's destructor no longer needs to be
1624 		 * bound to any channel client, release the binding.
1625 		 */
1626 		nx_port_unbind(nx, nx_port);
1627 	}
1628 
1629 	/*
1630 	 * If this is a defunct, then stop here as the port is still
1631 	 * occupied by the channel.  We'll come here again later when
1632 	 * the actual close happens.
1633 	 */
1634 	if (defunct) {
1635 		return;
1636 	}
1637 
1638 	SK_DF(SK_VERB_FSW, "--- vpna \"%s\" (%p) -!- fsw %p "
1639 	    "nx_port %d defunct %u", vpna->vpna_up.na_name, SK_KVA(vpna),
1640 	    SK_KVA(fsw), (int)nx_port, vpna->vpna_defunct);
1641 
1642 	nx_port_free(nx, nx_port);
1643 	vpna->vpna_fsw = NULL;
1644 	vpna->vpna_nx_port = NEXUS_PORT_ANY;
1645 	vpna->vpna_pid_bound = FALSE;
1646 	vpna->vpna_pid = -1;
1647 	vpna->vpna_defunct = FALSE;
1648 	vpna->vpna_up.na_private = NULL;
1649 }
1650 
1651 int
fsw_port_na_activate(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,na_activate_mode_t mode)1652 fsw_port_na_activate(struct nx_flowswitch *fsw,
1653     struct nexus_vp_adapter *vpna, na_activate_mode_t mode)
1654 {
1655 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1656 	uint32_t fo_cnt = 0;
1657 
1658 	SK_LOCK_ASSERT_HELD();
1659 
1660 	/* The following code relies on the static value asserted below */
1661 	static_assert(FSW_VP_DEV == 0);
1662 	static_assert(FSW_VP_HOST == 1);
1663 
1664 	ASSERT(NA_IS_ACTIVE(&vpna->vpna_up));
1665 	ASSERT(vpna->vpna_nx_port != NEXUS_PORT_ANY);
1666 
1667 	switch (mode) {
1668 	case NA_ACTIVATE_MODE_ON:
1669 		break;
1670 
1671 	case NA_ACTIVATE_MODE_DEFUNCT:
1672 		break;
1673 
1674 	case NA_ACTIVATE_MODE_OFF:
1675 		break;
1676 
1677 	default:
1678 		VERIFY(0);
1679 		/* NOTREACHED */
1680 		__builtin_unreachable();
1681 	}
1682 
1683 	/* nothing further to do for special ports */
1684 	if (vpna->vpna_nx_port < FSW_VP_USER_MIN) {
1685 		goto done;
1686 	}
1687 
1688 	/* activate any flow owner related resources (e.g. flowadv), if any */
1689 	fo_cnt = flow_owner_activate_nexus_port(fm, vpna->vpna_pid_bound,
1690 	    vpna->vpna_pid, vpna->vpna_nx_port, &vpna->vpna_up, mode);
1691 
1692 done:
1693 	SK_DF(SK_VERB_FSW,
1694 	    "fsw %p %s nx_port %d vpna_pid %d vpna_pid_bound %u fo_cnt %u",
1695 	    SK_KVA(fsw), na_activate_mode2str(mode), (int)vpna->vpna_nx_port,
1696 	    vpna->vpna_pid, vpna->vpna_pid_bound, fo_cnt);
1697 
1698 	return 0;
1699 }
1700 
1701 int
fsw_port_na_defunct(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna)1702 fsw_port_na_defunct(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna)
1703 {
1704 	int err = 0;
1705 
1706 	SK_LOCK_ASSERT_HELD();
1707 	ASSERT(vpna->vpna_nx_port >= FSW_VP_USER_MIN);
1708 
1709 	/*
1710 	 * During defunct, we want to purge all flows associated to this
1711 	 * port and the flow owner as well.  This is accomplished as part
1712 	 * of calling the port's destructor.  However, we still want to
1713 	 * occupy the nexus port since there's a channel open to it.
1714 	 */
1715 	FSW_WLOCK(fsw);
1716 	if (!vpna->vpna_defunct) {
1717 		fsw_port_free(fsw, vpna, vpna->vpna_nx_port, TRUE);
1718 	} else {
1719 		err = EALREADY;
1720 	}
1721 	FSW_WUNLOCK(fsw);
1722 
1723 	return err;
1724 }
1725 
1726 static size_t
fsw_mib_get_flow(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len)1727 fsw_mib_get_flow(struct nx_flowswitch *fsw,
1728     struct nexus_mib_filter *filter, void *__sized_by(len)out, size_t len)
1729 {
1730 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1731 	size_t sf_size = sizeof(struct sk_stats_flow);
1732 	__block size_t actual_space = 0;
1733 	__block struct sk_stats_flow *sf = out;
1734 	struct flow_entry *__single fe;
1735 
1736 	FSW_LOCK_ASSERT_HELD(fsw);
1737 
1738 	if (filter->nmf_bitmap & NXMIB_FILTER_FLOW_ID) {
1739 		fe = flow_mgr_get_fe_by_uuid_rlock(fm, filter->nmf_flow_id);
1740 		if (fe != NULL) {
1741 			if (out != NULL && len >= sf_size) {
1742 				flow_entry_stats_get(fe, sf);
1743 			}
1744 
1745 			flow_entry_release(&fe);
1746 			return sf_size;
1747 		}
1748 		return 0;
1749 	} else if (filter->nmf_bitmap & NXMIB_FILTER_INFO_TUPLE) {
1750 		struct info_tuple *itpl = &filter->nmf_info_tuple;
1751 		struct flow_key fk;
1752 		bzero(&fk, sizeof(fk));
1753 		if (itpl->itpl_local_sah.sa_family == AF_INET &&
1754 		    itpl->itpl_remote_sah.sa_family == AF_INET) {
1755 			fk.fk_mask = FKMASK_5TUPLE;
1756 			fk.fk_ipver = IPVERSION;
1757 			fk.fk_proto = itpl->itpl_proto;
1758 			fk.fk_src4 = itpl->itpl_local_sin.sin_addr;
1759 			fk.fk_dst4 = itpl->itpl_remote_sin.sin_addr;
1760 			fk.fk_sport = itpl->itpl_local_sin.sin_port;
1761 			fk.fk_dport = itpl->itpl_remote_sin.sin_port;
1762 		} else if (itpl->itpl_local_sah.sa_family == AF_INET6 &&
1763 		    itpl->itpl_remote_sah.sa_family == AF_INET6) {
1764 			fk.fk_mask = FKMASK_5TUPLE;
1765 			fk.fk_ipver = IPV6_VERSION;
1766 			fk.fk_proto = itpl->itpl_proto;
1767 			fk.fk_src6 = itpl->itpl_local_sin6.sin6_addr;
1768 			fk.fk_dst6 = itpl->itpl_remote_sin6.sin6_addr;
1769 			fk.fk_sport = itpl->itpl_local_sin6.sin6_port;
1770 			fk.fk_dport = itpl->itpl_remote_sin6.sin6_port;
1771 		} else {
1772 			SK_ERR("invalid info tuple: local af %d remote af %d",
1773 			    itpl->itpl_local_sah.sa_family,
1774 			    itpl->itpl_remote_sah.sa_family);
1775 			return 0;
1776 		}
1777 
1778 		fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &fk);
1779 		if (fe != NULL) {
1780 			if (out != NULL && len >= sf_size) {
1781 				flow_entry_stats_get(fe, sf);
1782 			}
1783 			flow_entry_release(&fe);
1784 			return sf_size;
1785 		}
1786 		return 0;
1787 	}
1788 
1789 	flow_mgr_foreach_flow(fsw->fsw_flow_mgr, ^(struct flow_entry *_fe) {
1790 		actual_space += sf_size;
1791 
1792 		if (out == NULL || actual_space > len) {
1793 		        return;
1794 		}
1795 
1796 		flow_entry_stats_get(_fe, sf);
1797 		sf++;
1798 	});
1799 
1800 	/*
1801 	 * Also return the ones in deferred free list.
1802 	 */
1803 	lck_mtx_lock(&fsw->fsw_linger_lock);
1804 	TAILQ_FOREACH(fe, &fsw->fsw_linger_head, fe_linger_link) {
1805 		actual_space += sf_size;
1806 		if (out == NULL || actual_space > len) {
1807 			continue;
1808 		}
1809 
1810 		flow_entry_stats_get(fe, sf);
1811 		sf++;
1812 	}
1813 	lck_mtx_unlock(&fsw->fsw_linger_lock);
1814 
1815 	return actual_space;
1816 }
1817 
1818 static size_t
fsw_mib_get_flow_adv(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len)1819 fsw_mib_get_flow_adv(struct nx_flowswitch *fsw,
1820     struct nexus_mib_filter *filter, void *__sized_by(len)out, size_t len)
1821 {
1822 #pragma unused(filter)
1823 	uint32_t fae_idx;
1824 	size_t actual_space = 0;
1825 	struct kern_channel *__single ch = NULL;
1826 	struct sk_stats_flow_adv *sfa = NULL;
1827 	struct sk_stats_flow_adv_ent *sfae = NULL;
1828 	struct __flowadv_entry *__single fae = NULL;
1829 	size_t sfa_size = sizeof(struct sk_stats_flow_adv);
1830 	size_t sfae_size = sizeof(struct sk_stats_flow_adv_ent);
1831 	uint32_t max_flowadv =
1832 	    fsw->fsw_nx->nx_prov->nxprov_params->nxp_flowadv_max;
1833 
1834 	SK_LOCK_ASSERT_HELD();
1835 
1836 	sfa = out;
1837 	/* copyout flow advisory table (allocated entries only) */
1838 	STAILQ_FOREACH(ch, &fsw->fsw_nx->nx_ch_head, ch_link) {
1839 		struct skmem_arena *ar;
1840 		struct skmem_arena_nexus *arn;
1841 		struct nexus_adapter *na;
1842 
1843 		/* ch_lock isn't needed here since sk_lock is held */
1844 		if ((ch->ch_flags & CHANF_CLOSING) ||
1845 		    (na = ch->ch_na) == NULL) {
1846 			/* channel is closing */
1847 			continue;
1848 		}
1849 
1850 		ar = na->na_arena;
1851 		arn = skmem_arena_nexus(ar);
1852 
1853 		AR_LOCK(ar);
1854 		if (arn->arn_flowadv_obj == NULL) {
1855 			ASSERT(ar->ar_flags & ARF_DEFUNCT);
1856 			AR_UNLOCK(ar);
1857 			continue;
1858 		}
1859 		actual_space += sfa_size;
1860 		/* fill out flowadv_table info */
1861 		if (out != NULL && actual_space <= len) {
1862 			uuid_copy(sfa->sfa_nx_uuid, fsw->fsw_nx->nx_uuid);
1863 			(void) strbufcpy(sfa->sfa_if_name,
1864 			    fsw->fsw_flow_mgr->fm_name);
1865 			sfa->sfa_owner_pid = ch->ch_pid;
1866 			sfa->sfa_entries_count = 0;
1867 		}
1868 
1869 		/* fill out flowadv_entries */
1870 		for (fae_idx = 0; fae_idx < max_flowadv; fae_idx++) {
1871 			fae = &arn->arn_flowadv_obj[fae_idx];
1872 			if (!uuid_is_null(fae->fae_id)) {
1873 				actual_space += sfae_size;
1874 				if (out == NULL || actual_space > len) {
1875 					continue;
1876 				}
1877 				sfae = &sfa->sfa_entries[0];
1878 
1879 				/* fill out entry */
1880 				uuid_copy(sfae->sfae_flow_id, fae->fae_id);
1881 				sfae->sfae_flags = fae->fae_flags;
1882 				sfae++;
1883 				sfa->sfa_entries_count++;
1884 			}
1885 		}
1886 		sfa = (struct sk_stats_flow_adv *)
1887 		    (void *)((int8_t *)out + actual_space);
1888 		AR_UNLOCK(ar);
1889 	}
1890 
1891 	return actual_space;
1892 }
1893 
1894 static inline void
fsw_fo2sfo(struct nx_flowswitch * fsw,struct flow_owner * fo,struct sk_stats_flow_owner * sfo)1895 fsw_fo2sfo(struct nx_flowswitch *fsw, struct flow_owner *fo,
1896     struct sk_stats_flow_owner *sfo)
1897 {
1898 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1899 
1900 	uuid_copy(sfo->sfo_nx_uuid, fsw->fsw_nx->nx_uuid);
1901 	(void) strbufcpy(sfo->sfo_if_name, fsw->fsw_flow_mgr->fm_name);
1902 	sfo->sfo_bucket_idx = flow_mgr_get_fob_idx(fm, FO_BUCKET(fo));
1903 
1904 	(void) snprintf(sfo->sfo_name, sizeof(sfo->sfo_name), "%s",
1905 	    fo->fo_name);
1906 	sfo->sfo_pid = fo->fo_pid;
1907 	sfo->sfo_nx_port = fo->fo_nx_port;
1908 	sfo->sfo_nx_port_pid_bound = fo->fo_nx_port_pid_bound;
1909 	sfo->sfo_nx_port_destroyed = fo->fo_nx_port_destroyed;
1910 }
1911 
1912 static size_t
fsw_mib_get_flow_owner(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len)1913 fsw_mib_get_flow_owner(struct nx_flowswitch *fsw,
1914     struct nexus_mib_filter *filter, void *__sized_by(len)out, size_t len)
1915 {
1916 #pragma unused(filter)
1917 	uint32_t i;
1918 	size_t actual_space = 0;
1919 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1920 	struct sk_stats_flow_owner *sfo = out;
1921 	size_t sfo_size = sizeof(struct sk_stats_flow_owner);
1922 	struct flow_owner *fo;
1923 
1924 	FSW_LOCK_ASSERT_HELD(fsw);
1925 
1926 	/*
1927 	 * Ideally we'd like to hide the bucket level details from flow library
1928 	 * user, but there is no simple way to iterate flow_owner with
1929 	 * buckets/RB_TREE nested. So keep it as is.
1930 	 */
1931 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
1932 		struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
1933 		FOB_LOCK(fob);
1934 		RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
1935 			actual_space += sfo_size;
1936 			if (out == NULL || actual_space > len) {
1937 				continue;
1938 			}
1939 
1940 			fsw_fo2sfo(fsw, fo, sfo);
1941 			sfo++;
1942 		}
1943 		FOB_UNLOCK(fob);
1944 	}
1945 
1946 	return actual_space;
1947 }
1948 
1949 static inline void
fsw_fr2sfr(struct nx_flowswitch * fsw,struct flow_route * fr,struct sk_stats_flow_route * sfr,boolean_t ll_scrub)1950 fsw_fr2sfr(struct nx_flowswitch *fsw, struct flow_route *fr,
1951     struct sk_stats_flow_route *sfr, boolean_t ll_scrub)
1952 {
1953 	uuid_copy(sfr->sfr_nx_uuid, fsw->fsw_nx->nx_uuid);
1954 	uuid_copy(sfr->sfr_uuid, fr->fr_uuid);
1955 	(void) strbufcpy(sfr->sfr_if_name, fsw->fsw_flow_mgr->fm_name);
1956 
1957 	sfr->sfr_bucket_idx = fr->fr_frb->frb_idx;
1958 	sfr->sfr_id_bucket_idx = fr->fr_frib->frib_idx;
1959 
1960 	if (fr->fr_flags & FLOWRTF_ATTACHED) {
1961 		sfr->sfr_flags |= SFLOWRTF_ATTACHED;
1962 	}
1963 	if (fr->fr_flags & FLOWRTF_ONLINK) {
1964 		sfr->sfr_flags |= SFLOWRTF_ONLINK;
1965 	}
1966 	if (fr->fr_flags & FLOWRTF_GATEWAY) {
1967 		sfr->sfr_flags |= SFLOWRTF_GATEWAY;
1968 	}
1969 	if (fr->fr_flags & FLOWRTF_RESOLVED) {
1970 		sfr->sfr_flags |= SFLOWRTF_RESOLVED;
1971 	}
1972 	if (fr->fr_flags & FLOWRTF_HAS_LLINFO) {
1973 		sfr->sfr_flags |= SFLOWRTF_HAS_LLINFO;
1974 	}
1975 	if (fr->fr_flags & FLOWRTF_DELETED) {
1976 		sfr->sfr_flags |= SFLOWRTF_DELETED;
1977 	}
1978 	if (fr->fr_flags & FLOWRTF_DST_LL_MCAST) {
1979 		sfr->sfr_flags |= SFLOWRTF_DST_LL_MCAST;
1980 	}
1981 	if (fr->fr_flags & FLOWRTF_DST_LL_BCAST) {
1982 		sfr->sfr_flags |= SFLOWRTF_DST_LL_BCAST;
1983 	}
1984 
1985 	lck_spin_lock(&fr->fr_reflock);
1986 	ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF);
1987 	sfr->sfr_usecnt = fr->fr_usecnt - FLOW_ROUTE_MINREF;
1988 	if (fr->fr_expire != 0) {
1989 		sfr->sfr_expire = (int64_t)(fr->fr_expire - net_uptime());
1990 	} else {
1991 		sfr->sfr_expire = 0;
1992 	}
1993 	lck_spin_unlock(&fr->fr_reflock);
1994 
1995 	sfr->sfr_laddr = fr->fr_laddr;
1996 	sfr->sfr_faddr = fr->fr_faddr;
1997 	sfr->sfr_gaddr = fr->fr_gaddr;
1998 
1999 	if (ll_scrub) {
2000 		static const uint8_t unspec[ETHER_ADDR_LEN] = {[0] = 2 };
2001 		bcopy(&unspec, &sfr->sfr_ether_dhost, ETHER_ADDR_LEN);
2002 	} else {
2003 		bcopy(&fr->fr_eth.ether_dhost, &sfr->sfr_ether_dhost,
2004 		    ETHER_ADDR_LEN);
2005 	}
2006 }
2007 
2008 #if CONFIG_MACF
2009 extern int dlil_lladdr_ckreq;
2010 #endif /* CONFIG_MACF */
2011 
2012 static size_t
fsw_mib_get_flow_route(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len,struct proc * p)2013 fsw_mib_get_flow_route(struct nx_flowswitch *fsw,
2014     struct nexus_mib_filter *filter, void *__sized_by(len)out, size_t len, struct proc *p)
2015 {
2016 #pragma unused(filter)
2017 	uint32_t i;
2018 	size_t actual_space = 0;
2019 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
2020 	struct sk_stats_flow_route *sfr = out;
2021 	size_t sfo_size = sizeof(struct sk_stats_flow_route);
2022 	struct flow_route *fr;
2023 	boolean_t ll_scrub;
2024 
2025 	FSW_LOCK_ASSERT_HELD(fsw);
2026 
2027 	/*
2028 	 * To get the link-layer info, the caller must have the following
2029 	 * in their sandbox profile (or not be sandboxed at all), else we
2030 	 * scrub it clean just like dlil_ifaddr_bytes() does:
2031 	 *
2032 	 * (allow system-info (info-type "net.link.addr"))
2033 	 *
2034 	 * If scrubbed, we return 02:00:00:00:00:00.
2035 	 */
2036 #if CONFIG_MACF
2037 	ll_scrub = (dlil_lladdr_ckreq &&
2038 	    skywalk_mac_system_check_proc_cred(p, "net.link.addr") != 0);
2039 #else /* !CONFIG_MACF */
2040 	ll_scrub = FALSE;
2041 #endif /* !CONFIG_MACF */
2042 
2043 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
2044 		struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
2045 		FRB_RLOCK(frb);
2046 		RB_FOREACH(fr, flow_route_tree, &frb->frb_head) {
2047 			actual_space += sfo_size;
2048 			if (out == NULL || actual_space > len) {
2049 				continue;
2050 			}
2051 
2052 			fsw_fr2sfr(fsw, fr, sfr, ll_scrub);
2053 			sfr++;
2054 		}
2055 		FRB_UNLOCK(frb);
2056 	}
2057 
2058 	return actual_space;
2059 }
2060 
2061 static inline void
fsw_nxs2nus(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,pid_t pid,struct __nx_stats_fsw * nxs,struct sk_stats_userstack * sus)2062 fsw_nxs2nus(struct nx_flowswitch *fsw, struct nexus_mib_filter *filter,
2063     pid_t pid, struct __nx_stats_fsw *nxs, struct sk_stats_userstack *sus)
2064 {
2065 	uuid_copy(sus->sus_nx_uuid, fsw->fsw_nx->nx_uuid);
2066 	(void) strbufcpy(sus->sus_if_name, fsw->fsw_flow_mgr->fm_name);
2067 	sus->sus_owner_pid = pid;
2068 
2069 	if (filter->nmf_type & NXMIB_IP_STATS) {
2070 		sus->sus_ip  = nxs->nxs_ipstat;
2071 	}
2072 
2073 	if (filter->nmf_type & NXMIB_IP6_STATS) {
2074 		sus->sus_ip6 = nxs->nxs_ip6stat;
2075 	}
2076 
2077 	if (filter->nmf_type & NXMIB_TCP_STATS) {
2078 		sus->sus_tcp = nxs->nxs_tcpstat;
2079 	}
2080 
2081 	if (filter->nmf_type & NXMIB_UDP_STATS) {
2082 		sus->sus_udp = nxs->nxs_udpstat;
2083 	}
2084 
2085 	if (filter->nmf_type & NXMIB_QUIC_STATS) {
2086 		sus->sus_quic = nxs->nxs_quicstat;
2087 	}
2088 }
2089 
2090 static size_t
fsw_mib_get_userstack_stats(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len)2091 fsw_mib_get_userstack_stats(struct nx_flowswitch *fsw,
2092     struct nexus_mib_filter *filter, void *__sized_by(len)out, size_t len)
2093 {
2094 	size_t actual_space = 0;
2095 	struct kern_channel *ch;
2096 	struct __nx_stats_fsw *nxs;
2097 	struct sk_stats_userstack *sus = out;
2098 	size_t sus_size = sizeof(struct sk_stats_userstack);
2099 
2100 	SK_LOCK_ASSERT_HELD();
2101 
2102 	/* copyout saved stats from closed ports */
2103 	if (((filter->nmf_bitmap & NXMIB_FILTER_PID) &&
2104 	    (filter->nmf_pid == 0)) ||
2105 	    !(filter->nmf_bitmap & NXMIB_FILTER_PID)) {
2106 		actual_space += sus_size;
2107 		if (out != NULL && actual_space <= len) {
2108 			nxs = fsw->fsw_closed_na_stats;
2109 			fsw_nxs2nus(fsw, filter, 0, nxs, sus);
2110 			sus++;
2111 		}
2112 	}
2113 
2114 	/*
2115 	 * XXX Currently a proc only opens one channel to nexus so we don't do
2116 	 * per proc aggregation of inet stats now as this needs lots of code
2117 	 */
2118 	/* copyout per process stats */
2119 	STAILQ_FOREACH(ch, &fsw->fsw_nx->nx_ch_head, ch_link) {
2120 		struct skmem_arena *ar;
2121 		struct nexus_adapter *na;
2122 
2123 		/* ch_lock isn't needed here since sk_lock is held */
2124 		if ((ch->ch_flags & CHANF_CLOSING) ||
2125 		    (na = ch->ch_na) == NULL) {
2126 			/* channel is closing */
2127 			continue;
2128 		}
2129 
2130 		if ((filter->nmf_bitmap & NXMIB_FILTER_PID) &&
2131 		    filter->nmf_pid != ch->ch_pid) {
2132 			continue;
2133 		}
2134 
2135 		ar = na->na_arena;
2136 
2137 		AR_LOCK(ar);
2138 		nxs = skmem_arena_nexus(ar)->arn_stats_obj;
2139 		if (nxs == NULL) {
2140 			ASSERT(ar->ar_flags & ARF_DEFUNCT);
2141 			AR_UNLOCK(ar);
2142 			continue;
2143 		}
2144 
2145 		actual_space += sus_size;
2146 		if (out == NULL || actual_space > len) {
2147 			AR_UNLOCK(ar);
2148 			continue;
2149 		}
2150 
2151 		fsw_nxs2nus(fsw, filter, ch->ch_pid, nxs, sus);
2152 		sus++;
2153 		AR_UNLOCK(ar);
2154 	}
2155 
2156 	return actual_space;
2157 }
2158 
2159 static size_t
fsw_mib_get_stats(struct nx_flowswitch * fsw,void * __sized_by (len)out,size_t len)2160 fsw_mib_get_stats(struct nx_flowswitch *fsw, void *__sized_by(len)out, size_t len)
2161 {
2162 	struct sk_stats_flow_switch *sfs = out;
2163 	size_t actual_space = sizeof(struct sk_stats_flow_switch);
2164 
2165 	/* XXX -fbounds-safety: Come back and fix strlcpy */
2166 	if (out != NULL && actual_space <= len) {
2167 		uuid_copy(sfs->sfs_nx_uuid, fsw->fsw_nx->nx_uuid);
2168 		(void) strbufcpy(sfs->sfs_if_name, fsw->fsw_flow_mgr->fm_name);
2169 		sfs->sfs_fsws = fsw->fsw_stats;
2170 	}
2171 
2172 	return actual_space;
2173 }
2174 
2175 size_t
fsw_mib_get(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len,struct proc * p)2176 fsw_mib_get(struct nx_flowswitch *fsw, struct nexus_mib_filter *filter,
2177     void *__sized_by(len)out, size_t len, struct proc *p)
2178 {
2179 	size_t ret;
2180 
2181 	switch (filter->nmf_type) {
2182 	case NXMIB_FSW_STATS:
2183 		ret = fsw_mib_get_stats(fsw, out, len);
2184 		break;
2185 	case NXMIB_FLOW:
2186 		ret = fsw_mib_get_flow(fsw, filter, out, len);
2187 		break;
2188 	case NXMIB_FLOW_OWNER:
2189 		ret = fsw_mib_get_flow_owner(fsw, filter, out, len);
2190 		break;
2191 	case NXMIB_FLOW_ROUTE:
2192 		ret = fsw_mib_get_flow_route(fsw, filter, out, len, p);
2193 		break;
2194 	case NXMIB_TCP_STATS:
2195 	case NXMIB_UDP_STATS:
2196 	case NXMIB_IP_STATS:
2197 	case NXMIB_IP6_STATS:
2198 	case NXMIB_USERSTACK_STATS:
2199 		ret = fsw_mib_get_userstack_stats(fsw, filter, out, len);
2200 		break;
2201 	case NXMIB_FLOW_ADV:
2202 		ret = fsw_mib_get_flow_adv(fsw, filter, out, len);
2203 		break;
2204 	default:
2205 		ret = 0;
2206 		break;
2207 	}
2208 
2209 	return ret;
2210 }
2211 
2212 void
fsw_fold_stats(struct nx_flowswitch * fsw,void * data,nexus_stats_type_t type)2213 fsw_fold_stats(struct nx_flowswitch *fsw,
2214     void *data, nexus_stats_type_t type)
2215 {
2216 	ASSERT(data != NULL);
2217 	FSW_LOCK_ASSERT_HELD(fsw);
2218 
2219 	switch (type) {
2220 	case NEXUS_STATS_TYPE_FSW:
2221 	{
2222 		struct __nx_stats_fsw *d, *__single s;
2223 		d = fsw->fsw_closed_na_stats;
2224 		s = data;
2225 		ip_stats_fold(&d->nxs_ipstat, &s->nxs_ipstat);
2226 		ip6_stats_fold(&d->nxs_ip6stat, &s->nxs_ip6stat);
2227 		tcp_stats_fold(&d->nxs_tcpstat, &s->nxs_tcpstat);
2228 		udp_stats_fold(&d->nxs_udpstat, &s->nxs_udpstat);
2229 		quic_stats_fold(&d->nxs_quicstat, &s->nxs_quicstat);
2230 		break;
2231 	}
2232 	case NEXUS_STATS_TYPE_CHAN_ERRORS:
2233 	{
2234 		struct __nx_stats_channel_errors *__single s = data;
2235 		fsw_vp_channel_error_stats_fold(&fsw->fsw_stats, s);
2236 		break;
2237 	}
2238 	default:
2239 		VERIFY(0);
2240 		/* NOTREACHED */
2241 		__builtin_unreachable();
2242 	}
2243 }
2244 
2245 boolean_t
fsw_detach_barrier_add(struct nx_flowswitch * fsw)2246 fsw_detach_barrier_add(struct nx_flowswitch *fsw)
2247 {
2248 	lck_mtx_lock_spin(&fsw->fsw_detach_barrier_lock);
2249 	if (__improbable(fsw->fsw_detach_flags != 0 ||
2250 	    fsw->fsw_ifp == NULL || fsw->fsw_agent_session == NULL)) {
2251 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2252 		return FALSE;
2253 	}
2254 	fsw->fsw_detach_barriers++;
2255 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2256 
2257 	return TRUE;
2258 }
2259 
2260 void
fsw_detach_barrier_remove(struct nx_flowswitch * fsw)2261 fsw_detach_barrier_remove(struct nx_flowswitch *fsw)
2262 {
2263 	lck_mtx_lock_spin(&fsw->fsw_detach_barrier_lock);
2264 	ASSERT((fsw->fsw_detach_flags & FSW_DETACHF_DETACHED) == 0);
2265 	ASSERT(fsw->fsw_detach_barriers != 0);
2266 	fsw->fsw_detach_barriers--;
2267 	/* if there's a thread waiting to detach the interface, let it know */
2268 	if (__improbable((fsw->fsw_detach_waiters > 0) &&
2269 	    (fsw->fsw_detach_barriers == 0))) {
2270 		fsw->fsw_detach_waiters = 0;
2271 		wakeup(&fsw->fsw_detach_waiters);
2272 	}
2273 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2274 }
2275 
2276 /*
2277  * Generic resolver for non-Ethernet interfaces.
2278  */
2279 int
fsw_generic_resolve(struct nx_flowswitch * fsw,struct flow_route * fr,struct __kern_packet * pkt)2280 fsw_generic_resolve(struct nx_flowswitch *fsw, struct flow_route *fr,
2281     struct __kern_packet *pkt)
2282 {
2283 #pragma unused(pkt)
2284 #if SK_LOG
2285 	char dst_s[MAX_IPv6_STR_LEN];
2286 #endif /* SK_LOG */
2287 	struct ifnet *ifp = fsw->fsw_ifp;
2288 	struct rtentry *tgt_rt = NULL;
2289 	int err = 0;
2290 
2291 	ASSERT(fr != NULL);
2292 	ASSERT(ifp != NULL);
2293 
2294 	FR_LOCK(fr);
2295 	/*
2296 	 * If the destination is on-link, we use the final destination
2297 	 * address as target.  If it's off-link, we use the gateway
2298 	 * address instead.  Point tgt_rt to the the destination or
2299 	 * gateway route accordingly.
2300 	 */
2301 	if (fr->fr_flags & FLOWRTF_ONLINK) {
2302 		tgt_rt = fr->fr_rt_dst;
2303 	} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
2304 		tgt_rt = fr->fr_rt_gw;
2305 	}
2306 
2307 	/*
2308 	 * Perform another routing table lookup if necessary.
2309 	 */
2310 	if (tgt_rt == NULL || !(tgt_rt->rt_flags & RTF_UP) ||
2311 	    fr->fr_want_configure) {
2312 		if (fr->fr_want_configure == 0) {
2313 			os_atomic_inc(&fr->fr_want_configure, relaxed);
2314 		}
2315 		err = flow_route_configure(fr, ifp, NULL);
2316 		if (err != 0) {
2317 			SK_ERR("failed to configure route to %s on %s (err %d)",
2318 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
2319 			    sizeof(dst_s)), ifp->if_xname, err);
2320 			goto done;
2321 		}
2322 
2323 		/* refresh pointers */
2324 		if (fr->fr_flags & FLOWRTF_ONLINK) {
2325 			tgt_rt = fr->fr_rt_dst;
2326 		} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
2327 			tgt_rt = fr->fr_rt_gw;
2328 		}
2329 	}
2330 
2331 	if (__improbable(!(fr->fr_flags & (FLOWRTF_ONLINK | FLOWRTF_GATEWAY)))) {
2332 		err = EHOSTUNREACH;
2333 		SK_ERR("invalid route for %s on %s (err %d)",
2334 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
2335 		    sizeof(dst_s)), ifp->if_xname, err);
2336 		goto done;
2337 	}
2338 
2339 	ASSERT(tgt_rt != NULL);
2340 
2341 done:
2342 	if (__probable(err == 0)) {
2343 		/*
2344 		 * There's no actual resolution taking place here, so just
2345 		 * mark it with FLOWRTF_RESOLVED for consistency.
2346 		 */
2347 		os_atomic_or(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
2348 		os_atomic_store(&fr->fr_want_probe, 0, release);
2349 	} else {
2350 		os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
2351 		flow_route_cleanup(fr);
2352 	}
2353 	FR_UNLOCK(fr);
2354 
2355 	return err;
2356 }
2357 
2358 static void
fsw_read_boot_args(void)2359 fsw_read_boot_args(void)
2360 {
2361 	(void) PE_parse_boot_argn("fsw_use_dual_sized_pool",
2362 	    &fsw_use_dual_sized_pool, sizeof(fsw_use_dual_sized_pool));
2363 }
2364 
2365 void
fsw_init(void)2366 fsw_init(void)
2367 {
2368 	static_assert(NX_FSW_CHUNK_FREE == (uint64_t) -1);
2369 	static_assert(PKT_MAX_PROTO_HEADER_SIZE <= NX_FSW_MINBUFSIZE);
2370 
2371 	if (!__nx_fsw_inited) {
2372 		fsw_read_boot_args();
2373 		/*
2374 		 * Register callbacks for interface & protocol events
2375 		 * Use dummy arg for callback cookie.
2376 		 */
2377 		__nx_fsw_ifnet_eventhandler_tag =
2378 		    EVENTHANDLER_REGISTER(&ifnet_evhdlr_ctxt,
2379 		    ifnet_event, &fsw_ifnet_event_callback,
2380 		    eventhandler_entry_dummy_arg, EVENTHANDLER_PRI_ANY);
2381 		VERIFY(__nx_fsw_ifnet_eventhandler_tag != NULL);
2382 
2383 		__nx_fsw_protoctl_eventhandler_tag =
2384 		    EVENTHANDLER_REGISTER(&protoctl_evhdlr_ctxt,
2385 		    protoctl_event, &fsw_protoctl_event_callback,
2386 		    eventhandler_entry_dummy_arg, EVENTHANDLER_PRI_ANY);
2387 		VERIFY(__nx_fsw_protoctl_eventhandler_tag != NULL);
2388 		__nx_fsw_inited = 1;
2389 	}
2390 }
2391 
2392 void
fsw_uninit(void)2393 fsw_uninit(void)
2394 {
2395 	if (__nx_fsw_inited) {
2396 		EVENTHANDLER_DEREGISTER(&ifnet_evhdlr_ctxt, ifnet_event,
2397 		    __nx_fsw_ifnet_eventhandler_tag);
2398 		EVENTHANDLER_DEREGISTER(&protoctl_evhdlr_ctxt, protoctl_event,
2399 		    __nx_fsw_protoctl_eventhandler_tag);
2400 
2401 		__nx_fsw_inited = 0;
2402 	}
2403 }
2404 
2405 struct nx_flowswitch *
fsw_alloc(zalloc_flags_t how)2406 fsw_alloc(zalloc_flags_t how)
2407 {
2408 	struct nx_flowswitch *fsw;
2409 	struct __nx_stats_fsw *__single nsfw;
2410 
2411 	SK_LOCK_ASSERT_HELD();
2412 
2413 	nsfw = zalloc_flags(nx_fsw_stats_zone, how | Z_ZERO);
2414 	if (nsfw == NULL) {
2415 		return NULL;
2416 	}
2417 
2418 	fsw = zalloc_flags(nx_fsw_zone, how | Z_ZERO);
2419 	if (fsw == NULL) {
2420 		zfree(nx_fsw_stats_zone, nsfw);
2421 		return NULL;
2422 	}
2423 
2424 	FSW_RWINIT(fsw);
2425 	fsw->fsw_dev_ch = NULL;
2426 	fsw->fsw_host_ch = NULL;
2427 	fsw->fsw_closed_na_stats = nsfw;
2428 
2429 	SK_DF(SK_VERB_MEM, "fsw %p ALLOC", SK_KVA(fsw));
2430 
2431 	return fsw;
2432 }
2433 
2434 static int
fsw_detach(struct nx_flowswitch * fsw,struct nexus_adapter * hwna,boolean_t purge)2435 fsw_detach(struct nx_flowswitch *fsw, struct nexus_adapter *hwna,
2436     boolean_t purge)
2437 {
2438 	struct kern_nexus_provider *nx_prov = fsw->fsw_nx->nx_prov;
2439 	boolean_t do_dtor = FALSE;
2440 
2441 	SK_LOCK_ASSERT_HELD();
2442 
2443 	/*
2444 	 * return error if the the host port detach is in progress
2445 	 * or already detached.
2446 	 * For the case of flowswitch free (i.e. purge is TRUE) we have to
2447 	 * cleanup everything, so we will block if needed.
2448 	 */
2449 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
2450 	if (!purge && fsw->fsw_detach_flags != 0) {
2451 		SK_ERR("fsw detaching");
2452 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2453 		return EBUSY;
2454 	}
2455 	VERIFY(purge || fsw->fsw_detach_flags == 0);
2456 	/*
2457 	 * mark the flowswitch as detaching and release sk_lock while
2458 	 * waiting for other threads to exit. Maintain lock/unlock
2459 	 * ordering between the two locks.
2460 	 */
2461 	fsw->fsw_detach_flags |= FSW_DETACHF_DETACHING;
2462 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2463 	SK_UNLOCK();
2464 
2465 	/*
2466 	 * wait until all threads needing accesses to the flowswitch
2467 	 * netagent get out, and mark this as detached to prevent
2468 	 * further access requests from being admitted.
2469 	 */
2470 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
2471 	while (fsw->fsw_detach_barriers != 0) {
2472 		fsw->fsw_detach_waiters++;
2473 		(void) msleep(&fsw->fsw_detach_waiters,
2474 		    &fsw->fsw_detach_barrier_lock,
2475 		    (PZERO + 1), __FUNCTION__, NULL);
2476 	}
2477 	VERIFY(fsw->fsw_detach_barriers == 0);
2478 	VERIFY(fsw->fsw_detach_flags != 0);
2479 	fsw->fsw_detach_flags &= ~FSW_DETACHF_DETACHING;
2480 	/*
2481 	 * if the NA detach thread as well as the flowswitch free thread were
2482 	 * both waiting, then the thread which wins the race is responsible
2483 	 * for doing the dtor work.
2484 	 */
2485 	if (fsw->fsw_detach_flags == 0) {
2486 		fsw->fsw_detach_flags |= FSW_DETACHF_DETACHED;
2487 		do_dtor = TRUE;
2488 	}
2489 	VERIFY(fsw->fsw_detach_flags == FSW_DETACHF_DETACHED);
2490 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2491 	SK_LOCK();
2492 
2493 	FSW_WLOCK(fsw);
2494 	if (do_dtor) {
2495 		if (fsw->fsw_ifp != NULL) {
2496 			fsw_teardown_ifp(fsw, hwna);
2497 			ASSERT(fsw->fsw_ifp == NULL);
2498 			ASSERT(fsw->fsw_nifna == NULL);
2499 		}
2500 		bzero(fsw->fsw_slla, sizeof(fsw->fsw_slla));
2501 		nx_prov->nxprov_params->nxp_ifindex = 0;
2502 		/* free any flow entries in the deferred list */
2503 		fsw_linger_purge(fsw);
2504 		fsw_rxstrc_purge(fsw);
2505 	}
2506 	/*
2507 	 * If we are destroying the instance, release lock to let all
2508 	 * outstanding agent threads to enter, followed by waiting until
2509 	 * all of them exit the critical section before continuing.
2510 	 */
2511 	if (purge) {
2512 		FSW_UNLOCK(fsw);
2513 		flow_mgr_terminate(fsw->fsw_flow_mgr);
2514 		FSW_WLOCK(fsw);
2515 	}
2516 	FSW_WUNLOCK(fsw);
2517 	return 0;
2518 }
2519 
2520 void
fsw_free(struct nx_flowswitch * fsw)2521 fsw_free(struct nx_flowswitch *fsw)
2522 {
2523 	int err;
2524 
2525 	SK_LOCK_ASSERT_HELD();
2526 	ASSERT(fsw != NULL);
2527 
2528 	err = fsw_detach(fsw, NULL, TRUE);
2529 	VERIFY(err == 0);
2530 
2531 	fsw_dp_dtor(fsw);
2532 
2533 	ASSERT(fsw->fsw_dev_ch == NULL);
2534 	ASSERT(fsw->fsw_host_ch == NULL);
2535 	ASSERT(fsw->fsw_closed_na_stats != NULL);
2536 	zfree(nx_fsw_stats_zone, fsw->fsw_closed_na_stats);
2537 	fsw->fsw_closed_na_stats = NULL;
2538 	FSW_RWDESTROY(fsw);
2539 
2540 	SK_DF(SK_VERB_MEM, "fsw %p FREE", SK_KVA(fsw));
2541 	zfree(nx_fsw_zone, fsw);
2542 }
2543