xref: /xnu-8796.141.3/bsd/skywalk/nexus/flowswitch/fsw.c (revision 1b191cb58250d0705d8a51287127505aa4bc0789)
1 /*
2  * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  *   1. Redistributions of source code must retain the above copyright
36  *      notice, this list of conditions and the following disclaimer.
37  *   2. Redistributions in binary form must reproduce the above copyright
38  *      notice, this list of conditions and the following disclaimer in the
39  *      documentation and/or other materials provided with the distribution.
40  *
41  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  */
53 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
54 #include <skywalk/os_skywalk_private.h>
55 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
56 #include <skywalk/nexus/flowswitch/fsw_var.h>
57 #include <skywalk/nexus/netif/nx_netif.h>
58 #include <skywalk/nexus/netif/nx_netif_compat.h>
59 
60 #include <net/bpf.h>
61 #include <net/if.h>
62 #include <net/pktsched/pktsched_netem.h>
63 #include <sys/eventhandler.h>
64 
65 #if (DEVELOPMENT || DEBUG)
66 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, chain_enqueue,
67     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_chain_enqueue, 0, "");
68 #endif /* !DEVELOPMENT && !DEBUG */
69 
70 /*
71  * Configures the flowswitch to utilize user packet pool with
72  * dual sized buffers.
73  * A non-zero value enables the support.
74  */
75 #if defined(XNU_TARGET_OS_IOS) || defined(XNU_TARGET_OS_OSX)
76 uint32_t fsw_use_dual_sized_pool = 1;
77 #else
78 uint32_t fsw_use_dual_sized_pool = 0;
79 #endif
80 
81 uint32_t fsw_chain_enqueue = 0;
82 static int __nx_fsw_inited = 0;
83 static eventhandler_tag __nx_fsw_ifnet_eventhandler_tag = NULL;
84 static eventhandler_tag __nx_fsw_protoctl_eventhandler_tag = NULL;
85 
86 static SKMEM_TYPE_DEFINE(nx_fsw_zone, struct nx_flowswitch);
87 
88 static SKMEM_TYPE_DEFINE(nx_fsw_stats_zone, struct __nx_stats_fsw);
89 
90 #define SKMEM_TAG_FSW_PORTS     "com.apple.skywalk.fsw.ports"
91 SKMEM_TAG_DEFINE(skmem_tag_fsw_ports, SKMEM_TAG_FSW_PORTS);
92 
93 #define SKMEM_TAG_FSW_FOB_HASH "com.apple.skywalk.fsw.fsw.fob.hash"
94 SKMEM_TAG_DEFINE(skmem_tag_fsw_fob_hash, SKMEM_TAG_FSW_FOB_HASH);
95 
96 #define SKMEM_TAG_FSW_FRB_HASH "com.apple.skywalk.fsw.fsw.frb.hash"
97 SKMEM_TAG_DEFINE(skmem_tag_fsw_frb_hash, SKMEM_TAG_FSW_FRB_HASH);
98 
99 #define SKMEM_TAG_FSW_FRIB_HASH "com.apple.skywalk.fsw.fsw.frib.hash"
100 SKMEM_TAG_DEFINE(skmem_tag_fsw_frib_hash, SKMEM_TAG_FSW_FRIB_HASH);
101 
102 #define SKMEM_TAG_FSW_FRAG_MGR "com.apple.skywalk.fsw.fsw.frag.mgr"
103 SKMEM_TAG_DEFINE(skmem_tag_fsw_frag_mgr, SKMEM_TAG_FSW_FRAG_MGR);
104 
105 /* 64-bit mask with range */
106 #define BMASK64(_beg, _end)     \
107 	((NX_FSW_CHUNK_FREE >> (63 - (_end))) & ~((1ULL << (_beg)) - 1))
108 
109 static int fsw_detach(struct nx_flowswitch *fsw, struct nexus_adapter *hwna,
110     boolean_t purge);
111 
112 int
fsw_attach_vp(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_vp_adapter ** vpna)113 fsw_attach_vp(struct kern_nexus *nx, struct kern_channel *ch,
114     struct chreq *chr, struct nxbind *nxb, struct proc *p,
115     struct nexus_vp_adapter **vpna)
116 {
117 #pragma unused(ch)
118 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
119 	char *cr_name = chr->cr_name;
120 	int err = 0;
121 
122 	SK_LOCK_ASSERT_HELD();
123 	ASSERT(!(chr->cr_mode & CHMODE_CONFIG));
124 	*vpna = NULL;
125 
126 	/* if there's an existing adapter on the nexus port then use it */
127 	FSW_WLOCK(fsw);
128 	err = fsw_port_alloc(fsw, nxb, vpna, chr->cr_port, p, FALSE, FALSE);
129 	FSW_WUNLOCK(fsw);
130 
131 	if (err != 0) {
132 		ASSERT(*vpna == NULL);
133 		goto out;
134 	} else if (*vpna != NULL) {
135 		/*
136 		 * Use the existing adapter on that port; fsw_port_alloc()
137 		 * callback has retained a reference count on the adapter.
138 		 */
139 		goto out;
140 	}
141 	ASSERT(*vpna == NULL);
142 
143 	/* create a virtual port; callee holds vpna ref */
144 	err = fsw_vp_na_create(nx, chr, vpna);
145 	if (err != 0) {
146 		SK_ERR("vpna create failed (err %d)", err);
147 		goto out;
148 	}
149 
150 	/* attach vp to fsw */
151 	err = fsw_vp_na_attach(nx, cr_name, &(*vpna)->vpna_up);
152 	if (err != 0) {
153 		SK_ERR("vpna \"%s\" fsw attach failed (err %d)",
154 		    (*vpna)->vpna_up.na_name, err);
155 		goto out;
156 	}
157 
158 	FSW_WLOCK(fsw);
159 	err = fsw_port_alloc(fsw, nxb, vpna, (*vpna)->vpna_nx_port, p, FALSE, FALSE);
160 	FSW_WUNLOCK(fsw);
161 
162 out:
163 	if ((*vpna) != NULL) {
164 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
165 		    "vpna \"%s\" (0x%llx) refs %u to fsw \"%s\" "
166 		    "nx_port %d (err %d)", (*vpna)->vpna_up.na_name,
167 		    SK_KVA(&(*vpna)->vpna_up), (*vpna)->vpna_up.na_refcount,
168 		    cr_name, (int)(*vpna)->vpna_nx_port, err);
169 
170 		if (err != 0) {
171 			na_release_locked(&(*vpna)->vpna_up);
172 			*vpna = NULL;
173 		}
174 	}
175 
176 	return err;
177 }
178 
179 static int
fsw_nx_check(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)180 fsw_nx_check(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
181 {
182 #pragma unused(fsw)
183 	nexus_type_t hw_nxdom_type = NX_DOM(hw_nx)->nxdom_type;
184 
185 	if (hw_nxdom_type != NEXUS_TYPE_NET_IF) {
186 		return EINVAL;
187 	}
188 
189 	/* it's a netif below */
190 	return 0;
191 }
192 
193 static int
fsw_ctl_flow_add(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)194 fsw_ctl_flow_add(struct nx_flowswitch *fsw, struct proc *p,
195     struct nx_flow_req *req)
196 {
197 	struct flow_owner *fo;
198 	int error = 0;
199 
200 	ASSERT(p != PROC_NULL);
201 
202 	if (p != kernproc) {
203 		/* special port shouldn't be bound via this method */
204 		if (req->nfr_nx_port < FSW_VP_USER_MIN) {
205 			return EINVAL;
206 		}
207 		req->nfr_flags |= (NXFLOWREQF_TRACK | NXFLOWREQF_FLOWADV);
208 	} else {
209 		/* no flow track or advisory support for bsd flow */
210 		ASSERT((req->nfr_flags & NXFLOWREQF_TRACK) == 0);
211 		ASSERT((req->nfr_flags & NXFLOWREQF_FLOWADV) == 0);
212 		ASSERT((req->nfr_flags & NXFLOWREQF_LOW_LATENCY) == 0);
213 	}
214 
215 	/* init kernel only fields */
216 	if (p != kernproc) {
217 		nx_flow_req_internalize(req);
218 	}
219 	req->nfr_pid = proc_pid(p);
220 	if (req->nfr_epid == -1) {
221 		req->nfr_epid = proc_pid(p);
222 	}
223 
224 	if (req->nfr_flow_demux_count > MAX_FLOW_DEMUX_PATTERN) {
225 		SK_ERR("invalid flow demux count %u", req->nfr_flow_demux_count);
226 		return EINVAL;
227 	}
228 
229 	fo = fsw_flow_add(fsw, req, &error);
230 	ASSERT(fo != NULL || error != 0);
231 
232 	if (error == 0) {
233 		// user space don't need this flow stats
234 		flow_stats_release(req->nfr_flow_stats);
235 	}
236 	if (p != kernproc) {
237 		nx_flow_req_externalize(req);
238 	}
239 
240 	return error;
241 }
242 
243 static int
fsw_ctl_flow_del(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)244 fsw_ctl_flow_del(struct nx_flowswitch *fsw, struct proc *p,
245     struct nx_flow_req *req)
246 {
247 	int err;
248 
249 	nx_flow_req_internalize(req);
250 	req->nfr_pid = proc_pid(p);
251 	err = fsw_flow_del(fsw, req, TRUE, NULL);
252 
253 	nx_flow_req_externalize(req);
254 	return err;
255 }
256 
257 #if (DEVELOPMENT || DEBUG)
258 static int
259 fsw_rps_threads_sysctl SYSCTL_HANDLER_ARGS
260 {
261 #pragma unused(oidp, arg2)
262 	struct nx_flowswitch *fsw = arg1;
263 	uint32_t nthreads;
264 	int changed;
265 	int error;
266 
267 	error = sysctl_io_number(req, fsw->fsw_rps_nthreads,
268 	    sizeof(fsw->fsw_rps_nthreads), &nthreads, &changed);
269 	if (error == 0 && changed != 0) {
270 		error = fsw_rps_set_nthreads(fsw, nthreads);
271 	}
272 	return error;
273 }
274 #endif /* !DEVELOPMENT && !DEBUG */
275 
276 static int
fsw_setup_ifp(struct nx_flowswitch * fsw,struct nexus_adapter * hwna)277 fsw_setup_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna)
278 {
279 	int error = 0;
280 	struct ifnet *ifp = hwna->na_ifp;
281 	struct kern_pbufpool *pp = skmem_arena_nexus(hwna->na_arena)->arn_rx_pp;
282 	size_t f_limit = pp->pp_kmd_region->skr_c_obj_cnt / 2;
283 
284 	ASSERT((hwna->na_type == NA_NETIF_HOST) ||
285 	    (hwna->na_type == NA_NETIF_COMPAT_HOST));
286 
287 	SK_LOCK_ASSERT_HELD();
288 
289 	/*
290 	 * XXX: we don't support non TXSTART interface.
291 	 * There are assumptions in fsw_port_flush_enqueue_dst() about
292 	 * single threaded write to destination rings.
293 	 */
294 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
295 		SK_ERR("non TXSTART interface not supported ifp(0x%llx)",
296 		    SK_KVA(ifp));
297 		return ENOTSUP;
298 	}
299 
300 	FSW_WLOCK(fsw);
301 
302 	ASSERT(fsw->fsw_ifp == NULL);
303 	ASSERT(fsw->fsw_nifna == NULL);
304 	ASSERT(fsw->fsw_resolve == NULL);
305 	ASSERT(fsw->fsw_frame == NULL);
306 	ASSERT(fsw->fsw_demux == NULL);
307 	ASSERT(fsw->fsw_pkt_copy_from_pkt == NULL);
308 	ASSERT(fsw->fsw_pkt_copy_from_mbuf == NULL);
309 	ASSERT(fsw->fsw_pkt_copy_to_mbuf == NULL);
310 
311 	fsw->fsw_ipfm = fsw_ip_frag_mgr_create(fsw, ifp, f_limit);
312 	if (fsw->fsw_ipfm == NULL) {
313 		FSW_WUNLOCK(fsw);
314 		return ENOMEM;
315 	}
316 
317 	switch (ifp->if_family) {
318 	case IFNET_FAMILY_ETHERNET:
319 		error = fsw_ethernet_setup(fsw, ifp);
320 		fsw->fsw_ifp_dlt = DLT_EN10MB;
321 		break;
322 
323 	case IFNET_FAMILY_CELLULAR:
324 		error = fsw_cellular_setup(fsw, ifp);
325 		fsw->fsw_ifp_dlt = DLT_RAW;
326 		break;
327 
328 	default:
329 		if (ifp->if_family == IFNET_FAMILY_IPSEC ||
330 		    ifp->if_family == IFNET_FAMILY_UTUN) {
331 			error = fsw_ip_setup(fsw, ifp);
332 			fsw->fsw_ifp_dlt = DLT_RAW;
333 			break;
334 		}
335 		error = ENOTSUP;
336 		break;
337 	}
338 
339 	if (error != 0) {
340 		FSW_WUNLOCK(fsw);
341 		return error;
342 	}
343 
344 	ASSERT(fsw->fsw_resolve != NULL);
345 
346 	if (NX_PROV(fsw->fsw_nx)->nxprov_region_params[SKMEM_REGION_KMD].
347 	    srp_max_frags > 1 || pp->pp_max_frags > 1) {
348 		fsw->fsw_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt;
349 		fsw->fsw_pkt_copy_from_mbuf = pkt_copy_multi_buflet_from_mbuf;
350 		fsw->fsw_pkt_copy_to_mbuf = pkt_copy_multi_buflet_to_mbuf;
351 	} else {
352 		fsw->fsw_pkt_copy_from_pkt = pkt_copy_from_pkt;
353 		fsw->fsw_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
354 		fsw->fsw_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
355 	}
356 
357 	/*
358 	 * Since it is possible for fsw to refer to the ifp after all
359 	 * underlying hwnas are freed (see fsw_teardown_ifp()), we need
360 	 * an extra reference to the ifp here.
361 	 *
362 	 * We also cache the netif adapter of the interface, as it's
363 	 * needed for each packet enqueued to the classq.  There is no
364 	 * need to retain a refcnt for the same reason as above.
365 	 *
366 	 * We hold the busy lock across these, just in case an interface
367 	 * detach and reattach happens, as fsw_flow_bind() relies on the
368 	 * same lock as well before making its checks.
369 	 */
370 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
371 
372 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
373 	fsw->fsw_ifp = ifp;
374 	fsw->fsw_nifna = &ifp->if_na->nifna_up;
375 	ifp->if_na->nifna_netif->nif_fsw = fsw;
376 	ifp->if_na->nifna_netif->nif_fsw_nxadv =
377 	    fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
378 	(void) strlcpy(fsw->fsw_flow_mgr->fm_name,
379 	    if_name(ifp), IFNAMSIZ);
380 
381 	fsw_classq_setup(fsw, hwna);
382 	fsw->fsw_classq_enabled = TRUE;
383 	fsw->fsw_src_lla_gencnt = 0;
384 
385 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
386 	(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
387 	    FSW_REAP_THREADNAME, ifp->if_xname, "");
388 	thread_set_thread_name(fsw->fsw_reap_thread, fsw->fsw_reap_name);
389 
390 	error = fsw_netagent_register(fsw, ifp);
391 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
392 	    "fsw_netagent_register %s (family %u) (err %d)",
393 	    if_name(ifp), ifp->if_family, error);
394 
395 	/*
396 	 * Clear NXF_REJECT to allow new channels to be opened
397 	 * to this nexus, in case this is an interface reattach.
398 	 * Otherwise this flag should already be cleared.
399 	 */
400 	if (error == 0) {
401 		atomic_bitclear_32(&fsw->fsw_nx->nx_flags, NXF_REJECT);
402 	}
403 
404 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
405 
406 	/*
407 	 * Wake up the reaper thread.
408 	 */
409 	if (error == 0) {
410 		fsw_reap_sched(fsw);
411 	}
412 
413 	/* init skoid */
414 	skoid_create(&fsw->fsw_skoid,
415 	    SKOID_SNODE(_kern_skywalk_flowswitch), if_name(ifp),
416 	    CTLFLAG_RW);
417 
418 #if (DEVELOPMENT || DEBUG)
419 	if (SKYWALK_NATIVE(fsw->fsw_ifp)) {
420 		skoid_add_handler(&fsw->fsw_skoid, "rps_nthreads", CTLFLAG_RW,
421 		    fsw_rps_threads_sysctl, fsw, 0);
422 	}
423 #endif /* !DEVELOPMENT && !DEBUG */
424 
425 	FSW_WUNLOCK(fsw);
426 
427 	return error;
428 }
429 
430 static void
fsw_teardown_ifp(struct nx_flowswitch * fsw,struct nexus_adapter * hwna)431 fsw_teardown_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna)
432 {
433 	struct ifnet *ifp;
434 
435 	SK_LOCK_ASSERT_HELD();
436 
437 	FSW_WLOCK_ASSERT_HELD(fsw);
438 	ifp = fsw->fsw_ifp;
439 	ASSERT(ifp != NULL);
440 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
441 
442 	fsw_netagent_unregister(fsw, ifp);
443 
444 	if (fsw->fsw_ipfm != NULL) {
445 		fsw_ip_frag_mgr_destroy(fsw->fsw_ipfm);
446 	}
447 
448 	skoid_destroy(&fsw->fsw_skoid);
449 
450 	SK_DF(SK_VERB_FSW, "%sdetached from %s (family %u)",
451 	    ((fsw->fsw_agent_session != NULL) ? "netagent" : ""),
452 	    if_name(ifp), ifp->if_family);
453 
454 	if (hwna != NULL) {
455 		fsw_classq_teardown(fsw, hwna);
456 	}
457 
458 	/*
459 	 * Set NXF_REJECT on the nexus, which would cause existing adapters
460 	 * to be marked similarly; channels associated with them would then
461 	 * cease to function.
462 	 */
463 	atomic_bitset_32(&fsw->fsw_nx->nx_flags, NXF_REJECT);
464 
465 	/* see notes on fsw_na_attach() about I/O refcnt */
466 	if (ifp->if_na != NULL) {
467 		ifp->if_na->nifna_netif->nif_fsw = NULL;
468 		ifp->if_na->nifna_netif->nif_fsw_nxadv = NULL;
469 		membar_sync();
470 	}
471 
472 	fsw->fsw_ifp = NULL;
473 	fsw->fsw_nifna = NULL;
474 	fsw->fsw_resolve = NULL;
475 	fsw->fsw_frame = NULL;
476 	fsw->fsw_frame_headroom = 0;
477 	fsw->fsw_demux = NULL;
478 	fsw->fsw_classq_enabled = FALSE;
479 	fsw->fsw_pkt_copy_from_pkt = NULL;
480 	fsw->fsw_pkt_copy_from_mbuf = NULL;
481 	fsw->fsw_pkt_copy_to_mbuf = NULL;
482 
483 	if (ifp->if_input_netem != NULL) {
484 		netem_destroy(ifp->if_input_netem);
485 		ifp->if_input_netem = NULL;
486 	}
487 
488 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
489 	(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
490 	    FSW_REAP_THREADNAME, if_name(ifp), "_detached");
491 	thread_set_thread_name(fsw->fsw_reap_thread, fsw->fsw_reap_name);
492 }
493 
494 static int
fsw_host_setup(struct nx_flowswitch * fsw)495 fsw_host_setup(struct nx_flowswitch *fsw)
496 {
497 	struct nexus_adapter *hwna;
498 	struct ifnet *ifp;
499 
500 	SK_LOCK_ASSERT_HELD();
501 
502 	hwna = fsw->fsw_host_ch->ch_na;
503 	ASSERT(hwna != NULL);
504 
505 
506 	/* the netif below must have an ifnet attached (dev/host port) */
507 	if ((ifp = hwna->na_ifp) == NULL) {
508 		return ENXIO;
509 	}
510 
511 	/*
512 	 * XXX: we don't support multiple rx rings yet.
513 	 * There are assumptions in fsw_port_flush_enqueue_dst() about
514 	 * single threaded write to destination rings.
515 	 */
516 	if (SKYWALK_NATIVE(ifp) && (hwna->na_num_rx_rings > 1)) {
517 		SK_ERR("ifp(0x%llx): multiple rx rings(%d) not supported",
518 		    SK_KVA(ifp), hwna->na_num_rx_rings);
519 		return ENOTSUP;
520 	}
521 
522 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
523 	if ((fsw->fsw_detach_flags & FSW_DETACHF_DETACHING) != 0) {
524 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
525 		return EBUSY;
526 	}
527 	fsw->fsw_detach_flags = 0;
528 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
529 
530 	int error = fsw_setup_ifp(fsw, hwna);
531 	ASSERT(error != 0 || fsw->fsw_ifp != NULL);
532 	if (error != 0) {
533 		return error;
534 	}
535 
536 	/* update the interface index */
537 	ASSERT(NX_PROV(fsw->fsw_nx)->nxprov_params->nxp_ifindex == 0);
538 	NX_PROV(fsw->fsw_nx)->nxprov_params->nxp_ifindex = ifp->if_index;
539 	return 0;
540 }
541 
542 static int
fsw_host_teardown(struct nx_flowswitch * fsw)543 fsw_host_teardown(struct nx_flowswitch *fsw)
544 {
545 	struct nexus_adapter *hwna = fsw->fsw_host_ch->ch_na;
546 
547 	SK_LOCK_ASSERT_HELD();
548 	return fsw_detach(fsw, hwna, FALSE);
549 }
550 
551 #if SK_LOG
552 /* Hoisted out of line to reduce kernel stack footprint */
553 SK_LOG_ATTRIBUTE
554 static void
fsw_ctl_attach_log(const struct nx_spec_req * nsr,const struct kern_nexus * nx,int err)555 fsw_ctl_attach_log(const struct nx_spec_req *nsr,
556     const struct kern_nexus *nx, int err)
557 {
558 	uuid_string_t uuidstr, ifuuidstr;
559 	const char *nustr;
560 
561 	if (nsr->nsr_flags & NXSPECREQ_UUID) {
562 		nustr = sk_uuid_unparse(nsr->nsr_uuid, uuidstr);
563 	} else if (nsr->nsr_flags & NXSPECREQ_IFP) {
564 		(void) snprintf((char *)uuidstr, sizeof(uuidstr), "0x%llx",
565 		    SK_KVA(nsr->nsr_ifp));
566 		nustr = uuidstr;
567 	} else {
568 		nustr = nsr->nsr_name;
569 	}
570 
571 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
572 	    "nexus 0x%llx (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d",
573 	    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr,
574 	    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err);
575 }
576 #endif /* SK_LOG */
577 
578 SK_NO_INLINE_ATTRIBUTE
579 static void
fsw_netif_set_callbacks_common(struct nx_flowswitch * fsw,boolean_t set)580 fsw_netif_set_callbacks_common(struct nx_flowswitch *fsw, boolean_t set)
581 {
582 	struct nexus_adapter *hwna = fsw->fsw_dev_ch->ch_na;
583 
584 	ASSERT(hwna->na_type == NA_NETIF_DEV ||
585 	    hwna->na_type == NA_NETIF_COMPAT_DEV);
586 
587 	if (set) {
588 		netif_hwna_set_mode(hwna, NETIF_MODE_FSW, fsw_devna_rx);
589 	} else {
590 		netif_hwna_clear_mode(hwna);
591 	}
592 }
593 
594 SK_NO_INLINE_ATTRIBUTE
595 static void
fsw_netif_set_callbacks(struct nx_flowswitch * fsw)596 fsw_netif_set_callbacks(struct nx_flowswitch *fsw)
597 {
598 	fsw_netif_set_callbacks_common(fsw, TRUE);
599 }
600 
601 SK_NO_INLINE_ATTRIBUTE
602 static void
fsw_netif_clear_callbacks(struct nx_flowswitch * fsw)603 fsw_netif_clear_callbacks(struct nx_flowswitch *fsw)
604 {
605 	fsw_netif_set_callbacks_common(fsw, FALSE);
606 }
607 
608 SK_NO_INLINE_ATTRIBUTE
609 static void
fsw_dp_start(struct nx_flowswitch * fsw)610 fsw_dp_start(struct nx_flowswitch *fsw)
611 {
612 	ASSERT(fsw->fsw_dev_ch != NULL);
613 	ASSERT(fsw->fsw_host_ch != NULL);
614 
615 	fsw_netif_set_callbacks(fsw);
616 	na_start_spec(fsw->fsw_dev_ch->ch_nexus, fsw->fsw_dev_ch);
617 	na_start_spec(fsw->fsw_host_ch->ch_nexus, fsw->fsw_host_ch);
618 }
619 
620 SK_NO_INLINE_ATTRIBUTE
621 static int
fsw_dp_stop(struct nx_flowswitch * fsw,struct ifnet ** ifpp)622 fsw_dp_stop(struct nx_flowswitch *fsw, struct ifnet **ifpp)
623 {
624 	struct ifnet *ifp;
625 
626 	FSW_WLOCK(fsw);
627 	if ((fsw->fsw_state_flags & FSW_STATEF_QUIESCED) != 0) {
628 		FSW_WUNLOCK(fsw);
629 		return EALREADY;
630 	}
631 	fsw->fsw_state_flags |= FSW_STATEF_QUIESCED;
632 	FSW_WUNLOCK(fsw);
633 
634 	/*
635 	 * For regular kernel-attached interfaces, quiescing is handled by
636 	 * the ifnet detach thread, which calls dlil_quiesce_and_detach_nexuses().
637 	 * For interfaces created by skywalk test cases, flowswitch/netif nexuses
638 	 * are constructed on the fly and can also be torn down on the fly.
639 	 * dlil_quiesce_and_detach_nexuses() won't help here because any nexus
640 	 * can be detached while the interface is still attached.
641 	 */
642 	if ((ifp = fsw->fsw_ifp) != NULL &&
643 	    ifnet_datamov_suspend_if_needed(ifp)) {
644 		SK_UNLOCK();
645 		ifnet_datamov_drain(ifp);
646 		/* Reference will be released by caller */
647 		*ifpp = ifp;
648 		SK_LOCK();
649 	}
650 	ASSERT(fsw->fsw_dev_ch != NULL);
651 	ASSERT(fsw->fsw_host_ch != NULL);
652 	na_stop_spec(fsw->fsw_host_ch->ch_nexus, fsw->fsw_host_ch);
653 	na_stop_spec(fsw->fsw_dev_ch->ch_nexus, fsw->fsw_dev_ch);
654 	fsw_netif_clear_callbacks(fsw);
655 	return 0;
656 }
657 
658 SK_NO_INLINE_ATTRIBUTE
659 static int
fsw_netif_port_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx,boolean_t host)660 fsw_netif_port_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx,
661     boolean_t host)
662 {
663 	struct chreq chr;
664 	struct kern_channel *ch;
665 	int err;
666 
667 	bzero(&chr, sizeof(chr));
668 	uuid_copy(chr.cr_spec_uuid, hw_nx->nx_uuid);
669 	chr.cr_ring_id = CHANNEL_RING_ID_ANY;
670 	chr.cr_port = host ? NEXUS_PORT_NET_IF_HOST : NEXUS_PORT_NET_IF_DEV;
671 	chr.cr_mode |= CHMODE_CONFIG | (host ? CHMODE_HOST : 0);
672 
673 	err = 0;
674 	ch = ch_open_special(hw_nx, &chr, FALSE, &err);
675 	if (ch == NULL) {
676 		SK_ERR("ch_open_special(%s) failed: %d",
677 		    host ? "host" : "dev", err);
678 		return err;
679 	}
680 	if (host) {
681 		fsw->fsw_host_ch = ch;
682 	} else {
683 		fsw->fsw_dev_ch = ch;
684 	}
685 	return 0;
686 }
687 
688 SK_NO_INLINE_ATTRIBUTE
689 static int
fsw_netif_port_teardown(struct nx_flowswitch * fsw,boolean_t host)690 fsw_netif_port_teardown(struct nx_flowswitch *fsw, boolean_t host)
691 {
692 	struct kern_channel *ch;
693 
694 	ch = host ? fsw->fsw_host_ch : fsw->fsw_dev_ch;
695 	if (ch == NULL) {
696 		return EINVAL;
697 	}
698 	if (host) {
699 		fsw->fsw_host_ch = NULL;
700 	} else {
701 		fsw->fsw_dev_ch = NULL;
702 	}
703 	ch_close_special(ch);
704 	(void) ch_release_locked(ch);
705 	return 0;
706 }
707 
708 SK_NO_INLINE_ATTRIBUTE
709 static int
fsw_devna_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)710 fsw_devna_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
711 {
712 	return fsw_netif_port_setup(fsw, hw_nx, FALSE);
713 }
714 
715 SK_NO_INLINE_ATTRIBUTE
716 static int
fsw_hostna_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)717 fsw_hostna_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
718 {
719 	return fsw_netif_port_setup(fsw, hw_nx, TRUE);
720 }
721 
722 SK_NO_INLINE_ATTRIBUTE
723 static int
fsw_devna_teardown(struct nx_flowswitch * fsw)724 fsw_devna_teardown(struct nx_flowswitch *fsw)
725 {
726 	return fsw_netif_port_teardown(fsw, FALSE);
727 }
728 
729 SK_NO_INLINE_ATTRIBUTE
730 static int
fsw_hostna_teardown(struct nx_flowswitch * fsw)731 fsw_hostna_teardown(struct nx_flowswitch *fsw)
732 {
733 	return fsw_netif_port_teardown(fsw, TRUE);
734 }
735 
736 /* Process NXCFG_CMD_ATTACH */
737 SK_NO_INLINE_ATTRIBUTE
738 static int
fsw_ctl_attach(struct kern_nexus * nx,struct proc * p,struct nx_spec_req * nsr)739 fsw_ctl_attach(struct kern_nexus *nx, struct proc *p, struct nx_spec_req *nsr)
740 {
741 #pragma unused(p)
742 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
743 	struct kern_nexus *hw_nx = NULL;
744 	int err = 0;
745 
746 	SK_LOCK_ASSERT_HELD();
747 
748 	/*
749 	 * The flowswitch only accepts UUID as an identifier, since it
750 	 * represents the UUID of the kernel object we are trying to
751 	 * attach to this flowswitch.
752 	 */
753 	if ((nsr->nsr_flags & (NXSPECREQ_UUID | NXSPECREQ_IFP)) !=
754 	    NXSPECREQ_UUID || uuid_is_null(nsr->nsr_uuid)) {
755 		err = EINVAL;
756 		goto done;
757 	}
758 
759 	if (fsw->fsw_dev_ch != NULL) {
760 		ASSERT(fsw->fsw_host_ch != NULL);
761 		err = EEXIST;
762 		goto done;
763 	}
764 
765 	hw_nx = nx_find(nsr->nsr_uuid, TRUE);
766 	if (hw_nx == NULL) {
767 		err = ENOENT;
768 		goto done;
769 	} else if (hw_nx == nx) {
770 		err = EINVAL;
771 		goto done;
772 	}
773 
774 	/* preflight check to see if the nexus is attachable to us */
775 	err = fsw_nx_check(fsw, hw_nx);
776 	if (err != 0) {
777 		goto done;
778 	}
779 
780 	err = fsw_devna_setup(fsw, hw_nx);
781 	if (err != 0) {
782 		goto done;
783 	}
784 
785 	err = fsw_hostna_setup(fsw, hw_nx);
786 	if (err != 0) {
787 		(void) fsw_devna_teardown(fsw);
788 		goto done;
789 	}
790 
791 	err = fsw_host_setup(fsw);
792 	if (err != 0) {
793 		(void) fsw_hostna_teardown(fsw);
794 		(void) fsw_devna_teardown(fsw);
795 		goto done;
796 	}
797 
798 	fsw_dp_start(fsw);
799 
800 	/* return the devna UUID */
801 	uuid_copy(nsr->nsr_if_uuid, fsw->fsw_dev_ch->ch_na->na_uuid);
802 	ASSERT(!uuid_is_null(nsr->nsr_if_uuid));
803 done:
804 #if SK_LOG
805 	if (__improbable(sk_verbose != 0)) {
806 		fsw_ctl_attach_log(nsr, nx, err);
807 	}
808 #endif /* SK_LOG */
809 
810 	if (hw_nx != NULL) {
811 		nx_release_locked(hw_nx);
812 	}
813 
814 	return err;
815 }
816 
817 SK_NO_INLINE_ATTRIBUTE
818 static void
fsw_cleanup(struct nx_flowswitch * fsw)819 fsw_cleanup(struct nx_flowswitch *fsw)
820 {
821 	int err;
822 	struct ifnet *ifp = NULL;
823 
824 	if (fsw->fsw_dev_ch == NULL) {
825 		ASSERT(fsw->fsw_host_ch == NULL);
826 		return;
827 	}
828 	err = fsw_dp_stop(fsw, &ifp);
829 	if (err != 0) {
830 		return;
831 	}
832 	err = fsw_host_teardown(fsw);
833 	VERIFY(err == 0);
834 
835 	err = fsw_hostna_teardown(fsw);
836 	VERIFY(err == 0);
837 
838 	err = fsw_devna_teardown(fsw);
839 	VERIFY(err == 0);
840 
841 	if (ifp != NULL) {
842 		ifnet_datamov_resume(ifp);
843 	}
844 }
845 
846 int
fsw_ctl_detach(struct kern_nexus * nx,struct proc * p,struct nx_spec_req * nsr)847 fsw_ctl_detach(struct kern_nexus *nx, struct proc *p,
848     struct nx_spec_req *nsr)
849 {
850 #pragma unused(p)
851 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
852 	int err = 0;
853 
854 	SK_LOCK_ASSERT_HELD();
855 
856 	/*
857 	 * nsr is NULL when we're called from the destructor, and it
858 	 * implies that we'll detach everything that is attached.
859 	 */
860 	if (nsr == NULL) {
861 		fsw_cleanup(fsw);
862 		ASSERT(fsw->fsw_dev_ch == NULL);
863 		ASSERT(fsw->fsw_host_ch == NULL);
864 		goto done;
865 	}
866 
867 	if (uuid_is_null(nsr->nsr_if_uuid)) {
868 		err = EINVAL;
869 		goto done;
870 	} else if (fsw->fsw_dev_ch == NULL || fsw->fsw_host_ch == NULL) {
871 		err = ENXIO;
872 		goto done;
873 	}
874 
875 	/* check if the devna uuid is correct */
876 	if (uuid_compare(nsr->nsr_if_uuid,
877 	    fsw->fsw_dev_ch->ch_na->na_uuid) != 0) {
878 		err = ESRCH;
879 		goto done;
880 	}
881 	fsw_cleanup(fsw);
882 
883 done:
884 #if SK_LOG
885 	if (nsr != NULL) {
886 		uuid_string_t ifuuidstr;
887 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
888 		    "nexus 0x%llx (%s) if_uuid %s flags 0x%x err %d",
889 		    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
890 		    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr),
891 		    nsr->nsr_flags, err);
892 	} else {
893 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
894 		    "nexus 0x%llx (%s) ANY err %d", SK_KVA(nx),
895 		    NX_DOM_PROV(nx)->nxdom_prov_name, err);
896 	}
897 #endif /* SK_LOG */
898 
899 	return err;
900 }
901 
902 static int
fsw_netem_config(struct nx_flowswitch * fsw,void * data)903 fsw_netem_config(struct nx_flowswitch *fsw, void *data)
904 {
905 	struct ifnet *ifp = fsw->fsw_ifp;
906 	struct if_netem_params *params = data;
907 	int ret;
908 
909 	if (ifp == NULL) {
910 		return ENODEV;
911 	}
912 
913 	SK_LOCK_ASSERT_HELD();
914 #define fsw_INPUT_NETEM_THREADNAME   "if_input_netem_%s@fsw"
915 #define fsw_INPUT_NETEM_THREADNAME_LEN       32
916 	char netem_name[fsw_INPUT_NETEM_THREADNAME_LEN];
917 	(void) snprintf(netem_name, sizeof(netem_name),
918 	    fsw_INPUT_NETEM_THREADNAME, if_name(ifp));
919 	ret = netem_config(&ifp->if_input_netem, netem_name, ifp, params, fsw,
920 	    fsw_dev_input_netem_dequeue, FSW_VP_DEV_BATCH_MAX);
921 
922 	return ret;
923 }
924 
925 int
fsw_ctl(struct kern_nexus * nx,nxcfg_cmd_t nc_cmd,struct proc * p,void * data)926 fsw_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, struct proc *p,
927     void *data)
928 {
929 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
930 	struct nx_spec_req *nsr = data;
931 	struct nx_flow_req *req = data;
932 	boolean_t need_check;
933 	int error = 0;
934 
935 	switch (nc_cmd) {
936 	case NXCFG_CMD_FLOW_ADD:
937 	case NXCFG_CMD_FLOW_DEL:
938 		if (uuid_is_null(req->nfr_flow_uuid)) {
939 			error = EINVAL;
940 			goto done;
941 		}
942 		if (p != kernproc) {
943 			req->nfr_flags &= NXFLOWREQF_MASK;
944 		}
945 		req->nfr_flowadv_idx = FLOWADV_IDX_NONE;
946 
947 		if (nc_cmd == NXCFG_CMD_FLOW_DEL) {
948 			break;
949 		}
950 
951 		need_check = FALSE;
952 		if (req->nfr_epid != -1 && proc_pid(p) != req->nfr_epid) {
953 			need_check = TRUE;
954 		} else if (!uuid_is_null(req->nfr_euuid)) {
955 			uuid_t uuid;
956 
957 			/* get the UUID of the issuing process */
958 			proc_getexecutableuuid(p, uuid, sizeof(uuid));
959 
960 			/*
961 			 * If this is not issued by a process for its own
962 			 * executable UUID and if the process does not have
963 			 * the necessary privilege, reject the request.
964 			 * The logic is similar to so_set_effective_uuid().
965 			 */
966 			if (uuid_compare(req->nfr_euuid, uuid) != 0) {
967 				need_check = TRUE;
968 			}
969 		}
970 		if (need_check) {
971 			kauth_cred_t cred = kauth_cred_proc_ref(p);
972 			error = priv_check_cred(cred,
973 			    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0);
974 			kauth_cred_unref(&cred);
975 			if (error != 0) {
976 				goto done;
977 			}
978 		}
979 		break;
980 
981 	default:
982 		break;
983 	}
984 
985 	switch (nc_cmd) {
986 	case NXCFG_CMD_ATTACH:
987 		error = fsw_ctl_attach(nx, p, nsr);
988 		break;
989 
990 	case NXCFG_CMD_DETACH:
991 		error = fsw_ctl_detach(nx, p, nsr);
992 		break;
993 
994 	case NXCFG_CMD_FLOW_ADD:       /* struct nx_flow_req */
995 		error = fsw_ctl_flow_add(fsw, p, data);
996 		break;
997 
998 	case NXCFG_CMD_FLOW_DEL:     /* struct nx_flow_req */
999 		error = fsw_ctl_flow_del(fsw, p, data);
1000 		break;
1001 	case NXCFG_CMD_NETEM:           /* struct if_netem_params */
1002 		error = fsw_netem_config(fsw, data);
1003 		break;
1004 
1005 	default:
1006 		SK_ERR("invalid cmd %u", nc_cmd);
1007 		error = EINVAL;
1008 		break;
1009 	}
1010 
1011 done:
1012 	return error;
1013 }
1014 
1015 struct nx_flowswitch *
fsw_ifp_to_fsw(struct ifnet * ifp)1016 fsw_ifp_to_fsw(struct ifnet *ifp)
1017 {
1018 	struct nx_flowswitch *fsw = NULL;
1019 
1020 	if (ifp->if_na != NULL) {
1021 		fsw = ifp->if_na->nifna_netif->nif_fsw;
1022 	}
1023 	return fsw;
1024 }
1025 
1026 static void
fsw_ifnet_event_callback(struct eventhandler_entry_arg ee_arg __unused,struct ifnet * ifp,struct sockaddr * ip_addr __unused,intf_event_code_t intf_ev_code)1027 fsw_ifnet_event_callback(struct eventhandler_entry_arg ee_arg __unused,
1028     struct ifnet *ifp, struct sockaddr *ip_addr __unused,
1029     intf_event_code_t intf_ev_code)
1030 {
1031 	struct nx_flowswitch *fsw = NULL;
1032 
1033 	if (ifp->if_na == NULL) {
1034 		return;
1035 	}
1036 
1037 	SK_LOCK();
1038 	fsw = fsw_ifp_to_fsw(ifp);
1039 	if (fsw != NULL) {
1040 		switch (intf_ev_code) {
1041 		case INTF_EVENT_CODE_LLADDR_UPDATE:
1042 			if ((fsw->fsw_ifp == NULL) ||
1043 			    (fsw->fsw_ifp_dlt != DLT_EN10MB)) {
1044 				break;
1045 			}
1046 
1047 			VERIFY(fsw->fsw_ifp == ifp);
1048 			SK_DF(SK_VERB_FSW, "MAC address change detected for %s",
1049 			    if_name(fsw->fsw_ifp));
1050 			(void) ifnet_lladdr_copy_bytes(ifp, fsw->fsw_ether_shost,
1051 			    ETHER_ADDR_LEN);
1052 			atomic_add_32(&fsw->fsw_src_lla_gencnt, 1);
1053 			break;
1054 
1055 		case INTF_EVENT_CODE_LOW_POWER_UPDATE:
1056 			if (fsw->fsw_ifp == NULL) {
1057 				break;
1058 			}
1059 
1060 			VERIFY(fsw->fsw_ifp == ifp);
1061 
1062 			if (ifp->if_xflags & IFXF_LOW_POWER) {
1063 				SK_DF(SK_VERB_FSW,
1064 				    "Low power mode updated for %s",
1065 				    if_name(fsw->fsw_ifp));
1066 
1067 				fsw_reap_sched(fsw);
1068 			}
1069 			break;
1070 
1071 		default:
1072 			break;
1073 		}
1074 	}
1075 	SK_UNLOCK();
1076 }
1077 
1078 static void
fsw_protoctl_event_callback(struct eventhandler_entry_arg ee_arg,struct ifnet * ifp,struct sockaddr * p_laddr,struct sockaddr * p_raddr,uint16_t lport,uint16_t rport,uint8_t proto,uint32_t protoctl_event_code,struct protoctl_ev_val * p_val)1079 fsw_protoctl_event_callback(struct eventhandler_entry_arg ee_arg,
1080     struct ifnet *ifp, struct sockaddr *p_laddr, struct sockaddr *p_raddr,
1081     uint16_t lport, uint16_t rport, uint8_t proto, uint32_t protoctl_event_code,
1082     struct protoctl_ev_val *p_val)
1083 {
1084 #pragma unused(ee_arg)
1085 	struct nx_flowswitch *fsw = NULL;
1086 	struct flow_entry *fe = NULL;
1087 	boolean_t netagent_update_flow = FALSE;
1088 	uuid_t fe_uuid;
1089 
1090 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1091 		return;
1092 	}
1093 
1094 	/*
1095 	 * XXX Right now only handle the event if we have enough
1096 	 * information to match the entire flow.
1097 	 */
1098 	if (lport == 0 || rport == 0 || p_laddr == NULL || p_raddr == NULL) {
1099 		return;
1100 	}
1101 
1102 	SK_LOCK();
1103 	fsw = fsw_ifp_to_fsw(ifp);
1104 	if (fsw == NULL) {
1105 		goto out;
1106 	}
1107 
1108 	if (!fsw_detach_barrier_add(fsw)) {
1109 		fsw = NULL;
1110 		SK_ERR("netagent detached");
1111 		goto out;
1112 	}
1113 
1114 	struct flow_key fk __sk_aligned(16);
1115 	FLOW_KEY_CLEAR(&fk);
1116 	fk.fk_proto = proto;
1117 	if (p_laddr->sa_family == AF_INET) {
1118 		fk.fk_ipver = IPVERSION;
1119 		fk.fk_src4 = SIN(p_laddr)->sin_addr;
1120 		fk.fk_dst4 = SIN(p_raddr)->sin_addr;
1121 	} else {
1122 		fk.fk_ipver = IPV6_VERSION;
1123 		fk.fk_src6 = SIN6(p_laddr)->sin6_addr;
1124 		fk.fk_dst6 = SIN6(p_raddr)->sin6_addr;
1125 	}
1126 	fk.fk_sport = lport;
1127 	fk.fk_dport = rport;
1128 	fk.fk_mask = FKMASK_5TUPLE;
1129 
1130 	fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &fk);
1131 	if (__improbable(fe == NULL)) {
1132 		goto out;
1133 	}
1134 
1135 	uuid_copy(fe_uuid, fe->fe_uuid);
1136 	/*
1137 	 * If the protocol notification is for TCP, make sure
1138 	 * protocol event received is for bytes in the flight.
1139 	 * XXX Redirect events are not delivered as protocol events
1140 	 * but as better route events.
1141 	 * Also redirect events do not indicate loss of the packet.
1142 	 */
1143 	if (proto != IPPROTO_TCP) {
1144 		p_val->tcp_seq_number = 0;
1145 	}
1146 
1147 	netagent_update_flow = TRUE;
1148 
1149 out:
1150 	SK_UNLOCK();
1151 
1152 	if (netagent_update_flow) {
1153 		int error = 0;
1154 #if SK_LOG
1155 		char dbgbuf[FLOWENTRY_DBGBUF_SIZE];
1156 		SK_DF(SK_VERB_FLOW, "Update flow entry \"%s\" for protocol "
1157 		    "event %d with value %d and tcp sequence number %d",
1158 		    fe_as_string(fe, dbgbuf, sizeof(dbgbuf)),
1159 		    protoctl_event_code, p_val->val, p_val->tcp_seq_number);
1160 #endif /* SK_LOG */
1161 		if ((error = netagent_update_flow_protoctl_event(
1162 			    fsw->fsw_agent_session, fe_uuid, protoctl_event_code,
1163 			    p_val->val, p_val->tcp_seq_number)) != 0) {
1164 #if SK_LOG
1165 			SK_DF(SK_VERB_FLOW, "Error: %d. Could not update "
1166 			    "flow entry \"%s\" for protocol event %d with "
1167 			    "value %d and tcp sequence number %d", error,
1168 			    dbgbuf, protoctl_event_code, p_val->val,
1169 			    p_val->tcp_seq_number);
1170 #endif /* SK_LOG */
1171 		}
1172 	}
1173 
1174 	if (fe != NULL) {
1175 		flow_entry_release(&fe);
1176 	}
1177 
1178 	if (fsw != NULL) {
1179 		fsw_detach_barrier_remove(fsw);
1180 	}
1181 }
1182 
1183 int
fsw_netagent_add_remove(struct kern_nexus * nx,boolean_t add)1184 fsw_netagent_add_remove(struct kern_nexus *nx, boolean_t add)
1185 {
1186 	struct nx_flowswitch *fsw = NULL;
1187 	int error = 0;
1188 
1189 	SK_LOCK_ASSERT_HELD();
1190 	VERIFY(nx != NULL);
1191 	VERIFY(NX_PROV(nx) != NULL);
1192 	VERIFY(NX_DOM_PROV(nx) != NULL);
1193 
1194 	if (NX_DOM(nx)->nxdom_type != NEXUS_TYPE_FLOW_SWITCH) {
1195 		error = EINVAL;
1196 		goto out;
1197 	}
1198 
1199 	fsw = NX_FSW_PRIVATE(nx);
1200 	VERIFY(fsw != NULL);
1201 	FSW_WLOCK(fsw);
1202 
1203 	if (fsw->fsw_agent_session == NULL) {
1204 		error = ENXIO;
1205 		goto out;
1206 	}
1207 
1208 	ASSERT(!uuid_is_null(fsw->fsw_agent_uuid));
1209 
1210 	if (add) {
1211 		if (FSW_NETAGENT_ADDED(fsw)) {
1212 			/* agent already added */
1213 			error = EEXIST;
1214 		} else if (fsw->fsw_ifp->if_bridge != NULL) {
1215 			/* see rdar://107076453 */
1216 			SK_ERR("%s is bridged, not adding netagent",
1217 			    if_name(fsw->fsw_ifp));
1218 			error = EBUSY;
1219 		} else {
1220 			fsw->fsw_state_flags |= FSW_STATEF_NETAGENT_ADDED;
1221 			if (if_is_fsw_netagent_enabled()) {
1222 				fsw->fsw_state_flags
1223 				        |= FSW_STATEF_NETAGENT_ENABLED;
1224 			}
1225 			if_add_netagent(fsw->fsw_ifp, fsw->fsw_agent_uuid);
1226 			SK_D("flowswitch netagent added for interface %s",
1227 			    if_name(fsw->fsw_ifp));
1228 		}
1229 	} else {
1230 		if (!FSW_NETAGENT_ADDED(fsw)) {
1231 			/* agent has not been added */
1232 			error = ENOENT;
1233 		} else {
1234 			fsw->fsw_state_flags &= ~(FSW_STATEF_NETAGENT_ADDED |
1235 			    FSW_STATEF_NETAGENT_ENABLED);
1236 			if_delete_netagent(fsw->fsw_ifp, fsw->fsw_agent_uuid);
1237 			SK_D("flowswitch netagent removed for interface %s",
1238 			    if_name(fsw->fsw_ifp));
1239 		}
1240 	}
1241 out:
1242 	if (fsw != NULL) {
1243 		FSW_UNLOCK(fsw);
1244 	}
1245 	return error;
1246 }
1247 
1248 void
fsw_netagent_update(struct kern_nexus * nx)1249 fsw_netagent_update(struct kern_nexus *nx)
1250 {
1251 	struct nx_flowswitch *fsw = NULL;
1252 
1253 	SK_LOCK_ASSERT_HELD();
1254 	VERIFY(nx != NULL);
1255 	VERIFY(NX_PROV(nx) != NULL);
1256 	VERIFY(NX_DOM_PROV(nx) != NULL);
1257 
1258 	if (NX_DOM(nx)->nxdom_type != NEXUS_TYPE_FLOW_SWITCH) {
1259 		goto out;
1260 	}
1261 	fsw = NX_FSW_PRIVATE(nx);
1262 	VERIFY(fsw != NULL);
1263 	FSW_WLOCK(fsw);
1264 	if (fsw->fsw_agent_session == NULL) {
1265 		goto out;
1266 	}
1267 	ASSERT(!uuid_is_null(fsw->fsw_agent_uuid));
1268 	uint32_t flags = netagent_get_flags(fsw->fsw_agent_uuid);
1269 	const bool ip_agent = ifnet_needs_fsw_ip_netagent(fsw->fsw_ifp);
1270 	const bool transport_agent = ifnet_needs_fsw_transport_netagent(fsw->fsw_ifp);
1271 	if (ip_agent || transport_agent) {
1272 		flags |= NETAGENT_FLAG_NEXUS_LISTENER;
1273 	} else {
1274 		flags &= ~NETAGENT_FLAG_NEXUS_LISTENER;
1275 	}
1276 	if (transport_agent) {
1277 		flags |= NETAGENT_FLAG_NEXUS_PROVIDER;
1278 	} else {
1279 		flags &= ~NETAGENT_FLAG_NEXUS_PROVIDER;
1280 	}
1281 	if (ip_agent) {
1282 		flags |= NETAGENT_FLAG_CUSTOM_IP_NEXUS;
1283 	} else {
1284 		flags &= ~NETAGENT_FLAG_CUSTOM_IP_NEXUS;
1285 	}
1286 	if (netagent_set_flags(fsw->fsw_agent_uuid, flags) == 0) {
1287 		SK_D("flowswitch netagent updated for interface %s",
1288 		    if_name(fsw->fsw_ifp));
1289 	}
1290 out:
1291 	if (fsw != NULL) {
1292 		FSW_UNLOCK(fsw);
1293 	}
1294 }
1295 
1296 static int
fsw_port_ctor(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,const struct nxbind * nxb)1297 fsw_port_ctor(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna,
1298     const struct nxbind *nxb)
1299 {
1300 #pragma unused(nxb)
1301 	int err = 0;
1302 
1303 	SK_LOCK_ASSERT_HELD();
1304 	ASSERT(nxb == NULL || !(nxb->nxb_flags & NXBF_MATCH_UNIQUEID) ||
1305 	    vpna->vpna_pid == nxb->nxb_pid);
1306 
1307 	/*
1308 	 * Reject regular channel open requests unless there is
1309 	 * something attached to the host port of the flowswitch.
1310 	 */
1311 	if (vpna->vpna_nx_port >= FSW_VP_USER_MIN) {
1312 		struct nexus_adapter *na = &vpna->vpna_up;
1313 		struct ifnet *ifp = fsw->fsw_ifp;
1314 
1315 		if (ifp == NULL) {
1316 			err = ENXIO;
1317 			goto done;
1318 		}
1319 
1320 		/* if adapter supports mitigation, set default value */
1321 		if (na->na_flags & (NAF_TX_MITIGATION | NAF_RX_MITIGATION)) {
1322 			if (IFNET_IS_WIFI(ifp)) {
1323 				na->na_ch_mit_ival = CH_MIT_IVAL_WIFI;
1324 			} else if (IFNET_IS_CELLULAR(ifp)) {
1325 				na->na_ch_mit_ival = CH_MIT_IVAL_CELLULAR;
1326 			} else if (IFNET_IS_ETHERNET(ifp)) {
1327 				na->na_ch_mit_ival = CH_MIT_IVAL_ETHERNET;
1328 			} else {
1329 				na->na_ch_mit_ival = CH_MIT_IVAL_DEFAULT;
1330 			}
1331 		}
1332 	}
1333 
1334 done:
1335 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
1336 	    "fsw 0x%llx nx_port %d vpna_pid %d vpna_pid_bound %u mit_ival %llu "
1337 	    "(err %d)", SK_KVA(fsw), (int)vpna->vpna_nx_port, vpna->vpna_pid,
1338 	    vpna->vpna_pid_bound, vpna->vpna_up.na_ch_mit_ival, err);
1339 
1340 	return err;
1341 }
1342 
1343 static bool
fsw_port_dtor(struct nx_flowswitch * fsw,const struct nexus_vp_adapter * vpna)1344 fsw_port_dtor(struct nx_flowswitch *fsw, const struct nexus_vp_adapter *vpna)
1345 {
1346 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1347 	nexus_port_t nx_port = vpna->vpna_nx_port;
1348 	uint32_t purge_cnt;
1349 
1350 	ASSERT(fsw == vpna->vpna_fsw);
1351 	ASSERT(nx_port != NEXUS_PORT_ANY);
1352 
1353 	/*
1354 	 * If this nexus port was bound to a PID, we just need to look at a
1355 	 * single bucket and iterate from there.  Note that in any case, we
1356 	 * can't just search for a single flow_owner based on the PID itself,
1357 	 * since a given process may be opening multiple channels to the
1358 	 * flowswitch; hence we search for the ones matching this nexus port.
1359 	 *
1360 	 * Close any open flows on the port and remove the flow owner and
1361 	 * nexus port binding.
1362 	 */
1363 	purge_cnt = flow_owner_detach_nexus_port(fm, vpna->vpna_pid_bound,
1364 	    vpna->vpna_pid, nx_port, FALSE);
1365 
1366 	SK_DF(SK_VERB_FSW,
1367 	    "fsw 0x%llx nx_port %d pid %d pid_bound %u defunct %u "
1368 	    "purged %u", SK_KVA(fsw), (int)nx_port,
1369 	    vpna->vpna_pid, vpna->vpna_pid_bound, vpna->vpna_defunct,
1370 	    purge_cnt);
1371 
1372 	return purge_cnt != 0;
1373 }
1374 
1375 /*
1376  * Flowswitch nexus port allocator.
1377  *
1378  * A nexus port is represented by a bit in the port bitmap; its state is
1379  * either free or allocated.  A free state implies that the port has no
1380  * nxbind AND no nexus adapter association.  An allocated state means that
1381  * either it has a nxbind OR a nexus adapter assocation.  This routine
1382  * manages the nexus adapter association with a nexus port; nxbind is
1383  * handled separately via nx_fsw_port_bind().
1384  *
1385  * The caller of this routine may optionally pass in a NULL nexus adapter.
1386  * In such a case (*vpna is NULL), this routine checks to see if the port
1387  * has already been associated with an adapter, and returns a reference to
1388  * that adapter.  No action is taken on a port that doesn't have an adapter
1389  * associated.  Otherwise (*vpna is non-NULL), this routine associates that
1390  * adapter with a port that's not already associated with one; the reference
1391  * to the adapter is untouched here, as the caller is expected to handle it.
1392  *
1393  * The flowswitch code invokes this routine each time it is requested to
1394  * find an adapter via nx_fsw_na_find().  The counterpart of this routine,
1395  * nx_fsw_port_free(), is only executed ONCE by the adapter's destructor.
1396  * This allows for multiple channels to be opened to a nexus port, each
1397  * time holding a reference to that same nexus adapter.  The releasing of
1398  * the nexus port only happens when the last channel closes.
1399  */
1400 static int
fsw_port_alloc__(struct nx_flowswitch * fsw,struct nxbind * nxb,struct nexus_vp_adapter ** vpna,nexus_port_t nx_port,struct proc * p)1401 fsw_port_alloc__(struct nx_flowswitch *fsw, struct nxbind *nxb,
1402     struct nexus_vp_adapter **vpna, nexus_port_t nx_port, struct proc *p)
1403 {
1404 	struct kern_nexus *nx = fsw->fsw_nx;
1405 	boolean_t refonly = FALSE;
1406 	int error = 0;
1407 
1408 	FSW_WLOCK_ASSERT_HELD(fsw);
1409 
1410 	error = nx_port_alloc(nx, nx_port, nxb, (struct nexus_adapter **)vpna, p);
1411 	if (error == 0 && *vpna != NULL && !refonly) {
1412 		/* initialize the nexus port and the adapter occupying it */
1413 		(*vpna)->vpna_fsw = fsw;
1414 		(*vpna)->vpna_nx_port = nx_port;
1415 		(*vpna)->vpna_pid = proc_pid(p);
1416 		if (nxb != NULL && (nxb->nxb_flags & NXBF_MATCH_UNIQUEID)) {
1417 			ASSERT((*vpna)->vpna_pid == nxb->nxb_pid);
1418 			(*vpna)->vpna_pid_bound = TRUE;
1419 		} else {
1420 			(*vpna)->vpna_pid_bound = FALSE;
1421 		}
1422 
1423 		error = fsw_port_ctor(fsw, *vpna, nxb);
1424 		if (error != 0) {
1425 			fsw_port_free(fsw, (*vpna),
1426 			    (*vpna)->vpna_nx_port, FALSE);
1427 		}
1428 	}
1429 
1430 #if SK_LOG
1431 	if (*vpna != NULL) {
1432 		SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
1433 		    "+++ vpna \"%s\" (0x%llx) <-> fsw 0x%llx "
1434 		    "%sport %d refonly %u (err %d)",
1435 		    (*vpna)->vpna_up.na_name, SK_KVA(*vpna), SK_KVA(fsw),
1436 		    nx_fsw_dom_port_is_reserved(nx, nx_port) ?
1437 		    "[reserved] " : "", (int)nx_port, refonly, error);
1438 	} else {
1439 		SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
1440 		    "+++ fsw 0x%llx nx_port %d refonly %u "
1441 		    "(err %d)", SK_KVA(fsw), (int)nx_port, refonly, error);
1442 	}
1443 #endif /* SK_LOG */
1444 
1445 	return error;
1446 }
1447 
1448 int
fsw_port_alloc(struct nx_flowswitch * fsw,struct nxbind * nxb,struct nexus_vp_adapter ** vpna,nexus_port_t nx_port,struct proc * p,boolean_t ifattach,boolean_t host)1449 fsw_port_alloc(struct nx_flowswitch *fsw, struct nxbind *nxb,
1450     struct nexus_vp_adapter **vpna, nexus_port_t nx_port, struct proc *p,
1451     boolean_t ifattach, boolean_t host)
1452 {
1453 	int err = 0;
1454 
1455 	FSW_WLOCK_ASSERT_HELD(fsw);
1456 
1457 	if (ifattach) {
1458 		/* override port to either NX_FSW_{HOST,DEV} */
1459 		nx_port = (host ? FSW_VP_HOST : FSW_VP_DEV);
1460 		/* allocate reserved port for ifattach */
1461 		err = fsw_port_alloc__(fsw, nxb, vpna, nx_port, p);
1462 	} else if (host) {
1463 		/* host is valid only for ifattach */
1464 		err = EINVAL;
1465 	} else {
1466 		/* nexus port otherwise (reserve dev and host for ifattach) */
1467 		err = fsw_port_alloc__(fsw, nxb, vpna, nx_port, p);
1468 	}
1469 
1470 	return err;
1471 }
1472 
1473 /*
1474  * Remove nexus port association from a nexus adapter.  This call is
1475  * the opposite of fsw_port_alloc(), except that it is called only
1476  * at nx_fsw_vp_na_dtor() destructor time.  See above notes
1477  * on fsw_port_alloc().
1478  */
1479 void
fsw_port_free(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,nexus_port_t nx_port,boolean_t defunct)1480 fsw_port_free(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna,
1481     nexus_port_t nx_port, boolean_t defunct)
1482 {
1483 	struct kern_nexus *nx = fsw->fsw_nx;
1484 
1485 	FSW_WLOCK_ASSERT_HELD(fsw);
1486 	ASSERT(vpna->vpna_fsw == fsw);
1487 
1488 	if (defunct) {
1489 		vpna->vpna_defunct = TRUE;
1490 		nx_port_defunct(nx, nx_port);
1491 	}
1492 
1493 	bool destroyed = fsw_port_dtor(fsw, vpna);
1494 	if (destroyed) {
1495 		/*
1496 		 * If the extension's destructor no longer needs to be
1497 		 * bound to any channel client, release the binding.
1498 		 */
1499 		nx_port_unbind(nx, nx_port);
1500 	}
1501 
1502 	/*
1503 	 * If this is a defunct, then stop here as the port is still
1504 	 * occupied by the channel.  We'll come here again later when
1505 	 * the actual close happens.
1506 	 */
1507 	if (defunct) {
1508 		return;
1509 	}
1510 
1511 	SK_DF(SK_VERB_FSW, "--- vpna \"%s\" (0x%llx) -!- fsw 0x%llx "
1512 	    "nx_port %d defunct %u", vpna->vpna_up.na_name, SK_KVA(vpna),
1513 	    SK_KVA(fsw), (int)nx_port, vpna->vpna_defunct);
1514 
1515 	nx_port_free(nx, nx_port);
1516 	vpna->vpna_fsw = NULL;
1517 	vpna->vpna_nx_port = NEXUS_PORT_ANY;
1518 	vpna->vpna_pid_bound = FALSE;
1519 	vpna->vpna_pid = -1;
1520 	vpna->vpna_defunct = FALSE;
1521 }
1522 
1523 int
fsw_port_na_activate(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,na_activate_mode_t mode)1524 fsw_port_na_activate(struct nx_flowswitch *fsw,
1525     struct nexus_vp_adapter *vpna, na_activate_mode_t mode)
1526 {
1527 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1528 	uint32_t fo_cnt = 0;
1529 
1530 	SK_LOCK_ASSERT_HELD();
1531 
1532 	/* The following code relies on the static value asserted below */
1533 	_CASSERT(FSW_VP_DEV == 0);
1534 	_CASSERT(FSW_VP_HOST == 1);
1535 
1536 	ASSERT(NA_IS_ACTIVE(&vpna->vpna_up));
1537 	ASSERT(vpna->vpna_nx_port != NEXUS_PORT_ANY);
1538 
1539 	switch (mode) {
1540 	case NA_ACTIVATE_MODE_ON:
1541 		break;
1542 
1543 	case NA_ACTIVATE_MODE_DEFUNCT:
1544 		break;
1545 
1546 	case NA_ACTIVATE_MODE_OFF:
1547 		break;
1548 
1549 	default:
1550 		VERIFY(0);
1551 		/* NOTREACHED */
1552 		__builtin_unreachable();
1553 	}
1554 
1555 	/* nothing further to do for special ports */
1556 	if (vpna->vpna_nx_port < FSW_VP_USER_MIN) {
1557 		goto done;
1558 	}
1559 
1560 	/* activate any flow owner related resources (e.g. flowadv), if any */
1561 	fo_cnt = flow_owner_activate_nexus_port(fm, vpna->vpna_pid_bound,
1562 	    vpna->vpna_pid, vpna->vpna_nx_port, &vpna->vpna_up, mode);
1563 
1564 done:
1565 	SK_DF(SK_VERB_FSW,
1566 	    "fsw 0x%llx %s nx_port %d vpna_pid %d vpna_pid_bound %u fo_cnt %u",
1567 	    SK_KVA(fsw), na_activate_mode2str(mode), (int)vpna->vpna_nx_port,
1568 	    vpna->vpna_pid, vpna->vpna_pid_bound, fo_cnt);
1569 
1570 	return 0;
1571 }
1572 
1573 int
fsw_port_na_defunct(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna)1574 fsw_port_na_defunct(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna)
1575 {
1576 	int err = 0;
1577 
1578 	SK_LOCK_ASSERT_HELD();
1579 	ASSERT(vpna->vpna_nx_port >= FSW_VP_USER_MIN);
1580 
1581 	/*
1582 	 * During defunct, we want to purge all flows associated to this
1583 	 * port and the flow owner as well.  This is accomplished as part
1584 	 * of calling the port's destructor.  However, we still want to
1585 	 * occupy the nexus port since there's a channel open to it.
1586 	 */
1587 	FSW_WLOCK(fsw);
1588 	if (!vpna->vpna_defunct) {
1589 		fsw_port_free(fsw, vpna, vpna->vpna_nx_port, TRUE);
1590 	} else {
1591 		err = EALREADY;
1592 	}
1593 	FSW_WUNLOCK(fsw);
1594 
1595 	return err;
1596 }
1597 
1598 static size_t
fsw_mib_get_flow(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1599 fsw_mib_get_flow(struct nx_flowswitch *fsw,
1600     struct nexus_mib_filter *filter, void *out, size_t len)
1601 {
1602 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1603 	size_t sf_size = sizeof(struct sk_stats_flow);
1604 	__block size_t actual_space = 0;
1605 	__block struct sk_stats_flow *sf = out;
1606 	struct flow_entry *fe;
1607 
1608 	FSW_LOCK_ASSERT_HELD(fsw);
1609 
1610 	if (filter->nmf_bitmap & NXMIB_FILTER_FLOW_ID) {
1611 		fe = flow_mgr_get_fe_by_uuid_rlock(fm, filter->nmf_flow_id);
1612 		if (fe != NULL) {
1613 			if (out != NULL && len >= sf_size) {
1614 				flow_entry_stats_get(fe, sf);
1615 			}
1616 
1617 			flow_entry_release(&fe);
1618 			return sf_size;
1619 		}
1620 		return 0;
1621 	} else if (filter->nmf_bitmap & NXMIB_FILTER_INFO_TUPLE) {
1622 		struct info_tuple *itpl = &filter->nmf_info_tuple;
1623 		struct flow_key fk;
1624 		bzero(&fk, sizeof(fk));
1625 		if (itpl->itpl_local_sa.sa_family == AF_INET &&
1626 		    itpl->itpl_remote_sa.sa_family == AF_INET) {
1627 			fk.fk_mask = FKMASK_5TUPLE;
1628 			fk.fk_ipver = IPVERSION;
1629 			fk.fk_proto = itpl->itpl_proto;
1630 			fk.fk_src4 = itpl->itpl_local_sin.sin_addr;
1631 			fk.fk_dst4 = itpl->itpl_remote_sin.sin_addr;
1632 			fk.fk_sport = itpl->itpl_local_sin.sin_port;
1633 			fk.fk_dport = itpl->itpl_remote_sin.sin_port;
1634 		} else if (itpl->itpl_local_sa.sa_family == AF_INET6 &&
1635 		    itpl->itpl_remote_sa.sa_family == AF_INET6) {
1636 			fk.fk_mask = FKMASK_5TUPLE;
1637 			fk.fk_ipver = IPV6_VERSION;
1638 			fk.fk_proto = itpl->itpl_proto;
1639 			fk.fk_src6 = itpl->itpl_local_sin6.sin6_addr;
1640 			fk.fk_dst6 = itpl->itpl_remote_sin6.sin6_addr;
1641 			fk.fk_sport = itpl->itpl_local_sin6.sin6_port;
1642 			fk.fk_dport = itpl->itpl_remote_sin6.sin6_port;
1643 		} else {
1644 			SK_ERR("invalid info tuple: local af %d remote af %d",
1645 			    itpl->itpl_local_sa.sa_family,
1646 			    itpl->itpl_remote_sa.sa_family);
1647 			return 0;
1648 		}
1649 
1650 		fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &fk);
1651 		if (fe != NULL) {
1652 			if (out != NULL && len >= sf_size) {
1653 				flow_entry_stats_get(fe, sf);
1654 			}
1655 			flow_entry_release(&fe);
1656 			return sf_size;
1657 		}
1658 		return 0;
1659 	}
1660 
1661 	flow_mgr_foreach_flow(fsw->fsw_flow_mgr, ^(struct flow_entry *_fe) {
1662 		actual_space += sf_size;
1663 
1664 		if (out == NULL || actual_space > len) {
1665 		        return;
1666 		}
1667 
1668 		flow_entry_stats_get(_fe, sf);
1669 		sf++;
1670 	});
1671 
1672 	/*
1673 	 * Also return the ones in deferred free list.
1674 	 */
1675 	lck_mtx_lock(&fsw->fsw_linger_lock);
1676 	TAILQ_FOREACH(fe, &fsw->fsw_linger_head, fe_linger_link) {
1677 		actual_space += sf_size;
1678 		if (out == NULL || actual_space > len) {
1679 			continue;
1680 		}
1681 
1682 		flow_entry_stats_get(fe, sf);
1683 		sf++;
1684 	}
1685 	lck_mtx_unlock(&fsw->fsw_linger_lock);
1686 
1687 	return actual_space;
1688 }
1689 
1690 static size_t
fsw_mib_get_flow_adv(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1691 fsw_mib_get_flow_adv(struct nx_flowswitch *fsw,
1692     struct nexus_mib_filter *filter, void *out, size_t len)
1693 {
1694 #pragma unused(filter)
1695 	uint32_t fae_idx;
1696 	size_t actual_space = 0;
1697 	struct kern_channel *ch = NULL;
1698 	struct sk_stats_flow_adv *sfa = NULL;
1699 	struct sk_stats_flow_adv_ent *sfae = NULL;
1700 	struct __flowadv_entry *fae = NULL;
1701 	size_t sfa_size = sizeof(struct sk_stats_flow_adv);
1702 	size_t sfae_size = sizeof(struct sk_stats_flow_adv_ent);
1703 	uint32_t max_flowadv =
1704 	    fsw->fsw_nx->nx_prov->nxprov_params->nxp_flowadv_max;
1705 
1706 	SK_LOCK_ASSERT_HELD();
1707 
1708 	sfa = out;
1709 	/* copyout flow advisory table (allocated entries only) */
1710 	STAILQ_FOREACH(ch, &fsw->fsw_nx->nx_ch_head, ch_link) {
1711 		struct skmem_arena *ar;
1712 		struct skmem_arena_nexus *arn;
1713 		struct nexus_adapter *na;
1714 
1715 		/* ch_lock isn't needed here since sk_lock is held */
1716 		if ((ch->ch_flags & CHANF_CLOSING) ||
1717 		    (na = ch->ch_na) == NULL) {
1718 			/* channel is closing */
1719 			continue;
1720 		}
1721 
1722 		ar = na->na_arena;
1723 		arn = skmem_arena_nexus(ar);
1724 
1725 		AR_LOCK(ar);
1726 		if (arn->arn_flowadv_obj == NULL) {
1727 			ASSERT(ar->ar_flags & ARF_DEFUNCT);
1728 			AR_UNLOCK(ar);
1729 			continue;
1730 		}
1731 		actual_space += sfa_size;
1732 		/* fill out flowadv_table info */
1733 		if (out != NULL && actual_space <= len) {
1734 			uuid_copy(sfa->sfa_nx_uuid, fsw->fsw_nx->nx_uuid);
1735 			(void) strlcpy(sfa->sfa_if_name,
1736 			    fsw->fsw_flow_mgr->fm_name, IFNAMSIZ);
1737 			sfa->sfa_owner_pid = ch->ch_pid;
1738 			sfa->sfa_entries_count = 0;
1739 		}
1740 
1741 		/* fill out flowadv_entries */
1742 		sfae = &sfa->sfa_entries[0];
1743 		for (fae_idx = 0; fae_idx < max_flowadv; fae_idx++) {
1744 			fae = &arn->arn_flowadv_obj[fae_idx];
1745 			if (!uuid_is_null(fae->fae_id)) {
1746 				actual_space += sfae_size;
1747 				if (out == NULL || actual_space > len) {
1748 					continue;
1749 				}
1750 
1751 				/* fill out entry */
1752 				uuid_copy(sfae->sfae_flow_id, fae->fae_id);
1753 				sfae->sfae_flags = fae->fae_flags;
1754 				sfae++;
1755 				sfa->sfa_entries_count++;
1756 			}
1757 		}
1758 		sfa = (struct sk_stats_flow_adv *)
1759 		    ((uintptr_t)out + actual_space);
1760 		AR_UNLOCK(ar);
1761 	}
1762 
1763 	return actual_space;
1764 }
1765 
1766 static inline void
fsw_fo2sfo(struct nx_flowswitch * fsw,struct flow_owner * fo,struct sk_stats_flow_owner * sfo)1767 fsw_fo2sfo(struct nx_flowswitch *fsw, struct flow_owner *fo,
1768     struct sk_stats_flow_owner *sfo)
1769 {
1770 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1771 
1772 	uuid_copy(sfo->sfo_nx_uuid, fsw->fsw_nx->nx_uuid);
1773 	(void) strlcpy(sfo->sfo_if_name, fsw->fsw_flow_mgr->fm_name,
1774 	    IFNAMSIZ);
1775 	sfo->sfo_bucket_idx = flow_mgr_get_fob_idx(fm, FO_BUCKET(fo));
1776 
1777 	(void) snprintf(sfo->sfo_name, sizeof(sfo->sfo_name), "%s",
1778 	    fo->fo_name);
1779 	sfo->sfo_pid = fo->fo_pid;
1780 	sfo->sfo_nx_port = fo->fo_nx_port;
1781 	sfo->sfo_nx_port_pid_bound = fo->fo_nx_port_pid_bound;
1782 	sfo->sfo_nx_port_destroyed = fo->fo_nx_port_destroyed;
1783 }
1784 
1785 static size_t
fsw_mib_get_flow_owner(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1786 fsw_mib_get_flow_owner(struct nx_flowswitch *fsw,
1787     struct nexus_mib_filter *filter, void *out, size_t len)
1788 {
1789 #pragma unused(filter)
1790 	uint32_t i;
1791 	size_t actual_space = 0;
1792 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1793 	struct sk_stats_flow_owner *sfo = out;
1794 	size_t sfo_size = sizeof(struct sk_stats_flow_owner);
1795 	struct flow_owner *fo;
1796 
1797 	FSW_LOCK_ASSERT_HELD(fsw);
1798 
1799 	/*
1800 	 * Ideally we'd like to hide the bucket level details from flow library
1801 	 * user, but there is no simple way to iterate flow_owner with
1802 	 * buckets/RB_TREE nested. So keep it as is.
1803 	 */
1804 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
1805 		struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
1806 		FOB_LOCK(fob);
1807 		RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
1808 			actual_space += sfo_size;
1809 			if (out == NULL || actual_space > len) {
1810 				continue;
1811 			}
1812 
1813 			fsw_fo2sfo(fsw, fo, sfo);
1814 			sfo++;
1815 		}
1816 		FOB_UNLOCK(fob);
1817 	}
1818 
1819 	return actual_space;
1820 }
1821 
1822 static inline void
fsw_fr2sfr(struct nx_flowswitch * fsw,struct flow_route * fr,struct sk_stats_flow_route * sfr,boolean_t ll_scrub)1823 fsw_fr2sfr(struct nx_flowswitch *fsw, struct flow_route *fr,
1824     struct sk_stats_flow_route *sfr, boolean_t ll_scrub)
1825 {
1826 	uuid_copy(sfr->sfr_nx_uuid, fsw->fsw_nx->nx_uuid);
1827 	uuid_copy(sfr->sfr_uuid, fr->fr_uuid);
1828 	(void) strlcpy(sfr->sfr_if_name, fsw->fsw_flow_mgr->fm_name,
1829 	    IFNAMSIZ);
1830 
1831 	sfr->sfr_bucket_idx = fr->fr_frb->frb_idx;
1832 	sfr->sfr_id_bucket_idx = fr->fr_frib->frib_idx;
1833 
1834 	if (fr->fr_flags & FLOWRTF_ATTACHED) {
1835 		sfr->sfr_flags |= SFLOWRTF_ATTACHED;
1836 	}
1837 	if (fr->fr_flags & FLOWRTF_ONLINK) {
1838 		sfr->sfr_flags |= SFLOWRTF_ONLINK;
1839 	}
1840 	if (fr->fr_flags & FLOWRTF_GATEWAY) {
1841 		sfr->sfr_flags |= SFLOWRTF_GATEWAY;
1842 	}
1843 	if (fr->fr_flags & FLOWRTF_RESOLVED) {
1844 		sfr->sfr_flags |= SFLOWRTF_RESOLVED;
1845 	}
1846 	if (fr->fr_flags & FLOWRTF_HAS_LLINFO) {
1847 		sfr->sfr_flags |= SFLOWRTF_HAS_LLINFO;
1848 	}
1849 	if (fr->fr_flags & FLOWRTF_DELETED) {
1850 		sfr->sfr_flags |= SFLOWRTF_DELETED;
1851 	}
1852 	if (fr->fr_flags & FLOWRTF_DST_LL_MCAST) {
1853 		sfr->sfr_flags |= SFLOWRTF_DST_LL_MCAST;
1854 	}
1855 	if (fr->fr_flags & FLOWRTF_DST_LL_BCAST) {
1856 		sfr->sfr_flags |= SFLOWRTF_DST_LL_BCAST;
1857 	}
1858 
1859 	lck_spin_lock(&fr->fr_reflock);
1860 	ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF);
1861 	sfr->sfr_usecnt = fr->fr_usecnt - FLOW_ROUTE_MINREF;
1862 	if (fr->fr_expire != 0) {
1863 		sfr->sfr_expire = (int64_t)(fr->fr_expire - net_uptime());
1864 	} else {
1865 		sfr->sfr_expire = 0;
1866 	}
1867 	lck_spin_unlock(&fr->fr_reflock);
1868 
1869 	sfr->sfr_laddr = fr->fr_laddr;
1870 	sfr->sfr_faddr = fr->fr_faddr;
1871 	sfr->sfr_gaddr = fr->fr_gaddr;
1872 
1873 	if (ll_scrub) {
1874 		static const uint8_t unspec[ETHER_ADDR_LEN] = {[0] = 2 };
1875 		bcopy(&unspec, &sfr->sfr_ether_dhost, ETHER_ADDR_LEN);
1876 	} else {
1877 		bcopy(&fr->fr_eth.ether_dhost, &sfr->sfr_ether_dhost,
1878 		    ETHER_ADDR_LEN);
1879 	}
1880 }
1881 
1882 #if CONFIG_MACF
1883 extern int dlil_lladdr_ckreq;
1884 #endif /* CONFIG_MACF */
1885 
1886 static size_t
fsw_mib_get_flow_route(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len,struct proc * p)1887 fsw_mib_get_flow_route(struct nx_flowswitch *fsw,
1888     struct nexus_mib_filter *filter, void *out, size_t len, struct proc *p)
1889 {
1890 #pragma unused(filter)
1891 	uint32_t i;
1892 	size_t actual_space = 0;
1893 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1894 	struct sk_stats_flow_route *sfr = out;
1895 	size_t sfo_size = sizeof(struct sk_stats_flow_route);
1896 	struct flow_route *fr;
1897 	boolean_t ll_scrub;
1898 
1899 	FSW_LOCK_ASSERT_HELD(fsw);
1900 
1901 	/*
1902 	 * To get the link-layer info, the caller must have the following
1903 	 * in their sandbox profile (or not be sandboxed at all), else we
1904 	 * scrub it clean just like dlil_ifaddr_bytes() does:
1905 	 *
1906 	 * (allow system-info (info-type "net.link.addr"))
1907 	 *
1908 	 * If scrubbed, we return 02:00:00:00:00:00.
1909 	 */
1910 #if CONFIG_MACF
1911 	ll_scrub = (dlil_lladdr_ckreq &&
1912 	    skywalk_mac_system_check_proc_cred(p, "net.link.addr") != 0);
1913 #else /* !CONFIG_MACF */
1914 	ll_scrub = FALSE;
1915 #endif /* !CONFIG_MACF */
1916 
1917 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
1918 		struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
1919 		FRB_RLOCK(frb);
1920 		RB_FOREACH(fr, flow_route_tree, &frb->frb_head) {
1921 			actual_space += sfo_size;
1922 			if (out == NULL || actual_space > len) {
1923 				continue;
1924 			}
1925 
1926 			fsw_fr2sfr(fsw, fr, sfr, ll_scrub);
1927 			sfr++;
1928 		}
1929 		FRB_UNLOCK(frb);
1930 	}
1931 
1932 	return actual_space;
1933 }
1934 
1935 static inline void
fsw_nxs2nus(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,pid_t pid,struct __nx_stats_fsw * nxs,struct sk_stats_userstack * sus)1936 fsw_nxs2nus(struct nx_flowswitch *fsw, struct nexus_mib_filter *filter,
1937     pid_t pid, struct __nx_stats_fsw *nxs, struct sk_stats_userstack *sus)
1938 {
1939 	uuid_copy(sus->sus_nx_uuid, fsw->fsw_nx->nx_uuid);
1940 	(void) strlcpy(sus->sus_if_name, fsw->fsw_flow_mgr->fm_name,
1941 	    IFNAMSIZ);
1942 	sus->sus_owner_pid = pid;
1943 
1944 	if (filter->nmf_type & NXMIB_IP_STATS) {
1945 		sus->sus_ip  = nxs->nxs_ipstat;
1946 	}
1947 
1948 	if (filter->nmf_type & NXMIB_IP6_STATS) {
1949 		sus->sus_ip6 = nxs->nxs_ip6stat;
1950 	}
1951 
1952 	if (filter->nmf_type & NXMIB_TCP_STATS) {
1953 		sus->sus_tcp = nxs->nxs_tcpstat;
1954 	}
1955 
1956 	if (filter->nmf_type & NXMIB_UDP_STATS) {
1957 		sus->sus_udp = nxs->nxs_udpstat;
1958 	}
1959 
1960 	if (filter->nmf_type & NXMIB_QUIC_STATS) {
1961 		sus->sus_quic = nxs->nxs_quicstat;
1962 	}
1963 }
1964 
1965 static size_t
fsw_mib_get_userstack_stats(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1966 fsw_mib_get_userstack_stats(struct nx_flowswitch *fsw,
1967     struct nexus_mib_filter *filter, void *out, size_t len)
1968 {
1969 	size_t actual_space = 0;
1970 	struct kern_channel *ch;
1971 	struct __nx_stats_fsw *nxs;
1972 	struct sk_stats_userstack *sus = out;
1973 	size_t sus_size = sizeof(struct sk_stats_userstack);
1974 
1975 	SK_LOCK_ASSERT_HELD();
1976 
1977 	/* copyout saved stats from closed ports */
1978 	if (((filter->nmf_bitmap & NXMIB_FILTER_PID) &&
1979 	    (filter->nmf_pid == 0)) ||
1980 	    !(filter->nmf_bitmap & NXMIB_FILTER_PID)) {
1981 		actual_space += sus_size;
1982 		if (out != NULL && actual_space <= len) {
1983 			nxs = fsw->fsw_closed_na_stats;
1984 			fsw_nxs2nus(fsw, filter, 0, nxs, sus);
1985 			sus++;
1986 		}
1987 	}
1988 
1989 	/*
1990 	 * XXX Currently a proc only opens one channel to nexus so we don't do
1991 	 * per proc aggregation of inet stats now as this needs lots of code
1992 	 */
1993 	/* copyout per process stats */
1994 	STAILQ_FOREACH(ch, &fsw->fsw_nx->nx_ch_head, ch_link) {
1995 		struct skmem_arena *ar;
1996 		struct nexus_adapter *na;
1997 
1998 		/* ch_lock isn't needed here since sk_lock is held */
1999 		if ((ch->ch_flags & CHANF_CLOSING) ||
2000 		    (na = ch->ch_na) == NULL) {
2001 			/* channel is closing */
2002 			continue;
2003 		}
2004 
2005 		if ((filter->nmf_bitmap & NXMIB_FILTER_PID) &&
2006 		    filter->nmf_pid != ch->ch_pid) {
2007 			continue;
2008 		}
2009 
2010 		ar = na->na_arena;
2011 
2012 		AR_LOCK(ar);
2013 		nxs = skmem_arena_nexus(ar)->arn_stats_obj;
2014 		if (nxs == NULL) {
2015 			ASSERT(ar->ar_flags & ARF_DEFUNCT);
2016 			AR_UNLOCK(ar);
2017 			continue;
2018 		}
2019 
2020 		actual_space += sus_size;
2021 		if (out == NULL || actual_space > len) {
2022 			AR_UNLOCK(ar);
2023 			continue;
2024 		}
2025 
2026 		fsw_nxs2nus(fsw, filter, ch->ch_pid, nxs, sus);
2027 		sus++;
2028 		AR_UNLOCK(ar);
2029 	}
2030 
2031 	return actual_space;
2032 }
2033 
2034 static size_t
fsw_mib_get_stats(struct nx_flowswitch * fsw,void * out,size_t len)2035 fsw_mib_get_stats(struct nx_flowswitch *fsw, void *out, size_t len)
2036 {
2037 	struct sk_stats_flow_switch *sfs = out;
2038 	size_t actual_space = sizeof(struct sk_stats_flow_switch);
2039 
2040 	if (out != NULL && actual_space <= len) {
2041 		uuid_copy(sfs->sfs_nx_uuid, fsw->fsw_nx->nx_uuid);
2042 		(void) strlcpy(sfs->sfs_if_name,
2043 		    fsw->fsw_flow_mgr->fm_name, IFNAMSIZ);
2044 		sfs->sfs_fsws = fsw->fsw_stats;
2045 	}
2046 
2047 	return actual_space;
2048 }
2049 
2050 size_t
fsw_mib_get(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len,struct proc * p)2051 fsw_mib_get(struct nx_flowswitch *fsw, struct nexus_mib_filter *filter,
2052     void *out, size_t len, struct proc *p)
2053 {
2054 	size_t ret;
2055 
2056 	switch (filter->nmf_type) {
2057 	case NXMIB_FSW_STATS:
2058 		ret = fsw_mib_get_stats(fsw, out, len);
2059 		break;
2060 	case NXMIB_FLOW:
2061 		ret = fsw_mib_get_flow(fsw, filter, out, len);
2062 		break;
2063 	case NXMIB_FLOW_OWNER:
2064 		ret = fsw_mib_get_flow_owner(fsw, filter, out, len);
2065 		break;
2066 	case NXMIB_FLOW_ROUTE:
2067 		ret = fsw_mib_get_flow_route(fsw, filter, out, len, p);
2068 		break;
2069 	case NXMIB_TCP_STATS:
2070 	case NXMIB_UDP_STATS:
2071 	case NXMIB_IP_STATS:
2072 	case NXMIB_IP6_STATS:
2073 	case NXMIB_USERSTACK_STATS:
2074 		ret = fsw_mib_get_userstack_stats(fsw, filter, out, len);
2075 		break;
2076 	case NXMIB_FLOW_ADV:
2077 		ret = fsw_mib_get_flow_adv(fsw, filter, out, len);
2078 		break;
2079 	default:
2080 		ret = 0;
2081 		break;
2082 	}
2083 
2084 	return ret;
2085 }
2086 
2087 void
fsw_fold_stats(struct nx_flowswitch * fsw,void * data,nexus_stats_type_t type)2088 fsw_fold_stats(struct nx_flowswitch *fsw,
2089     void *data, nexus_stats_type_t type)
2090 {
2091 	ASSERT(data != NULL);
2092 	FSW_LOCK_ASSERT_HELD(fsw);
2093 
2094 	switch (type) {
2095 	case NEXUS_STATS_TYPE_FSW:
2096 	{
2097 		struct __nx_stats_fsw *d, *s;
2098 		d = fsw->fsw_closed_na_stats;
2099 		s = data;
2100 		ip_stats_fold(&d->nxs_ipstat, &s->nxs_ipstat);
2101 		ip6_stats_fold(&d->nxs_ip6stat, &s->nxs_ip6stat);
2102 		tcp_stats_fold(&d->nxs_tcpstat, &s->nxs_tcpstat);
2103 		udp_stats_fold(&d->nxs_udpstat, &s->nxs_udpstat);
2104 		quic_stats_fold(&d->nxs_quicstat, &s->nxs_quicstat);
2105 		break;
2106 	}
2107 	case NEXUS_STATS_TYPE_CHAN_ERRORS:
2108 	{
2109 		struct __nx_stats_channel_errors *s = data;
2110 		fsw_vp_channel_error_stats_fold(&fsw->fsw_stats, s);
2111 		break;
2112 	}
2113 	default:
2114 		VERIFY(0);
2115 		/* NOTREACHED */
2116 		__builtin_unreachable();
2117 	}
2118 }
2119 
2120 boolean_t
fsw_detach_barrier_add(struct nx_flowswitch * fsw)2121 fsw_detach_barrier_add(struct nx_flowswitch *fsw)
2122 {
2123 	lck_mtx_lock_spin(&fsw->fsw_detach_barrier_lock);
2124 	if (__improbable(fsw->fsw_detach_flags != 0 ||
2125 	    fsw->fsw_ifp == NULL || fsw->fsw_agent_session == NULL)) {
2126 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2127 		return FALSE;
2128 	}
2129 	fsw->fsw_detach_barriers++;
2130 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2131 
2132 	return TRUE;
2133 }
2134 
2135 void
fsw_detach_barrier_remove(struct nx_flowswitch * fsw)2136 fsw_detach_barrier_remove(struct nx_flowswitch *fsw)
2137 {
2138 	lck_mtx_lock_spin(&fsw->fsw_detach_barrier_lock);
2139 	ASSERT((fsw->fsw_detach_flags & FSW_DETACHF_DETACHED) == 0);
2140 	ASSERT(fsw->fsw_detach_barriers != 0);
2141 	fsw->fsw_detach_barriers--;
2142 	/* if there's a thread waiting to detach the interface, let it know */
2143 	if (__improbable((fsw->fsw_detach_waiters > 0) &&
2144 	    (fsw->fsw_detach_barriers == 0))) {
2145 		fsw->fsw_detach_waiters = 0;
2146 		wakeup(&fsw->fsw_detach_waiters);
2147 	}
2148 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2149 }
2150 
2151 /*
2152  * Generic resolver for non-Ethernet interfaces.
2153  */
2154 int
fsw_generic_resolve(struct nx_flowswitch * fsw,struct flow_route * fr,struct __kern_packet * pkt)2155 fsw_generic_resolve(struct nx_flowswitch *fsw, struct flow_route *fr,
2156     struct __kern_packet *pkt)
2157 {
2158 #pragma unused(pkt)
2159 #if SK_LOG
2160 	char dst_s[MAX_IPv6_STR_LEN];
2161 #endif /* SK_LOG */
2162 	struct ifnet *ifp = fsw->fsw_ifp;
2163 	struct rtentry *tgt_rt = NULL;
2164 	int err = 0;
2165 
2166 	ASSERT(fr != NULL);
2167 	ASSERT(ifp != NULL);
2168 
2169 	FR_LOCK(fr);
2170 	/*
2171 	 * If the destination is on-link, we use the final destination
2172 	 * address as target.  If it's off-link, we use the gateway
2173 	 * address instead.  Point tgt_rt to the the destination or
2174 	 * gateway route accordingly.
2175 	 */
2176 	if (fr->fr_flags & FLOWRTF_ONLINK) {
2177 		tgt_rt = fr->fr_rt_dst;
2178 	} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
2179 		tgt_rt = fr->fr_rt_gw;
2180 	}
2181 
2182 	/*
2183 	 * Perform another routing table lookup if necessary.
2184 	 */
2185 	if (tgt_rt == NULL || !(tgt_rt->rt_flags & RTF_UP) ||
2186 	    fr->fr_want_configure) {
2187 		if (fr->fr_want_configure == 0) {
2188 			atomic_add_32(&fr->fr_want_configure, 1);
2189 		}
2190 		err = flow_route_configure(fr, ifp, NULL);
2191 		if (err != 0) {
2192 			SK_ERR("failed to configure route to %s on %s (err %d)",
2193 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
2194 			    sizeof(dst_s)), ifp->if_xname, err);
2195 			goto done;
2196 		}
2197 
2198 		/* refresh pointers */
2199 		if (fr->fr_flags & FLOWRTF_ONLINK) {
2200 			tgt_rt = fr->fr_rt_dst;
2201 		} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
2202 			tgt_rt = fr->fr_rt_gw;
2203 		}
2204 	}
2205 
2206 	if (__improbable(!(fr->fr_flags & (FLOWRTF_ONLINK | FLOWRTF_GATEWAY)))) {
2207 		err = EHOSTUNREACH;
2208 		SK_ERR("invalid route for %s on %s (err %d)",
2209 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
2210 		    sizeof(dst_s)), ifp->if_xname, err);
2211 		goto done;
2212 	}
2213 
2214 	ASSERT(tgt_rt != NULL);
2215 
2216 done:
2217 	if (__probable(err == 0)) {
2218 		/*
2219 		 * There's no actual resolution taking place here, so just
2220 		 * mark it with FLOWRTF_RESOLVED for consistency.
2221 		 */
2222 		atomic_bitset_32(&fr->fr_flags, FLOWRTF_RESOLVED);
2223 		atomic_set_32(&fr->fr_want_probe, 0);
2224 	} else {
2225 		atomic_bitclear_32(&fr->fr_flags, FLOWRTF_RESOLVED);
2226 		flow_route_cleanup(fr);
2227 	}
2228 	FR_UNLOCK(fr);
2229 
2230 	return err;
2231 }
2232 
2233 static void
fsw_read_boot_args(void)2234 fsw_read_boot_args(void)
2235 {
2236 	(void) PE_parse_boot_argn("fsw_use_dual_sized_pool",
2237 	    &fsw_use_dual_sized_pool, sizeof(fsw_use_dual_sized_pool));
2238 }
2239 
2240 void
fsw_init(void)2241 fsw_init(void)
2242 {
2243 	_CASSERT(NX_FSW_CHUNK_FREE == (uint64_t)-1);
2244 	_CASSERT(PKT_MAX_PROTO_HEADER_SIZE <= NX_FSW_MINBUFSIZE);
2245 
2246 	if (!__nx_fsw_inited) {
2247 		fsw_read_boot_args();
2248 		/*
2249 		 * Register callbacks for interface & protocol events
2250 		 * Use dummy arg for callback cookie.
2251 		 */
2252 		__nx_fsw_ifnet_eventhandler_tag =
2253 		    EVENTHANDLER_REGISTER(&ifnet_evhdlr_ctxt,
2254 		    ifnet_event, fsw_ifnet_event_callback,
2255 		    eventhandler_entry_dummy_arg, EVENTHANDLER_PRI_ANY);
2256 		VERIFY(__nx_fsw_ifnet_eventhandler_tag != NULL);
2257 
2258 		__nx_fsw_protoctl_eventhandler_tag =
2259 		    EVENTHANDLER_REGISTER(&protoctl_evhdlr_ctxt,
2260 		    protoctl_event, fsw_protoctl_event_callback,
2261 		    eventhandler_entry_dummy_arg, EVENTHANDLER_PRI_ANY);
2262 		VERIFY(__nx_fsw_protoctl_eventhandler_tag != NULL);
2263 		__nx_fsw_inited = 1;
2264 	}
2265 }
2266 
2267 void
fsw_uninit(void)2268 fsw_uninit(void)
2269 {
2270 	if (__nx_fsw_inited) {
2271 		EVENTHANDLER_DEREGISTER(&ifnet_evhdlr_ctxt, ifnet_event,
2272 		    __nx_fsw_ifnet_eventhandler_tag);
2273 		EVENTHANDLER_DEREGISTER(&protoctl_evhdlr_ctxt, protoctl_event,
2274 		    __nx_fsw_protoctl_eventhandler_tag);
2275 
2276 		__nx_fsw_inited = 0;
2277 	}
2278 }
2279 
2280 struct nx_flowswitch *
fsw_alloc(zalloc_flags_t how)2281 fsw_alloc(zalloc_flags_t how)
2282 {
2283 	struct nx_flowswitch *fsw;
2284 	struct __nx_stats_fsw *nsfw;
2285 
2286 	SK_LOCK_ASSERT_HELD();
2287 
2288 	nsfw = zalloc_flags(nx_fsw_stats_zone, how | Z_ZERO);
2289 	if (nsfw == NULL) {
2290 		return NULL;
2291 	}
2292 
2293 	fsw = zalloc_flags(nx_fsw_zone, how | Z_ZERO);
2294 	if (fsw == NULL) {
2295 		zfree(nx_fsw_stats_zone, nsfw);
2296 		return NULL;
2297 	}
2298 
2299 	FSW_RWINIT(fsw);
2300 	fsw->fsw_dev_ch = NULL;
2301 	fsw->fsw_host_ch = NULL;
2302 	fsw->fsw_closed_na_stats = nsfw;
2303 
2304 	SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
2305 
2306 	return fsw;
2307 }
2308 
2309 static int
fsw_detach(struct nx_flowswitch * fsw,struct nexus_adapter * hwna,boolean_t purge)2310 fsw_detach(struct nx_flowswitch *fsw, struct nexus_adapter *hwna,
2311     boolean_t purge)
2312 {
2313 	struct kern_nexus_provider *nx_prov = fsw->fsw_nx->nx_prov;
2314 	boolean_t do_dtor = FALSE;
2315 
2316 	SK_LOCK_ASSERT_HELD();
2317 
2318 	/*
2319 	 * return error if the the host port detach is in progress
2320 	 * or already detached.
2321 	 * For the case of flowswitch free (i.e. purge is TRUE) we have to
2322 	 * cleanup everything, so we will block if needed.
2323 	 */
2324 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
2325 	if (!purge && fsw->fsw_detach_flags != 0) {
2326 		SK_ERR("fsw detaching");
2327 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2328 		return EBUSY;
2329 	}
2330 	VERIFY(purge || fsw->fsw_detach_flags == 0);
2331 	/*
2332 	 * mark the flowswitch as detaching and release sk_lock while
2333 	 * waiting for other threads to exit. Maintain lock/unlock
2334 	 * ordering between the two locks.
2335 	 */
2336 	fsw->fsw_detach_flags |= FSW_DETACHF_DETACHING;
2337 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2338 	SK_UNLOCK();
2339 
2340 	/*
2341 	 * wait until all threads needing accesses to the flowswitch
2342 	 * netagent get out, and mark this as detached to prevent
2343 	 * further access requests from being admitted.
2344 	 */
2345 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
2346 	while (fsw->fsw_detach_barriers != 0) {
2347 		fsw->fsw_detach_waiters++;
2348 		(void) msleep(&fsw->fsw_detach_waiters,
2349 		    &fsw->fsw_detach_barrier_lock,
2350 		    (PZERO + 1), __FUNCTION__, NULL);
2351 	}
2352 	VERIFY(fsw->fsw_detach_barriers == 0);
2353 	VERIFY(fsw->fsw_detach_flags != 0);
2354 	fsw->fsw_detach_flags &= ~FSW_DETACHF_DETACHING;
2355 	/*
2356 	 * if the NA detach thread as well as the flowswitch free thread were
2357 	 * both waiting, then the thread which wins the race is responsible
2358 	 * for doing the dtor work.
2359 	 */
2360 	if (fsw->fsw_detach_flags == 0) {
2361 		fsw->fsw_detach_flags |= FSW_DETACHF_DETACHED;
2362 		do_dtor = TRUE;
2363 	}
2364 	VERIFY(fsw->fsw_detach_flags == FSW_DETACHF_DETACHED);
2365 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2366 	SK_LOCK();
2367 
2368 	FSW_WLOCK(fsw);
2369 	if (do_dtor) {
2370 		if (fsw->fsw_ifp != NULL) {
2371 			fsw_teardown_ifp(fsw, hwna);
2372 			ASSERT(fsw->fsw_ifp == NULL);
2373 			ASSERT(fsw->fsw_nifna == NULL);
2374 		}
2375 		bzero(fsw->fsw_slla, sizeof(fsw->fsw_slla));
2376 		nx_prov->nxprov_params->nxp_ifindex = 0;
2377 		/* free any flow entries in the deferred list */
2378 		fsw_linger_purge(fsw);
2379 	}
2380 	/*
2381 	 * If we are destroying the instance, release lock to let all
2382 	 * outstanding agent threads to enter, followed by waiting until
2383 	 * all of them exit the critical section before continuing.
2384 	 */
2385 	if (purge) {
2386 		FSW_UNLOCK(fsw);
2387 		flow_mgr_terminate(fsw->fsw_flow_mgr);
2388 		FSW_WLOCK(fsw);
2389 	}
2390 	FSW_WUNLOCK(fsw);
2391 	return 0;
2392 }
2393 
2394 void
fsw_free(struct nx_flowswitch * fsw)2395 fsw_free(struct nx_flowswitch *fsw)
2396 {
2397 	int err;
2398 
2399 	SK_LOCK_ASSERT_HELD();
2400 	ASSERT(fsw != NULL);
2401 
2402 	err = fsw_detach(fsw, NULL, TRUE);
2403 	VERIFY(err == 0);
2404 
2405 	fsw_dp_dtor(fsw);
2406 
2407 	ASSERT(fsw->fsw_dev_ch == NULL);
2408 	ASSERT(fsw->fsw_host_ch == NULL);
2409 	ASSERT(fsw->fsw_closed_na_stats != NULL);
2410 	zfree(nx_fsw_stats_zone, fsw->fsw_closed_na_stats);
2411 	fsw->fsw_closed_na_stats = NULL;
2412 	FSW_RWDESTROY(fsw);
2413 
2414 	SK_DF(SK_VERB_MEM, "fsw 0x%llx FREE", SK_KVA(fsw));
2415 	zfree(nx_fsw_zone, fsw);
2416 }
2417