xref: /xnu-8792.61.2/bsd/skywalk/nexus/flowswitch/fsw.c (revision 42e220869062b56f8d7d0726fd4c88954f87902c)
1 /*
2  * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  *   1. Redistributions of source code must retain the above copyright
36  *      notice, this list of conditions and the following disclaimer.
37  *   2. Redistributions in binary form must reproduce the above copyright
38  *      notice, this list of conditions and the following disclaimer in the
39  *      documentation and/or other materials provided with the distribution.
40  *
41  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  */
53 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
54 #include <skywalk/os_skywalk_private.h>
55 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
56 #include <skywalk/nexus/flowswitch/fsw_var.h>
57 #include <skywalk/nexus/netif/nx_netif.h>
58 #include <skywalk/nexus/netif/nx_netif_compat.h>
59 
60 #include <net/bpf.h>
61 #include <net/if.h>
62 #include <net/pktsched/pktsched_netem.h>
63 #include <sys/eventhandler.h>
64 
65 #if (DEVELOPMENT || DEBUG)
66 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, chain_enqueue,
67     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_chain_enqueue, 0, "");
68 #endif /* !DEVELOPMENT && !DEBUG */
69 
70 /*
71  * Configures the flowswitch to utilize user packet pool with
72  * dual sized buffers.
73  * A non-zero value enables the support.
74  */
75 #if defined(XNU_TARGET_OS_IOS) || defined(XNU_TARGET_OS_OSX)
76 uint32_t fsw_use_dual_sized_pool = 1;
77 #else
78 uint32_t fsw_use_dual_sized_pool = 0;
79 #endif
80 
81 uint32_t fsw_chain_enqueue = 0;
82 static int __nx_fsw_inited = 0;
83 static eventhandler_tag __nx_fsw_ifnet_eventhandler_tag = NULL;
84 static eventhandler_tag __nx_fsw_protoctl_eventhandler_tag = NULL;
85 
86 static ZONE_DEFINE(nx_fsw_zone, SKMEM_ZONE_PREFIX ".nx.fsw",
87     sizeof(struct nx_flowswitch), ZC_ZFREE_CLEARMEM);
88 
89 static ZONE_DEFINE(nx_fsw_stats_zone, SKMEM_ZONE_PREFIX ".nx.fsw.stats",
90     sizeof(struct __nx_stats_fsw), ZC_ZFREE_CLEARMEM);
91 
92 #define SKMEM_TAG_FSW_PORTS     "com.apple.skywalk.fsw.ports"
93 SKMEM_TAG_DEFINE(skmem_tag_fsw_ports, SKMEM_TAG_FSW_PORTS);
94 
95 #define SKMEM_TAG_FSW_FOB_HASH "com.apple.skywalk.fsw.fsw.fob.hash"
96 SKMEM_TAG_DEFINE(skmem_tag_fsw_fob_hash, SKMEM_TAG_FSW_FOB_HASH);
97 
98 #define SKMEM_TAG_FSW_FRB_HASH "com.apple.skywalk.fsw.fsw.frb.hash"
99 SKMEM_TAG_DEFINE(skmem_tag_fsw_frb_hash, SKMEM_TAG_FSW_FRB_HASH);
100 
101 #define SKMEM_TAG_FSW_FRIB_HASH "com.apple.skywalk.fsw.fsw.frib.hash"
102 SKMEM_TAG_DEFINE(skmem_tag_fsw_frib_hash, SKMEM_TAG_FSW_FRIB_HASH);
103 
104 #define SKMEM_TAG_FSW_FRAG_MGR "com.apple.skywalk.fsw.fsw.frag.mgr"
105 SKMEM_TAG_DEFINE(skmem_tag_fsw_frag_mgr, SKMEM_TAG_FSW_FRAG_MGR);
106 
107 /* 64-bit mask with range */
108 #define BMASK64(_beg, _end)     \
109 	((NX_FSW_CHUNK_FREE >> (63 - (_end))) & ~((1ULL << (_beg)) - 1))
110 
111 static int fsw_detach(struct nx_flowswitch *fsw, struct nexus_adapter *hwna,
112     boolean_t purge);
113 
114 int
fsw_attach_vp(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_vp_adapter ** vpna)115 fsw_attach_vp(struct kern_nexus *nx, struct kern_channel *ch,
116     struct chreq *chr, struct nxbind *nxb, struct proc *p,
117     struct nexus_vp_adapter **vpna)
118 {
119 #pragma unused(ch)
120 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
121 	char *cr_name = chr->cr_name;
122 	int err = 0;
123 
124 	SK_LOCK_ASSERT_HELD();
125 	ASSERT(!(chr->cr_mode & CHMODE_CONFIG));
126 	*vpna = NULL;
127 
128 	/* if there's an existing adapter on the nexus port then use it */
129 	FSW_WLOCK(fsw);
130 	err = fsw_port_alloc(fsw, nxb, vpna, chr->cr_port, p, FALSE, FALSE);
131 	FSW_WUNLOCK(fsw);
132 
133 	if (err != 0) {
134 		ASSERT(*vpna == NULL);
135 		goto out;
136 	} else if (*vpna != NULL) {
137 		/*
138 		 * Use the existing adapter on that port; fsw_port_alloc()
139 		 * callback has retained a reference count on the adapter.
140 		 */
141 		goto out;
142 	}
143 	ASSERT(*vpna == NULL);
144 
145 	/* create a virtual port; callee holds vpna ref */
146 	err = fsw_vp_na_create(nx, chr, vpna);
147 	if (err != 0) {
148 		SK_ERR("vpna create failed (err %d)", err);
149 		goto out;
150 	}
151 
152 	/* attach vp to fsw */
153 	err = fsw_vp_na_attach(nx, cr_name, &(*vpna)->vpna_up);
154 	if (err != 0) {
155 		SK_ERR("vpna \"%s\" fsw attach failed (err %d)",
156 		    (*vpna)->vpna_up.na_name, err);
157 		goto out;
158 	}
159 
160 	FSW_WLOCK(fsw);
161 	err = fsw_port_alloc(fsw, nxb, vpna, (*vpna)->vpna_nx_port, p, FALSE, FALSE);
162 	FSW_WUNLOCK(fsw);
163 
164 out:
165 	if ((*vpna) != NULL) {
166 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
167 		    "vpna \"%s\" (0x%llx) refs %u to fsw \"%s\" "
168 		    "nx_port %d (err %d)", (*vpna)->vpna_up.na_name,
169 		    SK_KVA(&(*vpna)->vpna_up), (*vpna)->vpna_up.na_refcount,
170 		    cr_name, (int)(*vpna)->vpna_nx_port, err);
171 
172 		if (err != 0) {
173 			na_release_locked(&(*vpna)->vpna_up);
174 			*vpna = NULL;
175 		}
176 	}
177 
178 	return err;
179 }
180 
181 static int
fsw_nx_check(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)182 fsw_nx_check(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
183 {
184 #pragma unused(fsw)
185 	nexus_type_t hw_nxdom_type = NX_DOM(hw_nx)->nxdom_type;
186 
187 	if (hw_nxdom_type != NEXUS_TYPE_NET_IF) {
188 		return EINVAL;
189 	}
190 
191 	/* it's a netif below */
192 	return 0;
193 }
194 
195 static int
fsw_ctl_flow_add(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)196 fsw_ctl_flow_add(struct nx_flowswitch *fsw, struct proc *p,
197     struct nx_flow_req *req)
198 {
199 	struct flow_owner *fo;
200 	int error = 0;
201 
202 	ASSERT(p != PROC_NULL);
203 
204 	if (p != kernproc) {
205 		/* special port shouldn't be bound via this method */
206 		if (req->nfr_nx_port < FSW_VP_USER_MIN) {
207 			return EINVAL;
208 		}
209 		req->nfr_flags |= (NXFLOWREQF_TRACK | NXFLOWREQF_FLOWADV);
210 	} else {
211 		/* no flow track or advisory support for bsd flow */
212 		ASSERT((req->nfr_flags & NXFLOWREQF_TRACK) == 0);
213 		ASSERT((req->nfr_flags & NXFLOWREQF_FLOWADV) == 0);
214 		ASSERT((req->nfr_flags & NXFLOWREQF_LOW_LATENCY) == 0);
215 	}
216 
217 	/* init kernel only fields */
218 	if (p != kernproc) {
219 		nx_flow_req_internalize(req);
220 	}
221 	req->nfr_pid = proc_pid(p);
222 	if (req->nfr_epid == -1) {
223 		req->nfr_epid = proc_pid(p);
224 	}
225 
226 	if (req->nfr_flow_demux_count > MAX_FLOW_DEMUX_PATTERN) {
227 		SK_ERR("invalid flow demux count %u", req->nfr_flow_demux_count);
228 		return EINVAL;
229 	}
230 
231 	fo = fsw_flow_add(fsw, req, &error);
232 	ASSERT(fo != NULL || error != 0);
233 
234 	if (error == 0) {
235 		// user space don't need this flow stats
236 		flow_stats_release(req->nfr_flow_stats);
237 	}
238 	if (p != kernproc) {
239 		nx_flow_req_externalize(req);
240 	}
241 
242 	return error;
243 }
244 
245 static int
fsw_ctl_flow_del(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)246 fsw_ctl_flow_del(struct nx_flowswitch *fsw, struct proc *p,
247     struct nx_flow_req *req)
248 {
249 	int err;
250 
251 	nx_flow_req_internalize(req);
252 	req->nfr_pid = proc_pid(p);
253 	err = fsw_flow_del(fsw, req, TRUE, NULL);
254 
255 	nx_flow_req_externalize(req);
256 	return err;
257 }
258 
259 #if (DEVELOPMENT || DEBUG)
260 static int
261 fsw_rps_threads_sysctl SYSCTL_HANDLER_ARGS
262 {
263 #pragma unused(oidp, arg2)
264 	struct nx_flowswitch *fsw = arg1;
265 	uint32_t nthreads;
266 	int changed;
267 	int error;
268 
269 	error = sysctl_io_number(req, fsw->fsw_rps_nthreads,
270 	    sizeof(fsw->fsw_rps_nthreads), &nthreads, &changed);
271 	if (error == 0 && changed != 0) {
272 		error = fsw_rps_set_nthreads(fsw, nthreads);
273 	}
274 	return error;
275 }
276 #endif /* !DEVELOPMENT && !DEBUG */
277 
278 static int
fsw_setup_ifp(struct nx_flowswitch * fsw,struct nexus_adapter * hwna)279 fsw_setup_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna)
280 {
281 	int error = 0;
282 	struct ifnet *ifp = hwna->na_ifp;
283 	struct kern_pbufpool *pp = skmem_arena_nexus(hwna->na_arena)->arn_rx_pp;
284 	size_t f_limit = pp->pp_kmd_region->skr_c_obj_cnt / 2;
285 
286 	ASSERT((hwna->na_type == NA_NETIF_HOST) ||
287 	    (hwna->na_type == NA_NETIF_COMPAT_HOST));
288 
289 	SK_LOCK_ASSERT_HELD();
290 
291 	/*
292 	 * XXX: we don't support non TXSTART interface.
293 	 * There are assumptions in fsw_port_flush_enqueue_dst() about
294 	 * single threaded write to destination rings.
295 	 */
296 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
297 		SK_ERR("non TXSTART interface not supported ifp(0x%llx)",
298 		    SK_KVA(ifp));
299 		return ENOTSUP;
300 	}
301 
302 	FSW_WLOCK(fsw);
303 
304 	ASSERT(fsw->fsw_ifp == NULL);
305 	ASSERT(fsw->fsw_nifna == NULL);
306 	ASSERT(fsw->fsw_resolve == NULL);
307 	ASSERT(fsw->fsw_frame == NULL);
308 	ASSERT(fsw->fsw_demux == NULL);
309 	ASSERT(fsw->fsw_pkt_copy_from_pkt == NULL);
310 	ASSERT(fsw->fsw_pkt_copy_from_mbuf == NULL);
311 	ASSERT(fsw->fsw_pkt_copy_to_mbuf == NULL);
312 
313 	fsw->fsw_ipfm = fsw_ip_frag_mgr_create(fsw, ifp, f_limit);
314 	if (fsw->fsw_ipfm == NULL) {
315 		FSW_WUNLOCK(fsw);
316 		return ENOMEM;
317 	}
318 
319 	switch (ifp->if_family) {
320 	case IFNET_FAMILY_ETHERNET:
321 		error = fsw_ethernet_setup(fsw, ifp);
322 		fsw->fsw_ifp_dlt = DLT_EN10MB;
323 		break;
324 
325 	case IFNET_FAMILY_CELLULAR:
326 		error = fsw_cellular_setup(fsw, ifp);
327 		fsw->fsw_ifp_dlt = DLT_RAW;
328 		break;
329 
330 	default:
331 		if (ifp->if_family == IFNET_FAMILY_IPSEC ||
332 		    ifp->if_family == IFNET_FAMILY_UTUN) {
333 			error = fsw_ip_setup(fsw, ifp);
334 			fsw->fsw_ifp_dlt = DLT_RAW;
335 			break;
336 		}
337 		error = ENOTSUP;
338 		break;
339 	}
340 
341 	if (error != 0) {
342 		FSW_WUNLOCK(fsw);
343 		return error;
344 	}
345 
346 	ASSERT(fsw->fsw_resolve != NULL);
347 
348 	if (NX_PROV(fsw->fsw_nx)->nxprov_region_params[SKMEM_REGION_KMD].
349 	    srp_max_frags > 1 || pp->pp_max_frags > 1) {
350 		fsw->fsw_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt;
351 		fsw->fsw_pkt_copy_from_mbuf = pkt_copy_multi_buflet_from_mbuf;
352 		fsw->fsw_pkt_copy_to_mbuf = pkt_copy_multi_buflet_to_mbuf;
353 	} else {
354 		fsw->fsw_pkt_copy_from_pkt = pkt_copy_from_pkt;
355 		fsw->fsw_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
356 		fsw->fsw_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
357 	}
358 
359 	/*
360 	 * Since it is possible for fsw to refer to the ifp after all
361 	 * underlying hwnas are freed (see fsw_teardown_ifp()), we need
362 	 * an extra reference to the ifp here.
363 	 *
364 	 * We also cache the netif adapter of the interface, as it's
365 	 * needed for each packet enqueued to the classq.  There is no
366 	 * need to retain a refcnt for the same reason as above.
367 	 *
368 	 * We hold the busy lock across these, just in case an interface
369 	 * detach and reattach happens, as fsw_flow_bind() relies on the
370 	 * same lock as well before making its checks.
371 	 */
372 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
373 
374 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
375 	fsw->fsw_ifp = ifp;
376 	fsw->fsw_nifna = &ifp->if_na->nifna_up;
377 	ifp->if_na->nifna_netif->nif_fsw = fsw;
378 	ifp->if_na->nifna_netif->nif_fsw_nxadv =
379 	    fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
380 	(void) strlcpy(fsw->fsw_flow_mgr->fm_name,
381 	    if_name(ifp), IFNAMSIZ);
382 
383 	fsw_classq_setup(fsw, hwna);
384 	fsw->fsw_classq_enabled = TRUE;
385 	fsw->fsw_src_lla_gencnt = 0;
386 
387 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
388 	(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
389 	    FSW_REAP_THREADNAME, ifp->if_xname, "");
390 	thread_set_thread_name(fsw->fsw_reap_thread, fsw->fsw_reap_name);
391 
392 	error = fsw_netagent_register(fsw, ifp);
393 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
394 	    "fsw_netagent_register %s (family %u) (err %d)",
395 	    if_name(ifp), ifp->if_family, error);
396 
397 	/*
398 	 * Clear NXF_REJECT to allow new channels to be opened
399 	 * to this nexus, in case this is an interface reattach.
400 	 * Otherwise this flag should already be cleared.
401 	 */
402 	if (error == 0) {
403 		atomic_bitclear_32(&fsw->fsw_nx->nx_flags, NXF_REJECT);
404 	}
405 
406 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
407 
408 	/*
409 	 * Wake up the reaper thread.
410 	 */
411 	if (error == 0) {
412 		fsw_reap_sched(fsw);
413 	}
414 
415 	/* init skoid */
416 	skoid_create(&fsw->fsw_skoid,
417 	    SKOID_SNODE(_kern_skywalk_flowswitch), if_name(ifp),
418 	    CTLFLAG_RW);
419 
420 #if (DEVELOPMENT || DEBUG)
421 	if (SKYWALK_NATIVE(fsw->fsw_ifp)) {
422 		skoid_add_handler(&fsw->fsw_skoid, "rps_nthreads", CTLFLAG_RW,
423 		    fsw_rps_threads_sysctl, fsw, 0);
424 	}
425 #endif /* !DEVELOPMENT && !DEBUG */
426 
427 	FSW_WUNLOCK(fsw);
428 
429 	return error;
430 }
431 
432 static void
fsw_teardown_ifp(struct nx_flowswitch * fsw,struct nexus_adapter * hwna)433 fsw_teardown_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna)
434 {
435 	struct ifnet *ifp;
436 
437 	SK_LOCK_ASSERT_HELD();
438 
439 	FSW_WLOCK_ASSERT_HELD(fsw);
440 	ifp = fsw->fsw_ifp;
441 	ASSERT(ifp != NULL);
442 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
443 
444 	fsw_netagent_unregister(fsw, ifp);
445 
446 	if (fsw->fsw_ipfm != NULL) {
447 		fsw_ip_frag_mgr_destroy(fsw->fsw_ipfm);
448 	}
449 
450 	skoid_destroy(&fsw->fsw_skoid);
451 
452 	SK_DF(SK_VERB_FSW, "%sdetached from %s (family %u)",
453 	    ((fsw->fsw_agent_session != NULL) ? "netagent" : ""),
454 	    if_name(ifp), ifp->if_family);
455 
456 	if (hwna != NULL) {
457 		fsw_classq_teardown(fsw, hwna);
458 	}
459 
460 	/*
461 	 * Set NXF_REJECT on the nexus, which would cause existing adapters
462 	 * to be marked similarly; channels associated with them would then
463 	 * cease to function.
464 	 */
465 	atomic_bitset_32(&fsw->fsw_nx->nx_flags, NXF_REJECT);
466 
467 	/* see notes on fsw_na_attach() about I/O refcnt */
468 	if (ifp->if_na != NULL) {
469 		ifp->if_na->nifna_netif->nif_fsw = NULL;
470 		ifp->if_na->nifna_netif->nif_fsw_nxadv = NULL;
471 		membar_sync();
472 	}
473 
474 	fsw->fsw_ifp = NULL;
475 	fsw->fsw_nifna = NULL;
476 	fsw->fsw_resolve = NULL;
477 	fsw->fsw_frame = NULL;
478 	fsw->fsw_frame_headroom = 0;
479 	fsw->fsw_demux = NULL;
480 	fsw->fsw_classq_enabled = FALSE;
481 	fsw->fsw_pkt_copy_from_pkt = NULL;
482 	fsw->fsw_pkt_copy_from_mbuf = NULL;
483 	fsw->fsw_pkt_copy_to_mbuf = NULL;
484 
485 	if (ifp->if_input_netem != NULL) {
486 		netem_destroy(ifp->if_input_netem);
487 		ifp->if_input_netem = NULL;
488 	}
489 
490 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
491 	(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
492 	    FSW_REAP_THREADNAME, if_name(ifp), "_detached");
493 	thread_set_thread_name(fsw->fsw_reap_thread, fsw->fsw_reap_name);
494 }
495 
496 static int
fsw_host_setup(struct nx_flowswitch * fsw)497 fsw_host_setup(struct nx_flowswitch *fsw)
498 {
499 	struct nexus_adapter *hwna;
500 	struct ifnet *ifp;
501 
502 	SK_LOCK_ASSERT_HELD();
503 
504 	hwna = fsw->fsw_host_ch->ch_na;
505 	ASSERT(hwna != NULL);
506 
507 
508 	/* the netif below must have an ifnet attached (dev/host port) */
509 	if ((ifp = hwna->na_ifp) == NULL) {
510 		return ENXIO;
511 	}
512 
513 	/*
514 	 * XXX: we don't support multiple rx rings yet.
515 	 * There are assumptions in fsw_port_flush_enqueue_dst() about
516 	 * single threaded write to destination rings.
517 	 */
518 	if (SKYWALK_NATIVE(ifp) && (hwna->na_num_rx_rings > 1)) {
519 		SK_ERR("ifp(0x%llx): multiple rx rings(%d) not supported",
520 		    SK_KVA(ifp), hwna->na_num_rx_rings);
521 		return ENOTSUP;
522 	}
523 
524 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
525 	if ((fsw->fsw_detach_flags & FSW_DETACHF_DETACHING) != 0) {
526 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
527 		return EBUSY;
528 	}
529 	fsw->fsw_detach_flags = 0;
530 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
531 
532 	int error = fsw_setup_ifp(fsw, hwna);
533 	ASSERT(error != 0 || fsw->fsw_ifp != NULL);
534 	if (error != 0) {
535 		return error;
536 	}
537 
538 	/* update the interface index */
539 	ASSERT(NX_PROV(fsw->fsw_nx)->nxprov_params->nxp_ifindex == 0);
540 	NX_PROV(fsw->fsw_nx)->nxprov_params->nxp_ifindex = ifp->if_index;
541 	return 0;
542 }
543 
544 static int
fsw_host_teardown(struct nx_flowswitch * fsw)545 fsw_host_teardown(struct nx_flowswitch *fsw)
546 {
547 	struct nexus_adapter *hwna = fsw->fsw_host_ch->ch_na;
548 
549 	SK_LOCK_ASSERT_HELD();
550 	return fsw_detach(fsw, hwna, FALSE);
551 }
552 
553 #if SK_LOG
554 /* Hoisted out of line to reduce kernel stack footprint */
555 SK_LOG_ATTRIBUTE
556 static void
fsw_ctl_attach_log(const struct nx_spec_req * nsr,const struct kern_nexus * nx,int err)557 fsw_ctl_attach_log(const struct nx_spec_req *nsr,
558     const struct kern_nexus *nx, int err)
559 {
560 	uuid_string_t uuidstr, ifuuidstr;
561 	const char *nustr;
562 
563 	if (nsr->nsr_flags & NXSPECREQ_UUID) {
564 		nustr = sk_uuid_unparse(nsr->nsr_uuid, uuidstr);
565 	} else if (nsr->nsr_flags & NXSPECREQ_IFP) {
566 		(void) snprintf((char *)uuidstr, sizeof(uuidstr), "0x%llx",
567 		    SK_KVA(nsr->nsr_ifp));
568 		nustr = uuidstr;
569 	} else {
570 		nustr = nsr->nsr_name;
571 	}
572 
573 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
574 	    "nexus 0x%llx (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d",
575 	    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr,
576 	    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err);
577 }
578 #endif /* SK_LOG */
579 
580 SK_NO_INLINE_ATTRIBUTE
581 static void
fsw_netif_set_callbacks_common(struct nx_flowswitch * fsw,boolean_t set)582 fsw_netif_set_callbacks_common(struct nx_flowswitch *fsw, boolean_t set)
583 {
584 	struct nexus_adapter *hwna = fsw->fsw_dev_ch->ch_na;
585 
586 	ASSERT(hwna->na_type == NA_NETIF_DEV ||
587 	    hwna->na_type == NA_NETIF_COMPAT_DEV);
588 
589 	if (set) {
590 		netif_hwna_set_mode(hwna, NETIF_MODE_FSW, fsw_devna_rx);
591 	} else {
592 		netif_hwna_clear_mode(hwna);
593 	}
594 }
595 
596 SK_NO_INLINE_ATTRIBUTE
597 static void
fsw_netif_set_callbacks(struct nx_flowswitch * fsw)598 fsw_netif_set_callbacks(struct nx_flowswitch *fsw)
599 {
600 	fsw_netif_set_callbacks_common(fsw, TRUE);
601 }
602 
603 SK_NO_INLINE_ATTRIBUTE
604 static void
fsw_netif_clear_callbacks(struct nx_flowswitch * fsw)605 fsw_netif_clear_callbacks(struct nx_flowswitch *fsw)
606 {
607 	fsw_netif_set_callbacks_common(fsw, FALSE);
608 }
609 
610 SK_NO_INLINE_ATTRIBUTE
611 static void
fsw_dp_start(struct nx_flowswitch * fsw)612 fsw_dp_start(struct nx_flowswitch *fsw)
613 {
614 	ASSERT(fsw->fsw_dev_ch != NULL);
615 	ASSERT(fsw->fsw_host_ch != NULL);
616 
617 	fsw_netif_set_callbacks(fsw);
618 	na_start_spec(fsw->fsw_dev_ch->ch_nexus, fsw->fsw_dev_ch);
619 	na_start_spec(fsw->fsw_host_ch->ch_nexus, fsw->fsw_host_ch);
620 }
621 
622 SK_NO_INLINE_ATTRIBUTE
623 static int
fsw_dp_stop(struct nx_flowswitch * fsw,struct ifnet ** ifpp)624 fsw_dp_stop(struct nx_flowswitch *fsw, struct ifnet **ifpp)
625 {
626 	struct ifnet *ifp;
627 
628 	FSW_WLOCK(fsw);
629 	if ((fsw->fsw_state_flags & FSW_STATEF_QUIESCED) != 0) {
630 		FSW_WUNLOCK(fsw);
631 		return EALREADY;
632 	}
633 	fsw->fsw_state_flags |= FSW_STATEF_QUIESCED;
634 	FSW_WUNLOCK(fsw);
635 
636 	/*
637 	 * For regular kernel-attached interfaces, quiescing is handled by
638 	 * the ifnet detach thread, which calls dlil_quiesce_and_detach_nexuses().
639 	 * For interfaces created by skywalk test cases, flowswitch/netif nexuses
640 	 * are constructed on the fly and can also be torn down on the fly.
641 	 * dlil_quiesce_and_detach_nexuses() won't help here because any nexus
642 	 * can be detached while the interface is still attached.
643 	 */
644 	if ((ifp = fsw->fsw_ifp) != NULL &&
645 	    ifnet_datamov_suspend_if_needed(ifp)) {
646 		SK_UNLOCK();
647 		ifnet_datamov_drain(ifp);
648 		/* Reference will be released by caller */
649 		*ifpp = ifp;
650 		SK_LOCK();
651 	}
652 	ASSERT(fsw->fsw_dev_ch != NULL);
653 	ASSERT(fsw->fsw_host_ch != NULL);
654 	na_stop_spec(fsw->fsw_host_ch->ch_nexus, fsw->fsw_host_ch);
655 	na_stop_spec(fsw->fsw_dev_ch->ch_nexus, fsw->fsw_dev_ch);
656 	fsw_netif_clear_callbacks(fsw);
657 	return 0;
658 }
659 
660 SK_NO_INLINE_ATTRIBUTE
661 static int
fsw_netif_port_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx,boolean_t host)662 fsw_netif_port_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx,
663     boolean_t host)
664 {
665 	struct chreq chr;
666 	struct kern_channel *ch;
667 	int err;
668 
669 	bzero(&chr, sizeof(chr));
670 	uuid_copy(chr.cr_spec_uuid, hw_nx->nx_uuid);
671 	chr.cr_ring_id = CHANNEL_RING_ID_ANY;
672 	chr.cr_port = host ? NEXUS_PORT_NET_IF_HOST : NEXUS_PORT_NET_IF_DEV;
673 	chr.cr_mode |= CHMODE_CONFIG | (host ? CHMODE_HOST : 0);
674 
675 	err = 0;
676 	ch = ch_open_special(hw_nx, &chr, FALSE, &err);
677 	if (ch == NULL) {
678 		SK_ERR("ch_open_special(%s) failed: %d",
679 		    host ? "host" : "dev", err);
680 		return err;
681 	}
682 	if (host) {
683 		fsw->fsw_host_ch = ch;
684 	} else {
685 		fsw->fsw_dev_ch = ch;
686 	}
687 	return 0;
688 }
689 
690 SK_NO_INLINE_ATTRIBUTE
691 static int
fsw_netif_port_teardown(struct nx_flowswitch * fsw,boolean_t host)692 fsw_netif_port_teardown(struct nx_flowswitch *fsw, boolean_t host)
693 {
694 	struct kern_channel *ch;
695 
696 	ch = host ? fsw->fsw_host_ch : fsw->fsw_dev_ch;
697 	if (ch == NULL) {
698 		return EINVAL;
699 	}
700 	if (host) {
701 		fsw->fsw_host_ch = NULL;
702 	} else {
703 		fsw->fsw_dev_ch = NULL;
704 	}
705 	ch_close_special(ch);
706 	(void) ch_release_locked(ch);
707 	return 0;
708 }
709 
710 SK_NO_INLINE_ATTRIBUTE
711 static int
fsw_devna_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)712 fsw_devna_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
713 {
714 	return fsw_netif_port_setup(fsw, hw_nx, FALSE);
715 }
716 
717 SK_NO_INLINE_ATTRIBUTE
718 static int
fsw_hostna_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)719 fsw_hostna_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
720 {
721 	return fsw_netif_port_setup(fsw, hw_nx, TRUE);
722 }
723 
724 SK_NO_INLINE_ATTRIBUTE
725 static int
fsw_devna_teardown(struct nx_flowswitch * fsw)726 fsw_devna_teardown(struct nx_flowswitch *fsw)
727 {
728 	return fsw_netif_port_teardown(fsw, FALSE);
729 }
730 
731 SK_NO_INLINE_ATTRIBUTE
732 static int
fsw_hostna_teardown(struct nx_flowswitch * fsw)733 fsw_hostna_teardown(struct nx_flowswitch *fsw)
734 {
735 	return fsw_netif_port_teardown(fsw, TRUE);
736 }
737 
738 /* Process NXCFG_CMD_ATTACH */
739 SK_NO_INLINE_ATTRIBUTE
740 static int
fsw_ctl_attach(struct kern_nexus * nx,struct proc * p,struct nx_spec_req * nsr)741 fsw_ctl_attach(struct kern_nexus *nx, struct proc *p, struct nx_spec_req *nsr)
742 {
743 #pragma unused(p)
744 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
745 	struct kern_nexus *hw_nx = NULL;
746 	int err = 0;
747 
748 	SK_LOCK_ASSERT_HELD();
749 
750 	/*
751 	 * The flowswitch only accepts UUID as an identifier, since it
752 	 * represents the UUID of the kernel object we are trying to
753 	 * attach to this flowswitch.
754 	 */
755 	if ((nsr->nsr_flags & (NXSPECREQ_UUID | NXSPECREQ_IFP)) !=
756 	    NXSPECREQ_UUID || uuid_is_null(nsr->nsr_uuid)) {
757 		err = EINVAL;
758 		goto done;
759 	}
760 
761 	if (fsw->fsw_dev_ch != NULL) {
762 		ASSERT(fsw->fsw_host_ch != NULL);
763 		err = EEXIST;
764 		goto done;
765 	}
766 
767 	hw_nx = nx_find(nsr->nsr_uuid, TRUE);
768 	if (hw_nx == NULL) {
769 		err = ENOENT;
770 		goto done;
771 	} else if (hw_nx == nx) {
772 		err = EINVAL;
773 		goto done;
774 	}
775 
776 	/* preflight check to see if the nexus is attachable to us */
777 	err = fsw_nx_check(fsw, hw_nx);
778 	if (err != 0) {
779 		goto done;
780 	}
781 
782 	err = fsw_devna_setup(fsw, hw_nx);
783 	if (err != 0) {
784 		goto done;
785 	}
786 
787 	err = fsw_hostna_setup(fsw, hw_nx);
788 	if (err != 0) {
789 		(void) fsw_devna_teardown(fsw);
790 		goto done;
791 	}
792 
793 	err = fsw_host_setup(fsw);
794 	if (err != 0) {
795 		(void) fsw_hostna_teardown(fsw);
796 		(void) fsw_devna_teardown(fsw);
797 		goto done;
798 	}
799 
800 	fsw_dp_start(fsw);
801 
802 	/* return the devna UUID */
803 	uuid_copy(nsr->nsr_if_uuid, fsw->fsw_dev_ch->ch_na->na_uuid);
804 	ASSERT(!uuid_is_null(nsr->nsr_if_uuid));
805 done:
806 #if SK_LOG
807 	if (__improbable(sk_verbose != 0)) {
808 		fsw_ctl_attach_log(nsr, nx, err);
809 	}
810 #endif /* SK_LOG */
811 
812 	if (hw_nx != NULL) {
813 		nx_release_locked(hw_nx);
814 	}
815 
816 	return err;
817 }
818 
819 SK_NO_INLINE_ATTRIBUTE
820 static void
fsw_cleanup(struct nx_flowswitch * fsw)821 fsw_cleanup(struct nx_flowswitch *fsw)
822 {
823 	int err;
824 	struct ifnet *ifp = NULL;
825 
826 	if (fsw->fsw_dev_ch == NULL) {
827 		ASSERT(fsw->fsw_host_ch == NULL);
828 		return;
829 	}
830 	err = fsw_dp_stop(fsw, &ifp);
831 	if (err != 0) {
832 		return;
833 	}
834 	err = fsw_host_teardown(fsw);
835 	VERIFY(err == 0);
836 
837 	err = fsw_hostna_teardown(fsw);
838 	VERIFY(err == 0);
839 
840 	err = fsw_devna_teardown(fsw);
841 	VERIFY(err == 0);
842 
843 	if (ifp != NULL) {
844 		ifnet_datamov_resume(ifp);
845 	}
846 }
847 
848 int
fsw_ctl_detach(struct kern_nexus * nx,struct proc * p,struct nx_spec_req * nsr)849 fsw_ctl_detach(struct kern_nexus *nx, struct proc *p,
850     struct nx_spec_req *nsr)
851 {
852 #pragma unused(p)
853 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
854 	int err = 0;
855 
856 	SK_LOCK_ASSERT_HELD();
857 
858 	/*
859 	 * nsr is NULL when we're called from the destructor, and it
860 	 * implies that we'll detach everything that is attached.
861 	 */
862 	if (nsr == NULL) {
863 		fsw_cleanup(fsw);
864 		ASSERT(fsw->fsw_dev_ch == NULL);
865 		ASSERT(fsw->fsw_host_ch == NULL);
866 		goto done;
867 	}
868 
869 	if (uuid_is_null(nsr->nsr_if_uuid)) {
870 		err = EINVAL;
871 		goto done;
872 	} else if (fsw->fsw_dev_ch == NULL || fsw->fsw_host_ch == NULL) {
873 		err = ENXIO;
874 		goto done;
875 	}
876 
877 	/* check if the devna uuid is correct */
878 	if (uuid_compare(nsr->nsr_if_uuid,
879 	    fsw->fsw_dev_ch->ch_na->na_uuid) != 0) {
880 		err = ESRCH;
881 		goto done;
882 	}
883 	fsw_cleanup(fsw);
884 
885 done:
886 #if SK_LOG
887 	if (nsr != NULL) {
888 		uuid_string_t ifuuidstr;
889 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
890 		    "nexus 0x%llx (%s) if_uuid %s flags 0x%x err %d",
891 		    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
892 		    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr),
893 		    nsr->nsr_flags, err);
894 	} else {
895 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
896 		    "nexus 0x%llx (%s) ANY err %d", SK_KVA(nx),
897 		    NX_DOM_PROV(nx)->nxdom_prov_name, err);
898 	}
899 #endif /* SK_LOG */
900 
901 	return err;
902 }
903 
904 static int
fsw_netem_config(struct nx_flowswitch * fsw,void * data)905 fsw_netem_config(struct nx_flowswitch *fsw, void *data)
906 {
907 	struct ifnet *ifp = fsw->fsw_ifp;
908 	struct if_netem_params *params = data;
909 	int ret;
910 
911 	if (ifp == NULL) {
912 		return ENODEV;
913 	}
914 
915 	SK_LOCK_ASSERT_HELD();
916 #define fsw_INPUT_NETEM_THREADNAME   "if_input_netem_%s@fsw"
917 #define fsw_INPUT_NETEM_THREADNAME_LEN       32
918 	char netem_name[fsw_INPUT_NETEM_THREADNAME_LEN];
919 	(void) snprintf(netem_name, sizeof(netem_name),
920 	    fsw_INPUT_NETEM_THREADNAME, if_name(ifp));
921 	ret = netem_config(&ifp->if_input_netem, netem_name, ifp, params, fsw,
922 	    fsw_dev_input_netem_dequeue, FSW_VP_DEV_BATCH_MAX);
923 
924 	return ret;
925 }
926 
927 int
fsw_ctl(struct kern_nexus * nx,nxcfg_cmd_t nc_cmd,struct proc * p,void * data)928 fsw_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, struct proc *p,
929     void *data)
930 {
931 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
932 	struct nx_spec_req *nsr = data;
933 	struct nx_flow_req *req = data;
934 	boolean_t need_check;
935 	int error = 0;
936 
937 	switch (nc_cmd) {
938 	case NXCFG_CMD_FLOW_ADD:
939 	case NXCFG_CMD_FLOW_DEL:
940 		if (uuid_is_null(req->nfr_flow_uuid)) {
941 			error = EINVAL;
942 			goto done;
943 		}
944 		if (p != kernproc) {
945 			req->nfr_flags &= NXFLOWREQF_MASK;
946 		}
947 		req->nfr_flowadv_idx = FLOWADV_IDX_NONE;
948 
949 		if (nc_cmd == NXCFG_CMD_FLOW_DEL) {
950 			break;
951 		}
952 
953 		need_check = FALSE;
954 		if (req->nfr_epid != -1 && proc_pid(p) != req->nfr_epid) {
955 			need_check = TRUE;
956 		} else if (!uuid_is_null(req->nfr_euuid)) {
957 			uuid_t uuid;
958 
959 			/* get the UUID of the issuing process */
960 			proc_getexecutableuuid(p, uuid, sizeof(uuid));
961 
962 			/*
963 			 * If this is not issued by a process for its own
964 			 * executable UUID and if the process does not have
965 			 * the necessary privilege, reject the request.
966 			 * The logic is similar to so_set_effective_uuid().
967 			 */
968 			if (uuid_compare(req->nfr_euuid, uuid) != 0) {
969 				need_check = TRUE;
970 			}
971 		}
972 		if (need_check) {
973 			kauth_cred_t cred = kauth_cred_proc_ref(p);
974 			error = priv_check_cred(cred,
975 			    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0);
976 			kauth_cred_unref(&cred);
977 			if (error != 0) {
978 				goto done;
979 			}
980 		}
981 		break;
982 
983 	default:
984 		break;
985 	}
986 
987 	switch (nc_cmd) {
988 	case NXCFG_CMD_ATTACH:
989 		error = fsw_ctl_attach(nx, p, nsr);
990 		break;
991 
992 	case NXCFG_CMD_DETACH:
993 		error = fsw_ctl_detach(nx, p, nsr);
994 		break;
995 
996 	case NXCFG_CMD_FLOW_ADD:       /* struct nx_flow_req */
997 		error = fsw_ctl_flow_add(fsw, p, data);
998 		break;
999 
1000 	case NXCFG_CMD_FLOW_DEL:     /* struct nx_flow_req */
1001 		error = fsw_ctl_flow_del(fsw, p, data);
1002 		break;
1003 	case NXCFG_CMD_NETEM:           /* struct if_netem_params */
1004 		error = fsw_netem_config(fsw, data);
1005 		break;
1006 
1007 	default:
1008 		SK_ERR("invalid cmd %u", nc_cmd);
1009 		error = EINVAL;
1010 		break;
1011 	}
1012 
1013 done:
1014 	return error;
1015 }
1016 
1017 struct nx_flowswitch *
fsw_ifp_to_fsw(struct ifnet * ifp)1018 fsw_ifp_to_fsw(struct ifnet *ifp)
1019 {
1020 	struct nx_flowswitch *fsw = NULL;
1021 
1022 	if (ifp->if_na != NULL) {
1023 		fsw = ifp->if_na->nifna_netif->nif_fsw;
1024 	}
1025 	return fsw;
1026 }
1027 
1028 static void
fsw_ifnet_event_callback(struct eventhandler_entry_arg ee_arg __unused,struct ifnet * ifp,struct sockaddr * ip_addr __unused,intf_event_code_t intf_ev_code)1029 fsw_ifnet_event_callback(struct eventhandler_entry_arg ee_arg __unused,
1030     struct ifnet *ifp, struct sockaddr *ip_addr __unused,
1031     intf_event_code_t intf_ev_code)
1032 {
1033 	struct nx_flowswitch *fsw = NULL;
1034 
1035 	if (ifp->if_na == NULL) {
1036 		return;
1037 	}
1038 
1039 	SK_LOCK();
1040 	fsw = fsw_ifp_to_fsw(ifp);
1041 	if (fsw != NULL) {
1042 		switch (intf_ev_code) {
1043 		case INTF_EVENT_CODE_LLADDR_UPDATE:
1044 			if ((fsw->fsw_ifp == NULL) ||
1045 			    (fsw->fsw_ifp_dlt != DLT_EN10MB)) {
1046 				break;
1047 			}
1048 
1049 			VERIFY(fsw->fsw_ifp == ifp);
1050 			SK_DF(SK_VERB_FSW, "MAC address change detected for %s",
1051 			    if_name(fsw->fsw_ifp));
1052 			(void) ifnet_lladdr_copy_bytes(ifp, fsw->fsw_ether_shost,
1053 			    ETHER_ADDR_LEN);
1054 			atomic_add_32(&fsw->fsw_src_lla_gencnt, 1);
1055 			break;
1056 
1057 		case INTF_EVENT_CODE_LOW_POWER_UPDATE:
1058 			if (fsw->fsw_ifp == NULL) {
1059 				break;
1060 			}
1061 
1062 			VERIFY(fsw->fsw_ifp == ifp);
1063 
1064 			if (ifp->if_xflags & IFXF_LOW_POWER) {
1065 				SK_DF(SK_VERB_FSW,
1066 				    "Low power mode updated for %s",
1067 				    if_name(fsw->fsw_ifp));
1068 
1069 				fsw_reap_sched(fsw);
1070 			}
1071 			break;
1072 
1073 		default:
1074 			break;
1075 		}
1076 	}
1077 	SK_UNLOCK();
1078 }
1079 
1080 static void
fsw_protoctl_event_callback(struct eventhandler_entry_arg ee_arg,struct ifnet * ifp,struct sockaddr * p_laddr,struct sockaddr * p_raddr,uint16_t lport,uint16_t rport,uint8_t proto,uint32_t protoctl_event_code,struct protoctl_ev_val * p_val)1081 fsw_protoctl_event_callback(struct eventhandler_entry_arg ee_arg,
1082     struct ifnet *ifp, struct sockaddr *p_laddr, struct sockaddr *p_raddr,
1083     uint16_t lport, uint16_t rport, uint8_t proto, uint32_t protoctl_event_code,
1084     struct protoctl_ev_val *p_val)
1085 {
1086 #pragma unused(ee_arg)
1087 	struct nx_flowswitch *fsw = NULL;
1088 	struct flow_entry *fe = NULL;
1089 	boolean_t netagent_update_flow = FALSE;
1090 	uuid_t fe_uuid;
1091 
1092 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1093 		return;
1094 	}
1095 
1096 	/*
1097 	 * XXX Right now only handle the event if we have enough
1098 	 * information to match the entire flow.
1099 	 */
1100 	if (lport == 0 || rport == 0 || p_laddr == NULL || p_raddr == NULL) {
1101 		return;
1102 	}
1103 
1104 	SK_LOCK();
1105 	fsw = fsw_ifp_to_fsw(ifp);
1106 	if (fsw == NULL) {
1107 		goto out;
1108 	}
1109 
1110 	if (!fsw_detach_barrier_add(fsw)) {
1111 		fsw = NULL;
1112 		SK_ERR("netagent detached");
1113 		goto out;
1114 	}
1115 
1116 	struct flow_key fk __sk_aligned(16);
1117 	FLOW_KEY_CLEAR(&fk);
1118 	fk.fk_proto = proto;
1119 	if (p_laddr->sa_family == AF_INET) {
1120 		fk.fk_ipver = IPVERSION;
1121 		fk.fk_src4 = SIN(p_laddr)->sin_addr;
1122 		fk.fk_dst4 = SIN(p_raddr)->sin_addr;
1123 	} else {
1124 		fk.fk_ipver = IPV6_VERSION;
1125 		fk.fk_src6 = SIN6(p_laddr)->sin6_addr;
1126 		fk.fk_dst6 = SIN6(p_raddr)->sin6_addr;
1127 	}
1128 	fk.fk_sport = lport;
1129 	fk.fk_dport = rport;
1130 	fk.fk_mask = FKMASK_5TUPLE;
1131 
1132 	fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &fk);
1133 	if (__improbable(fe == NULL)) {
1134 		goto out;
1135 	}
1136 
1137 	uuid_copy(fe_uuid, fe->fe_uuid);
1138 	/*
1139 	 * If the protocol notification is for TCP, make sure
1140 	 * protocol event received is for bytes in the flight.
1141 	 * XXX Redirect events are not delivered as protocol events
1142 	 * but as better route events.
1143 	 * Also redirect events do not indicate loss of the packet.
1144 	 */
1145 	if (proto != IPPROTO_TCP) {
1146 		p_val->tcp_seq_number = 0;
1147 	}
1148 
1149 	netagent_update_flow = TRUE;
1150 
1151 out:
1152 	SK_UNLOCK();
1153 
1154 	if (netagent_update_flow) {
1155 		int error = 0;
1156 #if SK_LOG
1157 		char dbgbuf[FLOWENTRY_DBGBUF_SIZE];
1158 		SK_DF(SK_VERB_FLOW, "Update flow entry \"%s\" for protocol "
1159 		    "event %d with value %d and tcp sequence number %d",
1160 		    fe_as_string(fe, dbgbuf, sizeof(dbgbuf)),
1161 		    protoctl_event_code, p_val->val, p_val->tcp_seq_number);
1162 #endif /* SK_LOG */
1163 		if ((error = netagent_update_flow_protoctl_event(
1164 			    fsw->fsw_agent_session, fe_uuid, protoctl_event_code,
1165 			    p_val->val, p_val->tcp_seq_number)) != 0) {
1166 #if SK_LOG
1167 			SK_DF(SK_VERB_FLOW, "Error: %d. Could not update "
1168 			    "flow entry \"%s\" for protocol event %d with "
1169 			    "value %d and tcp sequence number %d", error,
1170 			    dbgbuf, protoctl_event_code, p_val->val,
1171 			    p_val->tcp_seq_number);
1172 #endif /* SK_LOG */
1173 		}
1174 	}
1175 
1176 	if (fe != NULL) {
1177 		flow_entry_release(&fe);
1178 	}
1179 
1180 	if (fsw != NULL) {
1181 		fsw_detach_barrier_remove(fsw);
1182 	}
1183 }
1184 
1185 int
fsw_netagent_add_remove(struct kern_nexus * nx,boolean_t add)1186 fsw_netagent_add_remove(struct kern_nexus *nx, boolean_t add)
1187 {
1188 	struct nx_flowswitch *fsw = NULL;
1189 	int error = 0;
1190 
1191 	SK_LOCK_ASSERT_HELD();
1192 	VERIFY(nx != NULL);
1193 	VERIFY(NX_PROV(nx) != NULL);
1194 	VERIFY(NX_DOM_PROV(nx) != NULL);
1195 
1196 	if (NX_DOM(nx)->nxdom_type != NEXUS_TYPE_FLOW_SWITCH) {
1197 		error = EINVAL;
1198 		goto out;
1199 	}
1200 
1201 	fsw = NX_FSW_PRIVATE(nx);
1202 	VERIFY(fsw != NULL);
1203 	FSW_WLOCK(fsw);
1204 
1205 	if (fsw->fsw_agent_session == NULL) {
1206 		error = ENXIO;
1207 		goto out;
1208 	}
1209 
1210 	ASSERT(!uuid_is_null(fsw->fsw_agent_uuid));
1211 
1212 	if (add) {
1213 		if (FSW_NETAGENT_ADDED(fsw)) {
1214 			/* agent already added */
1215 			error = EEXIST;
1216 		} else {
1217 			fsw->fsw_state_flags |= FSW_STATEF_NETAGENT_ADDED;
1218 			if (if_is_fsw_netagent_enabled()) {
1219 				fsw->fsw_state_flags
1220 				        |= FSW_STATEF_NETAGENT_ENABLED;
1221 			}
1222 			if_add_netagent(fsw->fsw_ifp, fsw->fsw_agent_uuid);
1223 			SK_D("flowswitch netagent added for interface %s",
1224 			    if_name(fsw->fsw_ifp));
1225 		}
1226 	} else {
1227 		if (!FSW_NETAGENT_ADDED(fsw)) {
1228 			/* agent has not been added */
1229 			error = ENOENT;
1230 		} else {
1231 			fsw->fsw_state_flags &= ~(FSW_STATEF_NETAGENT_ADDED |
1232 			    FSW_STATEF_NETAGENT_ENABLED);
1233 			if_delete_netagent(fsw->fsw_ifp, fsw->fsw_agent_uuid);
1234 			SK_D("flowswitch netagent removed for interface %s",
1235 			    if_name(fsw->fsw_ifp));
1236 		}
1237 	}
1238 out:
1239 	if (fsw != NULL) {
1240 		FSW_UNLOCK(fsw);
1241 	}
1242 	return error;
1243 }
1244 
1245 void
fsw_netagent_update(struct kern_nexus * nx)1246 fsw_netagent_update(struct kern_nexus *nx)
1247 {
1248 	struct nx_flowswitch *fsw = NULL;
1249 
1250 	SK_LOCK_ASSERT_HELD();
1251 	VERIFY(nx != NULL);
1252 	VERIFY(NX_PROV(nx) != NULL);
1253 	VERIFY(NX_DOM_PROV(nx) != NULL);
1254 
1255 	if (NX_DOM(nx)->nxdom_type != NEXUS_TYPE_FLOW_SWITCH) {
1256 		goto out;
1257 	}
1258 	fsw = NX_FSW_PRIVATE(nx);
1259 	VERIFY(fsw != NULL);
1260 	FSW_WLOCK(fsw);
1261 	if (fsw->fsw_agent_session == NULL) {
1262 		goto out;
1263 	}
1264 	ASSERT(!uuid_is_null(fsw->fsw_agent_uuid));
1265 	uint32_t flags = netagent_get_flags(fsw->fsw_agent_uuid);
1266 	const bool ip_agent = ifnet_needs_fsw_ip_netagent(fsw->fsw_ifp);
1267 	const bool transport_agent = ifnet_needs_fsw_transport_netagent(fsw->fsw_ifp);
1268 	if (ip_agent || transport_agent) {
1269 		flags |= NETAGENT_FLAG_NEXUS_LISTENER;
1270 	} else {
1271 		flags &= ~NETAGENT_FLAG_NEXUS_LISTENER;
1272 	}
1273 	if (transport_agent) {
1274 		flags |= NETAGENT_FLAG_NEXUS_PROVIDER;
1275 	} else {
1276 		flags &= ~NETAGENT_FLAG_NEXUS_PROVIDER;
1277 	}
1278 	if (ip_agent) {
1279 		flags |= NETAGENT_FLAG_CUSTOM_IP_NEXUS;
1280 	} else {
1281 		flags &= ~NETAGENT_FLAG_CUSTOM_IP_NEXUS;
1282 	}
1283 	if (netagent_set_flags(fsw->fsw_agent_uuid, flags) == 0) {
1284 		SK_D("flowswitch netagent updated for interface %s",
1285 		    if_name(fsw->fsw_ifp));
1286 	}
1287 out:
1288 	if (fsw != NULL) {
1289 		FSW_UNLOCK(fsw);
1290 	}
1291 }
1292 
1293 static int
fsw_port_ctor(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,const struct nxbind * nxb)1294 fsw_port_ctor(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna,
1295     const struct nxbind *nxb)
1296 {
1297 #pragma unused(nxb)
1298 	int err = 0;
1299 
1300 	SK_LOCK_ASSERT_HELD();
1301 	ASSERT(nxb == NULL || !(nxb->nxb_flags & NXBF_MATCH_UNIQUEID) ||
1302 	    vpna->vpna_pid == nxb->nxb_pid);
1303 
1304 	/*
1305 	 * Reject regular channel open requests unless there is
1306 	 * something attached to the host port of the flowswitch.
1307 	 */
1308 	if (vpna->vpna_nx_port >= FSW_VP_USER_MIN) {
1309 		struct nexus_adapter *na = &vpna->vpna_up;
1310 		struct ifnet *ifp = fsw->fsw_ifp;
1311 
1312 		if (ifp == NULL) {
1313 			err = ENXIO;
1314 			goto done;
1315 		}
1316 
1317 		/* if adapter supports mitigation, set default value */
1318 		if (na->na_flags & (NAF_TX_MITIGATION | NAF_RX_MITIGATION)) {
1319 			if (IFNET_IS_WIFI(ifp)) {
1320 				na->na_ch_mit_ival = CH_MIT_IVAL_WIFI;
1321 			} else if (IFNET_IS_CELLULAR(ifp)) {
1322 				na->na_ch_mit_ival = CH_MIT_IVAL_CELLULAR;
1323 			} else if (IFNET_IS_ETHERNET(ifp)) {
1324 				na->na_ch_mit_ival = CH_MIT_IVAL_ETHERNET;
1325 			} else {
1326 				na->na_ch_mit_ival = CH_MIT_IVAL_DEFAULT;
1327 			}
1328 		}
1329 	}
1330 
1331 done:
1332 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
1333 	    "fsw 0x%llx nx_port %d vpna_pid %d vpna_pid_bound %u mit_ival %llu "
1334 	    "(err %d)", SK_KVA(fsw), (int)vpna->vpna_nx_port, vpna->vpna_pid,
1335 	    vpna->vpna_pid_bound, vpna->vpna_up.na_ch_mit_ival, err);
1336 
1337 	return err;
1338 }
1339 
1340 static bool
fsw_port_dtor(struct nx_flowswitch * fsw,const struct nexus_vp_adapter * vpna)1341 fsw_port_dtor(struct nx_flowswitch *fsw, const struct nexus_vp_adapter *vpna)
1342 {
1343 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1344 	nexus_port_t nx_port = vpna->vpna_nx_port;
1345 	uint32_t purge_cnt;
1346 
1347 	ASSERT(fsw == vpna->vpna_fsw);
1348 	ASSERT(nx_port != NEXUS_PORT_ANY);
1349 
1350 	/*
1351 	 * If this nexus port was bound to a PID, we just need to look at a
1352 	 * single bucket and iterate from there.  Note that in any case, we
1353 	 * can't just search for a single flow_owner based on the PID itself,
1354 	 * since a given process may be opening multiple channels to the
1355 	 * flowswitch; hence we search for the ones matching this nexus port.
1356 	 *
1357 	 * Close any open flows on the port and remove the flow owner and
1358 	 * nexus port binding.
1359 	 */
1360 	purge_cnt = flow_owner_detach_nexus_port(fm, vpna->vpna_pid_bound,
1361 	    vpna->vpna_pid, nx_port, FALSE);
1362 
1363 	SK_DF(SK_VERB_FSW,
1364 	    "fsw 0x%llx nx_port %d pid %d pid_bound %u defunct %u "
1365 	    "purged %u", SK_KVA(fsw), (int)nx_port,
1366 	    vpna->vpna_pid, vpna->vpna_pid_bound, vpna->vpna_defunct,
1367 	    purge_cnt);
1368 
1369 	return purge_cnt != 0;
1370 }
1371 
1372 /*
1373  * Flowswitch nexus port allocator.
1374  *
1375  * A nexus port is represented by a bit in the port bitmap; its state is
1376  * either free or allocated.  A free state implies that the port has no
1377  * nxbind AND no nexus adapter association.  An allocated state means that
1378  * either it has a nxbind OR a nexus adapter assocation.  This routine
1379  * manages the nexus adapter association with a nexus port; nxbind is
1380  * handled separately via nx_fsw_port_bind().
1381  *
1382  * The caller of this routine may optionally pass in a NULL nexus adapter.
1383  * In such a case (*vpna is NULL), this routine checks to see if the port
1384  * has already been associated with an adapter, and returns a reference to
1385  * that adapter.  No action is taken on a port that doesn't have an adapter
1386  * associated.  Otherwise (*vpna is non-NULL), this routine associates that
1387  * adapter with a port that's not already associated with one; the reference
1388  * to the adapter is untouched here, as the caller is expected to handle it.
1389  *
1390  * The flowswitch code invokes this routine each time it is requested to
1391  * find an adapter via nx_fsw_na_find().  The counterpart of this routine,
1392  * nx_fsw_port_free(), is only executed ONCE by the adapter's destructor.
1393  * This allows for multiple channels to be opened to a nexus port, each
1394  * time holding a reference to that same nexus adapter.  The releasing of
1395  * the nexus port only happens when the last channel closes.
1396  */
1397 static int
fsw_port_alloc__(struct nx_flowswitch * fsw,struct nxbind * nxb,struct nexus_vp_adapter ** vpna,nexus_port_t nx_port,struct proc * p)1398 fsw_port_alloc__(struct nx_flowswitch *fsw, struct nxbind *nxb,
1399     struct nexus_vp_adapter **vpna, nexus_port_t nx_port, struct proc *p)
1400 {
1401 	struct kern_nexus *nx = fsw->fsw_nx;
1402 	boolean_t refonly = FALSE;
1403 	int error = 0;
1404 
1405 	FSW_WLOCK_ASSERT_HELD(fsw);
1406 
1407 	error = nx_port_alloc(nx, nx_port, nxb, (struct nexus_adapter **)vpna, p);
1408 	if (error == 0 && *vpna != NULL && !refonly) {
1409 		/* initialize the nexus port and the adapter occupying it */
1410 		(*vpna)->vpna_fsw = fsw;
1411 		(*vpna)->vpna_nx_port = nx_port;
1412 		(*vpna)->vpna_pid = proc_pid(p);
1413 		if (nxb != NULL && (nxb->nxb_flags & NXBF_MATCH_UNIQUEID)) {
1414 			ASSERT((*vpna)->vpna_pid == nxb->nxb_pid);
1415 			(*vpna)->vpna_pid_bound = TRUE;
1416 		} else {
1417 			(*vpna)->vpna_pid_bound = FALSE;
1418 		}
1419 
1420 		error = fsw_port_ctor(fsw, *vpna, nxb);
1421 		if (error != 0) {
1422 			fsw_port_free(fsw, (*vpna),
1423 			    (*vpna)->vpna_nx_port, FALSE);
1424 		}
1425 	}
1426 
1427 #if SK_LOG
1428 	if (*vpna != NULL) {
1429 		SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
1430 		    "+++ vpna \"%s\" (0x%llx) <-> fsw 0x%llx "
1431 		    "%sport %d refonly %u (err %d)",
1432 		    (*vpna)->vpna_up.na_name, SK_KVA(*vpna), SK_KVA(fsw),
1433 		    nx_fsw_dom_port_is_reserved(nx, nx_port) ?
1434 		    "[reserved] " : "", (int)nx_port, refonly, error);
1435 	} else {
1436 		SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
1437 		    "+++ fsw 0x%llx nx_port %d refonly %u "
1438 		    "(err %d)", SK_KVA(fsw), (int)nx_port, refonly, error);
1439 	}
1440 #endif /* SK_LOG */
1441 
1442 	return error;
1443 }
1444 
1445 int
fsw_port_alloc(struct nx_flowswitch * fsw,struct nxbind * nxb,struct nexus_vp_adapter ** vpna,nexus_port_t nx_port,struct proc * p,boolean_t ifattach,boolean_t host)1446 fsw_port_alloc(struct nx_flowswitch *fsw, struct nxbind *nxb,
1447     struct nexus_vp_adapter **vpna, nexus_port_t nx_port, struct proc *p,
1448     boolean_t ifattach, boolean_t host)
1449 {
1450 	int err = 0;
1451 
1452 	FSW_WLOCK_ASSERT_HELD(fsw);
1453 
1454 	if (ifattach) {
1455 		/* override port to either NX_FSW_{HOST,DEV} */
1456 		nx_port = (host ? FSW_VP_HOST : FSW_VP_DEV);
1457 		/* allocate reserved port for ifattach */
1458 		err = fsw_port_alloc__(fsw, nxb, vpna, nx_port, p);
1459 	} else if (host) {
1460 		/* host is valid only for ifattach */
1461 		err = EINVAL;
1462 	} else {
1463 		/* nexus port otherwise (reserve dev and host for ifattach) */
1464 		err = fsw_port_alloc__(fsw, nxb, vpna, nx_port, p);
1465 	}
1466 
1467 	return err;
1468 }
1469 
1470 /*
1471  * Remove nexus port association from a nexus adapter.  This call is
1472  * the opposite of fsw_port_alloc(), except that it is called only
1473  * at nx_fsw_vp_na_dtor() destructor time.  See above notes
1474  * on fsw_port_alloc().
1475  */
1476 void
fsw_port_free(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,nexus_port_t nx_port,boolean_t defunct)1477 fsw_port_free(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna,
1478     nexus_port_t nx_port, boolean_t defunct)
1479 {
1480 	struct kern_nexus *nx = fsw->fsw_nx;
1481 
1482 	FSW_WLOCK_ASSERT_HELD(fsw);
1483 	ASSERT(vpna->vpna_fsw == fsw);
1484 
1485 	if (defunct) {
1486 		vpna->vpna_defunct = TRUE;
1487 		nx_port_defunct(nx, nx_port);
1488 	}
1489 
1490 	bool destroyed = fsw_port_dtor(fsw, vpna);
1491 	if (destroyed) {
1492 		/*
1493 		 * If the extension's destructor no longer needs to be
1494 		 * bound to any channel client, release the binding.
1495 		 */
1496 		nx_port_unbind(nx, nx_port);
1497 	}
1498 
1499 	/*
1500 	 * If this is a defunct, then stop here as the port is still
1501 	 * occupied by the channel.  We'll come here again later when
1502 	 * the actual close happens.
1503 	 */
1504 	if (defunct) {
1505 		return;
1506 	}
1507 
1508 	SK_DF(SK_VERB_FSW, "--- vpna \"%s\" (0x%llx) -!- fsw 0x%llx "
1509 	    "nx_port %d defunct %u", vpna->vpna_up.na_name, SK_KVA(vpna),
1510 	    SK_KVA(fsw), (int)nx_port, vpna->vpna_defunct);
1511 
1512 	nx_port_free(nx, nx_port);
1513 	vpna->vpna_fsw = NULL;
1514 	vpna->vpna_nx_port = NEXUS_PORT_ANY;
1515 	vpna->vpna_pid_bound = FALSE;
1516 	vpna->vpna_pid = -1;
1517 	vpna->vpna_defunct = FALSE;
1518 }
1519 
1520 int
fsw_port_na_activate(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,na_activate_mode_t mode)1521 fsw_port_na_activate(struct nx_flowswitch *fsw,
1522     struct nexus_vp_adapter *vpna, na_activate_mode_t mode)
1523 {
1524 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1525 	uint32_t fo_cnt = 0;
1526 
1527 	SK_LOCK_ASSERT_HELD();
1528 
1529 	/* The following code relies on the static value asserted below */
1530 	_CASSERT(FSW_VP_DEV == 0);
1531 	_CASSERT(FSW_VP_HOST == 1);
1532 
1533 	ASSERT(NA_IS_ACTIVE(&vpna->vpna_up));
1534 	ASSERT(vpna->vpna_nx_port != NEXUS_PORT_ANY);
1535 
1536 	switch (mode) {
1537 	case NA_ACTIVATE_MODE_ON:
1538 		break;
1539 
1540 	case NA_ACTIVATE_MODE_DEFUNCT:
1541 		break;
1542 
1543 	case NA_ACTIVATE_MODE_OFF:
1544 		break;
1545 
1546 	default:
1547 		VERIFY(0);
1548 		/* NOTREACHED */
1549 		__builtin_unreachable();
1550 	}
1551 
1552 	/* nothing further to do for special ports */
1553 	if (vpna->vpna_nx_port < FSW_VP_USER_MIN) {
1554 		goto done;
1555 	}
1556 
1557 	/* activate any flow owner related resources (e.g. flowadv), if any */
1558 	fo_cnt = flow_owner_activate_nexus_port(fm, vpna->vpna_pid_bound,
1559 	    vpna->vpna_pid, vpna->vpna_nx_port, &vpna->vpna_up, mode);
1560 
1561 done:
1562 	SK_DF(SK_VERB_FSW,
1563 	    "fsw 0x%llx %s nx_port %d vpna_pid %d vpna_pid_bound %u fo_cnt %u",
1564 	    SK_KVA(fsw), na_activate_mode2str(mode), (int)vpna->vpna_nx_port,
1565 	    vpna->vpna_pid, vpna->vpna_pid_bound, fo_cnt);
1566 
1567 	return 0;
1568 }
1569 
1570 int
fsw_port_na_defunct(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna)1571 fsw_port_na_defunct(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna)
1572 {
1573 	int err = 0;
1574 
1575 	SK_LOCK_ASSERT_HELD();
1576 	ASSERT(vpna->vpna_nx_port >= FSW_VP_USER_MIN);
1577 
1578 	/*
1579 	 * During defunct, we want to purge all flows associated to this
1580 	 * port and the flow owner as well.  This is accomplished as part
1581 	 * of calling the port's destructor.  However, we still want to
1582 	 * occupy the nexus port since there's a channel open to it.
1583 	 */
1584 	FSW_WLOCK(fsw);
1585 	if (!vpna->vpna_defunct) {
1586 		fsw_port_free(fsw, vpna, vpna->vpna_nx_port, TRUE);
1587 	} else {
1588 		err = EALREADY;
1589 	}
1590 	FSW_WUNLOCK(fsw);
1591 
1592 	return err;
1593 }
1594 
1595 static size_t
fsw_mib_get_flow(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1596 fsw_mib_get_flow(struct nx_flowswitch *fsw,
1597     struct nexus_mib_filter *filter, void *out, size_t len)
1598 {
1599 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1600 	size_t sf_size = sizeof(struct sk_stats_flow);
1601 	__block size_t actual_space = 0;
1602 	__block struct sk_stats_flow *sf = out;
1603 	struct flow_entry *fe;
1604 
1605 	FSW_LOCK_ASSERT_HELD(fsw);
1606 
1607 	if (filter->nmf_bitmap & NXMIB_FILTER_FLOW_ID) {
1608 		fe = flow_mgr_get_fe_by_uuid_rlock(fm, filter->nmf_flow_id);
1609 		if (fe != NULL) {
1610 			if (out != NULL && len >= sf_size) {
1611 				flow_entry_stats_get(fe, sf);
1612 			}
1613 
1614 			flow_entry_release(&fe);
1615 			return sf_size;
1616 		}
1617 		return 0;
1618 	} else if (filter->nmf_bitmap & NXMIB_FILTER_INFO_TUPLE) {
1619 		struct info_tuple *itpl = &filter->nmf_info_tuple;
1620 		struct flow_key fk;
1621 		bzero(&fk, sizeof(fk));
1622 		if (itpl->itpl_local_sa.sa_family == AF_INET &&
1623 		    itpl->itpl_remote_sa.sa_family == AF_INET) {
1624 			fk.fk_mask = FKMASK_5TUPLE;
1625 			fk.fk_ipver = IPVERSION;
1626 			fk.fk_proto = itpl->itpl_proto;
1627 			fk.fk_src4 = itpl->itpl_local_sin.sin_addr;
1628 			fk.fk_dst4 = itpl->itpl_remote_sin.sin_addr;
1629 			fk.fk_sport = itpl->itpl_local_sin.sin_port;
1630 			fk.fk_dport = itpl->itpl_remote_sin.sin_port;
1631 		} else if (itpl->itpl_local_sa.sa_family == AF_INET6 &&
1632 		    itpl->itpl_remote_sa.sa_family == AF_INET6) {
1633 			fk.fk_mask = FKMASK_5TUPLE;
1634 			fk.fk_ipver = IPV6_VERSION;
1635 			fk.fk_proto = itpl->itpl_proto;
1636 			fk.fk_src6 = itpl->itpl_local_sin6.sin6_addr;
1637 			fk.fk_dst6 = itpl->itpl_remote_sin6.sin6_addr;
1638 			fk.fk_sport = itpl->itpl_local_sin6.sin6_port;
1639 			fk.fk_dport = itpl->itpl_remote_sin6.sin6_port;
1640 		} else {
1641 			SK_ERR("invalid info tuple: local af %d remote af %d",
1642 			    itpl->itpl_local_sa.sa_family,
1643 			    itpl->itpl_remote_sa.sa_family);
1644 			return 0;
1645 		}
1646 
1647 		fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &fk);
1648 		if (fe != NULL) {
1649 			if (out != NULL && len >= sf_size) {
1650 				flow_entry_stats_get(fe, sf);
1651 			}
1652 			flow_entry_release(&fe);
1653 			return sf_size;
1654 		}
1655 		return 0;
1656 	}
1657 
1658 	flow_mgr_foreach_flow(fsw->fsw_flow_mgr, ^(struct flow_entry *_fe) {
1659 		actual_space += sf_size;
1660 
1661 		if (out == NULL || actual_space > len) {
1662 		        return;
1663 		}
1664 
1665 		flow_entry_stats_get(_fe, sf);
1666 		sf++;
1667 	});
1668 
1669 	/*
1670 	 * Also return the ones in deferred free list.
1671 	 */
1672 	lck_mtx_lock(&fsw->fsw_linger_lock);
1673 	TAILQ_FOREACH(fe, &fsw->fsw_linger_head, fe_linger_link) {
1674 		actual_space += sf_size;
1675 		if (out == NULL || actual_space > len) {
1676 			continue;
1677 		}
1678 
1679 		flow_entry_stats_get(fe, sf);
1680 		sf++;
1681 	}
1682 	lck_mtx_unlock(&fsw->fsw_linger_lock);
1683 
1684 	return actual_space;
1685 }
1686 
1687 static size_t
fsw_mib_get_flow_adv(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1688 fsw_mib_get_flow_adv(struct nx_flowswitch *fsw,
1689     struct nexus_mib_filter *filter, void *out, size_t len)
1690 {
1691 #pragma unused(filter)
1692 	uint32_t fae_idx;
1693 	size_t actual_space = 0;
1694 	struct kern_channel *ch = NULL;
1695 	struct sk_stats_flow_adv *sfa = NULL;
1696 	struct sk_stats_flow_adv_ent *sfae = NULL;
1697 	struct __flowadv_entry *fae = NULL;
1698 	size_t sfa_size = sizeof(struct sk_stats_flow_adv);
1699 	size_t sfae_size = sizeof(struct sk_stats_flow_adv_ent);
1700 	uint32_t max_flowadv =
1701 	    fsw->fsw_nx->nx_prov->nxprov_params->nxp_flowadv_max;
1702 
1703 	SK_LOCK_ASSERT_HELD();
1704 
1705 	sfa = out;
1706 	/* copyout flow advisory table (allocated entries only) */
1707 	STAILQ_FOREACH(ch, &fsw->fsw_nx->nx_ch_head, ch_link) {
1708 		struct skmem_arena *ar;
1709 		struct skmem_arena_nexus *arn;
1710 		struct nexus_adapter *na;
1711 
1712 		/* ch_lock isn't needed here since sk_lock is held */
1713 		if ((ch->ch_flags & CHANF_CLOSING) ||
1714 		    (na = ch->ch_na) == NULL) {
1715 			/* channel is closing */
1716 			continue;
1717 		}
1718 
1719 		ar = na->na_arena;
1720 		arn = skmem_arena_nexus(ar);
1721 
1722 		AR_LOCK(ar);
1723 		if (arn->arn_flowadv_obj == NULL) {
1724 			ASSERT(ar->ar_flags & ARF_DEFUNCT);
1725 			AR_UNLOCK(ar);
1726 			continue;
1727 		}
1728 		actual_space += sfa_size;
1729 		/* fill out flowadv_table info */
1730 		if (out != NULL && actual_space <= len) {
1731 			uuid_copy(sfa->sfa_nx_uuid, fsw->fsw_nx->nx_uuid);
1732 			(void) strlcpy(sfa->sfa_if_name,
1733 			    fsw->fsw_flow_mgr->fm_name, IFNAMSIZ);
1734 			sfa->sfa_owner_pid = ch->ch_pid;
1735 			sfa->sfa_entries_count = 0;
1736 		}
1737 
1738 		/* fill out flowadv_entries */
1739 		sfae = &sfa->sfa_entries[0];
1740 		for (fae_idx = 0; fae_idx < max_flowadv; fae_idx++) {
1741 			fae = &arn->arn_flowadv_obj[fae_idx];
1742 			if (!uuid_is_null(fae->fae_id)) {
1743 				actual_space += sfae_size;
1744 				if (out == NULL || actual_space > len) {
1745 					continue;
1746 				}
1747 
1748 				/* fill out entry */
1749 				uuid_copy(sfae->sfae_flow_id, fae->fae_id);
1750 				sfae->sfae_flags = fae->fae_flags;
1751 				sfae++;
1752 				sfa->sfa_entries_count++;
1753 			}
1754 		}
1755 		sfa = (struct sk_stats_flow_adv *)
1756 		    ((uintptr_t)out + actual_space);
1757 		AR_UNLOCK(ar);
1758 	}
1759 
1760 	return actual_space;
1761 }
1762 
1763 static inline void
fsw_fo2sfo(struct nx_flowswitch * fsw,struct flow_owner * fo,struct sk_stats_flow_owner * sfo)1764 fsw_fo2sfo(struct nx_flowswitch *fsw, struct flow_owner *fo,
1765     struct sk_stats_flow_owner *sfo)
1766 {
1767 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1768 
1769 	uuid_copy(sfo->sfo_nx_uuid, fsw->fsw_nx->nx_uuid);
1770 	(void) strlcpy(sfo->sfo_if_name, fsw->fsw_flow_mgr->fm_name,
1771 	    IFNAMSIZ);
1772 	sfo->sfo_bucket_idx = flow_mgr_get_fob_idx(fm, FO_BUCKET(fo));
1773 
1774 	(void) snprintf(sfo->sfo_name, sizeof(sfo->sfo_name), "%s",
1775 	    fo->fo_name);
1776 	sfo->sfo_pid = fo->fo_pid;
1777 	sfo->sfo_nx_port = fo->fo_nx_port;
1778 	sfo->sfo_nx_port_pid_bound = fo->fo_nx_port_pid_bound;
1779 	sfo->sfo_nx_port_destroyed = fo->fo_nx_port_destroyed;
1780 }
1781 
1782 static size_t
fsw_mib_get_flow_owner(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1783 fsw_mib_get_flow_owner(struct nx_flowswitch *fsw,
1784     struct nexus_mib_filter *filter, void *out, size_t len)
1785 {
1786 #pragma unused(filter)
1787 	uint32_t i;
1788 	size_t actual_space = 0;
1789 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1790 	struct sk_stats_flow_owner *sfo = out;
1791 	size_t sfo_size = sizeof(struct sk_stats_flow_owner);
1792 	struct flow_owner *fo;
1793 
1794 	FSW_LOCK_ASSERT_HELD(fsw);
1795 
1796 	/*
1797 	 * Ideally we'd like to hide the bucket level details from flow library
1798 	 * user, but there is no simple way to iterate flow_owner with
1799 	 * buckets/RB_TREE nested. So keep it as is.
1800 	 */
1801 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
1802 		struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
1803 		FOB_LOCK(fob);
1804 		RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
1805 			actual_space += sfo_size;
1806 			if (out == NULL || actual_space > len) {
1807 				continue;
1808 			}
1809 
1810 			fsw_fo2sfo(fsw, fo, sfo);
1811 			sfo++;
1812 		}
1813 		FOB_UNLOCK(fob);
1814 	}
1815 
1816 	return actual_space;
1817 }
1818 
1819 static inline void
fsw_fr2sfr(struct nx_flowswitch * fsw,struct flow_route * fr,struct sk_stats_flow_route * sfr,boolean_t ll_scrub)1820 fsw_fr2sfr(struct nx_flowswitch *fsw, struct flow_route *fr,
1821     struct sk_stats_flow_route *sfr, boolean_t ll_scrub)
1822 {
1823 	uuid_copy(sfr->sfr_nx_uuid, fsw->fsw_nx->nx_uuid);
1824 	uuid_copy(sfr->sfr_uuid, fr->fr_uuid);
1825 	(void) strlcpy(sfr->sfr_if_name, fsw->fsw_flow_mgr->fm_name,
1826 	    IFNAMSIZ);
1827 
1828 	sfr->sfr_bucket_idx = fr->fr_frb->frb_idx;
1829 	sfr->sfr_id_bucket_idx = fr->fr_frib->frib_idx;
1830 
1831 	if (fr->fr_flags & FLOWRTF_ATTACHED) {
1832 		sfr->sfr_flags |= SFLOWRTF_ATTACHED;
1833 	}
1834 	if (fr->fr_flags & FLOWRTF_ONLINK) {
1835 		sfr->sfr_flags |= SFLOWRTF_ONLINK;
1836 	}
1837 	if (fr->fr_flags & FLOWRTF_GATEWAY) {
1838 		sfr->sfr_flags |= SFLOWRTF_GATEWAY;
1839 	}
1840 	if (fr->fr_flags & FLOWRTF_RESOLVED) {
1841 		sfr->sfr_flags |= SFLOWRTF_RESOLVED;
1842 	}
1843 	if (fr->fr_flags & FLOWRTF_HAS_LLINFO) {
1844 		sfr->sfr_flags |= SFLOWRTF_HAS_LLINFO;
1845 	}
1846 	if (fr->fr_flags & FLOWRTF_DELETED) {
1847 		sfr->sfr_flags |= SFLOWRTF_DELETED;
1848 	}
1849 	if (fr->fr_flags & FLOWRTF_DST_LL_MCAST) {
1850 		sfr->sfr_flags |= SFLOWRTF_DST_LL_MCAST;
1851 	}
1852 	if (fr->fr_flags & FLOWRTF_DST_LL_BCAST) {
1853 		sfr->sfr_flags |= SFLOWRTF_DST_LL_BCAST;
1854 	}
1855 
1856 	lck_spin_lock(&fr->fr_reflock);
1857 	ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF);
1858 	sfr->sfr_usecnt = fr->fr_usecnt - FLOW_ROUTE_MINREF;
1859 	if (fr->fr_expire != 0) {
1860 		sfr->sfr_expire = (int64_t)(fr->fr_expire - net_uptime());
1861 	} else {
1862 		sfr->sfr_expire = 0;
1863 	}
1864 	lck_spin_unlock(&fr->fr_reflock);
1865 
1866 	sfr->sfr_laddr = fr->fr_laddr;
1867 	sfr->sfr_faddr = fr->fr_faddr;
1868 	sfr->sfr_gaddr = fr->fr_gaddr;
1869 
1870 	if (ll_scrub) {
1871 		static const uint8_t unspec[ETHER_ADDR_LEN] = {[0] = 2 };
1872 		bcopy(&unspec, &sfr->sfr_ether_dhost, ETHER_ADDR_LEN);
1873 	} else {
1874 		bcopy(&fr->fr_eth.ether_dhost, &sfr->sfr_ether_dhost,
1875 		    ETHER_ADDR_LEN);
1876 	}
1877 }
1878 
1879 #if CONFIG_MACF
1880 extern int dlil_lladdr_ckreq;
1881 #endif /* CONFIG_MACF */
1882 
1883 static size_t
fsw_mib_get_flow_route(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len,struct proc * p)1884 fsw_mib_get_flow_route(struct nx_flowswitch *fsw,
1885     struct nexus_mib_filter *filter, void *out, size_t len, struct proc *p)
1886 {
1887 #pragma unused(filter)
1888 	uint32_t i;
1889 	size_t actual_space = 0;
1890 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1891 	struct sk_stats_flow_route *sfr = out;
1892 	size_t sfo_size = sizeof(struct sk_stats_flow_route);
1893 	struct flow_route *fr;
1894 	boolean_t ll_scrub;
1895 
1896 	FSW_LOCK_ASSERT_HELD(fsw);
1897 
1898 	/*
1899 	 * To get the link-layer info, the caller must have the following
1900 	 * in their sandbox profile (or not be sandboxed at all), else we
1901 	 * scrub it clean just like dlil_ifaddr_bytes() does:
1902 	 *
1903 	 * (allow system-info (info-type "net.link.addr"))
1904 	 *
1905 	 * If scrubbed, we return 02:00:00:00:00:00.
1906 	 */
1907 #if CONFIG_MACF
1908 	ll_scrub = (dlil_lladdr_ckreq &&
1909 	    skywalk_mac_system_check_proc_cred(p, "net.link.addr") != 0);
1910 #else /* !CONFIG_MACF */
1911 	ll_scrub = FALSE;
1912 #endif /* !CONFIG_MACF */
1913 
1914 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
1915 		struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
1916 		FRB_RLOCK(frb);
1917 		RB_FOREACH(fr, flow_route_tree, &frb->frb_head) {
1918 			actual_space += sfo_size;
1919 			if (out == NULL || actual_space > len) {
1920 				continue;
1921 			}
1922 
1923 			fsw_fr2sfr(fsw, fr, sfr, ll_scrub);
1924 			sfr++;
1925 		}
1926 		FRB_UNLOCK(frb);
1927 	}
1928 
1929 	return actual_space;
1930 }
1931 
1932 static inline void
fsw_nxs2nus(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,pid_t pid,struct __nx_stats_fsw * nxs,struct sk_stats_userstack * sus)1933 fsw_nxs2nus(struct nx_flowswitch *fsw, struct nexus_mib_filter *filter,
1934     pid_t pid, struct __nx_stats_fsw *nxs, struct sk_stats_userstack *sus)
1935 {
1936 	uuid_copy(sus->sus_nx_uuid, fsw->fsw_nx->nx_uuid);
1937 	(void) strlcpy(sus->sus_if_name, fsw->fsw_flow_mgr->fm_name,
1938 	    IFNAMSIZ);
1939 	sus->sus_owner_pid = pid;
1940 
1941 	if (filter->nmf_type & NXMIB_IP_STATS) {
1942 		sus->sus_ip  = nxs->nxs_ipstat;
1943 	}
1944 
1945 	if (filter->nmf_type & NXMIB_IP6_STATS) {
1946 		sus->sus_ip6 = nxs->nxs_ip6stat;
1947 	}
1948 
1949 	if (filter->nmf_type & NXMIB_TCP_STATS) {
1950 		sus->sus_tcp = nxs->nxs_tcpstat;
1951 	}
1952 
1953 	if (filter->nmf_type & NXMIB_UDP_STATS) {
1954 		sus->sus_udp = nxs->nxs_udpstat;
1955 	}
1956 
1957 	if (filter->nmf_type & NXMIB_QUIC_STATS) {
1958 		sus->sus_quic = nxs->nxs_quicstat;
1959 	}
1960 }
1961 
1962 static size_t
fsw_mib_get_userstack_stats(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1963 fsw_mib_get_userstack_stats(struct nx_flowswitch *fsw,
1964     struct nexus_mib_filter *filter, void *out, size_t len)
1965 {
1966 	size_t actual_space = 0;
1967 	struct kern_channel *ch;
1968 	struct __nx_stats_fsw *nxs;
1969 	struct sk_stats_userstack *sus = out;
1970 	size_t sus_size = sizeof(struct sk_stats_userstack);
1971 
1972 	SK_LOCK_ASSERT_HELD();
1973 
1974 	/* copyout saved stats from closed ports */
1975 	if (((filter->nmf_bitmap & NXMIB_FILTER_PID) &&
1976 	    (filter->nmf_pid == 0)) ||
1977 	    !(filter->nmf_bitmap & NXMIB_FILTER_PID)) {
1978 		actual_space += sus_size;
1979 		if (out != NULL && actual_space <= len) {
1980 			nxs = fsw->fsw_closed_na_stats;
1981 			fsw_nxs2nus(fsw, filter, 0, nxs, sus);
1982 			sus++;
1983 		}
1984 	}
1985 
1986 	/*
1987 	 * XXX Currently a proc only opens one channel to nexus so we don't do
1988 	 * per proc aggregation of inet stats now as this needs lots of code
1989 	 */
1990 	/* copyout per process stats */
1991 	STAILQ_FOREACH(ch, &fsw->fsw_nx->nx_ch_head, ch_link) {
1992 		struct skmem_arena *ar;
1993 		struct nexus_adapter *na;
1994 
1995 		/* ch_lock isn't needed here since sk_lock is held */
1996 		if ((ch->ch_flags & CHANF_CLOSING) ||
1997 		    (na = ch->ch_na) == NULL) {
1998 			/* channel is closing */
1999 			continue;
2000 		}
2001 
2002 		if ((filter->nmf_bitmap & NXMIB_FILTER_PID) &&
2003 		    filter->nmf_pid != ch->ch_pid) {
2004 			continue;
2005 		}
2006 
2007 		ar = na->na_arena;
2008 
2009 		AR_LOCK(ar);
2010 		nxs = skmem_arena_nexus(ar)->arn_stats_obj;
2011 		if (nxs == NULL) {
2012 			ASSERT(ar->ar_flags & ARF_DEFUNCT);
2013 			AR_UNLOCK(ar);
2014 			continue;
2015 		}
2016 
2017 		actual_space += sus_size;
2018 		if (out == NULL || actual_space > len) {
2019 			AR_UNLOCK(ar);
2020 			continue;
2021 		}
2022 
2023 		fsw_nxs2nus(fsw, filter, ch->ch_pid, nxs, sus);
2024 		sus++;
2025 		AR_UNLOCK(ar);
2026 	}
2027 
2028 	return actual_space;
2029 }
2030 
2031 static size_t
fsw_mib_get_stats(struct nx_flowswitch * fsw,void * out,size_t len)2032 fsw_mib_get_stats(struct nx_flowswitch *fsw, void *out, size_t len)
2033 {
2034 	struct sk_stats_flow_switch *sfs = out;
2035 	size_t actual_space = sizeof(struct sk_stats_flow_switch);
2036 
2037 	if (out != NULL && actual_space <= len) {
2038 		uuid_copy(sfs->sfs_nx_uuid, fsw->fsw_nx->nx_uuid);
2039 		(void) strlcpy(sfs->sfs_if_name,
2040 		    fsw->fsw_flow_mgr->fm_name, IFNAMSIZ);
2041 		sfs->sfs_fsws = fsw->fsw_stats;
2042 	}
2043 
2044 	return actual_space;
2045 }
2046 
2047 size_t
fsw_mib_get(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len,struct proc * p)2048 fsw_mib_get(struct nx_flowswitch *fsw, struct nexus_mib_filter *filter,
2049     void *out, size_t len, struct proc *p)
2050 {
2051 	size_t ret;
2052 
2053 	switch (filter->nmf_type) {
2054 	case NXMIB_FSW_STATS:
2055 		ret = fsw_mib_get_stats(fsw, out, len);
2056 		break;
2057 	case NXMIB_FLOW:
2058 		ret = fsw_mib_get_flow(fsw, filter, out, len);
2059 		break;
2060 	case NXMIB_FLOW_OWNER:
2061 		ret = fsw_mib_get_flow_owner(fsw, filter, out, len);
2062 		break;
2063 	case NXMIB_FLOW_ROUTE:
2064 		ret = fsw_mib_get_flow_route(fsw, filter, out, len, p);
2065 		break;
2066 	case NXMIB_TCP_STATS:
2067 	case NXMIB_UDP_STATS:
2068 	case NXMIB_IP_STATS:
2069 	case NXMIB_IP6_STATS:
2070 	case NXMIB_USERSTACK_STATS:
2071 		ret = fsw_mib_get_userstack_stats(fsw, filter, out, len);
2072 		break;
2073 	case NXMIB_FLOW_ADV:
2074 		ret = fsw_mib_get_flow_adv(fsw, filter, out, len);
2075 		break;
2076 	default:
2077 		ret = 0;
2078 		break;
2079 	}
2080 
2081 	return ret;
2082 }
2083 
2084 void
fsw_fold_stats(struct nx_flowswitch * fsw,void * data,nexus_stats_type_t type)2085 fsw_fold_stats(struct nx_flowswitch *fsw,
2086     void *data, nexus_stats_type_t type)
2087 {
2088 	ASSERT(data != NULL);
2089 	FSW_LOCK_ASSERT_HELD(fsw);
2090 
2091 	switch (type) {
2092 	case NEXUS_STATS_TYPE_FSW:
2093 	{
2094 		struct __nx_stats_fsw *d, *s;
2095 		d = fsw->fsw_closed_na_stats;
2096 		s = data;
2097 		ip_stats_fold(&d->nxs_ipstat, &s->nxs_ipstat);
2098 		ip6_stats_fold(&d->nxs_ip6stat, &s->nxs_ip6stat);
2099 		tcp_stats_fold(&d->nxs_tcpstat, &s->nxs_tcpstat);
2100 		udp_stats_fold(&d->nxs_udpstat, &s->nxs_udpstat);
2101 		quic_stats_fold(&d->nxs_quicstat, &s->nxs_quicstat);
2102 		break;
2103 	}
2104 	case NEXUS_STATS_TYPE_CHAN_ERRORS:
2105 	{
2106 		struct __nx_stats_channel_errors *s = data;
2107 		fsw_vp_channel_error_stats_fold(&fsw->fsw_stats, s);
2108 		break;
2109 	}
2110 	default:
2111 		VERIFY(0);
2112 		/* NOTREACHED */
2113 		__builtin_unreachable();
2114 	}
2115 }
2116 
2117 boolean_t
fsw_detach_barrier_add(struct nx_flowswitch * fsw)2118 fsw_detach_barrier_add(struct nx_flowswitch *fsw)
2119 {
2120 	lck_mtx_lock_spin(&fsw->fsw_detach_barrier_lock);
2121 	if (__improbable(fsw->fsw_detach_flags != 0 ||
2122 	    fsw->fsw_ifp == NULL || fsw->fsw_agent_session == NULL)) {
2123 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2124 		return FALSE;
2125 	}
2126 	fsw->fsw_detach_barriers++;
2127 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2128 
2129 	return TRUE;
2130 }
2131 
2132 void
fsw_detach_barrier_remove(struct nx_flowswitch * fsw)2133 fsw_detach_barrier_remove(struct nx_flowswitch *fsw)
2134 {
2135 	lck_mtx_lock_spin(&fsw->fsw_detach_barrier_lock);
2136 	ASSERT((fsw->fsw_detach_flags & FSW_DETACHF_DETACHED) == 0);
2137 	ASSERT(fsw->fsw_detach_barriers != 0);
2138 	fsw->fsw_detach_barriers--;
2139 	/* if there's a thread waiting to detach the interface, let it know */
2140 	if (__improbable((fsw->fsw_detach_waiters > 0) &&
2141 	    (fsw->fsw_detach_barriers == 0))) {
2142 		fsw->fsw_detach_waiters = 0;
2143 		wakeup(&fsw->fsw_detach_waiters);
2144 	}
2145 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2146 }
2147 
2148 /*
2149  * Generic resolver for non-Ethernet interfaces.
2150  */
2151 int
fsw_generic_resolve(struct nx_flowswitch * fsw,struct flow_route * fr,struct __kern_packet * pkt)2152 fsw_generic_resolve(struct nx_flowswitch *fsw, struct flow_route *fr,
2153     struct __kern_packet *pkt)
2154 {
2155 #pragma unused(pkt)
2156 #if SK_LOG
2157 	char dst_s[MAX_IPv6_STR_LEN];
2158 #endif /* SK_LOG */
2159 	struct ifnet *ifp = fsw->fsw_ifp;
2160 	struct rtentry *tgt_rt = NULL;
2161 	int err = 0;
2162 
2163 	ASSERT(fr != NULL);
2164 	ASSERT(ifp != NULL);
2165 
2166 	FR_LOCK(fr);
2167 	/*
2168 	 * If the destination is on-link, we use the final destination
2169 	 * address as target.  If it's off-link, we use the gateway
2170 	 * address instead.  Point tgt_rt to the the destination or
2171 	 * gateway route accordingly.
2172 	 */
2173 	if (fr->fr_flags & FLOWRTF_ONLINK) {
2174 		tgt_rt = fr->fr_rt_dst;
2175 	} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
2176 		tgt_rt = fr->fr_rt_gw;
2177 	}
2178 
2179 	/*
2180 	 * Perform another routing table lookup if necessary.
2181 	 */
2182 	if (tgt_rt == NULL || !(tgt_rt->rt_flags & RTF_UP) ||
2183 	    fr->fr_want_configure) {
2184 		if (fr->fr_want_configure == 0) {
2185 			atomic_add_32(&fr->fr_want_configure, 1);
2186 		}
2187 		err = flow_route_configure(fr, ifp, NULL);
2188 		if (err != 0) {
2189 			SK_ERR("failed to configure route to %s on %s (err %d)",
2190 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
2191 			    sizeof(dst_s)), ifp->if_xname, err);
2192 			goto done;
2193 		}
2194 
2195 		/* refresh pointers */
2196 		if (fr->fr_flags & FLOWRTF_ONLINK) {
2197 			tgt_rt = fr->fr_rt_dst;
2198 		} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
2199 			tgt_rt = fr->fr_rt_gw;
2200 		}
2201 	}
2202 
2203 	if (__improbable(!(fr->fr_flags & (FLOWRTF_ONLINK | FLOWRTF_GATEWAY)))) {
2204 		err = EHOSTUNREACH;
2205 		SK_ERR("invalid route for %s on %s (err %d)",
2206 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
2207 		    sizeof(dst_s)), ifp->if_xname, err);
2208 		goto done;
2209 	}
2210 
2211 	ASSERT(tgt_rt != NULL);
2212 
2213 done:
2214 	if (__probable(err == 0)) {
2215 		/*
2216 		 * There's no actual resolution taking place here, so just
2217 		 * mark it with FLOWRTF_RESOLVED for consistency.
2218 		 */
2219 		atomic_bitset_32(&fr->fr_flags, FLOWRTF_RESOLVED);
2220 		atomic_set_32(&fr->fr_want_probe, 0);
2221 	} else {
2222 		atomic_bitclear_32(&fr->fr_flags, FLOWRTF_RESOLVED);
2223 		flow_route_cleanup(fr);
2224 	}
2225 	FR_UNLOCK(fr);
2226 
2227 	return err;
2228 }
2229 
2230 static void
fsw_read_boot_args(void)2231 fsw_read_boot_args(void)
2232 {
2233 	(void) PE_parse_boot_argn("fsw_use_dual_sized_pool",
2234 	    &fsw_use_dual_sized_pool, sizeof(fsw_use_dual_sized_pool));
2235 }
2236 
2237 void
fsw_init(void)2238 fsw_init(void)
2239 {
2240 	_CASSERT(NX_FSW_CHUNK_FREE == (uint64_t)-1);
2241 	_CASSERT(PKT_MAX_PROTO_HEADER_SIZE <= NX_FSW_MINBUFSIZE);
2242 
2243 	if (!__nx_fsw_inited) {
2244 		fsw_read_boot_args();
2245 		/*
2246 		 * Register callbacks for interface & protocol events
2247 		 * Use dummy arg for callback cookie.
2248 		 */
2249 		__nx_fsw_ifnet_eventhandler_tag =
2250 		    EVENTHANDLER_REGISTER(&ifnet_evhdlr_ctxt,
2251 		    ifnet_event, fsw_ifnet_event_callback,
2252 		    eventhandler_entry_dummy_arg, EVENTHANDLER_PRI_ANY);
2253 		VERIFY(__nx_fsw_ifnet_eventhandler_tag != NULL);
2254 
2255 		__nx_fsw_protoctl_eventhandler_tag =
2256 		    EVENTHANDLER_REGISTER(&protoctl_evhdlr_ctxt,
2257 		    protoctl_event, fsw_protoctl_event_callback,
2258 		    eventhandler_entry_dummy_arg, EVENTHANDLER_PRI_ANY);
2259 		VERIFY(__nx_fsw_protoctl_eventhandler_tag != NULL);
2260 		__nx_fsw_inited = 1;
2261 	}
2262 }
2263 
2264 void
fsw_uninit(void)2265 fsw_uninit(void)
2266 {
2267 	if (__nx_fsw_inited) {
2268 		EVENTHANDLER_DEREGISTER(&ifnet_evhdlr_ctxt, ifnet_event,
2269 		    __nx_fsw_ifnet_eventhandler_tag);
2270 		EVENTHANDLER_DEREGISTER(&protoctl_evhdlr_ctxt, protoctl_event,
2271 		    __nx_fsw_protoctl_eventhandler_tag);
2272 
2273 		__nx_fsw_inited = 0;
2274 	}
2275 }
2276 
2277 struct nx_flowswitch *
fsw_alloc(zalloc_flags_t how)2278 fsw_alloc(zalloc_flags_t how)
2279 {
2280 	struct nx_flowswitch *fsw;
2281 	struct __nx_stats_fsw *nsfw;
2282 
2283 	SK_LOCK_ASSERT_HELD();
2284 
2285 	nsfw = zalloc_flags(nx_fsw_stats_zone, how | Z_ZERO);
2286 	if (nsfw == NULL) {
2287 		return NULL;
2288 	}
2289 
2290 	fsw = zalloc_flags(nx_fsw_zone, how | Z_ZERO);
2291 	if (fsw == NULL) {
2292 		zfree(nx_fsw_stats_zone, nsfw);
2293 		return NULL;
2294 	}
2295 
2296 	FSW_RWINIT(fsw);
2297 	fsw->fsw_dev_ch = NULL;
2298 	fsw->fsw_host_ch = NULL;
2299 	fsw->fsw_closed_na_stats = nsfw;
2300 
2301 	SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
2302 
2303 	return fsw;
2304 }
2305 
2306 static int
fsw_detach(struct nx_flowswitch * fsw,struct nexus_adapter * hwna,boolean_t purge)2307 fsw_detach(struct nx_flowswitch *fsw, struct nexus_adapter *hwna,
2308     boolean_t purge)
2309 {
2310 	struct kern_nexus_provider *nx_prov = fsw->fsw_nx->nx_prov;
2311 	boolean_t do_dtor = FALSE;
2312 
2313 	SK_LOCK_ASSERT_HELD();
2314 
2315 	/*
2316 	 * return error if the the host port detach is in progress
2317 	 * or already detached.
2318 	 * For the case of flowswitch free (i.e. purge is TRUE) we have to
2319 	 * cleanup everything, so we will block if needed.
2320 	 */
2321 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
2322 	if (!purge && fsw->fsw_detach_flags != 0) {
2323 		SK_ERR("fsw detaching");
2324 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2325 		return EBUSY;
2326 	}
2327 	VERIFY(purge || fsw->fsw_detach_flags == 0);
2328 	/*
2329 	 * mark the flowswitch as detaching and release sk_lock while
2330 	 * waiting for other threads to exit. Maintain lock/unlock
2331 	 * ordering between the two locks.
2332 	 */
2333 	fsw->fsw_detach_flags |= FSW_DETACHF_DETACHING;
2334 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2335 	SK_UNLOCK();
2336 
2337 	/*
2338 	 * wait until all threads needing accesses to the flowswitch
2339 	 * netagent get out, and mark this as detached to prevent
2340 	 * further access requests from being admitted.
2341 	 */
2342 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
2343 	while (fsw->fsw_detach_barriers != 0) {
2344 		fsw->fsw_detach_waiters++;
2345 		(void) msleep(&fsw->fsw_detach_waiters,
2346 		    &fsw->fsw_detach_barrier_lock,
2347 		    (PZERO + 1), __FUNCTION__, NULL);
2348 	}
2349 	VERIFY(fsw->fsw_detach_barriers == 0);
2350 	VERIFY(fsw->fsw_detach_flags != 0);
2351 	fsw->fsw_detach_flags &= ~FSW_DETACHF_DETACHING;
2352 	/*
2353 	 * if the NA detach thread as well as the flowswitch free thread were
2354 	 * both waiting, then the thread which wins the race is responsible
2355 	 * for doing the dtor work.
2356 	 */
2357 	if (fsw->fsw_detach_flags == 0) {
2358 		fsw->fsw_detach_flags |= FSW_DETACHF_DETACHED;
2359 		do_dtor = TRUE;
2360 	}
2361 	VERIFY(fsw->fsw_detach_flags == FSW_DETACHF_DETACHED);
2362 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2363 	SK_LOCK();
2364 
2365 	FSW_WLOCK(fsw);
2366 	if (do_dtor) {
2367 		if (fsw->fsw_ifp != NULL) {
2368 			fsw_teardown_ifp(fsw, hwna);
2369 			ASSERT(fsw->fsw_ifp == NULL);
2370 			ASSERT(fsw->fsw_nifna == NULL);
2371 		}
2372 		bzero(fsw->fsw_slla, sizeof(fsw->fsw_slla));
2373 		nx_prov->nxprov_params->nxp_ifindex = 0;
2374 		/* free any flow entries in the deferred list */
2375 		fsw_linger_purge(fsw);
2376 	}
2377 	/*
2378 	 * If we are destroying the instance, release lock to let all
2379 	 * outstanding agent threads to enter, followed by waiting until
2380 	 * all of them exit the critical section before continuing.
2381 	 */
2382 	if (purge) {
2383 		FSW_UNLOCK(fsw);
2384 		flow_mgr_terminate(fsw->fsw_flow_mgr);
2385 		FSW_WLOCK(fsw);
2386 	}
2387 	FSW_WUNLOCK(fsw);
2388 	return 0;
2389 }
2390 
2391 void
fsw_free(struct nx_flowswitch * fsw)2392 fsw_free(struct nx_flowswitch *fsw)
2393 {
2394 	int err;
2395 
2396 	SK_LOCK_ASSERT_HELD();
2397 	ASSERT(fsw != NULL);
2398 
2399 	err = fsw_detach(fsw, NULL, TRUE);
2400 	VERIFY(err == 0);
2401 
2402 	fsw_dp_dtor(fsw);
2403 
2404 	ASSERT(fsw->fsw_dev_ch == NULL);
2405 	ASSERT(fsw->fsw_host_ch == NULL);
2406 	ASSERT(fsw->fsw_closed_na_stats != NULL);
2407 	zfree(nx_fsw_stats_zone, fsw->fsw_closed_na_stats);
2408 	fsw->fsw_closed_na_stats = NULL;
2409 	FSW_RWDESTROY(fsw);
2410 
2411 	SK_DF(SK_VERB_MEM, "fsw 0x%llx FREE", SK_KVA(fsw));
2412 	zfree(nx_fsw_zone, fsw);
2413 }
2414