xref: /xnu-8019.80.24/bsd/skywalk/nexus/flowswitch/fsw.c (revision a325d9c4a84054e40bbe985afedcb50ab80993ea)
1 /*
2  * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  *   1. Redistributions of source code must retain the above copyright
36  *      notice, this list of conditions and the following disclaimer.
37  *   2. Redistributions in binary form must reproduce the above copyright
38  *      notice, this list of conditions and the following disclaimer in the
39  *      documentation and/or other materials provided with the distribution.
40  *
41  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  */
53 #include <skywalk/os_skywalk_private.h>
54 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
55 #include <skywalk/nexus/flowswitch/fsw_var.h>
56 #include <skywalk/nexus/netif/nx_netif.h>
57 #include <skywalk/nexus/netif/nx_netif_compat.h>
58 
59 #include <net/bpf.h>
60 #include <net/if.h>
61 #include <net/pktsched/pktsched_netem.h>
62 #include <sys/eventhandler.h>
63 
64 #if (DEVELOPMENT || DEBUG)
65 SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, chain_enqueue,
66     CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_chain_enqueue, 0, "");
67 #endif /* !DEVELOPMENT && !DEBUG */
68 
69 uint32_t fsw_chain_enqueue = 0;
70 static int __nx_fsw_inited = 0;
71 static eventhandler_tag __nx_fsw_ifnet_eventhandler_tag = NULL;
72 static eventhandler_tag __nx_fsw_protoctl_eventhandler_tag = NULL;
73 
74 static ZONE_DECLARE(nx_fsw_zone, SKMEM_ZONE_PREFIX ".nx.fsw",
75     sizeof(struct nx_flowswitch), ZC_ZFREE_CLEARMEM);
76 
77 static ZONE_DECLARE(nx_fsw_stats_zone, SKMEM_ZONE_PREFIX ".nx.fsw.stats",
78     sizeof(struct __nx_stats_fsw), ZC_ZFREE_CLEARMEM);
79 
80 #define SKMEM_TAG_FSW_PORTS     "com.apple.skywalk.fsw.ports"
81 kern_allocation_name_t skmem_tag_fsw_ports;
82 
83 #define SKMEM_TAG_FSW_FOB_HASH "com.apple.skywalk.fsw.fsw.fob.hash"
84 kern_allocation_name_t skmem_tag_fsw_fob_hash;
85 
86 #define SKMEM_TAG_FSW_FRB_HASH "com.apple.skywalk.fsw.fsw.frb.hash"
87 kern_allocation_name_t skmem_tag_fsw_frb_hash;
88 
89 #define SKMEM_TAG_FSW_FRIB_HASH "com.apple.skywalk.fsw.fsw.frib.hash"
90 kern_allocation_name_t skmem_tag_fsw_frib_hash;
91 
92 #define SKMEM_TAG_FSW_FRAG_MGR "com.apple.skywalk.fsw.fsw.frag.mgr"
93 kern_allocation_name_t skmem_tag_fsw_frag_mgr;
94 
95 /* 64-bit mask with range */
96 #define BMASK64(_beg, _end)     \
97 	((NX_FSW_CHUNK_FREE >> (63 - (_end))) & ~((1ULL << (_beg)) - 1))
98 
99 static int fsw_detach(struct nx_flowswitch *fsw, struct nexus_adapter *hwna,
100     boolean_t purge);
101 
102 int
fsw_attach_vp(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_vp_adapter ** vpna)103 fsw_attach_vp(struct kern_nexus *nx, struct kern_channel *ch,
104     struct chreq *chr, struct nxbind *nxb, struct proc *p,
105     struct nexus_vp_adapter **vpna)
106 {
107 #pragma unused(ch)
108 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
109 	char *cr_name = chr->cr_name;
110 	int err = 0;
111 
112 	SK_LOCK_ASSERT_HELD();
113 	ASSERT(!(chr->cr_mode & CHMODE_CONFIG));
114 	*vpna = NULL;
115 
116 	/* if there's an existing adapter on the nexus port then use it */
117 	FSW_WLOCK(fsw);
118 	err = fsw_port_alloc(fsw, nxb, vpna, chr->cr_port, p, FALSE, FALSE);
119 	FSW_WUNLOCK(fsw);
120 
121 	if (err != 0) {
122 		ASSERT(*vpna == NULL);
123 		goto out;
124 	} else if (*vpna != NULL) {
125 		/*
126 		 * Use the existing adapter on that port; fsw_port_alloc()
127 		 * callback has retained a reference count on the adapter.
128 		 */
129 		goto out;
130 	}
131 	ASSERT(*vpna == NULL);
132 
133 	/* create a virtual port; callee holds vpna ref */
134 	err = fsw_vp_na_create(nx, chr, vpna);
135 	if (err != 0) {
136 		SK_ERR("vpna create failed (err %d)", err);
137 		goto out;
138 	}
139 
140 	/* attach vp to fsw */
141 	err = fsw_vp_na_attach(nx, cr_name, &(*vpna)->vpna_up);
142 	if (err != 0) {
143 		SK_ERR("vpna \"%s\" fsw attach failed (err %d)",
144 		    (*vpna)->vpna_up.na_name, err);
145 		goto out;
146 	}
147 
148 	FSW_WLOCK(fsw);
149 	err = fsw_port_alloc(fsw, nxb, vpna, (*vpna)->vpna_nx_port, p, FALSE, FALSE);
150 	FSW_WUNLOCK(fsw);
151 
152 out:
153 	if ((*vpna) != NULL) {
154 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
155 		    "vpna \"%s\" (0x%llx) refs %u to fsw \"%s\" "
156 		    "nx_port %d (err %d)", (*vpna)->vpna_up.na_name,
157 		    SK_KVA(&(*vpna)->vpna_up), (*vpna)->vpna_up.na_refcount,
158 		    cr_name, (int)(*vpna)->vpna_nx_port, err);
159 
160 		if (err != 0) {
161 			na_release_locked(&(*vpna)->vpna_up);
162 			*vpna = NULL;
163 		}
164 	}
165 
166 	return err;
167 }
168 
169 static int
fsw_nx_check(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)170 fsw_nx_check(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
171 {
172 #pragma unused(fsw)
173 	nexus_type_t hw_nxdom_type = NX_DOM(hw_nx)->nxdom_type;
174 
175 	if (hw_nxdom_type != NEXUS_TYPE_NET_IF) {
176 		return EINVAL;
177 	}
178 
179 	/* it's a netif below */
180 	return 0;
181 }
182 
183 static int
fsw_ctl_flow_add(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)184 fsw_ctl_flow_add(struct nx_flowswitch *fsw, struct proc *p,
185     struct nx_flow_req *req)
186 {
187 	struct flow_owner *fo;
188 	int error = 0;
189 
190 	ASSERT(p != PROC_NULL);
191 
192 	if (p != kernproc) {
193 		/* special port shouldn't be bound via this method */
194 		if (req->nfr_nx_port < FSW_VP_USER_MIN) {
195 			return EINVAL;
196 		}
197 		req->nfr_flags |= (NXFLOWREQF_TRACK | NXFLOWREQF_FLOWADV);
198 	} else {
199 		/* no flow track or advisory support for bsd flow */
200 		ASSERT((req->nfr_flags & NXFLOWREQF_TRACK) == 0);
201 		ASSERT((req->nfr_flags & NXFLOWREQF_FLOWADV) == 0);
202 		ASSERT((req->nfr_flags & NXFLOWREQF_LOW_LATENCY) == 0);
203 	}
204 
205 	/* init kernel only fields */
206 	if (p != kernproc) {
207 		nx_flow_req_internalize(req);
208 	}
209 	req->nfr_pid = proc_pid(p);
210 	if (req->nfr_epid == -1) {
211 		req->nfr_epid = proc_pid(p);
212 	}
213 
214 	fo = fsw_flow_add(fsw, req, &error);
215 	ASSERT(fo != NULL || error != 0);
216 
217 	if (error == 0) {
218 		// user space don't need this flow stats
219 		flow_stats_release(req->nfr_flow_stats);
220 	}
221 	if (p != kernproc) {
222 		nx_flow_req_externalize(req);
223 	}
224 
225 	return error;
226 }
227 
228 static int
fsw_ctl_flow_del(struct nx_flowswitch * fsw,struct proc * p,struct nx_flow_req * req)229 fsw_ctl_flow_del(struct nx_flowswitch *fsw, struct proc *p,
230     struct nx_flow_req *req)
231 {
232 	int err;
233 
234 	nx_flow_req_internalize(req);
235 	req->nfr_pid = proc_pid(p);
236 	err = fsw_flow_del(fsw, req, TRUE, NULL);
237 
238 	nx_flow_req_externalize(req);
239 	return err;
240 }
241 
242 static int
fsw_setup_ifp(struct nx_flowswitch * fsw,struct nexus_adapter * hwna)243 fsw_setup_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna)
244 {
245 	int error = 0;
246 	struct ifnet *ifp = hwna->na_ifp;
247 	struct kern_pbufpool *pp = skmem_arena_nexus(hwna->na_arena)->arn_rx_pp;
248 	size_t f_limit = pp->pp_kmd_region->skr_c_obj_cnt / 2;
249 
250 	ASSERT((hwna->na_type == NA_NETIF_HOST) ||
251 	    (hwna->na_type == NA_NETIF_COMPAT_HOST));
252 
253 	SK_LOCK_ASSERT_HELD();
254 
255 	/*
256 	 * XXX: we don't support non TXSTART interface.
257 	 * There are assumptions in fsw_port_flush_enqueue_dst() about
258 	 * single threaded write to destination rings.
259 	 */
260 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
261 		SK_ERR("non TXSTART interface not supported ifp(0x%llx)",
262 		    SK_KVA(ifp));
263 		return ENOTSUP;
264 	}
265 
266 	FSW_WLOCK(fsw);
267 
268 	ASSERT(fsw->fsw_ifp == NULL);
269 	ASSERT(fsw->fsw_nifna == NULL);
270 	ASSERT(fsw->fsw_resolve == NULL);
271 	ASSERT(fsw->fsw_frame == NULL);
272 	ASSERT(fsw->fsw_demux == NULL);
273 	ASSERT(fsw->fsw_pkt_copy_from_pkt == NULL);
274 	ASSERT(fsw->fsw_pkt_copy_from_mbuf == NULL);
275 	ASSERT(fsw->fsw_pkt_copy_to_mbuf == NULL);
276 
277 	fsw->fsw_ipfm = fsw_ip_frag_mgr_create(fsw, ifp, f_limit);
278 	if (fsw->fsw_ipfm == NULL) {
279 		FSW_WUNLOCK(fsw);
280 		return ENOMEM;
281 	}
282 
283 	switch (ifp->if_family) {
284 	case IFNET_FAMILY_ETHERNET:
285 		error = fsw_ethernet_setup(fsw, ifp);
286 		fsw->fsw_ifp_dlt = DLT_EN10MB;
287 		break;
288 
289 	case IFNET_FAMILY_CELLULAR:
290 		error = fsw_cellular_setup(fsw, ifp);
291 		fsw->fsw_ifp_dlt = DLT_RAW;
292 		break;
293 
294 	default:
295 		if (ifp->if_family == IFNET_FAMILY_IPSEC ||
296 		    ifp->if_family == IFNET_FAMILY_UTUN) {
297 			error = fsw_ip_setup(fsw, ifp);
298 			fsw->fsw_ifp_dlt = DLT_RAW;
299 			break;
300 		}
301 		error = ENOTSUP;
302 		break;
303 	}
304 
305 	if (error != 0) {
306 		FSW_WUNLOCK(fsw);
307 		return error;
308 	}
309 
310 	ASSERT(fsw->fsw_resolve != NULL);
311 
312 	if (NX_PROV(fsw->fsw_nx)->nxprov_region_params[SKMEM_REGION_KMD].
313 	    srp_max_frags > 1 || pp->pp_max_frags > 1) {
314 		fsw->fsw_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt;
315 		fsw->fsw_pkt_copy_from_mbuf = pkt_copy_multi_buflet_from_mbuf;
316 		fsw->fsw_pkt_copy_to_mbuf = pkt_copy_multi_buflet_to_mbuf;
317 	} else {
318 		fsw->fsw_pkt_copy_from_pkt = pkt_copy_from_pkt;
319 		fsw->fsw_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
320 		fsw->fsw_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
321 	}
322 
323 	/*
324 	 * Since it is possible for fsw to refer to the ifp after all
325 	 * underlying hwnas are freed (see fsw_teardown_ifp()), we need
326 	 * an extra reference to the ifp here.
327 	 *
328 	 * We also cache the netif adapter of the interface, as it's
329 	 * needed for each packet enqueued to the classq.  There is no
330 	 * need to retain a refcnt for the same reason as above.
331 	 *
332 	 * We hold the busy lock across these, just in case an interface
333 	 * detach and reattach happens, as fsw_flow_bind() relies on the
334 	 * same lock as well before making its checks.
335 	 */
336 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
337 
338 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
339 	fsw->fsw_ifp = ifp;
340 	fsw->fsw_nifna = &ifp->if_na->nifna_up;
341 	ifp->if_na->nifna_netif->nif_fsw = fsw;
342 	ifp->if_na->nifna_netif->nif_fsw_nxadv =
343 	    fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
344 	(void) strlcpy(fsw->fsw_flow_mgr->fm_name,
345 	    if_name(ifp), IFNAMSIZ);
346 
347 	fsw_classq_setup(fsw, hwna);
348 	fsw->fsw_classq_enabled = TRUE;
349 	fsw->fsw_src_lla_gencnt = 0;
350 
351 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
352 	(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
353 	    FSW_REAP_THREADNAME, ifp->if_xname, "");
354 	thread_set_thread_name(fsw->fsw_reap_thread, fsw->fsw_reap_name);
355 
356 	error = fsw_netagent_register(fsw, ifp);
357 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
358 	    "fsw_netagent_register %s (family %u) (err %d)",
359 	    if_name(ifp), ifp->if_family, error);
360 
361 	/*
362 	 * Clear NXF_REJECT to allow new channels to be opened
363 	 * to this nexus, in case this is an interface reattach.
364 	 * Otherwise this flag should already be cleared.
365 	 */
366 	if (error == 0) {
367 		atomic_bitclear_32(&fsw->fsw_nx->nx_flags, NXF_REJECT);
368 	}
369 
370 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
371 
372 	/*
373 	 * Wake up the reaper thread.
374 	 */
375 	if (error == 0) {
376 		fsw_reap_sched(fsw);
377 	}
378 
379 	/* init skoid */
380 	skoid_create(&fsw->fsw_skoid,
381 	    SKOID_SNODE(_kern_skywalk_flowswitch), if_name(ifp),
382 	    CTLFLAG_RW);
383 
384 	FSW_WUNLOCK(fsw);
385 
386 	return error;
387 }
388 
389 static void
fsw_teardown_ifp(struct nx_flowswitch * fsw,struct nexus_adapter * hwna)390 fsw_teardown_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna)
391 {
392 	struct ifnet *ifp;
393 
394 	SK_LOCK_ASSERT_HELD();
395 
396 	FSW_WLOCK_ASSERT_HELD(fsw);
397 	ifp = fsw->fsw_ifp;
398 	ASSERT(ifp != NULL);
399 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
400 
401 	fsw_netagent_unregister(fsw, ifp);
402 
403 	if (fsw->fsw_ipfm != NULL) {
404 		fsw_ip_frag_mgr_destroy(fsw->fsw_ipfm);
405 	}
406 
407 	skoid_destroy(&fsw->fsw_skoid);
408 
409 	SK_DF(SK_VERB_FSW, "%sdetached from %s (family %u)",
410 	    ((fsw->fsw_agent_session != NULL) ? "netagent" : ""),
411 	    if_name(ifp), ifp->if_family);
412 
413 	if (hwna != NULL) {
414 		fsw_classq_teardown(fsw, hwna);
415 	}
416 
417 	/*
418 	 * Set NXF_REJECT on the nexus, which would cause existing adapters
419 	 * to be marked similarly; channels associated with them would then
420 	 * cease to function.
421 	 */
422 	atomic_bitset_32(&fsw->fsw_nx->nx_flags, NXF_REJECT);
423 
424 	/* see notes on fsw_na_attach() about I/O refcnt */
425 	if (ifp->if_na != NULL) {
426 		ifp->if_na->nifna_netif->nif_fsw = NULL;
427 		ifp->if_na->nifna_netif->nif_fsw_nxadv = NULL;
428 		membar_sync();
429 	}
430 
431 	fsw->fsw_ifp = NULL;
432 	fsw->fsw_nifna = NULL;
433 	fsw->fsw_resolve = NULL;
434 	fsw->fsw_frame = NULL;
435 	fsw->fsw_frame_headroom = 0;
436 	fsw->fsw_demux = NULL;
437 	fsw->fsw_classq_enabled = FALSE;
438 	fsw->fsw_pkt_copy_from_pkt = NULL;
439 	fsw->fsw_pkt_copy_from_mbuf = NULL;
440 	fsw->fsw_pkt_copy_to_mbuf = NULL;
441 
442 	if (ifp->if_input_netem != NULL) {
443 		netem_destroy(ifp->if_input_netem);
444 		ifp->if_input_netem = NULL;
445 	}
446 
447 	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
448 	(void) snprintf(fsw->fsw_reap_name, sizeof(fsw->fsw_reap_name),
449 	    FSW_REAP_THREADNAME, if_name(ifp), "_detached");
450 	thread_set_thread_name(fsw->fsw_reap_thread, fsw->fsw_reap_name);
451 }
452 
453 static int
fsw_host_setup(struct nx_flowswitch * fsw)454 fsw_host_setup(struct nx_flowswitch *fsw)
455 {
456 	struct nexus_adapter *hwna;
457 	struct ifnet *ifp;
458 
459 	SK_LOCK_ASSERT_HELD();
460 
461 	hwna = fsw->fsw_host_ch->ch_na;
462 	ASSERT(hwna != NULL);
463 
464 
465 	/* the netif below must have an ifnet attached (dev/host port) */
466 	if ((ifp = hwna->na_ifp) == NULL) {
467 		return ENXIO;
468 	}
469 
470 	/*
471 	 * XXX: we don't support multiple rx rings yet.
472 	 * There are assumptions in fsw_port_flush_enqueue_dst() about
473 	 * single threaded write to destination rings.
474 	 */
475 	if (SKYWALK_NATIVE(ifp) && (hwna->na_num_rx_rings > 1)) {
476 		SK_ERR("ifp(0x%llx): multiple rx rings(%d) not supported",
477 		    SK_KVA(ifp), hwna->na_num_rx_rings);
478 		return ENOTSUP;
479 	}
480 
481 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
482 	if ((fsw->fsw_detach_flags & FSW_DETACHF_DETACHING) != 0) {
483 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
484 		return EBUSY;
485 	}
486 	fsw->fsw_detach_flags = 0;
487 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
488 
489 	int error = fsw_setup_ifp(fsw, hwna);
490 	ASSERT(error != 0 || fsw->fsw_ifp != NULL);
491 	if (error != 0) {
492 		return error;
493 	}
494 
495 	/* update the interface index */
496 	ASSERT(NX_PROV(fsw->fsw_nx)->nxprov_params->nxp_ifindex == 0);
497 	NX_PROV(fsw->fsw_nx)->nxprov_params->nxp_ifindex = ifp->if_index;
498 	return 0;
499 }
500 
501 static int
fsw_host_teardown(struct nx_flowswitch * fsw)502 fsw_host_teardown(struct nx_flowswitch *fsw)
503 {
504 	struct nexus_adapter *hwna = fsw->fsw_host_ch->ch_na;
505 
506 	SK_LOCK_ASSERT_HELD();
507 	return fsw_detach(fsw, hwna, FALSE);
508 }
509 
510 #if SK_LOG
511 /* Hoisted out of line to reduce kernel stack footprint */
512 SK_LOG_ATTRIBUTE
513 static void
fsw_ctl_attach_log(const struct nx_spec_req * nsr,const struct kern_nexus * nx,int err)514 fsw_ctl_attach_log(const struct nx_spec_req *nsr,
515     const struct kern_nexus *nx, int err)
516 {
517 	uuid_string_t uuidstr, ifuuidstr;
518 	const char *nustr;
519 
520 	if (nsr->nsr_flags & NXSPECREQ_UUID) {
521 		nustr = sk_uuid_unparse(nsr->nsr_uuid, uuidstr);
522 	} else if (nsr->nsr_flags & NXSPECREQ_IFP) {
523 		(void) snprintf((char *)uuidstr, sizeof(uuidstr), "0x%llx",
524 		    SK_KVA(nsr->nsr_ifp));
525 		nustr = uuidstr;
526 	} else {
527 		nustr = nsr->nsr_name;
528 	}
529 
530 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
531 	    "nexus 0x%llx (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d",
532 	    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr,
533 	    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err);
534 }
535 #endif /* SK_LOG */
536 
537 SK_NO_INLINE_ATTRIBUTE
538 static void
fsw_netif_set_callbacks_common(struct nx_flowswitch * fsw,boolean_t set)539 fsw_netif_set_callbacks_common(struct nx_flowswitch *fsw, boolean_t set)
540 {
541 	struct nexus_adapter *hwna = fsw->fsw_dev_ch->ch_na;
542 
543 	ASSERT(hwna->na_type == NA_NETIF_DEV ||
544 	    hwna->na_type == NA_NETIF_COMPAT_DEV);
545 
546 	if (set) {
547 		netif_hwna_set_mode(hwna, NETIF_MODE_FSW, fsw_devna_rx);
548 	} else {
549 		netif_hwna_clear_mode(hwna);
550 	}
551 }
552 
553 SK_NO_INLINE_ATTRIBUTE
554 static void
fsw_netif_set_callbacks(struct nx_flowswitch * fsw)555 fsw_netif_set_callbacks(struct nx_flowswitch *fsw)
556 {
557 	fsw_netif_set_callbacks_common(fsw, TRUE);
558 }
559 
560 SK_NO_INLINE_ATTRIBUTE
561 static void
fsw_netif_clear_callbacks(struct nx_flowswitch * fsw)562 fsw_netif_clear_callbacks(struct nx_flowswitch *fsw)
563 {
564 	fsw_netif_set_callbacks_common(fsw, FALSE);
565 }
566 
567 SK_NO_INLINE_ATTRIBUTE
568 static void
fsw_dp_start(struct nx_flowswitch * fsw)569 fsw_dp_start(struct nx_flowswitch *fsw)
570 {
571 	ASSERT(fsw->fsw_dev_ch != NULL);
572 	ASSERT(fsw->fsw_host_ch != NULL);
573 
574 	fsw_netif_set_callbacks(fsw);
575 	na_start_spec(fsw->fsw_dev_ch->ch_nexus, fsw->fsw_dev_ch);
576 	na_start_spec(fsw->fsw_host_ch->ch_nexus, fsw->fsw_host_ch);
577 }
578 
579 SK_NO_INLINE_ATTRIBUTE
580 static void
fsw_dp_stop(struct nx_flowswitch * fsw)581 fsw_dp_stop(struct nx_flowswitch *fsw)
582 {
583 	ASSERT(fsw->fsw_dev_ch != NULL);
584 	ASSERT(fsw->fsw_host_ch != NULL);
585 
586 	na_stop_spec(fsw->fsw_host_ch->ch_nexus, fsw->fsw_host_ch);
587 	na_stop_spec(fsw->fsw_dev_ch->ch_nexus, fsw->fsw_dev_ch);
588 	fsw_netif_clear_callbacks(fsw);
589 
590 	FSW_WLOCK(fsw);
591 	fsw->fsw_state_flags |= FSW_STATEF_QUIESCED;
592 	FSW_WUNLOCK(fsw);
593 }
594 
595 SK_NO_INLINE_ATTRIBUTE
596 static int
fsw_netif_port_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx,boolean_t host)597 fsw_netif_port_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx,
598     boolean_t host)
599 {
600 	struct chreq chr;
601 	struct kern_channel *ch;
602 	int err;
603 
604 	bzero(&chr, sizeof(chr));
605 	uuid_copy(chr.cr_spec_uuid, hw_nx->nx_uuid);
606 	chr.cr_ring_id = CHANNEL_RING_ID_ANY;
607 	chr.cr_port = host ? NEXUS_PORT_NET_IF_HOST : NEXUS_PORT_NET_IF_DEV;
608 	chr.cr_mode |= CHMODE_CONFIG | (host ? CHMODE_HOST : 0);
609 
610 	err = 0;
611 	ch = ch_open_special(hw_nx, &chr, FALSE, &err);
612 	if (ch == NULL) {
613 		SK_ERR("ch_open_special(%s) failed: %d",
614 		    host ? "host" : "dev", err);
615 		return err;
616 	}
617 	if (host) {
618 		fsw->fsw_host_ch = ch;
619 	} else {
620 		fsw->fsw_dev_ch = ch;
621 	}
622 	return 0;
623 }
624 
625 SK_NO_INLINE_ATTRIBUTE
626 static int
fsw_netif_port_teardown(struct nx_flowswitch * fsw,boolean_t host)627 fsw_netif_port_teardown(struct nx_flowswitch *fsw, boolean_t host)
628 {
629 	struct kern_channel *ch;
630 
631 	ch = host ? fsw->fsw_host_ch : fsw->fsw_dev_ch;
632 	if (ch == NULL) {
633 		return EINVAL;
634 	}
635 	if (host) {
636 		fsw->fsw_host_ch = NULL;
637 	} else {
638 		fsw->fsw_dev_ch = NULL;
639 	}
640 	ch_close_special(ch);
641 	(void) ch_release_locked(ch);
642 	return 0;
643 }
644 
645 SK_NO_INLINE_ATTRIBUTE
646 static int
fsw_devna_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)647 fsw_devna_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
648 {
649 	return fsw_netif_port_setup(fsw, hw_nx, FALSE);
650 }
651 
652 SK_NO_INLINE_ATTRIBUTE
653 static int
fsw_hostna_setup(struct nx_flowswitch * fsw,struct kern_nexus * hw_nx)654 fsw_hostna_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
655 {
656 	return fsw_netif_port_setup(fsw, hw_nx, TRUE);
657 }
658 
659 SK_NO_INLINE_ATTRIBUTE
660 static int
fsw_devna_teardown(struct nx_flowswitch * fsw)661 fsw_devna_teardown(struct nx_flowswitch *fsw)
662 {
663 	return fsw_netif_port_teardown(fsw, FALSE);
664 }
665 
666 SK_NO_INLINE_ATTRIBUTE
667 static int
fsw_hostna_teardown(struct nx_flowswitch * fsw)668 fsw_hostna_teardown(struct nx_flowswitch *fsw)
669 {
670 	return fsw_netif_port_teardown(fsw, TRUE);
671 }
672 
673 /* Process NXCFG_CMD_ATTACH */
674 SK_NO_INLINE_ATTRIBUTE
675 static int
fsw_ctl_attach(struct kern_nexus * nx,struct proc * p,struct nx_spec_req * nsr)676 fsw_ctl_attach(struct kern_nexus *nx, struct proc *p, struct nx_spec_req *nsr)
677 {
678 #pragma unused(p)
679 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
680 	struct kern_nexus *hw_nx = NULL;
681 	int err = 0;
682 
683 	SK_LOCK_ASSERT_HELD();
684 
685 	/*
686 	 * The flowswitch only accepts UUID as an identifier, since it
687 	 * represents the UUID of the kernel object we are trying to
688 	 * attach to this flowswitch.
689 	 */
690 	if ((nsr->nsr_flags & (NXSPECREQ_UUID | NXSPECREQ_IFP)) !=
691 	    NXSPECREQ_UUID || uuid_is_null(nsr->nsr_uuid)) {
692 		err = EINVAL;
693 		goto done;
694 	}
695 
696 	if (fsw->fsw_dev_ch != NULL) {
697 		ASSERT(fsw->fsw_host_ch != NULL);
698 		err = EEXIST;
699 		goto done;
700 	}
701 
702 	hw_nx = nx_find(nsr->nsr_uuid, TRUE);
703 	if (hw_nx == NULL) {
704 		err = ENOENT;
705 		goto done;
706 	} else if (hw_nx == nx) {
707 		err = EINVAL;
708 		goto done;
709 	}
710 
711 	/* preflight check to see if the nexus is attachable to us */
712 	err = fsw_nx_check(fsw, hw_nx);
713 	if (err != 0) {
714 		goto done;
715 	}
716 
717 	err = fsw_devna_setup(fsw, hw_nx);
718 	if (err != 0) {
719 		goto done;
720 	}
721 
722 	err = fsw_hostna_setup(fsw, hw_nx);
723 	if (err != 0) {
724 		(void) fsw_devna_teardown(fsw);
725 		goto done;
726 	}
727 
728 	err = fsw_host_setup(fsw);
729 	if (err != 0) {
730 		(void) fsw_hostna_teardown(fsw);
731 		(void) fsw_devna_teardown(fsw);
732 		goto done;
733 	}
734 
735 	fsw_dp_start(fsw);
736 
737 	/* return the devna UUID */
738 	uuid_copy(nsr->nsr_if_uuid, fsw->fsw_dev_ch->ch_na->na_uuid);
739 	ASSERT(!uuid_is_null(nsr->nsr_if_uuid));
740 done:
741 #if SK_LOG
742 	if (__improbable(sk_verbose != 0)) {
743 		fsw_ctl_attach_log(nsr, nx, err);
744 	}
745 #endif /* SK_LOG */
746 
747 	if (hw_nx != NULL) {
748 		nx_release_locked(hw_nx);
749 	}
750 
751 	return err;
752 }
753 
754 SK_NO_INLINE_ATTRIBUTE
755 static void
fsw_cleanup(struct nx_flowswitch * fsw)756 fsw_cleanup(struct nx_flowswitch *fsw)
757 {
758 	int err;
759 
760 	if (fsw->fsw_dev_ch == NULL) {
761 		ASSERT(fsw->fsw_host_ch == NULL);
762 		return;
763 	}
764 	fsw_dp_stop(fsw);
765 	err = fsw_host_teardown(fsw);
766 	VERIFY(err == 0);
767 
768 	err = fsw_hostna_teardown(fsw);
769 	VERIFY(err == 0);
770 
771 	err = fsw_devna_teardown(fsw);
772 	VERIFY(err == 0);
773 }
774 
775 int
fsw_ctl_detach(struct kern_nexus * nx,struct proc * p,struct nx_spec_req * nsr)776 fsw_ctl_detach(struct kern_nexus *nx, struct proc *p,
777     struct nx_spec_req *nsr)
778 {
779 #pragma unused(p)
780 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
781 	int err = 0;
782 
783 	SK_LOCK_ASSERT_HELD();
784 
785 	/*
786 	 * nsr is NULL when we're called from the destructor, and it
787 	 * implies that we'll detach everything that is attached.
788 	 */
789 	if (nsr == NULL) {
790 		fsw_cleanup(fsw);
791 		ASSERT(fsw->fsw_dev_ch == NULL);
792 		ASSERT(fsw->fsw_host_ch == NULL);
793 		goto done;
794 	}
795 
796 	if (uuid_is_null(nsr->nsr_if_uuid)) {
797 		err = EINVAL;
798 		goto done;
799 	} else if (fsw->fsw_dev_ch == NULL || fsw->fsw_host_ch == NULL) {
800 		err = ENXIO;
801 		goto done;
802 	}
803 
804 	/* check if the devna uuid is correct */
805 	if (uuid_compare(nsr->nsr_if_uuid,
806 	    fsw->fsw_dev_ch->ch_na->na_uuid) != 0) {
807 		err = ESRCH;
808 		goto done;
809 	}
810 	fsw_cleanup(fsw);
811 
812 done:
813 #if SK_LOG
814 	if (nsr != NULL) {
815 		uuid_string_t ifuuidstr;
816 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
817 		    "nexus 0x%llx (%s) if_uuid %s flags 0x%x err %d",
818 		    SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
819 		    sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr),
820 		    nsr->nsr_flags, err);
821 	} else {
822 		SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
823 		    "nexus 0x%llx (%s) ANY err %d", SK_KVA(nx),
824 		    NX_DOM_PROV(nx)->nxdom_prov_name, err);
825 	}
826 #endif /* SK_LOG */
827 
828 	return err;
829 }
830 
831 static int
fsw_netem_config(struct nx_flowswitch * fsw,void * data)832 fsw_netem_config(struct nx_flowswitch *fsw, void *data)
833 {
834 	struct ifnet *ifp = fsw->fsw_ifp;
835 	struct if_netem_params *params = data;
836 	int ret;
837 
838 	if (ifp == NULL) {
839 		return ENODEV;
840 	}
841 
842 	SK_LOCK_ASSERT_HELD();
843 #define fsw_INPUT_NETEM_THREADNAME   "if_input_netem_%s@fsw"
844 #define fsw_INPUT_NETEM_THREADNAME_LEN       32
845 	char netem_name[fsw_INPUT_NETEM_THREADNAME_LEN];
846 	(void) snprintf(netem_name, sizeof(netem_name),
847 	    fsw_INPUT_NETEM_THREADNAME, if_name(ifp));
848 	ret = netem_config(&ifp->if_input_netem, netem_name, params, fsw,
849 	    fsw_dev_input_netem_dequeue, FSW_VP_DEV_BATCH_MAX);
850 
851 	return ret;
852 }
853 
854 int
fsw_ctl(struct kern_nexus * nx,nxcfg_cmd_t nc_cmd,struct proc * p,void * data)855 fsw_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, struct proc *p,
856     void *data)
857 {
858 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
859 	struct nx_spec_req *nsr = data;
860 	struct nx_flow_req *req = data;
861 	boolean_t need_check;
862 	int error = 0;
863 
864 	switch (nc_cmd) {
865 	case NXCFG_CMD_FLOW_ADD:
866 	case NXCFG_CMD_FLOW_DEL:
867 		if (uuid_is_null(req->nfr_flow_uuid)) {
868 			error = EINVAL;
869 			goto done;
870 		}
871 		if (p != kernproc) {
872 			req->nfr_flags &= NXFLOWREQF_MASK;
873 		}
874 		req->nfr_flowadv_idx = FLOWADV_IDX_NONE;
875 
876 		if (nc_cmd == NXCFG_CMD_FLOW_DEL) {
877 			break;
878 		}
879 
880 		need_check = FALSE;
881 		if (req->nfr_epid != -1 && proc_pid(p) != req->nfr_epid) {
882 			need_check = TRUE;
883 		} else if (!uuid_is_null(req->nfr_euuid)) {
884 			uuid_t uuid;
885 
886 			/* get the UUID of the issuing process */
887 			proc_getexecutableuuid(p, uuid, sizeof(uuid));
888 
889 			/*
890 			 * If this is not issued by a process for its own
891 			 * executable UUID and if the process does not have
892 			 * the necessary privilege, reject the request.
893 			 * The logic is similar to so_set_effective_uuid().
894 			 */
895 			if (uuid_compare(req->nfr_euuid, uuid) != 0) {
896 				need_check = TRUE;
897 			}
898 		}
899 		if (need_check) {
900 			kauth_cred_t cred = kauth_cred_proc_ref(p);
901 			error = priv_check_cred(cred,
902 			    PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0);
903 			kauth_cred_unref(&cred);
904 			if (error != 0) {
905 				goto done;
906 			}
907 		}
908 		break;
909 
910 	default:
911 		break;
912 	}
913 
914 	switch (nc_cmd) {
915 	case NXCFG_CMD_ATTACH:
916 		error = fsw_ctl_attach(nx, p, nsr);
917 		break;
918 
919 	case NXCFG_CMD_DETACH:
920 		error = fsw_ctl_detach(nx, p, nsr);
921 		break;
922 
923 	case NXCFG_CMD_FLOW_ADD:       /* struct nx_flow_req */
924 		error = fsw_ctl_flow_add(fsw, p, data);
925 		break;
926 
927 	case NXCFG_CMD_FLOW_DEL:     /* struct nx_flow_req */
928 		error = fsw_ctl_flow_del(fsw, p, data);
929 		break;
930 	case NXCFG_CMD_NETEM:           /* struct if_netem_params */
931 		error = fsw_netem_config(fsw, data);
932 		break;
933 
934 	default:
935 		SK_ERR("invalid cmd %u", nc_cmd);
936 		error = EINVAL;
937 		break;
938 	}
939 
940 done:
941 	return error;
942 }
943 
944 struct nx_flowswitch *
fsw_ifp_to_fsw(struct ifnet * ifp)945 fsw_ifp_to_fsw(struct ifnet *ifp)
946 {
947 	struct nx_flowswitch *fsw = NULL;
948 
949 	if (ifp->if_na != NULL) {
950 		fsw = ifp->if_na->nifna_netif->nif_fsw;
951 	}
952 	return fsw;
953 }
954 
955 static void
fsw_ifnet_event_callback(struct eventhandler_entry_arg ee_arg __unused,struct ifnet * ifp,struct sockaddr * ip_addr __unused,intf_event_code_t intf_ev_code)956 fsw_ifnet_event_callback(struct eventhandler_entry_arg ee_arg __unused,
957     struct ifnet *ifp, struct sockaddr *ip_addr __unused,
958     intf_event_code_t intf_ev_code)
959 {
960 	struct nx_flowswitch *fsw = NULL;
961 
962 	if (ifp->if_na == NULL) {
963 		return;
964 	}
965 
966 	SK_LOCK();
967 	fsw = fsw_ifp_to_fsw(ifp);
968 	if (fsw != NULL) {
969 		switch (intf_ev_code) {
970 		case INTF_EVENT_CODE_LLADDR_UPDATE:
971 			if ((fsw->fsw_ifp == NULL) ||
972 			    (fsw->fsw_ifp_dlt != DLT_EN10MB)) {
973 				break;
974 			}
975 
976 			VERIFY(fsw->fsw_ifp == ifp);
977 			SK_DF(SK_VERB_FSW, "MAC address change detected for %s",
978 			    if_name(fsw->fsw_ifp));
979 			(void) ifnet_lladdr_copy_bytes(ifp, fsw->fsw_ether_shost,
980 			    ETHER_ADDR_LEN);
981 			atomic_add_32(&fsw->fsw_src_lla_gencnt, 1);
982 			break;
983 
984 		case INTF_EVENT_CODE_LOW_POWER_UPDATE:
985 			if (fsw->fsw_ifp == NULL) {
986 				break;
987 			}
988 
989 			VERIFY(fsw->fsw_ifp == ifp);
990 
991 			if (ifp->if_xflags & IFXF_LOW_POWER) {
992 				SK_DF(SK_VERB_FSW,
993 				    "Low power mode updated for %s",
994 				    if_name(fsw->fsw_ifp));
995 
996 				fsw_reap_sched(fsw);
997 			}
998 			break;
999 
1000 		default:
1001 			break;
1002 		}
1003 	}
1004 	SK_UNLOCK();
1005 }
1006 
1007 static void
fsw_protoctl_event_callback(struct eventhandler_entry_arg ee_arg,struct ifnet * ifp,struct sockaddr * p_laddr,struct sockaddr * p_raddr,uint16_t lport,uint16_t rport,uint8_t proto,uint32_t protoctl_event_code,struct protoctl_ev_val * p_val)1008 fsw_protoctl_event_callback(struct eventhandler_entry_arg ee_arg,
1009     struct ifnet *ifp, struct sockaddr *p_laddr, struct sockaddr *p_raddr,
1010     uint16_t lport, uint16_t rport, uint8_t proto, uint32_t protoctl_event_code,
1011     struct protoctl_ev_val *p_val)
1012 {
1013 #pragma unused(ee_arg)
1014 	struct nx_flowswitch *fsw = NULL;
1015 	struct flow_entry *fe = NULL;
1016 	boolean_t netagent_update_flow = FALSE;
1017 	uuid_t fe_uuid;
1018 
1019 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1020 		return;
1021 	}
1022 
1023 	/*
1024 	 * XXX Right now only handle the event if we have enough
1025 	 * information to match the entire flow.
1026 	 */
1027 	if (lport == 0 || rport == 0 || p_laddr == NULL || p_raddr == NULL) {
1028 		return;
1029 	}
1030 
1031 	SK_LOCK();
1032 	fsw = fsw_ifp_to_fsw(ifp);
1033 	if (fsw == NULL) {
1034 		goto out;
1035 	}
1036 
1037 	if (!fsw_detach_barrier_add(fsw)) {
1038 		fsw = NULL;
1039 		SK_ERR("netagent detached");
1040 		goto out;
1041 	}
1042 
1043 	struct flow_key fk __sk_aligned(16);
1044 	FLOW_KEY_CLEAR(&fk);
1045 	fk.fk_proto = proto;
1046 	if (p_laddr->sa_family == AF_INET) {
1047 		fk.fk_ipver = IPVERSION;
1048 		fk.fk_src4 = SIN(p_laddr)->sin_addr;
1049 		fk.fk_dst4 = SIN(p_raddr)->sin_addr;
1050 	} else {
1051 		fk.fk_ipver = IPV6_VERSION;
1052 		fk.fk_src6 = SIN6(p_laddr)->sin6_addr;
1053 		fk.fk_dst6 = SIN6(p_raddr)->sin6_addr;
1054 	}
1055 	fk.fk_sport = lport;
1056 	fk.fk_dport = rport;
1057 	fk.fk_mask = FKMASK_5TUPLE;
1058 
1059 	fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &fk);
1060 	if (__improbable(fe == NULL)) {
1061 		goto out;
1062 	}
1063 
1064 	uuid_copy(fe_uuid, fe->fe_uuid);
1065 	/*
1066 	 * If the protocol notification is for TCP, make sure
1067 	 * protocol event received is for bytes in the flight.
1068 	 * XXX Redirect events are not delivered as protocol events
1069 	 * but as better route events.
1070 	 * Also redirect events do not indicate loss of the packet.
1071 	 */
1072 	if (proto != IPPROTO_TCP) {
1073 		p_val->tcp_seq_number = 0;
1074 	}
1075 
1076 	netagent_update_flow = TRUE;
1077 
1078 out:
1079 	SK_UNLOCK();
1080 
1081 	if (netagent_update_flow) {
1082 		int error = 0;
1083 #if SK_LOG
1084 		char dbgbuf[FLOWENTRY_DBGBUF_SIZE];
1085 		SK_DF(SK_VERB_FLOW, "Update flow entry \"%s\" for protocol "
1086 		    "event %d with value %d and tcp sequence number %d",
1087 		    fe_as_string(fe, dbgbuf, sizeof(dbgbuf)),
1088 		    protoctl_event_code, p_val->val, p_val->tcp_seq_number);
1089 #endif /* SK_LOG */
1090 		if ((error = netagent_update_flow_protoctl_event(
1091 			    fsw->fsw_agent_session, fe_uuid, protoctl_event_code,
1092 			    p_val->val, p_val->tcp_seq_number)) != 0) {
1093 #if SK_LOG
1094 			SK_DF(SK_VERB_FLOW, "Error: %d. Could not update "
1095 			    "flow entry \"%s\" for protocol event %d with "
1096 			    "value %d and tcp sequence number %d", error,
1097 			    dbgbuf, protoctl_event_code, p_val->val,
1098 			    p_val->tcp_seq_number);
1099 #endif /* SK_LOG */
1100 		}
1101 	}
1102 
1103 	if (fe != NULL) {
1104 		flow_entry_release(&fe);
1105 	}
1106 
1107 	if (fsw != NULL) {
1108 		fsw_detach_barrier_remove(fsw);
1109 	}
1110 }
1111 
1112 int
fsw_netagent_add_remove(struct kern_nexus * nx,boolean_t add)1113 fsw_netagent_add_remove(struct kern_nexus *nx, boolean_t add)
1114 {
1115 	struct nx_flowswitch *fsw = NULL;
1116 	int error = 0;
1117 
1118 	SK_LOCK_ASSERT_HELD();
1119 	VERIFY(nx != NULL);
1120 	VERIFY(NX_PROV(nx) != NULL);
1121 	VERIFY(NX_DOM_PROV(nx) != NULL);
1122 
1123 	if (NX_DOM(nx)->nxdom_type != NEXUS_TYPE_FLOW_SWITCH) {
1124 		error = EINVAL;
1125 		goto out;
1126 	}
1127 
1128 	fsw = NX_FSW_PRIVATE(nx);
1129 	VERIFY(fsw != NULL);
1130 	FSW_WLOCK(fsw);
1131 
1132 	if (fsw->fsw_agent_session == NULL) {
1133 		error = ENXIO;
1134 		goto out;
1135 	}
1136 
1137 	ASSERT(!uuid_is_null(fsw->fsw_agent_uuid));
1138 
1139 	if (add) {
1140 		if (FSW_NETAGENT_ADDED(fsw)) {
1141 			/* agent already added */
1142 			error = EEXIST;
1143 		} else {
1144 			fsw->fsw_state_flags |= FSW_STATEF_NETAGENT_ADDED;
1145 			if (if_is_fsw_netagent_enabled()) {
1146 				fsw->fsw_state_flags
1147 				        |= FSW_STATEF_NETAGENT_ENABLED;
1148 			}
1149 			if_add_netagent(fsw->fsw_ifp, fsw->fsw_agent_uuid);
1150 			SK_D("flowswitch netagent added for interface %s",
1151 			    if_name(fsw->fsw_ifp));
1152 		}
1153 	} else {
1154 		if (!FSW_NETAGENT_ADDED(fsw)) {
1155 			/* agent has not been added */
1156 			error = ENOENT;
1157 		} else {
1158 			fsw->fsw_state_flags &= ~(FSW_STATEF_NETAGENT_ADDED |
1159 			    FSW_STATEF_NETAGENT_ENABLED);
1160 			if_delete_netagent(fsw->fsw_ifp, fsw->fsw_agent_uuid);
1161 			SK_D("flowswitch netagent removed for interface %s",
1162 			    if_name(fsw->fsw_ifp));
1163 		}
1164 	}
1165 out:
1166 	if (fsw != NULL) {
1167 		FSW_UNLOCK(fsw);
1168 	}
1169 	return error;
1170 }
1171 
1172 void
fsw_netagent_update(struct kern_nexus * nx)1173 fsw_netagent_update(struct kern_nexus *nx)
1174 {
1175 	struct nx_flowswitch *fsw = NULL;
1176 
1177 	SK_LOCK_ASSERT_HELD();
1178 	VERIFY(nx != NULL);
1179 	VERIFY(NX_PROV(nx) != NULL);
1180 	VERIFY(NX_DOM_PROV(nx) != NULL);
1181 
1182 	if (NX_DOM(nx)->nxdom_type != NEXUS_TYPE_FLOW_SWITCH) {
1183 		goto out;
1184 	}
1185 	fsw = NX_FSW_PRIVATE(nx);
1186 	VERIFY(fsw != NULL);
1187 	FSW_WLOCK(fsw);
1188 	if (fsw->fsw_agent_session == NULL) {
1189 		goto out;
1190 	}
1191 	ASSERT(!uuid_is_null(fsw->fsw_agent_uuid));
1192 	uint32_t flags = netagent_get_flags(fsw->fsw_agent_uuid);
1193 	const bool ip_agent = ifnet_needs_fsw_ip_netagent(fsw->fsw_ifp);
1194 	const bool transport_agent = ifnet_needs_fsw_transport_netagent(fsw->fsw_ifp);
1195 	if (ip_agent || transport_agent) {
1196 		flags |= NETAGENT_FLAG_NEXUS_LISTENER;
1197 	} else {
1198 		flags &= ~NETAGENT_FLAG_NEXUS_LISTENER;
1199 	}
1200 	if (transport_agent) {
1201 		flags |= NETAGENT_FLAG_NEXUS_PROVIDER;
1202 	} else {
1203 		flags &= ~NETAGENT_FLAG_NEXUS_PROVIDER;
1204 	}
1205 	if (ip_agent) {
1206 		flags |= NETAGENT_FLAG_CUSTOM_IP_NEXUS;
1207 	} else {
1208 		flags &= ~NETAGENT_FLAG_CUSTOM_IP_NEXUS;
1209 	}
1210 	if (netagent_set_flags(fsw->fsw_agent_uuid, flags) == 0) {
1211 		SK_D("flowswitch netagent updated for interface %s",
1212 		    if_name(fsw->fsw_ifp));
1213 	}
1214 out:
1215 	if (fsw != NULL) {
1216 		FSW_UNLOCK(fsw);
1217 	}
1218 }
1219 
1220 static int
fsw_port_ctor(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,const struct nxbind * nxb)1221 fsw_port_ctor(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna,
1222     const struct nxbind *nxb)
1223 {
1224 #pragma unused(nxb)
1225 	int err = 0;
1226 
1227 	SK_LOCK_ASSERT_HELD();
1228 	ASSERT(nxb == NULL || !(nxb->nxb_flags & NXBF_MATCH_UNIQUEID) ||
1229 	    vpna->vpna_pid == nxb->nxb_pid);
1230 
1231 	/*
1232 	 * Reject regular channel open requests unless there is
1233 	 * something attached to the host port of the flowswitch.
1234 	 */
1235 	if (vpna->vpna_nx_port >= FSW_VP_USER_MIN) {
1236 		struct nexus_adapter *na = &vpna->vpna_up;
1237 		struct ifnet *ifp = fsw->fsw_ifp;
1238 
1239 		if (ifp == NULL) {
1240 			err = ENXIO;
1241 			goto done;
1242 		}
1243 
1244 		/* if adapter supports mitigation, set default value */
1245 		if (na->na_flags & (NAF_TX_MITIGATION | NAF_RX_MITIGATION)) {
1246 			if (IFNET_IS_WIFI(ifp)) {
1247 				na->na_ch_mit_ival = CH_MIT_IVAL_WIFI;
1248 			} else if (IFNET_IS_CELLULAR(ifp)) {
1249 				na->na_ch_mit_ival = CH_MIT_IVAL_CELLULAR;
1250 			} else if (IFNET_IS_ETHERNET(ifp)) {
1251 				na->na_ch_mit_ival = CH_MIT_IVAL_ETHERNET;
1252 			} else {
1253 				na->na_ch_mit_ival = CH_MIT_IVAL_DEFAULT;
1254 			}
1255 		}
1256 	}
1257 
1258 done:
1259 	SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
1260 	    "fsw 0x%llx nx_port %d vpna_pid %d vpna_pid_bound %u mit_ival %llu "
1261 	    "(err %d)", SK_KVA(fsw), (int)vpna->vpna_nx_port, vpna->vpna_pid,
1262 	    vpna->vpna_pid_bound, vpna->vpna_up.na_ch_mit_ival, err);
1263 
1264 	return err;
1265 }
1266 
1267 static bool
fsw_port_dtor(struct nx_flowswitch * fsw,const struct nexus_vp_adapter * vpna)1268 fsw_port_dtor(struct nx_flowswitch *fsw, const struct nexus_vp_adapter *vpna)
1269 {
1270 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1271 	nexus_port_t nx_port = vpna->vpna_nx_port;
1272 	uint32_t purge_cnt;
1273 
1274 	ASSERT(fsw == vpna->vpna_fsw);
1275 	ASSERT(nx_port != NEXUS_PORT_ANY);
1276 
1277 	/*
1278 	 * If this nexus port was bound to a PID, we just need to look at a
1279 	 * single bucket and iterate from there.  Note that in any case, we
1280 	 * can't just search for a single flow_owner based on the PID itself,
1281 	 * since a given process may be opening multiple channels to the
1282 	 * flowswitch; hence we search for the ones matching this nexus port.
1283 	 *
1284 	 * Close any open flows on the port and remove the flow owner and
1285 	 * nexus port binding.
1286 	 */
1287 	purge_cnt = flow_owner_detach_nexus_port(fm, vpna->vpna_pid_bound,
1288 	    vpna->vpna_pid, nx_port, FALSE);
1289 
1290 	SK_DF(SK_VERB_FSW,
1291 	    "fsw 0x%llx nx_port %d pid %d pid_bound %u defunct %u "
1292 	    "purged %u", SK_KVA(fsw), (int)nx_port,
1293 	    vpna->vpna_pid, vpna->vpna_pid_bound, vpna->vpna_defunct,
1294 	    purge_cnt);
1295 
1296 	return purge_cnt != 0;
1297 }
1298 
1299 /*
1300  * Flowswitch nexus port allocator.
1301  *
1302  * A nexus port is represented by a bit in the port bitmap; its state is
1303  * either free or allocated.  A free state implies that the port has no
1304  * nxbind AND no nexus adapter association.  An allocated state means that
1305  * either it has a nxbind OR a nexus adapter assocation.  This routine
1306  * manages the nexus adapter association with a nexus port; nxbind is
1307  * handled separately via nx_fsw_port_bind().
1308  *
1309  * The caller of this routine may optionally pass in a NULL nexus adapter.
1310  * In such a case (*vpna is NULL), this routine checks to see if the port
1311  * has already been associated with an adapter, and returns a reference to
1312  * that adapter.  No action is taken on a port that doesn't have an adapter
1313  * associated.  Otherwise (*vpna is non-NULL), this routine associates that
1314  * adapter with a port that's not already associated with one; the reference
1315  * to the adapter is untouched here, as the caller is expected to handle it.
1316  *
1317  * The flowswitch code invokes this routine each time it is requested to
1318  * find an adapter via nx_fsw_na_find().  The counterpart of this routine,
1319  * nx_fsw_port_free(), is only executed ONCE by the adapter's destructor.
1320  * This allows for multiple channels to be opened to a nexus port, each
1321  * time holding a reference to that same nexus adapter.  The releasing of
1322  * the nexus port only happens when the last channel closes.
1323  */
1324 static int
fsw_port_alloc__(struct nx_flowswitch * fsw,struct nxbind * nxb,struct nexus_vp_adapter ** vpna,nexus_port_t nx_port,struct proc * p)1325 fsw_port_alloc__(struct nx_flowswitch *fsw, struct nxbind *nxb,
1326     struct nexus_vp_adapter **vpna, nexus_port_t nx_port, struct proc *p)
1327 {
1328 	struct kern_nexus *nx = fsw->fsw_nx;
1329 	boolean_t refonly = FALSE;
1330 	int error = 0;
1331 
1332 	FSW_WLOCK_ASSERT_HELD(fsw);
1333 
1334 	error = nx_port_alloc(nx, nx_port, nxb, (struct nexus_adapter **)vpna, p);
1335 	if (error == 0 && *vpna != NULL && !refonly) {
1336 		/* initialize the nexus port and the adapter occupying it */
1337 		(*vpna)->vpna_fsw = fsw;
1338 		(*vpna)->vpna_nx_port = nx_port;
1339 		(*vpna)->vpna_pid = proc_pid(p);
1340 		if (nxb != NULL && (nxb->nxb_flags & NXBF_MATCH_UNIQUEID)) {
1341 			ASSERT((*vpna)->vpna_pid == nxb->nxb_pid);
1342 			(*vpna)->vpna_pid_bound = TRUE;
1343 		} else {
1344 			(*vpna)->vpna_pid_bound = FALSE;
1345 		}
1346 
1347 		error = fsw_port_ctor(fsw, *vpna, nxb);
1348 		if (error != 0) {
1349 			fsw_port_free(fsw, (*vpna),
1350 			    (*vpna)->vpna_nx_port, FALSE);
1351 		}
1352 	}
1353 
1354 #if SK_LOG
1355 	if (*vpna != NULL) {
1356 		SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
1357 		    "+++ vpna \"%s\" (0x%llx) <-> fsw 0x%llx "
1358 		    "%sport %d refonly %u (err %d)",
1359 		    (*vpna)->vpna_up.na_name, SK_KVA(*vpna), SK_KVA(fsw),
1360 		    nx_fsw_dom_port_is_reserved(nx, nx_port) ?
1361 		    "[reserved] " : "", (int)nx_port, refonly, error);
1362 	} else {
1363 		SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
1364 		    "+++ fsw 0x%llx nx_port %d refonly %u "
1365 		    "(err %d)", SK_KVA(fsw), (int)nx_port, refonly, error);
1366 	}
1367 #endif /* SK_LOG */
1368 
1369 	return error;
1370 }
1371 
1372 int
fsw_port_alloc(struct nx_flowswitch * fsw,struct nxbind * nxb,struct nexus_vp_adapter ** vpna,nexus_port_t nx_port,struct proc * p,boolean_t ifattach,boolean_t host)1373 fsw_port_alloc(struct nx_flowswitch *fsw, struct nxbind *nxb,
1374     struct nexus_vp_adapter **vpna, nexus_port_t nx_port, struct proc *p,
1375     boolean_t ifattach, boolean_t host)
1376 {
1377 	int err = 0;
1378 
1379 	FSW_WLOCK_ASSERT_HELD(fsw);
1380 
1381 	if (ifattach) {
1382 		/* override port to either NX_FSW_{HOST,DEV} */
1383 		nx_port = (host ? FSW_VP_HOST : FSW_VP_DEV);
1384 		/* allocate reserved port for ifattach */
1385 		err = fsw_port_alloc__(fsw, nxb, vpna, nx_port, p);
1386 	} else if (host) {
1387 		/* host is valid only for ifattach */
1388 		err = EINVAL;
1389 	} else {
1390 		/* nexus port otherwise (reserve dev and host for ifattach) */
1391 		err = fsw_port_alloc__(fsw, nxb, vpna, nx_port, p);
1392 	}
1393 
1394 	return err;
1395 }
1396 
1397 /*
1398  * Remove nexus port association from a nexus adapter.  This call is
1399  * the opposite of fsw_port_alloc(), except that it is called only
1400  * at nx_fsw_vp_na_dtor() destructor time.  See above notes
1401  * on fsw_port_alloc().
1402  */
1403 void
fsw_port_free(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,nexus_port_t nx_port,boolean_t defunct)1404 fsw_port_free(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna,
1405     nexus_port_t nx_port, boolean_t defunct)
1406 {
1407 	struct kern_nexus *nx = fsw->fsw_nx;
1408 
1409 	FSW_WLOCK_ASSERT_HELD(fsw);
1410 	ASSERT(vpna->vpna_fsw == fsw);
1411 
1412 	if (defunct) {
1413 		vpna->vpna_defunct = TRUE;
1414 		nx_port_defunct(nx, nx_port);
1415 	}
1416 
1417 	bool destroyed = fsw_port_dtor(fsw, vpna);
1418 	if (destroyed) {
1419 		/*
1420 		 * If the extension's destructor no longer needs to be
1421 		 * bound to any channel client, release the binding.
1422 		 */
1423 		nx_port_unbind(nx, nx_port);
1424 	}
1425 
1426 	/*
1427 	 * If this is a defunct, then stop here as the port is still
1428 	 * occupied by the channel.  We'll come here again later when
1429 	 * the actual close happens.
1430 	 */
1431 	if (defunct) {
1432 		return;
1433 	}
1434 
1435 	SK_DF(SK_VERB_FSW, "--- vpna \"%s\" (0x%llx) -!- fsw 0x%llx "
1436 	    "nx_port %d defunct %u", vpna->vpna_up.na_name, SK_KVA(vpna),
1437 	    SK_KVA(fsw), (int)nx_port, vpna->vpna_defunct);
1438 
1439 	nx_port_free(nx, nx_port);
1440 	vpna->vpna_fsw = NULL;
1441 	vpna->vpna_nx_port = NEXUS_PORT_ANY;
1442 	vpna->vpna_pid_bound = FALSE;
1443 	vpna->vpna_pid = -1;
1444 	vpna->vpna_defunct = FALSE;
1445 }
1446 
1447 int
fsw_port_na_activate(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna,na_activate_mode_t mode)1448 fsw_port_na_activate(struct nx_flowswitch *fsw,
1449     struct nexus_vp_adapter *vpna, na_activate_mode_t mode)
1450 {
1451 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1452 	uint32_t fo_cnt = 0;
1453 
1454 	SK_LOCK_ASSERT_HELD();
1455 
1456 	/* The following code relies on the static value asserted below */
1457 	_CASSERT(FSW_VP_DEV == 0);
1458 	_CASSERT(FSW_VP_HOST == 1);
1459 
1460 	ASSERT(NA_IS_ACTIVE(&vpna->vpna_up));
1461 	ASSERT(vpna->vpna_nx_port != NEXUS_PORT_ANY);
1462 
1463 	switch (mode) {
1464 	case NA_ACTIVATE_MODE_ON:
1465 		break;
1466 
1467 	case NA_ACTIVATE_MODE_DEFUNCT:
1468 		break;
1469 
1470 	case NA_ACTIVATE_MODE_OFF:
1471 		break;
1472 
1473 	default:
1474 		VERIFY(0);
1475 		/* NOTREACHED */
1476 		__builtin_unreachable();
1477 	}
1478 
1479 	/* nothing further to do for special ports */
1480 	if (vpna->vpna_nx_port < FSW_VP_USER_MIN) {
1481 		goto done;
1482 	}
1483 
1484 	/* activate any flow owner related resources (e.g. flowadv), if any */
1485 	fo_cnt = flow_owner_activate_nexus_port(fm, vpna->vpna_pid_bound,
1486 	    vpna->vpna_pid, vpna->vpna_nx_port, &vpna->vpna_up, mode);
1487 
1488 done:
1489 	SK_DF(SK_VERB_FSW,
1490 	    "fsw 0x%llx %s nx_port %d vpna_pid %d vpna_pid_bound %u fo_cnt %u",
1491 	    SK_KVA(fsw), na_activate_mode2str(mode), (int)vpna->vpna_nx_port,
1492 	    vpna->vpna_pid, vpna->vpna_pid_bound, fo_cnt);
1493 
1494 	return 0;
1495 }
1496 
1497 int
fsw_port_na_defunct(struct nx_flowswitch * fsw,struct nexus_vp_adapter * vpna)1498 fsw_port_na_defunct(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna)
1499 {
1500 	int err = 0;
1501 
1502 	SK_LOCK_ASSERT_HELD();
1503 	ASSERT(vpna->vpna_nx_port >= FSW_VP_USER_MIN);
1504 
1505 	/*
1506 	 * During defunct, we want to purge all flows associated to this
1507 	 * port and the flow owner as well.  This is accomplished as part
1508 	 * of calling the port's destructor.  However, we still want to
1509 	 * occupy the nexus port since there's a channel open to it.
1510 	 */
1511 	FSW_WLOCK(fsw);
1512 	if (!vpna->vpna_defunct) {
1513 		fsw_port_free(fsw, vpna, vpna->vpna_nx_port, TRUE);
1514 	} else {
1515 		err = EALREADY;
1516 	}
1517 	FSW_WUNLOCK(fsw);
1518 
1519 	return err;
1520 }
1521 
1522 static size_t
fsw_mib_get_flow(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1523 fsw_mib_get_flow(struct nx_flowswitch *fsw,
1524     struct nexus_mib_filter *filter, void *out, size_t len)
1525 {
1526 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1527 	size_t sf_size = sizeof(struct sk_stats_flow);
1528 	__block size_t actual_space = 0;
1529 	__block struct sk_stats_flow *sf = out;
1530 	struct flow_entry *fe;
1531 
1532 	FSW_LOCK_ASSERT_HELD(fsw);
1533 
1534 	if (filter->nmf_bitmap & NXMIB_FILTER_FLOW_ID) {
1535 		fe = flow_mgr_get_fe_by_uuid_rlock(fm, filter->nmf_flow_id);
1536 		if (fe != NULL) {
1537 			if (out != NULL && len >= sf_size) {
1538 				flow_entry_stats_get(fe, sf);
1539 			}
1540 
1541 			flow_entry_release(&fe);
1542 			return sf_size;
1543 		}
1544 		return 0;
1545 	} else if (filter->nmf_bitmap & NXMIB_FILTER_INFO_TUPLE) {
1546 		struct info_tuple *itpl = &filter->nmf_info_tuple;
1547 		struct flow_key fk;
1548 		bzero(&fk, sizeof(fk));
1549 		if (itpl->itpl_local_sa.sa_family == AF_INET &&
1550 		    itpl->itpl_remote_sa.sa_family == AF_INET) {
1551 			fk.fk_mask = FKMASK_5TUPLE;
1552 			fk.fk_ipver = IPVERSION;
1553 			fk.fk_proto = itpl->itpl_proto;
1554 			fk.fk_src4 = itpl->itpl_local_sin.sin_addr;
1555 			fk.fk_dst4 = itpl->itpl_remote_sin.sin_addr;
1556 			fk.fk_sport = itpl->itpl_local_sin.sin_port;
1557 			fk.fk_dport = itpl->itpl_remote_sin.sin_port;
1558 		} else if (itpl->itpl_local_sa.sa_family == AF_INET6 &&
1559 		    itpl->itpl_remote_sa.sa_family == AF_INET6) {
1560 			fk.fk_mask = FKMASK_5TUPLE;
1561 			fk.fk_ipver = IPV6_VERSION;
1562 			fk.fk_proto = itpl->itpl_proto;
1563 			fk.fk_src6 = itpl->itpl_local_sin6.sin6_addr;
1564 			fk.fk_dst6 = itpl->itpl_remote_sin6.sin6_addr;
1565 			fk.fk_sport = itpl->itpl_local_sin6.sin6_port;
1566 			fk.fk_dport = itpl->itpl_remote_sin6.sin6_port;
1567 		} else {
1568 			SK_ERR("invalid info tuple: local af %d remote af %d",
1569 			    itpl->itpl_local_sa.sa_family,
1570 			    itpl->itpl_remote_sa.sa_family);
1571 			return 0;
1572 		}
1573 
1574 		fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &fk);
1575 		if (fe != NULL) {
1576 			if (out != NULL && len >= sf_size) {
1577 				flow_entry_stats_get(fe, sf);
1578 			}
1579 			flow_entry_release(&fe);
1580 			return sf_size;
1581 		}
1582 		return 0;
1583 	}
1584 
1585 	flow_mgr_foreach_flow(fsw->fsw_flow_mgr, ^(struct flow_entry *_fe) {
1586 		actual_space += sf_size;
1587 
1588 		if (out == NULL || actual_space > len) {
1589 		        return;
1590 		}
1591 
1592 		flow_entry_stats_get(_fe, sf);
1593 		sf++;
1594 	});
1595 
1596 	/*
1597 	 * Also return the ones in deferred free list.
1598 	 */
1599 	lck_mtx_lock(&fsw->fsw_linger_lock);
1600 	TAILQ_FOREACH(fe, &fsw->fsw_linger_head, fe_linger_link) {
1601 		actual_space += sf_size;
1602 		if (out == NULL || actual_space > len) {
1603 			continue;
1604 		}
1605 
1606 		flow_entry_stats_get(fe, sf);
1607 		sf++;
1608 	}
1609 	lck_mtx_unlock(&fsw->fsw_linger_lock);
1610 
1611 	return actual_space;
1612 }
1613 
1614 static size_t
fsw_mib_get_flow_adv(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1615 fsw_mib_get_flow_adv(struct nx_flowswitch *fsw,
1616     struct nexus_mib_filter *filter, void *out, size_t len)
1617 {
1618 #pragma unused(filter)
1619 	uint32_t fae_idx;
1620 	size_t actual_space = 0;
1621 	struct kern_channel *ch = NULL;
1622 	struct sk_stats_flow_adv *sfa = NULL;
1623 	struct sk_stats_flow_adv_ent *sfae = NULL;
1624 	struct __flowadv_entry *fae = NULL;
1625 	size_t sfa_size = sizeof(struct sk_stats_flow_adv);
1626 	size_t sfae_size = sizeof(struct sk_stats_flow_adv_ent);
1627 	uint32_t max_flowadv =
1628 	    fsw->fsw_nx->nx_prov->nxprov_params->nxp_flowadv_max;
1629 
1630 	SK_LOCK_ASSERT_HELD();
1631 
1632 	sfa = out;
1633 	/* copyout flow advisory table (allocated entries only) */
1634 	STAILQ_FOREACH(ch, &fsw->fsw_nx->nx_ch_head, ch_link) {
1635 		struct skmem_arena *ar;
1636 		struct skmem_arena_nexus *arn;
1637 		struct nexus_adapter *na;
1638 
1639 		/* ch_lock isn't needed here since sk_lock is held */
1640 		if ((ch->ch_flags & CHANF_CLOSING) ||
1641 		    (na = ch->ch_na) == NULL) {
1642 			/* channel is closing */
1643 			continue;
1644 		}
1645 
1646 		ar = na->na_arena;
1647 		arn = skmem_arena_nexus(ar);
1648 
1649 		AR_LOCK(ar);
1650 		if (arn->arn_flowadv_obj == NULL) {
1651 			ASSERT(ar->ar_flags & ARF_DEFUNCT);
1652 			AR_UNLOCK(ar);
1653 			continue;
1654 		}
1655 		actual_space += sfa_size;
1656 		/* fill out flowadv_table info */
1657 		if (out != NULL && actual_space <= len) {
1658 			uuid_copy(sfa->sfa_nx_uuid, fsw->fsw_nx->nx_uuid);
1659 			(void) strlcpy(sfa->sfa_if_name,
1660 			    fsw->fsw_flow_mgr->fm_name, IFNAMSIZ);
1661 			sfa->sfa_owner_pid = ch->ch_pid;
1662 			sfa->sfa_entries_count = 0;
1663 		}
1664 
1665 		/* fill out flowadv_entries */
1666 		sfae = &sfa->sfa_entries[0];
1667 		for (fae_idx = 0; fae_idx < max_flowadv; fae_idx++) {
1668 			fae = &arn->arn_flowadv_obj[fae_idx];
1669 			if (!uuid_is_null(fae->fae_id)) {
1670 				actual_space += sfae_size;
1671 				if (out == NULL || actual_space > len) {
1672 					continue;
1673 				}
1674 
1675 				/* fill out entry */
1676 				uuid_copy(sfae->sfae_flow_id, fae->fae_id);
1677 				sfae->sfae_flags = fae->fae_flags;
1678 				sfae++;
1679 				sfa->sfa_entries_count++;
1680 			}
1681 		}
1682 		sfa = (struct sk_stats_flow_adv *)
1683 		    ((uintptr_t)out + actual_space);
1684 		AR_UNLOCK(ar);
1685 	}
1686 
1687 	return actual_space;
1688 }
1689 
1690 static inline void
fsw_fo2sfo(struct nx_flowswitch * fsw,struct flow_owner * fo,struct sk_stats_flow_owner * sfo)1691 fsw_fo2sfo(struct nx_flowswitch *fsw, struct flow_owner *fo,
1692     struct sk_stats_flow_owner *sfo)
1693 {
1694 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1695 
1696 	uuid_copy(sfo->sfo_nx_uuid, fsw->fsw_nx->nx_uuid);
1697 	(void) strlcpy(sfo->sfo_if_name, fsw->fsw_flow_mgr->fm_name,
1698 	    IFNAMSIZ);
1699 	sfo->sfo_bucket_idx = flow_mgr_get_fob_idx(fm, FO_BUCKET(fo));
1700 
1701 	(void) snprintf(sfo->sfo_name, sizeof(sfo->sfo_name), "%s",
1702 	    fo->fo_name);
1703 	sfo->sfo_pid = fo->fo_pid;
1704 	sfo->sfo_nx_port = fo->fo_nx_port;
1705 	sfo->sfo_nx_port_pid_bound = fo->fo_nx_port_pid_bound;
1706 	sfo->sfo_nx_port_destroyed = fo->fo_nx_port_destroyed;
1707 }
1708 
1709 static size_t
fsw_mib_get_flow_owner(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1710 fsw_mib_get_flow_owner(struct nx_flowswitch *fsw,
1711     struct nexus_mib_filter *filter, void *out, size_t len)
1712 {
1713 #pragma unused(filter)
1714 	uint32_t i;
1715 	size_t actual_space = 0;
1716 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1717 	struct sk_stats_flow_owner *sfo = out;
1718 	size_t sfo_size = sizeof(struct sk_stats_flow_owner);
1719 	struct flow_owner *fo;
1720 
1721 	FSW_LOCK_ASSERT_HELD(fsw);
1722 
1723 	/*
1724 	 * Ideally we'd like to hide the bucket level details from flow library
1725 	 * user, but there is no simple way to iterate flow_owner with
1726 	 * buckets/RB_TREE nested. So keep it as is.
1727 	 */
1728 	for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
1729 		struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i);
1730 		FOB_LOCK(fob);
1731 		RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
1732 			actual_space += sfo_size;
1733 			if (out == NULL || actual_space > len) {
1734 				continue;
1735 			}
1736 
1737 			fsw_fo2sfo(fsw, fo, sfo);
1738 			sfo++;
1739 		}
1740 		FOB_UNLOCK(fob);
1741 	}
1742 
1743 	return actual_space;
1744 }
1745 
1746 static inline void
fsw_fr2sfr(struct nx_flowswitch * fsw,struct flow_route * fr,struct sk_stats_flow_route * sfr,boolean_t ll_scrub)1747 fsw_fr2sfr(struct nx_flowswitch *fsw, struct flow_route *fr,
1748     struct sk_stats_flow_route *sfr, boolean_t ll_scrub)
1749 {
1750 	uuid_copy(sfr->sfr_nx_uuid, fsw->fsw_nx->nx_uuid);
1751 	uuid_copy(sfr->sfr_uuid, fr->fr_uuid);
1752 	(void) strlcpy(sfr->sfr_if_name, fsw->fsw_flow_mgr->fm_name,
1753 	    IFNAMSIZ);
1754 
1755 	sfr->sfr_bucket_idx = fr->fr_frb->frb_idx;
1756 	sfr->sfr_id_bucket_idx = fr->fr_frib->frib_idx;
1757 
1758 	if (fr->fr_flags & FLOWRTF_ATTACHED) {
1759 		sfr->sfr_flags |= SFLOWRTF_ATTACHED;
1760 	}
1761 	if (fr->fr_flags & FLOWRTF_ONLINK) {
1762 		sfr->sfr_flags |= SFLOWRTF_ONLINK;
1763 	}
1764 	if (fr->fr_flags & FLOWRTF_GATEWAY) {
1765 		sfr->sfr_flags |= SFLOWRTF_GATEWAY;
1766 	}
1767 	if (fr->fr_flags & FLOWRTF_RESOLVED) {
1768 		sfr->sfr_flags |= SFLOWRTF_RESOLVED;
1769 	}
1770 	if (fr->fr_flags & FLOWRTF_HAS_LLINFO) {
1771 		sfr->sfr_flags |= SFLOWRTF_HAS_LLINFO;
1772 	}
1773 	if (fr->fr_flags & FLOWRTF_DELETED) {
1774 		sfr->sfr_flags |= SFLOWRTF_DELETED;
1775 	}
1776 	if (fr->fr_flags & FLOWRTF_DST_LL_MCAST) {
1777 		sfr->sfr_flags |= SFLOWRTF_DST_LL_MCAST;
1778 	}
1779 	if (fr->fr_flags & FLOWRTF_DST_LL_BCAST) {
1780 		sfr->sfr_flags |= SFLOWRTF_DST_LL_BCAST;
1781 	}
1782 
1783 	lck_spin_lock(&fr->fr_reflock);
1784 	ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF);
1785 	sfr->sfr_usecnt = fr->fr_usecnt - FLOW_ROUTE_MINREF;
1786 	if (fr->fr_expire != 0) {
1787 		sfr->sfr_expire = (int64_t)(fr->fr_expire - net_uptime());
1788 	} else {
1789 		sfr->sfr_expire = 0;
1790 	}
1791 	lck_spin_unlock(&fr->fr_reflock);
1792 
1793 	sfr->sfr_laddr = fr->fr_laddr;
1794 	sfr->sfr_faddr = fr->fr_faddr;
1795 	sfr->sfr_gaddr = fr->fr_gaddr;
1796 
1797 	if (ll_scrub) {
1798 		static const uint8_t unspec[ETHER_ADDR_LEN] = {[0] = 2 };
1799 		bcopy(&unspec, &sfr->sfr_ether_dhost, ETHER_ADDR_LEN);
1800 	} else {
1801 		bcopy(&fr->fr_eth.ether_dhost, &sfr->sfr_ether_dhost,
1802 		    ETHER_ADDR_LEN);
1803 	}
1804 }
1805 
1806 #if CONFIG_MACF
1807 extern int dlil_lladdr_ckreq;
1808 #endif /* CONFIG_MACF */
1809 
1810 static size_t
fsw_mib_get_flow_route(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len,struct proc * p)1811 fsw_mib_get_flow_route(struct nx_flowswitch *fsw,
1812     struct nexus_mib_filter *filter, void *out, size_t len, struct proc *p)
1813 {
1814 #pragma unused(filter)
1815 	uint32_t i;
1816 	size_t actual_space = 0;
1817 	struct flow_mgr *fm = fsw->fsw_flow_mgr;
1818 	struct sk_stats_flow_route *sfr = out;
1819 	size_t sfo_size = sizeof(struct sk_stats_flow_route);
1820 	struct flow_route *fr;
1821 	boolean_t ll_scrub;
1822 
1823 	FSW_LOCK_ASSERT_HELD(fsw);
1824 
1825 	/*
1826 	 * To get the link-layer info, the caller must have the following
1827 	 * in their sandbox profile (or not be sandboxed at all), else we
1828 	 * scrub it clean just like dlil_ifaddr_bytes() does:
1829 	 *
1830 	 * (allow system-info (info-type "net.link.addr"))
1831 	 *
1832 	 * If scrubbed, we return 02:00:00:00:00:00.
1833 	 */
1834 #if CONFIG_MACF
1835 	ll_scrub = (dlil_lladdr_ckreq &&
1836 	    skywalk_mac_system_check_proc_cred(p, "net.link.addr") != 0);
1837 #else /* !CONFIG_MACF */
1838 	ll_scrub = FALSE;
1839 #endif /* !CONFIG_MACF */
1840 
1841 	for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
1842 		struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i);
1843 		FRB_RLOCK(frb);
1844 		RB_FOREACH(fr, flow_route_tree, &frb->frb_head) {
1845 			actual_space += sfo_size;
1846 			if (out == NULL || actual_space > len) {
1847 				continue;
1848 			}
1849 
1850 			fsw_fr2sfr(fsw, fr, sfr, ll_scrub);
1851 			sfr++;
1852 		}
1853 		FRB_UNLOCK(frb);
1854 	}
1855 
1856 	return actual_space;
1857 }
1858 
1859 static inline void
fsw_nxs2nus(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,pid_t pid,struct __nx_stats_fsw * nxs,struct sk_stats_userstack * sus)1860 fsw_nxs2nus(struct nx_flowswitch *fsw, struct nexus_mib_filter *filter,
1861     pid_t pid, struct __nx_stats_fsw *nxs, struct sk_stats_userstack *sus)
1862 {
1863 	uuid_copy(sus->sus_nx_uuid, fsw->fsw_nx->nx_uuid);
1864 	(void) strlcpy(sus->sus_if_name, fsw->fsw_flow_mgr->fm_name,
1865 	    IFNAMSIZ);
1866 	sus->sus_owner_pid = pid;
1867 
1868 	if (filter->nmf_type & NXMIB_IP_STATS) {
1869 		sus->sus_ip  = nxs->nxs_ipstat;
1870 	}
1871 
1872 	if (filter->nmf_type & NXMIB_IP6_STATS) {
1873 		sus->sus_ip6 = nxs->nxs_ip6stat;
1874 	}
1875 
1876 	if (filter->nmf_type & NXMIB_TCP_STATS) {
1877 		sus->sus_tcp = nxs->nxs_tcpstat;
1878 	}
1879 
1880 	if (filter->nmf_type & NXMIB_UDP_STATS) {
1881 		sus->sus_udp = nxs->nxs_udpstat;
1882 	}
1883 
1884 	if (filter->nmf_type & NXMIB_QUIC_STATS) {
1885 		sus->sus_quic = nxs->nxs_quicstat;
1886 	}
1887 }
1888 
1889 static size_t
fsw_mib_get_userstack_stats(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len)1890 fsw_mib_get_userstack_stats(struct nx_flowswitch *fsw,
1891     struct nexus_mib_filter *filter, void *out, size_t len)
1892 {
1893 	size_t actual_space = 0;
1894 	struct kern_channel *ch;
1895 	struct __nx_stats_fsw *nxs;
1896 	struct sk_stats_userstack *sus = out;
1897 	size_t sus_size = sizeof(struct sk_stats_userstack);
1898 
1899 	SK_LOCK_ASSERT_HELD();
1900 
1901 	/* copyout saved stats from closed ports */
1902 	if (((filter->nmf_bitmap & NXMIB_FILTER_PID) &&
1903 	    (filter->nmf_pid == 0)) ||
1904 	    !(filter->nmf_bitmap & NXMIB_FILTER_PID)) {
1905 		actual_space += sus_size;
1906 		if (out != NULL && actual_space <= len) {
1907 			nxs = fsw->fsw_closed_na_stats;
1908 			fsw_nxs2nus(fsw, filter, 0, nxs, sus);
1909 			sus++;
1910 		}
1911 	}
1912 
1913 	/*
1914 	 * XXX Currently a proc only opens one channel to nexus so we don't do
1915 	 * per proc aggregation of inet stats now as this needs lots of code
1916 	 */
1917 	/* copyout per process stats */
1918 	STAILQ_FOREACH(ch, &fsw->fsw_nx->nx_ch_head, ch_link) {
1919 		struct skmem_arena *ar;
1920 		struct nexus_adapter *na;
1921 
1922 		/* ch_lock isn't needed here since sk_lock is held */
1923 		if ((ch->ch_flags & CHANF_CLOSING) ||
1924 		    (na = ch->ch_na) == NULL) {
1925 			/* channel is closing */
1926 			continue;
1927 		}
1928 
1929 		if ((filter->nmf_bitmap & NXMIB_FILTER_PID) &&
1930 		    filter->nmf_pid != ch->ch_pid) {
1931 			continue;
1932 		}
1933 
1934 		ar = na->na_arena;
1935 
1936 		AR_LOCK(ar);
1937 		nxs = skmem_arena_nexus(ar)->arn_stats_obj;
1938 		if (nxs == NULL) {
1939 			ASSERT(ar->ar_flags & ARF_DEFUNCT);
1940 			AR_UNLOCK(ar);
1941 			continue;
1942 		}
1943 
1944 		actual_space += sus_size;
1945 		if (out == NULL || actual_space > len) {
1946 			AR_UNLOCK(ar);
1947 			continue;
1948 		}
1949 
1950 		fsw_nxs2nus(fsw, filter, ch->ch_pid, nxs, sus);
1951 		sus++;
1952 		AR_UNLOCK(ar);
1953 	}
1954 
1955 	return actual_space;
1956 }
1957 
1958 static size_t
fsw_mib_get_stats(struct nx_flowswitch * fsw,void * out,size_t len)1959 fsw_mib_get_stats(struct nx_flowswitch *fsw, void *out, size_t len)
1960 {
1961 	struct sk_stats_flow_switch *sfs = out;
1962 	size_t actual_space = sizeof(struct sk_stats_flow_switch);
1963 
1964 	if (out != NULL && actual_space <= len) {
1965 		uuid_copy(sfs->sfs_nx_uuid, fsw->fsw_nx->nx_uuid);
1966 		(void) strlcpy(sfs->sfs_if_name,
1967 		    fsw->fsw_flow_mgr->fm_name, IFNAMSIZ);
1968 		sfs->sfs_fsws = fsw->fsw_stats;
1969 	}
1970 
1971 	return actual_space;
1972 }
1973 
1974 size_t
fsw_mib_get(struct nx_flowswitch * fsw,struct nexus_mib_filter * filter,void * out,size_t len,struct proc * p)1975 fsw_mib_get(struct nx_flowswitch *fsw, struct nexus_mib_filter *filter,
1976     void *out, size_t len, struct proc *p)
1977 {
1978 	size_t ret;
1979 
1980 	switch (filter->nmf_type) {
1981 	case NXMIB_FSW_STATS:
1982 		ret = fsw_mib_get_stats(fsw, out, len);
1983 		break;
1984 	case NXMIB_FLOW:
1985 		ret = fsw_mib_get_flow(fsw, filter, out, len);
1986 		break;
1987 	case NXMIB_FLOW_OWNER:
1988 		ret = fsw_mib_get_flow_owner(fsw, filter, out, len);
1989 		break;
1990 	case NXMIB_FLOW_ROUTE:
1991 		ret = fsw_mib_get_flow_route(fsw, filter, out, len, p);
1992 		break;
1993 	case NXMIB_TCP_STATS:
1994 	case NXMIB_UDP_STATS:
1995 	case NXMIB_IP_STATS:
1996 	case NXMIB_IP6_STATS:
1997 	case NXMIB_USERSTACK_STATS:
1998 		ret = fsw_mib_get_userstack_stats(fsw, filter, out, len);
1999 		break;
2000 	case NXMIB_FLOW_ADV:
2001 		ret = fsw_mib_get_flow_adv(fsw, filter, out, len);
2002 		break;
2003 	default:
2004 		ret = 0;
2005 		break;
2006 	}
2007 
2008 	return ret;
2009 }
2010 
2011 void
fsw_fold_stats(struct nx_flowswitch * fsw,void * data,nexus_stats_type_t type)2012 fsw_fold_stats(struct nx_flowswitch *fsw,
2013     void *data, nexus_stats_type_t type)
2014 {
2015 	ASSERT(data != NULL);
2016 	FSW_LOCK_ASSERT_HELD(fsw);
2017 
2018 	switch (type) {
2019 	case NEXUS_STATS_TYPE_FSW:
2020 	{
2021 		struct __nx_stats_fsw *d, *s;
2022 		d = fsw->fsw_closed_na_stats;
2023 		s = data;
2024 		ip_stats_fold(&d->nxs_ipstat, &s->nxs_ipstat);
2025 		ip6_stats_fold(&d->nxs_ip6stat, &s->nxs_ip6stat);
2026 		tcp_stats_fold(&d->nxs_tcpstat, &s->nxs_tcpstat);
2027 		udp_stats_fold(&d->nxs_udpstat, &s->nxs_udpstat);
2028 		quic_stats_fold(&d->nxs_quicstat, &s->nxs_quicstat);
2029 		break;
2030 	}
2031 	case NEXUS_STATS_TYPE_CHAN_ERRORS:
2032 	{
2033 		struct __nx_stats_channel_errors *s = data;
2034 		fsw_vp_channel_error_stats_fold(&fsw->fsw_stats, s);
2035 		break;
2036 	}
2037 	default:
2038 		VERIFY(0);
2039 		/* NOTREACHED */
2040 		__builtin_unreachable();
2041 	}
2042 }
2043 
2044 boolean_t
fsw_detach_barrier_add(struct nx_flowswitch * fsw)2045 fsw_detach_barrier_add(struct nx_flowswitch *fsw)
2046 {
2047 	lck_mtx_lock_spin(&fsw->fsw_detach_barrier_lock);
2048 	if (__improbable(fsw->fsw_detach_flags != 0 ||
2049 	    fsw->fsw_ifp == NULL || fsw->fsw_agent_session == NULL)) {
2050 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2051 		return FALSE;
2052 	}
2053 	fsw->fsw_detach_barriers++;
2054 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2055 
2056 	return TRUE;
2057 }
2058 
2059 void
fsw_detach_barrier_remove(struct nx_flowswitch * fsw)2060 fsw_detach_barrier_remove(struct nx_flowswitch *fsw)
2061 {
2062 	lck_mtx_lock_spin(&fsw->fsw_detach_barrier_lock);
2063 	ASSERT((fsw->fsw_detach_flags & FSW_DETACHF_DETACHED) == 0);
2064 	ASSERT(fsw->fsw_detach_barriers != 0);
2065 	fsw->fsw_detach_barriers--;
2066 	/* if there's a thread waiting to detach the interface, let it know */
2067 	if (__improbable((fsw->fsw_detach_waiters > 0) &&
2068 	    (fsw->fsw_detach_barriers == 0))) {
2069 		fsw->fsw_detach_waiters = 0;
2070 		wakeup(&fsw->fsw_detach_waiters);
2071 	}
2072 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2073 }
2074 
2075 /*
2076  * Generic resolver for non-Ethernet interfaces.
2077  */
2078 int
fsw_generic_resolve(struct nx_flowswitch * fsw,struct flow_route * fr,struct __kern_packet * pkt)2079 fsw_generic_resolve(struct nx_flowswitch *fsw, struct flow_route *fr,
2080     struct __kern_packet *pkt)
2081 {
2082 #pragma unused(pkt)
2083 #if SK_LOG
2084 	char dst_s[MAX_IPv6_STR_LEN];
2085 #endif /* SK_LOG */
2086 	struct ifnet *ifp = fsw->fsw_ifp;
2087 	struct rtentry *tgt_rt = NULL;
2088 	int err = 0;
2089 
2090 	ASSERT(fr != NULL);
2091 	ASSERT(ifp != NULL);
2092 
2093 	FR_LOCK(fr);
2094 	/*
2095 	 * If the destination is on-link, we use the final destination
2096 	 * address as target.  If it's off-link, we use the gateway
2097 	 * address instead.  Point tgt_rt to the the destination or
2098 	 * gateway route accordingly.
2099 	 */
2100 	if (fr->fr_flags & FLOWRTF_ONLINK) {
2101 		tgt_rt = fr->fr_rt_dst;
2102 	} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
2103 		tgt_rt = fr->fr_rt_gw;
2104 	}
2105 
2106 	/*
2107 	 * Perform another routing table lookup if necessary.
2108 	 */
2109 	if (tgt_rt == NULL || !(tgt_rt->rt_flags & RTF_UP) ||
2110 	    fr->fr_want_configure) {
2111 		if (fr->fr_want_configure == 0) {
2112 			atomic_add_32(&fr->fr_want_configure, 1);
2113 		}
2114 		err = flow_route_configure(fr, ifp, NULL);
2115 		if (err != 0) {
2116 			SK_ERR("failed to configure route to %s on %s (err %d)",
2117 			    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
2118 			    sizeof(dst_s)), ifp->if_xname, err);
2119 			goto done;
2120 		}
2121 
2122 		/* refresh pointers */
2123 		if (fr->fr_flags & FLOWRTF_ONLINK) {
2124 			tgt_rt = fr->fr_rt_dst;
2125 		} else if (fr->fr_flags & FLOWRTF_GATEWAY) {
2126 			tgt_rt = fr->fr_rt_gw;
2127 		}
2128 	}
2129 
2130 	if (__improbable(!(fr->fr_flags & (FLOWRTF_ONLINK | FLOWRTF_GATEWAY)))) {
2131 		err = EHOSTUNREACH;
2132 		SK_ERR("invalid route for %s on %s (err %d)",
2133 		    sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
2134 		    sizeof(dst_s)), ifp->if_xname, err);
2135 		goto done;
2136 	}
2137 
2138 	ASSERT(tgt_rt != NULL);
2139 
2140 done:
2141 	if (__probable(err == 0)) {
2142 		/*
2143 		 * There's no actual resolution taking place here, so just
2144 		 * mark it with FLOWRTF_RESOLVED for consistency.
2145 		 */
2146 		atomic_bitset_32(&fr->fr_flags, FLOWRTF_RESOLVED);
2147 		atomic_set_32(&fr->fr_want_probe, 0);
2148 	} else {
2149 		atomic_bitclear_32(&fr->fr_flags, FLOWRTF_RESOLVED);
2150 		flow_route_cleanup(fr);
2151 	}
2152 	FR_UNLOCK(fr);
2153 
2154 	return err;
2155 }
2156 
2157 static void
fsw_mem_init(void)2158 fsw_mem_init(void)
2159 {
2160 	ASSERT(skmem_tag_fsw_ports == NULL);
2161 	skmem_tag_fsw_ports =
2162 	    kern_allocation_name_allocate(SKMEM_TAG_FSW_PORTS, 0);
2163 	ASSERT(skmem_tag_fsw_ports != NULL);
2164 
2165 	ASSERT(skmem_tag_fsw_fob_hash == NULL);
2166 	skmem_tag_fsw_fob_hash =
2167 	    kern_allocation_name_allocate(SKMEM_TAG_FSW_FOB_HASH, 0);
2168 	ASSERT(skmem_tag_fsw_fob_hash != NULL);
2169 
2170 	ASSERT(skmem_tag_fsw_frb_hash == NULL);
2171 	skmem_tag_fsw_frb_hash =
2172 	    kern_allocation_name_allocate(SKMEM_TAG_FSW_FRB_HASH, 0);
2173 	ASSERT(skmem_tag_fsw_frb_hash != NULL);
2174 
2175 	ASSERT(skmem_tag_fsw_frib_hash == NULL);
2176 	skmem_tag_fsw_frib_hash =
2177 	    kern_allocation_name_allocate(SKMEM_TAG_FSW_FRIB_HASH, 0);
2178 	ASSERT(skmem_tag_fsw_frib_hash != NULL);
2179 
2180 	ASSERT(skmem_tag_fsw_frag_mgr == NULL);
2181 	skmem_tag_fsw_frag_mgr =
2182 	    kern_allocation_name_allocate(SKMEM_TAG_FSW_FRAG_MGR, 0);
2183 	ASSERT(skmem_tag_fsw_frag_mgr != NULL);
2184 }
2185 
2186 void
fsw_init(void)2187 fsw_init(void)
2188 {
2189 	_CASSERT(NX_FSW_CHUNK_FREE == (uint64_t)-1);
2190 	_CASSERT(PKT_MAX_PROTO_HEADER_SIZE <= NX_FSW_MINBUFSIZE);
2191 
2192 	if (!__nx_fsw_inited) {
2193 		fsw_mem_init();
2194 
2195 		/*
2196 		 * Register callbacks for interface & protocol events
2197 		 * Use dummy arg for callback cookie.
2198 		 */
2199 		__nx_fsw_ifnet_eventhandler_tag =
2200 		    EVENTHANDLER_REGISTER(&ifnet_evhdlr_ctxt,
2201 		    ifnet_event, fsw_ifnet_event_callback,
2202 		    eventhandler_entry_dummy_arg, EVENTHANDLER_PRI_ANY);
2203 		VERIFY(__nx_fsw_ifnet_eventhandler_tag != NULL);
2204 
2205 		__nx_fsw_protoctl_eventhandler_tag =
2206 		    EVENTHANDLER_REGISTER(&protoctl_evhdlr_ctxt,
2207 		    protoctl_event, fsw_protoctl_event_callback,
2208 		    eventhandler_entry_dummy_arg, EVENTHANDLER_PRI_ANY);
2209 		VERIFY(__nx_fsw_protoctl_eventhandler_tag != NULL);
2210 		__nx_fsw_inited = 1;
2211 	}
2212 }
2213 
2214 static void
fsw_mem_uninit(void)2215 fsw_mem_uninit(void)
2216 {
2217 	if (skmem_tag_fsw_ports != NULL) {
2218 		kern_allocation_name_release(skmem_tag_fsw_ports);
2219 		skmem_tag_fsw_ports = NULL;
2220 	}
2221 	if (skmem_tag_fsw_fob_hash != NULL) {
2222 		kern_allocation_name_release(skmem_tag_fsw_fob_hash);
2223 		skmem_tag_fsw_fob_hash = NULL;
2224 	}
2225 	if (skmem_tag_fsw_frb_hash != NULL) {
2226 		kern_allocation_name_release(skmem_tag_fsw_frb_hash);
2227 		skmem_tag_fsw_frb_hash = NULL;
2228 	}
2229 	if (skmem_tag_fsw_frib_hash != NULL) {
2230 		kern_allocation_name_release(
2231 			skmem_tag_fsw_frib_hash);
2232 		skmem_tag_fsw_frib_hash = NULL;
2233 	}
2234 	if (skmem_tag_fsw_frag_mgr != NULL) {
2235 		kern_allocation_name_release(skmem_tag_fsw_frag_mgr);
2236 		skmem_tag_fsw_frag_mgr = NULL;
2237 	}
2238 }
2239 
2240 void
fsw_uninit(void)2241 fsw_uninit(void)
2242 {
2243 	if (__nx_fsw_inited) {
2244 		EVENTHANDLER_DEREGISTER(&ifnet_evhdlr_ctxt, ifnet_event,
2245 		    __nx_fsw_ifnet_eventhandler_tag);
2246 		EVENTHANDLER_DEREGISTER(&protoctl_evhdlr_ctxt, protoctl_event,
2247 		    __nx_fsw_protoctl_eventhandler_tag);
2248 		fsw_mem_uninit();
2249 
2250 		__nx_fsw_inited = 0;
2251 	}
2252 }
2253 
2254 struct nx_flowswitch *
fsw_alloc(zalloc_flags_t how)2255 fsw_alloc(zalloc_flags_t how)
2256 {
2257 	struct nx_flowswitch *fsw;
2258 	struct __nx_stats_fsw *nsfw;
2259 
2260 	SK_LOCK_ASSERT_HELD();
2261 
2262 	nsfw = zalloc_flags(nx_fsw_stats_zone, how | Z_ZERO);
2263 	if (nsfw == NULL) {
2264 		return NULL;
2265 	}
2266 
2267 	fsw = zalloc_flags(nx_fsw_zone, how | Z_ZERO);
2268 	if (fsw == NULL) {
2269 		zfree(nx_fsw_stats_zone, nsfw);
2270 		return NULL;
2271 	}
2272 
2273 	FSW_RWINIT(fsw);
2274 	fsw->fsw_dev_ch = NULL;
2275 	fsw->fsw_host_ch = NULL;
2276 	fsw->fsw_closed_na_stats = nsfw;
2277 
2278 	SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
2279 
2280 	return fsw;
2281 }
2282 
2283 static int
fsw_detach(struct nx_flowswitch * fsw,struct nexus_adapter * hwna,boolean_t purge)2284 fsw_detach(struct nx_flowswitch *fsw, struct nexus_adapter *hwna,
2285     boolean_t purge)
2286 {
2287 	struct kern_nexus_provider *nx_prov = fsw->fsw_nx->nx_prov;
2288 	boolean_t do_dtor = FALSE;
2289 
2290 	SK_LOCK_ASSERT_HELD();
2291 
2292 	/*
2293 	 * return error if the the host port detach is in progress
2294 	 * or already detached.
2295 	 * For the case of flowswitch free (i.e. purge is TRUE) we have to
2296 	 * cleanup everything, so we will block if needed.
2297 	 */
2298 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
2299 	if (!purge && fsw->fsw_detach_flags != 0) {
2300 		SK_ERR("fsw detaching");
2301 		lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2302 		return EBUSY;
2303 	}
2304 	VERIFY(purge || fsw->fsw_detach_flags == 0);
2305 	/*
2306 	 * mark the flowswitch as detaching and release sk_lock while
2307 	 * waiting for other threads to exit. Maintain lock/unlock
2308 	 * ordering between the two locks.
2309 	 */
2310 	fsw->fsw_detach_flags |= FSW_DETACHF_DETACHING;
2311 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2312 	SK_UNLOCK();
2313 
2314 	/*
2315 	 * wait until all threads needing accesses to the flowswitch
2316 	 * netagent get out, and mark this as detached to prevent
2317 	 * further access requests from being admitted.
2318 	 */
2319 	lck_mtx_lock(&fsw->fsw_detach_barrier_lock);
2320 	while (fsw->fsw_detach_barriers != 0) {
2321 		fsw->fsw_detach_waiters++;
2322 		(void) msleep(&fsw->fsw_detach_waiters,
2323 		    &fsw->fsw_detach_barrier_lock,
2324 		    (PZERO + 1), __FUNCTION__, NULL);
2325 	}
2326 	VERIFY(fsw->fsw_detach_barriers == 0);
2327 	VERIFY(fsw->fsw_detach_flags != 0);
2328 	fsw->fsw_detach_flags &= ~FSW_DETACHF_DETACHING;
2329 	/*
2330 	 * if the NA detach thread as well as the flowswitch free thread were
2331 	 * both waiting, then the thread which wins the race is responsible
2332 	 * for doing the dtor work.
2333 	 */
2334 	if (fsw->fsw_detach_flags == 0) {
2335 		fsw->fsw_detach_flags |= FSW_DETACHF_DETACHED;
2336 		do_dtor = TRUE;
2337 	}
2338 	VERIFY(fsw->fsw_detach_flags == FSW_DETACHF_DETACHED);
2339 	lck_mtx_unlock(&fsw->fsw_detach_barrier_lock);
2340 	SK_LOCK();
2341 
2342 	FSW_WLOCK(fsw);
2343 	if (do_dtor) {
2344 		if (fsw->fsw_ifp != NULL) {
2345 			fsw_teardown_ifp(fsw, hwna);
2346 			ASSERT(fsw->fsw_ifp == NULL);
2347 			ASSERT(fsw->fsw_nifna == NULL);
2348 		}
2349 		bzero(fsw->fsw_slla, sizeof(fsw->fsw_slla));
2350 		nx_prov->nxprov_params->nxp_ifindex = 0;
2351 		/* free any flow entries in the deferred list */
2352 		fsw_linger_purge(fsw);
2353 	}
2354 	/*
2355 	 * If we are destroying the instance, release lock to let all
2356 	 * outstanding agent threads to enter, followed by waiting until
2357 	 * all of them exit the critical section before continuing.
2358 	 */
2359 	if (purge) {
2360 		FSW_UNLOCK(fsw);
2361 		flow_mgr_terminate(fsw->fsw_flow_mgr);
2362 		FSW_WLOCK(fsw);
2363 	}
2364 	FSW_WUNLOCK(fsw);
2365 	return 0;
2366 }
2367 
2368 void
fsw_free(struct nx_flowswitch * fsw)2369 fsw_free(struct nx_flowswitch *fsw)
2370 {
2371 	int err;
2372 
2373 	SK_LOCK_ASSERT_HELD();
2374 	ASSERT(fsw != NULL);
2375 
2376 	err = fsw_detach(fsw, NULL, TRUE);
2377 	VERIFY(err == 0);
2378 
2379 	fsw_dp_dtor(fsw);
2380 
2381 	ASSERT(fsw->fsw_dev_ch == NULL);
2382 	ASSERT(fsw->fsw_host_ch == NULL);
2383 	ASSERT(fsw->fsw_closed_na_stats != NULL);
2384 	zfree(nx_fsw_stats_zone, fsw->fsw_closed_na_stats);
2385 	fsw->fsw_closed_na_stats = NULL;
2386 	FSW_RWDESTROY(fsw);
2387 
2388 	SK_DF(SK_VERB_MEM, "fsw 0x%llx FREE", SK_KVA(fsw));
2389 	zfree(nx_fsw_zone, fsw);
2390 }
2391