xref: /xnu-8020.140.41/bsd/skywalk/nexus/flowswitch/nx_flowswitch.c (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1 /*
2  * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  *   1. Redistributions of source code must retain the above copyright
36  *      notice, this list of conditions and the following disclaimer.
37  *   2. Redistributions in binary form must reproduce the above copyright
38  *      notice, this list of conditions and the following disclaimer in the
39  *      documentation and/or other materials provided with the distribution.
40  *
41  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  */
53 
54 
55 /*
56  * This module implements the flow switch for Skywalk
57  *
58  * --- FLOW SWITCH ---
59  *
60  * For each switch, a lock protects deletion of ports. When configuring
61  * or deleting a new port, the lock is acquired in exclusive mode (after
62  * holding SK_LOCK).  When forwarding, the lock is acquired in shared
63  * mode (without SK_LOCK).  The lock is held throughout the entire
64  * forwarding cycle, during which the thread may incur in a page fault.
65  * Hence it is important that sleepable shared locks are used.
66  *
67  * On the rx ring, the per-port lock is grabbed initially to reserve
68  * a number of slot in the ring, then the lock is released, packets are
69  * copied from source to destination, and then the lock is acquired again
70  * and the receive ring is updated.  (A similar thing is done on the tx
71  * ring for NIC and host stack ports attached to the switch)
72  *
73  * When a netif is attached to a flowswitch, two kernel channels are opened:
74  * The device and host channels. The device channel provides the device
75  * datapath. The host channel is not used in the datapath. It is there
76  * only for providing some callbacks for activating the hostna (e.g.
77  * intercepting host packets).
78  */
79 
80 #include <net/bpf.h>
81 #include <netinet/tcp_seq.h>
82 #include <skywalk/os_skywalk_private.h>
83 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
84 #include <skywalk/nexus/flowswitch/fsw_var.h>
85 #include <skywalk/nexus/upipe/nx_user_pipe.h>
86 #include <skywalk/nexus/netif/nx_netif.h>
87 #include <skywalk/nexus/nexus_var.h>
88 #include <sys/protosw.h>
89 #include <sys/domain.h>
90 
91 SYSCTL_EXTENSIBLE_NODE(_kern_skywalk, OID_AUTO, flowswitch,
92     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk FlowSwitch");
93 
94 static void nx_fsw_dom_init(struct nxdom *);
95 static void nx_fsw_dom_terminate(struct nxdom *);
96 static void nx_fsw_dom_fini(struct nxdom *);
97 static int nx_fsw_dom_find_port(struct kern_nexus *, boolean_t, nexus_port_t *);
98 static int nx_fsw_dom_bind_port(struct kern_nexus *, nexus_port_t *,
99     struct nxbind *, void *);
100 static int nx_fsw_dom_unbind_port(struct kern_nexus *, nexus_port_t);
101 static int nx_fsw_dom_connect(struct kern_nexus_domain_provider *,
102     struct kern_nexus *, struct kern_channel *, struct chreq *,
103     struct kern_channel *, struct nxbind *, struct proc *);
104 static void nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *,
105     struct kern_nexus *, struct kern_channel *);
106 static void nx_fsw_dom_defunct(struct kern_nexus_domain_provider *,
107     struct kern_nexus *, struct kern_channel *, struct proc *);
108 static void nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *,
109     struct kern_nexus *, struct kern_channel *, boolean_t);
110 
111 static int nx_fsw_prov_init(struct kern_nexus_domain_provider *);
112 static int nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *,
113     const struct nxprov_params *, struct nxprov_adjusted_params *);
114 static int nx_fsw_prov_params(struct kern_nexus_domain_provider *,
115     const uint32_t, const struct nxprov_params *, struct nxprov_params *,
116     struct skmem_region_params[SKMEM_REGIONS]);
117 static int nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *,
118     struct kern_nexus *, struct nexus_adapter *);
119 static int nx_fsw_prov_config(struct kern_nexus_domain_provider *,
120     struct kern_nexus *, struct nx_cfg_req *, int, struct proc *,
121     kauth_cred_t);
122 static void nx_fsw_prov_fini(struct kern_nexus_domain_provider *);
123 static int nx_fsw_prov_nx_ctor(struct kern_nexus *);
124 static void nx_fsw_prov_nx_dtor(struct kern_nexus *);
125 static size_t nx_fsw_prov_mib_get(struct kern_nexus *nx,
126     struct nexus_mib_filter *, void *, size_t, struct proc *);
127 
128 struct nxdom nx_flowswitch_dom_s = {
129 	.nxdom_prov_head =
130     STAILQ_HEAD_INITIALIZER(nx_flowswitch_dom_s.nxdom_prov_head),
131 	.nxdom_type =           NEXUS_TYPE_FLOW_SWITCH,
132 	.nxdom_md_type =        NEXUS_META_TYPE_PACKET,
133 	.nxdom_md_subtype =     NEXUS_META_SUBTYPE_RAW,
134 	.nxdom_name =           "flowswitch",
135 	.nxdom_ports = {
136 		.nb_def = NX_FSW_VP_MAX,
137 		.nb_min = NX_FSW_VP_MIN,
138 		.nb_max = NX_FSW_VP_MAX,
139 	},
140 	.nxdom_tx_rings = {
141 		.nb_def = 1,
142 		.nb_min = 1,
143 		.nb_max = NX_FSW_MAXRINGS,
144 	},
145 	.nxdom_rx_rings = {
146 		.nb_def = 1,
147 		.nb_min = 1,
148 		.nb_max = NX_FSW_MAXRINGS,
149 	},
150 	.nxdom_tx_slots = {
151 		.nb_def = NX_FSW_TXRINGSIZE,
152 		.nb_min = NX_FSW_MINSLOTS,
153 		.nb_max = NX_FSW_MAXSLOTS,
154 	},
155 	.nxdom_rx_slots = {
156 		.nb_def = NX_FSW_RXRINGSIZE,
157 		.nb_min = NX_FSW_MINSLOTS,
158 		.nb_max = NX_FSW_MAXSLOTS,
159 	},
160 	.nxdom_buf_size = {
161 		.nb_def = NX_FSW_BUFSIZE,
162 		.nb_min = NX_FSW_MINBUFSIZE,
163 		.nb_max = NX_FSW_MAXBUFSIZE,
164 	},
165 	.nxdom_meta_size = {
166 		.nb_def = NX_FSW_UMD_SIZE,
167 		.nb_min = NX_FSW_UMD_SIZE,
168 		.nb_max = NX_METADATA_USR_MAX_SZ,
169 	},
170 	.nxdom_stats_size = {
171 		.nb_def = 0,
172 		.nb_min = 0,
173 		.nb_max = NX_STATS_MAX_SZ,
174 	},
175 	.nxdom_pipes = {
176 		.nb_def = 0,
177 		.nb_min = 0,
178 		.nb_max = NX_UPIPE_MAXPIPES,
179 	},
180 	.nxdom_flowadv_max = {
181 		.nb_def = 0,
182 		.nb_min = 0,
183 		.nb_max = NX_FLOWADV_MAX,
184 	},
185 	.nxdom_nexusadv_size = {
186 		.nb_def = 0,
187 		.nb_min = 0,
188 		.nb_max = NX_NEXUSADV_MAX_SZ,
189 	},
190 	.nxdom_capabilities = {
191 		.nb_def = NXPCAP_USER_CHANNEL,
192 		.nb_min = 0,
193 		.nb_max = (NXPCAP_CHECKSUM_PARTIAL | NXPCAP_USER_PACKET_POOL |
194     NXPCAP_USER_CHANNEL),
195 	},
196 	.nxdom_qmap = {
197 		.nb_def = NEXUS_QMAP_TYPE_INVALID,
198 		.nb_min = NEXUS_QMAP_TYPE_INVALID,
199 		.nb_max = NEXUS_QMAP_TYPE_INVALID,
200 	},
201 	.nxdom_max_frags = {
202 		.nb_def = NX_PBUF_FRAGS_DEFAULT,
203 		.nb_min = NX_PBUF_FRAGS_MIN,
204 		.nb_max = NX_PBUF_FRAGS_MAX,
205 	},
206 	.nxdom_init =           nx_fsw_dom_init,
207 	.nxdom_terminate =      nx_fsw_dom_terminate,
208 	.nxdom_fini =           nx_fsw_dom_fini,
209 	.nxdom_connect =        nx_fsw_dom_connect,
210 	.nxdom_find_port =      nx_fsw_dom_find_port,
211 	.nxdom_port_is_reserved = nx_fsw_dom_port_is_reserved,
212 	.nxdom_bind_port =      nx_fsw_dom_bind_port,
213 	.nxdom_unbind_port =    nx_fsw_dom_unbind_port,
214 	.nxdom_disconnect =     nx_fsw_dom_disconnect,
215 	.nxdom_defunct =        nx_fsw_dom_defunct,
216 	.nxdom_defunct_finalize = nx_fsw_dom_defunct_finalize,
217 };
218 
219 struct kern_nexus_domain_provider nx_fsw_prov_s = {
220 	.nxdom_prov_name =              NEXUS_PROVIDER_FLOW_SWITCH,
221 	.nxdom_prov_flags =             NXDOMPROVF_DEFAULT,
222 	.nxdom_prov_cb = {
223 		.dp_cb_init =           nx_fsw_prov_init,
224 		.dp_cb_fini =           nx_fsw_prov_fini,
225 		.dp_cb_params =         nx_fsw_prov_params,
226 		.dp_cb_mem_new =        nx_fsw_prov_mem_new,
227 		.dp_cb_config =         nx_fsw_prov_config,
228 		.dp_cb_nx_ctor =        nx_fsw_prov_nx_ctor,
229 		.dp_cb_nx_dtor =        nx_fsw_prov_nx_dtor,
230 		.dp_cb_nx_mem_info =    NULL,   /* not supported */
231 		.dp_cb_nx_mib_get =     nx_fsw_prov_mib_get,
232 		.dp_cb_nx_stop =        NULL,
233 	},
234 };
235 
236 
237 static void
nx_fsw_dom_init(struct nxdom * nxdom)238 nx_fsw_dom_init(struct nxdom *nxdom)
239 {
240 	SK_LOCK_ASSERT_HELD();
241 	ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));
242 
243 	/* Generic initialization */
244 	fsw_init();
245 	fsw_dp_init();
246 
247 	(void) nxdom_prov_add(nxdom, &nx_fsw_prov_s);
248 }
249 
250 static void
nx_fsw_dom_terminate(struct nxdom * nxdom)251 nx_fsw_dom_terminate(struct nxdom *nxdom)
252 {
253 	struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;
254 
255 	SK_LOCK_ASSERT_HELD();
256 
257 	STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
258 	    nxdom_prov_link, tnxdp) {
259 		(void) nxdom_prov_del(nxdom_prov);
260 	}
261 
262 	fsw_dp_uninit();
263 
264 	/* Generic uninitialization */
265 	fsw_uninit();
266 }
267 
268 static void
nx_fsw_dom_fini(struct nxdom * nxdom)269 nx_fsw_dom_fini(struct nxdom *nxdom)
270 {
271 #pragma unused(nxdom)
272 }
273 
274 static int
nx_fsw_prov_init(struct kern_nexus_domain_provider * nxdom_prov)275 nx_fsw_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
276 {
277 #pragma unused(nxdom_prov)
278 	SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
279 	return 0;
280 }
281 
282 static int
nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider * nxdom_prov,const struct nxprov_params * nxp,struct nxprov_adjusted_params * adj)283 nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
284     const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
285 {
286 #pragma unused(nxdom_prov, nxp)
287 	_CASSERT(NX_FSW_AFRINGSIZE <= NX_FSW_RXRINGSIZE);
288 	_CASSERT(NX_FSW_AFRINGSIZE <= NX_FSW_TXRINGSIZE);
289 
290 	*(adj->adj_md_subtype) = NEXUS_META_SUBTYPE_PAYLOAD;
291 	*(adj->adj_stats_size) = sizeof(struct __nx_stats_fsw);
292 	VERIFY(sk_max_flows > 0 && sk_max_flows <= NX_FLOWADV_MAX);
293 	*(adj->adj_flowadv_max) = sk_max_flows;
294 	*(adj->adj_nexusadv_size) = sizeof(struct sk_nexusadv);
295 	*(adj->adj_caps) |= NXPCAP_USER_PACKET_POOL;
296 	if (sk_cksum_tx != 0) {
297 		*(adj->adj_caps) |= NXPCAP_CHECKSUM_PARTIAL;
298 	}
299 	*(adj->adj_alloc_rings) = *(adj->adj_free_rings) =
300 	    ((nxp->nxp_max_frags > 1) && (sk_channel_buflet_alloc != 0)) ?
301 	    2 : 1;
302 	*(adj->adj_alloc_slots) = *(adj->adj_free_slots) =
303 	    NX_FSW_AFRINGSIZE;
304 
305 	if (adj->adj_buf_srp->srp_r_seg_size == 0) {
306 		adj->adj_buf_srp->srp_r_seg_size = skmem_usr_buf_seg_size;
307 	}
308 	if (!SKMEM_MEM_CONSTRAINED_DEVICE &&
309 	    (adj->adj_buf_srp->srp_r_seg_size < NX_FSW_BUF_SEG_SIZE)) {
310 		adj->adj_buf_srp->srp_r_seg_size = NX_FSW_BUF_SEG_SIZE;
311 	}
312 
313 	/* enable magazines layer for metadata */
314 	*(adj->adj_md_magazines) = TRUE;
315 
316 	if (*(adj->adj_max_frags) > 1) {
317 		uint32_t fsw_maxbufs = SKMEM_MEM_CONSTRAINED_DEVICE ?
318 		    NX_FSW_MAXBUFFERS_MEM_CONSTRAINED : NX_FSW_MAXBUFFERS;
319 		uint32_t magazine_max_objs;
320 
321 		*(adj->adj_max_buffers) = (sk_fsw_max_bufs != 0) ?
322 		    sk_fsw_max_bufs : fsw_maxbufs;
323 
324 		/*
325 		 * Given that packet objects are the ones cached, use the
326 		 * metadata size to determine the extra amount of objects
327 		 * at magazine layer.
328 		 */
329 		magazine_max_objs = skmem_cache_magazine_max(
330 			NX_METADATA_PACKET_SZ(*(adj->adj_max_frags)) +
331 			METADATA_PREAMBLE_SZ);
332 
333 		/*
334 		 * Adjust the max buffers to account for the increase
335 		 * associated with per-CPU caching.
336 		 */
337 		if (skmem_allow_magazines() &&
338 		    magazine_max_objs < *(adj->adj_max_buffers)) {
339 			*(adj->adj_max_buffers) -= magazine_max_objs;
340 		}
341 	}
342 	return 0;
343 }
344 
345 static int
nx_fsw_prov_params(struct kern_nexus_domain_provider * nxdom_prov,const uint32_t req,const struct nxprov_params * nxp0,struct nxprov_params * nxp,struct skmem_region_params srp[SKMEM_REGIONS])346 nx_fsw_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
347     const uint32_t req, const struct nxprov_params *nxp0,
348     struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS])
349 {
350 	struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;
351 
352 	return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
353 	           nxdom, nxdom, nxdom, nx_fsw_prov_params_adjust);
354 }
355 
356 static int
nx_fsw_prov_mem_new(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nexus_adapter * na)357 nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
358     struct kern_nexus *nx, struct nexus_adapter *na)
359 {
360 #pragma unused(nxdom_prov)
361 	int err = 0;
362 	struct skmem_region_params srp_tmp[SKMEM_REGIONS];
363 	struct skmem_region_params *srp = NX_PROV(nx)->nxprov_region_params;
364 
365 	SK_DF(SK_VERB_FSW,
366 	    "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx),
367 	    NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
368 	    SK_KVA(na));
369 
370 	ASSERT(na->na_type == NA_FLOWSWITCH_VP);
371 	ASSERT(na->na_arena == NULL);
372 	ASSERT((na->na_flags & NAF_USER_PKT_POOL) != 0);
373 
374 	srp = srp_tmp;
375 	memcpy(srp, NX_PROV(nx)->nxprov_region_params, sizeof(srp_tmp));
376 	srp[SKMEM_REGION_TXAUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
377 	srp[SKMEM_REGION_RXFUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
378 	if ((na->na_flags & NAF_LOW_LATENCY) != 0) {
379 		srp[SKMEM_REGION_KMD].srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
380 		srp[SKMEM_REGION_UMD].srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
381 		srp[SKMEM_REGION_BUF].srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
382 		srp[SKMEM_REGION_RXFKSD].srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
383 		srp[SKMEM_REGION_RXFUSD].srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
384 	}
385 
386 	/*
387 	 * Each port in the flow switch is isolated from one another;
388 	 * use NULL for the packet buffer pool references to indicate
389 	 * this, since otherwise we'd be sharing the same pp for the
390 	 * entire switch (maybe for a future, special use case?)
391 	 *
392 	 * This means that clients calling kern_nexus_get_pbufpool()
393 	 * will get NULL, but this is fine based on current design
394 	 * of providing port isolation, and also since we don't expose
395 	 * the flow switch to external kernel clients.
396 	 */
397 	na->na_arena = skmem_arena_create_for_nexus(na, srp, NULL, NULL, FALSE,
398 	    !NX_USER_CHANNEL_PROV(nx), &nx->nx_adv, &err);
399 	ASSERT(na->na_arena != NULL || err != 0);
400 
401 	return err;
402 }
403 
404 static int
nx_fsw_prov_config(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nx_cfg_req * ncr,int sopt_dir,struct proc * p,kauth_cred_t cred)405 nx_fsw_prov_config(struct kern_nexus_domain_provider *nxdom_prov,
406     struct kern_nexus *nx, struct nx_cfg_req *ncr, int sopt_dir,
407     struct proc *p, kauth_cred_t cred)
408 {
409 #pragma unused(nxdom_prov)
410 	struct sockopt sopt;
411 	int err = 0;
412 
413 	SK_LOCK_ASSERT_HELD();
414 
415 	/* proceed only if the client possesses flow switch entitlement */
416 	if ((err = skywalk_priv_check_cred(p, cred,
417 	    PRIV_SKYWALK_REGISTER_FLOW_SWITCH)) != 0) {
418 		goto done;
419 	}
420 
421 	if (ncr->nc_req == USER_ADDR_NULL) {
422 		err = EINVAL;
423 		goto done;
424 	}
425 
426 	/* to make life easier for handling copies */
427 	bzero(&sopt, sizeof(sopt));
428 	sopt.sopt_dir = sopt_dir;
429 	sopt.sopt_val = ncr->nc_req;
430 	sopt.sopt_valsize = ncr->nc_req_len;
431 	sopt.sopt_p = p;
432 
433 	/* avoid _MALLOCing at the cost of this ugly switch block */
434 	switch (ncr->nc_cmd) {
435 	case NXCFG_CMD_ATTACH:
436 	case NXCFG_CMD_DETACH: {
437 		struct nx_spec_req nsr;
438 
439 		bzero(&nsr, sizeof(nsr));
440 		err = sooptcopyin(&sopt, &nsr, sizeof(nsr), sizeof(nsr));
441 		if (err != 0) {
442 			goto done;
443 		}
444 
445 		/*
446 		 * Null-terminate in case this has an interface name;
447 		 * the union is already large enough for uuid_t.
448 		 */
449 		nsr.nsr_name[sizeof(nsr.nsr_name) - 1] = '\0';
450 		if (p != kernproc) {
451 			nsr.nsr_flags &= NXSPECREQ_MASK;
452 		}
453 
454 		err = fsw_ctl(nx, ncr->nc_cmd, p, &nsr);
455 		if (err != 0) {
456 			goto done;
457 		}
458 
459 		/* XXX: [email protected] -- can this copyout fail? */
460 		(void) sooptcopyout(&sopt, &nsr, sizeof(nsr));
461 		break;
462 	}
463 
464 	case NXCFG_CMD_FLOW_ADD:
465 	case NXCFG_CMD_FLOW_DEL: {
466 		_CASSERT(offsetof(struct nx_flow_req, _nfr_kernel_field_end) ==
467 		    offsetof(struct nx_flow_req, _nfr_common_field_end));
468 		struct nx_flow_req nfr;
469 
470 		bzero(&nfr, sizeof(nfr));
471 		err = sooptcopyin(&sopt, &nfr, sizeof(nfr), sizeof(nfr));
472 		if (err != 0) {
473 			goto done;
474 		}
475 
476 		err = fsw_ctl(nx, ncr->nc_cmd, p, &nfr);
477 		if (err != 0) {
478 			goto done;
479 		}
480 
481 		/* XXX: [email protected] -- can this copyout fail? */
482 		(void) sooptcopyout(&sopt, &nfr, sizeof(nfr));
483 		break;
484 	}
485 
486 	case NXCFG_CMD_NETEM: {
487 		struct if_netem_params inp;
488 
489 		bzero(&inp, sizeof(inp));
490 		err = sooptcopyin(&sopt, &inp, sizeof(inp), sizeof(inp));
491 		if (err != 0) {
492 			goto done;
493 		}
494 		err = fsw_ctl(nx, ncr->nc_cmd, p, &inp);
495 		if (err != 0) {
496 			goto done;
497 		}
498 		break;
499 	}
500 
501 	default:
502 		err = EINVAL;
503 		goto done;
504 	}
505 
506 done:
507 	SK_DF(err ? SK_VERB_ERROR: SK_VERB_FSW,
508 	    "nexus 0x%llx (%s) cmd %d (err %d)", SK_KVA(nx),
509 	    NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err);
510 	return err;
511 }
512 
513 static void
nx_fsw_prov_fini(struct kern_nexus_domain_provider * nxdom_prov)514 nx_fsw_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
515 {
516 #pragma unused(nxdom_prov)
517 	SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
518 }
519 
520 static int
nx_fsw_prov_nx_ctor(struct kern_nexus * nx)521 nx_fsw_prov_nx_ctor(struct kern_nexus *nx)
522 {
523 	struct nx_flowswitch *fsw;
524 
525 	SK_LOCK_ASSERT_HELD();
526 
527 	ASSERT(nx->nx_arg == NULL);
528 
529 	SK_D("nexus 0x%llx (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name);
530 
531 	fsw = fsw_alloc(Z_WAITOK);
532 	nx->nx_arg = fsw;
533 	fsw->fsw_nx = nx;
534 	fsw->fsw_tx_rings = NX_PROV(nx)->nxprov_params->nxp_tx_rings;
535 	fsw->fsw_rx_rings = NX_PROV(nx)->nxprov_params->nxp_rx_rings;
536 
537 	FSW_WLOCK(fsw);
538 
539 	fsw_dp_ctor(fsw);
540 
541 	FSW_WUNLOCK(fsw);
542 
543 	SK_D("create new fsw 0x%llx for nexus 0x%llx",
544 	    SK_KVA(NX_FSW_PRIVATE(nx)), SK_KVA(nx));
545 
546 	return 0;
547 }
548 
549 static void
nx_fsw_prov_nx_dtor(struct kern_nexus * nx)550 nx_fsw_prov_nx_dtor(struct kern_nexus *nx)
551 {
552 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
553 	int err;
554 
555 	SK_LOCK_ASSERT_HELD();
556 
557 	SK_D("nexus 0x%llx (%s) fsw 0x%llx", SK_KVA(nx),
558 	    NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(fsw));
559 
560 	err = fsw_ctl_detach(nx, current_proc(), NULL);
561 	ASSERT(err == 0);       /* this cannot fail */
562 	ASSERT(fsw->fsw_dev_ch == NULL);
563 	ASSERT(fsw->fsw_host_ch == NULL);
564 
565 	SK_DF(SK_VERB_FSW, "marking fsw 0x%llx as free", SK_KVA(fsw));
566 	fsw_free(fsw);
567 	nx->nx_arg = NULL;
568 }
569 
570 static size_t
nx_fsw_prov_mib_get(struct kern_nexus * nx,struct nexus_mib_filter * filter,void * out,size_t len,struct proc * p)571 nx_fsw_prov_mib_get(struct kern_nexus *nx, struct nexus_mib_filter *filter,
572     void *out, size_t len, struct proc *p)
573 {
574 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
575 
576 	/* this check doesn't require holding fsw_lock */
577 	if ((filter->nmf_bitmap & NXMIB_FILTER_NX_UUID) &&
578 	    (uuid_compare(filter->nmf_nx_uuid,
579 	    fsw->fsw_nx->nx_uuid)) != 0) {
580 		return 0;
581 	}
582 
583 	/* intercept NXMIB_FSW_STATS here since it's for flowswitch */
584 	FSW_RLOCK(fsw);
585 	len = fsw_mib_get(fsw, filter, out, len, p);
586 	FSW_UNLOCK(fsw);
587 
588 	return len;
589 }
590 
591 boolean_t
nx_fsw_dom_port_is_reserved(struct kern_nexus * nx,nexus_port_t nx_port)592 nx_fsw_dom_port_is_reserved(struct kern_nexus *nx, nexus_port_t nx_port)
593 {
594 #pragma unused(nx)
595 	return nx_port < NEXUS_PORT_FLOW_SWITCH_CLIENT;
596 }
597 
598 static int
nx_fsw_dom_find_port(struct kern_nexus * nx,boolean_t rsvd,nexus_port_t * nx_port)599 nx_fsw_dom_find_port(struct kern_nexus *nx, boolean_t rsvd,
600     nexus_port_t *nx_port)
601 {
602 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
603 	nexus_port_t first, last, port;
604 	int error;
605 
606 	ASSERT(nx_port != NULL);
607 
608 	port = *nx_port;
609 	ASSERT(port == NEXUS_PORT_ANY);
610 
611 	if (rsvd) {
612 		first = 0;
613 		last = NEXUS_PORT_FLOW_SWITCH_CLIENT;
614 	} else {
615 		first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
616 		last = NXDOM_MAX(NX_DOM(nx), ports);
617 	}
618 	ASSERT(first <= last);
619 
620 	FSW_WLOCK(fsw);
621 	if (__improbable(first == last)) {
622 		error = ENOSPC;
623 	} else {
624 		error = nx_port_find(nx, first, last - 1, &port);
625 		ASSERT(error != 0 || (port >= first && port < last));
626 	}
627 	FSW_WUNLOCK(fsw);
628 
629 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
630 	    "nx 0x%llx \"%s\" %snx_port %d [%u,%u] (err %d)", SK_KVA(nx),
631 	    nx->nx_prov->nxprov_params->nxp_name, (rsvd ? "[reserved] " : ""),
632 	    (int)port, first, (last - 1), error);
633 
634 	if (error == 0) {
635 		*nx_port = port;
636 	}
637 
638 	return error;
639 }
640 
641 static int
nx_fsw_dom_bind_port(struct kern_nexus * nx,nexus_port_t * nx_port,struct nxbind * nxb,void * info)642 nx_fsw_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
643     struct nxbind *nxb, void *info)
644 {
645 #pragma unused(info)
646 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
647 	nexus_port_t first, last, port;
648 	int error;
649 
650 	ASSERT(nx_port != NULL);
651 	ASSERT(nxb != NULL);
652 
653 	port = *nx_port;
654 
655 	/* can't bind reserved ports to client credentials */
656 	if (nx_fsw_dom_port_is_reserved(nx, port)) {
657 		return EDOM;
658 	}
659 
660 	/*
661 	 * Allow clients to bind to regular ports (non-reserved);
662 	 * reserved ports aren't subject to bind/unbind, since
663 	 * they are used for internal purposes.
664 	 */
665 	first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
666 	last = NXDOM_MAX(NX_DOM(nx), ports);
667 	ASSERT(first <= last);
668 
669 	FSW_WLOCK(fsw);
670 	if (__improbable(first == last)) {
671 		error = ENOSPC;
672 	} else if (port != NEXUS_PORT_ANY) {
673 		error = nx_port_bind(nx, port, nxb);
674 	} else {
675 		error = nx_port_find(nx, first, last - 1, &port);
676 		ASSERT(error != 0 || (port >= first && port < last));
677 		if (error == 0) {
678 			error = nx_port_bind(nx, port, nxb);
679 		}
680 	}
681 	FSW_WUNLOCK(fsw);
682 
683 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
684 	    "nx 0x%llx \"%s\" nx_port %d [%u,%u] (err %d)", SK_KVA(nx),
685 	    nx->nx_prov->nxprov_params->nxp_name, (int)port,
686 	    first, (last - 1), error);
687 
688 	ASSERT(*nx_port == NEXUS_PORT_ANY || *nx_port == port);
689 	if (error == 0) {
690 		*nx_port = port;
691 	}
692 
693 	return error;
694 }
695 
696 static int
nx_fsw_dom_unbind_port(struct kern_nexus * nx,nexus_port_t nx_port)697 nx_fsw_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
698 {
699 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
700 	int error;
701 
702 	FSW_WLOCK(fsw);
703 	error = nx_port_unbind(nx, nx_port);
704 	FSW_WUNLOCK(fsw);
705 
706 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
707 	    "nx 0x%llx \"%s\" nx_port %d (err %d)", SK_KVA(nx),
708 	    nx->nx_prov->nxprov_params->nxp_name, (int)nx_port, error);
709 
710 	return error;
711 }
712 
713 static int
nx_fsw_dom_connect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct kern_channel * ch0,struct nxbind * nxb,struct proc * p)714 nx_fsw_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
715     struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
716     struct kern_channel *ch0, struct nxbind *nxb, struct proc *p)
717 {
718 #pragma unused(nxdom_prov)
719 	nexus_port_t port = chr->cr_port;
720 	int err = 0;
721 
722 	SK_LOCK_ASSERT_HELD();
723 
724 	ASSERT(nx->nx_prov->nxprov_params->nxp_type ==
725 	    nxdom_prov->nxdom_prov_dom->nxdom_type &&
726 	    nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
727 	ASSERT(!(ch->ch_flags & CHANF_HOST));
728 
729 	if (port != NEXUS_PORT_ANY && port >= NXDOM_MAX(NX_DOM(nx), ports)) {
730 		err = EDOM;
731 		goto done;
732 	}
733 
734 	if (chr->cr_mode & CHMODE_EVENT_RING) {
735 		SK_ERR("event ring is not supported for flowswitch");
736 		err = ENOTSUP;
737 		goto done;
738 	}
739 
740 	chr->cr_real_endpoint = chr->cr_endpoint = CH_ENDPOINT_FLOW_SWITCH;
741 	if (ch->ch_flags & CHANF_KERNEL) {
742 		uuid_string_t uuidstr;
743 		ASSERT(!uuid_is_null(chr->cr_spec_uuid));
744 		(void) snprintf(chr->cr_name, sizeof(chr->cr_name),
745 		    "%s_%llu:%s", NX_FSW_NAME, nx->nx_id,
746 		    sk_uuid_unparse(chr->cr_spec_uuid, uuidstr));
747 		chr->cr_ring_set = RING_SET_DEFAULT;
748 		if (chr->cr_mode & CHMODE_HOST) {
749 			atomic_bitset_32(&ch->ch_flags, CHANF_HOST);
750 		}
751 		err = na_connect_spec(nx, ch, chr, p);
752 	} else {
753 		ASSERT(port != NEXUS_PORT_ANY);
754 		if (chr->cr_mode & CHMODE_HOST) {
755 			/* not allowed unless kernel (special) channel */
756 			err = EINVAL;
757 			goto done;
758 		}
759 		(void) snprintf(chr->cr_name, sizeof(chr->cr_name),
760 		    "%s_%llu:%u", NX_FSW_NAME, nx->nx_id, port);
761 		chr->cr_ring_set = RING_SET_DEFAULT;
762 		err = na_connect(nx, ch, chr, ch0, nxb, p);
763 	}
764 
765 done:
766 	return err;
767 }
768 
769 static void
nx_fsw_dom_disconnect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch)770 nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
771     struct kern_nexus *nx, struct kern_channel *ch)
772 {
773 #pragma unused(nxdom_prov)
774 	SK_LOCK_ASSERT_HELD();
775 
776 	SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch),
777 	    SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
778 	    ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
779 
780 	if (ch->ch_flags & CHANF_KERNEL) {
781 		na_disconnect_spec(nx, ch);
782 	} else {
783 		na_disconnect(nx, ch);
784 	}
785 }
786 
787 static void
nx_fsw_dom_defunct(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)788 nx_fsw_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
789     struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
790 {
791 #pragma unused(nxdom_prov)
792 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
793 
794 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
795 	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
796 	ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
797 
798 	/*
799 	 * Hold the flowswitch lock as writer; this prevents all data path
800 	 * accesses to the flowswitch, and allows us to mark the rings with
801 	 * CKRF_DEFUNCT.  Unlike some other nexus types, the flowswitch
802 	 * doesn't utilize kr_{enter,exit} for serialization, at present.
803 	 */
804 	FSW_WLOCK(fsw);
805 	na_ch_rings_defunct(ch, p);
806 	FSW_WUNLOCK(fsw);
807 }
808 
809 static void
nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,boolean_t locked)810 nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
811     struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
812 {
813 #pragma unused(nxdom_prov)
814 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
815 	int err = 0;
816 
817 	if (!locked) {
818 		SK_LOCK_ASSERT_NOTHELD();
819 		SK_LOCK();
820 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
821 	} else {
822 		SK_LOCK_ASSERT_HELD();
823 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
824 	}
825 
826 	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
827 	ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
828 	ASSERT(VPNA(ch->ch_na)->vpna_nx_port == ch->ch_info->cinfo_nx_port);
829 
830 	err = fsw_port_na_defunct(fsw, VPNA(ch->ch_na));
831 
832 	if (err == 0) {
833 		na_defunct(nx, ch, ch->ch_na, locked);
834 	}
835 
836 	SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d) err %d",
837 	    ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx),
838 	    nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
839 	    ch->ch_info->cinfo_nx_port,
840 	    (int)ch->ch_info->cinfo_ch_ring_id, err);
841 
842 	if (!locked) {
843 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
844 		SK_UNLOCK();
845 	} else {
846 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
847 		SK_LOCK_ASSERT_HELD();
848 	}
849 }
850 
851 #if SK_LOG
852 /* Hoisted out of line to reduce kernel stack footprint */
853 SK_LOG_ATTRIBUTE
854 static void
nx_fsw_na_find_log(const struct chreq * chr,boolean_t create)855 nx_fsw_na_find_log(const struct chreq *chr, boolean_t create)
856 {
857 	uuid_string_t uuidstr;
858 
859 	SK_D("name \"%s\" spec_uuid \"%s\" nx_port %d mode 0x%b pipe_id %u "
860 	    "ring_id %d ring_set %u ep_type %u:%u create %u%s",
861 	    chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
862 	    (int)chr->cr_port, chr->cr_mode, CHMODE_BITS, chr->cr_pipe_id,
863 	    (int)chr->cr_ring_id, chr->cr_ring_set, chr->cr_real_endpoint,
864 	    chr->cr_endpoint, create, (strncmp(chr->cr_name, NX_FSW_NAME,
865 	    sizeof(NX_FSW_NAME) - 1) != 0) ? " (skipped)" : "");
866 }
867 #endif /* SK_LOG */
868 
869 /*
870  * Try to get a reference to a Nexus adapter attached to a flow switch.
871  * If the adapter is found (or is created), this function returns 0, a
872  * non NULL pointer is returned into *na, and the caller holds a
873  * reference to the adapter.
874  * If an adapter is not found, then no reference is grabbed and the
875  * function returns an error code, or 0 if there is just a flow switch prefix
876  * mismatch. Therefore the caller holds a reference when
877  * (*na != NULL && return == 0).
878  */
879 int
nx_fsw_na_find(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_adapter ** na,boolean_t create)880 nx_fsw_na_find(struct kern_nexus *nx, struct kern_channel *ch,
881     struct chreq *chr, struct nxbind *nxb, struct proc *p,
882     struct nexus_adapter **na, boolean_t create)
883 {
884 #pragma unused(ch)
885 	struct nexus_vp_adapter *vpna = NULL;
886 	char *cr_name = chr->cr_name;
887 	struct nx_flowswitch *fsw;
888 	int error = 0;
889 
890 	SK_LOCK_ASSERT_HELD();
891 	*na = NULL;     /* default return value */
892 
893 #if SK_LOG
894 	if (__improbable(sk_verbose != 0)) {
895 		nx_fsw_na_find_log(chr, create);
896 	}
897 #endif /* SK_LOG */
898 
899 	/* first try to see if this is a flow switch port. */
900 	if (strncmp(cr_name, NX_FSW_NAME, sizeof(NX_FSW_NAME) - 1) != 0) {
901 		return 0;  /* no error, but no flow switch prefix */
902 	}
903 	ASSERT(nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
904 	fsw = NX_FSW_PRIVATE(nx);
905 	ASSERT(fsw != NULL);
906 
907 	if (!create) {
908 		return ENXIO;
909 	}
910 
911 	/*
912 	 * The flowswitch VP is only attachable from a user channel so none of
913 	 * these flags should be set.
914 	 */
915 	ASSERT((chr->cr_mode & (CHMODE_KERNEL | CHMODE_CONFIG)) == 0);
916 	error = fsw_attach_vp(nx, ch, chr, nxb, p, &vpna);
917 	ASSERT(vpna == NULL || error == 0);
918 
919 	if (error == 0) {
920 		/* use reference held by nx_fsw_attach_vp above */
921 		*na = &vpna->vpna_up;
922 		SK_DF(SK_VERB_FSW,
923 		    "vpna \"%s\" (0x%llx) refs %u to fsw \"%s\" nx_port %d",
924 		    (*na)->na_name, SK_KVA(*na), (*na)->na_refcount,
925 		    cr_name, (int)vpna->vpna_nx_port);
926 	}
927 
928 	return error;
929 }
930 
931 int
nx_fsw_netagent_add(struct kern_nexus * nx)932 nx_fsw_netagent_add(struct kern_nexus *nx)
933 {
934 	return fsw_netagent_add_remove(nx, TRUE);
935 }
936 
937 int
nx_fsw_netagent_remove(struct kern_nexus * nx)938 nx_fsw_netagent_remove(struct kern_nexus *nx)
939 {
940 	return fsw_netagent_add_remove(nx, FALSE);
941 }
942 
943 void
nx_fsw_netagent_update(struct kern_nexus * nx)944 nx_fsw_netagent_update(struct kern_nexus *nx)
945 {
946 	fsw_netagent_update(nx);
947 }
948