xref: /xnu-8792.61.2/bsd/skywalk/nexus/flowswitch/nx_flowswitch.c (revision 42e220869062b56f8d7d0726fd4c88954f87902c)
1 /*
2  * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  *   1. Redistributions of source code must retain the above copyright
36  *      notice, this list of conditions and the following disclaimer.
37  *   2. Redistributions in binary form must reproduce the above copyright
38  *      notice, this list of conditions and the following disclaimer in the
39  *      documentation and/or other materials provided with the distribution.
40  *
41  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  */
53 
54 
55 /*
56  * This module implements the flow switch for Skywalk
57  *
58  * --- FLOW SWITCH ---
59  *
60  * For each switch, a lock protects deletion of ports. When configuring
61  * or deleting a new port, the lock is acquired in exclusive mode (after
62  * holding SK_LOCK).  When forwarding, the lock is acquired in shared
63  * mode (without SK_LOCK).  The lock is held throughout the entire
64  * forwarding cycle, during which the thread may incur in a page fault.
65  * Hence it is important that sleepable shared locks are used.
66  *
67  * On the rx ring, the per-port lock is grabbed initially to reserve
68  * a number of slot in the ring, then the lock is released, packets are
69  * copied from source to destination, and then the lock is acquired again
70  * and the receive ring is updated.  (A similar thing is done on the tx
71  * ring for NIC and host stack ports attached to the switch)
72  *
73  * When a netif is attached to a flowswitch, two kernel channels are opened:
74  * The device and host channels. The device channel provides the device
75  * datapath. The host channel is not used in the datapath. It is there
76  * only for providing some callbacks for activating the hostna (e.g.
77  * intercepting host packets).
78  */
79 
80 #include <net/bpf.h>
81 #include <netinet/tcp_seq.h>
82 #include <skywalk/os_skywalk_private.h>
83 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
84 #include <skywalk/nexus/flowswitch/fsw_var.h>
85 #include <skywalk/nexus/upipe/nx_user_pipe.h>
86 #include <skywalk/nexus/netif/nx_netif.h>
87 #include <skywalk/nexus/nexus_var.h>
88 #include <sys/protosw.h>
89 #include <sys/domain.h>
90 
91 SYSCTL_EXTENSIBLE_NODE(_kern_skywalk, OID_AUTO, flowswitch,
92     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk FlowSwitch");
93 
94 static void nx_fsw_dom_init(struct nxdom *);
95 static void nx_fsw_dom_terminate(struct nxdom *);
96 static void nx_fsw_dom_fini(struct nxdom *);
97 static int nx_fsw_dom_find_port(struct kern_nexus *, boolean_t, nexus_port_t *);
98 static int nx_fsw_dom_bind_port(struct kern_nexus *, nexus_port_t *,
99     struct nxbind *, void *);
100 static int nx_fsw_dom_unbind_port(struct kern_nexus *, nexus_port_t);
101 static int nx_fsw_dom_connect(struct kern_nexus_domain_provider *,
102     struct kern_nexus *, struct kern_channel *, struct chreq *,
103     struct kern_channel *, struct nxbind *, struct proc *);
104 static void nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *,
105     struct kern_nexus *, struct kern_channel *);
106 static void nx_fsw_dom_defunct(struct kern_nexus_domain_provider *,
107     struct kern_nexus *, struct kern_channel *, struct proc *);
108 static void nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *,
109     struct kern_nexus *, struct kern_channel *, boolean_t);
110 
111 static int nx_fsw_prov_init(struct kern_nexus_domain_provider *);
112 static int nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *,
113     const struct nxprov_params *, struct nxprov_adjusted_params *);
114 static int nx_fsw_prov_params(struct kern_nexus_domain_provider *,
115     const uint32_t, const struct nxprov_params *, struct nxprov_params *,
116     struct skmem_region_params[SKMEM_REGIONS], uint32_t);
117 static int nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *,
118     struct kern_nexus *, struct nexus_adapter *);
119 static int nx_fsw_prov_config(struct kern_nexus_domain_provider *,
120     struct kern_nexus *, struct nx_cfg_req *, int, struct proc *,
121     kauth_cred_t);
122 static void nx_fsw_prov_fini(struct kern_nexus_domain_provider *);
123 static int nx_fsw_prov_nx_ctor(struct kern_nexus *);
124 static void nx_fsw_prov_nx_dtor(struct kern_nexus *);
125 static size_t nx_fsw_prov_mib_get(struct kern_nexus *nx,
126     struct nexus_mib_filter *, void *, size_t, struct proc *);
127 
128 struct nxdom nx_flowswitch_dom_s = {
129 	.nxdom_prov_head =
130     STAILQ_HEAD_INITIALIZER(nx_flowswitch_dom_s.nxdom_prov_head),
131 	.nxdom_type =           NEXUS_TYPE_FLOW_SWITCH,
132 	.nxdom_md_type =        NEXUS_META_TYPE_PACKET,
133 	.nxdom_md_subtype =     NEXUS_META_SUBTYPE_RAW,
134 	.nxdom_name =           "flowswitch",
135 	.nxdom_ports = {
136 		.nb_def = NX_FSW_VP_MAX,
137 		.nb_min = NX_FSW_VP_MIN,
138 		.nb_max = NX_FSW_VP_MAX,
139 	},
140 	.nxdom_tx_rings = {
141 		.nb_def = 1,
142 		.nb_min = 1,
143 		.nb_max = NX_FSW_MAXRINGS,
144 	},
145 	.nxdom_rx_rings = {
146 		.nb_def = 1,
147 		.nb_min = 1,
148 		.nb_max = NX_FSW_MAXRINGS,
149 	},
150 	.nxdom_tx_slots = {
151 		.nb_def = NX_FSW_TXRINGSIZE,
152 		.nb_min = NX_FSW_MINSLOTS,
153 		.nb_max = NX_FSW_MAXSLOTS,
154 	},
155 	.nxdom_rx_slots = {
156 		.nb_def = NX_FSW_RXRINGSIZE,
157 		.nb_min = NX_FSW_MINSLOTS,
158 		.nb_max = NX_FSW_MAXSLOTS,
159 	},
160 	.nxdom_buf_size = {
161 		.nb_def = NX_FSW_BUFSIZE,
162 		.nb_min = NX_FSW_MINBUFSIZE,
163 		.nb_max = NX_FSW_MAXBUFSIZE,
164 	},
165 	.nxdom_large_buf_size = {
166 		.nb_def = NX_FSW_DEF_LARGE_BUFSIZE,
167 		.nb_min = NX_FSW_MIN_LARGE_BUFSIZE,
168 		.nb_max = NX_FSW_MAX_LARGE_BUFSIZE,
169 	},
170 	.nxdom_meta_size = {
171 		.nb_def = NX_FSW_UMD_SIZE,
172 		.nb_min = NX_FSW_UMD_SIZE,
173 		.nb_max = NX_METADATA_USR_MAX_SZ,
174 	},
175 	.nxdom_stats_size = {
176 		.nb_def = 0,
177 		.nb_min = 0,
178 		.nb_max = NX_STATS_MAX_SZ,
179 	},
180 	.nxdom_pipes = {
181 		.nb_def = 0,
182 		.nb_min = 0,
183 		.nb_max = NX_UPIPE_MAXPIPES,
184 	},
185 	.nxdom_flowadv_max = {
186 		.nb_def = 0,
187 		.nb_min = 0,
188 		.nb_max = NX_FLOWADV_MAX,
189 	},
190 	.nxdom_nexusadv_size = {
191 		.nb_def = 0,
192 		.nb_min = 0,
193 		.nb_max = NX_NEXUSADV_MAX_SZ,
194 	},
195 	.nxdom_capabilities = {
196 		.nb_def = NXPCAP_USER_CHANNEL,
197 		.nb_min = 0,
198 		.nb_max = (NXPCAP_CHECKSUM_PARTIAL | NXPCAP_USER_PACKET_POOL |
199     NXPCAP_USER_CHANNEL),
200 	},
201 	.nxdom_qmap = {
202 		.nb_def = NEXUS_QMAP_TYPE_INVALID,
203 		.nb_min = NEXUS_QMAP_TYPE_INVALID,
204 		.nb_max = NEXUS_QMAP_TYPE_INVALID,
205 	},
206 	.nxdom_max_frags = {
207 		.nb_def = NX_PBUF_FRAGS_DEFAULT,
208 		.nb_min = NX_PBUF_FRAGS_MIN,
209 		.nb_max = NX_PBUF_FRAGS_MAX,
210 	},
211 	.nxdom_init =           nx_fsw_dom_init,
212 	.nxdom_terminate =      nx_fsw_dom_terminate,
213 	.nxdom_fini =           nx_fsw_dom_fini,
214 	.nxdom_connect =        nx_fsw_dom_connect,
215 	.nxdom_find_port =      nx_fsw_dom_find_port,
216 	.nxdom_port_is_reserved = nx_fsw_dom_port_is_reserved,
217 	.nxdom_bind_port =      nx_fsw_dom_bind_port,
218 	.nxdom_unbind_port =    nx_fsw_dom_unbind_port,
219 	.nxdom_disconnect =     nx_fsw_dom_disconnect,
220 	.nxdom_defunct =        nx_fsw_dom_defunct,
221 	.nxdom_defunct_finalize = nx_fsw_dom_defunct_finalize,
222 };
223 
224 struct kern_nexus_domain_provider nx_fsw_prov_s = {
225 	.nxdom_prov_name =              NEXUS_PROVIDER_FLOW_SWITCH,
226 	.nxdom_prov_flags =             NXDOMPROVF_DEFAULT,
227 	.nxdom_prov_cb = {
228 		.dp_cb_init =           nx_fsw_prov_init,
229 		.dp_cb_fini =           nx_fsw_prov_fini,
230 		.dp_cb_params =         nx_fsw_prov_params,
231 		.dp_cb_mem_new =        nx_fsw_prov_mem_new,
232 		.dp_cb_config =         nx_fsw_prov_config,
233 		.dp_cb_nx_ctor =        nx_fsw_prov_nx_ctor,
234 		.dp_cb_nx_dtor =        nx_fsw_prov_nx_dtor,
235 		.dp_cb_nx_mem_info =    NULL,   /* not supported */
236 		.dp_cb_nx_mib_get =     nx_fsw_prov_mib_get,
237 		.dp_cb_nx_stop =        NULL,
238 	},
239 };
240 
241 
242 static void
nx_fsw_dom_init(struct nxdom * nxdom)243 nx_fsw_dom_init(struct nxdom *nxdom)
244 {
245 	SK_LOCK_ASSERT_HELD();
246 	ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));
247 
248 	/* Generic initialization */
249 	fsw_init();
250 	fsw_dp_init();
251 
252 	(void) nxdom_prov_add(nxdom, &nx_fsw_prov_s);
253 }
254 
255 static void
nx_fsw_dom_terminate(struct nxdom * nxdom)256 nx_fsw_dom_terminate(struct nxdom *nxdom)
257 {
258 	struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;
259 
260 	SK_LOCK_ASSERT_HELD();
261 
262 	STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
263 	    nxdom_prov_link, tnxdp) {
264 		(void) nxdom_prov_del(nxdom_prov);
265 	}
266 
267 	fsw_dp_uninit();
268 
269 	/* Generic uninitialization */
270 	fsw_uninit();
271 }
272 
273 static void
nx_fsw_dom_fini(struct nxdom * nxdom)274 nx_fsw_dom_fini(struct nxdom *nxdom)
275 {
276 #pragma unused(nxdom)
277 }
278 
279 static int
nx_fsw_prov_init(struct kern_nexus_domain_provider * nxdom_prov)280 nx_fsw_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
281 {
282 #pragma unused(nxdom_prov)
283 	SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
284 	return 0;
285 }
286 
287 static int
nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider * nxdom_prov,const struct nxprov_params * nxp,struct nxprov_adjusted_params * adj)288 nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
289     const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
290 {
291 #pragma unused(nxdom_prov, nxp)
292 	_CASSERT(NX_FSW_AFRINGSIZE <= NX_FSW_RXRINGSIZE);
293 	_CASSERT(NX_FSW_AFRINGSIZE <= NX_FSW_TXRINGSIZE);
294 
295 	*(adj->adj_md_subtype) = NEXUS_META_SUBTYPE_PAYLOAD;
296 	*(adj->adj_stats_size) = sizeof(struct __nx_stats_fsw);
297 	VERIFY(sk_max_flows > 0 && sk_max_flows <= NX_FLOWADV_MAX);
298 	*(adj->adj_flowadv_max) = sk_max_flows;
299 	*(adj->adj_nexusadv_size) = sizeof(struct sk_nexusadv);
300 	*(adj->adj_caps) |= NXPCAP_USER_PACKET_POOL;
301 	if (sk_cksum_tx != 0) {
302 		*(adj->adj_caps) |= NXPCAP_CHECKSUM_PARTIAL;
303 	}
304 	*(adj->adj_alloc_rings) = *(adj->adj_free_rings) =
305 	    ((nxp->nxp_max_frags > 1) && (sk_channel_buflet_alloc != 0)) ?
306 	    2 : 1;
307 	*(adj->adj_alloc_slots) = *(adj->adj_free_slots) =
308 	    NX_FSW_AFRINGSIZE;
309 
310 	if (!SKMEM_MEM_CONSTRAINED_DEVICE() &&
311 	    (*(adj->adj_buf_region_segment_size) < NX_FSW_BUF_SEG_SIZE)) {
312 		*(adj->adj_buf_region_segment_size) = NX_FSW_BUF_SEG_SIZE;
313 	}
314 
315 	if (*(adj->adj_max_frags) > 1) {
316 		uint32_t fsw_maxbufs = SKMEM_MEM_CONSTRAINED_DEVICE() ?
317 		    NX_FSW_MAXBUFFERS_MEM_CONSTRAINED : NX_FSW_MAXBUFFERS;
318 		uint32_t magazine_max_objs;
319 
320 		*(adj->adj_max_buffers) = (sk_fsw_max_bufs != 0) ?
321 		    sk_fsw_max_bufs : fsw_maxbufs;
322 
323 		/*
324 		 * Given that packet objects are the ones cached, use the
325 		 * metadata size to determine the extra amount of objects
326 		 * at magazine layer.
327 		 */
328 		magazine_max_objs = skmem_cache_magazine_max(
329 			NX_METADATA_PACKET_SZ(*(adj->adj_max_frags)) +
330 			METADATA_PREAMBLE_SZ);
331 
332 		/*
333 		 * Adjust the max buffers to account for the increase
334 		 * associated with per-CPU caching.
335 		 */
336 		if (skmem_allow_magazines() &&
337 		    magazine_max_objs < *(adj->adj_max_buffers)) {
338 			*(adj->adj_max_buffers) -= magazine_max_objs;
339 		}
340 	}
341 	if (SKMEM_MEM_CONSTRAINED_DEVICE() || (fsw_use_dual_sized_pool == 0) ||
342 	    (*(adj->adj_max_frags) <= 1)) {
343 		*(adj->adj_large_buf_size) = 0;
344 	}
345 	return 0;
346 }
347 
348 static int
nx_fsw_prov_params(struct kern_nexus_domain_provider * nxdom_prov,const uint32_t req,const struct nxprov_params * nxp0,struct nxprov_params * nxp,struct skmem_region_params srp[SKMEM_REGIONS],uint32_t pp_region_config_flags)349 nx_fsw_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
350     const uint32_t req, const struct nxprov_params *nxp0,
351     struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS],
352     uint32_t pp_region_config_flags)
353 {
354 	struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;
355 
356 	/* USD regions need to be writable to support user packet pool */
357 	srp[SKMEM_REGION_TXAUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
358 	srp[SKMEM_REGION_RXFUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
359 
360 	return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
361 	           nxdom, nxdom, nxdom, pp_region_config_flags,
362 	           nx_fsw_prov_params_adjust);
363 }
364 
365 static int
nx_fsw_prov_mem_new(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nexus_adapter * na)366 nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
367     struct kern_nexus *nx, struct nexus_adapter *na)
368 {
369 #pragma unused(nxdom_prov)
370 	int err = 0;
371 	struct skmem_region_params *srp = NX_PROV(nx)->nxprov_region_params;
372 
373 	SK_DF(SK_VERB_FSW,
374 	    "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx),
375 	    NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
376 	    SK_KVA(na));
377 
378 	ASSERT(na->na_type == NA_FLOWSWITCH_VP);
379 	ASSERT(na->na_arena == NULL);
380 	ASSERT((na->na_flags & NAF_USER_PKT_POOL) != 0);
381 	/*
382 	 * Each port in the flow switch is isolated from one another;
383 	 * use NULL for the packet buffer pool references to indicate
384 	 * this, since otherwise we'd be sharing the same pp for the
385 	 * entire switch (maybe for a future, special use case?)
386 	 *
387 	 * This means that clients calling kern_nexus_get_pbufpool()
388 	 * will get NULL, but this is fine based on current design
389 	 * of providing port isolation, and also since we don't expose
390 	 * the flow switch to external kernel clients.
391 	 */
392 	uint32_t pp_flags = NX_USER_CHANNEL_PROV(nx) ?
393 	    0 : SKMEM_PP_FLAG_TRUNCATED_BUF;
394 	na->na_arena = skmem_arena_create_for_nexus(na, srp, NULL, NULL, pp_flags,
395 	    &nx->nx_adv, &err);
396 	ASSERT(na->na_arena != NULL || err != 0);
397 	return err;
398 }
399 
400 static int
nx_fsw_prov_config(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nx_cfg_req * ncr,int sopt_dir,struct proc * p,kauth_cred_t cred)401 nx_fsw_prov_config(struct kern_nexus_domain_provider *nxdom_prov,
402     struct kern_nexus *nx, struct nx_cfg_req *ncr, int sopt_dir,
403     struct proc *p, kauth_cred_t cred)
404 {
405 #pragma unused(nxdom_prov)
406 	struct sockopt sopt;
407 	int err = 0;
408 
409 	SK_LOCK_ASSERT_HELD();
410 
411 	/* proceed only if the client possesses flow switch entitlement */
412 	if ((err = skywalk_priv_check_cred(p, cred,
413 	    PRIV_SKYWALK_REGISTER_FLOW_SWITCH)) != 0) {
414 		goto done;
415 	}
416 
417 	if (ncr->nc_req == USER_ADDR_NULL) {
418 		err = EINVAL;
419 		goto done;
420 	}
421 
422 	/* to make life easier for handling copies */
423 	bzero(&sopt, sizeof(sopt));
424 	sopt.sopt_dir = sopt_dir;
425 	sopt.sopt_val = ncr->nc_req;
426 	sopt.sopt_valsize = ncr->nc_req_len;
427 	sopt.sopt_p = p;
428 
429 	/* avoid _MALLOCing at the cost of this ugly switch block */
430 	switch (ncr->nc_cmd) {
431 	case NXCFG_CMD_ATTACH:
432 	case NXCFG_CMD_DETACH: {
433 		struct nx_spec_req nsr;
434 
435 		bzero(&nsr, sizeof(nsr));
436 		err = sooptcopyin(&sopt, &nsr, sizeof(nsr), sizeof(nsr));
437 		if (err != 0) {
438 			goto done;
439 		}
440 
441 		/*
442 		 * Null-terminate in case this has an interface name;
443 		 * the union is already large enough for uuid_t.
444 		 */
445 		nsr.nsr_name[sizeof(nsr.nsr_name) - 1] = '\0';
446 		if (p != kernproc) {
447 			nsr.nsr_flags &= NXSPECREQ_MASK;
448 		}
449 
450 		err = fsw_ctl(nx, ncr->nc_cmd, p, &nsr);
451 		if (err != 0) {
452 			goto done;
453 		}
454 
455 		/* XXX: [email protected] -- can this copyout fail? */
456 		(void) sooptcopyout(&sopt, &nsr, sizeof(nsr));
457 		break;
458 	}
459 
460 	case NXCFG_CMD_FLOW_ADD:
461 	case NXCFG_CMD_FLOW_DEL: {
462 		_CASSERT(offsetof(struct nx_flow_req, _nfr_kernel_field_end) ==
463 		    offsetof(struct nx_flow_req, _nfr_common_field_end));
464 		struct nx_flow_req nfr;
465 
466 		bzero(&nfr, sizeof(nfr));
467 		err = sooptcopyin(&sopt, &nfr, sizeof(nfr), sizeof(nfr));
468 		if (err != 0) {
469 			goto done;
470 		}
471 
472 		err = fsw_ctl(nx, ncr->nc_cmd, p, &nfr);
473 		if (err != 0) {
474 			goto done;
475 		}
476 
477 		/* XXX: [email protected] -- can this copyout fail? */
478 		(void) sooptcopyout(&sopt, &nfr, sizeof(nfr));
479 		break;
480 	}
481 
482 	case NXCFG_CMD_NETEM: {
483 		struct if_netem_params inp;
484 
485 		bzero(&inp, sizeof(inp));
486 		err = sooptcopyin(&sopt, &inp, sizeof(inp), sizeof(inp));
487 		if (err != 0) {
488 			goto done;
489 		}
490 		err = fsw_ctl(nx, ncr->nc_cmd, p, &inp);
491 		if (err != 0) {
492 			goto done;
493 		}
494 		break;
495 	}
496 
497 	default:
498 		err = EINVAL;
499 		goto done;
500 	}
501 
502 done:
503 	SK_DF(err ? SK_VERB_ERROR: SK_VERB_FSW,
504 	    "nexus 0x%llx (%s) cmd %d (err %d)", SK_KVA(nx),
505 	    NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err);
506 	return err;
507 }
508 
509 static void
nx_fsw_prov_fini(struct kern_nexus_domain_provider * nxdom_prov)510 nx_fsw_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
511 {
512 #pragma unused(nxdom_prov)
513 	SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
514 }
515 
516 static int
nx_fsw_prov_nx_ctor(struct kern_nexus * nx)517 nx_fsw_prov_nx_ctor(struct kern_nexus *nx)
518 {
519 	struct nx_flowswitch *fsw;
520 
521 	SK_LOCK_ASSERT_HELD();
522 
523 	ASSERT(nx->nx_arg == NULL);
524 
525 	SK_D("nexus 0x%llx (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name);
526 
527 	fsw = fsw_alloc(Z_WAITOK);
528 	nx->nx_arg = fsw;
529 	fsw->fsw_nx = nx;
530 	fsw->fsw_tx_rings = NX_PROV(nx)->nxprov_params->nxp_tx_rings;
531 	fsw->fsw_rx_rings = NX_PROV(nx)->nxprov_params->nxp_rx_rings;
532 
533 	FSW_WLOCK(fsw);
534 
535 	fsw_dp_ctor(fsw);
536 
537 	FSW_WUNLOCK(fsw);
538 
539 	SK_D("create new fsw 0x%llx for nexus 0x%llx",
540 	    SK_KVA(NX_FSW_PRIVATE(nx)), SK_KVA(nx));
541 
542 	return 0;
543 }
544 
545 static void
nx_fsw_prov_nx_dtor(struct kern_nexus * nx)546 nx_fsw_prov_nx_dtor(struct kern_nexus *nx)
547 {
548 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
549 	int err;
550 
551 	SK_LOCK_ASSERT_HELD();
552 
553 	SK_D("nexus 0x%llx (%s) fsw 0x%llx", SK_KVA(nx),
554 	    NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(fsw));
555 
556 	err = fsw_ctl_detach(nx, current_proc(), NULL);
557 	ASSERT(err == 0);       /* this cannot fail */
558 	ASSERT(fsw->fsw_dev_ch == NULL);
559 	ASSERT(fsw->fsw_host_ch == NULL);
560 
561 	SK_DF(SK_VERB_FSW, "marking fsw 0x%llx as free", SK_KVA(fsw));
562 	fsw_free(fsw);
563 	nx->nx_arg = NULL;
564 }
565 
566 static size_t
nx_fsw_prov_mib_get(struct kern_nexus * nx,struct nexus_mib_filter * filter,void * out,size_t len,struct proc * p)567 nx_fsw_prov_mib_get(struct kern_nexus *nx, struct nexus_mib_filter *filter,
568     void *out, size_t len, struct proc *p)
569 {
570 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
571 
572 	/* this check doesn't require holding fsw_lock */
573 	if ((filter->nmf_bitmap & NXMIB_FILTER_NX_UUID) &&
574 	    (uuid_compare(filter->nmf_nx_uuid,
575 	    fsw->fsw_nx->nx_uuid)) != 0) {
576 		return 0;
577 	}
578 
579 	/* intercept NXMIB_FSW_STATS here since it's for flowswitch */
580 	FSW_RLOCK(fsw);
581 	len = fsw_mib_get(fsw, filter, out, len, p);
582 	FSW_UNLOCK(fsw);
583 
584 	return len;
585 }
586 
587 boolean_t
nx_fsw_dom_port_is_reserved(struct kern_nexus * nx,nexus_port_t nx_port)588 nx_fsw_dom_port_is_reserved(struct kern_nexus *nx, nexus_port_t nx_port)
589 {
590 #pragma unused(nx)
591 	return nx_port < NEXUS_PORT_FLOW_SWITCH_CLIENT;
592 }
593 
594 static int
nx_fsw_dom_find_port(struct kern_nexus * nx,boolean_t rsvd,nexus_port_t * nx_port)595 nx_fsw_dom_find_port(struct kern_nexus *nx, boolean_t rsvd,
596     nexus_port_t *nx_port)
597 {
598 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
599 	nexus_port_t first, last, port;
600 	int error;
601 
602 	ASSERT(nx_port != NULL);
603 
604 	port = *nx_port;
605 	ASSERT(port == NEXUS_PORT_ANY);
606 
607 	if (rsvd) {
608 		first = 0;
609 		last = NEXUS_PORT_FLOW_SWITCH_CLIENT;
610 	} else {
611 		first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
612 		ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
613 		last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
614 	}
615 	ASSERT(first <= last);
616 
617 	FSW_WLOCK(fsw);
618 	if (__improbable(first == last)) {
619 		error = ENOSPC;
620 	} else {
621 		error = nx_port_find(nx, first, last - 1, &port);
622 		ASSERT(error != 0 || (port >= first && port < last));
623 	}
624 	FSW_WUNLOCK(fsw);
625 
626 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
627 	    "nx 0x%llx \"%s\" %snx_port %d [%u,%u] (err %d)", SK_KVA(nx),
628 	    nx->nx_prov->nxprov_params->nxp_name, (rsvd ? "[reserved] " : ""),
629 	    (int)port, first, (last - 1), error);
630 
631 	if (error == 0) {
632 		*nx_port = port;
633 	}
634 
635 	return error;
636 }
637 
638 static int
nx_fsw_dom_bind_port(struct kern_nexus * nx,nexus_port_t * nx_port,struct nxbind * nxb,void * info)639 nx_fsw_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
640     struct nxbind *nxb, void *info)
641 {
642 #pragma unused(info)
643 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
644 	nexus_port_t first, last, port;
645 	int error;
646 
647 	ASSERT(nx_port != NULL);
648 	ASSERT(nxb != NULL);
649 
650 	port = *nx_port;
651 
652 	/* can't bind reserved ports to client credentials */
653 	if (nx_fsw_dom_port_is_reserved(nx, port)) {
654 		return EDOM;
655 	}
656 
657 	/*
658 	 * Allow clients to bind to regular ports (non-reserved);
659 	 * reserved ports aren't subject to bind/unbind, since
660 	 * they are used for internal purposes.
661 	 */
662 	first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
663 	ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
664 	last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
665 	ASSERT(first <= last);
666 
667 	FSW_WLOCK(fsw);
668 	if (__improbable(first == last)) {
669 		error = ENOSPC;
670 	} else if (port != NEXUS_PORT_ANY) {
671 		error = nx_port_bind(nx, port, nxb);
672 	} else {
673 		error = nx_port_find(nx, first, last - 1, &port);
674 		ASSERT(error != 0 || (port >= first && port < last));
675 		if (error == 0) {
676 			error = nx_port_bind(nx, port, nxb);
677 		}
678 	}
679 	FSW_WUNLOCK(fsw);
680 
681 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
682 	    "nx 0x%llx \"%s\" nx_port %d [%u,%u] (err %d)", SK_KVA(nx),
683 	    nx->nx_prov->nxprov_params->nxp_name, (int)port,
684 	    first, (last - 1), error);
685 
686 	ASSERT(*nx_port == NEXUS_PORT_ANY || *nx_port == port);
687 	if (error == 0) {
688 		*nx_port = port;
689 	}
690 
691 	return error;
692 }
693 
694 static int
nx_fsw_dom_unbind_port(struct kern_nexus * nx,nexus_port_t nx_port)695 nx_fsw_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
696 {
697 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
698 	int error;
699 
700 	FSW_WLOCK(fsw);
701 	error = nx_port_unbind(nx, nx_port);
702 	FSW_WUNLOCK(fsw);
703 
704 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
705 	    "nx 0x%llx \"%s\" nx_port %d (err %d)", SK_KVA(nx),
706 	    nx->nx_prov->nxprov_params->nxp_name, (int)nx_port, error);
707 
708 	return error;
709 }
710 
711 static int
nx_fsw_dom_connect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct kern_channel * ch0,struct nxbind * nxb,struct proc * p)712 nx_fsw_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
713     struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
714     struct kern_channel *ch0, struct nxbind *nxb, struct proc *p)
715 {
716 #pragma unused(nxdom_prov)
717 	nexus_port_t port = chr->cr_port;
718 	int err = 0;
719 
720 	SK_LOCK_ASSERT_HELD();
721 
722 	ASSERT(nx->nx_prov->nxprov_params->nxp_type ==
723 	    nxdom_prov->nxdom_prov_dom->nxdom_type &&
724 	    nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
725 	ASSERT(!(ch->ch_flags & CHANF_HOST));
726 
727 	if (port != NEXUS_PORT_ANY && port >= NXDOM_MAX(NX_DOM(nx), ports)) {
728 		err = EDOM;
729 		goto done;
730 	}
731 
732 	chr->cr_real_endpoint = chr->cr_endpoint = CH_ENDPOINT_FLOW_SWITCH;
733 	if (ch->ch_flags & CHANF_KERNEL) {
734 		uuid_string_t uuidstr;
735 		ASSERT(!uuid_is_null(chr->cr_spec_uuid));
736 		(void) snprintf(chr->cr_name, sizeof(chr->cr_name),
737 		    "%s_%llu:%s", NX_FSW_NAME, nx->nx_id,
738 		    sk_uuid_unparse(chr->cr_spec_uuid, uuidstr));
739 		chr->cr_ring_set = RING_SET_DEFAULT;
740 		if (chr->cr_mode & CHMODE_HOST) {
741 			atomic_bitset_32(&ch->ch_flags, CHANF_HOST);
742 		}
743 		err = na_connect_spec(nx, ch, chr, p);
744 	} else {
745 		ASSERT(port != NEXUS_PORT_ANY);
746 		if (chr->cr_mode & CHMODE_HOST) {
747 			/* not allowed unless kernel (special) channel */
748 			err = EINVAL;
749 			goto done;
750 		}
751 		(void) snprintf(chr->cr_name, sizeof(chr->cr_name),
752 		    "%s_%llu:%u", NX_FSW_NAME, nx->nx_id, port);
753 		chr->cr_ring_set = RING_SET_DEFAULT;
754 		err = na_connect(nx, ch, chr, ch0, nxb, p);
755 	}
756 
757 done:
758 	return err;
759 }
760 
761 static void
nx_fsw_dom_disconnect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch)762 nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
763     struct kern_nexus *nx, struct kern_channel *ch)
764 {
765 #pragma unused(nxdom_prov)
766 	SK_LOCK_ASSERT_HELD();
767 
768 	SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch),
769 	    SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
770 	    ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
771 
772 	if (ch->ch_flags & CHANF_KERNEL) {
773 		na_disconnect_spec(nx, ch);
774 	} else {
775 		na_disconnect(nx, ch);
776 	}
777 }
778 
779 static void
nx_fsw_dom_defunct(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)780 nx_fsw_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
781     struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
782 {
783 #pragma unused(nxdom_prov)
784 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
785 
786 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
787 	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
788 	ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
789 
790 	/*
791 	 * Hold the flowswitch lock as writer; this prevents all data path
792 	 * accesses to the flowswitch, and allows us to mark the rings with
793 	 * CKRF_DEFUNCT.  Unlike some other nexus types, the flowswitch
794 	 * doesn't utilize kr_{enter,exit} for serialization, at present.
795 	 */
796 	FSW_WLOCK(fsw);
797 	na_ch_rings_defunct(ch, p);
798 	FSW_WUNLOCK(fsw);
799 }
800 
801 static void
nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,boolean_t locked)802 nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
803     struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
804 {
805 #pragma unused(nxdom_prov)
806 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
807 	int err = 0;
808 
809 	if (!locked) {
810 		SK_LOCK_ASSERT_NOTHELD();
811 		SK_LOCK();
812 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
813 	} else {
814 		SK_LOCK_ASSERT_HELD();
815 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
816 	}
817 
818 	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
819 	ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
820 	ASSERT(VPNA(ch->ch_na)->vpna_nx_port == ch->ch_info->cinfo_nx_port);
821 
822 	err = fsw_port_na_defunct(fsw, VPNA(ch->ch_na));
823 
824 	if (err == 0) {
825 		na_defunct(nx, ch, ch->ch_na, locked);
826 	}
827 
828 	SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d) err %d",
829 	    ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx),
830 	    nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
831 	    ch->ch_info->cinfo_nx_port,
832 	    (int)ch->ch_info->cinfo_ch_ring_id, err);
833 
834 	if (!locked) {
835 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
836 		SK_UNLOCK();
837 	} else {
838 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
839 		SK_LOCK_ASSERT_HELD();
840 	}
841 }
842 
843 #if SK_LOG
844 /* Hoisted out of line to reduce kernel stack footprint */
845 SK_LOG_ATTRIBUTE
846 static void
nx_fsw_na_find_log(const struct chreq * chr,boolean_t create)847 nx_fsw_na_find_log(const struct chreq *chr, boolean_t create)
848 {
849 	uuid_string_t uuidstr;
850 
851 	SK_D("name \"%s\" spec_uuid \"%s\" nx_port %d mode 0x%b pipe_id %u "
852 	    "ring_id %d ring_set %u ep_type %u:%u create %u%s",
853 	    chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
854 	    (int)chr->cr_port, chr->cr_mode, CHMODE_BITS, chr->cr_pipe_id,
855 	    (int)chr->cr_ring_id, chr->cr_ring_set, chr->cr_real_endpoint,
856 	    chr->cr_endpoint, create, (strncmp(chr->cr_name, NX_FSW_NAME,
857 	    sizeof(NX_FSW_NAME) - 1) != 0) ? " (skipped)" : "");
858 }
859 #endif /* SK_LOG */
860 
861 /*
862  * Try to get a reference to a Nexus adapter attached to a flow switch.
863  * If the adapter is found (or is created), this function returns 0, a
864  * non NULL pointer is returned into *na, and the caller holds a
865  * reference to the adapter.
866  * If an adapter is not found, then no reference is grabbed and the
867  * function returns an error code, or 0 if there is just a flow switch prefix
868  * mismatch. Therefore the caller holds a reference when
869  * (*na != NULL && return == 0).
870  */
871 int
nx_fsw_na_find(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_adapter ** na,boolean_t create)872 nx_fsw_na_find(struct kern_nexus *nx, struct kern_channel *ch,
873     struct chreq *chr, struct nxbind *nxb, struct proc *p,
874     struct nexus_adapter **na, boolean_t create)
875 {
876 #pragma unused(ch)
877 	struct nexus_vp_adapter *vpna = NULL;
878 	char *cr_name = chr->cr_name;
879 	struct nx_flowswitch *fsw;
880 	int error = 0;
881 
882 	SK_LOCK_ASSERT_HELD();
883 	*na = NULL;     /* default return value */
884 
885 #if SK_LOG
886 	if (__improbable(sk_verbose != 0)) {
887 		nx_fsw_na_find_log(chr, create);
888 	}
889 #endif /* SK_LOG */
890 
891 	/* first try to see if this is a flow switch port. */
892 	if (strncmp(cr_name, NX_FSW_NAME, sizeof(NX_FSW_NAME) - 1) != 0) {
893 		return 0;  /* no error, but no flow switch prefix */
894 	}
895 	ASSERT(nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
896 	fsw = NX_FSW_PRIVATE(nx);
897 	ASSERT(fsw != NULL);
898 
899 	if (!create) {
900 		return ENXIO;
901 	}
902 
903 	/*
904 	 * The flowswitch VP is only attachable from a user channel so none of
905 	 * these flags should be set.
906 	 */
907 	ASSERT((chr->cr_mode & (CHMODE_KERNEL | CHMODE_CONFIG)) == 0);
908 	error = fsw_attach_vp(nx, ch, chr, nxb, p, &vpna);
909 	ASSERT(vpna == NULL || error == 0);
910 
911 	if (error == 0) {
912 		/* use reference held by nx_fsw_attach_vp above */
913 		*na = &vpna->vpna_up;
914 		SK_DF(SK_VERB_FSW,
915 		    "vpna \"%s\" (0x%llx) refs %u to fsw \"%s\" nx_port %d",
916 		    (*na)->na_name, SK_KVA(*na), (*na)->na_refcount,
917 		    cr_name, (int)vpna->vpna_nx_port);
918 	}
919 
920 	return error;
921 }
922 
923 int
nx_fsw_netagent_add(struct kern_nexus * nx)924 nx_fsw_netagent_add(struct kern_nexus *nx)
925 {
926 	return fsw_netagent_add_remove(nx, TRUE);
927 }
928 
929 int
nx_fsw_netagent_remove(struct kern_nexus * nx)930 nx_fsw_netagent_remove(struct kern_nexus *nx)
931 {
932 	return fsw_netagent_add_remove(nx, FALSE);
933 }
934 
935 void
nx_fsw_netagent_update(struct kern_nexus * nx)936 nx_fsw_netagent_update(struct kern_nexus *nx)
937 {
938 	fsw_netagent_update(nx);
939 }
940