xref: /xnu-12377.41.6/bsd/skywalk/nexus/flowswitch/nx_flowswitch.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  *   1. Redistributions of source code must retain the above copyright
36  *      notice, this list of conditions and the following disclaimer.
37  *   2. Redistributions in binary form must reproduce the above copyright
38  *      notice, this list of conditions and the following disclaimer in the
39  *      documentation and/or other materials provided with the distribution.
40  *
41  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  */
53 
54 
55 /*
56  * This module implements the flow switch for Skywalk
57  *
58  * --- FLOW SWITCH ---
59  *
60  * For each switch, a lock protects deletion of ports. When configuring
61  * or deleting a new port, the lock is acquired in exclusive mode (after
62  * holding SK_LOCK).  When forwarding, the lock is acquired in shared
63  * mode (without SK_LOCK).  The lock is held throughout the entire
64  * forwarding cycle, during which the thread may incur in a page fault.
65  * Hence it is important that sleepable shared locks are used.
66  *
67  * On the rx ring, the per-port lock is grabbed initially to reserve
68  * a number of slot in the ring, then the lock is released, packets are
69  * copied from source to destination, and then the lock is acquired again
70  * and the receive ring is updated.  (A similar thing is done on the tx
71  * ring for NIC and host stack ports attached to the switch)
72  *
73  * When a netif is attached to a flowswitch, two kernel channels are opened:
74  * The device and host channels. The device channel provides the device
75  * datapath. The host channel is not used in the datapath. It is there
76  * only for providing some callbacks for activating the hostna (e.g.
77  * intercepting host packets).
78  */
79 
80 #include <net/bpf.h>
81 #include <netinet/tcp_seq.h>
82 #include <skywalk/os_skywalk_private.h>
83 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
84 #include <skywalk/nexus/flowswitch/fsw_var.h>
85 #include <skywalk/nexus/upipe/nx_user_pipe.h>
86 #include <skywalk/nexus/netif/nx_netif.h>
87 #include <skywalk/nexus/nexus_var.h>
88 #include <sys/protosw.h>
89 #include <sys/domain.h>
90 
91 SYSCTL_EXTENSIBLE_NODE(_kern_skywalk, OID_AUTO, flowswitch,
92     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk FlowSwitch");
93 
94 static void nx_fsw_dom_init(struct nxdom *);
95 static void nx_fsw_dom_terminate(struct nxdom *);
96 static void nx_fsw_dom_fini(struct nxdom *);
97 static int nx_fsw_dom_find_port(struct kern_nexus *, boolean_t, nexus_port_t *);
98 static int nx_fsw_dom_bind_port(struct kern_nexus *, nexus_port_t *,
99     struct nxbind *, void *);
100 static int nx_fsw_dom_unbind_port(struct kern_nexus *, nexus_port_t);
101 static int nx_fsw_dom_connect(struct kern_nexus_domain_provider *,
102     struct kern_nexus *, struct kern_channel *, struct chreq *,
103     struct nxbind *, struct proc *);
104 static void nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *,
105     struct kern_nexus *, struct kern_channel *);
106 static void nx_fsw_dom_defunct(struct kern_nexus_domain_provider *,
107     struct kern_nexus *, struct kern_channel *, struct proc *);
108 static void nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *,
109     struct kern_nexus *, struct kern_channel *, boolean_t);
110 
111 static int nx_fsw_prov_init(struct kern_nexus_domain_provider *);
112 static int nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *,
113     const struct nxprov_params *, struct nxprov_adjusted_params *);
114 static int nx_fsw_prov_params(struct kern_nexus_domain_provider *,
115     const uint32_t, const struct nxprov_params *, struct nxprov_params *,
116     struct skmem_region_params[SKMEM_REGIONS], uint32_t);
117 static int nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *,
118     struct kern_nexus *, struct nexus_adapter *);
119 static int nx_fsw_prov_config(struct kern_nexus_domain_provider *,
120     struct kern_nexus *, struct nx_cfg_req *, int, struct proc *,
121     kauth_cred_t);
122 static void nx_fsw_prov_fini(struct kern_nexus_domain_provider *);
123 static int nx_fsw_prov_nx_ctor(struct kern_nexus *);
124 static void nx_fsw_prov_nx_dtor(struct kern_nexus *);
125 static size_t nx_fsw_prov_mib_get(struct kern_nexus *nx,
126     struct nexus_mib_filter *, void *__sized_by(len)out, size_t len,
127     struct proc *);
128 
129 struct nxdom nx_flowswitch_dom_s = {
130 	.nxdom_prov_head =
131     STAILQ_HEAD_INITIALIZER(nx_flowswitch_dom_s.nxdom_prov_head),
132 	.nxdom_type =           NEXUS_TYPE_FLOW_SWITCH,
133 	.nxdom_md_type =        NEXUS_META_TYPE_PACKET,
134 	.nxdom_md_subtype =     NEXUS_META_SUBTYPE_RAW,
135 	.nxdom_name =           "flowswitch",
136 	.nxdom_ports = {
137 		.nb_def = NX_FSW_VP_MAX,
138 		.nb_min = NX_FSW_VP_MIN,
139 		.nb_max = NX_FSW_VP_MAX,
140 	},
141 	.nxdom_tx_rings = {
142 		.nb_def = 1,
143 		.nb_min = 1,
144 		.nb_max = NX_FSW_MAXRINGS,
145 	},
146 	.nxdom_rx_rings = {
147 		.nb_def = 1,
148 		.nb_min = 1,
149 		.nb_max = NX_FSW_MAXRINGS,
150 	},
151 	.nxdom_tx_slots = {
152 		.nb_def = NX_FSW_TXRINGSIZE,
153 		.nb_min = NX_FSW_MINSLOTS,
154 		.nb_max = NX_FSW_MAXSLOTS,
155 	},
156 	.nxdom_rx_slots = {
157 		.nb_def = NX_FSW_RXRINGSIZE,
158 		.nb_min = NX_FSW_MINSLOTS,
159 		.nb_max = NX_FSW_MAXSLOTS,
160 	},
161 	.nxdom_buf_size = {
162 		.nb_def = NX_FSW_BUFSIZE,
163 		.nb_min = NX_FSW_MINBUFSIZE,
164 		.nb_max = NX_FSW_MAXBUFSIZE,
165 	},
166 	.nxdom_large_buf_size = {
167 		.nb_def = NX_FSW_DEF_LARGE_BUFSIZE,
168 		.nb_min = NX_FSW_MIN_LARGE_BUFSIZE,
169 		.nb_max = NX_FSW_MAX_LARGE_BUFSIZE,
170 	},
171 	.nxdom_meta_size = {
172 		.nb_def = NX_FSW_UMD_SIZE,
173 		.nb_min = NX_FSW_UMD_SIZE,
174 		.nb_max = NX_METADATA_USR_MAX_SZ,
175 	},
176 	.nxdom_stats_size = {
177 		.nb_def = 0,
178 		.nb_min = 0,
179 		.nb_max = NX_STATS_MAX_SZ,
180 	},
181 	.nxdom_pipes = {
182 		.nb_def = 0,
183 		.nb_min = 0,
184 		.nb_max = NX_UPIPE_MAXPIPES,
185 	},
186 	.nxdom_flowadv_max = {
187 		.nb_def = 0,
188 		.nb_min = 0,
189 		.nb_max = NX_FLOWADV_MAX,
190 	},
191 	.nxdom_nexusadv_size = {
192 		.nb_def = 0,
193 		.nb_min = 0,
194 		.nb_max = NX_NEXUSADV_MAX_SZ,
195 	},
196 	.nxdom_capabilities = {
197 		.nb_def = NXPCAP_USER_CHANNEL,
198 		.nb_min = 0,
199 		.nb_max = (NXPCAP_CHECKSUM_PARTIAL | NXPCAP_USER_PACKET_POOL |
200     NXPCAP_USER_CHANNEL),
201 	},
202 	.nxdom_qmap = {
203 		.nb_def = NEXUS_QMAP_TYPE_INVALID,
204 		.nb_min = NEXUS_QMAP_TYPE_INVALID,
205 		.nb_max = NEXUS_QMAP_TYPE_INVALID,
206 	},
207 	.nxdom_max_frags = {
208 		.nb_def = NX_PBUF_FRAGS_DEFAULT,
209 		.nb_min = NX_PBUF_FRAGS_MIN,
210 		.nb_max = NX_PBUF_FRAGS_MAX,
211 	},
212 	.nxdom_init =           nx_fsw_dom_init,
213 	.nxdom_terminate =      nx_fsw_dom_terminate,
214 	.nxdom_fini =           nx_fsw_dom_fini,
215 	.nxdom_connect =        nx_fsw_dom_connect,
216 	.nxdom_find_port =      nx_fsw_dom_find_port,
217 	.nxdom_port_is_reserved = nx_fsw_dom_port_is_reserved,
218 	.nxdom_bind_port =      nx_fsw_dom_bind_port,
219 	.nxdom_unbind_port =    nx_fsw_dom_unbind_port,
220 	.nxdom_disconnect =     nx_fsw_dom_disconnect,
221 	.nxdom_defunct =        nx_fsw_dom_defunct,
222 	.nxdom_defunct_finalize = nx_fsw_dom_defunct_finalize,
223 };
224 
225 struct kern_nexus_domain_provider nx_fsw_prov_s = {
226 	.nxdom_prov_name =              NEXUS_PROVIDER_FLOW_SWITCH,
227 	.nxdom_prov_flags =             NXDOMPROVF_DEFAULT,
228 	.nxdom_prov_cb = {
229 		.dp_cb_init =           nx_fsw_prov_init,
230 		.dp_cb_fini =           nx_fsw_prov_fini,
231 		.dp_cb_params =         nx_fsw_prov_params,
232 		.dp_cb_mem_new =        nx_fsw_prov_mem_new,
233 		.dp_cb_config =         nx_fsw_prov_config,
234 		.dp_cb_nx_ctor =        nx_fsw_prov_nx_ctor,
235 		.dp_cb_nx_dtor =        nx_fsw_prov_nx_dtor,
236 		.dp_cb_nx_mem_info =    NULL,   /* not supported */
237 		.dp_cb_nx_mib_get =     nx_fsw_prov_mib_get,
238 		.dp_cb_nx_stop =        NULL,
239 	},
240 };
241 
242 
243 static void
nx_fsw_dom_init(struct nxdom * nxdom)244 nx_fsw_dom_init(struct nxdom *nxdom)
245 {
246 	SK_LOCK_ASSERT_HELD();
247 	ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));
248 
249 	/* Generic initialization */
250 	fsw_init();
251 	fsw_dp_init();
252 
253 	(void) nxdom_prov_add(nxdom, &nx_fsw_prov_s);
254 }
255 
256 static void
nx_fsw_dom_terminate(struct nxdom * nxdom)257 nx_fsw_dom_terminate(struct nxdom *nxdom)
258 {
259 	struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;
260 
261 	SK_LOCK_ASSERT_HELD();
262 
263 	STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
264 	    nxdom_prov_link, tnxdp) {
265 		(void) nxdom_prov_del(nxdom_prov);
266 	}
267 
268 	fsw_dp_uninit();
269 
270 	/* Generic uninitialization */
271 	fsw_uninit();
272 }
273 
274 static void
nx_fsw_dom_fini(struct nxdom * nxdom)275 nx_fsw_dom_fini(struct nxdom *nxdom)
276 {
277 #pragma unused(nxdom)
278 }
279 
280 static int
nx_fsw_prov_init(struct kern_nexus_domain_provider * nxdom_prov)281 nx_fsw_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
282 {
283 #pragma unused(nxdom_prov)
284 	SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
285 	return 0;
286 }
287 
288 static int
nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider * nxdom_prov,const struct nxprov_params * nxp,struct nxprov_adjusted_params * adj)289 nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
290     const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
291 {
292 #pragma unused(nxdom_prov, nxp)
293 	static_assert(NX_FSW_AFRINGSIZE <= NX_FSW_RXRINGSIZE);
294 	static_assert(NX_FSW_AFRINGSIZE <= NX_FSW_TXRINGSIZE);
295 
296 	*(adj->adj_stats_size) = sizeof(struct __nx_stats_fsw);
297 	VERIFY(sk_max_flows > 0 && sk_max_flows <= NX_FLOWADV_MAX);
298 	*(adj->adj_flowadv_max) = sk_max_flows;
299 	*(adj->adj_nexusadv_size) = sizeof(struct sk_nexusadv);
300 	*(adj->adj_caps) |= NXPCAP_USER_PACKET_POOL;
301 	if (sk_cksum_tx != 0) {
302 		*(adj->adj_caps) |= NXPCAP_CHECKSUM_PARTIAL;
303 	}
304 	*(adj->adj_alloc_rings) = *(adj->adj_free_rings) =
305 	    ((nxp->nxp_max_frags > 1) && (sk_channel_buflet_alloc != 0)) ?
306 	    2 : 1;
307 	*(adj->adj_alloc_slots) = *(adj->adj_free_slots) =
308 	    NX_FSW_AFRINGSIZE;
309 
310 	if (!SKMEM_MEM_CONSTRAINED_DEVICE() &&
311 	    (*(adj->adj_buf_region_segment_size) < NX_FSW_BUF_SEG_SIZE)) {
312 		*(adj->adj_buf_region_segment_size) = NX_FSW_BUF_SEG_SIZE;
313 	}
314 
315 	if (*(adj->adj_max_frags) > 1) {
316 		uint32_t fsw_maxbufs = SKMEM_MEM_CONSTRAINED_DEVICE() ?
317 		    NX_FSW_MAXBUFFERS_MEM_CONSTRAINED : NX_FSW_MAXBUFFERS;
318 		uint32_t magazine_max_objs;
319 
320 		*(adj->adj_max_buffers) = (sk_fsw_max_bufs != 0) ?
321 		    sk_fsw_max_bufs : fsw_maxbufs;
322 
323 		/*
324 		 * Given that packet objects are the ones cached, use the
325 		 * metadata size to determine the extra amount of objects
326 		 * at magazine layer.
327 		 */
328 		magazine_max_objs = skmem_cache_magazine_max(
329 			NX_METADATA_PACKET_SZ(*(adj->adj_max_frags)) +
330 			METADATA_PREAMBLE_SZ);
331 
332 		/*
333 		 * Adjust the max buffers to account for the increase
334 		 * associated with per-CPU caching.
335 		 */
336 		if (skmem_allow_magazines() &&
337 		    magazine_max_objs < *(adj->adj_max_buffers)) {
338 			*(adj->adj_max_buffers) -= magazine_max_objs;
339 		}
340 	}
341 	if (SKMEM_MEM_CONSTRAINED_DEVICE() || (fsw_use_dual_sized_pool == 0) ||
342 	    (*(adj->adj_max_frags) <= 1)) {
343 		*(adj->adj_large_buf_size) = 0;
344 	}
345 	return 0;
346 }
347 
348 static int
nx_fsw_prov_params(struct kern_nexus_domain_provider * nxdom_prov,const uint32_t req,const struct nxprov_params * nxp0,struct nxprov_params * nxp,struct skmem_region_params srp[SKMEM_REGIONS],uint32_t pp_region_config_flags)349 nx_fsw_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
350     const uint32_t req, const struct nxprov_params *nxp0,
351     struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS],
352     uint32_t pp_region_config_flags)
353 {
354 	struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;
355 
356 	/* USD regions need to be writable to support user packet pool */
357 	srp[SKMEM_REGION_TXAUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
358 	srp[SKMEM_REGION_RXFUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
359 
360 	return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
361 	           nxdom, nxdom, nxdom, pp_region_config_flags,
362 	           nx_fsw_prov_params_adjust);
363 }
364 
365 static void
fsw_vp_region_params_setup(struct nexus_adapter * na,struct skmem_region_params * __counted_by (SKMEM_REGIONS)srp0,struct skmem_region_params * __counted_by (SKMEM_REGIONS)srp)366 fsw_vp_region_params_setup(struct nexus_adapter *na,
367     struct skmem_region_params *__counted_by(SKMEM_REGIONS)srp0,
368     struct skmem_region_params *__counted_by(SKMEM_REGIONS)srp)
369 {
370 	int i;
371 	uint32_t totalrings, nslots, afslots, evslots, lbaslots;
372 
373 	/* copy default flowswitch parameters initialized in nxprov_params_adjust() */
374 	for (i = 0; i < SKMEM_REGIONS; i++) {
375 		srp[i] = srp0[i];
376 	}
377 	/* customize parameters that could vary across NAs */
378 	totalrings = na_get_nrings(na, NR_TX) + na_get_nrings(na, NR_RX) +
379 	    na_get_nrings(na, NR_A) + na_get_nrings(na, NR_F) +
380 	    na_get_nrings(na, NR_EV) + na_get_nrings(na, NR_LBA);
381 
382 	srp[SKMEM_REGION_SCHEMA].srp_r_obj_size =
383 	    (uint32_t)CHANNEL_SCHEMA_SIZE(totalrings);
384 	srp[SKMEM_REGION_SCHEMA].srp_r_obj_cnt = totalrings;
385 	skmem_region_params_config(&srp[SKMEM_REGION_SCHEMA]);
386 
387 	srp[SKMEM_REGION_RING].srp_r_obj_size =
388 	    sizeof(struct __user_channel_ring);
389 	srp[SKMEM_REGION_RING].srp_r_obj_cnt = totalrings;
390 	skmem_region_params_config(&srp[SKMEM_REGION_RING]);
391 
392 	nslots = na_get_nslots(na, NR_TX);
393 	afslots = na_get_nslots(na, NR_A);
394 	evslots = na_get_nslots(na, NR_EV);
395 	lbaslots = na_get_nslots(na, NR_LBA);
396 	srp[SKMEM_REGION_TXAKSD].srp_r_obj_size =
397 	    MAX(MAX(MAX(nslots, afslots), evslots), lbaslots) * SLOT_DESC_SZ;
398 	srp[SKMEM_REGION_TXAKSD].srp_r_obj_cnt =
399 	    na_get_nrings(na, NR_TX) + na_get_nrings(na, NR_A) +
400 	    na_get_nrings(na, NR_EV) + na_get_nrings(na, NR_LBA);
401 	skmem_region_params_config(&srp[SKMEM_REGION_TXAKSD]);
402 
403 	/* USD and KSD objects share the same size and count */
404 	srp[SKMEM_REGION_TXAUSD].srp_r_obj_size =
405 	    srp[SKMEM_REGION_TXAKSD].srp_r_obj_size;
406 	srp[SKMEM_REGION_TXAUSD].srp_r_obj_cnt =
407 	    srp[SKMEM_REGION_TXAKSD].srp_r_obj_cnt;
408 	skmem_region_params_config(&srp[SKMEM_REGION_TXAUSD]);
409 }
410 
411 static int
nx_fsw_prov_mem_new(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nexus_adapter * na)412 nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
413     struct kern_nexus *nx, struct nexus_adapter *na)
414 {
415 #pragma unused(nxdom_prov)
416 	int err = 0;
417 	struct skmem_region_params *srp0 = NX_PROV(nx)->nxprov_region_params;
418 	struct skmem_region_params srp[SKMEM_REGIONS];
419 
420 	SK_DF(SK_VERB_FSW,
421 	    "nx %p (\"%s\":\"%s\") na \"%s\" (%p)", SK_KVA(nx),
422 	    NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
423 	    SK_KVA(na));
424 
425 	ASSERT(na->na_type == NA_FLOWSWITCH_VP);
426 	ASSERT(na->na_arena == NULL);
427 	ASSERT((na->na_flags & NAF_USER_PKT_POOL) != 0);
428 
429 	fsw_vp_region_params_setup(na, srp0, srp);
430 	/*
431 	 * Each port in the flow switch is isolated from one another;
432 	 * use NULL for the packet buffer pool references to indicate
433 	 * this, since otherwise we'd be sharing the same pp for the
434 	 * entire switch (maybe for a future, special use case?)
435 	 *
436 	 * This means that clients calling kern_nexus_get_pbufpool()
437 	 * will get NULL, but this is fine based on current design
438 	 * of providing port isolation, and also since we don't expose
439 	 * the flow switch to external kernel clients.
440 	 */
441 	na->na_arena = skmem_arena_create_for_nexus(na, srp, NULL, NULL, FALSE,
442 	    !NX_USER_CHANNEL_PROV(nx), &nx->nx_adv, &err);
443 	ASSERT(na->na_arena != NULL || err != 0);
444 	return err;
445 }
446 
447 static int
nx_fsw_prov_config(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nx_cfg_req * ncr,int sopt_dir,struct proc * p,kauth_cred_t cred)448 nx_fsw_prov_config(struct kern_nexus_domain_provider *nxdom_prov,
449     struct kern_nexus *nx, struct nx_cfg_req *ncr, int sopt_dir,
450     struct proc *p, kauth_cred_t cred)
451 {
452 #pragma unused(nxdom_prov)
453 	struct sockopt sopt;
454 	int err = 0;
455 
456 	SK_LOCK_ASSERT_HELD();
457 
458 	if (ncr->nc_req == USER_ADDR_NULL) {
459 		err = EINVAL;
460 		goto done;
461 	}
462 
463 	/* to make life easier for handling copies */
464 	bzero(&sopt, sizeof(sopt));
465 	sopt.sopt_dir = sopt_dir;
466 	sopt.sopt_val = ncr->nc_req;
467 	sopt.sopt_valsize = ncr->nc_req_len;
468 	sopt.sopt_p = p;
469 
470 	/* avoid _MALLOCing at the cost of this ugly switch block */
471 	switch (ncr->nc_cmd) {
472 	case NXCFG_CMD_ATTACH:
473 	case NXCFG_CMD_DETACH: {
474 		/* proceed only if the client possesses flow switch entitlement */
475 		if (cred == NULL || (err = skywalk_priv_check_cred(p, cred,
476 		    PRIV_SKYWALK_REGISTER_FLOW_SWITCH)) != 0) {
477 			SK_ERR("missing nxctl credential");
478 			err = EPERM;
479 			goto done;
480 		}
481 
482 		struct nx_spec_req nsr;
483 		bzero(&nsr, sizeof(nsr));
484 		err = sooptcopyin(&sopt, &nsr, sizeof(nsr), sizeof(nsr));
485 		if (err != 0) {
486 			goto done;
487 		}
488 
489 		/*
490 		 * Null-terminate in case this has an interface name;
491 		 * the union is already large enough for uuid_t.
492 		 */
493 		nsr.nsr_name[sizeof(nsr.nsr_name) - 1] = '\0';
494 		if (p != kernproc) {
495 			nsr.nsr_flags &= NXSPECREQ_MASK;
496 		}
497 
498 		err = fsw_ctl(nx, ncr->nc_cmd, p, &nsr);
499 		if (err != 0) {
500 			goto done;
501 		}
502 
503 		err = sooptcopyout(&sopt, &nsr, sizeof(nsr));
504 		break;
505 	}
506 
507 	case NXCFG_CMD_FLOW_ADD:
508 	case NXCFG_CMD_FLOW_DEL: {
509 		/* need to have owner nxctl or kernnxctl */
510 		if (cred == NULL) {
511 			SK_ERR("missing nxctl credential");
512 			err = EPERM;
513 			goto done;
514 		}
515 	} /* fall through */
516 	case NXCFG_CMD_FLOW_CONFIG: {
517 		/* checks flow PID ownership instead of nxctl creditial */
518 		struct nx_flow_req nfr;
519 		bzero(&nfr, sizeof(nfr));
520 		err = sooptcopyin(&sopt, &nfr, sizeof(nfr), sizeof(nfr));
521 		if (err != 0) {
522 			goto done;
523 		}
524 
525 		err = fsw_ctl(nx, ncr->nc_cmd, p, &nfr);
526 		if (err != 0) {
527 			goto done;
528 		}
529 
530 		err = sooptcopyout(&sopt, &nfr, sizeof(nfr));
531 		break;
532 	}
533 
534 	case NXCFG_CMD_NETEM: {
535 		struct if_netem_params inp;
536 
537 		bzero(&inp, sizeof(inp));
538 		err = sooptcopyin(&sopt, &inp, sizeof(inp), sizeof(inp));
539 		if (err != 0) {
540 			goto done;
541 		}
542 		err = fsw_ctl(nx, ncr->nc_cmd, p, &inp);
543 		if (err != 0) {
544 			goto done;
545 		}
546 		break;
547 	}
548 
549 	default:
550 		err = EINVAL;
551 		goto done;
552 	}
553 
554 done:
555 	SK_DF(err ? SK_VERB_ERROR: SK_VERB_FSW,
556 	    "nexus %p (%s) cmd %d (err %d)", SK_KVA(nx),
557 	    NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err);
558 	return err;
559 }
560 
561 static void
nx_fsw_prov_fini(struct kern_nexus_domain_provider * nxdom_prov)562 nx_fsw_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
563 {
564 #pragma unused(nxdom_prov)
565 	SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
566 }
567 
568 static int
nx_fsw_prov_nx_ctor(struct kern_nexus * nx)569 nx_fsw_prov_nx_ctor(struct kern_nexus *nx)
570 {
571 	struct nx_flowswitch *fsw;
572 
573 	SK_LOCK_ASSERT_HELD();
574 
575 	ASSERT(nx->nx_arg == NULL);
576 
577 	SK_D("nexus %p (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name);
578 
579 	fsw = fsw_alloc(Z_WAITOK);
580 	nx->nx_arg = fsw;
581 	fsw->fsw_nx = nx;
582 	fsw->fsw_tx_rings = NX_PROV(nx)->nxprov_params->nxp_tx_rings;
583 	fsw->fsw_rx_rings = NX_PROV(nx)->nxprov_params->nxp_rx_rings;
584 
585 	FSW_WLOCK(fsw);
586 
587 	fsw_dp_ctor(fsw);
588 
589 	FSW_WUNLOCK(fsw);
590 
591 	SK_D("create new fsw %p for nexus %p",
592 	    SK_KVA(NX_FSW_PRIVATE(nx)), SK_KVA(nx));
593 
594 	return 0;
595 }
596 
597 static void
nx_fsw_prov_nx_dtor(struct kern_nexus * nx)598 nx_fsw_prov_nx_dtor(struct kern_nexus *nx)
599 {
600 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
601 	int err;
602 
603 	SK_LOCK_ASSERT_HELD();
604 
605 	SK_D("nexus %p (%s) fsw %p", SK_KVA(nx),
606 	    NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(fsw));
607 
608 	err = fsw_ctl_detach(nx, current_proc(), NULL);
609 	ASSERT(err == 0);       /* this cannot fail */
610 	ASSERT(fsw->fsw_dev_ch == NULL);
611 	ASSERT(fsw->fsw_host_ch == NULL);
612 
613 	SK_DF(SK_VERB_FSW, "marking fsw %p as free", SK_KVA(fsw));
614 	fsw_free(fsw);
615 	nx->nx_arg = NULL;
616 }
617 
618 static size_t
nx_fsw_prov_mib_get(struct kern_nexus * nx,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len,struct proc * p)619 nx_fsw_prov_mib_get(struct kern_nexus *nx, struct nexus_mib_filter *filter,
620     void *__sized_by(len)out, size_t len, struct proc *p)
621 {
622 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
623 	size_t rlen;
624 
625 	/* this check doesn't require holding fsw_lock */
626 	if ((filter->nmf_bitmap & NXMIB_FILTER_NX_UUID) &&
627 	    (uuid_compare(filter->nmf_nx_uuid,
628 	    fsw->fsw_nx->nx_uuid)) != 0) {
629 		return 0;
630 	}
631 
632 	/* intercept NXMIB_FSW_STATS here since it's for flowswitch */
633 	FSW_RLOCK(fsw);
634 	rlen = fsw_mib_get(fsw, filter, out, len, p);
635 	FSW_UNLOCK(fsw);
636 
637 	return rlen;
638 }
639 
640 boolean_t
nx_fsw_dom_port_is_reserved(struct kern_nexus * nx,nexus_port_t nx_port)641 nx_fsw_dom_port_is_reserved(struct kern_nexus *nx, nexus_port_t nx_port)
642 {
643 #pragma unused(nx)
644 	return nx_port < NEXUS_PORT_FLOW_SWITCH_CLIENT;
645 }
646 
647 static int
nx_fsw_dom_find_port(struct kern_nexus * nx,boolean_t rsvd,nexus_port_t * nx_port)648 nx_fsw_dom_find_port(struct kern_nexus *nx, boolean_t rsvd,
649     nexus_port_t *nx_port)
650 {
651 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
652 	nexus_port_t first, last, port;
653 	int error;
654 
655 	ASSERT(nx_port != NULL);
656 
657 	port = *nx_port;
658 	ASSERT(port == NEXUS_PORT_ANY);
659 
660 	if (rsvd) {
661 		first = 0;
662 		last = NEXUS_PORT_FLOW_SWITCH_CLIENT;
663 	} else {
664 		first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
665 		ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
666 		last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
667 	}
668 	ASSERT(first <= last);
669 
670 	FSW_WLOCK(fsw);
671 	if (__improbable(first == last)) {
672 		error = ENOMEM;
673 	} else {
674 		error = nx_port_find(nx, first, last - 1, &port);
675 		ASSERT(error != 0 || (port >= first && port < last));
676 	}
677 	FSW_WUNLOCK(fsw);
678 
679 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
680 	    "nx %p \"%s\" %snx_port %d [%u,%u] (err %d)", SK_KVA(nx),
681 	    nx->nx_prov->nxprov_params->nxp_name, (rsvd ? "[reserved] " : ""),
682 	    (int)port, first, (last - 1), error);
683 
684 	if (error == 0) {
685 		*nx_port = port;
686 	}
687 
688 	return error;
689 }
690 
691 static int
nx_fsw_dom_bind_port(struct kern_nexus * nx,nexus_port_t * nx_port,struct nxbind * nxb,void * info)692 nx_fsw_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
693     struct nxbind *nxb, void *info)
694 {
695 #pragma unused(info)
696 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
697 	nexus_port_t first, last, port;
698 	int error;
699 
700 	ASSERT(nx_port != NULL);
701 	ASSERT(nxb != NULL);
702 
703 	port = *nx_port;
704 
705 	/* can't bind reserved ports to client credentials */
706 	if (nx_fsw_dom_port_is_reserved(nx, port)) {
707 		return EDOM;
708 	}
709 
710 	/*
711 	 * Allow clients to bind to regular ports (non-reserved);
712 	 * reserved ports aren't subject to bind/unbind, since
713 	 * they are used for internal purposes.
714 	 */
715 	first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
716 	ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
717 	last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
718 	ASSERT(first <= last);
719 
720 	FSW_WLOCK(fsw);
721 	if (__improbable(first == last)) {
722 		error = ENOMEM;
723 	} else if (port != NEXUS_PORT_ANY) {
724 		error = nx_port_bind(nx, port, nxb);
725 	} else {
726 		error = nx_port_find(nx, first, last - 1, &port);
727 		ASSERT(error != 0 || (port >= first && port < last));
728 		if (error == 0) {
729 			error = nx_port_bind(nx, port, nxb);
730 		}
731 	}
732 	FSW_WUNLOCK(fsw);
733 
734 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
735 	    "nx %p \"%s\" nx_port %d [%u,%u] (err %d)", SK_KVA(nx),
736 	    nx->nx_prov->nxprov_params->nxp_name, (int)port,
737 	    first, (last - 1), error);
738 
739 	ASSERT(*nx_port == NEXUS_PORT_ANY || *nx_port == port);
740 	if (error == 0) {
741 		*nx_port = port;
742 	}
743 
744 	return error;
745 }
746 
747 static int
nx_fsw_dom_unbind_port(struct kern_nexus * nx,nexus_port_t nx_port)748 nx_fsw_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
749 {
750 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
751 	int error;
752 
753 	FSW_WLOCK(fsw);
754 	error = nx_port_unbind(nx, nx_port);
755 	FSW_WUNLOCK(fsw);
756 
757 	SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
758 	    "nx %p \"%s\" nx_port %d (err %d)", SK_KVA(nx),
759 	    nx->nx_prov->nxprov_params->nxp_name, (int)nx_port, error);
760 
761 	return error;
762 }
763 
764 static int
nx_fsw_dom_connect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p)765 nx_fsw_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
766     struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
767     struct nxbind *nxb, struct proc *p)
768 {
769 #pragma unused(nxdom_prov)
770 	nexus_port_t port = chr->cr_port;
771 	int err = 0;
772 
773 	SK_LOCK_ASSERT_HELD();
774 
775 	ASSERT(nx->nx_prov->nxprov_params->nxp_type ==
776 	    nxdom_prov->nxdom_prov_dom->nxdom_type &&
777 	    nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
778 	ASSERT(!(ch->ch_flags & CHANF_HOST));
779 	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
780 
781 	if (port != NEXUS_PORT_ANY && port >= NXDOM_MAX(NX_DOM(nx), ports)) {
782 		err = EDOM;
783 		goto done;
784 	}
785 
786 	chr->cr_endpoint = CH_ENDPOINT_FLOW_SWITCH;
787 	ASSERT(port != NEXUS_PORT_ANY);
788 	(void) snprintf(chr->cr_name, sizeof(chr->cr_name),
789 	    "%s_%llu:%u", NX_FSW_NAME, nx->nx_id, port);
790 	chr->cr_ring_set = RING_SET_DEFAULT;
791 	err = na_connect(nx, ch, chr, nxb, p);
792 
793 done:
794 	return err;
795 }
796 
797 static void
nx_fsw_dom_disconnect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch)798 nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
799     struct kern_nexus *nx, struct kern_channel *ch)
800 {
801 #pragma unused(nxdom_prov)
802 	SK_LOCK_ASSERT_HELD();
803 
804 	SK_DF(SK_VERB_FSW, "channel %p -!- nexus %p (%s:\"%s\":%u:%d)",
805 	    SK_KVA(ch), SK_KVA(nx), nxdom_prov->nxdom_prov_name,
806 	    ch->ch_na->na_name, ch->ch_info->cinfo_nx_port,
807 	    (int)ch->ch_info->cinfo_ch_ring_id);
808 
809 	if (ch->ch_flags & CHANF_KERNEL) {
810 		na_disconnect_spec(nx, ch);
811 	} else {
812 		na_disconnect(nx, ch);
813 	}
814 }
815 
816 static void
nx_fsw_dom_defunct(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)817 nx_fsw_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
818     struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
819 {
820 #pragma unused(nxdom_prov)
821 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
822 
823 	LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
824 	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
825 	ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
826 
827 	/*
828 	 * Hold the flowswitch lock as writer; this prevents all data path
829 	 * accesses to the flowswitch, and allows us to mark the rings with
830 	 * CKRF_DEFUNCT.  Unlike some other nexus types, the flowswitch
831 	 * doesn't utilize kr_{enter,exit} for serialization, at present.
832 	 */
833 	FSW_WLOCK(fsw);
834 	na_ch_rings_defunct(ch, p);
835 	FSW_WUNLOCK(fsw);
836 }
837 
838 static void
nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,boolean_t locked)839 nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
840     struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
841 {
842 #pragma unused(nxdom_prov)
843 	struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
844 	int err = 0;
845 
846 	if (!locked) {
847 		SK_LOCK_ASSERT_NOTHELD();
848 		SK_LOCK();
849 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
850 	} else {
851 		SK_LOCK_ASSERT_HELD();
852 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
853 	}
854 
855 	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
856 	ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
857 	ASSERT(VPNA(ch->ch_na)->vpna_nx_port == ch->ch_info->cinfo_nx_port);
858 
859 	err = fsw_port_na_defunct(fsw, VPNA(ch->ch_na));
860 
861 	if (err == 0) {
862 		na_defunct(nx, ch, ch->ch_na, locked);
863 	}
864 
865 	SK_D("%s(%d): ch %p -/- nx %p (%s:\"%s\":%u:%d) err %d",
866 	    ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx),
867 	    nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
868 	    ch->ch_info->cinfo_nx_port,
869 	    (int)ch->ch_info->cinfo_ch_ring_id, err);
870 
871 	if (!locked) {
872 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
873 		SK_UNLOCK();
874 	} else {
875 		LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
876 		SK_LOCK_ASSERT_HELD();
877 	}
878 }
879 
880 #if SK_LOG
881 /* Hoisted out of line to reduce kernel stack footprint */
882 SK_LOG_ATTRIBUTE
883 static void
nx_fsw_na_find_log(const struct chreq * chr,boolean_t create)884 nx_fsw_na_find_log(const struct chreq *chr, boolean_t create)
885 {
886 	uuid_string_t uuidstr;
887 
888 	SK_D("name \"%s\" spec_uuid \"%s\" nx_port %d mode 0x%x pipe_id %u "
889 	    "ring_id %d ring_set %u ep_type %u create %u%s",
890 	    chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
891 	    (int)chr->cr_port, chr->cr_mode, chr->cr_pipe_id,
892 	    (int)chr->cr_ring_id, chr->cr_ring_set, chr->cr_endpoint, create,
893 	    (strlcmp(chr->cr_name, NX_FSW_NAME, sizeof(NX_FSW_NAME)) != 0) ?
894 	    " (skipped)" : "");
895 }
896 #endif /* SK_LOG */
897 
898 /*
899  * Try to get a reference to a Nexus adapter attached to a flow switch.
900  * If the adapter is found (or is created), this function returns 0, a
901  * non NULL pointer is returned into *na, and the caller holds a
902  * reference to the adapter.
903  * If an adapter is not found, then no reference is grabbed and the
904  * function returns an error code, or 0 if there is just a flow switch prefix
905  * mismatch. Therefore the caller holds a reference when
906  * (*na != NULL && return == 0).
907  */
908 int
nx_fsw_na_find(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_adapter ** na,boolean_t create)909 nx_fsw_na_find(struct kern_nexus *nx, struct kern_channel *ch,
910     struct chreq *chr, struct nxbind *nxb, struct proc *p,
911     struct nexus_adapter **na, boolean_t create)
912 {
913 	struct nexus_vp_adapter *__single vpna = NULL;
914 	char *cr_name = chr->cr_name;
915 	struct nx_flowswitch *fsw;
916 	int error = 0;
917 
918 	SK_LOCK_ASSERT_HELD();
919 	*na = NULL;     /* default return value */
920 
921 #if SK_LOG
922 	if (__improbable(sk_verbose != 0)) {
923 		nx_fsw_na_find_log(chr, create);
924 	}
925 #endif /* SK_LOG */
926 
927 	/* first try to see if this is a flow switch port. */
928 	if (strlcmp(cr_name, NX_FSW_NAME, sizeof(NX_FSW_NAME) - 1) != 0) {
929 		return 0;  /* no error, but no flow switch prefix */
930 	}
931 	ASSERT(nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
932 	fsw = NX_FSW_PRIVATE(nx);
933 	ASSERT(fsw != NULL);
934 
935 	if (!create) {
936 		return ENXIO;
937 	}
938 
939 	/*
940 	 * The flowswitch VP is only attachable from a user channel so none of
941 	 * these flags should be set.
942 	 */
943 	ASSERT((chr->cr_mode & (CHMODE_KERNEL | CHMODE_CONFIG)) == 0);
944 	error = fsw_attach_vp(nx, ch, chr, nxb, p, &vpna);
945 	ASSERT(vpna == NULL || error == 0);
946 
947 	if (error == 0) {
948 		/* use reference held by nx_fsw_attach_vp above */
949 		*na = &vpna->vpna_up;
950 		SK_DF(SK_VERB_FSW,
951 		    "vpna \"%s\" (%p) refs %u to fsw \"%s\" nx_port %d",
952 		    (*na)->na_name, SK_KVA(*na), (*na)->na_refcount,
953 		    cr_name, (int)vpna->vpna_nx_port);
954 	}
955 
956 	return error;
957 }
958 
959 int
nx_fsw_netagent_add(struct kern_nexus * nx)960 nx_fsw_netagent_add(struct kern_nexus *nx)
961 {
962 	return fsw_netagent_add_remove(nx, TRUE);
963 }
964 
965 int
nx_fsw_netagent_remove(struct kern_nexus * nx)966 nx_fsw_netagent_remove(struct kern_nexus *nx)
967 {
968 	return fsw_netagent_add_remove(nx, FALSE);
969 }
970 
971 void
nx_fsw_netagent_update(struct kern_nexus * nx)972 nx_fsw_netagent_update(struct kern_nexus *nx)
973 {
974 	fsw_netagent_update(nx);
975 }
976