1 /*
2 * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 */
53
54
55 /*
56 * This module implements the flow switch for Skywalk
57 *
58 * --- FLOW SWITCH ---
59 *
60 * For each switch, a lock protects deletion of ports. When configuring
61 * or deleting a new port, the lock is acquired in exclusive mode (after
62 * holding SK_LOCK). When forwarding, the lock is acquired in shared
63 * mode (without SK_LOCK). The lock is held throughout the entire
64 * forwarding cycle, during which the thread may incur in a page fault.
65 * Hence it is important that sleepable shared locks are used.
66 *
67 * On the rx ring, the per-port lock is grabbed initially to reserve
68 * a number of slot in the ring, then the lock is released, packets are
69 * copied from source to destination, and then the lock is acquired again
70 * and the receive ring is updated. (A similar thing is done on the tx
71 * ring for NIC and host stack ports attached to the switch)
72 *
73 * When a netif is attached to a flowswitch, two kernel channels are opened:
74 * The device and host channels. The device channel provides the device
75 * datapath. The host channel is not used in the datapath. It is there
76 * only for providing some callbacks for activating the hostna (e.g.
77 * intercepting host packets).
78 */
79
80 #include <net/bpf.h>
81 #include <netinet/tcp_seq.h>
82 #include <skywalk/os_skywalk_private.h>
83 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
84 #include <skywalk/nexus/flowswitch/fsw_var.h>
85 #include <skywalk/nexus/upipe/nx_user_pipe.h>
86 #include <skywalk/nexus/netif/nx_netif.h>
87 #include <skywalk/nexus/nexus_var.h>
88 #include <sys/protosw.h>
89 #include <sys/domain.h>
90
91 SYSCTL_EXTENSIBLE_NODE(_kern_skywalk, OID_AUTO, flowswitch,
92 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk FlowSwitch");
93
94 static void nx_fsw_dom_init(struct nxdom *);
95 static void nx_fsw_dom_terminate(struct nxdom *);
96 static void nx_fsw_dom_fini(struct nxdom *);
97 static int nx_fsw_dom_find_port(struct kern_nexus *, boolean_t, nexus_port_t *);
98 static int nx_fsw_dom_bind_port(struct kern_nexus *, nexus_port_t *,
99 struct nxbind *, void *);
100 static int nx_fsw_dom_unbind_port(struct kern_nexus *, nexus_port_t);
101 static int nx_fsw_dom_connect(struct kern_nexus_domain_provider *,
102 struct kern_nexus *, struct kern_channel *, struct chreq *,
103 struct nxbind *, struct proc *);
104 static void nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *,
105 struct kern_nexus *, struct kern_channel *);
106 static void nx_fsw_dom_defunct(struct kern_nexus_domain_provider *,
107 struct kern_nexus *, struct kern_channel *, struct proc *);
108 static void nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *,
109 struct kern_nexus *, struct kern_channel *, boolean_t);
110
111 static int nx_fsw_prov_init(struct kern_nexus_domain_provider *);
112 static int nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *,
113 const struct nxprov_params *, struct nxprov_adjusted_params *);
114 static int nx_fsw_prov_params(struct kern_nexus_domain_provider *,
115 const uint32_t, const struct nxprov_params *, struct nxprov_params *,
116 struct skmem_region_params[SKMEM_REGIONS], uint32_t);
117 static int nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *,
118 struct kern_nexus *, struct nexus_adapter *);
119 static int nx_fsw_prov_config(struct kern_nexus_domain_provider *,
120 struct kern_nexus *, struct nx_cfg_req *, int, struct proc *,
121 kauth_cred_t);
122 static void nx_fsw_prov_fini(struct kern_nexus_domain_provider *);
123 static int nx_fsw_prov_nx_ctor(struct kern_nexus *);
124 static void nx_fsw_prov_nx_dtor(struct kern_nexus *);
125 static size_t nx_fsw_prov_mib_get(struct kern_nexus *nx,
126 struct nexus_mib_filter *, void *__sized_by(len)out, size_t len,
127 struct proc *);
128
129 struct nxdom nx_flowswitch_dom_s = {
130 .nxdom_prov_head =
131 STAILQ_HEAD_INITIALIZER(nx_flowswitch_dom_s.nxdom_prov_head),
132 .nxdom_type = NEXUS_TYPE_FLOW_SWITCH,
133 .nxdom_md_type = NEXUS_META_TYPE_PACKET,
134 .nxdom_md_subtype = NEXUS_META_SUBTYPE_RAW,
135 .nxdom_name = "flowswitch",
136 .nxdom_ports = {
137 .nb_def = NX_FSW_VP_MAX,
138 .nb_min = NX_FSW_VP_MIN,
139 .nb_max = NX_FSW_VP_MAX,
140 },
141 .nxdom_tx_rings = {
142 .nb_def = 1,
143 .nb_min = 1,
144 .nb_max = NX_FSW_MAXRINGS,
145 },
146 .nxdom_rx_rings = {
147 .nb_def = 1,
148 .nb_min = 1,
149 .nb_max = NX_FSW_MAXRINGS,
150 },
151 .nxdom_tx_slots = {
152 .nb_def = NX_FSW_TXRINGSIZE,
153 .nb_min = NX_FSW_MINSLOTS,
154 .nb_max = NX_FSW_MAXSLOTS,
155 },
156 .nxdom_rx_slots = {
157 .nb_def = NX_FSW_RXRINGSIZE,
158 .nb_min = NX_FSW_MINSLOTS,
159 .nb_max = NX_FSW_MAXSLOTS,
160 },
161 .nxdom_buf_size = {
162 .nb_def = NX_FSW_BUFSIZE,
163 .nb_min = NX_FSW_MINBUFSIZE,
164 .nb_max = NX_FSW_MAXBUFSIZE,
165 },
166 .nxdom_large_buf_size = {
167 .nb_def = NX_FSW_DEF_LARGE_BUFSIZE,
168 .nb_min = NX_FSW_MIN_LARGE_BUFSIZE,
169 .nb_max = NX_FSW_MAX_LARGE_BUFSIZE,
170 },
171 .nxdom_meta_size = {
172 .nb_def = NX_FSW_UMD_SIZE,
173 .nb_min = NX_FSW_UMD_SIZE,
174 .nb_max = NX_METADATA_USR_MAX_SZ,
175 },
176 .nxdom_stats_size = {
177 .nb_def = 0,
178 .nb_min = 0,
179 .nb_max = NX_STATS_MAX_SZ,
180 },
181 .nxdom_pipes = {
182 .nb_def = 0,
183 .nb_min = 0,
184 .nb_max = NX_UPIPE_MAXPIPES,
185 },
186 .nxdom_flowadv_max = {
187 .nb_def = 0,
188 .nb_min = 0,
189 .nb_max = NX_FLOWADV_MAX,
190 },
191 .nxdom_nexusadv_size = {
192 .nb_def = 0,
193 .nb_min = 0,
194 .nb_max = NX_NEXUSADV_MAX_SZ,
195 },
196 .nxdom_capabilities = {
197 .nb_def = NXPCAP_USER_CHANNEL,
198 .nb_min = 0,
199 .nb_max = (NXPCAP_CHECKSUM_PARTIAL | NXPCAP_USER_PACKET_POOL |
200 NXPCAP_USER_CHANNEL),
201 },
202 .nxdom_qmap = {
203 .nb_def = NEXUS_QMAP_TYPE_INVALID,
204 .nb_min = NEXUS_QMAP_TYPE_INVALID,
205 .nb_max = NEXUS_QMAP_TYPE_INVALID,
206 },
207 .nxdom_max_frags = {
208 .nb_def = NX_PBUF_FRAGS_DEFAULT,
209 .nb_min = NX_PBUF_FRAGS_MIN,
210 .nb_max = NX_PBUF_FRAGS_MAX,
211 },
212 .nxdom_init = nx_fsw_dom_init,
213 .nxdom_terminate = nx_fsw_dom_terminate,
214 .nxdom_fini = nx_fsw_dom_fini,
215 .nxdom_connect = nx_fsw_dom_connect,
216 .nxdom_find_port = nx_fsw_dom_find_port,
217 .nxdom_port_is_reserved = nx_fsw_dom_port_is_reserved,
218 .nxdom_bind_port = nx_fsw_dom_bind_port,
219 .nxdom_unbind_port = nx_fsw_dom_unbind_port,
220 .nxdom_disconnect = nx_fsw_dom_disconnect,
221 .nxdom_defunct = nx_fsw_dom_defunct,
222 .nxdom_defunct_finalize = nx_fsw_dom_defunct_finalize,
223 };
224
225 struct kern_nexus_domain_provider nx_fsw_prov_s = {
226 .nxdom_prov_name = NEXUS_PROVIDER_FLOW_SWITCH,
227 .nxdom_prov_flags = NXDOMPROVF_DEFAULT,
228 .nxdom_prov_cb = {
229 .dp_cb_init = nx_fsw_prov_init,
230 .dp_cb_fini = nx_fsw_prov_fini,
231 .dp_cb_params = nx_fsw_prov_params,
232 .dp_cb_mem_new = nx_fsw_prov_mem_new,
233 .dp_cb_config = nx_fsw_prov_config,
234 .dp_cb_nx_ctor = nx_fsw_prov_nx_ctor,
235 .dp_cb_nx_dtor = nx_fsw_prov_nx_dtor,
236 .dp_cb_nx_mem_info = NULL, /* not supported */
237 .dp_cb_nx_mib_get = nx_fsw_prov_mib_get,
238 .dp_cb_nx_stop = NULL,
239 },
240 };
241
242
243 static void
nx_fsw_dom_init(struct nxdom * nxdom)244 nx_fsw_dom_init(struct nxdom *nxdom)
245 {
246 SK_LOCK_ASSERT_HELD();
247 ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));
248
249 /* Generic initialization */
250 fsw_init();
251 fsw_dp_init();
252
253 (void) nxdom_prov_add(nxdom, &nx_fsw_prov_s);
254 }
255
256 static void
nx_fsw_dom_terminate(struct nxdom * nxdom)257 nx_fsw_dom_terminate(struct nxdom *nxdom)
258 {
259 struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;
260
261 SK_LOCK_ASSERT_HELD();
262
263 STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
264 nxdom_prov_link, tnxdp) {
265 (void) nxdom_prov_del(nxdom_prov);
266 }
267
268 fsw_dp_uninit();
269
270 /* Generic uninitialization */
271 fsw_uninit();
272 }
273
274 static void
nx_fsw_dom_fini(struct nxdom * nxdom)275 nx_fsw_dom_fini(struct nxdom *nxdom)
276 {
277 #pragma unused(nxdom)
278 }
279
280 static int
nx_fsw_prov_init(struct kern_nexus_domain_provider * nxdom_prov)281 nx_fsw_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
282 {
283 #pragma unused(nxdom_prov)
284 SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
285 return 0;
286 }
287
288 static int
nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider * nxdom_prov,const struct nxprov_params * nxp,struct nxprov_adjusted_params * adj)289 nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
290 const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
291 {
292 #pragma unused(nxdom_prov, nxp)
293 static_assert(NX_FSW_AFRINGSIZE <= NX_FSW_RXRINGSIZE);
294 static_assert(NX_FSW_AFRINGSIZE <= NX_FSW_TXRINGSIZE);
295
296 *(adj->adj_stats_size) = sizeof(struct __nx_stats_fsw);
297 VERIFY(sk_max_flows > 0 && sk_max_flows <= NX_FLOWADV_MAX);
298 *(adj->adj_flowadv_max) = sk_max_flows;
299 *(adj->adj_nexusadv_size) = sizeof(struct sk_nexusadv);
300 *(adj->adj_caps) |= NXPCAP_USER_PACKET_POOL;
301 if (sk_cksum_tx != 0) {
302 *(adj->adj_caps) |= NXPCAP_CHECKSUM_PARTIAL;
303 }
304 *(adj->adj_alloc_rings) = *(adj->adj_free_rings) =
305 ((nxp->nxp_max_frags > 1) && (sk_channel_buflet_alloc != 0)) ?
306 2 : 1;
307 *(adj->adj_alloc_slots) = *(adj->adj_free_slots) =
308 NX_FSW_AFRINGSIZE;
309
310 if (!SKMEM_MEM_CONSTRAINED_DEVICE() &&
311 (*(adj->adj_buf_region_segment_size) < NX_FSW_BUF_SEG_SIZE)) {
312 *(adj->adj_buf_region_segment_size) = NX_FSW_BUF_SEG_SIZE;
313 }
314
315 if (*(adj->adj_max_frags) > 1) {
316 uint32_t fsw_maxbufs = SKMEM_MEM_CONSTRAINED_DEVICE() ?
317 NX_FSW_MAXBUFFERS_MEM_CONSTRAINED : NX_FSW_MAXBUFFERS;
318 uint32_t magazine_max_objs;
319
320 *(adj->adj_max_buffers) = (sk_fsw_max_bufs != 0) ?
321 sk_fsw_max_bufs : fsw_maxbufs;
322
323 /*
324 * Given that packet objects are the ones cached, use the
325 * metadata size to determine the extra amount of objects
326 * at magazine layer.
327 */
328 magazine_max_objs = skmem_cache_magazine_max(
329 NX_METADATA_PACKET_SZ(*(adj->adj_max_frags)) +
330 METADATA_PREAMBLE_SZ);
331
332 /*
333 * Adjust the max buffers to account for the increase
334 * associated with per-CPU caching.
335 */
336 if (skmem_allow_magazines() &&
337 magazine_max_objs < *(adj->adj_max_buffers)) {
338 *(adj->adj_max_buffers) -= magazine_max_objs;
339 }
340 }
341 if (SKMEM_MEM_CONSTRAINED_DEVICE() || (fsw_use_dual_sized_pool == 0) ||
342 (*(adj->adj_max_frags) <= 1)) {
343 *(adj->adj_large_buf_size) = 0;
344 }
345 return 0;
346 }
347
348 static int
nx_fsw_prov_params(struct kern_nexus_domain_provider * nxdom_prov,const uint32_t req,const struct nxprov_params * nxp0,struct nxprov_params * nxp,struct skmem_region_params srp[SKMEM_REGIONS],uint32_t pp_region_config_flags)349 nx_fsw_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
350 const uint32_t req, const struct nxprov_params *nxp0,
351 struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS],
352 uint32_t pp_region_config_flags)
353 {
354 struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;
355
356 /* USD regions need to be writable to support user packet pool */
357 srp[SKMEM_REGION_TXAUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
358 srp[SKMEM_REGION_RXFUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
359
360 return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
361 nxdom, nxdom, nxdom, pp_region_config_flags,
362 nx_fsw_prov_params_adjust);
363 }
364
365 static void
fsw_vp_region_params_setup(struct nexus_adapter * na,struct skmem_region_params * __counted_by (SKMEM_REGIONS)srp0,struct skmem_region_params * __counted_by (SKMEM_REGIONS)srp)366 fsw_vp_region_params_setup(struct nexus_adapter *na,
367 struct skmem_region_params *__counted_by(SKMEM_REGIONS)srp0,
368 struct skmem_region_params *__counted_by(SKMEM_REGIONS)srp)
369 {
370 int i;
371 uint32_t totalrings, nslots, afslots, evslots, lbaslots;
372
373 /* copy default flowswitch parameters initialized in nxprov_params_adjust() */
374 for (i = 0; i < SKMEM_REGIONS; i++) {
375 srp[i] = srp0[i];
376 }
377 /* customize parameters that could vary across NAs */
378 totalrings = na_get_nrings(na, NR_TX) + na_get_nrings(na, NR_RX) +
379 na_get_nrings(na, NR_A) + na_get_nrings(na, NR_F) +
380 na_get_nrings(na, NR_EV) + na_get_nrings(na, NR_LBA);
381
382 srp[SKMEM_REGION_SCHEMA].srp_r_obj_size =
383 (uint32_t)CHANNEL_SCHEMA_SIZE(totalrings);
384 srp[SKMEM_REGION_SCHEMA].srp_r_obj_cnt = totalrings;
385 skmem_region_params_config(&srp[SKMEM_REGION_SCHEMA]);
386
387 srp[SKMEM_REGION_RING].srp_r_obj_size =
388 sizeof(struct __user_channel_ring);
389 srp[SKMEM_REGION_RING].srp_r_obj_cnt = totalrings;
390 skmem_region_params_config(&srp[SKMEM_REGION_RING]);
391
392 nslots = na_get_nslots(na, NR_TX);
393 afslots = na_get_nslots(na, NR_A);
394 evslots = na_get_nslots(na, NR_EV);
395 lbaslots = na_get_nslots(na, NR_LBA);
396 srp[SKMEM_REGION_TXAKSD].srp_r_obj_size =
397 MAX(MAX(MAX(nslots, afslots), evslots), lbaslots) * SLOT_DESC_SZ;
398 srp[SKMEM_REGION_TXAKSD].srp_r_obj_cnt =
399 na_get_nrings(na, NR_TX) + na_get_nrings(na, NR_A) +
400 na_get_nrings(na, NR_EV) + na_get_nrings(na, NR_LBA);
401 skmem_region_params_config(&srp[SKMEM_REGION_TXAKSD]);
402
403 /* USD and KSD objects share the same size and count */
404 srp[SKMEM_REGION_TXAUSD].srp_r_obj_size =
405 srp[SKMEM_REGION_TXAKSD].srp_r_obj_size;
406 srp[SKMEM_REGION_TXAUSD].srp_r_obj_cnt =
407 srp[SKMEM_REGION_TXAKSD].srp_r_obj_cnt;
408 skmem_region_params_config(&srp[SKMEM_REGION_TXAUSD]);
409 }
410
411 static int
nx_fsw_prov_mem_new(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nexus_adapter * na)412 nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
413 struct kern_nexus *nx, struct nexus_adapter *na)
414 {
415 #pragma unused(nxdom_prov)
416 int err = 0;
417 struct skmem_region_params *srp0 = NX_PROV(nx)->nxprov_region_params;
418 struct skmem_region_params srp[SKMEM_REGIONS];
419
420 SK_DF(SK_VERB_FSW,
421 "nx %p (\"%s\":\"%s\") na \"%s\" (%p)", SK_KVA(nx),
422 NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
423 SK_KVA(na));
424
425 ASSERT(na->na_type == NA_FLOWSWITCH_VP);
426 ASSERT(na->na_arena == NULL);
427 ASSERT((na->na_flags & NAF_USER_PKT_POOL) != 0);
428
429 fsw_vp_region_params_setup(na, srp0, srp);
430 /*
431 * Each port in the flow switch is isolated from one another;
432 * use NULL for the packet buffer pool references to indicate
433 * this, since otherwise we'd be sharing the same pp for the
434 * entire switch (maybe for a future, special use case?)
435 *
436 * This means that clients calling kern_nexus_get_pbufpool()
437 * will get NULL, but this is fine based on current design
438 * of providing port isolation, and also since we don't expose
439 * the flow switch to external kernel clients.
440 */
441 na->na_arena = skmem_arena_create_for_nexus(na, srp, NULL, NULL, FALSE,
442 !NX_USER_CHANNEL_PROV(nx), &nx->nx_adv, &err);
443 ASSERT(na->na_arena != NULL || err != 0);
444 return err;
445 }
446
447 static int
nx_fsw_prov_config(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nx_cfg_req * ncr,int sopt_dir,struct proc * p,kauth_cred_t cred)448 nx_fsw_prov_config(struct kern_nexus_domain_provider *nxdom_prov,
449 struct kern_nexus *nx, struct nx_cfg_req *ncr, int sopt_dir,
450 struct proc *p, kauth_cred_t cred)
451 {
452 #pragma unused(nxdom_prov)
453 struct sockopt sopt;
454 int err = 0;
455
456 SK_LOCK_ASSERT_HELD();
457
458 if (ncr->nc_req == USER_ADDR_NULL) {
459 err = EINVAL;
460 goto done;
461 }
462
463 /* to make life easier for handling copies */
464 bzero(&sopt, sizeof(sopt));
465 sopt.sopt_dir = sopt_dir;
466 sopt.sopt_val = ncr->nc_req;
467 sopt.sopt_valsize = ncr->nc_req_len;
468 sopt.sopt_p = p;
469
470 /* avoid _MALLOCing at the cost of this ugly switch block */
471 switch (ncr->nc_cmd) {
472 case NXCFG_CMD_ATTACH:
473 case NXCFG_CMD_DETACH: {
474 /* proceed only if the client possesses flow switch entitlement */
475 if (cred == NULL || (err = skywalk_priv_check_cred(p, cred,
476 PRIV_SKYWALK_REGISTER_FLOW_SWITCH)) != 0) {
477 SK_ERR("missing nxctl credential");
478 err = EPERM;
479 goto done;
480 }
481
482 struct nx_spec_req nsr;
483 bzero(&nsr, sizeof(nsr));
484 err = sooptcopyin(&sopt, &nsr, sizeof(nsr), sizeof(nsr));
485 if (err != 0) {
486 goto done;
487 }
488
489 /*
490 * Null-terminate in case this has an interface name;
491 * the union is already large enough for uuid_t.
492 */
493 nsr.nsr_name[sizeof(nsr.nsr_name) - 1] = '\0';
494 if (p != kernproc) {
495 nsr.nsr_flags &= NXSPECREQ_MASK;
496 }
497
498 err = fsw_ctl(nx, ncr->nc_cmd, p, &nsr);
499 if (err != 0) {
500 goto done;
501 }
502
503 err = sooptcopyout(&sopt, &nsr, sizeof(nsr));
504 break;
505 }
506
507 case NXCFG_CMD_FLOW_ADD:
508 case NXCFG_CMD_FLOW_DEL: {
509 /* need to have owner nxctl or kernnxctl */
510 if (cred == NULL) {
511 SK_ERR("missing nxctl credential");
512 err = EPERM;
513 goto done;
514 }
515 } /* fall through */
516 case NXCFG_CMD_FLOW_CONFIG: {
517 /* checks flow PID ownership instead of nxctl creditial */
518 struct nx_flow_req nfr;
519 bzero(&nfr, sizeof(nfr));
520 err = sooptcopyin(&sopt, &nfr, sizeof(nfr), sizeof(nfr));
521 if (err != 0) {
522 goto done;
523 }
524
525 err = fsw_ctl(nx, ncr->nc_cmd, p, &nfr);
526 if (err != 0) {
527 goto done;
528 }
529
530 err = sooptcopyout(&sopt, &nfr, sizeof(nfr));
531 break;
532 }
533
534 case NXCFG_CMD_NETEM: {
535 struct if_netem_params inp;
536
537 bzero(&inp, sizeof(inp));
538 err = sooptcopyin(&sopt, &inp, sizeof(inp), sizeof(inp));
539 if (err != 0) {
540 goto done;
541 }
542 err = fsw_ctl(nx, ncr->nc_cmd, p, &inp);
543 if (err != 0) {
544 goto done;
545 }
546 break;
547 }
548
549 default:
550 err = EINVAL;
551 goto done;
552 }
553
554 done:
555 SK_DF(err ? SK_VERB_ERROR: SK_VERB_FSW,
556 "nexus %p (%s) cmd %d (err %d)", SK_KVA(nx),
557 NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err);
558 return err;
559 }
560
561 static void
nx_fsw_prov_fini(struct kern_nexus_domain_provider * nxdom_prov)562 nx_fsw_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
563 {
564 #pragma unused(nxdom_prov)
565 SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
566 }
567
568 static int
nx_fsw_prov_nx_ctor(struct kern_nexus * nx)569 nx_fsw_prov_nx_ctor(struct kern_nexus *nx)
570 {
571 struct nx_flowswitch *fsw;
572
573 SK_LOCK_ASSERT_HELD();
574
575 ASSERT(nx->nx_arg == NULL);
576
577 SK_D("nexus %p (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name);
578
579 fsw = fsw_alloc(Z_WAITOK);
580 nx->nx_arg = fsw;
581 fsw->fsw_nx = nx;
582 fsw->fsw_tx_rings = NX_PROV(nx)->nxprov_params->nxp_tx_rings;
583 fsw->fsw_rx_rings = NX_PROV(nx)->nxprov_params->nxp_rx_rings;
584
585 FSW_WLOCK(fsw);
586
587 fsw_dp_ctor(fsw);
588
589 FSW_WUNLOCK(fsw);
590
591 SK_D("create new fsw %p for nexus %p",
592 SK_KVA(NX_FSW_PRIVATE(nx)), SK_KVA(nx));
593
594 return 0;
595 }
596
597 static void
nx_fsw_prov_nx_dtor(struct kern_nexus * nx)598 nx_fsw_prov_nx_dtor(struct kern_nexus *nx)
599 {
600 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
601 int err;
602
603 SK_LOCK_ASSERT_HELD();
604
605 SK_D("nexus %p (%s) fsw %p", SK_KVA(nx),
606 NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(fsw));
607
608 err = fsw_ctl_detach(nx, current_proc(), NULL);
609 ASSERT(err == 0); /* this cannot fail */
610 ASSERT(fsw->fsw_dev_ch == NULL);
611 ASSERT(fsw->fsw_host_ch == NULL);
612
613 SK_DF(SK_VERB_FSW, "marking fsw %p as free", SK_KVA(fsw));
614 fsw_free(fsw);
615 nx->nx_arg = NULL;
616 }
617
618 static size_t
nx_fsw_prov_mib_get(struct kern_nexus * nx,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len,struct proc * p)619 nx_fsw_prov_mib_get(struct kern_nexus *nx, struct nexus_mib_filter *filter,
620 void *__sized_by(len)out, size_t len, struct proc *p)
621 {
622 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
623 size_t rlen;
624
625 /* this check doesn't require holding fsw_lock */
626 if ((filter->nmf_bitmap & NXMIB_FILTER_NX_UUID) &&
627 (uuid_compare(filter->nmf_nx_uuid,
628 fsw->fsw_nx->nx_uuid)) != 0) {
629 return 0;
630 }
631
632 /* intercept NXMIB_FSW_STATS here since it's for flowswitch */
633 FSW_RLOCK(fsw);
634 rlen = fsw_mib_get(fsw, filter, out, len, p);
635 FSW_UNLOCK(fsw);
636
637 return rlen;
638 }
639
640 boolean_t
nx_fsw_dom_port_is_reserved(struct kern_nexus * nx,nexus_port_t nx_port)641 nx_fsw_dom_port_is_reserved(struct kern_nexus *nx, nexus_port_t nx_port)
642 {
643 #pragma unused(nx)
644 return nx_port < NEXUS_PORT_FLOW_SWITCH_CLIENT;
645 }
646
647 static int
nx_fsw_dom_find_port(struct kern_nexus * nx,boolean_t rsvd,nexus_port_t * nx_port)648 nx_fsw_dom_find_port(struct kern_nexus *nx, boolean_t rsvd,
649 nexus_port_t *nx_port)
650 {
651 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
652 nexus_port_t first, last, port;
653 int error;
654
655 ASSERT(nx_port != NULL);
656
657 port = *nx_port;
658 ASSERT(port == NEXUS_PORT_ANY);
659
660 if (rsvd) {
661 first = 0;
662 last = NEXUS_PORT_FLOW_SWITCH_CLIENT;
663 } else {
664 first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
665 ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
666 last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
667 }
668 ASSERT(first <= last);
669
670 FSW_WLOCK(fsw);
671 if (__improbable(first == last)) {
672 error = ENOMEM;
673 } else {
674 error = nx_port_find(nx, first, last - 1, &port);
675 ASSERT(error != 0 || (port >= first && port < last));
676 }
677 FSW_WUNLOCK(fsw);
678
679 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
680 "nx %p \"%s\" %snx_port %d [%u,%u] (err %d)", SK_KVA(nx),
681 nx->nx_prov->nxprov_params->nxp_name, (rsvd ? "[reserved] " : ""),
682 (int)port, first, (last - 1), error);
683
684 if (error == 0) {
685 *nx_port = port;
686 }
687
688 return error;
689 }
690
691 static int
nx_fsw_dom_bind_port(struct kern_nexus * nx,nexus_port_t * nx_port,struct nxbind * nxb,void * info)692 nx_fsw_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
693 struct nxbind *nxb, void *info)
694 {
695 #pragma unused(info)
696 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
697 nexus_port_t first, last, port;
698 int error;
699
700 ASSERT(nx_port != NULL);
701 ASSERT(nxb != NULL);
702
703 port = *nx_port;
704
705 /* can't bind reserved ports to client credentials */
706 if (nx_fsw_dom_port_is_reserved(nx, port)) {
707 return EDOM;
708 }
709
710 /*
711 * Allow clients to bind to regular ports (non-reserved);
712 * reserved ports aren't subject to bind/unbind, since
713 * they are used for internal purposes.
714 */
715 first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
716 ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
717 last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
718 ASSERT(first <= last);
719
720 FSW_WLOCK(fsw);
721 if (__improbable(first == last)) {
722 error = ENOMEM;
723 } else if (port != NEXUS_PORT_ANY) {
724 error = nx_port_bind(nx, port, nxb);
725 } else {
726 error = nx_port_find(nx, first, last - 1, &port);
727 ASSERT(error != 0 || (port >= first && port < last));
728 if (error == 0) {
729 error = nx_port_bind(nx, port, nxb);
730 }
731 }
732 FSW_WUNLOCK(fsw);
733
734 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
735 "nx %p \"%s\" nx_port %d [%u,%u] (err %d)", SK_KVA(nx),
736 nx->nx_prov->nxprov_params->nxp_name, (int)port,
737 first, (last - 1), error);
738
739 ASSERT(*nx_port == NEXUS_PORT_ANY || *nx_port == port);
740 if (error == 0) {
741 *nx_port = port;
742 }
743
744 return error;
745 }
746
747 static int
nx_fsw_dom_unbind_port(struct kern_nexus * nx,nexus_port_t nx_port)748 nx_fsw_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
749 {
750 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
751 int error;
752
753 FSW_WLOCK(fsw);
754 error = nx_port_unbind(nx, nx_port);
755 FSW_WUNLOCK(fsw);
756
757 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
758 "nx %p \"%s\" nx_port %d (err %d)", SK_KVA(nx),
759 nx->nx_prov->nxprov_params->nxp_name, (int)nx_port, error);
760
761 return error;
762 }
763
764 static int
nx_fsw_dom_connect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p)765 nx_fsw_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
766 struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
767 struct nxbind *nxb, struct proc *p)
768 {
769 #pragma unused(nxdom_prov)
770 nexus_port_t port = chr->cr_port;
771 int err = 0;
772
773 SK_LOCK_ASSERT_HELD();
774
775 ASSERT(nx->nx_prov->nxprov_params->nxp_type ==
776 nxdom_prov->nxdom_prov_dom->nxdom_type &&
777 nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
778 ASSERT(!(ch->ch_flags & CHANF_HOST));
779 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
780
781 if (port != NEXUS_PORT_ANY && port >= NXDOM_MAX(NX_DOM(nx), ports)) {
782 err = EDOM;
783 goto done;
784 }
785
786 chr->cr_endpoint = CH_ENDPOINT_FLOW_SWITCH;
787 ASSERT(port != NEXUS_PORT_ANY);
788 (void) snprintf(chr->cr_name, sizeof(chr->cr_name),
789 "%s_%llu:%u", NX_FSW_NAME, nx->nx_id, port);
790 chr->cr_ring_set = RING_SET_DEFAULT;
791 err = na_connect(nx, ch, chr, nxb, p);
792
793 done:
794 return err;
795 }
796
797 static void
nx_fsw_dom_disconnect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch)798 nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
799 struct kern_nexus *nx, struct kern_channel *ch)
800 {
801 #pragma unused(nxdom_prov)
802 SK_LOCK_ASSERT_HELD();
803
804 SK_DF(SK_VERB_FSW, "channel %p -!- nexus %p (%s:\"%s\":%u:%d)",
805 SK_KVA(ch), SK_KVA(nx), nxdom_prov->nxdom_prov_name,
806 ch->ch_na->na_name, ch->ch_info->cinfo_nx_port,
807 (int)ch->ch_info->cinfo_ch_ring_id);
808
809 if (ch->ch_flags & CHANF_KERNEL) {
810 na_disconnect_spec(nx, ch);
811 } else {
812 na_disconnect(nx, ch);
813 }
814 }
815
816 static void
nx_fsw_dom_defunct(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)817 nx_fsw_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
818 struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
819 {
820 #pragma unused(nxdom_prov)
821 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
822
823 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
824 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
825 ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
826
827 /*
828 * Hold the flowswitch lock as writer; this prevents all data path
829 * accesses to the flowswitch, and allows us to mark the rings with
830 * CKRF_DEFUNCT. Unlike some other nexus types, the flowswitch
831 * doesn't utilize kr_{enter,exit} for serialization, at present.
832 */
833 FSW_WLOCK(fsw);
834 na_ch_rings_defunct(ch, p);
835 FSW_WUNLOCK(fsw);
836 }
837
838 static void
nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,boolean_t locked)839 nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
840 struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
841 {
842 #pragma unused(nxdom_prov)
843 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
844 int err = 0;
845
846 if (!locked) {
847 SK_LOCK_ASSERT_NOTHELD();
848 SK_LOCK();
849 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
850 } else {
851 SK_LOCK_ASSERT_HELD();
852 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
853 }
854
855 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
856 ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
857 ASSERT(VPNA(ch->ch_na)->vpna_nx_port == ch->ch_info->cinfo_nx_port);
858
859 err = fsw_port_na_defunct(fsw, VPNA(ch->ch_na));
860
861 if (err == 0) {
862 na_defunct(nx, ch, ch->ch_na, locked);
863 }
864
865 SK_D("%s(%d): ch %p -/- nx %p (%s:\"%s\":%u:%d) err %d",
866 ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx),
867 nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
868 ch->ch_info->cinfo_nx_port,
869 (int)ch->ch_info->cinfo_ch_ring_id, err);
870
871 if (!locked) {
872 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
873 SK_UNLOCK();
874 } else {
875 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
876 SK_LOCK_ASSERT_HELD();
877 }
878 }
879
880 #if SK_LOG
881 /* Hoisted out of line to reduce kernel stack footprint */
882 SK_LOG_ATTRIBUTE
883 static void
nx_fsw_na_find_log(const struct chreq * chr,boolean_t create)884 nx_fsw_na_find_log(const struct chreq *chr, boolean_t create)
885 {
886 uuid_string_t uuidstr;
887
888 SK_D("name \"%s\" spec_uuid \"%s\" nx_port %d mode 0x%x pipe_id %u "
889 "ring_id %d ring_set %u ep_type %u create %u%s",
890 chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
891 (int)chr->cr_port, chr->cr_mode, chr->cr_pipe_id,
892 (int)chr->cr_ring_id, chr->cr_ring_set, chr->cr_endpoint, create,
893 (strlcmp(chr->cr_name, NX_FSW_NAME, sizeof(NX_FSW_NAME)) != 0) ?
894 " (skipped)" : "");
895 }
896 #endif /* SK_LOG */
897
898 /*
899 * Try to get a reference to a Nexus adapter attached to a flow switch.
900 * If the adapter is found (or is created), this function returns 0, a
901 * non NULL pointer is returned into *na, and the caller holds a
902 * reference to the adapter.
903 * If an adapter is not found, then no reference is grabbed and the
904 * function returns an error code, or 0 if there is just a flow switch prefix
905 * mismatch. Therefore the caller holds a reference when
906 * (*na != NULL && return == 0).
907 */
908 int
nx_fsw_na_find(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_adapter ** na,boolean_t create)909 nx_fsw_na_find(struct kern_nexus *nx, struct kern_channel *ch,
910 struct chreq *chr, struct nxbind *nxb, struct proc *p,
911 struct nexus_adapter **na, boolean_t create)
912 {
913 struct nexus_vp_adapter *__single vpna = NULL;
914 char *cr_name = chr->cr_name;
915 struct nx_flowswitch *fsw;
916 int error = 0;
917
918 SK_LOCK_ASSERT_HELD();
919 *na = NULL; /* default return value */
920
921 #if SK_LOG
922 if (__improbable(sk_verbose != 0)) {
923 nx_fsw_na_find_log(chr, create);
924 }
925 #endif /* SK_LOG */
926
927 /* first try to see if this is a flow switch port. */
928 if (strlcmp(cr_name, NX_FSW_NAME, sizeof(NX_FSW_NAME) - 1) != 0) {
929 return 0; /* no error, but no flow switch prefix */
930 }
931 ASSERT(nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
932 fsw = NX_FSW_PRIVATE(nx);
933 ASSERT(fsw != NULL);
934
935 if (!create) {
936 return ENXIO;
937 }
938
939 /*
940 * The flowswitch VP is only attachable from a user channel so none of
941 * these flags should be set.
942 */
943 ASSERT((chr->cr_mode & (CHMODE_KERNEL | CHMODE_CONFIG)) == 0);
944 error = fsw_attach_vp(nx, ch, chr, nxb, p, &vpna);
945 ASSERT(vpna == NULL || error == 0);
946
947 if (error == 0) {
948 /* use reference held by nx_fsw_attach_vp above */
949 *na = &vpna->vpna_up;
950 SK_DF(SK_VERB_FSW,
951 "vpna \"%s\" (%p) refs %u to fsw \"%s\" nx_port %d",
952 (*na)->na_name, SK_KVA(*na), (*na)->na_refcount,
953 cr_name, (int)vpna->vpna_nx_port);
954 }
955
956 return error;
957 }
958
959 int
nx_fsw_netagent_add(struct kern_nexus * nx)960 nx_fsw_netagent_add(struct kern_nexus *nx)
961 {
962 return fsw_netagent_add_remove(nx, TRUE);
963 }
964
965 int
nx_fsw_netagent_remove(struct kern_nexus * nx)966 nx_fsw_netagent_remove(struct kern_nexus *nx)
967 {
968 return fsw_netagent_add_remove(nx, FALSE);
969 }
970
971 void
nx_fsw_netagent_update(struct kern_nexus * nx)972 nx_fsw_netagent_update(struct kern_nexus *nx)
973 {
974 fsw_netagent_update(nx);
975 }
976