1 /*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 */
53
54
55 /*
56 * This module implements the flow switch for Skywalk
57 *
58 * --- FLOW SWITCH ---
59 *
60 * For each switch, a lock protects deletion of ports. When configuring
61 * or deleting a new port, the lock is acquired in exclusive mode (after
62 * holding SK_LOCK). When forwarding, the lock is acquired in shared
63 * mode (without SK_LOCK). The lock is held throughout the entire
64 * forwarding cycle, during which the thread may incur in a page fault.
65 * Hence it is important that sleepable shared locks are used.
66 *
67 * On the rx ring, the per-port lock is grabbed initially to reserve
68 * a number of slot in the ring, then the lock is released, packets are
69 * copied from source to destination, and then the lock is acquired again
70 * and the receive ring is updated. (A similar thing is done on the tx
71 * ring for NIC and host stack ports attached to the switch)
72 *
73 * When a netif is attached to a flowswitch, two kernel channels are opened:
74 * The device and host channels. The device channel provides the device
75 * datapath. The host channel is not used in the datapath. It is there
76 * only for providing some callbacks for activating the hostna (e.g.
77 * intercepting host packets).
78 */
79
80 #include <net/bpf.h>
81 #include <netinet/tcp_seq.h>
82 #include <skywalk/os_skywalk_private.h>
83 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
84 #include <skywalk/nexus/flowswitch/fsw_var.h>
85 #include <skywalk/nexus/upipe/nx_user_pipe.h>
86 #include <skywalk/nexus/netif/nx_netif.h>
87 #include <skywalk/nexus/nexus_var.h>
88 #include <sys/protosw.h>
89 #include <sys/domain.h>
90
91 SYSCTL_EXTENSIBLE_NODE(_kern_skywalk, OID_AUTO, flowswitch,
92 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk FlowSwitch");
93
94 static void nx_fsw_dom_init(struct nxdom *);
95 static void nx_fsw_dom_terminate(struct nxdom *);
96 static void nx_fsw_dom_fini(struct nxdom *);
97 static int nx_fsw_dom_find_port(struct kern_nexus *, boolean_t, nexus_port_t *);
98 static int nx_fsw_dom_bind_port(struct kern_nexus *, nexus_port_t *,
99 struct nxbind *, void *);
100 static int nx_fsw_dom_unbind_port(struct kern_nexus *, nexus_port_t);
101 static int nx_fsw_dom_connect(struct kern_nexus_domain_provider *,
102 struct kern_nexus *, struct kern_channel *, struct chreq *,
103 struct kern_channel *, struct nxbind *, struct proc *);
104 static void nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *,
105 struct kern_nexus *, struct kern_channel *);
106 static void nx_fsw_dom_defunct(struct kern_nexus_domain_provider *,
107 struct kern_nexus *, struct kern_channel *, struct proc *);
108 static void nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *,
109 struct kern_nexus *, struct kern_channel *, boolean_t);
110
111 static int nx_fsw_prov_init(struct kern_nexus_domain_provider *);
112 static int nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *,
113 const struct nxprov_params *, struct nxprov_adjusted_params *);
114 static int nx_fsw_prov_params(struct kern_nexus_domain_provider *,
115 const uint32_t, const struct nxprov_params *, struct nxprov_params *,
116 struct skmem_region_params[SKMEM_REGIONS]);
117 static int nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *,
118 struct kern_nexus *, struct nexus_adapter *);
119 static int nx_fsw_prov_config(struct kern_nexus_domain_provider *,
120 struct kern_nexus *, struct nx_cfg_req *, int, struct proc *,
121 kauth_cred_t);
122 static void nx_fsw_prov_fini(struct kern_nexus_domain_provider *);
123 static int nx_fsw_prov_nx_ctor(struct kern_nexus *);
124 static void nx_fsw_prov_nx_dtor(struct kern_nexus *);
125 static size_t nx_fsw_prov_mib_get(struct kern_nexus *nx,
126 struct nexus_mib_filter *, void *, size_t, struct proc *);
127
128 struct nxdom nx_flowswitch_dom_s = {
129 .nxdom_prov_head =
130 STAILQ_HEAD_INITIALIZER(nx_flowswitch_dom_s.nxdom_prov_head),
131 .nxdom_type = NEXUS_TYPE_FLOW_SWITCH,
132 .nxdom_md_type = NEXUS_META_TYPE_PACKET,
133 .nxdom_md_subtype = NEXUS_META_SUBTYPE_RAW,
134 .nxdom_name = "flowswitch",
135 .nxdom_ports = {
136 .nb_def = NX_FSW_VP_MAX,
137 .nb_min = NX_FSW_VP_MIN,
138 .nb_max = NX_FSW_VP_MAX,
139 },
140 .nxdom_tx_rings = {
141 .nb_def = 1,
142 .nb_min = 1,
143 .nb_max = NX_FSW_MAXRINGS,
144 },
145 .nxdom_rx_rings = {
146 .nb_def = 1,
147 .nb_min = 1,
148 .nb_max = NX_FSW_MAXRINGS,
149 },
150 .nxdom_tx_slots = {
151 .nb_def = NX_FSW_TXRINGSIZE,
152 .nb_min = NX_FSW_MINSLOTS,
153 .nb_max = NX_FSW_MAXSLOTS,
154 },
155 .nxdom_rx_slots = {
156 .nb_def = NX_FSW_RXRINGSIZE,
157 .nb_min = NX_FSW_MINSLOTS,
158 .nb_max = NX_FSW_MAXSLOTS,
159 },
160 .nxdom_buf_size = {
161 .nb_def = NX_FSW_BUFSIZE,
162 .nb_min = NX_FSW_MINBUFSIZE,
163 .nb_max = NX_FSW_MAXBUFSIZE,
164 },
165 .nxdom_meta_size = {
166 .nb_def = NX_FSW_UMD_SIZE,
167 .nb_min = NX_FSW_UMD_SIZE,
168 .nb_max = NX_METADATA_USR_MAX_SZ,
169 },
170 .nxdom_stats_size = {
171 .nb_def = 0,
172 .nb_min = 0,
173 .nb_max = NX_STATS_MAX_SZ,
174 },
175 .nxdom_pipes = {
176 .nb_def = 0,
177 .nb_min = 0,
178 .nb_max = NX_UPIPE_MAXPIPES,
179 },
180 .nxdom_flowadv_max = {
181 .nb_def = 0,
182 .nb_min = 0,
183 .nb_max = NX_FLOWADV_MAX,
184 },
185 .nxdom_nexusadv_size = {
186 .nb_def = 0,
187 .nb_min = 0,
188 .nb_max = NX_NEXUSADV_MAX_SZ,
189 },
190 .nxdom_capabilities = {
191 .nb_def = NXPCAP_USER_CHANNEL,
192 .nb_min = 0,
193 .nb_max = (NXPCAP_CHECKSUM_PARTIAL | NXPCAP_USER_PACKET_POOL |
194 NXPCAP_USER_CHANNEL),
195 },
196 .nxdom_qmap = {
197 .nb_def = NEXUS_QMAP_TYPE_INVALID,
198 .nb_min = NEXUS_QMAP_TYPE_INVALID,
199 .nb_max = NEXUS_QMAP_TYPE_INVALID,
200 },
201 .nxdom_max_frags = {
202 .nb_def = NX_PBUF_FRAGS_DEFAULT,
203 .nb_min = NX_PBUF_FRAGS_MIN,
204 .nb_max = NX_PBUF_FRAGS_MAX,
205 },
206 .nxdom_init = nx_fsw_dom_init,
207 .nxdom_terminate = nx_fsw_dom_terminate,
208 .nxdom_fini = nx_fsw_dom_fini,
209 .nxdom_connect = nx_fsw_dom_connect,
210 .nxdom_find_port = nx_fsw_dom_find_port,
211 .nxdom_port_is_reserved = nx_fsw_dom_port_is_reserved,
212 .nxdom_bind_port = nx_fsw_dom_bind_port,
213 .nxdom_unbind_port = nx_fsw_dom_unbind_port,
214 .nxdom_disconnect = nx_fsw_dom_disconnect,
215 .nxdom_defunct = nx_fsw_dom_defunct,
216 .nxdom_defunct_finalize = nx_fsw_dom_defunct_finalize,
217 };
218
219 struct kern_nexus_domain_provider nx_fsw_prov_s = {
220 .nxdom_prov_name = NEXUS_PROVIDER_FLOW_SWITCH,
221 .nxdom_prov_flags = NXDOMPROVF_DEFAULT,
222 .nxdom_prov_cb = {
223 .dp_cb_init = nx_fsw_prov_init,
224 .dp_cb_fini = nx_fsw_prov_fini,
225 .dp_cb_params = nx_fsw_prov_params,
226 .dp_cb_mem_new = nx_fsw_prov_mem_new,
227 .dp_cb_config = nx_fsw_prov_config,
228 .dp_cb_nx_ctor = nx_fsw_prov_nx_ctor,
229 .dp_cb_nx_dtor = nx_fsw_prov_nx_dtor,
230 .dp_cb_nx_mem_info = NULL, /* not supported */
231 .dp_cb_nx_mib_get = nx_fsw_prov_mib_get,
232 .dp_cb_nx_stop = NULL,
233 },
234 };
235
236
237 static void
nx_fsw_dom_init(struct nxdom * nxdom)238 nx_fsw_dom_init(struct nxdom *nxdom)
239 {
240 SK_LOCK_ASSERT_HELD();
241 ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));
242
243 /* Generic initialization */
244 fsw_init();
245 fsw_dp_init();
246
247 (void) nxdom_prov_add(nxdom, &nx_fsw_prov_s);
248 }
249
250 static void
nx_fsw_dom_terminate(struct nxdom * nxdom)251 nx_fsw_dom_terminate(struct nxdom *nxdom)
252 {
253 struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;
254
255 SK_LOCK_ASSERT_HELD();
256
257 STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
258 nxdom_prov_link, tnxdp) {
259 (void) nxdom_prov_del(nxdom_prov);
260 }
261
262 fsw_dp_uninit();
263
264 /* Generic uninitialization */
265 fsw_uninit();
266 }
267
268 static void
nx_fsw_dom_fini(struct nxdom * nxdom)269 nx_fsw_dom_fini(struct nxdom *nxdom)
270 {
271 #pragma unused(nxdom)
272 }
273
274 static int
nx_fsw_prov_init(struct kern_nexus_domain_provider * nxdom_prov)275 nx_fsw_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
276 {
277 #pragma unused(nxdom_prov)
278 SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
279 return 0;
280 }
281
282 static int
nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider * nxdom_prov,const struct nxprov_params * nxp,struct nxprov_adjusted_params * adj)283 nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
284 const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
285 {
286 #pragma unused(nxdom_prov, nxp)
287 _CASSERT(NX_FSW_AFRINGSIZE <= NX_FSW_RXRINGSIZE);
288 _CASSERT(NX_FSW_AFRINGSIZE <= NX_FSW_TXRINGSIZE);
289
290 *(adj->adj_md_subtype) = NEXUS_META_SUBTYPE_PAYLOAD;
291 *(adj->adj_stats_size) = sizeof(struct __nx_stats_fsw);
292 VERIFY(sk_max_flows > 0 && sk_max_flows <= NX_FLOWADV_MAX);
293 *(adj->adj_flowadv_max) = sk_max_flows;
294 *(adj->adj_nexusadv_size) = sizeof(struct sk_nexusadv);
295 *(adj->adj_caps) |= NXPCAP_USER_PACKET_POOL;
296 if (sk_cksum_tx != 0) {
297 *(adj->adj_caps) |= NXPCAP_CHECKSUM_PARTIAL;
298 }
299 *(adj->adj_alloc_rings) = *(adj->adj_free_rings) =
300 ((nxp->nxp_max_frags > 1) && (sk_channel_buflet_alloc != 0)) ?
301 2 : 1;
302 *(adj->adj_alloc_slots) = *(adj->adj_free_slots) =
303 NX_FSW_AFRINGSIZE;
304
305 if (adj->adj_buf_srp->srp_r_seg_size == 0) {
306 adj->adj_buf_srp->srp_r_seg_size = skmem_usr_buf_seg_size;
307 }
308 if (!SKMEM_MEM_CONSTRAINED_DEVICE &&
309 (adj->adj_buf_srp->srp_r_seg_size < NX_FSW_BUF_SEG_SIZE)) {
310 adj->adj_buf_srp->srp_r_seg_size = NX_FSW_BUF_SEG_SIZE;
311 }
312
313 /* enable magazines layer for metadata */
314 *(adj->adj_md_magazines) = TRUE;
315
316 if (*(adj->adj_max_frags) > 1) {
317 uint32_t fsw_maxbufs = SKMEM_MEM_CONSTRAINED_DEVICE ?
318 NX_FSW_MAXBUFFERS_MEM_CONSTRAINED : NX_FSW_MAXBUFFERS;
319 uint32_t magazine_max_objs;
320
321 *(adj->adj_max_buffers) = (sk_fsw_max_bufs != 0) ?
322 sk_fsw_max_bufs : fsw_maxbufs;
323
324 /*
325 * Given that packet objects are the ones cached, use the
326 * metadata size to determine the extra amount of objects
327 * at magazine layer.
328 */
329 magazine_max_objs = skmem_cache_magazine_max(
330 NX_METADATA_PACKET_SZ(*(adj->adj_max_frags)) +
331 METADATA_PREAMBLE_SZ);
332
333 /*
334 * Adjust the max buffers to account for the increase
335 * associated with per-CPU caching.
336 */
337 if (skmem_allow_magazines() &&
338 magazine_max_objs < *(adj->adj_max_buffers)) {
339 *(adj->adj_max_buffers) -= magazine_max_objs;
340 }
341 }
342 return 0;
343 }
344
345 static int
nx_fsw_prov_params(struct kern_nexus_domain_provider * nxdom_prov,const uint32_t req,const struct nxprov_params * nxp0,struct nxprov_params * nxp,struct skmem_region_params srp[SKMEM_REGIONS])346 nx_fsw_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
347 const uint32_t req, const struct nxprov_params *nxp0,
348 struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS])
349 {
350 struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;
351
352 return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
353 nxdom, nxdom, nxdom, nx_fsw_prov_params_adjust);
354 }
355
356 static int
nx_fsw_prov_mem_new(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nexus_adapter * na)357 nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
358 struct kern_nexus *nx, struct nexus_adapter *na)
359 {
360 #pragma unused(nxdom_prov)
361 int err = 0;
362 struct skmem_region_params srp_tmp[SKMEM_REGIONS];
363 struct skmem_region_params *srp = NX_PROV(nx)->nxprov_region_params;
364
365 SK_DF(SK_VERB_FSW,
366 "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx),
367 NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
368 SK_KVA(na));
369
370 ASSERT(na->na_type == NA_FLOWSWITCH_VP);
371 ASSERT(na->na_arena == NULL);
372 ASSERT((na->na_flags & NAF_USER_PKT_POOL) != 0);
373
374 srp = srp_tmp;
375 memcpy(srp, NX_PROV(nx)->nxprov_region_params, sizeof(srp_tmp));
376 srp[SKMEM_REGION_TXAUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
377 srp[SKMEM_REGION_RXFUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
378 if ((na->na_flags & NAF_LOW_LATENCY) != 0) {
379 srp[SKMEM_REGION_KMD].srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
380 srp[SKMEM_REGION_UMD].srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
381 srp[SKMEM_REGION_BUF].srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
382 srp[SKMEM_REGION_RXFKSD].srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
383 srp[SKMEM_REGION_RXFUSD].srp_cflags |= SKMEM_REGION_CR_PERSISTENT;
384 }
385
386 /*
387 * Each port in the flow switch is isolated from one another;
388 * use NULL for the packet buffer pool references to indicate
389 * this, since otherwise we'd be sharing the same pp for the
390 * entire switch (maybe for a future, special use case?)
391 *
392 * This means that clients calling kern_nexus_get_pbufpool()
393 * will get NULL, but this is fine based on current design
394 * of providing port isolation, and also since we don't expose
395 * the flow switch to external kernel clients.
396 */
397 na->na_arena = skmem_arena_create_for_nexus(na, srp, NULL, NULL, FALSE,
398 !NX_USER_CHANNEL_PROV(nx), &nx->nx_adv, &err);
399 ASSERT(na->na_arena != NULL || err != 0);
400
401 return err;
402 }
403
404 static int
nx_fsw_prov_config(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nx_cfg_req * ncr,int sopt_dir,struct proc * p,kauth_cred_t cred)405 nx_fsw_prov_config(struct kern_nexus_domain_provider *nxdom_prov,
406 struct kern_nexus *nx, struct nx_cfg_req *ncr, int sopt_dir,
407 struct proc *p, kauth_cred_t cred)
408 {
409 #pragma unused(nxdom_prov)
410 struct sockopt sopt;
411 int err = 0;
412
413 SK_LOCK_ASSERT_HELD();
414
415 /* proceed only if the client possesses flow switch entitlement */
416 if ((err = skywalk_priv_check_cred(p, cred,
417 PRIV_SKYWALK_REGISTER_FLOW_SWITCH)) != 0) {
418 goto done;
419 }
420
421 if (ncr->nc_req == USER_ADDR_NULL) {
422 err = EINVAL;
423 goto done;
424 }
425
426 /* to make life easier for handling copies */
427 bzero(&sopt, sizeof(sopt));
428 sopt.sopt_dir = sopt_dir;
429 sopt.sopt_val = ncr->nc_req;
430 sopt.sopt_valsize = ncr->nc_req_len;
431 sopt.sopt_p = p;
432
433 /* avoid _MALLOCing at the cost of this ugly switch block */
434 switch (ncr->nc_cmd) {
435 case NXCFG_CMD_ATTACH:
436 case NXCFG_CMD_DETACH: {
437 struct nx_spec_req nsr;
438
439 bzero(&nsr, sizeof(nsr));
440 err = sooptcopyin(&sopt, &nsr, sizeof(nsr), sizeof(nsr));
441 if (err != 0) {
442 goto done;
443 }
444
445 /*
446 * Null-terminate in case this has an interface name;
447 * the union is already large enough for uuid_t.
448 */
449 nsr.nsr_name[sizeof(nsr.nsr_name) - 1] = '\0';
450 if (p != kernproc) {
451 nsr.nsr_flags &= NXSPECREQ_MASK;
452 }
453
454 err = fsw_ctl(nx, ncr->nc_cmd, p, &nsr);
455 if (err != 0) {
456 goto done;
457 }
458
459 /* XXX: [email protected] -- can this copyout fail? */
460 (void) sooptcopyout(&sopt, &nsr, sizeof(nsr));
461 break;
462 }
463
464 case NXCFG_CMD_FLOW_ADD:
465 case NXCFG_CMD_FLOW_DEL: {
466 _CASSERT(offsetof(struct nx_flow_req, _nfr_kernel_field_end) ==
467 offsetof(struct nx_flow_req, _nfr_common_field_end));
468 struct nx_flow_req nfr;
469
470 bzero(&nfr, sizeof(nfr));
471 err = sooptcopyin(&sopt, &nfr, sizeof(nfr), sizeof(nfr));
472 if (err != 0) {
473 goto done;
474 }
475
476 err = fsw_ctl(nx, ncr->nc_cmd, p, &nfr);
477 if (err != 0) {
478 goto done;
479 }
480
481 /* XXX: [email protected] -- can this copyout fail? */
482 (void) sooptcopyout(&sopt, &nfr, sizeof(nfr));
483 break;
484 }
485
486 case NXCFG_CMD_NETEM: {
487 struct if_netem_params inp;
488
489 bzero(&inp, sizeof(inp));
490 err = sooptcopyin(&sopt, &inp, sizeof(inp), sizeof(inp));
491 if (err != 0) {
492 goto done;
493 }
494 err = fsw_ctl(nx, ncr->nc_cmd, p, &inp);
495 if (err != 0) {
496 goto done;
497 }
498 break;
499 }
500
501 default:
502 err = EINVAL;
503 goto done;
504 }
505
506 done:
507 SK_DF(err ? SK_VERB_ERROR: SK_VERB_FSW,
508 "nexus 0x%llx (%s) cmd %d (err %d)", SK_KVA(nx),
509 NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err);
510 return err;
511 }
512
513 static void
nx_fsw_prov_fini(struct kern_nexus_domain_provider * nxdom_prov)514 nx_fsw_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
515 {
516 #pragma unused(nxdom_prov)
517 SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
518 }
519
520 static int
nx_fsw_prov_nx_ctor(struct kern_nexus * nx)521 nx_fsw_prov_nx_ctor(struct kern_nexus *nx)
522 {
523 struct nx_flowswitch *fsw;
524
525 SK_LOCK_ASSERT_HELD();
526
527 ASSERT(nx->nx_arg == NULL);
528
529 SK_D("nexus 0x%llx (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name);
530
531 fsw = fsw_alloc(Z_WAITOK);
532 nx->nx_arg = fsw;
533 fsw->fsw_nx = nx;
534 fsw->fsw_tx_rings = NX_PROV(nx)->nxprov_params->nxp_tx_rings;
535 fsw->fsw_rx_rings = NX_PROV(nx)->nxprov_params->nxp_rx_rings;
536
537 FSW_WLOCK(fsw);
538
539 fsw_dp_ctor(fsw);
540
541 FSW_WUNLOCK(fsw);
542
543 SK_D("create new fsw 0x%llx for nexus 0x%llx",
544 SK_KVA(NX_FSW_PRIVATE(nx)), SK_KVA(nx));
545
546 return 0;
547 }
548
549 static void
nx_fsw_prov_nx_dtor(struct kern_nexus * nx)550 nx_fsw_prov_nx_dtor(struct kern_nexus *nx)
551 {
552 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
553 int err;
554
555 SK_LOCK_ASSERT_HELD();
556
557 SK_D("nexus 0x%llx (%s) fsw 0x%llx", SK_KVA(nx),
558 NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(fsw));
559
560 err = fsw_ctl_detach(nx, current_proc(), NULL);
561 ASSERT(err == 0); /* this cannot fail */
562 ASSERT(fsw->fsw_dev_ch == NULL);
563 ASSERT(fsw->fsw_host_ch == NULL);
564
565 SK_DF(SK_VERB_FSW, "marking fsw 0x%llx as free", SK_KVA(fsw));
566 fsw_free(fsw);
567 nx->nx_arg = NULL;
568 }
569
570 static size_t
nx_fsw_prov_mib_get(struct kern_nexus * nx,struct nexus_mib_filter * filter,void * out,size_t len,struct proc * p)571 nx_fsw_prov_mib_get(struct kern_nexus *nx, struct nexus_mib_filter *filter,
572 void *out, size_t len, struct proc *p)
573 {
574 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
575
576 /* this check doesn't require holding fsw_lock */
577 if ((filter->nmf_bitmap & NXMIB_FILTER_NX_UUID) &&
578 (uuid_compare(filter->nmf_nx_uuid,
579 fsw->fsw_nx->nx_uuid)) != 0) {
580 return 0;
581 }
582
583 /* intercept NXMIB_FSW_STATS here since it's for flowswitch */
584 FSW_RLOCK(fsw);
585 len = fsw_mib_get(fsw, filter, out, len, p);
586 FSW_UNLOCK(fsw);
587
588 return len;
589 }
590
591 boolean_t
nx_fsw_dom_port_is_reserved(struct kern_nexus * nx,nexus_port_t nx_port)592 nx_fsw_dom_port_is_reserved(struct kern_nexus *nx, nexus_port_t nx_port)
593 {
594 #pragma unused(nx)
595 return nx_port < NEXUS_PORT_FLOW_SWITCH_CLIENT;
596 }
597
598 static int
nx_fsw_dom_find_port(struct kern_nexus * nx,boolean_t rsvd,nexus_port_t * nx_port)599 nx_fsw_dom_find_port(struct kern_nexus *nx, boolean_t rsvd,
600 nexus_port_t *nx_port)
601 {
602 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
603 nexus_port_t first, last, port;
604 int error;
605
606 ASSERT(nx_port != NULL);
607
608 port = *nx_port;
609 ASSERT(port == NEXUS_PORT_ANY);
610
611 if (rsvd) {
612 first = 0;
613 last = NEXUS_PORT_FLOW_SWITCH_CLIENT;
614 } else {
615 first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
616 last = NXDOM_MAX(NX_DOM(nx), ports);
617 }
618 ASSERT(first <= last);
619
620 FSW_WLOCK(fsw);
621 if (__improbable(first == last)) {
622 error = ENOSPC;
623 } else {
624 error = nx_port_find(nx, first, last - 1, &port);
625 ASSERT(error != 0 || (port >= first && port < last));
626 }
627 FSW_WUNLOCK(fsw);
628
629 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
630 "nx 0x%llx \"%s\" %snx_port %d [%u,%u] (err %d)", SK_KVA(nx),
631 nx->nx_prov->nxprov_params->nxp_name, (rsvd ? "[reserved] " : ""),
632 (int)port, first, (last - 1), error);
633
634 if (error == 0) {
635 *nx_port = port;
636 }
637
638 return error;
639 }
640
641 static int
nx_fsw_dom_bind_port(struct kern_nexus * nx,nexus_port_t * nx_port,struct nxbind * nxb,void * info)642 nx_fsw_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
643 struct nxbind *nxb, void *info)
644 {
645 #pragma unused(info)
646 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
647 nexus_port_t first, last, port;
648 int error;
649
650 ASSERT(nx_port != NULL);
651 ASSERT(nxb != NULL);
652
653 port = *nx_port;
654
655 /* can't bind reserved ports to client credentials */
656 if (nx_fsw_dom_port_is_reserved(nx, port)) {
657 return EDOM;
658 }
659
660 /*
661 * Allow clients to bind to regular ports (non-reserved);
662 * reserved ports aren't subject to bind/unbind, since
663 * they are used for internal purposes.
664 */
665 first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
666 last = NXDOM_MAX(NX_DOM(nx), ports);
667 ASSERT(first <= last);
668
669 FSW_WLOCK(fsw);
670 if (__improbable(first == last)) {
671 error = ENOSPC;
672 } else if (port != NEXUS_PORT_ANY) {
673 error = nx_port_bind(nx, port, nxb);
674 } else {
675 error = nx_port_find(nx, first, last - 1, &port);
676 ASSERT(error != 0 || (port >= first && port < last));
677 if (error == 0) {
678 error = nx_port_bind(nx, port, nxb);
679 }
680 }
681 FSW_WUNLOCK(fsw);
682
683 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
684 "nx 0x%llx \"%s\" nx_port %d [%u,%u] (err %d)", SK_KVA(nx),
685 nx->nx_prov->nxprov_params->nxp_name, (int)port,
686 first, (last - 1), error);
687
688 ASSERT(*nx_port == NEXUS_PORT_ANY || *nx_port == port);
689 if (error == 0) {
690 *nx_port = port;
691 }
692
693 return error;
694 }
695
696 static int
nx_fsw_dom_unbind_port(struct kern_nexus * nx,nexus_port_t nx_port)697 nx_fsw_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
698 {
699 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
700 int error;
701
702 FSW_WLOCK(fsw);
703 error = nx_port_unbind(nx, nx_port);
704 FSW_WUNLOCK(fsw);
705
706 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
707 "nx 0x%llx \"%s\" nx_port %d (err %d)", SK_KVA(nx),
708 nx->nx_prov->nxprov_params->nxp_name, (int)nx_port, error);
709
710 return error;
711 }
712
713 static int
nx_fsw_dom_connect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct kern_channel * ch0,struct nxbind * nxb,struct proc * p)714 nx_fsw_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
715 struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
716 struct kern_channel *ch0, struct nxbind *nxb, struct proc *p)
717 {
718 #pragma unused(nxdom_prov)
719 nexus_port_t port = chr->cr_port;
720 int err = 0;
721
722 SK_LOCK_ASSERT_HELD();
723
724 ASSERT(nx->nx_prov->nxprov_params->nxp_type ==
725 nxdom_prov->nxdom_prov_dom->nxdom_type &&
726 nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
727 ASSERT(!(ch->ch_flags & CHANF_HOST));
728
729 if (port != NEXUS_PORT_ANY && port >= NXDOM_MAX(NX_DOM(nx), ports)) {
730 err = EDOM;
731 goto done;
732 }
733
734 if (chr->cr_mode & CHMODE_EVENT_RING) {
735 SK_ERR("event ring is not supported for flowswitch");
736 err = ENOTSUP;
737 goto done;
738 }
739
740 chr->cr_real_endpoint = chr->cr_endpoint = CH_ENDPOINT_FLOW_SWITCH;
741 if (ch->ch_flags & CHANF_KERNEL) {
742 uuid_string_t uuidstr;
743 ASSERT(!uuid_is_null(chr->cr_spec_uuid));
744 (void) snprintf(chr->cr_name, sizeof(chr->cr_name),
745 "%s_%llu:%s", NX_FSW_NAME, nx->nx_id,
746 sk_uuid_unparse(chr->cr_spec_uuid, uuidstr));
747 chr->cr_ring_set = RING_SET_DEFAULT;
748 if (chr->cr_mode & CHMODE_HOST) {
749 atomic_bitset_32(&ch->ch_flags, CHANF_HOST);
750 }
751 err = na_connect_spec(nx, ch, chr, p);
752 } else {
753 ASSERT(port != NEXUS_PORT_ANY);
754 if (chr->cr_mode & CHMODE_HOST) {
755 /* not allowed unless kernel (special) channel */
756 err = EINVAL;
757 goto done;
758 }
759 (void) snprintf(chr->cr_name, sizeof(chr->cr_name),
760 "%s_%llu:%u", NX_FSW_NAME, nx->nx_id, port);
761 chr->cr_ring_set = RING_SET_DEFAULT;
762 err = na_connect(nx, ch, chr, ch0, nxb, p);
763 }
764
765 done:
766 return err;
767 }
768
769 static void
nx_fsw_dom_disconnect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch)770 nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
771 struct kern_nexus *nx, struct kern_channel *ch)
772 {
773 #pragma unused(nxdom_prov)
774 SK_LOCK_ASSERT_HELD();
775
776 SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch),
777 SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
778 ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
779
780 if (ch->ch_flags & CHANF_KERNEL) {
781 na_disconnect_spec(nx, ch);
782 } else {
783 na_disconnect(nx, ch);
784 }
785 }
786
787 static void
nx_fsw_dom_defunct(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)788 nx_fsw_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
789 struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
790 {
791 #pragma unused(nxdom_prov)
792 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
793
794 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
795 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
796 ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
797
798 /*
799 * Hold the flowswitch lock as writer; this prevents all data path
800 * accesses to the flowswitch, and allows us to mark the rings with
801 * CKRF_DEFUNCT. Unlike some other nexus types, the flowswitch
802 * doesn't utilize kr_{enter,exit} for serialization, at present.
803 */
804 FSW_WLOCK(fsw);
805 na_ch_rings_defunct(ch, p);
806 FSW_WUNLOCK(fsw);
807 }
808
809 static void
nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,boolean_t locked)810 nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
811 struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
812 {
813 #pragma unused(nxdom_prov)
814 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
815 int err = 0;
816
817 if (!locked) {
818 SK_LOCK_ASSERT_NOTHELD();
819 SK_LOCK();
820 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
821 } else {
822 SK_LOCK_ASSERT_HELD();
823 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
824 }
825
826 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
827 ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
828 ASSERT(VPNA(ch->ch_na)->vpna_nx_port == ch->ch_info->cinfo_nx_port);
829
830 err = fsw_port_na_defunct(fsw, VPNA(ch->ch_na));
831
832 if (err == 0) {
833 na_defunct(nx, ch, ch->ch_na, locked);
834 }
835
836 SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d) err %d",
837 ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx),
838 nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
839 ch->ch_info->cinfo_nx_port,
840 (int)ch->ch_info->cinfo_ch_ring_id, err);
841
842 if (!locked) {
843 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
844 SK_UNLOCK();
845 } else {
846 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
847 SK_LOCK_ASSERT_HELD();
848 }
849 }
850
851 #if SK_LOG
852 /* Hoisted out of line to reduce kernel stack footprint */
853 SK_LOG_ATTRIBUTE
854 static void
nx_fsw_na_find_log(const struct chreq * chr,boolean_t create)855 nx_fsw_na_find_log(const struct chreq *chr, boolean_t create)
856 {
857 uuid_string_t uuidstr;
858
859 SK_D("name \"%s\" spec_uuid \"%s\" nx_port %d mode 0x%b pipe_id %u "
860 "ring_id %d ring_set %u ep_type %u:%u create %u%s",
861 chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
862 (int)chr->cr_port, chr->cr_mode, CHMODE_BITS, chr->cr_pipe_id,
863 (int)chr->cr_ring_id, chr->cr_ring_set, chr->cr_real_endpoint,
864 chr->cr_endpoint, create, (strncmp(chr->cr_name, NX_FSW_NAME,
865 sizeof(NX_FSW_NAME) - 1) != 0) ? " (skipped)" : "");
866 }
867 #endif /* SK_LOG */
868
869 /*
870 * Try to get a reference to a Nexus adapter attached to a flow switch.
871 * If the adapter is found (or is created), this function returns 0, a
872 * non NULL pointer is returned into *na, and the caller holds a
873 * reference to the adapter.
874 * If an adapter is not found, then no reference is grabbed and the
875 * function returns an error code, or 0 if there is just a flow switch prefix
876 * mismatch. Therefore the caller holds a reference when
877 * (*na != NULL && return == 0).
878 */
879 int
nx_fsw_na_find(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_adapter ** na,boolean_t create)880 nx_fsw_na_find(struct kern_nexus *nx, struct kern_channel *ch,
881 struct chreq *chr, struct nxbind *nxb, struct proc *p,
882 struct nexus_adapter **na, boolean_t create)
883 {
884 #pragma unused(ch)
885 struct nexus_vp_adapter *vpna = NULL;
886 char *cr_name = chr->cr_name;
887 struct nx_flowswitch *fsw;
888 int error = 0;
889
890 SK_LOCK_ASSERT_HELD();
891 *na = NULL; /* default return value */
892
893 #if SK_LOG
894 if (__improbable(sk_verbose != 0)) {
895 nx_fsw_na_find_log(chr, create);
896 }
897 #endif /* SK_LOG */
898
899 /* first try to see if this is a flow switch port. */
900 if (strncmp(cr_name, NX_FSW_NAME, sizeof(NX_FSW_NAME) - 1) != 0) {
901 return 0; /* no error, but no flow switch prefix */
902 }
903 ASSERT(nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
904 fsw = NX_FSW_PRIVATE(nx);
905 ASSERT(fsw != NULL);
906
907 if (!create) {
908 return ENXIO;
909 }
910
911 /*
912 * The flowswitch VP is only attachable from a user channel so none of
913 * these flags should be set.
914 */
915 ASSERT((chr->cr_mode & (CHMODE_KERNEL | CHMODE_CONFIG)) == 0);
916 error = fsw_attach_vp(nx, ch, chr, nxb, p, &vpna);
917 ASSERT(vpna == NULL || error == 0);
918
919 if (error == 0) {
920 /* use reference held by nx_fsw_attach_vp above */
921 *na = &vpna->vpna_up;
922 SK_DF(SK_VERB_FSW,
923 "vpna \"%s\" (0x%llx) refs %u to fsw \"%s\" nx_port %d",
924 (*na)->na_name, SK_KVA(*na), (*na)->na_refcount,
925 cr_name, (int)vpna->vpna_nx_port);
926 }
927
928 return error;
929 }
930
931 int
nx_fsw_netagent_add(struct kern_nexus * nx)932 nx_fsw_netagent_add(struct kern_nexus *nx)
933 {
934 return fsw_netagent_add_remove(nx, TRUE);
935 }
936
937 int
nx_fsw_netagent_remove(struct kern_nexus * nx)938 nx_fsw_netagent_remove(struct kern_nexus *nx)
939 {
940 return fsw_netagent_add_remove(nx, FALSE);
941 }
942
943 void
nx_fsw_netagent_update(struct kern_nexus * nx)944 nx_fsw_netagent_update(struct kern_nexus *nx)
945 {
946 fsw_netagent_update(nx);
947 }
948