1 /*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 */
53
54
55 /*
56 * This module implements the flow switch for Skywalk
57 *
58 * --- FLOW SWITCH ---
59 *
60 * For each switch, a lock protects deletion of ports. When configuring
61 * or deleting a new port, the lock is acquired in exclusive mode (after
62 * holding SK_LOCK). When forwarding, the lock is acquired in shared
63 * mode (without SK_LOCK). The lock is held throughout the entire
64 * forwarding cycle, during which the thread may incur in a page fault.
65 * Hence it is important that sleepable shared locks are used.
66 *
67 * On the rx ring, the per-port lock is grabbed initially to reserve
68 * a number of slot in the ring, then the lock is released, packets are
69 * copied from source to destination, and then the lock is acquired again
70 * and the receive ring is updated. (A similar thing is done on the tx
71 * ring for NIC and host stack ports attached to the switch)
72 *
73 * When a netif is attached to a flowswitch, two kernel channels are opened:
74 * The device and host channels. The device channel provides the device
75 * datapath. The host channel is not used in the datapath. It is there
76 * only for providing some callbacks for activating the hostna (e.g.
77 * intercepting host packets).
78 */
79
80 #include <net/bpf.h>
81 #include <netinet/tcp_seq.h>
82 #include <skywalk/os_skywalk_private.h>
83 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
84 #include <skywalk/nexus/flowswitch/fsw_var.h>
85 #include <skywalk/nexus/upipe/nx_user_pipe.h>
86 #include <skywalk/nexus/netif/nx_netif.h>
87 #include <skywalk/nexus/nexus_var.h>
88 #include <sys/protosw.h>
89 #include <sys/domain.h>
90
91 SYSCTL_EXTENSIBLE_NODE(_kern_skywalk, OID_AUTO, flowswitch,
92 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk FlowSwitch");
93
94 static void nx_fsw_dom_init(struct nxdom *);
95 static void nx_fsw_dom_terminate(struct nxdom *);
96 static void nx_fsw_dom_fini(struct nxdom *);
97 static int nx_fsw_dom_find_port(struct kern_nexus *, boolean_t, nexus_port_t *);
98 static int nx_fsw_dom_bind_port(struct kern_nexus *, nexus_port_t *,
99 struct nxbind *, void *);
100 static int nx_fsw_dom_unbind_port(struct kern_nexus *, nexus_port_t);
101 static int nx_fsw_dom_connect(struct kern_nexus_domain_provider *,
102 struct kern_nexus *, struct kern_channel *, struct chreq *,
103 struct kern_channel *, struct nxbind *, struct proc *);
104 static void nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *,
105 struct kern_nexus *, struct kern_channel *);
106 static void nx_fsw_dom_defunct(struct kern_nexus_domain_provider *,
107 struct kern_nexus *, struct kern_channel *, struct proc *);
108 static void nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *,
109 struct kern_nexus *, struct kern_channel *, boolean_t);
110
111 static int nx_fsw_prov_init(struct kern_nexus_domain_provider *);
112 static int nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *,
113 const struct nxprov_params *, struct nxprov_adjusted_params *);
114 static int nx_fsw_prov_params(struct kern_nexus_domain_provider *,
115 const uint32_t, const struct nxprov_params *, struct nxprov_params *,
116 struct skmem_region_params[SKMEM_REGIONS], uint32_t);
117 static int nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *,
118 struct kern_nexus *, struct nexus_adapter *);
119 static int nx_fsw_prov_config(struct kern_nexus_domain_provider *,
120 struct kern_nexus *, struct nx_cfg_req *, int, struct proc *,
121 kauth_cred_t);
122 static void nx_fsw_prov_fini(struct kern_nexus_domain_provider *);
123 static int nx_fsw_prov_nx_ctor(struct kern_nexus *);
124 static void nx_fsw_prov_nx_dtor(struct kern_nexus *);
125 static size_t nx_fsw_prov_mib_get(struct kern_nexus *nx,
126 struct nexus_mib_filter *, void *, size_t, struct proc *);
127
128 struct nxdom nx_flowswitch_dom_s = {
129 .nxdom_prov_head =
130 STAILQ_HEAD_INITIALIZER(nx_flowswitch_dom_s.nxdom_prov_head),
131 .nxdom_type = NEXUS_TYPE_FLOW_SWITCH,
132 .nxdom_md_type = NEXUS_META_TYPE_PACKET,
133 .nxdom_md_subtype = NEXUS_META_SUBTYPE_RAW,
134 .nxdom_name = "flowswitch",
135 .nxdom_ports = {
136 .nb_def = NX_FSW_VP_MAX,
137 .nb_min = NX_FSW_VP_MIN,
138 .nb_max = NX_FSW_VP_MAX,
139 },
140 .nxdom_tx_rings = {
141 .nb_def = 1,
142 .nb_min = 1,
143 .nb_max = NX_FSW_MAXRINGS,
144 },
145 .nxdom_rx_rings = {
146 .nb_def = 1,
147 .nb_min = 1,
148 .nb_max = NX_FSW_MAXRINGS,
149 },
150 .nxdom_tx_slots = {
151 .nb_def = NX_FSW_TXRINGSIZE,
152 .nb_min = NX_FSW_MINSLOTS,
153 .nb_max = NX_FSW_MAXSLOTS,
154 },
155 .nxdom_rx_slots = {
156 .nb_def = NX_FSW_RXRINGSIZE,
157 .nb_min = NX_FSW_MINSLOTS,
158 .nb_max = NX_FSW_MAXSLOTS,
159 },
160 .nxdom_buf_size = {
161 .nb_def = NX_FSW_BUFSIZE,
162 .nb_min = NX_FSW_MINBUFSIZE,
163 .nb_max = NX_FSW_MAXBUFSIZE,
164 },
165 .nxdom_large_buf_size = {
166 .nb_def = NX_FSW_DEF_LARGE_BUFSIZE,
167 .nb_min = NX_FSW_MIN_LARGE_BUFSIZE,
168 .nb_max = NX_FSW_MAX_LARGE_BUFSIZE,
169 },
170 .nxdom_meta_size = {
171 .nb_def = NX_FSW_UMD_SIZE,
172 .nb_min = NX_FSW_UMD_SIZE,
173 .nb_max = NX_METADATA_USR_MAX_SZ,
174 },
175 .nxdom_stats_size = {
176 .nb_def = 0,
177 .nb_min = 0,
178 .nb_max = NX_STATS_MAX_SZ,
179 },
180 .nxdom_pipes = {
181 .nb_def = 0,
182 .nb_min = 0,
183 .nb_max = NX_UPIPE_MAXPIPES,
184 },
185 .nxdom_flowadv_max = {
186 .nb_def = 0,
187 .nb_min = 0,
188 .nb_max = NX_FLOWADV_MAX,
189 },
190 .nxdom_nexusadv_size = {
191 .nb_def = 0,
192 .nb_min = 0,
193 .nb_max = NX_NEXUSADV_MAX_SZ,
194 },
195 .nxdom_capabilities = {
196 .nb_def = NXPCAP_USER_CHANNEL,
197 .nb_min = 0,
198 .nb_max = (NXPCAP_CHECKSUM_PARTIAL | NXPCAP_USER_PACKET_POOL |
199 NXPCAP_USER_CHANNEL),
200 },
201 .nxdom_qmap = {
202 .nb_def = NEXUS_QMAP_TYPE_INVALID,
203 .nb_min = NEXUS_QMAP_TYPE_INVALID,
204 .nb_max = NEXUS_QMAP_TYPE_INVALID,
205 },
206 .nxdom_max_frags = {
207 .nb_def = NX_PBUF_FRAGS_DEFAULT,
208 .nb_min = NX_PBUF_FRAGS_MIN,
209 .nb_max = NX_PBUF_FRAGS_MAX,
210 },
211 .nxdom_init = nx_fsw_dom_init,
212 .nxdom_terminate = nx_fsw_dom_terminate,
213 .nxdom_fini = nx_fsw_dom_fini,
214 .nxdom_connect = nx_fsw_dom_connect,
215 .nxdom_find_port = nx_fsw_dom_find_port,
216 .nxdom_port_is_reserved = nx_fsw_dom_port_is_reserved,
217 .nxdom_bind_port = nx_fsw_dom_bind_port,
218 .nxdom_unbind_port = nx_fsw_dom_unbind_port,
219 .nxdom_disconnect = nx_fsw_dom_disconnect,
220 .nxdom_defunct = nx_fsw_dom_defunct,
221 .nxdom_defunct_finalize = nx_fsw_dom_defunct_finalize,
222 };
223
224 struct kern_nexus_domain_provider nx_fsw_prov_s = {
225 .nxdom_prov_name = NEXUS_PROVIDER_FLOW_SWITCH,
226 .nxdom_prov_flags = NXDOMPROVF_DEFAULT,
227 .nxdom_prov_cb = {
228 .dp_cb_init = nx_fsw_prov_init,
229 .dp_cb_fini = nx_fsw_prov_fini,
230 .dp_cb_params = nx_fsw_prov_params,
231 .dp_cb_mem_new = nx_fsw_prov_mem_new,
232 .dp_cb_config = nx_fsw_prov_config,
233 .dp_cb_nx_ctor = nx_fsw_prov_nx_ctor,
234 .dp_cb_nx_dtor = nx_fsw_prov_nx_dtor,
235 .dp_cb_nx_mem_info = NULL, /* not supported */
236 .dp_cb_nx_mib_get = nx_fsw_prov_mib_get,
237 .dp_cb_nx_stop = NULL,
238 },
239 };
240
241
242 static void
nx_fsw_dom_init(struct nxdom * nxdom)243 nx_fsw_dom_init(struct nxdom *nxdom)
244 {
245 SK_LOCK_ASSERT_HELD();
246 ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));
247
248 /* Generic initialization */
249 fsw_init();
250 fsw_dp_init();
251
252 (void) nxdom_prov_add(nxdom, &nx_fsw_prov_s);
253 }
254
255 static void
nx_fsw_dom_terminate(struct nxdom * nxdom)256 nx_fsw_dom_terminate(struct nxdom *nxdom)
257 {
258 struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;
259
260 SK_LOCK_ASSERT_HELD();
261
262 STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
263 nxdom_prov_link, tnxdp) {
264 (void) nxdom_prov_del(nxdom_prov);
265 }
266
267 fsw_dp_uninit();
268
269 /* Generic uninitialization */
270 fsw_uninit();
271 }
272
273 static void
nx_fsw_dom_fini(struct nxdom * nxdom)274 nx_fsw_dom_fini(struct nxdom *nxdom)
275 {
276 #pragma unused(nxdom)
277 }
278
279 static int
nx_fsw_prov_init(struct kern_nexus_domain_provider * nxdom_prov)280 nx_fsw_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
281 {
282 #pragma unused(nxdom_prov)
283 SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
284 return 0;
285 }
286
287 static int
nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider * nxdom_prov,const struct nxprov_params * nxp,struct nxprov_adjusted_params * adj)288 nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
289 const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
290 {
291 #pragma unused(nxdom_prov, nxp)
292 _CASSERT(NX_FSW_AFRINGSIZE <= NX_FSW_RXRINGSIZE);
293 _CASSERT(NX_FSW_AFRINGSIZE <= NX_FSW_TXRINGSIZE);
294
295 *(adj->adj_md_subtype) = NEXUS_META_SUBTYPE_PAYLOAD;
296 *(adj->adj_stats_size) = sizeof(struct __nx_stats_fsw);
297 VERIFY(sk_max_flows > 0 && sk_max_flows <= NX_FLOWADV_MAX);
298 *(adj->adj_flowadv_max) = sk_max_flows;
299 *(adj->adj_nexusadv_size) = sizeof(struct sk_nexusadv);
300 *(adj->adj_caps) |= NXPCAP_USER_PACKET_POOL;
301 if (sk_cksum_tx != 0) {
302 *(adj->adj_caps) |= NXPCAP_CHECKSUM_PARTIAL;
303 }
304 *(adj->adj_alloc_rings) = *(adj->adj_free_rings) =
305 ((nxp->nxp_max_frags > 1) && (sk_channel_buflet_alloc != 0)) ?
306 2 : 1;
307 *(adj->adj_alloc_slots) = *(adj->adj_free_slots) =
308 NX_FSW_AFRINGSIZE;
309
310 if (!SKMEM_MEM_CONSTRAINED_DEVICE() &&
311 (*(adj->adj_buf_region_segment_size) < NX_FSW_BUF_SEG_SIZE)) {
312 *(adj->adj_buf_region_segment_size) = NX_FSW_BUF_SEG_SIZE;
313 }
314
315 if (*(adj->adj_max_frags) > 1) {
316 uint32_t fsw_maxbufs = SKMEM_MEM_CONSTRAINED_DEVICE() ?
317 NX_FSW_MAXBUFFERS_MEM_CONSTRAINED : NX_FSW_MAXBUFFERS;
318 uint32_t magazine_max_objs;
319
320 *(adj->adj_max_buffers) = (sk_fsw_max_bufs != 0) ?
321 sk_fsw_max_bufs : fsw_maxbufs;
322
323 /*
324 * Given that packet objects are the ones cached, use the
325 * metadata size to determine the extra amount of objects
326 * at magazine layer.
327 */
328 magazine_max_objs = skmem_cache_magazine_max(
329 NX_METADATA_PACKET_SZ(*(adj->adj_max_frags)) +
330 METADATA_PREAMBLE_SZ);
331
332 /*
333 * Adjust the max buffers to account for the increase
334 * associated with per-CPU caching.
335 */
336 if (skmem_allow_magazines() &&
337 magazine_max_objs < *(adj->adj_max_buffers)) {
338 *(adj->adj_max_buffers) -= magazine_max_objs;
339 }
340 }
341 if (SKMEM_MEM_CONSTRAINED_DEVICE() || (fsw_use_dual_sized_pool == 0) ||
342 (*(adj->adj_max_frags) <= 1)) {
343 *(adj->adj_large_buf_size) = 0;
344 }
345 return 0;
346 }
347
348 static int
nx_fsw_prov_params(struct kern_nexus_domain_provider * nxdom_prov,const uint32_t req,const struct nxprov_params * nxp0,struct nxprov_params * nxp,struct skmem_region_params srp[SKMEM_REGIONS],uint32_t pp_region_config_flags)349 nx_fsw_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
350 const uint32_t req, const struct nxprov_params *nxp0,
351 struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS],
352 uint32_t pp_region_config_flags)
353 {
354 struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;
355
356 /* USD regions need to be writable to support user packet pool */
357 srp[SKMEM_REGION_TXAUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
358 srp[SKMEM_REGION_RXFUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
359
360 return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
361 nxdom, nxdom, nxdom, pp_region_config_flags,
362 nx_fsw_prov_params_adjust);
363 }
364
365 static int
nx_fsw_prov_mem_new(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nexus_adapter * na)366 nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
367 struct kern_nexus *nx, struct nexus_adapter *na)
368 {
369 #pragma unused(nxdom_prov)
370 int err = 0;
371 struct skmem_region_params *srp = NX_PROV(nx)->nxprov_region_params;
372
373 SK_DF(SK_VERB_FSW,
374 "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx),
375 NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
376 SK_KVA(na));
377
378 ASSERT(na->na_type == NA_FLOWSWITCH_VP);
379 ASSERT(na->na_arena == NULL);
380 ASSERT((na->na_flags & NAF_USER_PKT_POOL) != 0);
381 /*
382 * Each port in the flow switch is isolated from one another;
383 * use NULL for the packet buffer pool references to indicate
384 * this, since otherwise we'd be sharing the same pp for the
385 * entire switch (maybe for a future, special use case?)
386 *
387 * This means that clients calling kern_nexus_get_pbufpool()
388 * will get NULL, but this is fine based on current design
389 * of providing port isolation, and also since we don't expose
390 * the flow switch to external kernel clients.
391 */
392 uint32_t pp_flags = NX_USER_CHANNEL_PROV(nx) ?
393 0 : SKMEM_PP_FLAG_TRUNCATED_BUF;
394 na->na_arena = skmem_arena_create_for_nexus(na, srp, NULL, NULL, pp_flags,
395 &nx->nx_adv, &err);
396 ASSERT(na->na_arena != NULL || err != 0);
397 return err;
398 }
399
400 static int
nx_fsw_prov_config(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nx_cfg_req * ncr,int sopt_dir,struct proc * p,kauth_cred_t cred)401 nx_fsw_prov_config(struct kern_nexus_domain_provider *nxdom_prov,
402 struct kern_nexus *nx, struct nx_cfg_req *ncr, int sopt_dir,
403 struct proc *p, kauth_cred_t cred)
404 {
405 #pragma unused(nxdom_prov)
406 struct sockopt sopt;
407 int err = 0;
408
409 SK_LOCK_ASSERT_HELD();
410
411 /* proceed only if the client possesses flow switch entitlement */
412 if ((err = skywalk_priv_check_cred(p, cred,
413 PRIV_SKYWALK_REGISTER_FLOW_SWITCH)) != 0) {
414 goto done;
415 }
416
417 if (ncr->nc_req == USER_ADDR_NULL) {
418 err = EINVAL;
419 goto done;
420 }
421
422 /* to make life easier for handling copies */
423 bzero(&sopt, sizeof(sopt));
424 sopt.sopt_dir = sopt_dir;
425 sopt.sopt_val = ncr->nc_req;
426 sopt.sopt_valsize = ncr->nc_req_len;
427 sopt.sopt_p = p;
428
429 /* avoid _MALLOCing at the cost of this ugly switch block */
430 switch (ncr->nc_cmd) {
431 case NXCFG_CMD_ATTACH:
432 case NXCFG_CMD_DETACH: {
433 struct nx_spec_req nsr;
434
435 bzero(&nsr, sizeof(nsr));
436 err = sooptcopyin(&sopt, &nsr, sizeof(nsr), sizeof(nsr));
437 if (err != 0) {
438 goto done;
439 }
440
441 /*
442 * Null-terminate in case this has an interface name;
443 * the union is already large enough for uuid_t.
444 */
445 nsr.nsr_name[sizeof(nsr.nsr_name) - 1] = '\0';
446 if (p != kernproc) {
447 nsr.nsr_flags &= NXSPECREQ_MASK;
448 }
449
450 err = fsw_ctl(nx, ncr->nc_cmd, p, &nsr);
451 if (err != 0) {
452 goto done;
453 }
454
455 /* XXX: [email protected] -- can this copyout fail? */
456 (void) sooptcopyout(&sopt, &nsr, sizeof(nsr));
457 break;
458 }
459
460 case NXCFG_CMD_FLOW_ADD:
461 case NXCFG_CMD_FLOW_DEL: {
462 _CASSERT(offsetof(struct nx_flow_req, _nfr_kernel_field_end) ==
463 offsetof(struct nx_flow_req, _nfr_common_field_end));
464 struct nx_flow_req nfr;
465
466 bzero(&nfr, sizeof(nfr));
467 err = sooptcopyin(&sopt, &nfr, sizeof(nfr), sizeof(nfr));
468 if (err != 0) {
469 goto done;
470 }
471
472 err = fsw_ctl(nx, ncr->nc_cmd, p, &nfr);
473 if (err != 0) {
474 goto done;
475 }
476
477 /* XXX: [email protected] -- can this copyout fail? */
478 (void) sooptcopyout(&sopt, &nfr, sizeof(nfr));
479 break;
480 }
481
482 case NXCFG_CMD_NETEM: {
483 struct if_netem_params inp;
484
485 bzero(&inp, sizeof(inp));
486 err = sooptcopyin(&sopt, &inp, sizeof(inp), sizeof(inp));
487 if (err != 0) {
488 goto done;
489 }
490 err = fsw_ctl(nx, ncr->nc_cmd, p, &inp);
491 if (err != 0) {
492 goto done;
493 }
494 break;
495 }
496
497 default:
498 err = EINVAL;
499 goto done;
500 }
501
502 done:
503 SK_DF(err ? SK_VERB_ERROR: SK_VERB_FSW,
504 "nexus 0x%llx (%s) cmd %d (err %d)", SK_KVA(nx),
505 NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err);
506 return err;
507 }
508
509 static void
nx_fsw_prov_fini(struct kern_nexus_domain_provider * nxdom_prov)510 nx_fsw_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
511 {
512 #pragma unused(nxdom_prov)
513 SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
514 }
515
516 static int
nx_fsw_prov_nx_ctor(struct kern_nexus * nx)517 nx_fsw_prov_nx_ctor(struct kern_nexus *nx)
518 {
519 struct nx_flowswitch *fsw;
520
521 SK_LOCK_ASSERT_HELD();
522
523 ASSERT(nx->nx_arg == NULL);
524
525 SK_D("nexus 0x%llx (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name);
526
527 fsw = fsw_alloc(Z_WAITOK);
528 nx->nx_arg = fsw;
529 fsw->fsw_nx = nx;
530 fsw->fsw_tx_rings = NX_PROV(nx)->nxprov_params->nxp_tx_rings;
531 fsw->fsw_rx_rings = NX_PROV(nx)->nxprov_params->nxp_rx_rings;
532
533 FSW_WLOCK(fsw);
534
535 fsw_dp_ctor(fsw);
536
537 FSW_WUNLOCK(fsw);
538
539 SK_D("create new fsw 0x%llx for nexus 0x%llx",
540 SK_KVA(NX_FSW_PRIVATE(nx)), SK_KVA(nx));
541
542 return 0;
543 }
544
545 static void
nx_fsw_prov_nx_dtor(struct kern_nexus * nx)546 nx_fsw_prov_nx_dtor(struct kern_nexus *nx)
547 {
548 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
549 int err;
550
551 SK_LOCK_ASSERT_HELD();
552
553 SK_D("nexus 0x%llx (%s) fsw 0x%llx", SK_KVA(nx),
554 NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(fsw));
555
556 err = fsw_ctl_detach(nx, current_proc(), NULL);
557 ASSERT(err == 0); /* this cannot fail */
558 ASSERT(fsw->fsw_dev_ch == NULL);
559 ASSERT(fsw->fsw_host_ch == NULL);
560
561 SK_DF(SK_VERB_FSW, "marking fsw 0x%llx as free", SK_KVA(fsw));
562 fsw_free(fsw);
563 nx->nx_arg = NULL;
564 }
565
566 static size_t
nx_fsw_prov_mib_get(struct kern_nexus * nx,struct nexus_mib_filter * filter,void * out,size_t len,struct proc * p)567 nx_fsw_prov_mib_get(struct kern_nexus *nx, struct nexus_mib_filter *filter,
568 void *out, size_t len, struct proc *p)
569 {
570 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
571
572 /* this check doesn't require holding fsw_lock */
573 if ((filter->nmf_bitmap & NXMIB_FILTER_NX_UUID) &&
574 (uuid_compare(filter->nmf_nx_uuid,
575 fsw->fsw_nx->nx_uuid)) != 0) {
576 return 0;
577 }
578
579 /* intercept NXMIB_FSW_STATS here since it's for flowswitch */
580 FSW_RLOCK(fsw);
581 len = fsw_mib_get(fsw, filter, out, len, p);
582 FSW_UNLOCK(fsw);
583
584 return len;
585 }
586
587 boolean_t
nx_fsw_dom_port_is_reserved(struct kern_nexus * nx,nexus_port_t nx_port)588 nx_fsw_dom_port_is_reserved(struct kern_nexus *nx, nexus_port_t nx_port)
589 {
590 #pragma unused(nx)
591 return nx_port < NEXUS_PORT_FLOW_SWITCH_CLIENT;
592 }
593
594 static int
nx_fsw_dom_find_port(struct kern_nexus * nx,boolean_t rsvd,nexus_port_t * nx_port)595 nx_fsw_dom_find_port(struct kern_nexus *nx, boolean_t rsvd,
596 nexus_port_t *nx_port)
597 {
598 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
599 nexus_port_t first, last, port;
600 int error;
601
602 ASSERT(nx_port != NULL);
603
604 port = *nx_port;
605 ASSERT(port == NEXUS_PORT_ANY);
606
607 if (rsvd) {
608 first = 0;
609 last = NEXUS_PORT_FLOW_SWITCH_CLIENT;
610 } else {
611 first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
612 ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
613 last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
614 }
615 ASSERT(first <= last);
616
617 FSW_WLOCK(fsw);
618 if (__improbable(first == last)) {
619 error = ENOSPC;
620 } else {
621 error = nx_port_find(nx, first, last - 1, &port);
622 ASSERT(error != 0 || (port >= first && port < last));
623 }
624 FSW_WUNLOCK(fsw);
625
626 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
627 "nx 0x%llx \"%s\" %snx_port %d [%u,%u] (err %d)", SK_KVA(nx),
628 nx->nx_prov->nxprov_params->nxp_name, (rsvd ? "[reserved] " : ""),
629 (int)port, first, (last - 1), error);
630
631 if (error == 0) {
632 *nx_port = port;
633 }
634
635 return error;
636 }
637
638 static int
nx_fsw_dom_bind_port(struct kern_nexus * nx,nexus_port_t * nx_port,struct nxbind * nxb,void * info)639 nx_fsw_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
640 struct nxbind *nxb, void *info)
641 {
642 #pragma unused(info)
643 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
644 nexus_port_t first, last, port;
645 int error;
646
647 ASSERT(nx_port != NULL);
648 ASSERT(nxb != NULL);
649
650 port = *nx_port;
651
652 /* can't bind reserved ports to client credentials */
653 if (nx_fsw_dom_port_is_reserved(nx, port)) {
654 return EDOM;
655 }
656
657 /*
658 * Allow clients to bind to regular ports (non-reserved);
659 * reserved ports aren't subject to bind/unbind, since
660 * they are used for internal purposes.
661 */
662 first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
663 ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
664 last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
665 ASSERT(first <= last);
666
667 FSW_WLOCK(fsw);
668 if (__improbable(first == last)) {
669 error = ENOSPC;
670 } else if (port != NEXUS_PORT_ANY) {
671 error = nx_port_bind(nx, port, nxb);
672 } else {
673 error = nx_port_find(nx, first, last - 1, &port);
674 ASSERT(error != 0 || (port >= first && port < last));
675 if (error == 0) {
676 error = nx_port_bind(nx, port, nxb);
677 }
678 }
679 FSW_WUNLOCK(fsw);
680
681 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
682 "nx 0x%llx \"%s\" nx_port %d [%u,%u] (err %d)", SK_KVA(nx),
683 nx->nx_prov->nxprov_params->nxp_name, (int)port,
684 first, (last - 1), error);
685
686 ASSERT(*nx_port == NEXUS_PORT_ANY || *nx_port == port);
687 if (error == 0) {
688 *nx_port = port;
689 }
690
691 return error;
692 }
693
694 static int
nx_fsw_dom_unbind_port(struct kern_nexus * nx,nexus_port_t nx_port)695 nx_fsw_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
696 {
697 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
698 int error;
699
700 FSW_WLOCK(fsw);
701 error = nx_port_unbind(nx, nx_port);
702 FSW_WUNLOCK(fsw);
703
704 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
705 "nx 0x%llx \"%s\" nx_port %d (err %d)", SK_KVA(nx),
706 nx->nx_prov->nxprov_params->nxp_name, (int)nx_port, error);
707
708 return error;
709 }
710
711 static int
nx_fsw_dom_connect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct kern_channel * ch0,struct nxbind * nxb,struct proc * p)712 nx_fsw_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
713 struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
714 struct kern_channel *ch0, struct nxbind *nxb, struct proc *p)
715 {
716 #pragma unused(nxdom_prov)
717 nexus_port_t port = chr->cr_port;
718 int err = 0;
719
720 SK_LOCK_ASSERT_HELD();
721
722 ASSERT(nx->nx_prov->nxprov_params->nxp_type ==
723 nxdom_prov->nxdom_prov_dom->nxdom_type &&
724 nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
725 ASSERT(!(ch->ch_flags & CHANF_HOST));
726
727 if (port != NEXUS_PORT_ANY && port >= NXDOM_MAX(NX_DOM(nx), ports)) {
728 err = EDOM;
729 goto done;
730 }
731
732 chr->cr_real_endpoint = chr->cr_endpoint = CH_ENDPOINT_FLOW_SWITCH;
733 if (ch->ch_flags & CHANF_KERNEL) {
734 uuid_string_t uuidstr;
735 ASSERT(!uuid_is_null(chr->cr_spec_uuid));
736 (void) snprintf(chr->cr_name, sizeof(chr->cr_name),
737 "%s_%llu:%s", NX_FSW_NAME, nx->nx_id,
738 sk_uuid_unparse(chr->cr_spec_uuid, uuidstr));
739 chr->cr_ring_set = RING_SET_DEFAULT;
740 if (chr->cr_mode & CHMODE_HOST) {
741 atomic_bitset_32(&ch->ch_flags, CHANF_HOST);
742 }
743 err = na_connect_spec(nx, ch, chr, p);
744 } else {
745 ASSERT(port != NEXUS_PORT_ANY);
746 if (chr->cr_mode & CHMODE_HOST) {
747 /* not allowed unless kernel (special) channel */
748 err = EINVAL;
749 goto done;
750 }
751 (void) snprintf(chr->cr_name, sizeof(chr->cr_name),
752 "%s_%llu:%u", NX_FSW_NAME, nx->nx_id, port);
753 chr->cr_ring_set = RING_SET_DEFAULT;
754 err = na_connect(nx, ch, chr, ch0, nxb, p);
755 }
756
757 done:
758 return err;
759 }
760
761 static void
nx_fsw_dom_disconnect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch)762 nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
763 struct kern_nexus *nx, struct kern_channel *ch)
764 {
765 #pragma unused(nxdom_prov)
766 SK_LOCK_ASSERT_HELD();
767
768 SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch),
769 SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
770 ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
771
772 if (ch->ch_flags & CHANF_KERNEL) {
773 na_disconnect_spec(nx, ch);
774 } else {
775 na_disconnect(nx, ch);
776 }
777 }
778
779 static void
nx_fsw_dom_defunct(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)780 nx_fsw_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
781 struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
782 {
783 #pragma unused(nxdom_prov)
784 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
785
786 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
787 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
788 ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
789
790 /*
791 * Hold the flowswitch lock as writer; this prevents all data path
792 * accesses to the flowswitch, and allows us to mark the rings with
793 * CKRF_DEFUNCT. Unlike some other nexus types, the flowswitch
794 * doesn't utilize kr_{enter,exit} for serialization, at present.
795 */
796 FSW_WLOCK(fsw);
797 na_ch_rings_defunct(ch, p);
798 FSW_WUNLOCK(fsw);
799 }
800
801 static void
nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,boolean_t locked)802 nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
803 struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
804 {
805 #pragma unused(nxdom_prov)
806 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
807 int err = 0;
808
809 if (!locked) {
810 SK_LOCK_ASSERT_NOTHELD();
811 SK_LOCK();
812 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
813 } else {
814 SK_LOCK_ASSERT_HELD();
815 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
816 }
817
818 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
819 ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
820 ASSERT(VPNA(ch->ch_na)->vpna_nx_port == ch->ch_info->cinfo_nx_port);
821
822 err = fsw_port_na_defunct(fsw, VPNA(ch->ch_na));
823
824 if (err == 0) {
825 na_defunct(nx, ch, ch->ch_na, locked);
826 }
827
828 SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d) err %d",
829 ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx),
830 nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
831 ch->ch_info->cinfo_nx_port,
832 (int)ch->ch_info->cinfo_ch_ring_id, err);
833
834 if (!locked) {
835 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
836 SK_UNLOCK();
837 } else {
838 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
839 SK_LOCK_ASSERT_HELD();
840 }
841 }
842
843 #if SK_LOG
844 /* Hoisted out of line to reduce kernel stack footprint */
845 SK_LOG_ATTRIBUTE
846 static void
nx_fsw_na_find_log(const struct chreq * chr,boolean_t create)847 nx_fsw_na_find_log(const struct chreq *chr, boolean_t create)
848 {
849 uuid_string_t uuidstr;
850
851 SK_D("name \"%s\" spec_uuid \"%s\" nx_port %d mode 0x%b pipe_id %u "
852 "ring_id %d ring_set %u ep_type %u:%u create %u%s",
853 chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
854 (int)chr->cr_port, chr->cr_mode, CHMODE_BITS, chr->cr_pipe_id,
855 (int)chr->cr_ring_id, chr->cr_ring_set, chr->cr_real_endpoint,
856 chr->cr_endpoint, create, (strncmp(chr->cr_name, NX_FSW_NAME,
857 sizeof(NX_FSW_NAME) - 1) != 0) ? " (skipped)" : "");
858 }
859 #endif /* SK_LOG */
860
861 /*
862 * Try to get a reference to a Nexus adapter attached to a flow switch.
863 * If the adapter is found (or is created), this function returns 0, a
864 * non NULL pointer is returned into *na, and the caller holds a
865 * reference to the adapter.
866 * If an adapter is not found, then no reference is grabbed and the
867 * function returns an error code, or 0 if there is just a flow switch prefix
868 * mismatch. Therefore the caller holds a reference when
869 * (*na != NULL && return == 0).
870 */
871 int
nx_fsw_na_find(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_adapter ** na,boolean_t create)872 nx_fsw_na_find(struct kern_nexus *nx, struct kern_channel *ch,
873 struct chreq *chr, struct nxbind *nxb, struct proc *p,
874 struct nexus_adapter **na, boolean_t create)
875 {
876 #pragma unused(ch)
877 struct nexus_vp_adapter *vpna = NULL;
878 char *cr_name = chr->cr_name;
879 struct nx_flowswitch *fsw;
880 int error = 0;
881
882 SK_LOCK_ASSERT_HELD();
883 *na = NULL; /* default return value */
884
885 #if SK_LOG
886 if (__improbable(sk_verbose != 0)) {
887 nx_fsw_na_find_log(chr, create);
888 }
889 #endif /* SK_LOG */
890
891 /* first try to see if this is a flow switch port. */
892 if (strncmp(cr_name, NX_FSW_NAME, sizeof(NX_FSW_NAME) - 1) != 0) {
893 return 0; /* no error, but no flow switch prefix */
894 }
895 ASSERT(nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
896 fsw = NX_FSW_PRIVATE(nx);
897 ASSERT(fsw != NULL);
898
899 if (!create) {
900 return ENXIO;
901 }
902
903 /*
904 * The flowswitch VP is only attachable from a user channel so none of
905 * these flags should be set.
906 */
907 ASSERT((chr->cr_mode & (CHMODE_KERNEL | CHMODE_CONFIG)) == 0);
908 error = fsw_attach_vp(nx, ch, chr, nxb, p, &vpna);
909 ASSERT(vpna == NULL || error == 0);
910
911 if (error == 0) {
912 /* use reference held by nx_fsw_attach_vp above */
913 *na = &vpna->vpna_up;
914 SK_DF(SK_VERB_FSW,
915 "vpna \"%s\" (0x%llx) refs %u to fsw \"%s\" nx_port %d",
916 (*na)->na_name, SK_KVA(*na), (*na)->na_refcount,
917 cr_name, (int)vpna->vpna_nx_port);
918 }
919
920 return error;
921 }
922
923 int
nx_fsw_netagent_add(struct kern_nexus * nx)924 nx_fsw_netagent_add(struct kern_nexus *nx)
925 {
926 return fsw_netagent_add_remove(nx, TRUE);
927 }
928
929 int
nx_fsw_netagent_remove(struct kern_nexus * nx)930 nx_fsw_netagent_remove(struct kern_nexus *nx)
931 {
932 return fsw_netagent_add_remove(nx, FALSE);
933 }
934
935 void
nx_fsw_netagent_update(struct kern_nexus * nx)936 nx_fsw_netagent_update(struct kern_nexus *nx)
937 {
938 fsw_netagent_update(nx);
939 }
940