1 /*
2 * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 */
53
54
55 /*
56 * This module implements the flow switch for Skywalk
57 *
58 * --- FLOW SWITCH ---
59 *
60 * For each switch, a lock protects deletion of ports. When configuring
61 * or deleting a new port, the lock is acquired in exclusive mode (after
62 * holding SK_LOCK). When forwarding, the lock is acquired in shared
63 * mode (without SK_LOCK). The lock is held throughout the entire
64 * forwarding cycle, during which the thread may incur in a page fault.
65 * Hence it is important that sleepable shared locks are used.
66 *
67 * On the rx ring, the per-port lock is grabbed initially to reserve
68 * a number of slot in the ring, then the lock is released, packets are
69 * copied from source to destination, and then the lock is acquired again
70 * and the receive ring is updated. (A similar thing is done on the tx
71 * ring for NIC and host stack ports attached to the switch)
72 *
73 * When a netif is attached to a flowswitch, two kernel channels are opened:
74 * The device and host channels. The device channel provides the device
75 * datapath. The host channel is not used in the datapath. It is there
76 * only for providing some callbacks for activating the hostna (e.g.
77 * intercepting host packets).
78 */
79
80 #include <net/bpf.h>
81 #include <netinet/tcp_seq.h>
82 #include <skywalk/os_skywalk_private.h>
83 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
84 #include <skywalk/nexus/flowswitch/fsw_var.h>
85 #include <skywalk/nexus/upipe/nx_user_pipe.h>
86 #include <skywalk/nexus/netif/nx_netif.h>
87 #include <skywalk/nexus/nexus_var.h>
88 #include <sys/protosw.h>
89 #include <sys/domain.h>
90
91 SYSCTL_EXTENSIBLE_NODE(_kern_skywalk, OID_AUTO, flowswitch,
92 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk FlowSwitch");
93
94 static void nx_fsw_dom_init(struct nxdom *);
95 static void nx_fsw_dom_terminate(struct nxdom *);
96 static void nx_fsw_dom_fini(struct nxdom *);
97 static int nx_fsw_dom_find_port(struct kern_nexus *, boolean_t, nexus_port_t *);
98 static int nx_fsw_dom_bind_port(struct kern_nexus *, nexus_port_t *,
99 struct nxbind *, void *);
100 static int nx_fsw_dom_unbind_port(struct kern_nexus *, nexus_port_t);
101 static int nx_fsw_dom_connect(struct kern_nexus_domain_provider *,
102 struct kern_nexus *, struct kern_channel *, struct chreq *,
103 struct kern_channel *, struct nxbind *, struct proc *);
104 static void nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *,
105 struct kern_nexus *, struct kern_channel *);
106 static void nx_fsw_dom_defunct(struct kern_nexus_domain_provider *,
107 struct kern_nexus *, struct kern_channel *, struct proc *);
108 static void nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *,
109 struct kern_nexus *, struct kern_channel *, boolean_t);
110
111 static int nx_fsw_prov_init(struct kern_nexus_domain_provider *);
112 static int nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *,
113 const struct nxprov_params *, struct nxprov_adjusted_params *);
114 static int nx_fsw_prov_params(struct kern_nexus_domain_provider *,
115 const uint32_t, const struct nxprov_params *, struct nxprov_params *,
116 struct skmem_region_params[SKMEM_REGIONS], uint32_t);
117 static int nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *,
118 struct kern_nexus *, struct nexus_adapter *);
119 static int nx_fsw_prov_config(struct kern_nexus_domain_provider *,
120 struct kern_nexus *, struct nx_cfg_req *, int, struct proc *,
121 kauth_cred_t);
122 static void nx_fsw_prov_fini(struct kern_nexus_domain_provider *);
123 static int nx_fsw_prov_nx_ctor(struct kern_nexus *);
124 static void nx_fsw_prov_nx_dtor(struct kern_nexus *);
125 static size_t nx_fsw_prov_mib_get(struct kern_nexus *nx,
126 struct nexus_mib_filter *, void *__sized_by(len)out, size_t len,
127 struct proc *);
128
129 struct nxdom nx_flowswitch_dom_s = {
130 .nxdom_prov_head =
131 STAILQ_HEAD_INITIALIZER(nx_flowswitch_dom_s.nxdom_prov_head),
132 .nxdom_type = NEXUS_TYPE_FLOW_SWITCH,
133 .nxdom_md_type = NEXUS_META_TYPE_PACKET,
134 .nxdom_md_subtype = NEXUS_META_SUBTYPE_RAW,
135 .nxdom_name = "flowswitch",
136 .nxdom_ports = {
137 .nb_def = NX_FSW_VP_MAX,
138 .nb_min = NX_FSW_VP_MIN,
139 .nb_max = NX_FSW_VP_MAX,
140 },
141 .nxdom_tx_rings = {
142 .nb_def = 1,
143 .nb_min = 1,
144 .nb_max = NX_FSW_MAXRINGS,
145 },
146 .nxdom_rx_rings = {
147 .nb_def = 1,
148 .nb_min = 1,
149 .nb_max = NX_FSW_MAXRINGS,
150 },
151 .nxdom_tx_slots = {
152 .nb_def = NX_FSW_TXRINGSIZE,
153 .nb_min = NX_FSW_MINSLOTS,
154 .nb_max = NX_FSW_MAXSLOTS,
155 },
156 .nxdom_rx_slots = {
157 .nb_def = NX_FSW_RXRINGSIZE,
158 .nb_min = NX_FSW_MINSLOTS,
159 .nb_max = NX_FSW_MAXSLOTS,
160 },
161 .nxdom_buf_size = {
162 .nb_def = NX_FSW_BUFSIZE,
163 .nb_min = NX_FSW_MINBUFSIZE,
164 .nb_max = NX_FSW_MAXBUFSIZE,
165 },
166 .nxdom_large_buf_size = {
167 .nb_def = NX_FSW_DEF_LARGE_BUFSIZE,
168 .nb_min = NX_FSW_MIN_LARGE_BUFSIZE,
169 .nb_max = NX_FSW_MAX_LARGE_BUFSIZE,
170 },
171 .nxdom_meta_size = {
172 .nb_def = NX_FSW_UMD_SIZE,
173 .nb_min = NX_FSW_UMD_SIZE,
174 .nb_max = NX_METADATA_USR_MAX_SZ,
175 },
176 .nxdom_stats_size = {
177 .nb_def = 0,
178 .nb_min = 0,
179 .nb_max = NX_STATS_MAX_SZ,
180 },
181 .nxdom_pipes = {
182 .nb_def = 0,
183 .nb_min = 0,
184 .nb_max = NX_UPIPE_MAXPIPES,
185 },
186 .nxdom_flowadv_max = {
187 .nb_def = 0,
188 .nb_min = 0,
189 .nb_max = NX_FLOWADV_MAX,
190 },
191 .nxdom_nexusadv_size = {
192 .nb_def = 0,
193 .nb_min = 0,
194 .nb_max = NX_NEXUSADV_MAX_SZ,
195 },
196 .nxdom_capabilities = {
197 .nb_def = NXPCAP_USER_CHANNEL,
198 .nb_min = 0,
199 .nb_max = (NXPCAP_CHECKSUM_PARTIAL | NXPCAP_USER_PACKET_POOL |
200 NXPCAP_USER_CHANNEL),
201 },
202 .nxdom_qmap = {
203 .nb_def = NEXUS_QMAP_TYPE_INVALID,
204 .nb_min = NEXUS_QMAP_TYPE_INVALID,
205 .nb_max = NEXUS_QMAP_TYPE_INVALID,
206 },
207 .nxdom_max_frags = {
208 .nb_def = NX_PBUF_FRAGS_DEFAULT,
209 .nb_min = NX_PBUF_FRAGS_MIN,
210 .nb_max = NX_PBUF_FRAGS_MAX,
211 },
212 .nxdom_init = nx_fsw_dom_init,
213 .nxdom_terminate = nx_fsw_dom_terminate,
214 .nxdom_fini = nx_fsw_dom_fini,
215 .nxdom_connect = nx_fsw_dom_connect,
216 .nxdom_find_port = nx_fsw_dom_find_port,
217 .nxdom_port_is_reserved = nx_fsw_dom_port_is_reserved,
218 .nxdom_bind_port = nx_fsw_dom_bind_port,
219 .nxdom_unbind_port = nx_fsw_dom_unbind_port,
220 .nxdom_disconnect = nx_fsw_dom_disconnect,
221 .nxdom_defunct = nx_fsw_dom_defunct,
222 .nxdom_defunct_finalize = nx_fsw_dom_defunct_finalize,
223 };
224
225 struct kern_nexus_domain_provider nx_fsw_prov_s = {
226 .nxdom_prov_name = NEXUS_PROVIDER_FLOW_SWITCH,
227 .nxdom_prov_flags = NXDOMPROVF_DEFAULT,
228 .nxdom_prov_cb = {
229 .dp_cb_init = nx_fsw_prov_init,
230 .dp_cb_fini = nx_fsw_prov_fini,
231 .dp_cb_params = nx_fsw_prov_params,
232 .dp_cb_mem_new = nx_fsw_prov_mem_new,
233 .dp_cb_config = nx_fsw_prov_config,
234 .dp_cb_nx_ctor = nx_fsw_prov_nx_ctor,
235 .dp_cb_nx_dtor = nx_fsw_prov_nx_dtor,
236 .dp_cb_nx_mem_info = NULL, /* not supported */
237 .dp_cb_nx_mib_get = nx_fsw_prov_mib_get,
238 .dp_cb_nx_stop = NULL,
239 },
240 };
241
242
243 static void
nx_fsw_dom_init(struct nxdom * nxdom)244 nx_fsw_dom_init(struct nxdom *nxdom)
245 {
246 SK_LOCK_ASSERT_HELD();
247 ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));
248
249 /* Generic initialization */
250 fsw_init();
251 fsw_dp_init();
252
253 (void) nxdom_prov_add(nxdom, &nx_fsw_prov_s);
254 }
255
256 static void
nx_fsw_dom_terminate(struct nxdom * nxdom)257 nx_fsw_dom_terminate(struct nxdom *nxdom)
258 {
259 struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;
260
261 SK_LOCK_ASSERT_HELD();
262
263 STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
264 nxdom_prov_link, tnxdp) {
265 (void) nxdom_prov_del(nxdom_prov);
266 }
267
268 fsw_dp_uninit();
269
270 /* Generic uninitialization */
271 fsw_uninit();
272 }
273
274 static void
nx_fsw_dom_fini(struct nxdom * nxdom)275 nx_fsw_dom_fini(struct nxdom *nxdom)
276 {
277 #pragma unused(nxdom)
278 }
279
280 static int
nx_fsw_prov_init(struct kern_nexus_domain_provider * nxdom_prov)281 nx_fsw_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
282 {
283 #pragma unused(nxdom_prov)
284 SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
285 return 0;
286 }
287
288 static int
nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider * nxdom_prov,const struct nxprov_params * nxp,struct nxprov_adjusted_params * adj)289 nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
290 const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
291 {
292 #pragma unused(nxdom_prov, nxp)
293 _CASSERT(NX_FSW_AFRINGSIZE <= NX_FSW_RXRINGSIZE);
294 _CASSERT(NX_FSW_AFRINGSIZE <= NX_FSW_TXRINGSIZE);
295
296 *(adj->adj_md_subtype) = NEXUS_META_SUBTYPE_PAYLOAD;
297 *(adj->adj_stats_size) = sizeof(struct __nx_stats_fsw);
298 VERIFY(sk_max_flows > 0 && sk_max_flows <= NX_FLOWADV_MAX);
299 *(adj->adj_flowadv_max) = sk_max_flows;
300 *(adj->adj_nexusadv_size) = sizeof(struct sk_nexusadv);
301 *(adj->adj_caps) |= NXPCAP_USER_PACKET_POOL;
302 if (sk_cksum_tx != 0) {
303 *(adj->adj_caps) |= NXPCAP_CHECKSUM_PARTIAL;
304 }
305 *(adj->adj_alloc_rings) = *(adj->adj_free_rings) =
306 ((nxp->nxp_max_frags > 1) && (sk_channel_buflet_alloc != 0)) ?
307 2 : 1;
308 *(adj->adj_alloc_slots) = *(adj->adj_free_slots) =
309 NX_FSW_AFRINGSIZE;
310
311 if (!SKMEM_MEM_CONSTRAINED_DEVICE() &&
312 (*(adj->adj_buf_region_segment_size) < NX_FSW_BUF_SEG_SIZE)) {
313 *(adj->adj_buf_region_segment_size) = NX_FSW_BUF_SEG_SIZE;
314 }
315
316 if (*(adj->adj_max_frags) > 1) {
317 uint32_t fsw_maxbufs = SKMEM_MEM_CONSTRAINED_DEVICE() ?
318 NX_FSW_MAXBUFFERS_MEM_CONSTRAINED : NX_FSW_MAXBUFFERS;
319 uint32_t magazine_max_objs;
320
321 *(adj->adj_max_buffers) = (sk_fsw_max_bufs != 0) ?
322 sk_fsw_max_bufs : fsw_maxbufs;
323
324 /*
325 * Given that packet objects are the ones cached, use the
326 * metadata size to determine the extra amount of objects
327 * at magazine layer.
328 */
329 magazine_max_objs = skmem_cache_magazine_max(
330 NX_METADATA_PACKET_SZ(*(adj->adj_max_frags)) +
331 METADATA_PREAMBLE_SZ);
332
333 /*
334 * Adjust the max buffers to account for the increase
335 * associated with per-CPU caching.
336 */
337 if (skmem_allow_magazines() &&
338 magazine_max_objs < *(adj->adj_max_buffers)) {
339 *(adj->adj_max_buffers) -= magazine_max_objs;
340 }
341 }
342 if (SKMEM_MEM_CONSTRAINED_DEVICE() || (fsw_use_dual_sized_pool == 0) ||
343 (*(adj->adj_max_frags) <= 1)) {
344 *(adj->adj_large_buf_size) = 0;
345 }
346 return 0;
347 }
348
349 static int
nx_fsw_prov_params(struct kern_nexus_domain_provider * nxdom_prov,const uint32_t req,const struct nxprov_params * nxp0,struct nxprov_params * nxp,struct skmem_region_params srp[SKMEM_REGIONS],uint32_t pp_region_config_flags)350 nx_fsw_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
351 const uint32_t req, const struct nxprov_params *nxp0,
352 struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS],
353 uint32_t pp_region_config_flags)
354 {
355 struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;
356
357 /* USD regions need to be writable to support user packet pool */
358 srp[SKMEM_REGION_TXAUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
359 srp[SKMEM_REGION_RXFUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
360
361 return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
362 nxdom, nxdom, nxdom, pp_region_config_flags,
363 nx_fsw_prov_params_adjust);
364 }
365
366 static void
fsw_vp_region_params_setup(struct nexus_adapter * na,struct skmem_region_params * __counted_by (SKMEM_REGIONS)srp0,struct skmem_region_params * __counted_by (SKMEM_REGIONS)srp)367 fsw_vp_region_params_setup(struct nexus_adapter *na,
368 struct skmem_region_params *__counted_by(SKMEM_REGIONS)srp0,
369 struct skmem_region_params *__counted_by(SKMEM_REGIONS)srp)
370 {
371 int i;
372 uint32_t totalrings, nslots, afslots, evslots, lbaslots;
373
374 /* copy default flowswitch parameters initialized in nxprov_params_adjust() */
375 for (i = 0; i < SKMEM_REGIONS; i++) {
376 srp[i] = srp0[i];
377 }
378 /* customize parameters that could vary across NAs */
379 totalrings = na_get_nrings(na, NR_TX) + na_get_nrings(na, NR_RX) +
380 na_get_nrings(na, NR_A) + na_get_nrings(na, NR_F) +
381 na_get_nrings(na, NR_EV) + na_get_nrings(na, NR_LBA);
382
383 srp[SKMEM_REGION_SCHEMA].srp_r_obj_size =
384 (uint32_t)CHANNEL_SCHEMA_SIZE(totalrings);
385 srp[SKMEM_REGION_SCHEMA].srp_r_obj_cnt = totalrings;
386 skmem_region_params_config(&srp[SKMEM_REGION_SCHEMA]);
387
388 srp[SKMEM_REGION_RING].srp_r_obj_size =
389 sizeof(struct __user_channel_ring);
390 srp[SKMEM_REGION_RING].srp_r_obj_cnt = totalrings;
391 skmem_region_params_config(&srp[SKMEM_REGION_RING]);
392
393 nslots = na_get_nslots(na, NR_TX);
394 afslots = na_get_nslots(na, NR_A);
395 evslots = na_get_nslots(na, NR_EV);
396 lbaslots = na_get_nslots(na, NR_LBA);
397 srp[SKMEM_REGION_TXAKSD].srp_r_obj_size =
398 MAX(MAX(MAX(nslots, afslots), evslots), lbaslots) * SLOT_DESC_SZ;
399 srp[SKMEM_REGION_TXAKSD].srp_r_obj_cnt =
400 na_get_nrings(na, NR_TX) + na_get_nrings(na, NR_A) +
401 na_get_nrings(na, NR_EV) + na_get_nrings(na, NR_LBA);
402 skmem_region_params_config(&srp[SKMEM_REGION_TXAKSD]);
403
404 /* USD and KSD objects share the same size and count */
405 srp[SKMEM_REGION_TXAUSD].srp_r_obj_size =
406 srp[SKMEM_REGION_TXAKSD].srp_r_obj_size;
407 srp[SKMEM_REGION_TXAUSD].srp_r_obj_cnt =
408 srp[SKMEM_REGION_TXAKSD].srp_r_obj_cnt;
409 skmem_region_params_config(&srp[SKMEM_REGION_TXAUSD]);
410 }
411
412 static int
nx_fsw_prov_mem_new(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nexus_adapter * na)413 nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
414 struct kern_nexus *nx, struct nexus_adapter *na)
415 {
416 #pragma unused(nxdom_prov)
417 int err = 0;
418 struct skmem_region_params *srp0 = NX_PROV(nx)->nxprov_region_params;
419 struct skmem_region_params srp[SKMEM_REGIONS];
420
421 SK_DF(SK_VERB_FSW,
422 "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx),
423 NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
424 SK_KVA(na));
425
426 ASSERT(na->na_type == NA_FLOWSWITCH_VP);
427 ASSERT(na->na_arena == NULL);
428 ASSERT((na->na_flags & NAF_USER_PKT_POOL) != 0);
429
430 fsw_vp_region_params_setup(na, srp0, srp);
431 /*
432 * Each port in the flow switch is isolated from one another;
433 * use NULL for the packet buffer pool references to indicate
434 * this, since otherwise we'd be sharing the same pp for the
435 * entire switch (maybe for a future, special use case?)
436 *
437 * This means that clients calling kern_nexus_get_pbufpool()
438 * will get NULL, but this is fine based on current design
439 * of providing port isolation, and also since we don't expose
440 * the flow switch to external kernel clients.
441 */
442 na->na_arena = skmem_arena_create_for_nexus(na, srp, NULL, NULL, FALSE,
443 !NX_USER_CHANNEL_PROV(nx), &nx->nx_adv, &err);
444 ASSERT(na->na_arena != NULL || err != 0);
445 return err;
446 }
447
448 static int
nx_fsw_prov_config(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct nx_cfg_req * ncr,int sopt_dir,struct proc * p,kauth_cred_t cred)449 nx_fsw_prov_config(struct kern_nexus_domain_provider *nxdom_prov,
450 struct kern_nexus *nx, struct nx_cfg_req *ncr, int sopt_dir,
451 struct proc *p, kauth_cred_t cred)
452 {
453 #pragma unused(nxdom_prov)
454 struct sockopt sopt;
455 int err = 0;
456
457 SK_LOCK_ASSERT_HELD();
458
459 if (ncr->nc_req == USER_ADDR_NULL) {
460 err = EINVAL;
461 goto done;
462 }
463
464 /* to make life easier for handling copies */
465 bzero(&sopt, sizeof(sopt));
466 sopt.sopt_dir = sopt_dir;
467 sopt.sopt_val = ncr->nc_req;
468 sopt.sopt_valsize = ncr->nc_req_len;
469 sopt.sopt_p = p;
470
471 /* avoid _MALLOCing at the cost of this ugly switch block */
472 switch (ncr->nc_cmd) {
473 case NXCFG_CMD_ATTACH:
474 case NXCFG_CMD_DETACH: {
475 /* proceed only if the client possesses flow switch entitlement */
476 if (cred == NULL || (err = skywalk_priv_check_cred(p, cred,
477 PRIV_SKYWALK_REGISTER_FLOW_SWITCH)) != 0) {
478 SK_ERR("missing nxctl credential");
479 err = EPERM;
480 goto done;
481 }
482
483 struct nx_spec_req nsr;
484 bzero(&nsr, sizeof(nsr));
485 err = sooptcopyin(&sopt, &nsr, sizeof(nsr), sizeof(nsr));
486 if (err != 0) {
487 goto done;
488 }
489
490 /*
491 * Null-terminate in case this has an interface name;
492 * the union is already large enough for uuid_t.
493 */
494 nsr.nsr_name[sizeof(nsr.nsr_name) - 1] = '\0';
495 if (p != kernproc) {
496 nsr.nsr_flags &= NXSPECREQ_MASK;
497 }
498
499 err = fsw_ctl(nx, ncr->nc_cmd, p, &nsr);
500 if (err != 0) {
501 goto done;
502 }
503
504 err = sooptcopyout(&sopt, &nsr, sizeof(nsr));
505 break;
506 }
507
508 case NXCFG_CMD_FLOW_ADD:
509 case NXCFG_CMD_FLOW_DEL: {
510 /* need to have owner nxctl or kernnxctl */
511 if (cred == NULL) {
512 SK_ERR("missing nxctl credential");
513 err = EPERM;
514 goto done;
515 }
516 } /* fall through */
517 case NXCFG_CMD_FLOW_CONFIG: {
518 /* checks flow PID ownership instead of nxctl creditial */
519 struct nx_flow_req nfr;
520 bzero(&nfr, sizeof(nfr));
521 err = sooptcopyin(&sopt, &nfr, sizeof(nfr), sizeof(nfr));
522 if (err != 0) {
523 goto done;
524 }
525
526 err = fsw_ctl(nx, ncr->nc_cmd, p, &nfr);
527 if (err != 0) {
528 goto done;
529 }
530
531 err = sooptcopyout(&sopt, &nfr, sizeof(nfr));
532 break;
533 }
534
535 case NXCFG_CMD_NETEM: {
536 struct if_netem_params inp;
537
538 bzero(&inp, sizeof(inp));
539 err = sooptcopyin(&sopt, &inp, sizeof(inp), sizeof(inp));
540 if (err != 0) {
541 goto done;
542 }
543 err = fsw_ctl(nx, ncr->nc_cmd, p, &inp);
544 if (err != 0) {
545 goto done;
546 }
547 break;
548 }
549
550 default:
551 err = EINVAL;
552 goto done;
553 }
554
555 done:
556 SK_DF(err ? SK_VERB_ERROR: SK_VERB_FSW,
557 "nexus 0x%llx (%s) cmd %d (err %d)", SK_KVA(nx),
558 NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err);
559 return err;
560 }
561
562 static void
nx_fsw_prov_fini(struct kern_nexus_domain_provider * nxdom_prov)563 nx_fsw_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
564 {
565 #pragma unused(nxdom_prov)
566 SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
567 }
568
569 static int
nx_fsw_prov_nx_ctor(struct kern_nexus * nx)570 nx_fsw_prov_nx_ctor(struct kern_nexus *nx)
571 {
572 struct nx_flowswitch *fsw;
573
574 SK_LOCK_ASSERT_HELD();
575
576 ASSERT(nx->nx_arg == NULL);
577
578 SK_D("nexus 0x%llx (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name);
579
580 fsw = fsw_alloc(Z_WAITOK);
581 nx->nx_arg = fsw;
582 fsw->fsw_nx = nx;
583 fsw->fsw_tx_rings = NX_PROV(nx)->nxprov_params->nxp_tx_rings;
584 fsw->fsw_rx_rings = NX_PROV(nx)->nxprov_params->nxp_rx_rings;
585
586 FSW_WLOCK(fsw);
587
588 fsw_dp_ctor(fsw);
589
590 FSW_WUNLOCK(fsw);
591
592 SK_D("create new fsw 0x%llx for nexus 0x%llx",
593 SK_KVA(NX_FSW_PRIVATE(nx)), SK_KVA(nx));
594
595 return 0;
596 }
597
598 static void
nx_fsw_prov_nx_dtor(struct kern_nexus * nx)599 nx_fsw_prov_nx_dtor(struct kern_nexus *nx)
600 {
601 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
602 int err;
603
604 SK_LOCK_ASSERT_HELD();
605
606 SK_D("nexus 0x%llx (%s) fsw 0x%llx", SK_KVA(nx),
607 NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(fsw));
608
609 err = fsw_ctl_detach(nx, current_proc(), NULL);
610 ASSERT(err == 0); /* this cannot fail */
611 ASSERT(fsw->fsw_dev_ch == NULL);
612 ASSERT(fsw->fsw_host_ch == NULL);
613
614 SK_DF(SK_VERB_FSW, "marking fsw 0x%llx as free", SK_KVA(fsw));
615 fsw_free(fsw);
616 nx->nx_arg = NULL;
617 }
618
619 static size_t
nx_fsw_prov_mib_get(struct kern_nexus * nx,struct nexus_mib_filter * filter,void * __sized_by (len)out,size_t len,struct proc * p)620 nx_fsw_prov_mib_get(struct kern_nexus *nx, struct nexus_mib_filter *filter,
621 void *__sized_by(len)out, size_t len, struct proc *p)
622 {
623 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
624 size_t rlen;
625
626 /* this check doesn't require holding fsw_lock */
627 if ((filter->nmf_bitmap & NXMIB_FILTER_NX_UUID) &&
628 (uuid_compare(filter->nmf_nx_uuid,
629 fsw->fsw_nx->nx_uuid)) != 0) {
630 return 0;
631 }
632
633 /* intercept NXMIB_FSW_STATS here since it's for flowswitch */
634 FSW_RLOCK(fsw);
635 rlen = fsw_mib_get(fsw, filter, out, len, p);
636 FSW_UNLOCK(fsw);
637
638 return rlen;
639 }
640
641 boolean_t
nx_fsw_dom_port_is_reserved(struct kern_nexus * nx,nexus_port_t nx_port)642 nx_fsw_dom_port_is_reserved(struct kern_nexus *nx, nexus_port_t nx_port)
643 {
644 #pragma unused(nx)
645 return nx_port < NEXUS_PORT_FLOW_SWITCH_CLIENT;
646 }
647
648 static int
nx_fsw_dom_find_port(struct kern_nexus * nx,boolean_t rsvd,nexus_port_t * nx_port)649 nx_fsw_dom_find_port(struct kern_nexus *nx, boolean_t rsvd,
650 nexus_port_t *nx_port)
651 {
652 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
653 nexus_port_t first, last, port;
654 int error;
655
656 ASSERT(nx_port != NULL);
657
658 port = *nx_port;
659 ASSERT(port == NEXUS_PORT_ANY);
660
661 if (rsvd) {
662 first = 0;
663 last = NEXUS_PORT_FLOW_SWITCH_CLIENT;
664 } else {
665 first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
666 ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
667 last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
668 }
669 ASSERT(first <= last);
670
671 FSW_WLOCK(fsw);
672 if (__improbable(first == last)) {
673 error = ENOMEM;
674 } else {
675 error = nx_port_find(nx, first, last - 1, &port);
676 ASSERT(error != 0 || (port >= first && port < last));
677 }
678 FSW_WUNLOCK(fsw);
679
680 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
681 "nx 0x%llx \"%s\" %snx_port %d [%u,%u] (err %d)", SK_KVA(nx),
682 nx->nx_prov->nxprov_params->nxp_name, (rsvd ? "[reserved] " : ""),
683 (int)port, first, (last - 1), error);
684
685 if (error == 0) {
686 *nx_port = port;
687 }
688
689 return error;
690 }
691
692 static int
nx_fsw_dom_bind_port(struct kern_nexus * nx,nexus_port_t * nx_port,struct nxbind * nxb,void * info)693 nx_fsw_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
694 struct nxbind *nxb, void *info)
695 {
696 #pragma unused(info)
697 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
698 nexus_port_t first, last, port;
699 int error;
700
701 ASSERT(nx_port != NULL);
702 ASSERT(nxb != NULL);
703
704 port = *nx_port;
705
706 /* can't bind reserved ports to client credentials */
707 if (nx_fsw_dom_port_is_reserved(nx, port)) {
708 return EDOM;
709 }
710
711 /*
712 * Allow clients to bind to regular ports (non-reserved);
713 * reserved ports aren't subject to bind/unbind, since
714 * they are used for internal purposes.
715 */
716 first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
717 ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
718 last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
719 ASSERT(first <= last);
720
721 FSW_WLOCK(fsw);
722 if (__improbable(first == last)) {
723 error = ENOMEM;
724 } else if (port != NEXUS_PORT_ANY) {
725 error = nx_port_bind(nx, port, nxb);
726 } else {
727 error = nx_port_find(nx, first, last - 1, &port);
728 ASSERT(error != 0 || (port >= first && port < last));
729 if (error == 0) {
730 error = nx_port_bind(nx, port, nxb);
731 }
732 }
733 FSW_WUNLOCK(fsw);
734
735 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
736 "nx 0x%llx \"%s\" nx_port %d [%u,%u] (err %d)", SK_KVA(nx),
737 nx->nx_prov->nxprov_params->nxp_name, (int)port,
738 first, (last - 1), error);
739
740 ASSERT(*nx_port == NEXUS_PORT_ANY || *nx_port == port);
741 if (error == 0) {
742 *nx_port = port;
743 }
744
745 return error;
746 }
747
748 static int
nx_fsw_dom_unbind_port(struct kern_nexus * nx,nexus_port_t nx_port)749 nx_fsw_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
750 {
751 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
752 int error;
753
754 FSW_WLOCK(fsw);
755 error = nx_port_unbind(nx, nx_port);
756 FSW_WUNLOCK(fsw);
757
758 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
759 "nx 0x%llx \"%s\" nx_port %d (err %d)", SK_KVA(nx),
760 nx->nx_prov->nxprov_params->nxp_name, (int)nx_port, error);
761
762 return error;
763 }
764
765 static int
nx_fsw_dom_connect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct kern_channel * ch0,struct nxbind * nxb,struct proc * p)766 nx_fsw_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
767 struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
768 struct kern_channel *ch0, struct nxbind *nxb, struct proc *p)
769 {
770 #pragma unused(nxdom_prov)
771 nexus_port_t port = chr->cr_port;
772 int err = 0;
773
774 SK_LOCK_ASSERT_HELD();
775
776 ASSERT(nx->nx_prov->nxprov_params->nxp_type ==
777 nxdom_prov->nxdom_prov_dom->nxdom_type &&
778 nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
779 ASSERT(!(ch->ch_flags & CHANF_HOST));
780 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
781
782 if (port != NEXUS_PORT_ANY && port >= NXDOM_MAX(NX_DOM(nx), ports)) {
783 err = EDOM;
784 goto done;
785 }
786
787 chr->cr_real_endpoint = chr->cr_endpoint = CH_ENDPOINT_FLOW_SWITCH;
788 ASSERT(port != NEXUS_PORT_ANY);
789 (void) snprintf(chr->cr_name, sizeof(chr->cr_name),
790 "%s_%llu:%u", NX_FSW_NAME, nx->nx_id, port);
791 chr->cr_ring_set = RING_SET_DEFAULT;
792 err = na_connect(nx, ch, chr, ch0, nxb, p);
793
794 done:
795 return err;
796 }
797
798 static void
nx_fsw_dom_disconnect(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch)799 nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
800 struct kern_nexus *nx, struct kern_channel *ch)
801 {
802 #pragma unused(nxdom_prov)
803 SK_LOCK_ASSERT_HELD();
804
805 SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch),
806 SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
807 ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
808
809 if (ch->ch_flags & CHANF_KERNEL) {
810 na_disconnect_spec(nx, ch);
811 } else {
812 na_disconnect(nx, ch);
813 }
814 }
815
816 static void
nx_fsw_dom_defunct(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,struct proc * p)817 nx_fsw_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
818 struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
819 {
820 #pragma unused(nxdom_prov)
821 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
822
823 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
824 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
825 ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
826
827 /*
828 * Hold the flowswitch lock as writer; this prevents all data path
829 * accesses to the flowswitch, and allows us to mark the rings with
830 * CKRF_DEFUNCT. Unlike some other nexus types, the flowswitch
831 * doesn't utilize kr_{enter,exit} for serialization, at present.
832 */
833 FSW_WLOCK(fsw);
834 na_ch_rings_defunct(ch, p);
835 FSW_WUNLOCK(fsw);
836 }
837
838 static void
nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider * nxdom_prov,struct kern_nexus * nx,struct kern_channel * ch,boolean_t locked)839 nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
840 struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
841 {
842 #pragma unused(nxdom_prov)
843 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
844 int err = 0;
845
846 if (!locked) {
847 SK_LOCK_ASSERT_NOTHELD();
848 SK_LOCK();
849 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
850 } else {
851 SK_LOCK_ASSERT_HELD();
852 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
853 }
854
855 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
856 ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
857 ASSERT(VPNA(ch->ch_na)->vpna_nx_port == ch->ch_info->cinfo_nx_port);
858
859 err = fsw_port_na_defunct(fsw, VPNA(ch->ch_na));
860
861 if (err == 0) {
862 na_defunct(nx, ch, ch->ch_na, locked);
863 }
864
865 SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d) err %d",
866 ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx),
867 nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
868 ch->ch_info->cinfo_nx_port,
869 (int)ch->ch_info->cinfo_ch_ring_id, err);
870
871 if (!locked) {
872 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
873 SK_UNLOCK();
874 } else {
875 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
876 SK_LOCK_ASSERT_HELD();
877 }
878 }
879
880 #if SK_LOG
881 /* Hoisted out of line to reduce kernel stack footprint */
882 SK_LOG_ATTRIBUTE
883 static void
nx_fsw_na_find_log(const struct chreq * chr,boolean_t create)884 nx_fsw_na_find_log(const struct chreq *chr, boolean_t create)
885 {
886 uuid_string_t uuidstr;
887
888 SK_D("name \"%s\" spec_uuid \"%s\" nx_port %d mode 0x%b pipe_id %u "
889 "ring_id %d ring_set %u ep_type %u:%u create %u%s",
890 chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
891 (int)chr->cr_port, chr->cr_mode, CHMODE_BITS, chr->cr_pipe_id,
892 (int)chr->cr_ring_id, chr->cr_ring_set, chr->cr_real_endpoint,
893 chr->cr_endpoint, create, (strlcmp(chr->cr_name,
894 NX_FSW_NAME, sizeof(NX_FSW_NAME)) != 0) ? " (skipped)" : "");
895 }
896 #endif /* SK_LOG */
897
898 /*
899 * Try to get a reference to a Nexus adapter attached to a flow switch.
900 * If the adapter is found (or is created), this function returns 0, a
901 * non NULL pointer is returned into *na, and the caller holds a
902 * reference to the adapter.
903 * If an adapter is not found, then no reference is grabbed and the
904 * function returns an error code, or 0 if there is just a flow switch prefix
905 * mismatch. Therefore the caller holds a reference when
906 * (*na != NULL && return == 0).
907 */
908 int
nx_fsw_na_find(struct kern_nexus * nx,struct kern_channel * ch,struct chreq * chr,struct nxbind * nxb,struct proc * p,struct nexus_adapter ** na,boolean_t create)909 nx_fsw_na_find(struct kern_nexus *nx, struct kern_channel *ch,
910 struct chreq *chr, struct nxbind *nxb, struct proc *p,
911 struct nexus_adapter **na, boolean_t create)
912 {
913 struct nexus_vp_adapter *__single vpna = NULL;
914 char *cr_name = chr->cr_name;
915 struct nx_flowswitch *fsw;
916 int error = 0;
917
918 SK_LOCK_ASSERT_HELD();
919 *na = NULL; /* default return value */
920
921 #if SK_LOG
922 if (__improbable(sk_verbose != 0)) {
923 nx_fsw_na_find_log(chr, create);
924 }
925 #endif /* SK_LOG */
926
927 /* first try to see if this is a flow switch port. */
928 if (strlcmp(cr_name, NX_FSW_NAME, sizeof(NX_FSW_NAME) - 1) != 0) {
929 return 0; /* no error, but no flow switch prefix */
930 }
931 ASSERT(nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
932 fsw = NX_FSW_PRIVATE(nx);
933 ASSERT(fsw != NULL);
934
935 if (!create) {
936 return ENXIO;
937 }
938
939 /*
940 * The flowswitch VP is only attachable from a user channel so none of
941 * these flags should be set.
942 */
943 ASSERT((chr->cr_mode & (CHMODE_KERNEL | CHMODE_CONFIG)) == 0);
944 error = fsw_attach_vp(nx, ch, chr, nxb, p, &vpna);
945 ASSERT(vpna == NULL || error == 0);
946
947 if (error == 0) {
948 /* use reference held by nx_fsw_attach_vp above */
949 *na = &vpna->vpna_up;
950 SK_DF(SK_VERB_FSW,
951 "vpna \"%s\" (0x%llx) refs %u to fsw \"%s\" nx_port %d",
952 (*na)->na_name, SK_KVA(*na), (*na)->na_refcount,
953 cr_name, (int)vpna->vpna_nx_port);
954 }
955
956 return error;
957 }
958
959 int
nx_fsw_netagent_add(struct kern_nexus * nx)960 nx_fsw_netagent_add(struct kern_nexus *nx)
961 {
962 return fsw_netagent_add_remove(nx, TRUE);
963 }
964
965 int
nx_fsw_netagent_remove(struct kern_nexus * nx)966 nx_fsw_netagent_remove(struct kern_nexus *nx)
967 {
968 return fsw_netagent_add_remove(nx, FALSE);
969 }
970
971 void
nx_fsw_netagent_update(struct kern_nexus * nx)972 nx_fsw_netagent_update(struct kern_nexus *nx)
973 {
974 fsw_netagent_update(nx);
975 }
976