1 /*
2 * Copyright (c) 2016-2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /* This file contains useful utility routines, but contrary to skywalk_test_common
30 * Do not operate on a single set of static objects
31 */
32
33 /*
34 * Copyright (c) 1988, 1992, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93
38 */
39
40
41 #include <err.h>
42 #include <assert.h>
43 #include <inttypes.h>
44 #include <stdbool.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <time.h>
48 #include <string.h>
49 #include <unistd.h>
50 #include <poll.h>
51 #include <sys/event.h>
52 #include <uuid/uuid.h>
53 #include <arpa/inet.h>
54 #include <stddef.h>
55 #include <sysexits.h>
56 #include <sys/types.h>
57 #include <sys/sysctl.h>
58 #include <net/if_utun.h>
59 #include <net/if_ipsec.h>
60 #include <netinet/ip6.h>
61 #include <sys/kern_control.h>
62 #include <sys/ioctl.h>
63 #include <sys/socket.h>
64 #include <sys/kern_control.h>
65 #include <sys/sys_domain.h>
66 #include <ifaddrs.h>
67 #include <sys/fcntl.h>
68 #include <sys/kern_control.h>
69 #include <sys/sys_domain.h>
70 #include <net/if_utun.h>
71 #include <os/log.h>
72
73 #include <net/pfkeyv2.h>
74 #include <netinet6/ipsec.h>
75 #include <darwintest.h>
76
77 #include "skywalk_test_driver.h"
78 #include "skywalk_test_common.h" // XXX remove this
79 #include "skywalk_test_utils.h"
80
81 #define SIN(s) ((struct sockaddr_in *)(void *)s)
82 #define SIN6(s) ((struct sockaddr_in6 *)(void *)s)
83
84 void
sktc_build_nexus(nexus_controller_t ncd,struct sktc_nexus_attr * sktc_attr,uuid_t * providerp,uuid_t * instancep)85 sktc_build_nexus(nexus_controller_t ncd, struct sktc_nexus_attr *sktc_attr,
86 uuid_t *providerp, uuid_t *instancep)
87 {
88 nexus_attr_t attr;
89 int error;
90 uint64_t scratch;
91
92 attr = os_nexus_attr_create();
93 assert(attr);
94
95 if (sktc_attr->anonymous != -1) {
96 error = os_nexus_attr_set(attr, NEXUS_ATTR_ANONYMOUS,
97 sktc_attr->anonymous);
98 SKTC_ASSERT_ERR(!error);
99 }
100 if (sktc_attr->userchannel != -1) {
101 error = os_nexus_attr_set(attr, NEXUS_ATTR_USER_CHANNEL,
102 sktc_attr->userchannel);
103 SKTC_ASSERT_ERR(!error);
104 }
105 if (sktc_attr->ntxrings != -1) {
106 error = os_nexus_attr_set(attr, NEXUS_ATTR_TX_RINGS,
107 sktc_attr->ntxrings);
108 SKTC_ASSERT_ERR(!error);
109 }
110 if (sktc_attr->nrxrings != -1) {
111 error = os_nexus_attr_set(attr, NEXUS_ATTR_RX_RINGS,
112 sktc_attr->nrxrings);
113 SKTC_ASSERT_ERR(!error);
114 }
115 if (sktc_attr->ntxslots != -1) {
116 error = os_nexus_attr_set(attr, NEXUS_ATTR_TX_SLOTS,
117 sktc_attr->ntxslots);
118 SKTC_ASSERT_ERR(!error);
119 }
120 if (sktc_attr->nrxslots != -1) {
121 error = os_nexus_attr_set(attr, NEXUS_ATTR_RX_SLOTS,
122 sktc_attr->nrxslots);
123 SKTC_ASSERT_ERR(!error);
124 }
125 if (sktc_attr->slotsize != -1) {
126 error = os_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE,
127 sktc_attr->slotsize);
128 SKTC_ASSERT_ERR(!error);
129 }
130 if (sktc_attr->metasize != -1) {
131 error = os_nexus_attr_set(attr, NEXUS_ATTR_SLOT_META_SIZE,
132 sktc_attr->metasize);
133 SKTC_ASSERT_ERR(error == ENOTSUP);
134 }
135 if (sktc_attr->maxfrags != -1) {
136 error = os_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
137 sktc_attr->maxfrags);
138 SKTC_ASSERT_ERR(!error);
139 }
140 if (sktc_attr->rejectonclose != -1) {
141 error = os_nexus_attr_set(attr, NEXUS_ATTR_REJECT_ON_CLOSE,
142 sktc_attr->rejectonclose);
143 SKTC_ASSERT_ERR(!error);
144 }
145
146 uuid_clear(*providerp);
147 error = os_nexus_controller_register_provider(ncd,
148 sktc_attr->name, sktc_attr->type, attr, providerp);
149 SKTC_ASSERT_ERR(!error);
150 assert(!uuid_is_null(*providerp));
151
152 /* Clear the parameters to make sure they are being read */
153 error = os_nexus_attr_set(attr, NEXUS_ATTR_ANONYMOUS, -1);
154 SKTC_ASSERT_ERR(!error);
155 error = os_nexus_attr_set(attr, NEXUS_ATTR_TX_RINGS, -1);
156 SKTC_ASSERT_ERR(!error);
157 error = os_nexus_attr_set(attr, NEXUS_ATTR_RX_RINGS, -1);
158 SKTC_ASSERT_ERR(!error);
159 error = os_nexus_attr_set(attr, NEXUS_ATTR_TX_SLOTS, -1);
160 SKTC_ASSERT_ERR(!error);
161 error = os_nexus_attr_set(attr, NEXUS_ATTR_RX_SLOTS, -1);
162 SKTC_ASSERT_ERR(!error);
163 error = os_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, -1);
164 SKTC_ASSERT_ERR(!error);
165 error = os_nexus_attr_set(attr, NEXUS_ATTR_SLOT_META_SIZE, -1);
166 SKTC_ASSERT_ERR(error == ENOTSUP);
167 error = os_nexus_attr_set(attr, NEXUS_ATTR_EXTENSIONS, -1);
168 SKTC_ASSERT_ERR(!error);
169 error = os_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS, -1);
170 SKTC_ASSERT_ERR(!error);
171 error = os_nexus_attr_set(attr, NEXUS_ATTR_REJECT_ON_CLOSE, -1);
172 SKTC_ASSERT_ERR(!error);
173
174 error = os_nexus_controller_read_provider_attr(ncd,
175 *providerp, attr);
176 SKTC_ASSERT_ERR(!error);
177
178 scratch = -1;
179 error = os_nexus_attr_get(attr, NEXUS_ATTR_ANONYMOUS, &scratch);
180 SKTC_ASSERT_ERR(!error);
181 assert(scratch != -1);
182 assert(sktc_attr->anonymous == -1 || sktc_attr->anonymous == scratch);
183
184 scratch = -1;
185 error = os_nexus_attr_get(attr, NEXUS_ATTR_USER_CHANNEL, &scratch);
186 SKTC_ASSERT_ERR(!error);
187 assert(scratch != -1);
188 assert(sktc_attr->userchannel == -1 ||
189 sktc_attr->userchannel == scratch);
190
191 scratch = -1;
192 error = os_nexus_attr_get(attr, NEXUS_ATTR_TX_RINGS, &scratch);
193 SKTC_ASSERT_ERR(!error);
194 assert(scratch != -1);
195 assert(sktc_attr->ntxrings == -1 || sktc_attr->ntxrings == scratch);
196
197 scratch = -1;
198 error = os_nexus_attr_get(attr, NEXUS_ATTR_RX_RINGS, &scratch);
199 SKTC_ASSERT_ERR(!error);
200 assert(scratch != -1);
201 assert(sktc_attr->nrxrings == -1 || sktc_attr->nrxrings == scratch);
202
203 scratch = -1;
204 error = os_nexus_attr_get(attr, NEXUS_ATTR_TX_SLOTS, &scratch);
205 SKTC_ASSERT_ERR(!error);
206 assert(scratch != -1);
207 assert(sktc_attr->ntxslots == -1 || sktc_attr->ntxslots == scratch);
208
209 scratch = -1;
210 error = os_nexus_attr_get(attr, NEXUS_ATTR_RX_SLOTS, &scratch);
211 SKTC_ASSERT_ERR(!error);
212 assert(scratch != -1);
213 assert(sktc_attr->nrxslots == -1 || sktc_attr->nrxslots == scratch);
214
215 scratch = -1;
216 error = os_nexus_attr_get(attr, NEXUS_ATTR_SLOT_BUF_SIZE, &scratch);
217 SKTC_ASSERT_ERR(!error);
218 assert(scratch != -1);
219 assert(sktc_attr->slotsize == -1 || sktc_attr->slotsize == scratch);
220
221 scratch = -1;
222 error = os_nexus_attr_get(attr, NEXUS_ATTR_SLOT_META_SIZE, &scratch);
223 SKTC_ASSERT_ERR(!error);
224 assert(scratch != -1);
225 assert(sktc_attr->metasize == -1 || sktc_attr->metasize == scratch);
226
227 scratch = -1;
228 error = os_nexus_attr_get(attr, NEXUS_ATTR_MAX_FRAGS, &scratch);
229 SKTC_ASSERT_ERR(!error);
230 assert(scratch != -1);
231 assert(sktc_attr->maxfrags == -1 || sktc_attr->maxfrags == scratch);
232
233 scratch = -1;
234 error = os_nexus_attr_get(attr, NEXUS_ATTR_REJECT_ON_CLOSE, &scratch);
235 SKTC_ASSERT_ERR(!error);
236 assert(scratch != -1);
237 assert(sktc_attr->rejectonclose == -1 ||
238 sktc_attr->rejectonclose == scratch);
239
240 os_nexus_attr_destroy(attr);
241
242 if (instancep) {
243 uuid_clear(*instancep);
244 error = os_nexus_controller_alloc_provider_instance(ncd,
245 *providerp, instancep);
246 SKTC_ASSERT_ERR(!error);
247 assert(!uuid_is_null(*instancep));
248 }
249 }
250
251 /* up to 4 seconds of retries (250ms delay per retry) */
252 #define SKTU_CHANNEL_CREATE_NOMEM_RETRIES 16
253
254 channel_t
sktu_channel_create_extended(const uuid_t uuid,const nexus_port_t port,const ring_dir_t dir,const ring_id_t rid,const channel_attr_t attr,uint64_t exclusive,uint64_t txlowatunit,uint64_t txlowatval,uint64_t rxlowatunit,uint64_t rxlowatval,uint64_t userpacketpool,uint64_t defunctok,uint64_t event_ring,uint64_t low_latency)255 sktu_channel_create_extended(const uuid_t uuid, const nexus_port_t port,
256 const ring_dir_t dir, const ring_id_t rid, const channel_attr_t attr,
257 uint64_t exclusive, uint64_t txlowatunit, uint64_t txlowatval,
258 uint64_t rxlowatunit, uint64_t rxlowatval, uint64_t userpacketpool,
259 uint64_t defunctok, uint64_t event_ring, uint64_t low_latency)
260 {
261 channel_attr_t tmpattr;
262 int error;
263 uint64_t scratch;
264 static struct timespec delay250ms = { .tv_sec = 0, .tv_nsec = 250000000 };
265 uint32_t retries = 0;
266 channel_t ret = NULL;
267
268 if (!attr) {
269 tmpattr = os_channel_attr_create();
270 } else {
271 tmpattr = attr;
272 }
273
274 if (exclusive != -1) {
275 error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_EXCLUSIVE, exclusive);
276 SKTC_ASSERT_ERR(!error);
277 }
278
279 if (txlowatunit != -1) {
280 error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_TX_LOWAT_UNIT, txlowatunit);
281 SKTC_ASSERT_ERR(!error);
282 }
283
284 if (txlowatval != -1) {
285 error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_TX_LOWAT_VALUE, txlowatval);
286 SKTC_ASSERT_ERR(!error);
287 }
288
289 if (rxlowatunit != -1) {
290 error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_RX_LOWAT_UNIT, rxlowatunit);
291 SKTC_ASSERT_ERR(!error);
292 }
293
294 if (rxlowatval != -1) {
295 error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_RX_LOWAT_VALUE, rxlowatval);
296 SKTC_ASSERT_ERR(!error);
297 }
298
299 if (userpacketpool != -1) {
300 error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_USER_PACKET_POOL, userpacketpool);
301 SKTC_ASSERT_ERR(!error);
302 }
303
304 if (defunctok != -1) {
305 error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_NEXUS_DEFUNCT_OK, defunctok);
306 SKTC_ASSERT_ERR(!error);
307 }
308
309 if (event_ring != -1) {
310 error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_EVENT_RING, event_ring);
311 SKTC_ASSERT_ERR(!error);
312 }
313
314 if (low_latency != -1) {
315 error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_LOW_LATENCY, low_latency);
316 SKTC_ASSERT_ERR(!error);
317 }
318
319 retry:
320 ret = os_channel_create_extended(uuid, port, dir, rid, tmpattr);
321 if (ret == NULL) {
322 if (errno == ENOMEM && ++retries < SKTU_CHANNEL_CREATE_NOMEM_RETRIES) {
323 nanosleep(&delay250ms, NULL);
324 goto retry;
325 }
326 goto out;
327 }
328
329 scratch = -1;
330 error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_EXCLUSIVE, &scratch);
331 SKTC_ASSERT_ERR(!error);
332 assert(scratch != 1);
333 assert(exclusive == -1 || exclusive == scratch);
334
335 scratch = -1;
336 error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_TX_LOWAT_UNIT, &scratch);
337 SKTC_ASSERT_ERR(!error);
338 assert(scratch != -1);
339 assert(exclusive == -1 || txlowatunit == scratch);
340
341 scratch = -1;
342 error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_TX_LOWAT_VALUE, &scratch);
343 SKTC_ASSERT_ERR(!error);
344 assert(scratch != -1);
345 assert(exclusive == -1 || txlowatval == scratch);
346
347 scratch = -1;
348 error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_RX_LOWAT_UNIT, &scratch);
349 SKTC_ASSERT_ERR(!error);
350 assert(scratch != -1);
351 assert(exclusive == -1 || rxlowatunit == scratch);
352
353 scratch = -1;
354 error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_RX_LOWAT_VALUE, &scratch);
355 SKTC_ASSERT_ERR(!error);
356 assert(scratch != -1);
357 assert(exclusive == -1 || rxlowatval == scratch);
358
359 scratch = -1;
360 error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_USER_PACKET_POOL, &scratch);
361 SKTC_ASSERT_ERR(!error);
362 assert(scratch != -1);
363 assert(exclusive == -1 || userpacketpool == scratch);
364
365 scratch = -1;
366 error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_NEXUS_DEFUNCT_OK, &scratch);
367 SKTC_ASSERT_ERR(!error);
368 assert(scratch != -1);
369 assert(exclusive == -1 || defunctok == scratch);
370
371 scratch = -1;
372 error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_EVENT_RING, &scratch);
373 SKTC_ASSERT_ERR(!error);
374 assert(scratch != -1);
375 assert(exclusive == -1 || event_ring == scratch);
376
377 scratch = -1;
378 error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_LOW_LATENCY, &scratch);
379 SKTC_ASSERT_ERR(!error);
380 assert(scratch != -1);
381 assert(exclusive == -1 || low_latency == scratch);
382
383 out:
384 if (!attr) {
385 os_channel_attr_destroy(tmpattr);
386 }
387
388 return ret;
389 }
390
391 /****************************************************************/
392
393 static inline void
swap(int * permute,int i,int j)394 swap(int *permute, int i, int j)
395 {
396 int tmp = permute[i];
397 permute[i] = permute[j];
398 permute[j] = tmp;
399 }
400
401
402 /* Plain changes, see Knuth (7.2.1.2) "Algorithm P"
403 * has advantage of only swapping adjacent pairs
404 * This could be cleaned up to be more "C" like, but
405 * this literal translation works without fanfare.
406 */
407 void
permutefuncP(int n,int * permute,void (* func)(int,int * permute))408 permutefuncP(int n, int *permute, void (*func)(int, int *permute))
409 {
410 int j, s, q;
411 int c[n], o[n];
412 /* P1 Initialize. */
413 for (j = 0; j < n; j++) {
414 c[j] = 0;
415 o[j] = 1;
416 }
417 p2:
418 /* P2 Visit. */
419 func(n, permute);
420 /* P3 Prepare for change. */
421 j = n;
422 s = 0;
423 p4:
424 /* P4 Ready to change? */
425 q = c[j - 1] + o[j - 1];
426 if (q < 0) {
427 goto p7;
428 }
429 if (q == j) {
430 goto p6;
431 }
432 /* P5 Change. */
433 {
434 //T_LOG("Swapping %d with %d\n", j-c[j-1]+s-1, j-q+s-1);
435 swap(permute, j - c[j - 1] + s - 1, j - q + s - 1);
436 }
437 c[j - 1] = q;
438 goto p2;
439 p6: /* P6 Increase s */
440 if (j == 1) {
441 return;
442 }
443 s++;
444 p7: /* P7 Switch Direction */
445 o[j - 1] = -o[j - 1];
446 j--;
447 goto p4;
448 }
449
450 /* Heap's algorithm */
451 void
permutefuncH(int n,int * permute,void (* func)(int,int * permute))452 permutefuncH(int n, int *permute, void (*func)(int, int *permute))
453 {
454 time_t start = time(NULL);
455 time_t now, then = start;
456 int count = 0;
457 int total = 1;
458 int i = 0;
459 int c[n];
460 memset(c, 0, sizeof(c));
461 for (int f = 2; f <= n; f++) {
462 total *= f;
463 }
464 count++;
465 func(n, permute);
466 while (i < n) {
467 if (c[i] < i) {
468 if (!(i & 1)) { /* Even */
469 swap(permute, i, 0);
470 } else { /* Odd */
471 swap(permute, i, c[i]);
472 }
473 count++;
474 {
475 now = time(NULL);
476 if (now > then) {
477 T_LOG("time %ld on %d of %d (%2.2f%%, est %ld secs left)\n",
478 now - start, count, total,
479 (double)count * 100 / total,
480 (long)((double)(now - start) * total / count) - (now - start));
481 then = now;
482 }
483 }
484 func(n, permute);
485 c[i] += 1;
486 i = 0;
487 } else {
488 c[i] = 0;
489 i++;
490 }
491 }
492 now = time(NULL);
493 T_LOG("total time %ld for %d permutations (rate %.2f)\n",
494 now - start, total, (double)total / (now - start));
495 }
496
497 /* Random permutations, knuth's shuffle */
498
499 void
permutefuncR(int n,int * permute,void (* func)(int,int * permute),int total,unsigned seed)500 permutefuncR(int n, int *permute, void (*func)(int, int *permute), int total, unsigned seed)
501 {
502 time_t start = time(NULL);
503 time_t now, then = start;
504 int count = 0;
505 T_LOG("Starting %d random permutations with seed %u\n", total, seed);
506 srandom(seed);
507 while (count < total) {
508 for (int i = n - 1; i > 0; i--) {
509 int j = random() % i; // XXX modulo bias.
510 swap(permute, i, j);
511 }
512 count++;
513 {
514 now = time(NULL);
515 if (now > then) {
516 T_LOG("time %ld on %d of %d (%2.2f%%, est %ld secs left)\n",
517 now - start, count, total,
518 (double)count * 100 / total,
519 (long)((double)(now - start) * total / count) - (now - start));
520 then = now;
521 }
522 }
523 func(n, permute);
524 }
525 now = time(NULL);
526 T_LOG("total time %ld for %d permutations (rate %.2f)\n",
527 now - start, total, (double)total / (now - start));
528 }
529
530
531 /*
532 * rakes each element across all other elements.
533 */
534 void
permutefuncZ(int n,int * permute,void (* func)(int,int * permute))535 permutefuncZ(int n, int *permute, void (*func)(int, int *permute))
536 {
537 int save[n];
538 memcpy(save, permute, sizeof(save));
539 func(n, permute);
540 for (int i = 0; i < n; i++) {
541 //T_LOG("raking %d left\n", i);
542 memcpy(permute, save, sizeof(save));
543 for (int j = i; j > 0; j--) {
544 swap(permute, j, j - 1);
545 func(n, permute);
546 }
547 //T_LOG("raking %d right\n", i);
548 memcpy(permute, save, sizeof(save));
549 for (int j = i; j < n - 1; j++) {
550 swap(permute, j, j + 1);
551 /* The first right is the same as the last left, so skip it */
552 if (j != i) {
553 func(n, permute);
554 }
555 }
556 }
557 }
558
559 /****************************************************************/
560
561 void
sktc_create_flowswitch_no_address(struct sktc_nexus_handles * handles,uint64_t ntxslots,uint64_t nrxslots,uint64_t buf_size,uint64_t max_frags,uint64_t anonymous)562 sktc_create_flowswitch_no_address(struct sktc_nexus_handles *handles,
563 uint64_t ntxslots, uint64_t nrxslots, uint64_t buf_size, uint64_t max_frags,
564 uint64_t anonymous)
565 {
566 char buf[256];
567 int error;
568 struct sktc_nexus_attr attr = SKTC_NEXUS_ATTR_INIT();
569
570 attr.ntxslots = ntxslots;
571 attr.nrxslots = nrxslots;
572 attr.slotsize = buf_size;
573 attr.anonymous = anonymous;
574 attr.maxfrags = max_frags;
575
576 if (handles->netif_ifname[0] == '\0') {
577 T_LOG("%s: no interface name specified\n",
578 __func__);
579 return;
580 }
581 if (strlen(handles->netif_ifname) >= IFNAMSIZ) {
582 T_LOG("%s: invalid interface name specified %s\n",
583 __func__, handles->netif_ifname);
584 return;
585 }
586 handles->controller = os_nexus_controller_create();
587 if (handles->controller == NULL) {
588 SKT_LOG(
589 "%s: os_nexus_controller_create failed, %s (%d)\n",
590 __func__, strerror(errno), errno);
591 return;
592 }
593
594 snprintf(buf, sizeof(buf), "ms_fsw_%s", handles->netif_ifname);
595 strncpy((char *)attr.name, buf, sizeof(nexus_name_t) - 1);
596 attr.type = NEXUS_TYPE_FLOW_SWITCH;
597 sktc_build_nexus(handles->controller, &attr, &handles->fsw_prov_uuid,
598 &handles->fsw_nx_uuid);
599
600 /* if the netif is already present, don't bother creating/attaching */
601 if (!sktc_get_netif_nexus(handles->netif_ifname,
602 handles->netif_nx_uuid)) {
603 snprintf(buf, sizeof(buf), "netif_%s", handles->netif_ifname);
604 strncpy((char *)attr.name, buf, sizeof(nexus_name_t) - 1);
605 attr.type = NEXUS_TYPE_NET_IF;
606 attr.ntxslots = -1;
607 attr.nrxslots = -1;
608 sktc_build_nexus(handles->controller, &attr,
609 &handles->netif_prov_uuid, &handles->netif_nx_uuid);
610 error = __os_nexus_ifattach(handles->controller,
611 handles->netif_nx_uuid,
612 handles->netif_ifname, NULL,
613 false,
614 &handles->netif_nx_attach_uuid);
615 if (error != 0) {
616 SKT_LOG(
617 "__os_nexus_ifattach(%s) failed, %s (%d)\n",
618 buf, strerror(errno), errno);
619 return;
620 }
621 }
622 error = __os_nexus_ifattach(handles->controller, handles->fsw_nx_uuid,
623 NULL, handles->netif_nx_uuid, false, &handles->fsw_nx_dev_attach_uuid);
624 if (error != 0) {
625 SKT_LOG("__os_nexus_ifattach() failed, %s (%d)\n",
626 strerror(errno), errno);
627 return;
628 }
629 }
630
631
632 void
sktc_nexus_handles_assign_address(struct sktc_nexus_handles * handles)633 sktc_nexus_handles_assign_address(struct sktc_nexus_handles *handles)
634 {
635 int error;
636
637 error = sktc_ifnet_add_addr(handles->netif_ifname,
638 &handles->netif_addr,
639 &handles->netif_mask, NULL);
640 SKTC_ASSERT_ERR(!error);
641 }
642
643 void
sktc_create_flowswitch(struct sktc_nexus_handles * handles,int i)644 sktc_create_flowswitch(struct sktc_nexus_handles *handles, int i)
645 {
646 uint16_t val;
647
648 /* assign the name */
649 snprintf(handles->netif_ifname, sizeof(handles->netif_ifname),
650 FETH_FORMAT, i);
651
652 /* pick/assign a random IPv4LL address */
653 val = random() % 0xffff;
654 /* avoid subnet broadcast and host address 0 */
655 if (((val & 0xff) == 0) || ((val & 0xff) == 0xff)) {
656 val = (val & 0xfff0) | 0x2;
657 }
658 handles->netif_addr = sktc_make_in_addr(IN_LINKLOCALNETNUM | val);
659 handles->netif_mask = sktc_make_in_addr(IN_CLASSC_NET);
660 sktc_nexus_handles_assign_address(handles);
661
662 /* create the flowswitch */
663 sktc_create_flowswitch_no_address(handles, -1, -1, -1, -1, 1);
664 }
665
666 void
sktc_cleanup_flowswitch(struct sktc_nexus_handles * handles)667 sktc_cleanup_flowswitch(struct sktc_nexus_handles *handles)
668 {
669 int error;
670
671 assert(handles->controller);
672 assert(!uuid_is_null(handles->fsw_prov_uuid));
673 assert(!uuid_is_null(handles->fsw_nx_uuid));
674
675 error = os_nexus_controller_free_provider_instance(handles->controller,
676 handles->fsw_nx_uuid);
677 SKTC_ASSERT_ERR(!error);
678
679 error = os_nexus_controller_deregister_provider(handles->controller,
680 handles->fsw_prov_uuid);
681 SKTC_ASSERT_ERR(!error);
682
683 os_nexus_controller_destroy(handles->controller);
684
685 error = sktc_ifnet_del_addr(handles->netif_ifname, &handles->netif_addr);
686 SKTC_ASSERT_ERR(!error);
687 }
688
689 /****************************************************************/
690
691 int
sktc_bind_tcp4_flow(nexus_controller_t ncd,const uuid_t fsw,in_port_t in_port,nexus_port_t nx_port,const uuid_t flow)692 sktc_bind_tcp4_flow(nexus_controller_t ncd, const uuid_t fsw, in_port_t in_port, nexus_port_t nx_port, const uuid_t flow)
693 {
694 struct nx_flow_req nfr;
695 int error;
696
697 memset(&nfr, 0, sizeof(nfr));
698 nfr.nfr_ip_protocol = IPPROTO_TCP;
699 nfr.nfr_nx_port = nx_port;
700 nfr.nfr_saddr.sa.sa_len = sizeof(struct sockaddr_in);
701 nfr.nfr_saddr.sa.sa_family = AF_INET;
702 nfr.nfr_saddr.sin.sin_port = htons(in_port);
703 nfr.nfr_saddr.sin.sin_addr.s_addr = htonl(INADDR_ANY);
704 uuid_copy(nfr.nfr_flow_uuid, flow);
705
706 #if 0
707 char buf[31];
708 uuid_string_t uuidstr;
709 uuid_unparse(nfr.nfr_flow_uuid, uuidstr);
710 inet_ntop(AF_INET, &nfr.nfr_saddr.sin.sin_addr.s_addr, buf, sizeof(buf));
711 T_LOG("before: nx_port %3d Flow %s %s addr %s port %d\n",
712 nfr.nfr_nx_port, uuidstr, (nfr.nfr_ip_protocol == IPPROTO_TCP) ? "tcp" : "udp",
713 buf, ntohs(nfr.nfr_saddr.sin.sin_port));
714 #endif
715
716 error = __os_nexus_flow_add(ncd, fsw, &nfr);
717 #if 0
718 if (error) {
719 T_LOG("__os_nexus_flow_add returned %d, errno %d\n", error, errno);
720 }
721 #endif
722
723 #if 0
724 uuid_unparse(nfr.nfr_flow_uuid, uuidstr);
725 inet_ntop(AF_INET, &nfr.nfr_saddr.sin.sin_addr.s_addr, buf, sizeof(buf));
726 T_LOG("after: nx_port %3d Flow %s %s addr %s port %d\n",
727 nfr.nfr_nx_port, uuidstr, (nfr.nfr_ip_protocol == IPPROTO_TCP) ? "tcp" : "udp",
728 buf, ntohs(nfr.nfr_saddr.sin.sin_port));
729 #endif
730
731 // XXX fails, see the fswbind25 for standalone test for this
732 assert(nfr.nfr_nx_port == nx_port);
733 SKT_LOG("got ephemeral port %d\n", ntohs(nfr.nfr_saddr.sin.sin_port));
734
735 /* Validate the ephemeral ports */
736 if (!error && !in_port) {
737 static int first, last;
738 if (!first && !last) {
739 size_t size;
740
741 size = sizeof(first);
742 error = sysctlbyname("net.inet.ip.portrange.first", &first, &size, NULL, 0);
743 SKTC_ASSERT_ERR(!error);
744 assert(size == sizeof(first));
745
746 size = sizeof(last);
747 error = sysctlbyname("net.inet.ip.portrange.last", &last, &size, NULL, 0);
748 SKTC_ASSERT_ERR(!error);
749 assert(size == sizeof(last));
750
751 T_LOG("ephemeral port range first %d last %d\n", first, last);
752
753 if (last < first) {
754 int tmp = first;
755 first = last;
756 last = tmp;
757 }
758 assert(first <= last);
759 }
760 assert(ntohs(nfr.nfr_saddr.sin.sin_port) >= first);
761 assert(ntohs(nfr.nfr_saddr.sin.sin_port) <= last);
762 }
763
764 return error;
765 }
766
767 int
sktc_unbind_flow(nexus_controller_t ncd,const uuid_t fsw,const uuid_t flow)768 sktc_unbind_flow(nexus_controller_t ncd, const uuid_t fsw, const uuid_t flow)
769 {
770 struct nx_flow_req nfr;
771 int error;
772
773 memset(&nfr, 0, sizeof(nfr));
774 uuid_copy(nfr.nfr_flow_uuid, flow);
775
776 error = __os_nexus_flow_del(ncd, fsw, &nfr);
777 if (error) {
778 SKT_LOG("__os_nexus_flow_add returned %d, errno %d\n", error, errno);
779 }
780 return error;
781 }
782
783 /****************************************************************/
784
785 uint32_t
sktc_chew_random(channel_t channel,channel_ring_t ring,sync_mode_t mode,bool dosync,uint32_t nslots)786 sktc_chew_random(channel_t channel, channel_ring_t ring, sync_mode_t mode, bool dosync, uint32_t nslots)
787 {
788 uint64_t count = 0;
789 int error;
790 channel_slot_t slot;
791
792 /* Chew a random number of slots */
793 nslots = random() % (nslots + 1);
794
795 slot = NULL;
796 while (count < nslots) {
797 slot_prop_t prop;
798
799 slot = os_channel_get_next_slot(ring, slot, &prop);
800 assert(slot);
801 if (mode == CHANNEL_SYNC_TX) {
802 packet_t pkt = os_channel_slot_get_packet(ring, slot);
803 buflet_t buf = os_packet_get_next_buflet(pkt, NULL);
804 assert(buf != NULL);
805 uint16_t bdlim = os_buflet_get_data_limit(buf);
806 assert(bdlim != 0);
807 prop.sp_len = random() % bdlim;
808 os_channel_set_slot_properties(ring, slot, &prop);
809 }
810 count++;
811 }
812
813 if (slot) {
814 error = os_channel_advance_slot(ring, slot);
815 SKTC_ASSERT_ERR(!error);
816 }
817
818 if (dosync) {
819 error = os_channel_sync(channel, mode);
820 if (skywalk_in_driver && error) {
821 SKT_LOG("%s: sync fail error %d errno %d: %s\n", __func__, error, errno, strerror(errno));
822 } else {
823 SKTC_ASSERT_ERR(!error);
824 }
825 }
826
827 return count;
828 }
829
830 /* This pumps slots on a ring until count slots have been tranferred */
831 void
sktc_pump_ring_nslots_kq(channel_t channel,channel_ring_t ring,sync_mode_t mode,bool dosync,uint64_t nslots,bool verbose)832 sktc_pump_ring_nslots_kq(channel_t channel, channel_ring_t ring, sync_mode_t mode, bool dosync, uint64_t nslots, bool verbose)
833 {
834 uint64_t count = 0;
835 int channelfd;
836 int kq;
837 struct kevent kev;
838 int error;
839 time_t start, then;
840
841 channelfd = os_channel_get_fd(channel);
842 assert(channelfd != -1);
843
844 kq = kqueue();
845 assert(kq != -1);
846 EV_SET(&kev, channelfd,
847 mode == CHANNEL_SYNC_TX ? EVFILT_WRITE : EVFILT_READ,
848 EV_ADD | EV_ENABLE, 0, 0, NULL);
849 error = kevent(kq, &kev, 1, NULL, 0, NULL);
850 SKTC_ASSERT_ERR(!error);
851
852 if (verbose) {
853 then = start = time(NULL);
854 }
855
856 while (count < nslots) {
857 uint32_t avail;
858
859 if (verbose) {
860 time_t now = time(NULL);
861 if (now > then) {
862 T_LOG("time %ld pump %"PRId64" of %"PRId64" (%2.2f%%, est %ld secs left)\n",
863 now - start, count, nslots,
864 (double)count * 100 / nslots,
865 (long)((double)(now - start) * nslots / count) - (now - start));
866 then = now;
867 }
868 }
869
870 avail = os_channel_available_slot_count(ring);
871
872 if (!avail) {
873 int error;
874
875 memset(&kev, 0, sizeof(kev));
876 error = kevent(kq, NULL, 0, &kev, 1, NULL);
877 SKTC_ASSERT_ERR(error != -1);
878 SKTC_ASSERT_ERR(error == 1);
879
880 assert(kev.ident == channelfd);
881 if (mode == CHANNEL_SYNC_TX) {
882 assert(kev.filter == EVFILT_WRITE);
883 } else {
884 assert(kev.filter == EVFILT_READ);
885 }
886
887 avail = os_channel_available_slot_count(ring);
888 assert(avail);
889 }
890
891 count += sktc_chew_random(channel, ring, mode, dosync, MIN(nslots - count, avail));
892 }
893
894 if (verbose) {
895 time_t now = time(NULL);
896 T_LOG("total time %ld for %"PRId64" slots (rate %.2f)\n",
897 now - start, nslots, (double)nslots / (now - start));
898 }
899
900 error = close(kq);
901 SKTC_ASSERT_ERR(!error);
902 }
903
904 void
sktc_pump_ring_nslots_select(channel_t channel,channel_ring_t ring,sync_mode_t mode,bool dosync,uint64_t nslots,bool verbose)905 sktc_pump_ring_nslots_select(channel_t channel, channel_ring_t ring, sync_mode_t mode, bool dosync, uint64_t nslots, bool verbose)
906 {
907 uint64_t count = 0;
908 int channelfd;
909 fd_set readfds, writefds, errorfds, zerofds;
910 time_t start, then;
911
912 channelfd = os_channel_get_fd(channel);
913 assert(channelfd != -1);
914
915 FD_ZERO(&zerofds);
916 FD_ZERO(&readfds);
917 FD_ZERO(&writefds);
918 FD_ZERO(&errorfds);
919 if (mode == CHANNEL_SYNC_TX) {
920 FD_SET(channelfd, &writefds);
921 } else {
922 FD_SET(channelfd, &readfds);
923 }
924
925 if (verbose) {
926 then = start = time(NULL);
927 }
928
929 while (count < nslots) {
930 uint32_t avail;
931
932 if (verbose) {
933 time_t now = time(NULL);
934 if (now > then) {
935 T_LOG("time %ld pump %"PRId64" of %"PRId64" (%2.2f%%, est %ld secs left)\n",
936 now - start, count, nslots,
937 (double)count * 100 / nslots,
938 (long)((double)(now - start) * nslots / count) - (now - start));
939 then = now;
940 }
941 }
942
943 avail = os_channel_available_slot_count(ring);
944
945 if (!avail) {
946 int error;
947
948 FD_SET(channelfd, &errorfds);
949 error = select(channelfd + 1, &readfds, &writefds, &errorfds, NULL);
950 SKTC_ASSERT_ERR(error != -1);
951 assert(!memcmp(&zerofds, &errorfds, sizeof(zerofds)));
952 if (mode == CHANNEL_SYNC_TX) {
953 assert(FD_ISSET(channelfd, &writefds));
954 assert(!memcmp(&zerofds, &readfds, sizeof(zerofds)));
955 } else {
956 assert(FD_ISSET(channelfd, &readfds));
957 assert(!memcmp(&zerofds, &writefds, sizeof(zerofds)));
958 }
959 SKTC_ASSERT_ERR(error == 1);
960
961 avail = os_channel_available_slot_count(ring);
962 assert(avail);
963 }
964
965 count += sktc_chew_random(channel, ring, mode, dosync, MIN(nslots - count, avail));
966 }
967
968 if (verbose) {
969 time_t now = time(NULL);
970 T_LOG("total time %ld for %"PRId64" slots (rate %.2f)\n",
971 now - start, nslots, (double)nslots / (now - start));
972 }
973 }
974
975 void
sktc_pump_ring_nslots_poll(channel_t channel,channel_ring_t ring,sync_mode_t mode,bool dosync,uint64_t nslots,bool verbose)976 sktc_pump_ring_nslots_poll(channel_t channel, channel_ring_t ring, sync_mode_t mode, bool dosync, uint64_t nslots, bool verbose)
977 {
978 uint64_t count = 0;
979 int channelfd;
980 struct pollfd fds;
981 time_t start, then;
982
983 channelfd = os_channel_get_fd(channel);
984 assert(channelfd != -1);
985
986 fds.fd = channelfd;
987 if (mode == CHANNEL_SYNC_TX) {
988 fds.events = POLLWRNORM;
989 } else {
990 fds.events = POLLRDNORM;
991 }
992
993 if (verbose) {
994 then = start = time(NULL);
995 }
996
997 while (count < nslots) {
998 uint32_t avail;
999
1000 if (verbose) {
1001 time_t now = time(NULL);
1002 if (now > then) {
1003 T_LOG("time %ld pump %"PRId64" of %"PRId64" (%2.2f%%, est %ld secs left)\n",
1004 now - start, count, nslots,
1005 (double)count * 100 / nslots,
1006 (long)((double)(now - start) * nslots / count) - (now - start));
1007 then = now;
1008 }
1009 }
1010
1011 avail = os_channel_available_slot_count(ring);
1012
1013 if (!avail) {
1014 int error;
1015
1016 error = poll(&fds, 1, -1);
1017 SKTC_ASSERT_ERR(error != -1);
1018 SKTC_ASSERT_ERR(error == 1);
1019 assert(fds.fd == channelfd);
1020 if (mode == CHANNEL_SYNC_TX) {
1021 assert(fds.events == POLLWRNORM);
1022 assert(fds.revents == POLLWRNORM);
1023 } else {
1024 assert(fds.events == POLLRDNORM);
1025 assert(fds.revents == POLLRDNORM);
1026 }
1027
1028 avail = os_channel_available_slot_count(ring);
1029 assert(avail);
1030 }
1031
1032 count += sktc_chew_random(channel, ring, mode, dosync, MIN(nslots - count, avail));
1033 }
1034
1035 if (verbose) {
1036 time_t now = time(NULL);
1037 T_LOG("total time %ld for %"PRId64" slots (rate %.2f)\n",
1038 now - start, nslots, (double)nslots / (now - start));
1039 }
1040 }
1041
1042 /****************************************************************/
1043
1044 void
sktc_raise_file_limit(int new)1045 sktc_raise_file_limit(int new)
1046 {
1047 int error;
1048 struct rlimit rl;
1049
1050 error = getrlimit(RLIMIT_NOFILE, &rl);
1051 SKTC_ASSERT_ERR(!error);
1052
1053 if (rl.rlim_cur < new) {
1054 T_LOG("raising file open limit from %llu (max %llu) to %d\n",
1055 rl.rlim_cur, rl.rlim_max, new);
1056 rl.rlim_cur = new;
1057 rl.rlim_max = new;
1058 error = setrlimit(RLIMIT_NOFILE, &rl);
1059 SKTC_ASSERT_ERR(!error);
1060 }
1061 }
1062
1063
1064 /****************************************************************/
1065
1066 int
sktu_create_interface(sktu_if_type_t type,sktu_if_flag_t flags)1067 sktu_create_interface(sktu_if_type_t type, sktu_if_flag_t flags)
1068 {
1069 struct ctl_info kernctl_info;
1070 struct sockaddr_ctl kernctl_addr;
1071 int error;
1072 int tunsock;
1073 const char *CONTROL_NAME;
1074 int OPT_ENABLE_NETIF, OPT_ATTACH_FSW, OPT_ENABLE_CHANNEL;
1075 int enable_netif, attach_fsw, enable_channel;
1076 int scratch;
1077
1078 assert(type == SKTU_IFT_UTUN || type == SKTU_IFT_IPSEC);
1079 if (type == SKTU_IFT_UTUN) {
1080 CONTROL_NAME = UTUN_CONTROL_NAME;
1081 OPT_ENABLE_NETIF = UTUN_OPT_ENABLE_NETIF;
1082 OPT_ATTACH_FSW = UTUN_OPT_ATTACH_FLOWSWITCH;
1083 OPT_ENABLE_CHANNEL = UTUN_OPT_ENABLE_CHANNEL;
1084 } else {
1085 CONTROL_NAME = IPSEC_CONTROL_NAME;
1086 OPT_ENABLE_NETIF = IPSEC_OPT_ENABLE_NETIF;
1087 OPT_ATTACH_FSW = 0;
1088 OPT_ENABLE_CHANNEL = IPSEC_OPT_ENABLE_CHANNEL;
1089 }
1090
1091 enable_netif = ((flags & SKTU_IFF_ENABLE_NETIF) != 0) ? 1 : 0;
1092 attach_fsw = ((flags & SKTU_IFF_NO_ATTACH_FSW) != 0) ? 0 : 1;
1093 enable_channel = ((flags & SKTU_IFF_ENABLE_CHANNEL) != 0) ? 1 : 0;
1094
1095 /* XXX Remove this retry nonsense when this is fixed:
1096 * <rdar://problem/37340313> creating an interface without specifying specific interface name should not return EBUSY
1097 */
1098
1099 for (int i = 0; i < 10; i++) {
1100 if (i > 0) {
1101 T_LOG("%s: sleeping 1ms before retrying\n", __func__);
1102 usleep(1000);
1103 }
1104
1105 tunsock = socket(PF_SYSTEM, SOCK_DGRAM, SYSPROTO_CONTROL);
1106 assert(tunsock != -1);
1107
1108 memset(&kernctl_info, 0, sizeof(kernctl_info));
1109 strlcpy(kernctl_info.ctl_name, CONTROL_NAME, sizeof(kernctl_info.ctl_name));
1110 error = ioctl(tunsock, CTLIOCGINFO, &kernctl_info);
1111 SKTC_ASSERT_ERR(error == 0);
1112
1113 memset(&kernctl_addr, 0, sizeof(kernctl_addr));
1114 kernctl_addr.sc_len = sizeof(kernctl_addr);
1115 kernctl_addr.sc_family = AF_SYSTEM;
1116 kernctl_addr.ss_sysaddr = AF_SYS_CONTROL;
1117 kernctl_addr.sc_id = kernctl_info.ctl_id;
1118 kernctl_addr.sc_unit = 0;
1119
1120 /* If this is being called to reinstantiate a device that was just detached,
1121 * then this may return busy while the asynchronous detach completes.
1122 * This only occurs when this is being called in a tight loop
1123 * as per the utun27646755 test below
1124 */
1125
1126 error = bind(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr));
1127 if (error == -1 && errno == EBUSY) {
1128 close(tunsock);
1129 tunsock = -1;
1130 T_LOG("%s: i = %d bind returned EBUSY\n", __func__, i);
1131 continue;
1132 }
1133
1134 /* can only be set before connecting */
1135 error = setsockopt(tunsock, SYSPROTO_CONTROL, OPT_ENABLE_NETIF, &enable_netif, sizeof(enable_netif));
1136 SKTC_ASSERT_ERR(!error);
1137 socklen_t scratchlen = sizeof(scratch);
1138 error = getsockopt(tunsock, SYSPROTO_CONTROL, OPT_ENABLE_NETIF, &scratch, &scratchlen);
1139 SKTC_ASSERT_ERR(!error);
1140 assert(scratchlen == sizeof(scratch));
1141 assert(enable_netif == scratch);
1142
1143 error = setsockopt(tunsock, SYSPROTO_CONTROL, OPT_ENABLE_CHANNEL, &enable_channel, sizeof(enable_channel));
1144 SKTC_ASSERT_ERR(!error);
1145 scratchlen = sizeof(scratch);
1146 error = getsockopt(tunsock, SYSPROTO_CONTROL, OPT_ENABLE_CHANNEL, &scratch, &scratchlen);
1147 SKTC_ASSERT_ERR(!error);
1148 assert(scratchlen == sizeof(scratch));
1149 assert(enable_channel == scratch);
1150
1151 /* only applicable for utun */
1152 if (type == SKTU_IFT_UTUN) {
1153 error = setsockopt(tunsock, SYSPROTO_CONTROL, OPT_ATTACH_FSW, &attach_fsw, sizeof(attach_fsw));
1154 SKTC_ASSERT_ERR(!error);
1155 }
1156
1157 error = connect(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr));
1158 if (error == -1 && errno == EBUSY) {
1159 T_LOG("%s: i = %d connect returned EBUSY\n", __func__, i);
1160 close(tunsock);
1161 tunsock = -1;
1162 continue;
1163 }
1164 SKTC_ASSERT_ERR(!error);
1165
1166 error = fcntl(tunsock, F_SETFD, FD_CLOEXEC);
1167 if (error != 0) {
1168 warn("FD_CLOEXEC");
1169 }
1170
1171 break;
1172 }
1173
1174 if (error == -1) {
1175 warn("Failed to create utun errno %d", errno);
1176 close(tunsock);
1177 tunsock = -1;
1178 }
1179
1180 return tunsock;
1181 }
1182
1183 channel_t
sktu_create_interface_channel(sktu_if_type_t type,int tunsock)1184 sktu_create_interface_channel(sktu_if_type_t type, int tunsock)
1185 {
1186 uuid_t uuid;
1187 channel_attr_t attr;
1188 channel_t channel;
1189 socklen_t uuidlen;
1190 int error;
1191 int OPT_GET_CHANNEL_UUID;
1192
1193 if (type == SKTU_IFT_UTUN) {
1194 OPT_GET_CHANNEL_UUID = UTUN_OPT_GET_CHANNEL_UUID;
1195 } else {
1196 assert(type == SKTU_IFT_IPSEC);
1197 OPT_GET_CHANNEL_UUID = IPSEC_OPT_GET_CHANNEL_UUID;
1198 }
1199
1200 uuidlen = sizeof(uuid);
1201 error = getsockopt(tunsock, SYSPROTO_CONTROL, OPT_GET_CHANNEL_UUID, uuid, &uuidlen);
1202 SKTC_ASSERT_ERR(error == 0);
1203 assert(uuidlen == sizeof(uuid));
1204
1205 attr = NULL;
1206 channel = sktu_channel_create_extended(uuid,
1207 NEXUS_PORT_KERNEL_PIPE_CLIENT,
1208 CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, attr,
1209 -1, -1, -1, -1, -1, -1, 1, -1, -1);
1210 assert(channel);
1211
1212 return channel;
1213 }
1214
1215 void
sktu_get_interface_name(sktu_if_type_t type,int s,char name[IFNAMSIZ])1216 sktu_get_interface_name(sktu_if_type_t type, int s, char name[IFNAMSIZ])
1217 {
1218 int error;
1219 socklen_t optlen = IFNAMSIZ;
1220 if (type == SKTU_IFT_UTUN) {
1221 error = getsockopt(s, SYSPROTO_CONTROL, UTUN_OPT_IFNAME, name, &optlen);
1222 } else {
1223 error = getsockopt(s, SYSPROTO_CONTROL, IPSEC_OPT_IFNAME, name, &optlen);
1224 }
1225 SKTC_ASSERT_ERR(!error);
1226 }
1227
1228 void
sktu_dump_buffer(FILE * f,const char * desc,const void * buf,size_t len)1229 sktu_dump_buffer(FILE *f, const char *desc, const void *buf, size_t len)
1230 {
1231 int i;
1232 unsigned char buff[17];
1233 unsigned char *pc = (unsigned char*)buf;
1234
1235 if (desc != NULL) {
1236 fprintf(f, "%s:\n", desc);
1237 }
1238
1239 if (len == 0) {
1240 fprintf(f, " ZERO LENGTH\n");
1241 return;
1242 }
1243
1244 for (i = 0; i < len; i++) {
1245 if ((i % 16) == 0) {
1246 if (i != 0) {
1247 fprintf(f, " %s\n", buff);
1248 }
1249
1250 fprintf(f, " %04x ", i); // offset
1251 }
1252
1253 fprintf(f, " %02x", pc[i]);
1254
1255 // prepare ascii
1256 if ((pc[i] < 0x20) || (pc[i] > 0x7e)) {
1257 buff[i % 16] = '.';
1258 } else {
1259 buff[i % 16] = pc[i];
1260 }
1261 buff[(i % 16) + 1] = '\0';
1262 }
1263
1264 // pad last line to for ascii
1265 while ((i % 16) != 0) {
1266 fprintf(f, " ");
1267 i++;
1268 }
1269
1270 fprintf(f, " %s\n", buff);
1271 }
1272
1273 int
sysctl_buf(char * oid_name,void ** buffer,size_t * len,void * newp,size_t newlen)1274 sysctl_buf(char *oid_name, void **buffer, size_t *len, void *newp,
1275 size_t newlen)
1276 {
1277 int ret, err;
1278 int try = 0;
1279
1280 *buffer = NULL;
1281 #define RETRY_COUNT 10
1282 try_again:
1283 ret = sysctlbyname(oid_name, NULL, len, newp, newlen);
1284 if (ret != 0) {
1285 if (ret == ENOMEM) {
1286 try++;
1287 if (try <= RETRY_COUNT) {
1288 goto try_again;
1289 }
1290 }
1291 err = errno;
1292 SKT_LOG("sysctl for len failed, %s\n", strerror(errno));
1293 return err;
1294 }
1295 if (*len == 0) {
1296 T_LOG("sysctl for len returned zero! No stats?\n");
1297 *buffer = NULL;
1298 return 0;
1299 }
1300 *buffer = malloc(*len);
1301 if (*buffer == NULL) {
1302 T_LOG("sysctl malloc for %ld bytes failed\n", *len);
1303 return ENOMEM;
1304 }
1305
1306 ret = sysctlbyname(oid_name, *buffer, len, newp, newlen);
1307 if (ret != 0) {
1308 err = errno;
1309 if (ret == ENOMEM) {
1310 free(*buffer);
1311 *buffer = NULL;
1312 try++;
1313 if (try <= RETRY_COUNT) {
1314 goto try_again;
1315 }
1316 }
1317 SKT_LOG("sysctl for buf failed, %s\n", strerror(errno));
1318 free(*buffer);
1319 return err;
1320 }
1321
1322 return 0;
1323 }
1324
1325 uint32_t
sktu_set_inject_error_rmask(uint32_t * mask)1326 sktu_set_inject_error_rmask(uint32_t *mask)
1327 {
1328 uint32_t old_mask;
1329 size_t size = sizeof(old_mask);
1330 int error;
1331
1332 error = sysctlbyname("kern.skywalk.inject_error_rmask",
1333 &old_mask, &size, mask, mask ? sizeof(*mask) : 0);
1334
1335 SKTC_ASSERT_ERR(!error);
1336 return old_mask;
1337 }
1338
1339 /* returns TRUE if a matching IPv4 address is found */
1340 boolean_t
sktu_check_interface_ipv4_address(char * ifname,uint32_t ipaddr)1341 sktu_check_interface_ipv4_address(char *ifname, uint32_t ipaddr)
1342 {
1343 struct ifaddrs *ifaddr, *ifa;
1344 boolean_t match = FALSE;
1345 int error;
1346
1347 error = getifaddrs(&ifaddr);
1348 SKTC_ASSERT_ERR(!error);
1349
1350 for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) {
1351 struct sockaddr_in *sin =
1352 (struct sockaddr_in *)(void *)ifa->ifa_addr;
1353 if (ifa->ifa_addr == NULL) {
1354 continue;
1355 }
1356 if ((strncmp(ifa->ifa_name, ifname, IFNAMSIZ) == 0) &&
1357 (ifa->ifa_addr->sa_family == AF_INET) &&
1358 (sin->sin_addr.s_addr == ipaddr)) {
1359 match = TRUE;
1360 }
1361 }
1362 freeifaddrs(ifaddr);
1363 return match;
1364 }
1365
1366 /****************************************************************/
1367
1368 int
sktu_create_pfkeysock(void)1369 sktu_create_pfkeysock(void)
1370 {
1371 int keysock = socket(PF_KEY, SOCK_RAW, PF_KEY_V2);
1372 assert(keysock != -1);
1373 return keysock;
1374 }
1375
1376 void
sktu_create_sa(int keysock,const char ifname[IFXNAMSIZ],uint32_t spi,struct in_addr * src,struct in_addr * dst)1377 sktu_create_sa(int keysock, const char ifname[IFXNAMSIZ], uint32_t spi, struct in_addr *src, struct in_addr *dst)
1378 {
1379 /*
1380 * <base, SA, (lifetime(HS),) address(SD), (address(P),)
1381 * key(AE), (identity(SD),) (sensitivity)>
1382 */
1383
1384 struct {
1385 struct sadb_msg msg __attribute((aligned(sizeof(uint64_t))));
1386 struct sadb_key key __attribute((aligned(sizeof(uint64_t))));
1387 struct sadb_sa sa __attribute((aligned(sizeof(uint64_t))));
1388 struct sadb_x_sa2 sa2 __attribute((aligned(sizeof(uint64_t))));
1389 struct sadb_x_ipsecif ipsecif __attribute((aligned(sizeof(uint64_t))));
1390 struct {
1391 struct sadb_address addr __attribute((aligned(sizeof(uint64_t))));
1392 struct sockaddr_in saddr __attribute((aligned(sizeof(uint64_t))));
1393 } src;
1394 struct {
1395 struct sadb_address addr __attribute((aligned(sizeof(uint64_t))));
1396 struct sockaddr_in saddr __attribute((aligned(sizeof(uint64_t))));
1397 } dst;
1398 } addcmd;
1399
1400 memset(&addcmd, 0, sizeof(addcmd));
1401
1402 addcmd.msg.sadb_msg_version = PF_KEY_V2;
1403 addcmd.msg.sadb_msg_type = SADB_ADD;
1404 addcmd.msg.sadb_msg_errno = 0;
1405 addcmd.msg.sadb_msg_satype = SADB_SATYPE_ESP;
1406 addcmd.msg.sadb_msg_len = PFKEY_UNIT64(sizeof(addcmd));
1407 addcmd.msg.sadb_msg_reserved = 0;
1408 addcmd.msg.sadb_msg_seq = 0;
1409 addcmd.msg.sadb_msg_pid = (unsigned)getpid();
1410
1411 addcmd.key.sadb_key_len = PFKEY_UNIT64(sizeof(addcmd.key));
1412 addcmd.key.sadb_key_exttype = SADB_EXT_KEY_ENCRYPT;
1413 addcmd.key.sadb_key_bits = 0;
1414 addcmd.key.sadb_key_reserved = 0;
1415
1416 addcmd.sa.sadb_sa_len = PFKEY_UNIT64(sizeof(addcmd.sa));
1417 addcmd.sa.sadb_sa_exttype = SADB_EXT_SA;
1418 addcmd.sa.sadb_sa_spi = htonl(spi);
1419 addcmd.sa.sadb_sa_replay = 0;
1420 addcmd.sa.sadb_sa_state = 0;
1421 addcmd.sa.sadb_sa_auth = SADB_AALG_NONE;
1422 addcmd.sa.sadb_sa_encrypt = SADB_EALG_NULL;
1423 addcmd.sa.sadb_sa_flags = 0;
1424
1425 addcmd.sa2.sadb_x_sa2_len = PFKEY_UNIT64(sizeof(addcmd.sa2));
1426 addcmd.sa2.sadb_x_sa2_exttype = SADB_X_EXT_SA2;
1427 addcmd.sa2.sadb_x_sa2_mode = IPSEC_MODE_TRANSPORT;
1428 addcmd.sa2.sadb_x_sa2_alwaysexpire = 1;
1429 addcmd.sa2.sadb_x_sa2_flags = SADB_X_EXT_SA2_DELETE_ON_DETACH;
1430 addcmd.sa2.sadb_x_sa2_sequence = 0;
1431 addcmd.sa2.sadb_x_sa2_reqid = 0;
1432
1433 addcmd.ipsecif.sadb_x_ipsecif_len = PFKEY_UNIT64(sizeof(addcmd.ipsecif));
1434 addcmd.ipsecif.sadb_x_ipsecif_exttype = SADB_X_EXT_IPSECIF;
1435 memset(addcmd.ipsecif.sadb_x_ipsecif_internal_if, 0, sizeof(addcmd.ipsecif.sadb_x_ipsecif_internal_if));
1436 memset(addcmd.ipsecif.sadb_x_ipsecif_outgoing_if, 0, sizeof(addcmd.ipsecif.sadb_x_ipsecif_outgoing_if));
1437 strlcpy(addcmd.ipsecif.sadb_x_ipsecif_ipsec_if, ifname, sizeof(addcmd.ipsecif.sadb_x_ipsecif_ipsec_if));
1438 addcmd.ipsecif.sadb_x_ipsecif_init_disabled = 0;
1439 addcmd.ipsecif.reserved = 0;
1440
1441 addcmd.src.addr.sadb_address_len = PFKEY_UNIT64(sizeof(addcmd.src));
1442 addcmd.src.addr.sadb_address_exttype = SADB_EXT_ADDRESS_SRC;
1443 addcmd.src.addr.sadb_address_proto = IPSEC_ULPROTO_ANY;
1444 addcmd.src.addr.sadb_address_prefixlen = sizeof(struct in_addr) << 3; //XXX Why?
1445 addcmd.src.addr.sadb_address_reserved = 0;
1446 addcmd.src.saddr.sin_len = sizeof(addcmd.src.saddr);
1447 addcmd.src.saddr.sin_family = AF_INET;
1448 addcmd.src.saddr.sin_port = htons(0);
1449 addcmd.src.saddr.sin_addr = *src;
1450
1451 addcmd.dst.addr.sadb_address_len = PFKEY_UNIT64(sizeof(addcmd.dst));
1452 addcmd.dst.addr.sadb_address_exttype = SADB_EXT_ADDRESS_DST;
1453 addcmd.dst.addr.sadb_address_proto = IPSEC_ULPROTO_ANY;
1454 addcmd.dst.addr.sadb_address_prefixlen = sizeof(struct in_addr) << 3; //XXX Why?
1455 addcmd.dst.addr.sadb_address_reserved = 0;
1456 addcmd.dst.saddr.sin_len = sizeof(addcmd.dst.saddr);
1457 addcmd.dst.saddr.sin_family = AF_INET;
1458 addcmd.dst.saddr.sin_port = htons(0);
1459 addcmd.dst.saddr.sin_addr = *dst;
1460
1461 //log_hexdump(&addcmd, sizeof(addcmd));
1462
1463 ssize_t slen;
1464 slen = send(keysock, &addcmd, sizeof(addcmd), 0);
1465 assert(slen == sizeof(addcmd));
1466 }
1467
1468 typedef union {
1469 char c[2];
1470 u_short s;
1471 } short_union_t;
1472
1473 typedef union {
1474 u_short s[2];
1475 long l;
1476 } long_union_t;
1477
1478 static __inline__ void
reduce(int * sum)1479 reduce(int * sum)
1480 {
1481 long_union_t l_util;
1482
1483 l_util.l = *sum;
1484 *sum = l_util.s[0] + l_util.s[1];
1485 if (*sum > 65535) {
1486 *sum -= 65535;
1487 }
1488 return;
1489 }
1490
1491 unsigned short
in_cksum(void * pkt,int len,int sum0)1492 in_cksum(void * pkt, int len, int sum0)
1493 {
1494 u_short * w;
1495 int sum = sum0;
1496
1497 w = (u_short *)pkt;
1498 while ((len -= 32) >= 0) {
1499 sum += w[0]; sum += w[1];
1500 sum += w[2]; sum += w[3];
1501 sum += w[4]; sum += w[5];
1502 sum += w[6]; sum += w[7];
1503 sum += w[8]; sum += w[9];
1504 sum += w[10]; sum += w[11];
1505 sum += w[12]; sum += w[13];
1506 sum += w[14]; sum += w[15];
1507 w += 16;
1508 }
1509 len += 32;
1510 while ((len -= 8) >= 0) {
1511 sum += w[0]; sum += w[1];
1512 sum += w[2]; sum += w[3];
1513 w += 4;
1514 }
1515 len += 8;
1516 if (len) {
1517 reduce(&sum);
1518 while ((len -= 2) >= 0) {
1519 sum += *w++;
1520 }
1521 }
1522 if (len == -1) { /* odd-length packet */
1523 short_union_t s_util;
1524
1525 s_util.s = 0;
1526 s_util.c[0] = *((char *)w);
1527 s_util.c[1] = 0;
1528 sum += s_util.s;
1529 }
1530 reduce(&sum);
1531 return ~sum & 0xffff;
1532 }
1533
1534 #define ADDCARRY(_x) do { \
1535 while (((_x) >> 16) != 0) \
1536 (_x) = ((_x) >> 16) + ((_x) & 0xffff); \
1537 } while (0)
1538
1539 /*
1540 * Checksum routine for Internet Protocol family headers (Portable Version).
1541 *
1542 * This routine is very heavily used in the network
1543 * code and should be modified for each CPU to be as fast as possible.
1544 */
1545 #define REDUCE16 { \
1546 q_util.q = sum; \
1547 l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
1548 sum = l_util.s[0] + l_util.s[1]; \
1549 ADDCARRY(sum); \
1550 }
1551
1552 union l_util {
1553 uint16_t s[2];
1554 uint32_t l;
1555 };
1556
1557 union q_util {
1558 uint16_t s[4];
1559 uint32_t l[2];
1560 uint64_t q;
1561 };
1562
1563 uint16_t
in_pseudo(uint32_t a,uint32_t b,uint32_t c)1564 in_pseudo(uint32_t a, uint32_t b, uint32_t c)
1565 {
1566 uint64_t sum;
1567 union q_util q_util;
1568 union l_util l_util;
1569
1570 sum = (uint64_t)a + b + c;
1571 REDUCE16;
1572 return sum;
1573 }
1574
1575 uint16_t
in6_pseudo(const struct in6_addr * src,const struct in6_addr * dst,uint32_t x)1576 in6_pseudo(const struct in6_addr *src, const struct in6_addr *dst, uint32_t x)
1577 {
1578 uint32_t sum = 0;
1579 const uint16_t *w;
1580
1581 /*
1582 * IPv6 source address
1583 */
1584 w = (const uint16_t *)src;
1585 sum += w[0]; sum += w[1];
1586 sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5];
1587 sum += w[6]; sum += w[7];
1588
1589 /*
1590 * IPv6 destination address
1591 */
1592 w = (const uint16_t *)dst;
1593 sum += w[0]; sum += w[1];
1594 sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5];
1595 sum += w[6]; sum += w[7];
1596
1597 /*
1598 * Caller-supplied value; 'x' could be one of:
1599 *
1600 * htonl(proto + length), or
1601 * htonl(proto + length + sum)
1602 **/
1603 sum += x;
1604
1605 /* fold in carry bits */
1606 ADDCARRY(sum);
1607
1608 return sum;
1609 }
1610
1611 uint16_t
sktu_ip_id()1612 sktu_ip_id()
1613 {
1614 static int sktu_ip_id;
1615 return sktu_ip_id++;
1616 }
1617
1618 void
sktu_channel_port_init(channel_port_t ch_port,uuid_t instance,nexus_port_t nx_port,bool enable_upp,bool enable_event_ring,bool low_latency)1619 sktu_channel_port_init(channel_port_t ch_port, uuid_t instance,
1620 nexus_port_t nx_port, bool enable_upp, bool enable_event_ring,
1621 bool low_latency)
1622 {
1623 channel_t chan;
1624 nexus_port_t port = nx_port;
1625 ring_id_t ringid;
1626
1627 bzero(ch_port, sizeof(*ch_port));
1628 chan = sktu_channel_create_extended(instance, port,
1629 CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL,
1630 -1, -1, -1, -1, -1, enable_upp ? 1 : -1, 1,
1631 enable_event_ring ? 1 : -1, low_latency ? 1 : -1);
1632 if (chan == NULL) {
1633 SKT_LOG("Can't open channel on port %d, %s\n", port,
1634 strerror(errno));
1635 return;
1636 }
1637
1638 T_LOG("Opened port %d\n", port);
1639
1640 ch_port->chan = chan;
1641 ch_port->fd = os_channel_get_fd(chan);
1642 ch_port->port = port;
1643 ch_port->user_packet_pool = enable_upp;
1644
1645 /* tx ring */
1646 ringid = os_channel_ring_id(chan, CHANNEL_FIRST_TX_RING);
1647 ch_port->tx_ring = os_channel_tx_ring(ch_port->chan, ringid);
1648 assert(ch_port->tx_ring != NULL);
1649 /* rx ring */
1650 ringid = os_channel_ring_id(chan, CHANNEL_FIRST_RX_RING);
1651 ch_port->rx_ring = os_channel_rx_ring(ch_port->chan, ringid);
1652 assert(ch_port->rx_ring != NULL);
1653 }
1654
1655 static inline uint16_t
sktu_fold_sum_final(uint32_t sum)1656 sktu_fold_sum_final(uint32_t sum)
1657 {
1658 sum = (sum >> 16) + (sum & 0xffff); /* 17-bit */
1659 sum = (sum >> 16) + (sum & 0xffff); /* 16-bit + carry */
1660 sum = (sum >> 16) + (sum & 0xffff); /* final carry */
1661 return ~sum & 0xffff;
1662 }
1663
1664 packet_t
sktu_channel_port_frame_to_pkt(channel_port_t port,struct sktu_frame * frame)1665 sktu_channel_port_frame_to_pkt(channel_port_t port, struct sktu_frame *frame)
1666 {
1667 int error;
1668 packet_t pkt;
1669 void *baddr, *bytes = &frame->bytes[0];
1670 size_t len = frame->len;
1671 buflet_t buf, pbuf = NULL;
1672 uint16_t clen, bdlim, blen, bcnt;
1673
1674 assert(port->user_packet_pool);
1675
1676 error = os_channel_packet_alloc(port->chan, &pkt);
1677 SKTC_ASSERT_ERR(error == 0);
1678 assert(pkt != 0);
1679
1680 buf = os_packet_get_next_buflet(pkt, NULL);
1681 assert(buf != NULL);
1682 error = os_buflet_set_data_offset(buf, 0);
1683 SKTC_ASSERT_ERR(error == 0);
1684 bdlim = blen = os_buflet_get_data_limit(buf);
1685 assert(bdlim != 0);
1686 bcnt = os_packet_get_buflet_count(pkt);
1687 assert(blen * bcnt >= len);
1688 baddr = os_buflet_get_object_address(buf);
1689 assert(baddr != NULL);
1690
1691 error = os_packet_set_link_header_length(pkt, 0);
1692 SKTC_ASSERT_ERR(error == 0);
1693
1694 /* copy the frame bytes */
1695 while (len != 0) {
1696 if (blen == 0) {
1697 error = os_buflet_set_data_length(buf, bdlim);
1698 SKTC_ASSERT_ERR(error == 0);
1699 pbuf = buf;
1700 buf = os_packet_get_next_buflet(pkt, pbuf);
1701 assert(buf != NULL);
1702 error = os_buflet_set_data_offset(buf, 0);
1703 SKTC_ASSERT_ERR(error == 0);
1704 baddr = os_buflet_get_object_address(buf);
1705 assert(baddr != NULL);
1706 bdlim = blen = os_buflet_get_data_limit(buf);
1707 }
1708 clen = MIN(blen, len);
1709 memcpy(baddr, bytes, clen);
1710 len -= clen;
1711 blen -= clen;
1712 bytes += clen;
1713 baddr += clen;
1714 assert(len == 0 || blen == 0);
1715 }
1716 if (frame->csum_flags != 0) {
1717 os_packet_set_inet_checksum(pkt, frame->csum_flags,
1718 frame->csum_start, frame->csum_stuff);
1719 }
1720 if (pbuf == NULL) {
1721 error = os_buflet_set_data_length(buf, frame->len);
1722 } else {
1723 error = os_buflet_set_data_length(buf, clen);
1724 }
1725 SKTC_ASSERT_ERR(error == 0);
1726
1727 os_packet_set_flow_uuid(pkt, frame->flow_uuid);
1728 error = os_packet_finalize(pkt);
1729 SKTC_ASSERT_ERR(error == 0);
1730 return pkt;
1731 }
1732
1733 int
sktu_channel_port_tx(channel_port_t port,packet_t pkt)1734 sktu_channel_port_tx(channel_port_t port, packet_t pkt)
1735 {
1736 int error;
1737 slot_prop_t prop;
1738 channel_slot_t slot;
1739
1740 slot = os_channel_get_next_slot(port->tx_ring, NULL, &prop);
1741 if (slot == NULL) {
1742 return ENOENT;
1743 }
1744 error = os_channel_slot_attach_packet(port->tx_ring, slot, pkt);
1745 SKTC_ASSERT_ERR(error == 0);
1746 error = os_channel_advance_slot(port->tx_ring, slot);
1747 SKTC_ASSERT_ERR(error == 0);
1748 return 0;
1749 }
1750
1751 /*
1752 * Burst Tx tries to tx as many it can in one shot.
1753 *
1754 * Returns number of actually completed Tx.
1755 */
1756 uint32_t
sktu_channel_port_tx_burst_pkt(channel_port_t port,packet_t * pkts,uint32_t n)1757 sktu_channel_port_tx_burst_pkt(channel_port_t port, packet_t *pkts,
1758 uint32_t n)
1759 {
1760 struct timespec timeout = {
1761 .tv_sec = 10,
1762 .tv_nsec = 0,
1763 };
1764 struct kevent evlist, kev;
1765 int kq;
1766 int error;
1767 uint32_t i;
1768
1769 kq = kqueue();
1770 assert(kq != -1);
1771
1772 EV_SET(&kev, port->fd, EVFILT_WRITE, EV_ADD | EV_ENABLE, 0, 0, NULL);
1773 error = kevent(kq, &kev, 1, NULL, 0, NULL);
1774 SKTC_ASSERT_ERR(error == 0);
1775
1776 /* wait for Tx to become available */
1777 error = kevent(kq, NULL, 0, &evlist, 1, &timeout);
1778 if (error <= 0) {
1779 if (errno == EAGAIN) {
1780 return 0;
1781 }
1782 SKTC_ASSERT_ERR(error == 0);
1783 }
1784 if (error == 0) {
1785 T_LOG("kevent timeout\n");
1786 return 0;
1787 }
1788 if (evlist.flags & EV_ERROR) {
1789 int err = evlist.data;
1790 if (err == EAGAIN) {
1791 return 0;
1792 }
1793 SKTC_ASSERT_ERR(err == 0);
1794 }
1795
1796 if (evlist.filter != EVFILT_WRITE) {
1797 err(EX_OSERR, "%lu event %d?\n", evlist.ident, evlist.filter);
1798 }
1799
1800 for (i = 0; i < n; i++) {
1801 error = sktu_channel_port_tx(port, pkts[i]);
1802 if (error != 0) {
1803 break;
1804 }
1805 }
1806
1807 if (i != 0) {
1808 error = os_channel_sync(port->chan, CHANNEL_SYNC_TX);
1809 SKTC_ASSERT_ERR(error == 0);
1810 }
1811
1812 return i;
1813 }
1814
1815 /*
1816 * Burst Tx tries to tx as many it can in one shot.
1817 *
1818 * Returns number of actually completed Tx.
1819 */
1820 uint32_t
sktu_channel_port_tx_burst(channel_port_t port,struct sktu_frame ** frames,uint32_t n)1821 sktu_channel_port_tx_burst(channel_port_t port, struct sktu_frame **frames,
1822 uint32_t n)
1823 {
1824 struct timespec timeout = {
1825 .tv_sec = 10,
1826 .tv_nsec = 0,
1827 };
1828 struct kevent evlist, kev;
1829 int kq;
1830 int error;
1831 uint32_t i;
1832 packet_t pkt;
1833
1834 kq = kqueue();
1835 assert(kq != -1);
1836
1837 EV_SET(&kev, port->fd, EVFILT_WRITE, EV_ADD | EV_ENABLE, 0, 0, NULL);
1838 error = kevent(kq, &kev, 1, NULL, 0, NULL);
1839 SKTC_ASSERT_ERR(error == 0);
1840
1841 /* wait for Tx to become available */
1842 error = kevent(kq, NULL, 0, &evlist, 1, &timeout);
1843 if (error <= 0) {
1844 if (errno == EAGAIN) {
1845 return 0;
1846 }
1847 SKTC_ASSERT_ERR(error == 0);
1848 }
1849 if (error == 0) {
1850 T_LOG("kevent timeout\n");
1851 return 0;
1852 }
1853 if (evlist.flags & EV_ERROR) {
1854 int err = evlist.data;
1855 if (err == EAGAIN) {
1856 return 0;
1857 }
1858 SKTC_ASSERT_ERR(err == 0);
1859 }
1860
1861 if (evlist.filter != EVFILT_WRITE) {
1862 err(EX_OSERR, "%lu event %d?\n", evlist.ident, evlist.filter);
1863 }
1864
1865 for (i = 0; i < n; i++) {
1866 pkt = sktu_channel_port_frame_to_pkt(port, frames[i]);
1867 error = sktu_channel_port_tx(port, pkt);
1868 if (error != 0) {
1869 break;
1870 }
1871 }
1872
1873 if (i != 0) {
1874 error = os_channel_sync(port->chan, CHANNEL_SYNC_TX);
1875 SKTC_ASSERT_ERR(error == 0);
1876 }
1877
1878 return i;
1879 }
1880
1881 /*
1882 * Bulk Tx makes sure all Tx operations are completed; otherwise fails the test.
1883 */
1884 void
sktu_channel_port_tx_bulk(channel_port_t port,struct sktu_frame ** frames,uint32_t n)1885 sktu_channel_port_tx_bulk(channel_port_t port, struct sktu_frame **frames,
1886 uint32_t n)
1887 {
1888 uint32_t ret = 0;
1889 ret = sktu_channel_port_tx_burst(port, frames, n);
1890 assert(ret < n);
1891 if (ret != n) {
1892 errx(EX_OSERR, "tx bulk failed %u/%u", n, ret);
1893 }
1894 }
1895
1896 int
sktu_parse_ipv4_frame(struct sktu_frame * frame,void * ip_payload,uint32_t * ip_payload_len)1897 sktu_parse_ipv4_frame(struct sktu_frame *frame, void *ip_payload,
1898 uint32_t *ip_payload_len)
1899 {
1900 size_t pkt_len, payload_len;
1901 void *buf;
1902 struct ip *ip;
1903 uint16_t csum;
1904
1905 buf = &frame->bytes[0];
1906 ip = (struct ip*)buf;
1907 pkt_len = frame->len;
1908 assert(pkt_len == ntohs(ip->ip_len));
1909 payload_len = pkt_len - sizeof(*ip);
1910 assert(payload_len <= SKTU_FRAME_BUF_SIZE);
1911
1912 /* verify ip header checksum */
1913 csum = in_cksum(ip, sizeof(*ip), 0);
1914 if (csum != 0) {
1915 sktu_dump_buffer(stderr, __func__, buf, pkt_len);
1916 errx(EX_PROTOCOL, "IP header checksum invalid");
1917 }
1918
1919 if (ip_payload != NULL) { /* copy the data */
1920 memcpy(ip_payload, buf + sizeof(*ip), pkt_len - sizeof(*ip));
1921 }
1922
1923 *ip_payload_len = payload_len;
1924 return 0;
1925 }
1926
1927 int
sktu_parse_tcp4_frame(struct sktu_frame * frame,void * tcp_payload,uint32_t * tcp_payload_len)1928 sktu_parse_tcp4_frame(struct sktu_frame *frame, void *tcp_payload,
1929 uint32_t *tcp_payload_len)
1930 {
1931 uint32_t pkt_len, payload_len;
1932 void *buf;
1933 struct ip *ip;
1934 ip_tcp_header_t *ip_tcp;
1935 uint16_t csum;
1936
1937 buf = &frame->bytes[0];
1938 ip = buf;
1939 ip_tcp = buf;
1940 pkt_len = frame->len;
1941 if (ip->ip_p != IPPROTO_TCP) {
1942 sktu_dump_buffer(stderr, "non-TCP packet", buf, pkt_len);
1943 return EINVAL;
1944 }
1945 assert(pkt_len == ntohs(ip_tcp->ip.ip_len));
1946 payload_len = pkt_len - sizeof(ip_tcp_header_t);
1947 assert(payload_len <= SKTU_FRAME_BUF_SIZE);
1948
1949 csum = in_cksum(ip, sizeof(*ip), 0);
1950 if (csum != 0) {
1951 sktu_dump_buffer(stderr, __func__, buf, pkt_len);
1952 errx(EX_PROTOCOL, "IP header checksum invalid");
1953 }
1954
1955 csum = os_inet_checksum(&ip_tcp->tcp, pkt_len - sizeof(struct ip), 0);
1956 csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1957 csum + htonl(payload_len + sizeof(struct tcphdr) + IPPROTO_TCP));
1958 csum ^= 0xffff;
1959 if (csum != 0) {
1960 sktu_dump_buffer(stderr, "invalid TCP csum", buf, pkt_len);
1961 return -1;
1962 }
1963
1964 if (tcp_payload != NULL) { /* copy the data */
1965 memcpy(tcp_payload, buf + sizeof(*ip_tcp), payload_len);
1966 }
1967
1968 *tcp_payload_len = payload_len;
1969
1970 return 0;
1971 }
1972
1973 int
sktu_parse_udp4_frame(struct sktu_frame * frame,void * udp_payload,uint32_t * udp_payload_len)1974 sktu_parse_udp4_frame(struct sktu_frame *frame, void *udp_payload,
1975 uint32_t *udp_payload_len)
1976 {
1977 size_t pkt_len, payload_len;
1978 void *buf;
1979 struct ip *ip;
1980 ip_udp_header_t *ip_udp;
1981 uint16_t csum;
1982
1983 buf = &frame->bytes[0];
1984 ip = buf;
1985 ip_udp = buf;
1986 pkt_len = frame->len;
1987 if (ip->ip_p != IPPROTO_UDP) {
1988 sktu_dump_buffer(stderr,
1989 "sktu_parse_udp4_frame: non-UDP packet", buf, pkt_len);
1990 return EINVAL;
1991 }
1992 assert(pkt_len == ntohs(ip_udp->ip.ip_len));
1993 payload_len = pkt_len - sizeof(ip_udp_header_t);
1994 assert(payload_len <= SKTU_FRAME_BUF_SIZE);
1995
1996 csum = in_cksum(ip, sizeof(*ip), 0);
1997 if (csum != 0) {
1998 sktu_dump_buffer(stderr, __func__, buf, pkt_len);
1999 errx(EX_PROTOCOL, "IP header checksum invalid");
2000 }
2001
2002 if (ip_udp->udp.uh_sum == 0) {
2003 goto skip_udp_checksum;
2004 }
2005
2006 csum = os_inet_checksum(&ip_udp->udp, pkt_len - sizeof(struct ip), 0);
2007 csum += htons(payload_len + sizeof(struct udphdr) + IPPROTO_UDP);
2008 csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, csum);
2009 csum ^= 0xffff;
2010 if (csum != 0) {
2011 sktu_dump_buffer(stderr, __func__, buf, pkt_len);
2012 return -1;
2013 }
2014
2015 skip_udp_checksum:
2016 if (udp_payload != NULL) {
2017 memcpy(udp_payload, buf + sizeof(*ip_udp), payload_len);
2018 }
2019
2020 *udp_payload_len = payload_len;
2021
2022 return 0;
2023 }
2024
2025 /*
2026 * Rx once from an available ring;
2027 * Return 0, if successful; non-zero, otherwise.
2028 */
2029 struct sktu_frame *
sktu_channel_port_rx(channel_port_t port)2030 sktu_channel_port_rx(channel_port_t port)
2031 {
2032 int error;
2033 slot_prop_t prop;
2034 channel_slot_t slot;
2035 struct sktu_frame *frame;
2036 packet_t pkt;
2037 void *addr, *buf;
2038 size_t buf_len;
2039 size_t frame_length;
2040 buflet_t buflet;
2041
2042 slot = os_channel_get_next_slot(port->rx_ring, NULL, &prop);
2043 if (slot == NULL) {
2044 return NULL;
2045 }
2046 assert(prop.sp_buf_ptr != 0);
2047
2048 frame = sktu_frame_alloc();
2049
2050 pkt = os_channel_slot_get_packet(port->rx_ring, slot);
2051 assert(pkt != 0);
2052 if (port->user_packet_pool) {
2053 error = os_channel_slot_detach_packet(port->rx_ring,
2054 slot, pkt);
2055 SKTC_ASSERT_ERR(error == 0);
2056 }
2057
2058 buflet = os_packet_get_next_buflet(pkt, NULL);
2059 assert(buflet != NULL);
2060 buf = os_buflet_get_object_address(buflet) +
2061 os_buflet_get_data_offset(buflet);
2062 frame_length = os_packet_get_data_length(pkt);
2063
2064 buflet = os_packet_get_next_buflet(pkt, NULL);
2065 assert(buflet != NULL);
2066 buf = os_buflet_get_object_address(buflet) +
2067 os_buflet_get_data_offset(buflet);
2068 buf_len = os_buflet_get_data_length(buflet);
2069 assert(buf_len < SKTU_FRAME_BUF_SIZE);
2070
2071 frame->len = os_packet_get_data_length(pkt);
2072
2073 addr = &frame->bytes[0];
2074 memcpy(addr, buf, buf_len);
2075 frame_length -= buf_len;
2076
2077 while (frame_length != 0) {
2078 buflet = os_packet_get_next_buflet(pkt, buflet);
2079 assert(buflet != NULL);
2080 buf = os_buflet_get_object_address(buflet) +
2081 os_buflet_get_data_offset(buflet);
2082 assert(buf != 0);
2083 buf_len = os_buflet_get_data_length(buflet);
2084 assert(buf_len != 0);
2085 memcpy(addr, buf, buf_len);
2086 addr += buf_len;
2087 frame_length -= buf_len;
2088 }
2089
2090 os_packet_get_flow_uuid(pkt, &frame->flow_uuid);
2091 error = os_channel_packet_free(port->chan, pkt);
2092
2093 error = os_channel_advance_slot(port->rx_ring, slot);
2094 SKTC_ASSERT_ERR(error == 0);
2095
2096 return frame;
2097 }
2098
2099 uint32_t
sktu_channel_port_rx_burst(channel_port_t port,struct sktu_frame ** frames,uint32_t n)2100 sktu_channel_port_rx_burst(channel_port_t port, struct sktu_frame **frames,
2101 uint32_t n)
2102 {
2103 struct timespec timeout = {
2104 .tv_sec = 10,
2105 .tv_nsec = 0,
2106 };
2107
2108 int error;
2109 struct kevent evlist, kev;
2110 int kq;
2111 uint32_t i;
2112
2113 kq = kqueue();
2114 assert(kq != -1);
2115
2116 EV_SET(&kev, port->fd, EVFILT_READ, EV_ADD | EV_ENABLE, 0, 0, NULL);
2117 error = kevent(kq, &kev, 1, NULL, 0, NULL);
2118 SKTC_ASSERT_ERR(error == 0);
2119
2120 /* wait for RX to become available */
2121 error = kevent(kq, NULL, 0, &evlist, 1, &timeout);
2122 if (error <= 0) {
2123 if (errno == EAGAIN) {
2124 return 0;
2125 }
2126 SKTC_ASSERT_ERR(error == 0);
2127 }
2128 if (error == 0) {
2129 T_LOG("kevent timeout\n");
2130 return 0;
2131 }
2132 if (evlist.flags & EV_ERROR) {
2133 int err = evlist.data;
2134 if (err == EAGAIN) {
2135 return 0;
2136 }
2137 SKTC_ASSERT_ERR(err == 0);
2138 }
2139
2140 if (evlist.filter != EVFILT_READ) {
2141 err(EX_OSERR, "%lu event %d?\n", evlist.ident, evlist.filter);
2142 }
2143
2144 for (i = 0; i < n; i++) {
2145 frames[i] = sktu_channel_port_rx(port);
2146 if (frames[i] == NULL) {
2147 break;
2148 }
2149 }
2150
2151 if (i != 0) {
2152 error = os_channel_sync(port->chan, CHANNEL_SYNC_RX);
2153 SKTC_ASSERT_ERR(error == 0);
2154 }
2155
2156 close(kq);
2157
2158 return i;
2159 }
2160
2161 void
sktu_channel_port_rx_bulk(channel_port_t port,struct sktu_frame ** frames,uint32_t n)2162 sktu_channel_port_rx_bulk(channel_port_t port, struct sktu_frame **frames,
2163 uint32_t n)
2164 {
2165 uint32_t ret = 0;
2166 ret = sktu_channel_port_rx_burst(port, frames, n);
2167 assert(ret < n);
2168 if (ret != n) {
2169 errx(EX_OSERR, "rx bulk failed, %u/%u packets", n, ret);
2170 }
2171 }
2172
2173 /*
2174 * Received batch of frames from utun file descriptor.
2175 *
2176 * Returns number of frames actually received.
2177 */
2178 uint32_t
sktu_utun_fd_rx_burst(int utun_fd,struct sktu_frame ** frames,uint32_t n)2179 sktu_utun_fd_rx_burst(int utun_fd, struct sktu_frame **frames, uint32_t n)
2180 {
2181 struct timeval timeout = {
2182 .tv_sec = 10,
2183 .tv_usec = 0,
2184 };
2185
2186 fd_set readfds, errorfds;
2187 int retval;
2188
2189 FD_ZERO(&readfds);
2190 FD_ZERO(&errorfds);
2191 FD_SET(utun_fd, &readfds);
2192 FD_SET(utun_fd, &errorfds);
2193
2194 retval = select(utun_fd + 1, &readfds, NULL, &errorfds, &timeout);
2195 if (retval == -1) {
2196 err(EX_OSERR, "select()");
2197 }
2198
2199 if (!FD_ISSET(utun_fd, &readfds) && retval == 0) { // timeout
2200 T_LOG("recv timeout\n");
2201 return 0;
2202 }
2203 assert(!FD_ISSET(utun_fd, &errorfds));
2204 assert(retval == 1);
2205
2206 if (!FD_ISSET(utun_fd, &readfds)) {
2207 errx(EX_OSERR, "fd selected but no read fd available");
2208 }
2209
2210 uint32_t i = 0;
2211 for (i = 0; i < n; i++) {
2212 struct {
2213 uint32_t af;
2214 char bytes[SKTU_FRAME_BUF_SIZE];
2215 } utun_packet;
2216 ssize_t len;
2217 len = read(utun_fd, &utun_packet, sizeof(utun_packet));
2218 if (len < 1) {
2219 errx(EX_OSERR, "utun read 0 len");
2220 }
2221 struct sktu_frame *frame = frames[i] = sktu_frame_alloc();
2222 memcpy(frame->bytes, &utun_packet.bytes, len - sizeof(uint32_t));
2223 frame->len = len - sizeof(uint32_t);
2224 }
2225
2226 return i;
2227 }
2228
2229 void
sktu_utun_fd_tx_burst(int utun_fd,struct sktu_frame ** frames,uint32_t n)2230 sktu_utun_fd_tx_burst(int utun_fd, struct sktu_frame **frames, uint32_t n)
2231 {
2232 struct timeval timeout = {
2233 .tv_sec = 10,
2234 .tv_usec = 0,
2235 };
2236 fd_set writefds, errorfds;
2237 int retval;
2238
2239 FD_ZERO(&writefds);
2240 FD_ZERO(&errorfds);
2241 FD_SET(utun_fd, &writefds);
2242 FD_SET(utun_fd, &errorfds);
2243
2244 retval = select(utun_fd + 1, NULL, &writefds, &errorfds, &timeout);
2245 if (retval == -1) {
2246 err(EX_OSERR, "select()");
2247 }
2248
2249 if (!FD_ISSET(utun_fd, &writefds) && retval == 0) { // timeout
2250 err(EX_OSERR, "recv timeout\n");
2251 }
2252
2253 assert(!FD_ISSET(utun_fd, &errorfds));
2254 assert(retval == 1);
2255
2256 if (!FD_ISSET(utun_fd, &writefds)) {
2257 errx(EX_OSERR, "fd selected but no write fd available");
2258 }
2259
2260 uint32_t i = 0;
2261 for (i = 0; i < n; i++) {
2262 struct sktu_frame *frame = frames[i];
2263 struct ip *ip = (void *)&frame->bytes[0];
2264 uint32_t af;
2265 switch (ip->ip_v) {
2266 case IPVERSION:
2267 af = htonl(AF_INET);
2268 break;
2269 case IPV6_VERSION:
2270 af = htonl(AF_INET6);
2271 break;
2272 default:
2273 assert("unrecoginzed IP version");
2274 __builtin_unreachable();
2275 break;
2276 }
2277 struct {
2278 uint32_t af;
2279 char bytes[SKTU_FRAME_BUF_SIZE];
2280 } utun_packet;
2281 memcpy(&utun_packet.af, &af, sizeof(af));
2282 memcpy(&utun_packet.bytes, &frame->bytes[0], frame->len);
2283 ssize_t write_len = frame->len + sizeof(uint32_t);
2284 T_LOG("%s writing frame len %zu\n", __func__, write_len);
2285 ssize_t len = write(utun_fd, &utun_packet, write_len);
2286 if (len != write_len) {
2287 err(EX_OSERR, "utun write error\n");
2288 }
2289 }
2290 }
2291
2292 struct sktu_frame *
sktu_frame_alloc()2293 sktu_frame_alloc()
2294 {
2295 return malloc(sizeof(struct sktu_frame));
2296 }
2297
2298 #define sktu_frame_free(frame) \
2299 do { \
2300 free(frame); \
2301 frame = NULL; \
2302 } while (0)
2303
2304 void
sktu_frames_free(struct sktu_frame ** frames,size_t n)2305 sktu_frames_free(struct sktu_frame **frames, size_t n)
2306 {
2307 for (size_t i = 0; i < n; i++) {
2308 sktu_frame_free(frames[i]);
2309 frames[i] = NULL;
2310 }
2311 }
2312
2313 size_t
sktu_create_ip_frames(struct sktu_frame ** frames,size_t n,void * src_ip,void * dst_ip,uint8_t proto,const void * sdu,size_t sdu_len,size_t mtu,uint16_t csum_flags,uint16_t csum_start,uint16_t csum_stuff)2314 sktu_create_ip_frames(struct sktu_frame **frames, size_t n,
2315 void *src_ip, void *dst_ip, uint8_t proto, const void *sdu, size_t sdu_len,
2316 size_t mtu, uint16_t csum_flags, uint16_t csum_start, uint16_t csum_stuff)
2317 {
2318 size_t off = 0, remaining_sdu_len = sdu_len;
2319 size_t i = 0;
2320 uint16_t ip_id = sktu_ip_id();
2321 bool needs_frag = false;
2322
2323 while (remaining_sdu_len > 0) {
2324 assert(i < n);
2325
2326 struct sktu_frame *frame = frames[i] = sktu_frame_alloc();
2327 char *baddr = &frame->bytes[0];
2328 struct ip *ip = (struct ip *)baddr;
2329 size_t dlen;
2330 bool more_frag = false;
2331
2332 dlen = mtu - sizeof(*ip);
2333 if (dlen >= remaining_sdu_len) {
2334 dlen = remaining_sdu_len;
2335 needs_frag = false;
2336 more_frag = false;
2337 } else {
2338 dlen = dlen & ~0x7; // round down to 8-byte multiple
2339 needs_frag = true;
2340 more_frag = true;
2341 }
2342
2343 // can't handle fragmented csum offload
2344 assert(!(needs_frag && csum_flags != 0));
2345
2346 memset(ip, 0, sizeof(*ip));
2347 ip->ip_v = IPVERSION;
2348 ip->ip_hl = sizeof(struct ip) >> 2;
2349 ip->ip_ttl = MAXTTL;
2350 ip->ip_p = proto;
2351 memcpy(&ip->ip_src, src_ip, sizeof(struct in_addr));
2352 memcpy(&ip->ip_dst, dst_ip, sizeof(struct in_addr));
2353 ip->ip_len = htons(sizeof(*ip) + dlen);
2354 ip->ip_id = htons(ip_id);
2355 ip->ip_off = ((off >> 3) & IP_OFFMASK);
2356 if (more_frag) {
2357 ip->ip_off |= IP_MF;
2358 }
2359 ip->ip_off = htons(ip->ip_off);
2360
2361 /* compute the IP header checksum */
2362 ip->ip_sum = in_cksum(ip, sizeof(*ip), 0);
2363 baddr += sizeof(*ip);
2364
2365 memcpy(baddr, sdu + off, dlen);
2366
2367 frame->csum_flags = csum_flags;
2368 frame->csum_start = sizeof(*ip) + csum_start;
2369 frame->csum_stuff = sizeof(*ip) + csum_stuff;
2370
2371 frame->len = sizeof(*ip) + dlen;
2372
2373 off += dlen;
2374 remaining_sdu_len -= dlen;
2375 i++;
2376 }
2377
2378 return i;
2379 }
2380
2381 size_t
sktu_create_ip6_frames(struct sktu_frame ** frames,size_t n,void * src_ip,void * dst_ip,uint8_t proto,const void * sdu,size_t sdu_len,size_t mtu,uint16_t csum_flags,uint16_t csum_start,uint16_t csum_stuff)2382 sktu_create_ip6_frames(struct sktu_frame **frames, size_t n,
2383 void *src_ip, void *dst_ip, uint8_t proto, const void *sdu, size_t sdu_len,
2384 size_t mtu, uint16_t csum_flags, uint16_t csum_start, uint16_t csum_stuff)
2385 {
2386 size_t off = 0, remaining_sdu_len = sdu_len;
2387 size_t i = 0;
2388 uint16_t ip_id = sktu_ip_id();
2389 bool needs_frag = false;
2390
2391 while (remaining_sdu_len > 0) {
2392 assert(i < n);
2393
2394 struct sktu_frame *frame = frames[i] = sktu_frame_alloc();
2395 char *baddr = &frame->bytes[0];
2396 struct ip6_hdr *ip6 = (struct ip6_hdr *)baddr;
2397 size_t hlen = sizeof(*ip6);
2398 size_t plen, dlen;
2399 bool more_frag = false;
2400
2401 dlen = mtu - hlen;
2402 if (dlen >= remaining_sdu_len) {
2403 // fits in one packet
2404 dlen = plen = remaining_sdu_len;
2405 remaining_sdu_len = 0;
2406 more_frag = false;
2407 } else {
2408 // need to fragment
2409 dlen -= sizeof(struct ip6_frag);
2410 dlen = dlen & ~0x7; // round down to 8-byte multiple
2411 plen = sizeof(struct ip6_frag) + dlen;
2412 remaining_sdu_len -= dlen;
2413 needs_frag = true;
2414 more_frag = true;
2415 }
2416
2417 // can't handle fragmented csum offload
2418 assert(!(needs_frag && csum_flags != 0));
2419
2420 // insert ipv6 header
2421 memset(ip6, 0, sizeof(*ip6));
2422 ip6->ip6_vfc = (IPV6_VERSION & IPV6_VERSION_MASK);
2423 ip6->ip6_plen = htons(plen);
2424 ip6->ip6_nxt = needs_frag ? IPPROTO_FRAGMENT : proto;
2425 ip6->ip6_hlim = IPV6_DEFHLIM;
2426 memcpy(&ip6->ip6_src, src_ip, sizeof(struct in6_addr));
2427 memcpy(&ip6->ip6_dst, dst_ip, sizeof(struct in6_addr));
2428
2429 baddr += sizeof(*ip6);
2430
2431 // insert ipv6 frag header
2432 if (needs_frag) {
2433 struct ip6_frag *ip6f = (struct ip6_frag *)baddr;
2434 ip6f->ip6f_nxt = proto;
2435 ip6f->ip6f_reserved = 0;
2436 ip6f->ip6f_offlg = htons(off);
2437 if (more_frag) {
2438 ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
2439 }
2440 ip6f->ip6f_ident = htonl(ip_id);
2441
2442 hlen += sizeof(*ip6f);
2443 baddr += sizeof(*ip6f);
2444 }
2445
2446 memcpy(baddr, sdu + off, dlen);
2447
2448 frame->csum_flags = csum_flags;
2449 frame->csum_start = sizeof(*ip6) + csum_start;
2450 frame->csum_stuff = sizeof(*ip6) + csum_stuff;
2451 frame->len = hlen + dlen;
2452
2453 off += dlen;
2454 i++;
2455 }
2456
2457 return i;
2458 }
2459
2460 size_t
sktu_create_tcp_frames(struct sktu_frame ** frames,size_t n,uint8_t ipver,void * src_ip,void * dst_ip,uint16_t sport,uint16_t dport,const void * data,size_t data_len,size_t mtu,bool csum_offload)2461 sktu_create_tcp_frames(struct sktu_frame **frames, size_t n,
2462 uint8_t ipver, void *src_ip, void *dst_ip, uint16_t sport, uint16_t dport,
2463 const void *data, size_t data_len, size_t mtu, bool csum_offload)
2464 {
2465 uint32_t n_frames;
2466 size_t sdu_len = data_len + sizeof(struct tcphdr);
2467 void *sdu = malloc(sdu_len);
2468
2469 // populate header
2470 struct tcphdr *tcp = (struct tcphdr *)sdu;
2471 tcp->th_sport = htons(sport);
2472 tcp->th_dport = htons(dport);
2473 tcp->th_flags |= 0; //FIXME (connect ? TH_SYN : TH_RST);
2474 tcp->th_off = (sizeof(struct tcphdr)) >> 2;
2475
2476 // copy payload
2477 memcpy(sdu + sizeof(*tcp), data, data_len);
2478
2479 // compute checksum
2480 uint16_t sum = 0;
2481
2482 if (ipver == IPVERSION) {
2483 sum = in_pseudo(*(uint32_t*)src_ip, *(uint32_t*)dst_ip,
2484 htons(data_len + sizeof(struct tcphdr) + IPPROTO_TCP));
2485 } else {
2486 sum = in6_pseudo(src_ip, dst_ip,
2487 htonl(data_len + sizeof(struct tcphdr) + IPPROTO_TCP));
2488 }
2489 tcp->th_sum = sum;
2490
2491 uint16_t csum_flags = 0, csum_start = 0, csum_stuff = 0;
2492 if (csum_offload) {
2493 csum_flags = PACKET_CSUM_PARTIAL;
2494 csum_start = 0;
2495 csum_stuff = offsetof(struct tcphdr, th_sum);
2496 } else {
2497 sum = os_inet_checksum(sdu, sdu_len, 0);
2498 tcp->th_sum = sktu_fold_sum_final(sum);
2499 }
2500
2501 // IP framing
2502 if (ipver == IPVERSION) {
2503 n_frames = sktu_create_ip_frames(frames, n, src_ip, dst_ip,
2504 IPPROTO_TCP, sdu, sdu_len, mtu, csum_flags, csum_start,
2505 csum_stuff);
2506 } else {
2507 n_frames = sktu_create_ip6_frames(frames, n, src_ip, dst_ip,
2508 IPPROTO_TCP, sdu, sdu_len, mtu, csum_flags, csum_start,
2509 csum_stuff);
2510 }
2511
2512 free(sdu);
2513
2514 return n_frames;
2515 }
2516
2517 size_t
sktu_create_udp_frames(struct sktu_frame ** frames,size_t n,uint8_t ipver,void * src_ip,void * dst_ip,uint16_t sport,uint16_t dport,const void * data,size_t data_len,size_t mtu,bool csum_offload)2518 sktu_create_udp_frames(struct sktu_frame **frames, size_t n,
2519 uint8_t ipver, void *src_ip, void *dst_ip, uint16_t sport, uint16_t dport,
2520 const void *data, size_t data_len, size_t mtu, bool csum_offload)
2521 {
2522 uint32_t n_frames;
2523 size_t sdu_len = data_len + sizeof(struct udphdr);
2524 void *sdu = malloc(sdu_len);
2525
2526 // populate header
2527 struct udphdr *udp = (struct udphdr *)sdu;
2528 udp->uh_sport = htons(sport);
2529 udp->uh_dport = htons(dport);
2530 udp->uh_ulen = htons(sizeof(*udp) + data_len);
2531
2532 // compute payload checksum
2533 uint32_t payload_sum = 0, pseudo_sum = 0;
2534 if (ipver == IPVERSION) {
2535 struct ipv4_udp_pseudo_hdr udp_pseudo = {};
2536 memcpy(&udp_pseudo.src_ip, src_ip, sizeof(struct in_addr));
2537 memcpy(&udp_pseudo.dst_ip, dst_ip, sizeof(struct in_addr));
2538 udp_pseudo.proto = IPPROTO_UDP;
2539 udp_pseudo.length = htons(sizeof(struct udphdr) + data_len);
2540 pseudo_sum = os_inet_checksum(&udp_pseudo, sizeof(udp_pseudo)
2541 + sizeof(struct udphdr), 0);
2542 } else {
2543 struct ipv6_udp_pseudo_hdr udp_pseudo = {};
2544 memcpy(&udp_pseudo.src_ip, src_ip, sizeof(struct in6_addr));
2545 memcpy(&udp_pseudo.dst_ip, dst_ip, sizeof(struct in6_addr));
2546 udp_pseudo.proto = IPPROTO_UDP;
2547 udp_pseudo.length = htons(sizeof(struct udphdr) + data_len);
2548 pseudo_sum = os_inet_checksum(&udp_pseudo, sizeof(udp_pseudo)
2549 + sizeof(struct udphdr), 0);
2550 }
2551
2552 uint16_t csum_flags = 0, csum_start = 0, csum_stuff = 0;
2553 if (csum_offload) {
2554 csum_flags = PACKET_CSUM_PARTIAL | PACKET_CSUM_ZERO_INVERT;
2555 csum_start = 0;
2556 csum_stuff = offsetof(struct udphdr, uh_sum);
2557 udp->uh_sum = sktu_fold_sum_final(pseudo_sum);
2558 } else {
2559 payload_sum = os_inet_checksum(data, data_len, 0);
2560 udp->uh_sum = ~sktu_fold_sum_final(pseudo_sum + payload_sum);
2561 }
2562
2563 // copy payload
2564 memcpy(sdu + sizeof(*udp), data, data_len);
2565
2566 // IP framing
2567 if (ipver == IPVERSION) {
2568 n_frames = sktu_create_ip_frames(frames, n, src_ip, dst_ip,
2569 IPPROTO_UDP, sdu, sdu_len, mtu, csum_flags, csum_start,
2570 csum_stuff);
2571 } else {
2572 n_frames = sktu_create_ip6_frames(frames, n, src_ip, dst_ip,
2573 IPPROTO_UDP, sdu, sdu_len, mtu, csum_flags, csum_start,
2574 csum_stuff);
2575 }
2576
2577 free(sdu);
2578
2579 return n_frames;
2580 }
2581
2582 void
sktu_attach_flow_metadata_to_frames(struct sktu_flow * flow,struct sktu_frame ** frames,size_t n_frames)2583 sktu_attach_flow_metadata_to_frames(struct sktu_flow *flow,
2584 struct sktu_frame **frames, size_t n_frames)
2585 {
2586 for (uint32_t i = 0; i < n_frames; i++) {
2587 struct sktu_frame *frame = frames[i];
2588 uuid_copy(frame->flow_uuid, flow->uuid);
2589 }
2590 }
2591
2592 static size_t
_sktu_create_udp_flow_input_frames(struct sktu_flow * flow,struct sktu_frame ** frames,size_t n,const void * data,size_t data_len)2593 _sktu_create_udp_flow_input_frames(struct sktu_flow *flow,
2594 struct sktu_frame **frames, size_t n, const void *data, size_t data_len)
2595 {
2596 n = sktu_create_udp_frames(frames, n, flow->ipver, flow->dst_ip,
2597 flow->src_ip, flow->dport, flow->sport, data, data_len, flow->mtu,
2598 NO_CSUM_OFFLOAD);
2599 sktu_attach_flow_metadata_to_frames(flow, frames, n);
2600 return n;
2601 }
2602
2603 static size_t
_sktu_create_udp_flow_output_frames(struct sktu_flow * flow,struct sktu_frame ** frames,size_t n,const void * data,size_t data_len,bool csum_offload)2604 _sktu_create_udp_flow_output_frames(struct sktu_flow *flow,
2605 struct sktu_frame **frames, size_t n, const void *data, size_t data_len,
2606 bool csum_offload)
2607 {
2608 n = sktu_create_udp_frames(frames, n, flow->ipver, flow->src_ip,
2609 flow->dst_ip, flow->sport, flow->dport, data, data_len, flow->mtu,
2610 csum_offload);
2611 sktu_attach_flow_metadata_to_frames(flow, frames, n);
2612 return n;
2613 }
2614
2615 static size_t
_sktu_create_tcp_flow_input_frames(struct sktu_flow * flow,struct sktu_frame ** frames,size_t n,const void * data,size_t data_len)2616 _sktu_create_tcp_flow_input_frames(struct sktu_flow *flow,
2617 struct sktu_frame **frames, size_t n, const void *data, size_t data_len)
2618 {
2619 n = sktu_create_tcp_frames(frames, n, flow->ipver, flow->dst_ip,
2620 flow->src_ip, flow->dport, flow->sport, data, data_len, flow->mtu,
2621 NO_CSUM_OFFLOAD);
2622 sktu_attach_flow_metadata_to_frames(flow, frames, n);
2623 return n;
2624 }
2625
2626 static size_t
_sktu_create_tcp_flow_output_frames(struct sktu_flow * flow,struct sktu_frame ** frames,size_t n,const void * data,size_t data_len,bool csum_offload)2627 _sktu_create_tcp_flow_output_frames(struct sktu_flow *flow,
2628 struct sktu_frame **frames, size_t n, const void *data, size_t data_len,
2629 bool csum_offload)
2630 {
2631 n = sktu_create_tcp_frames(frames, n, flow->ipver, flow->src_ip,
2632 flow->dst_ip, flow->sport, flow->dport, data, data_len, flow->mtu,
2633 csum_offload);
2634 sktu_attach_flow_metadata_to_frames(flow, frames, n);
2635 return n;
2636 }
2637
2638 static size_t
_sktu_create_ip_flow_input_frames(struct sktu_flow * flow,struct sktu_frame ** frames,size_t n,const void * data,size_t data_len)2639 _sktu_create_ip_flow_input_frames(struct sktu_flow *flow,
2640 struct sktu_frame **frames, size_t n, const void *data, size_t data_len)
2641 {
2642 n = sktu_create_ip_frames(frames, n, flow->dst_ip, flow->src_ip,
2643 flow->ip_protocol, data, data_len, flow->mtu, 0, 0, 0);
2644 sktu_attach_flow_metadata_to_frames(flow, frames, n);
2645 return n;
2646 }
2647
2648 static size_t
_sktu_create_ip_flow_output_frames(struct sktu_flow * flow,struct sktu_frame ** frames,size_t n,const void * data,size_t data_len,bool csum_offload)2649 _sktu_create_ip_flow_output_frames(struct sktu_flow *flow,
2650 struct sktu_frame **frames, size_t n, const void *data,
2651 size_t data_len, bool csum_offload)
2652 {
2653 n = sktu_create_ip_frames(frames, n, flow->src_ip, flow->dst_ip,
2654 flow->ip_protocol, data, data_len, flow->mtu, 0, 0, 0);
2655 sktu_attach_flow_metadata_to_frames(flow, frames, n);
2656 return n;
2657 }
2658
2659 #define SKTU_STRING_BUF_MAX 2048
2660 char *
sktu_nfr_to_string(struct nx_flow_req * nfr)2661 sktu_nfr_to_string(struct nx_flow_req *nfr)
2662 {
2663 static char buf[SKTU_STRING_BUF_MAX];
2664 uuid_string_t uuidstr;
2665 char sa_buf[31];
2666 char da_buf[31];
2667
2668 uuid_unparse(nfr->nfr_flow_uuid, uuidstr);
2669 if (nfr->nfr_saddr.sa.sa_family == AF_INET) {
2670 inet_ntop(AF_INET, &nfr->nfr_saddr.sin.sin_addr.s_addr, sa_buf,
2671 sizeof(sa_buf));
2672 inet_ntop(AF_INET, &nfr->nfr_daddr.sin.sin_addr.s_addr, da_buf,
2673 sizeof(da_buf));
2674 } else {
2675 inet_ntop(AF_INET6, &nfr->nfr_saddr.sin6.sin6_addr, sa_buf,
2676 sizeof(sa_buf));
2677 inet_ntop(AF_INET6, &nfr->nfr_daddr.sin6.sin6_addr, da_buf,
2678 sizeof(da_buf));
2679 }
2680 snprintf(buf, sizeof(buf),
2681 "nx_port[%d] %s src=%s,dst=%s,proto=%d,sport=%d,dport=%d, flags=0x%x",
2682 nfr->nfr_nx_port, uuidstr, sa_buf, da_buf, nfr->nfr_ip_protocol,
2683 ntohs(nfr->nfr_saddr.sin.sin_port),
2684 ntohs(nfr->nfr_daddr.sin.sin_port), nfr->nfr_flags);
2685
2686 return buf;
2687 }
2688
2689 char *
sktu_flow_to_string(struct sktu_flow * flow)2690 sktu_flow_to_string(struct sktu_flow *flow)
2691 {
2692 return sktu_nfr_to_string(&flow->nfr);
2693 }
2694
2695 struct sktu_flow *
_sktu_create_nexus_flow(sktu_nexus_t nexus,nexus_port_t nx_port,uint8_t af,void * src,void * dst,uint8_t proto,uint16_t sport,uint16_t dport,uint32_t flags)2696 _sktu_create_nexus_flow(sktu_nexus_t nexus, nexus_port_t nx_port,
2697 uint8_t af, void *src, void *dst, uint8_t proto, uint16_t sport,
2698 uint16_t dport, uint32_t flags)
2699 {
2700 struct sktu_flow *flow = malloc(sizeof(*flow));
2701
2702 memset(flow, 0, sizeof(*flow));
2703 flow->nexus = nexus;
2704 flow->mtu = 1500;
2705
2706 flow->nx_port = nx_port;
2707
2708 struct nx_flow_req *nfr = &flow->nfr;
2709 union sockaddr_in_4_6 *saddr = &nfr->nfr_saddr;
2710 union sockaddr_in_4_6 *daddr = &nfr->nfr_daddr;
2711 nfr->nfr_nx_port = nx_port;
2712 if (af == AF_INET) {
2713 // initialize flow
2714 flow->ipver = IPVERSION;
2715 // fill in nfr (stuff in network order :)
2716 SIN(saddr)->sin_len = sizeof(struct sockaddr_in);
2717 SIN(daddr)->sin_len = sizeof(struct sockaddr_in);
2718 SIN(saddr)->sin_family = AF_INET;
2719 SIN(daddr)->sin_family = AF_INET;
2720 SIN(saddr)->sin_addr = *(struct in_addr *)src;
2721 SIN(daddr)->sin_addr = *(struct in_addr *)dst;
2722 nfr->nfr_ip_protocol = proto;
2723 SIN(saddr)->sin_port = htons(sport);
2724 SIN(daddr)->sin_port = htons(dport);
2725 } else {
2726 flow->ipver = IPV6_VERSION;
2727 SIN6(saddr)->sin6_len = sizeof(struct sockaddr_in6);
2728 SIN6(daddr)->sin6_len = sizeof(struct sockaddr_in6);
2729 SIN6(saddr)->sin6_family = AF_INET6;
2730 SIN6(daddr)->sin6_family = AF_INET6;
2731 SIN6(saddr)->sin6_addr = *(struct in6_addr *)src;
2732 SIN6(daddr)->sin6_addr = *(struct in6_addr *)dst;
2733 nfr->nfr_ip_protocol = proto;
2734 SIN6(saddr)->sin6_port = htons(sport);
2735 SIN6(daddr)->sin6_port = htons(dport);
2736 }
2737
2738 uuid_generate_random(nfr->nfr_flow_uuid);
2739 nfr->nfr_flags = flags;
2740
2741 errno = 0;
2742 int error = __os_nexus_flow_add(nexus->controller, nexus->fsw_nx_uuid, nfr);
2743 if (error) {
2744 T_LOG("Failed flow %s\n", sktu_nfr_to_string(nfr));
2745 free(flow);
2746 return NULL;
2747 }
2748
2749 if (af == AF_INET) {
2750 flow->src_ip = &SIN(saddr)->sin_addr;
2751 flow->dst_ip = &SIN(daddr)->sin_addr;
2752 flow->sport = ntohs(SIN(saddr)->sin_port);
2753 flow->dport = ntohs(SIN(daddr)->sin_port);
2754 } else {
2755 flow->src_ip = &SIN6(saddr)->sin6_addr;
2756 flow->dst_ip = &SIN6(daddr)->sin6_addr;
2757 flow->sport = ntohs(SIN6(saddr)->sin6_port);
2758 flow->dport = ntohs(SIN6(daddr)->sin6_port);
2759 }
2760
2761 flow->ip_protocol = proto;
2762 uuid_copy(flow->uuid, nfr->nfr_flow_uuid);
2763
2764 switch (proto) {
2765 case IPPROTO_UDP:
2766 flow->create_input_frames = _sktu_create_udp_flow_input_frames;
2767 flow->create_output_frames = _sktu_create_udp_flow_output_frames;
2768 break;
2769 case IPPROTO_TCP:
2770 flow->create_input_frames = _sktu_create_tcp_flow_input_frames;
2771 flow->create_output_frames = _sktu_create_tcp_flow_output_frames;
2772 break;
2773 default:
2774 flow->create_input_frames = _sktu_create_ip_flow_input_frames;
2775 flow->create_output_frames = _sktu_create_ip_flow_output_frames;
2776 }
2777
2778 assert(nfr->nfr_nx_port != NEXUS_PORT_ANY);
2779
2780 T_LOG("Created flow %s\n", sktu_nfr_to_string(nfr));
2781
2782 return flow;
2783 }
2784
2785 struct sktu_flow *
sktu_create_nexus_flow(sktu_nexus_t nexus,uint8_t af,void * src,void * dst,uint8_t proto,uint16_t sport,uint16_t dport)2786 sktu_create_nexus_flow(sktu_nexus_t nexus, uint8_t af, void *src, void *dst,
2787 uint8_t proto, uint16_t sport, uint16_t dport)
2788 {
2789 return _sktu_create_nexus_flow(nexus, NEXUS_PORT_ANY, af, src, dst, proto, sport, dport, 0);
2790 }
2791
2792 struct sktu_flow *
sktu_create_nexus_flow_with_nx_port(sktu_nexus_t nexus,nexus_port_t nx_port,uint8_t af,void * src,void * dst,uint8_t proto,uint16_t sport,uint16_t dport)2793 sktu_create_nexus_flow_with_nx_port(sktu_nexus_t nexus, nexus_port_t nx_port,
2794 uint8_t af, void *src, void *dst, uint8_t proto, uint16_t sport,
2795 uint16_t dport)
2796 {
2797 return _sktu_create_nexus_flow(nexus, nx_port, af, src, dst, proto, sport, dport, 0);
2798 }
2799
2800 struct sktu_flow *
sktu_create_nexus_low_latency_flow(sktu_nexus_t nexus,uint8_t af,void * src,void * dst,uint8_t proto,uint16_t sport,uint16_t dport)2801 sktu_create_nexus_low_latency_flow(sktu_nexus_t nexus, uint8_t af, void *src, void *dst,
2802 uint8_t proto, uint16_t sport, uint16_t dport)
2803 {
2804 return _sktu_create_nexus_flow(nexus, NEXUS_PORT_ANY, af, src, dst, proto, sport, dport, NXFLOWREQF_LOW_LATENCY);
2805 }
2806
2807 void
_sktu_destroy_nexus_flow(struct sktu_flow * flow)2808 _sktu_destroy_nexus_flow(struct sktu_flow *flow)
2809 {
2810 sktu_nexus_t nexus = flow->nexus;
2811 struct nx_flow_req *nfr = &flow->nfr;
2812
2813 int error = __os_nexus_flow_del(nexus->controller, nexus->fsw_nx_uuid, nfr);
2814 SKTC_ASSERT_ERR(!error);
2815 if (error) {
2816 T_LOG("failed to deling flow %s", sktu_nfr_to_string(nfr));
2817 }
2818
2819 free(flow);
2820 }
2821
2822 int
sktu_get_nexus_flow_stats(uuid_t flow_uuid,struct sk_stats_flow * sf)2823 sktu_get_nexus_flow_stats(uuid_t flow_uuid, struct sk_stats_flow *sf)
2824 {
2825 size_t length = 0;
2826 void *buffer = NULL;
2827 int ret = sysctl_buf(SK_STATS_FLOW, &buffer, &length, NULL, 0);
2828 assert(ret == 0);
2829 assert(buffer != NULL && length != 0);
2830
2831 assert((length % sizeof(*sf)) == 0);
2832
2833 struct sk_stats_flow *iter;
2834 for (iter = buffer; (void *)iter < buffer + length; iter++) {
2835 if (uuid_compare(iter->sf_uuid, flow_uuid) == 0) {
2836 *sf = *iter;
2837 return 0;
2838 }
2839 }
2840 return ENOENT;
2841 }
2842
2843 int
sktu_get_nexus_flowswitch_stats(struct sk_stats_flow_switch ** sfsw,size_t * len)2844 sktu_get_nexus_flowswitch_stats(struct sk_stats_flow_switch **sfsw, size_t *len)
2845 {
2846 int ret;
2847 void *buffer = NULL;
2848 size_t length = 0;
2849 size_t width = sizeof(struct sk_stats_flow_switch);
2850
2851 ret = sysctl_buf(SK_STATS_FLOW_SWITCH, &buffer, &length, NULL, 0);
2852 if (ret != 0 || buffer == NULL || length == 0) {
2853 return ret;
2854 }
2855 if ((length % width) != 0) {
2856 T_LOG("Error, mismatching sk_stats_flow_switch, quit\n");
2857 exit(EX_OSERR);
2858 }
2859
2860 *sfsw = (struct sk_stats_flow_switch *)buffer;
2861 *len = length;
2862
2863 return 0;
2864 }
2865
2866 void
__fsw_stats_print(struct fsw_stats * s)2867 __fsw_stats_print(struct fsw_stats *s)
2868 {
2869 int i;
2870
2871 for (i = 0; i < __FSW_STATS_MAX; i++) {
2872 if (STATS_VAL(s, i) == 0) {
2873 continue;
2874 }
2875 os_log(OS_LOG_DEFAULT, "\t%-24s: %llu\n",
2876 fsw_stats_str(i), STATS_VAL(s, i));
2877 }
2878 }
2879