xref: /xnu-11215.1.10/tests/skywalk/skywalk_test_utils.c (revision 8d741a5de7ff4191bf97d57b9f54c2f6d4a15585) !
1 /*
2  * Copyright (c) 2016-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /* This file contains useful utility routines, but contrary to skywalk_test_common
30  * Do not operate on a single set of static objects
31  */
32 
33 /*
34  * Copyright (c) 1988, 1992, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  *
37  *	@(#)in_cksum.c	8.1 (Berkeley) 6/10/93
38  */
39 
40 
41 #include <err.h>
42 #include <assert.h>
43 #include <inttypes.h>
44 #include <stdbool.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <time.h>
48 #include <string.h>
49 #include <unistd.h>
50 #include <poll.h>
51 #include <sys/event.h>
52 #include <uuid/uuid.h>
53 #include <arpa/inet.h>
54 #include <stddef.h>
55 #include <sysexits.h>
56 #include <sys/types.h>
57 #include <sys/sysctl.h>
58 #include <net/if_utun.h>
59 #include <net/if_ipsec.h>
60 #include <netinet/ip6.h>
61 #include <sys/kern_control.h>
62 #include <sys/ioctl.h>
63 #include <sys/socket.h>
64 #include <sys/kern_control.h>
65 #include <sys/sys_domain.h>
66 #include <ifaddrs.h>
67 #include <sys/fcntl.h>
68 #include <sys/kern_control.h>
69 #include <sys/sys_domain.h>
70 #include <net/if_utun.h>
71 #include <os/log.h>
72 
73 #include <net/pfkeyv2.h>
74 #include <netinet6/ipsec.h>
75 #include <darwintest.h>
76 
77 #include "skywalk_test_driver.h"
78 #include "skywalk_test_common.h" // XXX remove this
79 #include "skywalk_test_utils.h"
80 
81 #define SIN(s)          ((struct sockaddr_in *)(void *)s)
82 #define SIN6(s)          ((struct sockaddr_in6 *)(void *)s)
83 
84 void
sktc_build_nexus(nexus_controller_t ncd,struct sktc_nexus_attr * sktc_attr,uuid_t * providerp,uuid_t * instancep)85 sktc_build_nexus(nexus_controller_t ncd, struct sktc_nexus_attr *sktc_attr,
86     uuid_t *providerp, uuid_t *instancep)
87 {
88 	nexus_attr_t attr;
89 	int error;
90 	uint64_t scratch;
91 
92 	attr = os_nexus_attr_create();
93 	assert(attr);
94 
95 	if (sktc_attr->anonymous != -1) {
96 		error = os_nexus_attr_set(attr, NEXUS_ATTR_ANONYMOUS,
97 		    sktc_attr->anonymous);
98 		SKTC_ASSERT_ERR(!error);
99 	}
100 	if (sktc_attr->userchannel != -1) {
101 		error = os_nexus_attr_set(attr, NEXUS_ATTR_USER_CHANNEL,
102 		    sktc_attr->userchannel);
103 		SKTC_ASSERT_ERR(!error);
104 	}
105 	if (sktc_attr->ntxrings != -1) {
106 		error = os_nexus_attr_set(attr, NEXUS_ATTR_TX_RINGS,
107 		    sktc_attr->ntxrings);
108 		SKTC_ASSERT_ERR(!error);
109 	}
110 	if (sktc_attr->nrxrings != -1) {
111 		error = os_nexus_attr_set(attr, NEXUS_ATTR_RX_RINGS,
112 		    sktc_attr->nrxrings);
113 		SKTC_ASSERT_ERR(!error);
114 	}
115 	if (sktc_attr->ntxslots != -1) {
116 		error = os_nexus_attr_set(attr, NEXUS_ATTR_TX_SLOTS,
117 		    sktc_attr->ntxslots);
118 		SKTC_ASSERT_ERR(!error);
119 	}
120 	if (sktc_attr->nrxslots != -1) {
121 		error = os_nexus_attr_set(attr, NEXUS_ATTR_RX_SLOTS,
122 		    sktc_attr->nrxslots);
123 		SKTC_ASSERT_ERR(!error);
124 	}
125 	if (sktc_attr->slotsize != -1) {
126 		error = os_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE,
127 		    sktc_attr->slotsize);
128 		SKTC_ASSERT_ERR(!error);
129 	}
130 	if (sktc_attr->metasize != -1) {
131 		error = os_nexus_attr_set(attr, NEXUS_ATTR_SLOT_META_SIZE,
132 		    sktc_attr->metasize);
133 		SKTC_ASSERT_ERR(error == ENOTSUP);
134 	}
135 	if (sktc_attr->maxfrags != -1) {
136 		error = os_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
137 		    sktc_attr->maxfrags);
138 		SKTC_ASSERT_ERR(!error);
139 	}
140 	if (sktc_attr->rejectonclose != -1) {
141 		error = os_nexus_attr_set(attr, NEXUS_ATTR_REJECT_ON_CLOSE,
142 		    sktc_attr->rejectonclose);
143 		SKTC_ASSERT_ERR(!error);
144 	}
145 
146 	uuid_clear(*providerp);
147 	error = os_nexus_controller_register_provider(ncd,
148 	    sktc_attr->name, sktc_attr->type, attr, providerp);
149 	SKTC_ASSERT_ERR(!error);
150 	assert(!uuid_is_null(*providerp));
151 
152 	/* Clear the parameters to make sure they are being read */
153 	error = os_nexus_attr_set(attr, NEXUS_ATTR_ANONYMOUS, -1);
154 	SKTC_ASSERT_ERR(!error);
155 	error = os_nexus_attr_set(attr, NEXUS_ATTR_TX_RINGS, -1);
156 	SKTC_ASSERT_ERR(!error);
157 	error = os_nexus_attr_set(attr, NEXUS_ATTR_RX_RINGS, -1);
158 	SKTC_ASSERT_ERR(!error);
159 	error = os_nexus_attr_set(attr, NEXUS_ATTR_TX_SLOTS, -1);
160 	SKTC_ASSERT_ERR(!error);
161 	error = os_nexus_attr_set(attr, NEXUS_ATTR_RX_SLOTS, -1);
162 	SKTC_ASSERT_ERR(!error);
163 	error = os_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, -1);
164 	SKTC_ASSERT_ERR(!error);
165 	error = os_nexus_attr_set(attr, NEXUS_ATTR_SLOT_META_SIZE, -1);
166 	SKTC_ASSERT_ERR(error == ENOTSUP);
167 	error = os_nexus_attr_set(attr, NEXUS_ATTR_EXTENSIONS, -1);
168 	SKTC_ASSERT_ERR(!error);
169 	error = os_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS, -1);
170 	SKTC_ASSERT_ERR(!error);
171 	error = os_nexus_attr_set(attr, NEXUS_ATTR_REJECT_ON_CLOSE, -1);
172 	SKTC_ASSERT_ERR(!error);
173 
174 	error = os_nexus_controller_read_provider_attr(ncd,
175 	    *providerp, attr);
176 	SKTC_ASSERT_ERR(!error);
177 
178 	scratch = -1;
179 	error = os_nexus_attr_get(attr, NEXUS_ATTR_ANONYMOUS, &scratch);
180 	SKTC_ASSERT_ERR(!error);
181 	assert(scratch != -1);
182 	assert(sktc_attr->anonymous == -1 || sktc_attr->anonymous == scratch);
183 
184 	scratch = -1;
185 	error = os_nexus_attr_get(attr, NEXUS_ATTR_USER_CHANNEL, &scratch);
186 	SKTC_ASSERT_ERR(!error);
187 	assert(scratch != -1);
188 	assert(sktc_attr->userchannel == -1 ||
189 	    sktc_attr->userchannel == scratch);
190 
191 	scratch = -1;
192 	error = os_nexus_attr_get(attr, NEXUS_ATTR_TX_RINGS, &scratch);
193 	SKTC_ASSERT_ERR(!error);
194 	assert(scratch != -1);
195 	assert(sktc_attr->ntxrings == -1 || sktc_attr->ntxrings == scratch);
196 
197 	scratch = -1;
198 	error = os_nexus_attr_get(attr, NEXUS_ATTR_RX_RINGS, &scratch);
199 	SKTC_ASSERT_ERR(!error);
200 	assert(scratch != -1);
201 	assert(sktc_attr->nrxrings == -1 || sktc_attr->nrxrings == scratch);
202 
203 	scratch = -1;
204 	error = os_nexus_attr_get(attr, NEXUS_ATTR_TX_SLOTS, &scratch);
205 	SKTC_ASSERT_ERR(!error);
206 	assert(scratch != -1);
207 	assert(sktc_attr->ntxslots == -1 || sktc_attr->ntxslots == scratch);
208 
209 	scratch = -1;
210 	error = os_nexus_attr_get(attr, NEXUS_ATTR_RX_SLOTS, &scratch);
211 	SKTC_ASSERT_ERR(!error);
212 	assert(scratch != -1);
213 	assert(sktc_attr->nrxslots == -1 || sktc_attr->nrxslots == scratch);
214 
215 	scratch = -1;
216 	error = os_nexus_attr_get(attr, NEXUS_ATTR_SLOT_BUF_SIZE, &scratch);
217 	SKTC_ASSERT_ERR(!error);
218 	assert(scratch != -1);
219 	assert(sktc_attr->slotsize == -1 || sktc_attr->slotsize == scratch);
220 
221 	scratch = -1;
222 	error = os_nexus_attr_get(attr, NEXUS_ATTR_SLOT_META_SIZE, &scratch);
223 	SKTC_ASSERT_ERR(!error);
224 	assert(scratch != -1);
225 	assert(sktc_attr->metasize == -1 || sktc_attr->metasize == scratch);
226 
227 	scratch = -1;
228 	error = os_nexus_attr_get(attr, NEXUS_ATTR_MAX_FRAGS, &scratch);
229 	SKTC_ASSERT_ERR(!error);
230 	assert(scratch != -1);
231 	assert(sktc_attr->maxfrags == -1 || sktc_attr->maxfrags == scratch);
232 
233 	scratch = -1;
234 	error = os_nexus_attr_get(attr, NEXUS_ATTR_REJECT_ON_CLOSE, &scratch);
235 	SKTC_ASSERT_ERR(!error);
236 	assert(scratch != -1);
237 	assert(sktc_attr->rejectonclose == -1 ||
238 	    sktc_attr->rejectonclose == scratch);
239 
240 	os_nexus_attr_destroy(attr);
241 
242 	if (instancep) {
243 		uuid_clear(*instancep);
244 		error = os_nexus_controller_alloc_provider_instance(ncd,
245 		    *providerp, instancep);
246 		SKTC_ASSERT_ERR(!error);
247 		assert(!uuid_is_null(*instancep));
248 	}
249 }
250 
251 /* up to 4 seconds of retries (250ms delay per retry) */
252 #define SKTU_CHANNEL_CREATE_NOMEM_RETRIES       16
253 
254 channel_t
sktu_channel_create_extended(const uuid_t uuid,const nexus_port_t port,const ring_dir_t dir,const ring_id_t rid,const channel_attr_t attr,uint64_t exclusive,uint64_t monitor,uint64_t txlowatunit,uint64_t txlowatval,uint64_t rxlowatunit,uint64_t rxlowatval,uint64_t userpacketpool,uint64_t defunctok,uint64_t event_ring,uint64_t low_latency)255 sktu_channel_create_extended(const uuid_t uuid,
256     const nexus_port_t port, const ring_dir_t dir,
257     const ring_id_t rid, const channel_attr_t attr,
258     uint64_t exclusive, uint64_t monitor,
259     uint64_t txlowatunit, uint64_t txlowatval,
260     uint64_t rxlowatunit, uint64_t rxlowatval,
261     uint64_t userpacketpool, uint64_t defunctok,
262     uint64_t event_ring, uint64_t low_latency)
263 {
264 	channel_attr_t tmpattr;
265 	int error;
266 	uint64_t scratch;
267 	static struct timespec delay250ms = { .tv_sec = 0, .tv_nsec = 250000000 };
268 	uint32_t retries = 0;
269 	channel_t ret = NULL;
270 
271 	if (!attr) {
272 		tmpattr = os_channel_attr_create();
273 	} else {
274 		tmpattr = attr;
275 	}
276 
277 	if (exclusive != -1) {
278 		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_EXCLUSIVE, exclusive);
279 		SKTC_ASSERT_ERR(!error);
280 	}
281 
282 	if (monitor != -1) {
283 		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_MONITOR, monitor);
284 		SKTC_ASSERT_ERR(!error);
285 	}
286 
287 	if (txlowatunit != -1) {
288 		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_TX_LOWAT_UNIT, txlowatunit);
289 		SKTC_ASSERT_ERR(!error);
290 	}
291 
292 	if (txlowatval != -1) {
293 		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_TX_LOWAT_VALUE, txlowatval);
294 		SKTC_ASSERT_ERR(!error);
295 	}
296 
297 	if (rxlowatunit != -1) {
298 		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_RX_LOWAT_UNIT, rxlowatunit);
299 		SKTC_ASSERT_ERR(!error);
300 	}
301 
302 	if (rxlowatval != -1) {
303 		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_RX_LOWAT_VALUE, rxlowatval);
304 		SKTC_ASSERT_ERR(!error);
305 	}
306 
307 	if (userpacketpool != -1) {
308 		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_USER_PACKET_POOL, userpacketpool);
309 		SKTC_ASSERT_ERR(!error);
310 	}
311 
312 	if (defunctok != -1) {
313 		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_NEXUS_DEFUNCT_OK, defunctok);
314 		SKTC_ASSERT_ERR(!error);
315 	}
316 
317 	if (event_ring != -1) {
318 		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_EVENT_RING, event_ring);
319 		SKTC_ASSERT_ERR(!error);
320 	}
321 
322 	if (low_latency != -1) {
323 		error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_LOW_LATENCY, low_latency);
324 		SKTC_ASSERT_ERR(!error);
325 	}
326 
327 retry:
328 	ret = os_channel_create_extended(uuid, port, dir, rid, tmpattr);
329 	if (ret == NULL) {
330 		if (errno == ENOMEM && ++retries < SKTU_CHANNEL_CREATE_NOMEM_RETRIES) {
331 			nanosleep(&delay250ms, NULL);
332 			goto retry;
333 		}
334 		goto out;
335 	}
336 
337 	scratch = -1;
338 	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_EXCLUSIVE, &scratch);
339 	SKTC_ASSERT_ERR(!error);
340 	assert(scratch != 1);
341 	assert(exclusive == -1 || exclusive == scratch);
342 
343 	scratch = -1;
344 	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_MONITOR, &scratch);
345 	SKTC_ASSERT_ERR(!error);
346 	assert(scratch != -1);
347 	assert(exclusive == -1 || monitor == scratch);
348 
349 	scratch = -1;
350 	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_TX_LOWAT_UNIT, &scratch);
351 	SKTC_ASSERT_ERR(!error);
352 	assert(scratch != -1);
353 	assert(exclusive == -1 || txlowatunit == scratch);
354 
355 	scratch = -1;
356 	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_TX_LOWAT_VALUE, &scratch);
357 	SKTC_ASSERT_ERR(!error);
358 	assert(scratch != -1);
359 	assert(exclusive == -1 || txlowatval == scratch);
360 
361 	scratch = -1;
362 	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_RX_LOWAT_UNIT, &scratch);
363 	SKTC_ASSERT_ERR(!error);
364 	assert(scratch != -1);
365 	assert(exclusive == -1 || rxlowatunit == scratch);
366 
367 	scratch = -1;
368 	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_RX_LOWAT_VALUE, &scratch);
369 	SKTC_ASSERT_ERR(!error);
370 	assert(scratch != -1);
371 	assert(exclusive == -1 || rxlowatval == scratch);
372 
373 	scratch = -1;
374 	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_USER_PACKET_POOL, &scratch);
375 	SKTC_ASSERT_ERR(!error);
376 	assert(scratch != -1);
377 	assert(exclusive == -1 || userpacketpool == scratch);
378 
379 	scratch = -1;
380 	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_NEXUS_DEFUNCT_OK, &scratch);
381 	SKTC_ASSERT_ERR(!error);
382 	assert(scratch != -1);
383 	assert(exclusive == -1 || defunctok == scratch);
384 
385 	scratch = -1;
386 	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_EVENT_RING, &scratch);
387 	SKTC_ASSERT_ERR(!error);
388 	assert(scratch != -1);
389 	assert(exclusive == -1 || event_ring == scratch);
390 
391 	scratch = -1;
392 	error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_LOW_LATENCY, &scratch);
393 	SKTC_ASSERT_ERR(!error);
394 	assert(scratch != -1);
395 	assert(exclusive == -1 || low_latency == scratch);
396 
397 out:
398 	if (!attr) {
399 		os_channel_attr_destroy(tmpattr);
400 	}
401 
402 	return ret;
403 }
404 
405 /****************************************************************/
406 
407 static inline void
swap(int * permute,int i,int j)408 swap(int *permute, int i, int j)
409 {
410 	int tmp = permute[i];
411 	permute[i] = permute[j];
412 	permute[j] = tmp;
413 }
414 
415 
416 /* Plain changes, see Knuth (7.2.1.2) "Algorithm P"
417  * has advantage of only swapping adjacent pairs
418  * This could be cleaned up to be more "C" like, but
419  * this literal translation works without fanfare.
420  */
421 void
permutefuncP(int n,int * permute,void (* func)(int,int * permute))422 permutefuncP(int n, int *permute, void (*func)(int, int *permute))
423 {
424 	int j, s, q;
425 	int c[n], o[n];
426 	/* P1 Initialize. */
427 	for (j = 0; j < n; j++) {
428 		c[j] = 0;
429 		o[j] = 1;
430 	}
431 p2:
432 	/* P2 Visit. */
433 	func(n, permute);
434 	/* P3 Prepare for change. */
435 	j = n;
436 	s = 0;
437 p4:
438 	/* P4 Ready to change? */
439 	q = c[j - 1] + o[j - 1];
440 	if (q < 0) {
441 		goto p7;
442 	}
443 	if (q == j) {
444 		goto p6;
445 	}
446 	/* P5 Change. */
447 	{
448 		//T_LOG("Swapping %d with %d\n", j-c[j-1]+s-1, j-q+s-1);
449 		swap(permute, j - c[j - 1] + s - 1, j - q + s - 1);
450 	}
451 	c[j - 1] = q;
452 	goto p2;
453 p6:     /* P6 Increase s */
454 	if (j == 1) {
455 		return;
456 	}
457 	s++;
458 p7:     /* P7 Switch Direction */
459 	o[j - 1] = -o[j - 1];
460 	j--;
461 	goto p4;
462 }
463 
464 /* Heap's algorithm */
465 void
permutefuncH(int n,int * permute,void (* func)(int,int * permute))466 permutefuncH(int n, int *permute, void (*func)(int, int *permute))
467 {
468 	time_t start = time(NULL);
469 	time_t now, then = start;
470 	int count = 0;
471 	int total = 1;
472 	int i = 0;
473 	int c[n];
474 	memset(c, 0, sizeof(c));
475 	for (int f = 2; f <= n; f++) {
476 		total *= f;
477 	}
478 	count++;
479 	func(n, permute);
480 	while (i < n) {
481 		if (c[i] < i) {
482 			if (!(i & 1)) { /* Even */
483 				swap(permute, i, 0);
484 			} else { /* Odd */
485 				swap(permute, i, c[i]);
486 			}
487 			count++;
488 			{
489 				now = time(NULL);
490 				if (now > then) {
491 					T_LOG("time %ld on %d of %d (%2.2f%%, est %ld secs left)\n",
492 					    now - start, count, total,
493 					    (double)count * 100 / total,
494 					    (long)((double)(now - start) * total / count) - (now - start));
495 					then = now;
496 				}
497 			}
498 			func(n, permute);
499 			c[i] += 1;
500 			i = 0;
501 		} else {
502 			c[i] = 0;
503 			i++;
504 		}
505 	}
506 	now = time(NULL);
507 	T_LOG("total time %ld for %d permutations (rate %.2f)\n",
508 	    now - start, total, (double)total / (now - start));
509 }
510 
511 /* Random permutations, knuth's shuffle */
512 
513 void
permutefuncR(int n,int * permute,void (* func)(int,int * permute),int total,unsigned seed)514 permutefuncR(int n, int *permute, void (*func)(int, int *permute), int total, unsigned seed)
515 {
516 	time_t start = time(NULL);
517 	time_t now, then = start;
518 	int count = 0;
519 	T_LOG("Starting %d random permutations with seed %u\n", total, seed);
520 	srandom(seed);
521 	while (count < total) {
522 		for (int i = n - 1; i > 0; i--) {
523 			int j = random() % i; // XXX modulo bias.
524 			swap(permute, i, j);
525 		}
526 		count++;
527 		{
528 			now = time(NULL);
529 			if (now > then) {
530 				T_LOG("time %ld on %d of %d (%2.2f%%, est %ld secs left)\n",
531 				    now - start, count, total,
532 				    (double)count * 100 / total,
533 				    (long)((double)(now - start) * total / count) - (now - start));
534 				then = now;
535 			}
536 		}
537 		func(n, permute);
538 	}
539 	now = time(NULL);
540 	T_LOG("total time %ld for %d permutations (rate %.2f)\n",
541 	    now - start, total, (double)total / (now - start));
542 }
543 
544 
545 /*
546  * rakes each element across all other elements.
547  */
548 void
permutefuncZ(int n,int * permute,void (* func)(int,int * permute))549 permutefuncZ(int n, int *permute, void (*func)(int, int *permute))
550 {
551 	int save[n];
552 	memcpy(save, permute, sizeof(save));
553 	func(n, permute);
554 	for (int i = 0; i < n; i++) {
555 		//T_LOG("raking %d left\n", i);
556 		memcpy(permute, save, sizeof(save));
557 		for (int j = i; j > 0; j--) {
558 			swap(permute, j, j - 1);
559 			func(n, permute);
560 		}
561 		//T_LOG("raking %d right\n", i);
562 		memcpy(permute, save, sizeof(save));
563 		for (int j = i; j < n - 1; j++) {
564 			swap(permute, j, j + 1);
565 			/* The first right is the same as the last left, so skip it */
566 			if (j != i) {
567 				func(n, permute);
568 			}
569 		}
570 	}
571 }
572 
573 /****************************************************************/
574 
575 void
sktc_create_flowswitch_no_address(struct sktc_nexus_handles * handles,uint64_t ntxslots,uint64_t nrxslots,uint64_t buf_size,uint64_t max_frags,uint64_t anonymous)576 sktc_create_flowswitch_no_address(struct sktc_nexus_handles *handles,
577     uint64_t ntxslots, uint64_t nrxslots, uint64_t buf_size, uint64_t max_frags,
578     uint64_t anonymous)
579 {
580 	char buf[256];
581 	int error;
582 	struct sktc_nexus_attr attr = SKTC_NEXUS_ATTR_INIT();
583 
584 	attr.ntxslots = ntxslots;
585 	attr.nrxslots = nrxslots;
586 	attr.slotsize = buf_size;
587 	attr.anonymous = anonymous;
588 	attr.maxfrags = max_frags;
589 
590 	if (handles->netif_ifname[0] == '\0') {
591 		T_LOG("%s: no interface name specified\n",
592 		    __func__);
593 		return;
594 	}
595 	if (strlen(handles->netif_ifname) >= IFNAMSIZ) {
596 		T_LOG("%s: invalid interface name specified %s\n",
597 		    __func__, handles->netif_ifname);
598 		return;
599 	}
600 	handles->controller = os_nexus_controller_create();
601 	if (handles->controller == NULL) {
602 		SKT_LOG(
603 			"%s: os_nexus_controller_create failed, %s (%d)\n",
604 			__func__, strerror(errno), errno);
605 		return;
606 	}
607 
608 	snprintf(buf, sizeof(buf), "ms_fsw_%s", handles->netif_ifname);
609 	strncpy((char *)attr.name, buf, sizeof(nexus_name_t) - 1);
610 	attr.type = NEXUS_TYPE_FLOW_SWITCH;
611 	sktc_build_nexus(handles->controller, &attr, &handles->fsw_prov_uuid,
612 	    &handles->fsw_nx_uuid);
613 
614 	/* if the netif is already present, don't bother creating/attaching */
615 	if (!sktc_get_netif_nexus(handles->netif_ifname,
616 	    handles->netif_nx_uuid)) {
617 		snprintf(buf, sizeof(buf), "netif_%s", handles->netif_ifname);
618 		strncpy((char *)attr.name, buf, sizeof(nexus_name_t) - 1);
619 		attr.type = NEXUS_TYPE_NET_IF;
620 		attr.ntxslots = -1;
621 		attr.nrxslots = -1;
622 		sktc_build_nexus(handles->controller, &attr,
623 		    &handles->netif_prov_uuid, &handles->netif_nx_uuid);
624 		error = __os_nexus_ifattach(handles->controller,
625 		    handles->netif_nx_uuid,
626 		    handles->netif_ifname, NULL,
627 		    false,
628 		    &handles->netif_nx_attach_uuid);
629 		if (error != 0) {
630 			SKT_LOG(
631 				"__os_nexus_ifattach(%s) failed, %s (%d)\n",
632 				buf, strerror(errno), errno);
633 			return;
634 		}
635 	}
636 	error = __os_nexus_ifattach(handles->controller, handles->fsw_nx_uuid,
637 	    NULL, handles->netif_nx_uuid, false, &handles->fsw_nx_dev_attach_uuid);
638 	if (error != 0) {
639 		SKT_LOG("__os_nexus_ifattach() failed, %s (%d)\n",
640 		    strerror(errno), errno);
641 		return;
642 	}
643 }
644 
645 
646 void
sktc_nexus_handles_assign_address(struct sktc_nexus_handles * handles)647 sktc_nexus_handles_assign_address(struct sktc_nexus_handles *handles)
648 {
649 	int             error;
650 
651 	error = sktc_ifnet_add_addr(handles->netif_ifname,
652 	    &handles->netif_addr,
653 	    &handles->netif_mask, NULL);
654 	SKTC_ASSERT_ERR(!error);
655 }
656 
657 void
sktc_create_flowswitch(struct sktc_nexus_handles * handles,int i)658 sktc_create_flowswitch(struct sktc_nexus_handles *handles, int i)
659 {
660 	uint16_t        val;
661 
662 	/* assign the name */
663 	snprintf(handles->netif_ifname, sizeof(handles->netif_ifname),
664 	    FETH_FORMAT, i);
665 
666 	/* pick/assign a random IPv4LL address */
667 	val = random() % 0xffff;
668 	/* avoid subnet broadcast and host address 0 */
669 	if (((val & 0xff) == 0) || ((val & 0xff) == 0xff)) {
670 		val = (val & 0xfff0) | 0x2;
671 	}
672 	handles->netif_addr = sktc_make_in_addr(IN_LINKLOCALNETNUM | val);
673 	handles->netif_mask = sktc_make_in_addr(IN_CLASSC_NET);
674 	sktc_nexus_handles_assign_address(handles);
675 
676 	/* create the flowswitch */
677 	sktc_create_flowswitch_no_address(handles, -1, -1, -1, -1, 1);
678 }
679 
680 void
sktc_cleanup_flowswitch(struct sktc_nexus_handles * handles)681 sktc_cleanup_flowswitch(struct sktc_nexus_handles *handles)
682 {
683 	int error;
684 
685 	assert(handles->controller);
686 	assert(!uuid_is_null(handles->fsw_prov_uuid));
687 	assert(!uuid_is_null(handles->fsw_nx_uuid));
688 
689 	error = os_nexus_controller_free_provider_instance(handles->controller,
690 	    handles->fsw_nx_uuid);
691 	SKTC_ASSERT_ERR(!error);
692 
693 	error = os_nexus_controller_deregister_provider(handles->controller,
694 	    handles->fsw_prov_uuid);
695 	SKTC_ASSERT_ERR(!error);
696 
697 	os_nexus_controller_destroy(handles->controller);
698 
699 	error = sktc_ifnet_del_addr(handles->netif_ifname, &handles->netif_addr);
700 	SKTC_ASSERT_ERR(!error);
701 }
702 
703 /****************************************************************/
704 
705 int
sktc_bind_tcp4_flow(nexus_controller_t ncd,const uuid_t fsw,in_port_t in_port,nexus_port_t nx_port,const uuid_t flow)706 sktc_bind_tcp4_flow(nexus_controller_t ncd, const uuid_t fsw, in_port_t in_port, nexus_port_t nx_port, const uuid_t flow)
707 {
708 	struct nx_flow_req nfr;
709 	int error;
710 
711 	memset(&nfr, 0, sizeof(nfr));
712 	nfr.nfr_ip_protocol = IPPROTO_TCP;
713 	nfr.nfr_nx_port = nx_port;
714 	nfr.nfr_saddr.sa.sa_len = sizeof(struct sockaddr_in);
715 	nfr.nfr_saddr.sa.sa_family = AF_INET;
716 	nfr.nfr_saddr.sin.sin_port = htons(in_port);
717 	nfr.nfr_saddr.sin.sin_addr.s_addr = htonl(INADDR_ANY);
718 	uuid_copy(nfr.nfr_flow_uuid, flow);
719 
720 #if 0
721 	char buf[31];
722 	uuid_string_t uuidstr;
723 	uuid_unparse(nfr.nfr_flow_uuid, uuidstr);
724 	inet_ntop(AF_INET, &nfr.nfr_saddr.sin.sin_addr.s_addr, buf, sizeof(buf));
725 	T_LOG("before: nx_port %3d Flow %s %s addr %s port %d\n",
726 	    nfr.nfr_nx_port, uuidstr, (nfr.nfr_ip_protocol == IPPROTO_TCP) ? "tcp" : "udp",
727 	    buf, ntohs(nfr.nfr_saddr.sin.sin_port));
728 #endif
729 
730 	error = __os_nexus_flow_add(ncd, fsw, &nfr);
731 #if 0
732 	if (error) {
733 		T_LOG("__os_nexus_flow_add returned %d, errno %d\n", error, errno);
734 	}
735 #endif
736 
737 #if 0
738 	uuid_unparse(nfr.nfr_flow_uuid, uuidstr);
739 	inet_ntop(AF_INET, &nfr.nfr_saddr.sin.sin_addr.s_addr, buf, sizeof(buf));
740 	T_LOG("after:  nx_port %3d Flow %s %s addr %s port %d\n",
741 	    nfr.nfr_nx_port, uuidstr, (nfr.nfr_ip_protocol == IPPROTO_TCP) ? "tcp" : "udp",
742 	    buf, ntohs(nfr.nfr_saddr.sin.sin_port));
743 #endif
744 
745 	// XXX fails, see the fswbind25 for standalone test for this
746 	assert(nfr.nfr_nx_port == nx_port);
747 	T_LOG("got ephemeral port %d\n", ntohs(nfr.nfr_saddr.sin.sin_port));
748 
749 	/* Validate the ephemeral ports */
750 	if (!error && !in_port) {
751 		static int first, last;
752 		if (!first && !last) {
753 			size_t size;
754 
755 			size = sizeof(first);
756 			error = sysctlbyname("net.inet.ip.portrange.first", &first, &size, NULL, 0);
757 			SKTC_ASSERT_ERR(!error);
758 			assert(size == sizeof(first));
759 
760 			size = sizeof(last);
761 			error = sysctlbyname("net.inet.ip.portrange.last", &last, &size, NULL, 0);
762 			SKTC_ASSERT_ERR(!error);
763 			assert(size == sizeof(last));
764 
765 			T_LOG("ephemeral port range first %d last %d\n", first, last);
766 
767 			if (last < first) {
768 				int tmp = first;
769 				first = last;
770 				last = tmp;
771 			}
772 			assert(first <= last);
773 		}
774 		assert(ntohs(nfr.nfr_saddr.sin.sin_port) >= first);
775 		assert(ntohs(nfr.nfr_saddr.sin.sin_port) <= last);
776 	}
777 
778 	return error;
779 }
780 
781 int
sktc_unbind_flow(nexus_controller_t ncd,const uuid_t fsw,const uuid_t flow)782 sktc_unbind_flow(nexus_controller_t ncd, const uuid_t fsw, const uuid_t flow)
783 {
784 	struct nx_flow_req nfr;
785 	int error;
786 
787 	memset(&nfr, 0, sizeof(nfr));
788 	uuid_copy(nfr.nfr_flow_uuid, flow);
789 
790 	error = __os_nexus_flow_del(ncd, fsw, &nfr);
791 	if (error) {
792 		SKT_LOG("__os_nexus_flow_add returned %d, errno %d\n", error, errno);
793 	}
794 	return error;
795 }
796 
797 /****************************************************************/
798 
799 uint32_t
sktc_chew_random(channel_t channel,channel_ring_t ring,sync_mode_t mode,bool dosync,uint32_t nslots)800 sktc_chew_random(channel_t channel, channel_ring_t ring, sync_mode_t mode, bool dosync, uint32_t nslots)
801 {
802 	uint64_t count = 0;
803 	int error;
804 	channel_slot_t slot;
805 
806 	/* Chew a random number of slots */
807 	nslots = random() % (nslots + 1);
808 
809 	slot = NULL;
810 	while (count < nslots) {
811 		slot_prop_t prop;
812 
813 		slot = os_channel_get_next_slot(ring, slot, &prop);
814 		assert(slot);
815 		if (mode == CHANNEL_SYNC_TX) {
816 			packet_t pkt = os_channel_slot_get_packet(ring, slot);
817 			buflet_t buf = os_packet_get_next_buflet(pkt, NULL);
818 			assert(buf != NULL);
819 			uint16_t bdlim = os_buflet_get_data_limit(buf);
820 			assert(bdlim != 0);
821 			prop.sp_len = random() % bdlim;
822 			os_channel_set_slot_properties(ring, slot, &prop);
823 		}
824 		count++;
825 	}
826 
827 	if (slot) {
828 		error = os_channel_advance_slot(ring, slot);
829 		SKTC_ASSERT_ERR(!error);
830 	}
831 
832 	if (dosync) {
833 		error = os_channel_sync(channel, mode);
834 		if (skywalk_in_driver && error) {
835 			SKT_LOG("%s: sync fail error %d errno %d: %s\n", __func__, error, errno, strerror(errno));
836 		} else {
837 			SKTC_ASSERT_ERR(!error);
838 		}
839 	}
840 
841 	return count;
842 }
843 
844 /* This pumps slots on a ring until count slots have been tranferred */
845 void
sktc_pump_ring_nslots_kq(channel_t channel,channel_ring_t ring,sync_mode_t mode,bool dosync,uint64_t nslots,bool verbose)846 sktc_pump_ring_nslots_kq(channel_t channel, channel_ring_t ring, sync_mode_t mode, bool dosync, uint64_t nslots, bool verbose)
847 {
848 	uint64_t count = 0;
849 	int channelfd;
850 	int kq;
851 	struct kevent kev;
852 	int error;
853 	time_t start, then;
854 
855 	channelfd = os_channel_get_fd(channel);
856 	assert(channelfd != -1);
857 
858 	kq = kqueue();
859 	assert(kq != -1);
860 	EV_SET(&kev, channelfd,
861 	    mode == CHANNEL_SYNC_TX ? EVFILT_WRITE : EVFILT_READ,
862 	    EV_ADD | EV_ENABLE, 0, 0, NULL);
863 	error = kevent(kq, &kev, 1, NULL, 0, NULL);
864 	SKTC_ASSERT_ERR(!error);
865 
866 	if (verbose) {
867 		then = start = time(NULL);
868 	}
869 
870 	while (count < nslots) {
871 		uint32_t avail;
872 
873 		if (verbose) {
874 			time_t now = time(NULL);
875 			if (now > then) {
876 				T_LOG("time %ld pump %"PRId64" of %"PRId64" (%2.2f%%, est %ld secs left)\n",
877 				    now - start, count, nslots,
878 				    (double)count * 100 / nslots,
879 				    (long)((double)(now - start) * nslots / count) - (now - start));
880 				then = now;
881 			}
882 		}
883 
884 		avail = os_channel_available_slot_count(ring);
885 
886 		if (!avail) {
887 			int error;
888 
889 			memset(&kev, 0, sizeof(kev));
890 			error = kevent(kq, NULL, 0, &kev, 1, NULL);
891 			SKTC_ASSERT_ERR(error != -1);
892 			SKTC_ASSERT_ERR(error == 1);
893 
894 			assert(kev.ident == channelfd);
895 			if (mode == CHANNEL_SYNC_TX) {
896 				assert(kev.filter == EVFILT_WRITE);
897 			} else {
898 				assert(kev.filter == EVFILT_READ);
899 			}
900 
901 			avail = os_channel_available_slot_count(ring);
902 			assert(avail);
903 		}
904 
905 		count += sktc_chew_random(channel, ring, mode, dosync, MIN(nslots - count, avail));
906 	}
907 
908 	if (verbose) {
909 		time_t now = time(NULL);
910 		T_LOG("total time %ld for %"PRId64" slots (rate %.2f)\n",
911 		    now - start, nslots, (double)nslots / (now - start));
912 	}
913 
914 	error = close(kq);
915 	SKTC_ASSERT_ERR(!error);
916 }
917 
918 void
sktc_pump_ring_nslots_select(channel_t channel,channel_ring_t ring,sync_mode_t mode,bool dosync,uint64_t nslots,bool verbose)919 sktc_pump_ring_nslots_select(channel_t channel, channel_ring_t ring, sync_mode_t mode, bool dosync, uint64_t nslots, bool verbose)
920 {
921 	uint64_t count = 0;
922 	int channelfd;
923 	fd_set readfds, writefds, errorfds, zerofds;
924 	time_t start, then;
925 
926 	channelfd = os_channel_get_fd(channel);
927 	assert(channelfd != -1);
928 
929 	FD_ZERO(&zerofds);
930 	FD_ZERO(&readfds);
931 	FD_ZERO(&writefds);
932 	FD_ZERO(&errorfds);
933 	if (mode == CHANNEL_SYNC_TX) {
934 		FD_SET(channelfd, &writefds);
935 	} else {
936 		FD_SET(channelfd, &readfds);
937 	}
938 
939 	if (verbose) {
940 		then = start = time(NULL);
941 	}
942 
943 	while (count < nslots) {
944 		uint32_t avail;
945 
946 		if (verbose) {
947 			time_t now = time(NULL);
948 			if (now > then) {
949 				T_LOG("time %ld pump %"PRId64" of %"PRId64" (%2.2f%%, est %ld secs left)\n",
950 				    now - start, count, nslots,
951 				    (double)count * 100 / nslots,
952 				    (long)((double)(now - start) * nslots / count) - (now - start));
953 				then = now;
954 			}
955 		}
956 
957 		avail = os_channel_available_slot_count(ring);
958 
959 		if (!avail) {
960 			int error;
961 
962 			FD_SET(channelfd, &errorfds);
963 			error = select(channelfd + 1, &readfds, &writefds, &errorfds, NULL);
964 			SKTC_ASSERT_ERR(error != -1);
965 			assert(!memcmp(&zerofds, &errorfds, sizeof(zerofds)));
966 			if (mode == CHANNEL_SYNC_TX) {
967 				assert(FD_ISSET(channelfd, &writefds));
968 				assert(!memcmp(&zerofds, &readfds, sizeof(zerofds)));
969 			} else {
970 				assert(FD_ISSET(channelfd, &readfds));
971 				assert(!memcmp(&zerofds, &writefds, sizeof(zerofds)));
972 			}
973 			SKTC_ASSERT_ERR(error == 1);
974 
975 			avail = os_channel_available_slot_count(ring);
976 			assert(avail);
977 		}
978 
979 		count += sktc_chew_random(channel, ring, mode, dosync, MIN(nslots - count, avail));
980 	}
981 
982 	if (verbose) {
983 		time_t now = time(NULL);
984 		T_LOG("total time %ld for %"PRId64" slots (rate %.2f)\n",
985 		    now - start, nslots, (double)nslots / (now - start));
986 	}
987 }
988 
989 void
sktc_pump_ring_nslots_poll(channel_t channel,channel_ring_t ring,sync_mode_t mode,bool dosync,uint64_t nslots,bool verbose)990 sktc_pump_ring_nslots_poll(channel_t channel, channel_ring_t ring, sync_mode_t mode, bool dosync, uint64_t nslots, bool verbose)
991 {
992 	uint64_t count = 0;
993 	int channelfd;
994 	struct pollfd fds;
995 	time_t start, then;
996 
997 	channelfd = os_channel_get_fd(channel);
998 	assert(channelfd != -1);
999 
1000 	fds.fd = channelfd;
1001 	if (mode == CHANNEL_SYNC_TX) {
1002 		fds.events = POLLWRNORM;
1003 	} else {
1004 		fds.events = POLLRDNORM;
1005 	}
1006 
1007 	if (verbose) {
1008 		then = start = time(NULL);
1009 	}
1010 
1011 	while (count < nslots) {
1012 		uint32_t avail;
1013 
1014 		if (verbose) {
1015 			time_t now = time(NULL);
1016 			if (now > then) {
1017 				T_LOG("time %ld pump %"PRId64" of %"PRId64" (%2.2f%%, est %ld secs left)\n",
1018 				    now - start, count, nslots,
1019 				    (double)count * 100 / nslots,
1020 				    (long)((double)(now - start) * nslots / count) - (now - start));
1021 				then = now;
1022 			}
1023 		}
1024 
1025 		avail = os_channel_available_slot_count(ring);
1026 
1027 		if (!avail) {
1028 			int error;
1029 
1030 			error = poll(&fds, 1, -1);
1031 			SKTC_ASSERT_ERR(error != -1);
1032 			SKTC_ASSERT_ERR(error == 1);
1033 			assert(fds.fd == channelfd);
1034 			if (mode == CHANNEL_SYNC_TX) {
1035 				assert(fds.events == POLLWRNORM);
1036 				assert(fds.revents == POLLWRNORM);
1037 			} else {
1038 				assert(fds.events == POLLRDNORM);
1039 				assert(fds.revents == POLLRDNORM);
1040 			}
1041 
1042 			avail = os_channel_available_slot_count(ring);
1043 			assert(avail);
1044 		}
1045 
1046 		count += sktc_chew_random(channel, ring, mode, dosync, MIN(nslots - count, avail));
1047 	}
1048 
1049 	if (verbose) {
1050 		time_t now = time(NULL);
1051 		T_LOG("total time %ld for %"PRId64" slots (rate %.2f)\n",
1052 		    now - start, nslots, (double)nslots / (now - start));
1053 	}
1054 }
1055 
1056 /****************************************************************/
1057 
1058 void
sktc_raise_file_limit(int new)1059 sktc_raise_file_limit(int new)
1060 {
1061 	int error;
1062 	struct rlimit rl;
1063 
1064 	error = getrlimit(RLIMIT_NOFILE, &rl);
1065 	SKTC_ASSERT_ERR(!error);
1066 
1067 	if (rl.rlim_cur < new) {
1068 		T_LOG("raising file open limit from %llu (max %llu) to %d\n",
1069 		    rl.rlim_cur, rl.rlim_max, new);
1070 		rl.rlim_cur = new;
1071 		rl.rlim_max = new;
1072 		error = setrlimit(RLIMIT_NOFILE, &rl);
1073 		SKTC_ASSERT_ERR(!error);
1074 	}
1075 }
1076 
1077 
1078 /****************************************************************/
1079 
1080 int
sktu_create_interface(sktu_if_type_t type,sktu_if_flag_t flags)1081 sktu_create_interface(sktu_if_type_t type, sktu_if_flag_t flags)
1082 {
1083 	struct ctl_info kernctl_info;
1084 	struct sockaddr_ctl kernctl_addr;
1085 	int error;
1086 	int tunsock;
1087 	const char *CONTROL_NAME;
1088 	int OPT_ENABLE_NETIF, OPT_ATTACH_FSW;
1089 	int enable_netif, attach_fsw;
1090 	int scratch;
1091 
1092 	assert(type == SKTU_IFT_UTUN || type == SKTU_IFT_IPSEC);
1093 	if (type == SKTU_IFT_UTUN) {
1094 		CONTROL_NAME = UTUN_CONTROL_NAME;
1095 		OPT_ENABLE_NETIF = UTUN_OPT_ENABLE_NETIF;
1096 		OPT_ATTACH_FSW = UTUN_OPT_ATTACH_FLOWSWITCH;
1097 	} else {
1098 		CONTROL_NAME = IPSEC_CONTROL_NAME;
1099 		OPT_ENABLE_NETIF = IPSEC_OPT_ENABLE_NETIF;
1100 		OPT_ATTACH_FSW = 0;
1101 	}
1102 
1103 	enable_netif = ((flags & SKTU_IFF_ENABLE_NETIF) != 0) ? 1 : 0;
1104 	attach_fsw = ((flags & SKTU_IFF_NO_ATTACH_FSW) != 0) ? 0 : 1;
1105 
1106 	/* XXX Remove this retry nonsense when this is fixed:
1107 	 * <rdar://problem/37340313> creating an interface without specifying specific interface name should not return EBUSY
1108 	 */
1109 
1110 	for (int i = 0; i < 10; i++) {
1111 		if (i > 0) {
1112 			T_LOG("%s: sleeping 1ms before retrying\n", __func__);
1113 			usleep(1000);
1114 		}
1115 
1116 		tunsock = socket(PF_SYSTEM, SOCK_DGRAM, SYSPROTO_CONTROL);
1117 		assert(tunsock != -1);
1118 
1119 		memset(&kernctl_info, 0, sizeof(kernctl_info));
1120 		strlcpy(kernctl_info.ctl_name, CONTROL_NAME, sizeof(kernctl_info.ctl_name));
1121 		error = ioctl(tunsock, CTLIOCGINFO, &kernctl_info);
1122 		SKTC_ASSERT_ERR(error == 0);
1123 
1124 		memset(&kernctl_addr, 0, sizeof(kernctl_addr));
1125 		kernctl_addr.sc_len = sizeof(kernctl_addr);
1126 		kernctl_addr.sc_family = AF_SYSTEM;
1127 		kernctl_addr.ss_sysaddr = AF_SYS_CONTROL;
1128 		kernctl_addr.sc_id = kernctl_info.ctl_id;
1129 		kernctl_addr.sc_unit = 0;
1130 
1131 		/* If this is being called to reinstantiate a device that was just detached,
1132 		 * then this may return busy while the asynchronous detach completes.
1133 		 * This only occurs when this is being called in a tight loop
1134 		 * as per the utun27646755 test below
1135 		 */
1136 
1137 		error = bind(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr));
1138 		if (error == -1 && errno == EBUSY) {
1139 			close(tunsock);
1140 			tunsock = -1;
1141 			T_LOG("%s: i = %d bind returned EBUSY\n", __func__, i);
1142 			continue;
1143 		}
1144 
1145 		/* can only be set before connecting */
1146 		error = setsockopt(tunsock, SYSPROTO_CONTROL, OPT_ENABLE_NETIF, &enable_netif, sizeof(enable_netif));
1147 		SKTC_ASSERT_ERR(!error);
1148 		socklen_t scratchlen = sizeof(scratch);
1149 		error = getsockopt(tunsock, SYSPROTO_CONTROL, OPT_ENABLE_NETIF, &scratch, &scratchlen);
1150 		SKTC_ASSERT_ERR(!error);
1151 		assert(scratchlen == sizeof(scratch));
1152 		assert(enable_netif == scratch);
1153 
1154 		/* only applicable for utun */
1155 		if (type == SKTU_IFT_UTUN) {
1156 			error = setsockopt(tunsock, SYSPROTO_CONTROL, OPT_ATTACH_FSW, &attach_fsw, sizeof(attach_fsw));
1157 			SKTC_ASSERT_ERR(!error);
1158 		}
1159 
1160 		error = connect(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr));
1161 		if (error == -1 && errno == EBUSY) {
1162 			T_LOG("%s: i = %d connect returned EBUSY\n", __func__, i);
1163 			close(tunsock);
1164 			tunsock = -1;
1165 			continue;
1166 		}
1167 
1168 		error = fcntl(tunsock, F_SETFD, FD_CLOEXEC);
1169 		if (error != 0) {
1170 			warn("FD_CLOEXEC");
1171 		}
1172 
1173 		break;
1174 	}
1175 
1176 	if (error == -1) {
1177 		warn("Failed to create utun errno %d", errno);
1178 		close(tunsock);
1179 		tunsock = -1;
1180 	}
1181 
1182 	return tunsock;
1183 }
1184 
1185 channel_t
sktu_create_interface_channel(sktu_if_type_t type,int tunsock)1186 sktu_create_interface_channel(sktu_if_type_t type, int tunsock)
1187 {
1188 	uuid_t uuid;
1189 	channel_attr_t attr;
1190 	channel_t channel;
1191 	socklen_t uuidlen;
1192 	int error;
1193 	int OPT_ENABLE_CHANNEL;
1194 	int OPT_GET_CHANNEL_UUID;
1195 
1196 	if (type == SKTU_IFT_UTUN) {
1197 		OPT_ENABLE_CHANNEL = UTUN_OPT_ENABLE_CHANNEL;
1198 		OPT_GET_CHANNEL_UUID = UTUN_OPT_GET_CHANNEL_UUID;
1199 	} else {
1200 		assert(type == SKTU_IFT_IPSEC);
1201 		OPT_ENABLE_CHANNEL = IPSEC_OPT_ENABLE_CHANNEL;
1202 		OPT_GET_CHANNEL_UUID = IPSEC_OPT_GET_CHANNEL_UUID;
1203 	}
1204 
1205 	if (type == SKTU_IFT_UTUN) {
1206 		int enable = 1;
1207 		error = setsockopt(tunsock, SYSPROTO_CONTROL, OPT_ENABLE_CHANNEL, &enable, sizeof(enable));
1208 		if (error != 0) {
1209 			SKT_LOG("setsockopt returned error %d, errno %d\n", error, errno);
1210 		}
1211 		SKTC_ASSERT_ERR(error == 0);
1212 	}
1213 
1214 	int scratch;
1215 	socklen_t scratchlen = sizeof(scratch);
1216 	error = getsockopt(tunsock, SYSPROTO_CONTROL, OPT_ENABLE_CHANNEL, &scratch, &scratchlen);
1217 	SKTC_ASSERT_ERR(!error);
1218 	assert(scratchlen == sizeof(scratch));
1219 	assert(1 == scratch);
1220 
1221 	uuidlen = sizeof(uuid);
1222 	error = getsockopt(tunsock, SYSPROTO_CONTROL, OPT_GET_CHANNEL_UUID, uuid, &uuidlen);
1223 	SKTC_ASSERT_ERR(error == 0);
1224 	assert(uuidlen == sizeof(uuid));
1225 
1226 	attr = NULL;
1227 	channel = sktu_channel_create_extended(uuid,
1228 	    NEXUS_PORT_KERNEL_PIPE_CLIENT,
1229 	    CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, attr,
1230 	    -1, -1, -1, -1, -1, -1, -1, 1, -1, -1);
1231 	assert(channel);
1232 
1233 	return channel;
1234 }
1235 
1236 void
sktu_get_interface_name(sktu_if_type_t type,int s,char name[IFNAMSIZ])1237 sktu_get_interface_name(sktu_if_type_t type, int s, char name[IFNAMSIZ])
1238 {
1239 	int error;
1240 	socklen_t  optlen = IFNAMSIZ;
1241 	if (type == SKTU_IFT_UTUN) {
1242 		error = getsockopt(s, SYSPROTO_CONTROL, UTUN_OPT_IFNAME, name, &optlen);
1243 	} else {
1244 		error = getsockopt(s, SYSPROTO_CONTROL, IPSEC_OPT_IFNAME, name, &optlen);
1245 	}
1246 	SKTC_ASSERT_ERR(!error);
1247 }
1248 
1249 void
sktu_dump_buffer(FILE * f,const char * desc,const void * buf,size_t len)1250 sktu_dump_buffer(FILE *f, const char *desc, const void *buf, size_t len)
1251 {
1252 	int i;
1253 	unsigned char buff[17];
1254 	unsigned char *pc = (unsigned char*)buf;
1255 
1256 	if (desc != NULL) {
1257 		fprintf(f, "%s:\n", desc);
1258 	}
1259 
1260 	if (len == 0) {
1261 		fprintf(f, "  ZERO LENGTH\n");
1262 		return;
1263 	}
1264 
1265 	for (i = 0; i < len; i++) {
1266 		if ((i % 16) == 0) {
1267 			if (i != 0) {
1268 				fprintf(f, "  %s\n", buff);
1269 			}
1270 
1271 			fprintf(f, "  %04x ", i); // offset
1272 		}
1273 
1274 		fprintf(f, " %02x", pc[i]);
1275 
1276 		// prepare ascii
1277 		if ((pc[i] < 0x20) || (pc[i] > 0x7e)) {
1278 			buff[i % 16] = '.';
1279 		} else {
1280 			buff[i % 16] = pc[i];
1281 		}
1282 		buff[(i % 16) + 1] = '\0';
1283 	}
1284 
1285 	// pad last line to for ascii
1286 	while ((i % 16) != 0) {
1287 		fprintf(f, "   ");
1288 		i++;
1289 	}
1290 
1291 	fprintf(f, "  %s\n", buff);
1292 }
1293 
1294 int
sysctl_buf(char * oid_name,void ** buffer,size_t * len,void * newp,size_t newlen)1295 sysctl_buf(char *oid_name, void **buffer, size_t *len, void *newp,
1296     size_t newlen)
1297 {
1298 	int ret, err;
1299 	int try = 0;
1300 
1301 	*buffer = NULL;
1302 #define RETRY_COUNT 10
1303 try_again:
1304 	ret = sysctlbyname(oid_name, NULL, len, newp, newlen);
1305 	if (ret != 0) {
1306 		if (ret == ENOMEM) {
1307 			try++;
1308 			if (try <= RETRY_COUNT) {
1309 				goto try_again;
1310 			}
1311 		}
1312 		err = errno;
1313 		SKT_LOG("sysctl for len failed, %s\n", strerror(errno));
1314 		return err;
1315 	}
1316 	if (*len == 0) {
1317 		T_LOG("sysctl for len returned zero! No stats?\n");
1318 		*buffer = NULL;
1319 		return 0;
1320 	}
1321 	*buffer = malloc(*len);
1322 	if (*buffer == NULL) {
1323 		T_LOG("sysctl malloc for %ld bytes failed\n", *len);
1324 		return ENOMEM;
1325 	}
1326 
1327 	ret = sysctlbyname(oid_name, *buffer, len, newp, newlen);
1328 	if (ret != 0) {
1329 		err = errno;
1330 		if (ret == ENOMEM) {
1331 			free(*buffer);
1332 			*buffer = NULL;
1333 			try++;
1334 			if (try <= RETRY_COUNT) {
1335 				goto try_again;
1336 			}
1337 		}
1338 		SKT_LOG("sysctl for buf failed, %s\n", strerror(errno));
1339 		free(*buffer);
1340 		return err;
1341 	}
1342 
1343 	return 0;
1344 }
1345 
1346 uint32_t
sktu_set_inject_error_rmask(uint32_t * mask)1347 sktu_set_inject_error_rmask(uint32_t *mask)
1348 {
1349 	uint32_t old_mask;
1350 	size_t size = sizeof(old_mask);
1351 	int error;
1352 
1353 	error = sysctlbyname("kern.skywalk.inject_error_rmask",
1354 	    &old_mask, &size, mask, mask ? sizeof(*mask) : 0);
1355 
1356 	SKTC_ASSERT_ERR(!error);
1357 	return old_mask;
1358 }
1359 
1360 /* returns TRUE if a matching IPv4 address is found */
1361 boolean_t
sktu_check_interface_ipv4_address(char * ifname,uint32_t ipaddr)1362 sktu_check_interface_ipv4_address(char *ifname, uint32_t ipaddr)
1363 {
1364 	struct ifaddrs *ifaddr, *ifa;
1365 	boolean_t match = FALSE;
1366 	int error;
1367 
1368 	error = getifaddrs(&ifaddr);
1369 	SKTC_ASSERT_ERR(!error);
1370 
1371 	for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) {
1372 		struct sockaddr_in *sin =
1373 		    (struct sockaddr_in *)(void *)ifa->ifa_addr;
1374 		if (ifa->ifa_addr == NULL) {
1375 			continue;
1376 		}
1377 		if ((strncmp(ifa->ifa_name, ifname, IFNAMSIZ) == 0) &&
1378 		    (ifa->ifa_addr->sa_family == AF_INET) &&
1379 		    (sin->sin_addr.s_addr == ipaddr)) {
1380 			match = TRUE;
1381 		}
1382 	}
1383 	freeifaddrs(ifaddr);
1384 	return match;
1385 }
1386 
1387 /****************************************************************/
1388 
1389 int
sktu_create_pfkeysock(void)1390 sktu_create_pfkeysock(void)
1391 {
1392 	int keysock = socket(PF_KEY, SOCK_RAW, PF_KEY_V2);
1393 	assert(keysock != -1);
1394 	return keysock;
1395 }
1396 
1397 void
sktu_create_sa(int keysock,const char ifname[IFXNAMSIZ],uint32_t spi,struct in_addr * src,struct in_addr * dst)1398 sktu_create_sa(int keysock, const char ifname[IFXNAMSIZ], uint32_t spi, struct in_addr *src, struct in_addr *dst)
1399 {
1400 	/*
1401 	 *       <base, SA, (lifetime(HS),) address(SD), (address(P),)
1402 	 *       key(AE), (identity(SD),) (sensitivity)>
1403 	 */
1404 
1405 	struct {
1406 		struct sadb_msg msg __attribute((aligned(sizeof(uint64_t))));
1407 		struct sadb_key key      __attribute((aligned(sizeof(uint64_t))));
1408 		struct sadb_sa sa        __attribute((aligned(sizeof(uint64_t))));
1409 		struct sadb_x_sa2 sa2    __attribute((aligned(sizeof(uint64_t))));
1410 		struct sadb_x_ipsecif ipsecif __attribute((aligned(sizeof(uint64_t))));
1411 		struct {
1412 			struct sadb_address addr __attribute((aligned(sizeof(uint64_t))));
1413 			struct sockaddr_in saddr __attribute((aligned(sizeof(uint64_t))));
1414 		} src;
1415 		struct {
1416 			struct sadb_address addr __attribute((aligned(sizeof(uint64_t))));
1417 			struct sockaddr_in saddr __attribute((aligned(sizeof(uint64_t))));
1418 		} dst;
1419 	} addcmd;
1420 
1421 	memset(&addcmd, 0, sizeof(addcmd));
1422 
1423 	addcmd.msg.sadb_msg_version = PF_KEY_V2;
1424 	addcmd.msg.sadb_msg_type = SADB_ADD;
1425 	addcmd.msg.sadb_msg_errno = 0;
1426 	addcmd.msg.sadb_msg_satype = SADB_SATYPE_ESP;
1427 	addcmd.msg.sadb_msg_len = PFKEY_UNIT64(sizeof(addcmd));
1428 	addcmd.msg.sadb_msg_reserved = 0;
1429 	addcmd.msg.sadb_msg_seq = 0;
1430 	addcmd.msg.sadb_msg_pid = (unsigned)getpid();
1431 
1432 	addcmd.key.sadb_key_len = PFKEY_UNIT64(sizeof(addcmd.key));
1433 	addcmd.key.sadb_key_exttype = SADB_EXT_KEY_ENCRYPT;
1434 	addcmd.key.sadb_key_bits = 0;
1435 	addcmd.key.sadb_key_reserved = 0;
1436 
1437 	addcmd.sa.sadb_sa_len = PFKEY_UNIT64(sizeof(addcmd.sa));
1438 	addcmd.sa.sadb_sa_exttype = SADB_EXT_SA;
1439 	addcmd.sa.sadb_sa_spi = htonl(spi);
1440 	addcmd.sa.sadb_sa_replay = 0;
1441 	addcmd.sa.sadb_sa_state = 0;
1442 	addcmd.sa.sadb_sa_auth = SADB_AALG_NONE;
1443 	addcmd.sa.sadb_sa_encrypt = SADB_EALG_NULL;
1444 	addcmd.sa.sadb_sa_flags = 0;
1445 
1446 	addcmd.sa2.sadb_x_sa2_len = PFKEY_UNIT64(sizeof(addcmd.sa2));
1447 	addcmd.sa2.sadb_x_sa2_exttype = SADB_X_EXT_SA2;
1448 	addcmd.sa2.sadb_x_sa2_mode = IPSEC_MODE_TRANSPORT;
1449 	addcmd.sa2.sadb_x_sa2_alwaysexpire = 1;
1450 	addcmd.sa2.sadb_x_sa2_flags = SADB_X_EXT_SA2_DELETE_ON_DETACH;
1451 	addcmd.sa2.sadb_x_sa2_sequence = 0;
1452 	addcmd.sa2.sadb_x_sa2_reqid = 0;
1453 
1454 	addcmd.ipsecif.sadb_x_ipsecif_len = PFKEY_UNIT64(sizeof(addcmd.ipsecif));
1455 	addcmd.ipsecif.sadb_x_ipsecif_exttype = SADB_X_EXT_IPSECIF;
1456 	memset(addcmd.ipsecif.sadb_x_ipsecif_internal_if, 0, sizeof(addcmd.ipsecif.sadb_x_ipsecif_internal_if));
1457 	memset(addcmd.ipsecif.sadb_x_ipsecif_outgoing_if, 0, sizeof(addcmd.ipsecif.sadb_x_ipsecif_outgoing_if));
1458 	strlcpy(addcmd.ipsecif.sadb_x_ipsecif_ipsec_if, ifname, sizeof(addcmd.ipsecif.sadb_x_ipsecif_ipsec_if));
1459 	addcmd.ipsecif.sadb_x_ipsecif_init_disabled = 0;
1460 	addcmd.ipsecif.reserved = 0;
1461 
1462 	addcmd.src.addr.sadb_address_len = PFKEY_UNIT64(sizeof(addcmd.src));
1463 	addcmd.src.addr.sadb_address_exttype = SADB_EXT_ADDRESS_SRC;
1464 	addcmd.src.addr.sadb_address_proto = IPSEC_ULPROTO_ANY;
1465 	addcmd.src.addr.sadb_address_prefixlen = sizeof(struct in_addr) << 3; //XXX Why?
1466 	addcmd.src.addr.sadb_address_reserved = 0;
1467 	addcmd.src.saddr.sin_len = sizeof(addcmd.src.saddr);
1468 	addcmd.src.saddr.sin_family = AF_INET;
1469 	addcmd.src.saddr.sin_port = htons(0);
1470 	addcmd.src.saddr.sin_addr = *src;
1471 
1472 	addcmd.dst.addr.sadb_address_len = PFKEY_UNIT64(sizeof(addcmd.dst));
1473 	addcmd.dst.addr.sadb_address_exttype = SADB_EXT_ADDRESS_DST;
1474 	addcmd.dst.addr.sadb_address_proto = IPSEC_ULPROTO_ANY;
1475 	addcmd.dst.addr.sadb_address_prefixlen = sizeof(struct in_addr) << 3; //XXX Why?
1476 	addcmd.dst.addr.sadb_address_reserved = 0;
1477 	addcmd.dst.saddr.sin_len = sizeof(addcmd.dst.saddr);
1478 	addcmd.dst.saddr.sin_family = AF_INET;
1479 	addcmd.dst.saddr.sin_port = htons(0);
1480 	addcmd.dst.saddr.sin_addr = *dst;
1481 
1482 	//log_hexdump(&addcmd, sizeof(addcmd));
1483 
1484 	ssize_t slen;
1485 	slen = send(keysock, &addcmd, sizeof(addcmd), 0);
1486 	assert(slen == sizeof(addcmd));
1487 }
1488 
1489 typedef union {
1490 	char        c[2];
1491 	u_short     s;
1492 } short_union_t;
1493 
1494 typedef union {
1495 	u_short     s[2];
1496 	long        l;
1497 } long_union_t;
1498 
1499 static __inline__ void
reduce(int * sum)1500 reduce(int * sum)
1501 {
1502 	long_union_t l_util;
1503 
1504 	l_util.l = *sum;
1505 	*sum = l_util.s[0] + l_util.s[1];
1506 	if (*sum > 65535) {
1507 		*sum -= 65535;
1508 	}
1509 	return;
1510 }
1511 
1512 unsigned short
in_cksum(void * pkt,int len,int sum0)1513 in_cksum(void * pkt, int len, int sum0)
1514 {
1515 	u_short * w;
1516 	int sum = sum0;
1517 
1518 	w = (u_short *)pkt;
1519 	while ((len -= 32) >= 0) {
1520 		sum += w[0]; sum += w[1];
1521 		sum += w[2]; sum += w[3];
1522 		sum += w[4]; sum += w[5];
1523 		sum += w[6]; sum += w[7];
1524 		sum += w[8]; sum += w[9];
1525 		sum += w[10]; sum += w[11];
1526 		sum += w[12]; sum += w[13];
1527 		sum += w[14]; sum += w[15];
1528 		w += 16;
1529 	}
1530 	len += 32;
1531 	while ((len -= 8) >= 0) {
1532 		sum += w[0]; sum += w[1];
1533 		sum += w[2]; sum += w[3];
1534 		w += 4;
1535 	}
1536 	len += 8;
1537 	if (len) {
1538 		reduce(&sum);
1539 		while ((len -= 2) >= 0) {
1540 			sum += *w++;
1541 		}
1542 	}
1543 	if (len == -1) { /* odd-length packet */
1544 		short_union_t s_util;
1545 
1546 		s_util.s = 0;
1547 		s_util.c[0] = *((char *)w);
1548 		s_util.c[1] = 0;
1549 		sum += s_util.s;
1550 	}
1551 	reduce(&sum);
1552 	return ~sum & 0xffff;
1553 }
1554 
1555 #define ADDCARRY(_x)  do {                                              \
1556 	while (((_x) >> 16) != 0)                                       \
1557 	        (_x) = ((_x) >> 16) + ((_x) & 0xffff);                  \
1558 } while (0)
1559 
1560 /*
1561  * Checksum routine for Internet Protocol family headers (Portable Version).
1562  *
1563  * This routine is very heavily used in the network
1564  * code and should be modified for each CPU to be as fast as possible.
1565  */
1566 #define REDUCE16 {                                                        \
1567 	q_util.q = sum;                                                   \
1568 	l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
1569 	sum = l_util.s[0] + l_util.s[1];                                  \
1570 	ADDCARRY(sum);                                                    \
1571 }
1572 
1573 union l_util {
1574 	uint16_t s[2];
1575 	uint32_t l;
1576 };
1577 
1578 union q_util {
1579 	uint16_t s[4];
1580 	uint32_t l[2];
1581 	uint64_t q;
1582 };
1583 
1584 uint16_t
in_pseudo(uint32_t a,uint32_t b,uint32_t c)1585 in_pseudo(uint32_t a, uint32_t b, uint32_t c)
1586 {
1587 	uint64_t sum;
1588 	union q_util q_util;
1589 	union l_util l_util;
1590 
1591 	sum = (uint64_t)a + b + c;
1592 	REDUCE16;
1593 	return sum;
1594 }
1595 
1596 uint16_t
in6_pseudo(const struct in6_addr * src,const struct in6_addr * dst,uint32_t x)1597 in6_pseudo(const struct in6_addr *src, const struct in6_addr *dst, uint32_t x)
1598 {
1599 	uint32_t sum = 0;
1600 	const uint16_t *w;
1601 
1602 	/*
1603 	 * IPv6 source address
1604 	 */
1605 	w = (const uint16_t *)src;
1606 	sum += w[0]; sum += w[1];
1607 	sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5];
1608 	sum += w[6]; sum += w[7];
1609 
1610 	/*
1611 	 * IPv6 destination address
1612 	 */
1613 	w = (const uint16_t *)dst;
1614 	sum += w[0]; sum += w[1];
1615 	sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5];
1616 	sum += w[6]; sum += w[7];
1617 
1618 	/*
1619 	 * Caller-supplied value; 'x' could be one of:
1620 	 *
1621 	 *	htonl(proto + length), or
1622 	 *	htonl(proto + length + sum)
1623 	 **/
1624 	sum += x;
1625 
1626 	/* fold in carry bits */
1627 	ADDCARRY(sum);
1628 
1629 	return sum;
1630 }
1631 
1632 uint16_t
sktu_ip_id()1633 sktu_ip_id()
1634 {
1635 	static int sktu_ip_id;
1636 	return sktu_ip_id++;
1637 }
1638 
1639 void
sktu_channel_port_init(channel_port_t ch_port,uuid_t instance,nexus_port_t nx_port,bool enable_upp,bool enable_event_ring,bool low_latency)1640 sktu_channel_port_init(channel_port_t ch_port, uuid_t instance,
1641     nexus_port_t nx_port, bool enable_upp, bool enable_event_ring,
1642     bool low_latency)
1643 {
1644 	channel_t       chan;
1645 	nexus_port_t    port = nx_port;
1646 	ring_id_t       ringid;
1647 
1648 	bzero(ch_port, sizeof(*ch_port));
1649 	chan = sktu_channel_create_extended(instance, port,
1650 	    CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL,
1651 	    -1, -1, -1, -1, -1, -1, enable_upp ? 1 : -1, 1,
1652 	    enable_event_ring ? 1 : -1, low_latency ? 1 : -1);
1653 	if (chan == NULL) {
1654 		SKT_LOG("Can't open channel on port %d, %s\n", port,
1655 		    strerror(errno));
1656 		return;
1657 	}
1658 
1659 	T_LOG("Opened port %d\n", port);
1660 
1661 	ch_port->chan = chan;
1662 	ch_port->fd = os_channel_get_fd(chan);
1663 	ch_port->port = port;
1664 	ch_port->user_packet_pool = enable_upp;
1665 
1666 	/* tx ring */
1667 	ringid = os_channel_ring_id(chan, CHANNEL_FIRST_TX_RING);
1668 	ch_port->tx_ring = os_channel_tx_ring(ch_port->chan, ringid);
1669 	assert(ch_port->tx_ring != NULL);
1670 	/* rx ring */
1671 	ringid = os_channel_ring_id(chan, CHANNEL_FIRST_RX_RING);
1672 	ch_port->rx_ring = os_channel_rx_ring(ch_port->chan, ringid);
1673 	assert(ch_port->rx_ring != NULL);
1674 }
1675 
1676 static inline uint16_t
sktu_fold_sum_final(uint32_t sum)1677 sktu_fold_sum_final(uint32_t sum)
1678 {
1679 	sum = (sum >> 16) + (sum & 0xffff);     /* 17-bit */
1680 	sum = (sum >> 16) + (sum & 0xffff);     /* 16-bit + carry */
1681 	sum = (sum >> 16) + (sum & 0xffff);     /* final carry */
1682 	return ~sum & 0xffff;
1683 }
1684 
1685 packet_t
sktu_channel_port_frame_to_pkt(channel_port_t port,struct sktu_frame * frame)1686 sktu_channel_port_frame_to_pkt(channel_port_t port, struct sktu_frame *frame)
1687 {
1688 	int error;
1689 	packet_t pkt;
1690 	void *baddr, *bytes = &frame->bytes[0];
1691 	size_t len = frame->len;
1692 	buflet_t buf, pbuf = NULL;
1693 	uint16_t clen, bdlim, blen, bcnt;
1694 
1695 	assert(port->user_packet_pool);
1696 
1697 	error = os_channel_packet_alloc(port->chan, &pkt);
1698 	SKTC_ASSERT_ERR(error == 0);
1699 	assert(pkt != 0);
1700 
1701 	buf = os_packet_get_next_buflet(pkt, NULL);
1702 	assert(buf != NULL);
1703 	error = os_buflet_set_data_offset(buf, 0);
1704 	SKTC_ASSERT_ERR(error == 0);
1705 	bdlim = blen = os_buflet_get_data_limit(buf);
1706 	assert(bdlim != 0);
1707 	bcnt = os_packet_get_buflet_count(pkt);
1708 	assert(blen * bcnt >= len);
1709 	baddr = os_buflet_get_object_address(buf);
1710 	assert(baddr != NULL);
1711 
1712 	error = os_packet_set_link_header_length(pkt, 0);
1713 	SKTC_ASSERT_ERR(error == 0);
1714 
1715 	/* copy the frame bytes */
1716 	while (len != 0) {
1717 		if (blen == 0) {
1718 			error = os_buflet_set_data_length(buf, bdlim);
1719 			SKTC_ASSERT_ERR(error == 0);
1720 			pbuf = buf;
1721 			buf = os_packet_get_next_buflet(pkt, pbuf);
1722 			assert(buf != NULL);
1723 			error = os_buflet_set_data_offset(buf, 0);
1724 			SKTC_ASSERT_ERR(error == 0);
1725 			baddr = os_buflet_get_object_address(buf);
1726 			assert(baddr != NULL);
1727 			bdlim = blen = os_buflet_get_data_limit(buf);
1728 		}
1729 		clen = MIN(blen, len);
1730 		memcpy(baddr, bytes, clen);
1731 		len -= clen;
1732 		blen -= clen;
1733 		bytes += clen;
1734 		baddr += clen;
1735 		assert(len == 0 || blen == 0);
1736 	}
1737 	if (frame->csum_flags != 0) {
1738 		os_packet_set_inet_checksum(pkt, frame->csum_flags,
1739 		    frame->csum_start, frame->csum_stuff);
1740 	}
1741 	if (pbuf == NULL) {
1742 		error = os_buflet_set_data_length(buf, frame->len);
1743 	} else {
1744 		error = os_buflet_set_data_length(buf, clen);
1745 	}
1746 	SKTC_ASSERT_ERR(error == 0);
1747 
1748 	os_packet_set_flow_uuid(pkt, frame->flow_uuid);
1749 	error = os_packet_finalize(pkt);
1750 	SKTC_ASSERT_ERR(error == 0);
1751 	return pkt;
1752 }
1753 
1754 int
sktu_channel_port_tx(channel_port_t port,packet_t pkt)1755 sktu_channel_port_tx(channel_port_t port, packet_t pkt)
1756 {
1757 	int error;
1758 	slot_prop_t prop;
1759 	channel_slot_t slot;
1760 
1761 	slot = os_channel_get_next_slot(port->tx_ring, NULL, &prop);
1762 	if (slot == NULL) {
1763 		return ENOENT;
1764 	}
1765 	error = os_channel_slot_attach_packet(port->tx_ring, slot, pkt);
1766 	SKTC_ASSERT_ERR(error == 0);
1767 	error = os_channel_advance_slot(port->tx_ring, slot);
1768 	SKTC_ASSERT_ERR(error == 0);
1769 	return 0;
1770 }
1771 
1772 /*
1773  * Burst Tx tries to tx as many it can in one shot.
1774  *
1775  * Returns number of actually completed Tx.
1776  */
1777 uint32_t
sktu_channel_port_tx_burst_pkt(channel_port_t port,packet_t * pkts,uint32_t n)1778 sktu_channel_port_tx_burst_pkt(channel_port_t port, packet_t *pkts,
1779     uint32_t n)
1780 {
1781 	struct timespec timeout = {
1782 		.tv_sec = 10,
1783 		.tv_nsec = 0,
1784 	};
1785 	struct kevent evlist, kev;
1786 	int kq;
1787 	int error;
1788 	uint32_t i;
1789 
1790 	kq = kqueue();
1791 	assert(kq != -1);
1792 
1793 	EV_SET(&kev, port->fd, EVFILT_WRITE, EV_ADD | EV_ENABLE, 0, 0, NULL);
1794 	error = kevent(kq, &kev, 1, NULL, 0, NULL);
1795 	SKTC_ASSERT_ERR(error == 0);
1796 
1797 	/* wait for Tx to become available */
1798 	error = kevent(kq, NULL, 0, &evlist, 1, &timeout);
1799 	if (error <= 0) {
1800 		if (errno == EAGAIN) {
1801 			return 0;
1802 		}
1803 		SKTC_ASSERT_ERR(error == 0);
1804 	}
1805 	if (error == 0) {
1806 		T_LOG("kevent timeout\n");
1807 		return 0;
1808 	}
1809 	if (evlist.flags & EV_ERROR) {
1810 		int err = evlist.data;
1811 		if (err == EAGAIN) {
1812 			return 0;
1813 		}
1814 		SKTC_ASSERT_ERR(err == 0);
1815 	}
1816 
1817 	if (evlist.filter != EVFILT_WRITE) {
1818 		err(EX_OSERR, "%lu event %d?\n", evlist.ident, evlist.filter);
1819 	}
1820 
1821 	for (i = 0; i < n; i++) {
1822 		error = sktu_channel_port_tx(port, pkts[i]);
1823 		if (error != 0) {
1824 			break;
1825 		}
1826 	}
1827 
1828 	if (i != 0) {
1829 		error = os_channel_sync(port->chan, CHANNEL_SYNC_TX);
1830 		SKTC_ASSERT_ERR(error == 0);
1831 	}
1832 
1833 	return i;
1834 }
1835 
1836 /*
1837  * Burst Tx tries to tx as many it can in one shot.
1838  *
1839  * Returns number of actually completed Tx.
1840  */
1841 uint32_t
sktu_channel_port_tx_burst(channel_port_t port,struct sktu_frame ** frames,uint32_t n)1842 sktu_channel_port_tx_burst(channel_port_t port, struct sktu_frame **frames,
1843     uint32_t n)
1844 {
1845 	struct timespec timeout = {
1846 		.tv_sec = 10,
1847 		.tv_nsec = 0,
1848 	};
1849 	struct kevent evlist, kev;
1850 	int kq;
1851 	int error;
1852 	uint32_t i;
1853 	packet_t pkt;
1854 
1855 	kq = kqueue();
1856 	assert(kq != -1);
1857 
1858 	EV_SET(&kev, port->fd, EVFILT_WRITE, EV_ADD | EV_ENABLE, 0, 0, NULL);
1859 	error = kevent(kq, &kev, 1, NULL, 0, NULL);
1860 	SKTC_ASSERT_ERR(error == 0);
1861 
1862 	/* wait for Tx to become available */
1863 	error = kevent(kq, NULL, 0, &evlist, 1, &timeout);
1864 	if (error <= 0) {
1865 		if (errno == EAGAIN) {
1866 			return 0;
1867 		}
1868 		SKTC_ASSERT_ERR(error == 0);
1869 	}
1870 	if (error == 0) {
1871 		T_LOG("kevent timeout\n");
1872 		return 0;
1873 	}
1874 	if (evlist.flags & EV_ERROR) {
1875 		int err = evlist.data;
1876 		if (err == EAGAIN) {
1877 			return 0;
1878 		}
1879 		SKTC_ASSERT_ERR(err == 0);
1880 	}
1881 
1882 	if (evlist.filter != EVFILT_WRITE) {
1883 		err(EX_OSERR, "%lu event %d?\n", evlist.ident, evlist.filter);
1884 	}
1885 
1886 	for (i = 0; i < n; i++) {
1887 		pkt = sktu_channel_port_frame_to_pkt(port, frames[i]);
1888 		error = sktu_channel_port_tx(port, pkt);
1889 		if (error != 0) {
1890 			break;
1891 		}
1892 	}
1893 
1894 	if (i != 0) {
1895 		error = os_channel_sync(port->chan, CHANNEL_SYNC_TX);
1896 		SKTC_ASSERT_ERR(error == 0);
1897 	}
1898 
1899 	return i;
1900 }
1901 
1902 /*
1903  * Bulk Tx makes sure all Tx operations are completed; otherwise fails the test.
1904  */
1905 void
sktu_channel_port_tx_bulk(channel_port_t port,struct sktu_frame ** frames,uint32_t n)1906 sktu_channel_port_tx_bulk(channel_port_t port, struct sktu_frame **frames,
1907     uint32_t n)
1908 {
1909 	uint32_t ret = 0;
1910 	ret = sktu_channel_port_tx_burst(port, frames, n);
1911 	assert(ret < n);
1912 	if (ret != n) {
1913 		errx(EX_OSERR, "tx bulk failed %u/%u", n, ret);
1914 	}
1915 }
1916 
1917 int
sktu_parse_ipv4_frame(struct sktu_frame * frame,void * ip_payload,uint32_t * ip_payload_len)1918 sktu_parse_ipv4_frame(struct sktu_frame *frame, void *ip_payload,
1919     uint32_t *ip_payload_len)
1920 {
1921 	size_t pkt_len, payload_len;
1922 	void *buf;
1923 	struct ip *ip;
1924 	uint16_t csum;
1925 
1926 	buf = &frame->bytes[0];
1927 	ip = (struct ip*)buf;
1928 	pkt_len = frame->len;
1929 	assert(pkt_len == ntohs(ip->ip_len));
1930 	payload_len = pkt_len - sizeof(*ip);
1931 	assert(payload_len <= SKTU_FRAME_BUF_SIZE);
1932 
1933 	/* verify ip header checksum */
1934 	csum = in_cksum(ip, sizeof(*ip), 0);
1935 	if (csum != 0) {
1936 		sktu_dump_buffer(stderr, __func__, buf, pkt_len);
1937 		errx(EX_PROTOCOL, "IP header checksum invalid");
1938 	}
1939 
1940 	if (ip_payload != NULL) {     /* copy the data */
1941 		memcpy(ip_payload, buf + sizeof(*ip), pkt_len - sizeof(*ip));
1942 	}
1943 
1944 	*ip_payload_len = payload_len;
1945 	return 0;
1946 }
1947 
1948 int
sktu_parse_tcp4_frame(struct sktu_frame * frame,void * tcp_payload,uint32_t * tcp_payload_len)1949 sktu_parse_tcp4_frame(struct sktu_frame *frame, void *tcp_payload,
1950     uint32_t *tcp_payload_len)
1951 {
1952 	uint32_t pkt_len, payload_len;
1953 	void *buf;
1954 	struct ip *ip;
1955 	ip_tcp_header_t *ip_tcp;
1956 	uint16_t csum;
1957 
1958 	buf = &frame->bytes[0];
1959 	ip = buf;
1960 	ip_tcp = buf;
1961 	pkt_len = frame->len;
1962 	if (ip->ip_p != IPPROTO_TCP) {
1963 		sktu_dump_buffer(stderr, "non-TCP packet", buf, pkt_len);
1964 		return EINVAL;
1965 	}
1966 	assert(pkt_len == ntohs(ip_tcp->ip.ip_len));
1967 	payload_len = pkt_len - sizeof(ip_tcp_header_t);
1968 	assert(payload_len <= SKTU_FRAME_BUF_SIZE);
1969 
1970 	csum = in_cksum(ip, sizeof(*ip), 0);
1971 	if (csum != 0) {
1972 		sktu_dump_buffer(stderr, __func__, buf, pkt_len);
1973 		errx(EX_PROTOCOL, "IP header checksum invalid");
1974 	}
1975 
1976 	csum = os_inet_checksum(&ip_tcp->tcp, pkt_len - sizeof(struct ip), 0);
1977 	csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1978 	    csum + htonl(payload_len + sizeof(struct tcphdr) + IPPROTO_TCP));
1979 	csum ^= 0xffff;
1980 	if (csum != 0) {
1981 		sktu_dump_buffer(stderr, "invalid TCP csum", buf, pkt_len);
1982 		return -1;
1983 	}
1984 
1985 	if (tcp_payload != NULL) {     /* copy the data */
1986 		memcpy(tcp_payload, buf + sizeof(*ip_tcp), payload_len);
1987 	}
1988 
1989 	*tcp_payload_len = payload_len;
1990 
1991 	return 0;
1992 }
1993 
1994 int
sktu_parse_udp4_frame(struct sktu_frame * frame,void * udp_payload,uint32_t * udp_payload_len)1995 sktu_parse_udp4_frame(struct sktu_frame *frame, void *udp_payload,
1996     uint32_t *udp_payload_len)
1997 {
1998 	size_t pkt_len, payload_len;
1999 	void *buf;
2000 	struct ip *ip;
2001 	ip_udp_header_t *ip_udp;
2002 	uint16_t csum;
2003 
2004 	buf = &frame->bytes[0];
2005 	ip = buf;
2006 	ip_udp = buf;
2007 	pkt_len = frame->len;
2008 	if (ip->ip_p != IPPROTO_UDP) {
2009 		sktu_dump_buffer(stderr,
2010 		    "sktu_parse_udp4_frame: non-UDP packet", buf, pkt_len);
2011 		return EINVAL;
2012 	}
2013 	assert(pkt_len == ntohs(ip_udp->ip.ip_len));
2014 	payload_len = pkt_len - sizeof(ip_udp_header_t);
2015 	assert(payload_len <= SKTU_FRAME_BUF_SIZE);
2016 
2017 	csum = in_cksum(ip, sizeof(*ip), 0);
2018 	if (csum != 0) {
2019 		sktu_dump_buffer(stderr, __func__, buf, pkt_len);
2020 		errx(EX_PROTOCOL, "IP header checksum invalid");
2021 	}
2022 
2023 	if (ip_udp->udp.uh_sum == 0) {
2024 		goto skip_udp_checksum;
2025 	}
2026 
2027 	csum = os_inet_checksum(&ip_udp->udp, pkt_len - sizeof(struct ip), 0);
2028 	csum += htons(payload_len + sizeof(struct udphdr) + IPPROTO_UDP);
2029 	csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, csum);
2030 	csum ^= 0xffff;
2031 	if (csum != 0) {
2032 		sktu_dump_buffer(stderr, __func__, buf, pkt_len);
2033 		return -1;
2034 	}
2035 
2036 skip_udp_checksum:
2037 	if (udp_payload != NULL) {
2038 		memcpy(udp_payload, buf + sizeof(*ip_udp), payload_len);
2039 	}
2040 
2041 	*udp_payload_len = payload_len;
2042 
2043 	return 0;
2044 }
2045 
2046 /*
2047  * Rx once from an available ring;
2048  * Return 0, if successful; non-zero, otherwise.
2049  */
2050 struct sktu_frame *
sktu_channel_port_rx(channel_port_t port)2051 sktu_channel_port_rx(channel_port_t port)
2052 {
2053 	int error;
2054 	slot_prop_t prop;
2055 	channel_slot_t slot;
2056 	struct sktu_frame *frame;
2057 	packet_t pkt;
2058 	void *addr, *buf;
2059 	size_t buf_len;
2060 	size_t frame_length;
2061 	buflet_t buflet;
2062 
2063 	slot = os_channel_get_next_slot(port->rx_ring, NULL, &prop);
2064 	if (slot == NULL) {
2065 		return NULL;
2066 	}
2067 	assert(prop.sp_buf_ptr != 0);
2068 
2069 	frame = sktu_frame_alloc();
2070 
2071 	pkt = os_channel_slot_get_packet(port->rx_ring, slot);
2072 	assert(pkt != 0);
2073 	if (port->user_packet_pool) {
2074 		error = os_channel_slot_detach_packet(port->rx_ring,
2075 		    slot, pkt);
2076 		SKTC_ASSERT_ERR(error == 0);
2077 	}
2078 
2079 	buflet = os_packet_get_next_buflet(pkt, NULL);
2080 	assert(buflet != NULL);
2081 	buf = os_buflet_get_object_address(buflet) +
2082 	    os_buflet_get_data_offset(buflet);
2083 	frame_length = os_packet_get_data_length(pkt);
2084 
2085 	buflet = os_packet_get_next_buflet(pkt, NULL);
2086 	assert(buflet != NULL);
2087 	buf = os_buflet_get_object_address(buflet) +
2088 	    os_buflet_get_data_offset(buflet);
2089 	buf_len = os_buflet_get_data_length(buflet);
2090 	assert(buf_len < SKTU_FRAME_BUF_SIZE);
2091 
2092 	frame->len = os_packet_get_data_length(pkt);
2093 
2094 	addr = &frame->bytes[0];
2095 	memcpy(addr, buf, buf_len);
2096 	frame_length -= buf_len;
2097 
2098 	while (frame_length != 0) {
2099 		buflet = os_packet_get_next_buflet(pkt, buflet);
2100 		assert(buflet != NULL);
2101 		buf = os_buflet_get_object_address(buflet) +
2102 		    os_buflet_get_data_offset(buflet);
2103 		assert(buf != 0);
2104 		buf_len = os_buflet_get_data_length(buflet);
2105 		assert(buf_len != 0);
2106 		memcpy(addr, buf, buf_len);
2107 		addr += buf_len;
2108 		frame_length -= buf_len;
2109 	}
2110 
2111 	os_packet_get_flow_uuid(pkt, &frame->flow_uuid);
2112 	error = os_channel_packet_free(port->chan, pkt);
2113 
2114 	error = os_channel_advance_slot(port->rx_ring, slot);
2115 	SKTC_ASSERT_ERR(error == 0);
2116 
2117 	return frame;
2118 }
2119 
2120 uint32_t
sktu_channel_port_rx_burst(channel_port_t port,struct sktu_frame ** frames,uint32_t n)2121 sktu_channel_port_rx_burst(channel_port_t port, struct sktu_frame **frames,
2122     uint32_t n)
2123 {
2124 	struct timespec timeout = {
2125 		.tv_sec = 10,
2126 		.tv_nsec = 0,
2127 	};
2128 
2129 	int error;
2130 	struct kevent evlist, kev;
2131 	int kq;
2132 	uint32_t i;
2133 
2134 	kq = kqueue();
2135 	assert(kq != -1);
2136 
2137 	EV_SET(&kev, port->fd, EVFILT_READ, EV_ADD | EV_ENABLE, 0, 0, NULL);
2138 	error = kevent(kq, &kev, 1, NULL, 0, NULL);
2139 	SKTC_ASSERT_ERR(error == 0);
2140 
2141 	/* wait for RX to become available */
2142 	error = kevent(kq, NULL, 0, &evlist, 1, &timeout);
2143 	if (error <= 0) {
2144 		if (errno == EAGAIN) {
2145 			return 0;
2146 		}
2147 		SKTC_ASSERT_ERR(error == 0);
2148 	}
2149 	if (error == 0) {
2150 		T_LOG("kevent timeout\n");
2151 		return 0;
2152 	}
2153 	if (evlist.flags & EV_ERROR) {
2154 		int err = evlist.data;
2155 		if (err == EAGAIN) {
2156 			return 0;
2157 		}
2158 		SKTC_ASSERT_ERR(err == 0);
2159 	}
2160 
2161 	if (evlist.filter != EVFILT_READ) {
2162 		err(EX_OSERR, "%lu event %d?\n", evlist.ident, evlist.filter);
2163 	}
2164 
2165 	for (i = 0; i < n; i++) {
2166 		frames[i] = sktu_channel_port_rx(port);
2167 		if (frames[i] == NULL) {
2168 			break;
2169 		}
2170 	}
2171 
2172 	if (i != 0) {
2173 		error = os_channel_sync(port->chan, CHANNEL_SYNC_RX);
2174 		SKTC_ASSERT_ERR(error == 0);
2175 	}
2176 
2177 	close(kq);
2178 
2179 	return i;
2180 }
2181 
2182 void
sktu_channel_port_rx_bulk(channel_port_t port,struct sktu_frame ** frames,uint32_t n)2183 sktu_channel_port_rx_bulk(channel_port_t port, struct sktu_frame **frames,
2184     uint32_t n)
2185 {
2186 	uint32_t ret = 0;
2187 	ret = sktu_channel_port_rx_burst(port, frames, n);
2188 	assert(ret < n);
2189 	if (ret != n) {
2190 		errx(EX_OSERR, "rx bulk failed, %u/%u packets", n, ret);
2191 	}
2192 }
2193 
2194 /*
2195  * Received batch of frames from utun file descriptor.
2196  *
2197  * Returns number of frames actually received.
2198  */
2199 uint32_t
sktu_utun_fd_rx_burst(int utun_fd,struct sktu_frame ** frames,uint32_t n)2200 sktu_utun_fd_rx_burst(int utun_fd, struct sktu_frame **frames, uint32_t n)
2201 {
2202 	struct timeval timeout = {
2203 		.tv_sec = 10,
2204 		.tv_usec = 0,
2205 	};
2206 
2207 	fd_set readfds, errorfds;
2208 	int retval;
2209 
2210 	FD_ZERO(&readfds);
2211 	FD_ZERO(&errorfds);
2212 	FD_SET(utun_fd, &readfds);
2213 	FD_SET(utun_fd, &errorfds);
2214 
2215 	retval = select(utun_fd + 1, &readfds, NULL, &errorfds, &timeout);
2216 	if (retval == -1) {
2217 		err(EX_OSERR, "select()");
2218 	}
2219 
2220 	if (!FD_ISSET(utun_fd, &readfds) && retval == 0) { // timeout
2221 		T_LOG("recv timeout\n");
2222 		return 0;
2223 	}
2224 	assert(!FD_ISSET(utun_fd, &errorfds));
2225 	assert(retval == 1);
2226 
2227 	if (!FD_ISSET(utun_fd, &readfds)) {
2228 		errx(EX_OSERR, "fd selected but no read fd available");
2229 	}
2230 
2231 	uint32_t i = 0;
2232 	for (i = 0; i < n; i++) {
2233 		struct {
2234 			uint32_t af;
2235 			char bytes[SKTU_FRAME_BUF_SIZE];
2236 		} utun_packet;
2237 		ssize_t len;
2238 		len = read(utun_fd, &utun_packet, sizeof(utun_packet));
2239 		if (len < 1) {
2240 			errx(EX_OSERR, "utun read 0 len");
2241 		}
2242 		struct sktu_frame *frame = frames[i] = sktu_frame_alloc();
2243 		memcpy(frame->bytes, &utun_packet.bytes, len - sizeof(uint32_t));
2244 		frame->len = len - sizeof(uint32_t);
2245 	}
2246 
2247 	return i;
2248 }
2249 
2250 void
sktu_utun_fd_tx_burst(int utun_fd,struct sktu_frame ** frames,uint32_t n)2251 sktu_utun_fd_tx_burst(int utun_fd, struct sktu_frame **frames, uint32_t n)
2252 {
2253 	struct timeval timeout = {
2254 		.tv_sec = 10,
2255 		.tv_usec = 0,
2256 	};
2257 	fd_set writefds, errorfds;
2258 	int retval;
2259 
2260 	FD_ZERO(&writefds);
2261 	FD_ZERO(&errorfds);
2262 	FD_SET(utun_fd, &writefds);
2263 	FD_SET(utun_fd, &errorfds);
2264 
2265 	retval = select(utun_fd + 1, NULL, &writefds, &errorfds, &timeout);
2266 	if (retval == -1) {
2267 		err(EX_OSERR, "select()");
2268 	}
2269 
2270 	if (!FD_ISSET(utun_fd, &writefds) && retval == 0) { // timeout
2271 		err(EX_OSERR, "recv timeout\n");
2272 	}
2273 
2274 	assert(!FD_ISSET(utun_fd, &errorfds));
2275 	assert(retval == 1);
2276 
2277 	if (!FD_ISSET(utun_fd, &writefds)) {
2278 		errx(EX_OSERR, "fd selected but no write fd available");
2279 	}
2280 
2281 	uint32_t i = 0;
2282 	for (i = 0; i < n; i++) {
2283 		struct sktu_frame *frame = frames[i];
2284 		struct ip *ip = (void *)&frame->bytes[0];
2285 		uint32_t af;
2286 		switch (ip->ip_v) {
2287 		case IPVERSION:
2288 			af = htonl(AF_INET);
2289 			break;
2290 		case IPV6_VERSION:
2291 			af = htonl(AF_INET6);
2292 			break;
2293 		default:
2294 			assert("unrecoginzed IP version");
2295 			__builtin_unreachable();
2296 			break;
2297 		}
2298 		struct {
2299 			uint32_t af;
2300 			char bytes[SKTU_FRAME_BUF_SIZE];
2301 		} utun_packet;
2302 		memcpy(&utun_packet.af, &af, sizeof(af));
2303 		memcpy(&utun_packet.bytes, &frame->bytes[0], frame->len);
2304 		ssize_t write_len = frame->len + sizeof(uint32_t);
2305 		T_LOG("%s writing frame len %zu\n", __func__, write_len);
2306 		ssize_t len = write(utun_fd, &utun_packet, write_len);
2307 		if (len != write_len) {
2308 			err(EX_OSERR, "utun write error\n");
2309 		}
2310 	}
2311 }
2312 
2313 struct sktu_frame *
sktu_frame_alloc()2314 sktu_frame_alloc()
2315 {
2316 	return malloc(sizeof(struct sktu_frame));
2317 }
2318 
2319 #define sktu_frame_free(frame) \
2320 do { \
2321 	free(frame); \
2322 	frame = NULL; \
2323 } while (0)
2324 
2325 void
sktu_frames_free(struct sktu_frame ** frames,size_t n)2326 sktu_frames_free(struct sktu_frame **frames, size_t n)
2327 {
2328 	for (size_t i = 0; i < n; i++) {
2329 		sktu_frame_free(frames[i]);
2330 		frames[i] = NULL;
2331 	}
2332 }
2333 
2334 size_t
sktu_create_ip_frames(struct sktu_frame ** frames,size_t n,void * src_ip,void * dst_ip,uint8_t proto,const void * sdu,size_t sdu_len,size_t mtu,uint16_t csum_flags,uint16_t csum_start,uint16_t csum_stuff)2335 sktu_create_ip_frames(struct sktu_frame **frames, size_t n,
2336     void *src_ip, void *dst_ip, uint8_t proto, const void *sdu, size_t sdu_len,
2337     size_t mtu, uint16_t csum_flags, uint16_t csum_start, uint16_t csum_stuff)
2338 {
2339 	size_t off = 0, remaining_sdu_len = sdu_len;
2340 	size_t i = 0;
2341 	uint16_t ip_id = sktu_ip_id();
2342 	bool needs_frag = false;
2343 
2344 	while (remaining_sdu_len > 0) {
2345 		assert(i < n);
2346 
2347 		struct sktu_frame *frame = frames[i] = sktu_frame_alloc();
2348 		char *baddr = &frame->bytes[0];
2349 		struct ip *ip = (struct ip *)baddr;
2350 		size_t dlen;
2351 		bool more_frag = false;
2352 
2353 		dlen = mtu - sizeof(*ip);
2354 		if (dlen >= remaining_sdu_len) {
2355 			dlen = remaining_sdu_len;
2356 			needs_frag = false;
2357 			more_frag = false;
2358 		} else {
2359 			dlen = dlen & ~0x7; // round down to 8-byte multiple
2360 			needs_frag = true;
2361 			more_frag = true;
2362 		}
2363 
2364 		// can't handle fragmented csum offload
2365 		assert(!(needs_frag && csum_flags != 0));
2366 
2367 		memset(ip, 0, sizeof(*ip));
2368 		ip->ip_v = IPVERSION;
2369 		ip->ip_hl = sizeof(struct ip) >> 2;
2370 		ip->ip_ttl = MAXTTL;
2371 		ip->ip_p = proto;
2372 		memcpy(&ip->ip_src, src_ip, sizeof(struct in_addr));
2373 		memcpy(&ip->ip_dst, dst_ip, sizeof(struct in_addr));
2374 		ip->ip_len = htons(sizeof(*ip) + dlen);
2375 		ip->ip_id = htons(ip_id);
2376 		ip->ip_off = ((off >> 3) & IP_OFFMASK);
2377 		if (more_frag) {
2378 			ip->ip_off |= IP_MF;
2379 		}
2380 		ip->ip_off = htons(ip->ip_off);
2381 
2382 		/* compute the IP header checksum */
2383 		ip->ip_sum = in_cksum(ip, sizeof(*ip), 0);
2384 		baddr += sizeof(*ip);
2385 
2386 		memcpy(baddr, sdu + off, dlen);
2387 
2388 		frame->csum_flags = csum_flags;
2389 		frame->csum_start = sizeof(*ip) + csum_start;
2390 		frame->csum_stuff = sizeof(*ip) + csum_stuff;
2391 
2392 		frame->len = sizeof(*ip) + dlen;
2393 
2394 		off += dlen;
2395 		remaining_sdu_len -= dlen;
2396 		i++;
2397 	}
2398 
2399 	return i;
2400 }
2401 
2402 size_t
sktu_create_ip6_frames(struct sktu_frame ** frames,size_t n,void * src_ip,void * dst_ip,uint8_t proto,const void * sdu,size_t sdu_len,size_t mtu,uint16_t csum_flags,uint16_t csum_start,uint16_t csum_stuff)2403 sktu_create_ip6_frames(struct sktu_frame **frames, size_t n,
2404     void *src_ip, void *dst_ip, uint8_t proto, const void *sdu, size_t sdu_len,
2405     size_t mtu, uint16_t csum_flags, uint16_t csum_start, uint16_t csum_stuff)
2406 {
2407 	size_t off = 0, remaining_sdu_len = sdu_len;
2408 	size_t i = 0;
2409 	uint16_t ip_id = sktu_ip_id();
2410 	bool needs_frag = false;
2411 
2412 	while (remaining_sdu_len > 0) {
2413 		assert(i < n);
2414 
2415 		struct sktu_frame *frame = frames[i] = sktu_frame_alloc();
2416 		char *baddr = &frame->bytes[0];
2417 		struct ip6_hdr *ip6 = (struct ip6_hdr *)baddr;
2418 		size_t hlen = sizeof(*ip6);
2419 		size_t plen, dlen;
2420 		bool more_frag = false;
2421 
2422 		dlen = mtu - hlen;
2423 		if (dlen >= remaining_sdu_len) {
2424 			// fits in one packet
2425 			dlen = plen = remaining_sdu_len;
2426 			remaining_sdu_len = 0;
2427 			more_frag = false;
2428 		} else {
2429 			// need to fragment
2430 			dlen -= sizeof(struct ip6_frag);
2431 			dlen = dlen & ~0x7; // round down to 8-byte multiple
2432 			plen = sizeof(struct ip6_frag) + dlen;
2433 			remaining_sdu_len -= dlen;
2434 			needs_frag = true;
2435 			more_frag = true;
2436 		}
2437 
2438 		// can't handle fragmented csum offload
2439 		assert(!(needs_frag && csum_flags != 0));
2440 
2441 		// insert ipv6 header
2442 		memset(ip6, 0, sizeof(*ip6));
2443 		ip6->ip6_vfc = (IPV6_VERSION & IPV6_VERSION_MASK);
2444 		ip6->ip6_plen = htons(plen);
2445 		ip6->ip6_nxt = needs_frag ? IPPROTO_FRAGMENT : proto;
2446 		ip6->ip6_hlim = IPV6_DEFHLIM;
2447 		memcpy(&ip6->ip6_src, src_ip, sizeof(struct in6_addr));
2448 		memcpy(&ip6->ip6_dst, dst_ip, sizeof(struct in6_addr));
2449 
2450 		baddr += sizeof(*ip6);
2451 
2452 		// insert ipv6 frag header
2453 		if (needs_frag) {
2454 			struct ip6_frag *ip6f = (struct ip6_frag *)baddr;
2455 			ip6f->ip6f_nxt = proto;
2456 			ip6f->ip6f_reserved = 0;
2457 			ip6f->ip6f_offlg = htons(off);
2458 			if (more_frag) {
2459 				ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
2460 			}
2461 			ip6f->ip6f_ident = htonl(ip_id);
2462 
2463 			hlen += sizeof(*ip6f);
2464 			baddr += sizeof(*ip6f);
2465 		}
2466 
2467 		memcpy(baddr, sdu + off, dlen);
2468 
2469 		frame->csum_flags = csum_flags;
2470 		frame->csum_start = sizeof(*ip6) + csum_start;
2471 		frame->csum_stuff = sizeof(*ip6) + csum_stuff;
2472 		frame->len = hlen + dlen;
2473 
2474 		off += dlen;
2475 		i++;
2476 	}
2477 
2478 	return i;
2479 }
2480 
2481 size_t
sktu_create_tcp_frames(struct sktu_frame ** frames,size_t n,uint8_t ipver,void * src_ip,void * dst_ip,uint16_t sport,uint16_t dport,const void * data,size_t data_len,size_t mtu,bool csum_offload)2482 sktu_create_tcp_frames(struct sktu_frame **frames, size_t n,
2483     uint8_t ipver, void *src_ip, void *dst_ip, uint16_t sport, uint16_t dport,
2484     const void *data, size_t data_len, size_t mtu, bool csum_offload)
2485 {
2486 	uint32_t n_frames;
2487 	size_t sdu_len = data_len + sizeof(struct tcphdr);
2488 	void *sdu = malloc(sdu_len);
2489 
2490 	// populate header
2491 	struct tcphdr *tcp = (struct tcphdr *)sdu;
2492 	tcp->th_sport = htons(sport);
2493 	tcp->th_dport = htons(dport);
2494 	tcp->th_flags |= 0; //FIXME (connect ? TH_SYN : TH_RST);
2495 	tcp->th_off = (sizeof(struct tcphdr)) >> 2;
2496 
2497 	// copy payload
2498 	memcpy(sdu + sizeof(*tcp), data, data_len);
2499 
2500 	// compute checksum
2501 	uint16_t sum = 0;
2502 
2503 	if (ipver == IPVERSION) {
2504 		sum = in_pseudo(*(uint32_t*)src_ip, *(uint32_t*)dst_ip,
2505 		    htons(data_len + sizeof(struct tcphdr) + IPPROTO_TCP));
2506 	} else {
2507 		sum = in6_pseudo(src_ip, dst_ip,
2508 		    htonl(data_len + sizeof(struct tcphdr) + IPPROTO_TCP));
2509 	}
2510 	tcp->th_sum = sum;
2511 
2512 	uint16_t csum_flags = 0, csum_start = 0, csum_stuff = 0;
2513 	if (csum_offload) {
2514 		csum_flags = PACKET_CSUM_PARTIAL;
2515 		csum_start = 0;
2516 		csum_stuff = offsetof(struct tcphdr, th_sum);
2517 	} else {
2518 		sum = os_inet_checksum(sdu, sdu_len, 0);
2519 		tcp->th_sum = sktu_fold_sum_final(sum);
2520 	}
2521 
2522 	// IP framing
2523 	if (ipver == IPVERSION) {
2524 		n_frames = sktu_create_ip_frames(frames, n, src_ip, dst_ip,
2525 		    IPPROTO_TCP, sdu, sdu_len, mtu, csum_flags, csum_start,
2526 		    csum_stuff);
2527 	} else {
2528 		n_frames = sktu_create_ip6_frames(frames, n, src_ip, dst_ip,
2529 		    IPPROTO_TCP, sdu, sdu_len, mtu, csum_flags, csum_start,
2530 		    csum_stuff);
2531 	}
2532 
2533 	free(sdu);
2534 
2535 	return n_frames;
2536 }
2537 
2538 size_t
sktu_create_udp_frames(struct sktu_frame ** frames,size_t n,uint8_t ipver,void * src_ip,void * dst_ip,uint16_t sport,uint16_t dport,const void * data,size_t data_len,size_t mtu,bool csum_offload)2539 sktu_create_udp_frames(struct sktu_frame **frames, size_t n,
2540     uint8_t ipver, void *src_ip, void *dst_ip, uint16_t sport, uint16_t dport,
2541     const void *data, size_t data_len, size_t mtu, bool csum_offload)
2542 {
2543 	uint32_t n_frames;
2544 	size_t sdu_len = data_len + sizeof(struct udphdr);
2545 	void *sdu = malloc(sdu_len);
2546 
2547 	// populate header
2548 	struct udphdr *udp = (struct udphdr *)sdu;
2549 	udp->uh_sport = htons(sport);
2550 	udp->uh_dport = htons(dport);
2551 	udp->uh_ulen = htons(sizeof(*udp) + data_len);
2552 
2553 	// compute payload checksum
2554 	uint32_t payload_sum = 0, pseudo_sum = 0;
2555 	if (ipver == IPVERSION) {
2556 		struct ipv4_udp_pseudo_hdr udp_pseudo = {};
2557 		memcpy(&udp_pseudo.src_ip, src_ip, sizeof(struct in_addr));
2558 		memcpy(&udp_pseudo.dst_ip, dst_ip, sizeof(struct in_addr));
2559 		udp_pseudo.proto = IPPROTO_UDP;
2560 		udp_pseudo.length = htons(sizeof(struct udphdr) + data_len);
2561 		pseudo_sum = os_inet_checksum(&udp_pseudo, sizeof(udp_pseudo)
2562 		    + sizeof(struct udphdr), 0);
2563 	} else {
2564 		struct ipv6_udp_pseudo_hdr udp_pseudo = {};
2565 		memcpy(&udp_pseudo.src_ip, src_ip, sizeof(struct in6_addr));
2566 		memcpy(&udp_pseudo.dst_ip, dst_ip, sizeof(struct in6_addr));
2567 		udp_pseudo.proto = IPPROTO_UDP;
2568 		udp_pseudo.length = htons(sizeof(struct udphdr) + data_len);
2569 		pseudo_sum = os_inet_checksum(&udp_pseudo, sizeof(udp_pseudo)
2570 		    + sizeof(struct udphdr), 0);
2571 	}
2572 
2573 	uint16_t csum_flags = 0, csum_start = 0, csum_stuff = 0;
2574 	if (csum_offload) {
2575 		csum_flags = PACKET_CSUM_PARTIAL | PACKET_CSUM_ZERO_INVERT;
2576 		csum_start = 0;
2577 		csum_stuff = offsetof(struct udphdr, uh_sum);
2578 		udp->uh_sum = sktu_fold_sum_final(pseudo_sum);
2579 	} else {
2580 		payload_sum = os_inet_checksum(data, data_len, 0);
2581 		udp->uh_sum = ~sktu_fold_sum_final(pseudo_sum + payload_sum);
2582 	}
2583 
2584 	// copy payload
2585 	memcpy(sdu + sizeof(*udp), data, data_len);
2586 
2587 	// IP framing
2588 	if (ipver == IPVERSION) {
2589 		n_frames = sktu_create_ip_frames(frames, n, src_ip, dst_ip,
2590 		    IPPROTO_UDP, sdu, sdu_len, mtu, csum_flags, csum_start,
2591 		    csum_stuff);
2592 	} else {
2593 		n_frames = sktu_create_ip6_frames(frames, n, src_ip, dst_ip,
2594 		    IPPROTO_UDP, sdu, sdu_len, mtu, csum_flags, csum_start,
2595 		    csum_stuff);
2596 	}
2597 
2598 	free(sdu);
2599 
2600 	return n_frames;
2601 }
2602 
2603 void
sktu_attach_flow_metadata_to_frames(struct sktu_flow * flow,struct sktu_frame ** frames,size_t n_frames)2604 sktu_attach_flow_metadata_to_frames(struct sktu_flow *flow,
2605     struct sktu_frame **frames, size_t n_frames)
2606 {
2607 	for (uint32_t i = 0; i < n_frames; i++) {
2608 		struct sktu_frame *frame = frames[i];
2609 		uuid_copy(frame->flow_uuid, flow->uuid);
2610 	}
2611 }
2612 
2613 static size_t
_sktu_create_udp_flow_input_frames(struct sktu_flow * flow,struct sktu_frame ** frames,size_t n,const void * data,size_t data_len)2614 _sktu_create_udp_flow_input_frames(struct sktu_flow *flow,
2615     struct sktu_frame **frames, size_t n, const void *data, size_t data_len)
2616 {
2617 	n = sktu_create_udp_frames(frames, n, flow->ipver, flow->dst_ip,
2618 	    flow->src_ip, flow->dport, flow->sport, data, data_len, flow->mtu,
2619 	    NO_CSUM_OFFLOAD);
2620 	sktu_attach_flow_metadata_to_frames(flow, frames, n);
2621 	return n;
2622 }
2623 
2624 static size_t
_sktu_create_udp_flow_output_frames(struct sktu_flow * flow,struct sktu_frame ** frames,size_t n,const void * data,size_t data_len,bool csum_offload)2625 _sktu_create_udp_flow_output_frames(struct sktu_flow *flow,
2626     struct sktu_frame **frames, size_t n, const void *data, size_t data_len,
2627     bool csum_offload)
2628 {
2629 	n = sktu_create_udp_frames(frames, n, flow->ipver, flow->src_ip,
2630 	    flow->dst_ip, flow->sport, flow->dport, data, data_len, flow->mtu,
2631 	    csum_offload);
2632 	sktu_attach_flow_metadata_to_frames(flow, frames, n);
2633 	return n;
2634 }
2635 
2636 static size_t
_sktu_create_tcp_flow_input_frames(struct sktu_flow * flow,struct sktu_frame ** frames,size_t n,const void * data,size_t data_len)2637 _sktu_create_tcp_flow_input_frames(struct sktu_flow *flow,
2638     struct sktu_frame **frames, size_t n, const void *data, size_t data_len)
2639 {
2640 	n = sktu_create_tcp_frames(frames, n, flow->ipver, flow->dst_ip,
2641 	    flow->src_ip, flow->dport, flow->sport, data, data_len, flow->mtu,
2642 	    NO_CSUM_OFFLOAD);
2643 	sktu_attach_flow_metadata_to_frames(flow, frames, n);
2644 	return n;
2645 }
2646 
2647 static size_t
_sktu_create_tcp_flow_output_frames(struct sktu_flow * flow,struct sktu_frame ** frames,size_t n,const void * data,size_t data_len,bool csum_offload)2648 _sktu_create_tcp_flow_output_frames(struct sktu_flow *flow,
2649     struct sktu_frame **frames, size_t n, const void *data, size_t data_len,
2650     bool csum_offload)
2651 {
2652 	n = sktu_create_tcp_frames(frames, n, flow->ipver, flow->src_ip,
2653 	    flow->dst_ip, flow->sport, flow->dport, data, data_len, flow->mtu,
2654 	    csum_offload);
2655 	sktu_attach_flow_metadata_to_frames(flow, frames, n);
2656 	return n;
2657 }
2658 
2659 static size_t
_sktu_create_ip_flow_input_frames(struct sktu_flow * flow,struct sktu_frame ** frames,size_t n,const void * data,size_t data_len)2660 _sktu_create_ip_flow_input_frames(struct sktu_flow *flow,
2661     struct sktu_frame **frames, size_t n, const void *data, size_t data_len)
2662 {
2663 	n = sktu_create_ip_frames(frames, n, flow->dst_ip, flow->src_ip,
2664 	    flow->ip_protocol, data, data_len, flow->mtu, 0, 0, 0);
2665 	sktu_attach_flow_metadata_to_frames(flow, frames, n);
2666 	return n;
2667 }
2668 
2669 static size_t
_sktu_create_ip_flow_output_frames(struct sktu_flow * flow,struct sktu_frame ** frames,size_t n,const void * data,size_t data_len,bool csum_offload)2670 _sktu_create_ip_flow_output_frames(struct sktu_flow *flow,
2671     struct sktu_frame **frames, size_t n, const void *data,
2672     size_t data_len, bool csum_offload)
2673 {
2674 	n = sktu_create_ip_frames(frames, n, flow->src_ip, flow->dst_ip,
2675 	    flow->ip_protocol, data, data_len, flow->mtu, 0, 0, 0);
2676 	sktu_attach_flow_metadata_to_frames(flow, frames, n);
2677 	return n;
2678 }
2679 
2680 #define SKTU_STRING_BUF_MAX 2048
2681 char *
sktu_nfr_to_string(struct nx_flow_req * nfr)2682 sktu_nfr_to_string(struct nx_flow_req *nfr)
2683 {
2684 	static char buf[SKTU_STRING_BUF_MAX];
2685 	uuid_string_t uuidstr;
2686 	char sa_buf[31];
2687 	char da_buf[31];
2688 
2689 	uuid_unparse(nfr->nfr_flow_uuid, uuidstr);
2690 	if (nfr->nfr_saddr.sa.sa_family == AF_INET) {
2691 		inet_ntop(AF_INET, &nfr->nfr_saddr.sin.sin_addr.s_addr, sa_buf,
2692 		    sizeof(sa_buf));
2693 		inet_ntop(AF_INET, &nfr->nfr_daddr.sin.sin_addr.s_addr, da_buf,
2694 		    sizeof(da_buf));
2695 	} else {
2696 		inet_ntop(AF_INET6, &nfr->nfr_saddr.sin6.sin6_addr, sa_buf,
2697 		    sizeof(sa_buf));
2698 		inet_ntop(AF_INET6, &nfr->nfr_daddr.sin6.sin6_addr, da_buf,
2699 		    sizeof(da_buf));
2700 	}
2701 	snprintf(buf, sizeof(buf),
2702 	    "nx_port[%d] %s src=%s,dst=%s,proto=%d,sport=%d,dport=%d, flags=0x%x",
2703 	    nfr->nfr_nx_port, uuidstr, sa_buf, da_buf, nfr->nfr_ip_protocol,
2704 	    ntohs(nfr->nfr_saddr.sin.sin_port),
2705 	    ntohs(nfr->nfr_daddr.sin.sin_port), nfr->nfr_flags);
2706 
2707 	return buf;
2708 }
2709 
2710 char *
sktu_flow_to_string(struct sktu_flow * flow)2711 sktu_flow_to_string(struct sktu_flow *flow)
2712 {
2713 	return sktu_nfr_to_string(&flow->nfr);
2714 }
2715 
2716 struct sktu_flow *
_sktu_create_nexus_flow(sktu_nexus_t nexus,nexus_port_t nx_port,uint8_t af,void * src,void * dst,uint8_t proto,uint16_t sport,uint16_t dport,uint32_t flags)2717 _sktu_create_nexus_flow(sktu_nexus_t nexus, nexus_port_t nx_port,
2718     uint8_t af, void *src, void *dst, uint8_t proto, uint16_t sport,
2719     uint16_t dport, uint32_t flags)
2720 {
2721 	struct sktu_flow *flow = malloc(sizeof(*flow));
2722 
2723 	memset(flow, 0, sizeof(*flow));
2724 	flow->nexus = nexus;
2725 	flow->mtu = 1500;
2726 
2727 	flow->nx_port = nx_port;
2728 
2729 	struct nx_flow_req *nfr = &flow->nfr;
2730 	union sockaddr_in_4_6 *saddr = &nfr->nfr_saddr;
2731 	union sockaddr_in_4_6 *daddr = &nfr->nfr_daddr;
2732 	nfr->nfr_nx_port = nx_port;
2733 	if (af == AF_INET) {
2734 		// initialize flow
2735 		flow->ipver = IPVERSION;
2736 		// fill in nfr (stuff in network order :)
2737 		SIN(saddr)->sin_len = sizeof(struct sockaddr_in);
2738 		SIN(daddr)->sin_len = sizeof(struct sockaddr_in);
2739 		SIN(saddr)->sin_family = AF_INET;
2740 		SIN(daddr)->sin_family = AF_INET;
2741 		SIN(saddr)->sin_addr = *(struct in_addr *)src;
2742 		SIN(daddr)->sin_addr = *(struct in_addr *)dst;
2743 		nfr->nfr_ip_protocol = proto;
2744 		SIN(saddr)->sin_port = htons(sport);
2745 		SIN(daddr)->sin_port = htons(dport);
2746 	} else {
2747 		flow->ipver = IPV6_VERSION;
2748 		SIN6(saddr)->sin6_len = sizeof(struct sockaddr_in6);
2749 		SIN6(daddr)->sin6_len = sizeof(struct sockaddr_in6);
2750 		SIN6(saddr)->sin6_family = AF_INET6;
2751 		SIN6(daddr)->sin6_family = AF_INET6;
2752 		SIN6(saddr)->sin6_addr = *(struct in6_addr *)src;
2753 		SIN6(daddr)->sin6_addr = *(struct in6_addr *)dst;
2754 		nfr->nfr_ip_protocol = proto;
2755 		SIN6(saddr)->sin6_port = htons(sport);
2756 		SIN6(daddr)->sin6_port = htons(dport);
2757 	}
2758 
2759 	uuid_generate_random(nfr->nfr_flow_uuid);
2760 	nfr->nfr_flags = flags;
2761 
2762 	errno = 0;
2763 	int error = __os_nexus_flow_add(nexus->controller, nexus->fsw_nx_uuid, nfr);
2764 	if (error) {
2765 		T_LOG("Failed flow %s\n", sktu_nfr_to_string(nfr));
2766 		free(flow);
2767 		return NULL;
2768 	}
2769 
2770 	if (af == AF_INET) {
2771 		flow->src_ip = &SIN(saddr)->sin_addr;
2772 		flow->dst_ip = &SIN(daddr)->sin_addr;
2773 		flow->sport = ntohs(SIN(saddr)->sin_port);
2774 		flow->dport = ntohs(SIN(daddr)->sin_port);
2775 	} else {
2776 		flow->src_ip = &SIN6(saddr)->sin6_addr;
2777 		flow->dst_ip = &SIN6(daddr)->sin6_addr;
2778 		flow->sport = ntohs(SIN6(saddr)->sin6_port);
2779 		flow->dport = ntohs(SIN6(daddr)->sin6_port);
2780 	}
2781 
2782 	flow->ip_protocol = proto;
2783 	uuid_copy(flow->uuid, nfr->nfr_flow_uuid);
2784 
2785 	switch (proto) {
2786 	case IPPROTO_UDP:
2787 		flow->create_input_frames = _sktu_create_udp_flow_input_frames;
2788 		flow->create_output_frames = _sktu_create_udp_flow_output_frames;
2789 		break;
2790 	case IPPROTO_TCP:
2791 		flow->create_input_frames = _sktu_create_tcp_flow_input_frames;
2792 		flow->create_output_frames = _sktu_create_tcp_flow_output_frames;
2793 		break;
2794 	default:
2795 		flow->create_input_frames = _sktu_create_ip_flow_input_frames;
2796 		flow->create_output_frames = _sktu_create_ip_flow_output_frames;
2797 	}
2798 
2799 	assert(nfr->nfr_nx_port != NEXUS_PORT_ANY);
2800 
2801 	T_LOG("Created flow %s\n", sktu_nfr_to_string(nfr));
2802 
2803 	return flow;
2804 }
2805 
2806 struct sktu_flow *
sktu_create_nexus_flow(sktu_nexus_t nexus,uint8_t af,void * src,void * dst,uint8_t proto,uint16_t sport,uint16_t dport)2807 sktu_create_nexus_flow(sktu_nexus_t nexus, uint8_t af, void *src, void *dst,
2808     uint8_t proto, uint16_t sport, uint16_t dport)
2809 {
2810 	return _sktu_create_nexus_flow(nexus, NEXUS_PORT_ANY, af, src, dst, proto, sport, dport, 0);
2811 }
2812 
2813 struct sktu_flow *
sktu_create_nexus_flow_with_nx_port(sktu_nexus_t nexus,nexus_port_t nx_port,uint8_t af,void * src,void * dst,uint8_t proto,uint16_t sport,uint16_t dport)2814 sktu_create_nexus_flow_with_nx_port(sktu_nexus_t nexus, nexus_port_t nx_port,
2815     uint8_t af, void *src, void *dst, uint8_t proto, uint16_t sport,
2816     uint16_t dport)
2817 {
2818 	return _sktu_create_nexus_flow(nexus, nx_port, af, src, dst, proto, sport, dport, 0);
2819 }
2820 
2821 struct sktu_flow *
sktu_create_nexus_low_latency_flow(sktu_nexus_t nexus,uint8_t af,void * src,void * dst,uint8_t proto,uint16_t sport,uint16_t dport)2822 sktu_create_nexus_low_latency_flow(sktu_nexus_t nexus, uint8_t af, void *src, void *dst,
2823     uint8_t proto, uint16_t sport, uint16_t dport)
2824 {
2825 	return _sktu_create_nexus_flow(nexus, NEXUS_PORT_ANY, af, src, dst, proto, sport, dport, NXFLOWREQF_LOW_LATENCY);
2826 }
2827 
2828 void
_sktu_destroy_nexus_flow(struct sktu_flow * flow)2829 _sktu_destroy_nexus_flow(struct sktu_flow *flow)
2830 {
2831 	sktu_nexus_t nexus = flow->nexus;
2832 	struct nx_flow_req *nfr = &flow->nfr;
2833 
2834 	int error = __os_nexus_flow_del(nexus->controller, nexus->fsw_nx_uuid, nfr);
2835 	SKTC_ASSERT_ERR(!error);
2836 	if (error) {
2837 		T_LOG("failed to deling flow %s", sktu_nfr_to_string(nfr));
2838 	}
2839 
2840 	free(flow);
2841 }
2842 
2843 int
sktu_get_nexus_flow_stats(uuid_t flow_uuid,struct sk_stats_flow * sf)2844 sktu_get_nexus_flow_stats(uuid_t flow_uuid, struct sk_stats_flow *sf)
2845 {
2846 	size_t length = 0;
2847 	void *buffer = NULL;
2848 	int ret = sysctl_buf(SK_STATS_FLOW, &buffer, &length, NULL, 0);
2849 	assert(ret == 0);
2850 	assert(buffer != NULL && length != 0);
2851 
2852 	assert((length % sizeof(*sf)) == 0);
2853 
2854 	struct sk_stats_flow *iter;
2855 	for (iter = buffer; (void *)iter < buffer + length; iter++) {
2856 		if (uuid_compare(iter->sf_uuid, flow_uuid) == 0) {
2857 			*sf = *iter;
2858 			return 0;
2859 		}
2860 	}
2861 	return ENOENT;
2862 }
2863 
2864 int
sktu_get_nexus_flowswitch_stats(struct sk_stats_flow_switch ** sfsw,size_t * len)2865 sktu_get_nexus_flowswitch_stats(struct sk_stats_flow_switch **sfsw, size_t *len)
2866 {
2867 	int ret;
2868 	void *buffer = NULL;
2869 	size_t length = 0;
2870 	size_t width = sizeof(struct sk_stats_flow_switch);
2871 
2872 	ret = sysctl_buf(SK_STATS_FLOW_SWITCH, &buffer, &length, NULL, 0);
2873 	if (ret != 0 || buffer == NULL || length == 0) {
2874 		return ret;
2875 	}
2876 	if ((length % width) != 0) {
2877 		T_LOG("Error, mismatching sk_stats_flow_switch, quit\n");
2878 		exit(EX_OSERR);
2879 	}
2880 
2881 	*sfsw = (struct sk_stats_flow_switch *)buffer;
2882 	*len = length;
2883 
2884 	return 0;
2885 }
2886 
2887 void
__fsw_stats_print(struct fsw_stats * s)2888 __fsw_stats_print(struct fsw_stats *s)
2889 {
2890 	int i;
2891 
2892 	for (i = 0; i < __FSW_STATS_MAX; i++) {
2893 		if (STATS_VAL(s, i) == 0) {
2894 			continue;
2895 		}
2896 		os_log(OS_LOG_DEFAULT, "\t%-24s: %llu\n",
2897 		    fsw_stats_str(i), STATS_VAL(s, i));
2898 	}
2899 }
2900