xref: /xnu-8020.140.41/bsd/netinet/cpu_in_cksum_gen.c (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1 /*
2  * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*-
30  * Copyright (c) 2008 Joerg Sonnenberger <[email protected]>.
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  *
37  * 1. Redistributions of source code must retain the above copyright
38  *    notice, this list of conditions and the following disclaimer.
39  * 2. Redistributions in binary form must reproduce the above copyright
40  *    notice, this list of conditions and the following disclaimer in
41  *    the documentation and/or other materials provided with the
42  *    distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
45  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
46  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
47  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
48  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
49  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
50  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
51  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
52  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
53  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
54  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55  * SUCH DAMAGE.
56  */
57 
58 #ifdef KERNEL
59 #include <sys/param.h>
60 #include <machine/endian.h>
61 #include <sys/mcache.h>
62 #include <sys/mbuf.h>
63 #include <kern/debug.h>
64 #include <libkern/libkern.h>
65 #include <mach/boolean.h>
66 #include <pexpert/pexpert.h>
67 #define CKSUM_ERR(fmt, args...) kprintf(fmt, ## args)
68 #else /* !KERNEL */
69 #ifndef LIBSYSCALL_INTERFACE
70 #error "LIBSYSCALL_INTERFACE not defined"
71 #endif /* !LIBSYSCALL_INTERFACE */
72 #include <stdlib.h>
73 #include <stddef.h>
74 #include <stdint.h>
75 #include <unistd.h>
76 #include <strings.h>
77 #include <mach/boolean.h>
78 #include <skywalk/os_skywalk_private.h>
79 #define CKSUM_ERR(fmt, args...) fprintf_stderr(fmt, ## args)
80 #endif /* !KERNEL */
81 
82 /* compile time assert */
83 #ifndef _CASSERT
84 #define _CASSERT(x)     _Static_assert(x, "compile-time assertion failed")
85 #endif /* !_CASSERT */
86 
87 #ifndef VERIFY
88 #define VERIFY(EX) ((void)0)
89 #endif /* !VERIFY */
90 
91 #ifndef CKSUM_ERR
92 #define CKSUM_ERR(fmt, args...) ((void)0)
93 #endif /* !CKSUM_ERR */
94 
95 #define PREDICT_TRUE(x)         __builtin_expect(!!((long)(x)), 1L)
96 #define PREDICT_FALSE(x)        __builtin_expect(!!((long)(x)), 0L)
97 
98 /* fake mbuf struct used only for calling os_cpu_in_cksum_mbuf() */
99 struct _mbuf {
100 	struct _mbuf    *_m_next;
101 	void            *_m_pad;
102 	uint8_t         *_m_data;
103 	int32_t         _m_len;
104 };
105 
106 extern uint32_t os_cpu_in_cksum(const void *, uint32_t, uint32_t);
107 extern uint32_t os_cpu_in_cksum_mbuf(struct _mbuf *, int, int, uint32_t);
108 
109 uint32_t
os_cpu_in_cksum(const void * data,uint32_t len,uint32_t initial_sum)110 os_cpu_in_cksum(const void *data, uint32_t len, uint32_t initial_sum)
111 {
112 	/*
113 	 * If data is 4-bytes aligned (conditional), length is multiple
114 	 * of 4-bytes (required), and the amount to checksum is small,
115 	 * this would be quicker; this is suitable for IPv4/TCP header.
116 	 */
117 	if (
118 #if !defined(__arm64__) && !defined(__x86_64__)
119 		IS_P2ALIGNED(data, sizeof(uint32_t)) &&
120 #endif /* !__arm64__ && !__x86_64__ */
121 		len <= 64 && (len & 3) == 0) {
122 		uint8_t *p = __DECONST(uint8_t *, data);
123 		uint64_t sum = initial_sum;
124 
125 		switch (len) {
126 		case 20:                /* simple IPv4 or TCP header */
127 			sum += *(uint32_t *)(void *)p;
128 			sum += *(uint32_t *)(void *)(p + 4);
129 			sum += *(uint32_t *)(void *)(p + 8);
130 			sum += *(uint32_t *)(void *)(p + 12);
131 			sum += *(uint32_t *)(void *)(p + 16);
132 			break;
133 
134 		case 32:                /* TCP header + timestamp option */
135 			sum += *(uint32_t *)(void *)p;
136 			sum += *(uint32_t *)(void *)(p + 4);
137 			sum += *(uint32_t *)(void *)(p + 8);
138 			sum += *(uint32_t *)(void *)(p + 12);
139 			sum += *(uint32_t *)(void *)(p + 16);
140 			sum += *(uint32_t *)(void *)(p + 20);
141 			sum += *(uint32_t *)(void *)(p + 24);
142 			sum += *(uint32_t *)(void *)(p + 28);
143 			break;
144 
145 		default:
146 			while (len) {
147 				sum += *(uint32_t *)(void *)p;
148 				p += 4;
149 				len -= 4;
150 			}
151 			break;
152 		}
153 
154 		/* fold 64-bit to 16-bit (deferred carries) */
155 		sum = (sum >> 32) + (sum & 0xffffffff); /* 33-bit */
156 		sum = (sum >> 16) + (sum & 0xffff);     /* 17-bit + carry */
157 		sum = (sum >> 16) + (sum & 0xffff);     /* 16-bit + carry */
158 		sum = (sum >> 16) + (sum & 0xffff);     /* final carry */
159 
160 		return sum & 0xffff;
161 	}
162 
163 	/*
164 	 * Otherwise, let os_cpu_in_cksum_mbuf() handle it; it only looks
165 	 * at 3 fields: {next,data,len}, and since it doesn't care about
166 	 * the authenticity of the mbuf, we use a fake one here.  Make
167 	 * sure the offsets are as expected.
168 	 */
169 #if defined(__LP64__)
170 	_CASSERT(offsetof(struct _mbuf, _m_next) == 0);
171 	_CASSERT(offsetof(struct _mbuf, _m_data) == 16);
172 	_CASSERT(offsetof(struct _mbuf, _m_len) == 24);
173 #else /* !__LP64__ */
174 	_CASSERT(offsetof(struct _mbuf, _m_next) == 0);
175 	_CASSERT(offsetof(struct _mbuf, _m_data) == 8);
176 	_CASSERT(offsetof(struct _mbuf, _m_len) == 12);
177 #endif /* !__LP64__ */
178 #ifdef KERNEL
179 	_CASSERT(offsetof(struct _mbuf, _m_next) ==
180 	    offsetof(struct mbuf, m_next));
181 	_CASSERT(offsetof(struct _mbuf, _m_data) ==
182 	    offsetof(struct mbuf, m_data));
183 	_CASSERT(offsetof(struct _mbuf, _m_len) ==
184 	    offsetof(struct mbuf, m_len));
185 #endif /* KERNEL */
186 	struct _mbuf m = {
187 		._m_next = NULL,
188 		._m_data = __DECONST(uint8_t *, data),
189 		._m_len = len,
190 	};
191 
192 	return os_cpu_in_cksum_mbuf(&m, len, 0, initial_sum);
193 }
194 
195 #if defined(__i386__) || defined(__x86_64__)
196 
197 /*
198  * Checksum routine for Internet Protocol family headers (Portable Version).
199  *
200  * This routine is very heavily used in the network
201  * code and should be modified for each CPU to be as fast as possible.
202  *
203  * A discussion of different implementation techniques can be found in
204  * RFC 1071.
205  *
206  * The default implementation for 32-bit architectures is using
207  * a 32-bit accumulator and operating on 16-bit operands.
208  *
209  * The default implementation for 64-bit architectures is using
210  * a 64-bit accumulator and operating on 32-bit operands.
211  *
212  * Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core
213  * of the inner loop. After each iteration of the inner loop, a partial
214  * reduction is done to avoid carry in long packets.
215  */
216 
217 #if !defined(__LP64__)
218 /* 32-bit version */
219 uint32_t
os_cpu_in_cksum_mbuf(struct _mbuf * m,int len,int off,uint32_t initial_sum)220 os_cpu_in_cksum_mbuf(struct _mbuf *m, int len, int off, uint32_t initial_sum)
221 {
222 	int mlen;
223 	uint32_t sum, partial;
224 	unsigned int final_acc;
225 	uint8_t *data;
226 	boolean_t needs_swap, started_on_odd;
227 
228 	VERIFY(len >= 0);
229 	VERIFY(off >= 0);
230 
231 	needs_swap = FALSE;
232 	started_on_odd = FALSE;
233 	sum = (initial_sum >> 16) + (initial_sum & 0xffff);
234 
235 	for (;;) {
236 		if (PREDICT_FALSE(m == NULL)) {
237 			CKSUM_ERR("%s: out of data\n", __func__);
238 			return (uint32_t)-1;
239 		}
240 		mlen = m->_m_len;
241 		if (mlen > off) {
242 			mlen -= off;
243 			data = m->_m_data + off;
244 			goto post_initial_offset;
245 		}
246 		off -= mlen;
247 		if (len == 0) {
248 			break;
249 		}
250 		m = m->_m_next;
251 	}
252 
253 	for (; len > 0; m = m->_m_next) {
254 		if (PREDICT_FALSE(m == NULL)) {
255 			CKSUM_ERR("%s: out of data\n", __func__);
256 			return (uint32_t)-1;
257 		}
258 		mlen = m->_m_len;
259 		data = m->_m_data;
260 post_initial_offset:
261 		if (mlen == 0) {
262 			continue;
263 		}
264 		if (mlen > len) {
265 			mlen = len;
266 		}
267 		len -= mlen;
268 
269 		partial = 0;
270 		if ((uintptr_t)data & 1) {
271 			/* Align on word boundary */
272 			started_on_odd = !started_on_odd;
273 #if BYTE_ORDER == LITTLE_ENDIAN
274 			partial = *data << 8;
275 #else
276 			partial = *data;
277 #endif
278 			++data;
279 			--mlen;
280 		}
281 		needs_swap = started_on_odd;
282 		while (mlen >= 32) {
283 			__builtin_prefetch(data + 32);
284 			partial += *(uint16_t *)(void *)data;
285 			partial += *(uint16_t *)(void *)(data + 2);
286 			partial += *(uint16_t *)(void *)(data + 4);
287 			partial += *(uint16_t *)(void *)(data + 6);
288 			partial += *(uint16_t *)(void *)(data + 8);
289 			partial += *(uint16_t *)(void *)(data + 10);
290 			partial += *(uint16_t *)(void *)(data + 12);
291 			partial += *(uint16_t *)(void *)(data + 14);
292 			partial += *(uint16_t *)(void *)(data + 16);
293 			partial += *(uint16_t *)(void *)(data + 18);
294 			partial += *(uint16_t *)(void *)(data + 20);
295 			partial += *(uint16_t *)(void *)(data + 22);
296 			partial += *(uint16_t *)(void *)(data + 24);
297 			partial += *(uint16_t *)(void *)(data + 26);
298 			partial += *(uint16_t *)(void *)(data + 28);
299 			partial += *(uint16_t *)(void *)(data + 30);
300 			data += 32;
301 			mlen -= 32;
302 			if (PREDICT_FALSE(partial & 0xc0000000)) {
303 				if (needs_swap) {
304 					partial = (partial << 8) +
305 					    (partial >> 24);
306 				}
307 				sum += (partial >> 16);
308 				sum += (partial & 0xffff);
309 				partial = 0;
310 			}
311 		}
312 		if (mlen & 16) {
313 			partial += *(uint16_t *)(void *)data;
314 			partial += *(uint16_t *)(void *)(data + 2);
315 			partial += *(uint16_t *)(void *)(data + 4);
316 			partial += *(uint16_t *)(void *)(data + 6);
317 			partial += *(uint16_t *)(void *)(data + 8);
318 			partial += *(uint16_t *)(void *)(data + 10);
319 			partial += *(uint16_t *)(void *)(data + 12);
320 			partial += *(uint16_t *)(void *)(data + 14);
321 			data += 16;
322 			mlen -= 16;
323 		}
324 		/*
325 		 * mlen is not updated below as the remaining tests
326 		 * are using bit masks, which are not affected.
327 		 */
328 		if (mlen & 8) {
329 			partial += *(uint16_t *)(void *)data;
330 			partial += *(uint16_t *)(void *)(data + 2);
331 			partial += *(uint16_t *)(void *)(data + 4);
332 			partial += *(uint16_t *)(void *)(data + 6);
333 			data += 8;
334 		}
335 		if (mlen & 4) {
336 			partial += *(uint16_t *)(void *)data;
337 			partial += *(uint16_t *)(void *)(data + 2);
338 			data += 4;
339 		}
340 		if (mlen & 2) {
341 			partial += *(uint16_t *)(void *)data;
342 			data += 2;
343 		}
344 		if (mlen & 1) {
345 #if BYTE_ORDER == LITTLE_ENDIAN
346 			partial += *data;
347 #else
348 			partial += *data << 8;
349 #endif
350 			started_on_odd = !started_on_odd;
351 		}
352 
353 		if (needs_swap) {
354 			partial = (partial << 8) + (partial >> 24);
355 		}
356 		sum += (partial >> 16) + (partial & 0xffff);
357 		/*
358 		 * Reduce sum to allow potential byte swap
359 		 * in the next iteration without carry.
360 		 */
361 		sum = (sum >> 16) + (sum & 0xffff);
362 	}
363 	final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff);
364 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
365 	return final_acc & 0xffff;
366 }
367 
368 #else /* __LP64__ */
369 /* 64-bit version */
370 uint32_t
os_cpu_in_cksum_mbuf(struct _mbuf * m,int len,int off,uint32_t initial_sum)371 os_cpu_in_cksum_mbuf(struct _mbuf *m, int len, int off, uint32_t initial_sum)
372 {
373 	int mlen;
374 	uint64_t sum, partial;
375 	unsigned int final_acc;
376 	uint8_t *data;
377 	boolean_t needs_swap, started_on_odd;
378 
379 	VERIFY(len >= 0);
380 	VERIFY(off >= 0);
381 
382 	needs_swap = FALSE;
383 	started_on_odd = FALSE;
384 	sum = initial_sum;
385 
386 	for (;;) {
387 		if (PREDICT_FALSE(m == NULL)) {
388 			CKSUM_ERR("%s: out of data\n", __func__);
389 			return (uint32_t)-1;
390 		}
391 		mlen = m->_m_len;
392 		if (mlen > off) {
393 			mlen -= off;
394 			data = m->_m_data + off;
395 			goto post_initial_offset;
396 		}
397 		off -= mlen;
398 		if (len == 0) {
399 			break;
400 		}
401 		m = m->_m_next;
402 	}
403 
404 	for (; len > 0; m = m->_m_next) {
405 		if (PREDICT_FALSE(m == NULL)) {
406 			CKSUM_ERR("%s: out of data\n", __func__);
407 			return (uint32_t)-1;
408 		}
409 		mlen = m->_m_len;
410 		data = m->_m_data;
411 post_initial_offset:
412 		if (mlen == 0) {
413 			continue;
414 		}
415 		if (mlen > len) {
416 			mlen = len;
417 		}
418 		len -= mlen;
419 
420 		partial = 0;
421 		if ((uintptr_t)data & 1) {
422 			/* Align on word boundary */
423 			started_on_odd = !started_on_odd;
424 #if BYTE_ORDER == LITTLE_ENDIAN
425 			partial = *data << 8;
426 #else
427 			partial = *data;
428 #endif
429 			++data;
430 			--mlen;
431 		}
432 		needs_swap = started_on_odd;
433 		if ((uintptr_t)data & 2) {
434 			if (mlen < 2) {
435 				goto trailing_bytes;
436 			}
437 			partial += *(uint16_t *)(void *)data;
438 			data += 2;
439 			mlen -= 2;
440 		}
441 		while (mlen >= 64) {
442 			__builtin_prefetch(data + 32);
443 			__builtin_prefetch(data + 64);
444 			partial += *(uint32_t *)(void *)data;
445 			partial += *(uint32_t *)(void *)(data + 4);
446 			partial += *(uint32_t *)(void *)(data + 8);
447 			partial += *(uint32_t *)(void *)(data + 12);
448 			partial += *(uint32_t *)(void *)(data + 16);
449 			partial += *(uint32_t *)(void *)(data + 20);
450 			partial += *(uint32_t *)(void *)(data + 24);
451 			partial += *(uint32_t *)(void *)(data + 28);
452 			partial += *(uint32_t *)(void *)(data + 32);
453 			partial += *(uint32_t *)(void *)(data + 36);
454 			partial += *(uint32_t *)(void *)(data + 40);
455 			partial += *(uint32_t *)(void *)(data + 44);
456 			partial += *(uint32_t *)(void *)(data + 48);
457 			partial += *(uint32_t *)(void *)(data + 52);
458 			partial += *(uint32_t *)(void *)(data + 56);
459 			partial += *(uint32_t *)(void *)(data + 60);
460 			data += 64;
461 			mlen -= 64;
462 			if (PREDICT_FALSE(partial & (3ULL << 62))) {
463 				if (needs_swap) {
464 					partial = (partial << 8) +
465 					    (partial >> 56);
466 				}
467 				sum += (partial >> 32);
468 				sum += (partial & 0xffffffff);
469 				partial = 0;
470 			}
471 		}
472 		/*
473 		 * mlen is not updated below as the remaining tests
474 		 * are using bit masks, which are not affected.
475 		 */
476 		if (mlen & 32) {
477 			partial += *(uint32_t *)(void *)data;
478 			partial += *(uint32_t *)(void *)(data + 4);
479 			partial += *(uint32_t *)(void *)(data + 8);
480 			partial += *(uint32_t *)(void *)(data + 12);
481 			partial += *(uint32_t *)(void *)(data + 16);
482 			partial += *(uint32_t *)(void *)(data + 20);
483 			partial += *(uint32_t *)(void *)(data + 24);
484 			partial += *(uint32_t *)(void *)(data + 28);
485 			data += 32;
486 		}
487 		if (mlen & 16) {
488 			partial += *(uint32_t *)(void *)data;
489 			partial += *(uint32_t *)(void *)(data + 4);
490 			partial += *(uint32_t *)(void *)(data + 8);
491 			partial += *(uint32_t *)(void *)(data + 12);
492 			data += 16;
493 		}
494 		if (mlen & 8) {
495 			partial += *(uint32_t *)(void *)data;
496 			partial += *(uint32_t *)(void *)(data + 4);
497 			data += 8;
498 		}
499 		if (mlen & 4) {
500 			partial += *(uint32_t *)(void *)data;
501 			data += 4;
502 		}
503 		if (mlen & 2) {
504 			partial += *(uint16_t *)(void *)data;
505 			data += 2;
506 		}
507 trailing_bytes:
508 		if (mlen & 1) {
509 #if BYTE_ORDER == LITTLE_ENDIAN
510 			partial += *data;
511 #else
512 			partial += *data << 8;
513 #endif
514 			started_on_odd = !started_on_odd;
515 		}
516 
517 		if (needs_swap) {
518 			partial = (partial << 8) + (partial >> 56);
519 		}
520 		sum += (partial >> 32) + (partial & 0xffffffff);
521 		/*
522 		 * Reduce sum to allow potential byte swap
523 		 * in the next iteration without carry.
524 		 */
525 		sum = (sum >> 32) + (sum & 0xffffffff);
526 	}
527 	final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
528 	    ((sum >> 16) & 0xffff) + (sum & 0xffff);
529 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
530 	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
531 	return final_acc & 0xffff;
532 }
533 #endif /* __LP64 */
534 
535 #endif /* __i386__ || __x86_64__ */
536