xref: /xnu-11417.140.69/bsd/dev/arm64/cpu_in_cksum.s (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1/*
2 * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
31 * with __arm64__ tagged ARM64_TODO .  This code revision is optimized based
32 * on the 64-bit part in netinet/cpu_in_cksum.c
33 *
34 * cclee - CoreOS - Vector & Numerics. 06/20/2012.
35 */
36
37#ifdef KERNEL
38#include <arm64/asm.h>
39
40#define	CKSUM_ERR _kprintf
41#else
42#ifndef LIBSYSCALL_INTERFACE
43#error "LIBSYSCALL_INTERFACE not defined"
44#endif /* !LIBSYSCALL_INTERFACE */
45#define	CKSUM_ERR _fprintf_stderr
46#endif /* !KERNEL */
47
48/*
49 * XXX: [email protected]:
50 *
51 * Ugly, but we have little choice, since relying on genassym and <assym.s>
52 * is not possible unless this code lives in osfmk.  Note also that this
53 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
54 * authentic; it only cares about 3 fields.
55 */
56#if defined(__LP64__)
57#define	M_NEXT	0
58#define	M_DATA	16	// 8-byte address, would be aligned to 8-byte boundary
59#define	M_LEN	24
60#else
61#define	M_NEXT	0
62#define	M_DATA	8
63#define	M_LEN	12
64#endif
65
66	.globl	_os_cpu_in_cksum_mbuf
67	.text
68	.align	4
69_os_cpu_in_cksum_mbuf:
70
71
72/*
73 * 64-bit version.
74 *
75 * This function returns the partial 16-bit checksum accumulated in
76 * a 32-bit variable (withouth 1's complement); caller is responsible
77 * for folding the 32-bit sum into 16-bit and performinng the 1's
78 * complement if applicable
79 */
80
81/*
82 * uint32_t
83 * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
84 * {
85 * 	int mlen;
86 * 	uint64_t sum, partial;
87 * 	unsigned int final_acc;
88 * 	uint8_t *data;
89 * 	boolean_t needs_swap, started_on_odd;
90 *
91 * 	VERIFY(len >= 0);
92 * 	VERIFY(off >= 0);
93 *
94 * 	needs_swap = FALSE;
95 * 	started_on_odd = FALSE;
96 * 	sum = initial_sum;
97 */
98
99	#define	m		x0
100	#define	len		x1
101	#define	off		x2
102	#define	sum		x3
103	#define	needs_swap	x4
104	#define	started_on_odd	x5
105	#define	mlen			x6
106	#define	Wmlen			w6
107	#define t       x7
108	#define	data	x8
109#if defined(__LP64__)
110	#define ptr_m		x0
111	#define ptr_data	x8
112#else
113	#define ptr_m		w0
114	#define ptr_data	w8
115#endif
116
117
118#ifdef KERNEL
119	ARM64_PROLOG
120#endif /* KERNEL */
121	mov	needs_swap, #0		// needs_swap = FALSE;
122	mov	started_on_odd, #0	// started_on_odd = FALSE;
123	mov	w3, w3			// clear higher half
124
125
126/*
127 *	for (;;) {
128 *		if (PREDICT_FALSE(m == NULL)) {
129 *			CKSUM_ERR("%s: out of data\n", __func__);
130 *			return (-1);
131 *		}
132 *		mlen = m->m_len;
133 *		if (mlen > off) {
134 *			mlen -= off;
135 *			data = mtod(m, uint8_t *) + off;
136 *			goto post_initial_offset;
137 *		}
138 *		off -= mlen;
139 *		if (len == 0)
140 *			break;
141 *		m = m->m_next;
142 *	}
143 */
144
1450:
146	cbz	m, Lin_cksum_whoops	// if (m == NULL) return -1;
147	ldr	Wmlen, [m, #M_LEN]	// mlen = m->m_len;
148	cmp	mlen, off
149	b.le	1f
150	ldr	ptr_data, [m, #M_DATA]	// mtod(m, uint8_t *)
151	sub	mlen, mlen, off		// mlen -= off;
152	add	data, data, off		// data = mtod(m, uint8_t *) + off;
153	b	L_post_initial_offset
1541:
155	sub	off, off, mlen
156	cbnz	len, 2f
157	mov	x0, x3
158	ret	lr
1592:
160	ldr	ptr_m, [m, #M_NEXT]
161	b	0b
162
163L_loop:	// for (; len > 0; m = m->m_next) {
164/*
165 *		if (PREDICT_FALSE(m == NULL)) {
166 *			CKSUM_ERR("%s: out of data\n", __func__);
167 *			return (-1);
168 *		}
169 *		mlen = m->m_len;
170 *		data = mtod(m, uint8_t *);
171 */
172	cbz	m, Lin_cksum_whoops	// if (m == NULL) return -1;
173	ldr	Wmlen, [m, #M_LEN]	// mlen = m->m_len;
174	ldr	ptr_data, [m, #M_DATA]	// mtod(m, uint8_t *)
175
176L_post_initial_offset:
177/*
178 *		if (mlen == 0) continue;
179 *		if (mlen > len) mlen = len;
180 *		len -= mlen;
181 */
182
183	cbz	mlen, L_continue
184	cmp	mlen, len
185	csel	mlen, mlen, len, le
186	sub	len, len, mlen
187
188/*
189 *		partial = 0;
190 *		if ((uintptr_t)data & 1) {
191 *			started_on_odd = !started_on_odd;
192 *			partial = *data << 8;
193 *			++data;
194 *			--mlen;
195 *		}
196 *		needs_swap = started_on_odd;
197 */
198
199	tst	data, #1
200	mov	x7, #0
201	mov	x10, #0
202	b.eq	1f
203	ldrb	w7, [data], #1
204	eor	started_on_odd, started_on_odd, #1
205	sub	mlen, mlen, #1
206	lsl	w7, w7, #8
2071:
208
209
210/*
211 *		if ((uintptr_t)data & 2) {
212 *			if (mlen < 2)
213 *				goto trailing_bytes;
214 *			partial += *(uint16_t *)(void *)data;
215 *			data += 2;
216 *			mlen -= 2;
217 *		}
218 */
219	tst	data, #2
220	mov	needs_swap, started_on_odd
221	b.eq	1f
222	cmp	mlen, #2
223	b.lt	L_trailing_bytes
224	ldrh	w9, [data], #2
225	sub	mlen, mlen, #2
226	add	w7, w7, w9
2271:
228
229/*
230 *		if ((uintptr_t)data & 4) {
231 *			if (mlen < 4)
232 *				goto L2_bytes;
233 *			partial += *(uint32_t *)(void *)data;
234 *			data += 4;
235 *			mlen -= 4;
236 *		}
237 */
238	// align on 8-bytes boundary if applicable
239	tst	data, #4
240	b.eq	1f
241	cmp	mlen, #4
242	b.lt	L2_bytes
243	ldr	w9, [data], #4
244	sub	mlen, mlen, #4
245	adds	w7, w7, w9
246	adc	x7, x7, x10 // assumes x10 still is #0 as set above
2471:
248
249/*
250 *		while (mlen >= 64) {
251 *			__builtin_prefetch(data + 32);
252 *			__builtin_prefetch(data + 64);
253 *			partial += *(uint32_t *)(void *)data;
254 *			partial += *(uint32_t *)(void *)(data + 4);
255 *			partial += *(uint32_t *)(void *)(data + 8);
256 *			partial += *(uint32_t *)(void *)(data + 12);
257 *			partial += *(uint32_t *)(void *)(data + 16);
258 *			partial += *(uint32_t *)(void *)(data + 20);
259 *			partial += *(uint32_t *)(void *)(data + 24);
260 *			partial += *(uint32_t *)(void *)(data + 28);
261 *			partial += *(uint32_t *)(void *)(data + 32);
262 *			partial += *(uint32_t *)(void *)(data + 36);
263 *			partial += *(uint32_t *)(void *)(data + 40);
264 *			partial += *(uint32_t *)(void *)(data + 44);
265 *			partial += *(uint32_t *)(void *)(data + 48);
266 *			partial += *(uint32_t *)(void *)(data + 52);
267 *			partial += *(uint32_t *)(void *)(data + 56);
268 *			partial += *(uint32_t *)(void *)(data + 60);
269 *			data += 64;
270 *			mlen -= 64;
271 *		//	if (PREDICT_FALSE(partial & (3ULL << 62))) {
272 *		//		if (needs_swap)
273 *		//			partial = (partial << 8) +
274 *		//			    (partial >> 56);
275 *		//		sum += (partial >> 32);
276 *		//		sum += (partial & 0xffffffff);
277 *		//		partial = 0;
278 *		//	}
279 *		}
280*/
281
282	// pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
283	subs	mlen, mlen, #64
284	b.lt	L32_bytes
285
286	// save used vector registers
287	sub	sp, sp, #8*16
288	mov	x11, sp
289	st1.4s	{v0, v1, v2, v3}, [x11], #4*16
290	st1.4s	{v4, v5, v6, v7}, [x11], #4*16
291
292	// spread partial into 8 8-byte registers in v0-v3
293	fmov	s3, w7
294	eor.16b	v0, v0, v0
295	eor.16b	v1, v1, v1
296	eor.16b	v2, v2, v2
297
298	// load the 1st 64 bytes (16 32-bit words)
299	ld1.4s	{v4,v5,v6,v7},[data],#64
300
301	// branch to finish off if mlen<64
302	subs	mlen, mlen, #64
303	b.lt	L64_finishup
304
305	/*
306	 * loop for loading and accumulating 16 32-bit words into
307	 * 8 8-byte accumulators per iteration.
308	 */
309L64_loop:
310	subs        mlen, mlen, #64             // mlen -= 64
311
312	uadalp.2d   v0, v4
313	ld1.4s      {v4},[data], #16
314
315	uadalp.2d   v1, v5
316	ld1.4s      {v5},[data], #16
317
318	uadalp.2d   v2, v6
319	ld1.4s      {v6},[data], #16
320
321	uadalp.2d   v3, v7
322	ld1.4s      {v7},[data], #16
323
324	b.ge        L64_loop
325
326L64_finishup:
327	uadalp.2d   v0, v4
328	uadalp.2d   v1, v5
329	uadalp.2d   v2, v6
330	uadalp.2d   v3, v7
331
332	add.2d      v0, v0, v1
333	add.2d      v2, v2, v3
334	addp.2d     d0, v0
335	addp.2d     d2, v2
336	add.2d      v0, v0, v2
337	fmov        x7, d0			// partial in x7 now
338
339	// restore used vector registers
340	ld1.4s      {v0, v1, v2, v3}, [sp], #4*16
341	ld1.4s      {v4, v5, v6, v7}, [sp], #4*16
342
343L32_bytes:
344	tst     mlen, #32
345	b.eq    L16_bytes
346	ldp	x9, x10, [data], #16
347	ldp	x11, x12, [data], #16
348	adds	x7, x7, x9
349	mov	x9, #0
350	adcs	x7, x7, x10
351	adcs	x7, x7, x11
352	adcs	x7, x7, x12
353	adc	x7, x7, x9
354
355L16_bytes:
356	tst	mlen, #16
357	b.eq	L8_bytes
358	ldp	x9, x10, [data], #16
359	adds	x7, x7, x9
360	mov	x9, #0
361	adcs	x7, x7, x10
362	adc	x7, x7, x9
363
364L8_bytes:
365	tst     mlen, #8
366	mov	x10, #0
367	b.eq    L4_bytes
368	ldr	x9,[data],#8
369	adds	x7, x7, x9
370	adc	x7, x7, x10
371
372L4_bytes:
373	tst     mlen, #4
374	b.eq    L2_bytes
375	ldr	w9,[data],#4
376	adds	x7, x7, x9
377	adc	x7, x7, x10
378
379L2_bytes:
380	tst	mlen, #2
381	b.eq	L_trailing_bytes
382	ldrh	w9,[data],#2
383	adds	x7, x7, x9
384	adc	x7, x7, x10
385
386L_trailing_bytes:
387	tst     mlen, #1
388	b.eq    L0_bytes
389	ldrb	w9,[data],#1
390	adds	x7, x7, x9
391	adc	x7, x7, x10
392	eor	started_on_odd, started_on_odd, #1
393
394L0_bytes:
395/*
396 *		if (needs_swap)
397 *			partial = (partial << 8) + (partial >> 56);
398 */
399	cbz	needs_swap, 1f
400	ror	x7, x7, #56
4011:
402/*
403 *		sum += (partial >> 32) + (partial & 0xffffffff);
404 *		sum = (sum >> 32) + (sum & 0xffffffff);
405 *	}
406 */
407
408	add	x3, x3, x7, lsr #32
409	mov	w7, w7
410	add	x3, x3, x7
411	mov	w7, w3
412	add	x3, x7, x3, lsr #32
413
414L_continue:
415	cmp	len, #0
416	ldr     ptr_m, [m, #M_NEXT]			// m = m->m_next
417	b.gt	L_loop
418
419/*
420 *	final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
421 *	    ((sum >> 16) & 0xffff) + (sum & 0xffff);
422 *	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
423 *	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
424 *	return (final_acc & 0xffff);
425 * }
426 */
427
428	mov	w4, #0x00ffff
429	and	x0, x4, x3, lsr #48
430	and	x1, x4, x3, lsr #32
431	and	x2, x4, x3, lsr #16
432	and	x3, x4, x3
433	add	w0, w0, w1
434	add	w2, w2, w3
435	add	w0, w0, w2
436	and	w1, w4, w0, lsr #16
437	and	w0, w4, w0
438	add	w0, w0, w1
439	and	w1, w4, w0, lsr #16
440	and	w0, w4, w0
441	add	w0, w0, w1
442	/*
443	 * If we were to 1's complement it (XOR with 0xffff):
444	 *
445	 * eor    	w0, w0, w4
446	 */
447	and	w0, w0, w4
448
449	ret	lr
450
451Lin_cksum_whoops:
452	adrp	x0, Lin_cksum_whoops_str@page
453	add	x0, x0, Lin_cksum_whoops_str@pageoff
454	bl	#CKSUM_ERR
455	mov	x0, #-1
456	ret	lr
457
458Lin_cksum_whoops_str:
459	.asciz	"os_cpu_in_cksum_mbuf: out of data\n"
460	.align	5
461