xref: /xnu-11215.41.3/bsd/dev/arm64/cpu_in_cksum.s (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1/*
2 * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
31 * with __arm64__ tagged ARM64_TODO .  This code revision is optimized based
32 * on the 64-bit part in netinet/cpu_in_cksum.c
33 *
34 * cclee - CoreOS - Vector & Numerics. 06/20/2012.
35 */
36
37#ifdef KERNEL
38#define	CKSUM_ERR _kprintf
39#else
40#ifndef LIBSYSCALL_INTERFACE
41#error "LIBSYSCALL_INTERFACE not defined"
42#endif /* !LIBSYSCALL_INTERFACE */
43#define	CKSUM_ERR _fprintf_stderr
44#endif /* !KERNEL */
45
46/*
47 * XXX: [email protected]:
48 *
49 * Ugly, but we have little choice, since relying on genassym and <assym.s>
50 * is not possible unless this code lives in osfmk.  Note also that this
51 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
52 * authentic; it only cares about 3 fields.
53 */
54#if defined(__LP64__)
55#define	M_NEXT	0
56#define	M_DATA	16	// 8-byte address, would be aligned to 8-byte boundary
57#define	M_LEN	24
58#else
59#define	M_NEXT	0
60#define	M_DATA	8
61#define	M_LEN	12
62#endif
63
64	.globl	_os_cpu_in_cksum_mbuf
65	.text
66	.align	4
67_os_cpu_in_cksum_mbuf:
68
69
70/*
71 * 64-bit version.
72 *
73 * This function returns the partial 16-bit checksum accumulated in
74 * a 32-bit variable (withouth 1's complement); caller is responsible
75 * for folding the 32-bit sum into 16-bit and performinng the 1's
76 * complement if applicable
77 */
78
79/*
80 * uint32_t
81 * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
82 * {
83 * 	int mlen;
84 * 	uint64_t sum, partial;
85 * 	unsigned int final_acc;
86 * 	uint8_t *data;
87 * 	boolean_t needs_swap, started_on_odd;
88 *
89 * 	VERIFY(len >= 0);
90 * 	VERIFY(off >= 0);
91 *
92 * 	needs_swap = FALSE;
93 * 	started_on_odd = FALSE;
94 * 	sum = initial_sum;
95 */
96
97	#define	m		x0
98	#define	len		x1
99	#define	off		x2
100	#define	sum		x3
101	#define	needs_swap	x4
102	#define	started_on_odd	x5
103	#define	mlen			x6
104	#define	Wmlen			w6
105	#define t       x7
106	#define	data	x8
107#if defined(__LP64__)
108	#define ptr_m		x0
109	#define ptr_data	x8
110#else
111	#define ptr_m		w0
112	#define ptr_data	w8
113#endif
114
115
116	mov	needs_swap, #0		// needs_swap = FALSE;
117	mov	started_on_odd, #0	// started_on_odd = FALSE;
118	mov	w3, w3			// clear higher half
119
120
121/*
122 *	for (;;) {
123 *		if (PREDICT_FALSE(m == NULL)) {
124 *			CKSUM_ERR("%s: out of data\n", __func__);
125 *			return (-1);
126 *		}
127 *		mlen = m->m_len;
128 *		if (mlen > off) {
129 *			mlen -= off;
130 *			data = mtod(m, uint8_t *) + off;
131 *			goto post_initial_offset;
132 *		}
133 *		off -= mlen;
134 *		if (len == 0)
135 *			break;
136 *		m = m->m_next;
137 *	}
138 */
139
1400:
141	cbz	m, Lin_cksum_whoops	// if (m == NULL) return -1;
142	ldr	Wmlen, [m, #M_LEN]	// mlen = m->m_len;
143	cmp	mlen, off
144	b.le	1f
145	ldr	ptr_data, [m, #M_DATA]	// mtod(m, uint8_t *)
146	sub	mlen, mlen, off		// mlen -= off;
147	add	data, data, off		// data = mtod(m, uint8_t *) + off;
148	b	L_post_initial_offset
1491:
150	sub	off, off, mlen
151	cbnz	len, 2f
152	mov	x0, x3
153	ret	lr
1542:
155	ldr	ptr_m, [m, #M_NEXT]
156	b	0b
157
158L_loop:	// for (; len > 0; m = m->m_next) {
159/*
160 *		if (PREDICT_FALSE(m == NULL)) {
161 *			CKSUM_ERR("%s: out of data\n", __func__);
162 *			return (-1);
163 *		}
164 *		mlen = m->m_len;
165 *		data = mtod(m, uint8_t *);
166 */
167	cbz	m, Lin_cksum_whoops	// if (m == NULL) return -1;
168	ldr	Wmlen, [m, #M_LEN]	// mlen = m->m_len;
169	ldr	ptr_data, [m, #M_DATA]	// mtod(m, uint8_t *)
170
171L_post_initial_offset:
172/*
173 *		if (mlen == 0) continue;
174 *		if (mlen > len) mlen = len;
175 *		len -= mlen;
176 */
177
178	cbz	mlen, L_continue
179	cmp	mlen, len
180	csel	mlen, mlen, len, le
181	sub	len, len, mlen
182
183/*
184 *		partial = 0;
185 *		if ((uintptr_t)data & 1) {
186 *			started_on_odd = !started_on_odd;
187 *			partial = *data << 8;
188 *			++data;
189 *			--mlen;
190 *		}
191 *		needs_swap = started_on_odd;
192 */
193
194	tst	data, #1
195	mov	x7, #0
196	mov	x10, #0
197	b.eq	1f
198	ldrb	w7, [data], #1
199	eor	started_on_odd, started_on_odd, #1
200	sub	mlen, mlen, #1
201	lsl	w7, w7, #8
2021:
203
204
205/*
206 *		if ((uintptr_t)data & 2) {
207 *			if (mlen < 2)
208 *				goto trailing_bytes;
209 *			partial += *(uint16_t *)(void *)data;
210 *			data += 2;
211 *			mlen -= 2;
212 *		}
213 */
214	tst	data, #2
215	mov	needs_swap, started_on_odd
216	b.eq	1f
217	cmp	mlen, #2
218	b.lt	L_trailing_bytes
219	ldrh	w9, [data], #2
220	sub	mlen, mlen, #2
221	add	w7, w7, w9
2221:
223
224/*
225 *		if ((uintptr_t)data & 4) {
226 *			if (mlen < 4)
227 *				goto L2_bytes;
228 *			partial += *(uint32_t *)(void *)data;
229 *			data += 4;
230 *			mlen -= 4;
231 *		}
232 */
233	// align on 8-bytes boundary if applicable
234	tst	data, #4
235	b.eq	1f
236	cmp	mlen, #4
237	b.lt	L2_bytes
238	ldr	w9, [data], #4
239	sub	mlen, mlen, #4
240	adds	w7, w7, w9
241	adc	x7, x7, x10 // assumes x10 still is #0 as set above
2421:
243
244/*
245 *		while (mlen >= 64) {
246 *			__builtin_prefetch(data + 32);
247 *			__builtin_prefetch(data + 64);
248 *			partial += *(uint32_t *)(void *)data;
249 *			partial += *(uint32_t *)(void *)(data + 4);
250 *			partial += *(uint32_t *)(void *)(data + 8);
251 *			partial += *(uint32_t *)(void *)(data + 12);
252 *			partial += *(uint32_t *)(void *)(data + 16);
253 *			partial += *(uint32_t *)(void *)(data + 20);
254 *			partial += *(uint32_t *)(void *)(data + 24);
255 *			partial += *(uint32_t *)(void *)(data + 28);
256 *			partial += *(uint32_t *)(void *)(data + 32);
257 *			partial += *(uint32_t *)(void *)(data + 36);
258 *			partial += *(uint32_t *)(void *)(data + 40);
259 *			partial += *(uint32_t *)(void *)(data + 44);
260 *			partial += *(uint32_t *)(void *)(data + 48);
261 *			partial += *(uint32_t *)(void *)(data + 52);
262 *			partial += *(uint32_t *)(void *)(data + 56);
263 *			partial += *(uint32_t *)(void *)(data + 60);
264 *			data += 64;
265 *			mlen -= 64;
266 *		//	if (PREDICT_FALSE(partial & (3ULL << 62))) {
267 *		//		if (needs_swap)
268 *		//			partial = (partial << 8) +
269 *		//			    (partial >> 56);
270 *		//		sum += (partial >> 32);
271 *		//		sum += (partial & 0xffffffff);
272 *		//		partial = 0;
273 *		//	}
274 *		}
275*/
276
277	// pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
278	subs	mlen, mlen, #64
279	b.lt	L32_bytes
280
281	// save used vector registers
282	sub	sp, sp, #8*16
283	mov	x11, sp
284	st1.4s	{v0, v1, v2, v3}, [x11], #4*16
285	st1.4s	{v4, v5, v6, v7}, [x11], #4*16
286
287	// spread partial into 8 8-byte registers in v0-v3
288	fmov	s3, w7
289	eor.16b	v0, v0, v0
290	eor.16b	v1, v1, v1
291	eor.16b	v2, v2, v2
292
293	// load the 1st 64 bytes (16 32-bit words)
294	ld1.4s	{v4,v5,v6,v7},[data],#64
295
296	// branch to finish off if mlen<64
297	subs	mlen, mlen, #64
298	b.lt	L64_finishup
299
300	/*
301	 * loop for loading and accumulating 16 32-bit words into
302	 * 8 8-byte accumulators per iteration.
303	 */
304L64_loop:
305	subs        mlen, mlen, #64             // mlen -= 64
306
307	uadalp.2d   v0, v4
308	ld1.4s      {v4},[data], #16
309
310	uadalp.2d   v1, v5
311	ld1.4s      {v5},[data], #16
312
313	uadalp.2d   v2, v6
314	ld1.4s      {v6},[data], #16
315
316	uadalp.2d   v3, v7
317	ld1.4s      {v7},[data], #16
318
319	b.ge        L64_loop
320
321L64_finishup:
322	uadalp.2d   v0, v4
323	uadalp.2d   v1, v5
324	uadalp.2d   v2, v6
325	uadalp.2d   v3, v7
326
327	add.2d      v0, v0, v1
328	add.2d      v2, v2, v3
329	addp.2d     d0, v0
330	addp.2d     d2, v2
331	add.2d      v0, v0, v2
332	fmov        x7, d0			// partial in x7 now
333
334	// restore used vector registers
335	ld1.4s      {v0, v1, v2, v3}, [sp], #4*16
336	ld1.4s      {v4, v5, v6, v7}, [sp], #4*16
337
338L32_bytes:
339	tst     mlen, #32
340	b.eq    L16_bytes
341	ldp	x9, x10, [data], #16
342	ldp	x11, x12, [data], #16
343	adds	x7, x7, x9
344	mov	x9, #0
345	adcs	x7, x7, x10
346	adcs	x7, x7, x11
347	adcs	x7, x7, x12
348	adc	x7, x7, x9
349
350L16_bytes:
351	tst	mlen, #16
352	b.eq	L8_bytes
353	ldp	x9, x10, [data], #16
354	adds	x7, x7, x9
355	mov	x9, #0
356	adcs	x7, x7, x10
357	adc	x7, x7, x9
358
359L8_bytes:
360	tst     mlen, #8
361	mov	x10, #0
362	b.eq    L4_bytes
363	ldr	x9,[data],#8
364	adds	x7, x7, x9
365	adc	x7, x7, x10
366
367L4_bytes:
368	tst     mlen, #4
369	b.eq    L2_bytes
370	ldr	w9,[data],#4
371	adds	x7, x7, x9
372	adc	x7, x7, x10
373
374L2_bytes:
375	tst	mlen, #2
376	b.eq	L_trailing_bytes
377	ldrh	w9,[data],#2
378	adds	x7, x7, x9
379	adc	x7, x7, x10
380
381L_trailing_bytes:
382	tst     mlen, #1
383	b.eq    L0_bytes
384	ldrb	w9,[data],#1
385	adds	x7, x7, x9
386	adc	x7, x7, x10
387	eor	started_on_odd, started_on_odd, #1
388
389L0_bytes:
390/*
391 *		if (needs_swap)
392 *			partial = (partial << 8) + (partial >> 56);
393 */
394	cbz	needs_swap, 1f
395	ror	x7, x7, #56
3961:
397/*
398 *		sum += (partial >> 32) + (partial & 0xffffffff);
399 *		sum = (sum >> 32) + (sum & 0xffffffff);
400 *	}
401 */
402
403	add	x3, x3, x7, lsr #32
404	mov	w7, w7
405	add	x3, x3, x7
406	mov	w7, w3
407	add	x3, x7, x3, lsr #32
408
409L_continue:
410	cmp	len, #0
411	ldr     ptr_m, [m, #M_NEXT]			// m = m->m_next
412	b.gt	L_loop
413
414/*
415 *	final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
416 *	    ((sum >> 16) & 0xffff) + (sum & 0xffff);
417 *	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
418 *	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
419 *	return (final_acc & 0xffff);
420 * }
421 */
422
423	mov	w4, #0x00ffff
424	and	x0, x4, x3, lsr #48
425	and	x1, x4, x3, lsr #32
426	and	x2, x4, x3, lsr #16
427	and	x3, x4, x3
428	add	w0, w0, w1
429	add	w2, w2, w3
430	add	w0, w0, w2
431	and	w1, w4, w0, lsr #16
432	and	w0, w4, w0
433	add	w0, w0, w1
434	and	w1, w4, w0, lsr #16
435	and	w0, w4, w0
436	add	w0, w0, w1
437	/*
438	 * If we were to 1's complement it (XOR with 0xffff):
439	 *
440	 * eor    	w0, w0, w4
441	 */
442	and	w0, w0, w4
443
444	ret	lr
445
446Lin_cksum_whoops:
447	adrp	x0, Lin_cksum_whoops_str@page
448	add	x0, x0, Lin_cksum_whoops_str@pageoff
449	bl	#CKSUM_ERR
450	mov	x0, #-1
451	ret	lr
452
453Lin_cksum_whoops_str:
454	.asciz	"os_cpu_in_cksum_mbuf: out of data\n"
455	.align	5
456