xref: /xnu-11417.140.69/bsd/dev/arm64/cpu_copy_in_cksum.s (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1/*
2 * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#ifdef KERNEL
30#include <arm64/asm.h>
31#endif /* KERNEL */
32
33/*
34 *  extern uint32_t os_cpu_copy_in_cksum(const void *src, void *dst,
35 *      uint32_t len, uint32_t sum0);
36 *
37 *  input :
38 *      src : source starting address
39 *      dst : destination starting address
40 *      len : byte stream length
41 *      sum0 : initial 32-bit sum
42 *
43 *  output :
44 *      the source byte stream is copied into the destination buffer
45 *      the function returns the partial 16-bit checksum accumulated
46 *	in a 32-bit variable (without 1's complement); caller is
47 *	responsible for folding the 32-bit sum into 16-bit and
48 *	performing the 1's complement if applicable
49 */
50
51/*
52 * The following definitions default the implementation to little-endian
53 * architectures.
54 */
55#define LITTLE_ENDIAN	1
56#define BYTE_ORDER	LITTLE_ENDIAN
57
58/*
59 * ARM64 kernel mode -- just like user mode -- no longer requires saving
60 * the vector registers, since it's done by the exception handler code.
61 */
62#define	SAVE_REGISTERS	0
63
64	.globl	_os_cpu_copy_in_cksum
65	.text
66	.align	4
67_os_cpu_copy_in_cksum:
68
69#define	src		x0
70#define	dst		x1
71#define	len		x2
72#define	sum		x3
73#define need_swap	x5
74#define t		x6
75#define partial		x7
76#define wpartial	w7
77
78#ifdef KERNEL
79	ARM64_PROLOG
80#endif /* KERNEL */
81	mov	partial, #0		// partial = 0;
82	mov	need_swap, #0		// needs_swap = 0;
83
84	cbz	len, L_len_0
85
86/*
87 * Deal with odd-addressed byte, use w7 to store temporary sum, deposit this
88 * byte to high byte of 16-bit in w7
89 *
90 *	t = 0;
91 *	if ((uintptr_t)src & 1) {
92 *		t = *src << 8;
93 *		*dst++ = *src++;
94 *		--len;
95 *	}
96 */
97	tst	src, #1
98	b.eq	1f
99	ldrb	wpartial, [src]
100	add	src, src, #1
101	strb	wpartial, [dst], #1
102#if BYTE_ORDER == LITTLE_ENDIAN
103	lsl	partial, partial, #8
104#endif
105	sub	len, len, #1
106	mov	need_swap, #1
107	cbz	len, L_len_0
1081:
109
110#if SAVE_REGISTERS
111	/*
112	 * we will always use v0-v3, and v4-v7/v16-v19 if len>=128
113	 * so allocate 12*16 bytes in the stack, and store v0-v3 now,
114	 * keep x11 as the pointer
115	 */
116	sub	sp, sp, #12*16
117	mov	x11, sp
118	st1.4s	{v0, v1, v2, v3}, [x11], #4*16
119#endif
120
121	/*
122	 * pre-decrement len by 8*16, and if less tha 8*16 bytes, try
123	 * 4*16 bytes next.
124	 * v0,v1 will store temp result after we exit the L128 loop
125	 */
126	eor.16b	v0, v0, v0
127	eor.16b	v1, v1, v1
128	cmp	len, #8*16
129	mov	v0.d[0], partial	// move partial to 1st 64b lane in v0
130	b.lt	L64_bytes
131
132#if SAVE_REGISTERS
133	/* if we are here, we need to save v4-v7/v16-v19 for kernel mode */
134	st1.4s	{v4, v5, v6, v7}, [x11], #4*16
135	st1.4s	{v16, v17, v18, v19}, [x11], #4*16
136#endif
137
138	/*
139	 * accumulate 4 x 2 x 32-bit pairs into 8 lanes in v0-v3
140	 * load 1st 4 vectors, and clear v0-v3
141	 */
142	ldr	q4, [src], #8*16
143	eor.16b	v2, v2, v2
144	ldr	q5, [src, #-7*16]
145	eor.16b	v3, v3, v3
146	ldr	q6, [src, #-6*16]
147	ldr	q7, [src, #-5*16]
148	ldr	q16, [src, #-4*16]
149	ldr	q17, [src, #-3*16]
150	ldr	q18, [src, #-2*16]
151	ldr	q19, [src, #-1*16]
152
153	/* branch to finish off if len<128 */
154	subs	len, len, #2*8*16
155	b.lt	L128_finishup
156
157	/*
158	 * loop for loading and accumulating 16 32-bit words nto 8 8-byte
159	 * accumulators per iteration
160	 */
161L128_loop:
162	str		q4, [dst], #16*8
163	uadalp.2d	v0, v4
164	str		q5, [dst, #-7*16]
165	uadalp.2d	v1, v5
166	ldr		q4, [src], #16*8
167	ldr		q5, [src, #-7*16]
168
169	str		q6, [dst, #-6*16]
170	uadalp.2d	v2, v6
171	str		q7, [dst, #-5*16]
172	uadalp.2d	v3, v7
173	ldr		q6, [src, #-6*16]
174	ldr		q7, [src, #-5*16]
175
176	str		q16, [dst, #-4*16]
177	uadalp.2d	v0, v16
178	str		q17, [dst, #-3*16]
179	uadalp.2d	v1, v17
180	ldr		q16, [src, #-4*16]
181	ldr		q17, [src, #-3*16]
182
183	str		q18, [dst, #-2*16]
184	uadalp.2d	v2, v18
185	str		q19, [dst, #-1*16]
186	uadalp.2d	v3, v19
187	ldr		q18, [src, #-2*16]
188	ldr		q19, [src, #-1*16]
189
190	subs		len, len, #8*16
191	b.ge		L128_loop
192
193L128_finishup:
194	str		q4, [dst], #16*8
195	uadalp.2d	v0, v4
196	str		q5, [dst, #-7*16]
197	uadalp.2d	v1, v5
198	str		q6, [dst, #-6*16]
199	uadalp.2d	v2, v6
200	str		q7, [dst, #-5*16]
201	uadalp.2d	v3, v7
202
203	str		q16, [dst, #-4*16]
204	uadalp.2d	v0, v16
205	str		q17, [dst, #-3*16]
206	uadalp.2d	v1, v17
207	str		q18, [dst, #-2*16]
208	uadalp.2d	v2, v18
209	str		q19, [dst, #-1*16]
210	uadalp.2d	v3, v19
211
212	add		len, len, #8*16
213
214	add.2d		v0, v0, v2
215	add.2d		v1, v1, v3
216
217#if SAVE_REGISTERS
218	/* restore v4-v7/v16-v19 as they won't be used any more */
219	add		x11, sp, #4*16
220	ld1.4s		{v4, v5, v6, v7}, [x11], #4*16
221	ld1.4s		{v16, v17, v18, v19}, [x11], #4*16
222#endif
223
224L64_bytes:
225	cmp		len, #4*16
226	b.lt		L32_bytes
227
228	ldr		q2, [src], #4*16
229	ldr		q3, [src, #-3*16]
230	str		q2, [dst], #4*16
231	uadalp.2d	v0, v2
232	str		q3, [dst, #-3*16]
233	uadalp.2d	v1, v3
234
235	ldr		q2, [src, #-2*16]
236	ldr		q3, [src, #-1*16]
237	str		q2, [dst, #-2*16]
238	uadalp.2d	v0, v2
239	str		q3, [dst, #-1*16]
240	uadalp.2d	v1, v3
241	sub		len, len, #4*16
242
243L32_bytes:
244	cmp		len, #2*16
245	b.lt		L16_bytes
246	ldr		q2, [src], #2*16
247	ldr		q3, [src, #-1*16]
248	str		q2, [dst], #2*16
249	uadalp.2d	v0, v2
250	str		q3, [dst, #-1*16]
251	uadalp.2d	v1, v3
252	sub		len, len, #2*16
253
254L16_bytes:
255	add.2d		v0, v0, v1
256	cmp		len, #16
257	b.lt		L8_bytes
258	ldr		q2, [src], #16
259	str		q2, [dst], #16
260	uadalp.2d	v0, v2
261	sub		len, len, #16
262
263L8_bytes:
264	eor.16b		v1, v1, v1
265	eor.16b		v2, v2, v2
266	eor.16b		v3, v3, v3
267
268	tst		len, #8
269	b.eq		L4_bytes
270	ldr		d1,[src],#8
271	str		d1,[dst],#8
272
273L4_bytes:
274	tst		len, #4
275	b.eq		L2_bytes
276	ldr		s2,[src],#4
277	str		s2,[dst],#4
278
279L2_bytes:
280	uadalp.2d	v0, v1
281	eor.16b		v1, v1, v1
282	tst		len, #2
283	b.eq		L_trailing_bytes
284	ldr		h3,[src],#2
285	str		h3,[dst],#2
286
287L_trailing_bytes:
288	tst		len, #1
289	b.eq		L0_bytes
290	ldr		b1,[src],#1
291	str		b1,[dst],#1
292#if BYTE_ORDER != LITTLE_ENDIAN
293	shl.4h		v1, v1, #8	// partial <<= 8;
294#endif
295
296L0_bytes:
297	uadalp.2d	v2, v3
298	uadalp.2d	v0, v1
299	uadalp.2d	v0, v2
300
301	addp.2d		d0, v0
302	fmov		partial, d0
303
304#if SAVE_REGISTERS
305	/* restore v0-v3 and deallocate stack space */
306	ld1.4s	{v0, v1, v2, v3}, [sp]
307	add	sp, sp, #12*16
308#endif
309
310	/* partial = (partial >> 32) + (partial & 0xffffffff); */
311	and	t, partial, #0xffffffff
312	add	partial, t, partial, lsr #32
313
314	/* partial = (partial >> 16) + (partial & 0xffff); */
315	and	t, partial, #0xffff
316	add	partial, t, partial, lsr #16
317
318L_len_0:
319	/*
320	 * if (needs_swap)
321	 *	partial = (partial << 8) + (partial >> 24);
322	 */
323	cbz	need_swap, 1f
324	lsl	t, partial, #8
325	add	partial, t, partial, lsr #24
3261:
327	/* final_acc = (sum0 >> 16) + (sum0 & 0xffff); */
328	and	x0, sum, #0xffff
329	add	x0, x0, sum, lsr #16
330
331	/* final_acc += (partial >> 16) + (partial & 0xffff); */
332	add	x0, x0, partial, lsr #16
333	and	partial, partial, #0xffff
334	add	x0, x0, partial
335
336	/* final_acc = (final_acc >> 16) + (final_acc & 0xffff); */
337	and	t, x0, #0xffff
338	add	x0, t, x0, lsr #16
339
340	/*
341	 * One final fold in case of carry from the previous one.
342	 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
343	 */
344	and	t, x0, #0xffff
345	add	x0, t, x0, lsr #16
346
347	/*
348	 * return (~final_acc & 0xffff);
349	 *
350	 * mvn	w0, w0
351	 * and	w0, w0, #0xffff
352	 */
353
354	ret	lr
355