xref: /xnu-8020.140.41/bsd/dev/arm/cpu_copy_in_cksum.s (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1/*
2 * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 *  extern uint32_t os_cpu_copy_in_cksum(const void *src, void *dst,
31 *      uint32_t len, uint32_t sum0);
32 *
33 *  input :
34 *      src : source starting address
35 *      dst : destination starting address
36 *      len : byte stream length
37 *      sum0 : initial 32-bit sum
38 *
39 *  output :
40 *      the source byte stream is copied into the destination buffer
41 *      the function returns the partial 16-bit checksum accumulated
42 *	in a 32-bit variable (without 1's complement); caller is
43 *	responsible for folding the 32-bit sum into 16-bit and
44 *	performing the 1's complement if applicable
45 */
46
47/*
48 * the following definitions default the implementation to little-endian
49 * architectures
50 */
51#define LITTLE_ENDIAN	1
52#define BYTE_ORDER	LITTLE_ENDIAN
53
54/*
55 * renaming registers to ease code porting from arm64
56 */
57#define v0	q0
58#define v1	q1
59#define v2	q2
60#define v3	q3
61#define v8	q8
62#define v9	q9
63#define v10	q10
64#define v11	q11
65#define v12	q12
66#define v13	q13
67#define v14	q14
68#define v15	q15
69
70	.syntax	unified
71	.align	2
72	.code	16
73	.thumb_func _os_cpu_copy_in_cksum
74	.text
75
76	.globl	_os_cpu_copy_in_cksum
77_os_cpu_copy_in_cksum:
78
79#define	src		r0
80#define	dst		r1
81#define	len		r2
82#define	sum		r3
83#define need_swap	r4
84#define partial		r5
85#define t		r12
86
87	push	{r4,r5,r7,lr}
88	add	r7, sp, #8	/* set up base pointer for debug tracing */
89
90	cmp	len, #0
91	mov	partial, #0	/* partial = 0; */
92	mov	need_swap, #0	/* needs_swap = 0; */
93
94	cbnz	len, 0f
95	b	L_len_0
960:
97
98/*
99 * Deal with odd-addressed byte, use w7 to store temporary sum, deposit this
100 * byte to high byte of 16-bit in w7
101 *
102 *	t = 0;
103 *	if ((uintptr_t)src & 1) {
104 *		t = *src << 8;
105 *		*dst++ = *src++;
106 *		--len;
107 *	}
108*/
109	tst	src, #1
110	beq	1f
111	ldrb	partial, [src]
112	add	src, src, #1
113	strb	partial, [dst], #1
114#if BYTE_ORDER == LITTLE_ENDIAN
115	lsl	partial, partial, #8
116#endif
117	subs	len, len, #1
118	mov	need_swap, #1
119	beq	L_len_0
1201:
121
122#ifdef KERNEL
123	vpush	{v8-v15}
124	vpush	{v0-v3}
125#endif
126
127	/*
128	 * pre-decrement len by 8*16, and if less tha 8*16 bytes, try
129	 * 4*16 bytes next.
130	 * v0,v1 will store temp result after we exit the L128 loop
131	 */
132	veor	v0, v0, v0
133	veor	v1, v1, v1
134	cmp	len, #8*16
135	vmov	s0, partial	/* move partial to 1st 64b lane in v0 */
136	blt	L64_bytes
137
138	/*
139	 * accumulate 8 x 2 x 16-bit pairs into 16 lanes in v0-v3
140	 * branch to finish off if len<128
141	 */
142	vld1.8	{q8,q9}, [src]!
143	veor	v2, v2, v2
144	vld1.8	{q10,q11}, [src]!
145	veor	v3, v3, v3
146	vld1.8	{q12,q13}, [src]!
147	subs	len, len, #2*8*16
148	vld1.8	{q14,q15}, [src]!
149	blt	L128_finishup
150
151	/*
152	 * loop for loading and accumulating 16 32-bit words nto 8 8-byte
153	 * accumulators per iteration
154	 */
155L128_loop:
156	vpadal.u16	v0, v8
157	vst1.8		{q8,q9}, [dst]!
158	vpadal.u16	v1, v9
159	vld1.8		{q8,q9}, [src]!
160
161	vpadal.u16	v2, v10
162	vst1.8		{q10,q11}, [dst]!
163	vpadal.u16	v3, v11
164	vld1.8		{q10,q11}, [src]!
165
166	vpadal.u16	v0, v12
167	vst1.8		{q12,q13}, [dst]!
168	vpadal.u16	v1, v13
169	vld1.8		{q12,q13}, [src]!
170
171	vpadal.u16	v2, v14
172	vst1.8		{q14,q15}, [dst]!
173	vpadal.u16	v3, v15
174	vld1.8		{q14,q15}, [src]!
175
176	subs		len, len, #8*16
177	bge		L128_loop
178
179L128_finishup:
180	vpadal.u16	v0, v8
181	vst1.8		{q8,q9}, [dst]!
182	vpadal.u16	v1, v9
183
184	vpadal.u16	v2, v10
185	vst1.8		{q10,q11}, [dst]!
186	vpadal.u16	v3, v11
187
188	vpadal.u16	v0, v12
189	vst1.8		{q12,q13}, [dst]!
190	vpadal.u16	v1, v13
191
192	vpadal.u16	v2, v14
193	vst1.8		{q14,q15}, [dst]!
194	vpadal.u16	v3, v15
195
196	add		len, len, #8*16
197
198	vadd.i32	v0, v0, v2
199	vadd.i32	v1, v1, v3
200
201L64_bytes:
202	cmp		len, #4*16
203	blt		L32_bytes
204
205	vld1.8		{q8,q9}, [src]!
206	vld1.8		{q10,q11}, [src]!
207
208	vpadal.u16	v0, v8
209	vst1.8		{q8,q9}, [dst]!
210	vpadal.u16	v1, v9
211
212	vpadal.u16	v0, v10
213	vst1.8		{q10,q11}, [dst]!
214	vpadal.u16	v1, v11
215
216	sub		len, len, #4*16
217
218L32_bytes:
219	cmp		len, #2*16
220	blt		L16_bytes
221
222	vld1.8		{q8,q9}, [src]!
223
224	vpadal.u16	v0, v8
225	vst1.8		{q8,q9}, [dst]!
226	vpadal.u16	v1, v9
227
228	sub		len, len, #2*16
229
230L16_bytes:
231	vadd.i32	v0, v0, v1
232
233	cmp		len, #16
234	blt		L8_bytes
235	vld1.8		{q8}, [src]!
236	vpadal.u16	v0, v8
237	vst1.8		{q8}, [dst]!
238
239	sub		len, len, #16
240
241L8_bytes:
242	veor		v1, v1, v1
243	tst		len, #8
244	beq		L4_bytes
245	vld1.8		{d2}, [src]!
246	vst1.8		{d2}, [dst]!
247	vpadal.u16	v0, v1
248
249L4_bytes:
250	ands		len, len, #7
251	vpadd.i32	d0, d0, d1
252	vpadd.i32	d0, d0, d1
253	vmov		partial, s0
254
255#ifdef KERNEL
256	vpop	{q0-q1}
257	vpop	{q2-q3}
258	vpop	{q8-q9}
259	vpop	{q10-q11}
260	vpop	{q12-q13}
261	vpop	{q14-q15}
262#endif
263
264	beq	L_len_0
265
266	subs	len, len, #2
267	blt	L_trailing_bytes
268
269L2_bytes:
270	ldrh	t, [src], #2
271	strh	t, [dst], #2
272	add	partial, partial, t
273	subs	len, len, #2
274	bge	L2_bytes
275
276L_trailing_bytes:
277	tst	len, #1
278	beq	L_len_0
279	ldrb	t,[src],#1
280	strb	t,[dst],#1
281#if BYTE_ORDER != LITTLE_ENDIAN
282	lsl	t, t, #8
283#endif
284	add	partial, partial, t
285
286L_len_0:
287	/*
288	 * if (needs_swap)
289	 *	partial = (partial << 8) + (partial >> 24);
290	 */
291	cbz	need_swap, 1f
292	lsl	t, partial, #8
293	add	partial, t, partial, lsr #24
2941:
295	movw	lr, #0xffff
296
297	/* final_acc = (sum0 >> 16) + (sum0 & 0xffff); */
298	and	r0, sum, lr
299	add	r0, r0, sum, lsr #16
300
301	/* final_acc += (partial >> 16) + (partial & 0xffff); */
302	add	r0, r0, partial, lsr #16
303	and	partial, partial, lr
304	add	r0, r0, partial
305
306	/* final_acc = (final_acc >> 16) + (final_acc & 0xffff); */
307	and	t, r0, lr
308	add	r0, t, r0, lsr #16
309
310	/*
311	 * One final fold in case of carry from the previous one.
312	 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
313	 */
314	and	t, r0, lr
315	add	r0, t, r0, lsr #16
316
317	/*
318	 * return (~final_acc & 0xffff);
319	 *
320	 * mvn	r0, r0
321	 * and	r0, r0, lr
322	 */
323
324	pop	{r4,r5,r7,pc}
325