xref: /xnu-8020.140.41/bsd/dev/arm/cpu_in_cksum.s (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1/*
2 * Copyright (c) 2009-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*	$NetBSD: cpu_in_cksum.S,v 1.2 2008/01/27 16:58:05 chris Exp $	*/
30
31/*
32 * Copyright 2003 Wasabi Systems, Inc.
33 * All rights reserved.
34 *
35 * Written by Steve C. Woodford for Wasabi Systems, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 *    notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 *    notice, this list of conditions and the following disclaimer in the
44 *    documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 *    must display the following acknowledgement:
47 *      This product includes software developed for the NetBSD Project by
48 *      Wasabi Systems, Inc.
49 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
50 *    or promote products derived from this software without specific prior
51 *    written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
56 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
57 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
58 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
59 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
60 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
61 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
62 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
63 * POSSIBILITY OF SUCH DAMAGE.
64 */
65
66#ifdef KERNEL
67#include "../../../osfmk/arm/arch.h"
68#include "../../../osfmk/arm/proc_reg.h"
69
70#if __ARM_VFP__ < 3
71#error "Unsupported: __ARM_VFP__ < 3"
72#endif /* __ARM_VFP__ < 3 */
73#define	CKSUM_ERR _kprintf
74#else /* !KERNEL */
75#ifndef LIBSYSCALL_INTERFACE
76#error "LIBSYSCALL_INTERFACE not defined"
77#endif /* !LIBSYSCALL_INTERFACE */
78#define	CKSUM_ERR _fprintf_stderr
79#define	__ARM_VFP__	3
80#endif /* !KERNEL */
81
82/*
83 * The following default the implementation to little-endian architectures.
84 */
85#define	LITTLE_ENDIAN	1
86#define	BYTE_ORDER	LITTLE_ENDIAN
87
88.syntax unified
89
90/*
91 * XXX: [email protected]:
92 *
93 * Ugly, but we have little choice, since relying on genassym and <assym.s>
94 * is not possible unless this code lives in osfmk.  Note also that this
95 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
96 * authentic; it only cares about 3 fields.
97 */
98#define	M_NEXT	0
99#define	M_DATA	8
100#define	M_LEN	12
101
102/*
103 * APPLE MODIFICATION
104 *
105 * The use of R7 in this code as data register prevents
106 * the use of debugging or instrumentation tools, which is an acceptable
107 * tradeoff considering the potential gain in performance.
108 */
109
110/*
111 * Hand-optimised implementations for ARM/Xscale
112 */
113
114	.macro EnableVFP
115#ifdef KERNEL
116        push    {r0, r1, r2, r12}
117        bl      _enable_kernel_vfp_context
118        pop     {r0, r1, r2, r12}
119#endif /* KERNEL */
120	.endm
121
122
123/*
124 * uint32_t os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off,
125 *     uint32_t initial_sum);
126 *
127 * Entry:
128 *	r0	m
129 *	r1	len
130 *	r2	off
131 *	r3	initial_sum
132 *
133 * Function wide register usage
134 *	r8	accumulated sum
135 *	r9	remaining length to parse
136 *	ip	pointer to next mbuf
137 *
138 * This function returns the partial 16-bit checksum accumulated in
139 * a 32-bit variable (withouth 1's complement); caller is responsible
140 * for folding the 32-bit sum into 16-bit and performinng the 1's
141 * complement if applicable
142 */
143	.globl	_os_cpu_in_cksum_mbuf
144	.text
145	.align	4
146_os_cpu_in_cksum_mbuf:
147	stmfd	sp!, {r4-r11,lr}
148
149	mov	r8, r3			/* Accumulate sum in r8 */
150	mov	r9, r1			/* save len in r9 */
151	mov	ip, r0			/* set ip to the current mbuf */
152
153	cmp	r9, #0			/* length is 0? */
154	bne	.Lin_cksum_skip_loop	/* if not, proceed further */
155	mov	r0, r8			/* otherwise, return initial sum */
156
157	ldmfd	sp!, {r4-r11, pc}
158
159.Lin_cksum_skip_loop:
160	ldr	r1, [ip, #(M_LEN)]
161	ldr	r0, [ip, #(M_DATA)]
162	ldr	ip, [ip, #(M_NEXT)]
163.Lin_cksum_skip_entry:
164	subs	r2, r2, r1		/* offset = offset - mbuf length */
165	blt	.Lin_cksum_skip_done	/* if offset has gone negative start with this mbuf */
166	cmp	ip, #0x00
167	bne	.Lin_cksum_skip_loop
168	b	.Lin_cksum_whoops
169
170.Lin_cksum_skip_done:
171	add	r0, r2, r0		/* data += offset (offset is < 0) */
172	add	r0, r0, r1		/* data += length of mbuf */
173					/* data == start of data to cksum */
174	rsb	r1, r2, #0x00		/* length = remainder of mbuf to read */
175	mov	r10, #0x00
176	b	.Lin_cksum_entry
177
178.Lin_cksum_loop:
179	ldr	r1, [ip, #(M_LEN)]
180	ldr	r0, [ip, #(M_DATA)]
181	ldr	ip, [ip, #(M_NEXT)]
182.Lin_cksum_entry:
183	cmp	r9, r1
184	movlt	r1, r9
185	sub	r9, r9, r1
186	eor	r11, r10, r0
187	add	r10, r10, r1
188	adds	r2, r1, #0x00
189
190	beq	.Lin_cksum_next
191
192/*
193 * APPLE MODIFICATION
194 *
195 * Replace the 'blne _ASM_LABEL(L_cksumdata)' by bringing the called function
196 * inline. This results in slightly faster code, and also permits the whole
197 * function to be included in kernel profiling data.
198 */
199
200/*
201 * The main in*_cksum() workhorse...
202 *
203 * Entry parameters:
204 *	r0	Pointer to buffer
205 *	r1	Buffer length
206 *	lr	Return address
207 *
208 * Returns:
209 *	r2	Accumulated 32-bit sum
210 *
211 * Clobbers:
212 *	r0-r7
213 */
214	mov	r2, #0
215
216	/* We first have to word-align the buffer.  */
217	ands	r7, r0, #0x03
218	beq	.Lcksumdata_wordaligned
219	rsb	r7, r7, #0x04
220	cmp	r1, r7			/* Enough bytes left to make it? */
221	blt	.Lcksumdata_endgame
222	cmp	r7, #0x02
223	ldrb	r4, [r0], #0x01		/* Fetch 1st byte */
224	ldrbge	r5, [r0], #0x01		/* Fetch 2nd byte */
225	movlt	r5, #0x00
226	ldrbgt	r6, [r0], #0x01		/* Fetch 3rd byte */
227	movle	r6, #0x00
228	/* Combine the three bytes depending on endianness and alignment */
229#if BYTE_ORDER != LITTLE_ENDIAN
230	orreq	r2, r5, r4, lsl #8
231	orreq	r2, r2, r6, lsl #24
232	orrne	r2, r4, r5, lsl #8
233	orrne	r2, r2, r6, lsl #16
234#else
235	orreq	r2, r4, r5, lsl #8
236	orreq	r2, r2, r6, lsl #16
237	orrne	r2, r5, r4, lsl #8
238	orrne	r2, r2, r6, lsl #24
239#endif
240	subs	r1, r1, r7		/* Update length */
241	beq	.Lin_cksum_next		/* All done? */
242
243	/* Buffer is now word aligned */
244.Lcksumdata_wordaligned:
245
246#if __ARM_VFP__ >= 3
247
248	cmp		r1, #512	// do this if r1 is at least 512
249	blt		9f
250
251	EnableVFP
252
253	and		r3, r1, #~0x3f
254
255	vpush	{q0-q7}
256
257	// move r2 to s16 (q4) for neon computation
258	veor        q4, q4, q4
259	vld1.32     {q0-q1}, [r0]!
260	vmov        s16, r2
261	vld1.32     {q2-q3}, [r0]!
262
263	// pre-decrement size by 64
264	subs	r3, r3, #0x80
265
266	vpadal.u32  q4, q0
267	vld1.32     {q0}, [r0]!
268	vpaddl.u32  q5, q1
269	vld1.32     {q1}, [r0]!
270	vpaddl.u32  q6, q2
271	vld1.32     {q2}, [r0]!
272	vpaddl.u32  q7, q3
273	vld1.32     {q3}, [r0]!
274
2750:
276	subs	r3, r3, #0x40		// decrement size by 64
277
278	vpadal.u32  q4, q0
279	vld1.32     {q0}, [r0]!
280	vpadal.u32  q5, q1
281	vld1.32     {q1}, [r0]!
282	vpadal.u32  q6, q2
283	vld1.32     {q2}, [r0]!
284	vpadal.u32  q7, q3
285	vld1.32     {q3}, [r0]!
286
287	bgt		0b
288
289	vpadal.u32  q4, q0
290	vpadal.u32  q5, q1
291	vpadal.u32  q6, q2
292	vpadal.u32  q7, q3
293
294	vpadal.u32  q4, q5
295	vpadal.u32  q6, q7
296	vpadal.u32  q4, q6
297	vadd.i64    d8, d9
298
299	vpaddl.u32  d8, d8
300	vpaddl.u32  d8, d8
301	vpaddl.u32  d8, d8
302
303	vmov    r2, s16
304
305	vpop   {q0-q7}
306
307	ands    r1, r1, #0x3f		// residual bytes
308	beq 	.Lin_cksum_next
309
3109:
311
312#endif /* __ARM_VFP__ >= 3 */
313
314	subs	r1, r1, #0x40
315	blt	.Lcksumdata_bigloop_end
316
317.Lcksumdata_bigloop:
318	ldmia	r0!, {r3, r4, r5, r6}
319	adds	r2, r2, r3
320	adcs	r2, r2, r4
321	adcs	r2, r2, r5
322	ldmia	r0!, {r3, r4, r5, r7}
323	adcs	r2, r2, r6
324	adcs	r2, r2, r3
325	adcs	r2, r2, r4
326	adcs	r2, r2, r5
327	ldmia	r0!, {r3, r4, r5, r6}
328	adcs	r2, r2, r7
329	adcs	r2, r2, r3
330	adcs	r2, r2, r4
331	adcs	r2, r2, r5
332	ldmia	r0!, {r3, r4, r5, r7}
333	adcs	r2, r2, r6
334	adcs	r2, r2, r3
335	adcs	r2, r2, r4
336	adcs	r2, r2, r5
337	adcs	r2, r2, r7
338	adc	r2, r2, #0x00
339	subs	r1, r1, #0x40
340	bge	.Lcksumdata_bigloop
341.Lcksumdata_bigloop_end:
342
343	adds	r1, r1, #0x40
344	beq	.Lin_cksum_next
345
346	cmp	r1, #0x20
347
348	blt	.Lcksumdata_less_than_32
349	ldmia	r0!, {r3, r4, r5, r6}
350	adds	r2, r2, r3
351	adcs	r2, r2, r4
352	adcs	r2, r2, r5
353	ldmia	r0!, {r3, r4, r5, r7}
354	adcs	r2, r2, r6
355	adcs	r2, r2, r3
356	adcs	r2, r2, r4
357	adcs	r2, r2, r5
358	adcs	r2, r2, r7
359	adc	r2, r2, #0x00
360	subs	r1, r1, #0x20
361	beq	.Lin_cksum_next
362
363.Lcksumdata_less_than_32:
364	/* There are less than 32 bytes left */
365	and	r3, r1, #0x18
366	rsb	r4, r3, #0x18
367	sub	r1, r1, r3
368	adds	r4, r4, r4, lsr #1	/* Side effect: Clear carry flag */
369	addne	pc, pc, r4
370
371/*
372 * Note: We use ldm here, even on Xscale, since the combined issue/result
373 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
374 */
375	/* At least 24 bytes remaining... */
376	ldmia	r0!, {r4, r5}
377	nop
378	adcs	r2, r2, r4
379	adcs	r2, r2, r5
380
381	/* At least 16 bytes remaining... */
382	ldmia	r0!, {r4, r5}
383	adcs	r2, r2, r4
384	adcs	r2, r2, r5
385
386	/* At least 8 bytes remaining... */
387	ldmia	r0!, {r4, r5}
388	adcs	r2, r2, r4
389	adcs	r2, r2, r5
390
391	/* Less than 8 bytes remaining... */
392	adc	r2, r2, #0x00
393	subs	r1, r1, #0x04
394	blt	.Lcksumdata_lessthan4
395
396	ldr	r4, [r0], #0x04
397	sub	r1, r1, #0x04
398	adds	r2, r2, r4
399	adc	r2, r2, #0x00
400
401	/* Deal with < 4 bytes remaining */
402.Lcksumdata_lessthan4:
403	adds	r1, r1, #0x04
404	beq	.Lin_cksum_next
405
406	/* Deal with 1 to 3 remaining bytes, possibly misaligned */
407.Lcksumdata_endgame:
408	ldrb	r3, [r0]		/* Fetch first byte */
409	cmp	r1, #0x02
410	ldrbge	r4, [r0, #0x01]		/* Fetch 2nd and 3rd as necessary */
411	movlt	r4, #0x00
412	ldrbgt	r5, [r0, #0x02]
413	movle	r5, #0x00
414	/* Combine the three bytes depending on endianness and alignment */
415	tst	r0, #0x01
416#if BYTE_ORDER != LITTLE_ENDIAN
417	orreq	r3, r4, r3, lsl #8
418	orreq	r3, r3, r5, lsl #24
419	orrne	r3, r3, r4, lsl #8
420	orrne	r3, r3, r5, lsl #16
421#else
422	orreq	r3, r3, r4, lsl #8
423	orreq	r3, r3, r5, lsl #16
424	orrne	r3, r4, r3, lsl #8
425	orrne	r3, r3, r5, lsl #24
426#endif
427	adds	r2, r2, r3
428	adc	r2, r2, #0x00
429
430.Lin_cksum_next:
431	tst	r11, #0x01
432	movne	r2, r2, ror #8
433	adds	r8, r8, r2
434	adc	r8, r8, #0x00
435	cmp	ip, #00
436	bne	.Lin_cksum_loop
437
438	mov	r1, #0xff
439	orr	r1, r1, #0xff00
440	and	r0, r8, r1
441	add	r0, r0, r8, lsr #16
442	add	r0, r0, r0, lsr #16
443	and	r0, r0, r1
444	/*
445	 * If we were to 1's complement it (XOR with 0xffff):
446	 *
447	 * eor	r0, r0, r1
448	 */
449
450	ldmfd	sp!, {r4-r11, pc}
451
452.Lin_cksum_whoops:
453	adr	r0, .Lin_cksum_whoops_str
454	bl	#CKSUM_ERR
455	mov	r0, #-1
456
457	ldmfd	sp!, {r4-r11, pc}
458
459.Lin_cksum_whoops_str:
460	.asciz	"os_cpu_in_cksum_mbuf: out of data\n"
461	.align	5
462