xref: /xnu-8020.101.4/osfmk/arm/bcopy.s (revision e7776783b89a353188416a9a346c6cdb4928faad)
1/*
2 * Copyright (c) 2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <arm/proc_reg.h>
30
31.syntax unified
32.text
33.align 2
34
35	.globl _ovbcopy
36	.globl _memcpy
37	.globl _bcopy
38	.globl _memmove
39
40_bcopy:		/* void bcopy(const void *src, void *dest, size_t len); */
41_ovbcopy:
42	mov		r3, r0
43	mov		r0, r1
44	mov		r1, r3
45
46_memcpy:		/* void *memcpy(void *dest, const void *src, size_t len); */
47_memmove: 	/* void *memmove(void *dest, const void *src, size_t len); */
48	/* check for zero len or if the pointers are the same */
49	cmp		r2, #0
50	cmpne	r0, r1
51	bxeq	lr
52
53	/* save r0 (return value), r4 (scratch), and r5 (scratch) */
54	stmfd   sp!, { r0, r4, r5, r7, lr }
55	add	r7, sp, #12
56
57	/* check for overlap. r3 <- distance between src & dest */
58	subhs	r3, r0, r1
59	sublo	r3, r1, r0
60	cmp		r3, r2			/* if distance(src, dest) < len, we have overlap */
61	blo		Loverlap
62
63Lnormalforwardcopy:
64	/* are src and dest dissimilarly word aligned? */
65	mov		r12, r0, lsl #30
66	cmp		r12, r1, lsl #30
67	bne		Lnonwordaligned_forward
68
69	/* if len < 64, do a quick forward copy */
70	cmp		r2, #64
71	blt		Lsmallforwardcopy
72
73	/* check for 16 byte src/dest unalignment */
74	tst		r0, #0xf
75	bne		Lsimilarlyunaligned
76
77	/* check for 32 byte dest unalignment */
78	tst		r0, #(1<<4)
79	bne		Lunaligned_32
80
81Lmorethan64_aligned:
82	/* save some more registers to use in the copy */
83	stmfd	sp!, { r6, r8, r10, r11 }
84
85	/* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
86	sub		r2, r2, #64
87
88L64loop:
89	/* copy 64 bytes at a time */
90	ldmia	r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
91	pld		[r1, #32]
92	stmia	r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
93	ldmia	r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
94	subs	r2, r2, #64
95	pld		[r1, #32]
96	stmia	r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
97	bge		L64loop
98
99	/* restore the scratch registers we just saved */
100	ldmfd	sp!, { r6, r8, r10, r11 }
101
102	/* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
103	adds	r2, r2, #64
104	beq		Lexit
105
106Llessthan64_aligned:
107	/* copy 16 bytes at a time until we have < 16 bytes */
108	cmp		r2, #16
109	ldmiage	r1!, { r3, r4, r5, r12 }
110	stmiage	r0!, { r3, r4, r5, r12 }
111	subsge	r2, r2, #16
112	bgt		Llessthan64_aligned
113	beq		Lexit
114
115Llessthan16_aligned:
116	mov		r2, r2, lsl #28
117	msr		cpsr_f, r2
118
119	ldmiami	r1!, { r2, r3 }
120	ldreq	r4, [r1], #4
121	ldrhcs	r5, [r1], #2
122	ldrbvs	r12, [r1], #1
123
124	stmiami	r0!, { r2, r3 }
125	streq	r4, [r0], #4
126	strhcs	r5, [r0], #2
127	strbvs	r12, [r0], #1
128	b		Lexit
129
130Lsimilarlyunaligned:
131	/* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
132	mov		r12, r0, lsl #28
133	rsb		r12, r12, #0
134	msr		cpsr_f, r12
135
136	ldrbvs	r3, [r1], #1
137	ldrhcs	r4, [r1], #2
138	ldreq	r5, [r1], #4
139
140	strbvs	r3, [r0], #1
141	strhcs	r4, [r0], #2
142	streq	r5, [r0], #4
143
144	ldmiami	r1!, { r3, r4 }
145	stmiami	r0!, { r3, r4 }
146
147	subs	r2, r2, r12, lsr #28
148	beq		Lexit
149
150Lunaligned_32:
151	/* bring up to dest 32 byte alignment */
152	tst		r0, #(1 << 4)
153	ldmiane	r1!, { r3, r4, r5, r12 }
154	stmiane	r0!, { r3, r4, r5, r12 }
155	subne	r2, r2, #16
156
157	/* we should now be aligned, see what copy method we should use */
158	cmp		r2, #64
159	bge		Lmorethan64_aligned
160	b		Llessthan64_aligned
161
162Lbytewise2:
163	/* copy 2 bytes at a time */
164	subs	r2, r2, #2
165
166	ldrb	r3, [r1], #1
167	ldrbpl	r4, [r1], #1
168
169	strb	r3, [r0], #1
170	strbpl	r4, [r0], #1
171
172	bhi		Lbytewise2
173	b		Lexit
174
175Lbytewise:
176	/* simple bytewise forward copy */
177	ldrb	r3, [r1], #1
178	subs	r2, r2, #1
179	strb	r3, [r0], #1
180	bne		Lbytewise
181	b		Lexit
182
183Lsmallforwardcopy:
184	/* src and dest are word aligned similarly, less than 64 bytes to copy */
185	cmp		r2, #4
186	blt		Lbytewise2
187
188	/* bytewise copy until word aligned */
189	tst		r1, #3
190Lwordalignloop:
191	ldrbne	r3, [r1], #1
192	strbne	r3, [r0], #1
193	subne	r2, r2, #1
194	tstne	r1, #3
195	bne		Lwordalignloop
196
197	cmp		r2, #16
198	bge		Llessthan64_aligned
199	blt		Llessthan16_aligned
200
201Loverlap:
202	/* src and dest overlap in some way, len > 0 */
203	cmp		r0, r1				/* if dest > src */
204	bhi		Loverlap_srclower
205
206Loverlap_destlower:
207	/* dest < src, see if we can still do a fast forward copy or fallback to slow forward copy */
208	cmp		r3, #64
209	bge		Lnormalforwardcopy 	/* overlap is greater than one stride of the copy, use normal copy */
210
211	cmp		r3, #2
212	bge		Lbytewise2
213	b		Lbytewise
214
215	/* the following routines deal with having to copy in the reverse direction */
216Loverlap_srclower:
217	/* src < dest, with overlap */
218
219	/* src += len; dest += len; */
220	add		r0, r0, r2
221	add		r1, r1, r2
222
223	/* we have to copy in reverse no matter what, test if we can we use a large block reverse copy */
224	cmp		r2, #64				/* less than 64 bytes to copy? */
225	cmpgt	r3, #64				/* less than 64 bytes of nonoverlap? */
226	blt		Lbytewise_reverse
227
228	/* test of src and dest are nonword aligned differently */
229	mov		r3, r0, lsl #30
230	cmp		r3, r1, lsl #30
231	bne		Lbytewise_reverse
232
233	/* test if src and dest are non word aligned or dest is non 16 byte aligned */
234	tst		r0, #0xf
235	bne		Lunaligned_reverse_similarly
236
237	/* test for dest 32 byte alignment */
238	tst		r0, #(1<<4)
239	bne		Lunaligned_32_reverse_similarly
240
241	/* 64 byte reverse block copy, src and dest aligned */
242Lmorethan64_aligned_reverse:
243	/* save some more registers to use in the copy */
244	stmfd	sp!, { r6, r8, r10, r11 }
245
246	/* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
247	sub		r2, r2, #64
248
249L64loop_reverse:
250	/* copy 64 bytes at a time */
251	ldmdb	r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
252#if ARCH_ARMv5 || ARCH_ARMv5e || ARCH_ARMv6
253	pld		[r1, #-32]
254#endif
255	stmdb	r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
256	ldmdb	r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
257	subs	r2, r2, #64
258	pld		[r1, #-32]
259	stmdb	r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
260	bge		L64loop_reverse
261
262	/* restore the scratch registers we just saved */
263	ldmfd	sp!, { r6, r8, r10, r11 }
264
265	/* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
266	adds	r2, r2, #64
267	beq		Lexit
268
269Lbytewise_reverse:
270	ldrb	r3, [r1, #-1]!
271	strb	r3, [r0, #-1]!
272	subs	r2, r2, #1
273	bne		Lbytewise_reverse
274	b		Lexit
275
276Lunaligned_reverse_similarly:
277	/* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
278	mov		r12, r0, lsl #28
279	msr		cpsr_f, r12
280
281	ldrbvs	r3, [r1, #-1]!
282	ldrhcs	r4, [r1, #-2]!
283	ldreq	r5, [r1, #-4]!
284
285	strbvs	r3, [r0, #-1]!
286	strhcs	r4, [r0, #-2]!
287	streq	r5, [r0, #-4]!
288
289	ldmdbmi	r1!, { r3, r4 }
290	stmdbmi	r0!, { r3, r4 }
291
292	subs	r2, r2, r12, lsr #28
293	beq		Lexit
294
295Lunaligned_32_reverse_similarly:
296	/* bring up to dest 32 byte alignment */
297	tst		r0, #(1 << 4)
298	ldmdbne	r1!, { r3, r4, r5, r12 }
299	stmdbne	r0!, { r3, r4, r5, r12 }
300	subne	r2, r2, #16
301
302	/* we should now be aligned, see what copy method we should use */
303	cmp		r2, #64
304	bge		Lmorethan64_aligned_reverse
305	b		Lbytewise_reverse
306
307	/* the following routines deal with non word aligned copies */
308Lnonwordaligned_forward:
309	cmp		r2, #8
310	blt		Lbytewise2			/* not worth the effort with less than 24 bytes total */
311
312	/* bytewise copy until src word aligned */
313	tst		r1, #3
314Lwordalignloop2:
315	ldrbne	r3, [r1], #1
316	strbne	r3, [r0], #1
317	subne	r2, r2, #1
318	tstne	r1, #3
319	bne		Lwordalignloop2
320
321	/* figure out how the src and dest are unaligned */
322	and		r3, r0, #3
323	cmp		r3, #2
324	blt		Lalign1_forward
325	beq		Lalign2_forward
326	bgt		Lalign3_forward
327
328Lalign1_forward:
329	/* the dest pointer is 1 byte off from src */
330	mov		r12, r2, lsr #2		/* number of words we should copy */
331	sub		r0, r0, #1
332
333	/* prime the copy */
334	ldrb	r4, [r0]			/* load D[7:0] */
335
336Lalign1_forward_loop:
337	ldr		r3, [r1], #4		/* load S */
338	orr		r4, r4, r3, lsl #8	/* D[31:8] = S[24:0] */
339	str		r4, [r0], #4		/* save D */
340	mov		r4, r3, lsr #24		/* D[7:0] = S[31:25] */
341	subs	r12, r12, #1
342	bne		Lalign1_forward_loop
343
344	/* finish the copy off */
345	strb	r4, [r0], #1		/* save D[7:0] */
346
347	ands	r2, r2, #3
348	beq		Lexit
349	b		Lbytewise2
350
351Lalign2_forward:
352	/* the dest pointer is 2 bytes off from src */
353	mov		r12, r2, lsr #2		/* number of words we should copy */
354	sub		r0, r0, #2
355
356	/* prime the copy */
357	ldrh	r4, [r0]			/* load D[15:0] */
358
359Lalign2_forward_loop:
360	ldr		r3, [r1], #4		/* load S */
361	orr		r4, r4, r3, lsl #16	/* D[31:16] = S[15:0] */
362	str		r4, [r0], #4		/* save D */
363	mov		r4, r3, lsr #16		/* D[15:0] = S[31:15] */
364	subs	r12, r12, #1
365	bne		Lalign2_forward_loop
366
367	/* finish the copy off */
368	strh	r4, [r0], #2		/* save D[15:0] */
369
370	ands	r2, r2, #3
371	beq		Lexit
372	b		Lbytewise2
373
374Lalign3_forward:
375	/* the dest pointer is 3 bytes off from src */
376	mov		r12, r2, lsr #2		/* number of words we should copy */
377	sub		r0, r0, #3
378
379	/* prime the copy */
380	ldr		r4, [r0]
381	and		r4, r4, #0x00ffffff	/* load D[24:0] */
382
383Lalign3_forward_loop:
384	ldr		r3, [r1], #4		/* load S */
385	orr		r4, r4, r3, lsl #24	/* D[31:25] = S[7:0] */
386	str		r4, [r0], #4		/* save D */
387	mov		r4, r3, lsr #8		/* D[24:0] = S[31:8] */
388	subs	r12, r12, #1
389	bne		Lalign3_forward_loop
390
391	/* finish the copy off */
392	strh	r4, [r0], #2		/* save D[15:0] */
393	mov		r4, r4, lsr #16
394	strb	r4, [r0], #1		/* save D[23:16] */
395
396	ands	r2, r2, #3
397	beq		Lexit
398	b		Lbytewise2
399
400Lexit:
401	ldmfd	sp!, { r0, r4, r5, r7, pc }
402
403