xref: /xnu-11417.121.6/osfmk/arm64/bcopy.s (revision a1e26a70f38d1d7daa7b49b258e2f8538ad81650)
1/*
2 * Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 *  This file implements the following functions for the arm64 architecture.
29 *
30 *  void bcopy(const void * source,
31 *             void * destination,
32 *             size_t length);
33 *
34 *  void *memmove(void * destination,
35 *                const void * source,
36 *                size_t n);
37 *
38 *  void *memcpy(void * restrict destination,
39 *               const void * restrict source,
40 *               size_t n);
41 *
42 * All copy n successive bytes from source to destination.  Memmove and memcpy
43 * return destination, whereas bcopy has no return value.  Copying takes place
44 * as if it were through a temporary buffer -- after return destination
45 * contains exactly the bytes from source, even if the buffers overlap (this is
46 * not required of memcpy by the C standard; its behavior is undefined if the
47 * buffers overlap, but we are holding ourselves to the historical behavior of
48 * this function on MacOS).
49 */
50
51#include "asm.h"
52
53.globl _bcopy
54.globl _ovbcopy
55.globl _memcpy
56.globl _memmove
57
58/*****************************************************************************
59 *  Macros                                                                   *
60 *****************************************************************************/
61
62#define kSmallCopy 64
63
64/*****************************************************************************
65 *  Entrypoints                                                              *
66 *****************************************************************************/
67
68.text
69.align 5
70_bcopy:
71_ovbcopy:
72//  Translate bcopy into memcpy by swapping the first and second arguments.
73	ARM64_PROLOG
74	mov     x3,      x0
75	mov     x0,      x1
76	mov     x1,      x3
77
78.align 4
79_memcpy:
80_memmove:
81//	Our preference is to copy the data in ascending address order, but if the
82//	buffers overlap such that the beginning of the destination buffer aliases
83//	the end of the source buffer, we need to copy in descending address order
84//	instead to preserve the memmove semantics.  We detect this case with the
85//	test:
86//
87//	    destination - source < length    (unsigned compare)
88//
89//	If the address of the source buffer is higher than the address of the
90//	destination buffer, this arithmetic can overflow, but the overflowed value
91//	can only be smaller than length if the buffers do not overlap, so we don't
92//	need to worry about false positives due to the overflow (they happen, but
93//	only in cases where copying in either order is correct).
94	ARM64_STACK_PROLOG
95	PUSH_FRAME
96	sub     x3,      x0, x1
97	cmp     x3,      x2
98	b.cc    L_reverse
99	mov     x3,      x0      // copy destination pointer
100	cmp     x2,      #(kSmallCopy)
101	b.cc    L_forwardSmallCopy
102
103/*****************************************************************************
104 *  Forward large copy                                                       *
105 *****************************************************************************/
106
107//	Load the first 32 bytes from src, and compute the number of bytes to the
108//	first 32-byte aligned location in dst.  Even though we are going to copy
109//	32 bytes, only those preceeding that 32-byte location "count" towards
110//	reducing the length of the buffer or advancing the pointers.  We will need
111//	to issue the first load from the advanced src pointer BEFORE the store to
112//	the unmodified dst pointer.
113	add     x3,      x3, #32
114	and     x3,      x3, #-32 // aligned dst
115	ldp     x12,x13,[x1]
116	ldp     x14,x15,[x1, #16]
117	sub     x5,      x3, x0   // bytes between original dst and aligned dst
118	add     x1,      x1, x5   // update src pointer
119
120//	At this point, data in the following registers is in flight:
121//
122//		x0    original dst pointer
123//		x1    corresponding location in src buffer.
124//		x2    length from aligned location in dst to end of buffer.  This is
125//		      guaranteed to be >= (64 - 32).
126//		x3    aligned location in dst buffer.
127//		x12:x15 first 32 bytes of src buffer.
128//
129//	We now load 32 bytes from x1, and store 32 bytes from x12:x15 to x3.  The
130//	store *may* overlap the first 32 bytes of the load, so in order to get
131//	correct memmove semantics, the first 32 byte load must occur before the
132//	store.
133//
134//	After loading these 32 bytes, we advance x1, and decrement the length by
135//	64.  If the remaining length of the buffer was less than 64, then we jump
136//	directly to the cleanup path.
137	ldp     x8, x9, [x1]
138	ldp     x10,x11,[x1, #16]
139	add     x1,      x1, #32
140	sub     x2,      x2, x5   // update length
141	stp     x12,x13,[x0]      // initial unaligned store
142	stp     x14,x15,[x0, #16] // initial unaligned store
143	subs    x2,      x2, #64
144	b.ls    L_forwardCleanup
145
146L_forwardCopyLoop:
147//	Main copy loop:
148//
149//		1. store the 32 bytes loaded in the previous loop iteration
150//		2. advance the destination pointer
151//		3. load the next 32 bytes
152//		4. advance the source pointer
153//		5. subtract 32 from the length
154//
155//	The loop is terminated when 32 or fewer bytes remain to be loaded.  Those
156//	trailing 1-32 bytes will be copied in the loop cleanup.
157	stnp    x8, x9, [x3]
158	stnp    x10,x11,[x3, #16]
159	add     x3,      x3, #32
160	ldnp    x8, x9, [x1]
161	ldnp    x10,x11,[x1, #16]
162	add     x1,      x1, #32
163	subs    x2,      x2, #32
164	b.hi    L_forwardCopyLoop
165
166L_forwardCleanup:
167//	There are 32 bytes in x8-x11 that were loaded in the previous loop
168//	iteration, which need to be stored to [x3,x3+32).  In addition, between
169//  0 and 32 more bytes need to be copied from x1 to x3 + 32.  The exact
170//	number of bytes to copy is x2 + 32.  Instead of using smaller conditional
171//	copies, we simply copy 32 unaligned bytes from x1+x2 to 64+x3+x2.
172//	This copy may overlap with the first store, so the loads must come before
173//	the store of the data from the previous loop iteration.
174	add     x1,      x1, x2
175	ldp     x12,x13,[x1]
176	ldp     x14,x15,[x1, #16]
177	stp     x8, x9, [x3]
178	stp     x10,x11,[x3, #16]
179	add     x3,      x3, x2
180	stp     x12,x13,[x3, #32]
181	stp     x14,x15,[x3, #48]
182	POP_FRAME
183	ARM64_STACK_EPILOG
184
185/*****************************************************************************
186 *  forward small copy                                                       *
187 *****************************************************************************/
188
189//	Copy one quadword at a time until less than 8 bytes remain to be copied.
190//	At the point of entry to L_forwardSmallCopy, the "calling convention"
191//	is as follows:
192//
193//	  x0     pointer to first byte of destination
194//	  x1     pointer to first byte of source
195//	  x2     length of buffers
196//	  x3     pointer to first byte of destination
1970:	ldr     x6,     [x1],#8
198	str     x6,     [x3],#8
199L_forwardSmallCopy:
200	subs    x2,      x2, #8
201	b.cs    0b
202	adds    x2,      x2, #8
203	b.eq    2f
2041:	ldrb    w6,     [x1],#1
205	strb    w6,     [x3],#1
206	subs    x2,      x2, #1
207	b.ne    1b
2082:	POP_FRAME
209	ARM64_STACK_EPILOG
210
211/*****************************************************************************
212 *  Reverse copy engines                                                     *
213 *****************************************************************************/
214
215//	The reverse copy engines are identical in every way to the forward copy
216//	engines, except in that they do everything backwards.  For this reason, they
217//	are somewhat more sparsely commented than the forward copy loops.  I have
218//	tried to only comment things that might be somewhat surprising in how they
219//	differ from the forward implementation.
220//
221//	The one important thing to note is that (almost without fail), x1 and x3
222//	will point to ONE BYTE BEYOND the "right-hand edge" of the active buffer
223//	throughout these copy loops.  They are initially advanced to that position
224//	in the L_reverse jump island.  Because of this, whereas the forward copy
225//	loops generally follow a "copy data, then advance pointers" scheme, in the
226//	reverse copy loops, we advance the pointers, then copy the data.
227
228L_reverse:
229//	As a minor optimization, we early out if dst == src.
230	cbz     x3,      L_return
231//	advance both pointers to the ends of their respective buffers before
232//	jumping into the appropriate reverse copy loop.
233	add     x4,      x0, x2
234	add     x1,      x1, x2
235	cmp     x2,      #(kSmallCopy)
236	b.cc    L_reverseSmallCopy
237
238/*****************************************************************************
239 *  Reverse large copy                                                       *
240 *****************************************************************************/
241
242	ldp     x12,x13,[x1, #-16]
243	ldp     x14,x15,[x1, #-32]
244	sub     x3,      x4, #1   // In the forward copy, we used dst+32 & -32
245	and     x3,      x3, #-32 // to find an aligned location in the dest
246	sub     x5,      x4, x3   // buffer.  Here we use dst-1 & -32 instead,
247	sub     x1,      x1, x5   // because we are going backwards.
248	sub     x2,      x2, x5
249	ldp     x8, x9, [x1, #-16]
250	ldp     x10,x11,[x1, #-32]
251	stp     x12,x13,[x4, #-16]
252	stp     x14,x15,[x4, #-32]
253	sub     x1,      x1, #32
254	subs    x2,      x2, #64
255	b.ls    L_reverseCleanup
256
257L_reverseCopyLoop:
258	stnp    x8, x9, [x3, #-16]
259	stnp    x10,x11,[x3, #-32]
260	sub     x3,      x3, #32
261	ldnp    x8, x9, [x1, #-16]
262	ldnp    x10,x11,[x1, #-32]
263	sub     x1,      x1, #32
264	subs    x2,      x2, #32
265	b.hi    L_reverseCopyLoop
266
267L_reverseCleanup:
268	sub     x1,      x1, x2
269	ldp     x12,x13,[x1, #-16]
270	ldp     x14,x15,[x1, #-32]
271	stp     x8, x9, [x3, #-16]
272	stp     x10,x11,[x3, #-32]
273	stp     x12,x13,[x0, #16] // In the forward copy, we need to compute the
274	stp     x14,x15,[x0]      // address of these stores, but here we already
275	POP_FRAME       // have a pointer to the start of the buffer.
276	ARM64_STACK_EPILOG
277
278/*****************************************************************************
279 *  reverse small copy                                                       *
280 *****************************************************************************/
281
2820:	ldr     x6,     [x1,#-8]!
283	str     x6,     [x4,#-8]!
284L_reverseSmallCopy:
285	subs    x2,      x2, #8
286	b.cs    0b
287	adds    x2,      x2, #8
288	b.eq    2f
2891:	ldrb    w6,     [x1,#-1]!
290	strb    w6,     [x4,#-1]!
291	subs    x2,      x2, #1
292	b.ne    1b
2932:	POP_FRAME
294	ARM64_STACK_EPILOG
295
296
297L_return:
298	POP_FRAME
299	ARM64_STACK_EPILOG
300