xref: /xnu-10002.61.3/osfmk/arm64/bcopy.s (revision 0f4c859e951fba394238ab619495c4e1d54d0f34)
1*0f4c859eSApple OSS Distributions/*
2*0f4c859eSApple OSS Distributions * Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
3*0f4c859eSApple OSS Distributions *
4*0f4c859eSApple OSS Distributions * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5*0f4c859eSApple OSS Distributions *
6*0f4c859eSApple OSS Distributions * This file contains Original Code and/or Modifications of Original Code
7*0f4c859eSApple OSS Distributions * as defined in and that are subject to the Apple Public Source License
8*0f4c859eSApple OSS Distributions * Version 2.0 (the 'License'). You may not use this file except in
9*0f4c859eSApple OSS Distributions * compliance with the License. The rights granted to you under the License
10*0f4c859eSApple OSS Distributions * may not be used to create, or enable the creation or redistribution of,
11*0f4c859eSApple OSS Distributions * unlawful or unlicensed copies of an Apple operating system, or to
12*0f4c859eSApple OSS Distributions * circumvent, violate, or enable the circumvention or violation of, any
13*0f4c859eSApple OSS Distributions * terms of an Apple operating system software license agreement.
14*0f4c859eSApple OSS Distributions *
15*0f4c859eSApple OSS Distributions * Please obtain a copy of the License at
16*0f4c859eSApple OSS Distributions * http://www.opensource.apple.com/apsl/ and read it before using this file.
17*0f4c859eSApple OSS Distributions *
18*0f4c859eSApple OSS Distributions * The Original Code and all software distributed under the License are
19*0f4c859eSApple OSS Distributions * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20*0f4c859eSApple OSS Distributions * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21*0f4c859eSApple OSS Distributions * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22*0f4c859eSApple OSS Distributions * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23*0f4c859eSApple OSS Distributions * Please see the License for the specific language governing rights and
24*0f4c859eSApple OSS Distributions * limitations under the License.
25*0f4c859eSApple OSS Distributions *
26*0f4c859eSApple OSS Distributions * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27*0f4c859eSApple OSS Distributions *
28*0f4c859eSApple OSS Distributions *  This file implements the following functions for the arm64 architecture.
29*0f4c859eSApple OSS Distributions *
30*0f4c859eSApple OSS Distributions *  void bcopy(const void * source,
31*0f4c859eSApple OSS Distributions *             void * destination,
32*0f4c859eSApple OSS Distributions *             size_t length);
33*0f4c859eSApple OSS Distributions *
34*0f4c859eSApple OSS Distributions *  void *memmove(void * destination,
35*0f4c859eSApple OSS Distributions *                const void * source,
36*0f4c859eSApple OSS Distributions *                size_t n);
37*0f4c859eSApple OSS Distributions *
38*0f4c859eSApple OSS Distributions *  void *memcpy(void * restrict destination,
39*0f4c859eSApple OSS Distributions *               const void * restrict source,
40*0f4c859eSApple OSS Distributions *               size_t n);
41*0f4c859eSApple OSS Distributions *
42*0f4c859eSApple OSS Distributions * All copy n successive bytes from source to destination.  Memmove and memcpy
43*0f4c859eSApple OSS Distributions * return destination, whereas bcopy has no return value.  Copying takes place
44*0f4c859eSApple OSS Distributions * as if it were through a temporary buffer -- after return destination
45*0f4c859eSApple OSS Distributions * contains exactly the bytes from source, even if the buffers overlap (this is
46*0f4c859eSApple OSS Distributions * not required of memcpy by the C standard; its behavior is undefined if the
47*0f4c859eSApple OSS Distributions * buffers overlap, but we are holding ourselves to the historical behavior of
48*0f4c859eSApple OSS Distributions * this function on MacOS).
49*0f4c859eSApple OSS Distributions */
50*0f4c859eSApple OSS Distributions
51*0f4c859eSApple OSS Distributions#include "asm.h"
52*0f4c859eSApple OSS Distributions
53*0f4c859eSApple OSS Distributions.globl _bcopy
54*0f4c859eSApple OSS Distributions.globl _ovbcopy
55*0f4c859eSApple OSS Distributions.globl _memcpy
56*0f4c859eSApple OSS Distributions.globl _memmove
57*0f4c859eSApple OSS Distributions
58*0f4c859eSApple OSS Distributions/*****************************************************************************
59*0f4c859eSApple OSS Distributions *  Macros                                                                   *
60*0f4c859eSApple OSS Distributions *****************************************************************************/
61*0f4c859eSApple OSS Distributions
62*0f4c859eSApple OSS Distributions#define kSmallCopy 64
63*0f4c859eSApple OSS Distributions
64*0f4c859eSApple OSS Distributions/*****************************************************************************
65*0f4c859eSApple OSS Distributions *  Entrypoints                                                              *
66*0f4c859eSApple OSS Distributions *****************************************************************************/
67*0f4c859eSApple OSS Distributions
68*0f4c859eSApple OSS Distributions.text
69*0f4c859eSApple OSS Distributions.align 5
70*0f4c859eSApple OSS Distributions_bcopy:
71*0f4c859eSApple OSS Distributions_ovbcopy:
72*0f4c859eSApple OSS Distributions//  Translate bcopy into memcpy by swapping the first and second arguments.
73*0f4c859eSApple OSS Distributions	mov     x3,      x0
74*0f4c859eSApple OSS Distributions	mov     x0,      x1
75*0f4c859eSApple OSS Distributions	mov     x1,      x3
76*0f4c859eSApple OSS Distributions
77*0f4c859eSApple OSS Distributions.align 4
78*0f4c859eSApple OSS Distributions_memcpy:
79*0f4c859eSApple OSS Distributions_memmove:
80*0f4c859eSApple OSS Distributions//	Our preference is to copy the data in ascending address order, but if the
81*0f4c859eSApple OSS Distributions//	buffers overlap such that the beginning of the destination buffer aliases
82*0f4c859eSApple OSS Distributions//	the end of the source buffer, we need to copy in descending address order
83*0f4c859eSApple OSS Distributions//	instead to preserve the memmove semantics.  We detect this case with the
84*0f4c859eSApple OSS Distributions//	test:
85*0f4c859eSApple OSS Distributions//
86*0f4c859eSApple OSS Distributions//	    destination - source < length    (unsigned compare)
87*0f4c859eSApple OSS Distributions//
88*0f4c859eSApple OSS Distributions//	If the address of the source buffer is higher than the address of the
89*0f4c859eSApple OSS Distributions//	destination buffer, this arithmetic can overflow, but the overflowed value
90*0f4c859eSApple OSS Distributions//	can only be smaller than length if the buffers do not overlap, so we don't
91*0f4c859eSApple OSS Distributions//	need to worry about false positives due to the overflow (they happen, but
92*0f4c859eSApple OSS Distributions//	only in cases where copying in either order is correct).
93*0f4c859eSApple OSS Distributions	ARM64_STACK_PROLOG
94*0f4c859eSApple OSS Distributions	PUSH_FRAME
95*0f4c859eSApple OSS Distributions	sub     x3,      x0, x1
96*0f4c859eSApple OSS Distributions	cmp     x3,      x2
97*0f4c859eSApple OSS Distributions	b.cc    L_reverse
98*0f4c859eSApple OSS Distributions	mov     x3,      x0      // copy destination pointer
99*0f4c859eSApple OSS Distributions	cmp     x2,      #(kSmallCopy)
100*0f4c859eSApple OSS Distributions	b.cc    L_forwardSmallCopy
101*0f4c859eSApple OSS Distributions
102*0f4c859eSApple OSS Distributions/*****************************************************************************
103*0f4c859eSApple OSS Distributions *  Forward large copy                                                       *
104*0f4c859eSApple OSS Distributions *****************************************************************************/
105*0f4c859eSApple OSS Distributions
106*0f4c859eSApple OSS Distributions//	Load the first 32 bytes from src, and compute the number of bytes to the
107*0f4c859eSApple OSS Distributions//	first 32-byte aligned location in dst.  Even though we are going to copy
108*0f4c859eSApple OSS Distributions//	32 bytes, only those preceeding that 32-byte location "count" towards
109*0f4c859eSApple OSS Distributions//	reducing the length of the buffer or advancing the pointers.  We will need
110*0f4c859eSApple OSS Distributions//	to issue the first load from the advanced src pointer BEFORE the store to
111*0f4c859eSApple OSS Distributions//	the unmodified dst pointer.
112*0f4c859eSApple OSS Distributions	add     x3,      x3, #32
113*0f4c859eSApple OSS Distributions	and     x3,      x3, #-32 // aligned dst
114*0f4c859eSApple OSS Distributions	ldp     x12,x13,[x1]
115*0f4c859eSApple OSS Distributions	ldp     x14,x15,[x1, #16]
116*0f4c859eSApple OSS Distributions	sub     x5,      x3, x0   // bytes between original dst and aligned dst
117*0f4c859eSApple OSS Distributions	add     x1,      x1, x5   // update src pointer
118*0f4c859eSApple OSS Distributions
119*0f4c859eSApple OSS Distributions//	At this point, data in the following registers is in flight:
120*0f4c859eSApple OSS Distributions//
121*0f4c859eSApple OSS Distributions//		x0    original dst pointer
122*0f4c859eSApple OSS Distributions//		x1    corresponding location in src buffer.
123*0f4c859eSApple OSS Distributions//		x2    length from aligned location in dst to end of buffer.  This is
124*0f4c859eSApple OSS Distributions//		      guaranteed to be >= (64 - 32).
125*0f4c859eSApple OSS Distributions//		x3    aligned location in dst buffer.
126*0f4c859eSApple OSS Distributions//		x12:x15 first 32 bytes of src buffer.
127*0f4c859eSApple OSS Distributions//
128*0f4c859eSApple OSS Distributions//	We now load 32 bytes from x1, and store 32 bytes from x12:x15 to x3.  The
129*0f4c859eSApple OSS Distributions//	store *may* overlap the first 32 bytes of the load, so in order to get
130*0f4c859eSApple OSS Distributions//	correct memmove semantics, the first 32 byte load must occur before the
131*0f4c859eSApple OSS Distributions//	store.
132*0f4c859eSApple OSS Distributions//
133*0f4c859eSApple OSS Distributions//	After loading these 32 bytes, we advance x1, and decrement the length by
134*0f4c859eSApple OSS Distributions//	64.  If the remaining length of the buffer was less than 64, then we jump
135*0f4c859eSApple OSS Distributions//	directly to the cleanup path.
136*0f4c859eSApple OSS Distributions	ldp     x8, x9, [x1]
137*0f4c859eSApple OSS Distributions	ldp     x10,x11,[x1, #16]
138*0f4c859eSApple OSS Distributions	add     x1,      x1, #32
139*0f4c859eSApple OSS Distributions	sub     x2,      x2, x5   // update length
140*0f4c859eSApple OSS Distributions	stp     x12,x13,[x0]      // initial unaligned store
141*0f4c859eSApple OSS Distributions	stp     x14,x15,[x0, #16] // initial unaligned store
142*0f4c859eSApple OSS Distributions	subs    x2,      x2, #64
143*0f4c859eSApple OSS Distributions	b.ls    L_forwardCleanup
144*0f4c859eSApple OSS Distributions
145*0f4c859eSApple OSS DistributionsL_forwardCopyLoop:
146*0f4c859eSApple OSS Distributions//	Main copy loop:
147*0f4c859eSApple OSS Distributions//
148*0f4c859eSApple OSS Distributions//		1. store the 32 bytes loaded in the previous loop iteration
149*0f4c859eSApple OSS Distributions//		2. advance the destination pointer
150*0f4c859eSApple OSS Distributions//		3. load the next 32 bytes
151*0f4c859eSApple OSS Distributions//		4. advance the source pointer
152*0f4c859eSApple OSS Distributions//		5. subtract 32 from the length
153*0f4c859eSApple OSS Distributions//
154*0f4c859eSApple OSS Distributions//	The loop is terminated when 32 or fewer bytes remain to be loaded.  Those
155*0f4c859eSApple OSS Distributions//	trailing 1-32 bytes will be copied in the loop cleanup.
156*0f4c859eSApple OSS Distributions	stnp    x8, x9, [x3]
157*0f4c859eSApple OSS Distributions	stnp    x10,x11,[x3, #16]
158*0f4c859eSApple OSS Distributions	add     x3,      x3, #32
159*0f4c859eSApple OSS Distributions	ldnp    x8, x9, [x1]
160*0f4c859eSApple OSS Distributions	ldnp    x10,x11,[x1, #16]
161*0f4c859eSApple OSS Distributions	add     x1,      x1, #32
162*0f4c859eSApple OSS Distributions	subs    x2,      x2, #32
163*0f4c859eSApple OSS Distributions	b.hi    L_forwardCopyLoop
164*0f4c859eSApple OSS Distributions
165*0f4c859eSApple OSS DistributionsL_forwardCleanup:
166*0f4c859eSApple OSS Distributions//	There are 32 bytes in x8-x11 that were loaded in the previous loop
167*0f4c859eSApple OSS Distributions//	iteration, which need to be stored to [x3,x3+32).  In addition, between
168*0f4c859eSApple OSS Distributions//  0 and 32 more bytes need to be copied from x1 to x3 + 32.  The exact
169*0f4c859eSApple OSS Distributions//	number of bytes to copy is x2 + 32.  Instead of using smaller conditional
170*0f4c859eSApple OSS Distributions//	copies, we simply copy 32 unaligned bytes from x1+x2 to 64+x3+x2.
171*0f4c859eSApple OSS Distributions//	This copy may overlap with the first store, so the loads must come before
172*0f4c859eSApple OSS Distributions//	the store of the data from the previous loop iteration.
173*0f4c859eSApple OSS Distributions	add     x1,      x1, x2
174*0f4c859eSApple OSS Distributions	ldp     x12,x13,[x1]
175*0f4c859eSApple OSS Distributions	ldp     x14,x15,[x1, #16]
176*0f4c859eSApple OSS Distributions	stp     x8, x9, [x3]
177*0f4c859eSApple OSS Distributions	stp     x10,x11,[x3, #16]
178*0f4c859eSApple OSS Distributions	add     x3,      x3, x2
179*0f4c859eSApple OSS Distributions	stp     x12,x13,[x3, #32]
180*0f4c859eSApple OSS Distributions	stp     x14,x15,[x3, #48]
181*0f4c859eSApple OSS Distributions	POP_FRAME
182*0f4c859eSApple OSS Distributions	ARM64_STACK_EPILOG
183*0f4c859eSApple OSS Distributions
184*0f4c859eSApple OSS Distributions/*****************************************************************************
185*0f4c859eSApple OSS Distributions *  forward small copy                                                       *
186*0f4c859eSApple OSS Distributions *****************************************************************************/
187*0f4c859eSApple OSS Distributions
188*0f4c859eSApple OSS Distributions//	Copy one quadword at a time until less than 8 bytes remain to be copied.
189*0f4c859eSApple OSS Distributions//	At the point of entry to L_forwardSmallCopy, the "calling convention"
190*0f4c859eSApple OSS Distributions//	is as follows:
191*0f4c859eSApple OSS Distributions//
192*0f4c859eSApple OSS Distributions//	  x0     pointer to first byte of destination
193*0f4c859eSApple OSS Distributions//	  x1     pointer to first byte of source
194*0f4c859eSApple OSS Distributions//	  x2     length of buffers
195*0f4c859eSApple OSS Distributions//	  x3     pointer to first byte of destination
196*0f4c859eSApple OSS Distributions0:	ldr     x6,     [x1],#8
197*0f4c859eSApple OSS Distributions	str     x6,     [x3],#8
198*0f4c859eSApple OSS DistributionsL_forwardSmallCopy:
199*0f4c859eSApple OSS Distributions	subs    x2,      x2, #8
200*0f4c859eSApple OSS Distributions	b.cs    0b
201*0f4c859eSApple OSS Distributions	adds    x2,      x2, #8
202*0f4c859eSApple OSS Distributions	b.eq    2f
203*0f4c859eSApple OSS Distributions1:	ldrb    w6,     [x1],#1
204*0f4c859eSApple OSS Distributions	strb    w6,     [x3],#1
205*0f4c859eSApple OSS Distributions	subs    x2,      x2, #1
206*0f4c859eSApple OSS Distributions	b.ne    1b
207*0f4c859eSApple OSS Distributions2:	POP_FRAME
208*0f4c859eSApple OSS Distributions	ARM64_STACK_EPILOG
209*0f4c859eSApple OSS Distributions
210*0f4c859eSApple OSS Distributions/*****************************************************************************
211*0f4c859eSApple OSS Distributions *  Reverse copy engines                                                     *
212*0f4c859eSApple OSS Distributions *****************************************************************************/
213*0f4c859eSApple OSS Distributions
214*0f4c859eSApple OSS Distributions//	The reverse copy engines are identical in every way to the forward copy
215*0f4c859eSApple OSS Distributions//	engines, except in that they do everything backwards.  For this reason, they
216*0f4c859eSApple OSS Distributions//	are somewhat more sparsely commented than the forward copy loops.  I have
217*0f4c859eSApple OSS Distributions//	tried to only comment things that might be somewhat surprising in how they
218*0f4c859eSApple OSS Distributions//	differ from the forward implementation.
219*0f4c859eSApple OSS Distributions//
220*0f4c859eSApple OSS Distributions//	The one important thing to note is that (almost without fail), x1 and x3
221*0f4c859eSApple OSS Distributions//	will point to ONE BYTE BEYOND the "right-hand edge" of the active buffer
222*0f4c859eSApple OSS Distributions//	throughout these copy loops.  They are initially advanced to that position
223*0f4c859eSApple OSS Distributions//	in the L_reverse jump island.  Because of this, whereas the forward copy
224*0f4c859eSApple OSS Distributions//	loops generally follow a "copy data, then advance pointers" scheme, in the
225*0f4c859eSApple OSS Distributions//	reverse copy loops, we advance the pointers, then copy the data.
226*0f4c859eSApple OSS Distributions
227*0f4c859eSApple OSS DistributionsL_reverse:
228*0f4c859eSApple OSS Distributions//	As a minor optimization, we early out if dst == src.
229*0f4c859eSApple OSS Distributions	cbz     x3,      L_return
230*0f4c859eSApple OSS Distributions//	advance both pointers to the ends of their respective buffers before
231*0f4c859eSApple OSS Distributions//	jumping into the appropriate reverse copy loop.
232*0f4c859eSApple OSS Distributions	add     x4,      x0, x2
233*0f4c859eSApple OSS Distributions	add     x1,      x1, x2
234*0f4c859eSApple OSS Distributions	cmp     x2,      #(kSmallCopy)
235*0f4c859eSApple OSS Distributions	b.cc    L_reverseSmallCopy
236*0f4c859eSApple OSS Distributions
237*0f4c859eSApple OSS Distributions/*****************************************************************************
238*0f4c859eSApple OSS Distributions *  Reverse large copy                                                       *
239*0f4c859eSApple OSS Distributions *****************************************************************************/
240*0f4c859eSApple OSS Distributions
241*0f4c859eSApple OSS Distributions	ldp     x12,x13,[x1, #-16]
242*0f4c859eSApple OSS Distributions	ldp     x14,x15,[x1, #-32]
243*0f4c859eSApple OSS Distributions	sub     x3,      x4, #1   // In the forward copy, we used dst+32 & -32
244*0f4c859eSApple OSS Distributions	and     x3,      x3, #-32 // to find an aligned location in the dest
245*0f4c859eSApple OSS Distributions	sub     x5,      x4, x3   // buffer.  Here we use dst-1 & -32 instead,
246*0f4c859eSApple OSS Distributions	sub     x1,      x1, x5   // because we are going backwards.
247*0f4c859eSApple OSS Distributions	sub     x2,      x2, x5
248*0f4c859eSApple OSS Distributions	ldp     x8, x9, [x1, #-16]
249*0f4c859eSApple OSS Distributions	ldp     x10,x11,[x1, #-32]
250*0f4c859eSApple OSS Distributions	stp     x12,x13,[x4, #-16]
251*0f4c859eSApple OSS Distributions	stp     x14,x15,[x4, #-32]
252*0f4c859eSApple OSS Distributions	sub     x1,      x1, #32
253*0f4c859eSApple OSS Distributions	subs    x2,      x2, #64
254*0f4c859eSApple OSS Distributions	b.ls    L_reverseCleanup
255*0f4c859eSApple OSS Distributions
256*0f4c859eSApple OSS DistributionsL_reverseCopyLoop:
257*0f4c859eSApple OSS Distributions	stnp    x8, x9, [x3, #-16]
258*0f4c859eSApple OSS Distributions	stnp    x10,x11,[x3, #-32]
259*0f4c859eSApple OSS Distributions	sub     x3,      x3, #32
260*0f4c859eSApple OSS Distributions	ldnp    x8, x9, [x1, #-16]
261*0f4c859eSApple OSS Distributions	ldnp    x10,x11,[x1, #-32]
262*0f4c859eSApple OSS Distributions	sub     x1,      x1, #32
263*0f4c859eSApple OSS Distributions	subs    x2,      x2, #32
264*0f4c859eSApple OSS Distributions	b.hi    L_reverseCopyLoop
265*0f4c859eSApple OSS Distributions
266*0f4c859eSApple OSS DistributionsL_reverseCleanup:
267*0f4c859eSApple OSS Distributions	sub     x1,      x1, x2
268*0f4c859eSApple OSS Distributions	ldp     x12,x13,[x1, #-16]
269*0f4c859eSApple OSS Distributions	ldp     x14,x15,[x1, #-32]
270*0f4c859eSApple OSS Distributions	stp     x8, x9, [x3, #-16]
271*0f4c859eSApple OSS Distributions	stp     x10,x11,[x3, #-32]
272*0f4c859eSApple OSS Distributions	stp     x12,x13,[x0, #16] // In the forward copy, we need to compute the
273*0f4c859eSApple OSS Distributions	stp     x14,x15,[x0]      // address of these stores, but here we already
274*0f4c859eSApple OSS Distributions	POP_FRAME       // have a pointer to the start of the buffer.
275*0f4c859eSApple OSS Distributions	ARM64_STACK_EPILOG
276*0f4c859eSApple OSS Distributions
277*0f4c859eSApple OSS Distributions/*****************************************************************************
278*0f4c859eSApple OSS Distributions *  reverse small copy                                                       *
279*0f4c859eSApple OSS Distributions *****************************************************************************/
280*0f4c859eSApple OSS Distributions
281*0f4c859eSApple OSS Distributions0:	ldr     x6,     [x1,#-8]!
282*0f4c859eSApple OSS Distributions	str     x6,     [x4,#-8]!
283*0f4c859eSApple OSS DistributionsL_reverseSmallCopy:
284*0f4c859eSApple OSS Distributions	subs    x2,      x2, #8
285*0f4c859eSApple OSS Distributions	b.cs    0b
286*0f4c859eSApple OSS Distributions	adds    x2,      x2, #8
287*0f4c859eSApple OSS Distributions	b.eq    2f
288*0f4c859eSApple OSS Distributions1:	ldrb    w6,     [x1,#-1]!
289*0f4c859eSApple OSS Distributions	strb    w6,     [x4,#-1]!
290*0f4c859eSApple OSS Distributions	subs    x2,      x2, #1
291*0f4c859eSApple OSS Distributions	b.ne    1b
292*0f4c859eSApple OSS Distributions2:	POP_FRAME
293*0f4c859eSApple OSS Distributions	ARM64_STACK_EPILOG
294*0f4c859eSApple OSS Distributions
295*0f4c859eSApple OSS Distributions
296*0f4c859eSApple OSS DistributionsL_return:
297*0f4c859eSApple OSS Distributions	POP_FRAME
298*0f4c859eSApple OSS Distributions	ARM64_STACK_EPILOG
299