1*0f4c859eSApple OSS Distributions/* 2*0f4c859eSApple OSS Distributions * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. 3*0f4c859eSApple OSS Distributions * 4*0f4c859eSApple OSS Distributions * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5*0f4c859eSApple OSS Distributions * 6*0f4c859eSApple OSS Distributions * This file contains Original Code and/or Modifications of Original Code 7*0f4c859eSApple OSS Distributions * as defined in and that are subject to the Apple Public Source License 8*0f4c859eSApple OSS Distributions * Version 2.0 (the 'License'). You may not use this file except in 9*0f4c859eSApple OSS Distributions * compliance with the License. The rights granted to you under the License 10*0f4c859eSApple OSS Distributions * may not be used to create, or enable the creation or redistribution of, 11*0f4c859eSApple OSS Distributions * unlawful or unlicensed copies of an Apple operating system, or to 12*0f4c859eSApple OSS Distributions * circumvent, violate, or enable the circumvention or violation of, any 13*0f4c859eSApple OSS Distributions * terms of an Apple operating system software license agreement. 14*0f4c859eSApple OSS Distributions * 15*0f4c859eSApple OSS Distributions * Please obtain a copy of the License at 16*0f4c859eSApple OSS Distributions * http://www.opensource.apple.com/apsl/ and read it before using this file. 17*0f4c859eSApple OSS Distributions * 18*0f4c859eSApple OSS Distributions * The Original Code and all software distributed under the License are 19*0f4c859eSApple OSS Distributions * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20*0f4c859eSApple OSS Distributions * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21*0f4c859eSApple OSS Distributions * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22*0f4c859eSApple OSS Distributions * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23*0f4c859eSApple OSS Distributions * Please see the License for the specific language governing rights and 24*0f4c859eSApple OSS Distributions * limitations under the License. 25*0f4c859eSApple OSS Distributions * 26*0f4c859eSApple OSS Distributions * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27*0f4c859eSApple OSS Distributions * 28*0f4c859eSApple OSS Distributions * This file implements the following functions for the arm64 architecture. 29*0f4c859eSApple OSS Distributions * 30*0f4c859eSApple OSS Distributions * void bcopy(const void * source, 31*0f4c859eSApple OSS Distributions * void * destination, 32*0f4c859eSApple OSS Distributions * size_t length); 33*0f4c859eSApple OSS Distributions * 34*0f4c859eSApple OSS Distributions * void *memmove(void * destination, 35*0f4c859eSApple OSS Distributions * const void * source, 36*0f4c859eSApple OSS Distributions * size_t n); 37*0f4c859eSApple OSS Distributions * 38*0f4c859eSApple OSS Distributions * void *memcpy(void * restrict destination, 39*0f4c859eSApple OSS Distributions * const void * restrict source, 40*0f4c859eSApple OSS Distributions * size_t n); 41*0f4c859eSApple OSS Distributions * 42*0f4c859eSApple OSS Distributions * All copy n successive bytes from source to destination. Memmove and memcpy 43*0f4c859eSApple OSS Distributions * return destination, whereas bcopy has no return value. Copying takes place 44*0f4c859eSApple OSS Distributions * as if it were through a temporary buffer -- after return destination 45*0f4c859eSApple OSS Distributions * contains exactly the bytes from source, even if the buffers overlap (this is 46*0f4c859eSApple OSS Distributions * not required of memcpy by the C standard; its behavior is undefined if the 47*0f4c859eSApple OSS Distributions * buffers overlap, but we are holding ourselves to the historical behavior of 48*0f4c859eSApple OSS Distributions * this function on MacOS). 49*0f4c859eSApple OSS Distributions */ 50*0f4c859eSApple OSS Distributions 51*0f4c859eSApple OSS Distributions#include "asm.h" 52*0f4c859eSApple OSS Distributions 53*0f4c859eSApple OSS Distributions.globl _bcopy 54*0f4c859eSApple OSS Distributions.globl _ovbcopy 55*0f4c859eSApple OSS Distributions.globl _memcpy 56*0f4c859eSApple OSS Distributions.globl _memmove 57*0f4c859eSApple OSS Distributions 58*0f4c859eSApple OSS Distributions/***************************************************************************** 59*0f4c859eSApple OSS Distributions * Macros * 60*0f4c859eSApple OSS Distributions *****************************************************************************/ 61*0f4c859eSApple OSS Distributions 62*0f4c859eSApple OSS Distributions#define kSmallCopy 64 63*0f4c859eSApple OSS Distributions 64*0f4c859eSApple OSS Distributions/***************************************************************************** 65*0f4c859eSApple OSS Distributions * Entrypoints * 66*0f4c859eSApple OSS Distributions *****************************************************************************/ 67*0f4c859eSApple OSS Distributions 68*0f4c859eSApple OSS Distributions.text 69*0f4c859eSApple OSS Distributions.align 5 70*0f4c859eSApple OSS Distributions_bcopy: 71*0f4c859eSApple OSS Distributions_ovbcopy: 72*0f4c859eSApple OSS Distributions// Translate bcopy into memcpy by swapping the first and second arguments. 73*0f4c859eSApple OSS Distributions mov x3, x0 74*0f4c859eSApple OSS Distributions mov x0, x1 75*0f4c859eSApple OSS Distributions mov x1, x3 76*0f4c859eSApple OSS Distributions 77*0f4c859eSApple OSS Distributions.align 4 78*0f4c859eSApple OSS Distributions_memcpy: 79*0f4c859eSApple OSS Distributions_memmove: 80*0f4c859eSApple OSS Distributions// Our preference is to copy the data in ascending address order, but if the 81*0f4c859eSApple OSS Distributions// buffers overlap such that the beginning of the destination buffer aliases 82*0f4c859eSApple OSS Distributions// the end of the source buffer, we need to copy in descending address order 83*0f4c859eSApple OSS Distributions// instead to preserve the memmove semantics. We detect this case with the 84*0f4c859eSApple OSS Distributions// test: 85*0f4c859eSApple OSS Distributions// 86*0f4c859eSApple OSS Distributions// destination - source < length (unsigned compare) 87*0f4c859eSApple OSS Distributions// 88*0f4c859eSApple OSS Distributions// If the address of the source buffer is higher than the address of the 89*0f4c859eSApple OSS Distributions// destination buffer, this arithmetic can overflow, but the overflowed value 90*0f4c859eSApple OSS Distributions// can only be smaller than length if the buffers do not overlap, so we don't 91*0f4c859eSApple OSS Distributions// need to worry about false positives due to the overflow (they happen, but 92*0f4c859eSApple OSS Distributions// only in cases where copying in either order is correct). 93*0f4c859eSApple OSS Distributions ARM64_STACK_PROLOG 94*0f4c859eSApple OSS Distributions PUSH_FRAME 95*0f4c859eSApple OSS Distributions sub x3, x0, x1 96*0f4c859eSApple OSS Distributions cmp x3, x2 97*0f4c859eSApple OSS Distributions b.cc L_reverse 98*0f4c859eSApple OSS Distributions mov x3, x0 // copy destination pointer 99*0f4c859eSApple OSS Distributions cmp x2, #(kSmallCopy) 100*0f4c859eSApple OSS Distributions b.cc L_forwardSmallCopy 101*0f4c859eSApple OSS Distributions 102*0f4c859eSApple OSS Distributions/***************************************************************************** 103*0f4c859eSApple OSS Distributions * Forward large copy * 104*0f4c859eSApple OSS Distributions *****************************************************************************/ 105*0f4c859eSApple OSS Distributions 106*0f4c859eSApple OSS Distributions// Load the first 32 bytes from src, and compute the number of bytes to the 107*0f4c859eSApple OSS Distributions// first 32-byte aligned location in dst. Even though we are going to copy 108*0f4c859eSApple OSS Distributions// 32 bytes, only those preceeding that 32-byte location "count" towards 109*0f4c859eSApple OSS Distributions// reducing the length of the buffer or advancing the pointers. We will need 110*0f4c859eSApple OSS Distributions// to issue the first load from the advanced src pointer BEFORE the store to 111*0f4c859eSApple OSS Distributions// the unmodified dst pointer. 112*0f4c859eSApple OSS Distributions add x3, x3, #32 113*0f4c859eSApple OSS Distributions and x3, x3, #-32 // aligned dst 114*0f4c859eSApple OSS Distributions ldp x12,x13,[x1] 115*0f4c859eSApple OSS Distributions ldp x14,x15,[x1, #16] 116*0f4c859eSApple OSS Distributions sub x5, x3, x0 // bytes between original dst and aligned dst 117*0f4c859eSApple OSS Distributions add x1, x1, x5 // update src pointer 118*0f4c859eSApple OSS Distributions 119*0f4c859eSApple OSS Distributions// At this point, data in the following registers is in flight: 120*0f4c859eSApple OSS Distributions// 121*0f4c859eSApple OSS Distributions// x0 original dst pointer 122*0f4c859eSApple OSS Distributions// x1 corresponding location in src buffer. 123*0f4c859eSApple OSS Distributions// x2 length from aligned location in dst to end of buffer. This is 124*0f4c859eSApple OSS Distributions// guaranteed to be >= (64 - 32). 125*0f4c859eSApple OSS Distributions// x3 aligned location in dst buffer. 126*0f4c859eSApple OSS Distributions// x12:x15 first 32 bytes of src buffer. 127*0f4c859eSApple OSS Distributions// 128*0f4c859eSApple OSS Distributions// We now load 32 bytes from x1, and store 32 bytes from x12:x15 to x3. The 129*0f4c859eSApple OSS Distributions// store *may* overlap the first 32 bytes of the load, so in order to get 130*0f4c859eSApple OSS Distributions// correct memmove semantics, the first 32 byte load must occur before the 131*0f4c859eSApple OSS Distributions// store. 132*0f4c859eSApple OSS Distributions// 133*0f4c859eSApple OSS Distributions// After loading these 32 bytes, we advance x1, and decrement the length by 134*0f4c859eSApple OSS Distributions// 64. If the remaining length of the buffer was less than 64, then we jump 135*0f4c859eSApple OSS Distributions// directly to the cleanup path. 136*0f4c859eSApple OSS Distributions ldp x8, x9, [x1] 137*0f4c859eSApple OSS Distributions ldp x10,x11,[x1, #16] 138*0f4c859eSApple OSS Distributions add x1, x1, #32 139*0f4c859eSApple OSS Distributions sub x2, x2, x5 // update length 140*0f4c859eSApple OSS Distributions stp x12,x13,[x0] // initial unaligned store 141*0f4c859eSApple OSS Distributions stp x14,x15,[x0, #16] // initial unaligned store 142*0f4c859eSApple OSS Distributions subs x2, x2, #64 143*0f4c859eSApple OSS Distributions b.ls L_forwardCleanup 144*0f4c859eSApple OSS Distributions 145*0f4c859eSApple OSS DistributionsL_forwardCopyLoop: 146*0f4c859eSApple OSS Distributions// Main copy loop: 147*0f4c859eSApple OSS Distributions// 148*0f4c859eSApple OSS Distributions// 1. store the 32 bytes loaded in the previous loop iteration 149*0f4c859eSApple OSS Distributions// 2. advance the destination pointer 150*0f4c859eSApple OSS Distributions// 3. load the next 32 bytes 151*0f4c859eSApple OSS Distributions// 4. advance the source pointer 152*0f4c859eSApple OSS Distributions// 5. subtract 32 from the length 153*0f4c859eSApple OSS Distributions// 154*0f4c859eSApple OSS Distributions// The loop is terminated when 32 or fewer bytes remain to be loaded. Those 155*0f4c859eSApple OSS Distributions// trailing 1-32 bytes will be copied in the loop cleanup. 156*0f4c859eSApple OSS Distributions stnp x8, x9, [x3] 157*0f4c859eSApple OSS Distributions stnp x10,x11,[x3, #16] 158*0f4c859eSApple OSS Distributions add x3, x3, #32 159*0f4c859eSApple OSS Distributions ldnp x8, x9, [x1] 160*0f4c859eSApple OSS Distributions ldnp x10,x11,[x1, #16] 161*0f4c859eSApple OSS Distributions add x1, x1, #32 162*0f4c859eSApple OSS Distributions subs x2, x2, #32 163*0f4c859eSApple OSS Distributions b.hi L_forwardCopyLoop 164*0f4c859eSApple OSS Distributions 165*0f4c859eSApple OSS DistributionsL_forwardCleanup: 166*0f4c859eSApple OSS Distributions// There are 32 bytes in x8-x11 that were loaded in the previous loop 167*0f4c859eSApple OSS Distributions// iteration, which need to be stored to [x3,x3+32). In addition, between 168*0f4c859eSApple OSS Distributions// 0 and 32 more bytes need to be copied from x1 to x3 + 32. The exact 169*0f4c859eSApple OSS Distributions// number of bytes to copy is x2 + 32. Instead of using smaller conditional 170*0f4c859eSApple OSS Distributions// copies, we simply copy 32 unaligned bytes from x1+x2 to 64+x3+x2. 171*0f4c859eSApple OSS Distributions// This copy may overlap with the first store, so the loads must come before 172*0f4c859eSApple OSS Distributions// the store of the data from the previous loop iteration. 173*0f4c859eSApple OSS Distributions add x1, x1, x2 174*0f4c859eSApple OSS Distributions ldp x12,x13,[x1] 175*0f4c859eSApple OSS Distributions ldp x14,x15,[x1, #16] 176*0f4c859eSApple OSS Distributions stp x8, x9, [x3] 177*0f4c859eSApple OSS Distributions stp x10,x11,[x3, #16] 178*0f4c859eSApple OSS Distributions add x3, x3, x2 179*0f4c859eSApple OSS Distributions stp x12,x13,[x3, #32] 180*0f4c859eSApple OSS Distributions stp x14,x15,[x3, #48] 181*0f4c859eSApple OSS Distributions POP_FRAME 182*0f4c859eSApple OSS Distributions ARM64_STACK_EPILOG 183*0f4c859eSApple OSS Distributions 184*0f4c859eSApple OSS Distributions/***************************************************************************** 185*0f4c859eSApple OSS Distributions * forward small copy * 186*0f4c859eSApple OSS Distributions *****************************************************************************/ 187*0f4c859eSApple OSS Distributions 188*0f4c859eSApple OSS Distributions// Copy one quadword at a time until less than 8 bytes remain to be copied. 189*0f4c859eSApple OSS Distributions// At the point of entry to L_forwardSmallCopy, the "calling convention" 190*0f4c859eSApple OSS Distributions// is as follows: 191*0f4c859eSApple OSS Distributions// 192*0f4c859eSApple OSS Distributions// x0 pointer to first byte of destination 193*0f4c859eSApple OSS Distributions// x1 pointer to first byte of source 194*0f4c859eSApple OSS Distributions// x2 length of buffers 195*0f4c859eSApple OSS Distributions// x3 pointer to first byte of destination 196*0f4c859eSApple OSS Distributions0: ldr x6, [x1],#8 197*0f4c859eSApple OSS Distributions str x6, [x3],#8 198*0f4c859eSApple OSS DistributionsL_forwardSmallCopy: 199*0f4c859eSApple OSS Distributions subs x2, x2, #8 200*0f4c859eSApple OSS Distributions b.cs 0b 201*0f4c859eSApple OSS Distributions adds x2, x2, #8 202*0f4c859eSApple OSS Distributions b.eq 2f 203*0f4c859eSApple OSS Distributions1: ldrb w6, [x1],#1 204*0f4c859eSApple OSS Distributions strb w6, [x3],#1 205*0f4c859eSApple OSS Distributions subs x2, x2, #1 206*0f4c859eSApple OSS Distributions b.ne 1b 207*0f4c859eSApple OSS Distributions2: POP_FRAME 208*0f4c859eSApple OSS Distributions ARM64_STACK_EPILOG 209*0f4c859eSApple OSS Distributions 210*0f4c859eSApple OSS Distributions/***************************************************************************** 211*0f4c859eSApple OSS Distributions * Reverse copy engines * 212*0f4c859eSApple OSS Distributions *****************************************************************************/ 213*0f4c859eSApple OSS Distributions 214*0f4c859eSApple OSS Distributions// The reverse copy engines are identical in every way to the forward copy 215*0f4c859eSApple OSS Distributions// engines, except in that they do everything backwards. For this reason, they 216*0f4c859eSApple OSS Distributions// are somewhat more sparsely commented than the forward copy loops. I have 217*0f4c859eSApple OSS Distributions// tried to only comment things that might be somewhat surprising in how they 218*0f4c859eSApple OSS Distributions// differ from the forward implementation. 219*0f4c859eSApple OSS Distributions// 220*0f4c859eSApple OSS Distributions// The one important thing to note is that (almost without fail), x1 and x3 221*0f4c859eSApple OSS Distributions// will point to ONE BYTE BEYOND the "right-hand edge" of the active buffer 222*0f4c859eSApple OSS Distributions// throughout these copy loops. They are initially advanced to that position 223*0f4c859eSApple OSS Distributions// in the L_reverse jump island. Because of this, whereas the forward copy 224*0f4c859eSApple OSS Distributions// loops generally follow a "copy data, then advance pointers" scheme, in the 225*0f4c859eSApple OSS Distributions// reverse copy loops, we advance the pointers, then copy the data. 226*0f4c859eSApple OSS Distributions 227*0f4c859eSApple OSS DistributionsL_reverse: 228*0f4c859eSApple OSS Distributions// As a minor optimization, we early out if dst == src. 229*0f4c859eSApple OSS Distributions cbz x3, L_return 230*0f4c859eSApple OSS Distributions// advance both pointers to the ends of their respective buffers before 231*0f4c859eSApple OSS Distributions// jumping into the appropriate reverse copy loop. 232*0f4c859eSApple OSS Distributions add x4, x0, x2 233*0f4c859eSApple OSS Distributions add x1, x1, x2 234*0f4c859eSApple OSS Distributions cmp x2, #(kSmallCopy) 235*0f4c859eSApple OSS Distributions b.cc L_reverseSmallCopy 236*0f4c859eSApple OSS Distributions 237*0f4c859eSApple OSS Distributions/***************************************************************************** 238*0f4c859eSApple OSS Distributions * Reverse large copy * 239*0f4c859eSApple OSS Distributions *****************************************************************************/ 240*0f4c859eSApple OSS Distributions 241*0f4c859eSApple OSS Distributions ldp x12,x13,[x1, #-16] 242*0f4c859eSApple OSS Distributions ldp x14,x15,[x1, #-32] 243*0f4c859eSApple OSS Distributions sub x3, x4, #1 // In the forward copy, we used dst+32 & -32 244*0f4c859eSApple OSS Distributions and x3, x3, #-32 // to find an aligned location in the dest 245*0f4c859eSApple OSS Distributions sub x5, x4, x3 // buffer. Here we use dst-1 & -32 instead, 246*0f4c859eSApple OSS Distributions sub x1, x1, x5 // because we are going backwards. 247*0f4c859eSApple OSS Distributions sub x2, x2, x5 248*0f4c859eSApple OSS Distributions ldp x8, x9, [x1, #-16] 249*0f4c859eSApple OSS Distributions ldp x10,x11,[x1, #-32] 250*0f4c859eSApple OSS Distributions stp x12,x13,[x4, #-16] 251*0f4c859eSApple OSS Distributions stp x14,x15,[x4, #-32] 252*0f4c859eSApple OSS Distributions sub x1, x1, #32 253*0f4c859eSApple OSS Distributions subs x2, x2, #64 254*0f4c859eSApple OSS Distributions b.ls L_reverseCleanup 255*0f4c859eSApple OSS Distributions 256*0f4c859eSApple OSS DistributionsL_reverseCopyLoop: 257*0f4c859eSApple OSS Distributions stnp x8, x9, [x3, #-16] 258*0f4c859eSApple OSS Distributions stnp x10,x11,[x3, #-32] 259*0f4c859eSApple OSS Distributions sub x3, x3, #32 260*0f4c859eSApple OSS Distributions ldnp x8, x9, [x1, #-16] 261*0f4c859eSApple OSS Distributions ldnp x10,x11,[x1, #-32] 262*0f4c859eSApple OSS Distributions sub x1, x1, #32 263*0f4c859eSApple OSS Distributions subs x2, x2, #32 264*0f4c859eSApple OSS Distributions b.hi L_reverseCopyLoop 265*0f4c859eSApple OSS Distributions 266*0f4c859eSApple OSS DistributionsL_reverseCleanup: 267*0f4c859eSApple OSS Distributions sub x1, x1, x2 268*0f4c859eSApple OSS Distributions ldp x12,x13,[x1, #-16] 269*0f4c859eSApple OSS Distributions ldp x14,x15,[x1, #-32] 270*0f4c859eSApple OSS Distributions stp x8, x9, [x3, #-16] 271*0f4c859eSApple OSS Distributions stp x10,x11,[x3, #-32] 272*0f4c859eSApple OSS Distributions stp x12,x13,[x0, #16] // In the forward copy, we need to compute the 273*0f4c859eSApple OSS Distributions stp x14,x15,[x0] // address of these stores, but here we already 274*0f4c859eSApple OSS Distributions POP_FRAME // have a pointer to the start of the buffer. 275*0f4c859eSApple OSS Distributions ARM64_STACK_EPILOG 276*0f4c859eSApple OSS Distributions 277*0f4c859eSApple OSS Distributions/***************************************************************************** 278*0f4c859eSApple OSS Distributions * reverse small copy * 279*0f4c859eSApple OSS Distributions *****************************************************************************/ 280*0f4c859eSApple OSS Distributions 281*0f4c859eSApple OSS Distributions0: ldr x6, [x1,#-8]! 282*0f4c859eSApple OSS Distributions str x6, [x4,#-8]! 283*0f4c859eSApple OSS DistributionsL_reverseSmallCopy: 284*0f4c859eSApple OSS Distributions subs x2, x2, #8 285*0f4c859eSApple OSS Distributions b.cs 0b 286*0f4c859eSApple OSS Distributions adds x2, x2, #8 287*0f4c859eSApple OSS Distributions b.eq 2f 288*0f4c859eSApple OSS Distributions1: ldrb w6, [x1,#-1]! 289*0f4c859eSApple OSS Distributions strb w6, [x4,#-1]! 290*0f4c859eSApple OSS Distributions subs x2, x2, #1 291*0f4c859eSApple OSS Distributions b.ne 1b 292*0f4c859eSApple OSS Distributions2: POP_FRAME 293*0f4c859eSApple OSS Distributions ARM64_STACK_EPILOG 294*0f4c859eSApple OSS Distributions 295*0f4c859eSApple OSS Distributions 296*0f4c859eSApple OSS DistributionsL_return: 297*0f4c859eSApple OSS Distributions POP_FRAME 298*0f4c859eSApple OSS Distributions ARM64_STACK_EPILOG 299