1/* 2 * Copyright (c) 2016-2021 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* 30 * extern uint32_t os_cpu_copy_in_cksum(const void *src, void *dst, 31 * uint32_t len, uint32_t sum0); 32 * 33 * input : 34 * src : source starting address 35 * dst : destination starting address 36 * len : byte stream length 37 * sum0 : initial 32-bit sum 38 * 39 * output : 40 * the source byte stream is copied into the destination buffer 41 * the function returns the partial 16-bit checksum accumulated 42 * in a 32-bit variable (without 1's complement); caller is 43 * responsible for folding the 32-bit sum into 16-bit and 44 * performing the 1's complement if applicable 45 */ 46 47/* 48 * The following definitions default the implementation to little-endian 49 * architectures. 50 */ 51#define LITTLE_ENDIAN 1 52#define BYTE_ORDER LITTLE_ENDIAN 53 54/* 55 * ARM64 kernel mode -- just like user mode -- no longer requires saving 56 * the vector registers, since it's done by the exception handler code. 57 */ 58#define SAVE_REGISTERS 0 59 60 .globl _os_cpu_copy_in_cksum 61 .text 62 .align 4 63_os_cpu_copy_in_cksum: 64 65#define src x0 66#define dst x1 67#define len x2 68#define sum x3 69#define need_swap x5 70#define t x6 71#define partial x7 72#define wpartial w7 73 74 mov partial, #0 // partial = 0; 75 mov need_swap, #0 // needs_swap = 0; 76 77 cbz len, L_len_0 78 79/* 80 * Deal with odd-addressed byte, use w7 to store temporary sum, deposit this 81 * byte to high byte of 16-bit in w7 82 * 83 * t = 0; 84 * if ((uintptr_t)src & 1) { 85 * t = *src << 8; 86 * *dst++ = *src++; 87 * --len; 88 * } 89 */ 90 tst src, #1 91 b.eq 1f 92 ldrb wpartial, [src] 93 add src, src, #1 94 strb wpartial, [dst], #1 95#if BYTE_ORDER == LITTLE_ENDIAN 96 lsl partial, partial, #8 97#endif 98 sub len, len, #1 99 mov need_swap, #1 100 cbz len, L_len_0 1011: 102 103#if SAVE_REGISTERS 104 /* 105 * we will always use v0-v3, and v4-v7/v16-v19 if len>=128 106 * so allocate 12*16 bytes in the stack, and store v0-v3 now, 107 * keep x11 as the pointer 108 */ 109 sub sp, sp, #12*16 110 mov x11, sp 111 st1.4s {v0, v1, v2, v3}, [x11], #4*16 112#endif 113 114 /* 115 * pre-decrement len by 8*16, and if less tha 8*16 bytes, try 116 * 4*16 bytes next. 117 * v0,v1 will store temp result after we exit the L128 loop 118 */ 119 eor.16b v0, v0, v0 120 eor.16b v1, v1, v1 121 cmp len, #8*16 122 mov v0.d[0], partial // move partial to 1st 64b lane in v0 123 b.lt L64_bytes 124 125#if SAVE_REGISTERS 126 /* if we are here, we need to save v4-v7/v16-v19 for kernel mode */ 127 st1.4s {v4, v5, v6, v7}, [x11], #4*16 128 st1.4s {v16, v17, v18, v19}, [x11], #4*16 129#endif 130 131 /* 132 * accumulate 4 x 2 x 32-bit pairs into 8 lanes in v0-v3 133 * load 1st 4 vectors, and clear v0-v3 134 */ 135 ldr q4, [src], #8*16 136 eor.16b v2, v2, v2 137 ldr q5, [src, #-7*16] 138 eor.16b v3, v3, v3 139 ldr q6, [src, #-6*16] 140 ldr q7, [src, #-5*16] 141 ldr q16, [src, #-4*16] 142 ldr q17, [src, #-3*16] 143 ldr q18, [src, #-2*16] 144 ldr q19, [src, #-1*16] 145 146 /* branch to finish off if len<128 */ 147 subs len, len, #2*8*16 148 b.lt L128_finishup 149 150 /* 151 * loop for loading and accumulating 16 32-bit words nto 8 8-byte 152 * accumulators per iteration 153 */ 154L128_loop: 155 str q4, [dst], #16*8 156 uadalp.2d v0, v4 157 str q5, [dst, #-7*16] 158 uadalp.2d v1, v5 159 ldr q4, [src], #16*8 160 ldr q5, [src, #-7*16] 161 162 str q6, [dst, #-6*16] 163 uadalp.2d v2, v6 164 str q7, [dst, #-5*16] 165 uadalp.2d v3, v7 166 ldr q6, [src, #-6*16] 167 ldr q7, [src, #-5*16] 168 169 str q16, [dst, #-4*16] 170 uadalp.2d v0, v16 171 str q17, [dst, #-3*16] 172 uadalp.2d v1, v17 173 ldr q16, [src, #-4*16] 174 ldr q17, [src, #-3*16] 175 176 str q18, [dst, #-2*16] 177 uadalp.2d v2, v18 178 str q19, [dst, #-1*16] 179 uadalp.2d v3, v19 180 ldr q18, [src, #-2*16] 181 ldr q19, [src, #-1*16] 182 183 subs len, len, #8*16 184 b.ge L128_loop 185 186L128_finishup: 187 str q4, [dst], #16*8 188 uadalp.2d v0, v4 189 str q5, [dst, #-7*16] 190 uadalp.2d v1, v5 191 str q6, [dst, #-6*16] 192 uadalp.2d v2, v6 193 str q7, [dst, #-5*16] 194 uadalp.2d v3, v7 195 196 str q16, [dst, #-4*16] 197 uadalp.2d v0, v16 198 str q17, [dst, #-3*16] 199 uadalp.2d v1, v17 200 str q18, [dst, #-2*16] 201 uadalp.2d v2, v18 202 str q19, [dst, #-1*16] 203 uadalp.2d v3, v19 204 205 add len, len, #8*16 206 207 add.2d v0, v0, v2 208 add.2d v1, v1, v3 209 210#if SAVE_REGISTERS 211 /* restore v4-v7/v16-v19 as they won't be used any more */ 212 add x11, sp, #4*16 213 ld1.4s {v4, v5, v6, v7}, [x11], #4*16 214 ld1.4s {v16, v17, v18, v19}, [x11], #4*16 215#endif 216 217L64_bytes: 218 cmp len, #4*16 219 b.lt L32_bytes 220 221 ldr q2, [src], #4*16 222 ldr q3, [src, #-3*16] 223 str q2, [dst], #4*16 224 uadalp.2d v0, v2 225 str q3, [dst, #-3*16] 226 uadalp.2d v1, v3 227 228 ldr q2, [src, #-2*16] 229 ldr q3, [src, #-1*16] 230 str q2, [dst, #-2*16] 231 uadalp.2d v0, v2 232 str q3, [dst, #-1*16] 233 uadalp.2d v1, v3 234 sub len, len, #4*16 235 236L32_bytes: 237 cmp len, #2*16 238 b.lt L16_bytes 239 ldr q2, [src], #2*16 240 ldr q3, [src, #-1*16] 241 str q2, [dst], #2*16 242 uadalp.2d v0, v2 243 str q3, [dst, #-1*16] 244 uadalp.2d v1, v3 245 sub len, len, #2*16 246 247L16_bytes: 248 add.2d v0, v0, v1 249 cmp len, #16 250 b.lt L8_bytes 251 ldr q2, [src], #16 252 str q2, [dst], #16 253 uadalp.2d v0, v2 254 sub len, len, #16 255 256L8_bytes: 257 eor.16b v1, v1, v1 258 eor.16b v2, v2, v2 259 eor.16b v3, v3, v3 260 261 tst len, #8 262 b.eq L4_bytes 263 ldr d1,[src],#8 264 str d1,[dst],#8 265 266L4_bytes: 267 tst len, #4 268 b.eq L2_bytes 269 ldr s2,[src],#4 270 str s2,[dst],#4 271 272L2_bytes: 273 uadalp.2d v0, v1 274 eor.16b v1, v1, v1 275 tst len, #2 276 b.eq L_trailing_bytes 277 ldr h3,[src],#2 278 str h3,[dst],#2 279 280L_trailing_bytes: 281 tst len, #1 282 b.eq L0_bytes 283 ldr b1,[src],#1 284 str b1,[dst],#1 285#if BYTE_ORDER != LITTLE_ENDIAN 286 shl.4h v1, v1, #8 // partial <<= 8; 287#endif 288 289L0_bytes: 290 uadalp.2d v2, v3 291 uadalp.2d v0, v1 292 uadalp.2d v0, v2 293 294 addp.2d d0, v0 295 fmov partial, d0 296 297#if SAVE_REGISTERS 298 /* restore v0-v3 and deallocate stack space */ 299 ld1.4s {v0, v1, v2, v3}, [sp] 300 add sp, sp, #12*16 301#endif 302 303 /* partial = (partial >> 32) + (partial & 0xffffffff); */ 304 and t, partial, #0xffffffff 305 add partial, t, partial, lsr #32 306 307 /* partial = (partial >> 16) + (partial & 0xffff); */ 308 and t, partial, #0xffff 309 add partial, t, partial, lsr #16 310 311L_len_0: 312 /* 313 * if (needs_swap) 314 * partial = (partial << 8) + (partial >> 24); 315 */ 316 cbz need_swap, 1f 317 lsl t, partial, #8 318 add partial, t, partial, lsr #24 3191: 320 /* final_acc = (sum0 >> 16) + (sum0 & 0xffff); */ 321 and x0, sum, #0xffff 322 add x0, x0, sum, lsr #16 323 324 /* final_acc += (partial >> 16) + (partial & 0xffff); */ 325 add x0, x0, partial, lsr #16 326 and partial, partial, #0xffff 327 add x0, x0, partial 328 329 /* final_acc = (final_acc >> 16) + (final_acc & 0xffff); */ 330 and t, x0, #0xffff 331 add x0, t, x0, lsr #16 332 333 /* 334 * One final fold in case of carry from the previous one. 335 * final_acc = (final_acc >> 16) + (final_acc & 0xffff); 336 */ 337 and t, x0, #0xffff 338 add x0, t, x0, lsr #16 339 340 /* 341 * return (~final_acc & 0xffff); 342 * 343 * mvn w0, w0 344 * and w0, w0, #0xffff 345 */ 346 347 ret lr 348