1/* 2 * Copyright (c) 2016-2021 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#ifdef KERNEL 30#include <arm64/asm.h> 31#endif /* KERNEL */ 32 33/* 34 * extern uint32_t os_cpu_copy_in_cksum(const void *src, void *dst, 35 * uint32_t len, uint32_t sum0); 36 * 37 * input : 38 * src : source starting address 39 * dst : destination starting address 40 * len : byte stream length 41 * sum0 : initial 32-bit sum 42 * 43 * output : 44 * the source byte stream is copied into the destination buffer 45 * the function returns the partial 16-bit checksum accumulated 46 * in a 32-bit variable (without 1's complement); caller is 47 * responsible for folding the 32-bit sum into 16-bit and 48 * performing the 1's complement if applicable 49 */ 50 51/* 52 * The following definitions default the implementation to little-endian 53 * architectures. 54 */ 55#define LITTLE_ENDIAN 1 56#define BYTE_ORDER LITTLE_ENDIAN 57 58/* 59 * ARM64 kernel mode -- just like user mode -- no longer requires saving 60 * the vector registers, since it's done by the exception handler code. 61 */ 62#define SAVE_REGISTERS 0 63 64 .globl _os_cpu_copy_in_cksum 65 .text 66 .align 4 67_os_cpu_copy_in_cksum: 68 69#define src x0 70#define dst x1 71#define len x2 72#define sum x3 73#define need_swap x5 74#define t x6 75#define partial x7 76#define wpartial w7 77 78#ifdef KERNEL 79 ARM64_PROLOG 80#endif /* KERNEL */ 81 mov partial, #0 // partial = 0; 82 mov need_swap, #0 // needs_swap = 0; 83 84 cbz len, L_len_0 85 86/* 87 * Deal with odd-addressed byte, use w7 to store temporary sum, deposit this 88 * byte to high byte of 16-bit in w7 89 * 90 * t = 0; 91 * if ((uintptr_t)src & 1) { 92 * t = *src << 8; 93 * *dst++ = *src++; 94 * --len; 95 * } 96 */ 97 tst src, #1 98 b.eq 1f 99 ldrb wpartial, [src] 100 add src, src, #1 101 strb wpartial, [dst], #1 102#if BYTE_ORDER == LITTLE_ENDIAN 103 lsl partial, partial, #8 104#endif 105 sub len, len, #1 106 mov need_swap, #1 107 cbz len, L_len_0 1081: 109 110#if SAVE_REGISTERS 111 /* 112 * we will always use v0-v3, and v4-v7/v16-v19 if len>=128 113 * so allocate 12*16 bytes in the stack, and store v0-v3 now, 114 * keep x11 as the pointer 115 */ 116 sub sp, sp, #12*16 117 mov x11, sp 118 st1.4s {v0, v1, v2, v3}, [x11], #4*16 119#endif 120 121 /* 122 * pre-decrement len by 8*16, and if less tha 8*16 bytes, try 123 * 4*16 bytes next. 124 * v0,v1 will store temp result after we exit the L128 loop 125 */ 126 eor.16b v0, v0, v0 127 eor.16b v1, v1, v1 128 cmp len, #8*16 129 mov v0.d[0], partial // move partial to 1st 64b lane in v0 130 b.lt L64_bytes 131 132#if SAVE_REGISTERS 133 /* if we are here, we need to save v4-v7/v16-v19 for kernel mode */ 134 st1.4s {v4, v5, v6, v7}, [x11], #4*16 135 st1.4s {v16, v17, v18, v19}, [x11], #4*16 136#endif 137 138 /* 139 * accumulate 4 x 2 x 32-bit pairs into 8 lanes in v0-v3 140 * load 1st 4 vectors, and clear v0-v3 141 */ 142 ldr q4, [src], #8*16 143 eor.16b v2, v2, v2 144 ldr q5, [src, #-7*16] 145 eor.16b v3, v3, v3 146 ldr q6, [src, #-6*16] 147 ldr q7, [src, #-5*16] 148 ldr q16, [src, #-4*16] 149 ldr q17, [src, #-3*16] 150 ldr q18, [src, #-2*16] 151 ldr q19, [src, #-1*16] 152 153 /* branch to finish off if len<128 */ 154 subs len, len, #2*8*16 155 b.lt L128_finishup 156 157 /* 158 * loop for loading and accumulating 16 32-bit words nto 8 8-byte 159 * accumulators per iteration 160 */ 161L128_loop: 162 str q4, [dst], #16*8 163 uadalp.2d v0, v4 164 str q5, [dst, #-7*16] 165 uadalp.2d v1, v5 166 ldr q4, [src], #16*8 167 ldr q5, [src, #-7*16] 168 169 str q6, [dst, #-6*16] 170 uadalp.2d v2, v6 171 str q7, [dst, #-5*16] 172 uadalp.2d v3, v7 173 ldr q6, [src, #-6*16] 174 ldr q7, [src, #-5*16] 175 176 str q16, [dst, #-4*16] 177 uadalp.2d v0, v16 178 str q17, [dst, #-3*16] 179 uadalp.2d v1, v17 180 ldr q16, [src, #-4*16] 181 ldr q17, [src, #-3*16] 182 183 str q18, [dst, #-2*16] 184 uadalp.2d v2, v18 185 str q19, [dst, #-1*16] 186 uadalp.2d v3, v19 187 ldr q18, [src, #-2*16] 188 ldr q19, [src, #-1*16] 189 190 subs len, len, #8*16 191 b.ge L128_loop 192 193L128_finishup: 194 str q4, [dst], #16*8 195 uadalp.2d v0, v4 196 str q5, [dst, #-7*16] 197 uadalp.2d v1, v5 198 str q6, [dst, #-6*16] 199 uadalp.2d v2, v6 200 str q7, [dst, #-5*16] 201 uadalp.2d v3, v7 202 203 str q16, [dst, #-4*16] 204 uadalp.2d v0, v16 205 str q17, [dst, #-3*16] 206 uadalp.2d v1, v17 207 str q18, [dst, #-2*16] 208 uadalp.2d v2, v18 209 str q19, [dst, #-1*16] 210 uadalp.2d v3, v19 211 212 add len, len, #8*16 213 214 add.2d v0, v0, v2 215 add.2d v1, v1, v3 216 217#if SAVE_REGISTERS 218 /* restore v4-v7/v16-v19 as they won't be used any more */ 219 add x11, sp, #4*16 220 ld1.4s {v4, v5, v6, v7}, [x11], #4*16 221 ld1.4s {v16, v17, v18, v19}, [x11], #4*16 222#endif 223 224L64_bytes: 225 cmp len, #4*16 226 b.lt L32_bytes 227 228 ldr q2, [src], #4*16 229 ldr q3, [src, #-3*16] 230 str q2, [dst], #4*16 231 uadalp.2d v0, v2 232 str q3, [dst, #-3*16] 233 uadalp.2d v1, v3 234 235 ldr q2, [src, #-2*16] 236 ldr q3, [src, #-1*16] 237 str q2, [dst, #-2*16] 238 uadalp.2d v0, v2 239 str q3, [dst, #-1*16] 240 uadalp.2d v1, v3 241 sub len, len, #4*16 242 243L32_bytes: 244 cmp len, #2*16 245 b.lt L16_bytes 246 ldr q2, [src], #2*16 247 ldr q3, [src, #-1*16] 248 str q2, [dst], #2*16 249 uadalp.2d v0, v2 250 str q3, [dst, #-1*16] 251 uadalp.2d v1, v3 252 sub len, len, #2*16 253 254L16_bytes: 255 add.2d v0, v0, v1 256 cmp len, #16 257 b.lt L8_bytes 258 ldr q2, [src], #16 259 str q2, [dst], #16 260 uadalp.2d v0, v2 261 sub len, len, #16 262 263L8_bytes: 264 eor.16b v1, v1, v1 265 eor.16b v2, v2, v2 266 eor.16b v3, v3, v3 267 268 tst len, #8 269 b.eq L4_bytes 270 ldr d1,[src],#8 271 str d1,[dst],#8 272 273L4_bytes: 274 tst len, #4 275 b.eq L2_bytes 276 ldr s2,[src],#4 277 str s2,[dst],#4 278 279L2_bytes: 280 uadalp.2d v0, v1 281 eor.16b v1, v1, v1 282 tst len, #2 283 b.eq L_trailing_bytes 284 ldr h3,[src],#2 285 str h3,[dst],#2 286 287L_trailing_bytes: 288 tst len, #1 289 b.eq L0_bytes 290 ldr b1,[src],#1 291 str b1,[dst],#1 292#if BYTE_ORDER != LITTLE_ENDIAN 293 shl.4h v1, v1, #8 // partial <<= 8; 294#endif 295 296L0_bytes: 297 uadalp.2d v2, v3 298 uadalp.2d v0, v1 299 uadalp.2d v0, v2 300 301 addp.2d d0, v0 302 fmov partial, d0 303 304#if SAVE_REGISTERS 305 /* restore v0-v3 and deallocate stack space */ 306 ld1.4s {v0, v1, v2, v3}, [sp] 307 add sp, sp, #12*16 308#endif 309 310 /* partial = (partial >> 32) + (partial & 0xffffffff); */ 311 and t, partial, #0xffffffff 312 add partial, t, partial, lsr #32 313 314 /* partial = (partial >> 16) + (partial & 0xffff); */ 315 and t, partial, #0xffff 316 add partial, t, partial, lsr #16 317 318L_len_0: 319 /* 320 * if (needs_swap) 321 * partial = (partial << 8) + (partial >> 24); 322 */ 323 cbz need_swap, 1f 324 lsl t, partial, #8 325 add partial, t, partial, lsr #24 3261: 327 /* final_acc = (sum0 >> 16) + (sum0 & 0xffff); */ 328 and x0, sum, #0xffff 329 add x0, x0, sum, lsr #16 330 331 /* final_acc += (partial >> 16) + (partial & 0xffff); */ 332 add x0, x0, partial, lsr #16 333 and partial, partial, #0xffff 334 add x0, x0, partial 335 336 /* final_acc = (final_acc >> 16) + (final_acc & 0xffff); */ 337 and t, x0, #0xffff 338 add x0, t, x0, lsr #16 339 340 /* 341 * One final fold in case of carry from the previous one. 342 * final_acc = (final_acc >> 16) + (final_acc & 0xffff); 343 */ 344 and t, x0, #0xffff 345 add x0, t, x0, lsr #16 346 347 /* 348 * return (~final_acc & 0xffff); 349 * 350 * mvn w0, w0 351 * and w0, w0, #0xffff 352 */ 353 354 ret lr 355