1/* 2 * Copyright (c) 2019-2021 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 This file provides armv8+neon hand implementation of the following function 30 31 void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks); 32 33 which is a C function in sha2.c (from xnu). 34 35 sha256 algorithm per block description: 36 37 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) 38 2. load 8 digests a-h from ctx->state 39 3. for r = 0:15 40 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; 41 d += T1; 42 h = T1 + Sigma0(a) + Maj(a,b,c) 43 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g 44 4. for r = 16:63 45 W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]); 46 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; 47 d += T1; 48 h = T1 + Sigma0(a) + Maj(a,b,c) 49 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g 50 51 In the assembly implementation: 52 - a circular window of message schedule W(r:r+15) is updated and stored in q0-q3 53 - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer 54 - the 8 digests (a-h) will be stored in GPR or memory 55 56 the implementation per block looks like 57 58 ---------------------------------------------------------------------------- 59 60 load W(0:15) (big-endian per 4 bytes) into q0:q3 61 pre_calculate and store W+K(0:15) in stack 62 63 load digests a-h from ctx->state; 64 65 for (r=0;r<48;r+=4) { 66 digests a-h update and permute round r:r+3 67 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 68 } 69 70 for (r=48;r<64;r+=4) { 71 digests a-h update and permute round r:r+3 72 } 73 74 ctx->states += digests a-h; 75 76 ---------------------------------------------------------------------------- 77 78 our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block 79 into the last 16 rounds of its previous block: 80 81 ---------------------------------------------------------------------------- 82 83 load W(0:15) (big-endian per 4 bytes) into q0:q3 84 pre_calculate and store W+K(0:15) in stack 85 86L_loop: 87 88 load digests a-h from ctx->state; 89 90 for (r=0;r<48;r+=4) { 91 digests a-h update and permute round r:r+3 92 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 93 } 94 95 num_block--; 96 if (num_block==0) jmp L_last_block; 97 98 for (r=48;r<64;r+=4) { 99 digests a-h update and permute round r:r+3 100 load W([r:r+3]%16) (big-endian per 4 bytes) into q0:q3 101 pre_calculate and store W+K([r:r+3]%16) in stack 102 } 103 104 ctx->states += digests a-h; 105 106 jmp L_loop; 107 108L_last_block: 109 110 for (r=48;r<64;r+=4) { 111 digests a-h update and permute round r:r+3 112 } 113 114 ctx->states += digests a-h; 115 116 ------------------------------------------------------------------------ 117 118 Apple CoreOS vector & numerics 119*/ 120 121#if defined(__arm64__) 122 123#include "arm64_isa_compatibility.h" 124 125.subsections_via_symbols 126 .text 127 128 .p2align 4 129 130K256: 131 .long 0x428a2f98 132 .long 0x71374491 133 .long 0xb5c0fbcf 134 .long 0xe9b5dba5 135 .long 0x3956c25b 136 .long 0x59f111f1 137 .long 0x923f82a4 138 .long 0xab1c5ed5 139 .long 0xd807aa98 140 .long 0x12835b01 141 .long 0x243185be 142 .long 0x550c7dc3 143 .long 0x72be5d74 144 .long 0x80deb1fe 145 .long 0x9bdc06a7 146 .long 0xc19bf174 147 .long 0xe49b69c1 148 .long 0xefbe4786 149 .long 0x0fc19dc6 150 .long 0x240ca1cc 151 .long 0x2de92c6f 152 .long 0x4a7484aa 153 .long 0x5cb0a9dc 154 .long 0x76f988da 155 .long 0x983e5152 156 .long 0xa831c66d 157 .long 0xb00327c8 158 .long 0xbf597fc7 159 .long 0xc6e00bf3 160 .long 0xd5a79147 161 .long 0x06ca6351 162 .long 0x14292967 163 .long 0x27b70a85 164 .long 0x2e1b2138 165 .long 0x4d2c6dfc 166 .long 0x53380d13 167 .long 0x650a7354 168 .long 0x766a0abb 169 .long 0x81c2c92e 170 .long 0x92722c85 171 .long 0xa2bfe8a1 172 .long 0xa81a664b 173 .long 0xc24b8b70 174 .long 0xc76c51a3 175 .long 0xd192e819 176 .long 0xd6990624 177 .long 0xf40e3585 178 .long 0x106aa070 179 .long 0x19a4c116 180 .long 0x1e376c08 181 .long 0x2748774c 182 .long 0x34b0bcb5 183 .long 0x391c0cb3 184 .long 0x4ed8aa4a 185 .long 0x5b9cca4f 186 .long 0x682e6ff3 187 .long 0x748f82ee 188 .long 0x78a5636f 189 .long 0x84c87814 190 .long 0x8cc70208 191 .long 0x90befffa 192 .long 0xa4506ceb 193 .long 0xbef9a3f7 194 .long 0xc67178f2 195 196 197 .p2align 4 198 199 .globl _AccelerateCrypto_SHA256_compress 200_AccelerateCrypto_SHA256_compress: 201 202 203 #define hashes x0 204 #define numblocks x1 205 #define data x2 206 #define ktable x3 207 208#ifdef __ILP32__ 209 uxtw numblocks, numblocks // in arm64_32 size_t is 32-bit, so we need to extend it 210#endif 211 212 213 adrp ktable, K256@page 214 cbnz numblocks, 1f // if number of blocks is nonzero, go on for sha256 transform operation 215 ret lr // otherwise, return 2161: 217 add ktable, ktable, K256@pageoff 218 219#if BUILDKERNEL 220 // save q0-q7, q16-q24 8+8+1=19 221 sub x4, sp, #17*16 222 sub sp, sp, #17*16 223 st1.4s {v0, v1, v2, v3}, [x4], #64 224 st1.4s {v4, v5, v6, v7}, [x4], #64 225 st1.4s {v16, v17, v18, v19}, [x4], #64 226 st1.4s {v20, v21, v22, v23}, [x4], #64 227 st1.4s {v24}, [x4], #16 228#endif 229 230 ld1.4s {v0,v1,v2,v3}, [data], #64 // w0,w1,w2,w3 need to bswap into big-endian 231 232 rev32.16b v0, v0 // byte swap of 1st 4 ints 233 ldr q21, [ktable, #16*0] 234 rev32.16b v1, v1 // byte swap of 2nd 4 ints 235 ldr q16, [hashes, #0] 236 rev32.16b v2, v2 // byte swap of 3rd 4 ints 237 ldr q17, [hashes, #16] 238 rev32.16b v3, v3 // byte swap of 4th 4 ints 239 ldr q22, [ktable, #16*1] 240 241 mov.16b v18, v16 242 ldr q23, [ktable, #16*2] 243 add.4s v4, v0, v21 // 1st 4 input + K256 244 ldr q24, [ktable, #16*3] 245 add.4s v5, v1, v22 // 2nd 4 input + K256 246 mov.16b v19, v17 247 add.4s v6, v2, v23 // 3rd 4 input + K256 248 add.4s v7, v3, v24 // 4th 4 input + K256 249 add ktable, ktable, #16*4 250 251 252 .macro sha256_round 253 mov.16b v20, v18 254 SHA256SU0 $0, $1 255 SHA256H 18, 19, $4 256 SHA256SU1 $0, $2, $3 257 SHA256H2 19, 20, $4 258 add.4s $6, $5, $7 259 .endm 260 261 // 4 vector hashes update and load next vector rounds 262 .macro sha256_hash_load_round 263 mov.16b v20, v18 264 SHA256H 18, 19, $0 265 rev32.16b $1, $1 266 SHA256H2 19, 20, $0 267 add.4s $2, $1, $3 268 .endm 269 270 .macro sha256_hash_round 271 mov.16b v20, v18 272 SHA256H 18, 19, $0 273 SHA256H2 19, 20, $0 274 .endm 275 276 // 12 vector hash and sequence update rounds 277 mov w4, #3 278L_i_loop: 279 mov.16b v20, v18 280 ldr q21, [ktable, #0] // k0 281 SHA256SU0 0, 1 282 ldr q22, [ktable, #16] // k1 283 SHA256H 18, 19, 4 284 ldr q23, [ktable, #32] // k2 285 SHA256SU1 0, 2, 3 286 ldr q24, [ktable, #48] // k3 287 SHA256H2 19, 20, 4 288 add ktable, ktable, #64 289 add.4s v4, v0, v21 290 291 sha256_round 1, 2, 3, 0, 5, v1, v5, v22 292 sha256_round 2, 3, 0, 1, 6, v2, v6, v23 293 subs w4, w4, #1 294 sha256_round 3, 0, 1, 2, 7, v3, v7, v24 295 b.gt L_i_loop 296 297 subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1 298 b.le L_wrapup 299 300 sub ktable, ktable, #256 301 302L_loop: 303 304 ldr q0, [data, #0] 305 mov.16b v20, v18 306 ldr q21, [ktable,#0] 307 SHA256H 18, 19, 4 308 ldr q1, [data, #16] 309 rev32.16b v0, v0 310 ldr q2, [data, #32] 311 SHA256H2 19, 20, 4 312 ldr q3, [data, #48] 313 add.4s v4, v0, v21 314 315 ldr q22, [ktable,#16] 316 mov.16b v20, v18 317 add data, data, #64 318 SHA256H 18, 19, 5 319 ldr q23, [ktable,#32] 320 rev32.16b v1, v1 321 ldr q24, [ktable,#48] 322 SHA256H2 19, 20, 5 323 add.4s v5, v1, v22 324 325 sha256_hash_load_round 6, v2, v6, v23 326 sha256_hash_load_round 7, v3, v7, v24 327 328 add.4s v18, v16, v18 329 add.4s v19, v17, v19 330 mov.16b v16, v18 331 mov.16b v17, v19 332 333 // 12 vector hash and sequence update rounds 334 mov.16b v20, v18 335 ldr q21, [ktable, #16*4] // k0 336 SHA256SU0 0, 1 337 ldr q22, [ktable, #16*5] // k1 338 SHA256H 18, 19, 4 339 ldr q23, [ktable, #16*6] // k2 340 SHA256SU1 0, 2, 3 341 ldr q24, [ktable, #16*7] // k3 342 SHA256H2 19, 20, 4 343 add.4s v4, v0, v21 344 345 sha256_round 1, 2, 3, 0, 5, v1, v5, v22 346 sha256_round 2, 3, 0, 1, 6, v2, v6, v23 347 sha256_round 3, 0, 1, 2, 7, v3, v7, v24 348 mov.16b v20, v18 349 ldr q21, [ktable, #16*8] // k0 350 SHA256SU0 0, 1 351 ldr q22, [ktable, #16*9] // k1 352 SHA256H 18, 19, 4 353 ldr q23, [ktable, #16*10] // k2 354 SHA256SU1 0, 2, 3 355 ldr q24, [ktable, #16*11] // k3 356 SHA256H2 19, 20, 4 357 add.4s v4, v0, v21 358 359 sha256_round 1, 2, 3, 0, 5, v1, v5, v22 360 sha256_round 2, 3, 0, 1, 6, v2, v6, v23 361 sha256_round 3, 0, 1, 2, 7, v3, v7, v24 362 363 mov.16b v20, v18 364 ldr q21, [ktable, #16*12] // k0 365 SHA256SU0 0, 1 366 ldr q22, [ktable, #16*13] // k1 367 SHA256H 18, 19, 4 368 ldr q23, [ktable, #16*14] // k2 369 SHA256SU1 0, 2, 3 370 ldr q24, [ktable, #16*15] // k3 371 SHA256H2 19, 20, 4 372 add.4s v4, v0, v21 373 374 sha256_round 1, 2, 3, 0, 5, v1, v5, v22 375 sha256_round 2, 3, 0, 1, 6, v2, v6, v23 376 sha256_round 3, 0, 1, 2, 7, v3, v7, v24 377 378 subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1 379 b.gt L_loop 380 381L_wrapup: 382 383 sha256_hash_round 4 384 sha256_hash_round 5 385 sha256_hash_round 6 386 sha256_hash_round 7 387 388 add.4s v16, v16, v18 389 add.4s v17, v17, v19 390 st1.4s {v16,v17}, [hashes] // hashes q16 : d,c,b,a q17 : h,g,f,e 391 392#if BUILDKERNEL 393 // restore q9-q13, q0-q7, q16-q31 394 ld1.4s {v0, v1, v2, v3}, [sp], #64 395 ld1.4s {v4, v5, v6, v7}, [sp], #64 396 ld1.4s {v16, v17, v18, v19}, [sp], #64 397 ld1.4s {v20, v21, v22, v23}, [sp], #64 398 ld1.4s {v24}, [sp], #16 399#endif 400 401 ret lr 402 403 404#endif // arm64 405 406