xref: /xnu-10002.1.13/osfmk/arm64/corecrypto/sha256_compress_arm64.s (revision 1031c584a5e37aff177559b9f69dbd3c8c3fd30a)
1/*
2 * Copyright (c) 2019-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29	This file provides armv8+neon hand implementation of the following function
30
31	void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
32
33	which is a C function in sha2.c (from xnu).
34
35	sha256 algorithm per block description:
36
37		1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
38		2. load 8 digests a-h from ctx->state
39		3. for r = 0:15
40				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
41				d += T1;
42				h = T1 + Sigma0(a) + Maj(a,b,c)
43				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
44		4. for r = 16:63
45				W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
46				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
47				d += T1;
48				h = T1 + Sigma0(a) + Maj(a,b,c)
49				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
50
51	In the assembly implementation:
52		- a circular window of message schedule W(r:r+15) is updated and stored in q0-q3
53		- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
54		- the 8 digests (a-h) will be stored in GPR or memory
55
56	the implementation per block looks like
57
58	----------------------------------------------------------------------------
59
60	load W(0:15) (big-endian per 4 bytes) into q0:q3
61	pre_calculate and store W+K(0:15) in stack
62
63	load digests a-h from ctx->state;
64
65	for (r=0;r<48;r+=4) {
66		digests a-h update and permute round r:r+3
67		update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
68	}
69
70	for (r=48;r<64;r+=4) {
71		digests a-h update and permute round r:r+3
72	}
73
74	ctx->states += digests a-h;
75
76	----------------------------------------------------------------------------
77
78	our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
79	into the last 16 rounds of its previous block:
80
81	----------------------------------------------------------------------------
82
83	load W(0:15) (big-endian per 4 bytes) into q0:q3
84	pre_calculate and store W+K(0:15) in stack
85
86L_loop:
87
88	load digests a-h from ctx->state;
89
90	for (r=0;r<48;r+=4) {
91		digests a-h update and permute round r:r+3
92		update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
93	}
94
95	num_block--;
96	if (num_block==0)	jmp L_last_block;
97
98	for (r=48;r<64;r+=4) {
99		digests a-h update and permute round r:r+3
100		load W([r:r+3]%16) (big-endian per 4 bytes) into q0:q3
101		pre_calculate and store W+K([r:r+3]%16) in stack
102	}
103
104	ctx->states += digests a-h;
105
106	jmp	L_loop;
107
108L_last_block:
109
110	for (r=48;r<64;r+=4) {
111		digests a-h update and permute round r:r+3
112	}
113
114	ctx->states += digests a-h;
115
116	------------------------------------------------------------------------
117
118	Apple CoreOS vector & numerics
119*/
120
121#if defined(__arm64__)
122
123#include "arm64_isa_compatibility.h"
124
125.subsections_via_symbols
126    .text
127
128    .p2align  4
129
130K256:
131    .long   0x428a2f98
132    .long   0x71374491
133    .long   0xb5c0fbcf
134    .long   0xe9b5dba5
135    .long   0x3956c25b
136    .long   0x59f111f1
137    .long   0x923f82a4
138    .long   0xab1c5ed5
139    .long   0xd807aa98
140    .long   0x12835b01
141    .long   0x243185be
142    .long   0x550c7dc3
143    .long   0x72be5d74
144    .long   0x80deb1fe
145    .long   0x9bdc06a7
146    .long   0xc19bf174
147    .long   0xe49b69c1
148    .long   0xefbe4786
149    .long   0x0fc19dc6
150    .long   0x240ca1cc
151    .long   0x2de92c6f
152    .long   0x4a7484aa
153    .long   0x5cb0a9dc
154    .long   0x76f988da
155    .long   0x983e5152
156    .long   0xa831c66d
157    .long   0xb00327c8
158    .long   0xbf597fc7
159    .long   0xc6e00bf3
160    .long   0xd5a79147
161    .long   0x06ca6351
162    .long   0x14292967
163    .long   0x27b70a85
164    .long   0x2e1b2138
165    .long   0x4d2c6dfc
166    .long   0x53380d13
167    .long   0x650a7354
168    .long   0x766a0abb
169    .long   0x81c2c92e
170    .long   0x92722c85
171    .long   0xa2bfe8a1
172    .long   0xa81a664b
173    .long   0xc24b8b70
174    .long   0xc76c51a3
175    .long   0xd192e819
176    .long   0xd6990624
177    .long   0xf40e3585
178    .long   0x106aa070
179    .long   0x19a4c116
180    .long   0x1e376c08
181    .long   0x2748774c
182    .long   0x34b0bcb5
183    .long   0x391c0cb3
184    .long   0x4ed8aa4a
185    .long   0x5b9cca4f
186    .long   0x682e6ff3
187    .long   0x748f82ee
188    .long   0x78a5636f
189    .long   0x84c87814
190    .long   0x8cc70208
191    .long   0x90befffa
192    .long   0xa4506ceb
193    .long   0xbef9a3f7
194    .long   0xc67178f2
195
196
197    .p2align  4
198
199	.globl _AccelerateCrypto_SHA256_compress
200_AccelerateCrypto_SHA256_compress:
201
202
203	#define	hashes		x0
204	#define	numblocks	x1
205	#define	data		x2
206	#define	ktable		x3
207
208#ifdef __ILP32__
209    uxtw    numblocks, numblocks        // in arm64_32 size_t is 32-bit, so we need to extend it
210#endif
211
212
213	adrp	ktable, K256@page
214	cbnz	numblocks, 1f						// if number of blocks is nonzero, go on for sha256 transform operation
215	ret		lr							// otherwise, return
2161:
217	add		ktable, ktable, K256@pageoff
218
219#if BUILDKERNEL
220	// save q0-q7, q16-q24 8+8+1=19
221	sub		x4, sp, #17*16
222	sub		sp, sp, #17*16
223	st1.4s	{v0, v1, v2, v3}, [x4], #64
224	st1.4s	{v4, v5, v6, v7}, [x4], #64
225	st1.4s	{v16, v17, v18, v19}, [x4], #64
226	st1.4s	{v20, v21, v22, v23}, [x4], #64
227	st1.4s	{v24}, [x4], #16
228#endif
229
230	ld1.4s	{v0,v1,v2,v3}, [data], #64			// w0,w1,w2,w3 need to bswap into big-endian
231
232    rev32.16b	v0, v0					// byte swap of 1st 4 ints
233    ldr         q21, [ktable, #16*0]
234    rev32.16b	v1, v1					// byte swap of 2nd 4 ints
235    ldr         q16, [hashes, #0]
236    rev32.16b	v2, v2					// byte swap of 3rd 4 ints
237    ldr         q17, [hashes, #16]
238    rev32.16b	v3, v3					// byte swap of 4th 4 ints
239    ldr         q22, [ktable, #16*1]
240
241	mov.16b		v18, v16
242    ldr         q23, [ktable, #16*2]
243    add.4s		v4, v0, v21				// 1st 4 input + K256
244    ldr         q24, [ktable, #16*3]
245    add.4s		v5, v1, v22				// 2nd 4 input + K256
246	mov.16b		v19, v17
247    add.4s		v6, v2, v23				// 3rd 4 input + K256
248    add.4s		v7, v3, v24				// 4th 4 input + K256
249    add         ktable, ktable, #16*4
250
251
252	.macro	sha256_round
253	mov.16b		v20, v18
254	SHA256SU0	$0, $1
255	SHA256H		18, 19, $4
256	SHA256SU1	$0, $2, $3
257	SHA256H2	19, 20, $4
258	add.4s		$6, $5, $7
259	.endm
260
261	// 4 vector hashes update and load next vector rounds
262	.macro	sha256_hash_load_round
263	mov.16b		v20, v18
264	SHA256H		18, 19, $0
265    rev32.16b	$1, $1
266	SHA256H2	19, 20, $0
267    add.4s		$2, $1, $3
268	.endm
269
270	.macro	sha256_hash_round
271	mov.16b		v20, v18
272	SHA256H		18, 19, $0
273	SHA256H2	19, 20, $0
274	.endm
275
276	// 12 vector hash and sequence update rounds
277    mov         w4, #3
278L_i_loop:
279    mov.16b		v20, v18
280	ldr         q21, [ktable, #0]		// k0
281	SHA256SU0	0, 1
282	ldr         q22, [ktable, #16]		// k1
283	SHA256H		18, 19, 4
284	ldr         q23, [ktable, #32]		// k2
285	SHA256SU1	0, 2, 3
286	ldr         q24, [ktable, #48]		// k3
287	SHA256H2	19, 20, 4
288    add         ktable, ktable, #64
289	add.4s		v4, v0, v21
290
291	sha256_round	1, 2, 3, 0, 5, v1, v5, v22
292	sha256_round	2, 3, 0, 1, 6, v2, v6, v23
293    subs            w4, w4, #1
294	sha256_round	3, 0, 1, 2, 7, v3, v7, v24
295    b.gt            L_i_loop
296
297	subs 		numblocks, numblocks, #1	// pre-decrement num_blocks by 1
298	b.le		L_wrapup
299
300	sub			ktable, ktable, #256
301
302L_loop:
303
304    ldr	        q0, [data, #0]
305	mov.16b		v20, v18
306    ldr         q21, [ktable,#0]
307	SHA256H		18, 19, 4
308	ldr	        q1, [data, #16]
309    rev32.16b	v0, v0
310	ldr	        q2, [data, #32]
311	SHA256H2	19, 20, 4
312	ldr	        q3, [data, #48]
313    add.4s		v4, v0, v21
314
315    ldr         q22, [ktable,#16]
316	mov.16b		v20, v18
317    add         data, data, #64
318	SHA256H		18, 19, 5
319    ldr         q23, [ktable,#32]
320    rev32.16b	v1, v1
321    ldr         q24, [ktable,#48]
322	SHA256H2	19, 20, 5
323    add.4s		v5, v1, v22
324
325	sha256_hash_load_round	6, v2, v6, v23
326	sha256_hash_load_round	7, v3, v7, v24
327
328	add.4s		v18, v16, v18
329	add.4s		v19, v17, v19
330	mov.16b		v16, v18
331	mov.16b		v17, v19
332
333	// 12 vector hash and sequence update rounds
334    mov.16b		v20, v18
335	ldr         q21, [ktable, #16*4]		// k0
336	SHA256SU0	0, 1
337	ldr         q22, [ktable, #16*5]		// k1
338	SHA256H		18, 19, 4
339	ldr         q23, [ktable, #16*6]		// k2
340	SHA256SU1	0, 2, 3
341	ldr         q24, [ktable, #16*7]		// k3
342	SHA256H2	19, 20, 4
343	add.4s		v4, v0, v21
344
345	sha256_round	1, 2, 3, 0, 5, v1, v5, v22
346	sha256_round	2, 3, 0, 1, 6, v2, v6, v23
347	sha256_round	3, 0, 1, 2, 7, v3, v7, v24
348    mov.16b		v20, v18
349	ldr         q21, [ktable, #16*8]		// k0
350	SHA256SU0	0, 1
351	ldr         q22, [ktable, #16*9]		// k1
352	SHA256H		18, 19, 4
353	ldr         q23, [ktable, #16*10]		// k2
354	SHA256SU1	0, 2, 3
355	ldr         q24, [ktable, #16*11]		// k3
356	SHA256H2	19, 20, 4
357	add.4s		v4, v0, v21
358
359	sha256_round	1, 2, 3, 0, 5, v1, v5, v22
360	sha256_round	2, 3, 0, 1, 6, v2, v6, v23
361	sha256_round	3, 0, 1, 2, 7, v3, v7, v24
362
363    mov.16b		v20, v18
364	ldr         q21, [ktable, #16*12]		// k0
365	SHA256SU0	0, 1
366	ldr         q22, [ktable, #16*13]		// k1
367	SHA256H		18, 19, 4
368	ldr         q23, [ktable, #16*14]		// k2
369	SHA256SU1	0, 2, 3
370	ldr         q24, [ktable, #16*15]		// k3
371	SHA256H2	19, 20, 4
372	add.4s		v4, v0, v21
373
374	sha256_round	1, 2, 3, 0, 5, v1, v5, v22
375	sha256_round	2, 3, 0, 1, 6, v2, v6, v23
376	sha256_round	3, 0, 1, 2, 7, v3, v7, v24
377
378	subs 		numblocks, numblocks, #1	// pre-decrement num_blocks by 1
379	b.gt		L_loop
380
381L_wrapup:
382
383	sha256_hash_round	4
384	sha256_hash_round	5
385	sha256_hash_round	6
386	sha256_hash_round	7
387
388	add.4s		v16, v16, v18
389	add.4s		v17, v17, v19
390	st1.4s		{v16,v17}, [hashes]					// hashes q16 : d,c,b,a   q17 : h,g,f,e
391
392#if BUILDKERNEL
393	// restore q9-q13, q0-q7, q16-q31
394	ld1.4s	{v0, v1, v2, v3}, [sp], #64
395	ld1.4s	{v4, v5, v6, v7}, [sp], #64
396	ld1.4s	{v16, v17, v18, v19}, [sp], #64
397	ld1.4s	{v20, v21, v22, v23}, [sp], #64
398	ld1.4s	{v24}, [sp], #16
399#endif
400
401	ret		lr
402
403
404#endif		// arm64
405
406