1#!/usr/bin/env perl
2# SPDX-License-Identifier: GPL-2.0
3
4# This code is taken from the OpenSSL project but the author (Andy Polyakov)
5# has relicensed it under the GPLv2. Therefore this program is free software;
6# you can redistribute it and/or modify it under the terms of the GNU General
7# Public License version 2 as published by the Free Software Foundation.
8#
9# The original headers, including the original license headers, are
10# included below for completeness.
11
12# ====================================================================
13# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14# project. The module is, however, dual licensed under OpenSSL and
15# CRYPTOGAMS licenses depending on where you obtain it. For further
16# details see http://www.openssl.org/~appro/cryptogams/.
17# ====================================================================
18
19# SHA256 block procedure for ARMv4. May 2007.
20
21# Performance is ~2x better than gcc 3.4 generated code and in "abso-
22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23# byte [on single-issue Xscale PXA250 core].
24
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 22% improvement on
28# Cortex A8 core and ~20 cycles per processed byte.
29
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 16%
33# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35# September 2013.
36#
37# Add NEON implementation. On Cortex A8 it was measured to process one
38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40# code (meaning that latter performs sub-optimally, nothing was done
41# about it).
42
43# May 2014.
44#
45# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
47while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
48open STDOUT,">$output";
49
50$ctx="r0";	$t0="r0";
51$inp="r1";	$t4="r1";
52$len="r2";	$t1="r2";
53$T1="r3";	$t3="r3";
54$A="r4";
55$B="r5";
56$C="r6";
57$D="r7";
58$E="r8";
59$F="r9";
60$G="r10";
61$H="r11";
62@V=($A,$B,$C,$D,$E,$F,$G,$H);
63$t2="r12";
64$Ktbl="r14";
65
66@Sigma0=( 2,13,22);
67@Sigma1=( 6,11,25);
68@sigma0=( 7,18, 3);
69@sigma1=(17,19,10);
70
71sub BODY_00_15 {
72my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
73
74$code.=<<___ if ($i<16);
75#if __ARM_ARCH__>=7
76	@ ldr	$t1,[$inp],#4			@ $i
77# if $i==15
78	str	$inp,[sp,#17*4]			@ make room for $t4
79# endif
80	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
81	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
82	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
83# ifndef __ARMEB__
84	rev	$t1,$t1
85# endif
86#else
87	@ ldrb	$t1,[$inp,#3]			@ $i
88	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
89	ldrb	$t2,[$inp,#2]
90	ldrb	$t0,[$inp,#1]
91	orr	$t1,$t1,$t2,lsl#8
92	ldrb	$t2,[$inp],#4
93	orr	$t1,$t1,$t0,lsl#16
94# if $i==15
95	str	$inp,[sp,#17*4]			@ make room for $t4
96# endif
97	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
98	orr	$t1,$t1,$t2,lsl#24
99	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
100#endif
101___
102$code.=<<___;
103	ldr	$t2,[$Ktbl],#4			@ *K256++
104	add	$h,$h,$t1			@ h+=X[i]
105	str	$t1,[sp,#`$i%16`*4]
106	eor	$t1,$f,$g
107	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
108	and	$t1,$t1,$e
109	add	$h,$h,$t2			@ h+=K256[i]
110	eor	$t1,$t1,$g			@ Ch(e,f,g)
111	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
112	add	$h,$h,$t1			@ h+=Ch(e,f,g)
113#if $i==31
114	and	$t2,$t2,#0xff
115	cmp	$t2,#0xf2			@ done?
116#endif
117#if $i<15
118# if __ARM_ARCH__>=7
119	ldr	$t1,[$inp],#4			@ prefetch
120# else
121	ldrb	$t1,[$inp,#3]
122# endif
123	eor	$t2,$a,$b			@ a^b, b^c in next round
124#else
125	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
126	eor	$t2,$a,$b			@ a^b, b^c in next round
127	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
128#endif
129	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
130	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
131	add	$d,$d,$h			@ d+=h
132	eor	$t3,$t3,$b			@ Maj(a,b,c)
133	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
134	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
135___
136	($t2,$t3)=($t3,$t2);
137}
138
139sub BODY_16_XX {
140my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
141
142$code.=<<___;
143	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
144	@ ldr	$t4,[sp,#`($i+14)%16`*4]
145	mov	$t0,$t1,ror#$sigma0[0]
146	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
147	mov	$t2,$t4,ror#$sigma1[0]
148	eor	$t0,$t0,$t1,ror#$sigma0[1]
149	eor	$t2,$t2,$t4,ror#$sigma1[1]
150	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
151	ldr	$t1,[sp,#`($i+0)%16`*4]
152	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
153	ldr	$t4,[sp,#`($i+9)%16`*4]
154
155	add	$t2,$t2,$t0
156	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
157	add	$t1,$t1,$t2
158	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
159	add	$t1,$t1,$t4			@ X[i]
160___
161	&BODY_00_15(@_);
162}
163
164$code=<<___;
165#ifndef __KERNEL__
166# include "arm_arch.h"
167#else
168# define __ARM_ARCH__ __LINUX_ARM_ARCH__
169# define __ARM_MAX_ARCH__ 7
170#endif
171
172.text
173#if __ARM_ARCH__<7
174.code	32
175#else
176.syntax unified
177# ifdef __thumb2__
178#  define adrl adr
179.thumb
180# else
181.code   32
182# endif
183#endif
184
185.type	K256,%object
186.align	5
187K256:
188.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
189.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
190.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
191.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
192.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
193.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
194.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
195.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
196.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
197.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
198.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
199.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
200.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
201.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
202.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
203.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
204.size	K256,.-K256
205.word	0				@ terminator
206#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
207.LOPENSSL_armcap:
208.word	OPENSSL_armcap_P-sha256_block_data_order
209#endif
210.align	5
211
212.global	sha256_block_data_order
213.type	sha256_block_data_order,%function
214sha256_block_data_order:
215#if __ARM_ARCH__<7
216	sub	r3,pc,#8		@ sha256_block_data_order
217#else
218	adr	r3,sha256_block_data_order
219#endif
220#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
221	ldr	r12,.LOPENSSL_armcap
222	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
223	tst	r12,#ARMV8_SHA256
224	bne	.LARMv8
225	tst	r12,#ARMV7_NEON
226	bne	.LNEON
227#endif
228	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
229	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
230	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
231	sub	$Ktbl,r3,#256+32	@ K256
232	sub	sp,sp,#16*4		@ alloca(X[16])
233.Loop:
234# if __ARM_ARCH__>=7
235	ldr	$t1,[$inp],#4
236# else
237	ldrb	$t1,[$inp,#3]
238# endif
239	eor	$t3,$B,$C		@ magic
240	eor	$t2,$t2,$t2
241___
242for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
243$code.=".Lrounds_16_xx:\n";
244for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
245$code.=<<___;
246#if __ARM_ARCH__>=7
247	ite	eq			@ Thumb2 thing, sanity check in ARM
248#endif
249	ldreq	$t3,[sp,#16*4]		@ pull ctx
250	bne	.Lrounds_16_xx
251
252	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
253	ldr	$t0,[$t3,#0]
254	ldr	$t1,[$t3,#4]
255	ldr	$t2,[$t3,#8]
256	add	$A,$A,$t0
257	ldr	$t0,[$t3,#12]
258	add	$B,$B,$t1
259	ldr	$t1,[$t3,#16]
260	add	$C,$C,$t2
261	ldr	$t2,[$t3,#20]
262	add	$D,$D,$t0
263	ldr	$t0,[$t3,#24]
264	add	$E,$E,$t1
265	ldr	$t1,[$t3,#28]
266	add	$F,$F,$t2
267	ldr	$inp,[sp,#17*4]		@ pull inp
268	ldr	$t2,[sp,#18*4]		@ pull inp+len
269	add	$G,$G,$t0
270	add	$H,$H,$t1
271	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
272	cmp	$inp,$t2
273	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
274	bne	.Loop
275
276	add	sp,sp,#`16+3`*4	@ destroy frame
277#if __ARM_ARCH__>=5
278	ldmia	sp!,{r4-r11,pc}
279#else
280	ldmia	sp!,{r4-r11,lr}
281	tst	lr,#1
282	moveq	pc,lr			@ be binary compatible with V4, yet
283	bx	lr			@ interoperable with Thumb ISA:-)
284#endif
285.size	sha256_block_data_order,.-sha256_block_data_order
286___
287######################################################################
288# NEON stuff
289#
290{{{
291my @X=map("q$_",(0..3));
292my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
293my $Xfer=$t4;
294my $j=0;
295
296sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
297sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
298
299sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
300{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
301  my $arg = pop;
302    $arg = "#$arg" if ($arg*1 eq $arg);
303    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
304}
305
306sub Xupdate()
307{ use integer;
308  my $body = shift;
309  my @insns = (&$body,&$body,&$body,&$body);
310  my ($a,$b,$c,$d,$e,$f,$g,$h);
311
312	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
313	 eval(shift(@insns));
314	 eval(shift(@insns));
315	 eval(shift(@insns));
316	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
317	 eval(shift(@insns));
318	 eval(shift(@insns));
319	 eval(shift(@insns));
320	&vshr_u32	($T2,$T0,$sigma0[0]);
321	 eval(shift(@insns));
322	 eval(shift(@insns));
323	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
324	 eval(shift(@insns));
325	 eval(shift(@insns));
326	&vshr_u32	($T1,$T0,$sigma0[2]);
327	 eval(shift(@insns));
328	 eval(shift(@insns));
329	&vsli_32	($T2,$T0,32-$sigma0[0]);
330	 eval(shift(@insns));
331	 eval(shift(@insns));
332	&vshr_u32	($T3,$T0,$sigma0[1]);
333	 eval(shift(@insns));
334	 eval(shift(@insns));
335	&veor		($T1,$T1,$T2);
336	 eval(shift(@insns));
337	 eval(shift(@insns));
338	&vsli_32	($T3,$T0,32-$sigma0[1]);
339	 eval(shift(@insns));
340	 eval(shift(@insns));
341	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
342	 eval(shift(@insns));
343	 eval(shift(@insns));
344	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
345	 eval(shift(@insns));
346	 eval(shift(@insns));
347	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
348	 eval(shift(@insns));
349	 eval(shift(@insns));
350	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
351	 eval(shift(@insns));
352	 eval(shift(@insns));
353	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
354	 eval(shift(@insns));
355	 eval(shift(@insns));
356	  &veor		($T5,$T5,$T4);
357	 eval(shift(@insns));
358	 eval(shift(@insns));
359	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
360	 eval(shift(@insns));
361	 eval(shift(@insns));
362	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
363	 eval(shift(@insns));
364	 eval(shift(@insns));
365	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
366	 eval(shift(@insns));
367	 eval(shift(@insns));
368	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
369	 eval(shift(@insns));
370	 eval(shift(@insns));
371	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
372	 eval(shift(@insns));
373	 eval(shift(@insns));
374	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
375	 eval(shift(@insns));
376	 eval(shift(@insns));
377	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
378	 eval(shift(@insns));
379	 eval(shift(@insns));
380	  &veor		($T5,$T5,$T4);
381	 eval(shift(@insns));
382	 eval(shift(@insns));
383	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
384	 eval(shift(@insns));
385	 eval(shift(@insns));
386	&vld1_32	("{$T0}","[$Ktbl,:128]!");
387	 eval(shift(@insns));
388	 eval(shift(@insns));
389	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
390	 eval(shift(@insns));
391	 eval(shift(@insns));
392	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
393	 eval(shift(@insns));
394	 eval(shift(@insns));
395	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
396	 eval(shift(@insns));
397	 eval(shift(@insns));
398	&vadd_i32	($T0,$T0,@X[0]);
399	 while($#insns>=2) { eval(shift(@insns)); }
400	&vst1_32	("{$T0}","[$Xfer,:128]!");
401	 eval(shift(@insns));
402	 eval(shift(@insns));
403
404	push(@X,shift(@X));		# "rotate" X[]
405}
406
407sub Xpreload()
408{ use integer;
409  my $body = shift;
410  my @insns = (&$body,&$body,&$body,&$body);
411  my ($a,$b,$c,$d,$e,$f,$g,$h);
412
413	 eval(shift(@insns));
414	 eval(shift(@insns));
415	 eval(shift(@insns));
416	 eval(shift(@insns));
417	&vld1_32	("{$T0}","[$Ktbl,:128]!");
418	 eval(shift(@insns));
419	 eval(shift(@insns));
420	 eval(shift(@insns));
421	 eval(shift(@insns));
422	&vrev32_8	(@X[0],@X[0]);
423	 eval(shift(@insns));
424	 eval(shift(@insns));
425	 eval(shift(@insns));
426	 eval(shift(@insns));
427	&vadd_i32	($T0,$T0,@X[0]);
428	 foreach (@insns) { eval; }	# remaining instructions
429	&vst1_32	("{$T0}","[$Xfer,:128]!");
430
431	push(@X,shift(@X));		# "rotate" X[]
432}
433
434sub body_00_15 () {
435	(
436	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
437	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
438	'&eor	($t1,$f,$g)',
439	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
440	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
441	'&and	($t1,$t1,$e)',
442	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
443	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
444	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
445	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
446	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
447	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
448	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
449	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
450	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
451	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
452	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
453	'&add	($d,$d,$h)',			# d+=h
454	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
455	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
456	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
457	)
458}
459
460$code.=<<___;
461#if __ARM_MAX_ARCH__>=7
462.arch	armv7-a
463.fpu	neon
464
465.global	sha256_block_data_order_neon
466.type	sha256_block_data_order_neon,%function
467.align	4
468sha256_block_data_order_neon:
469.LNEON:
470	stmdb	sp!,{r4-r12,lr}
471
472	sub	$H,sp,#16*4+16
473	adrl	$Ktbl,K256
474	bic	$H,$H,#15		@ align for 128-bit stores
475	mov	$t2,sp
476	mov	sp,$H			@ alloca
477	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
478
479	vld1.8		{@X[0]},[$inp]!
480	vld1.8		{@X[1]},[$inp]!
481	vld1.8		{@X[2]},[$inp]!
482	vld1.8		{@X[3]},[$inp]!
483	vld1.32		{$T0},[$Ktbl,:128]!
484	vld1.32		{$T1},[$Ktbl,:128]!
485	vld1.32		{$T2},[$Ktbl,:128]!
486	vld1.32		{$T3},[$Ktbl,:128]!
487	vrev32.8	@X[0],@X[0]		@ yes, even on
488	str		$ctx,[sp,#64]
489	vrev32.8	@X[1],@X[1]		@ big-endian
490	str		$inp,[sp,#68]
491	mov		$Xfer,sp
492	vrev32.8	@X[2],@X[2]
493	str		$len,[sp,#72]
494	vrev32.8	@X[3],@X[3]
495	str		$t2,[sp,#76]		@ save original sp
496	vadd.i32	$T0,$T0,@X[0]
497	vadd.i32	$T1,$T1,@X[1]
498	vst1.32		{$T0},[$Xfer,:128]!
499	vadd.i32	$T2,$T2,@X[2]
500	vst1.32		{$T1},[$Xfer,:128]!
501	vadd.i32	$T3,$T3,@X[3]
502	vst1.32		{$T2},[$Xfer,:128]!
503	vst1.32		{$T3},[$Xfer,:128]!
504
505	ldmia		$ctx,{$A-$H}
506	sub		$Xfer,$Xfer,#64
507	ldr		$t1,[sp,#0]
508	eor		$t2,$t2,$t2
509	eor		$t3,$B,$C
510	b		.L_00_48
511
512.align	4
513.L_00_48:
514___
515	&Xupdate(\&body_00_15);
516	&Xupdate(\&body_00_15);
517	&Xupdate(\&body_00_15);
518	&Xupdate(\&body_00_15);
519$code.=<<___;
520	teq	$t1,#0				@ check for K256 terminator
521	ldr	$t1,[sp,#0]
522	sub	$Xfer,$Xfer,#64
523	bne	.L_00_48
524
525	ldr		$inp,[sp,#68]
526	ldr		$t0,[sp,#72]
527	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
528	teq		$inp,$t0
529	it		eq
530	subeq		$inp,$inp,#64		@ avoid SEGV
531	vld1.8		{@X[0]},[$inp]!		@ load next input block
532	vld1.8		{@X[1]},[$inp]!
533	vld1.8		{@X[2]},[$inp]!
534	vld1.8		{@X[3]},[$inp]!
535	it		ne
536	strne		$inp,[sp,#68]
537	mov		$Xfer,sp
538___
539	&Xpreload(\&body_00_15);
540	&Xpreload(\&body_00_15);
541	&Xpreload(\&body_00_15);
542	&Xpreload(\&body_00_15);
543$code.=<<___;
544	ldr	$t0,[$t1,#0]
545	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
546	ldr	$t2,[$t1,#4]
547	ldr	$t3,[$t1,#8]
548	ldr	$t4,[$t1,#12]
549	add	$A,$A,$t0			@ accumulate
550	ldr	$t0,[$t1,#16]
551	add	$B,$B,$t2
552	ldr	$t2,[$t1,#20]
553	add	$C,$C,$t3
554	ldr	$t3,[$t1,#24]
555	add	$D,$D,$t4
556	ldr	$t4,[$t1,#28]
557	add	$E,$E,$t0
558	str	$A,[$t1],#4
559	add	$F,$F,$t2
560	str	$B,[$t1],#4
561	add	$G,$G,$t3
562	str	$C,[$t1],#4
563	add	$H,$H,$t4
564	str	$D,[$t1],#4
565	stmia	$t1,{$E-$H}
566
567	ittte	ne
568	movne	$Xfer,sp
569	ldrne	$t1,[sp,#0]
570	eorne	$t2,$t2,$t2
571	ldreq	sp,[sp,#76]			@ restore original sp
572	itt	ne
573	eorne	$t3,$B,$C
574	bne	.L_00_48
575
576	ldmia	sp!,{r4-r12,pc}
577.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
578#endif
579___
580}}}
581######################################################################
582# ARMv8 stuff
583#
584{{{
585my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
586my @MSG=map("q$_",(8..11));
587my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
588my $Ktbl="r3";
589
590$code.=<<___;
591#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
592
593# ifdef __thumb2__
594#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
595# else
596#  define INST(a,b,c,d)	.byte	a,b,c,d
597# endif
598
599.type	sha256_block_data_order_armv8,%function
600.align	5
601sha256_block_data_order_armv8:
602.LARMv8:
603	vld1.32	{$ABCD,$EFGH},[$ctx]
604# ifdef __thumb2__
605	adr	$Ktbl,.LARMv8
606	sub	$Ktbl,$Ktbl,#.LARMv8-K256
607# else
608	adrl	$Ktbl,K256
609# endif
610	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
611
612.Loop_v8:
613	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
614	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
615	vld1.32		{$W0},[$Ktbl]!
616	vrev32.8	@MSG[0],@MSG[0]
617	vrev32.8	@MSG[1],@MSG[1]
618	vrev32.8	@MSG[2],@MSG[2]
619	vrev32.8	@MSG[3],@MSG[3]
620	vmov		$ABCD_SAVE,$ABCD	@ offload
621	vmov		$EFGH_SAVE,$EFGH
622	teq		$inp,$len
623___
624for($i=0;$i<12;$i++) {
625$code.=<<___;
626	vld1.32		{$W1},[$Ktbl]!
627	vadd.i32	$W0,$W0,@MSG[0]
628	sha256su0	@MSG[0],@MSG[1]
629	vmov		$abcd,$ABCD
630	sha256h		$ABCD,$EFGH,$W0
631	sha256h2	$EFGH,$abcd,$W0
632	sha256su1	@MSG[0],@MSG[2],@MSG[3]
633___
634	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
635}
636$code.=<<___;
637	vld1.32		{$W1},[$Ktbl]!
638	vadd.i32	$W0,$W0,@MSG[0]
639	vmov		$abcd,$ABCD
640	sha256h		$ABCD,$EFGH,$W0
641	sha256h2	$EFGH,$abcd,$W0
642
643	vld1.32		{$W0},[$Ktbl]!
644	vadd.i32	$W1,$W1,@MSG[1]
645	vmov		$abcd,$ABCD
646	sha256h		$ABCD,$EFGH,$W1
647	sha256h2	$EFGH,$abcd,$W1
648
649	vld1.32		{$W1},[$Ktbl]
650	vadd.i32	$W0,$W0,@MSG[2]
651	sub		$Ktbl,$Ktbl,#256-16	@ rewind
652	vmov		$abcd,$ABCD
653	sha256h		$ABCD,$EFGH,$W0
654	sha256h2	$EFGH,$abcd,$W0
655
656	vadd.i32	$W1,$W1,@MSG[3]
657	vmov		$abcd,$ABCD
658	sha256h		$ABCD,$EFGH,$W1
659	sha256h2	$EFGH,$abcd,$W1
660
661	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
662	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
663	it		ne
664	bne		.Loop_v8
665
666	vst1.32		{$ABCD,$EFGH},[$ctx]
667
668	ret		@ bx lr
669.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
670#endif
671___
672}}}
673$code.=<<___;
674.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
675.align	2
676#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
677.comm   OPENSSL_armcap_P,4,4
678#endif
679___
680
681open SELF,$0;
682while(<SELF>) {
683	next if (/^#!/);
684	last if (!s/^#/@/ and !/^$/);
685	print;
686}
687close SELF;
688
689{   my  %opcode = (
690	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
691	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
692
693    sub unsha256 {
694	my ($mnemonic,$arg)=@_;
695
696	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
697	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
698					 |(($2&7)<<17)|(($2&8)<<4)
699					 |(($3&7)<<1) |(($3&8)<<2);
700	    # since ARMv7 instructions are always encoded little-endian.
701	    # correct solution is to use .inst directive, but older
702	    # assemblers don't implement it:-(
703	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
704			$word&0xff,($word>>8)&0xff,
705			($word>>16)&0xff,($word>>24)&0xff,
706			$mnemonic,$arg;
707	}
708    }
709}
710
711foreach (split($/,$code)) {
712
713	s/\`([^\`]*)\`/eval $1/geo;
714
715	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
716
717	s/\bret\b/bx	lr/go		or
718	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
719
720	print $_,"\n";
721}
722
723close STDOUT; # enforce flush
724