sha512-armv8.pl - OpenGrok cross reference for /Linux-v5.15/arch/arm64/crypto/sha512-armv8.pl

Lines Matching +full:cortex +full:- +full:m
2 # SPDX-License-Identifier: GPL-2.0
12 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
31 #		SHA256-hw	SHA256(*)	SHA512
32 # Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
33 # Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
34 # Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
36 # X-Gene			20.0 (+100%)	12.8 (+300%(***))
41 # (**)	The result is a trade-off: it's possible to improve it by
43 #	on Cortex-A53 (or by 4 cycles per round).
44 # (***)	Super-impressive coefficients over gcc-generated code are
46 #	generated with -mgeneral-regs-only is significanty faster
47 #	and the gap is only 40-90%.
52 # version of SHA256 for 64-bit processors. This is because performance
53 # improvement on most wide-spread Cortex-A5x processors was observed
54 # to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
55 # observed that 32-bit NEON SHA256 performs significantly better than
56 # 64-bit scalar version on *some* of the more recent processors. As
57 # result 64-bit NEON version of SHA256 was added to provide best
58 # all-round performance. For example it executes ~30% faster on X-Gene
60 # deliver much less improvement, likely *negative* on Cortex-A5x.
67     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
68     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
69     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
70     die "can't locate arm-xlate.pl";
109 my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
124 	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
130 	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
132 # While ARMv8 specifies merged rotate-n-logical operation such as
137 # Cortex-A5x handles merged instructions much better than disjoint
142 	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
151 	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
225 	stp	x29,x30,[sp,#-128]!
287 .size	$func,.-$func
356 .size	.LK$BITS,.-.LK$BITS
361 	.long	OPENSSL_armcap_P-.
363 	.quad	OPENSSL_armcap_P-.
384 	stp		x29,x30,[sp,#-16]!
391 	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
428 	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
447 .size	sha256_block_armv8,.-sha256_block_armv8
453 # You'll surely note a lot of similarities with sha256-armv4 module,
454 # and of course it's not a coincidence. sha256-armv4 was used as
456 # extensively re-tuned for all-round performance.
466 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
473 sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
474 sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
475 sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
499 	&sli_32		($T2,$T0,32-$sigma0[0]);
508 	&sli_32		($T3,$T0,32-$sigma0[1]);
517 	  &sli_32	($T4,$T7,32-$sigma1[0]);
529 	  &sli_u32	($T3,$T7,32-$sigma1[1]);
549 	  &sli_32	($T6,@X[0],32-$sigma1[0]);
557 	  &sli_32	($T5,@X[0],32-$sigma1[1]);
619 	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
622 	'&eor	($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
623 	'&eor	($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
627 	'&eor	($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
647 	stp	x29, x30, [sp, #-16]!
663 	rev32	@X[1],@X[1]		// big-endian
670 	st1.32	{$T0-$T1},[$Xfer], #32
672 	st1.32	{$T2-$T3},[$Xfer]
738 .size	sha256_block_neon,.-sha256_block_neon
755 	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
777 	s/\bq([0-9]+)\b/v$1.16b/g;		# old->new registers
781 	m/(ld|st)1[^\[]+\[0\]/	and s/\.4s/\.s/g;