Lines Matching +full:cortex +full:- +full:m

2 # SPDX-License-Identifier: GPL-2.0
12 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
31 # SHA256-hw SHA256(*) SHA512
32 # Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
33 # Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
34 # Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
36 # X-Gene 20.0 (+100%) 12.8 (+300%(***))
41 # (**) The result is a trade-off: it's possible to improve it by
43 # on Cortex-A53 (or by 4 cycles per round).
44 # (***) Super-impressive coefficients over gcc-generated code are
46 # generated with -mgeneral-regs-only is significanty faster
47 # and the gap is only 40-90%.
52 # version of SHA256 for 64-bit processors. This is because performance
53 # improvement on most wide-spread Cortex-A5x processors was observed
54 # to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
55 # observed that 32-bit NEON SHA256 performs significantly better than
56 # 64-bit scalar version on *some* of the more recent processors. As
57 # result 64-bit NEON version of SHA256 was added to provide best
58 # all-round performance. For example it executes ~30% faster on X-Gene
60 # deliver much less improvement, likely *negative* on Cortex-A5x.
67 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
68 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
69 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
70 die "can't locate arm-xlate.pl";
109 my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
124 ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
130 str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
132 # While ARMv8 specifies merged rotate-n-logical operation such as
137 # Cortex-A5x handles merged instructions much better than disjoint
142 eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
151 eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
225 stp x29,x30,[sp,#-128]!
287 .size $func,.-$func
356 .size .LK$BITS,.-.LK$BITS
361 .long OPENSSL_armcap_P-.
363 .quad OPENSSL_armcap_P-.
384 stp x29,x30,[sp,#-16]!
391 ld1 {@MSG[0]-@MSG[3]},[$inp],#64
428 sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
447 .size sha256_block_armv8,.-sha256_block_armv8
453 # You'll surely note a lot of similarities with sha256-armv4 module,
454 # and of course it's not a coincidence. sha256-armv4 was used as
456 # extensively re-tuned for all-round performance.
466 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
473 sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
474 sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
475 sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
499 &sli_32 ($T2,$T0,32-$sigma0[0]);
508 &sli_32 ($T3,$T0,32-$sigma0[1]);
517 &sli_32 ($T4,$T7,32-$sigma1[0]);
529 &sli_u32 ($T3,$T7,32-$sigma1[1]);
549 &sli_32 ($T6,@X[0],32-$sigma1[0]);
557 &sli_32 ($T5,@X[0],32-$sigma1[1]);
619 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
622 '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
623 '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
627 '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
647 stp x29, x30, [sp, #-16]!
663 rev32 @X[1],@X[1] // big-endian
670 st1.32 {$T0-$T1},[$Xfer], #32
672 st1.32 {$T2-$T3},[$Xfer]
738 .size sha256_block_neon,.-sha256_block_neon
755 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
777 s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers
781 m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g;