Lines Matching +full:cortex +full:- +full:a8
2 # SPDX-License-Identifier: GPL-2.0
22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
28 # Cortex A8 core and ~40 cycles per processed byte.
32 # Profiler-assisted and platform-specific optimization resulted in 7%
33 # improvement on Coxtex A8 core and ~38 cycles per byte.
37 # Add NEON implementation. On Cortex A8 it was measured to process
38 # one byte in 23.3 cycles or ~60% faster than integer-only code.
44 # Technical writers asserted that 3-way S4 pipeline can sustain
46 # not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
47 # for further details. On side note Cortex-A15 processes one byte in
53 # h[0-7], namely with most significant dword at *lower* address, which
55 # expected to maintain native byte order for whole 64-bit values.
60 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
190 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
191 # define VFP_ABI_POP vldmia sp!,{d8-d15}
264 .size K512,.-K512
267 .word OPENSSL_armcap_P-sha512_block_data_order
268 .skip 32-4
289 stmdb sp!,{r4-r12,lr}
352 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
353 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
360 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
362 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
386 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
389 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
405 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
406 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
482 ldmia sp!,{r4-r12,pc}
484 ldmia sp!,{r4-r12,lr}
487 bx lr @ interoperable with Thumb ISA:-)
489 .size sha512_block_data_order,.-sha512_block_data_order
499 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
522 vsli.64 $t0,$e,#`64-@Sigma1[0]`
523 vsli.64 $t1,$e,#`64-@Sigma1[1]`
525 vsli.64 $t2,$e,#`64-@Sigma1[2]`
535 vsli.64 $t0,$a,#`64-@Sigma0[0]`
539 vsli.64 $t1,$a,#`64-@Sigma0[1]`
541 vsli.64 $t2,$a,#`64-@Sigma0[2]`
557 # 2x-vectorized, therefore runs every 2nd round
558 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
568 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
570 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
577 vsli.64 $t0,$s0,#`64-@sigma0[0]`
578 vsli.64 $t1,$s0,#`64-@sigma0[1]`
593 .arch armv7-a
601 dmb @ errata #451034 on early Cortex A8
605 sub $Ktbl,$Ktbl,.Lsha512_block_data_order-K512
606 vldmia $ctx,{$A-$H} @ load context
620 vldmia $ctx,{d24-d31} @ load context to temp
625 vstmia $ctx,{$A-$H} @ save context
632 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
645 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4