Lines Matching +full:v1 +full:- +full:v6
4 * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
29 * chacha_permute - permute one block
31 * Permute one 64-byte block where the state matrix is stored in the four NEON
32 * registers v0-v3. It performs matrix operations on four words in parallel,
46 add v0.4s, v0.4s, v1.4s
52 eor v4.16b, v1.16b, v2.16b
53 shl v1.4s, v4.4s, #12
54 sri v1.4s, v4.4s, #20
57 add v0.4s, v0.4s, v1.4s
63 eor v4.16b, v1.16b, v2.16b
64 shl v1.4s, v4.4s, #7
65 sri v1.4s, v4.4s, #25
68 ext v1.16b, v1.16b, v1.16b, #4
75 add v0.4s, v0.4s, v1.4s
81 eor v4.16b, v1.16b, v2.16b
82 shl v1.4s, v4.4s, #12
83 sri v1.4s, v4.4s, #20
86 add v0.4s, v0.4s, v1.4s
92 eor v4.16b, v1.16b, v2.16b
93 shl v1.4s, v4.4s, #7
94 sri v1.4s, v4.4s, #25
97 ext v1.16b, v1.16b, v1.16b, #12
115 stp x29, x30, [sp, #-16]!
119 ld1 {v0.4s-v3.4s}, [x0]
120 ld1 {v8.4s-v11.4s}, [x0]
124 ld1 {v4.16b-v7.16b}, [x2]
131 add v1.4s, v1.4s, v9.4s
132 eor v1.16b, v1.16b, v5.16b
136 eor v2.16b, v2.16b, v6.16b
142 st1 {v0.16b-v3.16b}, [x1]
150 // x1: output (8 32-bit words)
153 stp x29, x30, [sp, #-16]!
156 ld1 {v0.4s-v3.4s}, [x0]
204 // matrix by interleaving 32- and then 64-bit words, which allows us to
211 ld1 {v30.4s-v31.4s}, [x9]
213 // x0..15[0-3] = s0..3[0..3]
215 ld4r { v0.4s- v3.4s}, [x0]
216 ld4r { v4.4s- v7.4s}, [x8], #16
217 ld4r { v8.4s-v11.4s}, [x8], #16
218 ld4r {v12.4s-v15.4s}, [x8]
221 mov a1, v1.s[0]
226 mov a6, v6.s[0]
237 // x12 += counter values 1-4
247 add v1.4s, v1.4s, v5.4s
249 add v2.4s, v2.4s, v6.4s
256 eor v13.16b, v13.16b, v1.16b
289 eor v18.16b, v6.16b, v10.16b
296 shl v6.4s, v18.4s, #12
303 sri v6.4s, v18.4s, #20
314 add v1.4s, v1.4s, v5.4s
316 add v2.4s, v2.4s, v6.4s
323 eor v13.16b, v13.16b, v1.16b
356 eor v18.16b, v6.16b, v10.16b
363 shl v6.4s, v18.4s, #7
370 sri v6.4s, v18.4s, #25
381 add v1.4s, v1.4s, v6.4s
390 eor v12.16b, v12.16b, v1.16b
421 eor v17.16b, v6.16b, v11.16b
429 shl v6.4s, v17.4s, #12
435 sri v6.4s, v17.4s, #20
448 add v1.4s, v1.4s, v6.4s
457 eor v12.16b, v12.16b, v1.16b
488 eor v17.16b, v6.16b, v11.16b
496 shl v6.4s, v17.4s, #7
502 sri v6.4s, v17.4s, #25
512 ld4r {v16.4s-v19.4s}, [x0], #16
513 ld4r {v20.4s-v23.4s}, [x0], #16
515 // x12 += counter values 0-3
518 // x0[0-3] += s0[0]
519 // x1[0-3] += s0[1]
520 // x2[0-3] += s0[2]
521 // x3[0-3] += s0[3]
525 add v1.4s, v1.4s, v17.4s
539 ld4r {v24.4s-v27.4s}, [x0], #16
540 ld4r {v28.4s-v31.4s}, [x0]
542 // x4[0-3] += s1[0]
543 // x5[0-3] += s1[1]
544 // x6[0-3] += s1[2]
545 // x7[0-3] += s1[3]
552 add v6.4s, v6.4s, v22.4s
563 // x8[0-3] += s2[0]
564 // x9[0-3] += s2[1]
565 // x10[0-3] += s2[2]
566 // x11[0-3] += s2[3]
584 // x12[0-3] += s3[0]
585 // x13[0-3] += s3[1]
586 // x14[0-3] += s3[2]
587 // x15[0-3] += s3[3]
605 // interleave 32-bit words in state n, n+1
607 zip1 v16.4s, v0.4s, v1.4s
608 ldp w8, w9, [x2, #-56]
610 zip2 v17.4s, v0.4s, v1.4s
616 ldp w6, w7, [x2, #-48]
618 ldp w8, w9, [x2, #-40]
622 zip1 v22.4s, v6.4s, v7.4s
624 zip2 v23.4s, v6.4s, v7.4s
626 ldp w6, w7, [x2, #-32]
628 ldp w8, w9, [x2, #-24]
636 ldp w6, w7, [x2, #-16]
638 ldp w8, w9, [x2, #-8]
653 // interleave 64-bit words in state n, n+2
659 stp a2, a3, [x1, #-56]
662 ld1 {v16.16b-v19.16b}, [x2], #64
665 zip1 v1.2d, v20.2d, v22.2d
667 stp a4, a5, [x1, #-48]
670 stp a6, a7, [x1, #-40]
673 ld1 {v20.16b-v23.16b}, [x2], #64
677 zip2 v6.2d, v24.2d, v26.2d
678 stp a8, a9, [x1, #-32]
681 stp a10, a11, [x1, #-24]
684 ld1 {v24.16b-v27.16b}, [x2], #64
689 stp a12, a13, [x1, #-16]
692 stp a14, a15, [x1, #-8]
695 ld1 {v28.16b-v31.16b}, [x2]
699 eor v17.16b, v17.16b, v1.16b
707 eor v22.16b, v22.16b, v6.16b
710 st1 {v16.16b-v19.16b}, [x1], #64
718 st1 {v20.16b-v23.16b}, [x1], #64
726 st1 {v24.16b-v27.16b}, [x1], #64
727 st1 {v28.16b-v31.16b}, [x1]
734 ld1 {v28.16b-v31.16b}, [x10]
736 tbl v28.16b, {v4.16b-v7.16b}, v28.16b
737 tbl v29.16b, {v4.16b-v7.16b}, v29.16b
738 tbl v30.16b, {v4.16b-v7.16b}, v30.16b
739 tbl v31.16b, {v4.16b-v7.16b}, v31.16b
745 st1 {v20.16b-v23.16b}, [x5] // overlapping stores
746 1: st1 {v16.16b-v19.16b}, [x1]
750 .Lt128: ld1 {v28.16b-v31.16b}, [x10]
753 tbl v28.16b, {v0.16b-v3.16b}, v28.16b
754 tbl v29.16b, {v0.16b-v3.16b}, v29.16b
755 tbl v30.16b, {v0.16b-v3.16b}, v30.16b
756 tbl v31.16b, {v0.16b-v3.16b}, v31.16b
757 ld1 {v16.16b-v19.16b}, [x1] // reload first output block
762 ld1 {v4.16b-v7.16b}, [x10]
764 tbl v0.16b, {v8.16b-v11.16b}, v4.16b
765 tbl v1.16b, {v8.16b-v11.16b}, v5.16b
766 tbl v2.16b, {v8.16b-v11.16b}, v6.16b
767 tbl v3.16b, {v8.16b-v11.16b}, v7.16b
770 eor v29.16b, v29.16b, v1.16b
773 st1 {v28.16b-v31.16b}, [x6] // overlapping stores
774 2: st1 {v20.16b-v23.16b}, [x1]
779 ld1 {v4.16b-v7.16b}, [x10]
781 tbl v0.16b, {v12.16b-v15.16b}, v4.16b
782 tbl v1.16b, {v12.16b-v15.16b}, v5.16b
783 tbl v2.16b, {v12.16b-v15.16b}, v6.16b
784 tbl v3.16b, {v12.16b-v15.16b}, v7.16b
787 eor v29.16b, v29.16b, v1.16b
790 st1 {v28.16b-v31.16b}, [x7] // overlapping stores
791 3: st1 {v24.16b-v27.16b}, [x1]
800 .byte (.Li - 64)