Lines Matching +full:256 +full:- +full:byte
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
26 * (c) vrev32.16 (16-bit rotations only)
30 * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations,
31 * the only choices are (a) and (b). We use (a) since it takes two-thirds the
32 * cycles of (b) on both Cortex-A7 and Cortex-A53.
34 * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
37 * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence
42 * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
46 * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
57 * chacha_permute - permute one block
59 * Permute one 64-byte block where the state matrix is stored in the four NEON
60 * registers q0-q3. It performs matrix operations on four words in parallel,
65 * Clobbers: r3, ip, q4-q5
148 vld1.32 {q0-q1}, [r0]
149 vld1.32 {q2-q3}, [ip]
159 vld1.8 {q4-q5}, [r2]
160 vld1.8 {q6-q7}, [ip]
179 vst1.8 {q0-q1}, [r1]
180 vst1.8 {q2-q3}, [ip]
187 // r1: output (8 32-bit words)
191 vld1.32 {q0-q1}, [r0]!
192 vld1.32 {q2-q3}, [r0]
205 .Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6
211 sub ip, sp, #0x20 // allocate a 32 byte buffer
224 // requires no word shuffling. The words are re-interleaved before the
228 // x0..15[0-3] = s0..15[0-3]
230 vld1.32 {q0-q1}, [r0]
231 vld1.32 {q2-q3}, [ip]
241 vadd.u32 q12, q12, q4 // x12 += counter values 0-3
257 vld1.32 {q8-q9}, [sp, :256]
287 vst1.32 {q8-q9}, [sp, :256]
327 vld1.32 {q8-q9}, [sp, :256]
338 vst1.32 {q8-q9}, [sp, :256]
354 vld1.32 {q8-q9}, [sp, :256]
384 vst1.32 {q8-q9}, [sp, :256]
424 vld1.32 {q8-q9}, [sp, :256]
435 vst1.32 {q8-q9}, [sp, :256]
454 // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
455 // x8..9[0-3] are on the stack.
457 // Re-interleave the words in the first two rows of each block (x0..7).
458 // Also add the counter values 0-3 to x12[0-3].
459 vld1.32 {q8}, [lr, :128] // load counter values 0-3
464 vadd.u32 q12, q8 // x12 += counter values 0-3
467 vld1.32 {q8-q9}, [r0]! // load s0..7
471 // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
477 // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block)
483 // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block)
490 vld1.8 {q8-q9}, [r2]!
493 vst1.8 {q8-q9}, [r1]!
495 // Re-interleave the words in the last two rows of each block (x8..15).
496 vld1.32 {q8-q9}, [sp, :256]
503 vld1.32 {q0-q1}, [r0] // load s8..15
511 // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block)
517 // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
525 vld1.8 {q0-q1}, [r2]!
530 vst1.8 {q0-q1}, [r1]!
532 vld1.8 {q0-q1}, [r2]!
537 vst1.8 {q0-q1}, [r1]!
539 vld1.8 {q0-q1}, [r2]!
544 vst1.8 {q0-q1}, [r1]!
546 vld1.8 {q0-q1}, [r2]!
551 vst1.8 {q0-q1}, [r1]!
553 vld1.8 {q0-q1}, [r2]!
558 vst1.8 {q0-q1}, [r1]!
560 vld1.8 {q0-q1}, [r2]!
566 vst1.8 {q0-q1}, [r1]!
568 vld1.8 {q0-q1}, [r2]
571 vst1.8 {q0-q1}, [r1]
584 // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
585 // previous 32 byte output block that still needs to be written at
586 // [r1] in q0-q1.
595 vld1.8 {q2-q3}, [lr]
596 vld1.8 {q6-q7}, [r2]
600 vtbl.8 d4, {q4-q5}, d4
601 vtbl.8 d5, {q4-q5}, d5
602 vtbl.8 d6, {q4-q5}, d6
603 vtbl.8 d7, {q4-q5}, d7
608 vst1.8 {q6-q7}, [r4] // overlapping stores
609 vst1.8 {q0-q1}, [r1]
636 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
637 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
638 .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
639 .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
640 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
641 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
642 .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
643 .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f