chacha-neon-core.S - OpenGrok cross reference for /Linux-v6.1/arch/arm/crypto/chacha-neon-core.S

Lines Matching +full:256 +full:- +full:byte
11  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
26   * (c)  vrev32.16			(16-bit rotations only)
30   * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
31   * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
32   * cycles of (b) on both Cortex-A7 and Cortex-A53.
34   * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
37   * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
42   * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
46   * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
57  * chacha_permute - permute one block
59  * Permute one 64-byte block where the state matrix is stored in the four NEON
60  * registers q0-q3.  It performs matrix operations on four words in parallel,
65  * Clobbers: r3, ip, q4-q5
148 	vld1.32		{q0-q1}, [r0]
149 	vld1.32		{q2-q3}, [ip]
159 	vld1.8		{q4-q5}, [r2]
160 	vld1.8		{q6-q7}, [ip]
179 	vst1.8		{q0-q1}, [r1]
180 	vst1.8		{q2-q3}, [ip]
187 	// r1: output (8 32-bit words)
191 	vld1.32		{q0-q1}, [r0]!
192 	vld1.32		{q2-q3}, [r0]
205 .Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
211 	sub		ip, sp, #0x20		// allocate a 32 byte buffer
224 	// requires no word shuffling. The words are re-interleaved before the
228 	// x0..15[0-3] = s0..15[0-3]
230 	vld1.32		{q0-q1}, [r0]
231 	vld1.32		{q2-q3}, [ip]
241 	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
257 	vld1.32		{q8-q9}, [sp, :256]
287 	vst1.32		{q8-q9}, [sp, :256]
327 	vld1.32		{q8-q9}, [sp, :256]
338 	vst1.32		{q8-q9}, [sp, :256]
354 	vld1.32		{q8-q9}, [sp, :256]
384 	vst1.32		{q8-q9}, [sp, :256]
424 	vld1.32		{q8-q9}, [sp, :256]
435 	vst1.32		{q8-q9}, [sp, :256]
454 	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
455 	// x8..9[0-3] are on the stack.
457 	// Re-interleave the words in the first two rows of each block (x0..7).
458 	// Also add the counter values 0-3 to x12[0-3].
459 	  vld1.32	{q8}, [lr, :128]	// load counter values 0-3
464 	  vadd.u32	q12, q8			// x12 += counter values 0-3
467 	  vld1.32	{q8-q9}, [r0]!		// load s0..7
471 	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
477 	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
483 	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
490 	vld1.8		{q8-q9}, [r2]!
493 	vst1.8		{q8-q9}, [r1]!
495 	// Re-interleave the words in the last two rows of each block (x8..15).
496 	vld1.32		{q8-q9}, [sp, :256]
503 	  vld1.32	{q0-q1}, [r0]	// load s8..15
511 	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
517 	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
525 	vld1.8		{q0-q1}, [r2]!
530 	vst1.8		{q0-q1}, [r1]!
532 	vld1.8		{q0-q1}, [r2]!
537 	vst1.8		{q0-q1}, [r1]!
539 	vld1.8		{q0-q1}, [r2]!
544 	vst1.8		{q0-q1}, [r1]!
546 	vld1.8		{q0-q1}, [r2]!
551 	vst1.8		{q0-q1}, [r1]!
553 	vld1.8		{q0-q1}, [r2]!
558 	vst1.8		{q0-q1}, [r1]!
560 	vld1.8		{q0-q1}, [r2]!
566 	vst1.8		{q0-q1}, [r1]!
568 	vld1.8		{q0-q1}, [r2]
571 	vst1.8		{q0-q1}, [r1]
584 	// Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
585 	// previous 32 byte output block that still needs to be written at
586 	// [r1] in q0-q1.
595 	vld1.8		{q2-q3}, [lr]
596 	vld1.8		{q6-q7}, [r2]
600 	vtbl.8		d4, {q4-q5}, d4
601 	vtbl.8		d5, {q4-q5}, d5
602 	vtbl.8		d6, {q4-q5}, d6
603 	vtbl.8		d7, {q4-q5}, d7
608 	vst1.8		{q6-q7}, [r4]	// overlapping stores
609 	vst1.8		{q0-q1}, [r1]
636 	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
637 	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
638 	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
639 	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
640 	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
641 	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
642 	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
643 	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f