Lines Matching +full:0 +full:- +full:32

1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * x86_64/AVX2/AES-NI assembler implementation of Camellia
14 #define key_table 0
51 32-way camellia
56 * x0..x7: byte-sliced AB state
60 * x0..x7: new byte-sliced CD state
65 * S-function with AES subbytes \
147 vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
159 /* P-function */ \
193 vpxor 5 * 32(mem_cd), x1, x1; \
200 vpxor 4 * 32(mem_cd), x0, x0; \
203 vpxor 6 * 32(mem_cd), x2, x2; \
206 vpxor 7 * 32(mem_cd), x3, x3; \
209 vpxor 0 * 32(mem_cd), x4, x4; \
212 vpxor 1 * 32(mem_cd), x5, x5; \
215 vpxor 2 * 32(mem_cd), x6, x6; \
218 vpxor 3 * 32(mem_cd), x7, x7;
242 * x0..x7: byte-sliced AB state preloaded
243 * mem_ab: byte-sliced AB state in memory
244 * mem_cb: byte-sliced CD state in memory
251 vmovdqu x0, 4 * 32(mem_cd); \
252 vmovdqu x1, 5 * 32(mem_cd); \
253 vmovdqu x2, 6 * 32(mem_cd); \
254 vmovdqu x3, 7 * 32(mem_cd); \
255 vmovdqu x4, 0 * 32(mem_cd); \
256 vmovdqu x5, 1 * 32(mem_cd); \
257 vmovdqu x6, 2 * 32(mem_cd); \
258 vmovdqu x7, 3 * 32(mem_cd); \
269 vmovdqu x4, 4 * 32(mem_ab); \
270 vmovdqu x5, 5 * 32(mem_ab); \
271 vmovdqu x6, 6 * 32(mem_ab); \
272 vmovdqu x7, 7 * 32(mem_ab); \
273 vmovdqu x0, 0 * 32(mem_ab); \
274 vmovdqu x1, 1 * 32(mem_ab); \
275 vmovdqu x2, 2 * 32(mem_ab); \
276 vmovdqu x3, 3 * 32(mem_ab);
290 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
292 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
294 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
298 * v0..3: byte-sliced 32-bit integers
327 * r: byte-sliced AB state in memory
328 * l: byte-sliced CD state in memory
330 * x0..x7: new byte-sliced CD state
339 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
357 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
358 vmovdqu l4, 4 * 32(l); \
360 vmovdqu l5, 5 * 32(l); \
362 vmovdqu l6, 6 * 32(l); \
364 vmovdqu l7, 7 * 32(l); \
380 vpor 4 * 32(r), t0, t0; \
381 vpor 5 * 32(r), t1, t1; \
382 vpor 6 * 32(r), t2, t2; \
383 vpor 7 * 32(r), t3, t3; \
385 vpxor 0 * 32(r), t0, t0; \
386 vpxor 1 * 32(r), t1, t1; \
387 vpxor 2 * 32(r), t2, t2; \
388 vpxor 3 * 32(r), t3, t3; \
389 vmovdqu t0, 0 * 32(r); \
390 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
391 vmovdqu t1, 1 * 32(r); \
392 vmovdqu t2, 2 * 32(r); \
393 vmovdqu t3, 3 * 32(r); \
408 vpand 0 * 32(r), t0, t0; \
409 vpand 1 * 32(r), t1, t1; \
410 vpand 2 * 32(r), t2, t2; \
411 vpand 3 * 32(r), t3, t3; \
415 vpxor 4 * 32(r), t0, t0; \
416 vpxor 5 * 32(r), t1, t1; \
417 vpxor 6 * 32(r), t2, t2; \
418 vpxor 7 * 32(r), t3, t3; \
419 vmovdqu t0, 4 * 32(r); \
420 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
421 vmovdqu t1, 5 * 32(r); \
422 vmovdqu t2, 6 * 32(r); \
423 vmovdqu t3, 7 * 32(r); \
445 vmovdqu l0, 0 * 32(l); \
447 vmovdqu l1, 1 * 32(l); \
449 vmovdqu l2, 2 * 32(l); \
451 vmovdqu l3, 3 * 32(l);
515 /* load blocks to registers and apply pre-whitening */
521 vpxor 0 * 32(rio), x0, y7; \
522 vpxor 1 * 32(rio), x0, y6; \
523 vpxor 2 * 32(rio), x0, y5; \
524 vpxor 3 * 32(rio), x0, y4; \
525 vpxor 4 * 32(rio), x0, y3; \
526 vpxor 5 * 32(rio), x0, y2; \
527 vpxor 6 * 32(rio), x0, y1; \
528 vpxor 7 * 32(rio), x0, y0; \
529 vpxor 8 * 32(rio), x0, x7; \
530 vpxor 9 * 32(rio), x0, x6; \
531 vpxor 10 * 32(rio), x0, x5; \
532 vpxor 11 * 32(rio), x0, x4; \
533 vpxor 12 * 32(rio), x0, x3; \
534 vpxor 13 * 32(rio), x0, x2; \
535 vpxor 14 * 32(rio), x0, x1; \
536 vpxor 15 * 32(rio), x0, x0;
538 /* byteslice pre-whitened blocks and store to temporary memory */
544 vmovdqu x0, 0 * 32(mem_ab); \
545 vmovdqu x1, 1 * 32(mem_ab); \
546 vmovdqu x2, 2 * 32(mem_ab); \
547 vmovdqu x3, 3 * 32(mem_ab); \
548 vmovdqu x4, 4 * 32(mem_ab); \
549 vmovdqu x5, 5 * 32(mem_ab); \
550 vmovdqu x6, 6 * 32(mem_ab); \
551 vmovdqu x7, 7 * 32(mem_ab); \
552 vmovdqu y0, 0 * 32(mem_cd); \
553 vmovdqu y1, 1 * 32(mem_cd); \
554 vmovdqu y2, 2 * 32(mem_cd); \
555 vmovdqu y3, 3 * 32(mem_cd); \
556 vmovdqu y4, 4 * 32(mem_cd); \
557 vmovdqu y5, 5 * 32(mem_cd); \
558 vmovdqu y6, 6 * 32(mem_cd); \
559 vmovdqu y7, 7 * 32(mem_cd);
561 /* de-byteslice, apply post-whitening and store blocks */
591 vmovdqu x0, 0 * 32(rio); \
592 vmovdqu x1, 1 * 32(rio); \
593 vmovdqu x2, 2 * 32(rio); \
594 vmovdqu x3, 3 * 32(rio); \
595 vmovdqu x4, 4 * 32(rio); \
596 vmovdqu x5, 5 * 32(rio); \
597 vmovdqu x6, 6 * 32(rio); \
598 vmovdqu x7, 7 * 32(rio); \
599 vmovdqu y0, 8 * 32(rio); \
600 vmovdqu y1, 9 * 32(rio); \
601 vmovdqu y2, 10 * 32(rio); \
602 vmovdqu y3, 11 * 32(rio); \
603 vmovdqu y4, 12 * 32(rio); \
604 vmovdqu y5, 13 * 32(rio); \
605 vmovdqu y6, 14 * 32(rio); \
606 vmovdqu y7, 15 * 32(rio);
609 .section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
610 .align 32
612 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
614 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
615 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
617 .section .rodata.cst32.pack_bswap, "aM", @progbits, 32
618 .align 32
620 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
621 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
623 /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
628 * pre-SubByte transform
630 * pre-lookup for sbox1, sbox2, sbox3:
639 * (note: '⊕ 0xc5' inside camellia_f())
642 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
643 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
645 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
646 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
649 * pre-SubByte transform
651 * pre-lookup for sbox4:
660 * (note: '⊕ 0xc5' inside camellia_f())
663 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
664 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
666 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
667 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
670 * post-SubByte transform
672 * post-lookup for sbox1, sbox4:
683 * (note: '⊕ 0x6e' inside camellia_h())
686 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
687 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
689 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
690 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
693 * post-SubByte transform
695 * post-lookup for sbox2:
706 * (note: '⊕ 0x6e' inside camellia_h())
709 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
710 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
712 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
713 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
716 * post-SubByte transform
718 * post-lookup for sbox3:
729 * (note: '⊕ 0x6e' inside camellia_h())
732 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
733 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
735 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
736 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
740 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
741 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
745 /* 4-bit mask */
747 .long 0x0f0f0f0f
756 * %ymm0..%ymm15: 32 plaintext blocks
758 * %ymm0..%ymm15: 32 encrypted blocks, order swapped:
759 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
763 leaq 8 * 32(%rax), %rcx;
771 %ymm15, %rax, %rcx, 0);
776 ((key_table + (8) * 8) + 0)(CTX),
788 ((key_table + (16) * 8) + 0)(CTX),
803 vmovdqu 0 * 32(%rcx), %ymm8;
804 vmovdqu 1 * 32(%rcx), %ymm9;
805 vmovdqu 2 * 32(%rcx), %ymm10;
806 vmovdqu 3 * 32(%rcx), %ymm11;
807 vmovdqu 4 * 32(%rcx), %ymm12;
808 vmovdqu 5 * 32(%rcx), %ymm13;
809 vmovdqu 6 * 32(%rcx), %ymm14;
810 vmovdqu 7 * 32(%rcx), %ymm15;
814 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
821 movl $32, %r8d;
826 ((key_table + (24) * 8) + 0)(CTX),
843 * %r8d: 24 for 16 byte key, 32 for larger
847 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
851 leaq 8 * 32(%rax), %rcx;
857 cmpl $32, %r8d;
870 ((key_table + (16) * 8) + 0)(CTX),
882 ((key_table + (8) * 8) + 0)(CTX),
887 %ymm15, %rax, %rcx, 0);
890 vmovdqu 0 * 32(%rcx), %ymm8;
891 vmovdqu 1 * 32(%rcx), %ymm9;
892 vmovdqu 2 * 32(%rcx), %ymm10;
893 vmovdqu 3 * 32(%rcx), %ymm11;
894 vmovdqu 4 * 32(%rcx), %ymm12;
895 vmovdqu 5 * 32(%rcx), %ymm13;
896 vmovdqu 6 * 32(%rcx), %ymm14;
897 vmovdqu 7 * 32(%rcx), %ymm15;
901 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
917 ((key_table + (24) * 8) + 0)(CTX),
926 * %rsi: dst (32 blocks)
927 * %rdx: src (32 blocks)
955 * %rsi: dst (32 blocks)
956 * %rdx: src (32 blocks)
963 movl $32, %r8d;
989 * %rsi: dst (32 blocks)
990 * %rdx: src (32 blocks)
993 subq $(16 * 32), %rsp;
998 movl $32, %r8d;
1015 * dst still in-use (because dst == src), so use stack for temporary
1027 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1028 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1029 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1030 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1031 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1032 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1033 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1034 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1035 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1036 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1037 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1038 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1039 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1040 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1041 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1048 addq $(16 * 32), %rsp;