Lines Matching +full:4 +full:- +full:16
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
6 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
14 .file "cast5-avx-x86_64-asm_64.S"
23 #define kr (16*4)
24 #define rr ((16*4)+16)
26 /* s-boxes */
33 16-way AVX cast5
88 shrq $16, src; \
89 movl s1(, RID1, 4), dst ## d; \
90 op1 s2(, RID2, 4), dst ## d; \
94 op2 s3(, RID1, 4), dst ## d; \
95 op3 s4(, RID2, 4), dst ## d;
100 shrq $16, reg;
146 vbroadcastss (km+(4*n))(CTX), RKM; \
151 subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
155 /* add 16-bit rotation to key rotations (mod 32) */ \
160 /* add 16-bit rotation to key rotations (mod 32) */ \
183 .section .rodata.cst16.bswap_mask, "aM", @progbits, 16
184 .align 16
186 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
187 .section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
188 .align 16
190 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
191 .section .rodata.cst16.bswap_iv_mask, "aM", @progbits, 16
192 .align 16
194 .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
196 .section .rodata.cst4.16_mask, "aM", @progbits, 4
197 .align 4
199 .byte 16, 16, 16, 16
200 .section .rodata.cst4.32_mask, "aM", @progbits, 4
201 .align 4
204 .section .rodata.cst4.first_mask, "aM", @progbits, 4
205 .align 4
211 .align 16
216 * RR1: blocks 3 and 4
222 * RR4: blocks 15 and 16
225 * RR1: encrypted blocks 3 and 4
231 * RR4: encrypted blocks 15 and 16
253 round(RL, RR, 4, 2);
285 .align 16
290 * RR1: encrypted blocks 3 and 4
296 * RR4: encrypted blocks 15 and 16
299 * RR1: decrypted blocks 3 and 4
305 * RR4: decrypted blocks 15 and 16
340 round(RR, RL, 4, 2);
358 vpsrldq $4, RKR, RKR;
374 vmovdqu (0*4*4)(%rdx), RL1;
375 vmovdqu (1*4*4)(%rdx), RR1;
376 vmovdqu (2*4*4)(%rdx), RL2;
377 vmovdqu (3*4*4)(%rdx), RR2;
378 vmovdqu (4*4*4)(%rdx), RL3;
379 vmovdqu (5*4*4)(%rdx), RR3;
380 vmovdqu (6*4*4)(%rdx), RL4;
381 vmovdqu (7*4*4)(%rdx), RR4;
385 vmovdqu RR1, (0*4*4)(%r11);
386 vmovdqu RL1, (1*4*4)(%r11);
387 vmovdqu RR2, (2*4*4)(%r11);
388 vmovdqu RL2, (3*4*4)(%r11);
389 vmovdqu RR3, (4*4*4)(%r11);
390 vmovdqu RL3, (5*4*4)(%r11);
391 vmovdqu RR4, (6*4*4)(%r11);
392 vmovdqu RL4, (7*4*4)(%r11);
412 vmovdqu (0*4*4)(%rdx), RL1;
413 vmovdqu (1*4*4)(%rdx), RR1;
414 vmovdqu (2*4*4)(%rdx), RL2;
415 vmovdqu (3*4*4)(%rdx), RR2;
416 vmovdqu (4*4*4)(%rdx), RL3;
417 vmovdqu (5*4*4)(%rdx), RR3;
418 vmovdqu (6*4*4)(%rdx), RL4;
419 vmovdqu (7*4*4)(%rdx), RR4;
423 vmovdqu RR1, (0*4*4)(%r11);
424 vmovdqu RL1, (1*4*4)(%r11);
425 vmovdqu RR2, (2*4*4)(%r11);
426 vmovdqu RL2, (3*4*4)(%r11);
427 vmovdqu RR3, (4*4*4)(%r11);
428 vmovdqu RL3, (5*4*4)(%r11);
429 vmovdqu RR4, (6*4*4)(%r11);
430 vmovdqu RL4, (7*4*4)(%r11);
451 vmovdqu (0*16)(%rdx), RL1;
452 vmovdqu (1*16)(%rdx), RR1;
453 vmovdqu (2*16)(%rdx), RL2;
454 vmovdqu (3*16)(%rdx), RR2;
455 vmovdqu (4*16)(%rdx), RL3;
456 vmovdqu (5*16)(%rdx), RR3;
457 vmovdqu (6*16)(%rdx), RL4;
458 vmovdqu (7*16)(%rdx), RR4;
466 vpxor 0*16+8(%r12), RL1, RL1;
467 vpxor 1*16+8(%r12), RR2, RR2;
468 vpxor 2*16+8(%r12), RL2, RL2;
469 vpxor 3*16+8(%r12), RR3, RR3;
470 vpxor 4*16+8(%r12), RL3, RL3;
471 vpxor 5*16+8(%r12), RR4, RR4;
472 vpxor 6*16+8(%r12), RL4, RL4;
474 vmovdqu RR1, (0*16)(%r11);
475 vmovdqu RL1, (1*16)(%r11);
476 vmovdqu RR2, (2*16)(%r11);
477 vmovdqu RL2, (3*16)(%r11);
478 vmovdqu RR3, (4*16)(%r11);
479 vmovdqu RL3, (5*16)(%r11);
480 vmovdqu RR4, (6*16)(%r11);
481 vmovdqu RL4, (7*16)(%r11);
505 vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
508 vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
542 vpxor (0*16)(%r12), RR1, RR1;
543 vpxor (1*16)(%r12), RL1, RL1;
544 vpxor (2*16)(%r12), RR2, RR2;
545 vpxor (3*16)(%r12), RL2, RL2;
546 vpxor (4*16)(%r12), RR3, RR3;
547 vpxor (5*16)(%r12), RL3, RL3;
548 vpxor (6*16)(%r12), RR4, RR4;
549 vpxor (7*16)(%r12), RL4, RL4;
550 vmovdqu RR1, (0*16)(%r11);
551 vmovdqu RL1, (1*16)(%r11);
552 vmovdqu RR2, (2*16)(%r11);
553 vmovdqu RL2, (3*16)(%r11);
554 vmovdqu RR3, (4*16)(%r11);
555 vmovdqu RL3, (5*16)(%r11);
556 vmovdqu RR4, (6*16)(%r11);
557 vmovdqu RL4, (7*16)(%r11);