Lines Matching +full:phase +full:- +full:shift
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 ## AAD Format with 32-bit Sequence Number
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107 ## AAD Format with 64-bit Extended Sequence Number
300 and $-16, %r13 # r13 = r13 - (r13 mod 16)
377 cmp $(255-8), %r15d
440 # able to shift 16-r13 bytes (r13 is the
456 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
461 # shift right 16-r13 bytes
468 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
469 # mask out top 16-r13 bytes of xmm9
470 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
478 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
479 # mask out top 16-r13 bytes of xmm9
480 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
515 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
647 /* finalize: shift out the extra bytes we read, and align
648 left. since pslldq can only shift by an immediate, we use
727 mov -1(\DPTR, \DLEN, 1), %al
738 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
764 # adjust the shuffle mask pointer to be able to shift r13 bytes
765 # r16-r13 is the number of bytes in plaintext mod 16)
768 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
779 # shift mask accordingly
784 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
815 # shift mask accordingly
820 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
880 # Input: A and B (128-bits each, bit-reflected)
898 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
899 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
903 #first phase of the reduction
905 vpslld $30, \GH, \T3 # packed right shifting shift << 30
906 vpslld $25, \GH, \T4 # packed right shifting shift << 25
911 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
913 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
914 vpxor \T2, \GH, \GH # first phase of the reduction complete
916 #second phase of the reduction
992 i = (8-\num_initial_blocks)
999 i = (9-\num_initial_blocks)
1010 i = (9-\num_initial_blocks)
1022 i = (9-\num_initial_blocks)
1035 i = (9-\num_initial_blocks)
1043 i = (9-\num_initial_blocks)
1059 i = (8-\num_initial_blocks)
1060 j = (9-\num_initial_blocks)
1494 .rep (\REP-9)
1529 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
1530 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
1537 #first phase of the reduction
1540 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1541 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1546 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1548 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1549 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1563 #second phase of the reduction
1723 # the accumulated carry-less multiplications
1726 #first phase of the reduction
1728 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1729 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1734 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1736 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1737 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1741 #second phase of the reduction
1758 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1759 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1762 # concatenated with 0x00000001. 16-byte aligned pointer. */
1777 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1806 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1862 # Input: A and B (128-bits each, bit-reflected)
1876 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1877 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1883 #first phase of the reduction
1887 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1889 vpxor \T2, \GH, \GH # first phase of the reduction complete
1891 #second phase of the reduction
1893 …vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-…
1896 …vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no sh…
1898 vpxor \T2, \GH, \GH # second phase of the reduction complete
1940 i = (8-\num_initial_blocks)
1947 i = (9-\num_initial_blocks)
1958 i = (9-\num_initial_blocks)
1970 i = (9-\num_initial_blocks)
1984 i = (9-\num_initial_blocks)
1992 i = (9-\num_initial_blocks)
2009 i = (8-\num_initial_blocks)
2010 j = (9-\num_initial_blocks)
2452 .rep (\REP-9)
2486 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2487 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2494 #first phase of the reduction
2498 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2500 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2514 #second phase of the reduction
2516 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2519 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2521 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2695 # accumulated carry-less multiplications
2698 #first phase of the reduction
2702 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2704 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2708 #second phase of the reduction
2710 …vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs sh…
2713 …vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with n…
2715 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2726 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2729 # concatenated with 0x00000001. 16-byte aligned pointer. */
2730 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2745 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2774 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */