1########################################################################
2# Copyright (c) 2013, Intel Corporation
3#
4# This software is available to you under a choice of one of two
5# licenses.  You may choose to be licensed under the terms of the GNU
6# General Public License (GPL) Version 2, available from the file
7# COPYING in the main directory of this source tree, or the
8# OpenIB.org BSD license below:
9#
10# Redistribution and use in source and binary forms, with or without
11# modification, are permitted provided that the following conditions are
12# met:
13#
14# * Redistributions of source code must retain the above copyright
15#   notice, this list of conditions and the following disclaimer.
16#
17# * Redistributions in binary form must reproduce the above copyright
18#   notice, this list of conditions and the following disclaimer in the
19#   documentation and/or other materials provided with the
20#   distribution.
21#
22# * Neither the name of the Intel Corporation nor the names of its
23#   contributors may be used to endorse or promote products derived from
24#   this software without specific prior written permission.
25#
26#
27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38########################################################################
39##
40## Authors:
41##	Erdinc Ozturk <erdinc.ozturk@intel.com>
42##	Vinodh Gopal <vinodh.gopal@intel.com>
43##	James Guilford <james.guilford@intel.com>
44##	Tim Chen <tim.c.chen@linux.intel.com>
45##
46## References:
47##       This code was derived and highly optimized from the code described in paper:
48##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49##			on Intel Architecture Processors. August, 2010
50##       The details of the implementation is explained in:
51##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52##			on Intel Architecture Processors. October, 2012.
53##
54## Assumptions:
55##
56##
57##
58## iv:
59##       0                   1                   2                   3
60##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62##       |                             Salt  (From the SA)               |
63##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64##       |                     Initialization Vector                     |
65##       |         (This is the sequence number from IPSec header)       |
66##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67##       |                              0x1                              |
68##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69##
70##
71##
72## AAD:
73##       AAD padded to 128 bits with 0
74##       for example, assume AAD is a u32 vector
75##
76##       if AAD is 8 bytes:
77##       AAD[3] = {A0, A1}#
78##       padded AAD in xmm register = {A1 A0 0 0}
79##
80##       0                   1                   2                   3
81##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83##       |                               SPI (A1)                        |
84##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85##       |                     32-bit Sequence Number (A0)               |
86##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87##       |                              0x0                              |
88##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89##
90##                                       AAD Format with 32-bit Sequence Number
91##
92##       if AAD is 12 bytes:
93##       AAD[3] = {A0, A1, A2}#
94##       padded AAD in xmm register = {A2 A1 A0 0}
95##
96##       0                   1                   2                   3
97##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99##       |                               SPI (A2)                        |
100##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101##       |                 64-bit Extended Sequence Number {A1,A0}       |
102##       |                                                               |
103##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104##       |                              0x0                              |
105##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106##
107##        AAD Format with 64-bit Extended Sequence Number
108##
109##
110## aadLen:
111##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
112##	 The code additionally supports aadLen of length 16 bytes.
113##
114## TLen:
115##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116##
117## poly = x^128 + x^127 + x^126 + x^121 + 1
118## throughout the code, one tab and two tab indentations are used. one tab is
119## for GHASH part, two tabs is for AES part.
120##
121
122#include <linux/linkage.h>
123#include <asm/inst.h>
124
125# constants in mergeable sections, linker can reorder and merge
126.section	.rodata.cst16.POLY, "aM", @progbits, 16
127.align 16
128POLY:            .octa     0xC2000000000000000000000000000001
129
130.section	.rodata.cst16.POLY2, "aM", @progbits, 16
131.align 16
132POLY2:           .octa     0xC20000000000000000000001C2000000
133
134.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
135.align 16
136TWOONE:          .octa     0x00000001000000000000000000000001
137
138.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
139.align 16
140SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
141
142.section	.rodata.cst16.ONE, "aM", @progbits, 16
143.align 16
144ONE:             .octa     0x00000000000000000000000000000001
145
146.section	.rodata.cst16.ONEf, "aM", @progbits, 16
147.align 16
148ONEf:            .octa     0x01000000000000000000000000000000
149
150# order of these constants should not change.
151# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
152.section	.rodata, "a", @progbits
153.align 16
154SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
155ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
156                 .octa     0x00000000000000000000000000000000
157
158.section .rodata
159.align 16
160.type aad_shift_arr, @object
161.size aad_shift_arr, 272
162aad_shift_arr:
163        .octa     0xffffffffffffffffffffffffffffffff
164        .octa     0xffffffffffffffffffffffffffffff0C
165        .octa     0xffffffffffffffffffffffffffff0D0C
166        .octa     0xffffffffffffffffffffffffff0E0D0C
167        .octa     0xffffffffffffffffffffffff0F0E0D0C
168        .octa     0xffffffffffffffffffffff0C0B0A0908
169        .octa     0xffffffffffffffffffff0D0C0B0A0908
170        .octa     0xffffffffffffffffff0E0D0C0B0A0908
171        .octa     0xffffffffffffffff0F0E0D0C0B0A0908
172        .octa     0xffffffffffffff0C0B0A090807060504
173        .octa     0xffffffffffff0D0C0B0A090807060504
174        .octa     0xffffffffff0E0D0C0B0A090807060504
175        .octa     0xffffffff0F0E0D0C0B0A090807060504
176        .octa     0xffffff0C0B0A09080706050403020100
177        .octa     0xffff0D0C0B0A09080706050403020100
178        .octa     0xff0E0D0C0B0A09080706050403020100
179        .octa     0x0F0E0D0C0B0A09080706050403020100
180
181
182.text
183
184
185##define the fields of the gcm aes context
186#{
187#        u8 expanded_keys[16*11] store expanded keys
188#        u8 shifted_hkey_1[16]   store HashKey <<1 mod poly here
189#        u8 shifted_hkey_2[16]   store HashKey^2 <<1 mod poly here
190#        u8 shifted_hkey_3[16]   store HashKey^3 <<1 mod poly here
191#        u8 shifted_hkey_4[16]   store HashKey^4 <<1 mod poly here
192#        u8 shifted_hkey_5[16]   store HashKey^5 <<1 mod poly here
193#        u8 shifted_hkey_6[16]   store HashKey^6 <<1 mod poly here
194#        u8 shifted_hkey_7[16]   store HashKey^7 <<1 mod poly here
195#        u8 shifted_hkey_8[16]   store HashKey^8 <<1 mod poly here
196#        u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
197#        u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
198#        u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
199#        u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
200#        u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
201#        u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
202#        u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
203#        u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
204#} gcm_ctx#
205
206HashKey        = 16*11   # store HashKey <<1 mod poly here
207HashKey_2      = 16*12   # store HashKey^2 <<1 mod poly here
208HashKey_3      = 16*13   # store HashKey^3 <<1 mod poly here
209HashKey_4      = 16*14   # store HashKey^4 <<1 mod poly here
210HashKey_5      = 16*15   # store HashKey^5 <<1 mod poly here
211HashKey_6      = 16*16   # store HashKey^6 <<1 mod poly here
212HashKey_7      = 16*17   # store HashKey^7 <<1 mod poly here
213HashKey_8      = 16*18   # store HashKey^8 <<1 mod poly here
214HashKey_k      = 16*19   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
215HashKey_2_k    = 16*20   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
216HashKey_3_k    = 16*21   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
217HashKey_4_k    = 16*22   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
218HashKey_5_k    = 16*23   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
219HashKey_6_k    = 16*24   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
220HashKey_7_k    = 16*25   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
221HashKey_8_k    = 16*26   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
222
223#define arg1 %rdi
224#define arg2 %rsi
225#define arg3 %rdx
226#define arg4 %rcx
227#define arg5 %r8
228#define arg6 %r9
229#define arg7 STACK_OFFSET+8*1(%r14)
230#define arg8 STACK_OFFSET+8*2(%r14)
231#define arg9 STACK_OFFSET+8*3(%r14)
232
233i = 0
234j = 0
235
236out_order = 0
237in_order = 1
238DEC = 0
239ENC = 1
240
241.macro define_reg r n
242reg_\r = %xmm\n
243.endm
244
245.macro setreg
246.altmacro
247define_reg i %i
248define_reg j %j
249.noaltmacro
250.endm
251
252# need to push 4 registers into stack to maintain
253STACK_OFFSET = 8*4
254
255TMP1 =   16*0    # Temporary storage for AAD
256TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
257TMP3 =   16*2    # Temporary storage for AES State 3
258TMP4 =   16*3    # Temporary storage for AES State 4
259TMP5 =   16*4    # Temporary storage for AES State 5
260TMP6 =   16*5    # Temporary storage for AES State 6
261TMP7 =   16*6    # Temporary storage for AES State 7
262TMP8 =   16*7    # Temporary storage for AES State 8
263
264VARIABLE_OFFSET = 16*8
265
266################################
267# Utility Macros
268################################
269
270# Encryption of a single block
271.macro ENCRYPT_SINGLE_BLOCK XMM0
272                vpxor    (arg1), \XMM0, \XMM0
273		i = 1
274		setreg
275.rep 9
276                vaesenc  16*i(arg1), \XMM0, \XMM0
277		i = (i+1)
278		setreg
279.endr
280                vaesenclast 16*10(arg1), \XMM0, \XMM0
281.endm
282
283#ifdef CONFIG_AS_AVX
284###############################################################################
285# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
286# Input: A and B (128-bits each, bit-reflected)
287# Output: C = A*B*x mod poly, (i.e. >>1 )
288# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
289# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
290###############################################################################
291.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
292
293        vpshufd         $0b01001110, \GH, \T2
294        vpshufd         $0b01001110, \HK, \T3
295        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
296        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
297
298        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
299        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
300        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
301        vpxor           \GH, \T2,\T2
302        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
303
304        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
305        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
306        vpxor           \T3, \GH, \GH
307        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
308
309        #first phase of the reduction
310        vpslld  $31, \GH, \T2                   # packed right shifting << 31
311        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
312        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
313
314        vpxor   \T3, \T2, \T2                   # xor the shifted versions
315        vpxor   \T4, \T2, \T2
316
317        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
318
319        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
320        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
321
322        #second phase of the reduction
323
324        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
325        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
326        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
327        vpxor   \T3, \T2, \T2                   # xor the shifted versions
328        vpxor   \T4, \T2, \T2
329
330        vpxor   \T5, \T2, \T2
331        vpxor   \T2, \GH, \GH
332        vpxor   \T1, \GH, \GH                   # the result is in GH
333
334
335.endm
336
337.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
338
339        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
340        vmovdqa  \HK, \T5
341
342        vpshufd  $0b01001110, \T5, \T1
343        vpxor    \T5, \T1, \T1
344        vmovdqa  \T1, HashKey_k(arg1)
345
346        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
347        vmovdqa  \T5, HashKey_2(arg1)                    #  [HashKey_2] = HashKey^2<<1 mod poly
348        vpshufd  $0b01001110, \T5, \T1
349        vpxor    \T5, \T1, \T1
350        vmovdqa  \T1, HashKey_2_k(arg1)
351
352        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
353        vmovdqa  \T5, HashKey_3(arg1)
354        vpshufd  $0b01001110, \T5, \T1
355        vpxor    \T5, \T1, \T1
356        vmovdqa  \T1, HashKey_3_k(arg1)
357
358        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
359        vmovdqa  \T5, HashKey_4(arg1)
360        vpshufd  $0b01001110, \T5, \T1
361        vpxor    \T5, \T1, \T1
362        vmovdqa  \T1, HashKey_4_k(arg1)
363
364        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
365        vmovdqa  \T5, HashKey_5(arg1)
366        vpshufd  $0b01001110, \T5, \T1
367        vpxor    \T5, \T1, \T1
368        vmovdqa  \T1, HashKey_5_k(arg1)
369
370        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
371        vmovdqa  \T5, HashKey_6(arg1)
372        vpshufd  $0b01001110, \T5, \T1
373        vpxor    \T5, \T1, \T1
374        vmovdqa  \T1, HashKey_6_k(arg1)
375
376        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
377        vmovdqa  \T5, HashKey_7(arg1)
378        vpshufd  $0b01001110, \T5, \T1
379        vpxor    \T5, \T1, \T1
380        vmovdqa  \T1, HashKey_7_k(arg1)
381
382        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
383        vmovdqa  \T5, HashKey_8(arg1)
384        vpshufd  $0b01001110, \T5, \T1
385        vpxor    \T5, \T1, \T1
386        vmovdqa  \T1, HashKey_8_k(arg1)
387
388.endm
389
390## if a = number of total plaintext bytes
391## b = floor(a/16)
392## num_initial_blocks = b mod 4#
393## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
394## r10, r11, r12, rax are clobbered
395## arg1, arg2, arg3, r14 are used as a pointer only, not modified
396
397.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
398	i = (8-\num_initial_blocks)
399	j = 0
400	setreg
401
402	mov     arg6, %r10                      # r10 = AAD
403	mov     arg7, %r12                      # r12 = aadLen
404
405
406	mov     %r12, %r11
407
408	vpxor   reg_j, reg_j, reg_j
409	vpxor   reg_i, reg_i, reg_i
410	cmp     $16, %r11
411	jl      _get_AAD_rest8\@
412_get_AAD_blocks\@:
413	vmovdqu (%r10), reg_i
414	vpshufb SHUF_MASK(%rip), reg_i, reg_i
415	vpxor   reg_i, reg_j, reg_j
416	GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6
417	add     $16, %r10
418	sub     $16, %r12
419	sub     $16, %r11
420	cmp     $16, %r11
421	jge     _get_AAD_blocks\@
422	vmovdqu reg_j, reg_i
423	cmp     $0, %r11
424	je      _get_AAD_done\@
425
426	vpxor   reg_i, reg_i, reg_i
427
428	/* read the last <16B of AAD. since we have at least 4B of
429	data right after the AAD (the ICV, and maybe some CT), we can
430	read 4B/8B blocks safely, and then get rid of the extra stuff */
431_get_AAD_rest8\@:
432	cmp     $4, %r11
433	jle     _get_AAD_rest4\@
434	movq    (%r10), \T1
435	add     $8, %r10
436	sub     $8, %r11
437	vpslldq $8, \T1, \T1
438	vpsrldq $8, reg_i, reg_i
439	vpxor   \T1, reg_i, reg_i
440	jmp     _get_AAD_rest8\@
441_get_AAD_rest4\@:
442	cmp     $0, %r11
443	jle      _get_AAD_rest0\@
444	mov     (%r10), %eax
445	movq    %rax, \T1
446	add     $4, %r10
447	sub     $4, %r11
448	vpslldq $12, \T1, \T1
449	vpsrldq $4, reg_i, reg_i
450	vpxor   \T1, reg_i, reg_i
451_get_AAD_rest0\@:
452	/* finalize: shift out the extra bytes we read, and align
453	left. since pslldq can only shift by an immediate, we use
454	vpshufb and an array of shuffle masks */
455	movq    %r12, %r11
456	salq    $4, %r11
457	movdqu  aad_shift_arr(%r11), \T1
458	vpshufb \T1, reg_i, reg_i
459_get_AAD_rest_final\@:
460	vpshufb SHUF_MASK(%rip), reg_i, reg_i
461	vpxor   reg_j, reg_i, reg_i
462	GHASH_MUL_AVX       reg_i, \T2, \T1, \T3, \T4, \T5, \T6
463
464_get_AAD_done\@:
465	# initialize the data pointer offset as zero
466	xor     %r11d, %r11d
467
468	# start AES for num_initial_blocks blocks
469	mov     arg5, %rax                     # rax = *Y0
470	vmovdqu (%rax), \CTR                   # CTR = Y0
471	vpshufb SHUF_MASK(%rip), \CTR, \CTR
472
473
474	i = (9-\num_initial_blocks)
475	setreg
476.rep \num_initial_blocks
477                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
478                vmovdqa \CTR, reg_i
479                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
480	i = (i+1)
481	setreg
482.endr
483
484	vmovdqa  (arg1), \T_key
485	i = (9-\num_initial_blocks)
486	setreg
487.rep \num_initial_blocks
488                vpxor   \T_key, reg_i, reg_i
489	i = (i+1)
490	setreg
491.endr
492
493	j = 1
494	setreg
495.rep 9
496	vmovdqa  16*j(arg1), \T_key
497	i = (9-\num_initial_blocks)
498	setreg
499.rep \num_initial_blocks
500        vaesenc \T_key, reg_i, reg_i
501	i = (i+1)
502	setreg
503.endr
504
505	j = (j+1)
506	setreg
507.endr
508
509
510	vmovdqa  16*10(arg1), \T_key
511	i = (9-\num_initial_blocks)
512	setreg
513.rep \num_initial_blocks
514        vaesenclast      \T_key, reg_i, reg_i
515	i = (i+1)
516	setreg
517.endr
518
519	i = (9-\num_initial_blocks)
520	setreg
521.rep \num_initial_blocks
522                vmovdqu (arg3, %r11), \T1
523                vpxor   \T1, reg_i, reg_i
524                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for num_initial_blocks blocks
525                add     $16, %r11
526.if  \ENC_DEC == DEC
527                vmovdqa \T1, reg_i
528.endif
529                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
530	i = (i+1)
531	setreg
532.endr
533
534
535	i = (8-\num_initial_blocks)
536	j = (9-\num_initial_blocks)
537	setreg
538
539.rep \num_initial_blocks
540        vpxor    reg_i, reg_j, reg_j
541        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
542	i = (i+1)
543	j = (j+1)
544	setreg
545.endr
546        # XMM8 has the combined result here
547
548        vmovdqa  \XMM8, TMP1(%rsp)
549        vmovdqa  \XMM8, \T3
550
551        cmp     $128, %r13
552        jl      _initial_blocks_done\@                  # no need for precomputed constants
553
554###############################################################################
555# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
556                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
557                vmovdqa  \CTR, \XMM1
558                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
559
560                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
561                vmovdqa  \CTR, \XMM2
562                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
563
564                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
565                vmovdqa  \CTR, \XMM3
566                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
567
568                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
569                vmovdqa  \CTR, \XMM4
570                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
571
572                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
573                vmovdqa  \CTR, \XMM5
574                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
575
576                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
577                vmovdqa  \CTR, \XMM6
578                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
579
580                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
581                vmovdqa  \CTR, \XMM7
582                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
583
584                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
585                vmovdqa  \CTR, \XMM8
586                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
587
588                vmovdqa  (arg1), \T_key
589                vpxor    \T_key, \XMM1, \XMM1
590                vpxor    \T_key, \XMM2, \XMM2
591                vpxor    \T_key, \XMM3, \XMM3
592                vpxor    \T_key, \XMM4, \XMM4
593                vpxor    \T_key, \XMM5, \XMM5
594                vpxor    \T_key, \XMM6, \XMM6
595                vpxor    \T_key, \XMM7, \XMM7
596                vpxor    \T_key, \XMM8, \XMM8
597
598		i = 1
599		setreg
600.rep    9       # do 9 rounds
601                vmovdqa  16*i(arg1), \T_key
602                vaesenc  \T_key, \XMM1, \XMM1
603                vaesenc  \T_key, \XMM2, \XMM2
604                vaesenc  \T_key, \XMM3, \XMM3
605                vaesenc  \T_key, \XMM4, \XMM4
606                vaesenc  \T_key, \XMM5, \XMM5
607                vaesenc  \T_key, \XMM6, \XMM6
608                vaesenc  \T_key, \XMM7, \XMM7
609                vaesenc  \T_key, \XMM8, \XMM8
610		i = (i+1)
611		setreg
612.endr
613
614
615                vmovdqa  16*i(arg1), \T_key
616                vaesenclast  \T_key, \XMM1, \XMM1
617                vaesenclast  \T_key, \XMM2, \XMM2
618                vaesenclast  \T_key, \XMM3, \XMM3
619                vaesenclast  \T_key, \XMM4, \XMM4
620                vaesenclast  \T_key, \XMM5, \XMM5
621                vaesenclast  \T_key, \XMM6, \XMM6
622                vaesenclast  \T_key, \XMM7, \XMM7
623                vaesenclast  \T_key, \XMM8, \XMM8
624
625                vmovdqu  (arg3, %r11), \T1
626                vpxor    \T1, \XMM1, \XMM1
627                vmovdqu  \XMM1, (arg2 , %r11)
628                .if   \ENC_DEC == DEC
629                vmovdqa  \T1, \XMM1
630                .endif
631
632                vmovdqu  16*1(arg3, %r11), \T1
633                vpxor    \T1, \XMM2, \XMM2
634                vmovdqu  \XMM2, 16*1(arg2 , %r11)
635                .if   \ENC_DEC == DEC
636                vmovdqa  \T1, \XMM2
637                .endif
638
639                vmovdqu  16*2(arg3, %r11), \T1
640                vpxor    \T1, \XMM3, \XMM3
641                vmovdqu  \XMM3, 16*2(arg2 , %r11)
642                .if   \ENC_DEC == DEC
643                vmovdqa  \T1, \XMM3
644                .endif
645
646                vmovdqu  16*3(arg3, %r11), \T1
647                vpxor    \T1, \XMM4, \XMM4
648                vmovdqu  \XMM4, 16*3(arg2 , %r11)
649                .if   \ENC_DEC == DEC
650                vmovdqa  \T1, \XMM4
651                .endif
652
653                vmovdqu  16*4(arg3, %r11), \T1
654                vpxor    \T1, \XMM5, \XMM5
655                vmovdqu  \XMM5, 16*4(arg2 , %r11)
656                .if   \ENC_DEC == DEC
657                vmovdqa  \T1, \XMM5
658                .endif
659
660                vmovdqu  16*5(arg3, %r11), \T1
661                vpxor    \T1, \XMM6, \XMM6
662                vmovdqu  \XMM6, 16*5(arg2 , %r11)
663                .if   \ENC_DEC == DEC
664                vmovdqa  \T1, \XMM6
665                .endif
666
667                vmovdqu  16*6(arg3, %r11), \T1
668                vpxor    \T1, \XMM7, \XMM7
669                vmovdqu  \XMM7, 16*6(arg2 , %r11)
670                .if   \ENC_DEC == DEC
671                vmovdqa  \T1, \XMM7
672                .endif
673
674                vmovdqu  16*7(arg3, %r11), \T1
675                vpxor    \T1, \XMM8, \XMM8
676                vmovdqu  \XMM8, 16*7(arg2 , %r11)
677                .if   \ENC_DEC == DEC
678                vmovdqa  \T1, \XMM8
679                .endif
680
681                add     $128, %r11
682
683                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
684                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
685                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
686                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
687                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
688                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
689                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
690                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
691                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
692
693###############################################################################
694
695_initial_blocks_done\@:
696
697.endm
698
699# encrypt 8 blocks at a time
700# ghash the 8 previously encrypted ciphertext blocks
701# arg1, arg2, arg3 are used as pointers only, not modified
702# r11 is the data offset value
703.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
704
705        vmovdqa \XMM1, \T2
706        vmovdqa \XMM2, TMP2(%rsp)
707        vmovdqa \XMM3, TMP3(%rsp)
708        vmovdqa \XMM4, TMP4(%rsp)
709        vmovdqa \XMM5, TMP5(%rsp)
710        vmovdqa \XMM6, TMP6(%rsp)
711        vmovdqa \XMM7, TMP7(%rsp)
712        vmovdqa \XMM8, TMP8(%rsp)
713
714.if \loop_idx == in_order
715                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
716                vpaddd  ONE(%rip), \XMM1, \XMM2
717                vpaddd  ONE(%rip), \XMM2, \XMM3
718                vpaddd  ONE(%rip), \XMM3, \XMM4
719                vpaddd  ONE(%rip), \XMM4, \XMM5
720                vpaddd  ONE(%rip), \XMM5, \XMM6
721                vpaddd  ONE(%rip), \XMM6, \XMM7
722                vpaddd  ONE(%rip), \XMM7, \XMM8
723                vmovdqa \XMM8, \CTR
724
725                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
726                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
727                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
728                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
729                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
730                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
731                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
732                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
733.else
734                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
735                vpaddd  ONEf(%rip), \XMM1, \XMM2
736                vpaddd  ONEf(%rip), \XMM2, \XMM3
737                vpaddd  ONEf(%rip), \XMM3, \XMM4
738                vpaddd  ONEf(%rip), \XMM4, \XMM5
739                vpaddd  ONEf(%rip), \XMM5, \XMM6
740                vpaddd  ONEf(%rip), \XMM6, \XMM7
741                vpaddd  ONEf(%rip), \XMM7, \XMM8
742                vmovdqa \XMM8, \CTR
743.endif
744
745
746        #######################################################################
747
748                vmovdqu (arg1), \T1
749                vpxor   \T1, \XMM1, \XMM1
750                vpxor   \T1, \XMM2, \XMM2
751                vpxor   \T1, \XMM3, \XMM3
752                vpxor   \T1, \XMM4, \XMM4
753                vpxor   \T1, \XMM5, \XMM5
754                vpxor   \T1, \XMM6, \XMM6
755                vpxor   \T1, \XMM7, \XMM7
756                vpxor   \T1, \XMM8, \XMM8
757
758        #######################################################################
759
760
761
762
763
764                vmovdqu 16*1(arg1), \T1
765                vaesenc \T1, \XMM1, \XMM1
766                vaesenc \T1, \XMM2, \XMM2
767                vaesenc \T1, \XMM3, \XMM3
768                vaesenc \T1, \XMM4, \XMM4
769                vaesenc \T1, \XMM5, \XMM5
770                vaesenc \T1, \XMM6, \XMM6
771                vaesenc \T1, \XMM7, \XMM7
772                vaesenc \T1, \XMM8, \XMM8
773
774                vmovdqu 16*2(arg1), \T1
775                vaesenc \T1, \XMM1, \XMM1
776                vaesenc \T1, \XMM2, \XMM2
777                vaesenc \T1, \XMM3, \XMM3
778                vaesenc \T1, \XMM4, \XMM4
779                vaesenc \T1, \XMM5, \XMM5
780                vaesenc \T1, \XMM6, \XMM6
781                vaesenc \T1, \XMM7, \XMM7
782                vaesenc \T1, \XMM8, \XMM8
783
784
785        #######################################################################
786
787        vmovdqa         HashKey_8(arg1), \T5
788        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
789        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
790
791        vpshufd         $0b01001110, \T2, \T6
792        vpxor           \T2, \T6, \T6
793
794        vmovdqa         HashKey_8_k(arg1), \T5
795        vpclmulqdq      $0x00, \T5, \T6, \T6
796
797                vmovdqu 16*3(arg1), \T1
798                vaesenc \T1, \XMM1, \XMM1
799                vaesenc \T1, \XMM2, \XMM2
800                vaesenc \T1, \XMM3, \XMM3
801                vaesenc \T1, \XMM4, \XMM4
802                vaesenc \T1, \XMM5, \XMM5
803                vaesenc \T1, \XMM6, \XMM6
804                vaesenc \T1, \XMM7, \XMM7
805                vaesenc \T1, \XMM8, \XMM8
806
807        vmovdqa         TMP2(%rsp), \T1
808        vmovdqa         HashKey_7(arg1), \T5
809        vpclmulqdq      $0x11, \T5, \T1, \T3
810        vpxor           \T3, \T4, \T4
811        vpclmulqdq      $0x00, \T5, \T1, \T3
812        vpxor           \T3, \T7, \T7
813
814        vpshufd         $0b01001110, \T1, \T3
815        vpxor           \T1, \T3, \T3
816        vmovdqa         HashKey_7_k(arg1), \T5
817        vpclmulqdq      $0x10, \T5, \T3, \T3
818        vpxor           \T3, \T6, \T6
819
820                vmovdqu 16*4(arg1), \T1
821                vaesenc \T1, \XMM1, \XMM1
822                vaesenc \T1, \XMM2, \XMM2
823                vaesenc \T1, \XMM3, \XMM3
824                vaesenc \T1, \XMM4, \XMM4
825                vaesenc \T1, \XMM5, \XMM5
826                vaesenc \T1, \XMM6, \XMM6
827                vaesenc \T1, \XMM7, \XMM7
828                vaesenc \T1, \XMM8, \XMM8
829
830        #######################################################################
831
832        vmovdqa         TMP3(%rsp), \T1
833        vmovdqa         HashKey_6(arg1), \T5
834        vpclmulqdq      $0x11, \T5, \T1, \T3
835        vpxor           \T3, \T4, \T4
836        vpclmulqdq      $0x00, \T5, \T1, \T3
837        vpxor           \T3, \T7, \T7
838
839        vpshufd         $0b01001110, \T1, \T3
840        vpxor           \T1, \T3, \T3
841        vmovdqa         HashKey_6_k(arg1), \T5
842        vpclmulqdq      $0x10, \T5, \T3, \T3
843        vpxor           \T3, \T6, \T6
844
845                vmovdqu 16*5(arg1), \T1
846                vaesenc \T1, \XMM1, \XMM1
847                vaesenc \T1, \XMM2, \XMM2
848                vaesenc \T1, \XMM3, \XMM3
849                vaesenc \T1, \XMM4, \XMM4
850                vaesenc \T1, \XMM5, \XMM5
851                vaesenc \T1, \XMM6, \XMM6
852                vaesenc \T1, \XMM7, \XMM7
853                vaesenc \T1, \XMM8, \XMM8
854
855        vmovdqa         TMP4(%rsp), \T1
856        vmovdqa         HashKey_5(arg1), \T5
857        vpclmulqdq      $0x11, \T5, \T1, \T3
858        vpxor           \T3, \T4, \T4
859        vpclmulqdq      $0x00, \T5, \T1, \T3
860        vpxor           \T3, \T7, \T7
861
862        vpshufd         $0b01001110, \T1, \T3
863        vpxor           \T1, \T3, \T3
864        vmovdqa         HashKey_5_k(arg1), \T5
865        vpclmulqdq      $0x10, \T5, \T3, \T3
866        vpxor           \T3, \T6, \T6
867
868                vmovdqu 16*6(arg1), \T1
869                vaesenc \T1, \XMM1, \XMM1
870                vaesenc \T1, \XMM2, \XMM2
871                vaesenc \T1, \XMM3, \XMM3
872                vaesenc \T1, \XMM4, \XMM4
873                vaesenc \T1, \XMM5, \XMM5
874                vaesenc \T1, \XMM6, \XMM6
875                vaesenc \T1, \XMM7, \XMM7
876                vaesenc \T1, \XMM8, \XMM8
877
878
879        vmovdqa         TMP5(%rsp), \T1
880        vmovdqa         HashKey_4(arg1), \T5
881        vpclmulqdq      $0x11, \T5, \T1, \T3
882        vpxor           \T3, \T4, \T4
883        vpclmulqdq      $0x00, \T5, \T1, \T3
884        vpxor           \T3, \T7, \T7
885
886        vpshufd         $0b01001110, \T1, \T3
887        vpxor           \T1, \T3, \T3
888        vmovdqa         HashKey_4_k(arg1), \T5
889        vpclmulqdq      $0x10, \T5, \T3, \T3
890        vpxor           \T3, \T6, \T6
891
892                vmovdqu 16*7(arg1), \T1
893                vaesenc \T1, \XMM1, \XMM1
894                vaesenc \T1, \XMM2, \XMM2
895                vaesenc \T1, \XMM3, \XMM3
896                vaesenc \T1, \XMM4, \XMM4
897                vaesenc \T1, \XMM5, \XMM5
898                vaesenc \T1, \XMM6, \XMM6
899                vaesenc \T1, \XMM7, \XMM7
900                vaesenc \T1, \XMM8, \XMM8
901
902        vmovdqa         TMP6(%rsp), \T1
903        vmovdqa         HashKey_3(arg1), \T5
904        vpclmulqdq      $0x11, \T5, \T1, \T3
905        vpxor           \T3, \T4, \T4
906        vpclmulqdq      $0x00, \T5, \T1, \T3
907        vpxor           \T3, \T7, \T7
908
909        vpshufd         $0b01001110, \T1, \T3
910        vpxor           \T1, \T3, \T3
911        vmovdqa         HashKey_3_k(arg1), \T5
912        vpclmulqdq      $0x10, \T5, \T3, \T3
913        vpxor           \T3, \T6, \T6
914
915
916                vmovdqu 16*8(arg1), \T1
917                vaesenc \T1, \XMM1, \XMM1
918                vaesenc \T1, \XMM2, \XMM2
919                vaesenc \T1, \XMM3, \XMM3
920                vaesenc \T1, \XMM4, \XMM4
921                vaesenc \T1, \XMM5, \XMM5
922                vaesenc \T1, \XMM6, \XMM6
923                vaesenc \T1, \XMM7, \XMM7
924                vaesenc \T1, \XMM8, \XMM8
925
926        vmovdqa         TMP7(%rsp), \T1
927        vmovdqa         HashKey_2(arg1), \T5
928        vpclmulqdq      $0x11, \T5, \T1, \T3
929        vpxor           \T3, \T4, \T4
930        vpclmulqdq      $0x00, \T5, \T1, \T3
931        vpxor           \T3, \T7, \T7
932
933        vpshufd         $0b01001110, \T1, \T3
934        vpxor           \T1, \T3, \T3
935        vmovdqa         HashKey_2_k(arg1), \T5
936        vpclmulqdq      $0x10, \T5, \T3, \T3
937        vpxor           \T3, \T6, \T6
938
939        #######################################################################
940
941                vmovdqu 16*9(arg1), \T5
942                vaesenc \T5, \XMM1, \XMM1
943                vaesenc \T5, \XMM2, \XMM2
944                vaesenc \T5, \XMM3, \XMM3
945                vaesenc \T5, \XMM4, \XMM4
946                vaesenc \T5, \XMM5, \XMM5
947                vaesenc \T5, \XMM6, \XMM6
948                vaesenc \T5, \XMM7, \XMM7
949                vaesenc \T5, \XMM8, \XMM8
950
951        vmovdqa         TMP8(%rsp), \T1
952        vmovdqa         HashKey(arg1), \T5
953        vpclmulqdq      $0x11, \T5, \T1, \T3
954        vpxor           \T3, \T4, \T4
955        vpclmulqdq      $0x00, \T5, \T1, \T3
956        vpxor           \T3, \T7, \T7
957
958        vpshufd         $0b01001110, \T1, \T3
959        vpxor           \T1, \T3, \T3
960        vmovdqa         HashKey_k(arg1), \T5
961        vpclmulqdq      $0x10, \T5, \T3, \T3
962        vpxor           \T3, \T6, \T6
963
964        vpxor           \T4, \T6, \T6
965        vpxor           \T7, \T6, \T6
966
967                vmovdqu 16*10(arg1), \T5
968
969	i = 0
970	j = 1
971	setreg
972.rep 8
973		vpxor	16*i(arg3, %r11), \T5, \T2
974                .if \ENC_DEC == ENC
975                vaesenclast     \T2, reg_j, reg_j
976                .else
977                vaesenclast     \T2, reg_j, \T3
978                vmovdqu 16*i(arg3, %r11), reg_j
979                vmovdqu \T3, 16*i(arg2, %r11)
980                .endif
981	i = (i+1)
982	j = (j+1)
983	setreg
984.endr
985	#######################################################################
986
987
988	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
989	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
990	vpxor	\T3, \T7, \T7
991	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
992
993
994
995	#######################################################################
996	#first phase of the reduction
997	#######################################################################
998        vpslld  $31, \T7, \T2                           # packed right shifting << 31
999        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
1000        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
1001
1002        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1003        vpxor   \T4, \T2, \T2
1004
1005        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
1006
1007        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
1008        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
1009	#######################################################################
1010                .if \ENC_DEC == ENC
1011		vmovdqu	 \XMM1,	16*0(arg2,%r11)		# Write to the Ciphertext buffer
1012		vmovdqu	 \XMM2,	16*1(arg2,%r11)		# Write to the Ciphertext buffer
1013		vmovdqu	 \XMM3,	16*2(arg2,%r11)		# Write to the Ciphertext buffer
1014		vmovdqu	 \XMM4,	16*3(arg2,%r11)		# Write to the Ciphertext buffer
1015		vmovdqu	 \XMM5,	16*4(arg2,%r11)		# Write to the Ciphertext buffer
1016		vmovdqu	 \XMM6,	16*5(arg2,%r11)		# Write to the Ciphertext buffer
1017		vmovdqu	 \XMM7,	16*6(arg2,%r11)		# Write to the Ciphertext buffer
1018		vmovdqu	 \XMM8,	16*7(arg2,%r11)		# Write to the Ciphertext buffer
1019                .endif
1020
1021	#######################################################################
1022	#second phase of the reduction
1023        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
1024        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
1025        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
1026        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1027        vpxor   \T4, \T2, \T2
1028
1029        vpxor   \T1, \T2, \T2
1030        vpxor   \T2, \T7, \T7
1031        vpxor   \T7, \T6, \T6                           # the result is in T6
1032	#######################################################################
1033
1034		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
1035		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
1036		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
1037		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
1038		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
1039		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
1040		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
1041		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
1042
1043
1044	vpxor	\T6, \XMM1, \XMM1
1045
1046
1047
1048.endm
1049
1050
1051# GHASH the last 4 ciphertext blocks.
1052.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1053
1054        ## Karatsuba Method
1055
1056
1057        vpshufd         $0b01001110, \XMM1, \T2
1058        vpxor           \XMM1, \T2, \T2
1059        vmovdqa         HashKey_8(arg1), \T5
1060        vpclmulqdq      $0x11, \T5, \XMM1, \T6
1061        vpclmulqdq      $0x00, \T5, \XMM1, \T7
1062
1063        vmovdqa         HashKey_8_k(arg1), \T3
1064        vpclmulqdq      $0x00, \T3, \T2, \XMM1
1065
1066        ######################
1067
1068        vpshufd         $0b01001110, \XMM2, \T2
1069        vpxor           \XMM2, \T2, \T2
1070        vmovdqa         HashKey_7(arg1), \T5
1071        vpclmulqdq      $0x11, \T5, \XMM2, \T4
1072        vpxor           \T4, \T6, \T6
1073
1074        vpclmulqdq      $0x00, \T5, \XMM2, \T4
1075        vpxor           \T4, \T7, \T7
1076
1077        vmovdqa         HashKey_7_k(arg1), \T3
1078        vpclmulqdq      $0x00, \T3, \T2, \T2
1079        vpxor           \T2, \XMM1, \XMM1
1080
1081        ######################
1082
1083        vpshufd         $0b01001110, \XMM3, \T2
1084        vpxor           \XMM3, \T2, \T2
1085        vmovdqa         HashKey_6(arg1), \T5
1086        vpclmulqdq      $0x11, \T5, \XMM3, \T4
1087        vpxor           \T4, \T6, \T6
1088
1089        vpclmulqdq      $0x00, \T5, \XMM3, \T4
1090        vpxor           \T4, \T7, \T7
1091
1092        vmovdqa         HashKey_6_k(arg1), \T3
1093        vpclmulqdq      $0x00, \T3, \T2, \T2
1094        vpxor           \T2, \XMM1, \XMM1
1095
1096        ######################
1097
1098        vpshufd         $0b01001110, \XMM4, \T2
1099        vpxor           \XMM4, \T2, \T2
1100        vmovdqa         HashKey_5(arg1), \T5
1101        vpclmulqdq      $0x11, \T5, \XMM4, \T4
1102        vpxor           \T4, \T6, \T6
1103
1104        vpclmulqdq      $0x00, \T5, \XMM4, \T4
1105        vpxor           \T4, \T7, \T7
1106
1107        vmovdqa         HashKey_5_k(arg1), \T3
1108        vpclmulqdq      $0x00, \T3, \T2, \T2
1109        vpxor           \T2, \XMM1, \XMM1
1110
1111        ######################
1112
1113        vpshufd         $0b01001110, \XMM5, \T2
1114        vpxor           \XMM5, \T2, \T2
1115        vmovdqa         HashKey_4(arg1), \T5
1116        vpclmulqdq      $0x11, \T5, \XMM5, \T4
1117        vpxor           \T4, \T6, \T6
1118
1119        vpclmulqdq      $0x00, \T5, \XMM5, \T4
1120        vpxor           \T4, \T7, \T7
1121
1122        vmovdqa         HashKey_4_k(arg1), \T3
1123        vpclmulqdq      $0x00, \T3, \T2, \T2
1124        vpxor           \T2, \XMM1, \XMM1
1125
1126        ######################
1127
1128        vpshufd         $0b01001110, \XMM6, \T2
1129        vpxor           \XMM6, \T2, \T2
1130        vmovdqa         HashKey_3(arg1), \T5
1131        vpclmulqdq      $0x11, \T5, \XMM6, \T4
1132        vpxor           \T4, \T6, \T6
1133
1134        vpclmulqdq      $0x00, \T5, \XMM6, \T4
1135        vpxor           \T4, \T7, \T7
1136
1137        vmovdqa         HashKey_3_k(arg1), \T3
1138        vpclmulqdq      $0x00, \T3, \T2, \T2
1139        vpxor           \T2, \XMM1, \XMM1
1140
1141        ######################
1142
1143        vpshufd         $0b01001110, \XMM7, \T2
1144        vpxor           \XMM7, \T2, \T2
1145        vmovdqa         HashKey_2(arg1), \T5
1146        vpclmulqdq      $0x11, \T5, \XMM7, \T4
1147        vpxor           \T4, \T6, \T6
1148
1149        vpclmulqdq      $0x00, \T5, \XMM7, \T4
1150        vpxor           \T4, \T7, \T7
1151
1152        vmovdqa         HashKey_2_k(arg1), \T3
1153        vpclmulqdq      $0x00, \T3, \T2, \T2
1154        vpxor           \T2, \XMM1, \XMM1
1155
1156        ######################
1157
1158        vpshufd         $0b01001110, \XMM8, \T2
1159        vpxor           \XMM8, \T2, \T2
1160        vmovdqa         HashKey(arg1), \T5
1161        vpclmulqdq      $0x11, \T5, \XMM8, \T4
1162        vpxor           \T4, \T6, \T6
1163
1164        vpclmulqdq      $0x00, \T5, \XMM8, \T4
1165        vpxor           \T4, \T7, \T7
1166
1167        vmovdqa         HashKey_k(arg1), \T3
1168        vpclmulqdq      $0x00, \T3, \T2, \T2
1169
1170        vpxor           \T2, \XMM1, \XMM1
1171        vpxor           \T6, \XMM1, \XMM1
1172        vpxor           \T7, \XMM1, \T2
1173
1174
1175
1176
1177        vpslldq $8, \T2, \T4
1178        vpsrldq $8, \T2, \T2
1179
1180        vpxor   \T4, \T7, \T7
1181        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
1182				# the accumulated carry-less multiplications
1183
1184        #######################################################################
1185        #first phase of the reduction
1186        vpslld  $31, \T7, \T2   # packed right shifting << 31
1187        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
1188        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
1189
1190        vpxor   \T3, \T2, \T2   # xor the shifted versions
1191        vpxor   \T4, \T2, \T2
1192
1193        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
1194
1195        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
1196        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
1197        #######################################################################
1198
1199
1200        #second phase of the reduction
1201        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
1202        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
1203        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
1204        vpxor   \T3, \T2, \T2   # xor the shifted versions
1205        vpxor   \T4, \T2, \T2
1206
1207        vpxor   \T1, \T2, \T2
1208        vpxor   \T2, \T7, \T7
1209        vpxor   \T7, \T6, \T6   # the result is in T6
1210
1211.endm
1212
1213
1214# combined for GCM encrypt and decrypt functions
1215# clobbering all xmm registers
1216# clobbering r10, r11, r12, r13, r14, r15
1217.macro  GCM_ENC_DEC_AVX     ENC_DEC
1218
1219        #the number of pushes must equal STACK_OFFSET
1220        push    %r12
1221        push    %r13
1222        push    %r14
1223        push    %r15
1224
1225        mov     %rsp, %r14
1226
1227
1228
1229
1230        sub     $VARIABLE_OFFSET, %rsp
1231        and     $~63, %rsp                  # align rsp to 64 bytes
1232
1233
1234        vmovdqu  HashKey(arg1), %xmm13      # xmm13 = HashKey
1235
1236        mov     arg4, %r13                  # save the number of bytes of plaintext/ciphertext
1237        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
1238
1239        mov     %r13, %r12
1240        shr     $4, %r12
1241        and     $7, %r12
1242        jz      _initial_num_blocks_is_0\@
1243
1244        cmp     $7, %r12
1245        je      _initial_num_blocks_is_7\@
1246        cmp     $6, %r12
1247        je      _initial_num_blocks_is_6\@
1248        cmp     $5, %r12
1249        je      _initial_num_blocks_is_5\@
1250        cmp     $4, %r12
1251        je      _initial_num_blocks_is_4\@
1252        cmp     $3, %r12
1253        je      _initial_num_blocks_is_3\@
1254        cmp     $2, %r12
1255        je      _initial_num_blocks_is_2\@
1256
1257        jmp     _initial_num_blocks_is_1\@
1258
1259_initial_num_blocks_is_7\@:
1260        INITIAL_BLOCKS_AVX  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1261        sub     $16*7, %r13
1262        jmp     _initial_blocks_encrypted\@
1263
1264_initial_num_blocks_is_6\@:
1265        INITIAL_BLOCKS_AVX  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1266        sub     $16*6, %r13
1267        jmp     _initial_blocks_encrypted\@
1268
1269_initial_num_blocks_is_5\@:
1270        INITIAL_BLOCKS_AVX  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1271        sub     $16*5, %r13
1272        jmp     _initial_blocks_encrypted\@
1273
1274_initial_num_blocks_is_4\@:
1275        INITIAL_BLOCKS_AVX  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1276        sub     $16*4, %r13
1277        jmp     _initial_blocks_encrypted\@
1278
1279_initial_num_blocks_is_3\@:
1280        INITIAL_BLOCKS_AVX  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1281        sub     $16*3, %r13
1282        jmp     _initial_blocks_encrypted\@
1283
1284_initial_num_blocks_is_2\@:
1285        INITIAL_BLOCKS_AVX  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1286        sub     $16*2, %r13
1287        jmp     _initial_blocks_encrypted\@
1288
1289_initial_num_blocks_is_1\@:
1290        INITIAL_BLOCKS_AVX  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1291        sub     $16*1, %r13
1292        jmp     _initial_blocks_encrypted\@
1293
1294_initial_num_blocks_is_0\@:
1295        INITIAL_BLOCKS_AVX  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1296
1297
1298_initial_blocks_encrypted\@:
1299        cmp     $0, %r13
1300        je      _zero_cipher_left\@
1301
1302        sub     $128, %r13
1303        je      _eight_cipher_left\@
1304
1305
1306
1307
1308        vmovd   %xmm9, %r15d
1309        and     $255, %r15d
1310        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1311
1312
1313_encrypt_by_8_new\@:
1314        cmp     $(255-8), %r15d
1315        jg      _encrypt_by_8\@
1316
1317
1318
1319        add     $8, %r15b
1320        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
1321        add     $128, %r11
1322        sub     $128, %r13
1323        jne     _encrypt_by_8_new\@
1324
1325        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1326        jmp     _eight_cipher_left\@
1327
1328_encrypt_by_8\@:
1329        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1330        add     $8, %r15b
1331        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
1332        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1333        add     $128, %r11
1334        sub     $128, %r13
1335        jne     _encrypt_by_8_new\@
1336
1337        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1338
1339
1340
1341
1342_eight_cipher_left\@:
1343        GHASH_LAST_8_AVX    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
1344
1345
1346_zero_cipher_left\@:
1347        cmp     $16, arg4
1348        jl      _only_less_than_16\@
1349
1350        mov     arg4, %r13
1351        and     $15, %r13                            # r13 = (arg4 mod 16)
1352
1353        je      _multiple_of_16_bytes\@
1354
1355        # handle the last <16 Byte block seperately
1356
1357
1358        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
1359        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1360        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
1361
1362        sub     $16, %r11
1363        add     %r13, %r11
1364        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
1365
1366        lea     SHIFT_MASK+16(%rip), %r12
1367        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
1368						     # able to shift 16-r13 bytes (r13 is the
1369						     # number of bytes in plaintext mod 16)
1370        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
1371        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
1372        jmp     _final_ghash_mul\@
1373
1374_only_less_than_16\@:
1375        # check for 0 length
1376        mov     arg4, %r13
1377        and     $15, %r13                            # r13 = (arg4 mod 16)
1378
1379        je      _multiple_of_16_bytes\@
1380
1381        # handle the last <16 Byte block seperately
1382
1383
1384        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
1385        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1386        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
1387
1388
1389        lea     SHIFT_MASK+16(%rip), %r12
1390        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
1391						     # able to shift 16-r13 bytes (r13 is the
1392						     # number of bytes in plaintext mod 16)
1393
1394_get_last_16_byte_loop\@:
1395        movb    (arg3, %r11),  %al
1396        movb    %al,  TMP1 (%rsp , %r11)
1397        add     $1, %r11
1398        cmp     %r13,  %r11
1399        jne     _get_last_16_byte_loop\@
1400
1401        vmovdqu  TMP1(%rsp), %xmm1
1402
1403        sub     $16, %r11
1404
1405_final_ghash_mul\@:
1406        .if  \ENC_DEC ==  DEC
1407        vmovdqa %xmm1, %xmm2
1408        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
1409        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
1410						     # mask out top 16-r13 bytes of xmm9
1411        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
1412        vpand   %xmm1, %xmm2, %xmm2
1413        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
1414        vpxor   %xmm2, %xmm14, %xmm14
1415	#GHASH computation for the last <16 Byte block
1416        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1417        sub     %r13, %r11
1418        add     $16, %r11
1419        .else
1420        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
1421        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
1422						     # mask out top 16-r13 bytes of xmm9
1423        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
1424        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1425        vpxor   %xmm9, %xmm14, %xmm14
1426	#GHASH computation for the last <16 Byte block
1427        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1428        sub     %r13, %r11
1429        add     $16, %r11
1430        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
1431        .endif
1432
1433
1434        #############################
1435        # output r13 Bytes
1436        vmovq   %xmm9, %rax
1437        cmp     $8, %r13
1438        jle     _less_than_8_bytes_left\@
1439
1440        mov     %rax, (arg2 , %r11)
1441        add     $8, %r11
1442        vpsrldq $8, %xmm9, %xmm9
1443        vmovq   %xmm9, %rax
1444        sub     $8, %r13
1445
1446_less_than_8_bytes_left\@:
1447        movb    %al, (arg2 , %r11)
1448        add     $1, %r11
1449        shr     $8, %rax
1450        sub     $1, %r13
1451        jne     _less_than_8_bytes_left\@
1452        #############################
1453
1454_multiple_of_16_bytes\@:
1455        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
1456        shl     $3, %r12                             # convert into number of bits
1457        vmovd   %r12d, %xmm15                        # len(A) in xmm15
1458
1459        shl     $3, arg4                             # len(C) in bits  (*128)
1460        vmovq   arg4, %xmm1
1461        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
1462        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
1463
1464        vpxor   %xmm15, %xmm14, %xmm14
1465        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
1466        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
1467
1468        mov     arg5, %rax                           # rax = *Y0
1469        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
1470
1471        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
1472
1473        vpxor   %xmm14, %xmm9, %xmm9
1474
1475
1476
1477_return_T\@:
1478        mov     arg8, %r10              # r10 = authTag
1479        mov     arg9, %r11              # r11 = auth_tag_len
1480
1481        cmp     $16, %r11
1482        je      _T_16\@
1483
1484        cmp     $8, %r11
1485        jl      _T_4\@
1486
1487_T_8\@:
1488        vmovq   %xmm9, %rax
1489        mov     %rax, (%r10)
1490        add     $8, %r10
1491        sub     $8, %r11
1492        vpsrldq $8, %xmm9, %xmm9
1493        cmp     $0, %r11
1494        je     _return_T_done\@
1495_T_4\@:
1496        vmovd   %xmm9, %eax
1497        mov     %eax, (%r10)
1498        add     $4, %r10
1499        sub     $4, %r11
1500        vpsrldq     $4, %xmm9, %xmm9
1501        cmp     $0, %r11
1502        je     _return_T_done\@
1503_T_123\@:
1504        vmovd     %xmm9, %eax
1505        cmp     $2, %r11
1506        jl     _T_1\@
1507        mov     %ax, (%r10)
1508        cmp     $2, %r11
1509        je     _return_T_done\@
1510        add     $2, %r10
1511        sar     $16, %eax
1512_T_1\@:
1513        mov     %al, (%r10)
1514        jmp     _return_T_done\@
1515
1516_T_16\@:
1517        vmovdqu %xmm9, (%r10)
1518
1519_return_T_done\@:
1520        mov     %r14, %rsp
1521
1522        pop     %r15
1523        pop     %r14
1524        pop     %r13
1525        pop     %r12
1526.endm
1527
1528
1529#############################################################
1530#void   aesni_gcm_precomp_avx_gen2
1531#        (gcm_data     *my_ctx_data,
1532#        u8     *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1533#############################################################
1534ENTRY(aesni_gcm_precomp_avx_gen2)
1535        #the number of pushes must equal STACK_OFFSET
1536        push    %r12
1537        push    %r13
1538        push    %r14
1539        push    %r15
1540
1541        mov     %rsp, %r14
1542
1543
1544
1545        sub     $VARIABLE_OFFSET, %rsp
1546        and     $~63, %rsp                  # align rsp to 64 bytes
1547
1548        vmovdqu  (arg2), %xmm6              # xmm6 = HashKey
1549
1550        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
1551        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
1552        vmovdqa  %xmm6, %xmm2
1553        vpsllq   $1, %xmm6, %xmm6
1554        vpsrlq   $63, %xmm2, %xmm2
1555        vmovdqa  %xmm2, %xmm1
1556        vpslldq  $8, %xmm2, %xmm2
1557        vpsrldq  $8, %xmm1, %xmm1
1558        vpor     %xmm2, %xmm6, %xmm6
1559        #reduction
1560        vpshufd  $0b00100100, %xmm1, %xmm2
1561        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1562        vpand    POLY(%rip), %xmm2, %xmm2
1563        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
1564        #######################################################################
1565        vmovdqa  %xmm6, HashKey(arg1)       # store HashKey<<1 mod poly
1566
1567
1568        PRECOMPUTE_AVX  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1569
1570        mov     %r14, %rsp
1571
1572        pop     %r15
1573        pop     %r14
1574        pop     %r13
1575        pop     %r12
1576        ret
1577ENDPROC(aesni_gcm_precomp_avx_gen2)
1578
1579###############################################################################
1580#void   aesni_gcm_enc_avx_gen2(
1581#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1582#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
1583#        const   u8 *in, /* Plaintext input */
1584#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
1585#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1586#			(from Security Association) concatenated with 8 byte
1587#			Initialisation Vector (from IPSec ESP Payload)
1588#			concatenated with 0x00000001. 16-byte aligned pointer. */
1589#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1590#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1591#        u8      *auth_tag, /* Authenticated Tag output. */
1592#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1593#				Valid values are 16 (most likely), 12 or 8. */
1594###############################################################################
1595ENTRY(aesni_gcm_enc_avx_gen2)
1596        GCM_ENC_DEC_AVX     ENC
1597	ret
1598ENDPROC(aesni_gcm_enc_avx_gen2)
1599
1600###############################################################################
1601#void   aesni_gcm_dec_avx_gen2(
1602#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1603#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
1604#        const   u8 *in, /* Ciphertext input */
1605#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
1606#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1607#			(from Security Association) concatenated with 8 byte
1608#			Initialisation Vector (from IPSec ESP Payload)
1609#			concatenated with 0x00000001. 16-byte aligned pointer. */
1610#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1611#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1612#        u8      *auth_tag, /* Authenticated Tag output. */
1613#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1614#				Valid values are 16 (most likely), 12 or 8. */
1615###############################################################################
1616ENTRY(aesni_gcm_dec_avx_gen2)
1617        GCM_ENC_DEC_AVX     DEC
1618	ret
1619ENDPROC(aesni_gcm_dec_avx_gen2)
1620#endif /* CONFIG_AS_AVX */
1621
1622#ifdef CONFIG_AS_AVX2
1623###############################################################################
1624# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1625# Input: A and B (128-bits each, bit-reflected)
1626# Output: C = A*B*x mod poly, (i.e. >>1 )
1627# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1628# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1629###############################################################################
1630.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1631
1632        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
1633        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
1634        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
1635        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
1636        vpxor           \T3, \GH, \GH
1637
1638
1639        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
1640        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
1641
1642        vpxor           \T3, \T1, \T1
1643        vpxor           \T2, \GH, \GH
1644
1645        #######################################################################
1646        #first phase of the reduction
1647        vmovdqa         POLY2(%rip), \T3
1648
1649        vpclmulqdq      $0x01, \GH, \T3, \T2
1650        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
1651
1652        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
1653        #######################################################################
1654        #second phase of the reduction
1655        vpclmulqdq      $0x00, \GH, \T3, \T2
1656        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1657
1658        vpclmulqdq      $0x10, \GH, \T3, \GH
1659        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1660
1661        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
1662        #######################################################################
1663        vpxor           \T1, \GH, \GH          # the result is in GH
1664
1665
1666.endm
1667
1668.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1669
1670        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1671        vmovdqa  \HK, \T5
1672        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
1673        vmovdqa  \T5, HashKey_2(arg1)                       #  [HashKey_2] = HashKey^2<<1 mod poly
1674
1675        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
1676        vmovdqa  \T5, HashKey_3(arg1)
1677
1678        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
1679        vmovdqa  \T5, HashKey_4(arg1)
1680
1681        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
1682        vmovdqa  \T5, HashKey_5(arg1)
1683
1684        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
1685        vmovdqa  \T5, HashKey_6(arg1)
1686
1687        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
1688        vmovdqa  \T5, HashKey_7(arg1)
1689
1690        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
1691        vmovdqa  \T5, HashKey_8(arg1)
1692
1693.endm
1694
1695
1696## if a = number of total plaintext bytes
1697## b = floor(a/16)
1698## num_initial_blocks = b mod 4#
1699## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1700## r10, r11, r12, rax are clobbered
1701## arg1, arg2, arg3, r14 are used as a pointer only, not modified
1702
1703.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1704	i = (8-\num_initial_blocks)
1705	j = 0
1706	setreg
1707
1708	mov     arg6, %r10                       # r10 = AAD
1709	mov     arg7, %r12                       # r12 = aadLen
1710
1711
1712	mov     %r12, %r11
1713
1714	vpxor   reg_j, reg_j, reg_j
1715	vpxor   reg_i, reg_i, reg_i
1716
1717	cmp     $16, %r11
1718	jl      _get_AAD_rest8\@
1719_get_AAD_blocks\@:
1720	vmovdqu (%r10), reg_i
1721	vpshufb SHUF_MASK(%rip), reg_i, reg_i
1722	vpxor   reg_i, reg_j, reg_j
1723	GHASH_MUL_AVX2      reg_j, \T2, \T1, \T3, \T4, \T5, \T6
1724	add     $16, %r10
1725	sub     $16, %r12
1726	sub     $16, %r11
1727	cmp     $16, %r11
1728	jge     _get_AAD_blocks\@
1729	vmovdqu reg_j, reg_i
1730	cmp     $0, %r11
1731	je      _get_AAD_done\@
1732
1733	vpxor   reg_i, reg_i, reg_i
1734
1735	/* read the last <16B of AAD. since we have at least 4B of
1736	data right after the AAD (the ICV, and maybe some CT), we can
1737	read 4B/8B blocks safely, and then get rid of the extra stuff */
1738_get_AAD_rest8\@:
1739	cmp     $4, %r11
1740	jle     _get_AAD_rest4\@
1741	movq    (%r10), \T1
1742	add     $8, %r10
1743	sub     $8, %r11
1744	vpslldq $8, \T1, \T1
1745	vpsrldq $8, reg_i, reg_i
1746	vpxor   \T1, reg_i, reg_i
1747	jmp     _get_AAD_rest8\@
1748_get_AAD_rest4\@:
1749	cmp     $0, %r11
1750	jle     _get_AAD_rest0\@
1751	mov     (%r10), %eax
1752	movq    %rax, \T1
1753	add     $4, %r10
1754	sub     $4, %r11
1755	vpslldq $12, \T1, \T1
1756	vpsrldq $4, reg_i, reg_i
1757	vpxor   \T1, reg_i, reg_i
1758_get_AAD_rest0\@:
1759	/* finalize: shift out the extra bytes we read, and align
1760	left. since pslldq can only shift by an immediate, we use
1761	vpshufb and an array of shuffle masks */
1762	movq    %r12, %r11
1763	salq    $4, %r11
1764	movdqu  aad_shift_arr(%r11), \T1
1765	vpshufb \T1, reg_i, reg_i
1766_get_AAD_rest_final\@:
1767	vpshufb SHUF_MASK(%rip), reg_i, reg_i
1768	vpxor   reg_j, reg_i, reg_i
1769	GHASH_MUL_AVX2      reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1770
1771_get_AAD_done\@:
1772	# initialize the data pointer offset as zero
1773	xor     %r11d, %r11d
1774
1775	# start AES for num_initial_blocks blocks
1776	mov     arg5, %rax                     # rax = *Y0
1777	vmovdqu (%rax), \CTR                   # CTR = Y0
1778	vpshufb SHUF_MASK(%rip), \CTR, \CTR
1779
1780
1781	i = (9-\num_initial_blocks)
1782	setreg
1783.rep \num_initial_blocks
1784                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
1785                vmovdqa \CTR, reg_i
1786                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
1787	i = (i+1)
1788	setreg
1789.endr
1790
1791	vmovdqa  (arg1), \T_key
1792	i = (9-\num_initial_blocks)
1793	setreg
1794.rep \num_initial_blocks
1795                vpxor   \T_key, reg_i, reg_i
1796	i = (i+1)
1797	setreg
1798.endr
1799
1800	j = 1
1801	setreg
1802.rep 9
1803	vmovdqa  16*j(arg1), \T_key
1804	i = (9-\num_initial_blocks)
1805	setreg
1806.rep \num_initial_blocks
1807        vaesenc \T_key, reg_i, reg_i
1808	i = (i+1)
1809	setreg
1810.endr
1811
1812	j = (j+1)
1813	setreg
1814.endr
1815
1816
1817	vmovdqa  16*10(arg1), \T_key
1818	i = (9-\num_initial_blocks)
1819	setreg
1820.rep \num_initial_blocks
1821        vaesenclast      \T_key, reg_i, reg_i
1822	i = (i+1)
1823	setreg
1824.endr
1825
1826	i = (9-\num_initial_blocks)
1827	setreg
1828.rep \num_initial_blocks
1829                vmovdqu (arg3, %r11), \T1
1830                vpxor   \T1, reg_i, reg_i
1831                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for
1832						       # num_initial_blocks blocks
1833                add     $16, %r11
1834.if  \ENC_DEC == DEC
1835                vmovdqa \T1, reg_i
1836.endif
1837                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1838	i = (i+1)
1839	setreg
1840.endr
1841
1842
1843	i = (8-\num_initial_blocks)
1844	j = (9-\num_initial_blocks)
1845	setreg
1846
1847.rep \num_initial_blocks
1848        vpxor    reg_i, reg_j, reg_j
1849        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
1850	i = (i+1)
1851	j = (j+1)
1852	setreg
1853.endr
1854        # XMM8 has the combined result here
1855
1856        vmovdqa  \XMM8, TMP1(%rsp)
1857        vmovdqa  \XMM8, \T3
1858
1859        cmp     $128, %r13
1860        jl      _initial_blocks_done\@                  # no need for precomputed constants
1861
1862###############################################################################
1863# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1864                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1865                vmovdqa  \CTR, \XMM1
1866                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
1867
1868                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1869                vmovdqa  \CTR, \XMM2
1870                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
1871
1872                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1873                vmovdqa  \CTR, \XMM3
1874                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
1875
1876                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1877                vmovdqa  \CTR, \XMM4
1878                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
1879
1880                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1881                vmovdqa  \CTR, \XMM5
1882                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
1883
1884                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1885                vmovdqa  \CTR, \XMM6
1886                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
1887
1888                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1889                vmovdqa  \CTR, \XMM7
1890                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
1891
1892                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1893                vmovdqa  \CTR, \XMM8
1894                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
1895
1896                vmovdqa  (arg1), \T_key
1897                vpxor    \T_key, \XMM1, \XMM1
1898                vpxor    \T_key, \XMM2, \XMM2
1899                vpxor    \T_key, \XMM3, \XMM3
1900                vpxor    \T_key, \XMM4, \XMM4
1901                vpxor    \T_key, \XMM5, \XMM5
1902                vpxor    \T_key, \XMM6, \XMM6
1903                vpxor    \T_key, \XMM7, \XMM7
1904                vpxor    \T_key, \XMM8, \XMM8
1905
1906		i = 1
1907		setreg
1908.rep    9       # do 9 rounds
1909                vmovdqa  16*i(arg1), \T_key
1910                vaesenc  \T_key, \XMM1, \XMM1
1911                vaesenc  \T_key, \XMM2, \XMM2
1912                vaesenc  \T_key, \XMM3, \XMM3
1913                vaesenc  \T_key, \XMM4, \XMM4
1914                vaesenc  \T_key, \XMM5, \XMM5
1915                vaesenc  \T_key, \XMM6, \XMM6
1916                vaesenc  \T_key, \XMM7, \XMM7
1917                vaesenc  \T_key, \XMM8, \XMM8
1918		i = (i+1)
1919		setreg
1920.endr
1921
1922
1923                vmovdqa  16*i(arg1), \T_key
1924                vaesenclast  \T_key, \XMM1, \XMM1
1925                vaesenclast  \T_key, \XMM2, \XMM2
1926                vaesenclast  \T_key, \XMM3, \XMM3
1927                vaesenclast  \T_key, \XMM4, \XMM4
1928                vaesenclast  \T_key, \XMM5, \XMM5
1929                vaesenclast  \T_key, \XMM6, \XMM6
1930                vaesenclast  \T_key, \XMM7, \XMM7
1931                vaesenclast  \T_key, \XMM8, \XMM8
1932
1933                vmovdqu  (arg3, %r11), \T1
1934                vpxor    \T1, \XMM1, \XMM1
1935                vmovdqu  \XMM1, (arg2 , %r11)
1936                .if   \ENC_DEC == DEC
1937                vmovdqa  \T1, \XMM1
1938                .endif
1939
1940                vmovdqu  16*1(arg3, %r11), \T1
1941                vpxor    \T1, \XMM2, \XMM2
1942                vmovdqu  \XMM2, 16*1(arg2 , %r11)
1943                .if   \ENC_DEC == DEC
1944                vmovdqa  \T1, \XMM2
1945                .endif
1946
1947                vmovdqu  16*2(arg3, %r11), \T1
1948                vpxor    \T1, \XMM3, \XMM3
1949                vmovdqu  \XMM3, 16*2(arg2 , %r11)
1950                .if   \ENC_DEC == DEC
1951                vmovdqa  \T1, \XMM3
1952                .endif
1953
1954                vmovdqu  16*3(arg3, %r11), \T1
1955                vpxor    \T1, \XMM4, \XMM4
1956                vmovdqu  \XMM4, 16*3(arg2 , %r11)
1957                .if   \ENC_DEC == DEC
1958                vmovdqa  \T1, \XMM4
1959                .endif
1960
1961                vmovdqu  16*4(arg3, %r11), \T1
1962                vpxor    \T1, \XMM5, \XMM5
1963                vmovdqu  \XMM5, 16*4(arg2 , %r11)
1964                .if   \ENC_DEC == DEC
1965                vmovdqa  \T1, \XMM5
1966                .endif
1967
1968                vmovdqu  16*5(arg3, %r11), \T1
1969                vpxor    \T1, \XMM6, \XMM6
1970                vmovdqu  \XMM6, 16*5(arg2 , %r11)
1971                .if   \ENC_DEC == DEC
1972                vmovdqa  \T1, \XMM6
1973                .endif
1974
1975                vmovdqu  16*6(arg3, %r11), \T1
1976                vpxor    \T1, \XMM7, \XMM7
1977                vmovdqu  \XMM7, 16*6(arg2 , %r11)
1978                .if   \ENC_DEC == DEC
1979                vmovdqa  \T1, \XMM7
1980                .endif
1981
1982                vmovdqu  16*7(arg3, %r11), \T1
1983                vpxor    \T1, \XMM8, \XMM8
1984                vmovdqu  \XMM8, 16*7(arg2 , %r11)
1985                .if   \ENC_DEC == DEC
1986                vmovdqa  \T1, \XMM8
1987                .endif
1988
1989                add     $128, %r11
1990
1991                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1992                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
1993							   # the corresponding ciphertext
1994                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1995                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1996                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1997                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1998                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1999                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2000                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2001
2002###############################################################################
2003
2004_initial_blocks_done\@:
2005
2006
2007.endm
2008
2009
2010
2011# encrypt 8 blocks at a time
2012# ghash the 8 previously encrypted ciphertext blocks
2013# arg1, arg2, arg3 are used as pointers only, not modified
2014# r11 is the data offset value
2015.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2016
2017        vmovdqa \XMM1, \T2
2018        vmovdqa \XMM2, TMP2(%rsp)
2019        vmovdqa \XMM3, TMP3(%rsp)
2020        vmovdqa \XMM4, TMP4(%rsp)
2021        vmovdqa \XMM5, TMP5(%rsp)
2022        vmovdqa \XMM6, TMP6(%rsp)
2023        vmovdqa \XMM7, TMP7(%rsp)
2024        vmovdqa \XMM8, TMP8(%rsp)
2025
2026.if \loop_idx == in_order
2027                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
2028                vpaddd  ONE(%rip), \XMM1, \XMM2
2029                vpaddd  ONE(%rip), \XMM2, \XMM3
2030                vpaddd  ONE(%rip), \XMM3, \XMM4
2031                vpaddd  ONE(%rip), \XMM4, \XMM5
2032                vpaddd  ONE(%rip), \XMM5, \XMM6
2033                vpaddd  ONE(%rip), \XMM6, \XMM7
2034                vpaddd  ONE(%rip), \XMM7, \XMM8
2035                vmovdqa \XMM8, \CTR
2036
2037                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2038                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2039                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2040                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2041                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2042                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2043                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2044                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2045.else
2046                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
2047                vpaddd  ONEf(%rip), \XMM1, \XMM2
2048                vpaddd  ONEf(%rip), \XMM2, \XMM3
2049                vpaddd  ONEf(%rip), \XMM3, \XMM4
2050                vpaddd  ONEf(%rip), \XMM4, \XMM5
2051                vpaddd  ONEf(%rip), \XMM5, \XMM6
2052                vpaddd  ONEf(%rip), \XMM6, \XMM7
2053                vpaddd  ONEf(%rip), \XMM7, \XMM8
2054                vmovdqa \XMM8, \CTR
2055.endif
2056
2057
2058        #######################################################################
2059
2060                vmovdqu (arg1), \T1
2061                vpxor   \T1, \XMM1, \XMM1
2062                vpxor   \T1, \XMM2, \XMM2
2063                vpxor   \T1, \XMM3, \XMM3
2064                vpxor   \T1, \XMM4, \XMM4
2065                vpxor   \T1, \XMM5, \XMM5
2066                vpxor   \T1, \XMM6, \XMM6
2067                vpxor   \T1, \XMM7, \XMM7
2068                vpxor   \T1, \XMM8, \XMM8
2069
2070        #######################################################################
2071
2072
2073
2074
2075
2076                vmovdqu 16*1(arg1), \T1
2077                vaesenc \T1, \XMM1, \XMM1
2078                vaesenc \T1, \XMM2, \XMM2
2079                vaesenc \T1, \XMM3, \XMM3
2080                vaesenc \T1, \XMM4, \XMM4
2081                vaesenc \T1, \XMM5, \XMM5
2082                vaesenc \T1, \XMM6, \XMM6
2083                vaesenc \T1, \XMM7, \XMM7
2084                vaesenc \T1, \XMM8, \XMM8
2085
2086                vmovdqu 16*2(arg1), \T1
2087                vaesenc \T1, \XMM1, \XMM1
2088                vaesenc \T1, \XMM2, \XMM2
2089                vaesenc \T1, \XMM3, \XMM3
2090                vaesenc \T1, \XMM4, \XMM4
2091                vaesenc \T1, \XMM5, \XMM5
2092                vaesenc \T1, \XMM6, \XMM6
2093                vaesenc \T1, \XMM7, \XMM7
2094                vaesenc \T1, \XMM8, \XMM8
2095
2096
2097        #######################################################################
2098
2099        vmovdqa         HashKey_8(arg1), \T5
2100        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
2101        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
2102        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
2103        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
2104        vpxor           \T5, \T6, \T6
2105
2106                vmovdqu 16*3(arg1), \T1
2107                vaesenc \T1, \XMM1, \XMM1
2108                vaesenc \T1, \XMM2, \XMM2
2109                vaesenc \T1, \XMM3, \XMM3
2110                vaesenc \T1, \XMM4, \XMM4
2111                vaesenc \T1, \XMM5, \XMM5
2112                vaesenc \T1, \XMM6, \XMM6
2113                vaesenc \T1, \XMM7, \XMM7
2114                vaesenc \T1, \XMM8, \XMM8
2115
2116        vmovdqa         TMP2(%rsp), \T1
2117        vmovdqa         HashKey_7(arg1), \T5
2118        vpclmulqdq      $0x11, \T5, \T1, \T3
2119        vpxor           \T3, \T4, \T4
2120
2121        vpclmulqdq      $0x00, \T5, \T1, \T3
2122        vpxor           \T3, \T7, \T7
2123
2124        vpclmulqdq      $0x01, \T5, \T1, \T3
2125        vpxor           \T3, \T6, \T6
2126
2127        vpclmulqdq      $0x10, \T5, \T1, \T3
2128        vpxor           \T3, \T6, \T6
2129
2130                vmovdqu 16*4(arg1), \T1
2131                vaesenc \T1, \XMM1, \XMM1
2132                vaesenc \T1, \XMM2, \XMM2
2133                vaesenc \T1, \XMM3, \XMM3
2134                vaesenc \T1, \XMM4, \XMM4
2135                vaesenc \T1, \XMM5, \XMM5
2136                vaesenc \T1, \XMM6, \XMM6
2137                vaesenc \T1, \XMM7, \XMM7
2138                vaesenc \T1, \XMM8, \XMM8
2139
2140        #######################################################################
2141
2142        vmovdqa         TMP3(%rsp), \T1
2143        vmovdqa         HashKey_6(arg1), \T5
2144        vpclmulqdq      $0x11, \T5, \T1, \T3
2145        vpxor           \T3, \T4, \T4
2146
2147        vpclmulqdq      $0x00, \T5, \T1, \T3
2148        vpxor           \T3, \T7, \T7
2149
2150        vpclmulqdq      $0x01, \T5, \T1, \T3
2151        vpxor           \T3, \T6, \T6
2152
2153        vpclmulqdq      $0x10, \T5, \T1, \T3
2154        vpxor           \T3, \T6, \T6
2155
2156                vmovdqu 16*5(arg1), \T1
2157                vaesenc \T1, \XMM1, \XMM1
2158                vaesenc \T1, \XMM2, \XMM2
2159                vaesenc \T1, \XMM3, \XMM3
2160                vaesenc \T1, \XMM4, \XMM4
2161                vaesenc \T1, \XMM5, \XMM5
2162                vaesenc \T1, \XMM6, \XMM6
2163                vaesenc \T1, \XMM7, \XMM7
2164                vaesenc \T1, \XMM8, \XMM8
2165
2166        vmovdqa         TMP4(%rsp), \T1
2167        vmovdqa         HashKey_5(arg1), \T5
2168        vpclmulqdq      $0x11, \T5, \T1, \T3
2169        vpxor           \T3, \T4, \T4
2170
2171        vpclmulqdq      $0x00, \T5, \T1, \T3
2172        vpxor           \T3, \T7, \T7
2173
2174        vpclmulqdq      $0x01, \T5, \T1, \T3
2175        vpxor           \T3, \T6, \T6
2176
2177        vpclmulqdq      $0x10, \T5, \T1, \T3
2178        vpxor           \T3, \T6, \T6
2179
2180                vmovdqu 16*6(arg1), \T1
2181                vaesenc \T1, \XMM1, \XMM1
2182                vaesenc \T1, \XMM2, \XMM2
2183                vaesenc \T1, \XMM3, \XMM3
2184                vaesenc \T1, \XMM4, \XMM4
2185                vaesenc \T1, \XMM5, \XMM5
2186                vaesenc \T1, \XMM6, \XMM6
2187                vaesenc \T1, \XMM7, \XMM7
2188                vaesenc \T1, \XMM8, \XMM8
2189
2190
2191        vmovdqa         TMP5(%rsp), \T1
2192        vmovdqa         HashKey_4(arg1), \T5
2193        vpclmulqdq      $0x11, \T5, \T1, \T3
2194        vpxor           \T3, \T4, \T4
2195
2196        vpclmulqdq      $0x00, \T5, \T1, \T3
2197        vpxor           \T3, \T7, \T7
2198
2199        vpclmulqdq      $0x01, \T5, \T1, \T3
2200        vpxor           \T3, \T6, \T6
2201
2202        vpclmulqdq      $0x10, \T5, \T1, \T3
2203        vpxor           \T3, \T6, \T6
2204
2205                vmovdqu 16*7(arg1), \T1
2206                vaesenc \T1, \XMM1, \XMM1
2207                vaesenc \T1, \XMM2, \XMM2
2208                vaesenc \T1, \XMM3, \XMM3
2209                vaesenc \T1, \XMM4, \XMM4
2210                vaesenc \T1, \XMM5, \XMM5
2211                vaesenc \T1, \XMM6, \XMM6
2212                vaesenc \T1, \XMM7, \XMM7
2213                vaesenc \T1, \XMM8, \XMM8
2214
2215        vmovdqa         TMP6(%rsp), \T1
2216        vmovdqa         HashKey_3(arg1), \T5
2217        vpclmulqdq      $0x11, \T5, \T1, \T3
2218        vpxor           \T3, \T4, \T4
2219
2220        vpclmulqdq      $0x00, \T5, \T1, \T3
2221        vpxor           \T3, \T7, \T7
2222
2223        vpclmulqdq      $0x01, \T5, \T1, \T3
2224        vpxor           \T3, \T6, \T6
2225
2226        vpclmulqdq      $0x10, \T5, \T1, \T3
2227        vpxor           \T3, \T6, \T6
2228
2229                vmovdqu 16*8(arg1), \T1
2230                vaesenc \T1, \XMM1, \XMM1
2231                vaesenc \T1, \XMM2, \XMM2
2232                vaesenc \T1, \XMM3, \XMM3
2233                vaesenc \T1, \XMM4, \XMM4
2234                vaesenc \T1, \XMM5, \XMM5
2235                vaesenc \T1, \XMM6, \XMM6
2236                vaesenc \T1, \XMM7, \XMM7
2237                vaesenc \T1, \XMM8, \XMM8
2238
2239        vmovdqa         TMP7(%rsp), \T1
2240        vmovdqa         HashKey_2(arg1), \T5
2241        vpclmulqdq      $0x11, \T5, \T1, \T3
2242        vpxor           \T3, \T4, \T4
2243
2244        vpclmulqdq      $0x00, \T5, \T1, \T3
2245        vpxor           \T3, \T7, \T7
2246
2247        vpclmulqdq      $0x01, \T5, \T1, \T3
2248        vpxor           \T3, \T6, \T6
2249
2250        vpclmulqdq      $0x10, \T5, \T1, \T3
2251        vpxor           \T3, \T6, \T6
2252
2253
2254        #######################################################################
2255
2256                vmovdqu 16*9(arg1), \T5
2257                vaesenc \T5, \XMM1, \XMM1
2258                vaesenc \T5, \XMM2, \XMM2
2259                vaesenc \T5, \XMM3, \XMM3
2260                vaesenc \T5, \XMM4, \XMM4
2261                vaesenc \T5, \XMM5, \XMM5
2262                vaesenc \T5, \XMM6, \XMM6
2263                vaesenc \T5, \XMM7, \XMM7
2264                vaesenc \T5, \XMM8, \XMM8
2265
2266        vmovdqa         TMP8(%rsp), \T1
2267        vmovdqa         HashKey(arg1), \T5
2268
2269        vpclmulqdq      $0x00, \T5, \T1, \T3
2270        vpxor           \T3, \T7, \T7
2271
2272        vpclmulqdq      $0x01, \T5, \T1, \T3
2273        vpxor           \T3, \T6, \T6
2274
2275        vpclmulqdq      $0x10, \T5, \T1, \T3
2276        vpxor           \T3, \T6, \T6
2277
2278        vpclmulqdq      $0x11, \T5, \T1, \T3
2279        vpxor           \T3, \T4, \T1
2280
2281
2282                vmovdqu 16*10(arg1), \T5
2283
2284	i = 0
2285	j = 1
2286	setreg
2287.rep 8
2288		vpxor	16*i(arg3, %r11), \T5, \T2
2289                .if \ENC_DEC == ENC
2290                vaesenclast     \T2, reg_j, reg_j
2291                .else
2292                vaesenclast     \T2, reg_j, \T3
2293                vmovdqu 16*i(arg3, %r11), reg_j
2294                vmovdqu \T3, 16*i(arg2, %r11)
2295                .endif
2296	i = (i+1)
2297	j = (j+1)
2298	setreg
2299.endr
2300	#######################################################################
2301
2302
2303	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
2304	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
2305	vpxor	\T3, \T7, \T7
2306	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
2307
2308
2309
2310	#######################################################################
2311	#first phase of the reduction
2312	vmovdqa         POLY2(%rip), \T3
2313
2314	vpclmulqdq	$0x01, \T7, \T3, \T2
2315	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
2316
2317	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
2318	#######################################################################
2319                .if \ENC_DEC == ENC
2320		vmovdqu	 \XMM1,	16*0(arg2,%r11)		# Write to the Ciphertext buffer
2321		vmovdqu	 \XMM2,	16*1(arg2,%r11)		# Write to the Ciphertext buffer
2322		vmovdqu	 \XMM3,	16*2(arg2,%r11)		# Write to the Ciphertext buffer
2323		vmovdqu	 \XMM4,	16*3(arg2,%r11)		# Write to the Ciphertext buffer
2324		vmovdqu	 \XMM5,	16*4(arg2,%r11)		# Write to the Ciphertext buffer
2325		vmovdqu	 \XMM6,	16*5(arg2,%r11)		# Write to the Ciphertext buffer
2326		vmovdqu	 \XMM7,	16*6(arg2,%r11)		# Write to the Ciphertext buffer
2327		vmovdqu	 \XMM8,	16*7(arg2,%r11)		# Write to the Ciphertext buffer
2328                .endif
2329
2330	#######################################################################
2331	#second phase of the reduction
2332	vpclmulqdq	$0x00, \T7, \T3, \T2
2333	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2334
2335	vpclmulqdq	$0x10, \T7, \T3, \T4
2336	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2337
2338	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
2339	#######################################################################
2340	vpxor		\T4, \T1, \T1			# the result is in T1
2341
2342		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
2343		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
2344		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
2345		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
2346		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
2347		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
2348		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
2349		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
2350
2351
2352	vpxor	\T1, \XMM1, \XMM1
2353
2354
2355
2356.endm
2357
2358
2359# GHASH the last 4 ciphertext blocks.
2360.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2361
2362        ## Karatsuba Method
2363
2364        vmovdqa         HashKey_8(arg1), \T5
2365
2366        vpshufd         $0b01001110, \XMM1, \T2
2367        vpshufd         $0b01001110, \T5, \T3
2368        vpxor           \XMM1, \T2, \T2
2369        vpxor           \T5, \T3, \T3
2370
2371        vpclmulqdq      $0x11, \T5, \XMM1, \T6
2372        vpclmulqdq      $0x00, \T5, \XMM1, \T7
2373
2374        vpclmulqdq      $0x00, \T3, \T2, \XMM1
2375
2376        ######################
2377
2378        vmovdqa         HashKey_7(arg1), \T5
2379        vpshufd         $0b01001110, \XMM2, \T2
2380        vpshufd         $0b01001110, \T5, \T3
2381        vpxor           \XMM2, \T2, \T2
2382        vpxor           \T5, \T3, \T3
2383
2384        vpclmulqdq      $0x11, \T5, \XMM2, \T4
2385        vpxor           \T4, \T6, \T6
2386
2387        vpclmulqdq      $0x00, \T5, \XMM2, \T4
2388        vpxor           \T4, \T7, \T7
2389
2390        vpclmulqdq      $0x00, \T3, \T2, \T2
2391
2392        vpxor           \T2, \XMM1, \XMM1
2393
2394        ######################
2395
2396        vmovdqa         HashKey_6(arg1), \T5
2397        vpshufd         $0b01001110, \XMM3, \T2
2398        vpshufd         $0b01001110, \T5, \T3
2399        vpxor           \XMM3, \T2, \T2
2400        vpxor           \T5, \T3, \T3
2401
2402        vpclmulqdq      $0x11, \T5, \XMM3, \T4
2403        vpxor           \T4, \T6, \T6
2404
2405        vpclmulqdq      $0x00, \T5, \XMM3, \T4
2406        vpxor           \T4, \T7, \T7
2407
2408        vpclmulqdq      $0x00, \T3, \T2, \T2
2409
2410        vpxor           \T2, \XMM1, \XMM1
2411
2412        ######################
2413
2414        vmovdqa         HashKey_5(arg1), \T5
2415        vpshufd         $0b01001110, \XMM4, \T2
2416        vpshufd         $0b01001110, \T5, \T3
2417        vpxor           \XMM4, \T2, \T2
2418        vpxor           \T5, \T3, \T3
2419
2420        vpclmulqdq      $0x11, \T5, \XMM4, \T4
2421        vpxor           \T4, \T6, \T6
2422
2423        vpclmulqdq      $0x00, \T5, \XMM4, \T4
2424        vpxor           \T4, \T7, \T7
2425
2426        vpclmulqdq      $0x00, \T3, \T2, \T2
2427
2428        vpxor           \T2, \XMM1, \XMM1
2429
2430        ######################
2431
2432        vmovdqa         HashKey_4(arg1), \T5
2433        vpshufd         $0b01001110, \XMM5, \T2
2434        vpshufd         $0b01001110, \T5, \T3
2435        vpxor           \XMM5, \T2, \T2
2436        vpxor           \T5, \T3, \T3
2437
2438        vpclmulqdq      $0x11, \T5, \XMM5, \T4
2439        vpxor           \T4, \T6, \T6
2440
2441        vpclmulqdq      $0x00, \T5, \XMM5, \T4
2442        vpxor           \T4, \T7, \T7
2443
2444        vpclmulqdq      $0x00, \T3, \T2, \T2
2445
2446        vpxor           \T2, \XMM1, \XMM1
2447
2448        ######################
2449
2450        vmovdqa         HashKey_3(arg1), \T5
2451        vpshufd         $0b01001110, \XMM6, \T2
2452        vpshufd         $0b01001110, \T5, \T3
2453        vpxor           \XMM6, \T2, \T2
2454        vpxor           \T5, \T3, \T3
2455
2456        vpclmulqdq      $0x11, \T5, \XMM6, \T4
2457        vpxor           \T4, \T6, \T6
2458
2459        vpclmulqdq      $0x00, \T5, \XMM6, \T4
2460        vpxor           \T4, \T7, \T7
2461
2462        vpclmulqdq      $0x00, \T3, \T2, \T2
2463
2464        vpxor           \T2, \XMM1, \XMM1
2465
2466        ######################
2467
2468        vmovdqa         HashKey_2(arg1), \T5
2469        vpshufd         $0b01001110, \XMM7, \T2
2470        vpshufd         $0b01001110, \T5, \T3
2471        vpxor           \XMM7, \T2, \T2
2472        vpxor           \T5, \T3, \T3
2473
2474        vpclmulqdq      $0x11, \T5, \XMM7, \T4
2475        vpxor           \T4, \T6, \T6
2476
2477        vpclmulqdq      $0x00, \T5, \XMM7, \T4
2478        vpxor           \T4, \T7, \T7
2479
2480        vpclmulqdq      $0x00, \T3, \T2, \T2
2481
2482        vpxor           \T2, \XMM1, \XMM1
2483
2484        ######################
2485
2486        vmovdqa         HashKey(arg1), \T5
2487        vpshufd         $0b01001110, \XMM8, \T2
2488        vpshufd         $0b01001110, \T5, \T3
2489        vpxor           \XMM8, \T2, \T2
2490        vpxor           \T5, \T3, \T3
2491
2492        vpclmulqdq      $0x11, \T5, \XMM8, \T4
2493        vpxor           \T4, \T6, \T6
2494
2495        vpclmulqdq      $0x00, \T5, \XMM8, \T4
2496        vpxor           \T4, \T7, \T7
2497
2498        vpclmulqdq      $0x00, \T3, \T2, \T2
2499
2500        vpxor           \T2, \XMM1, \XMM1
2501        vpxor           \T6, \XMM1, \XMM1
2502        vpxor           \T7, \XMM1, \T2
2503
2504
2505
2506
2507        vpslldq $8, \T2, \T4
2508        vpsrldq $8, \T2, \T2
2509
2510        vpxor   \T4, \T7, \T7
2511        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
2512						   # accumulated carry-less multiplications
2513
2514        #######################################################################
2515        #first phase of the reduction
2516        vmovdqa         POLY2(%rip), \T3
2517
2518        vpclmulqdq      $0x01, \T7, \T3, \T2
2519        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
2520
2521        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
2522        #######################################################################
2523
2524
2525        #second phase of the reduction
2526        vpclmulqdq      $0x00, \T7, \T3, \T2
2527        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2528
2529        vpclmulqdq      $0x10, \T7, \T3, \T4
2530        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2531
2532        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
2533        #######################################################################
2534        vpxor           \T4, \T6, \T6              # the result is in T6
2535.endm
2536
2537
2538
2539# combined for GCM encrypt and decrypt functions
2540# clobbering all xmm registers
2541# clobbering r10, r11, r12, r13, r14, r15
2542.macro  GCM_ENC_DEC_AVX2     ENC_DEC
2543
2544        #the number of pushes must equal STACK_OFFSET
2545        push    %r12
2546        push    %r13
2547        push    %r14
2548        push    %r15
2549
2550        mov     %rsp, %r14
2551
2552
2553
2554
2555        sub     $VARIABLE_OFFSET, %rsp
2556        and     $~63, %rsp                         # align rsp to 64 bytes
2557
2558
2559        vmovdqu  HashKey(arg1), %xmm13             # xmm13 = HashKey
2560
2561        mov     arg4, %r13                         # save the number of bytes of plaintext/ciphertext
2562        and     $-16, %r13                         # r13 = r13 - (r13 mod 16)
2563
2564        mov     %r13, %r12
2565        shr     $4, %r12
2566        and     $7, %r12
2567        jz      _initial_num_blocks_is_0\@
2568
2569        cmp     $7, %r12
2570        je      _initial_num_blocks_is_7\@
2571        cmp     $6, %r12
2572        je      _initial_num_blocks_is_6\@
2573        cmp     $5, %r12
2574        je      _initial_num_blocks_is_5\@
2575        cmp     $4, %r12
2576        je      _initial_num_blocks_is_4\@
2577        cmp     $3, %r12
2578        je      _initial_num_blocks_is_3\@
2579        cmp     $2, %r12
2580        je      _initial_num_blocks_is_2\@
2581
2582        jmp     _initial_num_blocks_is_1\@
2583
2584_initial_num_blocks_is_7\@:
2585        INITIAL_BLOCKS_AVX2  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2586        sub     $16*7, %r13
2587        jmp     _initial_blocks_encrypted\@
2588
2589_initial_num_blocks_is_6\@:
2590        INITIAL_BLOCKS_AVX2  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2591        sub     $16*6, %r13
2592        jmp     _initial_blocks_encrypted\@
2593
2594_initial_num_blocks_is_5\@:
2595        INITIAL_BLOCKS_AVX2  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2596        sub     $16*5, %r13
2597        jmp     _initial_blocks_encrypted\@
2598
2599_initial_num_blocks_is_4\@:
2600        INITIAL_BLOCKS_AVX2  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2601        sub     $16*4, %r13
2602        jmp     _initial_blocks_encrypted\@
2603
2604_initial_num_blocks_is_3\@:
2605        INITIAL_BLOCKS_AVX2  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2606        sub     $16*3, %r13
2607        jmp     _initial_blocks_encrypted\@
2608
2609_initial_num_blocks_is_2\@:
2610        INITIAL_BLOCKS_AVX2  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2611        sub     $16*2, %r13
2612        jmp     _initial_blocks_encrypted\@
2613
2614_initial_num_blocks_is_1\@:
2615        INITIAL_BLOCKS_AVX2  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2616        sub     $16*1, %r13
2617        jmp     _initial_blocks_encrypted\@
2618
2619_initial_num_blocks_is_0\@:
2620        INITIAL_BLOCKS_AVX2  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2621
2622
2623_initial_blocks_encrypted\@:
2624        cmp     $0, %r13
2625        je      _zero_cipher_left\@
2626
2627        sub     $128, %r13
2628        je      _eight_cipher_left\@
2629
2630
2631
2632
2633        vmovd   %xmm9, %r15d
2634        and     $255, %r15d
2635        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2636
2637
2638_encrypt_by_8_new\@:
2639        cmp     $(255-8), %r15d
2640        jg      _encrypt_by_8\@
2641
2642
2643
2644        add     $8, %r15b
2645        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
2646        add     $128, %r11
2647        sub     $128, %r13
2648        jne     _encrypt_by_8_new\@
2649
2650        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2651        jmp     _eight_cipher_left\@
2652
2653_encrypt_by_8\@:
2654        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2655        add     $8, %r15b
2656        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
2657        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2658        add     $128, %r11
2659        sub     $128, %r13
2660        jne     _encrypt_by_8_new\@
2661
2662        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2663
2664
2665
2666
2667_eight_cipher_left\@:
2668        GHASH_LAST_8_AVX2    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
2669
2670
2671_zero_cipher_left\@:
2672        cmp     $16, arg4
2673        jl      _only_less_than_16\@
2674
2675        mov     arg4, %r13
2676        and     $15, %r13                            # r13 = (arg4 mod 16)
2677
2678        je      _multiple_of_16_bytes\@
2679
2680        # handle the last <16 Byte block seperately
2681
2682
2683        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
2684        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2685        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
2686
2687        sub     $16, %r11
2688        add     %r13, %r11
2689        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
2690
2691        lea     SHIFT_MASK+16(%rip), %r12
2692        sub     %r13, %r12                           # adjust the shuffle mask pointer
2693						     # to be able to shift 16-r13 bytes
2694						     # (r13 is the number of bytes in plaintext mod 16)
2695        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
2696        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
2697        jmp     _final_ghash_mul\@
2698
2699_only_less_than_16\@:
2700        # check for 0 length
2701        mov     arg4, %r13
2702        and     $15, %r13                            # r13 = (arg4 mod 16)
2703
2704        je      _multiple_of_16_bytes\@
2705
2706        # handle the last <16 Byte block seperately
2707
2708
2709        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
2710        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2711        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
2712
2713
2714        lea     SHIFT_MASK+16(%rip), %r12
2715        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
2716						     # able to shift 16-r13 bytes (r13 is the
2717						     # number of bytes in plaintext mod 16)
2718
2719_get_last_16_byte_loop\@:
2720        movb    (arg3, %r11),  %al
2721        movb    %al,  TMP1 (%rsp , %r11)
2722        add     $1, %r11
2723        cmp     %r13,  %r11
2724        jne     _get_last_16_byte_loop\@
2725
2726        vmovdqu  TMP1(%rsp), %xmm1
2727
2728        sub     $16, %r11
2729
2730_final_ghash_mul\@:
2731        .if  \ENC_DEC ==  DEC
2732        vmovdqa %xmm1, %xmm2
2733        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
2734        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2735        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
2736        vpand   %xmm1, %xmm2, %xmm2
2737        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
2738        vpxor   %xmm2, %xmm14, %xmm14
2739	#GHASH computation for the last <16 Byte block
2740        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2741        sub     %r13, %r11
2742        add     $16, %r11
2743        .else
2744        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
2745        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2746        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
2747        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2748        vpxor   %xmm9, %xmm14, %xmm14
2749	#GHASH computation for the last <16 Byte block
2750        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2751        sub     %r13, %r11
2752        add     $16, %r11
2753        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
2754        .endif
2755
2756
2757        #############################
2758        # output r13 Bytes
2759        vmovq   %xmm9, %rax
2760        cmp     $8, %r13
2761        jle     _less_than_8_bytes_left\@
2762
2763        mov     %rax, (arg2 , %r11)
2764        add     $8, %r11
2765        vpsrldq $8, %xmm9, %xmm9
2766        vmovq   %xmm9, %rax
2767        sub     $8, %r13
2768
2769_less_than_8_bytes_left\@:
2770        movb    %al, (arg2 , %r11)
2771        add     $1, %r11
2772        shr     $8, %rax
2773        sub     $1, %r13
2774        jne     _less_than_8_bytes_left\@
2775        #############################
2776
2777_multiple_of_16_bytes\@:
2778        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
2779        shl     $3, %r12                             # convert into number of bits
2780        vmovd   %r12d, %xmm15                        # len(A) in xmm15
2781
2782        shl     $3, arg4                             # len(C) in bits  (*128)
2783        vmovq   arg4, %xmm1
2784        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
2785        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
2786
2787        vpxor   %xmm15, %xmm14, %xmm14
2788        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
2789        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14              # perform a 16Byte swap
2790
2791        mov     arg5, %rax                           # rax = *Y0
2792        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
2793
2794        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
2795
2796        vpxor   %xmm14, %xmm9, %xmm9
2797
2798
2799
2800_return_T\@:
2801        mov     arg8, %r10              # r10 = authTag
2802        mov     arg9, %r11              # r11 = auth_tag_len
2803
2804        cmp     $16, %r11
2805        je      _T_16\@
2806
2807        cmp     $8, %r11
2808        jl      _T_4\@
2809
2810_T_8\@:
2811        vmovq   %xmm9, %rax
2812        mov     %rax, (%r10)
2813        add     $8, %r10
2814        sub     $8, %r11
2815        vpsrldq $8, %xmm9, %xmm9
2816        cmp     $0, %r11
2817        je     _return_T_done\@
2818_T_4\@:
2819        vmovd   %xmm9, %eax
2820        mov     %eax, (%r10)
2821        add     $4, %r10
2822        sub     $4, %r11
2823        vpsrldq     $4, %xmm9, %xmm9
2824        cmp     $0, %r11
2825        je     _return_T_done\@
2826_T_123\@:
2827        vmovd     %xmm9, %eax
2828        cmp     $2, %r11
2829        jl     _T_1\@
2830        mov     %ax, (%r10)
2831        cmp     $2, %r11
2832        je     _return_T_done\@
2833        add     $2, %r10
2834        sar     $16, %eax
2835_T_1\@:
2836        mov     %al, (%r10)
2837        jmp     _return_T_done\@
2838
2839_T_16\@:
2840        vmovdqu %xmm9, (%r10)
2841
2842_return_T_done\@:
2843        mov     %r14, %rsp
2844
2845        pop     %r15
2846        pop     %r14
2847        pop     %r13
2848        pop     %r12
2849.endm
2850
2851
2852#############################################################
2853#void   aesni_gcm_precomp_avx_gen4
2854#        (gcm_data     *my_ctx_data,
2855#        u8     *hash_subkey)# /* H, the Hash sub key input.
2856#				Data starts on a 16-byte boundary. */
2857#############################################################
2858ENTRY(aesni_gcm_precomp_avx_gen4)
2859        #the number of pushes must equal STACK_OFFSET
2860        push    %r12
2861        push    %r13
2862        push    %r14
2863        push    %r15
2864
2865        mov     %rsp, %r14
2866
2867
2868
2869        sub     $VARIABLE_OFFSET, %rsp
2870        and     $~63, %rsp                    # align rsp to 64 bytes
2871
2872        vmovdqu  (arg2), %xmm6                # xmm6 = HashKey
2873
2874        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
2875        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
2876        vmovdqa  %xmm6, %xmm2
2877        vpsllq   $1, %xmm6, %xmm6
2878        vpsrlq   $63, %xmm2, %xmm2
2879        vmovdqa  %xmm2, %xmm1
2880        vpslldq  $8, %xmm2, %xmm2
2881        vpsrldq  $8, %xmm1, %xmm1
2882        vpor     %xmm2, %xmm6, %xmm6
2883        #reduction
2884        vpshufd  $0b00100100, %xmm1, %xmm2
2885        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2886        vpand    POLY(%rip), %xmm2, %xmm2
2887        vpxor    %xmm2, %xmm6, %xmm6          # xmm6 holds the HashKey<<1 mod poly
2888        #######################################################################
2889        vmovdqa  %xmm6, HashKey(arg1)         # store HashKey<<1 mod poly
2890
2891
2892        PRECOMPUTE_AVX2  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2893
2894        mov     %r14, %rsp
2895
2896        pop     %r15
2897        pop     %r14
2898        pop     %r13
2899        pop     %r12
2900        ret
2901ENDPROC(aesni_gcm_precomp_avx_gen4)
2902
2903
2904###############################################################################
2905#void   aesni_gcm_enc_avx_gen4(
2906#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2907#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
2908#        const   u8 *in, /* Plaintext input */
2909#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
2910#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2911#			(from Security Association) concatenated with 8 byte
2912#			 Initialisation Vector (from IPSec ESP Payload)
2913#			 concatenated with 0x00000001. 16-byte aligned pointer. */
2914#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2915#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2916#        u8      *auth_tag, /* Authenticated Tag output. */
2917#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2918#				Valid values are 16 (most likely), 12 or 8. */
2919###############################################################################
2920ENTRY(aesni_gcm_enc_avx_gen4)
2921        GCM_ENC_DEC_AVX2     ENC
2922	ret
2923ENDPROC(aesni_gcm_enc_avx_gen4)
2924
2925###############################################################################
2926#void   aesni_gcm_dec_avx_gen4(
2927#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2928#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
2929#        const   u8 *in, /* Ciphertext input */
2930#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
2931#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2932#			(from Security Association) concatenated with 8 byte
2933#			Initialisation Vector (from IPSec ESP Payload)
2934#			concatenated with 0x00000001. 16-byte aligned pointer. */
2935#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2936#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2937#        u8      *auth_tag, /* Authenticated Tag output. */
2938#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2939#				Valid values are 16 (most likely), 12 or 8. */
2940###############################################################################
2941ENTRY(aesni_gcm_dec_avx_gen4)
2942        GCM_ENC_DEC_AVX2     DEC
2943	ret
2944ENDPROC(aesni_gcm_dec_avx_gen4)
2945
2946#endif /* CONFIG_AS_AVX2 */
2947