1######################################################################## 2# Copyright (c) 2013, Intel Corporation 3# 4# This software is available to you under a choice of one of two 5# licenses. You may choose to be licensed under the terms of the GNU 6# General Public License (GPL) Version 2, available from the file 7# COPYING in the main directory of this source tree, or the 8# OpenIB.org BSD license below: 9# 10# Redistribution and use in source and binary forms, with or without 11# modification, are permitted provided that the following conditions are 12# met: 13# 14# * Redistributions of source code must retain the above copyright 15# notice, this list of conditions and the following disclaimer. 16# 17# * Redistributions in binary form must reproduce the above copyright 18# notice, this list of conditions and the following disclaimer in the 19# documentation and/or other materials provided with the 20# distribution. 21# 22# * Neither the name of the Intel Corporation nor the names of its 23# contributors may be used to endorse or promote products derived from 24# this software without specific prior written permission. 25# 26# 27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR 34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38######################################################################## 39## 40## Authors: 41## Erdinc Ozturk <erdinc.ozturk@intel.com> 42## Vinodh Gopal <vinodh.gopal@intel.com> 43## James Guilford <james.guilford@intel.com> 44## Tim Chen <tim.c.chen@linux.intel.com> 45## 46## References: 47## This code was derived and highly optimized from the code described in paper: 48## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation 49## on Intel Architecture Processors. August, 2010 50## The details of the implementation is explained in: 51## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode 52## on Intel Architecture Processors. October, 2012. 53## 54## Assumptions: 55## 56## 57## 58## iv: 59## 0 1 2 3 60## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 61## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 62## | Salt (From the SA) | 63## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 64## | Initialization Vector | 65## | (This is the sequence number from IPSec header) | 66## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 67## | 0x1 | 68## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 69## 70## 71## 72## AAD: 73## AAD padded to 128 bits with 0 74## for example, assume AAD is a u32 vector 75## 76## if AAD is 8 bytes: 77## AAD[3] = {A0, A1}# 78## padded AAD in xmm register = {A1 A0 0 0} 79## 80## 0 1 2 3 81## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 82## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 83## | SPI (A1) | 84## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 85## | 32-bit Sequence Number (A0) | 86## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 87## | 0x0 | 88## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 89## 90## AAD Format with 32-bit Sequence Number 91## 92## if AAD is 12 bytes: 93## AAD[3] = {A0, A1, A2}# 94## padded AAD in xmm register = {A2 A1 A0 0} 95## 96## 0 1 2 3 97## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 98## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 99## | SPI (A2) | 100## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 101## | 64-bit Extended Sequence Number {A1,A0} | 102## | | 103## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 104## | 0x0 | 105## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 106## 107## AAD Format with 64-bit Extended Sequence Number 108## 109## 110## aadLen: 111## from the definition of the spec, aadLen can only be 8 or 12 bytes. 112## The code additionally supports aadLen of length 16 bytes. 113## 114## TLen: 115## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 116## 117## poly = x^128 + x^127 + x^126 + x^121 + 1 118## throughout the code, one tab and two tab indentations are used. one tab is 119## for GHASH part, two tabs is for AES part. 120## 121 122#include <linux/linkage.h> 123#include <asm/inst.h> 124 125# constants in mergeable sections, linker can reorder and merge 126.section .rodata.cst16.POLY, "aM", @progbits, 16 127.align 16 128POLY: .octa 0xC2000000000000000000000000000001 129 130.section .rodata.cst16.POLY2, "aM", @progbits, 16 131.align 16 132POLY2: .octa 0xC20000000000000000000001C2000000 133 134.section .rodata.cst16.TWOONE, "aM", @progbits, 16 135.align 16 136TWOONE: .octa 0x00000001000000000000000000000001 137 138.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 139.align 16 140SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 141 142.section .rodata.cst16.ONE, "aM", @progbits, 16 143.align 16 144ONE: .octa 0x00000000000000000000000000000001 145 146.section .rodata.cst16.ONEf, "aM", @progbits, 16 147.align 16 148ONEf: .octa 0x01000000000000000000000000000000 149 150# order of these constants should not change. 151# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F 152.section .rodata, "a", @progbits 153.align 16 154SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 155ALL_F: .octa 0xffffffffffffffffffffffffffffffff 156 .octa 0x00000000000000000000000000000000 157 158.section .rodata 159.align 16 160.type aad_shift_arr, @object 161.size aad_shift_arr, 272 162aad_shift_arr: 163 .octa 0xffffffffffffffffffffffffffffffff 164 .octa 0xffffffffffffffffffffffffffffff0C 165 .octa 0xffffffffffffffffffffffffffff0D0C 166 .octa 0xffffffffffffffffffffffffff0E0D0C 167 .octa 0xffffffffffffffffffffffff0F0E0D0C 168 .octa 0xffffffffffffffffffffff0C0B0A0908 169 .octa 0xffffffffffffffffffff0D0C0B0A0908 170 .octa 0xffffffffffffffffff0E0D0C0B0A0908 171 .octa 0xffffffffffffffff0F0E0D0C0B0A0908 172 .octa 0xffffffffffffff0C0B0A090807060504 173 .octa 0xffffffffffff0D0C0B0A090807060504 174 .octa 0xffffffffff0E0D0C0B0A090807060504 175 .octa 0xffffffff0F0E0D0C0B0A090807060504 176 .octa 0xffffff0C0B0A09080706050403020100 177 .octa 0xffff0D0C0B0A09080706050403020100 178 .octa 0xff0E0D0C0B0A09080706050403020100 179 .octa 0x0F0E0D0C0B0A09080706050403020100 180 181 182.text 183 184 185##define the fields of the gcm aes context 186#{ 187# u8 expanded_keys[16*11] store expanded keys 188# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here 189# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here 190# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here 191# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here 192# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here 193# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here 194# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here 195# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here 196# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes) 197# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes) 198# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes) 199# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes) 200# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes) 201# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes) 202# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes) 203# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes) 204#} gcm_ctx# 205 206HashKey = 16*11 # store HashKey <<1 mod poly here 207HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here 208HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here 209HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here 210HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here 211HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here 212HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here 213HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here 214HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) 215HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) 216HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) 217HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) 218HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) 219HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) 220HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) 221HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) 222 223#define arg1 %rdi 224#define arg2 %rsi 225#define arg3 %rdx 226#define arg4 %rcx 227#define arg5 %r8 228#define arg6 %r9 229#define arg7 STACK_OFFSET+8*1(%r14) 230#define arg8 STACK_OFFSET+8*2(%r14) 231#define arg9 STACK_OFFSET+8*3(%r14) 232 233i = 0 234j = 0 235 236out_order = 0 237in_order = 1 238DEC = 0 239ENC = 1 240 241.macro define_reg r n 242reg_\r = %xmm\n 243.endm 244 245.macro setreg 246.altmacro 247define_reg i %i 248define_reg j %j 249.noaltmacro 250.endm 251 252# need to push 4 registers into stack to maintain 253STACK_OFFSET = 8*4 254 255TMP1 = 16*0 # Temporary storage for AAD 256TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) 257TMP3 = 16*2 # Temporary storage for AES State 3 258TMP4 = 16*3 # Temporary storage for AES State 4 259TMP5 = 16*4 # Temporary storage for AES State 5 260TMP6 = 16*5 # Temporary storage for AES State 6 261TMP7 = 16*6 # Temporary storage for AES State 7 262TMP8 = 16*7 # Temporary storage for AES State 8 263 264VARIABLE_OFFSET = 16*8 265 266################################ 267# Utility Macros 268################################ 269 270# Encryption of a single block 271.macro ENCRYPT_SINGLE_BLOCK XMM0 272 vpxor (arg1), \XMM0, \XMM0 273 i = 1 274 setreg 275.rep 9 276 vaesenc 16*i(arg1), \XMM0, \XMM0 277 i = (i+1) 278 setreg 279.endr 280 vaesenclast 16*10(arg1), \XMM0, \XMM0 281.endm 282 283#ifdef CONFIG_AS_AVX 284############################################################################### 285# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 286# Input: A and B (128-bits each, bit-reflected) 287# Output: C = A*B*x mod poly, (i.e. >>1 ) 288# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 289# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 290############################################################################### 291.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 292 293 vpshufd $0b01001110, \GH, \T2 294 vpshufd $0b01001110, \HK, \T3 295 vpxor \GH , \T2, \T2 # T2 = (a1+a0) 296 vpxor \HK , \T3, \T3 # T3 = (b1+b0) 297 298 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 299 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 300 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) 301 vpxor \GH, \T2,\T2 302 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 303 304 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs 305 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs 306 vpxor \T3, \GH, \GH 307 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK 308 309 #first phase of the reduction 310 vpslld $31, \GH, \T2 # packed right shifting << 31 311 vpslld $30, \GH, \T3 # packed right shifting shift << 30 312 vpslld $25, \GH, \T4 # packed right shifting shift << 25 313 314 vpxor \T3, \T2, \T2 # xor the shifted versions 315 vpxor \T4, \T2, \T2 316 317 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW 318 319 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 320 vpxor \T2, \GH, \GH # first phase of the reduction complete 321 322 #second phase of the reduction 323 324 vpsrld $1,\GH, \T2 # packed left shifting >> 1 325 vpsrld $2,\GH, \T3 # packed left shifting >> 2 326 vpsrld $7,\GH, \T4 # packed left shifting >> 7 327 vpxor \T3, \T2, \T2 # xor the shifted versions 328 vpxor \T4, \T2, \T2 329 330 vpxor \T5, \T2, \T2 331 vpxor \T2, \GH, \GH 332 vpxor \T1, \GH, \GH # the result is in GH 333 334 335.endm 336 337.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 338 339 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 340 vmovdqa \HK, \T5 341 342 vpshufd $0b01001110, \T5, \T1 343 vpxor \T5, \T1, \T1 344 vmovdqa \T1, HashKey_k(arg1) 345 346 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 347 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly 348 vpshufd $0b01001110, \T5, \T1 349 vpxor \T5, \T1, \T1 350 vmovdqa \T1, HashKey_2_k(arg1) 351 352 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 353 vmovdqa \T5, HashKey_3(arg1) 354 vpshufd $0b01001110, \T5, \T1 355 vpxor \T5, \T1, \T1 356 vmovdqa \T1, HashKey_3_k(arg1) 357 358 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 359 vmovdqa \T5, HashKey_4(arg1) 360 vpshufd $0b01001110, \T5, \T1 361 vpxor \T5, \T1, \T1 362 vmovdqa \T1, HashKey_4_k(arg1) 363 364 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 365 vmovdqa \T5, HashKey_5(arg1) 366 vpshufd $0b01001110, \T5, \T1 367 vpxor \T5, \T1, \T1 368 vmovdqa \T1, HashKey_5_k(arg1) 369 370 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 371 vmovdqa \T5, HashKey_6(arg1) 372 vpshufd $0b01001110, \T5, \T1 373 vpxor \T5, \T1, \T1 374 vmovdqa \T1, HashKey_6_k(arg1) 375 376 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 377 vmovdqa \T5, HashKey_7(arg1) 378 vpshufd $0b01001110, \T5, \T1 379 vpxor \T5, \T1, \T1 380 vmovdqa \T1, HashKey_7_k(arg1) 381 382 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 383 vmovdqa \T5, HashKey_8(arg1) 384 vpshufd $0b01001110, \T5, \T1 385 vpxor \T5, \T1, \T1 386 vmovdqa \T1, HashKey_8_k(arg1) 387 388.endm 389 390## if a = number of total plaintext bytes 391## b = floor(a/16) 392## num_initial_blocks = b mod 4# 393## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 394## r10, r11, r12, rax are clobbered 395## arg1, arg2, arg3, r14 are used as a pointer only, not modified 396 397.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC 398 i = (8-\num_initial_blocks) 399 j = 0 400 setreg 401 402 mov arg6, %r10 # r10 = AAD 403 mov arg7, %r12 # r12 = aadLen 404 405 406 mov %r12, %r11 407 408 vpxor reg_j, reg_j, reg_j 409 vpxor reg_i, reg_i, reg_i 410 cmp $16, %r11 411 jl _get_AAD_rest8\@ 412_get_AAD_blocks\@: 413 vmovdqu (%r10), reg_i 414 vpshufb SHUF_MASK(%rip), reg_i, reg_i 415 vpxor reg_i, reg_j, reg_j 416 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 417 add $16, %r10 418 sub $16, %r12 419 sub $16, %r11 420 cmp $16, %r11 421 jge _get_AAD_blocks\@ 422 vmovdqu reg_j, reg_i 423 cmp $0, %r11 424 je _get_AAD_done\@ 425 426 vpxor reg_i, reg_i, reg_i 427 428 /* read the last <16B of AAD. since we have at least 4B of 429 data right after the AAD (the ICV, and maybe some CT), we can 430 read 4B/8B blocks safely, and then get rid of the extra stuff */ 431_get_AAD_rest8\@: 432 cmp $4, %r11 433 jle _get_AAD_rest4\@ 434 movq (%r10), \T1 435 add $8, %r10 436 sub $8, %r11 437 vpslldq $8, \T1, \T1 438 vpsrldq $8, reg_i, reg_i 439 vpxor \T1, reg_i, reg_i 440 jmp _get_AAD_rest8\@ 441_get_AAD_rest4\@: 442 cmp $0, %r11 443 jle _get_AAD_rest0\@ 444 mov (%r10), %eax 445 movq %rax, \T1 446 add $4, %r10 447 sub $4, %r11 448 vpslldq $12, \T1, \T1 449 vpsrldq $4, reg_i, reg_i 450 vpxor \T1, reg_i, reg_i 451_get_AAD_rest0\@: 452 /* finalize: shift out the extra bytes we read, and align 453 left. since pslldq can only shift by an immediate, we use 454 vpshufb and an array of shuffle masks */ 455 movq %r12, %r11 456 salq $4, %r11 457 movdqu aad_shift_arr(%r11), \T1 458 vpshufb \T1, reg_i, reg_i 459_get_AAD_rest_final\@: 460 vpshufb SHUF_MASK(%rip), reg_i, reg_i 461 vpxor reg_j, reg_i, reg_i 462 GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 463 464_get_AAD_done\@: 465 # initialize the data pointer offset as zero 466 xor %r11d, %r11d 467 468 # start AES for num_initial_blocks blocks 469 mov arg5, %rax # rax = *Y0 470 vmovdqu (%rax), \CTR # CTR = Y0 471 vpshufb SHUF_MASK(%rip), \CTR, \CTR 472 473 474 i = (9-\num_initial_blocks) 475 setreg 476.rep \num_initial_blocks 477 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 478 vmovdqa \CTR, reg_i 479 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 480 i = (i+1) 481 setreg 482.endr 483 484 vmovdqa (arg1), \T_key 485 i = (9-\num_initial_blocks) 486 setreg 487.rep \num_initial_blocks 488 vpxor \T_key, reg_i, reg_i 489 i = (i+1) 490 setreg 491.endr 492 493 j = 1 494 setreg 495.rep 9 496 vmovdqa 16*j(arg1), \T_key 497 i = (9-\num_initial_blocks) 498 setreg 499.rep \num_initial_blocks 500 vaesenc \T_key, reg_i, reg_i 501 i = (i+1) 502 setreg 503.endr 504 505 j = (j+1) 506 setreg 507.endr 508 509 510 vmovdqa 16*10(arg1), \T_key 511 i = (9-\num_initial_blocks) 512 setreg 513.rep \num_initial_blocks 514 vaesenclast \T_key, reg_i, reg_i 515 i = (i+1) 516 setreg 517.endr 518 519 i = (9-\num_initial_blocks) 520 setreg 521.rep \num_initial_blocks 522 vmovdqu (arg3, %r11), \T1 523 vpxor \T1, reg_i, reg_i 524 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks 525 add $16, %r11 526.if \ENC_DEC == DEC 527 vmovdqa \T1, reg_i 528.endif 529 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 530 i = (i+1) 531 setreg 532.endr 533 534 535 i = (8-\num_initial_blocks) 536 j = (9-\num_initial_blocks) 537 setreg 538 539.rep \num_initial_blocks 540 vpxor reg_i, reg_j, reg_j 541 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 542 i = (i+1) 543 j = (j+1) 544 setreg 545.endr 546 # XMM8 has the combined result here 547 548 vmovdqa \XMM8, TMP1(%rsp) 549 vmovdqa \XMM8, \T3 550 551 cmp $128, %r13 552 jl _initial_blocks_done\@ # no need for precomputed constants 553 554############################################################################### 555# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 556 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 557 vmovdqa \CTR, \XMM1 558 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 559 560 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 561 vmovdqa \CTR, \XMM2 562 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 563 564 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 565 vmovdqa \CTR, \XMM3 566 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 567 568 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 569 vmovdqa \CTR, \XMM4 570 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 571 572 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 573 vmovdqa \CTR, \XMM5 574 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 575 576 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 577 vmovdqa \CTR, \XMM6 578 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 579 580 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 581 vmovdqa \CTR, \XMM7 582 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 583 584 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 585 vmovdqa \CTR, \XMM8 586 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 587 588 vmovdqa (arg1), \T_key 589 vpxor \T_key, \XMM1, \XMM1 590 vpxor \T_key, \XMM2, \XMM2 591 vpxor \T_key, \XMM3, \XMM3 592 vpxor \T_key, \XMM4, \XMM4 593 vpxor \T_key, \XMM5, \XMM5 594 vpxor \T_key, \XMM6, \XMM6 595 vpxor \T_key, \XMM7, \XMM7 596 vpxor \T_key, \XMM8, \XMM8 597 598 i = 1 599 setreg 600.rep 9 # do 9 rounds 601 vmovdqa 16*i(arg1), \T_key 602 vaesenc \T_key, \XMM1, \XMM1 603 vaesenc \T_key, \XMM2, \XMM2 604 vaesenc \T_key, \XMM3, \XMM3 605 vaesenc \T_key, \XMM4, \XMM4 606 vaesenc \T_key, \XMM5, \XMM5 607 vaesenc \T_key, \XMM6, \XMM6 608 vaesenc \T_key, \XMM7, \XMM7 609 vaesenc \T_key, \XMM8, \XMM8 610 i = (i+1) 611 setreg 612.endr 613 614 615 vmovdqa 16*i(arg1), \T_key 616 vaesenclast \T_key, \XMM1, \XMM1 617 vaesenclast \T_key, \XMM2, \XMM2 618 vaesenclast \T_key, \XMM3, \XMM3 619 vaesenclast \T_key, \XMM4, \XMM4 620 vaesenclast \T_key, \XMM5, \XMM5 621 vaesenclast \T_key, \XMM6, \XMM6 622 vaesenclast \T_key, \XMM7, \XMM7 623 vaesenclast \T_key, \XMM8, \XMM8 624 625 vmovdqu (arg3, %r11), \T1 626 vpxor \T1, \XMM1, \XMM1 627 vmovdqu \XMM1, (arg2 , %r11) 628 .if \ENC_DEC == DEC 629 vmovdqa \T1, \XMM1 630 .endif 631 632 vmovdqu 16*1(arg3, %r11), \T1 633 vpxor \T1, \XMM2, \XMM2 634 vmovdqu \XMM2, 16*1(arg2 , %r11) 635 .if \ENC_DEC == DEC 636 vmovdqa \T1, \XMM2 637 .endif 638 639 vmovdqu 16*2(arg3, %r11), \T1 640 vpxor \T1, \XMM3, \XMM3 641 vmovdqu \XMM3, 16*2(arg2 , %r11) 642 .if \ENC_DEC == DEC 643 vmovdqa \T1, \XMM3 644 .endif 645 646 vmovdqu 16*3(arg3, %r11), \T1 647 vpxor \T1, \XMM4, \XMM4 648 vmovdqu \XMM4, 16*3(arg2 , %r11) 649 .if \ENC_DEC == DEC 650 vmovdqa \T1, \XMM4 651 .endif 652 653 vmovdqu 16*4(arg3, %r11), \T1 654 vpxor \T1, \XMM5, \XMM5 655 vmovdqu \XMM5, 16*4(arg2 , %r11) 656 .if \ENC_DEC == DEC 657 vmovdqa \T1, \XMM5 658 .endif 659 660 vmovdqu 16*5(arg3, %r11), \T1 661 vpxor \T1, \XMM6, \XMM6 662 vmovdqu \XMM6, 16*5(arg2 , %r11) 663 .if \ENC_DEC == DEC 664 vmovdqa \T1, \XMM6 665 .endif 666 667 vmovdqu 16*6(arg3, %r11), \T1 668 vpxor \T1, \XMM7, \XMM7 669 vmovdqu \XMM7, 16*6(arg2 , %r11) 670 .if \ENC_DEC == DEC 671 vmovdqa \T1, \XMM7 672 .endif 673 674 vmovdqu 16*7(arg3, %r11), \T1 675 vpxor \T1, \XMM8, \XMM8 676 vmovdqu \XMM8, 16*7(arg2 , %r11) 677 .if \ENC_DEC == DEC 678 vmovdqa \T1, \XMM8 679 .endif 680 681 add $128, %r11 682 683 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 684 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext 685 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 686 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 687 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 688 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 689 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 690 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 691 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 692 693############################################################################### 694 695_initial_blocks_done\@: 696 697.endm 698 699# encrypt 8 blocks at a time 700# ghash the 8 previously encrypted ciphertext blocks 701# arg1, arg2, arg3 are used as pointers only, not modified 702# r11 is the data offset value 703.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 704 705 vmovdqa \XMM1, \T2 706 vmovdqa \XMM2, TMP2(%rsp) 707 vmovdqa \XMM3, TMP3(%rsp) 708 vmovdqa \XMM4, TMP4(%rsp) 709 vmovdqa \XMM5, TMP5(%rsp) 710 vmovdqa \XMM6, TMP6(%rsp) 711 vmovdqa \XMM7, TMP7(%rsp) 712 vmovdqa \XMM8, TMP8(%rsp) 713 714.if \loop_idx == in_order 715 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 716 vpaddd ONE(%rip), \XMM1, \XMM2 717 vpaddd ONE(%rip), \XMM2, \XMM3 718 vpaddd ONE(%rip), \XMM3, \XMM4 719 vpaddd ONE(%rip), \XMM4, \XMM5 720 vpaddd ONE(%rip), \XMM5, \XMM6 721 vpaddd ONE(%rip), \XMM6, \XMM7 722 vpaddd ONE(%rip), \XMM7, \XMM8 723 vmovdqa \XMM8, \CTR 724 725 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 726 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 727 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 728 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 729 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 730 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 731 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 732 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 733.else 734 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 735 vpaddd ONEf(%rip), \XMM1, \XMM2 736 vpaddd ONEf(%rip), \XMM2, \XMM3 737 vpaddd ONEf(%rip), \XMM3, \XMM4 738 vpaddd ONEf(%rip), \XMM4, \XMM5 739 vpaddd ONEf(%rip), \XMM5, \XMM6 740 vpaddd ONEf(%rip), \XMM6, \XMM7 741 vpaddd ONEf(%rip), \XMM7, \XMM8 742 vmovdqa \XMM8, \CTR 743.endif 744 745 746 ####################################################################### 747 748 vmovdqu (arg1), \T1 749 vpxor \T1, \XMM1, \XMM1 750 vpxor \T1, \XMM2, \XMM2 751 vpxor \T1, \XMM3, \XMM3 752 vpxor \T1, \XMM4, \XMM4 753 vpxor \T1, \XMM5, \XMM5 754 vpxor \T1, \XMM6, \XMM6 755 vpxor \T1, \XMM7, \XMM7 756 vpxor \T1, \XMM8, \XMM8 757 758 ####################################################################### 759 760 761 762 763 764 vmovdqu 16*1(arg1), \T1 765 vaesenc \T1, \XMM1, \XMM1 766 vaesenc \T1, \XMM2, \XMM2 767 vaesenc \T1, \XMM3, \XMM3 768 vaesenc \T1, \XMM4, \XMM4 769 vaesenc \T1, \XMM5, \XMM5 770 vaesenc \T1, \XMM6, \XMM6 771 vaesenc \T1, \XMM7, \XMM7 772 vaesenc \T1, \XMM8, \XMM8 773 774 vmovdqu 16*2(arg1), \T1 775 vaesenc \T1, \XMM1, \XMM1 776 vaesenc \T1, \XMM2, \XMM2 777 vaesenc \T1, \XMM3, \XMM3 778 vaesenc \T1, \XMM4, \XMM4 779 vaesenc \T1, \XMM5, \XMM5 780 vaesenc \T1, \XMM6, \XMM6 781 vaesenc \T1, \XMM7, \XMM7 782 vaesenc \T1, \XMM8, \XMM8 783 784 785 ####################################################################### 786 787 vmovdqa HashKey_8(arg1), \T5 788 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 789 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 790 791 vpshufd $0b01001110, \T2, \T6 792 vpxor \T2, \T6, \T6 793 794 vmovdqa HashKey_8_k(arg1), \T5 795 vpclmulqdq $0x00, \T5, \T6, \T6 796 797 vmovdqu 16*3(arg1), \T1 798 vaesenc \T1, \XMM1, \XMM1 799 vaesenc \T1, \XMM2, \XMM2 800 vaesenc \T1, \XMM3, \XMM3 801 vaesenc \T1, \XMM4, \XMM4 802 vaesenc \T1, \XMM5, \XMM5 803 vaesenc \T1, \XMM6, \XMM6 804 vaesenc \T1, \XMM7, \XMM7 805 vaesenc \T1, \XMM8, \XMM8 806 807 vmovdqa TMP2(%rsp), \T1 808 vmovdqa HashKey_7(arg1), \T5 809 vpclmulqdq $0x11, \T5, \T1, \T3 810 vpxor \T3, \T4, \T4 811 vpclmulqdq $0x00, \T5, \T1, \T3 812 vpxor \T3, \T7, \T7 813 814 vpshufd $0b01001110, \T1, \T3 815 vpxor \T1, \T3, \T3 816 vmovdqa HashKey_7_k(arg1), \T5 817 vpclmulqdq $0x10, \T5, \T3, \T3 818 vpxor \T3, \T6, \T6 819 820 vmovdqu 16*4(arg1), \T1 821 vaesenc \T1, \XMM1, \XMM1 822 vaesenc \T1, \XMM2, \XMM2 823 vaesenc \T1, \XMM3, \XMM3 824 vaesenc \T1, \XMM4, \XMM4 825 vaesenc \T1, \XMM5, \XMM5 826 vaesenc \T1, \XMM6, \XMM6 827 vaesenc \T1, \XMM7, \XMM7 828 vaesenc \T1, \XMM8, \XMM8 829 830 ####################################################################### 831 832 vmovdqa TMP3(%rsp), \T1 833 vmovdqa HashKey_6(arg1), \T5 834 vpclmulqdq $0x11, \T5, \T1, \T3 835 vpxor \T3, \T4, \T4 836 vpclmulqdq $0x00, \T5, \T1, \T3 837 vpxor \T3, \T7, \T7 838 839 vpshufd $0b01001110, \T1, \T3 840 vpxor \T1, \T3, \T3 841 vmovdqa HashKey_6_k(arg1), \T5 842 vpclmulqdq $0x10, \T5, \T3, \T3 843 vpxor \T3, \T6, \T6 844 845 vmovdqu 16*5(arg1), \T1 846 vaesenc \T1, \XMM1, \XMM1 847 vaesenc \T1, \XMM2, \XMM2 848 vaesenc \T1, \XMM3, \XMM3 849 vaesenc \T1, \XMM4, \XMM4 850 vaesenc \T1, \XMM5, \XMM5 851 vaesenc \T1, \XMM6, \XMM6 852 vaesenc \T1, \XMM7, \XMM7 853 vaesenc \T1, \XMM8, \XMM8 854 855 vmovdqa TMP4(%rsp), \T1 856 vmovdqa HashKey_5(arg1), \T5 857 vpclmulqdq $0x11, \T5, \T1, \T3 858 vpxor \T3, \T4, \T4 859 vpclmulqdq $0x00, \T5, \T1, \T3 860 vpxor \T3, \T7, \T7 861 862 vpshufd $0b01001110, \T1, \T3 863 vpxor \T1, \T3, \T3 864 vmovdqa HashKey_5_k(arg1), \T5 865 vpclmulqdq $0x10, \T5, \T3, \T3 866 vpxor \T3, \T6, \T6 867 868 vmovdqu 16*6(arg1), \T1 869 vaesenc \T1, \XMM1, \XMM1 870 vaesenc \T1, \XMM2, \XMM2 871 vaesenc \T1, \XMM3, \XMM3 872 vaesenc \T1, \XMM4, \XMM4 873 vaesenc \T1, \XMM5, \XMM5 874 vaesenc \T1, \XMM6, \XMM6 875 vaesenc \T1, \XMM7, \XMM7 876 vaesenc \T1, \XMM8, \XMM8 877 878 879 vmovdqa TMP5(%rsp), \T1 880 vmovdqa HashKey_4(arg1), \T5 881 vpclmulqdq $0x11, \T5, \T1, \T3 882 vpxor \T3, \T4, \T4 883 vpclmulqdq $0x00, \T5, \T1, \T3 884 vpxor \T3, \T7, \T7 885 886 vpshufd $0b01001110, \T1, \T3 887 vpxor \T1, \T3, \T3 888 vmovdqa HashKey_4_k(arg1), \T5 889 vpclmulqdq $0x10, \T5, \T3, \T3 890 vpxor \T3, \T6, \T6 891 892 vmovdqu 16*7(arg1), \T1 893 vaesenc \T1, \XMM1, \XMM1 894 vaesenc \T1, \XMM2, \XMM2 895 vaesenc \T1, \XMM3, \XMM3 896 vaesenc \T1, \XMM4, \XMM4 897 vaesenc \T1, \XMM5, \XMM5 898 vaesenc \T1, \XMM6, \XMM6 899 vaesenc \T1, \XMM7, \XMM7 900 vaesenc \T1, \XMM8, \XMM8 901 902 vmovdqa TMP6(%rsp), \T1 903 vmovdqa HashKey_3(arg1), \T5 904 vpclmulqdq $0x11, \T5, \T1, \T3 905 vpxor \T3, \T4, \T4 906 vpclmulqdq $0x00, \T5, \T1, \T3 907 vpxor \T3, \T7, \T7 908 909 vpshufd $0b01001110, \T1, \T3 910 vpxor \T1, \T3, \T3 911 vmovdqa HashKey_3_k(arg1), \T5 912 vpclmulqdq $0x10, \T5, \T3, \T3 913 vpxor \T3, \T6, \T6 914 915 916 vmovdqu 16*8(arg1), \T1 917 vaesenc \T1, \XMM1, \XMM1 918 vaesenc \T1, \XMM2, \XMM2 919 vaesenc \T1, \XMM3, \XMM3 920 vaesenc \T1, \XMM4, \XMM4 921 vaesenc \T1, \XMM5, \XMM5 922 vaesenc \T1, \XMM6, \XMM6 923 vaesenc \T1, \XMM7, \XMM7 924 vaesenc \T1, \XMM8, \XMM8 925 926 vmovdqa TMP7(%rsp), \T1 927 vmovdqa HashKey_2(arg1), \T5 928 vpclmulqdq $0x11, \T5, \T1, \T3 929 vpxor \T3, \T4, \T4 930 vpclmulqdq $0x00, \T5, \T1, \T3 931 vpxor \T3, \T7, \T7 932 933 vpshufd $0b01001110, \T1, \T3 934 vpxor \T1, \T3, \T3 935 vmovdqa HashKey_2_k(arg1), \T5 936 vpclmulqdq $0x10, \T5, \T3, \T3 937 vpxor \T3, \T6, \T6 938 939 ####################################################################### 940 941 vmovdqu 16*9(arg1), \T5 942 vaesenc \T5, \XMM1, \XMM1 943 vaesenc \T5, \XMM2, \XMM2 944 vaesenc \T5, \XMM3, \XMM3 945 vaesenc \T5, \XMM4, \XMM4 946 vaesenc \T5, \XMM5, \XMM5 947 vaesenc \T5, \XMM6, \XMM6 948 vaesenc \T5, \XMM7, \XMM7 949 vaesenc \T5, \XMM8, \XMM8 950 951 vmovdqa TMP8(%rsp), \T1 952 vmovdqa HashKey(arg1), \T5 953 vpclmulqdq $0x11, \T5, \T1, \T3 954 vpxor \T3, \T4, \T4 955 vpclmulqdq $0x00, \T5, \T1, \T3 956 vpxor \T3, \T7, \T7 957 958 vpshufd $0b01001110, \T1, \T3 959 vpxor \T1, \T3, \T3 960 vmovdqa HashKey_k(arg1), \T5 961 vpclmulqdq $0x10, \T5, \T3, \T3 962 vpxor \T3, \T6, \T6 963 964 vpxor \T4, \T6, \T6 965 vpxor \T7, \T6, \T6 966 967 vmovdqu 16*10(arg1), \T5 968 969 i = 0 970 j = 1 971 setreg 972.rep 8 973 vpxor 16*i(arg3, %r11), \T5, \T2 974 .if \ENC_DEC == ENC 975 vaesenclast \T2, reg_j, reg_j 976 .else 977 vaesenclast \T2, reg_j, \T3 978 vmovdqu 16*i(arg3, %r11), reg_j 979 vmovdqu \T3, 16*i(arg2, %r11) 980 .endif 981 i = (i+1) 982 j = (j+1) 983 setreg 984.endr 985 ####################################################################### 986 987 988 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 989 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 990 vpxor \T3, \T7, \T7 991 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 992 993 994 995 ####################################################################### 996 #first phase of the reduction 997 ####################################################################### 998 vpslld $31, \T7, \T2 # packed right shifting << 31 999 vpslld $30, \T7, \T3 # packed right shifting shift << 30 1000 vpslld $25, \T7, \T4 # packed right shifting shift << 25 1001 1002 vpxor \T3, \T2, \T2 # xor the shifted versions 1003 vpxor \T4, \T2, \T2 1004 1005 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1006 1007 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1008 vpxor \T2, \T7, \T7 # first phase of the reduction complete 1009 ####################################################################### 1010 .if \ENC_DEC == ENC 1011 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer 1012 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer 1013 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer 1014 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer 1015 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer 1016 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer 1017 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer 1018 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer 1019 .endif 1020 1021 ####################################################################### 1022 #second phase of the reduction 1023 vpsrld $1, \T7, \T2 # packed left shifting >> 1 1024 vpsrld $2, \T7, \T3 # packed left shifting >> 2 1025 vpsrld $7, \T7, \T4 # packed left shifting >> 7 1026 vpxor \T3, \T2, \T2 # xor the shifted versions 1027 vpxor \T4, \T2, \T2 1028 1029 vpxor \T1, \T2, \T2 1030 vpxor \T2, \T7, \T7 1031 vpxor \T7, \T6, \T6 # the result is in T6 1032 ####################################################################### 1033 1034 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1035 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1036 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1037 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1038 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1039 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1040 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1041 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1042 1043 1044 vpxor \T6, \XMM1, \XMM1 1045 1046 1047 1048.endm 1049 1050 1051# GHASH the last 4 ciphertext blocks. 1052.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 1053 1054 ## Karatsuba Method 1055 1056 1057 vpshufd $0b01001110, \XMM1, \T2 1058 vpxor \XMM1, \T2, \T2 1059 vmovdqa HashKey_8(arg1), \T5 1060 vpclmulqdq $0x11, \T5, \XMM1, \T6 1061 vpclmulqdq $0x00, \T5, \XMM1, \T7 1062 1063 vmovdqa HashKey_8_k(arg1), \T3 1064 vpclmulqdq $0x00, \T3, \T2, \XMM1 1065 1066 ###################### 1067 1068 vpshufd $0b01001110, \XMM2, \T2 1069 vpxor \XMM2, \T2, \T2 1070 vmovdqa HashKey_7(arg1), \T5 1071 vpclmulqdq $0x11, \T5, \XMM2, \T4 1072 vpxor \T4, \T6, \T6 1073 1074 vpclmulqdq $0x00, \T5, \XMM2, \T4 1075 vpxor \T4, \T7, \T7 1076 1077 vmovdqa HashKey_7_k(arg1), \T3 1078 vpclmulqdq $0x00, \T3, \T2, \T2 1079 vpxor \T2, \XMM1, \XMM1 1080 1081 ###################### 1082 1083 vpshufd $0b01001110, \XMM3, \T2 1084 vpxor \XMM3, \T2, \T2 1085 vmovdqa HashKey_6(arg1), \T5 1086 vpclmulqdq $0x11, \T5, \XMM3, \T4 1087 vpxor \T4, \T6, \T6 1088 1089 vpclmulqdq $0x00, \T5, \XMM3, \T4 1090 vpxor \T4, \T7, \T7 1091 1092 vmovdqa HashKey_6_k(arg1), \T3 1093 vpclmulqdq $0x00, \T3, \T2, \T2 1094 vpxor \T2, \XMM1, \XMM1 1095 1096 ###################### 1097 1098 vpshufd $0b01001110, \XMM4, \T2 1099 vpxor \XMM4, \T2, \T2 1100 vmovdqa HashKey_5(arg1), \T5 1101 vpclmulqdq $0x11, \T5, \XMM4, \T4 1102 vpxor \T4, \T6, \T6 1103 1104 vpclmulqdq $0x00, \T5, \XMM4, \T4 1105 vpxor \T4, \T7, \T7 1106 1107 vmovdqa HashKey_5_k(arg1), \T3 1108 vpclmulqdq $0x00, \T3, \T2, \T2 1109 vpxor \T2, \XMM1, \XMM1 1110 1111 ###################### 1112 1113 vpshufd $0b01001110, \XMM5, \T2 1114 vpxor \XMM5, \T2, \T2 1115 vmovdqa HashKey_4(arg1), \T5 1116 vpclmulqdq $0x11, \T5, \XMM5, \T4 1117 vpxor \T4, \T6, \T6 1118 1119 vpclmulqdq $0x00, \T5, \XMM5, \T4 1120 vpxor \T4, \T7, \T7 1121 1122 vmovdqa HashKey_4_k(arg1), \T3 1123 vpclmulqdq $0x00, \T3, \T2, \T2 1124 vpxor \T2, \XMM1, \XMM1 1125 1126 ###################### 1127 1128 vpshufd $0b01001110, \XMM6, \T2 1129 vpxor \XMM6, \T2, \T2 1130 vmovdqa HashKey_3(arg1), \T5 1131 vpclmulqdq $0x11, \T5, \XMM6, \T4 1132 vpxor \T4, \T6, \T6 1133 1134 vpclmulqdq $0x00, \T5, \XMM6, \T4 1135 vpxor \T4, \T7, \T7 1136 1137 vmovdqa HashKey_3_k(arg1), \T3 1138 vpclmulqdq $0x00, \T3, \T2, \T2 1139 vpxor \T2, \XMM1, \XMM1 1140 1141 ###################### 1142 1143 vpshufd $0b01001110, \XMM7, \T2 1144 vpxor \XMM7, \T2, \T2 1145 vmovdqa HashKey_2(arg1), \T5 1146 vpclmulqdq $0x11, \T5, \XMM7, \T4 1147 vpxor \T4, \T6, \T6 1148 1149 vpclmulqdq $0x00, \T5, \XMM7, \T4 1150 vpxor \T4, \T7, \T7 1151 1152 vmovdqa HashKey_2_k(arg1), \T3 1153 vpclmulqdq $0x00, \T3, \T2, \T2 1154 vpxor \T2, \XMM1, \XMM1 1155 1156 ###################### 1157 1158 vpshufd $0b01001110, \XMM8, \T2 1159 vpxor \XMM8, \T2, \T2 1160 vmovdqa HashKey(arg1), \T5 1161 vpclmulqdq $0x11, \T5, \XMM8, \T4 1162 vpxor \T4, \T6, \T6 1163 1164 vpclmulqdq $0x00, \T5, \XMM8, \T4 1165 vpxor \T4, \T7, \T7 1166 1167 vmovdqa HashKey_k(arg1), \T3 1168 vpclmulqdq $0x00, \T3, \T2, \T2 1169 1170 vpxor \T2, \XMM1, \XMM1 1171 vpxor \T6, \XMM1, \XMM1 1172 vpxor \T7, \XMM1, \T2 1173 1174 1175 1176 1177 vpslldq $8, \T2, \T4 1178 vpsrldq $8, \T2, \T2 1179 1180 vpxor \T4, \T7, \T7 1181 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of 1182 # the accumulated carry-less multiplications 1183 1184 ####################################################################### 1185 #first phase of the reduction 1186 vpslld $31, \T7, \T2 # packed right shifting << 31 1187 vpslld $30, \T7, \T3 # packed right shifting shift << 30 1188 vpslld $25, \T7, \T4 # packed right shifting shift << 25 1189 1190 vpxor \T3, \T2, \T2 # xor the shifted versions 1191 vpxor \T4, \T2, \T2 1192 1193 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1194 1195 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1196 vpxor \T2, \T7, \T7 # first phase of the reduction complete 1197 ####################################################################### 1198 1199 1200 #second phase of the reduction 1201 vpsrld $1, \T7, \T2 # packed left shifting >> 1 1202 vpsrld $2, \T7, \T3 # packed left shifting >> 2 1203 vpsrld $7, \T7, \T4 # packed left shifting >> 7 1204 vpxor \T3, \T2, \T2 # xor the shifted versions 1205 vpxor \T4, \T2, \T2 1206 1207 vpxor \T1, \T2, \T2 1208 vpxor \T2, \T7, \T7 1209 vpxor \T7, \T6, \T6 # the result is in T6 1210 1211.endm 1212 1213 1214# combined for GCM encrypt and decrypt functions 1215# clobbering all xmm registers 1216# clobbering r10, r11, r12, r13, r14, r15 1217.macro GCM_ENC_DEC_AVX ENC_DEC 1218 1219 #the number of pushes must equal STACK_OFFSET 1220 push %r12 1221 push %r13 1222 push %r14 1223 push %r15 1224 1225 mov %rsp, %r14 1226 1227 1228 1229 1230 sub $VARIABLE_OFFSET, %rsp 1231 and $~63, %rsp # align rsp to 64 bytes 1232 1233 1234 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey 1235 1236 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext 1237 and $-16, %r13 # r13 = r13 - (r13 mod 16) 1238 1239 mov %r13, %r12 1240 shr $4, %r12 1241 and $7, %r12 1242 jz _initial_num_blocks_is_0\@ 1243 1244 cmp $7, %r12 1245 je _initial_num_blocks_is_7\@ 1246 cmp $6, %r12 1247 je _initial_num_blocks_is_6\@ 1248 cmp $5, %r12 1249 je _initial_num_blocks_is_5\@ 1250 cmp $4, %r12 1251 je _initial_num_blocks_is_4\@ 1252 cmp $3, %r12 1253 je _initial_num_blocks_is_3\@ 1254 cmp $2, %r12 1255 je _initial_num_blocks_is_2\@ 1256 1257 jmp _initial_num_blocks_is_1\@ 1258 1259_initial_num_blocks_is_7\@: 1260 INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1261 sub $16*7, %r13 1262 jmp _initial_blocks_encrypted\@ 1263 1264_initial_num_blocks_is_6\@: 1265 INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1266 sub $16*6, %r13 1267 jmp _initial_blocks_encrypted\@ 1268 1269_initial_num_blocks_is_5\@: 1270 INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1271 sub $16*5, %r13 1272 jmp _initial_blocks_encrypted\@ 1273 1274_initial_num_blocks_is_4\@: 1275 INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1276 sub $16*4, %r13 1277 jmp _initial_blocks_encrypted\@ 1278 1279_initial_num_blocks_is_3\@: 1280 INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1281 sub $16*3, %r13 1282 jmp _initial_blocks_encrypted\@ 1283 1284_initial_num_blocks_is_2\@: 1285 INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1286 sub $16*2, %r13 1287 jmp _initial_blocks_encrypted\@ 1288 1289_initial_num_blocks_is_1\@: 1290 INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1291 sub $16*1, %r13 1292 jmp _initial_blocks_encrypted\@ 1293 1294_initial_num_blocks_is_0\@: 1295 INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 1296 1297 1298_initial_blocks_encrypted\@: 1299 cmp $0, %r13 1300 je _zero_cipher_left\@ 1301 1302 sub $128, %r13 1303 je _eight_cipher_left\@ 1304 1305 1306 1307 1308 vmovd %xmm9, %r15d 1309 and $255, %r15d 1310 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1311 1312 1313_encrypt_by_8_new\@: 1314 cmp $(255-8), %r15d 1315 jg _encrypt_by_8\@ 1316 1317 1318 1319 add $8, %r15b 1320 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC 1321 add $128, %r11 1322 sub $128, %r13 1323 jne _encrypt_by_8_new\@ 1324 1325 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1326 jmp _eight_cipher_left\@ 1327 1328_encrypt_by_8\@: 1329 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1330 add $8, %r15b 1331 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC 1332 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1333 add $128, %r11 1334 sub $128, %r13 1335 jne _encrypt_by_8_new\@ 1336 1337 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1338 1339 1340 1341 1342_eight_cipher_left\@: 1343 GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 1344 1345 1346_zero_cipher_left\@: 1347 cmp $16, arg4 1348 jl _only_less_than_16\@ 1349 1350 mov arg4, %r13 1351 and $15, %r13 # r13 = (arg4 mod 16) 1352 1353 je _multiple_of_16_bytes\@ 1354 1355 # handle the last <16 Byte block seperately 1356 1357 1358 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 1359 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1360 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) 1361 1362 sub $16, %r11 1363 add %r13, %r11 1364 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block 1365 1366 lea SHIFT_MASK+16(%rip), %r12 1367 sub %r13, %r12 # adjust the shuffle mask pointer to be 1368 # able to shift 16-r13 bytes (r13 is the 1369 # number of bytes in plaintext mod 16) 1370 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask 1371 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes 1372 jmp _final_ghash_mul\@ 1373 1374_only_less_than_16\@: 1375 # check for 0 length 1376 mov arg4, %r13 1377 and $15, %r13 # r13 = (arg4 mod 16) 1378 1379 je _multiple_of_16_bytes\@ 1380 1381 # handle the last <16 Byte block seperately 1382 1383 1384 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 1385 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1386 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) 1387 1388 1389 lea SHIFT_MASK+16(%rip), %r12 1390 sub %r13, %r12 # adjust the shuffle mask pointer to be 1391 # able to shift 16-r13 bytes (r13 is the 1392 # number of bytes in plaintext mod 16) 1393 1394_get_last_16_byte_loop\@: 1395 movb (arg3, %r11), %al 1396 movb %al, TMP1 (%rsp , %r11) 1397 add $1, %r11 1398 cmp %r13, %r11 1399 jne _get_last_16_byte_loop\@ 1400 1401 vmovdqu TMP1(%rsp), %xmm1 1402 1403 sub $16, %r11 1404 1405_final_ghash_mul\@: 1406 .if \ENC_DEC == DEC 1407 vmovdqa %xmm1, %xmm2 1408 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 1409 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 1410 # mask out top 16-r13 bytes of xmm9 1411 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 1412 vpand %xmm1, %xmm2, %xmm2 1413 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 1414 vpxor %xmm2, %xmm14, %xmm14 1415 #GHASH computation for the last <16 Byte block 1416 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 1417 sub %r13, %r11 1418 add $16, %r11 1419 .else 1420 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 1421 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 1422 # mask out top 16-r13 bytes of xmm9 1423 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 1424 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 1425 vpxor %xmm9, %xmm14, %xmm14 1426 #GHASH computation for the last <16 Byte block 1427 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 1428 sub %r13, %r11 1429 add $16, %r11 1430 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext 1431 .endif 1432 1433 1434 ############################# 1435 # output r13 Bytes 1436 vmovq %xmm9, %rax 1437 cmp $8, %r13 1438 jle _less_than_8_bytes_left\@ 1439 1440 mov %rax, (arg2 , %r11) 1441 add $8, %r11 1442 vpsrldq $8, %xmm9, %xmm9 1443 vmovq %xmm9, %rax 1444 sub $8, %r13 1445 1446_less_than_8_bytes_left\@: 1447 movb %al, (arg2 , %r11) 1448 add $1, %r11 1449 shr $8, %rax 1450 sub $1, %r13 1451 jne _less_than_8_bytes_left\@ 1452 ############################# 1453 1454_multiple_of_16_bytes\@: 1455 mov arg7, %r12 # r12 = aadLen (number of bytes) 1456 shl $3, %r12 # convert into number of bits 1457 vmovd %r12d, %xmm15 # len(A) in xmm15 1458 1459 shl $3, arg4 # len(C) in bits (*128) 1460 vmovq arg4, %xmm1 1461 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 1462 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) 1463 1464 vpxor %xmm15, %xmm14, %xmm14 1465 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation 1466 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap 1467 1468 mov arg5, %rax # rax = *Y0 1469 vmovdqu (%rax), %xmm9 # xmm9 = Y0 1470 1471 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) 1472 1473 vpxor %xmm14, %xmm9, %xmm9 1474 1475 1476 1477_return_T\@: 1478 mov arg8, %r10 # r10 = authTag 1479 mov arg9, %r11 # r11 = auth_tag_len 1480 1481 cmp $16, %r11 1482 je _T_16\@ 1483 1484 cmp $8, %r11 1485 jl _T_4\@ 1486 1487_T_8\@: 1488 vmovq %xmm9, %rax 1489 mov %rax, (%r10) 1490 add $8, %r10 1491 sub $8, %r11 1492 vpsrldq $8, %xmm9, %xmm9 1493 cmp $0, %r11 1494 je _return_T_done\@ 1495_T_4\@: 1496 vmovd %xmm9, %eax 1497 mov %eax, (%r10) 1498 add $4, %r10 1499 sub $4, %r11 1500 vpsrldq $4, %xmm9, %xmm9 1501 cmp $0, %r11 1502 je _return_T_done\@ 1503_T_123\@: 1504 vmovd %xmm9, %eax 1505 cmp $2, %r11 1506 jl _T_1\@ 1507 mov %ax, (%r10) 1508 cmp $2, %r11 1509 je _return_T_done\@ 1510 add $2, %r10 1511 sar $16, %eax 1512_T_1\@: 1513 mov %al, (%r10) 1514 jmp _return_T_done\@ 1515 1516_T_16\@: 1517 vmovdqu %xmm9, (%r10) 1518 1519_return_T_done\@: 1520 mov %r14, %rsp 1521 1522 pop %r15 1523 pop %r14 1524 pop %r13 1525 pop %r12 1526.endm 1527 1528 1529############################################################# 1530#void aesni_gcm_precomp_avx_gen2 1531# (gcm_data *my_ctx_data, 1532# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ 1533############################################################# 1534ENTRY(aesni_gcm_precomp_avx_gen2) 1535 #the number of pushes must equal STACK_OFFSET 1536 push %r12 1537 push %r13 1538 push %r14 1539 push %r15 1540 1541 mov %rsp, %r14 1542 1543 1544 1545 sub $VARIABLE_OFFSET, %rsp 1546 and $~63, %rsp # align rsp to 64 bytes 1547 1548 vmovdqu (arg2), %xmm6 # xmm6 = HashKey 1549 1550 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 1551 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey 1552 vmovdqa %xmm6, %xmm2 1553 vpsllq $1, %xmm6, %xmm6 1554 vpsrlq $63, %xmm2, %xmm2 1555 vmovdqa %xmm2, %xmm1 1556 vpslldq $8, %xmm2, %xmm2 1557 vpsrldq $8, %xmm1, %xmm1 1558 vpor %xmm2, %xmm6, %xmm6 1559 #reduction 1560 vpshufd $0b00100100, %xmm1, %xmm2 1561 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 1562 vpand POLY(%rip), %xmm2, %xmm2 1563 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly 1564 ####################################################################### 1565 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly 1566 1567 1568 PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 1569 1570 mov %r14, %rsp 1571 1572 pop %r15 1573 pop %r14 1574 pop %r13 1575 pop %r12 1576 ret 1577ENDPROC(aesni_gcm_precomp_avx_gen2) 1578 1579############################################################################### 1580#void aesni_gcm_enc_avx_gen2( 1581# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1582# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 1583# const u8 *in, /* Plaintext input */ 1584# u64 plaintext_len, /* Length of data in Bytes for encryption. */ 1585# u8 *iv, /* Pre-counter block j0: 4 byte salt 1586# (from Security Association) concatenated with 8 byte 1587# Initialisation Vector (from IPSec ESP Payload) 1588# concatenated with 0x00000001. 16-byte aligned pointer. */ 1589# const u8 *aad, /* Additional Authentication Data (AAD)*/ 1590# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 1591# u8 *auth_tag, /* Authenticated Tag output. */ 1592# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 1593# Valid values are 16 (most likely), 12 or 8. */ 1594############################################################################### 1595ENTRY(aesni_gcm_enc_avx_gen2) 1596 GCM_ENC_DEC_AVX ENC 1597 ret 1598ENDPROC(aesni_gcm_enc_avx_gen2) 1599 1600############################################################################### 1601#void aesni_gcm_dec_avx_gen2( 1602# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1603# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 1604# const u8 *in, /* Ciphertext input */ 1605# u64 plaintext_len, /* Length of data in Bytes for encryption. */ 1606# u8 *iv, /* Pre-counter block j0: 4 byte salt 1607# (from Security Association) concatenated with 8 byte 1608# Initialisation Vector (from IPSec ESP Payload) 1609# concatenated with 0x00000001. 16-byte aligned pointer. */ 1610# const u8 *aad, /* Additional Authentication Data (AAD)*/ 1611# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 1612# u8 *auth_tag, /* Authenticated Tag output. */ 1613# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 1614# Valid values are 16 (most likely), 12 or 8. */ 1615############################################################################### 1616ENTRY(aesni_gcm_dec_avx_gen2) 1617 GCM_ENC_DEC_AVX DEC 1618 ret 1619ENDPROC(aesni_gcm_dec_avx_gen2) 1620#endif /* CONFIG_AS_AVX */ 1621 1622#ifdef CONFIG_AS_AVX2 1623############################################################################### 1624# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 1625# Input: A and B (128-bits each, bit-reflected) 1626# Output: C = A*B*x mod poly, (i.e. >>1 ) 1627# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 1628# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 1629############################################################################### 1630.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 1631 1632 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 1633 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 1634 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 1635 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 1636 vpxor \T3, \GH, \GH 1637 1638 1639 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs 1640 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs 1641 1642 vpxor \T3, \T1, \T1 1643 vpxor \T2, \GH, \GH 1644 1645 ####################################################################### 1646 #first phase of the reduction 1647 vmovdqa POLY2(%rip), \T3 1648 1649 vpclmulqdq $0x01, \GH, \T3, \T2 1650 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs 1651 1652 vpxor \T2, \GH, \GH # first phase of the reduction complete 1653 ####################################################################### 1654 #second phase of the reduction 1655 vpclmulqdq $0x00, \GH, \T3, \T2 1656 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 1657 1658 vpclmulqdq $0x10, \GH, \T3, \GH 1659 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) 1660 1661 vpxor \T2, \GH, \GH # second phase of the reduction complete 1662 ####################################################################### 1663 vpxor \T1, \GH, \GH # the result is in GH 1664 1665 1666.endm 1667 1668.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 1669 1670 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1671 vmovdqa \HK, \T5 1672 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 1673 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly 1674 1675 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 1676 vmovdqa \T5, HashKey_3(arg1) 1677 1678 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 1679 vmovdqa \T5, HashKey_4(arg1) 1680 1681 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 1682 vmovdqa \T5, HashKey_5(arg1) 1683 1684 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 1685 vmovdqa \T5, HashKey_6(arg1) 1686 1687 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 1688 vmovdqa \T5, HashKey_7(arg1) 1689 1690 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 1691 vmovdqa \T5, HashKey_8(arg1) 1692 1693.endm 1694 1695 1696## if a = number of total plaintext bytes 1697## b = floor(a/16) 1698## num_initial_blocks = b mod 4# 1699## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 1700## r10, r11, r12, rax are clobbered 1701## arg1, arg2, arg3, r14 are used as a pointer only, not modified 1702 1703.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER 1704 i = (8-\num_initial_blocks) 1705 j = 0 1706 setreg 1707 1708 mov arg6, %r10 # r10 = AAD 1709 mov arg7, %r12 # r12 = aadLen 1710 1711 1712 mov %r12, %r11 1713 1714 vpxor reg_j, reg_j, reg_j 1715 vpxor reg_i, reg_i, reg_i 1716 1717 cmp $16, %r11 1718 jl _get_AAD_rest8\@ 1719_get_AAD_blocks\@: 1720 vmovdqu (%r10), reg_i 1721 vpshufb SHUF_MASK(%rip), reg_i, reg_i 1722 vpxor reg_i, reg_j, reg_j 1723 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 1724 add $16, %r10 1725 sub $16, %r12 1726 sub $16, %r11 1727 cmp $16, %r11 1728 jge _get_AAD_blocks\@ 1729 vmovdqu reg_j, reg_i 1730 cmp $0, %r11 1731 je _get_AAD_done\@ 1732 1733 vpxor reg_i, reg_i, reg_i 1734 1735 /* read the last <16B of AAD. since we have at least 4B of 1736 data right after the AAD (the ICV, and maybe some CT), we can 1737 read 4B/8B blocks safely, and then get rid of the extra stuff */ 1738_get_AAD_rest8\@: 1739 cmp $4, %r11 1740 jle _get_AAD_rest4\@ 1741 movq (%r10), \T1 1742 add $8, %r10 1743 sub $8, %r11 1744 vpslldq $8, \T1, \T1 1745 vpsrldq $8, reg_i, reg_i 1746 vpxor \T1, reg_i, reg_i 1747 jmp _get_AAD_rest8\@ 1748_get_AAD_rest4\@: 1749 cmp $0, %r11 1750 jle _get_AAD_rest0\@ 1751 mov (%r10), %eax 1752 movq %rax, \T1 1753 add $4, %r10 1754 sub $4, %r11 1755 vpslldq $12, \T1, \T1 1756 vpsrldq $4, reg_i, reg_i 1757 vpxor \T1, reg_i, reg_i 1758_get_AAD_rest0\@: 1759 /* finalize: shift out the extra bytes we read, and align 1760 left. since pslldq can only shift by an immediate, we use 1761 vpshufb and an array of shuffle masks */ 1762 movq %r12, %r11 1763 salq $4, %r11 1764 movdqu aad_shift_arr(%r11), \T1 1765 vpshufb \T1, reg_i, reg_i 1766_get_AAD_rest_final\@: 1767 vpshufb SHUF_MASK(%rip), reg_i, reg_i 1768 vpxor reg_j, reg_i, reg_i 1769 GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 1770 1771_get_AAD_done\@: 1772 # initialize the data pointer offset as zero 1773 xor %r11d, %r11d 1774 1775 # start AES for num_initial_blocks blocks 1776 mov arg5, %rax # rax = *Y0 1777 vmovdqu (%rax), \CTR # CTR = Y0 1778 vpshufb SHUF_MASK(%rip), \CTR, \CTR 1779 1780 1781 i = (9-\num_initial_blocks) 1782 setreg 1783.rep \num_initial_blocks 1784 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1785 vmovdqa \CTR, reg_i 1786 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 1787 i = (i+1) 1788 setreg 1789.endr 1790 1791 vmovdqa (arg1), \T_key 1792 i = (9-\num_initial_blocks) 1793 setreg 1794.rep \num_initial_blocks 1795 vpxor \T_key, reg_i, reg_i 1796 i = (i+1) 1797 setreg 1798.endr 1799 1800 j = 1 1801 setreg 1802.rep 9 1803 vmovdqa 16*j(arg1), \T_key 1804 i = (9-\num_initial_blocks) 1805 setreg 1806.rep \num_initial_blocks 1807 vaesenc \T_key, reg_i, reg_i 1808 i = (i+1) 1809 setreg 1810.endr 1811 1812 j = (j+1) 1813 setreg 1814.endr 1815 1816 1817 vmovdqa 16*10(arg1), \T_key 1818 i = (9-\num_initial_blocks) 1819 setreg 1820.rep \num_initial_blocks 1821 vaesenclast \T_key, reg_i, reg_i 1822 i = (i+1) 1823 setreg 1824.endr 1825 1826 i = (9-\num_initial_blocks) 1827 setreg 1828.rep \num_initial_blocks 1829 vmovdqu (arg3, %r11), \T1 1830 vpxor \T1, reg_i, reg_i 1831 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for 1832 # num_initial_blocks blocks 1833 add $16, %r11 1834.if \ENC_DEC == DEC 1835 vmovdqa \T1, reg_i 1836.endif 1837 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 1838 i = (i+1) 1839 setreg 1840.endr 1841 1842 1843 i = (8-\num_initial_blocks) 1844 j = (9-\num_initial_blocks) 1845 setreg 1846 1847.rep \num_initial_blocks 1848 vpxor reg_i, reg_j, reg_j 1849 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 1850 i = (i+1) 1851 j = (j+1) 1852 setreg 1853.endr 1854 # XMM8 has the combined result here 1855 1856 vmovdqa \XMM8, TMP1(%rsp) 1857 vmovdqa \XMM8, \T3 1858 1859 cmp $128, %r13 1860 jl _initial_blocks_done\@ # no need for precomputed constants 1861 1862############################################################################### 1863# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1864 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1865 vmovdqa \CTR, \XMM1 1866 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1867 1868 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1869 vmovdqa \CTR, \XMM2 1870 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1871 1872 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1873 vmovdqa \CTR, \XMM3 1874 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1875 1876 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1877 vmovdqa \CTR, \XMM4 1878 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1879 1880 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1881 vmovdqa \CTR, \XMM5 1882 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1883 1884 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1885 vmovdqa \CTR, \XMM6 1886 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1887 1888 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1889 vmovdqa \CTR, \XMM7 1890 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1891 1892 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1893 vmovdqa \CTR, \XMM8 1894 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1895 1896 vmovdqa (arg1), \T_key 1897 vpxor \T_key, \XMM1, \XMM1 1898 vpxor \T_key, \XMM2, \XMM2 1899 vpxor \T_key, \XMM3, \XMM3 1900 vpxor \T_key, \XMM4, \XMM4 1901 vpxor \T_key, \XMM5, \XMM5 1902 vpxor \T_key, \XMM6, \XMM6 1903 vpxor \T_key, \XMM7, \XMM7 1904 vpxor \T_key, \XMM8, \XMM8 1905 1906 i = 1 1907 setreg 1908.rep 9 # do 9 rounds 1909 vmovdqa 16*i(arg1), \T_key 1910 vaesenc \T_key, \XMM1, \XMM1 1911 vaesenc \T_key, \XMM2, \XMM2 1912 vaesenc \T_key, \XMM3, \XMM3 1913 vaesenc \T_key, \XMM4, \XMM4 1914 vaesenc \T_key, \XMM5, \XMM5 1915 vaesenc \T_key, \XMM6, \XMM6 1916 vaesenc \T_key, \XMM7, \XMM7 1917 vaesenc \T_key, \XMM8, \XMM8 1918 i = (i+1) 1919 setreg 1920.endr 1921 1922 1923 vmovdqa 16*i(arg1), \T_key 1924 vaesenclast \T_key, \XMM1, \XMM1 1925 vaesenclast \T_key, \XMM2, \XMM2 1926 vaesenclast \T_key, \XMM3, \XMM3 1927 vaesenclast \T_key, \XMM4, \XMM4 1928 vaesenclast \T_key, \XMM5, \XMM5 1929 vaesenclast \T_key, \XMM6, \XMM6 1930 vaesenclast \T_key, \XMM7, \XMM7 1931 vaesenclast \T_key, \XMM8, \XMM8 1932 1933 vmovdqu (arg3, %r11), \T1 1934 vpxor \T1, \XMM1, \XMM1 1935 vmovdqu \XMM1, (arg2 , %r11) 1936 .if \ENC_DEC == DEC 1937 vmovdqa \T1, \XMM1 1938 .endif 1939 1940 vmovdqu 16*1(arg3, %r11), \T1 1941 vpxor \T1, \XMM2, \XMM2 1942 vmovdqu \XMM2, 16*1(arg2 , %r11) 1943 .if \ENC_DEC == DEC 1944 vmovdqa \T1, \XMM2 1945 .endif 1946 1947 vmovdqu 16*2(arg3, %r11), \T1 1948 vpxor \T1, \XMM3, \XMM3 1949 vmovdqu \XMM3, 16*2(arg2 , %r11) 1950 .if \ENC_DEC == DEC 1951 vmovdqa \T1, \XMM3 1952 .endif 1953 1954 vmovdqu 16*3(arg3, %r11), \T1 1955 vpxor \T1, \XMM4, \XMM4 1956 vmovdqu \XMM4, 16*3(arg2 , %r11) 1957 .if \ENC_DEC == DEC 1958 vmovdqa \T1, \XMM4 1959 .endif 1960 1961 vmovdqu 16*4(arg3, %r11), \T1 1962 vpxor \T1, \XMM5, \XMM5 1963 vmovdqu \XMM5, 16*4(arg2 , %r11) 1964 .if \ENC_DEC == DEC 1965 vmovdqa \T1, \XMM5 1966 .endif 1967 1968 vmovdqu 16*5(arg3, %r11), \T1 1969 vpxor \T1, \XMM6, \XMM6 1970 vmovdqu \XMM6, 16*5(arg2 , %r11) 1971 .if \ENC_DEC == DEC 1972 vmovdqa \T1, \XMM6 1973 .endif 1974 1975 vmovdqu 16*6(arg3, %r11), \T1 1976 vpxor \T1, \XMM7, \XMM7 1977 vmovdqu \XMM7, 16*6(arg2 , %r11) 1978 .if \ENC_DEC == DEC 1979 vmovdqa \T1, \XMM7 1980 .endif 1981 1982 vmovdqu 16*7(arg3, %r11), \T1 1983 vpxor \T1, \XMM8, \XMM8 1984 vmovdqu \XMM8, 16*7(arg2 , %r11) 1985 .if \ENC_DEC == DEC 1986 vmovdqa \T1, \XMM8 1987 .endif 1988 1989 add $128, %r11 1990 1991 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1992 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with 1993 # the corresponding ciphertext 1994 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1995 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1996 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1997 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1998 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1999 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2000 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2001 2002############################################################################### 2003 2004_initial_blocks_done\@: 2005 2006 2007.endm 2008 2009 2010 2011# encrypt 8 blocks at a time 2012# ghash the 8 previously encrypted ciphertext blocks 2013# arg1, arg2, arg3 are used as pointers only, not modified 2014# r11 is the data offset value 2015.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 2016 2017 vmovdqa \XMM1, \T2 2018 vmovdqa \XMM2, TMP2(%rsp) 2019 vmovdqa \XMM3, TMP3(%rsp) 2020 vmovdqa \XMM4, TMP4(%rsp) 2021 vmovdqa \XMM5, TMP5(%rsp) 2022 vmovdqa \XMM6, TMP6(%rsp) 2023 vmovdqa \XMM7, TMP7(%rsp) 2024 vmovdqa \XMM8, TMP8(%rsp) 2025 2026.if \loop_idx == in_order 2027 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 2028 vpaddd ONE(%rip), \XMM1, \XMM2 2029 vpaddd ONE(%rip), \XMM2, \XMM3 2030 vpaddd ONE(%rip), \XMM3, \XMM4 2031 vpaddd ONE(%rip), \XMM4, \XMM5 2032 vpaddd ONE(%rip), \XMM5, \XMM6 2033 vpaddd ONE(%rip), \XMM6, \XMM7 2034 vpaddd ONE(%rip), \XMM7, \XMM8 2035 vmovdqa \XMM8, \CTR 2036 2037 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2038 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2039 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2040 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2041 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2042 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2043 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2044 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2045.else 2046 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 2047 vpaddd ONEf(%rip), \XMM1, \XMM2 2048 vpaddd ONEf(%rip), \XMM2, \XMM3 2049 vpaddd ONEf(%rip), \XMM3, \XMM4 2050 vpaddd ONEf(%rip), \XMM4, \XMM5 2051 vpaddd ONEf(%rip), \XMM5, \XMM6 2052 vpaddd ONEf(%rip), \XMM6, \XMM7 2053 vpaddd ONEf(%rip), \XMM7, \XMM8 2054 vmovdqa \XMM8, \CTR 2055.endif 2056 2057 2058 ####################################################################### 2059 2060 vmovdqu (arg1), \T1 2061 vpxor \T1, \XMM1, \XMM1 2062 vpxor \T1, \XMM2, \XMM2 2063 vpxor \T1, \XMM3, \XMM3 2064 vpxor \T1, \XMM4, \XMM4 2065 vpxor \T1, \XMM5, \XMM5 2066 vpxor \T1, \XMM6, \XMM6 2067 vpxor \T1, \XMM7, \XMM7 2068 vpxor \T1, \XMM8, \XMM8 2069 2070 ####################################################################### 2071 2072 2073 2074 2075 2076 vmovdqu 16*1(arg1), \T1 2077 vaesenc \T1, \XMM1, \XMM1 2078 vaesenc \T1, \XMM2, \XMM2 2079 vaesenc \T1, \XMM3, \XMM3 2080 vaesenc \T1, \XMM4, \XMM4 2081 vaesenc \T1, \XMM5, \XMM5 2082 vaesenc \T1, \XMM6, \XMM6 2083 vaesenc \T1, \XMM7, \XMM7 2084 vaesenc \T1, \XMM8, \XMM8 2085 2086 vmovdqu 16*2(arg1), \T1 2087 vaesenc \T1, \XMM1, \XMM1 2088 vaesenc \T1, \XMM2, \XMM2 2089 vaesenc \T1, \XMM3, \XMM3 2090 vaesenc \T1, \XMM4, \XMM4 2091 vaesenc \T1, \XMM5, \XMM5 2092 vaesenc \T1, \XMM6, \XMM6 2093 vaesenc \T1, \XMM7, \XMM7 2094 vaesenc \T1, \XMM8, \XMM8 2095 2096 2097 ####################################################################### 2098 2099 vmovdqa HashKey_8(arg1), \T5 2100 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 2101 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 2102 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 2103 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 2104 vpxor \T5, \T6, \T6 2105 2106 vmovdqu 16*3(arg1), \T1 2107 vaesenc \T1, \XMM1, \XMM1 2108 vaesenc \T1, \XMM2, \XMM2 2109 vaesenc \T1, \XMM3, \XMM3 2110 vaesenc \T1, \XMM4, \XMM4 2111 vaesenc \T1, \XMM5, \XMM5 2112 vaesenc \T1, \XMM6, \XMM6 2113 vaesenc \T1, \XMM7, \XMM7 2114 vaesenc \T1, \XMM8, \XMM8 2115 2116 vmovdqa TMP2(%rsp), \T1 2117 vmovdqa HashKey_7(arg1), \T5 2118 vpclmulqdq $0x11, \T5, \T1, \T3 2119 vpxor \T3, \T4, \T4 2120 2121 vpclmulqdq $0x00, \T5, \T1, \T3 2122 vpxor \T3, \T7, \T7 2123 2124 vpclmulqdq $0x01, \T5, \T1, \T3 2125 vpxor \T3, \T6, \T6 2126 2127 vpclmulqdq $0x10, \T5, \T1, \T3 2128 vpxor \T3, \T6, \T6 2129 2130 vmovdqu 16*4(arg1), \T1 2131 vaesenc \T1, \XMM1, \XMM1 2132 vaesenc \T1, \XMM2, \XMM2 2133 vaesenc \T1, \XMM3, \XMM3 2134 vaesenc \T1, \XMM4, \XMM4 2135 vaesenc \T1, \XMM5, \XMM5 2136 vaesenc \T1, \XMM6, \XMM6 2137 vaesenc \T1, \XMM7, \XMM7 2138 vaesenc \T1, \XMM8, \XMM8 2139 2140 ####################################################################### 2141 2142 vmovdqa TMP3(%rsp), \T1 2143 vmovdqa HashKey_6(arg1), \T5 2144 vpclmulqdq $0x11, \T5, \T1, \T3 2145 vpxor \T3, \T4, \T4 2146 2147 vpclmulqdq $0x00, \T5, \T1, \T3 2148 vpxor \T3, \T7, \T7 2149 2150 vpclmulqdq $0x01, \T5, \T1, \T3 2151 vpxor \T3, \T6, \T6 2152 2153 vpclmulqdq $0x10, \T5, \T1, \T3 2154 vpxor \T3, \T6, \T6 2155 2156 vmovdqu 16*5(arg1), \T1 2157 vaesenc \T1, \XMM1, \XMM1 2158 vaesenc \T1, \XMM2, \XMM2 2159 vaesenc \T1, \XMM3, \XMM3 2160 vaesenc \T1, \XMM4, \XMM4 2161 vaesenc \T1, \XMM5, \XMM5 2162 vaesenc \T1, \XMM6, \XMM6 2163 vaesenc \T1, \XMM7, \XMM7 2164 vaesenc \T1, \XMM8, \XMM8 2165 2166 vmovdqa TMP4(%rsp), \T1 2167 vmovdqa HashKey_5(arg1), \T5 2168 vpclmulqdq $0x11, \T5, \T1, \T3 2169 vpxor \T3, \T4, \T4 2170 2171 vpclmulqdq $0x00, \T5, \T1, \T3 2172 vpxor \T3, \T7, \T7 2173 2174 vpclmulqdq $0x01, \T5, \T1, \T3 2175 vpxor \T3, \T6, \T6 2176 2177 vpclmulqdq $0x10, \T5, \T1, \T3 2178 vpxor \T3, \T6, \T6 2179 2180 vmovdqu 16*6(arg1), \T1 2181 vaesenc \T1, \XMM1, \XMM1 2182 vaesenc \T1, \XMM2, \XMM2 2183 vaesenc \T1, \XMM3, \XMM3 2184 vaesenc \T1, \XMM4, \XMM4 2185 vaesenc \T1, \XMM5, \XMM5 2186 vaesenc \T1, \XMM6, \XMM6 2187 vaesenc \T1, \XMM7, \XMM7 2188 vaesenc \T1, \XMM8, \XMM8 2189 2190 2191 vmovdqa TMP5(%rsp), \T1 2192 vmovdqa HashKey_4(arg1), \T5 2193 vpclmulqdq $0x11, \T5, \T1, \T3 2194 vpxor \T3, \T4, \T4 2195 2196 vpclmulqdq $0x00, \T5, \T1, \T3 2197 vpxor \T3, \T7, \T7 2198 2199 vpclmulqdq $0x01, \T5, \T1, \T3 2200 vpxor \T3, \T6, \T6 2201 2202 vpclmulqdq $0x10, \T5, \T1, \T3 2203 vpxor \T3, \T6, \T6 2204 2205 vmovdqu 16*7(arg1), \T1 2206 vaesenc \T1, \XMM1, \XMM1 2207 vaesenc \T1, \XMM2, \XMM2 2208 vaesenc \T1, \XMM3, \XMM3 2209 vaesenc \T1, \XMM4, \XMM4 2210 vaesenc \T1, \XMM5, \XMM5 2211 vaesenc \T1, \XMM6, \XMM6 2212 vaesenc \T1, \XMM7, \XMM7 2213 vaesenc \T1, \XMM8, \XMM8 2214 2215 vmovdqa TMP6(%rsp), \T1 2216 vmovdqa HashKey_3(arg1), \T5 2217 vpclmulqdq $0x11, \T5, \T1, \T3 2218 vpxor \T3, \T4, \T4 2219 2220 vpclmulqdq $0x00, \T5, \T1, \T3 2221 vpxor \T3, \T7, \T7 2222 2223 vpclmulqdq $0x01, \T5, \T1, \T3 2224 vpxor \T3, \T6, \T6 2225 2226 vpclmulqdq $0x10, \T5, \T1, \T3 2227 vpxor \T3, \T6, \T6 2228 2229 vmovdqu 16*8(arg1), \T1 2230 vaesenc \T1, \XMM1, \XMM1 2231 vaesenc \T1, \XMM2, \XMM2 2232 vaesenc \T1, \XMM3, \XMM3 2233 vaesenc \T1, \XMM4, \XMM4 2234 vaesenc \T1, \XMM5, \XMM5 2235 vaesenc \T1, \XMM6, \XMM6 2236 vaesenc \T1, \XMM7, \XMM7 2237 vaesenc \T1, \XMM8, \XMM8 2238 2239 vmovdqa TMP7(%rsp), \T1 2240 vmovdqa HashKey_2(arg1), \T5 2241 vpclmulqdq $0x11, \T5, \T1, \T3 2242 vpxor \T3, \T4, \T4 2243 2244 vpclmulqdq $0x00, \T5, \T1, \T3 2245 vpxor \T3, \T7, \T7 2246 2247 vpclmulqdq $0x01, \T5, \T1, \T3 2248 vpxor \T3, \T6, \T6 2249 2250 vpclmulqdq $0x10, \T5, \T1, \T3 2251 vpxor \T3, \T6, \T6 2252 2253 2254 ####################################################################### 2255 2256 vmovdqu 16*9(arg1), \T5 2257 vaesenc \T5, \XMM1, \XMM1 2258 vaesenc \T5, \XMM2, \XMM2 2259 vaesenc \T5, \XMM3, \XMM3 2260 vaesenc \T5, \XMM4, \XMM4 2261 vaesenc \T5, \XMM5, \XMM5 2262 vaesenc \T5, \XMM6, \XMM6 2263 vaesenc \T5, \XMM7, \XMM7 2264 vaesenc \T5, \XMM8, \XMM8 2265 2266 vmovdqa TMP8(%rsp), \T1 2267 vmovdqa HashKey(arg1), \T5 2268 2269 vpclmulqdq $0x00, \T5, \T1, \T3 2270 vpxor \T3, \T7, \T7 2271 2272 vpclmulqdq $0x01, \T5, \T1, \T3 2273 vpxor \T3, \T6, \T6 2274 2275 vpclmulqdq $0x10, \T5, \T1, \T3 2276 vpxor \T3, \T6, \T6 2277 2278 vpclmulqdq $0x11, \T5, \T1, \T3 2279 vpxor \T3, \T4, \T1 2280 2281 2282 vmovdqu 16*10(arg1), \T5 2283 2284 i = 0 2285 j = 1 2286 setreg 2287.rep 8 2288 vpxor 16*i(arg3, %r11), \T5, \T2 2289 .if \ENC_DEC == ENC 2290 vaesenclast \T2, reg_j, reg_j 2291 .else 2292 vaesenclast \T2, reg_j, \T3 2293 vmovdqu 16*i(arg3, %r11), reg_j 2294 vmovdqu \T3, 16*i(arg2, %r11) 2295 .endif 2296 i = (i+1) 2297 j = (j+1) 2298 setreg 2299.endr 2300 ####################################################################### 2301 2302 2303 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 2304 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 2305 vpxor \T3, \T7, \T7 2306 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 2307 2308 2309 2310 ####################################################################### 2311 #first phase of the reduction 2312 vmovdqa POLY2(%rip), \T3 2313 2314 vpclmulqdq $0x01, \T7, \T3, \T2 2315 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2316 2317 vpxor \T2, \T7, \T7 # first phase of the reduction complete 2318 ####################################################################### 2319 .if \ENC_DEC == ENC 2320 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer 2321 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer 2322 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer 2323 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer 2324 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer 2325 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer 2326 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer 2327 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer 2328 .endif 2329 2330 ####################################################################### 2331 #second phase of the reduction 2332 vpclmulqdq $0x00, \T7, \T3, \T2 2333 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2334 2335 vpclmulqdq $0x10, \T7, \T3, \T4 2336 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) 2337 2338 vpxor \T2, \T4, \T4 # second phase of the reduction complete 2339 ####################################################################### 2340 vpxor \T4, \T1, \T1 # the result is in T1 2341 2342 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2343 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2344 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2345 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2346 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2347 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2348 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2349 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2350 2351 2352 vpxor \T1, \XMM1, \XMM1 2353 2354 2355 2356.endm 2357 2358 2359# GHASH the last 4 ciphertext blocks. 2360.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 2361 2362 ## Karatsuba Method 2363 2364 vmovdqa HashKey_8(arg1), \T5 2365 2366 vpshufd $0b01001110, \XMM1, \T2 2367 vpshufd $0b01001110, \T5, \T3 2368 vpxor \XMM1, \T2, \T2 2369 vpxor \T5, \T3, \T3 2370 2371 vpclmulqdq $0x11, \T5, \XMM1, \T6 2372 vpclmulqdq $0x00, \T5, \XMM1, \T7 2373 2374 vpclmulqdq $0x00, \T3, \T2, \XMM1 2375 2376 ###################### 2377 2378 vmovdqa HashKey_7(arg1), \T5 2379 vpshufd $0b01001110, \XMM2, \T2 2380 vpshufd $0b01001110, \T5, \T3 2381 vpxor \XMM2, \T2, \T2 2382 vpxor \T5, \T3, \T3 2383 2384 vpclmulqdq $0x11, \T5, \XMM2, \T4 2385 vpxor \T4, \T6, \T6 2386 2387 vpclmulqdq $0x00, \T5, \XMM2, \T4 2388 vpxor \T4, \T7, \T7 2389 2390 vpclmulqdq $0x00, \T3, \T2, \T2 2391 2392 vpxor \T2, \XMM1, \XMM1 2393 2394 ###################### 2395 2396 vmovdqa HashKey_6(arg1), \T5 2397 vpshufd $0b01001110, \XMM3, \T2 2398 vpshufd $0b01001110, \T5, \T3 2399 vpxor \XMM3, \T2, \T2 2400 vpxor \T5, \T3, \T3 2401 2402 vpclmulqdq $0x11, \T5, \XMM3, \T4 2403 vpxor \T4, \T6, \T6 2404 2405 vpclmulqdq $0x00, \T5, \XMM3, \T4 2406 vpxor \T4, \T7, \T7 2407 2408 vpclmulqdq $0x00, \T3, \T2, \T2 2409 2410 vpxor \T2, \XMM1, \XMM1 2411 2412 ###################### 2413 2414 vmovdqa HashKey_5(arg1), \T5 2415 vpshufd $0b01001110, \XMM4, \T2 2416 vpshufd $0b01001110, \T5, \T3 2417 vpxor \XMM4, \T2, \T2 2418 vpxor \T5, \T3, \T3 2419 2420 vpclmulqdq $0x11, \T5, \XMM4, \T4 2421 vpxor \T4, \T6, \T6 2422 2423 vpclmulqdq $0x00, \T5, \XMM4, \T4 2424 vpxor \T4, \T7, \T7 2425 2426 vpclmulqdq $0x00, \T3, \T2, \T2 2427 2428 vpxor \T2, \XMM1, \XMM1 2429 2430 ###################### 2431 2432 vmovdqa HashKey_4(arg1), \T5 2433 vpshufd $0b01001110, \XMM5, \T2 2434 vpshufd $0b01001110, \T5, \T3 2435 vpxor \XMM5, \T2, \T2 2436 vpxor \T5, \T3, \T3 2437 2438 vpclmulqdq $0x11, \T5, \XMM5, \T4 2439 vpxor \T4, \T6, \T6 2440 2441 vpclmulqdq $0x00, \T5, \XMM5, \T4 2442 vpxor \T4, \T7, \T7 2443 2444 vpclmulqdq $0x00, \T3, \T2, \T2 2445 2446 vpxor \T2, \XMM1, \XMM1 2447 2448 ###################### 2449 2450 vmovdqa HashKey_3(arg1), \T5 2451 vpshufd $0b01001110, \XMM6, \T2 2452 vpshufd $0b01001110, \T5, \T3 2453 vpxor \XMM6, \T2, \T2 2454 vpxor \T5, \T3, \T3 2455 2456 vpclmulqdq $0x11, \T5, \XMM6, \T4 2457 vpxor \T4, \T6, \T6 2458 2459 vpclmulqdq $0x00, \T5, \XMM6, \T4 2460 vpxor \T4, \T7, \T7 2461 2462 vpclmulqdq $0x00, \T3, \T2, \T2 2463 2464 vpxor \T2, \XMM1, \XMM1 2465 2466 ###################### 2467 2468 vmovdqa HashKey_2(arg1), \T5 2469 vpshufd $0b01001110, \XMM7, \T2 2470 vpshufd $0b01001110, \T5, \T3 2471 vpxor \XMM7, \T2, \T2 2472 vpxor \T5, \T3, \T3 2473 2474 vpclmulqdq $0x11, \T5, \XMM7, \T4 2475 vpxor \T4, \T6, \T6 2476 2477 vpclmulqdq $0x00, \T5, \XMM7, \T4 2478 vpxor \T4, \T7, \T7 2479 2480 vpclmulqdq $0x00, \T3, \T2, \T2 2481 2482 vpxor \T2, \XMM1, \XMM1 2483 2484 ###################### 2485 2486 vmovdqa HashKey(arg1), \T5 2487 vpshufd $0b01001110, \XMM8, \T2 2488 vpshufd $0b01001110, \T5, \T3 2489 vpxor \XMM8, \T2, \T2 2490 vpxor \T5, \T3, \T3 2491 2492 vpclmulqdq $0x11, \T5, \XMM8, \T4 2493 vpxor \T4, \T6, \T6 2494 2495 vpclmulqdq $0x00, \T5, \XMM8, \T4 2496 vpxor \T4, \T7, \T7 2497 2498 vpclmulqdq $0x00, \T3, \T2, \T2 2499 2500 vpxor \T2, \XMM1, \XMM1 2501 vpxor \T6, \XMM1, \XMM1 2502 vpxor \T7, \XMM1, \T2 2503 2504 2505 2506 2507 vpslldq $8, \T2, \T4 2508 vpsrldq $8, \T2, \T2 2509 2510 vpxor \T4, \T7, \T7 2511 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the 2512 # accumulated carry-less multiplications 2513 2514 ####################################################################### 2515 #first phase of the reduction 2516 vmovdqa POLY2(%rip), \T3 2517 2518 vpclmulqdq $0x01, \T7, \T3, \T2 2519 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2520 2521 vpxor \T2, \T7, \T7 # first phase of the reduction complete 2522 ####################################################################### 2523 2524 2525 #second phase of the reduction 2526 vpclmulqdq $0x00, \T7, \T3, \T2 2527 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2528 2529 vpclmulqdq $0x10, \T7, \T3, \T4 2530 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) 2531 2532 vpxor \T2, \T4, \T4 # second phase of the reduction complete 2533 ####################################################################### 2534 vpxor \T4, \T6, \T6 # the result is in T6 2535.endm 2536 2537 2538 2539# combined for GCM encrypt and decrypt functions 2540# clobbering all xmm registers 2541# clobbering r10, r11, r12, r13, r14, r15 2542.macro GCM_ENC_DEC_AVX2 ENC_DEC 2543 2544 #the number of pushes must equal STACK_OFFSET 2545 push %r12 2546 push %r13 2547 push %r14 2548 push %r15 2549 2550 mov %rsp, %r14 2551 2552 2553 2554 2555 sub $VARIABLE_OFFSET, %rsp 2556 and $~63, %rsp # align rsp to 64 bytes 2557 2558 2559 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey 2560 2561 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext 2562 and $-16, %r13 # r13 = r13 - (r13 mod 16) 2563 2564 mov %r13, %r12 2565 shr $4, %r12 2566 and $7, %r12 2567 jz _initial_num_blocks_is_0\@ 2568 2569 cmp $7, %r12 2570 je _initial_num_blocks_is_7\@ 2571 cmp $6, %r12 2572 je _initial_num_blocks_is_6\@ 2573 cmp $5, %r12 2574 je _initial_num_blocks_is_5\@ 2575 cmp $4, %r12 2576 je _initial_num_blocks_is_4\@ 2577 cmp $3, %r12 2578 je _initial_num_blocks_is_3\@ 2579 cmp $2, %r12 2580 je _initial_num_blocks_is_2\@ 2581 2582 jmp _initial_num_blocks_is_1\@ 2583 2584_initial_num_blocks_is_7\@: 2585 INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2586 sub $16*7, %r13 2587 jmp _initial_blocks_encrypted\@ 2588 2589_initial_num_blocks_is_6\@: 2590 INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2591 sub $16*6, %r13 2592 jmp _initial_blocks_encrypted\@ 2593 2594_initial_num_blocks_is_5\@: 2595 INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2596 sub $16*5, %r13 2597 jmp _initial_blocks_encrypted\@ 2598 2599_initial_num_blocks_is_4\@: 2600 INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2601 sub $16*4, %r13 2602 jmp _initial_blocks_encrypted\@ 2603 2604_initial_num_blocks_is_3\@: 2605 INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2606 sub $16*3, %r13 2607 jmp _initial_blocks_encrypted\@ 2608 2609_initial_num_blocks_is_2\@: 2610 INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2611 sub $16*2, %r13 2612 jmp _initial_blocks_encrypted\@ 2613 2614_initial_num_blocks_is_1\@: 2615 INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2616 sub $16*1, %r13 2617 jmp _initial_blocks_encrypted\@ 2618 2619_initial_num_blocks_is_0\@: 2620 INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 2621 2622 2623_initial_blocks_encrypted\@: 2624 cmp $0, %r13 2625 je _zero_cipher_left\@ 2626 2627 sub $128, %r13 2628 je _eight_cipher_left\@ 2629 2630 2631 2632 2633 vmovd %xmm9, %r15d 2634 and $255, %r15d 2635 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2636 2637 2638_encrypt_by_8_new\@: 2639 cmp $(255-8), %r15d 2640 jg _encrypt_by_8\@ 2641 2642 2643 2644 add $8, %r15b 2645 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC 2646 add $128, %r11 2647 sub $128, %r13 2648 jne _encrypt_by_8_new\@ 2649 2650 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2651 jmp _eight_cipher_left\@ 2652 2653_encrypt_by_8\@: 2654 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2655 add $8, %r15b 2656 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC 2657 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2658 add $128, %r11 2659 sub $128, %r13 2660 jne _encrypt_by_8_new\@ 2661 2662 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2663 2664 2665 2666 2667_eight_cipher_left\@: 2668 GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 2669 2670 2671_zero_cipher_left\@: 2672 cmp $16, arg4 2673 jl _only_less_than_16\@ 2674 2675 mov arg4, %r13 2676 and $15, %r13 # r13 = (arg4 mod 16) 2677 2678 je _multiple_of_16_bytes\@ 2679 2680 # handle the last <16 Byte block seperately 2681 2682 2683 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 2684 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2685 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) 2686 2687 sub $16, %r11 2688 add %r13, %r11 2689 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block 2690 2691 lea SHIFT_MASK+16(%rip), %r12 2692 sub %r13, %r12 # adjust the shuffle mask pointer 2693 # to be able to shift 16-r13 bytes 2694 # (r13 is the number of bytes in plaintext mod 16) 2695 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask 2696 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes 2697 jmp _final_ghash_mul\@ 2698 2699_only_less_than_16\@: 2700 # check for 0 length 2701 mov arg4, %r13 2702 and $15, %r13 # r13 = (arg4 mod 16) 2703 2704 je _multiple_of_16_bytes\@ 2705 2706 # handle the last <16 Byte block seperately 2707 2708 2709 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 2710 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2711 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) 2712 2713 2714 lea SHIFT_MASK+16(%rip), %r12 2715 sub %r13, %r12 # adjust the shuffle mask pointer to be 2716 # able to shift 16-r13 bytes (r13 is the 2717 # number of bytes in plaintext mod 16) 2718 2719_get_last_16_byte_loop\@: 2720 movb (arg3, %r11), %al 2721 movb %al, TMP1 (%rsp , %r11) 2722 add $1, %r11 2723 cmp %r13, %r11 2724 jne _get_last_16_byte_loop\@ 2725 2726 vmovdqu TMP1(%rsp), %xmm1 2727 2728 sub $16, %r11 2729 2730_final_ghash_mul\@: 2731 .if \ENC_DEC == DEC 2732 vmovdqa %xmm1, %xmm2 2733 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 2734 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 2735 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 2736 vpand %xmm1, %xmm2, %xmm2 2737 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 2738 vpxor %xmm2, %xmm14, %xmm14 2739 #GHASH computation for the last <16 Byte block 2740 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 2741 sub %r13, %r11 2742 add $16, %r11 2743 .else 2744 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 2745 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 2746 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 2747 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 2748 vpxor %xmm9, %xmm14, %xmm14 2749 #GHASH computation for the last <16 Byte block 2750 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 2751 sub %r13, %r11 2752 add $16, %r11 2753 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext 2754 .endif 2755 2756 2757 ############################# 2758 # output r13 Bytes 2759 vmovq %xmm9, %rax 2760 cmp $8, %r13 2761 jle _less_than_8_bytes_left\@ 2762 2763 mov %rax, (arg2 , %r11) 2764 add $8, %r11 2765 vpsrldq $8, %xmm9, %xmm9 2766 vmovq %xmm9, %rax 2767 sub $8, %r13 2768 2769_less_than_8_bytes_left\@: 2770 movb %al, (arg2 , %r11) 2771 add $1, %r11 2772 shr $8, %rax 2773 sub $1, %r13 2774 jne _less_than_8_bytes_left\@ 2775 ############################# 2776 2777_multiple_of_16_bytes\@: 2778 mov arg7, %r12 # r12 = aadLen (number of bytes) 2779 shl $3, %r12 # convert into number of bits 2780 vmovd %r12d, %xmm15 # len(A) in xmm15 2781 2782 shl $3, arg4 # len(C) in bits (*128) 2783 vmovq arg4, %xmm1 2784 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 2785 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) 2786 2787 vpxor %xmm15, %xmm14, %xmm14 2788 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation 2789 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap 2790 2791 mov arg5, %rax # rax = *Y0 2792 vmovdqu (%rax), %xmm9 # xmm9 = Y0 2793 2794 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) 2795 2796 vpxor %xmm14, %xmm9, %xmm9 2797 2798 2799 2800_return_T\@: 2801 mov arg8, %r10 # r10 = authTag 2802 mov arg9, %r11 # r11 = auth_tag_len 2803 2804 cmp $16, %r11 2805 je _T_16\@ 2806 2807 cmp $8, %r11 2808 jl _T_4\@ 2809 2810_T_8\@: 2811 vmovq %xmm9, %rax 2812 mov %rax, (%r10) 2813 add $8, %r10 2814 sub $8, %r11 2815 vpsrldq $8, %xmm9, %xmm9 2816 cmp $0, %r11 2817 je _return_T_done\@ 2818_T_4\@: 2819 vmovd %xmm9, %eax 2820 mov %eax, (%r10) 2821 add $4, %r10 2822 sub $4, %r11 2823 vpsrldq $4, %xmm9, %xmm9 2824 cmp $0, %r11 2825 je _return_T_done\@ 2826_T_123\@: 2827 vmovd %xmm9, %eax 2828 cmp $2, %r11 2829 jl _T_1\@ 2830 mov %ax, (%r10) 2831 cmp $2, %r11 2832 je _return_T_done\@ 2833 add $2, %r10 2834 sar $16, %eax 2835_T_1\@: 2836 mov %al, (%r10) 2837 jmp _return_T_done\@ 2838 2839_T_16\@: 2840 vmovdqu %xmm9, (%r10) 2841 2842_return_T_done\@: 2843 mov %r14, %rsp 2844 2845 pop %r15 2846 pop %r14 2847 pop %r13 2848 pop %r12 2849.endm 2850 2851 2852############################################################# 2853#void aesni_gcm_precomp_avx_gen4 2854# (gcm_data *my_ctx_data, 2855# u8 *hash_subkey)# /* H, the Hash sub key input. 2856# Data starts on a 16-byte boundary. */ 2857############################################################# 2858ENTRY(aesni_gcm_precomp_avx_gen4) 2859 #the number of pushes must equal STACK_OFFSET 2860 push %r12 2861 push %r13 2862 push %r14 2863 push %r15 2864 2865 mov %rsp, %r14 2866 2867 2868 2869 sub $VARIABLE_OFFSET, %rsp 2870 and $~63, %rsp # align rsp to 64 bytes 2871 2872 vmovdqu (arg2), %xmm6 # xmm6 = HashKey 2873 2874 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 2875 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey 2876 vmovdqa %xmm6, %xmm2 2877 vpsllq $1, %xmm6, %xmm6 2878 vpsrlq $63, %xmm2, %xmm2 2879 vmovdqa %xmm2, %xmm1 2880 vpslldq $8, %xmm2, %xmm2 2881 vpsrldq $8, %xmm1, %xmm1 2882 vpor %xmm2, %xmm6, %xmm6 2883 #reduction 2884 vpshufd $0b00100100, %xmm1, %xmm2 2885 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 2886 vpand POLY(%rip), %xmm2, %xmm2 2887 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly 2888 ####################################################################### 2889 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly 2890 2891 2892 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 2893 2894 mov %r14, %rsp 2895 2896 pop %r15 2897 pop %r14 2898 pop %r13 2899 pop %r12 2900 ret 2901ENDPROC(aesni_gcm_precomp_avx_gen4) 2902 2903 2904############################################################################### 2905#void aesni_gcm_enc_avx_gen4( 2906# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2907# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 2908# const u8 *in, /* Plaintext input */ 2909# u64 plaintext_len, /* Length of data in Bytes for encryption. */ 2910# u8 *iv, /* Pre-counter block j0: 4 byte salt 2911# (from Security Association) concatenated with 8 byte 2912# Initialisation Vector (from IPSec ESP Payload) 2913# concatenated with 0x00000001. 16-byte aligned pointer. */ 2914# const u8 *aad, /* Additional Authentication Data (AAD)*/ 2915# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 2916# u8 *auth_tag, /* Authenticated Tag output. */ 2917# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 2918# Valid values are 16 (most likely), 12 or 8. */ 2919############################################################################### 2920ENTRY(aesni_gcm_enc_avx_gen4) 2921 GCM_ENC_DEC_AVX2 ENC 2922 ret 2923ENDPROC(aesni_gcm_enc_avx_gen4) 2924 2925############################################################################### 2926#void aesni_gcm_dec_avx_gen4( 2927# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2928# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 2929# const u8 *in, /* Ciphertext input */ 2930# u64 plaintext_len, /* Length of data in Bytes for encryption. */ 2931# u8 *iv, /* Pre-counter block j0: 4 byte salt 2932# (from Security Association) concatenated with 8 byte 2933# Initialisation Vector (from IPSec ESP Payload) 2934# concatenated with 0x00000001. 16-byte aligned pointer. */ 2935# const u8 *aad, /* Additional Authentication Data (AAD)*/ 2936# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 2937# u8 *auth_tag, /* Authenticated Tag output. */ 2938# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 2939# Valid values are 16 (most likely), 12 or 8. */ 2940############################################################################### 2941ENTRY(aesni_gcm_dec_avx_gen4) 2942 GCM_ENC_DEC_AVX2 DEC 2943 ret 2944ENDPROC(aesni_gcm_dec_avx_gen4) 2945 2946#endif /* CONFIG_AS_AVX2 */ 2947