1/* 2 * Multi-buffer SHA1 algorithm hash compute routine 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * Copyright(c) 2014 Intel Corporation. 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of version 2 of the GNU General Public License as 13 * published by the Free Software Foundation. 14 * 15 * This program is distributed in the hope that it will be useful, but 16 * WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * Contact Information: 21 * James Guilford <james.guilford@intel.com> 22 * Tim Chen <tim.c.chen@linux.intel.com> 23 * 24 * BSD LICENSE 25 * 26 * Copyright(c) 2014 Intel Corporation. 27 * 28 * Redistribution and use in source and binary forms, with or without 29 * modification, are permitted provided that the following conditions 30 * are met: 31 * 32 * * Redistributions of source code must retain the above copyright 33 * notice, this list of conditions and the following disclaimer. 34 * * Redistributions in binary form must reproduce the above copyright 35 * notice, this list of conditions and the following disclaimer in 36 * the documentation and/or other materials provided with the 37 * distribution. 38 * * Neither the name of Intel Corporation nor the names of its 39 * contributors may be used to endorse or promote products derived 40 * from this software without specific prior written permission. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 46 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 53 */ 54 55#include <linux/linkage.h> 56#include "sha1_mb_mgr_datastruct.S" 57 58## code to compute oct SHA1 using SSE-256 59## outer calling routine takes care of save and restore of XMM registers 60 61## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15# ymm0-15 62## 63## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 64## Linux preserves: rdi rbp r8 65## 66## clobbers ymm0-15 67 68 69# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 70# "transpose" data in {r0...r7} using temps {t0...t1} 71# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} 72# r0 = {a7 a6 a5 a4 a3 a2 a1 a0} 73# r1 = {b7 b6 b5 b4 b3 b2 b1 b0} 74# r2 = {c7 c6 c5 c4 c3 c2 c1 c0} 75# r3 = {d7 d6 d5 d4 d3 d2 d1 d0} 76# r4 = {e7 e6 e5 e4 e3 e2 e1 e0} 77# r5 = {f7 f6 f5 f4 f3 f2 f1 f0} 78# r6 = {g7 g6 g5 g4 g3 g2 g1 g0} 79# r7 = {h7 h6 h5 h4 h3 h2 h1 h0} 80# 81# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} 82# r0 = {h0 g0 f0 e0 d0 c0 b0 a0} 83# r1 = {h1 g1 f1 e1 d1 c1 b1 a1} 84# r2 = {h2 g2 f2 e2 d2 c2 b2 a2} 85# r3 = {h3 g3 f3 e3 d3 c3 b3 a3} 86# r4 = {h4 g4 f4 e4 d4 c4 b4 a4} 87# r5 = {h5 g5 f5 e5 d5 c5 b5 a5} 88# r6 = {h6 g6 f6 e6 d6 c6 b6 a6} 89# r7 = {h7 g7 f7 e7 d7 c7 b7 a7} 90# 91 92.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1 93 # process top half (r0..r3) {a...d} 94 vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0} 95 vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2} 96 vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0} 97 vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2} 98 vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1} 99 vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2} 100 vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3} 101 vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0} 102 103 # use r2 in place of t0 104 # process bottom half (r4..r7) {e...h} 105 vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0} 106 vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2} 107 vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0} 108 vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2} 109 vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1} 110 vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2} 111 vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3} 112 vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0} 113 114 vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6 115 vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2 116 vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5 117 vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1 118 vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7 119 vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3 120 vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4 121 vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0 122 123.endm 124## 125## Magic functions defined in FIPS 180-1 126## 127# macro MAGIC_F0 F,B,C,D,T ## F = (D ^ (B & (C ^ D))) 128.macro MAGIC_F0 regF regB regC regD regT 129 vpxor \regD, \regC, \regF 130 vpand \regB, \regF, \regF 131 vpxor \regD, \regF, \regF 132.endm 133 134# macro MAGIC_F1 F,B,C,D,T ## F = (B ^ C ^ D) 135.macro MAGIC_F1 regF regB regC regD regT 136 vpxor \regC, \regD, \regF 137 vpxor \regB, \regF, \regF 138.endm 139 140# macro MAGIC_F2 F,B,C,D,T ## F = ((B & C) | (B & D) | (C & D)) 141.macro MAGIC_F2 regF regB regC regD regT 142 vpor \regC, \regB, \regF 143 vpand \regC, \regB, \regT 144 vpand \regD, \regF, \regF 145 vpor \regT, \regF, \regF 146.endm 147 148# macro MAGIC_F3 F,B,C,D,T ## F = (B ^ C ^ D) 149.macro MAGIC_F3 regF regB regC regD regT 150 MAGIC_F1 \regF,\regB,\regC,\regD,\regT 151.endm 152 153# PROLD reg, imm, tmp 154.macro PROLD reg imm tmp 155 vpsrld $(32-\imm), \reg, \tmp 156 vpslld $\imm, \reg, \reg 157 vpor \tmp, \reg, \reg 158.endm 159 160.macro PROLD_nd reg imm tmp src 161 vpsrld $(32-\imm), \src, \tmp 162 vpslld $\imm, \src, \reg 163 vpor \tmp, \reg, \reg 164.endm 165 166.macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC 167 vpaddd \immCNT, \regE, \regE 168 vpaddd \memW*32(%rsp), \regE, \regE 169 PROLD_nd \regT, 5, \regF, \regA 170 vpaddd \regT, \regE, \regE 171 \MAGIC \regF, \regB, \regC, \regD, \regT 172 PROLD \regB, 30, \regT 173 vpaddd \regF, \regE, \regE 174.endm 175 176.macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC 177 vpaddd \immCNT, \regE, \regE 178 offset = ((\memW - 14) & 15) * 32 179 vmovdqu offset(%rsp), W14 180 vpxor W14, W16, W16 181 offset = ((\memW - 8) & 15) * 32 182 vpxor offset(%rsp), W16, W16 183 offset = ((\memW - 3) & 15) * 32 184 vpxor offset(%rsp), W16, W16 185 vpsrld $(32-1), W16, \regF 186 vpslld $1, W16, W16 187 vpor W16, \regF, \regF 188 189 ROTATE_W 190 191 offset = ((\memW - 0) & 15) * 32 192 vmovdqu \regF, offset(%rsp) 193 vpaddd \regF, \regE, \regE 194 PROLD_nd \regT, 5, \regF, \regA 195 vpaddd \regT, \regE, \regE 196 \MAGIC \regF,\regB,\regC,\regD,\regT ## FUN = MAGIC_Fi(B,C,D) 197 PROLD \regB,30, \regT 198 vpaddd \regF, \regE, \regE 199.endm 200 201######################################################################## 202######################################################################## 203######################################################################## 204 205## FRAMESZ plus pushes must be an odd multiple of 8 206YMM_SAVE = (15-15)*32 207FRAMESZ = 32*16 + YMM_SAVE 208_YMM = FRAMESZ - YMM_SAVE 209 210#define VMOVPS vmovups 211 212IDX = %rax 213inp0 = %r9 214inp1 = %r10 215inp2 = %r11 216inp3 = %r12 217inp4 = %r13 218inp5 = %r14 219inp6 = %r15 220inp7 = %rcx 221arg1 = %rdi 222arg2 = %rsi 223RSP_SAVE = %rdx 224 225# ymm0 A 226# ymm1 B 227# ymm2 C 228# ymm3 D 229# ymm4 E 230# ymm5 F AA 231# ymm6 T0 BB 232# ymm7 T1 CC 233# ymm8 T2 DD 234# ymm9 T3 EE 235# ymm10 T4 TMP 236# ymm11 T5 FUN 237# ymm12 T6 K 238# ymm13 T7 W14 239# ymm14 T8 W15 240# ymm15 T9 W16 241 242 243A = %ymm0 244B = %ymm1 245C = %ymm2 246D = %ymm3 247E = %ymm4 248F = %ymm5 249T0 = %ymm6 250T1 = %ymm7 251T2 = %ymm8 252T3 = %ymm9 253T4 = %ymm10 254T5 = %ymm11 255T6 = %ymm12 256T7 = %ymm13 257T8 = %ymm14 258T9 = %ymm15 259 260AA = %ymm5 261BB = %ymm6 262CC = %ymm7 263DD = %ymm8 264EE = %ymm9 265TMP = %ymm10 266FUN = %ymm11 267K = %ymm12 268W14 = %ymm13 269W15 = %ymm14 270W16 = %ymm15 271 272.macro ROTATE_ARGS 273 TMP_ = E 274 E = D 275 D = C 276 C = B 277 B = A 278 A = TMP_ 279.endm 280 281.macro ROTATE_W 282TMP_ = W16 283W16 = W15 284W15 = W14 285W14 = TMP_ 286.endm 287 288# 8 streams x 5 32bit words per digest x 4 bytes per word 289#define DIGEST_SIZE (8*5*4) 290 291.align 32 292 293# void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size) 294# arg 1 : pointer to array[4] of pointer to input data 295# arg 2 : size (in blocks) ;; assumed to be >= 1 296# 297ENTRY(sha1_x8_avx2) 298 299 # save callee-saved clobbered registers to comply with C function ABI 300 push %r12 301 push %r13 302 push %r14 303 push %r15 304 305 #save rsp 306 mov %rsp, RSP_SAVE 307 sub $FRAMESZ, %rsp 308 309 #align rsp to 32 Bytes 310 and $~0x1F, %rsp 311 312 ## Initialize digests 313 vmovdqu 0*32(arg1), A 314 vmovdqu 1*32(arg1), B 315 vmovdqu 2*32(arg1), C 316 vmovdqu 3*32(arg1), D 317 vmovdqu 4*32(arg1), E 318 319 ## transpose input onto stack 320 mov _data_ptr+0*8(arg1),inp0 321 mov _data_ptr+1*8(arg1),inp1 322 mov _data_ptr+2*8(arg1),inp2 323 mov _data_ptr+3*8(arg1),inp3 324 mov _data_ptr+4*8(arg1),inp4 325 mov _data_ptr+5*8(arg1),inp5 326 mov _data_ptr+6*8(arg1),inp6 327 mov _data_ptr+7*8(arg1),inp7 328 329 xor IDX, IDX 330lloop: 331 vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), F 332 I=0 333.rep 2 334 VMOVPS (inp0, IDX), T0 335 VMOVPS (inp1, IDX), T1 336 VMOVPS (inp2, IDX), T2 337 VMOVPS (inp3, IDX), T3 338 VMOVPS (inp4, IDX), T4 339 VMOVPS (inp5, IDX), T5 340 VMOVPS (inp6, IDX), T6 341 VMOVPS (inp7, IDX), T7 342 343 TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 344 vpshufb F, T0, T0 345 vmovdqu T0, (I*8)*32(%rsp) 346 vpshufb F, T1, T1 347 vmovdqu T1, (I*8+1)*32(%rsp) 348 vpshufb F, T2, T2 349 vmovdqu T2, (I*8+2)*32(%rsp) 350 vpshufb F, T3, T3 351 vmovdqu T3, (I*8+3)*32(%rsp) 352 vpshufb F, T4, T4 353 vmovdqu T4, (I*8+4)*32(%rsp) 354 vpshufb F, T5, T5 355 vmovdqu T5, (I*8+5)*32(%rsp) 356 vpshufb F, T6, T6 357 vmovdqu T6, (I*8+6)*32(%rsp) 358 vpshufb F, T7, T7 359 vmovdqu T7, (I*8+7)*32(%rsp) 360 add $32, IDX 361 I = (I+1) 362.endr 363 # save old digests 364 vmovdqu A,AA 365 vmovdqu B,BB 366 vmovdqu C,CC 367 vmovdqu D,DD 368 vmovdqu E,EE 369 370## 371## perform 0-79 steps 372## 373 vmovdqu K00_19(%rip), K 374## do rounds 0...15 375 I = 0 376.rep 16 377 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 378 ROTATE_ARGS 379 I = (I+1) 380.endr 381 382## do rounds 16...19 383 vmovdqu ((16 - 16) & 15) * 32 (%rsp), W16 384 vmovdqu ((16 - 15) & 15) * 32 (%rsp), W15 385.rep 4 386 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 387 ROTATE_ARGS 388 I = (I+1) 389.endr 390 391## do rounds 20...39 392 vmovdqu K20_39(%rip), K 393.rep 20 394 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 395 ROTATE_ARGS 396 I = (I+1) 397.endr 398 399## do rounds 40...59 400 vmovdqu K40_59(%rip), K 401.rep 20 402 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 403 ROTATE_ARGS 404 I = (I+1) 405.endr 406 407## do rounds 60...79 408 vmovdqu K60_79(%rip), K 409.rep 20 410 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 411 ROTATE_ARGS 412 I = (I+1) 413.endr 414 415 vpaddd AA,A,A 416 vpaddd BB,B,B 417 vpaddd CC,C,C 418 vpaddd DD,D,D 419 vpaddd EE,E,E 420 421 sub $1, arg2 422 jne lloop 423 424 # write out digests 425 vmovdqu A, 0*32(arg1) 426 vmovdqu B, 1*32(arg1) 427 vmovdqu C, 2*32(arg1) 428 vmovdqu D, 3*32(arg1) 429 vmovdqu E, 4*32(arg1) 430 431 # update input pointers 432 add IDX, inp0 433 add IDX, inp1 434 add IDX, inp2 435 add IDX, inp3 436 add IDX, inp4 437 add IDX, inp5 438 add IDX, inp6 439 add IDX, inp7 440 mov inp0, _data_ptr (arg1) 441 mov inp1, _data_ptr + 1*8(arg1) 442 mov inp2, _data_ptr + 2*8(arg1) 443 mov inp3, _data_ptr + 3*8(arg1) 444 mov inp4, _data_ptr + 4*8(arg1) 445 mov inp5, _data_ptr + 5*8(arg1) 446 mov inp6, _data_ptr + 6*8(arg1) 447 mov inp7, _data_ptr + 7*8(arg1) 448 449 ################ 450 ## Postamble 451 452 mov RSP_SAVE, %rsp 453 454 # restore callee-saved clobbered registers 455 pop %r15 456 pop %r14 457 pop %r13 458 pop %r12 459 460 ret 461ENDPROC(sha1_x8_avx2) 462 463 464.section .rodata.cst32.K00_19, "aM", @progbits, 32 465.align 32 466K00_19: 467.octa 0x5A8279995A8279995A8279995A827999 468.octa 0x5A8279995A8279995A8279995A827999 469 470.section .rodata.cst32.K20_39, "aM", @progbits, 32 471.align 32 472K20_39: 473.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 474.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 475 476.section .rodata.cst32.K40_59, "aM", @progbits, 32 477.align 32 478K40_59: 479.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC 480.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC 481 482.section .rodata.cst32.K60_79, "aM", @progbits, 32 483.align 32 484K60_79: 485.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 486.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 487 488.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 489.align 32 490PSHUFFLE_BYTE_FLIP_MASK: 491.octa 0x0c0d0e0f08090a0b0405060700010203 492.octa 0x0c0d0e0f08090a0b0405060700010203 493