1/*
2 * Multi-buffer SHA1 algorithm hash compute routine
3 *
4 * This file is provided under a dual BSD/GPLv2 license.  When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 *  Copyright(c) 2014 Intel Corporation.
10 *
11 *  This program is free software; you can redistribute it and/or modify
12 *  it under the terms of version 2 of the GNU General Public License as
13 *  published by the Free Software Foundation.
14 *
15 *  This program is distributed in the hope that it will be useful, but
16 *  WITHOUT ANY WARRANTY; without even the implied warranty of
17 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 *  General Public License for more details.
19 *
20 *  Contact Information:
21 *      James Guilford <james.guilford@intel.com>
22 *	Tim Chen <tim.c.chen@linux.intel.com>
23 *
24 *  BSD LICENSE
25 *
26 *  Copyright(c) 2014 Intel Corporation.
27 *
28 *  Redistribution and use in source and binary forms, with or without
29 *  modification, are permitted provided that the following conditions
30 *  are met:
31 *
32 *    * Redistributions of source code must retain the above copyright
33 *      notice, this list of conditions and the following disclaimer.
34 *    * Redistributions in binary form must reproduce the above copyright
35 *      notice, this list of conditions and the following disclaimer in
36 *      the documentation and/or other materials provided with the
37 *      distribution.
38 *    * Neither the name of Intel Corporation nor the names of its
39 *      contributors may be used to endorse or promote products derived
40 *      from this software without specific prior written permission.
41 *
42 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
43 *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
44 *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
45 *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
46 *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48 *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
49 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
50 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 */
54
55#include <linux/linkage.h>
56#include "sha1_mb_mgr_datastruct.S"
57
58## code to compute oct SHA1 using SSE-256
59## outer calling routine takes care of save and restore of XMM registers
60
61## Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15# ymm0-15
62##
63## Linux clobbers:    rax rbx rcx rdx rsi            r9 r10 r11 r12 r13 r14 r15
64## Linux preserves:                       rdi rbp r8
65##
66## clobbers ymm0-15
67
68
69# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
70# "transpose" data in {r0...r7} using temps {t0...t1}
71# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
72# r0 = {a7 a6 a5 a4   a3 a2 a1 a0}
73# r1 = {b7 b6 b5 b4   b3 b2 b1 b0}
74# r2 = {c7 c6 c5 c4   c3 c2 c1 c0}
75# r3 = {d7 d6 d5 d4   d3 d2 d1 d0}
76# r4 = {e7 e6 e5 e4   e3 e2 e1 e0}
77# r5 = {f7 f6 f5 f4   f3 f2 f1 f0}
78# r6 = {g7 g6 g5 g4   g3 g2 g1 g0}
79# r7 = {h7 h6 h5 h4   h3 h2 h1 h0}
80#
81# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
82# r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
83# r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
84# r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
85# r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
86# r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
87# r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
88# r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
89# r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
90#
91
92.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
93	# process top half (r0..r3) {a...d}
94	vshufps  $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4   b1 b0 a1 a0}
95	vshufps  $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6   b3 b2 a3 a2}
96	vshufps  $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4   d1 d0 c1 c0}
97	vshufps  $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6   d3 d2 c3 c2}
98	vshufps  $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5   d1 c1 b1 a1}
99	vshufps  $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6   d2 c2 b2 a2}
100	vshufps  $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7   d3 c3 b3 a3}
101	vshufps  $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4   d0 c0 b0 a0}
102
103	# use r2 in place of t0
104	# process bottom half (r4..r7) {e...h}
105	vshufps  $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4   f1 f0 e1 e0}
106	vshufps  $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6   f3 f2 e3 e2}
107	vshufps  $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4   h1 h0 g1 g0}
108	vshufps  $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6   h3 h2 g3 g2}
109	vshufps  $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5   h1 g1 f1 e1}
110	vshufps  $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6   h2 g2 f2 e2}
111	vshufps  $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7   h3 g3 f3 e3}
112	vshufps  $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4   h0 g0 f0 e0}
113
114	vperm2f128      $0x13, \r1, \r5, \r6  # h6...a6
115	vperm2f128      $0x02, \r1, \r5, \r2  # h2...a2
116	vperm2f128      $0x13, \r3, \r7, \r5  # h5...a5
117	vperm2f128      $0x02, \r3, \r7, \r1  # h1...a1
118	vperm2f128      $0x13, \r0, \r4, \r7  # h7...a7
119	vperm2f128      $0x02, \r0, \r4, \r3  # h3...a3
120	vperm2f128      $0x13, \t0, \t1, \r4  # h4...a4
121	vperm2f128      $0x02, \t0, \t1, \r0  # h0...a0
122
123.endm
124##
125## Magic functions defined in FIPS 180-1
126##
127# macro MAGIC_F0 F,B,C,D,T   ## F = (D ^ (B & (C ^ D)))
128.macro MAGIC_F0 regF regB regC regD regT
129    vpxor \regD, \regC, \regF
130    vpand \regB, \regF, \regF
131    vpxor \regD, \regF, \regF
132.endm
133
134# macro MAGIC_F1 F,B,C,D,T   ## F = (B ^ C ^ D)
135.macro MAGIC_F1 regF regB regC regD regT
136    vpxor  \regC, \regD, \regF
137    vpxor  \regB, \regF, \regF
138.endm
139
140# macro MAGIC_F2 F,B,C,D,T   ## F = ((B & C) | (B & D) | (C & D))
141.macro MAGIC_F2 regF regB regC regD regT
142    vpor  \regC, \regB, \regF
143    vpand \regC, \regB, \regT
144    vpand \regD, \regF, \regF
145    vpor  \regT, \regF, \regF
146.endm
147
148# macro MAGIC_F3 F,B,C,D,T   ## F = (B ^ C ^ D)
149.macro MAGIC_F3 regF regB regC regD regT
150    MAGIC_F1 \regF,\regB,\regC,\regD,\regT
151.endm
152
153# PROLD reg, imm, tmp
154.macro PROLD reg imm tmp
155	vpsrld  $(32-\imm), \reg, \tmp
156	vpslld  $\imm, \reg, \reg
157	vpor    \tmp, \reg, \reg
158.endm
159
160.macro PROLD_nd reg imm tmp src
161	vpsrld  $(32-\imm), \src, \tmp
162	vpslld  $\imm, \src, \reg
163	vpor	\tmp, \reg, \reg
164.endm
165
166.macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC
167	vpaddd	\immCNT, \regE, \regE
168	vpaddd	\memW*32(%rsp), \regE, \regE
169	PROLD_nd \regT, 5, \regF, \regA
170	vpaddd	\regT, \regE, \regE
171	\MAGIC  \regF, \regB, \regC, \regD, \regT
172        PROLD   \regB, 30, \regT
173        vpaddd  \regF, \regE, \regE
174.endm
175
176.macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC
177	vpaddd	\immCNT, \regE, \regE
178	offset = ((\memW - 14) & 15) * 32
179	vmovdqu offset(%rsp), W14
180	vpxor	W14, W16, W16
181	offset = ((\memW -  8) & 15) * 32
182	vpxor	offset(%rsp), W16, W16
183	offset = ((\memW -  3) & 15) * 32
184	vpxor	offset(%rsp), W16, W16
185	vpsrld	$(32-1), W16, \regF
186	vpslld	$1, W16, W16
187	vpor	W16, \regF, \regF
188
189	ROTATE_W
190
191	offset = ((\memW - 0) & 15) * 32
192	vmovdqu	\regF, offset(%rsp)
193	vpaddd	\regF, \regE, \regE
194	PROLD_nd \regT, 5, \regF, \regA
195	vpaddd	\regT, \regE, \regE
196	\MAGIC \regF,\regB,\regC,\regD,\regT      ## FUN  = MAGIC_Fi(B,C,D)
197	PROLD   \regB,30, \regT
198	vpaddd  \regF, \regE, \regE
199.endm
200
201########################################################################
202########################################################################
203########################################################################
204
205## FRAMESZ plus pushes must be an odd multiple of 8
206YMM_SAVE = (15-15)*32
207FRAMESZ = 32*16 + YMM_SAVE
208_YMM  =   FRAMESZ - YMM_SAVE
209
210#define VMOVPS   vmovups
211
212IDX  = %rax
213inp0 = %r9
214inp1 = %r10
215inp2 = %r11
216inp3 = %r12
217inp4 = %r13
218inp5 = %r14
219inp6 = %r15
220inp7 = %rcx
221arg1 = %rdi
222arg2 = %rsi
223RSP_SAVE = %rdx
224
225# ymm0 A
226# ymm1 B
227# ymm2 C
228# ymm3 D
229# ymm4 E
230# ymm5         F       AA
231# ymm6         T0      BB
232# ymm7         T1      CC
233# ymm8         T2      DD
234# ymm9         T3      EE
235# ymm10                T4      TMP
236# ymm11                T5      FUN
237# ymm12                T6      K
238# ymm13                T7      W14
239# ymm14                T8      W15
240# ymm15                T9      W16
241
242
243A  =     %ymm0
244B  =     %ymm1
245C  =     %ymm2
246D  =     %ymm3
247E  =     %ymm4
248F  =     %ymm5
249T0 =	 %ymm6
250T1 =     %ymm7
251T2 =     %ymm8
252T3 =     %ymm9
253T4 =     %ymm10
254T5 =     %ymm11
255T6 =     %ymm12
256T7 =     %ymm13
257T8  =     %ymm14
258T9  =     %ymm15
259
260AA  =     %ymm5
261BB  =     %ymm6
262CC  =     %ymm7
263DD  =     %ymm8
264EE  =     %ymm9
265TMP =     %ymm10
266FUN =     %ymm11
267K   =     %ymm12
268W14 =     %ymm13
269W15 =     %ymm14
270W16 =     %ymm15
271
272.macro ROTATE_ARGS
273 TMP_ = E
274 E = D
275 D = C
276 C = B
277 B = A
278 A = TMP_
279.endm
280
281.macro ROTATE_W
282TMP_  = W16
283W16  = W15
284W15  = W14
285W14  = TMP_
286.endm
287
288# 8 streams x 5 32bit words per digest x 4 bytes per word
289#define DIGEST_SIZE (8*5*4)
290
291.align 32
292
293# void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size)
294# arg 1 : pointer to array[4] of pointer to input data
295# arg 2 : size (in blocks) ;; assumed to be >= 1
296#
297ENTRY(sha1_x8_avx2)
298
299	# save callee-saved clobbered registers to comply with C function ABI
300	push	%r12
301	push	%r13
302	push	%r14
303	push	%r15
304
305	#save rsp
306	mov	%rsp, RSP_SAVE
307	sub     $FRAMESZ, %rsp
308
309	#align rsp to 32 Bytes
310	and	$~0x1F, %rsp
311
312	## Initialize digests
313	vmovdqu  0*32(arg1), A
314	vmovdqu  1*32(arg1), B
315	vmovdqu  2*32(arg1), C
316	vmovdqu  3*32(arg1), D
317	vmovdqu  4*32(arg1), E
318
319	## transpose input onto stack
320	mov     _data_ptr+0*8(arg1),inp0
321	mov     _data_ptr+1*8(arg1),inp1
322	mov     _data_ptr+2*8(arg1),inp2
323	mov     _data_ptr+3*8(arg1),inp3
324	mov     _data_ptr+4*8(arg1),inp4
325	mov     _data_ptr+5*8(arg1),inp5
326	mov     _data_ptr+6*8(arg1),inp6
327	mov     _data_ptr+7*8(arg1),inp7
328
329	xor     IDX, IDX
330lloop:
331	vmovdqu  PSHUFFLE_BYTE_FLIP_MASK(%rip), F
332	I=0
333.rep 2
334	VMOVPS   (inp0, IDX), T0
335	VMOVPS   (inp1, IDX), T1
336	VMOVPS   (inp2, IDX), T2
337	VMOVPS   (inp3, IDX), T3
338	VMOVPS   (inp4, IDX), T4
339	VMOVPS   (inp5, IDX), T5
340	VMOVPS   (inp6, IDX), T6
341	VMOVPS   (inp7, IDX), T7
342
343	TRANSPOSE8       T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
344	vpshufb  F, T0, T0
345	vmovdqu  T0, (I*8)*32(%rsp)
346	vpshufb  F, T1, T1
347	vmovdqu  T1, (I*8+1)*32(%rsp)
348	vpshufb  F, T2, T2
349	vmovdqu  T2, (I*8+2)*32(%rsp)
350	vpshufb  F, T3, T3
351	vmovdqu  T3, (I*8+3)*32(%rsp)
352	vpshufb  F, T4, T4
353	vmovdqu  T4, (I*8+4)*32(%rsp)
354	vpshufb  F, T5, T5
355	vmovdqu  T5, (I*8+5)*32(%rsp)
356	vpshufb  F, T6, T6
357	vmovdqu  T6, (I*8+6)*32(%rsp)
358	vpshufb  F, T7, T7
359	vmovdqu  T7, (I*8+7)*32(%rsp)
360	add     $32, IDX
361	I = (I+1)
362.endr
363	# save old digests
364	vmovdqu  A,AA
365	vmovdqu  B,BB
366	vmovdqu  C,CC
367	vmovdqu  D,DD
368	vmovdqu  E,EE
369
370##
371## perform 0-79 steps
372##
373	vmovdqu  K00_19(%rip), K
374## do rounds 0...15
375	I = 0
376.rep 16
377	SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
378	ROTATE_ARGS
379	I = (I+1)
380.endr
381
382## do rounds 16...19
383	vmovdqu  ((16 - 16) & 15) * 32 (%rsp), W16
384	vmovdqu  ((16 - 15) & 15) * 32 (%rsp), W15
385.rep 4
386	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
387	ROTATE_ARGS
388	I = (I+1)
389.endr
390
391## do rounds 20...39
392	vmovdqu  K20_39(%rip), K
393.rep 20
394	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
395	ROTATE_ARGS
396	I = (I+1)
397.endr
398
399## do rounds 40...59
400	vmovdqu  K40_59(%rip), K
401.rep 20
402	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
403	ROTATE_ARGS
404	I = (I+1)
405.endr
406
407## do rounds 60...79
408	vmovdqu  K60_79(%rip), K
409.rep 20
410	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
411	ROTATE_ARGS
412	I = (I+1)
413.endr
414
415	vpaddd   AA,A,A
416	vpaddd   BB,B,B
417	vpaddd   CC,C,C
418	vpaddd   DD,D,D
419	vpaddd   EE,E,E
420
421	sub     $1, arg2
422	jne     lloop
423
424	# write out digests
425	vmovdqu  A, 0*32(arg1)
426	vmovdqu  B, 1*32(arg1)
427	vmovdqu  C, 2*32(arg1)
428	vmovdqu  D, 3*32(arg1)
429	vmovdqu  E, 4*32(arg1)
430
431	# update input pointers
432	add     IDX, inp0
433	add     IDX, inp1
434	add     IDX, inp2
435	add     IDX, inp3
436	add     IDX, inp4
437	add     IDX, inp5
438	add     IDX, inp6
439	add     IDX, inp7
440	mov     inp0, _data_ptr (arg1)
441	mov     inp1, _data_ptr + 1*8(arg1)
442	mov     inp2, _data_ptr + 2*8(arg1)
443	mov     inp3, _data_ptr + 3*8(arg1)
444	mov     inp4, _data_ptr + 4*8(arg1)
445	mov     inp5, _data_ptr + 5*8(arg1)
446	mov     inp6, _data_ptr + 6*8(arg1)
447	mov     inp7, _data_ptr + 7*8(arg1)
448
449	################
450	## Postamble
451
452	mov     RSP_SAVE, %rsp
453
454	# restore callee-saved clobbered registers
455	pop	%r15
456	pop	%r14
457	pop	%r13
458	pop	%r12
459
460	ret
461ENDPROC(sha1_x8_avx2)
462
463
464.section	.rodata.cst32.K00_19, "aM", @progbits, 32
465.align 32
466K00_19:
467.octa 0x5A8279995A8279995A8279995A827999
468.octa 0x5A8279995A8279995A8279995A827999
469
470.section	.rodata.cst32.K20_39, "aM", @progbits, 32
471.align 32
472K20_39:
473.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
474.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
475
476.section	.rodata.cst32.K40_59, "aM", @progbits, 32
477.align 32
478K40_59:
479.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
480.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
481
482.section	.rodata.cst32.K60_79, "aM", @progbits, 32
483.align 32
484K60_79:
485.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
486.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
487
488.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
489.align 32
490PSHUFFLE_BYTE_FLIP_MASK:
491.octa 0x0c0d0e0f08090a0b0405060700010203
492.octa 0x0c0d0e0f08090a0b0405060700010203
493