Lines Matching +full:- +full:16 +full:g
2 # Implement fast SHA-256 with AVX1 instructions. (x86_64)
21 # - Redistributions of source code must retain the above
25 # - Redistributions in binary form must reproduce the above
40 # This code is described in an Intel White-Paper:
41 # "Fast SHA-256 Implementations on Intel Architecture Processors"
58 # Add reg to mem using reg-mem add and store
66 shld $(32-(\p1)), \p2, \p2
93 SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
94 SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
110 g = %r10d define
120 _XFER_SIZE = 16
143 h = g
144 g = f define
155 ## compute W[-16] + W[-7] 4 at a time
158 MY_ROR (25-11), y0 # y0 = e >> (25-11)
160 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
161 MY_ROR (22-13), y1 # y1 = a >> (22-13)
162 xor e, y0 # y0 = e ^ (e >> (25-11))
164 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
165 xor a, y1 # y1 = a ^ (a >> (22-13)
166 xor g, y2 # y2 = f^g
167 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
168 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
169 and e, y2 # y2 = (f^g)&e
170 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
172 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
173 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
175 xor g, y2 # y2 = CH = ((f^g)&e)^g
186 vpslld $(32-7), XTMP1, XTMP3
189 vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
195 MY_ROR (25-11), y0 # y0 = e >> (25-11)
196 xor e, y0 # y0 = e ^ (e >> (25-11))
198 MY_ROR (22-13), y1 # y1 = a >> (22-13)
200 xor a, y1 # y1 = a ^ (a >> (22-13)
201 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
202 xor g, y2 # y2 = f^g
203 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
204 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
205 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
206 and e, y2 # y2 = (f^g)&e
208 vpslld $(32-18), XTMP1, XTMP1
209 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
210 xor g, y2 # y2 = CH = ((f^g)&e)^g
215 vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
224 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
227 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
233 MY_ROR (25-11), y0 # y0 = e >> (25-11)
234 xor e, y0 # y0 = e ^ (e >> (25-11))
235 MY_ROR (22-13), y1 # y1 = a >> (22-13)
237 xor a, y1 # y1 = a ^ (a >> (22-13)
238 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
239 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
240 xor g, y2 # y2 = f^g
241 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
242 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
243 and e, y2 # y2 = (f^g)&e
244 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
245 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
246 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
247 xor g, y2 # y2 = CH = ((f^g)&e)^g
265 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
270 MY_ROR (25-11), y0 # y0 = e >> (25-11)
272 MY_ROR (22-13), y1 # y1 = a >> (22-13)
273 xor e, y0 # y0 = e ^ (e >> (25-11))
275 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
276 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
277 xor a, y1 # y1 = a ^ (a >> (22-13)
278 xor g, y2 # y2 = f^g
279 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
280 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
281 and e, y2 # y2 = (f^g)&e
282 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
283 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
284 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
286 xor g, y2 # y2 = CH = ((f^g)&e)^g
311 MY_ROR (25-11), y0 # y0 = e >> (25-11)
313 xor e, y0 # y0 = e ^ (e >> (25-11))
314 MY_ROR (22-13), y1 # y1 = a >> (22-13)
316 xor a, y1 # y1 = a ^ (a >> (22-13)
317 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
318 xor g, y2 # y2 = f^g
319 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
320 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
321 and e, y2 # y2 = (f^g)&e
322 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
324 xor g, y2 # y2 = CH = ((f^g)&e)^g
374 mov 4*6(CTX), g
383 ## byte swap first 16 dwords
384 COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
385 COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
386 COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
387 COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
391 ## schedule 48 input dwords, by doing 3 rounds of 16 each
393 .align 16
399 vpaddd 1*16(TBL), X0, XFER
403 vpaddd 2*16(TBL), X0, XFER
407 vpaddd 3*16(TBL), X0, XFER
409 add $4*16, TBL
424 vpaddd 1*16(TBL), X1, XFER
426 add $2*16, TBL
444 addm (4*6)(CTX),g
484 .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
485 .align 16
489 .section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
490 .align 16
491 # shuffle xBxA -> 00BA
495 .section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
496 .align 16
497 # shuffle xDxC -> DC00