Lines Matching +full:- +full:2 +full:g
2 # Implement fast SHA-512 with AVX2 instructions. (x86_64)
14 # General Public License (GPL) Version 2, available from the file
22 # - Redistributions of source code must retain the above
26 # - Redistributions in binary form must reproduce the above
42 # This code is described in an Intel White-Paper:
43 # "Fast SHA-512 Implementations on Intel Architecture Processors"
74 # 2nd arg
90 g = %r10 define
117 # Add reg to mem using reg-mem add and store
145 h = g
146 g = f define
165 # Extract w[t-7]
166 MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
167 # Calculate w[t-16] + w[t-7]
168 vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
169 # Extract w[t-15]
170 MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
174 # Calculate w[t-15] ror 1
176 vpsllq $(64-1), YTMP1, YTMP3
177 vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
178 # Calculate w[t-15] shr 7
179 vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
184 add frame_XFER(%rsp),h # h = k + w + h # --
190 xor g, y2 # y2 = f^g # CH
193 and e, y2 # y2 = (f^g)&e # CH
196 add h, d # d = k + w + h + d # --
202 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
207 add y0, y2 # y2 = S1 + CH # --
209 add y1, h # h = k + w + h + S0 # --
211 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
213 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
214 add y3, h # h = t1 + S0 + MAJ # --
220 # Calculate w[t-15] ror 8
222 vpsllq $(64-8), YTMP1, YTMP1
223 vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
225 vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
229 # Add three components, w[t-16], w[t-7] and sigma0
230 vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
232 vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
234 vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
239 vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
240 vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
246 add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
253 xor g, y2 # y2 = f^g # CH
259 and e, y2 # y2 = (f^g)&e # CH
260 add h, d # d = k + w + h + d # --
266 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
271 add y0, y2 # y2 = S1 + CH # --
274 add y1, h # h = k + w + h + S0 # --
276 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
277 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
278 add y3, h # h = t1 + S0 + MAJ # --
283 ################################### RND N + 2 #########################################
285 vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
286 vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
287 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
288 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
289 vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
290 vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
291 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
292 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
293 # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
299 vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
303 add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
308 xor g, y2 # y2 = f^g # CH
312 and e, y2 # y2 = (f^g)&e # CH
315 add h, d # d = k + w + h + d # --
320 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
328 add y0, y2 # y2 = S1 + CH # --
331 add y1, h # h = k + w + h + S0 # --
332 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
333 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
335 add y3, h # h = t1 + S0 + MAJ # --
341 vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
342 vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
343 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
344 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
345 vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
346 vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
347 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
348 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
349 # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
351 # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
353 vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
356 vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]}
361 add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
368 xor g, y2 # y2 = f^g # CH
372 and e, y2 # y2 = (f^g)&e # CH
373 add h, d # d = k + w + h + d # --
377 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
380 add y0, y2 # y2 = S1 + CH # --
383 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
392 add y1, h # h = k + w + h + S0 # --
393 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
394 add y3, h # h = t1 + S0 + MAJ # --
408 xor g, y2 # y2 = f^g # CH
412 and e, y2 # y2 = (f^g)&e # CH
416 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
422 add frame_XFER(%rsp), h # h = k + w + h # --
429 add y0, y2 # y2 = S1 + CH # --
431 add h, d # d = k + w + h + d # --
433 add y1, h # h = k + w + h + S0 # --
435 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
441 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
445 xor g, y2 # y2 = f^g # CH
449 and e, y2 # y2 = (f^g)&e # CH
450 add y3, old_h # h = t1 + S0 + MAJ # --
454 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
460 add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
467 add y0, y2 # y2 = S1 + CH # --
469 add h, d # d = k + w + h + d # --
471 add y1, h # h = k + w + h + S0 # --
473 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
477 ################################### RND N + 2 #########################################
479 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
483 xor g, y2 # y2 = f^g # CH
487 and e, y2 # y2 = (f^g)&e # CH
488 add y3, old_h # h = t1 + S0 + MAJ # --
492 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
498 add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
505 add y0, y2 # y2 = S1 + CH # --
507 add h, d # d = k + w + h + d # --
509 add y1, h # h = k + w + h + S0 # --
511 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
517 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
521 xor g, y2 # y2 = f^g # CH
525 and e, y2 # y2 = (f^g)&e # CH
526 add y3, old_h # h = t1 + S0 + MAJ # --
530 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
536 add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
543 add y0, y2 # y2 = S1 + CH # --
546 add h, d # d = k + w + h + d # --
548 add y1, h # h = k + w + h + S0 # --
550 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
552 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
554 add y3, h # h = t1 + S0 + MAJ # --
580 and $~(0x20 - 1), %rsp
590 mov 8*2(CTX1), c
594 mov 8*6(CTX1), g
608 COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK
626 vpaddq 2*32(TBL), Y_0, XFER
638 movq $2, frame_SRND(%rsp)
645 add $(2*32), TBL
657 addm 8*2(CTX2), c
661 addm 8*6(CTX2), g
689 # Mergeable 640-byte rodata section. This allows linker to merge the table
690 # with other, exactly the same 640-byte fragment of another rodata section
739 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.