Lines Matching +full:4 +full:- +full:ch

2 # Implement fast SHA-256 with AVX2 instructions. (x86_64)
21 # - Redistributions of source code must retain the above
25 # - Redistributions in binary form must reproduce the above
41 # This code is described in an Intel White-Paper:
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
48 # This code schedules 2 blocks at a time, with 4 lanes per block
60 # Add reg to mem using reg-mem add and store
87 SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
88 SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
116 _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
161 addl \disp(%rsp, SRND), h # h = k + w + h # --
163 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
164 mov f, y2 # y2 = f # CH
168 xor g, y2 # y2 = f^g # CH
169 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
172 and e, y2 # y2 = (f^g)&e # CH
175 add h, d # d = k + w + h + d # --
178 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
182 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
188 add y0, y2 # y2 = S1 + CH # --
189 vpslld $(32-7), XTMP1, XTMP3
191 add y1, h # h = k + w + h + S0 # --
193 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
194 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
197 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
198 add y3, h # h = t1 + S0 + MAJ # --
208 offset = \disp + 1*4
209 addl offset(%rsp, SRND), h # h = k + w + h # --
213 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
214 mov f, y2 # y2 = f # CH
217 xor g, y2 # y2 = f^g # CH
223 and e, y2 # y2 = (f^g)&e # CH
224 add h, d # d = k + w + h + d # --
226 vpslld $(32-18), XTMP1, XTMP1
232 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
234 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
238 add y0, y2 # y2 = S1 + CH # --
241 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
243 add y1, h # h = k + w + h + S0 # --
245 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
246 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
247 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
248 add y3, h # h = t1 + S0 + MAJ # --
250 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
259 offset = \disp + 2*4
260 addl offset(%rsp, SRND), h # h = k + w + h # --
262 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
265 mov f, y2 # y2 = f # CH
266 xor g, y2 # y2 = f^g # CH
270 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
271 and e, y2 # y2 = (f^g)&e # CH
275 add h, d # d = k + w + h + d # --
281 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
291 add y0, y2 # y2 = S1 + CH # --
292 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
295 add y1,h # h = k + w + h + S0 # --
296 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
297 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
299 add y3,h # h = t1 + S0 + MAJ # --
309 offset = \disp + 3*4
310 addl offset(%rsp, SRND), h # h = k + w + h # --
314 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
315 mov f, y2 # y2 = f # CH
318 xor g, y2 # y2 = f^g # CH
321 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
323 and e, y2 # y2 = (f^g)&e # CH
324 add h, d # d = k + w + h + d # --
327 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
329 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
333 add y0, y2 # y2 = S1 + CH # --
337 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
348 add y1, h # h = k + w + h + S0 # --
349 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
350 add y3, h # h = t1 + S0 + MAJ # --
359 mov f, y2 # y2 = f # CH
362 xor g, y2 # y2 = f^g # CH
366 and e, y2 # y2 = (f^g)&e # CH
370 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
376 addl \disp(%rsp, SRND), h # h = k + w + h # --
383 add y0, y2 # y2 = S1 + CH # --
386 add h, d # d = k + w + h + d # --
388 add y1, h # h = k + w + h + S0 # --
389 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
395 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
396 mov f, y2 # y2 = f # CH
399 xor g, y2 # y2 = f^g # CH
403 and e, y2 # y2 = (f^g)&e # CH
404 add y3, old_h # h = t1 + S0 + MAJ # --
408 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
414 offset = 4*1 + \disp
415 addl offset(%rsp, SRND), h # h = k + w + h # --
422 add y0, y2 # y2 = S1 + CH # --
425 add h, d # d = k + w + h + d # --
427 add y1, h # h = k + w + h + S0 # --
429 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
435 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
436 mov f, y2 # y2 = f # CH
439 xor g, y2 # y2 = f^g # CH
443 and e, y2 # y2 = (f^g)&e # CH
444 add y3, old_h # h = t1 + S0 + MAJ # --
448 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
454 offset = 4*2 + \disp
455 addl offset(%rsp, SRND), h # h = k + w + h # --
462 add y0, y2 # y2 = S1 + CH # --
465 add h, d # d = k + w + h + d # --
467 add y1, h # h = k + w + h + S0 # --
469 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
475 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
476 mov f, y2 # y2 = f # CH
479 xor g, y2 # y2 = f^g # CH
483 and e, y2 # y2 = (f^g)&e # CH
484 add y3, old_h # h = t1 + S0 + MAJ # --
488 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
494 offset = 4*3 + \disp
495 addl offset(%rsp, SRND), h # h = k + w + h # --
502 add y0, y2 # y2 = S1 + CH # --
505 add h, d # d = k + w + h + d # --
507 add y1, h # h = k + w + h + S0 # --
509 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
512 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
514 add y3, h # h = t1 + S0 + MAJ # --
538 and $-32, %rsp # align rsp to 32 byte boundary
542 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
550 mov 4*1(CTX), b
551 mov 4*2(CTX), c
552 mov 4*3(CTX), d
553 mov 4*4(CTX), e
554 mov 4*5(CTX), f
555 mov 4*6(CTX), g
556 mov 4*7(CTX), h
612 add $4*32, SRND
613 cmp $3*4*32, SRND
632 cmp $4*4*32, SRND
638 addm (4*0)(CTX),a
639 addm (4*1)(CTX),b
640 addm (4*2)(CTX),c
641 addm (4*3)(CTX),d
642 addm (4*4)(CTX),e
643 addm (4*5)(CTX),f
644 addm (4*6)(CTX),g
645 addm (4*7)(CTX),h
657 cmp $4*4*32, SRND
664 addm (4*0)(CTX),a
665 addm (4*1)(CTX),b
666 addm (4*2)(CTX),c
667 addm (4*3)(CTX),d
668 addm (4*4)(CTX),e
669 addm (4*5)(CTX),f
670 addm (4*6)(CTX),g
671 addm (4*7)(CTX),h
693 mov (4*0)(CTX),a
694 mov (4*1)(CTX),b
695 mov (4*2)(CTX),c
696 mov (4*3)(CTX),d
697 mov (4*4)(CTX),e
698 mov (4*5)(CTX),f
699 mov (4*6)(CTX),g
700 mov (4*7)(CTX),h
763 # shuffle xBxA -> 00BA
769 # shuffle xDxC -> DC00