Lines Matching full:w

61 /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
249 * RR does two rounds of SHA-1 back to back with W[] pre-calc
250 * t1 = F(b, c, d); e += w(i)
251 * e += t1; b <<= 30; d += w(i+1);
311 .set W, W0 define
319 .set W_minus_32, W
330 .set W_minus_04, W
331 .set W, W_minus_32 define
352 movdqa W_TMP1, W
363 * - calculating last 32 w[i] values in 8 XMM registers
364 * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
367 * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
374 movdqa W_minus_12, W
375 palignr $8, W_minus_16, W # w[i-14]
377 psrldq $4, W_TMP1 # w[i-3]
378 pxor W_minus_08, W
381 pxor W_TMP1, W
382 movdqa W, W_TMP2
383 movdqa W, W_TMP1
386 psrld $31, W
388 por W, W_TMP1
389 movdqa W_TMP2, W
391 pslld $2, W
393 pxor W, W_TMP1
395 movdqa W_TMP1, W
404 * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
405 * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
406 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
411 pxor W_minus_28, W # W is W_minus_32 before xor
414 pxor W_minus_16, W
415 pxor W_TMP1, W
416 movdqa W, W_TMP1
418 psrld $30, W
420 por W, W_TMP1
422 movdqa W_TMP1, W
489 vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
491 vpaddd (K_BASE), W, W_TMP1
500 vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
501 vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
502 vpxor W_minus_08, W, W
505 vpxor W_TMP1, W, W
506 vpslldq $12, W, W_TMP2
507 vpslld $1, W, W_TMP1
509 vpsrld $31, W, W
510 vpor W, W_TMP1, W_TMP1
511 vpslld $2, W_TMP2, W
514 vpxor W, W_TMP1, W_TMP1
515 vpxor W_TMP2, W_TMP1, W
516 vpaddd K_XMM(K_BASE), W, W_TMP1
525 vpxor W_minus_28, W, W # W is W_minus_32 before xor
528 vpxor W_TMP1, W, W
530 vpslld $2, W, W_TMP1
531 vpsrld $30, W, W
532 vpor W, W_TMP1, W
534 vpaddd K_XMM(K_BASE), W, W_TMP1