sha512-avx2-asm.S - OpenGrok cross reference for /Linux-v6.1/arch/x86/crypto/sha512-avx2-asm.S

Lines Matching +full:- +full:2 +full:g
2 # Implement fast SHA-512 with AVX2 instructions. (x86_64)
14 # General Public License (GPL) Version 2, available from the file
22 #      - Redistributions of source code must retain the above
26 #      - Redistributions in binary form must reproduce the above
42 # This code is described in an Intel White-Paper:
43 # "Fast SHA-512 Implementations on Intel Architecture Processors"
74 # 2nd arg
90 g     = %r10  define
117 # Add reg to mem using reg-mem add and store
145 	h      = g
146 	g      = f  define
165 	# Extract w[t-7]
166 	MY_VPALIGNR	YTMP0, Y_3, Y_2, 8		# YTMP0 = W[-7]
167 	# Calculate w[t-16] + w[t-7]
168 	vpaddq		Y_0, YTMP0, YTMP0		# YTMP0 = W[-7] + W[-16]
169 	# Extract w[t-15]
170 	MY_VPALIGNR	YTMP1, Y_1, Y_0, 8		# YTMP1 = W[-15]
174 	# Calculate w[t-15] ror 1
176 	vpsllq		$(64-1), YTMP1, YTMP3
177 	vpor		YTMP2, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1
178 	# Calculate w[t-15] shr 7
179 	vpsrlq		$7, YTMP1, YTMP4		# YTMP4 = W[-15] >> 7
184 	add	frame_XFER(%rsp),h		# h = k + w + h         # --
190 	xor	g, y2		# y2 = f^g                              # CH
193 	and	e, y2		# y2 = (f^g)&e                          # CH
196 	add	h, d		# d = k + w + h + d                     # --
202 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
207 	add	y0, y2		# y2 = S1 + CH                          # --
209 	add	y1, h		# h = k + w + h + S0                    # --
211 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
213 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
214 	add	y3, h		# h = t1 + S0 + MAJ                     # --
220 	# Calculate w[t-15] ror 8
222 	vpsllq		$(64-8), YTMP1, YTMP1
223 	vpor		YTMP2, YTMP1, YTMP1		# YTMP1 = W[-15] ror 8
225 	vpxor		YTMP4, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
229 	# Add three components, w[t-16], w[t-7] and sigma0
230 	vpaddq		YTMP1, YTMP0, YTMP0		# YTMP0 = W[-16] + W[-7] + s0
232 	vperm2f128	$0x0, YTMP0, YTMP0, Y_0		# Y_0 = W[-16] + W[-7] + s0 {BABA}
234 	vpand		MASK_YMM_LO(%rip), YTMP0, YTMP0	# YTMP0 = W[-16] + W[-7] + s0 {DC00}
239 	vperm2f128	$0x11, Y_3, Y_3, YTMP2		# YTMP2 = W[-2] {BABA}
240 	vpsrlq		$6, YTMP2, YTMP4		# YTMP4 = W[-2] >> 6 {BABA}
246 	add	1*8+frame_XFER(%rsp), h		# h = k + w + h         # --
253 	xor	g, y2		# y2 = f^g                              # CH
259 	and	e, y2		# y2 = (f^g)&e                          # CH
260 	add	h, d		# d = k + w + h + d                     # --
266 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
271 	add	y0, y2		# y2 = S1 + CH                          # --
274 	add	y1, h		# h = k + w + h + S0                    # --
276 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
277 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
278 	add	y3, h		# h = t1 + S0 + MAJ                     # --
283 ################################### RND N + 2 #########################################
285 	vpsrlq		$19, YTMP2, YTMP3		# YTMP3 = W[-2] >> 19 {BABA}
286 	vpsllq		$(64-19), YTMP2, YTMP1		# YTMP1 = W[-2] << 19 {BABA}
287 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {BABA}
288 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
289 	vpsrlq		$61, YTMP2, YTMP3		# YTMP3 = W[-2] >> 61 {BABA}
290 	vpsllq		$(64-61), YTMP2, YTMP1		# YTMP1 = W[-2] << 61 {BABA}
291 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {BABA}
292 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
293 							#  (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
299 	vpsrlq		$6, Y_0, YTMP4			# YTMP4 = W[-2] >> 6 {DC--}
303 	add	2*8+frame_XFER(%rsp), h		# h = k + w + h         # --
308 	xor	g, y2		# y2 = f^g                              # CH
312 	and	e, y2		# y2 = (f^g)&e                          # CH
315 	add	h, d		# d = k + w + h + d                     # --
320 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
328 	add	y0, y2		# y2 = S1 + CH                          # --
331 	add	y1, h		# h = k + w + h + S0                    # --
332 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
333 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
335 	add	y3, h		# h = t1 + S0 + MAJ                     # --
341 	vpsrlq		$19, Y_0, YTMP3			# YTMP3 = W[-2] >> 19 {DC--}
342 	vpsllq		$(64-19), Y_0, YTMP1		# YTMP1 = W[-2] << 19 {DC--}
343 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {DC--}
344 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
345 	vpsrlq		$61, Y_0, YTMP3			# YTMP3 = W[-2] >> 61 {DC--}
346 	vpsllq		$(64-61), Y_0, YTMP1		# YTMP1 = W[-2] << 61 {DC--}
347 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {DC--}
348 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
349 							#  (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
351 	# Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
353 	vpaddq		YTMP4, YTMP0, YTMP2		# YTMP2 = {W[3], W[2], --, --}
356 	vpblendd		$0xF0, YTMP2, Y_0, Y_0		# Y_0 = {W[3], W[2], W[1], W[0]}
361 	add	3*8+frame_XFER(%rsp), h		# h = k + w + h         # --
368 	xor	g, y2		# y2 = f^g                              # CH
372 	and	e, y2		# y2 = (f^g)&e                          # CH
373 	add	h, d		# d = k + w + h + d                     # --
377 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
380 	add	y0, y2		# y2 = S1 + CH                          # --
383 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
392 	add	y1, h		# h = k + w + h + S0                    # --
393 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
394 	add	y3, h		# h = t1 + S0 + MAJ                     # --
408 	xor	g, y2		# y2 = f^g                              # CH
412 	and	e, y2		# y2 = (f^g)&e                          # CH
416 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
422 	add	frame_XFER(%rsp), h		# h = k + w + h         # --
429 	add	y0, y2		# y2 = S1 + CH                          # --
431 	add	h, d		# d = k + w + h + d                     # --
433 	add	y1, h		# h = k + w + h + S0                    # --
435 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
441 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
445 	xor	g, y2		# y2 = f^g                              # CH
449 	and	e, y2		# y2 = (f^g)&e                          # CH
450 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
454 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
460 	add	8*1+frame_XFER(%rsp), h		# h = k + w + h         # --
467 	add	y0, y2		# y2 = S1 + CH                          # --
469 	add	h, d		# d = k + w + h + d                     # --
471 	add	y1, h		# h = k + w + h + S0                    # --
473 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
477 ################################### RND N + 2 #########################################
479 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
483 	xor	g, y2		# y2 = f^g                              # CH
487 	and	e, y2		# y2 = (f^g)&e                          # CH
488 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
492 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
498 	add	8*2+frame_XFER(%rsp), h		# h = k + w + h         # --
505 	add	y0, y2		# y2 = S1 + CH                          # --
507 	add	h, d		# d = k + w + h + d                     # --
509 	add	y1, h		# h = k + w + h + S0                    # --
511 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
517 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
521 	xor	g, y2		# y2 = f^g                              # CH
525 	and	e, y2		# y2 = (f^g)&e                          # CH
526 	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
530 	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
536 	add	8*3+frame_XFER(%rsp), h		# h = k + w + h         # --
543 	add	y0, y2		# y2 = S1 + CH                          # --
546 	add	h, d		# d = k + w + h + d                     # --
548 	add	y1, h		# h = k + w + h + S0                    # --
550 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
552 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
554 	add	y3, h		# h = t1 + S0 + MAJ                     # --
580 	and	$~(0x20 - 1), %rsp
590 	mov	8*2(CTX1), c
594 	mov	8*6(CTX1), g
608 	COPY_YMM_AND_BSWAP	Y_2, 2*32(INP), BYTE_FLIP_MASK
626 	vpaddq	2*32(TBL), Y_0, XFER
638 	movq	$2, frame_SRND(%rsp)
645 	add	$(2*32), TBL
657 	addm	8*2(CTX2), c
661 	addm	8*6(CTX2), g
689 # Mergeable 640-byte rodata section. This allows linker to merge the table
690 # with other, exactly the same 640-byte fragment of another rodata section
739 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.