1/*
2 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
3 *
4 * Copyright (C) 2015 Martin Willi
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 */
11
12#include <linux/linkage.h>
13
14.section	.rodata.cst16.ROT8, "aM", @progbits, 16
15.align 16
16ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
17.section	.rodata.cst16.ROT16, "aM", @progbits, 16
18.align 16
19ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
20.section	.rodata.cst16.CTRINC, "aM", @progbits, 16
21.align 16
22CTRINC:	.octa 0x00000003000000020000000100000000
23
24.text
25
26ENTRY(chacha20_block_xor_ssse3)
27	# %rdi: Input state matrix, s
28	# %rsi: 1 data block output, o
29	# %rdx: 1 data block input, i
30
31	# This function encrypts one ChaCha20 block by loading the state matrix
32	# in four SSE registers. It performs matrix operation on four words in
33	# parallel, but requireds shuffling to rearrange the words after each
34	# round. 8/16-bit word rotation is done with the slightly better
35	# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
36	# traditional shift+OR.
37
38	# x0..3 = s0..3
39	movdqa		0x00(%rdi),%xmm0
40	movdqa		0x10(%rdi),%xmm1
41	movdqa		0x20(%rdi),%xmm2
42	movdqa		0x30(%rdi),%xmm3
43	movdqa		%xmm0,%xmm8
44	movdqa		%xmm1,%xmm9
45	movdqa		%xmm2,%xmm10
46	movdqa		%xmm3,%xmm11
47
48	movdqa		ROT8(%rip),%xmm4
49	movdqa		ROT16(%rip),%xmm5
50
51	mov	$10,%ecx
52
53.Ldoubleround:
54
55	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
56	paddd		%xmm1,%xmm0
57	pxor		%xmm0,%xmm3
58	pshufb		%xmm5,%xmm3
59
60	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
61	paddd		%xmm3,%xmm2
62	pxor		%xmm2,%xmm1
63	movdqa		%xmm1,%xmm6
64	pslld		$12,%xmm6
65	psrld		$20,%xmm1
66	por		%xmm6,%xmm1
67
68	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
69	paddd		%xmm1,%xmm0
70	pxor		%xmm0,%xmm3
71	pshufb		%xmm4,%xmm3
72
73	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
74	paddd		%xmm3,%xmm2
75	pxor		%xmm2,%xmm1
76	movdqa		%xmm1,%xmm7
77	pslld		$7,%xmm7
78	psrld		$25,%xmm1
79	por		%xmm7,%xmm1
80
81	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
82	pshufd		$0x39,%xmm1,%xmm1
83	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
84	pshufd		$0x4e,%xmm2,%xmm2
85	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
86	pshufd		$0x93,%xmm3,%xmm3
87
88	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
89	paddd		%xmm1,%xmm0
90	pxor		%xmm0,%xmm3
91	pshufb		%xmm5,%xmm3
92
93	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
94	paddd		%xmm3,%xmm2
95	pxor		%xmm2,%xmm1
96	movdqa		%xmm1,%xmm6
97	pslld		$12,%xmm6
98	psrld		$20,%xmm1
99	por		%xmm6,%xmm1
100
101	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
102	paddd		%xmm1,%xmm0
103	pxor		%xmm0,%xmm3
104	pshufb		%xmm4,%xmm3
105
106	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
107	paddd		%xmm3,%xmm2
108	pxor		%xmm2,%xmm1
109	movdqa		%xmm1,%xmm7
110	pslld		$7,%xmm7
111	psrld		$25,%xmm1
112	por		%xmm7,%xmm1
113
114	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
115	pshufd		$0x93,%xmm1,%xmm1
116	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
117	pshufd		$0x4e,%xmm2,%xmm2
118	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
119	pshufd		$0x39,%xmm3,%xmm3
120
121	dec		%ecx
122	jnz		.Ldoubleround
123
124	# o0 = i0 ^ (x0 + s0)
125	movdqu		0x00(%rdx),%xmm4
126	paddd		%xmm8,%xmm0
127	pxor		%xmm4,%xmm0
128	movdqu		%xmm0,0x00(%rsi)
129	# o1 = i1 ^ (x1 + s1)
130	movdqu		0x10(%rdx),%xmm5
131	paddd		%xmm9,%xmm1
132	pxor		%xmm5,%xmm1
133	movdqu		%xmm1,0x10(%rsi)
134	# o2 = i2 ^ (x2 + s2)
135	movdqu		0x20(%rdx),%xmm6
136	paddd		%xmm10,%xmm2
137	pxor		%xmm6,%xmm2
138	movdqu		%xmm2,0x20(%rsi)
139	# o3 = i3 ^ (x3 + s3)
140	movdqu		0x30(%rdx),%xmm7
141	paddd		%xmm11,%xmm3
142	pxor		%xmm7,%xmm3
143	movdqu		%xmm3,0x30(%rsi)
144
145	ret
146ENDPROC(chacha20_block_xor_ssse3)
147
148ENTRY(chacha20_4block_xor_ssse3)
149	# %rdi: Input state matrix, s
150	# %rsi: 4 data blocks output, o
151	# %rdx: 4 data blocks input, i
152
153	# This function encrypts four consecutive ChaCha20 blocks by loading the
154	# the state matrix in SSE registers four times. As we need some scratch
155	# registers, we save the first four registers on the stack. The
156	# algorithm performs each operation on the corresponding word of each
157	# state matrix, hence requires no word shuffling. For final XORing step
158	# we transpose the matrix by interleaving 32- and then 64-bit words,
159	# which allows us to do XOR in SSE registers. 8/16-bit word rotation is
160	# done with the slightly better performing SSSE3 byte shuffling,
161	# 7/12-bit word rotation uses traditional shift+OR.
162
163	lea		8(%rsp),%r10
164	sub		$0x80,%rsp
165	and		$~63,%rsp
166
167	# x0..15[0-3] = s0..3[0..3]
168	movq		0x00(%rdi),%xmm1
169	pshufd		$0x00,%xmm1,%xmm0
170	pshufd		$0x55,%xmm1,%xmm1
171	movq		0x08(%rdi),%xmm3
172	pshufd		$0x00,%xmm3,%xmm2
173	pshufd		$0x55,%xmm3,%xmm3
174	movq		0x10(%rdi),%xmm5
175	pshufd		$0x00,%xmm5,%xmm4
176	pshufd		$0x55,%xmm5,%xmm5
177	movq		0x18(%rdi),%xmm7
178	pshufd		$0x00,%xmm7,%xmm6
179	pshufd		$0x55,%xmm7,%xmm7
180	movq		0x20(%rdi),%xmm9
181	pshufd		$0x00,%xmm9,%xmm8
182	pshufd		$0x55,%xmm9,%xmm9
183	movq		0x28(%rdi),%xmm11
184	pshufd		$0x00,%xmm11,%xmm10
185	pshufd		$0x55,%xmm11,%xmm11
186	movq		0x30(%rdi),%xmm13
187	pshufd		$0x00,%xmm13,%xmm12
188	pshufd		$0x55,%xmm13,%xmm13
189	movq		0x38(%rdi),%xmm15
190	pshufd		$0x00,%xmm15,%xmm14
191	pshufd		$0x55,%xmm15,%xmm15
192	# x0..3 on stack
193	movdqa		%xmm0,0x00(%rsp)
194	movdqa		%xmm1,0x10(%rsp)
195	movdqa		%xmm2,0x20(%rsp)
196	movdqa		%xmm3,0x30(%rsp)
197
198	movdqa		CTRINC(%rip),%xmm1
199	movdqa		ROT8(%rip),%xmm2
200	movdqa		ROT16(%rip),%xmm3
201
202	# x12 += counter values 0-3
203	paddd		%xmm1,%xmm12
204
205	mov		$10,%ecx
206
207.Ldoubleround4:
208	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
209	movdqa		0x00(%rsp),%xmm0
210	paddd		%xmm4,%xmm0
211	movdqa		%xmm0,0x00(%rsp)
212	pxor		%xmm0,%xmm12
213	pshufb		%xmm3,%xmm12
214	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
215	movdqa		0x10(%rsp),%xmm0
216	paddd		%xmm5,%xmm0
217	movdqa		%xmm0,0x10(%rsp)
218	pxor		%xmm0,%xmm13
219	pshufb		%xmm3,%xmm13
220	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
221	movdqa		0x20(%rsp),%xmm0
222	paddd		%xmm6,%xmm0
223	movdqa		%xmm0,0x20(%rsp)
224	pxor		%xmm0,%xmm14
225	pshufb		%xmm3,%xmm14
226	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
227	movdqa		0x30(%rsp),%xmm0
228	paddd		%xmm7,%xmm0
229	movdqa		%xmm0,0x30(%rsp)
230	pxor		%xmm0,%xmm15
231	pshufb		%xmm3,%xmm15
232
233	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
234	paddd		%xmm12,%xmm8
235	pxor		%xmm8,%xmm4
236	movdqa		%xmm4,%xmm0
237	pslld		$12,%xmm0
238	psrld		$20,%xmm4
239	por		%xmm0,%xmm4
240	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
241	paddd		%xmm13,%xmm9
242	pxor		%xmm9,%xmm5
243	movdqa		%xmm5,%xmm0
244	pslld		$12,%xmm0
245	psrld		$20,%xmm5
246	por		%xmm0,%xmm5
247	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
248	paddd		%xmm14,%xmm10
249	pxor		%xmm10,%xmm6
250	movdqa		%xmm6,%xmm0
251	pslld		$12,%xmm0
252	psrld		$20,%xmm6
253	por		%xmm0,%xmm6
254	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
255	paddd		%xmm15,%xmm11
256	pxor		%xmm11,%xmm7
257	movdqa		%xmm7,%xmm0
258	pslld		$12,%xmm0
259	psrld		$20,%xmm7
260	por		%xmm0,%xmm7
261
262	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
263	movdqa		0x00(%rsp),%xmm0
264	paddd		%xmm4,%xmm0
265	movdqa		%xmm0,0x00(%rsp)
266	pxor		%xmm0,%xmm12
267	pshufb		%xmm2,%xmm12
268	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
269	movdqa		0x10(%rsp),%xmm0
270	paddd		%xmm5,%xmm0
271	movdqa		%xmm0,0x10(%rsp)
272	pxor		%xmm0,%xmm13
273	pshufb		%xmm2,%xmm13
274	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
275	movdqa		0x20(%rsp),%xmm0
276	paddd		%xmm6,%xmm0
277	movdqa		%xmm0,0x20(%rsp)
278	pxor		%xmm0,%xmm14
279	pshufb		%xmm2,%xmm14
280	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
281	movdqa		0x30(%rsp),%xmm0
282	paddd		%xmm7,%xmm0
283	movdqa		%xmm0,0x30(%rsp)
284	pxor		%xmm0,%xmm15
285	pshufb		%xmm2,%xmm15
286
287	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
288	paddd		%xmm12,%xmm8
289	pxor		%xmm8,%xmm4
290	movdqa		%xmm4,%xmm0
291	pslld		$7,%xmm0
292	psrld		$25,%xmm4
293	por		%xmm0,%xmm4
294	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
295	paddd		%xmm13,%xmm9
296	pxor		%xmm9,%xmm5
297	movdqa		%xmm5,%xmm0
298	pslld		$7,%xmm0
299	psrld		$25,%xmm5
300	por		%xmm0,%xmm5
301	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
302	paddd		%xmm14,%xmm10
303	pxor		%xmm10,%xmm6
304	movdqa		%xmm6,%xmm0
305	pslld		$7,%xmm0
306	psrld		$25,%xmm6
307	por		%xmm0,%xmm6
308	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
309	paddd		%xmm15,%xmm11
310	pxor		%xmm11,%xmm7
311	movdqa		%xmm7,%xmm0
312	pslld		$7,%xmm0
313	psrld		$25,%xmm7
314	por		%xmm0,%xmm7
315
316	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
317	movdqa		0x00(%rsp),%xmm0
318	paddd		%xmm5,%xmm0
319	movdqa		%xmm0,0x00(%rsp)
320	pxor		%xmm0,%xmm15
321	pshufb		%xmm3,%xmm15
322	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
323	movdqa		0x10(%rsp),%xmm0
324	paddd		%xmm6,%xmm0
325	movdqa		%xmm0,0x10(%rsp)
326	pxor		%xmm0,%xmm12
327	pshufb		%xmm3,%xmm12
328	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
329	movdqa		0x20(%rsp),%xmm0
330	paddd		%xmm7,%xmm0
331	movdqa		%xmm0,0x20(%rsp)
332	pxor		%xmm0,%xmm13
333	pshufb		%xmm3,%xmm13
334	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
335	movdqa		0x30(%rsp),%xmm0
336	paddd		%xmm4,%xmm0
337	movdqa		%xmm0,0x30(%rsp)
338	pxor		%xmm0,%xmm14
339	pshufb		%xmm3,%xmm14
340
341	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
342	paddd		%xmm15,%xmm10
343	pxor		%xmm10,%xmm5
344	movdqa		%xmm5,%xmm0
345	pslld		$12,%xmm0
346	psrld		$20,%xmm5
347	por		%xmm0,%xmm5
348	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
349	paddd		%xmm12,%xmm11
350	pxor		%xmm11,%xmm6
351	movdqa		%xmm6,%xmm0
352	pslld		$12,%xmm0
353	psrld		$20,%xmm6
354	por		%xmm0,%xmm6
355	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
356	paddd		%xmm13,%xmm8
357	pxor		%xmm8,%xmm7
358	movdqa		%xmm7,%xmm0
359	pslld		$12,%xmm0
360	psrld		$20,%xmm7
361	por		%xmm0,%xmm7
362	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
363	paddd		%xmm14,%xmm9
364	pxor		%xmm9,%xmm4
365	movdqa		%xmm4,%xmm0
366	pslld		$12,%xmm0
367	psrld		$20,%xmm4
368	por		%xmm0,%xmm4
369
370	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
371	movdqa		0x00(%rsp),%xmm0
372	paddd		%xmm5,%xmm0
373	movdqa		%xmm0,0x00(%rsp)
374	pxor		%xmm0,%xmm15
375	pshufb		%xmm2,%xmm15
376	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
377	movdqa		0x10(%rsp),%xmm0
378	paddd		%xmm6,%xmm0
379	movdqa		%xmm0,0x10(%rsp)
380	pxor		%xmm0,%xmm12
381	pshufb		%xmm2,%xmm12
382	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
383	movdqa		0x20(%rsp),%xmm0
384	paddd		%xmm7,%xmm0
385	movdqa		%xmm0,0x20(%rsp)
386	pxor		%xmm0,%xmm13
387	pshufb		%xmm2,%xmm13
388	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
389	movdqa		0x30(%rsp),%xmm0
390	paddd		%xmm4,%xmm0
391	movdqa		%xmm0,0x30(%rsp)
392	pxor		%xmm0,%xmm14
393	pshufb		%xmm2,%xmm14
394
395	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
396	paddd		%xmm15,%xmm10
397	pxor		%xmm10,%xmm5
398	movdqa		%xmm5,%xmm0
399	pslld		$7,%xmm0
400	psrld		$25,%xmm5
401	por		%xmm0,%xmm5
402	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
403	paddd		%xmm12,%xmm11
404	pxor		%xmm11,%xmm6
405	movdqa		%xmm6,%xmm0
406	pslld		$7,%xmm0
407	psrld		$25,%xmm6
408	por		%xmm0,%xmm6
409	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
410	paddd		%xmm13,%xmm8
411	pxor		%xmm8,%xmm7
412	movdqa		%xmm7,%xmm0
413	pslld		$7,%xmm0
414	psrld		$25,%xmm7
415	por		%xmm0,%xmm7
416	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
417	paddd		%xmm14,%xmm9
418	pxor		%xmm9,%xmm4
419	movdqa		%xmm4,%xmm0
420	pslld		$7,%xmm0
421	psrld		$25,%xmm4
422	por		%xmm0,%xmm4
423
424	dec		%ecx
425	jnz		.Ldoubleround4
426
427	# x0[0-3] += s0[0]
428	# x1[0-3] += s0[1]
429	movq		0x00(%rdi),%xmm3
430	pshufd		$0x00,%xmm3,%xmm2
431	pshufd		$0x55,%xmm3,%xmm3
432	paddd		0x00(%rsp),%xmm2
433	movdqa		%xmm2,0x00(%rsp)
434	paddd		0x10(%rsp),%xmm3
435	movdqa		%xmm3,0x10(%rsp)
436	# x2[0-3] += s0[2]
437	# x3[0-3] += s0[3]
438	movq		0x08(%rdi),%xmm3
439	pshufd		$0x00,%xmm3,%xmm2
440	pshufd		$0x55,%xmm3,%xmm3
441	paddd		0x20(%rsp),%xmm2
442	movdqa		%xmm2,0x20(%rsp)
443	paddd		0x30(%rsp),%xmm3
444	movdqa		%xmm3,0x30(%rsp)
445
446	# x4[0-3] += s1[0]
447	# x5[0-3] += s1[1]
448	movq		0x10(%rdi),%xmm3
449	pshufd		$0x00,%xmm3,%xmm2
450	pshufd		$0x55,%xmm3,%xmm3
451	paddd		%xmm2,%xmm4
452	paddd		%xmm3,%xmm5
453	# x6[0-3] += s1[2]
454	# x7[0-3] += s1[3]
455	movq		0x18(%rdi),%xmm3
456	pshufd		$0x00,%xmm3,%xmm2
457	pshufd		$0x55,%xmm3,%xmm3
458	paddd		%xmm2,%xmm6
459	paddd		%xmm3,%xmm7
460
461	# x8[0-3] += s2[0]
462	# x9[0-3] += s2[1]
463	movq		0x20(%rdi),%xmm3
464	pshufd		$0x00,%xmm3,%xmm2
465	pshufd		$0x55,%xmm3,%xmm3
466	paddd		%xmm2,%xmm8
467	paddd		%xmm3,%xmm9
468	# x10[0-3] += s2[2]
469	# x11[0-3] += s2[3]
470	movq		0x28(%rdi),%xmm3
471	pshufd		$0x00,%xmm3,%xmm2
472	pshufd		$0x55,%xmm3,%xmm3
473	paddd		%xmm2,%xmm10
474	paddd		%xmm3,%xmm11
475
476	# x12[0-3] += s3[0]
477	# x13[0-3] += s3[1]
478	movq		0x30(%rdi),%xmm3
479	pshufd		$0x00,%xmm3,%xmm2
480	pshufd		$0x55,%xmm3,%xmm3
481	paddd		%xmm2,%xmm12
482	paddd		%xmm3,%xmm13
483	# x14[0-3] += s3[2]
484	# x15[0-3] += s3[3]
485	movq		0x38(%rdi),%xmm3
486	pshufd		$0x00,%xmm3,%xmm2
487	pshufd		$0x55,%xmm3,%xmm3
488	paddd		%xmm2,%xmm14
489	paddd		%xmm3,%xmm15
490
491	# x12 += counter values 0-3
492	paddd		%xmm1,%xmm12
493
494	# interleave 32-bit words in state n, n+1
495	movdqa		0x00(%rsp),%xmm0
496	movdqa		0x10(%rsp),%xmm1
497	movdqa		%xmm0,%xmm2
498	punpckldq	%xmm1,%xmm2
499	punpckhdq	%xmm1,%xmm0
500	movdqa		%xmm2,0x00(%rsp)
501	movdqa		%xmm0,0x10(%rsp)
502	movdqa		0x20(%rsp),%xmm0
503	movdqa		0x30(%rsp),%xmm1
504	movdqa		%xmm0,%xmm2
505	punpckldq	%xmm1,%xmm2
506	punpckhdq	%xmm1,%xmm0
507	movdqa		%xmm2,0x20(%rsp)
508	movdqa		%xmm0,0x30(%rsp)
509	movdqa		%xmm4,%xmm0
510	punpckldq	%xmm5,%xmm4
511	punpckhdq	%xmm5,%xmm0
512	movdqa		%xmm0,%xmm5
513	movdqa		%xmm6,%xmm0
514	punpckldq	%xmm7,%xmm6
515	punpckhdq	%xmm7,%xmm0
516	movdqa		%xmm0,%xmm7
517	movdqa		%xmm8,%xmm0
518	punpckldq	%xmm9,%xmm8
519	punpckhdq	%xmm9,%xmm0
520	movdqa		%xmm0,%xmm9
521	movdqa		%xmm10,%xmm0
522	punpckldq	%xmm11,%xmm10
523	punpckhdq	%xmm11,%xmm0
524	movdqa		%xmm0,%xmm11
525	movdqa		%xmm12,%xmm0
526	punpckldq	%xmm13,%xmm12
527	punpckhdq	%xmm13,%xmm0
528	movdqa		%xmm0,%xmm13
529	movdqa		%xmm14,%xmm0
530	punpckldq	%xmm15,%xmm14
531	punpckhdq	%xmm15,%xmm0
532	movdqa		%xmm0,%xmm15
533
534	# interleave 64-bit words in state n, n+2
535	movdqa		0x00(%rsp),%xmm0
536	movdqa		0x20(%rsp),%xmm1
537	movdqa		%xmm0,%xmm2
538	punpcklqdq	%xmm1,%xmm2
539	punpckhqdq	%xmm1,%xmm0
540	movdqa		%xmm2,0x00(%rsp)
541	movdqa		%xmm0,0x20(%rsp)
542	movdqa		0x10(%rsp),%xmm0
543	movdqa		0x30(%rsp),%xmm1
544	movdqa		%xmm0,%xmm2
545	punpcklqdq	%xmm1,%xmm2
546	punpckhqdq	%xmm1,%xmm0
547	movdqa		%xmm2,0x10(%rsp)
548	movdqa		%xmm0,0x30(%rsp)
549	movdqa		%xmm4,%xmm0
550	punpcklqdq	%xmm6,%xmm4
551	punpckhqdq	%xmm6,%xmm0
552	movdqa		%xmm0,%xmm6
553	movdqa		%xmm5,%xmm0
554	punpcklqdq	%xmm7,%xmm5
555	punpckhqdq	%xmm7,%xmm0
556	movdqa		%xmm0,%xmm7
557	movdqa		%xmm8,%xmm0
558	punpcklqdq	%xmm10,%xmm8
559	punpckhqdq	%xmm10,%xmm0
560	movdqa		%xmm0,%xmm10
561	movdqa		%xmm9,%xmm0
562	punpcklqdq	%xmm11,%xmm9
563	punpckhqdq	%xmm11,%xmm0
564	movdqa		%xmm0,%xmm11
565	movdqa		%xmm12,%xmm0
566	punpcklqdq	%xmm14,%xmm12
567	punpckhqdq	%xmm14,%xmm0
568	movdqa		%xmm0,%xmm14
569	movdqa		%xmm13,%xmm0
570	punpcklqdq	%xmm15,%xmm13
571	punpckhqdq	%xmm15,%xmm0
572	movdqa		%xmm0,%xmm15
573
574	# xor with corresponding input, write to output
575	movdqa		0x00(%rsp),%xmm0
576	movdqu		0x00(%rdx),%xmm1
577	pxor		%xmm1,%xmm0
578	movdqu		%xmm0,0x00(%rsi)
579	movdqa		0x10(%rsp),%xmm0
580	movdqu		0x80(%rdx),%xmm1
581	pxor		%xmm1,%xmm0
582	movdqu		%xmm0,0x80(%rsi)
583	movdqa		0x20(%rsp),%xmm0
584	movdqu		0x40(%rdx),%xmm1
585	pxor		%xmm1,%xmm0
586	movdqu		%xmm0,0x40(%rsi)
587	movdqa		0x30(%rsp),%xmm0
588	movdqu		0xc0(%rdx),%xmm1
589	pxor		%xmm1,%xmm0
590	movdqu		%xmm0,0xc0(%rsi)
591	movdqu		0x10(%rdx),%xmm1
592	pxor		%xmm1,%xmm4
593	movdqu		%xmm4,0x10(%rsi)
594	movdqu		0x90(%rdx),%xmm1
595	pxor		%xmm1,%xmm5
596	movdqu		%xmm5,0x90(%rsi)
597	movdqu		0x50(%rdx),%xmm1
598	pxor		%xmm1,%xmm6
599	movdqu		%xmm6,0x50(%rsi)
600	movdqu		0xd0(%rdx),%xmm1
601	pxor		%xmm1,%xmm7
602	movdqu		%xmm7,0xd0(%rsi)
603	movdqu		0x20(%rdx),%xmm1
604	pxor		%xmm1,%xmm8
605	movdqu		%xmm8,0x20(%rsi)
606	movdqu		0xa0(%rdx),%xmm1
607	pxor		%xmm1,%xmm9
608	movdqu		%xmm9,0xa0(%rsi)
609	movdqu		0x60(%rdx),%xmm1
610	pxor		%xmm1,%xmm10
611	movdqu		%xmm10,0x60(%rsi)
612	movdqu		0xe0(%rdx),%xmm1
613	pxor		%xmm1,%xmm11
614	movdqu		%xmm11,0xe0(%rsi)
615	movdqu		0x30(%rdx),%xmm1
616	pxor		%xmm1,%xmm12
617	movdqu		%xmm12,0x30(%rsi)
618	movdqu		0xb0(%rdx),%xmm1
619	pxor		%xmm1,%xmm13
620	movdqu		%xmm13,0xb0(%rsi)
621	movdqu		0x70(%rdx),%xmm1
622	pxor		%xmm1,%xmm14
623	movdqu		%xmm14,0x70(%rsi)
624	movdqu		0xf0(%rdx),%xmm1
625	pxor		%xmm1,%xmm15
626	movdqu		%xmm15,0xf0(%rsi)
627
628	lea		-8(%r10),%rsp
629	ret
630ENDPROC(chacha20_4block_xor_ssse3)
631